1
0
mirror of https://github.com/yt-dlp/yt-dlp.git synced 2025-12-24 00:49:06 +00:00

Merge branch 'master' into yt-live-from-start-range

This commit is contained in:
bashonly
2024-02-29 04:58:30 -06:00
committed by GitHub
109 changed files with 5788 additions and 1875 deletions

View File

@@ -138,6 +138,10 @@ from .ard import (
ARDMediathekCollectionIE,
ARDIE,
)
from .art19 import (
Art19IE,
Art19ShowIE,
)
from .arte import (
ArteTVIE,
ArteTVEmbedIE,
@@ -253,6 +257,7 @@ from .blogger import BloggerIE
from .bloomberg import BloombergIE
from .bokecc import BokeCCIE
from .bongacams import BongaCamsIE
from .boosty import BoostyIE
from .bostonglobe import BostonGlobeIE
from .box import BoxIE
from .boxcast import BoxCastVideoIE
@@ -369,11 +374,11 @@ from .clippit import ClippitIE
from .cliprs import ClipRsIE
from .closertotruth import CloserToTruthIE
from .cloudflarestream import CloudflareStreamIE
from .cloudycdn import CloudyCDNIE
from .clubic import ClubicIE
from .clyp import ClypIE
from .cmt import CMTIE
from .cnbc import (
CNBCIE,
CNBCVideoIE,
)
from .cnn import (
@@ -564,6 +569,7 @@ from .eroprofile import (
EroProfileIE,
EroProfileAlbumIE,
)
from .err import ERRJupiterIE
from .ertgr import (
ERTFlixCodenameIE,
ERTFlixIE,
@@ -588,6 +594,7 @@ from .facebook import (
FacebookPluginsVideoIE,
FacebookRedirectURLIE,
FacebookReelIE,
FacebookAdsIE,
)
from .fancode import (
FancodeVodIE,
@@ -610,6 +617,7 @@ from .filmon import (
from .filmweb import FilmwebIE
from .firsttv import FirstTVIE
from .fivetv import FiveTVIE
from .flextv import FlexTVIE
from .flickr import FlickrIE
from .floatplane import (
FloatplaneIE,
@@ -1000,6 +1008,11 @@ from .lrt import (
LRTVODIE,
LRTStreamIE
)
from .lsm import (
LSMLREmbedIE,
LSMLTVEmbedIE,
LSMReplayIE
)
from .lumni import (
LumniIE
)
@@ -1111,6 +1124,7 @@ from .motherless import (
MotherlessIE,
MotherlessGroupIE,
MotherlessGalleryIE,
MotherlessUploaderIE,
)
from .motorsport import MotorsportIE
from .moviepilot import MoviepilotIE
@@ -1137,6 +1151,11 @@ from .musicdex import (
MusicdexArtistIE,
MusicdexPlaylistIE,
)
from .mx3 import (
Mx3IE,
Mx3NeoIE,
Mx3VolksmusikIE,
)
from .mxplayer import (
MxplayerIE,
MxplayerShowIE,
@@ -1229,7 +1248,10 @@ from .nexx import (
NexxIE,
NexxEmbedIE,
)
from .nfb import NFBIE
from .nfb import (
NFBIE,
NFBSeriesIE,
)
from .nfhsnetwork import NFHSNetworkIE
from .nfl import (
NFLIE,
@@ -1266,6 +1288,7 @@ from .niconico import (
NicovideoTagURLIE,
NiconicoLiveIE,
)
from .ninaprotocol import NinaProtocolIE
from .ninecninemedia import (
NineCNineMediaIE,
CPTwentyFourIE,
@@ -1330,6 +1353,12 @@ from .nytimes import (
NYTimesIE,
NYTimesArticleIE,
NYTimesCookingIE,
NYTimesCookingRecipeIE,
)
from .nuum import (
NuumLiveIE,
NuumTabIE,
NuumMediaIE,
)
from .nuvid import NuvidIE
from .nzherald import NZHeraldIE
@@ -1372,6 +1401,7 @@ from .ora import OraTVIE
from .orf import (
ORFTVthekIE,
ORFFM4StoryIE,
ORFONIE,
ORFRadioIE,
ORFPodcastIE,
ORFIPTVIE,
@@ -1496,7 +1526,7 @@ from .puhutv import (
PuhuTVSerieIE,
)
from .pr0gramm import Pr0grammIE
from .prankcast import PrankCastIE
from .prankcast import PrankCastIE, PrankCastPostIE
from .premiershiprugby import PremiershipRugbyIE
from .presstv import PressTVIE
from .projectveritas import ProjectVeritasIE
@@ -1593,6 +1623,7 @@ from .redbulltv import (
RedBullIE,
)
from .reddit import RedditIE
from .redge import RedCDNLivxIE
from .redgifs import (
RedGifsIE,
RedGifsSearchIE,
@@ -1727,6 +1758,7 @@ from .scte import (
)
from .scrolller import ScrolllerIE
from .seeker import SeekerIE
from .sejmpl import SejmIE
from .senalcolombia import SenalColombiaLiveIE
from .senategov import SenateISVPIE, SenateGovIE
from .sendtonews import SendtoNewsIE
@@ -2289,11 +2321,6 @@ from .washingtonpost import (
WashingtonPostIE,
WashingtonPostArticleIE,
)
from .wasdtv import (
WASDTVStreamIE,
WASDTVRecordIE,
WASDTVClipIE,
)
from .wat import WatIE
from .wdr import (
WDRIE,
@@ -2472,6 +2499,7 @@ from .zee5 import (
Zee5SeriesIE,
)
from .zeenews import ZeeNewsIE
from .zetland import ZetlandDKArticleIE
from .zhihu import ZhihuIE
from .zingmp3 import (
ZingMp3IE,

View File

@@ -3,6 +3,7 @@ import binascii
import json
import os
import random
import time
from .common import InfoExtractor
from ..aes import aes_cbc_decrypt_bytes, unpad_pkcs7
@@ -17,6 +18,7 @@ from ..utils import (
int_or_none,
intlist_to_bytes,
long_to_bytes,
parse_iso8601,
pkcs1pad,
strip_or_none,
str_or_none,
@@ -185,7 +187,10 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text'''
user = options['user']
if not user.get('hasAccess'):
self.raise_login_required()
start_date = traverse_obj(options, ('video', 'startDate', {str}))
if (parse_iso8601(start_date) or 0) > time.time():
raise ExtractorError(f'This video is not available yet. Release date: {start_date}', expected=True)
self.raise_login_required('This video requires a subscription', method='password')
token = self._download_json(
user.get('refreshTokenUrl') or (self._PLAYER_BASE_URL + 'refresh/token'),
@@ -267,6 +272,9 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text'''
f['language'] = 'de'
formats.extend(m3u8_formats)
if not formats:
self.raise_login_required('This video requires a subscription', method='password')
video = (self._download_json(
self._API_BASE_URL + 'video/%s' % video_id, video_id,
'Downloading additional video metadata', fatal=False) or {}).get('video') or {}

View File

@@ -22,7 +22,7 @@ class AltCensoredIE(InfoExtractor):
'title': "QUELLES SONT LES CONSÉQUENCES DE L'HYPERSEXUALISATION DE LA SOCIÉTÉ ?",
'display_id': 'k0srjLSkga8.webm',
'release_date': '20180403',
'creator': 'Virginie Vota',
'creators': ['Virginie Vota'],
'release_year': 2018,
'upload_date': '20230318',
'uploader': 'admin@altcensored.com',
@@ -32,7 +32,7 @@ class AltCensoredIE(InfoExtractor):
'duration': 926.09,
'thumbnail': 'https://archive.org/download/youtube-k0srjLSkga8/youtube-k0srjLSkga8.thumbs/k0srjLSkga8_000925.jpg',
'view_count': int,
'categories': ['News & Politics'],
'categories': ['News & Politics'], # FIXME
}
}]
@@ -62,14 +62,21 @@ class AltCensoredChannelIE(InfoExtractor):
'title': 'Virginie Vota',
'id': 'UCFPTO55xxHqFqkzRZHu4kcw',
},
'playlist_count': 91
'playlist_count': 85,
}, {
'url': 'https://altcensored.com/channel/UC9CcJ96HKMWn0LZlcxlpFTw',
'info_dict': {
'title': 'yukikaze775',
'id': 'UC9CcJ96HKMWn0LZlcxlpFTw',
},
'playlist_count': 4
'playlist_count': 4,
}, {
'url': 'https://altcensored.com/channel/UCfYbb7nga6-icsFWWgS-kWw',
'info_dict': {
'title': 'Mister Metokur',
'id': 'UCfYbb7nga6-icsFWWgS-kWw',
},
'playlist_count': 121,
}]
def _real_extract(self, url):
@@ -78,7 +85,7 @@ class AltCensoredChannelIE(InfoExtractor):
url, channel_id, 'Download channel webpage', 'Unable to get channel webpage')
title = self._html_search_meta('altcen_title', webpage, 'title', fatal=False)
page_count = int_or_none(self._html_search_regex(
r'<a[^>]+href="/channel/\w+/page/(\d+)">(?:\1)</a>',
r'<a[^>]+href="/channel/[\w-]+/page/(\d+)">(?:\1)</a>',
webpage, 'page count', default='1'))
def page_func(page_num):

View File

@@ -78,14 +78,14 @@ class Ant1NewsGrArticleIE(AntennaBaseIE):
_TESTS = [{
'url': 'https://www.ant1news.gr/afieromata/article/549468/o-tzeims-mpont-sta-meteora-oi-apeiles-kai-o-xesikomos-ton-kalogeron',
'md5': '294f18331bb516539d72d85a82887dcc',
'md5': '57eb8d12181f0fa2b14b0b138e1de9b6',
'info_dict': {
'id': '_xvg/m_cmbatw=',
'ext': 'mp4',
'title': 'md5:a93e8ecf2e4073bfdffcb38f59945411',
'timestamp': 1603092840,
'upload_date': '20201019',
'thumbnail': 'https://ant1media.azureedge.net/imgHandler/640/756206d2-d640-40e2-b201-3555abdfc0db.jpg',
'timestamp': 1666166520,
'upload_date': '20221019',
'thumbnail': 'https://ant1media.azureedge.net/imgHandler/1920/756206d2-d640-40e2-b201-3555abdfc0db.jpg',
},
}, {
'url': 'https://ant1news.gr/Society/article/620286/symmoria-anilikon-dikigoros-thymaton-ithelan-na-toys-apoteleiosoyn',
@@ -117,7 +117,7 @@ class Ant1NewsGrEmbedIE(AntennaBaseIE):
_BASE_PLAYER_URL_RE = r'(?:https?:)?//(?:[a-zA-Z0-9\-]+\.)?(?:antenna|ant1news)\.gr/templates/pages/player'
_VALID_URL = rf'{_BASE_PLAYER_URL_RE}\?([^#]+&)?cid=(?P<id>[^#&]+)'
_EMBED_REGEX = [rf'<iframe[^>]+?src=(?P<_q1>["\'])(?P<url>{_BASE_PLAYER_URL_RE}\?(?:(?!(?P=_q1)).)+)(?P=_q1)']
_API_PATH = '/news/templates/data/jsonPlayer'
_API_PATH = '/templates/data/jsonPlayer'
_TESTS = [{
'url': 'https://www.antenna.gr/templates/pages/player?cid=3f_li_c_az_jw_y_u=&w=670&h=377',

View File

@@ -300,7 +300,7 @@ class ArchiveOrgIE(InfoExtractor):
is_logged_in = bool(self._get_cookies('https://archive.org').get('logged-in-sig'))
if extension in KNOWN_EXTENSIONS and (not f.get('private') or is_logged_in):
entry['formats'].append({
'url': 'https://archive.org/download/' + identifier + '/' + f['name'],
'url': 'https://archive.org/download/' + identifier + '/' + urllib.parse.quote(f['name']),
'format': f.get('format'),
'width': int_or_none(f.get('width')),
'height': int_or_none(f.get('height')),

View File

@@ -8,6 +8,7 @@ from ..utils import (
determine_ext,
int_or_none,
join_nonempty,
jwt_decode_hs256,
make_archive_id,
parse_duration,
parse_iso8601,
@@ -238,6 +239,7 @@ class ARDBetaMediathekIE(InfoExtractor):
(?P<id>[a-zA-Z0-9]+)
/?(?:[?#]|$)'''
_GEO_COUNTRIES = ['DE']
_TOKEN_URL = 'https://sso.ardmediathek.de/sso/token'
_TESTS = [{
'url': 'https://www.ardmediathek.de/video/filme-im-mdr/liebe-auf-vier-pfoten/mdr-fernsehen/Y3JpZDovL21kci5kZS9zZW5kdW5nLzI4MjA0MC80MjIwOTEtNDAyNTM0',
@@ -359,12 +361,27 @@ class ARDBetaMediathekIE(InfoExtractor):
def _real_extract(self, url):
display_id = self._match_id(url)
query = {'embedded': 'false', 'mcV6': 'true'}
headers = {}
if self._get_cookies(self._TOKEN_URL).get('ams'):
token = self._download_json(
self._TOKEN_URL, display_id, 'Fetching token for age verification',
'Unable to fetch age verification token', fatal=False)
id_token = traverse_obj(token, ('idToken', {str}))
decoded_token = traverse_obj(id_token, ({jwt_decode_hs256}, {dict}))
user_id = traverse_obj(decoded_token, (('user_id', 'sub'), {str}), get_all=False)
if not user_id:
self.report_warning('Unable to extract token, continuing without authentication')
else:
headers['x-authorization'] = f'Bearer {id_token}'
query['userId'] = user_id
if decoded_token.get('age_rating') != 18:
self.report_warning('Account is not verified as 18+; video may be unavailable')
page_data = self._download_json(
f'https://api.ardmediathek.de/page-gateway/pages/ard/item/{display_id}', display_id, query={
'embedded': 'false',
'mcV6': 'true',
})
f'https://api.ardmediathek.de/page-gateway/pages/ard/item/{display_id}',
display_id, query=query, headers=headers)
# For user convenience we use the old contentId instead of the longer crid
# Ref: https://github.com/yt-dlp/yt-dlp/issues/8731#issuecomment-1874398283
@@ -383,7 +400,7 @@ class ARDBetaMediathekIE(InfoExtractor):
media_data = traverse_obj(player_data, ('mediaCollection', 'embedded', {dict}))
if player_data.get('blockedByFsk'):
self.raise_no_formats('This video is only available after 22:00', expected=True)
self.raise_login_required('This video is only available for age verified users or after 22:00')
formats = []
subtitles = {}

303
yt_dlp/extractor/art19.py Normal file
View File

@@ -0,0 +1,303 @@
import re
from .common import InfoExtractor
from ..utils import float_or_none, int_or_none, parse_iso8601, url_or_none
from ..utils.traversal import traverse_obj
class Art19IE(InfoExtractor):
_UUID_REGEX = r'[\da-f]{8}-?[\da-f]{4}-?[\da-f]{4}-?[\da-f]{4}-?[\da-f]{12}'
_VALID_URL = [
rf'https?://(?:www\.)?art19\.com/shows/[^/#?]+/episodes/(?P<id>{_UUID_REGEX})',
rf'https?://rss\.art19\.com/episodes/(?P<id>{_UUID_REGEX})\.mp3',
]
_EMBED_REGEX = [rf'<iframe[^>]+\bsrc=[\'"](?P<url>{_VALID_URL[0]})']
_TESTS = [{
'url': 'https://rss.art19.com/episodes/5ba1413c-48b8-472b-9cc3-cfd952340bdb.mp3',
'info_dict': {
'id': '5ba1413c-48b8-472b-9cc3-cfd952340bdb',
'ext': 'mp3',
'title': 'Why Did DeSantis Drop Out?',
'series': 'The Daily Briefing',
'release_timestamp': 1705941275,
'description': 'md5:da38961da4a3f7e419471365e3c6b49f',
'episode': 'Episode 582',
'thumbnail': r're:^https?://content\.production\.cdn\.art19\.com.*\.jpeg$',
'series_id': 'ed52a0ab-08b1-4def-8afc-549e4d93296d',
'upload_date': '20240122',
'timestamp': 1705940815,
'episode_number': 582,
'modified_date': '20240122',
'episode_id': '5ba1413c-48b8-472b-9cc3-cfd952340bdb',
'modified_timestamp': 1705941275,
'release_date': '20240122',
'duration': 527.4,
},
}, {
'url': 'https://art19.com/shows/scamfluencers/episodes/8319b776-4153-4d22-8630-631f204a03dd',
'info_dict': {
'id': '8319b776-4153-4d22-8630-631f204a03dd',
'ext': 'mp3',
'title': 'Martha Stewart: The Homemaker Hustler Part 2',
'modified_date': '20240116',
'upload_date': '20240105',
'modified_timestamp': 1705435802,
'episode_id': '8319b776-4153-4d22-8630-631f204a03dd',
'series_id': 'd3c9b8ca-26b3-42f4-9bd8-21d1a9031e75',
'thumbnail': r're:^https?://content\.production\.cdn\.art19\.com.*\.jpeg$',
'description': 'md5:4aa7cfd1358dc57e729835bc208d7893',
'release_timestamp': 1705305660,
'release_date': '20240115',
'timestamp': 1704481536,
'episode_number': 88,
'series': 'Scamfluencers',
'duration': 2588.37501,
'episode': 'Episode 88',
},
}]
_WEBPAGE_TESTS = [{
'url': 'https://www.nu.nl/formule-1/6291456/verstappen-wordt-een-synoniem-voor-formule-1.html',
'info_dict': {
'id': '7d42626a-7301-47db-bb8a-3b6f054d77d7',
'ext': 'mp3',
'title': "'Verstappen wordt een synoniem voor Formule 1'",
'season': 'Seizoen 6',
'description': 'md5:39a7159a31c4cda312b2e893bdd5c071',
'episode_id': '7d42626a-7301-47db-bb8a-3b6f054d77d7',
'duration': 3061.82111,
'series_id': '93f4e113-2a60-4609-a564-755058fa40d8',
'release_date': '20231126',
'modified_timestamp': 1701156004,
'thumbnail': r're:^https?://content\.production\.cdn\.art19\.com.*\.jpeg$',
'season_number': 6,
'episode_number': 52,
'modified_date': '20231128',
'upload_date': '20231126',
'timestamp': 1701025981,
'season_id': '36097c1e-7455-490d-a2fe-e2f10b4d5f26',
'series': 'De Boordradio',
'release_timestamp': 1701026308,
'episode': 'Episode 52',
},
}, {
'url': 'https://www.wishtv.com/podcast-episode/larry-bucshon-announces-retirement-from-congress/',
'info_dict': {
'id': '8da368bd-08d1-46d0-afaa-c134a4af7dc0',
'ext': 'mp3',
'title': 'Larry Bucshon announces retirement from congress',
'upload_date': '20240115',
'episode_number': 148,
'episode': 'Episode 148',
'thumbnail': r're:^https?://content\.production\.cdn\.art19\.com.*\.jpeg$',
'release_date': '20240115',
'timestamp': 1705328205,
'release_timestamp': 1705329275,
'series': 'All INdiana Politics',
'modified_date': '20240117',
'modified_timestamp': 1705458901,
'series_id': 'c4af6c27-b10f-4ff2-9f84-0f407df86ff1',
'episode_id': '8da368bd-08d1-46d0-afaa-c134a4af7dc0',
'description': 'md5:53b5239e4d14973a87125c217c255b2a',
'duration': 1256.18848,
},
}]
@classmethod
def _extract_embed_urls(cls, url, webpage):
yield from super()._extract_embed_urls(url, webpage)
for episode_id in re.findall(
rf'<div[^>]+\bclass=[\'"][^\'"]*art19-web-player[^\'"]*[\'"][^>]+\bdata-episode-id=[\'"]({cls._UUID_REGEX})[\'"]', webpage):
yield f'https://rss.art19.com/episodes/{episode_id}.mp3'
def _real_extract(self, url):
episode_id = self._match_id(url)
player_metadata = self._download_json(
f'https://art19.com/episodes/{episode_id}', episode_id,
note='Downloading player metadata', fatal=False,
headers={'Accept': 'application/vnd.art19.v0+json'})
rss_metadata = self._download_json(
f'https://rss.art19.com/episodes/{episode_id}.json', episode_id, fatal=False,
note='Downloading RSS metadata')
formats = [{
'format_id': 'direct',
'url': f'https://rss.art19.com/episodes/{episode_id}.mp3',
'vcodec': 'none',
'acodec': 'mp3',
}]
for fmt_id, fmt_data in traverse_obj(rss_metadata, ('content', 'media', {dict.items}, ...)):
if fmt_id == 'waveform_bin':
continue
fmt_url = traverse_obj(fmt_data, ('url', {url_or_none}))
if not fmt_url:
continue
formats.append({
'format_id': fmt_id,
'url': fmt_url,
'vcodec': 'none',
'acodec': fmt_id,
'quality': -2 if fmt_id == 'ogg' else -1,
})
return {
'id': episode_id,
'formats': formats,
**traverse_obj(player_metadata, ('episode', {
'title': ('title', {str}),
'description': ('description_plain', {str}),
'episode_id': ('id', {str}),
'episode_number': ('episode_number', {int_or_none}),
'season_id': ('season_id', {str}),
'series_id': ('series_id', {str}),
'timestamp': ('created_at', {parse_iso8601}),
'release_timestamp': ('released_at', {parse_iso8601}),
'modified_timestamp': ('updated_at', {parse_iso8601})
})),
**traverse_obj(rss_metadata, ('content', {
'title': ('episode_title', {str}),
'description': ('episode_description_plain', {str}),
'episode_id': ('episode_id', {str}),
'episode_number': ('episode_number', {int_or_none}),
'season': ('season_title', {str}),
'season_id': ('season_id', {str}),
'season_number': ('season_number', {int_or_none}),
'series': ('series_title', {str}),
'series_id': ('series_id', {str}),
'thumbnail': ('cover_image', {url_or_none}),
'duration': ('duration', {float_or_none}),
})),
}
class Art19ShowIE(InfoExtractor):
_VALID_URL_BASE = r'https?://(?:www\.)?art19\.com/shows/(?P<id>[\w-]+)(?:/embed)?/?'
_VALID_URL = [
rf'{_VALID_URL_BASE}(?:$|[#?])',
r'https?://rss\.art19\.com/(?P<id>[\w-]+)/?(?:$|[#?])',
]
_EMBED_REGEX = [rf'<iframe[^>]+\bsrc=[\'"](?P<url>{_VALID_URL_BASE}[^\'"])']
_TESTS = [{
'url': 'https://www.art19.com/shows/5898c087-a14f-48dc-b6fc-a2280a1ff6e0/',
'info_dict': {
'_type': 'playlist',
'id': '5898c087-a14f-48dc-b6fc-a2280a1ff6e0',
'display_id': 'echt-gebeurd',
'title': 'Echt Gebeurd',
'description': 'md5:5fd11dc80b76e51ffd34b6067fd5e560',
'timestamp': 1492642167,
'upload_date': '20170419',
'modified_timestamp': int,
'modified_date': str,
'tags': 'count:7',
},
'playlist_mincount': 425,
}, {
'url': 'https://www.art19.com/shows/echt-gebeurd',
'info_dict': {
'_type': 'playlist',
'id': '5898c087-a14f-48dc-b6fc-a2280a1ff6e0',
'display_id': 'echt-gebeurd',
'title': 'Echt Gebeurd',
'description': 'md5:5fd11dc80b76e51ffd34b6067fd5e560',
'timestamp': 1492642167,
'upload_date': '20170419',
'modified_timestamp': int,
'modified_date': str,
'tags': 'count:7',
},
'playlist_mincount': 425,
}, {
'url': 'https://rss.art19.com/scamfluencers',
'info_dict': {
'_type': 'playlist',
'id': 'd3c9b8ca-26b3-42f4-9bd8-21d1a9031e75',
'display_id': 'scamfluencers',
'title': 'Scamfluencers',
'description': 'md5:7d239d670c0ced6dadbf71c4caf764b7',
'timestamp': 1647368573,
'upload_date': '20220315',
'modified_timestamp': int,
'modified_date': str,
'tags': [],
},
'playlist_mincount': 90,
}, {
'url': 'https://art19.com/shows/enthuellt/embed',
'info_dict': {
'_type': 'playlist',
'id': 'e2cacf57-bb8a-4263-aa81-719bcdd4f80c',
'display_id': 'enthuellt',
'title': 'Enthüllt',
'description': 'md5:17752246643414a2fd51744fc9a1c08e',
'timestamp': 1601645860,
'upload_date': '20201002',
'modified_timestamp': int,
'modified_date': str,
'tags': 'count:10',
},
'playlist_mincount': 10,
}]
_WEBPAGE_TESTS = [{
'url': 'https://deconstructingyourself.com/deconstructing-yourself-podcast',
'info_dict': {
'_type': 'playlist',
'id': 'cfbb9b01-c295-4adb-8726-adde7c03cf21',
'display_id': 'deconstructing-yourself',
'title': 'Deconstructing Yourself',
'description': 'md5:dab5082b28b248a35476abf64768854d',
'timestamp': 1570581181,
'upload_date': '20191009',
'modified_timestamp': int,
'modified_date': str,
'tags': 'count:5',
},
'playlist_mincount': 80,
}, {
'url': 'https://chicagoreader.com/columns-opinion/podcasts/ben-joravsky-show-podcast-episodes/',
'info_dict': {
'_type': 'playlist',
'id': '9dfa2c37-ab87-4c13-8388-4897914313ec',
'display_id': 'the-ben-joravsky-show',
'title': 'The Ben Joravsky Show',
'description': 'md5:c0f3ec0ee0dbea764390e521adc8780a',
'timestamp': 1550875095,
'upload_date': '20190222',
'modified_timestamp': int,
'modified_date': str,
'tags': ['Chicago Politics', 'chicago', 'Ben Joravsky'],
},
'playlist_mincount': 1900,
}]
@classmethod
def _extract_embed_urls(cls, url, webpage):
yield from super()._extract_embed_urls(url, webpage)
for series_id in re.findall(
r'<div[^>]+\bclass=[\'"][^\'"]*art19-web-player[^\'"]*[\'"][^>]+\bdata-series-id=[\'"]([\w-]+)[\'"]', webpage):
yield f'https://art19.com/shows/{series_id}'
def _real_extract(self, url):
series_id = self._match_id(url)
series_metadata = self._download_json(
f'https://art19.com/series/{series_id}', series_id, note='Downloading series metadata',
headers={'Accept': 'application/vnd.art19.v0+json'})
return {
'_type': 'playlist',
'entries': [
self.url_result(f'https://rss.art19.com/episodes/{episode_id}.mp3', Art19IE)
for episode_id in traverse_obj(series_metadata, ('series', 'episode_ids', ..., {str}))
],
**traverse_obj(series_metadata, ('series', {
'id': ('id', {str}),
'display_id': ('slug', {str}),
'title': ('title', {str}),
'description': ('description_plain', {str}),
'timestamp': ('created_at', {parse_iso8601}),
'modified_timestamp': ('updated_at', {parse_iso8601}),
})),
'tags': traverse_obj(series_metadata, ('tags', ..., 'name', {str})),
}

View File

@@ -7,6 +7,7 @@ import math
import re
import time
import urllib.parse
import uuid
from .common import InfoExtractor, SearchInfoExtractor
from ..dependencies import Cryptodome
@@ -1304,6 +1305,26 @@ class BilibiliPlaylistIE(BilibiliSpaceListBaseIE):
'upload_date': '20211127',
},
'playlist_mincount': 513,
}, {
'url': 'https://www.bilibili.com/list/1958703906?sid=547718&oid=687146339&bvid=BV1DU4y1r7tz',
'info_dict': {
'id': 'BV1DU4y1r7tz',
'ext': 'mp4',
'title': '【直播回放】8.20晚9:30 3d发布喵 2022年8月20日21点场',
'upload_date': '20220820',
'description': '',
'timestamp': 1661016330,
'uploader_id': '1958703906',
'uploader': '靡烟miya',
'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$',
'duration': 9552.903,
'tags': list,
'comment_count': int,
'view_count': int,
'like_count': int,
'_old_archive_ids': ['bilibili 687146339_part1'],
},
'params': {'noplaylist': True},
}, {
'url': 'https://www.bilibili.com/medialist/play/1958703906?business=space_series&business_id=547718&desc=1',
'info_dict': {
@@ -1355,6 +1376,11 @@ class BilibiliPlaylistIE(BilibiliSpaceListBaseIE):
def _real_extract(self, url):
list_id = self._match_id(url)
bvid = traverse_obj(parse_qs(url), ('bvid', 0))
if not self._yes_playlist(list_id, bvid):
return self.url_result(f'https://www.bilibili.com/video/{bvid}', BiliBiliIE)
webpage = self._download_webpage(url, list_id)
initial_state = self._search_json(r'window\.__INITIAL_STATE__\s*=', webpage, 'initial state', list_id)
if traverse_obj(initial_state, ('error', 'code', {int_or_none})) != 200:
@@ -1464,8 +1490,37 @@ class BiliBiliSearchIE(SearchInfoExtractor):
IE_DESC = 'Bilibili video search'
_MAX_RESULTS = 100000
_SEARCH_KEY = 'bilisearch'
_TESTS = [{
'url': 'bilisearch3:靡烟 出道一年,我怎么还在等你单推的女人睡觉后开播啊',
'playlist_count': 3,
'info_dict': {
'id': '靡烟 出道一年,我怎么还在等你单推的女人睡觉后开播啊',
'title': '靡烟 出道一年,我怎么还在等你单推的女人睡觉后开播啊',
},
'playlist': [{
'info_dict': {
'id': 'BV1n44y1Q7sc',
'ext': 'mp4',
'title': '“出道一年我怎么还在等你单推的女人睡觉后开播啊”【一分钟了解靡烟miya】',
'timestamp': 1669889987,
'upload_date': '20221201',
'description': 'md5:43343c0973defff527b5a4b403b4abf9',
'tags': list,
'uploader': '靡烟miya',
'duration': 123.156,
'uploader_id': '1958703906',
'comment_count': int,
'view_count': int,
'like_count': int,
'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$',
'_old_archive_ids': ['bilibili 988222410_part1'],
},
}],
}]
def _search_results(self, query):
if not self._get_cookies('https://api.bilibili.com').get('buvid3'):
self._set_cookie('.bilibili.com', 'buvid3', f'{uuid.uuid4()}infoc')
for page_num in itertools.count(1):
videos = self._download_json(
'https://api.bilibili.com/x/web-interface/search/type', query,
@@ -1941,7 +1996,7 @@ class BiliIntlIE(BiliIntlBaseIE):
'title': get_element_by_class(
'bstar-meta__title', webpage) or self._html_search_meta('og:title', webpage),
'description': get_element_by_class(
'bstar-meta__desc', webpage) or self._html_search_meta('og:description'),
'bstar-meta__desc', webpage) or self._html_search_meta('og:description', webpage),
}, self._search_json_ld(webpage, video_id, default={}))
def _get_comments_reply(self, root_id, next_id=0, display_id=None):

209
yt_dlp/extractor/boosty.py Normal file
View File

@@ -0,0 +1,209 @@
from .common import InfoExtractor
from .youtube import YoutubeIE
from ..utils import (
ExtractorError,
int_or_none,
qualities,
str_or_none,
url_or_none,
)
from ..utils.traversal import traverse_obj
class BoostyIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?boosty\.to/(?P<user>[^/#?]+)/posts/(?P<post_id>[^/#?]+)'
_TESTS = [{
# single ok_video
'url': 'https://boosty.to/kuplinov/posts/e55d050c-e3bb-4873-a7db-ac7a49b40c38',
'info_dict': {
'id': 'd7473824-352e-48e2-ae53-d4aa39459968',
'title': 'phasma_3',
'channel': 'Kuplinov',
'channel_id': '7958701',
'timestamp': 1655031975,
'upload_date': '20220612',
'release_timestamp': 1655049000,
'release_date': '20220612',
'modified_timestamp': 1668680993,
'modified_date': '20221117',
'tags': ['куплинов', 'phasmophobia'],
'like_count': int,
'ext': 'mp4',
'duration': 105,
'view_count': int,
'thumbnail': r're:^https://i\.mycdn\.me/videoPreview\?',
},
}, {
# multiple ok_video
'url': 'https://boosty.to/maddyson/posts/0c652798-3b35-471f-8b48-a76a0b28736f',
'info_dict': {
'id': '0c652798-3b35-471f-8b48-a76a0b28736f',
'title': 'то что не пропустил юта6',
'channel': 'Илья Давыдов',
'channel_id': '6808257',
'timestamp': 1694017040,
'upload_date': '20230906',
'release_timestamp': 1694017040,
'release_date': '20230906',
'modified_timestamp': 1694071178,
'modified_date': '20230907',
'like_count': int,
},
'playlist_count': 3,
'playlist': [{
'info_dict': {
'id': 'cc325a9f-a563-41c6-bf47-516c1b506c9a',
'title': 'то что не пропустил юта6',
'channel': 'Илья Давыдов',
'channel_id': '6808257',
'timestamp': 1694017040,
'upload_date': '20230906',
'release_timestamp': 1694017040,
'release_date': '20230906',
'modified_timestamp': 1694071178,
'modified_date': '20230907',
'like_count': int,
'ext': 'mp4',
'duration': 31204,
'view_count': int,
'thumbnail': r're:^https://i\.mycdn\.me/videoPreview\?',
},
}, {
'info_dict': {
'id': 'd07b0a72-9493-4512-b54e-55ce468fd4b7',
'title': 'то что не пропустил юта6',
'channel': 'Илья Давыдов',
'channel_id': '6808257',
'timestamp': 1694017040,
'upload_date': '20230906',
'release_timestamp': 1694017040,
'release_date': '20230906',
'modified_timestamp': 1694071178,
'modified_date': '20230907',
'like_count': int,
'ext': 'mp4',
'duration': 25704,
'view_count': int,
'thumbnail': r're:^https://i\.mycdn\.me/videoPreview\?',
},
}, {
'info_dict': {
'id': '4a3bba32-78c8-422a-9432-2791aff60b42',
'title': 'то что не пропустил юта6',
'channel': 'Илья Давыдов',
'channel_id': '6808257',
'timestamp': 1694017040,
'upload_date': '20230906',
'release_timestamp': 1694017040,
'release_date': '20230906',
'modified_timestamp': 1694071178,
'modified_date': '20230907',
'like_count': int,
'ext': 'mp4',
'duration': 31867,
'view_count': int,
'thumbnail': r're:^https://i\.mycdn\.me/videoPreview\?',
},
}],
}, {
# single external video (youtube)
'url': 'https://boosty.to/denischuzhoy/posts/6094a487-bcec-4cf8-a453-43313b463c38',
'info_dict': {
'id': 'EXelTnve5lY',
'title': 'Послание Президента Федеральному Собранию | Класс народа',
'upload_date': '20210425',
'channel': 'Денис Чужой',
'tags': 'count:10',
'like_count': int,
'ext': 'mp4',
'duration': 816,
'view_count': int,
'thumbnail': r're:^https://i\.ytimg\.com/',
'age_limit': 0,
'availability': 'public',
'categories': list,
'channel_follower_count': int,
'channel_id': 'UCCzVNbWZfYpBfyofCCUD_0w',
'channel_is_verified': bool,
'channel_url': r're:^https://www\.youtube\.com/',
'comment_count': int,
'description': str,
'heatmap': 'count:100',
'live_status': str,
'playable_in_embed': bool,
'uploader': str,
'uploader_id': str,
'uploader_url': r're:^https://www\.youtube\.com/',
},
}]
_MP4_TYPES = ('tiny', 'lowest', 'low', 'medium', 'high', 'full_hd', 'quad_hd', 'ultra_hd')
def _extract_formats(self, player_urls, video_id):
formats = []
quality = qualities(self._MP4_TYPES)
for player_url in traverse_obj(player_urls, lambda _, v: url_or_none(v['url'])):
url = player_url['url']
format_type = player_url.get('type')
if format_type in ('hls', 'hls_live', 'live_ondemand_hls', 'live_playback_hls'):
formats.extend(self._extract_m3u8_formats(url, video_id, m3u8_id='hls', fatal=False))
elif format_type in ('dash', 'dash_live', 'live_playback_dash'):
formats.extend(self._extract_mpd_formats(url, video_id, mpd_id='dash', fatal=False))
elif format_type in self._MP4_TYPES:
formats.append({
'url': url,
'ext': 'mp4',
'format_id': format_type,
'quality': quality(format_type),
})
else:
self.report_warning(f'Unknown format type: {format_type!r}')
return formats
def _real_extract(self, url):
user, post_id = self._match_valid_url(url).group('user', 'post_id')
post = self._download_json(
f'https://api.boosty.to/v1/blog/{user}/post/{post_id}', post_id,
note='Downloading post data', errnote='Unable to download post data')
post_title = post.get('title')
if not post_title:
self.report_warning('Unable to extract post title. Falling back to parsing html page')
webpage = self._download_webpage(url, video_id=post_id)
post_title = self._og_search_title(webpage, default=None) or self._html_extract_title(webpage)
common_metadata = {
'title': post_title,
**traverse_obj(post, {
'channel': ('user', 'name', {str}),
'channel_id': ('user', 'id', {str_or_none}),
'timestamp': ('createdAt', {int_or_none}),
'release_timestamp': ('publishTime', {int_or_none}),
'modified_timestamp': ('updatedAt', {int_or_none}),
'tags': ('tags', ..., 'title', {str}),
'like_count': ('count', 'likes', {int_or_none}),
}),
}
entries = []
for item in traverse_obj(post, ('data', ..., {dict})):
item_type = item.get('type')
if item_type == 'video' and url_or_none(item.get('url')):
entries.append(self.url_result(item['url'], YoutubeIE))
elif item_type == 'ok_video':
video_id = item.get('id') or post_id
entries.append({
'id': video_id,
'formats': self._extract_formats(item.get('playerUrls'), video_id),
**common_metadata,
**traverse_obj(item, {
'title': ('title', {str}),
'duration': ('duration', {int_or_none}),
'view_count': ('viewsCounter', {int_or_none}),
'thumbnail': (('previewUrl', 'defaultPreview'), {url_or_none}),
}, get_all=False)})
if not entries:
raise ExtractorError('No videos found', expected=True)
if len(entries) == 1:
return entries[0]
return self.playlist_result(entries, post_id, post_title, **common_metadata)

View File

@@ -1,6 +1,7 @@
from .common import InfoExtractor
from ..utils import (
clean_html,
determine_ext,
int_or_none,
parse_duration,
parse_resolution,
@@ -60,6 +61,7 @@ class CCMAIE(InfoExtractor):
'http://dinamics.ccma.cat/pvideo/media.jsp', media_id, query={
'media': media_type,
'idint': media_id,
'format': 'dm',
})
formats = []
@@ -69,6 +71,10 @@ class CCMAIE(InfoExtractor):
format_url = url_or_none(format_.get('file'))
if not format_url:
continue
if determine_ext(format_url) == 'mpd':
formats.extend(self._extract_mpd_formats(
format_url, media_id, mpd_id='dash', fatal=False))
continue
label = format_.get('label')
f = parse_resolution(label)
f.update({

View File

@@ -67,7 +67,10 @@ class CineverseIE(CineverseBaseIE):
html = self._download_webpage(url, video_id)
idetails = self._search_nextjs_data(html, video_id)['props']['pageProps']['idetails']
if idetails.get('err_code') == 1200:
err_code = idetails.get('err_code')
if err_code == 1002:
self.raise_login_required()
elif err_code == 1200:
self.raise_geo_restricted(
'This video is not available from your location due to geo restriction. '
'You may be able to bypass it by using the /details/ page instead of the /watch/ page',

View File

@@ -4,27 +4,25 @@ from .common import InfoExtractor
class CloudflareStreamIE(InfoExtractor):
_SUBDOMAIN_RE = r'(?:(?:watch|iframe|customer-\w+)\.)?'
_DOMAIN_RE = r'(?:cloudflarestream\.com|(?:videodelivery|bytehighway)\.net)'
_EMBED_RE = r'embed\.%s/embed/[^/]+\.js\?.*?\bvideo=' % _DOMAIN_RE
_EMBED_RE = rf'embed\.{_DOMAIN_RE}/embed/[^/]+\.js\?.*?\bvideo='
_ID_RE = r'[\da-f]{32}|[\w-]+\.[\w-]+\.[\w-]+'
_VALID_URL = r'''(?x)
https?://
(?:
(?:watch\.)?%s/|
%s
)
(?P<id>%s)
''' % (_DOMAIN_RE, _EMBED_RE, _ID_RE)
_EMBED_REGEX = [fr'<script[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//{_EMBED_RE}(?:{_ID_RE}).*?)\1']
_VALID_URL = rf'https?://(?:{_SUBDOMAIN_RE}{_DOMAIN_RE}/|{_EMBED_RE})(?P<id>{_ID_RE})'
_EMBED_REGEX = [
rf'<script[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//{_EMBED_RE}(?:{_ID_RE}).*?)\1',
rf'<iframe[^>]+\bsrc=["\'](?P<url>https?://{_SUBDOMAIN_RE}{_DOMAIN_RE}/[\da-f]{{32}})',
]
_TESTS = [{
'url': 'https://embed.cloudflarestream.com/embed/we4g.fla9.latest.js?video=31c9291ab41fac05471db4e73aa11717',
'info_dict': {
'id': '31c9291ab41fac05471db4e73aa11717',
'ext': 'mp4',
'title': '31c9291ab41fac05471db4e73aa11717',
'thumbnail': 'https://videodelivery.net/31c9291ab41fac05471db4e73aa11717/thumbnails/thumbnail.jpg',
},
'params': {
'skip_download': True,
'skip_download': 'm3u8',
},
}, {
'url': 'https://watch.cloudflarestream.com/9df17203414fd1db3e3ed74abbe936c1',
@@ -35,6 +33,21 @@ class CloudflareStreamIE(InfoExtractor):
}, {
'url': 'https://embed.videodelivery.net/embed/r4xu.fla9.latest.js?video=81d80727f3022488598f68d323c1ad5e',
'only_matching': True,
}, {
'url': 'https://customer-aw5py76sw8wyqzmh.cloudflarestream.com/2463f6d3e06fa29710a337f5f5389fd8/iframe',
'only_matching': True,
}]
_WEBPAGE_TESTS = [{
'url': 'https://upride.cc/incident/shoulder-pass-at-light/',
'info_dict': {
'id': 'eaef9dea5159cf968be84241b5cedfe7',
'ext': 'mp4',
'title': 'eaef9dea5159cf968be84241b5cedfe7',
'thumbnail': 'https://videodelivery.net/eaef9dea5159cf968be84241b5cedfe7/thumbnails/thumbnail.jpg',
},
'params': {
'skip_download': 'm3u8',
},
}]
def _real_extract(self, url):

View File

@@ -0,0 +1,79 @@
from .common import InfoExtractor
from ..utils import (
int_or_none,
parse_iso8601,
url_or_none,
urlencode_postdata,
)
from ..utils.traversal import traverse_obj
class CloudyCDNIE(InfoExtractor):
_VALID_URL = r'(?:https?:)?//embed\.cloudycdn\.services/(?P<site_id>[^/?#]+)/media/(?P<id>[\w-]+)'
_EMBED_REGEX = [rf'<iframe[^>]+\bsrc=[\'"](?P<url>{_VALID_URL})']
_TESTS = [{
'url': 'https://embed.cloudycdn.services/ltv/media/46k_d23-6000-105?',
'md5': '64f72a360ca530d5ed89c77646c9eee5',
'info_dict': {
'id': '46k_d23-6000-105',
'ext': 'mp4',
'timestamp': 1700589151,
'duration': 1442,
'upload_date': '20231121',
'title': 'D23-6000-105_cetstud',
'thumbnail': 'https://store.cloudycdn.services/tmsp00060/assets/media/660858/placeholder1700589200.jpg',
}
}, {
'url': 'https://embed.cloudycdn.services/izm/media/26e_lv-8-5-1',
'md5': '798828a479151e2444d8dcfbec76e482',
'info_dict': {
'id': '26e_lv-8-5-1',
'ext': 'mp4',
'title': 'LV-8-5-1',
'timestamp': 1669767167,
'thumbnail': 'https://store.cloudycdn.services/tmsp00120/assets/media/488306/placeholder1679423604.jpg',
'duration': 1205,
'upload_date': '20221130',
}
}]
_WEBPAGE_TESTS = [{
'url': 'https://www.tavaklase.lv/video/es-esmu-mina-um-2/',
'md5': '63074e8e6c84ac2a01f2fb8bf03b8f43',
'info_dict': {
'id': 'cqd_lib-2',
'ext': 'mp4',
'upload_date': '20230223',
'duration': 629,
'thumbnail': 'https://store.cloudycdn.services/tmsp00120/assets/media/518407/placeholder1678748124.jpg',
'timestamp': 1677181513,
'title': 'LIB-2',
}
}]
def _real_extract(self, url):
site_id, video_id = self._match_valid_url(url).group('site_id', 'id')
data = self._download_json(
f'https://player.cloudycdn.services/player/{site_id}/media/{video_id}/',
video_id, data=urlencode_postdata({
'version': '6.4.0',
'referer': url,
}))
formats, subtitles = [], {}
for m3u8_url in traverse_obj(data, ('source', 'sources', ..., 'src', {url_or_none})):
fmts, subs = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, fatal=False)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
return {
'id': video_id,
'formats': formats,
'subtitles': subtitles,
**traverse_obj(data, {
'title': ('name', {str}),
'duration': ('duration', {int_or_none}),
'timestamp': ('upload_date', {parse_iso8601}),
'thumbnail': ('source', 'poster', {url_or_none}),
}),
}

View File

@@ -1,68 +1,97 @@
from .common import InfoExtractor
from ..utils import smuggle_url
class CNBCIE(InfoExtractor):
_VALID_URL = r'https?://video\.cnbc\.com/gallery/\?video=(?P<id>[0-9]+)'
_TEST = {
'url': 'http://video.cnbc.com/gallery/?video=3000503714',
'info_dict': {
'id': '3000503714',
'ext': 'mp4',
'title': 'Fighting zombies is big business',
'description': 'md5:0c100d8e1a7947bd2feec9a5550e519e',
'timestamp': 1459332000,
'upload_date': '20160330',
'uploader': 'NBCU-CNBC',
},
'params': {
# m3u8 download
'skip_download': True,
},
'skip': 'Dead link',
}
def _real_extract(self, url):
video_id = self._match_id(url)
return {
'_type': 'url_transparent',
'ie_key': 'ThePlatform',
'url': smuggle_url(
'http://link.theplatform.com/s/gZWlPC/media/guid/2408950221/%s?mbr=true&manifest=m3u' % video_id,
{'force_smil_url': True}),
'id': video_id,
}
from ..utils import int_or_none, parse_iso8601, str_or_none, url_or_none
from ..utils.traversal import traverse_obj
class CNBCVideoIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?cnbc\.com(?P<path>/video/(?:[^/]+/)+(?P<id>[^./?#&]+)\.html)'
_TEST = {
'url': 'https://www.cnbc.com/video/2018/07/19/trump-i-dont-necessarily-agree-with-raising-rates.html',
_VALID_URL = r'https?://(?:www\.)?cnbc\.com/video/(?:[^/?#]+/)+(?P<id>[^./?#&]+)\.html'
_TESTS = [{
'url': 'https://www.cnbc.com/video/2023/12/07/mcdonalds-just-unveiled-cosmcsits-new-spinoff-brand.html',
'info_dict': {
'id': '7000031301',
'ext': 'mp4',
'title': "Trump: I don't necessarily agree with raising rates",
'description': 'md5:878d8f0b4ebb5bb1dda3514b91b49de3',
'timestamp': 1531958400,
'upload_date': '20180719',
'uploader': 'NBCU-CNBC',
'id': '107344774',
'display_id': 'mcdonalds-just-unveiled-cosmcsits-new-spinoff-brand',
'modified_timestamp': 1702053483,
'timestamp': 1701977810,
'channel': 'News Videos',
'upload_date': '20231207',
'description': 'md5:882c001d85cb43d7579b514307b3e78b',
'release_timestamp': 1701977375,
'modified_date': '20231208',
'release_date': '20231207',
'duration': 65,
'author': 'Sean Conlon',
'title': 'Here\'s a first look at McDonald\'s new spinoff brand, CosMc\'s',
'thumbnail': 'https://image.cnbcfm.com/api/v1/image/107344192-1701894812493-CosMcsskyHero_2336x1040_hero-desktop.jpg?v=1701894855',
},
'params': {
'skip_download': True,
'expected_warnings': ['Unable to download f4m manifest'],
}, {
'url': 'https://www.cnbc.com/video/2023/12/08/jim-cramer-shares-his-take-on-seattles-tech-scene.html',
'info_dict': {
'author': 'Jim Cramer',
'channel': 'Mad Money with Jim Cramer',
'description': 'md5:72925be21b952e95eba51178dddf4e3e',
'duration': 299.0,
'ext': 'mp4',
'id': '107345451',
'display_id': 'jim-cramer-shares-his-take-on-seattles-tech-scene',
'thumbnail': 'https://image.cnbcfm.com/api/v1/image/107345481-1702079431MM-B-120823.jpg?v=1702079430',
'timestamp': 1702080139,
'title': 'Jim Cramer shares his take on Seattle\'s tech scene',
'release_date': '20231208',
'upload_date': '20231209',
'modified_timestamp': 1702080139,
'modified_date': '20231209',
'release_timestamp': 1702073551,
},
'skip': 'Dead link',
}
'expected_warnings': ['Unable to download f4m manifest'],
}, {
'url': 'https://www.cnbc.com/video/2023/12/08/the-epicenter-of-ai-is-in-seattle-says-jim-cramer.html',
'info_dict': {
'author': 'Jim Cramer',
'channel': 'Mad Money with Jim Cramer',
'description': 'md5:72925be21b952e95eba51178dddf4e3e',
'duration': 113.0,
'ext': 'mp4',
'id': '107345474',
'display_id': 'the-epicenter-of-ai-is-in-seattle-says-jim-cramer',
'thumbnail': 'https://image.cnbcfm.com/api/v1/image/107345486-Screenshot_2023-12-08_at_70339_PM.png?v=1702080248',
'timestamp': 1702080535,
'title': 'The epicenter of AI is in Seattle, says Jim Cramer',
'release_timestamp': 1702077347,
'modified_timestamp': 1702080535,
'release_date': '20231208',
'upload_date': '20231209',
'modified_date': '20231209',
},
'expected_warnings': ['Unable to download f4m manifest'],
}]
def _real_extract(self, url):
path, display_id = self._match_valid_url(url).groups()
video_id = self._download_json(
'https://webql-redesign.cnbcfm.com/graphql', display_id, query={
'query': '''{
page(path: "%s") {
vcpsId
}
}''' % path,
})['data']['page']['vcpsId']
return self.url_result(
'http://video.cnbc.com/gallery/?video=%d' % video_id,
CNBCIE.ie_key())
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
data = self._search_json(r'window\.__s_data=', webpage, 'video data', display_id)
player_data = traverse_obj(data, (
'page', 'page', 'layout', ..., 'columns', ..., 'modules',
lambda _, v: v['name'] == 'clipPlayer', 'data', {dict}), get_all=False)
return {
'id': display_id,
'display_id': display_id,
'formats': self._extract_akamai_formats(player_data['playbackURL'], display_id),
**self._search_json_ld(webpage, display_id, fatal=False),
**traverse_obj(player_data, {
'id': ('id', {str_or_none}),
'title': ('title', {str}),
'description': ('description', {str}),
'author': ('author', ..., 'name', {str}),
'timestamp': ('datePublished', {parse_iso8601}),
'release_timestamp': ('uploadDate', {parse_iso8601}),
'modified_timestamp': ('dateLastPublished', {parse_iso8601}),
'thumbnail': ('thumbnail', {url_or_none}),
'duration': ('duration', {int_or_none}),
'channel': ('section', 'title', {str}),
}, get_all=False),
}

View File

@@ -247,6 +247,8 @@ class InfoExtractor:
(For internal use only)
* http_chunk_size Chunk size for HTTP downloads
* ffmpeg_args Extra arguments for ffmpeg downloader
* is_dash_periods Whether the format is a result of merging
multiple DASH periods.
RTMP formats can also have the additional fields: page_url,
app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
rtmp_protocol, rtmp_real_time
@@ -278,7 +280,7 @@ class InfoExtractor:
description: Full video description.
uploader: Full name of the video uploader.
license: License name the video is licensed under.
creator: The creator of the video.
creators: List of creators of the video.
timestamp: UNIX timestamp of the moment the video was uploaded
upload_date: Video upload date in UTC (YYYYMMDD).
If not explicitly set, calculated from timestamp
@@ -422,16 +424,16 @@ class InfoExtractor:
track_number: Number of the track within an album or a disc, as an integer.
track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii),
as a unicode string.
artist: Artist(s) of the track.
genre: Genre(s) of the track.
artists: List of artists of the track.
composers: List of composers of the piece.
genres: List of genres of the track.
album: Title of the album the track belongs to.
album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
album_artist: List of all artists appeared on the album (e.g.
"Ash Borer / Fell Voices" or "Various Artists", useful for splits
and compilations).
album_artists: List of all artists appeared on the album.
E.g. ["Ash Borer", "Fell Voices"] or ["Various Artists"].
Useful for splits and compilations.
disc_number: Number of the disc or other physical medium the track belongs to,
as an integer.
composer: Composer of the piece
The following fields should only be set for clips that should be cut from the original video:
@@ -442,6 +444,18 @@ class InfoExtractor:
rows: Number of rows in each storyboard fragment, as an integer
columns: Number of columns in each storyboard fragment, as an integer
The following fields are deprecated and should not be set by new code:
composer: Use "composers" instead.
Composer(s) of the piece, comma-separated.
artist: Use "artists" instead.
Artist(s) of the track, comma-separated.
genre: Use "genres" instead.
Genre(s) of the track, comma-separated.
album_artist: Use "album_artists" instead.
All artists appeared on the album, comma-separated.
creator: Use "creators" instead.
The creator of the video.
Unless mentioned otherwise, the fields should be Unicode strings.
Unless mentioned otherwise, None is equivalent to absence of information.
@@ -2530,7 +2544,11 @@ class InfoExtractor:
self._report_ignoring_subs('DASH')
return fmts
def _extract_mpd_formats_and_subtitles(
def _extract_mpd_formats_and_subtitles(self, *args, **kwargs):
periods = self._extract_mpd_periods(*args, **kwargs)
return self._merge_mpd_periods(periods)
def _extract_mpd_periods(
self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
fatal=True, data=None, headers={}, query={}):
@@ -2543,17 +2561,16 @@ class InfoExtractor:
errnote='Failed to download MPD manifest' if errnote is None else errnote,
fatal=fatal, data=data, headers=headers, query=query)
if res is False:
return [], {}
return []
mpd_doc, urlh = res
if mpd_doc is None:
return [], {}
return []
# We could have been redirected to a new url when we retrieved our mpd file.
mpd_url = urlh.url
mpd_base_url = base_url(mpd_url)
return self._parse_mpd_formats_and_subtitles(
mpd_doc, mpd_id, mpd_base_url, mpd_url)
return self._parse_mpd_periods(mpd_doc, mpd_id, mpd_base_url, mpd_url)
def _parse_mpd_formats(self, *args, **kwargs):
fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
@@ -2561,8 +2578,39 @@ class InfoExtractor:
self._report_ignoring_subs('DASH')
return fmts
def _parse_mpd_formats_and_subtitles(
self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
def _parse_mpd_formats_and_subtitles(self, *args, **kwargs):
periods = self._parse_mpd_periods(*args, **kwargs)
return self._merge_mpd_periods(periods)
def _merge_mpd_periods(self, periods):
"""
Combine all formats and subtitles from an MPD manifest into a single list,
by concatenate streams with similar formats.
"""
formats, subtitles = {}, {}
for period in periods:
for f in period['formats']:
assert 'is_dash_periods' not in f, 'format already processed'
f['is_dash_periods'] = True
format_key = tuple(v for k, v in f.items() if k not in (
('format_id', 'fragments', 'manifest_stream_number')))
if format_key not in formats:
formats[format_key] = f
elif 'fragments' in f:
formats[format_key].setdefault('fragments', []).extend(f['fragments'])
if subtitles and period['subtitles']:
self.report_warning(bug_reports_message(
'Found subtitles in multiple periods in the DASH manifest; '
'if part of the subtitles are missing,'
), only_once=True)
for sub_lang, sub_info in period['subtitles'].items():
subtitles.setdefault(sub_lang, []).extend(sub_info)
return list(formats.values()), subtitles
def _parse_mpd_periods(self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
"""
Parse formats from MPD manifest.
References:
@@ -2643,14 +2691,17 @@ class InfoExtractor:
mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
availability_start_time = unified_timestamp(
mpd_doc.get('availabilityStartTime'), with_milliseconds=True) or 0
formats, subtitles = [], {}
stream_numbers = collections.defaultdict(int)
for period in mpd_doc.findall(_add_ns('Period')):
for period_idx, period in enumerate(mpd_doc.findall(_add_ns('Period'))):
# segmentIngestTime is completely out of spec, but YT Livestream do this
segment_ingest_time = period.get('{http://youtube.com/yt/2012/10/10}segmentIngestTime')
if segment_ingest_time:
availability_start_time = unified_timestamp(segment_ingest_time, with_milliseconds=True)
period_entry = {
'id': period.get('id', f'period-{period_idx}'),
'formats': [],
'subtitles': collections.defaultdict(list),
}
period_duration = parse_duration(period.get('duration')) or mpd_duration
period_ms_info = extract_multisegment_info(period, {
'start_number': 1,
@@ -2908,11 +2959,10 @@ class InfoExtractor:
if content_type in ('video', 'audio', 'image/jpeg'):
f['manifest_stream_number'] = stream_numbers[f['url']]
stream_numbers[f['url']] += 1
formats.append(f)
period_entry['formats'].append(f)
elif content_type == 'text':
subtitles.setdefault(lang or 'und', []).append(f)
return formats, subtitles
period_entry['subtitles'][lang or 'und'].append(f)
yield period_entry
def _extract_ism_formats(self, *args, **kwargs):
fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)

View File

@@ -33,10 +33,7 @@ class CrooksAndLiarsIE(InfoExtractor):
webpage = self._download_webpage(
'http://embed.crooksandliars.com/embed/%s' % video_id, video_id)
manifest = self._parse_json(
self._search_regex(
r'var\s+manifest\s*=\s*({.+?})\n', webpage, 'manifest JSON'),
video_id)
manifest = self._search_json(r'var\s+manifest\s*=', webpage, 'manifest JSON', video_id)
quality = qualities(('webm_low', 'mp4_low', 'webm_high', 'mp4_high'))

View File

@@ -1,8 +1,10 @@
from .common import InfoExtractor
from ..utils import (
encode_base_n,
ExtractorError,
encode_base_n,
get_elements_by_class,
int_or_none,
join_nonempty,
merge_dicts,
parse_duration,
str_to_int,
@@ -81,6 +83,7 @@ class EpornerIE(InfoExtractor):
sources = video['sources']
formats = []
has_av1 = bool(get_elements_by_class('download-av1', webpage))
for kind, formats_dict in sources.items():
if not isinstance(formats_dict, dict):
continue
@@ -106,6 +109,14 @@ class EpornerIE(InfoExtractor):
'height': height,
'fps': fps,
})
if has_av1:
formats.append({
'url': src.replace('.mp4', '-av1.mp4'),
'format_id': join_nonempty('av1', format_id),
'height': height,
'fps': fps,
'vcodec': 'av1',
})
json_ld = self._search_json_ld(webpage, display_id, default={})

224
yt_dlp/extractor/err.py Normal file
View File

@@ -0,0 +1,224 @@
from .common import InfoExtractor
from ..utils import (
clean_html,
int_or_none,
str_or_none,
url_or_none,
)
from ..utils.traversal import traverse_obj
class ERRJupiterIE(InfoExtractor):
_VALID_URL = r'https?://(?:jupiter(?:pluss)?|lasteekraan)\.err\.ee/(?P<id>\d+)'
_TESTS = [{
'note': 'Jupiter: Movie: siin-me-oleme',
'url': 'https://jupiter.err.ee/1211107/siin-me-oleme',
'md5': '9b45d1682a98853acaa1e1b0c791f425',
'info_dict': {
'id': '1211107',
'ext': 'mp4',
'title': 'Siin me oleme!',
'alt_title': '',
'description': 'md5:1825b795f5f7584241aeb59e5bbb4f70',
'release_date': '20231226',
'upload_date': '20201217',
'modified_date': '20201217',
'release_timestamp': 1703577600,
'timestamp': 1608210000,
'modified_timestamp': 1608220800,
'release_year': 1978,
},
}, {
'note': 'Jupiter: Series: Impulss',
'url': 'https://jupiter.err.ee/1609145945/impulss',
'md5': 'a378486df07ed1ba74e46cc861886243',
'info_dict': {
'id': '1609145945',
'ext': 'mp4',
'title': 'Impulss',
'alt_title': 'Loteriipilet hooldekodusse',
'description': 'md5:fa8a2ed0cdccb130211513443ee4d571',
'release_date': '20231107',
'upload_date': '20231026',
'modified_date': '20231118',
'release_timestamp': 1699380000,
'timestamp': 1698327601,
'modified_timestamp': 1700311802,
'series': 'Impulss',
'season': 'Season 1',
'season_number': 1,
'episode': 'Loteriipilet hooldekodusse',
'episode_number': 6,
'series_id': '1609108187',
'release_year': 2023,
'episode_id': '1609145945',
},
}, {
'note': 'Jupiter: Radio Show: mnemoturniir episode',
'url': 'https://jupiter.err.ee/1037919/mnemoturniir',
'md5': 'f1eb95fe66f9620ff84e81bbac37076a',
'info_dict': {
'id': '1037919',
'ext': 'm4a',
'title': 'Mnemoturniir',
'alt_title': '',
'description': 'md5:626db52394e7583c26ab74d6a34d9982',
'release_date': '20240121',
'upload_date': '20240108',
'modified_date': '20240121',
'release_timestamp': 1705827900,
'timestamp': 1704675602,
'modified_timestamp': 1705827601,
'series': 'Mnemoturniir',
'season': 'Season 0',
'season_number': 0,
'episode': 'Episode 0',
'episode_number': 0,
'series_id': '1037919',
'release_year': 2024,
'episode_id': '1609215101',
},
}, {
'note': 'Jupiter+: Clip: bolee-zelenyj-tallinn',
'url': 'https://jupiterpluss.err.ee/1609180445/bolee-zelenyj-tallinn',
'md5': '1b812270c4daf6ce51c06bfeaf33ed95',
'info_dict': {
'id': '1609180445',
'ext': 'mp4',
'title': 'Более зеленый Таллинн',
'alt_title': '',
'description': 'md5:fd34d9bf939c28c4a725b19a7f0d6320',
'release_date': '20231224',
'upload_date': '20231130',
'modified_date': '20231207',
'release_timestamp': 1703423400,
'timestamp': 1701338400,
'modified_timestamp': 1701967200,
'release_year': 2023,
},
}, {
'note': 'Jupiter+: Series: The Sniffer',
'url': 'https://jupiterpluss.err.ee/1608311387/njuhach',
'md5': '2abdeb7131ce551bce49e8d0cea08536',
'info_dict': {
'id': '1608311387',
'ext': 'mp4',
'title': 'Нюхач',
'alt_title': '',
'description': 'md5:8c5c7d8f32ec6e54cd498c9e59ca83bc',
'release_date': '20230601',
'upload_date': '20210818',
'modified_date': '20210903',
'release_timestamp': 1685633400,
'timestamp': 1629318000,
'modified_timestamp': 1630686000,
'release_year': 2013,
'episode': 'Episode 1',
'episode_id': '1608311390',
'episode_number': 1,
'season': 'Season 1',
'season_number': 1,
'series': 'Нюхач',
'series_id': '1608311387',
},
}, {
'note': 'Jupiter+: Podcast: lesnye-istorii-aisty',
'url': 'https://jupiterpluss.err.ee/1608990335/lesnye-istorii-aisty',
'md5': '8b46d7e4510b254a14b7a52211b5bf96',
'info_dict': {
'id': '1608990335',
'ext': 'm4a',
'title': 'Лесные истории | Аисты',
'alt_title': '',
'description': 'md5:065e721623e271e7a63e6540d409ca6b',
'release_date': '20230609',
'upload_date': '20230527',
'modified_date': '20230608',
'release_timestamp': 1686308700,
'timestamp': 1685145600,
'modified_timestamp': 1686252600,
'release_year': 2023,
'episode': 'Episode 0',
'episode_id': '1608990335',
'episode_number': 0,
'season': 'Season 0',
'season_number': 0,
'series': 'Лесные истории | Аисты',
'series_id': '1037497',
}
}, {
'note': 'Lasteekraan: Pätu',
'url': 'https://lasteekraan.err.ee/1092243/patu',
'md5': 'a67eb9b9bcb3d201718c15d1638edf77',
'info_dict': {
'id': '1092243',
'ext': 'mp4',
'title': 'Pätu',
'alt_title': '',
'description': 'md5:64a7b5a80afd7042d3f8ec48c77befd9',
'release_date': '20230614',
'upload_date': '20200520',
'modified_date': '20200520',
'release_timestamp': 1686745800,
'timestamp': 1589975640,
'modified_timestamp': 1589975640,
'release_year': 1990,
'episode': 'Episode 1',
'episode_id': '1092243',
'episode_number': 1,
'season': 'Season 1',
'season_number': 1,
'series': 'Pätu',
'series_id': '1092236',
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
data = self._download_json(
'https://services.err.ee/api/v2/vodContent/getContentPageData', video_id,
query={'contentId': video_id})['data']['mainContent']
media_data = traverse_obj(data, ('medias', ..., {dict}), get_all=False)
if traverse_obj(media_data, ('restrictions', 'drm', {bool})):
self.report_drm(video_id)
formats, subtitles = [], {}
for format_url in set(traverse_obj(media_data, ('src', ('hls', 'hls2', 'hlsNew'), {url_or_none}))):
fmts, subs = self._extract_m3u8_formats_and_subtitles(
format_url, video_id, 'mp4', m3u8_id='hls', fatal=False)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
for format_url in set(traverse_obj(media_data, ('src', ('dash', 'dashNew'), {url_or_none}))):
fmts, subs = self._extract_mpd_formats_and_subtitles(
format_url, video_id, mpd_id='dash', fatal=False)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
if format_url := traverse_obj(media_data, ('src', 'file', {url_or_none})):
formats.append({
'url': format_url,
'format_id': 'http',
})
return {
'id': video_id,
'formats': formats,
'subtitles': subtitles,
**traverse_obj(data, {
'title': ('heading', {str}),
'alt_title': ('subHeading', {str}),
'description': (('lead', 'body'), {clean_html}, {lambda x: x or None}),
'timestamp': ('created', {int_or_none}),
'modified_timestamp': ('updated', {int_or_none}),
'release_timestamp': (('scheduleStart', 'publicStart'), {int_or_none}),
'release_year': ('year', {int_or_none}),
}, get_all=False),
**(traverse_obj(data, {
'series': ('heading', {str}),
'series_id': ('rootContentId', {str_or_none}),
'episode': ('subHeading', {str}),
'season_number': ('season', {int_or_none}),
'episode_number': ('episode', {int_or_none}),
'episode_id': ('id', {str_or_none}),
}) if data.get('type') == 'episode' else {}),
}

View File

@@ -20,6 +20,7 @@ from ..utils import (
get_element_by_id,
get_first,
int_or_none,
join_nonempty,
js_to_json,
merge_dicts,
parse_count,
@@ -43,6 +44,7 @@ class FacebookIE(InfoExtractor):
(?:[^#]*?\#!/)?
(?:
(?:
permalink\.php|
video/video\.php|
photo\.php|
video\.php|
@@ -52,6 +54,7 @@ class FacebookIE(InfoExtractor):
)\?(?:.*?)(?:v|video_id|story_fbid)=|
[^/]+/videos/(?:[^/]+/)?|
[^/]+/posts/|
events/(?:[^/]+/)?|
groups/[^/]+/(?:permalink|posts)/|
watchparty/
)|
@@ -248,6 +251,7 @@ class FacebookIE(InfoExtractor):
'duration': 148.435,
},
}, {
# data.node.comet_sections.content.story.attachments[].styles.attachment.media
'url': 'https://www.facebook.com/attn/posts/pfbid0j1Czf2gGDVqeQ8KiMLFm3pWN8GxsQmeRrVhimWDzMuKQoR8r4b1knNsejELmUgyhl',
'info_dict': {
'id': '6968553779868435',
@@ -262,6 +266,22 @@ class FacebookIE(InfoExtractor):
'thumbnail': r're:^https?://.*',
'timestamp': 1701975646,
},
}, {
# data.node.comet_sections.content.story.attachments[].styles.attachment.media
'url': 'https://www.facebook.com/permalink.php?story_fbid=pfbid0fqQuVEQyXRa9Dp4RcaTR14KHU3uULHV1EK7eckNXSH63JMuoALsAvVCJ97zAGitil&id=100068861234290',
'info_dict': {
'id': '270103405756416',
'ext': 'mp4',
'title': 'Lela Evans',
'description': 'Today Makkovik\'s own Pilot Mandy Smith made her inaugural landing on the airstrip in her hometown. What a proud moment as we all cheered and...',
'thumbnail': r're:^https?://.*',
'uploader': 'Lela Evans',
'uploader_id': 'pfbid0shZJipuigyy5mqrUJn9ub5LJFWNHvan5prtyi3LrDuuuJ4NwrURgnQHYR9fywBepl',
'upload_date': '20231228',
'timestamp': 1703804085,
'duration': 394.347,
'view_count': int,
},
}, {
'url': 'https://www.facebook.com/story.php?story_fbid=pfbid0Fnzhm8UuzjBYpPMNFzaSpFE9UmLdU4fJN8qTANi1Dmtj5q7DNrL5NERXfsAzDEV7l&id=100073071055552',
'only_matching': True,
@@ -380,6 +400,18 @@ class FacebookIE(InfoExtractor):
},
'playlist_count': 1,
'skip': 'Requires logging in',
}, {
# data.event.cover_media_renderer.cover_video
'url': 'https://m.facebook.com/events/1509582499515440',
'info_dict': {
'id': '637246984455045',
'ext': 'mp4',
'title': 'ANALISI IN CAMPO OSCURO " Coaguli nel sangue dei vaccinati"',
'description': 'Other event by Comitato Liberi Pensatori on Tuesday, October 18 2022',
'thumbnail': r're:^https?://.*',
'uploader': 'Comitato Liberi Pensatori',
'uploader_id': '100065709540881',
},
}]
_SUPPORTED_PAGLETS_REGEX = r'(?:pagelet_group_mall|permalink_video_pagelet|hyperfeed_story_id_[0-9a-f]+)'
_api_config = {
@@ -454,38 +486,10 @@ class FacebookIE(InfoExtractor):
r'data-sjs>({.*?ScheduledServerJS.*?})</script>', webpage)]
post = traverse_obj(post_data, (
..., 'require', ..., ..., ..., '__bbox', 'require', ..., ..., ..., '__bbox', 'result', 'data'), expected_type=dict) or []
automatic_captions, subtitles = {}, {}
subs_data = traverse_obj(post, (..., 'video', ..., 'attachments', ..., lambda k, v: (
k == 'media' and str(v['id']) == video_id and v['__typename'] == 'Video')))
is_video_broadcast = get_first(subs_data, 'is_video_broadcast', expected_type=bool)
captions = get_first(subs_data, 'video_available_captions_locales', 'captions_url')
if url_or_none(captions): # if subs_data only had a 'captions_url'
locale = self._html_search_meta(['og:locale', 'twitter:locale'], webpage, 'locale', default='en_US')
subtitles[locale] = [{'url': captions}]
# or else subs_data had 'video_available_captions_locales', a list of dicts
for caption in traverse_obj(captions, (
{lambda x: sorted(x, key=lambda c: c['locale'])}, lambda _, v: v['captions_url'])
):
lang = caption.get('localized_language') or ''
subs = {
'url': caption['captions_url'],
'name': format_field(caption, 'localized_country', f'{lang} (%s)', default=lang),
}
if caption.get('localized_creation_method') or is_video_broadcast:
automatic_captions.setdefault(caption['locale'], []).append(subs)
else:
subtitles.setdefault(caption['locale'], []).append(subs)
media = traverse_obj(post, (..., 'attachments', ..., lambda k, v: (
k == 'media' and str(v['id']) == video_id and v['__typename'] == 'Video')), expected_type=dict)
title = get_first(media, ('title', 'text'))
description = get_first(media, ('creation_story', 'comet_sections', 'message', 'story', 'message', 'text'))
uploader_data = (
get_first(media, ('owner', {dict}))
or get_first(post, (..., 'video', lambda k, v: k == 'owner' and v['name']))
or get_first(post, ('node', 'actors', ..., {dict})) or {})
page_title = title or self._html_search_regex((
r'<h2\s+[^>]*class="uiHeaderTitle"[^>]*>(?P<content>[^<]*)</h2>',
r'(?s)<span class="fbPhotosPhotoCaption".*?id="fbPhotoPageCaption"><span class="hasCaption">(?P<content>.*?)</span>',
@@ -494,11 +498,16 @@ class FacebookIE(InfoExtractor):
description = description or self._html_search_meta(
['description', 'og:description', 'twitter:description'],
webpage, 'description', default=None)
uploader_data = (
get_first(media, ('owner', {dict}))
or get_first(post, ('video', 'creation_story', 'attachments', ..., 'media', lambda k, v: k == 'owner' and v['name']))
or get_first(post, (..., 'video', lambda k, v: k == 'owner' and v['name']))
or get_first(post, ('node', 'actors', ..., {dict}))
or get_first(post, ('event', 'event_creator', {dict})) or {})
uploader = uploader_data.get('name') or (
clean_html(get_element_by_id('fbPhotoPageAuthorName', webpage))
or self._search_regex(
(r'ownerName\s*:\s*"([^"]+)"', *self._og_regexes('title')), webpage, 'uploader', fatal=False))
timestamp = int_or_none(self._search_regex(
r'<abbr[^>]+data-utime=["\'](\d+)', webpage,
'timestamp', default=None))
@@ -520,8 +529,6 @@ class FacebookIE(InfoExtractor):
webpage, 'view count', default=None)),
'concurrent_view_count': get_first(post, (
('video', (..., ..., 'attachments', ..., 'media')), 'liveViewerCount', {int_or_none})),
'automatic_captions': automatic_captions,
'subtitles': subtitles,
}
info_json_ld = self._search_json_ld(webpage, video_id, default={})
@@ -563,7 +570,11 @@ class FacebookIE(InfoExtractor):
# Downloads with browser's User-Agent are rate limited. Working around
# with non-browser User-Agent.
for f in info['formats']:
# Downloads with browser's User-Agent are rate limited. Working around
# with non-browser User-Agent.
f.setdefault('http_headers', {})['User-Agent'] = 'facebookexternalhit/1.1'
# Formats larger than ~500MB will return error 403 unless chunk size is regulated
f.setdefault('downloader_options', {})['http_chunk_size'] = 250 << 20
def extract_relay_data(_filter):
return self._parse_json(self._search_regex(
@@ -573,8 +584,8 @@ class FacebookIE(InfoExtractor):
def extract_relay_prefetched_data(_filter):
return traverse_obj(extract_relay_data(_filter), (
'require', (None, (..., ..., ..., '__bbox', 'require')),
lambda _, v: 'RelayPrefetchedStreamCache' in v, ..., ...,
'__bbox', 'result', 'data', {dict}), get_all=False) or {}
lambda _, v: any(key.startswith('RelayPrefetchedStreamCache') for key in v),
..., ..., '__bbox', 'result', 'data', {dict}), get_all=False) or {}
if not video_data:
server_js_data = self._parse_json(self._search_regex([
@@ -615,6 +626,29 @@ class FacebookIE(InfoExtractor):
'url': playable_url,
})
extract_dash_manifest(video, formats)
automatic_captions, subtitles = {}, {}
is_broadcast = traverse_obj(video, ('is_video_broadcast', {bool}))
for caption in traverse_obj(video, (
'video_available_captions_locales',
{lambda x: sorted(x, key=lambda c: c['locale'])},
lambda _, v: url_or_none(v['captions_url'])
)):
lang = caption.get('localized_language') or 'und'
subs = {
'url': caption['captions_url'],
'name': format_field(caption, 'localized_country', f'{lang} (%s)', default=lang),
}
if caption.get('localized_creation_method') or is_broadcast:
automatic_captions.setdefault(caption['locale'], []).append(subs)
else:
subtitles.setdefault(caption['locale'], []).append(subs)
captions_url = traverse_obj(video, ('captions_url', {url_or_none}))
if captions_url and not automatic_captions and not subtitles:
locale = self._html_search_meta(
['og:locale', 'twitter:locale'], webpage, 'locale', default='en_US')
(automatic_captions if is_broadcast else subtitles)[locale] = [{'url': captions_url}]
info = {
'id': v_id,
'formats': formats,
@@ -624,6 +658,8 @@ class FacebookIE(InfoExtractor):
'timestamp': traverse_obj(video, 'publish_time', 'creation_time', expected_type=int_or_none),
'duration': (float_or_none(video.get('playable_duration_in_ms'), 1000)
or float_or_none(video.get('length_in_second'))),
'automatic_captions': automatic_captions,
'subtitles': subtitles,
}
process_formats(info)
description = try_get(video, lambda x: x['savable_description']['text'])
@@ -658,7 +694,8 @@ class FacebookIE(InfoExtractor):
for edge in edges:
parse_attachment(edge, key='node')
video = data.get('video') or {}
video = traverse_obj(data, (
'event', 'cover_media_renderer', 'cover_video'), 'video', expected_type=dict) or {}
if video:
attachments = try_get(video, [
lambda x: x['story']['attachments'],
@@ -677,6 +714,9 @@ class FacebookIE(InfoExtractor):
# honor precise duration in video info
if video_info.get('duration'):
webpage_info['duration'] = video_info['duration']
# preserve preferred_thumbnail in video info
if video_info.get('thumbnail'):
webpage_info['thumbnail'] = video_info['thumbnail']
return merge_dicts(webpage_info, video_info)
if not video_data:
@@ -907,3 +947,114 @@ class FacebookReelIE(InfoExtractor):
video_id = self._match_id(url)
return self.url_result(
f'https://m.facebook.com/watch/?v={video_id}&_rdr', FacebookIE, video_id)
class FacebookAdsIE(InfoExtractor):
_VALID_URL = r'https?://(?:[\w-]+\.)?facebook\.com/ads/library/?\?(?:[^#]+&)?id=(?P<id>\d+)'
IE_NAME = 'facebook:ads'
_TESTS = [{
'url': 'https://www.facebook.com/ads/library/?id=899206155126718',
'info_dict': {
'id': '899206155126718',
'ext': 'mp4',
'title': 'video by Kandao',
'uploader': 'Kandao',
'uploader_id': '774114102743284',
'uploader_url': r're:^https?://.*',
'timestamp': 1702548330,
'thumbnail': r're:^https?://.*',
'upload_date': '20231214',
'like_count': int,
}
}, {
'url': 'https://www.facebook.com/ads/library/?id=893637265423481',
'info_dict': {
'id': '893637265423481',
'title': 'Jusqu\u2019\u00e0 -25% sur une s\u00e9lection de vins p\u00e9tillants italiens ',
'uploader': 'Eataly Paris Marais',
'uploader_id': '2086668958314152',
'uploader_url': r're:^https?://.*',
'timestamp': 1703571529,
'upload_date': '20231226',
'like_count': int,
},
'playlist_count': 3,
}, {
'url': 'https://es-la.facebook.com/ads/library/?id=901230958115569',
'only_matching': True,
}, {
'url': 'https://m.facebook.com/ads/library/?id=901230958115569',
'only_matching': True,
}]
_FORMATS_MAP = {
'watermarked_video_sd_url': ('sd-wmk', 'SD, watermarked'),
'video_sd_url': ('sd', None),
'watermarked_video_hd_url': ('hd-wmk', 'HD, watermarked'),
'video_hd_url': ('hd', None),
}
def _extract_formats(self, video_dict):
formats = []
for format_key, format_url in traverse_obj(video_dict, (
{dict.items}, lambda _, v: v[0] in self._FORMATS_MAP and url_or_none(v[1])
)):
formats.append({
'format_id': self._FORMATS_MAP[format_key][0],
'format_note': self._FORMATS_MAP[format_key][1],
'url': format_url,
'ext': 'mp4',
'quality': qualities(tuple(self._FORMATS_MAP))(format_key),
})
return formats
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
post_data = [self._parse_json(j, video_id, fatal=False)
for j in re.findall(r's\.handle\(({.*})\);requireLazy\(', webpage)]
data = traverse_obj(post_data, (
..., 'require', ..., ..., ..., 'props', 'deeplinkAdCard', 'snapshot', {dict}), get_all=False)
if not data:
raise ExtractorError('Unable to extract ad data')
title = data.get('title')
if not title or title == '{{product.name}}':
title = join_nonempty('display_format', 'page_name', delim=' by ', from_dict=data)
info_dict = traverse_obj(data, {
'description': ('link_description', {str}, {lambda x: x if x != '{{product.description}}' else None}),
'uploader': ('page_name', {str}),
'uploader_id': ('page_id', {str_or_none}),
'uploader_url': ('page_profile_uri', {url_or_none}),
'timestamp': ('creation_time', {int_or_none}),
'like_count': ('page_like_count', {int_or_none}),
})
entries = []
for idx, entry in enumerate(traverse_obj(
data, (('videos', 'cards'), lambda _, v: any([url_or_none(v[f]) for f in self._FORMATS_MAP]))), 1
):
entries.append({
'id': f'{video_id}_{idx}',
'title': entry.get('title') or title,
'description': entry.get('link_description') or info_dict.get('description'),
'thumbnail': url_or_none(entry.get('video_preview_image_url')),
'formats': self._extract_formats(entry),
})
if len(entries) == 1:
info_dict.update(entries[0])
elif len(entries) > 1:
info_dict.update({
'title': entries[0]['title'],
'entries': entries,
'_type': 'playlist',
})
info_dict['id'] = video_id
return info_dict

View File

@@ -0,0 +1,62 @@
from .common import InfoExtractor
from ..networking.exceptions import HTTPError
from ..utils import (
ExtractorError,
UserNotLive,
parse_iso8601,
str_or_none,
traverse_obj,
url_or_none,
)
class FlexTVIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?flextv\.co\.kr/channels/(?P<id>\d+)/live'
_TESTS = [{
'url': 'https://www.flextv.co.kr/channels/231638/live',
'info_dict': {
'id': '231638',
'ext': 'mp4',
'title': r're:^214하나만\.\.\. ',
'thumbnail': r're:^https?://.+\.jpg',
'upload_date': r're:\d{8}',
'timestamp': int,
'live_status': 'is_live',
'channel': 'Hi별',
'channel_id': '244396',
},
'skip': 'The channel is offline',
}, {
'url': 'https://www.flextv.co.kr/channels/746/live',
'only_matching': True,
}]
def _real_extract(self, url):
channel_id = self._match_id(url)
try:
stream_data = self._download_json(
f'https://api.flextv.co.kr/api/channels/{channel_id}/stream',
channel_id, query={'option': 'all'})
except ExtractorError as e:
if isinstance(e.cause, HTTPError) and e.cause.status == 400:
raise UserNotLive(video_id=channel_id)
raise
playlist_url = stream_data['sources'][0]['url']
formats, subtitles = self._extract_m3u8_formats_and_subtitles(
playlist_url, channel_id, 'mp4')
return {
'id': channel_id,
'formats': formats,
'subtitles': subtitles,
'is_live': True,
**traverse_obj(stream_data, {
'title': ('stream', 'title', {str}),
'timestamp': ('stream', 'createdAt', {parse_iso8601}),
'thumbnail': ('thumbUrl', {url_or_none}),
'channel': ('owner', 'name', {str}),
'channel_id': ('owner', 'id', {str_or_none}),
}),
}

View File

@@ -11,6 +11,7 @@ from ..utils import (
join_nonempty,
parse_codecs,
parse_iso8601,
url_or_none,
urljoin,
)
from ..utils.traversal import traverse_obj
@@ -108,6 +109,64 @@ class FloatplaneIE(InfoExtractor):
'availability': 'subscriber_only',
},
'params': {'skip_download': 'm3u8'},
}, {
'url': 'https://www.floatplane.com/post/65B5PNoBtf',
'info_dict': {
'id': '65B5PNoBtf',
'description': 'I recorded the inbuilt demo mode for your 90\'s enjoyment, thanks for being Floaties!',
'display_id': '65B5PNoBtf',
'like_count': int,
'release_timestamp': 1701249480,
'uploader': 'The Trash Network',
'availability': 'subscriber_only',
'uploader_id': '61bc20c9a131fb692bf2a513',
'uploader_url': 'https://www.floatplane.com/channel/TheTrashNetwork/home',
'channel_url': 'https://www.floatplane.com/channel/TheTrashNetwork/home/thedrumthing',
'comment_count': int,
'title': 'The $50 electronic drum kit.',
'channel_id': '64424fe73cd58cbcf8d8e131',
'thumbnail': 'https://pbs.floatplane.com/blogPost_thumbnails/65B5PNoBtf/725555379422705_1701247052743.jpeg',
'dislike_count': int,
'channel': 'The Drum Thing',
'release_date': '20231129',
},
'playlist_count': 2,
'playlist': [{
'info_dict': {
'id': 'ISPJjexylS',
'ext': 'mp4',
'release_date': '20231129',
'release_timestamp': 1701249480,
'title': 'The $50 electronic drum kit. .mov',
'channel_id': '64424fe73cd58cbcf8d8e131',
'thumbnail': 'https://pbs.floatplane.com/video_thumbnails/ISPJjexylS/335202812134041_1701249383392.jpeg',
'availability': 'subscriber_only',
'uploader': 'The Trash Network',
'duration': 622,
'channel': 'The Drum Thing',
'uploader_id': '61bc20c9a131fb692bf2a513',
'channel_url': 'https://www.floatplane.com/channel/TheTrashNetwork/home/thedrumthing',
'uploader_url': 'https://www.floatplane.com/channel/TheTrashNetwork/home',
},
}, {
'info_dict': {
'id': 'qKfxu6fEpu',
'ext': 'aac',
'release_date': '20231129',
'release_timestamp': 1701249480,
'title': 'Roland TD-7 Demo.m4a',
'channel_id': '64424fe73cd58cbcf8d8e131',
'availability': 'subscriber_only',
'uploader': 'The Trash Network',
'duration': 114,
'channel': 'The Drum Thing',
'uploader_id': '61bc20c9a131fb692bf2a513',
'channel_url': 'https://www.floatplane.com/channel/TheTrashNetwork/home/thedrumthing',
'uploader_url': 'https://www.floatplane.com/channel/TheTrashNetwork/home',
},
}],
'skip': 'requires subscription: "The Trash Network"',
'params': {'skip_download': 'm3u8'},
}]
def _real_initialize(self):
@@ -124,6 +183,22 @@ class FloatplaneIE(InfoExtractor):
if not any(traverse_obj(post_data, ('metadata', ('hasVideo', 'hasAudio')))):
raise ExtractorError('Post does not contain a video or audio track', expected=True)
uploader_url = format_field(
post_data, [('creator', 'urlname')], 'https://www.floatplane.com/channel/%s/home') or None
common_info = {
'uploader_url': uploader_url,
'channel_url': urljoin(f'{uploader_url}/', traverse_obj(post_data, ('channel', 'urlname'))),
'availability': self._availability(needs_subscription=True),
**traverse_obj(post_data, {
'uploader': ('creator', 'title', {str}),
'uploader_id': ('creator', 'id', {str}),
'channel': ('channel', 'title', {str}),
'channel_id': ('channel', 'id', {str}),
'release_timestamp': ('releaseDate', {parse_iso8601}),
}),
}
items = []
for media in traverse_obj(post_data, (('videoAttachments', 'audioAttachments'), ...)):
media_id = media['id']
@@ -150,11 +225,11 @@ class FloatplaneIE(InfoExtractor):
formats = []
for quality in traverse_obj(stream, ('resource', 'data', 'qualityLevels', ...)):
url = urljoin(stream['cdn'], format_path(traverse_obj(
stream, ('resource', 'data', 'qualityLevelParams', quality['name']))))
stream, ('resource', 'data', 'qualityLevelParams', quality['name'], {dict}))))
formats.append({
**traverse_obj(quality, {
'format_id': 'name',
'format_note': 'label',
'format_id': ('name', {str}),
'format_note': ('label', {str}),
'width': ('width', {int}),
'height': ('height', {int}),
}),
@@ -164,38 +239,28 @@ class FloatplaneIE(InfoExtractor):
})
items.append({
**common_info,
'id': media_id,
**traverse_obj(metadata, {
'title': 'title',
'title': ('title', {str}),
'duration': ('duration', {int_or_none}),
'thumbnail': ('thumbnail', 'path'),
'thumbnail': ('thumbnail', 'path', {url_or_none}),
}),
'formats': formats,
})
uploader_url = format_field(
post_data, [('creator', 'urlname')], 'https://www.floatplane.com/channel/%s/home') or None
channel_url = urljoin(f'{uploader_url}/', traverse_obj(post_data, ('channel', 'urlname')))
post_info = {
**common_info,
'id': post_id,
'display_id': post_id,
**traverse_obj(post_data, {
'title': 'title',
'title': ('title', {str}),
'description': ('text', {clean_html}),
'uploader': ('creator', 'title'),
'uploader_id': ('creator', 'id'),
'channel': ('channel', 'title'),
'channel_id': ('channel', 'id'),
'like_count': ('likes', {int_or_none}),
'dislike_count': ('dislikes', {int_or_none}),
'comment_count': ('comments', {int_or_none}),
'release_timestamp': ('releaseDate', {parse_iso8601}),
'thumbnail': ('thumbnail', 'path'),
'thumbnail': ('thumbnail', 'path', {url_or_none}),
}),
'uploader_url': uploader_url,
'channel_url': channel_url,
'availability': self._availability(needs_subscription=True),
}
if len(items) > 1:

View File

@@ -1,25 +1,29 @@
from .common import InfoExtractor
from .nexx import NexxIE
from ..utils import (
int_or_none,
str_or_none,
)
class FunkIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.|origin\.)?funk\.net/(?:channel|playlist)/[^/]+/(?P<display_id>[0-9a-z-]+)-(?P<id>\d+)'
_TESTS = [{
'url': 'https://www.funk.net/channel/ba-793/die-lustigsten-instrumente-aus-dem-internet-teil-2-1155821',
'md5': '8dd9d9ab59b4aa4173b3197f2ea48e81',
'md5': '8610449476156f338761a75391b0017d',
'info_dict': {
'id': '1155821',
'ext': 'mp4',
'title': 'Die LUSTIGSTEN INSTRUMENTE aus dem Internet - Teil 2',
'description': 'md5:a691d0413ef4835588c5b03ded670c1f',
'description': 'md5:2a03b67596eda0d1b5125c299f45e953',
'timestamp': 1514507395,
'upload_date': '20171229',
'duration': 426.0,
'cast': ['United Creators PMB GmbH'],
'thumbnail': 'https://assets.nexx.cloud/media/75/56/79/3YKUSJN1LACN0CRxL.jpg',
'display_id': 'die-lustigsten-instrumente-aus-dem-internet-teil-2',
'alt_title': 'Die LUSTIGSTEN INSTRUMENTE aus dem Internet Teil 2',
'season_number': 0,
'season': 'Season 0',
'episode_number': 0,
'episode': 'Episode 0',
},
}, {
'url': 'https://www.funk.net/playlist/neuesteVideos/kameras-auf-dem-fusion-festival-1618699',
'only_matching': True,
@@ -27,18 +31,10 @@ class FunkIE(InfoExtractor):
def _real_extract(self, url):
display_id, nexx_id = self._match_valid_url(url).groups()
video = self._download_json(
'https://www.funk.net/api/v4.0/videos/' + nexx_id, nexx_id)
return {
'_type': 'url_transparent',
'url': 'nexx:741:' + nexx_id,
'url': f'nexx:741:{nexx_id}',
'ie_key': NexxIE.ie_key(),
'id': nexx_id,
'title': video.get('title'),
'description': video.get('description'),
'duration': int_or_none(video.get('duration')),
'channel_id': str_or_none(video.get('channelId')),
'display_id': display_id,
'tags': video.get('tags'),
'thumbnail': video.get('imageUrlLandscape'),
}

View File

@@ -66,7 +66,7 @@ class GofileIE(InfoExtractor):
query_params = {
'contentId': file_id,
'token': self._TOKEN,
'websiteToken': '7fd94ds12fds4', # From https://gofile.io/dist/js/alljs.js
'wt': '4fd6sg89d7s6', # From https://gofile.io/dist/js/alljs.js
}
password = self.get_param('videopassword')
if password:

View File

@@ -40,6 +40,22 @@ class GoPlayIE(InfoExtractor):
'title': 'A Family for the Holidays',
},
'skip': 'This video is only available for registered users'
}, {
'url': 'https://www.goplay.be/video/de-mol/de-mol-s11/de-mol-s11-aflevering-1#autoplay',
'info_dict': {
'id': '03eb8f2f-153e-41cb-9805-0d3a29dab656',
'ext': 'mp4',
'title': 'S11 - Aflevering 1',
'episode': 'Episode 1',
'series': 'De Mol',
'season_number': 11,
'episode_number': 1,
'season': 'Season 11'
},
'params': {
'skip_download': True
},
'skip': 'This video is only available for registered users'
}]
_id_token = None
@@ -77,16 +93,39 @@ class GoPlayIE(InfoExtractor):
api = self._download_json(
f'https://api.goplay.be/web/v1/videos/long-form/{video_id}',
video_id, headers={'Authorization': 'Bearer %s' % self._id_token})
video_id, headers={
'Authorization': 'Bearer %s' % self._id_token,
**self.geo_verification_headers(),
})
formats, subs = self._extract_m3u8_formats_and_subtitles(
api['manifestUrls']['hls'], video_id, ext='mp4', m3u8_id='HLS')
if 'manifestUrls' in api:
formats, subtitles = self._extract_m3u8_formats_and_subtitles(
api['manifestUrls']['hls'], video_id, ext='mp4', m3u8_id='HLS')
else:
if 'ssai' not in api:
raise ExtractorError('expecting Google SSAI stream')
ssai_content_source_id = api['ssai']['contentSourceID']
ssai_video_id = api['ssai']['videoID']
dai = self._download_json(
f'https://dai.google.com/ondemand/dash/content/{ssai_content_source_id}/vid/{ssai_video_id}/streams',
video_id, data=b'{"api-key":"null"}',
headers={'content-type': 'application/json'})
periods = self._extract_mpd_periods(dai['stream_manifest'], video_id)
# skip pre-roll and mid-roll ads
periods = [p for p in periods if '-ad-' not in p['id']]
formats, subtitles = self._merge_mpd_periods(periods)
info_dict.update({
'id': video_id,
'formats': formats,
'subtitles': subtitles,
})
return info_dict

View File

@@ -13,7 +13,7 @@ class LeFigaroVideoEmbedIE(InfoExtractor):
_TESTS = [{
'url': 'https://video.lefigaro.fr/embed/figaro/video/les-francais-ne-veulent-ils-plus-travailler-suivez-en-direct-le-club-le-figaro-idees/',
'md5': 'e94de44cd80818084352fcf8de1ce82c',
'md5': 'a0c3069b7e4c4526abf0053a7713f56f',
'info_dict': {
'id': 'g9j7Eovo',
'title': 'Les Français ne veulent-ils plus travailler ? Retrouvez Le Club Le Figaro Idées',
@@ -26,7 +26,7 @@ class LeFigaroVideoEmbedIE(InfoExtractor):
},
}, {
'url': 'https://video.lefigaro.fr/embed/figaro/video/intelligence-artificielle-faut-il-sen-mefier/',
'md5': '0b3f10332b812034b3a3eda1ef877c5f',
'md5': '319c662943dd777bab835cae1e2d73a5',
'info_dict': {
'id': 'LeAgybyc',
'title': 'Intelligence artificielle : faut-il sen méfier ?',
@@ -41,7 +41,7 @@ class LeFigaroVideoEmbedIE(InfoExtractor):
_WEBPAGE_TESTS = [{
'url': 'https://video.lefigaro.fr/figaro/video/suivez-en-direct-le-club-le-figaro-international-avec-philippe-gelie-9/',
'md5': '3972ddf2d5f8b98699f191687258e2f9',
'md5': '6289f9489efb969e38245f31721596fe',
'info_dict': {
'id': 'QChnbPYA',
'title': 'Où en est le couple franco-allemand ? Retrouvez Le Club Le Figaro International',
@@ -55,7 +55,7 @@ class LeFigaroVideoEmbedIE(InfoExtractor):
},
}, {
'url': 'https://video.lefigaro.fr/figaro/video/la-philosophe-nathalie-sarthou-lajus-est-linvitee-du-figaro-live/',
'md5': '3ac0a0769546ee6be41ab52caea5d9a9',
'md5': 'f6df814cae53e85937621599d2967520',
'info_dict': {
'id': 'QJzqoNbf',
'title': 'La philosophe Nathalie Sarthou-Lajus est linvitée du Figaro Live',
@@ -73,7 +73,8 @@ class LeFigaroVideoEmbedIE(InfoExtractor):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
player_data = self._search_nextjs_data(webpage, display_id)['props']['pageProps']['pageData']['playerData']
player_data = self._search_nextjs_data(
webpage, display_id)['props']['pageProps']['initialProps']['pageData']['playerData']
return self.url_result(
f'jwplatform:{player_data["videoId"]}', title=player_data.get('title'),

View File

@@ -3,16 +3,15 @@ import re
from .common import InfoExtractor
from ..utils import (
clean_html,
extract_attributes,
ExtractorError,
extract_attributes,
float_or_none,
get_element_by_class,
int_or_none,
srt_subtitles_timecode,
strip_or_none,
mimetype2ext,
traverse_obj,
try_get,
url_or_none,
urlencode_postdata,
urljoin,
)
@@ -83,15 +82,29 @@ class LinkedInLearningBaseIE(LinkedInBaseIE):
class LinkedInIE(LinkedInBaseIE):
_VALID_URL = r'https?://(?:www\.)?linkedin\.com/posts/.+?(?P<id>\d+)'
_VALID_URL = r'https?://(?:www\.)?linkedin\.com/posts/[^/?#]+-(?P<id>\d+)-\w{4}/?(?:[?#]|$)'
_TESTS = [{
'url': 'https://www.linkedin.com/posts/mishalkhawaja_sendinblueviews-toronto-digitalmarketing-ugcPost-6850898786781339649-mM20',
'info_dict': {
'id': '6850898786781339649',
'ext': 'mp4',
'title': 'Mishal K. on LinkedIn: #sendinblueviews #toronto #digitalmarketing',
'description': 'md5:be125430bab1c574f16aeb186a4d5b19',
'creator': 'Mishal K.'
'title': 'Mishal K. on LinkedIn: #sendinblueviews #toronto #digitalmarketing #nowhiring #sendinblue…',
'description': 'md5:2998a31f6f479376dd62831f53a80f71',
'uploader': 'Mishal K.',
'thumbnail': 're:^https?://media.licdn.com/dms/image/.*$',
'like_count': int
},
}, {
'url': 'https://www.linkedin.com/posts/the-mathworks_2_what-is-mathworks-cloud-center-activity-7151241570371948544-4Gu7',
'info_dict': {
'id': '7151241570371948544',
'ext': 'mp4',
'title': 'MathWorks on LinkedIn: What Is MathWorks Cloud Center?',
'description': 'md5:95f9d4eeb6337882fb47eefe13d7a40c',
'uploader': 'MathWorks',
'thumbnail': 're:^https?://media.licdn.com/dms/image/.*$',
'like_count': int,
'subtitles': 'mincount:1'
},
}]
@@ -99,26 +112,30 @@ class LinkedInIE(LinkedInBaseIE):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
title = self._html_extract_title(webpage)
description = clean_html(get_element_by_class('share-update-card__update-text', webpage))
like_count = int_or_none(get_element_by_class('social-counts-reactions__social-counts-numRections', webpage))
creator = strip_or_none(clean_html(get_element_by_class('comment__actor-name', webpage)))
sources = self._parse_json(extract_attributes(self._search_regex(r'(<video[^>]+>)', webpage, 'video'))['data-sources'], video_id)
video_attrs = extract_attributes(self._search_regex(r'(<video[^>]+>)', webpage, 'video'))
sources = self._parse_json(video_attrs['data-sources'], video_id)
formats = [{
'url': source['src'],
'ext': mimetype2ext(source.get('type')),
'tbr': float_or_none(source.get('data-bitrate'), scale=1000),
} for source in sources]
subtitles = {'en': [{
'url': video_attrs['data-captions-url'],
'ext': 'vtt',
}]} if url_or_none(video_attrs.get('data-captions-url')) else {}
return {
'id': video_id,
'formats': formats,
'title': title,
'like_count': like_count,
'creator': creator,
'title': self._og_search_title(webpage, default=None) or self._html_extract_title(webpage),
'like_count': int_or_none(self._search_regex(
r'\bdata-num-reactions="(\d+)"', webpage, 'reactions', default=None)),
'uploader': traverse_obj(
self._yield_json_ld(webpage, video_id),
(lambda _, v: v['@type'] == 'SocialMediaPosting', 'author', 'name', {str}), get_all=False),
'thumbnail': self._og_search_thumbnail(webpage),
'description': description,
'description': self._og_search_description(webpage, default=None),
'subtitles': subtitles,
}

282
yt_dlp/extractor/lsm.py Normal file
View File

@@ -0,0 +1,282 @@
import re
import urllib.parse
from .common import InfoExtractor
from ..utils import (
ExtractorError,
determine_ext,
int_or_none,
js_to_json,
parse_iso8601,
parse_qs,
str_or_none,
url_or_none,
urljoin,
)
from ..utils.traversal import traverse_obj
class LSMLREmbedIE(InfoExtractor):
_VALID_URL = r'''(?x)
https?://(?:
(?:latvijasradio|lr1|lr2|klasika|lr4|naba|radioteatris)\.lsm|
pieci
)\.lv/[^/?#]+/(?:
pleijeris|embed
)/?\?(?:[^#]+&)?(?:show|id)=(?P<id>\d+)'''
_TESTS = [{
'url': 'https://latvijasradio.lsm.lv/lv/embed/?theme=black&size=16x9&showCaptions=0&id=183522',
'md5': '719b33875cd1429846eeeaeec6df2830',
'info_dict': {
'id': 'a342781',
'ext': 'mp3',
'duration': 1823,
'title': '#138 Nepilnīgā kompensējamo zāļu sistēma pat mēnešiem dzenā pacientus pa aptiekām',
'thumbnail': 'https://pic.latvijasradio.lv/public/assets/media/9/d/gallery_fd4675ac.jpg',
}
}, {
'url': 'https://radioteatris.lsm.lv/lv/embed/?id=&show=1270&theme=white&size=16x9',
'info_dict': {
'id': '1270',
},
'playlist_count': 3,
'playlist': [{
'md5': '2e61b6eceff00d14d57fdbbe6ab24cac',
'info_dict': {
'id': 'a297397',
'ext': 'mp3',
'title': 'Eriks Emanuels Šmits "Pilāta evaņģēlijs". 1. daļa',
'thumbnail': 'https://radioteatris.lsm.lv/public/assets/shows/62f131ae81e3c.jpg',
'duration': 3300,
},
}],
}, {
'url': 'https://radioteatris.lsm.lv/lv/embed/?id=&show=1269&theme=white&size=16x9',
'md5': '24810d4a961da2295d9860afdcaf4f5a',
'info_dict': {
'id': 'a230690',
'ext': 'mp3',
'title': 'Jens Ahlboms "Spārni". Radioizrāde ar Mārtiņa Freimaņa mūziku',
'thumbnail': 'https://radioteatris.lsm.lv/public/assets/shows/62f13023a457c.jpg',
'duration': 1788,
}
}, {
'url': 'https://lr1.lsm.lv/lv/embed/?id=166557&show=0&theme=white&size=16x9',
'info_dict': {
'id': '166557',
},
'playlist_count': 2,
'playlist': [{
'md5': '6a8b0927572f443f09c6e50a3ad65f2d',
'info_dict': {
'id': 'a303104',
'ext': 'mp3',
'thumbnail': 'https://pic.latvijasradio.lv/public/assets/media/c/5/gallery_a83ad2c2.jpg',
'title': 'Krustpunktā Lielā intervija: Valsts prezidents Egils Levits',
'duration': 3222,
},
}, {
'md5': '5d5e191e718b7644e5118b7b4e093a6d',
'info_dict': {
'id': 'v303104',
'ext': 'mp4',
'thumbnail': 'https://pic.latvijasradio.lv/public/assets/media/c/5/gallery_a83ad2c2.jpg',
'title': 'Krustpunktā Lielā intervija: Valsts prezidents Egils Levits - Video Version',
'duration': 3222,
},
}],
}, {
'url': 'https://lr1.lsm.lv/lv/embed/?id=183522&show=0&theme=white&size=16x9',
'only_matching': True,
}, {
'url': 'https://lr2.lsm.lv/lv/embed/?id=182126&show=0&theme=white&size=16x9',
'only_matching': True,
}, {
'url': 'https://klasika.lsm.lv/lv/embed/?id=110806&show=0&theme=white&size=16x9',
'only_matching': True,
}, {
'url': 'https://lr4.lsm.lv/lv/embed/?id=184282&show=0&theme=white&size=16x9',
'only_matching': True,
}, {
'url': 'https://pieci.lv/lv/embed/?id=168896&show=0&theme=white&size=16x9',
'only_matching': True,
}, {
'url': 'https://naba.lsm.lv/lv/embed/?id=182901&show=0&theme=white&size=16x9',
'only_matching': True,
}, {
'url': 'https://radioteatris.lsm.lv/lv/embed/?id=176439&show=0&theme=white&size=16x9',
'only_matching': True,
}, {
'url': 'https://lr1.lsm.lv/lv/pleijeris/?embed=0&id=48205&time=00%3A00&idx=0',
'only_matching': True,
}]
def _real_extract(self, url):
query = parse_qs(url)
video_id = traverse_obj(query, (
('show', 'id'), 0, {int_or_none}, {lambda x: x or None}, {str_or_none}), get_all=False)
webpage = self._download_webpage(url, video_id)
player_data, media_data = self._search_regex(
r'LR\.audio\.Player\s*\([^{]*(?P<player>\{.*?\}),(?P<media>\{.*\})\);',
webpage, 'player json', group=('player', 'media'))
player_json = self._parse_json(
player_data, video_id, transform_source=js_to_json, fatal=False) or {}
media_json = self._parse_json(media_data, video_id, transform_source=js_to_json)
entries = []
for item in traverse_obj(media_json, (('audio', 'video'), lambda _, v: v['id'])):
formats = []
for source_url in traverse_obj(item, ('sources', ..., 'file', {url_or_none})):
if determine_ext(source_url) == 'm3u8':
formats.extend(self._extract_m3u8_formats(source_url, video_id, fatal=False))
else:
formats.append({'url': source_url})
id_ = item['id']
title = item.get('title')
if id_.startswith('v') and not title:
title = traverse_obj(
media_json, ('audio', lambda _, v: v['id'][1:] == id_[1:], 'title',
{lambda x: x and f'{x} - Video Version'}), get_all=False)
entries.append({
'formats': formats,
'thumbnail': urljoin(url, player_json.get('poster')),
'id': id_,
'title': title,
'duration': traverse_obj(item, ('duration', {int_or_none})),
})
if len(entries) == 1:
return entries[0]
return self.playlist_result(entries, video_id)
class LSMLTVEmbedIE(InfoExtractor):
_VALID_URL = r'https?://ltv\.lsm\.lv/embed\?(?:[^#]+&)?c=(?P<id>[^#&]+)'
_TESTS = [{
'url': 'https://ltv.lsm.lv/embed?c=eyJpdiI6IjQzbHVUeHAyaDJiamFjcjdSUUFKdnc9PSIsInZhbHVlIjoiMHl3SnJNRmd2TmFIdnZwOGtGUUpzODFzUEZ4SVVsN2xoRjliSW9vckUyMWZIWG8vbWVzaFFkY0lhNmRjbjRpaCIsIm1hYyI6ImMzNjdhMzFhNTFhZmY1ZmE0NWI5YmFjZGI1YmJiNGEyNjgzNDM4MjUzMWEwM2FmMDMyZDMwYWM1MDFjZmM5MGIiLCJ0YWciOiIifQ==',
'md5': '64f72a360ca530d5ed89c77646c9eee5',
'info_dict': {
'id': '46k_d23-6000-105',
'ext': 'mp4',
'timestamp': 1700589151,
'duration': 1442,
'upload_date': '20231121',
'title': 'D23-6000-105_cetstud',
'thumbnail': 'https://store.cloudycdn.services/tmsp00060/assets/media/660858/placeholder1700589200.jpg',
}
}, {
'url': 'https://ltv.lsm.lv/embed?enablesdkjs=1&c=eyJpdiI6IncwVzZmUFk2MU12enVWK1I3SUcwQ1E9PSIsInZhbHVlIjoid3FhV29vamc3T2sxL1RaRmJ5Rm1GTXozU0o2dVczdUtLK0cwZEZJMDQ2a3ZIRG5DK2pneGlnbktBQy9uazVleHN6VXhxdWIweWNvcHRDSnlISlNYOHlVZ1lpcTUrcWZSTUZPQW14TVdkMW9aOUtRWVNDcFF4eWpHNGcrT0VZbUNFQStKQk91cGpndW9FVjJIa0lpbkh3PT0iLCJtYWMiOiIyZGI1NDJlMWRlM2QyMGNhOGEwYTM2MmNlN2JlOGRhY2QyYjdkMmEzN2RlOTEzYTVkNzI1ODlhZDlhZjU4MjQ2IiwidGFnIjoiIn0=',
'md5': 'a1711e190fe680fdb68fd8413b378e87',
'info_dict': {
'id': 'wUnFArIPDSY',
'ext': 'mp4',
'uploader': 'LTV_16plus',
'release_date': '20220514',
'channel_url': 'https://www.youtube.com/channel/UCNMrnafwXD2XKeeQOyfkFCw',
'view_count': int,
'availability': 'public',
'thumbnail': 'https://i.ytimg.com/vi/wUnFArIPDSY/maxresdefault.jpg',
'release_timestamp': 1652544074,
'title': 'EIROVĪZIJA SALĀTOS',
'live_status': 'was_live',
'uploader_id': '@LTV16plus',
'comment_count': int,
'channel_id': 'UCNMrnafwXD2XKeeQOyfkFCw',
'channel_follower_count': int,
'categories': ['Entertainment'],
'duration': 5269,
'upload_date': '20220514',
'age_limit': 0,
'channel': 'LTV_16plus',
'playable_in_embed': True,
'tags': [],
'uploader_url': 'https://www.youtube.com/@LTV16plus',
'like_count': int,
'description': 'md5:7ff0c42ba971e3c13e4b8a2ff03b70b5',
}
}]
def _real_extract(self, url):
video_id = urllib.parse.unquote(self._match_id(url))
webpage = self._download_webpage(url, video_id)
data = self._search_json(
r'window\.ltvEmbedPayload\s*=', webpage, 'embed json', video_id)
embed_type = traverse_obj(data, ('source', 'name', {str}))
if embed_type == 'telia':
ie_key = 'CloudyCDN'
embed_url = traverse_obj(data, ('source', 'embed_url', {url_or_none}))
elif embed_type == 'youtube':
ie_key = 'Youtube'
embed_url = traverse_obj(data, ('source', 'id', {str}))
else:
raise ExtractorError(f'Unsupported embed type {embed_type!r}')
return self.url_result(
embed_url, ie_key, video_id, **traverse_obj(data, {
'title': ('parentInfo', 'title'),
'duration': ('parentInfo', 'duration', {int_or_none}),
'thumbnail': ('source', 'poster', {url_or_none}),
}))
class LSMReplayIE(InfoExtractor):
_VALID_URL = r'https?://replay\.lsm\.lv/[^/?#]+/(?:ieraksts|statja)/[^/?#]+/(?P<id>\d+)'
_TESTS = [{
'url': 'https://replay.lsm.lv/lv/ieraksts/ltv/311130/4-studija-zolitudes-tragedija-un-incupes-stacija',
'md5': '64f72a360ca530d5ed89c77646c9eee5',
'info_dict': {
'id': '46k_d23-6000-105',
'ext': 'mp4',
'timestamp': 1700586300,
'description': 'md5:0f1b14798cc39e1ae578bd0eb268f759',
'duration': 1442,
'upload_date': '20231121',
'title': '4. studija. Zolitūdes traģēdija un Inčupes stacija',
'thumbnail': 'https://ltv.lsm.lv/storage/media/8/7/large/5/1f9604e1.jpg',
}
}, {
'url': 'https://replay.lsm.lv/lv/ieraksts/lr/183522/138-nepilniga-kompensejamo-zalu-sistema-pat-menesiem-dzena-pacientus-pa-aptiekam',
'md5': '719b33875cd1429846eeeaeec6df2830',
'info_dict': {
'id': 'a342781',
'ext': 'mp3',
'duration': 1823,
'title': '#138 Nepilnīgā kompensējamo zāļu sistēma pat mēnešiem dzenā pacientus pa aptiekām',
'thumbnail': 'https://pic.latvijasradio.lv/public/assets/media/9/d/large_fd4675ac.jpg',
'upload_date': '20231102',
'timestamp': 1698921060,
'description': 'md5:7bac3b2dd41e44325032943251c357b1',
}
}, {
'url': 'https://replay.lsm.lv/ru/statja/ltv/311130/4-studija-zolitudes-tragedija-un-incupes-stacija',
'only_matching': True,
}]
def _fix_nuxt_data(self, webpage):
return re.sub(r'Object\.create\(null(?:,(\{.+\}))?\)', lambda m: m.group(1) or 'null', webpage)
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
data = self._search_nuxt_data(
self._fix_nuxt_data(webpage), video_id, context_name='__REPLAY__')
return {
'_type': 'url_transparent',
'id': video_id,
**traverse_obj(data, {
'url': ('playback', 'service', 'url', {url_or_none}),
'title': ('mediaItem', 'title'),
'description': ('mediaItem', ('lead', 'body')),
'duration': ('mediaItem', 'duration', {int_or_none}),
'timestamp': ('mediaItem', 'aired_at', {parse_iso8601}),
'thumbnail': ('mediaItem', 'largeThumbnail', {url_or_none}),
}, get_all=False),
}

View File

@@ -28,12 +28,24 @@ class MagellanTVIE(InfoExtractor):
'tags': ['Investigation', 'True Crime', 'Justice', 'Europe'],
},
'params': {'skip_download': 'm3u8'},
}, {
'url': 'https://www.magellantv.com/watch/celebration-nation',
'info_dict': {
'id': 'celebration-nation',
'ext': 'mp4',
'tags': ['Art & Culture', 'Human Interest', 'Anthropology', 'China', 'History'],
'duration': 2640.0,
'title': 'Ancestors',
},
'params': {'skip_download': 'm3u8'},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
data = self._search_nextjs_data(webpage, video_id)['props']['pageProps']['reactContext']['video']['detail']
data = traverse_obj(self._search_nextjs_data(webpage, video_id), (
'props', 'pageProps', 'reactContext',
(('video', 'detail'), ('series', 'currentEpisode')), {dict}), get_all=False)
formats, subtitles = self._extract_m3u8_formats_and_subtitles(data['jwpVideoUrl'], video_id)
return {

View File

@@ -8,7 +8,8 @@ from ..utils import (
float_or_none,
int_or_none,
str_or_none,
traverse_obj
traverse_obj,
update_url_query,
)
@@ -16,7 +17,7 @@ class MedalTVIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?medal\.tv/games/[^/?#&]+/clips/(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'https://medal.tv/games/valorant/clips/jTBFnLKdLy15K',
'md5': '6930f8972914b6b9fdc2bb3918098ba0',
'md5': '03e4911fdcf7fce563090705c2e79267',
'info_dict': {
'id': 'jTBFnLKdLy15K',
'ext': 'mp4',
@@ -33,8 +34,8 @@ class MedalTVIE(InfoExtractor):
'duration': 13,
}
}, {
'url': 'https://medal.tv/games/cod%20cold%20war/clips/2mA60jWAGQCBH',
'md5': '3d19d426fe0b2d91c26e412684e66a06',
'url': 'https://medal.tv/games/cod-cold-war/clips/2mA60jWAGQCBH',
'md5': 'fc7a3e4552ae8993c1c4006db46be447',
'info_dict': {
'id': '2mA60jWAGQCBH',
'ext': 'mp4',
@@ -52,7 +53,7 @@ class MedalTVIE(InfoExtractor):
'duration': 23,
}
}, {
'url': 'https://medal.tv/games/cod%20cold%20war/clips/2um24TWdty0NA',
'url': 'https://medal.tv/games/cod-cold-war/clips/2um24TWdty0NA',
'md5': 'b6dc76b78195fff0b4f8bf4a33ec2148',
'info_dict': {
'id': '2um24TWdty0NA',
@@ -81,7 +82,7 @@ class MedalTVIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
webpage = self._download_webpage(update_url_query(url, {'mobilebypass': 'true'}), video_id)
hydration_data = self._search_json(
r'<script[^>]*>[^<]*\bhydrationData\s*=', webpage,

View File

@@ -177,6 +177,7 @@ class MotherlessIE(InfoExtractor):
class MotherlessPaginatedIE(InfoExtractor):
_EXTRA_QUERY = {}
_PAGE_SIZE = 60
def _correct_path(self, url, item_id):
@@ -199,7 +200,7 @@ class MotherlessPaginatedIE(InfoExtractor):
def get_page(idx):
page = idx + 1
current_page = webpage if not idx else self._download_webpage(
real_url, item_id, note=f'Downloading page {page}', query={'page': page})
real_url, item_id, note=f'Downloading page {page}', query={'page': page, **self._EXTRA_QUERY})
yield from self._extract_entries(current_page, real_url)
return self.playlist_result(
@@ -213,7 +214,7 @@ class MotherlessGroupIE(MotherlessPaginatedIE):
'url': 'http://motherless.com/gv/movie_scenes',
'info_dict': {
'id': 'movie_scenes',
'title': 'Movie Scenes',
'title': 'Movie Scenes - Videos - Hot and sexy scenes from "regular" movies... Beautiful actresses fully',
},
'playlist_mincount': 540,
}, {
@@ -244,7 +245,7 @@ class MotherlessGalleryIE(MotherlessPaginatedIE):
'id': '338999F',
'title': 'Random',
},
'playlist_mincount': 190,
'playlist_mincount': 171,
}, {
'url': 'https://motherless.com/GVABD6213',
'info_dict': {
@@ -270,3 +271,27 @@ class MotherlessGalleryIE(MotherlessPaginatedIE):
def _correct_path(self, url, item_id):
return urllib.parse.urljoin(url, f'/GV{item_id}')
class MotherlessUploaderIE(MotherlessPaginatedIE):
_VALID_URL = r'https?://(?:www\.)?motherless\.com/u/(?P<id>\w+)/?(?:$|[?#])'
_TESTS = [{
'url': 'https://motherless.com/u/Mrgo4hrs2023',
'info_dict': {
'id': 'Mrgo4hrs2023',
'title': "Mrgo4hrs2023's Uploads - Videos",
},
'playlist_mincount': 32,
}, {
'url': 'https://motherless.com/u/Happy_couple?t=v',
'info_dict': {
'id': 'Happy_couple',
'title': "Happy_couple's Uploads - Videos",
},
'playlist_mincount': 8,
}]
_EXTRA_QUERY = {'t': 'v'}
def _correct_path(self, url, item_id):
return urllib.parse.urljoin(url, f'/u/{item_id}?t=v')

171
yt_dlp/extractor/mx3.py Normal file
View File

@@ -0,0 +1,171 @@
import re
from .common import InfoExtractor
from ..networking import HEADRequest
from ..utils import (
get_element_by_class,
int_or_none,
try_call,
url_or_none,
urlhandle_detect_ext,
)
from ..utils.traversal import traverse_obj
class Mx3BaseIE(InfoExtractor):
_VALID_URL_TMPL = r'https?://(?:www\.)?%s/t/(?P<id>\w+)'
_FORMATS = [{
'url': 'player_asset',
'format_id': 'default',
'quality': 0,
}, {
'url': 'player_asset?quality=hd',
'format_id': 'hd',
'quality': 1,
}, {
'url': 'download',
'format_id': 'download',
'quality': 2,
}, {
'url': 'player_asset?quality=source',
'format_id': 'source',
'quality': 2,
}]
def _extract_formats(self, track_id):
formats = []
for fmt in self._FORMATS:
format_url = f'https://{self._DOMAIN}/tracks/{track_id}/{fmt["url"]}'
urlh = self._request_webpage(
HEADRequest(format_url), track_id, fatal=False, expected_status=404,
note=f'Checking for format {fmt["format_id"]}')
if urlh and urlh.status == 200:
formats.append({
**fmt,
'url': format_url,
'ext': urlhandle_detect_ext(urlh),
'filesize': int_or_none(urlh.headers.get('Content-Length')),
})
return formats
def _real_extract(self, url):
track_id = self._match_id(url)
webpage = self._download_webpage(url, track_id)
more_info = get_element_by_class('single-more-info', webpage)
data = self._download_json(f'https://{self._DOMAIN}/t/{track_id}.json', track_id, fatal=False)
def get_info_field(name):
return self._html_search_regex(
rf'<dt[^>]*>\s*{name}\s*</dt>\s*<dd[^>]*>(.*?)</dd>',
more_info, name, default=None, flags=re.DOTALL)
return {
'id': track_id,
'formats': self._extract_formats(track_id),
'genre': self._html_search_regex(
r'<div\b[^>]+class="single-band-genre"[^>]*>([^<]+)</div>', webpage, 'genre', default=None),
'release_year': int_or_none(get_info_field('Year of creation')),
'description': get_info_field('Description'),
'tags': try_call(lambda: get_info_field('Tag').split(', '), list),
**traverse_obj(data, {
'title': ('title', {str}),
'artist': (('performer_name', 'artist'), {str}),
'album_artist': ('artist', {str}),
'composer': ('composer_name', {str}),
'thumbnail': (('picture_url_xlarge', 'picture_url'), {url_or_none}),
}, get_all=False),
}
class Mx3IE(Mx3BaseIE):
_DOMAIN = 'mx3.ch'
_VALID_URL = Mx3BaseIE._VALID_URL_TMPL % re.escape(_DOMAIN)
_TESTS = [{
'url': 'https://mx3.ch/t/1Cru',
'md5': '7ba09e9826b4447d4e1ce9d69e0e295f',
'info_dict': {
'id': '1Cru',
'ext': 'wav',
'artist': 'Godina',
'album_artist': 'Tortue Tortue',
'composer': 'Olivier Godinat',
'genre': 'Rock',
'thumbnail': 'https://mx3.ch/pictures/mx3/file/0101/4643/square_xlarge/1-s-envoler-1.jpg?1630272813',
'title': "S'envoler",
'release_year': 2021,
'tags': [],
}
}, {
'url': 'https://mx3.ch/t/1LIY',
'md5': '48293cb908342547827f963a5a2e9118',
'info_dict': {
'id': '1LIY',
'ext': 'mov',
'artist': 'Tania Kimfumu',
'album_artist': 'The Broots',
'composer': 'Emmanuel Diserens',
'genre': 'Electro',
'thumbnail': 'https://mx3.ch/pictures/mx3/file/0110/0003/video_xlarge/frame_0000.png?1686963670',
'title': 'The Broots-Larytta remix "Begging For Help"',
'release_year': 2023,
'tags': ['the broots', 'cassata records', 'larytta'],
'description': '"Begging for Help" Larytta Remix Official Video\nRealized By Kali Donkilie in 2023',
}
}, {
'url': 'https://mx3.ch/t/1C6E',
'md5': '1afcd578493ddb8e5008e94bb6d97e25',
'info_dict': {
'id': '1C6E',
'ext': 'wav',
'artist': 'Alien Bubblegum',
'album_artist': 'Alien Bubblegum',
'composer': 'Alien Bubblegum',
'genre': 'Punk',
'thumbnail': 'https://mx3.ch/pictures/mx3/file/0101/1551/square_xlarge/pandora-s-box-cover-with-title.png?1627054733',
'title': 'Wide Awake',
'release_year': 2021,
'tags': ['alien bubblegum', 'bubblegum', 'alien', 'pop punk', 'poppunk'],
}
}]
class Mx3NeoIE(Mx3BaseIE):
_DOMAIN = 'neo.mx3.ch'
_VALID_URL = Mx3BaseIE._VALID_URL_TMPL % re.escape(_DOMAIN)
_TESTS = [{
'url': 'https://neo.mx3.ch/t/1hpd',
'md5': '6d9986bbae5cac3296ec8813bf965eb2',
'info_dict': {
'id': '1hpd',
'ext': 'wav',
'artist': 'Baptiste Lopez',
'album_artist': 'Kammerorchester Basel',
'composer': 'Jannik Giger',
'genre': 'Composition, Orchestra',
'title': 'Troisième œil. Für Kammerorchester (2023)',
'thumbnail': 'https://neo.mx3.ch/pictures/neo/file/0000/0241/square_xlarge/kammerorchester-basel-group-photo-2_c_-lukasz-rajchert.jpg?1560341252',
'release_year': 2023,
'tags': [],
}
}]
class Mx3VolksmusikIE(Mx3BaseIE):
_DOMAIN = 'volksmusik.mx3.ch'
_VALID_URL = Mx3BaseIE._VALID_URL_TMPL % re.escape(_DOMAIN)
_TESTS = [{
'url': 'https://volksmusik.mx3.ch/t/Zx',
'md5': 'dd967a7b0c1ef898f3e072cf9c2eae3c',
'info_dict': {
'id': 'Zx',
'ext': 'mp3',
'artist': 'Ländlerkapelle GrischArt',
'album_artist': 'Ländlerkapelle GrischArt',
'composer': 'Urs Glauser',
'genre': 'Instrumental, Graubünden',
'title': 'Chämilouf',
'thumbnail': 'https://volksmusik.mx3.ch/pictures/vxm/file/0000/3815/square_xlarge/grischart1.jpg?1450530120',
'release_year': 2012,
'tags': [],
}
}]

View File

@@ -1,6 +1,7 @@
import itertools
import json
from .art19 import Art19IE
from .common import InfoExtractor
from ..networking.exceptions import HTTPError
from ..utils import (
@@ -112,7 +113,8 @@ class NebulaBaseIE(InfoExtractor):
class NebulaIE(NebulaBaseIE):
_VALID_URL = rf'{_BASE_URL_RE}/videos/(?P<id>[-\w]+)'
IE_NAME = 'nebula:video'
_VALID_URL = rf'{_BASE_URL_RE}/videos/(?P<id>[\w-]+)'
_TESTS = [{
'url': 'https://nebula.tv/videos/that-time-disney-remade-beauty-and-the-beast',
'info_dict': {
@@ -236,8 +238,8 @@ class NebulaIE(NebulaBaseIE):
class NebulaClassIE(NebulaBaseIE):
IE_NAME = 'nebula:class'
_VALID_URL = rf'{_BASE_URL_RE}/(?P<id>[-\w]+)/(?P<ep>\d+)'
IE_NAME = 'nebula:media'
_VALID_URL = rf'{_BASE_URL_RE}/(?!(?:myshows|library|videos)/)(?P<id>[\w-]+)/(?P<ep>[\w-]+)/?(?:$|[?#])'
_TESTS = [{
'url': 'https://nebula.tv/copyright-for-fun-and-profit/14',
'info_dict': {
@@ -253,6 +255,46 @@ class NebulaClassIE(NebulaBaseIE):
'title': 'Photos, Sculpture, and Video',
},
'params': {'skip_download': 'm3u8'},
}, {
'url': 'https://nebula.tv/extremitiespodcast/pyramiden-the-high-arctic-soviet-ghost-town',
'info_dict': {
'ext': 'mp3',
'id': '018f65f0-0033-4021-8f87-2d132beb19aa',
'description': 'md5:05d2b23ab780c955e2511a2b9127acff',
'series_id': '335e8159-d663-491a-888f-1732285706ac',
'modified_timestamp': 1599091504,
'episode_id': '018f65f0-0033-4021-8f87-2d132beb19aa',
'series': 'Extremities',
'modified_date': '20200903',
'upload_date': '20200902',
'title': 'Pyramiden: The High-Arctic Soviet Ghost Town',
'release_timestamp': 1571237958,
'thumbnail': r're:^https?://content\.production\.cdn\.art19\.com.*\.jpeg$',
'duration': 1546.05714,
'timestamp': 1599085608,
'release_date': '20191016',
},
}, {
'url': 'https://nebula.tv/thelayover/the-layover-episode-1',
'info_dict': {
'ext': 'mp3',
'id': '9d74a762-00bb-45a8-9e8d-9ed47c04a1d0',
'episode_number': 1,
'thumbnail': r're:^https?://content\.production\.cdn\.art19\.com.*\.jpeg$',
'release_date': '20230304',
'modified_date': '20230403',
'series': 'The Layover',
'episode_id': '9d74a762-00bb-45a8-9e8d-9ed47c04a1d0',
'modified_timestamp': 1680554566,
'duration': 3130.46401,
'release_timestamp': 1677943800,
'title': 'The Layover — Episode 1',
'series_id': '874303a5-4900-4626-a4b6-2aacac34466a',
'upload_date': '20230303',
'episode': 'Episode 1',
'timestamp': 1677883672,
'description': 'md5:002cca89258e3bc7c268d5b8c24ba482',
},
}]
def _real_extract(self, url):
@@ -268,16 +310,38 @@ class NebulaClassIE(NebulaBaseIE):
metadata = self._call_api(
f'https://content.api.nebula.app/content/{slug}/{episode}/?include=lessons',
slug, note='Fetching video metadata')
return {
**self._extract_video_metadata(metadata),
**self._extract_formats(metadata['id'], slug),
}
slug, note='Fetching class/podcast metadata')
content_type = metadata.get('type')
if content_type == 'lesson':
return {
**self._extract_video_metadata(metadata),
**self._extract_formats(metadata['id'], slug),
}
elif content_type == 'podcast_episode':
episode_url = metadata['episode_url']
if not episode_url and metadata.get('premium'):
self.raise_login_required()
if Art19IE.suitable(episode_url):
return self.url_result(episode_url, Art19IE)
return traverse_obj(metadata, {
'id': ('id', {str}),
'url': ('episode_url', {url_or_none}),
'title': ('title', {str}),
'description': ('description', {str}),
'timestamp': ('published_at', {parse_iso8601}),
'duration': ('duration', {int_or_none}),
'channel_id': ('channel_id', {str}),
'chnanel': ('channel_title', {str}),
'thumbnail': ('assets', 'regular', {url_or_none}),
})
raise ExtractorError(f'Unexpected content type {content_type!r}')
class NebulaSubscriptionsIE(NebulaBaseIE):
IE_NAME = 'nebula:subscriptions'
_VALID_URL = rf'{_BASE_URL_RE}/(?P<id>myshows|library/latest-videos)'
_VALID_URL = rf'{_BASE_URL_RE}/(?P<id>myshows|library/latest-videos)/?(?:$|[?#])'
_TESTS = [{
'url': 'https://nebula.tv/myshows',
'playlist_mincount': 1,
@@ -310,7 +374,7 @@ class NebulaSubscriptionsIE(NebulaBaseIE):
class NebulaChannelIE(NebulaBaseIE):
IE_NAME = 'nebula:channel'
_VALID_URL = rf'{_BASE_URL_RE}/(?!myshows|library|videos/)(?P<id>[-\w]+)/?(?:$|[?#])'
_VALID_URL = rf'{_BASE_URL_RE}/(?!myshows|library|videos)(?P<id>[\w-]+)/?(?:$|[?#])'
_TESTS = [{
'url': 'https://nebula.tv/tom-scott-presents-money',
'info_dict': {
@@ -343,6 +407,14 @@ class NebulaChannelIE(NebulaBaseIE):
'description': 'md5:6690248223eed044a9f11cd5a24f9742',
},
'playlist_count': 23,
}, {
'url': 'https://nebula.tv/trussissuespodcast',
'info_dict': {
'id': 'trussissuespodcast',
'title': 'The TLDR News Podcast',
'description': 'md5:a08c4483bc0b705881d3e0199e721385',
},
'playlist_mincount': 80,
}]
def _generate_playlist_entries(self, collection_id, collection_slug):
@@ -365,6 +437,17 @@ class NebulaChannelIE(NebulaBaseIE):
lesson.get('share_url') or f'https://nebula.tv/{metadata["class_slug"]}/{metadata["slug"]}',
{'id': lesson['id']}), NebulaClassIE, url_transparent=True, **metadata)
def _generate_podcast_entries(self, collection_id, collection_slug):
next_url = f'https://content.api.nebula.app/podcast_channels/{collection_id}/podcast_episodes/?ordering=-published_at&premium=true'
for page_num in itertools.count(1):
episodes = self._call_api(next_url, collection_slug, note=f'Retrieving podcast page {page_num}')
for episode in traverse_obj(episodes, ('results', lambda _, v: url_or_none(v['share_url']))):
yield self.url_result(episode['share_url'], NebulaClassIE)
next_url = episodes.get('next')
if not next_url:
break
def _real_extract(self, url):
collection_slug = self._match_id(url)
channel = self._call_api(
@@ -373,6 +456,8 @@ class NebulaChannelIE(NebulaBaseIE):
if channel.get('type') == 'class':
entries = self._generate_class_entries(channel)
elif channel.get('type') == 'podcast_channel':
entries = self._generate_podcast_entries(channel['id'], collection_slug)
else:
entries = self._generate_playlist_entries(channel['id'], collection_slug)

View File

@@ -1,33 +1,38 @@
import datetime
from .common import InfoExtractor
from .youtube import YoutubeIE
from ..utils import parse_iso8601, url_or_none
from ..utils.traversal import traverse_obj
class NerdCubedFeedIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?nerdcubed\.co\.uk/feed\.json'
_VALID_URL = r'https?://(?:www\.)?nerdcubed\.co\.uk/?(?:$|[#?])'
_TEST = {
'url': 'http://www.nerdcubed.co.uk/feed.json',
'url': 'http://www.nerdcubed.co.uk/',
'info_dict': {
'id': 'nerdcubed-feed',
'title': 'nerdcubed.co.uk feed',
},
'playlist_mincount': 1300,
'playlist_mincount': 5500,
}
def _extract_video(self, feed_entry):
return self.url_result(
f'https://www.youtube.com/watch?v={feed_entry["id"]}', YoutubeIE,
**traverse_obj(feed_entry, {
'id': ('id', {str}),
'title': ('title', {str}),
'description': ('description', {str}),
'timestamp': ('publishedAt', {parse_iso8601}),
'channel': ('source', 'name', {str}),
'channel_id': ('source', 'id', {str}),
'channel_url': ('source', 'url', {str}),
'thumbnail': ('thumbnail', 'source', {url_or_none}),
}), url_transparent=True)
def _real_extract(self, url):
feed = self._download_json(url, url, 'Downloading NerdCubed JSON feed')
video_id = 'nerdcubed-feed'
feed = self._download_json('https://www.nerdcubed.co.uk/_/cdn/videos.json', video_id)
entries = [{
'_type': 'url',
'title': feed_entry['title'],
'uploader': feed_entry['source']['name'] if feed_entry['source'] else None,
'upload_date': datetime.datetime.strptime(feed_entry['date'], '%Y-%m-%d').strftime('%Y%m%d'),
'url': 'http://www.youtube.com/watch?v=' + feed_entry['youtube_id'],
} for feed_entry in feed]
return {
'_type': 'playlist',
'title': 'nerdcubed.co.uk feed',
'id': 'nerdcubed-feed',
'entries': entries,
}
return self.playlist_result(
map(self._extract_video, traverse_obj(feed, ('videos', lambda _, v: v['id']))),
video_id, 'nerdcubed.co.uk feed')

View File

@@ -3,15 +3,15 @@ import re
from .common import InfoExtractor
from ..utils import (
OnDemandPagedList,
clean_html,
extract_attributes,
get_element_by_id,
int_or_none,
parse_count,
parse_duration,
traverse_obj,
unified_timestamp,
OnDemandPagedList,
try_get,
)
@@ -263,19 +263,16 @@ class NewgroundsUserIE(InfoExtractor):
def _fetch_page(self, channel_id, url, page):
page += 1
posts_info = self._download_json(
f'{url}/page/{page}', channel_id,
f'{url}?page={page}', channel_id,
note=f'Downloading page {page}', headers={
'Accept': 'application/json, text/javascript, */*; q = 0.01',
'X-Requested-With': 'XMLHttpRequest',
})
sequence = posts_info.get('sequence', [])
for year in sequence:
posts = try_get(posts_info, lambda x: x['years'][str(year)]['items'])
for post in posts:
path, media_id = self._search_regex(
r'<a[^>]+\bhref=["\'][^"\']+((?:portal/view|audio/listen)/(\d+))[^>]+>',
post, 'url', group=(1, 2))
yield self.url_result(f'https://www.newgrounds.com/{path}', NewgroundsIE.ie_key(), media_id)
for post in traverse_obj(posts_info, ('items', ..., ..., {str})):
path, media_id = self._search_regex(
r'<a[^>]+\bhref=["\'][^"\']+((?:portal/view|audio/listen)/(\d+))[^>]+>',
post, 'url', group=(1, 2))
yield self.url_result(f'https://www.newgrounds.com/{path}', NewgroundsIE.ie_key(), media_id)
def _real_extract(self, url):
channel_id = self._match_id(url)

View File

@@ -1,10 +1,54 @@
from .common import InfoExtractor
from ..utils import int_or_none
from ..utils import (
int_or_none,
join_nonempty,
merge_dicts,
parse_count,
url_or_none,
urljoin,
)
from ..utils.traversal import traverse_obj
class NFBIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?nfb\.ca/film/(?P<id>[^/?#&]+)'
class NFBBaseIE(InfoExtractor):
_VALID_URL_BASE = r'https?://(?:www\.)?(?P<site>nfb|onf)\.ca'
_GEO_COUNTRIES = ['CA']
def _extract_ep_data(self, webpage, video_id, fatal=False):
return self._search_json(
r'const\s+episodesData\s*=', webpage, 'episode data', video_id,
contains_pattern=r'\[\s*{(?s:.+)}\s*\]', fatal=fatal) or []
def _extract_ep_info(self, data, video_id, slug=None):
info = traverse_obj(data, (lambda _, v: video_id in v['embed_url'], {
'description': ('description', {str}),
'thumbnail': ('thumbnail_url', {url_or_none}),
'uploader': ('data_layer', 'episodeMaker', {str}),
'release_year': ('data_layer', 'episodeYear', {int_or_none}),
'episode': ('data_layer', 'episodeTitle', {str}),
'season': ('data_layer', 'seasonTitle', {str}),
'season_number': ('data_layer', 'seasonTitle', {parse_count}),
'series': ('data_layer', 'seriesTitle', {str}),
}), get_all=False)
return {
**info,
'id': video_id,
'title': join_nonempty('series', 'episode', from_dict=info, delim=' - '),
'episode_number': int_or_none(self._search_regex(
r'[/-]e(?:pisode)?-?(\d+)(?:[/-]|$)', slug or video_id, 'episode number', default=None)),
}
class NFBIE(NFBBaseIE):
IE_NAME = 'nfb'
IE_DESC = 'nfb.ca and onf.ca films and episodes'
_VALID_URL = [
rf'{NFBBaseIE._VALID_URL_BASE}/(?P<type>film)/(?P<id>[^/?#&]+)',
rf'{NFBBaseIE._VALID_URL_BASE}/(?P<type>series?)/(?P<id>[^/?#&]+/s(?:ea|ai)son\d+/episode\d+)',
]
_TESTS = [{
'note': 'NFB film',
'url': 'https://www.nfb.ca/film/trafficopter/',
'info_dict': {
'id': 'trafficopter',
@@ -14,29 +58,192 @@ class NFBIE(InfoExtractor):
'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'Barrie Howells',
'release_year': 1972,
'duration': 600.0,
},
'params': {'skip_download': 'm3u8'},
}, {
'note': 'ONF film',
'url': 'https://www.onf.ca/film/mal-du-siecle/',
'info_dict': {
'id': 'mal-du-siecle',
'ext': 'mp4',
'title': 'Le mal du siècle',
'description': 'md5:1abf774d77569ebe603419f2d344102b',
'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'Catherine Lepage',
'release_year': 2019,
'duration': 300.0,
},
'params': {'skip_download': 'm3u8'},
}, {
'note': 'NFB episode with English title',
'url': 'https://www.nfb.ca/series/true-north-inside-the-rise-of-toronto-basketball/season1/episode9/',
'info_dict': {
'id': 'true-north-episode9-true-north-finale-making-it',
'ext': 'mp4',
'title': 'True North: Inside the Rise of Toronto Basketball - Finale: Making It',
'description': 'We catch up with each player in the midst of their journey as they reflect on their road ahead.',
'series': 'True North: Inside the Rise of Toronto Basketball',
'release_year': 2018,
'season': 'Season 1',
'season_number': 1,
'episode': 'Finale: Making It',
'episode_number': 9,
'uploader': 'Ryan Sidhoo',
'thumbnail': r're:^https?://.*\.jpg$',
},
'params': {'skip_download': 'm3u8'},
}, {
'note': 'ONF episode with French title',
'url': 'https://www.onf.ca/serie/direction-nord-la-montee-du-basketball-a-toronto/saison1/episode9/',
'info_dict': {
'id': 'direction-nord-episode-9',
'ext': 'mp4',
'title': 'Direction nord La montée du basketball à Toronto - Finale : Réussir',
'description': 'md5:349a57419b71432b97bf6083d92b029d',
'series': 'Direction nord La montée du basketball à Toronto',
'release_year': 2018,
'season': 'Saison 1',
'season_number': 1,
'episode': 'Finale : Réussir',
'episode_number': 9,
'uploader': 'Ryan Sidhoo',
'thumbnail': r're:^https?://.*\.jpg$',
},
'params': {'skip_download': 'm3u8'},
}, {
'note': 'NFB episode with French title (needs geo-bypass)',
'url': 'https://www.nfb.ca/series/etoile-du-nord/saison1/episode1/',
'info_dict': {
'id': 'etoile-du-nord-episode-1-lobservation',
'ext': 'mp4',
'title': 'Étoile du Nord - L\'observation',
'description': 'md5:161a4617260dee3de70f509b2c9dd21b',
'series': 'Étoile du Nord',
'release_year': 2023,
'season': 'Saison 1',
'season_number': 1,
'episode': 'L\'observation',
'episode_number': 1,
'uploader': 'Patrick Bossé',
'thumbnail': r're:^https?://.*\.jpg$',
},
'params': {'skip_download': 'm3u8'},
}, {
'note': 'ONF episode with English title (needs geo-bypass)',
'url': 'https://www.onf.ca/serie/north-star/season1/episode1/',
'info_dict': {
'id': 'north-star-episode-1-observation',
'ext': 'mp4',
'title': 'North Star - Observation',
'description': 'md5:c727f370839d8a817392b9e3f23655c7',
'series': 'North Star',
'release_year': 2023,
'season': 'Season 1',
'season_number': 1,
'episode': 'Observation',
'episode_number': 1,
'uploader': 'Patrick Bossé',
'thumbnail': r're:^https?://.*\.jpg$',
},
'params': {'skip_download': 'm3u8'},
}, {
'note': 'NFB episode with /film/ URL and English title (needs geo-bypass)',
'url': 'https://www.nfb.ca/film/north-star-episode-1-observation/',
'info_dict': {
'id': 'north-star-episode-1-observation',
'ext': 'mp4',
'title': 'North Star - Observation',
'description': 'md5:c727f370839d8a817392b9e3f23655c7',
'series': 'North Star',
'release_year': 2023,
'season': 'Season 1',
'season_number': 1,
'episode': 'Observation',
'episode_number': 1,
'uploader': 'Patrick Bossé',
'thumbnail': r're:^https?://.*\.jpg$',
},
'params': {'skip_download': 'm3u8'},
}, {
'note': 'ONF episode with /film/ URL and French title (needs geo-bypass)',
'url': 'https://www.onf.ca/film/etoile-du-nord-episode-1-lobservation/',
'info_dict': {
'id': 'etoile-du-nord-episode-1-lobservation',
'ext': 'mp4',
'title': 'Étoile du Nord - L\'observation',
'description': 'md5:161a4617260dee3de70f509b2c9dd21b',
'series': 'Étoile du Nord',
'release_year': 2023,
'season': 'Saison 1',
'season_number': 1,
'episode': 'L\'observation',
'episode_number': 1,
'uploader': 'Patrick Bossé',
'thumbnail': r're:^https?://.*\.jpg$',
},
'params': {'skip_download': 'm3u8'},
}, {
'note': 'Season 2 episode w/o episode num in id, extract from json ld',
'url': 'https://www.onf.ca/film/liste-des-choses-qui-existent-saison-2-ours',
'info_dict': {
'id': 'liste-des-choses-qui-existent-saison-2-ours',
'ext': 'mp4',
'title': 'La liste des choses qui existent - L\'ours en peluche',
'description': 'md5:d5e8d8fc5f3a7385a9cf0f509b37e28a',
'series': 'La liste des choses qui existent',
'release_year': 2022,
'season': 'Saison 2',
'season_number': 2,
'episode': 'L\'ours en peluche',
'episode_number': 12,
'uploader': 'Francis Papillon',
'thumbnail': r're:^https?://.*\.jpg$',
},
'params': {'skip_download': 'm3u8'},
}, {
'note': 'NFB film /embed/player/ page',
'url': 'https://www.nfb.ca/film/afterlife/embed/player/',
'info_dict': {
'id': 'afterlife',
'ext': 'mp4',
'title': 'Afterlife',
'description': 'md5:84951394f594f1fb1e62d9c43242fdf5',
'release_year': 1978,
'duration': 420.0,
'uploader': 'Ishu Patel',
'thumbnail': r're:^https?://.*\.jpg$',
},
'params': {'skip_download': 'm3u8'},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
site, type_, slug = self._match_valid_url(url).group('site', 'type', 'id')
# Need to construct the URL since we match /embed/player/ URLs as well
webpage, urlh = self._download_webpage_handle(f'https://www.{site}.ca/{type_}/{slug}/', slug)
# type_ can change from film to serie(s) after redirect; new slug may have episode number
type_, slug = self._match_valid_url(urlh.url).group('type', 'id')
webpage = self._download_webpage('https://www.nfb.ca/film/%s/' % video_id, video_id)
embed_url = urljoin(f'https://www.{site}.ca', self._html_search_regex(
r'<[^>]+\bid=["\']player-iframe["\'][^>]*\bsrc=["\']([^"\']+)', webpage, 'embed url'))
video_id = self._match_id(embed_url) # embed url has unique slug
player = self._download_webpage(embed_url, video_id, 'Downloading player page')
if 'MESSAGE_GEOBLOCKED' in player:
self.raise_geo_restricted(countries=self._GEO_COUNTRIES)
iframe = self._html_search_regex(
r'<[^>]+\bid=["\']player-iframe["\'][^>]*src=["\']([^"\']+)',
webpage, 'iframe', default=None, fatal=True)
if iframe.startswith('/'):
iframe = f'https://www.nfb.ca{iframe}'
formats, subtitles = self._extract_m3u8_formats_and_subtitles(
self._html_search_regex(r'source:\s*\'([^\']+)', player, 'm3u8 url'),
video_id, 'mp4', m3u8_id='hls')
player = self._download_webpage(iframe, video_id)
if dv_source := self._html_search_regex(r'dvSource:\s*\'([^\']+)', player, 'dv', default=None):
fmts, subs = self._extract_m3u8_formats_and_subtitles(
dv_source, video_id, 'mp4', m3u8_id='dv', preference=-2, fatal=False)
for fmt in fmts:
fmt['format_note'] = 'described video'
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
source = self._html_search_regex(
r'source:\s*\'([^\']+)',
player, 'source', default=None, fatal=True)
formats, subtitles = self._extract_m3u8_formats_and_subtitles(source, video_id, ext='mp4')
return {
info = {
'id': video_id,
'title': self._html_search_regex(
r'<[^>]+\bid=["\']titleHeader["\'][^>]*>\s*<h1[^>]*>\s*([^<]+?)\s*</h1>',
@@ -45,14 +252,49 @@ class NFBIE(InfoExtractor):
r'<[^>]+\bid=["\']tabSynopsis["\'][^>]*>\s*<p[^>]*>\s*([^<]+)',
webpage, 'description', default=None),
'thumbnail': self._html_search_regex(
r'poster:\s*\'([^\']+)',
player, 'thumbnail', default=None),
r'poster:\s*\'([^\']+)', player, 'thumbnail', default=None),
'uploader': self._html_search_regex(
r'<[^>]+\bitemprop=["\']name["\'][^>]*>([^<]+)',
webpage, 'uploader', default=None),
r'<[^>]+\bitemprop=["\']name["\'][^>]*>([^<]+)', webpage, 'uploader', default=None),
'release_year': int_or_none(self._html_search_regex(
r'<[^>]+\bitemprop=["\']datePublished["\'][^>]*>([^<]+)',
webpage, 'release_year', default=None)),
} if type_ == 'film' else self._extract_ep_info(self._extract_ep_data(webpage, video_id, slug), video_id)
return merge_dicts({
'formats': formats,
'subtitles': subtitles,
}
}, info, self._search_json_ld(webpage, video_id, default={}))
class NFBSeriesIE(NFBBaseIE):
IE_NAME = 'nfb:series'
IE_DESC = 'nfb.ca and onf.ca series'
_VALID_URL = rf'{NFBBaseIE._VALID_URL_BASE}/(?P<type>series?)/(?P<id>[^/?#&]+)/?(?:[?#]|$)'
_TESTS = [{
'url': 'https://www.nfb.ca/series/true-north-inside-the-rise-of-toronto-basketball/',
'playlist_mincount': 9,
'info_dict': {
'id': 'true-north-inside-the-rise-of-toronto-basketball',
},
}, {
'url': 'https://www.onf.ca/serie/la-liste-des-choses-qui-existent-serie/',
'playlist_mincount': 26,
'info_dict': {
'id': 'la-liste-des-choses-qui-existent-serie',
},
}]
def _entries(self, episodes):
for episode in traverse_obj(episodes, lambda _, v: NFBIE.suitable(v['embed_url'])):
mobj = NFBIE._match_valid_url(episode['embed_url'])
yield self.url_result(
mobj[0], NFBIE, **self._extract_ep_info([episode], mobj.group('id')))
def _real_extract(self, url):
site, type_, series_id = self._match_valid_url(url).group('site', 'type', 'id')
season_path = 'saison' if type_ == 'serie' else 'season'
webpage = self._download_webpage(
f'https://www.{site}.ca/{type_}/{series_id}/{season_path}1/episode1', series_id)
episodes = self._extract_ep_data(webpage, series_id, fatal=True)
return self.playlist_result(self._entries(episodes), series_id)

View File

@@ -9,6 +9,7 @@ from ..utils import (
join_nonempty,
parse_duration,
traverse_obj,
try_call,
unescapeHTML,
unified_timestamp,
url_or_none,
@@ -473,22 +474,21 @@ class NhkRadiruIE(InfoExtractor):
IE_DESC = 'NHK らじる (Radiru/Rajiru)'
_VALID_URL = r'https?://www\.nhk\.or\.jp/radio/(?:player/ondemand|ondemand/detail)\.html\?p=(?P<site>[\da-zA-Z]+)_(?P<corner>[\da-zA-Z]+)(?:_(?P<headline>[\da-zA-Z]+))?'
_TESTS = [{
'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=0449_01_3853544',
'skip': 'Episode expired on 2023-04-16',
'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=0449_01_3926210',
'skip': 'Episode expired on 2024-02-24',
'info_dict': {
'channel': 'NHK-FM',
'uploader': 'NHK-FM',
'description': 'md5:94b08bdeadde81a97df4ec882acce3e9',
'title': 'ジャズ・トゥナイト シリーズJAZZジャイアンツ 56 ジョニー・ホッジス',
'id': '0449_01_3926210',
'ext': 'm4a',
'id': '0449_01_3853544',
'series': 'ジャズ・トゥナイト',
'uploader': 'NHK-FM',
'channel': 'NHK-FM',
'thumbnail': 'https://www.nhk.or.jp/prog/img/449/g449.jpg',
'timestamp': 1680969600,
'title': 'ジャズ・トゥナイト NEWジャズ特集',
'upload_date': '20230408',
'release_timestamp': 1680962400,
'release_date': '20230408',
'was_live': True,
'release_date': '20240217',
'description': 'md5:a456ee8e5e59e6dd2a7d32e62386e811',
'timestamp': 1708185600,
'release_timestamp': 1708178400,
'upload_date': '20240217',
},
}, {
# playlist, airs every weekday so it should _hopefully_ be okay forever
@@ -519,7 +519,8 @@ class NhkRadiruIE(InfoExtractor):
'series': 'らじる文庫 by ラジオ深夜便 ',
'release_timestamp': 1481126700,
'upload_date': '20211101',
}
},
'expected_warnings': ['Unable to download JSON metadata', 'Failed to get extended description'],
}, {
# news
'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=F261_01_3855109',
@@ -539,9 +540,28 @@ class NhkRadiruIE(InfoExtractor):
},
}]
_API_URL_TMPL = None
def _extract_extended_description(self, episode_id, episode):
service, _, area = traverse_obj(episode, ('aa_vinfo2', {str}, {lambda x: (x or '').partition(',')}))
aa_vinfo3 = traverse_obj(episode, ('aa_vinfo3', {str}))
detail_url = try_call(
lambda: self._API_URL_TMPL.format(service=service, area=area, dateid=aa_vinfo3))
if not detail_url:
return
full_meta = traverse_obj(
self._download_json(detail_url, episode_id, 'Downloading extended metadata', fatal=False),
('list', service, 0, {dict})) or {}
return join_nonempty('subtitle', 'content', 'act', 'music', delim='\n\n', from_dict=full_meta)
def _extract_episode_info(self, headline, programme_id, series_meta):
episode_id = f'{programme_id}_{headline["headline_id"]}'
episode = traverse_obj(headline, ('file_list', 0, {dict}))
description = self._extract_extended_description(episode_id, episode)
if not description:
self.report_warning('Failed to get extended description, falling back to summary')
description = traverse_obj(episode, ('file_title_sub', {str}))
return {
**series_meta,
@@ -551,14 +571,21 @@ class NhkRadiruIE(InfoExtractor):
'was_live': True,
'series': series_meta.get('title'),
'thumbnail': url_or_none(headline.get('headline_image')) or series_meta.get('thumbnail'),
'description': description,
**traverse_obj(episode, {
'title': 'file_title',
'description': 'file_title_sub',
'timestamp': ('open_time', {unified_timestamp}),
'release_timestamp': ('aa_vinfo4', {lambda x: x.split('_')[0]}, {unified_timestamp}),
}),
}
def _real_initialize(self):
if self._API_URL_TMPL:
return
api_config = self._download_xml(
'https://www.nhk.or.jp/radio/config/config_web.xml', None, 'Downloading API config', fatal=False)
NhkRadiruIE._API_URL_TMPL = try_call(lambda: f'https:{api_config.find(".//url_program_detail").text}')
def _real_extract(self, url):
site_id, corner_id, headline_id = self._match_valid_url(url).group('site', 'corner', 'headline')
programme_id = f'{site_id}_{corner_id}'

View File

@@ -172,9 +172,6 @@ class NiconicoIE(InfoExtractor):
_VALID_URL = r'https?://(?:(?:www\.|secure\.|sp\.)?nicovideo\.jp/watch|nico\.ms)/(?P<id>(?:[a-z]{2})?[0-9]+)'
_NETRC_MACHINE = 'niconico'
_COMMENT_API_ENDPOINTS = (
'https://nvcomment.nicovideo.jp/legacy/api.json',
'https://nmsg.nicovideo.jp/api.json',)
_API_HEADERS = {
'X-Frontend-ID': '6',
'X-Frontend-Version': '0',
@@ -470,93 +467,16 @@ class NiconicoIE(InfoExtractor):
parse_duration(self._html_search_meta('video:duration', webpage, 'video duration', default=None))
or get_video_info('duration')),
'webpage_url': url_or_none(url) or f'https://www.nicovideo.jp/watch/{video_id}',
'subtitles': self.extract_subtitles(video_id, api_data, session_api_data),
'subtitles': self.extract_subtitles(video_id, api_data),
}
def _get_subtitles(self, video_id, api_data, session_api_data):
comment_user_key = traverse_obj(api_data, ('comment', 'keys', 'userKey'))
user_id_str = session_api_data.get('serviceUserId')
thread_ids = traverse_obj(api_data, ('comment', 'threads', lambda _, v: v['isActive']))
legacy_danmaku = self._extract_legacy_comments(video_id, thread_ids, user_id_str, comment_user_key) or []
new_comments = traverse_obj(api_data, ('comment', 'nvComment'))
new_danmaku = self._extract_new_comments(
new_comments.get('server'), video_id,
new_comments.get('params'), new_comments.get('threadKey'))
if not legacy_danmaku and not new_danmaku:
self.report_warning(f'Failed to get comments. {bug_reports_message()}')
return
return {
'comments': [{
'ext': 'json',
'data': json.dumps(legacy_danmaku + new_danmaku),
}],
}
def _extract_legacy_comments(self, video_id, threads, user_id, user_key):
auth_data = {
'user_id': user_id,
'userkey': user_key,
} if user_id and user_key else {'user_id': ''}
api_url = traverse_obj(threads, (..., 'server'), get_all=False)
# Request Start
post_data = [{'ping': {'content': 'rs:0'}}]
for i, thread in enumerate(threads):
thread_id = thread['id']
thread_fork = thread['fork']
# Post Start (2N)
post_data.append({'ping': {'content': f'ps:{i * 2}'}})
post_data.append({'thread': {
'fork': thread_fork,
'language': 0,
'nicoru': 3,
'scores': 1,
'thread': thread_id,
'version': '20090904',
'with_global': 1,
**auth_data,
}})
# Post Final (2N)
post_data.append({'ping': {'content': f'pf:{i * 2}'}})
# Post Start (2N+1)
post_data.append({'ping': {'content': f'ps:{i * 2 + 1}'}})
post_data.append({'thread_leaves': {
# format is '<bottom of minute range>-<top of minute range>:<comments per minute>,<total last comments'
# unfortunately NND limits (deletes?) comment returns this way, so you're only able to grab the last 1000 per language
'content': '0-999999:999999,999999,nicoru:999999',
'fork': thread_fork,
'language': 0,
'nicoru': 3,
'scores': 1,
'thread': thread_id,
**auth_data,
}})
# Post Final (2N+1)
post_data.append({'ping': {'content': f'pf:{i * 2 + 1}'}})
# Request Final
post_data.append({'ping': {'content': 'rf:0'}})
return self._download_json(
f'{api_url}/api.json', video_id, data=json.dumps(post_data).encode(), fatal=False,
headers={
'Referer': f'https://www.nicovideo.jp/watch/{video_id}',
'Origin': 'https://www.nicovideo.jp',
'Content-Type': 'text/plain;charset=UTF-8',
},
note='Downloading comments', errnote=f'Failed to access endpoint {api_url}')
def _extract_new_comments(self, endpoint, video_id, params, thread_key):
comments = self._download_json(
f'{endpoint}/v1/threads', video_id, data=json.dumps({
def _get_subtitles(self, video_id, api_data):
comments_info = traverse_obj(api_data, ('comment', 'nvComment', {dict})) or {}
danmaku = traverse_obj(self._download_json(
f'{comments_info.get("server")}/v1/threads', video_id, data=json.dumps({
'additionals': {},
'params': params,
'threadKey': thread_key,
'params': comments_info.get('params'),
'threadKey': comments_info.get('threadKey'),
}).encode(), fatal=False,
headers={
'Referer': 'https://www.nicovideo.jp/',
@@ -566,8 +486,19 @@ class NiconicoIE(InfoExtractor):
'x-frontend-id': '6',
'x-frontend-version': '0',
},
note='Downloading comments (new)', errnote='Failed to download comments (new)')
return traverse_obj(comments, ('data', 'threads', ..., 'comments', ...))
note='Downloading comments', errnote='Failed to download comments'),
('data', 'threads', ..., 'comments', ...))
if not danmaku:
self.report_warning(f'Failed to get comments. {bug_reports_message()}')
return
return {
'comments': [{
'ext': 'json',
'data': json.dumps(danmaku),
}],
}
class NiconicoPlaylistBaseIE(InfoExtractor):

View File

@@ -0,0 +1,225 @@
from .common import InfoExtractor
from ..utils import int_or_none, mimetype2ext, parse_iso8601, url_or_none
from ..utils.traversal import traverse_obj
class NinaProtocolIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?ninaprotocol\.com/releases/(?P<id>[^/#?]+)'
_TESTS = [{
'url': 'https://www.ninaprotocol.com/releases/3SvsMM3y4oTPZ5DXFJnLkCAqkxz34hjzFxqms1vu9XBJ',
'info_dict': {
'id': '3SvsMM3y4oTPZ5DXFJnLkCAqkxz34hjzFxqms1vu9XBJ',
'title': 'The Spatulas - March Chant',
'tags': ['punk', 'postpresentmedium', 'cambridge'],
'uploader_id': '2bGjgdKUddJoj2shYGqfNcUfoSoABP21RJoiwGMZDq3A',
'channel': 'ppm',
'description': 'md5:bb9f9d39d8f786449cd5d0ff7c5772db',
'album': 'The Spatulas - March Chant',
'thumbnail': 'https://www.arweave.net/VyZA6CBeUuqP174khvSrD44Eosi3MLVyWN42uaQKg50',
'timestamp': 1701417610,
'uploader': 'ppmrecs',
'channel_id': '4ceG4zsb7VVxBTGPtZMqDZWGHo3VUg2xRvzC2b17ymWP',
'display_id': 'the-spatulas-march-chant',
'upload_date': '20231201',
'album_artist': 'Post Present Medium ',
},
'playlist': [{
'info_dict': {
'id': '3SvsMM3y4oTPZ5DXFJnLkCAqkxz34hjzFxqms1vu9XBJ_1',
'title': 'March Chant In April',
'track': 'March Chant In April',
'ext': 'mp3',
'duration': 152,
'track_number': 1,
'uploader_id': '2bGjgdKUddJoj2shYGqfNcUfoSoABP21RJoiwGMZDq3A',
'uploader': 'ppmrecs',
'thumbnail': 'https://www.arweave.net/VyZA6CBeUuqP174khvSrD44Eosi3MLVyWN42uaQKg50',
'timestamp': 1701417610,
'channel': 'ppm',
'album': 'The Spatulas - March Chant',
'tags': ['punk', 'postpresentmedium', 'cambridge'],
'channel_id': '4ceG4zsb7VVxBTGPtZMqDZWGHo3VUg2xRvzC2b17ymWP',
'upload_date': '20231201',
'album_artist': 'Post Present Medium ',
}
}, {
'info_dict': {
'id': '3SvsMM3y4oTPZ5DXFJnLkCAqkxz34hjzFxqms1vu9XBJ_2',
'title': 'Rescue Mission',
'track': 'Rescue Mission',
'ext': 'mp3',
'duration': 212,
'track_number': 2,
'album_artist': 'Post Present Medium ',
'uploader': 'ppmrecs',
'tags': ['punk', 'postpresentmedium', 'cambridge'],
'thumbnail': 'https://www.arweave.net/VyZA6CBeUuqP174khvSrD44Eosi3MLVyWN42uaQKg50',
'channel': 'ppm',
'upload_date': '20231201',
'channel_id': '4ceG4zsb7VVxBTGPtZMqDZWGHo3VUg2xRvzC2b17ymWP',
'timestamp': 1701417610,
'album': 'The Spatulas - March Chant',
'uploader_id': '2bGjgdKUddJoj2shYGqfNcUfoSoABP21RJoiwGMZDq3A',
}
}, {
'info_dict': {
'id': '3SvsMM3y4oTPZ5DXFJnLkCAqkxz34hjzFxqms1vu9XBJ_3',
'title': 'Slinger Style',
'track': 'Slinger Style',
'ext': 'mp3',
'duration': 179,
'track_number': 3,
'timestamp': 1701417610,
'upload_date': '20231201',
'channel_id': '4ceG4zsb7VVxBTGPtZMqDZWGHo3VUg2xRvzC2b17ymWP',
'uploader_id': '2bGjgdKUddJoj2shYGqfNcUfoSoABP21RJoiwGMZDq3A',
'thumbnail': 'https://www.arweave.net/VyZA6CBeUuqP174khvSrD44Eosi3MLVyWN42uaQKg50',
'album_artist': 'Post Present Medium ',
'album': 'The Spatulas - March Chant',
'tags': ['punk', 'postpresentmedium', 'cambridge'],
'uploader': 'ppmrecs',
'channel': 'ppm',
}
}, {
'info_dict': {
'id': '3SvsMM3y4oTPZ5DXFJnLkCAqkxz34hjzFxqms1vu9XBJ_4',
'title': 'Psychic Signal',
'track': 'Psychic Signal',
'ext': 'mp3',
'duration': 220,
'track_number': 4,
'tags': ['punk', 'postpresentmedium', 'cambridge'],
'upload_date': '20231201',
'album': 'The Spatulas - March Chant',
'thumbnail': 'https://www.arweave.net/VyZA6CBeUuqP174khvSrD44Eosi3MLVyWN42uaQKg50',
'timestamp': 1701417610,
'album_artist': 'Post Present Medium ',
'channel_id': '4ceG4zsb7VVxBTGPtZMqDZWGHo3VUg2xRvzC2b17ymWP',
'channel': 'ppm',
'uploader_id': '2bGjgdKUddJoj2shYGqfNcUfoSoABP21RJoiwGMZDq3A',
'uploader': 'ppmrecs',
}
}, {
'info_dict': {
'id': '3SvsMM3y4oTPZ5DXFJnLkCAqkxz34hjzFxqms1vu9XBJ_5',
'title': 'Curvy Color',
'track': 'Curvy Color',
'ext': 'mp3',
'duration': 148,
'track_number': 5,
'timestamp': 1701417610,
'uploader_id': '2bGjgdKUddJoj2shYGqfNcUfoSoABP21RJoiwGMZDq3A',
'thumbnail': 'https://www.arweave.net/VyZA6CBeUuqP174khvSrD44Eosi3MLVyWN42uaQKg50',
'album': 'The Spatulas - March Chant',
'album_artist': 'Post Present Medium ',
'channel': 'ppm',
'tags': ['punk', 'postpresentmedium', 'cambridge'],
'uploader': 'ppmrecs',
'channel_id': '4ceG4zsb7VVxBTGPtZMqDZWGHo3VUg2xRvzC2b17ymWP',
'upload_date': '20231201',
}
}, {
'info_dict': {
'id': '3SvsMM3y4oTPZ5DXFJnLkCAqkxz34hjzFxqms1vu9XBJ_6',
'title': 'Caveman Star',
'track': 'Caveman Star',
'ext': 'mp3',
'duration': 121,
'track_number': 6,
'channel_id': '4ceG4zsb7VVxBTGPtZMqDZWGHo3VUg2xRvzC2b17ymWP',
'thumbnail': 'https://www.arweave.net/VyZA6CBeUuqP174khvSrD44Eosi3MLVyWN42uaQKg50',
'tags': ['punk', 'postpresentmedium', 'cambridge'],
'album_artist': 'Post Present Medium ',
'uploader': 'ppmrecs',
'timestamp': 1701417610,
'uploader_id': '2bGjgdKUddJoj2shYGqfNcUfoSoABP21RJoiwGMZDq3A',
'album': 'The Spatulas - March Chant',
'channel': 'ppm',
'upload_date': '20231201',
},
}],
}, {
'url': 'https://www.ninaprotocol.com/releases/f-g-s-american-shield',
'info_dict': {
'id': '76PZnJwaMgViQHYfA4NYJXds7CmW6vHQKAtQUxGene6J',
'description': 'md5:63f08d5db558b4b36e1896f317062721',
'title': 'F.G.S. - American Shield',
'uploader_id': 'Ej3rozs11wYqFk1Gs6oggGCkGLz8GzBhmJfnUxf6gPci',
'channel_id': '6JuksCZPXuP16wJ1BUfwuukJzh42C7guhLrFPPkVJfyE',
'channel': 'tinkscough',
'tags': [],
'album_artist': 'F.G.S.',
'album': 'F.G.S. - American Shield',
'thumbnail': 'https://www.arweave.net/YJpgImkXLT9SbpFb576KuZ5pm6bdvs452LMs3Rx6lm8',
'display_id': 'f-g-s-american-shield',
'uploader': 'flannerysilva',
'timestamp': 1702395858,
'upload_date': '20231212',
},
'playlist_count': 1,
}, {
'url': 'https://www.ninaprotocol.com/releases/time-to-figure-things-out',
'info_dict': {
'id': '6Zi1nC5hj6b13NkpxVYwRhFy6mYA7oLBbe9DMrgGDcYh',
'display_id': 'time-to-figure-things-out',
'description': 'md5:960202ed01c3134bb8958f1008527e35',
'timestamp': 1706283607,
'title': 'DJ STEPDAD - time to figure things out',
'album_artist': 'DJ STEPDAD',
'uploader': 'tddvsss',
'upload_date': '20240126',
'album': 'time to figure things out',
'uploader_id': 'AXQNRgTyYsySyAMFDwxzumuGjfmoXshorCesjpquwCBi',
'thumbnail': 'https://www.arweave.net/O4i8bcKVqJVZvNeHHFp6r8knpFGh9ZwEgbeYacr4nss',
'tags': [],
},
'playlist_count': 4,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
release = self._download_json(
f'https://api.ninaprotocol.com/v1/releases/{video_id}', video_id)['release']
video_id = release.get('publicKey') or video_id
common_info = traverse_obj(release, {
'album': ('metadata', 'properties', 'title', {str}),
'album_artist': ((('hub', 'data'), 'publisherAccount'), 'displayName', {str}),
'timestamp': ('datetime', {parse_iso8601}),
'thumbnail': ('metadata', 'image', {url_or_none}),
'uploader': ('publisherAccount', 'handle', {str}),
'uploader_id': ('publisherAccount', 'publicKey', {str}),
'channel': ('hub', 'handle', {str}),
'channel_id': ('hub', 'publicKey', {str}),
}, get_all=False)
common_info['tags'] = traverse_obj(release, ('metadata', 'properties', 'tags', ..., {str}))
entries = []
for track_num, track in enumerate(traverse_obj(release, (
'metadata', 'properties', 'files', lambda _, v: url_or_none(v['uri']))), 1):
entries.append({
'id': f'{video_id}_{track_num}',
'url': track['uri'],
**traverse_obj(track, {
'title': ('track_title', {str}),
'track': ('track_title', {str}),
'ext': ('type', {mimetype2ext}),
'track_number': ('track', {int_or_none}),
'duration': ('duration', {int_or_none}),
}),
'vcodec': 'none',
**common_info,
})
return {
'_type': 'playlist',
'id': video_id,
'entries': entries,
**traverse_obj(release, {
'display_id': ('slug', {str}),
'title': ('metadata', 'name', {str}),
'description': ('metadata', 'description', {str}),
}),
**common_info,
}

View File

@@ -135,14 +135,15 @@ class NovaIE(InfoExtractor):
_VALID_URL = r'https?://(?:[^.]+\.)?(?P<site>tv(?:noviny)?|tn|novaplus|vymena|fanda|krasna|doma|prask)\.nova\.cz/(?:[^/]+/)+(?P<id>[^/]+?)(?:\.html|/|$)'
_TESTS = [{
'url': 'http://tn.nova.cz/clanek/tajemstvi-ukryte-v-podzemi-specialni-nemocnice-v-prazske-krci.html#player_13260',
'md5': '249baab7d0104e186e78b0899c7d5f28',
'md5': 'da8f3f1fcdaf9fb0f112a32a165760a3',
'info_dict': {
'id': '1757139',
'display_id': 'tajemstvi-ukryte-v-podzemi-specialni-nemocnice-v-prazske-krci',
'id': '8OvQqEvV3MW',
'display_id': '8OvQqEvV3MW',
'ext': 'mp4',
'title': 'Podzemní nemocnice v pražské Krči',
'description': 'md5:f0a42dd239c26f61c28f19e62d20ef53',
'thumbnail': r're:^https?://.*\.(?:jpg)',
'duration': 151,
}
}, {
'url': 'http://fanda.nova.cz/clanek/fun-and-games/krvavy-epos-zaklinac-3-divoky-hon-vychazi-vyhrajte-ho-pro-sebe.html',
@@ -210,7 +211,7 @@ class NovaIE(InfoExtractor):
# novaplus
embed_id = self._search_regex(
r'<iframe[^>]+\bsrc=["\'](?:https?:)?//media\.cms\.nova\.cz/embed/([^/?#&]+)',
r'<iframe[^>]+\bsrc=["\'](?:https?:)?//media(?:tn)?\.cms\.nova\.cz/embed/([^/?#&"\']+)',
webpage, 'embed url', default=None)
if embed_id:
return {

View File

@@ -35,6 +35,7 @@ class NTVRuIE(InfoExtractor):
'duration': 172,
'view_count': int,
},
'skip': '404 Not Found',
}, {
'url': 'http://www.ntv.ru/peredacha/segodnya/m23700/o232416',
'md5': '82dbd49b38e3af1d00df16acbeab260c',
@@ -78,7 +79,8 @@ class NTVRuIE(InfoExtractor):
}]
_VIDEO_ID_REGEXES = [
r'<meta property="og:url" content="http://www\.ntv\.ru/video/(\d+)',
r'<meta property="og:url" content="https?://www\.ntv\.ru/video/(\d+)',
r'<meta property="og:video:(?:url|iframe)" content="https?://www\.ntv\.ru/embed/(\d+)',
r'<video embed=[^>]+><id>(\d+)</id>',
r'<video restriction[^>]+><key>(\d+)</key>',
]

199
yt_dlp/extractor/nuum.py Normal file
View File

@@ -0,0 +1,199 @@
import functools
from .common import InfoExtractor
from ..utils import (
ExtractorError,
OnDemandPagedList,
UserNotLive,
filter_dict,
int_or_none,
parse_iso8601,
str_or_none,
url_or_none,
)
from ..utils.traversal import traverse_obj
class NuumBaseIE(InfoExtractor):
def _call_api(self, path, video_id, description, query={}):
response = self._download_json(
f'https://nuum.ru/api/v2/{path}', video_id, query=query,
note=f'Downloading {description} metadata',
errnote=f'Unable to download {description} metadata')
if error := response.get('error'):
raise ExtractorError(f'API returned error: {error!r}')
return response['result']
def _get_channel_info(self, channel_name):
return self._call_api(
'broadcasts/public', video_id=channel_name, description='channel',
query={
'with_extra': 'true',
'channel_name': channel_name,
'with_deleted': 'true',
})
def _parse_video_data(self, container, extract_formats=True):
stream = traverse_obj(container, ('media_container_streams', 0, {dict})) or {}
media = traverse_obj(stream, ('stream_media', 0, {dict})) or {}
media_url = traverse_obj(media, (
'media_meta', ('media_archive_url', 'media_url'), {url_or_none}), get_all=False)
video_id = str(container['media_container_id'])
is_live = media.get('media_status') == 'RUNNING'
formats, subtitles = None, None
if extract_formats:
formats, subtitles = self._extract_m3u8_formats_and_subtitles(
media_url, video_id, 'mp4', live=is_live)
return filter_dict({
'id': video_id,
'is_live': is_live,
'formats': formats,
'subtitles': subtitles,
**traverse_obj(container, {
'title': ('media_container_name', {str}),
'description': ('media_container_description', {str}),
'timestamp': ('created_at', {parse_iso8601}),
'channel': ('media_container_channel', 'channel_name', {str}),
'channel_id': ('media_container_channel', 'channel_id', {str_or_none}),
}),
**traverse_obj(stream, {
'view_count': ('stream_total_viewers', {int_or_none}),
'concurrent_view_count': ('stream_current_viewers', {int_or_none}),
}),
**traverse_obj(media, {
'duration': ('media_duration', {int_or_none}),
'thumbnail': ('media_meta', ('media_preview_archive_url', 'media_preview_url'), {url_or_none}),
}, get_all=False),
})
class NuumMediaIE(NuumBaseIE):
IE_NAME = 'nuum:media'
_VALID_URL = r'https?://nuum\.ru/(?:streams|videos|clips)/(?P<id>[\d]+)'
_TESTS = [{
'url': 'https://nuum.ru/streams/1592713-7-days-to-die',
'only_matching': True,
}, {
'url': 'https://nuum.ru/videos/1567547-toxi-hurtz',
'md5': 'f1d9118a30403e32b702a204eb03aca3',
'info_dict': {
'id': '1567547',
'ext': 'mp4',
'title': 'Toxi$ - Hurtz',
'description': '',
'timestamp': 1702631651,
'upload_date': '20231215',
'thumbnail': r're:^https?://.+\.jpg',
'view_count': int,
'concurrent_view_count': int,
'channel_id': '6911',
'channel': 'toxis',
'duration': 116,
},
}, {
'url': 'https://nuum.ru/clips/1552564-pro-misu',
'md5': 'b248ae1565b1e55433188f11beeb0ca1',
'info_dict': {
'id': '1552564',
'ext': 'mp4',
'title': 'Про Мису 🙃',
'timestamp': 1701971828,
'upload_date': '20231207',
'thumbnail': r're:^https?://.+\.jpg',
'view_count': int,
'concurrent_view_count': int,
'channel_id': '3320',
'channel': 'Misalelik',
'duration': 41,
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
video_data = self._call_api(f'media-containers/{video_id}', video_id, 'media')
return self._parse_video_data(video_data)
class NuumLiveIE(NuumBaseIE):
IE_NAME = 'nuum:live'
_VALID_URL = r'https?://nuum\.ru/channel/(?P<id>[^/#?]+)/?(?:$|[#?])'
_TESTS = [{
'url': 'https://nuum.ru/channel/mts_live',
'only_matching': True,
}]
def _real_extract(self, url):
channel = self._match_id(url)
channel_info = self._get_channel_info(channel)
if traverse_obj(channel_info, ('channel', 'channel_is_live')) is False:
raise UserNotLive(video_id=channel)
info = self._parse_video_data(channel_info['media_container'])
return {
'webpage_url': f'https://nuum.ru/streams/{info["id"]}',
'extractor_key': NuumMediaIE.ie_key(),
'extractor': NuumMediaIE.IE_NAME,
**info,
}
class NuumTabIE(NuumBaseIE):
IE_NAME = 'nuum:tab'
_VALID_URL = r'https?://nuum\.ru/channel/(?P<id>[^/#?]+)/(?P<type>streams|videos|clips)'
_TESTS = [{
'url': 'https://nuum.ru/channel/dankon_/clips',
'info_dict': {
'id': 'dankon__clips',
'title': 'Dankon_',
},
'playlist_mincount': 29,
}, {
'url': 'https://nuum.ru/channel/dankon_/videos',
'info_dict': {
'id': 'dankon__videos',
'title': 'Dankon_',
},
'playlist_mincount': 2,
}, {
'url': 'https://nuum.ru/channel/dankon_/streams',
'info_dict': {
'id': 'dankon__streams',
'title': 'Dankon_',
},
'playlist_mincount': 1,
}]
_PAGE_SIZE = 50
def _fetch_page(self, channel_id, tab_type, tab_id, page):
CONTAINER_TYPES = {
'clips': ['SHORT_VIDEO', 'REVIEW_VIDEO'],
'videos': ['LONG_VIDEO'],
'streams': ['SINGLE'],
}
media_containers = self._call_api(
'media-containers', video_id=tab_id, description=f'{tab_type} tab page {page + 1}',
query={
'limit': self._PAGE_SIZE,
'offset': page * self._PAGE_SIZE,
'channel_id': channel_id,
'media_container_status': 'STOPPED',
'media_container_type': CONTAINER_TYPES[tab_type],
})
for container in traverse_obj(media_containers, (..., {dict})):
metadata = self._parse_video_data(container, extract_formats=False)
yield self.url_result(f'https://nuum.ru/videos/{metadata["id"]}', NuumMediaIE, **metadata)
def _real_extract(self, url):
channel_name, tab_type = self._match_valid_url(url).group('id', 'type')
tab_id = f'{channel_name}_{tab_type}'
channel_data = self._get_channel_info(channel_name)['channel']
return self.playlist_result(OnDemandPagedList(functools.partial(
self._fetch_page, channel_data['channel_id'], tab_type, tab_id), self._PAGE_SIZE),
playlist_id=tab_id, playlist_title=channel_data.get('channel_name'))

View File

@@ -1,50 +1,93 @@
import hmac
import hashlib
import base64
import json
import uuid
from .common import InfoExtractor
from ..utils import (
ExtractorError,
clean_html,
determine_ext,
extract_attributes,
float_or_none,
get_elements_html_by_class,
int_or_none,
js_to_json,
merge_dicts,
mimetype2ext,
parse_iso8601,
remove_end,
remove_start,
str_or_none,
traverse_obj,
url_or_none,
)
class NYTimesBaseIE(InfoExtractor):
_SECRET = b'pX(2MbU2);4N{7J8)>YwKRJ+/pQ3JkiU2Q^V>mFYv6g6gYvt6v'
_DNS_NAMESPACE = uuid.UUID('36dd619a-56dc-595b-9e09-37f4152c7b5d')
_TOKEN = 'MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAuNIzKBOFB77aT/jN/FQ+/QVKWq5V1ka1AYmCR9hstz1pGNPH5ajOU9gAqta0T89iPnhjwla+3oec/Z3kGjxbpv6miQXufHFq3u2RC6HyU458cLat5kVPSOQCe3VVB5NRpOlRuwKHqn0txfxnwSSj8mqzstR997d3gKB//RO9zE16y3PoWlDQXkASngNJEWvL19iob/xwAkfEWCjyRILWFY0JYX3AvLMSbq7wsqOCE5srJpo7rRU32zsByhsp1D5W9OYqqwDmflsgCEQy2vqTsJjrJohuNg+urMXNNZ7Y3naMoqttsGDrWVxtPBafKMI8pM2ReNZBbGQsQXRzQNo7+QIDAQAB'
_GRAPHQL_API = 'https://samizdat-graphql.nytimes.com/graphql/v2'
_GRAPHQL_QUERY = '''query VideoQuery($id: String!) {
video(id: $id) {
... on Video {
bylines {
renderedRepresentation
}
duration
firstPublished
promotionalHeadline
promotionalMedia {
... on Image {
crops {
name
renditions {
name
width
height
url
}
}
}
}
renditions {
type
width
height
url
bitrate
}
summary
}
}
}'''
def _extract_video_from_id(self, video_id):
# Authorization generation algorithm is reverse engineered from `signer` in
# http://graphics8.nytimes.com/video/vhs/vhs-2.x.min.js
path = '/svc/video/api/v3/video/' + video_id
hm = hmac.new(self._SECRET, (path + ':vhs').encode(), hashlib.sha512).hexdigest()
video_data = self._download_json('http://www.nytimes.com' + path, video_id, 'Downloading video JSON', headers={
'Authorization': 'NYTV ' + base64.b64encode(hm.encode()).decode(),
'X-NYTV': 'vhs',
}, fatal=False)
if not video_data:
video_data = self._download_json(
'http://www.nytimes.com/svc/video/api/v2/video/' + video_id,
video_id, 'Downloading video JSON')
def _call_api(self, media_id):
# reference: `id-to-uri.js`
video_uuid = uuid.uuid5(self._DNS_NAMESPACE, 'video')
media_uuid = uuid.uuid5(video_uuid, media_id)
title = video_data['headline']
return traverse_obj(self._download_json(
self._GRAPHQL_API, media_id, 'Downloading JSON from GraphQL API', data=json.dumps({
'query': self._GRAPHQL_QUERY,
'variables': {'id': f'nyt://video/{media_uuid}'},
}, separators=(',', ':')).encode(), headers={
'Content-Type': 'application/json',
'Nyt-App-Type': 'vhs',
'Nyt-App-Version': 'v3.52.21',
'Nyt-Token': self._TOKEN,
'Origin': 'https://nytimes.com',
}, fatal=False), ('data', 'video', {dict})) or {}
def get_file_size(file_size):
if isinstance(file_size, int):
return file_size
elif isinstance(file_size, dict):
return int(file_size.get('value', 0))
else:
return None
def _extract_thumbnails(self, thumbs):
return traverse_obj(thumbs, (lambda _, v: url_or_none(v['url']), {
'url': 'url',
'width': ('width', {int_or_none}),
'height': ('height', {int_or_none}),
}), default=None)
def _extract_formats_and_subtitles(self, video_id, content_media_json):
urls = []
formats = []
subtitles = {}
for video in video_data.get('renditions', []):
for video in traverse_obj(content_media_json, ('renditions', ..., {dict})):
video_url = video.get('url')
format_id = video.get('type')
if not video_url or format_id == 'thumbs' or video_url in urls:
@@ -56,11 +99,9 @@ class NYTimesBaseIE(InfoExtractor):
video_url, video_id, 'mp4', 'm3u8_native',
m3u8_id=format_id or 'hls', fatal=False)
formats.extend(m3u8_fmts)
subtitles = self._merge_subtitles(subtitles, m3u8_subs)
self._merge_subtitles(m3u8_subs, target=subtitles)
elif ext == 'mpd':
continue
# formats.extend(self._extract_mpd_formats(
# video_url, video_id, format_id or 'dash', fatal=False))
continue # all mpd urls give 404 errors
else:
formats.append({
'url': video_url,
@@ -68,55 +109,50 @@ class NYTimesBaseIE(InfoExtractor):
'vcodec': video.get('videoencoding') or video.get('video_codec'),
'width': int_or_none(video.get('width')),
'height': int_or_none(video.get('height')),
'filesize': get_file_size(video.get('file_size') or video.get('fileSize')),
'filesize': traverse_obj(video, (
('file_size', 'fileSize'), (None, ('value')), {int_or_none}), get_all=False),
'tbr': int_or_none(video.get('bitrate'), 1000) or None,
'ext': ext,
})
thumbnails = []
for image in video_data.get('images', []):
image_url = image.get('url')
if not image_url:
continue
thumbnails.append({
'url': 'http://www.nytimes.com/' + image_url,
'width': int_or_none(image.get('width')),
'height': int_or_none(image.get('height')),
})
return formats, subtitles
publication_date = video_data.get('publication_date')
timestamp = parse_iso8601(publication_date[:-8]) if publication_date else None
def _extract_video(self, media_id):
data = self._call_api(media_id)
formats, subtitles = self._extract_formats_and_subtitles(media_id, data)
return {
'id': video_id,
'title': title,
'description': video_data.get('summary'),
'timestamp': timestamp,
'uploader': video_data.get('byline'),
'duration': float_or_none(video_data.get('duration'), 1000),
'id': media_id,
'title': data.get('promotionalHeadline'),
'description': data.get('summary'),
'timestamp': parse_iso8601(data.get('firstPublished')),
'duration': float_or_none(data.get('duration'), scale=1000),
'creator': ', '.join(traverse_obj(data, ( # TODO: change to 'creators'
'bylines', ..., 'renderedRepresentation', {lambda x: remove_start(x, 'By ')}))),
'formats': formats,
'subtitles': subtitles,
'thumbnails': thumbnails,
'thumbnails': self._extract_thumbnails(
traverse_obj(data, ('promotionalMedia', 'crops', ..., 'renditions', ...))),
}
class NYTimesIE(NYTimesBaseIE):
_VALID_URL = r'https?://(?:(?:www\.)?nytimes\.com/video/(?:[^/]+/)+?|graphics8\.nytimes\.com/bcvideo/\d+(?:\.\d+)?/iframe/embed\.html\?videoId=)(?P<id>\d+)'
_EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//graphics8\.nytimes\.com/bcvideo/[^/]+/iframe/embed\.html.+?)\1>']
_TESTS = [{
'url': 'http://www.nytimes.com/video/opinion/100000002847155/verbatim-what-is-a-photocopier.html?playlistId=100000001150263',
'md5': 'd665342765db043f7e225cff19df0f2d',
'md5': 'a553aa344014e3723d33893d89d4defc',
'info_dict': {
'id': '100000002847155',
'ext': 'mov',
'ext': 'mp4',
'title': 'Verbatim: What Is a Photocopier?',
'description': 'md5:93603dada88ddbda9395632fdc5da260',
'timestamp': 1398631707,
'upload_date': '20140427',
'uploader': 'Brett Weiner',
'timestamp': 1398646132,
'upload_date': '20140428',
'creator': 'Brett Weiner',
'thumbnail': r're:https?://\w+\.nyt.com/images/.+\.jpg',
'duration': 419,
}
},
}, {
'url': 'http://www.nytimes.com/video/travel/100000003550828/36-hours-in-dubai.html',
'only_matching': True,
@@ -125,138 +161,260 @@ class NYTimesIE(NYTimesBaseIE):
def _real_extract(self, url):
video_id = self._match_id(url)
return self._extract_video_from_id(video_id)
return self._extract_video(video_id)
class NYTimesArticleIE(NYTimesBaseIE):
_VALID_URL = r'https?://(?:www\.)?nytimes\.com/(.(?<!video))*?/(?:[^/]+/)*(?P<id>[^.]+)(?:\.html)?'
_VALID_URL = r'https?://(?:www\.)?nytimes\.com/\d{4}/\d{2}/\d{2}/(?!books|podcasts)[^/?#]+/(?:\w+/)?(?P<id>[^./?#]+)(?:\.html)?'
_TESTS = [{
'url': 'http://www.nytimes.com/2015/04/14/business/owner-of-gravity-payments-a-credit-card-processor-is-setting-a-new-minimum-wage-70000-a-year.html?_r=0',
'md5': 'e2076d58b4da18e6a001d53fd56db3c9',
'md5': '3eb5ddb1d6f86254fe4f233826778737',
'info_dict': {
'id': '100000003628438',
'ext': 'mov',
'title': 'New Minimum Wage: $70,000 a Year',
'description': 'Dan Price, C.E.O. of Gravity Payments, surprised his 120-person staff by announcing that he planned over the next three years to raise the salary of every employee to $70,000 a year.',
'timestamp': 1429033037,
'ext': 'mp4',
'title': 'One Companys New Minimum Wage: $70,000 a Year',
'description': 'md5:89ba9ab67ca767bb92bf823d1f138433',
'timestamp': 1429047468,
'upload_date': '20150414',
'uploader': 'Matthew Williams',
}
'creator': 'Patricia Cohen',
'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg',
'duration': 119.0,
},
}, {
'url': 'http://www.nytimes.com/2016/10/14/podcasts/revelations-from-the-final-weeks.html',
'md5': 'e0d52040cafb07662acf3c9132db3575',
# article with audio and no video
'url': 'https://www.nytimes.com/2023/09/29/health/mosquitoes-genetic-engineering.html',
'md5': '2365b3555c8aa7f4dd34ca735ad02e6a',
'info_dict': {
'id': '100000004709062',
'title': 'The Run-Up: He Was Like an Octopus',
'id': '100000009110381',
'ext': 'mp3',
'description': 'md5:fb5c6b93b12efc51649b4847fe066ee4',
'series': 'The Run-Up',
'episode': 'He Was Like an Octopus',
'episode_number': 20,
'duration': 2130,
}
'title': 'The Gamble: Can Genetically Modified Mosquitoes End Disease?',
'description': 'md5:9ff8b47acbaf7f3ca8c732f5c815be2e',
'timestamp': 1695960700,
'upload_date': '20230929',
'creator': 'Stephanie Nolen, Natalija Gormalova',
'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg',
'duration': 1322,
},
}, {
'url': 'http://www.nytimes.com/2016/10/16/books/review/inside-the-new-york-times-book-review-the-rise-of-hitler.html',
'url': 'https://www.nytimes.com/2023/11/29/business/dealbook/kamala-harris-biden-voters.html',
'md5': '3eb5ddb1d6f86254fe4f233826778737',
'info_dict': {
'id': '100000004709479',
'title': 'The Rise of Hitler',
'ext': 'mp3',
'description': 'md5:bce877fd9e3444990cb141875fab0028',
'creator': 'Pamela Paul',
'duration': 3475,
'id': '100000009202270',
'ext': 'mp4',
'title': 'Kamala Harris Defends Biden Policies, but Says More Work Needed to Reach Voters',
'description': 'md5:de4212a7e19bb89e4fb14210ca915f1f',
'timestamp': 1701290997,
'upload_date': '20231129',
'uploader': 'By The New York Times',
'creator': 'Katie Rogers',
'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg',
'duration': 97.631,
},
'params': {
'skip_download': True,
'skip_download': 'm3u8',
},
}, {
'url': 'http://www.nytimes.com/news/minute/2014/03/17/times-minute-whats-next-in-crimea/?_php=true&_type=blogs&_php=true&_type=blogs&_r=1',
# multiple videos in the same article
'url': 'https://www.nytimes.com/2023/12/02/business/air-traffic-controllers-safety.html',
'info_dict': {
'id': 'air-traffic-controllers-safety',
'title': 'Drunk and Asleep on the Job: Air Traffic Controllers Pushed to the Brink',
'description': 'md5:549e5a5e935bf7d048be53ba3d2c863d',
'upload_date': '20231202',
'creator': 'Emily Steel, Sydney Ember',
'timestamp': 1701511264,
},
'playlist_count': 3,
}, {
'url': 'https://www.nytimes.com/2023/12/02/business/media/netflix-squid-game-challenge.html',
'only_matching': True,
}]
def _extract_podcast_from_json(self, json, page_id, webpage):
podcast_audio = self._parse_json(
json, page_id, transform_source=js_to_json)
def _extract_content_from_block(self, block):
details = traverse_obj(block, {
'id': ('sourceId', {str}),
'uploader': ('bylines', ..., 'renderedRepresentation', {str}),
'duration': (None, (('duration', {lambda x: float_or_none(x, scale=1000)}), ('length', {int_or_none}))),
'timestamp': ('firstPublished', {parse_iso8601}),
'series': ('podcastSeries', {str}),
}, get_all=False)
audio_data = podcast_audio['data']
track = audio_data['track']
episode_title = track['title']
video_url = track['source']
description = track.get('description') or self._html_search_meta(
['og:description', 'twitter:description'], webpage)
podcast_title = audio_data.get('podcast', {}).get('title')
title = ('%s: %s' % (podcast_title, episode_title)
if podcast_title else episode_title)
episode = audio_data.get('podcast', {}).get('episode') or ''
episode_number = int_or_none(self._search_regex(
r'[Ee]pisode\s+(\d+)', episode, 'episode number', default=None))
formats, subtitles = self._extract_formats_and_subtitles(details.get('id'), block)
# audio articles will have an url and no formats
url = traverse_obj(block, ('fileUrl', {url_or_none}))
if not formats and url:
formats.append({'url': url, 'vcodec': 'none'})
return {
'id': remove_start(podcast_audio.get('target'), 'FT') or page_id,
'url': video_url,
'title': title,
'description': description,
'creator': track.get('credit'),
'series': podcast_title,
'episode': episode_title,
'episode_number': episode_number,
'duration': int_or_none(track.get('duration')),
**details,
'thumbnails': self._extract_thumbnails(traverse_obj(
block, ('promotionalMedia', 'crops', ..., 'renditions', ...))),
'formats': formats,
'subtitles': subtitles
}
def _real_extract(self, url):
page_id = self._match_id(url)
webpage = self._download_webpage(url, page_id)
art_json = self._search_json(
r'window\.__preloadedData\s*=', webpage, 'media details', page_id,
transform_source=lambda x: x.replace('undefined', 'null'))['initialData']['data']['article']
video_id = self._search_regex(
r'data-videoid=["\'](\d+)', webpage, 'video id',
default=None, fatal=False)
if video_id is not None:
return self._extract_video_from_id(video_id)
blocks = traverse_obj(art_json, (
'sprinkledBody', 'content', ..., ('ledeMedia', None),
lambda _, v: v['__typename'] in ('Video', 'Audio')))
if not blocks:
raise ExtractorError('Unable to extract any media blocks from webpage')
podcast_data = self._search_regex(
(r'NYTD\.FlexTypes\.push\s*\(\s*({.+?})\s*\)\s*;\s*</script',
r'NYTD\.FlexTypes\.push\s*\(\s*({.+})\s*\)\s*;'),
webpage, 'podcast data')
return self._extract_podcast_from_json(podcast_data, page_id, webpage)
common_info = {
'title': remove_end(self._html_extract_title(webpage), ' - The New York Times'),
'description': traverse_obj(art_json, (
'sprinkledBody', 'content', ..., 'summary', 'content', ..., 'text', {str}),
get_all=False) or self._html_search_meta(['og:description', 'twitter:description'], webpage),
'timestamp': traverse_obj(art_json, ('firstPublished', {parse_iso8601})),
'creator': ', '.join(
traverse_obj(art_json, ('bylines', ..., 'creators', ..., 'displayName'))), # TODO: change to 'creators' (list)
'thumbnails': self._extract_thumbnails(traverse_obj(
art_json, ('promotionalMedia', 'assetCrops', ..., 'renditions', ...))),
}
entries = []
for block in blocks:
entries.append(merge_dicts(self._extract_content_from_block(block), common_info))
if len(entries) > 1:
return self.playlist_result(entries, page_id, **common_info)
return {
'id': page_id,
**entries[0],
}
class NYTimesCookingIE(NYTimesBaseIE):
_VALID_URL = r'https?://cooking\.nytimes\.com/(?:guid|recip)es/(?P<id>\d+)'
IE_NAME = 'NYTimesCookingGuide'
_VALID_URL = r'https?://cooking\.nytimes\.com/guides/(?P<id>[\w-]+)'
_TESTS = [{
'url': 'https://cooking.nytimes.com/recipes/1017817-cranberry-curd-tart',
'md5': 'dab81fa2eaeb3f9ed47498bdcfcdc1d3',
'url': 'https://cooking.nytimes.com/guides/13-how-to-cook-a-turkey',
'info_dict': {
'id': '100000004756089',
'ext': 'mov',
'timestamp': 1479383008,
'uploader': 'By SHAW LASH, ADAM SAEWITZ and JAMES HERRON',
'title': 'Cranberry Tart',
'upload_date': '20161117',
'description': 'If you are a fan of lemon curd or the classic French tarte au citron, you will love this cranberry version.',
'id': '13-how-to-cook-a-turkey',
'title': 'How to Cook a Turkey',
'description': 'md5:726cfd3f9b161bdf5c279879e8050ca0',
},
'playlist_count': 2,
}, {
# single video example
'url': 'https://cooking.nytimes.com/guides/50-how-to-make-mac-and-cheese',
'md5': '64415805fe0b8640fce6b0b9def5989a',
'info_dict': {
'id': '100000005835845',
'ext': 'mp4',
'title': 'How to Make Mac and Cheese',
'description': 'md5:b8f2f33ec1fb7523b21367147c9594f1',
'timestamp': 1522950315,
'upload_date': '20180405',
'duration': 9.51,
'creator': 'Alison Roman',
'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg',
},
}, {
'url': 'https://cooking.nytimes.com/guides/13-how-to-cook-a-turkey',
'md5': '4b2e8c70530a89b8d905a2b572316eb8',
'url': 'https://cooking.nytimes.com/guides/20-how-to-frost-a-cake',
'md5': '64415805fe0b8640fce6b0b9def5989a',
'info_dict': {
'id': '100000003951728',
'ext': 'mov',
'timestamp': 1445509539,
'description': 'Turkey guide',
'upload_date': '20151022',
'title': 'Turkey',
}
'id': '20-how-to-frost-a-cake',
'title': 'How to Frost a Cake',
'description': 'md5:a31fe3b98a8ce7b98aae097730c269cd',
},
'playlist_count': 8,
}]
def _real_extract(self, url):
page_id = self._match_id(url)
webpage = self._download_webpage(url, page_id)
title = self._html_search_meta(['og:title', 'twitter:title'], webpage)
description = self._html_search_meta(['og:description', 'twitter:description'], webpage)
video_id = self._search_regex(
r'data-video-id=["\'](\d+)', webpage, 'video id')
lead_video_id = self._search_regex(
r'data-video-player-id="(\d+)"></div>', webpage, 'lead video')
media_ids = traverse_obj(
get_elements_html_by_class('video-item', webpage), (..., {extract_attributes}, 'data-video-id'))
return self._extract_video_from_id(video_id)
if media_ids:
media_ids.append(lead_video_id)
return self.playlist_result(
[self._extract_video(media_id) for media_id in media_ids], page_id, title, description)
return {
**self._extract_video(lead_video_id),
'title': title,
'description': description,
'creator': self._search_regex( # TODO: change to 'creators'
r'<span itemprop="author">([^<]+)</span></p>', webpage, 'author', default=None),
}
class NYTimesCookingRecipeIE(InfoExtractor):
_VALID_URL = r'https?://cooking\.nytimes\.com/recipes/(?P<id>\d+)'
_TESTS = [{
'url': 'https://cooking.nytimes.com/recipes/1017817-cranberry-curd-tart',
'md5': '579e83bbe8e61e9de67f80edba8a78a8',
'info_dict': {
'id': '1017817',
'ext': 'mp4',
'title': 'Cranberry Curd Tart',
'description': 'md5:ad77a3fc321db636256d4343c5742152',
'timestamp': 1447804800,
'upload_date': '20151118',
'creator': 'David Tanis',
'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg',
},
}, {
'url': 'https://cooking.nytimes.com/recipes/1024781-neapolitan-checkerboard-cookies',
'md5': '58df35998241dcf0620e99e646331b42',
'info_dict': {
'id': '1024781',
'ext': 'mp4',
'title': 'Neapolitan Checkerboard Cookies',
'description': 'md5:ba12394c585ababea951cb6d2fcc6631',
'timestamp': 1701302400,
'upload_date': '20231130',
'creator': 'Sue Li',
'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg',
},
}, {
'url': 'https://cooking.nytimes.com/recipes/1019516-overnight-oats',
'md5': '2fe7965a3adc899913b8e25ada360823',
'info_dict': {
'id': '1019516',
'ext': 'mp4',
'timestamp': 1546387200,
'description': 'md5:8856ce10239161bd2596ac335b9f9bfb',
'upload_date': '20190102',
'title': 'Overnight Oats',
'creator': 'Genevieve Ko',
'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg',
},
}]
def _real_extract(self, url):
page_id = self._match_id(url)
webpage = self._download_webpage(url, page_id)
recipe_data = self._search_nextjs_data(webpage, page_id)['props']['pageProps']['recipe']
formats, subtitles = self._extract_m3u8_formats_and_subtitles(
recipe_data['videoSrc'], page_id, 'mp4', m3u8_id='hls')
return {
**traverse_obj(recipe_data, {
'id': ('id', {str_or_none}),
'title': ('title', {str}),
'description': ('topnote', {clean_html}),
'timestamp': ('publishedAt', {int_or_none}),
'creator': ('contentAttribution', 'cardByline', {str}),
}),
'formats': formats,
'subtitles': subtitles,
'thumbnails': [{'url': thumb_url} for thumb_url in traverse_obj(
recipe_data, ('image', 'crops', 'recipe', ..., {url_or_none}))],
}

View File

@@ -1,4 +1,6 @@
from .common import InfoExtractor
from .jwplatform import JWPlatformIE
from ..utils import make_archive_id
class OneFootballIE(InfoExtractor):
@@ -7,41 +9,43 @@ class OneFootballIE(InfoExtractor):
_TESTS = [{
'url': 'https://onefootball.com/en/video/highlights-fc-zuerich-3-3-fc-basel-34012334',
'info_dict': {
'id': '34012334',
'id': 'Y2VtcWAT',
'ext': 'mp4',
'title': 'Highlights: FC Zürich 3-3 FC Basel',
'description': 'md5:33d9855cb790702c4fe42a513700aba8',
'thumbnail': 'https://photobooth-api.onefootball.com/api/screenshot/https:%2F%2Fperegrine-api.onefootball.com%2Fv2%2Fphotobooth%2Fcms%2Fen%2F34012334',
'timestamp': 1635874604,
'upload_date': '20211102'
'thumbnail': 'https://cdn.jwplayer.com/v2/media/Y2VtcWAT/poster.jpg?width=720',
'timestamp': 1635874895,
'upload_date': '20211102',
'duration': 375.0,
'tags': ['Football', 'Soccer', 'OneFootball'],
'_old_archive_ids': ['onefootball 34012334'],
},
'params': {'skip_download': True}
'params': {'skip_download': True},
'expected_warnings': ['Failed to download m3u8 information'],
}, {
'url': 'https://onefootball.com/en/video/klopp-fumes-at-var-decisions-in-west-ham-defeat-34041020',
'info_dict': {
'id': '34041020',
'id': 'leVJrMho',
'ext': 'mp4',
'title': 'Klopp fumes at VAR decisions in West Ham defeat',
'description': 'md5:9c50371095a01ad3f63311c73d8f51a5',
'thumbnail': 'https://photobooth-api.onefootball.com/api/screenshot/https:%2F%2Fperegrine-api.onefootball.com%2Fv2%2Fphotobooth%2Fcms%2Fen%2F34041020',
'timestamp': 1636314103,
'upload_date': '20211107'
'thumbnail': 'https://cdn.jwplayer.com/v2/media/leVJrMho/poster.jpg?width=720',
'timestamp': 1636315232,
'upload_date': '20211107',
'duration': 93.0,
'tags': ['Football', 'Soccer', 'OneFootball'],
'_old_archive_ids': ['onefootball 34041020'],
},
'params': {'skip_download': True}
}]
def _real_extract(self, url):
id = self._match_id(url)
webpage = self._download_webpage(url, id)
data_json = self._search_json_ld(webpage, id)
m3u8_url = self._html_search_regex(r'(https://cdn\.jwplayer\.com/manifests/.+\.m3u8)', webpage, 'm3u8_url')
formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, id)
return {
'id': id,
'title': data_json.get('title'),
'description': data_json.get('description'),
'thumbnail': data_json.get('thumbnail'),
'timestamp': data_json.get('timestamp'),
'formats': formats,
'subtitles': subtitles,
}
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
data_json = self._search_json_ld(webpage, video_id, fatal=False)
data_json.pop('url', None)
m3u8_url = self._html_search_regex(r'(https://cdn\.jwplayer\.com/manifests/\w+\.m3u8)', webpage, 'm3u8_url')
return self.url_result(
m3u8_url, JWPlatformIE, video_id, _old_archive_ids=[make_archive_id(self, video_id)],
**data_json, url_transparent=True)

View File

@@ -12,6 +12,8 @@ from ..compat import compat_str
class OpenRecBaseIE(InfoExtractor):
_M3U8_HEADERS = {'Referer': 'https://www.openrec.tv/'}
def _extract_pagestore(self, webpage, video_id):
return self._parse_json(
self._search_regex(r'(?m)window\.pageStore\s*=\s*(\{.+?\});$', webpage, 'window.pageStore'), video_id)
@@ -21,7 +23,7 @@ class OpenRecBaseIE(InfoExtractor):
if not m3u8_url:
continue
yield from self._extract_m3u8_formats(
m3u8_url, video_id, ext='mp4', m3u8_id=name)
m3u8_url, video_id, ext='mp4', m3u8_id=name, headers=self._M3U8_HEADERS)
def _extract_movie(self, webpage, video_id, name, is_live):
window_stores = self._extract_pagestore(webpage, video_id)
@@ -60,6 +62,7 @@ class OpenRecBaseIE(InfoExtractor):
'uploader_id': get_first(movie_stores, ('channel', 'user', 'id')),
'timestamp': int_or_none(get_first(movie_stores, ['publishedAt', 'time']), scale=1000) or unified_timestamp(get_first(movie_stores, 'publishedAt')),
'is_live': is_live,
'http_headers': self._M3U8_HEADERS,
}
@@ -110,7 +113,7 @@ class OpenRecCaptureIE(OpenRecBaseIE):
raise ExtractorError('Cannot extract title')
formats = self._extract_m3u8_formats(
capture_data.get('source'), video_id, ext='mp4')
capture_data.get('source'), video_id, ext='mp4', headers=self._M3U8_HEADERS)
return {
'id': video_id,
@@ -121,6 +124,7 @@ class OpenRecCaptureIE(OpenRecBaseIE):
'uploader': traverse_obj(movie_store, ('channel', 'name'), expected_type=compat_str),
'uploader_id': traverse_obj(movie_store, ('channel', 'id'), expected_type=compat_str),
'upload_date': unified_strdate(capture_data.get('createdAt')),
'http_headers': self._M3U8_HEADERS,
}

View File

@@ -1,3 +1,4 @@
import base64
import functools
import re
@@ -565,3 +566,66 @@ class ORFFM4StoryIE(InfoExtractor):
})
return self.playlist_result(entries)
class ORFONIE(InfoExtractor):
IE_NAME = 'orf:on'
_VALID_URL = r'https?://on\.orf\.at/video/(?P<id>\d{8})/(?P<slug>[\w-]+)'
_TESTS = [{
'url': 'https://on.orf.at/video/14210000/school-of-champions-48',
'info_dict': {
'id': '14210000',
'ext': 'mp4',
'duration': 2651.08,
'thumbnail': 'https://api-tvthek.orf.at/assets/segments/0167/98/thumb_16697671_segments_highlight_teaser.jpeg',
'title': 'School of Champions (4/8)',
'description': 'md5:d09ad279fc2e8502611e7648484b6afd',
'media_type': 'episode',
'timestamp': 1706472362,
'upload_date': '20240128',
}
}]
def _extract_video(self, video_id, display_id):
encrypted_id = base64.b64encode(f'3dSlfek03nsLKdj4Jsd{video_id}'.encode()).decode()
api_json = self._download_json(
f'https://api-tvthek.orf.at/api/v4.3/public/episode/encrypted/{encrypted_id}', display_id)
formats, subtitles = [], {}
for manifest_type in traverse_obj(api_json, ('sources', {dict.keys}, ...)):
for manifest_url in traverse_obj(api_json, ('sources', manifest_type, ..., 'src', {url_or_none})):
if manifest_type == 'hls':
fmts, subs = self._extract_m3u8_formats_and_subtitles(
manifest_url, display_id, fatal=False, m3u8_id='hls')
elif manifest_type == 'dash':
fmts, subs = self._extract_mpd_formats_and_subtitles(
manifest_url, display_id, fatal=False, mpd_id='dash')
else:
continue
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
return {
'id': video_id,
'formats': formats,
'subtitles': subtitles,
**traverse_obj(api_json, {
'duration': ('duration_second', {float_or_none}),
'title': (('title', 'headline'), {str}),
'description': (('description', 'teaser_text'), {str}),
'media_type': ('video_type', {str}),
}, get_all=False),
}
def _real_extract(self, url):
video_id, display_id = self._match_valid_url(url).group('id', 'slug')
webpage = self._download_webpage(url, display_id)
return {
'id': video_id,
'title': self._html_search_meta(['og:title', 'twitter:title'], webpage, default=None),
'description': self._html_search_meta(
['description', 'og:description', 'twitter:description'], webpage, default=None),
**self._search_json_ld(webpage, display_id, fatal=False),
**self._extract_video(video_id, display_id),
}

View File

@@ -275,7 +275,7 @@ class PatreonIE(PatreonBaseIE):
'ext': ext,
'url': post_file['url'],
}
elif name == 'video':
elif name == 'video' or determine_ext(post_file.get('url')) == 'm3u8':
formats, subtitles = self._extract_m3u8_formats_and_subtitles(post_file['url'], video_id)
return {
**info,

File diff suppressed because it is too large Load Diff

View File

@@ -1,10 +1,18 @@
import json
from .common import InfoExtractor
from ..utils import int_or_none, traverse_obj
from ..utils import (
ExtractorError,
int_or_none,
parse_qs,
traverse_obj,
update_url_query,
urlencode_postdata,
)
class PlaySuisseIE(InfoExtractor):
_NETRC_MACHINE = 'playsuisse'
_VALID_URL = r'https?://(?:www\.)?playsuisse\.ch/(?:watch|detail)/(?:[^#]*[?&]episodeId=)?(?P<id>[0-9]+)'
_TESTS = [
{
@@ -134,12 +142,47 @@ class PlaySuisseIE(InfoExtractor):
id
url
}'''
_LOGIN_BASE_URL = 'https://login.srgssr.ch/srgssrlogin.onmicrosoft.com'
_LOGIN_PATH = 'B2C_1A__SignInV2'
_ID_TOKEN = None
def _perform_login(self, username, password):
login_page = self._download_webpage(
'https://www.playsuisse.ch/api/sso/login', None, note='Downloading login page',
query={'x': 'x', 'locale': 'de', 'redirectUrl': 'https://www.playsuisse.ch/'})
settings = self._search_json(r'var\s+SETTINGS\s*=', login_page, 'settings', None)
csrf_token = settings['csrf']
query = {'tx': settings['transId'], 'p': self._LOGIN_PATH}
status = traverse_obj(self._download_json(
f'{self._LOGIN_BASE_URL}/{self._LOGIN_PATH}/SelfAsserted', None, 'Logging in',
query=query, headers={'X-CSRF-TOKEN': csrf_token}, data=urlencode_postdata({
'request_type': 'RESPONSE',
'signInName': username,
'password': password
}), expected_status=400), ('status', {int_or_none}))
if status == 400:
raise ExtractorError('Invalid username or password', expected=True)
urlh = self._request_webpage(
f'{self._LOGIN_BASE_URL}/{self._LOGIN_PATH}/api/CombinedSigninAndSignup/confirmed',
None, 'Downloading ID token', query={
'rememberMe': 'false',
'csrf_token': csrf_token,
**query,
'diags': '',
})
self._ID_TOKEN = traverse_obj(parse_qs(urlh.url), ('id_token', 0))
if not self._ID_TOKEN:
raise ExtractorError('Login failed')
def _get_media_data(self, media_id):
# NOTE In the web app, the "locale" header is used to switch between languages,
# However this doesn't seem to take effect when passing the header here.
response = self._download_json(
'https://4bbepzm4ef.execute-api.eu-central-1.amazonaws.com/prod/graphql',
'https://www.playsuisse.ch/api/graphql',
media_id, data=json.dumps({
'operationName': 'AssetWatch',
'query': self._GRAPHQL_QUERY,
@@ -150,6 +193,9 @@ class PlaySuisseIE(InfoExtractor):
return response['data']['assetV2']
def _real_extract(self, url):
if not self._ID_TOKEN:
self.raise_login_required(method='password')
media_id = self._match_id(url)
media_data = self._get_media_data(media_id)
info = self._extract_single(media_data)
@@ -168,7 +214,8 @@ class PlaySuisseIE(InfoExtractor):
if not media.get('url') or media.get('type') != 'HLS':
continue
f, subs = self._extract_m3u8_formats_and_subtitles(
media['url'], media_data['id'], 'mp4', m3u8_id='HLS', fatal=False)
update_url_query(media['url'], {'id_token': self._ID_TOKEN}),
media_data['id'], 'mp4', m3u8_id='HLS', fatal=False)
formats.extend(f)
self._merge_subtitles(subs, target=subtitles)

View File

@@ -87,8 +87,8 @@ class PornHubBaseIE(InfoExtractor):
def is_logged(webpage):
return any(re.search(p, webpage) for p in (
r'class=["\']signOut',
r'>Sign\s+[Oo]ut\s*<'))
r'id="profileMenuDropdown"',
r'class="ph-icon-logout"'))
if is_logged(login_page):
self._logged_in = True

View File

@@ -18,7 +18,6 @@ from ..utils.traversal import traverse_obj
class Pr0grammIE(InfoExtractor):
_VALID_URL = r'https?://pr0gramm\.com\/(?:[^/?#]+/)+(?P<id>[\d]+)(?:[/?#:]|$)'
_TESTS = [{
# Tags require account
'url': 'https://pr0gramm.com/new/video/5466437',
'info_dict': {
'id': '5466437',
@@ -36,7 +35,6 @@ class Pr0grammIE(InfoExtractor):
'_old_archive_ids': ['pr0grammstatic 5466437'],
},
}, {
# Tags require account
'url': 'https://pr0gramm.com/new/3052805:comment28391322',
'info_dict': {
'id': '3052805',
@@ -71,6 +69,23 @@ class Pr0grammIE(InfoExtractor):
'thumbnail': r're:^https://thumb\.pr0gramm\.com/.*\.jpg',
'_old_archive_ids': ['pr0grammstatic 5848332'],
},
}, {
'url': 'https://pr0gramm.com/top/5895149',
'info_dict': {
'id': '5895149',
'ext': 'mp4',
'title': 'pr0gramm-5895149 by algoholigSeeManThrower',
'tags': 'count:19',
'uploader': 'algoholigSeeManThrower',
'uploader_id': 457556,
'upload_timestamp': 1697580902,
'upload_date': '20231018',
'like_count': int,
'dislike_count': int,
'age_limit': 0,
'thumbnail': 'https://thumb.pr0gramm.com/2023/10/18/db47bb3db5e1a1b3.jpg',
'_old_archive_ids': ['pr0grammstatic 5895149'],
},
}, {
'url': 'https://pr0gramm.com/static/5466437',
'only_matching': True,
@@ -92,15 +107,15 @@ class Pr0grammIE(InfoExtractor):
def _maximum_flags(self):
# We need to guess the flags for the content otherwise the api will raise an error
# We can guess the maximum allowed flags for the account from the cookies
# Bitflags are (msbf): nsfp, nsfl, nsfw, sfw
flags = 0b0001
# Bitflags are (msbf): pol, nsfp, nsfl, nsfw, sfw
flags = 0b10001
if self._is_logged_in:
flags |= 0b1000
flags |= 0b01000
cookies = self._get_cookies(self.BASE_URL)
if 'me' not in cookies:
self._download_webpage(self.BASE_URL, None, 'Refreshing verification information')
if traverse_obj(cookies, ('me', {lambda x: x.value}, {unquote}, {json.loads}, 'verified')):
flags |= 0b0110
flags |= 0b00110
return flags
@@ -134,14 +149,12 @@ class Pr0grammIE(InfoExtractor):
if not source or not source.endswith('mp4'):
self.raise_no_formats('Could not extract a video', expected=bool(source), video_id=video_id)
tags = None
if self._is_logged_in:
metadata = self._call_api('info', video_id, {'itemId': video_id}, note='Downloading tags')
tags = traverse_obj(metadata, ('tags', ..., 'tag', {str}))
# Sorted by "confidence", higher confidence = earlier in list
confidences = traverse_obj(metadata, ('tags', ..., 'confidence', ({int}, {float})))
if confidences:
tags = [tag for _, tag in sorted(zip(confidences, tags), reverse=True)]
metadata = self._call_api('info', video_id, {'itemId': video_id}, note='Downloading tags')
tags = traverse_obj(metadata, ('tags', ..., 'tag', {str}))
# Sorted by "confidence", higher confidence = earlier in list
confidences = traverse_obj(metadata, ('tags', ..., 'confidence', ({int}, {float})))
if confidences:
tags = [tag for _, tag in sorted(zip(confidences, tags), reverse=True)]
formats = traverse_obj(video_info, ('variants', ..., {
'format_id': ('name', {str}),

View File

@@ -1,5 +1,8 @@
import json
from .common import InfoExtractor
from ..utils import parse_iso8601, traverse_obj, try_call
from ..utils import float_or_none, parse_iso8601, str_or_none, try_call
from ..utils.traversal import traverse_obj
class PrankCastIE(InfoExtractor):
@@ -64,3 +67,71 @@ class PrankCastIE(InfoExtractor):
'categories': [json_info.get('broadcast_category')],
'tags': try_call(lambda: json_info['broadcast_tags'].split(','))
}
class PrankCastPostIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?prankcast\.com/[^/?#]+/posts/(?P<id>\d+)-(?P<display_id>[^/?#]+)'
_TESTS = [{
'url': 'https://prankcast.com/devonanustart/posts/6214-happy-national-rachel-day-',
'info_dict': {
'id': '6214',
'ext': 'mp3',
'title': 'Happy National Rachel Day!',
'display_id': 'happy-national-rachel-day-',
'timestamp': 1704333938,
'uploader': 'Devonanustart',
'channel_id': '4',
'duration': 13175,
'cast': ['Devonanustart'],
'description': '',
'categories': ['prank call'],
'upload_date': '20240104'
}
}, {
'url': 'https://prankcast.com/despicabledogs/posts/6217-jake-the-work-crow-',
'info_dict': {
'id': '6217',
'ext': 'mp3',
'title': 'Jake the Work Crow!',
'display_id': 'jake-the-work-crow-',
'timestamp': 1704346592,
'uploader': 'despicabledogs',
'channel_id': '957',
'duration': 263.287,
'cast': ['despicabledogs'],
'description': 'https://imgur.com/a/vtxLvKU',
'categories': [],
'upload_date': '20240104'
}
}]
def _real_extract(self, url):
video_id, display_id = self._match_valid_url(url).group('id', 'display_id')
webpage = self._download_webpage(url, video_id)
post = self._search_nextjs_data(webpage, video_id)['props']['pageProps']['ssr_data_posts']
content = self._parse_json(post['post_contents_json'], video_id)[0]
uploader = post.get('user_name')
guests_json = traverse_obj(content, ('guests_json', {json.loads}, {dict})) or {}
return {
'id': video_id,
'title': post.get('post_title') or self._og_search_title(webpage),
'display_id': display_id,
'url': content.get('url'),
'timestamp': parse_iso8601(content.get('start_date') or content.get('crdate'), ' '),
'uploader': uploader,
'channel_id': str_or_none(post.get('user_id')),
'duration': float_or_none(content.get('duration')),
'cast': list(filter(None, [uploader] + traverse_obj(guests_json, (..., 'name')))),
'description': post.get('post_body'),
'categories': list(filter(None, [content.get('category')])),
'tags': try_call(lambda: list(filter('', post['post_tags'].split(',')))),
'subtitles': {
'live_chat': [{
'url': f'https://prankcast.com/api/private/chat/select-broadcast?id={post["content_id"]}&cache=',
'ext': 'json',
}],
} if post.get('content_id') else None
}

View File

@@ -1,5 +1,6 @@
import base64
import random
import re
import urllib.parse
from .common import InfoExtractor
@@ -11,6 +12,7 @@ from ..utils import (
unified_timestamp,
update_url_query,
)
from ..utils.traversal import traverse_obj
class RadikoBaseIE(InfoExtractor):
@@ -159,6 +161,12 @@ class RadikoBaseIE(InfoExtractor):
return formats
def _extract_performers(self, prog):
performers = traverse_obj(prog, (
'pfm/text()', ..., {lambda x: re.split(r'[//、 ,]', x)}, ..., {str.strip}))
# TODO: change 'artist' fields to 'artists' and return traversal list instead of str
return ', '.join(performers) or None
class RadikoIE(RadikoBaseIE):
_VALID_URL = r'https?://(?:www\.)?radiko\.jp/#!/ts/(?P<station>[A-Z0-9-]+)/(?P<id>\d+)'
@@ -186,10 +194,12 @@ class RadikoIE(RadikoBaseIE):
return {
'id': video_id,
'title': try_call(lambda: prog.find('title').text),
'artist': self._extract_performers(prog),
'description': clean_html(try_call(lambda: prog.find('info').text)),
'uploader': try_call(lambda: station_program.find('.//name').text),
'uploader_id': station,
'timestamp': vid_int,
'duration': try_call(lambda: unified_timestamp(radio_end, False) - unified_timestamp(radio_begin, False)),
'is_live': True,
'formats': self._extract_formats(
video_id=video_id, station=station, is_onair=False,
@@ -243,6 +253,7 @@ class RadikoRadioIE(RadikoBaseIE):
return {
'id': station,
'title': title,
'artist': self._extract_performers(prog),
'description': description,
'uploader': station_name,
'uploader_id': station,

View File

@@ -1,6 +1,7 @@
import re
from .common import InfoExtractor
from ..networking import HEADRequest
from ..utils import (
clean_html,
determine_ext,
@@ -91,7 +92,7 @@ class RaiBaseIE(InfoExtractor):
self.raise_geo_restricted(countries=self._GEO_COUNTRIES, metadata_available=True)
if not audio_only and not is_live:
formats.extend(self._create_http_urls(media_url, relinker_url, formats))
formats.extend(self._create_http_urls(media_url, relinker_url, formats, video_id))
return filter_dict({
'is_live': is_live,
@@ -99,7 +100,7 @@ class RaiBaseIE(InfoExtractor):
'formats': formats,
})
def _create_http_urls(self, manifest_url, relinker_url, fmts):
def _create_http_urls(self, manifest_url, relinker_url, fmts, video_id):
_MANIFEST_REG = r'/(?P<id>\w+)(?:_(?P<quality>[\d\,]+))?(?:\.mp4)?(?:\.csmil)?/playlist\.m3u8'
_MP4_TMPL = '%s&overrideUserAgentRule=mp4-%s'
_QUALITY = {
@@ -166,6 +167,14 @@ class RaiBaseIE(InfoExtractor):
'fps': 25,
}
# Check if MP4 download is available
try:
self._request_webpage(
HEADRequest(_MP4_TMPL % (relinker_url, '*')), video_id, 'Checking MP4 availability')
except ExtractorError as e:
self.to_screen(f'{video_id}: MP4 direct download is not available: {e.cause}')
return []
# filter out single-stream formats
fmts = [f for f in fmts
if not f.get('vcodec') == 'none' and not f.get('acodec') == 'none']

135
yt_dlp/extractor/redge.py Normal file
View File

@@ -0,0 +1,135 @@
import functools
from .common import InfoExtractor
from ..networking import HEADRequest
from ..utils import (
float_or_none,
int_or_none,
join_nonempty,
parse_qs,
update_url_query,
)
from ..utils.traversal import traverse_obj
class RedCDNLivxIE(InfoExtractor):
_VALID_URL = r'https?://[^.]+\.(?:dcs\.redcdn|atmcdn)\.pl/(?:live(?:dash|hls|ss)|nvr)/o2/(?P<tenant>[^/?#]+)/(?P<id>[^?#]+)\.livx'
IE_NAME = 'redcdnlivx'
_TESTS = [{
'url': 'https://r.dcs.redcdn.pl/livedash/o2/senat/ENC02/channel.livx?indexMode=true&startTime=638272860000&stopTime=638292544000',
'info_dict': {
'id': 'ENC02-638272860000-638292544000',
'ext': 'mp4',
'title': 'ENC02',
'duration': 19683.982,
'live_status': 'was_live',
},
}, {
'url': 'https://r.dcs.redcdn.pl/livedash/o2/sejm/ENC18/live.livx?indexMode=true&startTime=722333096000&stopTime=722335562000',
'info_dict': {
'id': 'ENC18-722333096000-722335562000',
'ext': 'mp4',
'title': 'ENC18',
'duration': 2463.995,
'live_status': 'was_live',
},
}, {
'url': 'https://r.dcs.redcdn.pl/livehls/o2/sportevolution/live/triathlon2018/warsaw.livx/playlist.m3u8?startTime=550305000000&stopTime=550327620000',
'info_dict': {
'id': 'triathlon2018-warsaw-550305000000-550327620000',
'ext': 'mp4',
'title': 'triathlon2018/warsaw',
'duration': 22619.98,
'live_status': 'was_live',
},
}, {
'url': 'https://n-25-12.dcs.redcdn.pl/nvr/o2/sejm/Migacz-ENC01/1.livx?startTime=722347200000&stopTime=722367345000',
'only_matching': True,
}, {
'url': 'https://redir.atmcdn.pl/nvr/o2/sejm/ENC08/1.livx?startTime=503831270000&stopTime=503840040000',
'only_matching': True,
}]
"""
Known methods (first in url path):
- `livedash` - DASH MPD
- `livehls` - HTTP Live Streaming
- `livess` - IIS Smooth Streaming
- `nvr` - CCTV mode, directly returns a file, typically flv, avc1, aac
- `sc` - shoutcast/icecast (audio streams, like radio)
"""
def _real_extract(self, url):
tenant, path = self._match_valid_url(url).group('tenant', 'id')
qs = parse_qs(url)
start_time = traverse_obj(qs, ('startTime', 0, {int_or_none}))
stop_time = traverse_obj(qs, ('stopTime', 0, {int_or_none}))
def livx_mode(mode):
suffix = ''
if mode == 'livess':
suffix = '/manifest'
elif mode == 'livehls':
suffix = '/playlist.m3u8'
file_qs = {}
if start_time:
file_qs['startTime'] = start_time
if stop_time:
file_qs['stopTime'] = stop_time
if mode == 'nvr':
file_qs['nolimit'] = 1
elif mode != 'sc':
file_qs['indexMode'] = 'true'
return update_url_query(f'https://r.dcs.redcdn.pl/{mode}/o2/{tenant}/{path}.livx{suffix}', file_qs)
# no id or title for a transmission. making ones up.
title = path \
.replace('/live', '').replace('live/', '') \
.replace('/channel', '').replace('channel/', '') \
.strip('/')
video_id = join_nonempty(title.replace('/', '-'), start_time, stop_time)
formats = []
# downloading the manifest separately here instead of _extract_ism_formats to also get some stream metadata
ism_res = self._download_xml_handle(
livx_mode('livess'), video_id,
note='Downloading ISM manifest',
errnote='Failed to download ISM manifest',
fatal=False)
ism_doc = None
if ism_res is not False:
ism_doc, ism_urlh = ism_res
formats, _ = self._parse_ism_formats_and_subtitles(ism_doc, ism_urlh.url, 'ss')
nvr_urlh = self._request_webpage(
HEADRequest(livx_mode('nvr')), video_id, 'Follow flv file redirect', fatal=False,
expected_status=lambda _: True)
if nvr_urlh and nvr_urlh.status == 200:
formats.append({
'url': nvr_urlh.url,
'ext': 'flv',
'format_id': 'direct-0',
'preference': -1, # might be slow
})
formats.extend(self._extract_mpd_formats(livx_mode('livedash'), video_id, mpd_id='dash', fatal=False))
formats.extend(self._extract_m3u8_formats(
livx_mode('livehls'), video_id, m3u8_id='hls', ext='mp4', fatal=False))
time_scale = traverse_obj(ism_doc, ('@TimeScale', {int_or_none})) or 10000000
duration = traverse_obj(
ism_doc, ('@Duration', {functools.partial(float_or_none, scale=time_scale)})) or None
live_status = None
if traverse_obj(ism_doc, '@IsLive') == 'TRUE':
live_status = 'is_live'
elif duration:
live_status = 'was_live'
return {
'id': video_id,
'title': title,
'formats': formats,
'duration': duration,
'live_status': live_status,
}

View File

@@ -7,11 +7,12 @@ from ..utils import (
str_to_int,
unified_strdate,
url_or_none,
urljoin,
)
class RedTubeIE(InfoExtractor):
_VALID_URL = r'https?://(?:(?:\w+\.)?redtube\.com/|embed\.redtube\.com/\?.*?\bid=)(?P<id>[0-9]+)'
_VALID_URL = r'https?://(?:(?:\w+\.)?redtube\.com(?:\.br)?/|embed\.redtube\.com/\?.*?\bid=)(?P<id>[0-9]+)'
_EMBED_REGEX = [r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//embed\.redtube\.com/\?.*?\bid=\d+)']
_TESTS = [{
'url': 'https://www.redtube.com/38864951',
@@ -34,6 +35,9 @@ class RedTubeIE(InfoExtractor):
}, {
'url': 'http://it.redtube.com/66418',
'only_matching': True,
}, {
'url': 'https://www.redtube.com.br/103224331',
'only_matching': True,
}]
def _real_extract(self, url):
@@ -79,7 +83,7 @@ class RedTubeIE(InfoExtractor):
'media definitions', default='{}'),
video_id, fatal=False)
for media in medias if isinstance(medias, list) else []:
format_url = url_or_none(media.get('videoUrl'))
format_url = urljoin('https://www.redtube.com', media.get('videoUrl'))
if not format_url:
continue
format_id = media.get('format')

View File

@@ -247,17 +247,17 @@ class MujRozhlasIE(RozhlasBaseIE):
'url': 'https://www.mujrozhlas.cz/vykopavky/ach-jo-zase-teleci-rizek-je-mnohem-min-cesky-nez-jsme-si-mysleli',
'md5': '6f8fd68663e64936623e67c152a669e0',
'info_dict': {
'id': '10739193',
'id': '10787730',
'ext': 'mp3',
'title': 'Ach jo, zase to telecí! Řízek je mnohem míň český, než jsme si mysleli',
'description': 'md5:db7141e9caaedc9041ec7cefb9a62908',
'timestamp': 1684915200,
'modified_timestamp': 1684922446,
'modified_timestamp': 1687550432,
'series': 'Vykopávky',
'thumbnail': 'https://portal.rozhlas.cz/sites/default/files/images/84377046610af6ddc54d910b1dd7a22b.jpg',
'channel_id': 'radio-wave',
'upload_date': '20230524',
'modified_date': '20230524',
'modified_date': '20230623',
},
}, {
# serial extraction
@@ -277,6 +277,26 @@ class MujRozhlasIE(RozhlasBaseIE):
'title': 'Nespavci',
'description': 'md5:c430adcbf9e2b9eac88b745881e814dc',
},
}, {
# serialPart
'url': 'https://www.mujrozhlas.cz/povidka/gustavo-adolfo-becquer-hora-duchu',
'info_dict': {
'id': '8889035',
'ext': 'm4a',
'title': 'Gustavo Adolfo Bécquer: Hora duchů',
'description': 'md5:343a15257b376c276e210b78e900ffea',
'chapter': 'Hora duchů a Polibek dva tajemné příběhy Gustava Adolfa Bécquera',
'thumbnail': 'https://portal.rozhlas.cz/sites/default/files/images/2adfe1387fb140634be725c1ccf26214.jpg',
'timestamp': 1708173000,
'episode': 'Episode 1',
'episode_number': 1,
'series': 'Povídka',
'modified_date': '20240217',
'upload_date': '20240217',
'modified_timestamp': 1708173198,
'channel_id': 'vltava',
},
'params': {'skip_download': 'dash'},
}]
def _call_api(self, path, item_id, msg='API JSON'):
@@ -322,7 +342,7 @@ class MujRozhlasIE(RozhlasBaseIE):
entity = info['siteEntityBundle']
if entity == 'episode':
if entity in ('episode', 'serialPart'):
return self._extract_audio_entry(self._call_api(
'episodes', info['contentId'], 'episode info API JSON'))

View File

@@ -9,7 +9,6 @@ from ..utils import (
get_element_html_by_class,
get_elements_by_class,
int_or_none,
join_nonempty,
parse_count,
parse_duration,
unescapeHTML,
@@ -18,10 +17,10 @@ from ..utils.traversal import traverse_obj
class Rule34VideoIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?rule34video\.com/videos/(?P<id>\d+)'
_VALID_URL = r'https?://(?:www\.)?rule34video\.com/videos?/(?P<id>\d+)'
_TESTS = [
{
'url': 'https://rule34video.com/videos/3065157/shot-it-mmd-hmv/',
'url': 'https://rule34video.com/video/3065157/shot-it-mmd-hmv/',
'md5': 'ffccac2c23799dabbd192621ae4d04f3',
'info_dict': {
'id': '3065157',
@@ -57,7 +56,7 @@ class Rule34VideoIE(InfoExtractor):
'comment_count': int,
'timestamp': 1640131200,
'description': '',
'creator': 'WildeerStudio',
'creators': ['WildeerStudio'],
'upload_date': '20211222',
'uploader': 'CerZule',
'uploader_url': 'https://rule34video.com/members/36281/',
@@ -81,13 +80,13 @@ class Rule34VideoIE(InfoExtractor):
'quality': quality,
})
categories, creator, uploader, uploader_url = [None] * 4
categories, creators, uploader, uploader_url = [None] * 4
for col in get_elements_by_class('col', webpage):
label = clean_html(get_element_by_class('label', col))
if label == 'Categories:':
categories = list(map(clean_html, get_elements_by_class('item', col)))
elif label == 'Artist:':
creator = join_nonempty(*map(clean_html, get_elements_by_class('item', col)), delim=', ')
creators = list(map(clean_html, get_elements_by_class('item', col)))
elif label == 'Uploaded By:':
uploader = clean_html(get_element_by_class('name', col))
uploader_url = extract_attributes(get_element_html_by_class('name', col) or '').get('href')
@@ -115,7 +114,7 @@ class Rule34VideoIE(InfoExtractor):
'comment_count': int_or_none(self._search_regex(
r'[^(]+\((\d+)\)', get_element_by_attribute('href', '#tab_comments', webpage), 'comment count', fatal=False)),
'age_limit': 18,
'creator': creator,
'creators': creators,
'uploader': uploader,
'uploader_url': uploader_url,
'categories': categories,

View File

@@ -383,7 +383,7 @@ class RumbleChannelIE(InfoExtractor):
if isinstance(e.cause, HTTPError) and e.cause.status == 404:
break
raise
for video_url in re.findall(r'class=video-item--a\s?href=([^>]+\.html)', webpage):
for video_url in re.findall(r'class="[^>"]*videostream__link[^>]+href="([^"]+\.html)"', webpage):
yield self.url_result('https://rumble.com' + video_url)
def _real_extract(self, url):

View File

@@ -5,7 +5,10 @@ from ..utils import traverse_obj, update_url_query
class ScreencastifyIE(InfoExtractor):
_VALID_URL = r'https?://watch\.screencastify\.com/v/(?P<id>[^/?#]+)'
_VALID_URL = [
r'https?://watch\.screencastify\.com/v/(?P<id>[^/?#]+)',
r'https?://app\.screencastify\.com/v[23]/watch/(?P<id>[^/?#]+)',
]
_TESTS = [{
'url': 'https://watch.screencastify.com/v/sYVkZip3quLKhHw4Ybk8',
'info_dict': {
@@ -19,6 +22,21 @@ class ScreencastifyIE(InfoExtractor):
'params': {
'skip_download': 'm3u8',
},
}, {
'url': 'https://app.screencastify.com/v3/watch/J5N7H11wofDN1jZUCr3t',
'info_dict': {
'id': 'J5N7H11wofDN1jZUCr3t',
'ext': 'mp4',
'uploader': 'Scott Piesen',
'description': '',
'title': 'Lesson Recording 1-17 Burrr...',
},
'params': {
'skip_download': 'm3u8',
},
}, {
'url': 'https://app.screencastify.com/v2/watch/BQ26VbUdfbQLhKzkktOk',
'only_matching': True,
}]
def _real_extract(self, url):

218
yt_dlp/extractor/sejmpl.py Normal file
View File

@@ -0,0 +1,218 @@
import datetime
from .common import InfoExtractor
from .redge import RedCDNLivxIE
from ..utils import (
clean_html,
join_nonempty,
js_to_json,
strip_or_none,
update_url_query,
)
from ..utils.traversal import traverse_obj
def is_dst(date):
last_march = datetime.datetime(date.year, 3, 31)
last_october = datetime.datetime(date.year, 10, 31)
last_sunday_march = last_march - datetime.timedelta(days=last_march.isoweekday() % 7)
last_sunday_october = last_october - datetime.timedelta(days=last_october.isoweekday() % 7)
return last_sunday_march.replace(hour=2) <= date <= last_sunday_october.replace(hour=3)
def rfc3339_to_atende(date):
date = datetime.datetime.fromisoformat(date)
date = date + datetime.timedelta(hours=1 if is_dst(date) else 0)
return int((date.timestamp() - 978307200) * 1000)
class SejmIE(InfoExtractor):
_VALID_URL = (
r'https?://(?:www\.)?sejm\.gov\.pl/[Ss]ejm(?P<term>\d+)\.nsf/transmisje(?:_arch)?\.xsp(?:\?[^#]*)?#(?P<id>[\dA-F]+)',
r'https?://(?:www\.)?sejm\.gov\.pl/[Ss]ejm(?P<term>\d+)\.nsf/transmisje(?:_arch)?\.xsp\?(?:[^#]+&)?unid=(?P<id>[\dA-F]+)',
r'https?://sejm-embed\.redcdn\.pl/[Ss]ejm(?P<term>\d+)\.nsf/VideoFrame\.xsp/(?P<id>[\dA-F]+)',
)
IE_NAME = 'sejm'
_TESTS = [{
# multiple cameras, polish SL iterpreter
'url': 'https://www.sejm.gov.pl/Sejm10.nsf/transmisje_arch.xsp#6181EF1AD9CEEBB5C1258A6D006452B5',
'info_dict': {
'id': '6181EF1AD9CEEBB5C1258A6D006452B5',
'title': '1. posiedzenie Sejmu X kadencji',
'duration': 20145,
'live_status': 'was_live',
'location': 'Sala Posiedzeń',
},
'playlist': [{
'info_dict': {
'id': 'ENC01-722340000000-722360145000',
'ext': 'mp4',
'duration': 20145,
'title': '1. posiedzenie Sejmu X kadencji - ENC01',
'live_status': 'was_live',
},
}, {
'info_dict': {
'id': 'ENC30-722340000000-722360145000',
'ext': 'mp4',
'duration': 20145,
'title': '1. posiedzenie Sejmu X kadencji - ENC30',
'live_status': 'was_live',
},
}, {
'info_dict': {
'id': 'ENC31-722340000000-722360145000',
'ext': 'mp4',
'duration': 20145,
'title': '1. posiedzenie Sejmu X kadencji - ENC31',
'live_status': 'was_live',
},
}, {
'info_dict': {
'id': 'ENC32-722340000000-722360145000',
'ext': 'mp4',
'duration': 20145,
'title': '1. posiedzenie Sejmu X kadencji - ENC32',
'live_status': 'was_live',
},
}, {
# sign lang interpreter
'info_dict': {
'id': 'Migacz-ENC01-1-722340000000-722360145000',
'ext': 'mp4',
'duration': 20145,
'title': '1. posiedzenie Sejmu X kadencji - Migacz-ENC01',
'live_status': 'was_live',
},
}],
}, {
'url': 'https://www.sejm.gov.pl/Sejm8.nsf/transmisje.xsp?unid=9377A9D65518E9A5C125808E002E9FF2',
'info_dict': {
'id': '9377A9D65518E9A5C125808E002E9FF2',
'title': 'Debata "Lepsza Polska: obywatelska"',
'description': 'KP .Nowoczesna',
'duration': 8770,
'live_status': 'was_live',
'location': 'sala kolumnowa im. Kazimierza Pużaka (bud. C-D)',
},
'playlist': [{
'info_dict': {
'id': 'ENC08-1-503831270000-503840040000',
'ext': 'mp4',
'duration': 8770,
'title': 'Debata "Lepsza Polska: obywatelska" - ENC08',
'live_status': 'was_live',
},
}],
}, {
# 7th term is very special, since it does not use redcdn livx
'url': 'https://www.sejm.gov.pl/sejm7.nsf/transmisje_arch.xsp?rok=2015&month=11#A6E6D475ECCC6FE5C1257EF90034817F',
'info_dict': {
'id': 'A6E6D475ECCC6FE5C1257EF90034817F',
'title': 'Konferencja prasowa - Stanowisko SLD ws. składu nowego rządu',
'description': 'SLD - Biuro Prasowe Klubu',
'duration': 514,
'location': 'sala 101/bud. C',
'live_status': 'was_live',
},
'playlist': [{
'info_dict': {
'id': 'A6E6D475ECCC6FE5C1257EF90034817F',
'ext': 'mp4',
'title': 'Konferencja prasowa - Stanowisko SLD ws. składu nowego rządu',
'duration': 514,
},
}],
}, {
'url': 'https://sejm-embed.redcdn.pl/Sejm10.nsf/VideoFrame.xsp/FED58EABB97FBD53C1258A7400386492',
'only_matching': True,
}]
def _real_extract(self, url):
term, video_id = self._match_valid_url(url).group('term', 'id')
frame = self._download_webpage(
f'https://sejm-embed.redcdn.pl/Sejm{term}.nsf/VideoFrame.xsp/{video_id}',
video_id)
# despite it says "transmisje_arch", it works for live streams too!
data = self._download_json(
f'https://www.sejm.gov.pl/Sejm{term}.nsf/transmisje_arch.xsp/json/{video_id}',
video_id)
params = data['params']
title = strip_or_none(data.get('title'))
if data.get('status') == 'VIDEO_ENDED':
live_status = 'was_live'
elif data.get('status') == 'VIDEO_PLAYING':
live_status = 'is_live'
else:
live_status = None
self.report_warning(f'unknown status: {data.get("status")}')
start_time = rfc3339_to_atende(params['start'])
# current streams have a stop time of *expected* end of session, but actual times
# can change during the transmission. setting a stop_time would artificially
# end the stream at that time, while the session actually keeps going.
if live_status == 'was_live':
stop_time = rfc3339_to_atende(params['stop'])
duration = (stop_time - start_time) // 1000
else:
stop_time, duration = None, None
entries = []
def add_entry(file, legacy_file=False):
if not file:
return
file = self._proto_relative_url(file)
if not legacy_file:
file = update_url_query(file, {'startTime': start_time})
if stop_time is not None:
file = update_url_query(file, {'stopTime': stop_time})
stream_id = self._search_regex(r'/o2/sejm/([^/]+)/[^./]+\.livx', file, 'stream id')
common_info = {
'url': file,
'duration': duration,
}
if legacy_file:
entries.append({
**common_info,
'id': video_id,
'title': title,
})
else:
entries.append({
**common_info,
'_type': 'url_transparent',
'ie_key': RedCDNLivxIE.ie_key(),
'id': stream_id,
'title': join_nonempty(title, stream_id, delim=' - '),
})
cameras = self._search_json(
r'var\s+cameras\s*=', frame, 'camera list', video_id,
contains_pattern=r'\[(?s:.+)\]', transform_source=js_to_json,
fatal=False) or []
for camera_file in traverse_obj(cameras, (..., 'file', {dict})):
if camera_file.get('flv'):
add_entry(camera_file['flv'])
elif camera_file.get('mp4'):
# this is only a thing in 7th term. no streams before, and starting 8th it's redcdn livx
add_entry(camera_file['mp4'], legacy_file=True)
else:
self.report_warning('Unknown camera stream type found')
if params.get('mig'):
add_entry(self._search_regex(r"var sliUrl\s*=\s*'([^']+)'", frame, 'sign language interpreter url', fatal=False))
return {
'_type': 'playlist',
'entries': entries,
'id': video_id,
'title': title,
'description': clean_html(data.get('desc')) or None,
'duration': duration,
'live_status': live_status,
'location': strip_or_none(data.get('location')),
}

View File

@@ -7,8 +7,6 @@ from ..utils import (
determine_ext,
dict_get,
int_or_none,
str_or_none,
strip_or_none,
traverse_obj,
try_get,
unified_timestamp,
@@ -388,15 +386,55 @@ class SVTSeriesIE(SVTPlayBaseIE):
dict_get(series, ('longDescription', 'shortDescription')))
class SVTPageIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?svt\.se/(?P<path>(?:[^/]+/)*(?P<id>[^/?&#]+))'
class SVTPageIE(SVTBaseIE):
_VALID_URL = r'https?://(?:www\.)?svt\.se/(?:[^/?#]+/)*(?P<id>[^/?&#]+)'
_TESTS = [{
'url': 'https://www.svt.se/nyheter/lokalt/skane/viktor-18-forlorade-armar-och-ben-i-sepsis-vill-ateruppta-karaten-och-bli-svetsare',
'info_dict': {
'title': 'Viktor, 18, förlorade armar och ben i sepsis vill återuppta karaten och bli svetsare',
'id': 'viktor-18-forlorade-armar-och-ben-i-sepsis-vill-ateruppta-karaten-och-bli-svetsare',
},
'playlist_count': 2,
}, {
'url': 'https://www.svt.se/nyheter/lokalt/skane/forsvarsmakten-om-trafikkaoset-pa-e22-kunde-inte-varit-dar-snabbare',
'info_dict': {
'id': 'jXvk42E',
'title': 'Försvarsmakten om trafikkaoset på E22: Kunde inte varit där snabbare',
'ext': 'mp4',
"duration": 80,
'age_limit': 0,
'timestamp': 1704370009,
'episode': 'Försvarsmakten om trafikkaoset på E22: Kunde inte varit där snabbare',
'series': 'Lokala Nyheter Skåne',
'upload_date': '20240104'
},
'params': {
'skip_download': True,
}
}, {
'url': 'https://www.svt.se/nyheter/svtforum/2023-tungt-ar-for-svensk-media',
'info_dict': {
'title': '2023 tungt år för svensk media',
'id': 'ewqAZv4',
'ext': 'mp4',
"duration": 3074,
'age_limit': 0,
'series': '',
'timestamp': 1702980479,
'upload_date': '20231219',
'episode': 'Mediestudier'
},
'params': {
'skip_download': True,
}
}, {
'url': 'https://www.svt.se/sport/ishockey/bakom-masken-lehners-kamp-mot-mental-ohalsa',
'info_dict': {
'id': '25298267',
'title': 'Bakom masken Lehners kamp mot mental ohälsa',
},
'playlist_count': 4,
'skip': 'Video is gone'
}, {
'url': 'https://www.svt.se/nyheter/utrikes/svenska-andrea-ar-en-mil-fran-branderna-i-kalifornien',
'info_dict': {
@@ -404,6 +442,7 @@ class SVTPageIE(InfoExtractor):
'title': 'Svenska Andrea redo att fly sitt hem i Kalifornien',
},
'playlist_count': 2,
'skip': 'Video is gone'
}, {
# only programTitle
'url': 'http://www.svt.se/sport/ishockey/jagr-tacklar-giroux-under-intervjun',
@@ -414,6 +453,7 @@ class SVTPageIE(InfoExtractor):
'duration': 27,
'age_limit': 0,
},
'skip': 'Video is gone'
}, {
'url': 'https://www.svt.se/nyheter/lokalt/vast/svt-testar-tar-nagon-upp-skrapet-1',
'only_matching': True,
@@ -427,26 +467,23 @@ class SVTPageIE(InfoExtractor):
return False if SVTIE.suitable(url) or SVTPlayIE.suitable(url) else super(SVTPageIE, cls).suitable(url)
def _real_extract(self, url):
path, display_id = self._match_valid_url(url).groups()
display_id = self._match_id(url)
article = self._download_json(
'https://api.svt.se/nss-api/page/' + path, display_id,
query={'q': 'articles'})['articles']['content'][0]
webpage = self._download_webpage(url, display_id)
title = self._og_search_title(webpage)
entries = []
urql_state = self._search_json(
r'window\.svt\.nyh\.urqlState\s*=', webpage, 'json data', display_id)
def _process_content(content):
if content.get('_type') in ('VIDEOCLIP', 'VIDEOEPISODE'):
video_id = compat_str(content['image']['svtId'])
entries.append(self.url_result(
'svt:' + video_id, SVTPlayIE.ie_key(), video_id))
data = traverse_obj(urql_state, (..., 'data', {str}, {json.loads}), get_all=False) or {}
for media in article.get('media', []):
_process_content(media)
def entries():
for video_id in set(traverse_obj(data, (
'page', (('topMedia', 'svtId'), ('body', ..., 'video', 'svtId')), {str}
))):
info = self._extract_video(
self._download_json(f'https://api.svt.se/video/{video_id}', video_id), video_id)
info['title'] = title
yield info
for obj in article.get('structuredBody', []):
_process_content(obj.get('content') or {})
return self.playlist_result(
entries, str_or_none(article.get('id')),
strip_or_none(article.get('title')))
return self.playlist_result(entries(), display_id, title)

View File

@@ -1,5 +1,5 @@
from .common import InfoExtractor
from ..utils import int_or_none, traverse_obj
from ..utils import ExtractorError, int_or_none, traverse_obj
class SwearnetEpisodeIE(InfoExtractor):
@@ -51,7 +51,13 @@ class SwearnetEpisodeIE(InfoExtractor):
display_id, season_number, episode_number = self._match_valid_url(url).group('id', 'season_num', 'episode_num')
webpage = self._download_webpage(url, display_id)
external_id = self._search_regex(r'externalid\s*=\s*"([^"]+)', webpage, 'externalid')
try:
external_id = self._search_regex(r'externalid\s*=\s*"([^"]+)', webpage, 'externalid')
except ExtractorError:
if 'Upgrade Now' in webpage:
self.raise_login_required()
raise
json_data = self._download_json(
f'https://play.vidyard.com/player/{external_id}.json', display_id)['payload']['chapters'][0]

View File

@@ -6,7 +6,7 @@ import string
import time
from .common import InfoExtractor
from ..compat import compat_urllib_parse_unquote, compat_urllib_parse_urlparse
from ..compat import compat_urllib_parse_urlparse
from ..networking import HEADRequest
from ..utils import (
ExtractorError,
@@ -15,7 +15,6 @@ from ..utils import (
UserNotLive,
determine_ext,
format_field,
get_first,
int_or_none,
join_nonempty,
merge_dicts,
@@ -219,8 +218,8 @@ class TikTokBaseIE(InfoExtractor):
def extract_addr(addr, add_meta={}):
parsed_meta, res = parse_url_key(addr.get('url_key', ''))
if res:
known_resolutions.setdefault(res, {}).setdefault('height', add_meta.get('height') or addr.get('height'))
known_resolutions[res].setdefault('width', add_meta.get('width') or addr.get('width'))
known_resolutions.setdefault(res, {}).setdefault('height', int_or_none(addr.get('height')))
known_resolutions[res].setdefault('width', int_or_none(addr.get('width')))
parsed_meta.update(known_resolutions.get(res, {}))
add_meta.setdefault('height', int_or_none(res[:-1]))
return [{
@@ -237,22 +236,26 @@ class TikTokBaseIE(InfoExtractor):
# Hack: Add direct video links first to prioritize them when removing duplicate formats
formats = []
width = int_or_none(video_info.get('width'))
height = int_or_none(video_info.get('height'))
if video_info.get('play_addr'):
formats.extend(extract_addr(video_info['play_addr'], {
'format_id': 'play_addr',
'format_note': 'Direct video',
'vcodec': 'h265' if traverse_obj(
video_info, 'is_bytevc1', 'is_h265') else 'h264', # TODO: Check for "direct iOS" videos, like https://www.tiktok.com/@cookierun_dev/video/7039716639834656002
'width': video_info.get('width'),
'height': video_info.get('height'),
'width': width,
'height': height,
}))
if video_info.get('download_addr'):
formats.extend(extract_addr(video_info['download_addr'], {
download_addr = video_info['download_addr']
dl_width = int_or_none(download_addr.get('width'))
formats.extend(extract_addr(download_addr, {
'format_id': 'download_addr',
'format_note': 'Download video%s' % (', watermarked' if video_info.get('has_watermark') else ''),
'vcodec': 'h264',
'width': video_info.get('width'),
'height': video_info.get('height'),
'width': dl_width or width,
'height': try_call(lambda: int(dl_width / 0.5625)) or height, # download_addr['height'] is wrong
'preference': -2 if video_info.get('has_watermark') else -1,
}))
if video_info.get('play_addr_h264'):
@@ -315,9 +318,6 @@ class TikTokBaseIE(InfoExtractor):
return {
'id': aweme_id,
'extractor_key': TikTokIE.ie_key(),
'extractor': TikTokIE.IE_NAME,
'webpage_url': self._create_url(author_info.get('uid'), aweme_id),
**traverse_obj(aweme_detail, {
'title': ('desc', {str}),
'description': ('desc', {str}),
@@ -921,20 +921,23 @@ class DouyinIE(TikTokBaseIE):
_VALID_URL = r'https?://(?:www\.)?douyin\.com/video/(?P<id>[0-9]+)'
_TESTS = [{
'url': 'https://www.douyin.com/video/6961737553342991651',
'md5': 'a97db7e3e67eb57bf40735c022ffa228',
'md5': '9ecce7bc5b302601018ecb2871c63a75',
'info_dict': {
'id': '6961737553342991651',
'ext': 'mp4',
'title': '#杨超越 小小水手带你去远航❤️',
'description': '#杨超越 小小水手带你去远航❤️',
'uploader': '6897520xka',
'uploader_id': '110403406559',
'uploader_url': 'https://www.douyin.com/user/MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98',
'channel_id': 'MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98',
'creator': '杨超越',
'duration': 19782,
'creators': ['杨超越'],
'duration': 19,
'timestamp': 1620905839,
'upload_date': '20210513',
'track': '@杨超越创作的原声',
'artists': ['杨超越'],
'view_count': int,
'like_count': int,
'repost_count': int,
@@ -943,20 +946,23 @@ class DouyinIE(TikTokBaseIE):
},
}, {
'url': 'https://www.douyin.com/video/6982497745948921092',
'md5': '34a87ebff3833357733da3fe17e37c0e',
'md5': '15c5e660b7048af3707304e3cc02bbb5',
'info_dict': {
'id': '6982497745948921092',
'ext': 'mp4',
'title': '这个夏日和小羊@杨超越 一起遇见白色幻想',
'description': '这个夏日和小羊@杨超越 一起遇见白色幻想',
'uploader': '0731chaoyue',
'uploader_id': '408654318141572',
'uploader_url': 'https://www.douyin.com/user/MS4wLjABAAAAZJpnglcjW2f_CMVcnqA_6oVBXKWMpH0F8LIHuUu8-lA',
'channel_id': 'MS4wLjABAAAAZJpnglcjW2f_CMVcnqA_6oVBXKWMpH0F8LIHuUu8-lA',
'creator': '杨超越工作室',
'duration': 42479,
'creators': ['杨超越工作室'],
'duration': 42,
'timestamp': 1625739481,
'upload_date': '20210708',
'track': '@杨超越工作室创作的原声',
'artists': ['杨超越工作室'],
'view_count': int,
'like_count': int,
'repost_count': int,
@@ -965,20 +971,23 @@ class DouyinIE(TikTokBaseIE):
},
}, {
'url': 'https://www.douyin.com/video/6953975910773099811',
'md5': 'dde3302460f19db59c47060ff013b902',
'md5': '0e6443758b8355db9a3c34864a4276be',
'info_dict': {
'id': '6953975910773099811',
'ext': 'mp4',
'title': '#一起看海 出现在你的夏日里',
'description': '#一起看海 出现在你的夏日里',
'uploader': '6897520xka',
'uploader_id': '110403406559',
'uploader_url': 'https://www.douyin.com/user/MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98',
'channel_id': 'MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98',
'creator': '杨超越',
'duration': 17343,
'creators': ['杨超越'],
'duration': 17,
'timestamp': 1619098692,
'upload_date': '20210422',
'track': '@杨超越创作的原声',
'artists': ['杨超越'],
'view_count': int,
'like_count': int,
'repost_count': int,
@@ -1004,20 +1013,23 @@ class DouyinIE(TikTokBaseIE):
'skip': 'No longer available',
}, {
'url': 'https://www.douyin.com/video/6963263655114722595',
'md5': 'cf9f11f0ec45d131445ec2f06766e122',
'md5': '1440bcf59d8700f8e014da073a4dfea8',
'info_dict': {
'id': '6963263655114722595',
'ext': 'mp4',
'title': '#哪个爱豆的105度最甜 换个角度看看我哈哈',
'description': '#哪个爱豆的105度最甜 换个角度看看我哈哈',
'uploader': '6897520xka',
'uploader_id': '110403406559',
'uploader_url': 'https://www.douyin.com/user/MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98',
'channel_id': 'MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98',
'creator': '杨超越',
'duration': 15115,
'creators': ['杨超越'],
'duration': 15,
'timestamp': 1621261163,
'upload_date': '20210517',
'track': '@杨超越创作的原声',
'artists': ['杨超越'],
'view_count': int,
'like_count': int,
'repost_count': int,
@@ -1025,34 +1037,23 @@ class DouyinIE(TikTokBaseIE):
'thumbnail': r're:https?://.+\.jpe?g',
},
}]
_APP_VERSIONS = [('23.3.0', '230300')]
_APP_NAME = 'aweme'
_AID = 1128
_API_HOSTNAME = 'aweme.snssdk.com'
_UPLOADER_URL_FORMAT = 'https://www.douyin.com/user/%s'
_WEBPAGE_HOST = 'https://www.douyin.com/'
def _real_extract(self, url):
video_id = self._match_id(url)
try:
return self._extract_aweme_app(video_id)
except ExtractorError as e:
e.expected = True
self.to_screen(f'{e}; trying with webpage')
webpage = self._download_webpage(url, video_id)
render_data = self._search_json(
r'<script [^>]*\bid=[\'"]RENDER_DATA[\'"][^>]*>', webpage, 'render data', video_id,
contains_pattern=r'%7B(?s:.+)%7D', fatal=False, transform_source=compat_urllib_parse_unquote)
if not render_data:
detail = traverse_obj(self._download_json(
'https://www.douyin.com/aweme/v1/web/aweme/detail/', video_id,
'Downloading web detail JSON', 'Failed to download web detail JSON',
query={'aweme_id': video_id}, fatal=False), ('aweme_detail', {dict}))
if not detail:
# TODO: Run verification challenge code to generate signature cookies
cookies = self._get_cookies(self._WEBPAGE_HOST)
expected = not cookies.get('s_v_web_id') or not cookies.get('ttwid')
raise ExtractorError(
'Fresh cookies (not necessarily logged in) are needed', expected=expected)
'Fresh cookies (not necessarily logged in) are needed',
expected=not self._get_cookies(self._WEBPAGE_HOST).get('s_v_web_id'))
return self._parse_aweme_video_web(get_first(render_data, ('aweme', 'detail')), url, video_id)
return self._parse_aweme_video_app(detail)
class TikTokVMIE(InfoExtractor):

View File

@@ -21,7 +21,7 @@ from ..utils import (
class TVPIE(InfoExtractor):
IE_NAME = 'tvp'
IE_DESC = 'Telewizja Polska'
_VALID_URL = r'https?://(?:[^/]+\.)?(?:tvp(?:parlament)?\.(?:pl|info)|tvpworld\.com|swipeto\.pl)/(?:(?!\d+/)[^/]+/)*(?P<id>\d+)'
_VALID_URL = r'https?://(?:[^/]+\.)?(?:tvp(?:parlament)?\.(?:pl|info)|tvpworld\.com|swipeto\.pl)/(?:(?!\d+/)[^/]+/)*(?P<id>\d+)(?:[/?#]|$)'
_TESTS = [{
# TVPlayer 2 in js wrapper
@@ -514,7 +514,7 @@ class TVPVODBaseIE(InfoExtractor):
class TVPVODVideoIE(TVPVODBaseIE):
IE_NAME = 'tvp:vod'
_VALID_URL = r'https?://vod\.tvp\.pl/[a-z\d-]+,\d+/[a-z\d-]+(?<!-odcinki)(?:-odcinki,\d+/odcinek-\d+,S\d+E\d+)?,(?P<id>\d+)(?:\?[^#]+)?(?:#.+)?$'
_VALID_URL = r'https?://vod\.tvp\.pl/(?P<category>[a-z\d-]+,\d+)/[a-z\d-]+(?<!-odcinki)(?:-odcinki,\d+/odcinek-\d+,S\d+E\d+)?,(?P<id>\d+)/?(?:[?#]|$)'
_TESTS = [{
'url': 'https://vod.tvp.pl/dla-dzieci,24/laboratorium-alchemika-odcinki,309338/odcinek-24,S01E24,311357',
@@ -560,12 +560,23 @@ class TVPVODVideoIE(TVPVODBaseIE):
'thumbnail': 're:https?://.+',
},
'params': {'skip_download': 'm3u8'},
}, {
'url': 'https://vod.tvp.pl/live,1/tvp-world,399731',
'info_dict': {
'id': '399731',
'ext': 'mp4',
'title': r're:TVP WORLD \d{4}-\d{2}-\d{2} \d{2}:\d{2}',
'live_status': 'is_live',
'thumbnail': 're:https?://.+',
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
category, video_id = self._match_valid_url(url).group('category', 'id')
info_dict = self._parse_video(self._call_api(f'vods/{video_id}', video_id), with_url=False)
is_live = category == 'live,1'
entity = 'lives' if is_live else 'vods'
info_dict = self._parse_video(self._call_api(f'{entity}/{video_id}', video_id), with_url=False)
playlist = self._call_api(f'{video_id}/videos/playlist', video_id, query={'videoType': 'MOVIE'})
@@ -582,6 +593,8 @@ class TVPVODVideoIE(TVPVODBaseIE):
'ext': 'ttml',
})
info_dict['is_live'] = is_live
return info_dict

View File

@@ -100,9 +100,13 @@ class TwitterBaseIE(InfoExtractor):
if not variant_url:
return [], {}
elif '.m3u8' in variant_url:
return self._extract_m3u8_formats_and_subtitles(
fmts, subs = self._extract_m3u8_formats_and_subtitles(
variant_url, video_id, 'mp4', 'm3u8_native',
m3u8_id='hls', fatal=False)
for f in traverse_obj(fmts, lambda _, v: v['vcodec'] == 'none' and v.get('tbr') is None):
if mobj := re.match(r'hls-[Aa]udio-(?P<bitrate>\d{4,})', f['format_id']):
f['tbr'] = int_or_none(mobj.group('bitrate'), 1000)
return fmts, subs
else:
tbr = int_or_none(dict_get(variant, ('bitrate', 'bit_rate')), 1000) or None
f = {
@@ -471,6 +475,7 @@ class TwitterIE(TwitterBaseIE):
'title': 'FREE THE NIPPLE - FTN supporters on Hollywood Blvd today!',
'thumbnail': r're:^https?://.*\.jpg',
'description': 'FTN supporters on Hollywood Blvd today! http://t.co/c7jHH749xJ',
'channel_id': '549749560',
'uploader': 'FREE THE NIPPLE',
'uploader_id': 'freethenipple',
'duration': 12.922,
@@ -484,6 +489,7 @@ class TwitterIE(TwitterBaseIE):
'age_limit': 18,
'_old_archive_ids': ['twitter 643211948184596480'],
},
'skip': 'Requires authentication',
}, {
'url': 'https://twitter.com/giphz/status/657991469417025536/photo/1',
'md5': 'f36dcd5fb92bf7057f155e7d927eeb42',
@@ -506,6 +512,7 @@ class TwitterIE(TwitterBaseIE):
'ext': 'mp4',
'title': r're:Star Wars.*A new beginning is coming December 18.*',
'description': 'A new beginning is coming December 18. Watch the official 60 second #TV spot for #StarWars: #TheForceAwakens. https://t.co/OkSqT2fjWJ',
'channel_id': '20106852',
'uploader_id': 'starwars',
'uploader': r're:Star Wars.*',
'timestamp': 1447395772,
@@ -551,6 +558,7 @@ class TwitterIE(TwitterBaseIE):
'title': 'jaydin donte geer - BEAT PROD: @suhmeduh #Damndaniel',
'description': 'BEAT PROD: @suhmeduh https://t.co/HBrQ4AfpvZ #Damndaniel https://t.co/byBooq2ejZ',
'thumbnail': r're:^https?://.*\.jpg',
'channel_id': '1383165541',
'uploader': 'jaydin donte geer',
'uploader_id': 'jaydingeer',
'duration': 30.0,
@@ -591,6 +599,7 @@ class TwitterIE(TwitterBaseIE):
'ext': 'mp4',
'title': 'Captain America - @King0fNerd Are you sure you made the right choice? Find out in theaters.',
'description': '@King0fNerd Are you sure you made the right choice? Find out in theaters. https://t.co/GpgYi9xMJI',
'channel_id': '701615052',
'uploader_id': 'CaptainAmerica',
'uploader': 'Captain America',
'duration': 3.17,
@@ -627,6 +636,7 @@ class TwitterIE(TwitterBaseIE):
'ext': 'mp4',
'title': 'عالم الأخبار - كلمة تاريخية بجلسة الجناسي التاريخية.. النائب خالد مؤنس العتيبي للمعارضين : اتقوا الله .. الظلم ظلمات يوم القيامة',
'description': 'كلمة تاريخية بجلسة الجناسي التاريخية.. النائب خالد مؤنس العتيبي للمعارضين : اتقوا الله .. الظلم ظلمات يوم القيامة https://t.co/xg6OhpyKfN',
'channel_id': '2526757026',
'uploader': 'عالم الأخبار',
'uploader_id': 'news_al3alm',
'duration': 277.4,
@@ -651,6 +661,7 @@ class TwitterIE(TwitterBaseIE):
'title': 'Préfet de Guadeloupe - [Direct] #Maria Le centre se trouve actuellement au sud de Basse-Terre. Restez confinés. Réfugiez-vous dans la pièce la + sûre.',
'thumbnail': r're:^https?://.*\.jpg',
'description': '[Direct] #Maria Le centre se trouve actuellement au sud de Basse-Terre. Restez confinés. Réfugiez-vous dans la pièce la + sûre. https://t.co/mwx01Rs4lo',
'channel_id': '2319432498',
'uploader': 'Préfet de Guadeloupe',
'uploader_id': 'Prefet971',
'duration': 47.48,
@@ -677,6 +688,7 @@ class TwitterIE(TwitterBaseIE):
'title': 're:.*?Shep is on a roll today.*?',
'thumbnail': r're:^https?://.*\.jpg',
'description': 'md5:37b9f2ff31720cef23b2bd42ee8a0f09',
'channel_id': '255036353',
'uploader': 'Lis Power',
'uploader_id': 'LisPower1',
'duration': 111.278,
@@ -741,6 +753,7 @@ class TwitterIE(TwitterBaseIE):
'title': 'md5:d1c4941658e4caaa6cb579260d85dcba',
'thumbnail': r're:^https?://.*\.jpg',
'description': 'md5:71ead15ec44cee55071547d6447c6a3e',
'channel_id': '18552281',
'uploader': 'Brooklyn Nets',
'uploader_id': 'BrooklynNets',
'duration': 324.484,
@@ -763,10 +776,11 @@ class TwitterIE(TwitterBaseIE):
'id': '1577855447914409984',
'display_id': '1577855540407197696',
'ext': 'mp4',
'title': 'md5:9d198efb93557b8f8d5b78c480407214',
'title': 'md5:466a3a8b049b5f5a13164ce915484b51',
'description': 'md5:b9c3699335447391d11753ab21c70a74',
'upload_date': '20221006',
'uploader': 'oshtru',
'channel_id': '143077138',
'uploader': 'Oshtru',
'uploader_id': 'oshtru',
'uploader_url': 'https://twitter.com/oshtru',
'thumbnail': r're:^https?://.*\.jpg',
@@ -784,9 +798,10 @@ class TwitterIE(TwitterBaseIE):
'url': 'https://twitter.com/UltimaShadowX/status/1577719286659006464',
'info_dict': {
'id': '1577719286659006464',
'title': 'Ultima - Test',
'title': 'Ultima Reload - Test',
'description': 'Test https://t.co/Y3KEZD7Dad',
'uploader': 'Ultima',
'channel_id': '168922496',
'uploader': 'Ultima Reload',
'uploader_id': 'UltimaShadowX',
'uploader_url': 'https://twitter.com/UltimaShadowX',
'upload_date': '20221005',
@@ -808,6 +823,7 @@ class TwitterIE(TwitterBaseIE):
'title': 'md5:eec26382babd0f7c18f041db8ae1c9c9',
'thumbnail': r're:^https?://.*\.jpg',
'description': 'md5:95aea692fda36a12081b9629b02daa92',
'channel_id': '1094109584',
'uploader': 'Max Olson',
'uploader_id': 'MesoMax919',
'uploader_url': 'https://twitter.com/MesoMax919',
@@ -830,6 +846,7 @@ class TwitterIE(TwitterBaseIE):
'ext': 'mp4',
'title': str,
'description': str,
'channel_id': '1217167793541480450',
'uploader': str,
'uploader_id': 'Rizdraws',
'uploader_url': 'https://twitter.com/Rizdraws',
@@ -840,7 +857,8 @@ class TwitterIE(TwitterBaseIE):
'repost_count': int,
'comment_count': int,
'age_limit': 18,
'tags': []
'tags': [],
'_old_archive_ids': ['twitter 1575199173472927762'],
},
'params': {'skip_download': 'The media could not be played'},
'skip': 'Requires authentication',
@@ -852,6 +870,7 @@ class TwitterIE(TwitterBaseIE):
'id': '1395079556562706435',
'title': str,
'tags': [],
'channel_id': '21539378',
'uploader': str,
'like_count': int,
'upload_date': '20210519',
@@ -869,6 +888,7 @@ class TwitterIE(TwitterBaseIE):
'info_dict': {
'id': '1578353380363501568',
'title': str,
'channel_id': '2195866214',
'uploader_id': 'DavidToons_',
'repost_count': int,
'like_count': int,
@@ -888,6 +908,7 @@ class TwitterIE(TwitterBaseIE):
'id': '1578401165338976258',
'title': str,
'description': 'md5:659a6b517a034b4cee5d795381a2dc41',
'channel_id': '19338359',
'uploader': str,
'uploader_id': 'primevideouk',
'timestamp': 1665155137,
@@ -929,6 +950,7 @@ class TwitterIE(TwitterBaseIE):
'description': 'md5:591c19ce66fadc2359725d5cd0d1052c',
'comment_count': int,
'uploader_id': 'CTVJLaidlaw',
'channel_id': '80082014',
'repost_count': int,
'tags': ['colorectalcancer', 'cancerjourney', 'imnotaquitter'],
'upload_date': '20221208',
@@ -946,6 +968,7 @@ class TwitterIE(TwitterBaseIE):
'title': 'md5:7662a0a27ce6faa3e5b160340f3cfab1',
'thumbnail': r're:^https?://.+\.jpg',
'timestamp': 1670459604.0,
'channel_id': '80082014',
'uploader_id': 'CTVJLaidlaw',
'uploader': 'Jocelyn Laidlaw',
'repost_count': int,
@@ -972,6 +995,7 @@ class TwitterIE(TwitterBaseIE):
'title': '뽀 - 아 최우제 이동속도 봐',
'description': '아 최우제 이동속도 봐 https://t.co/dxu2U5vXXB',
'duration': 24.598,
'channel_id': '1281839411068432384',
'uploader': '',
'uploader_id': 's2FAKER',
'uploader_url': 'https://twitter.com/s2FAKER',
@@ -985,6 +1009,7 @@ class TwitterIE(TwitterBaseIE):
'comment_count': int,
'_old_archive_ids': ['twitter 1621117700482416640'],
},
'skip': 'Requires authentication',
}, {
'url': 'https://twitter.com/hlo_again/status/1599108751385972737/video/2',
'info_dict': {
@@ -992,6 +1017,7 @@ class TwitterIE(TwitterBaseIE):
'display_id': '1599108751385972737',
'ext': 'mp4',
'title': '\u06ea - \U0001F48B',
'channel_id': '1347791436809441283',
'uploader_url': 'https://twitter.com/hlo_again',
'like_count': int,
'uploader_id': 'hlo_again',
@@ -1014,6 +1040,7 @@ class TwitterIE(TwitterBaseIE):
'id': '1600009362759733248',
'display_id': '1600009574919962625',
'ext': 'mp4',
'channel_id': '211814412',
'uploader_url': 'https://twitter.com/MunTheShinobi',
'description': 'This is a genius ad by Apple. \U0001f525\U0001f525\U0001f525\U0001f525\U0001f525 https://t.co/cNsA0MoOml',
'thumbnail': 'https://pbs.twimg.com/ext_tw_video_thumb/1600009362759733248/pu/img/XVhFQivj75H_YxxV.jpg?name=orig',
@@ -1061,6 +1088,7 @@ class TwitterIE(TwitterBaseIE):
'display_id': '1695424220702888009',
'title': 'md5:e8daa9527bc2b947121395494f786d9d',
'description': 'md5:004f2d37fd58737724ec75bc7e679938',
'channel_id': '15212187',
'uploader': 'Benny Johnson',
'uploader_id': 'bennyjohnson',
'uploader_url': 'https://twitter.com/bennyjohnson',
@@ -1084,6 +1112,7 @@ class TwitterIE(TwitterBaseIE):
'display_id': '1695424220702888009',
'title': 'md5:e8daa9527bc2b947121395494f786d9d',
'description': 'md5:004f2d37fd58737724ec75bc7e679938',
'channel_id': '15212187',
'uploader': 'Benny Johnson',
'uploader_id': 'bennyjohnson',
'uploader_url': 'https://twitter.com/bennyjohnson',
@@ -1117,7 +1146,7 @@ class TwitterIE(TwitterBaseIE):
},
'add_ie': ['TwitterBroadcast'],
}, {
# Animated gif and quote tweet video, with syndication API
# Animated gif and quote tweet video
'url': 'https://twitter.com/BAKKOOONN/status/1696256659889565950',
'playlist_mincount': 2,
'info_dict': {
@@ -1125,6 +1154,7 @@ class TwitterIE(TwitterBaseIE):
'title': 'BAKOON - https://t.co/zom968d0a0',
'description': 'https://t.co/zom968d0a0',
'tags': [],
'channel_id': '1263540390',
'uploader': 'BAKOON',
'uploader_id': 'BAKKOOONN',
'uploader_url': 'https://twitter.com/BAKKOOONN',
@@ -1132,19 +1162,21 @@ class TwitterIE(TwitterBaseIE):
'timestamp': 1693254077.0,
'upload_date': '20230828',
'like_count': int,
'comment_count': int,
'repost_count': int,
},
'params': {'extractor_args': {'twitter': {'api': ['syndication']}}},
'expected_warnings': ['Not all metadata'],
'skip': 'Requires authentication',
}, {
# "stale tweet" with typename "TweetWithVisibilityResults"
'url': 'https://twitter.com/RobertKennedyJr/status/1724884212803834154',
'md5': '62b1e11cdc2cdd0e527f83adb081f536',
'md5': '511377ff8dfa7545307084dca4dce319',
'info_dict': {
'id': '1724883339285544960',
'ext': 'mp4',
'title': 'md5:cc56716f9ed0b368de2ba54c478e493c',
'description': 'md5:9dc14f5b0f1311fc7caf591ae253a164',
'display_id': '1724884212803834154',
'channel_id': '337808606',
'uploader': 'Robert F. Kennedy Jr',
'uploader_id': 'RobertKennedyJr',
'uploader_url': 'https://twitter.com/RobertKennedyJr',
@@ -1386,6 +1418,7 @@ class TwitterIE(TwitterBaseIE):
'description': description,
'uploader': uploader,
'timestamp': unified_timestamp(status.get('created_at')),
'channel_id': str_or_none(status.get('user_id_str')) or str_or_none(user.get('id_str')),
'uploader_id': uploader_id,
'uploader_url': format_field(uploader_id, None, 'https://twitter.com/%s'),
'like_count': int_or_none(status.get('favorite_count')),

View File

@@ -10,6 +10,7 @@ from ..utils import (
parse_duration,
traverse_obj,
try_call,
url_or_none,
urljoin,
variadic,
)
@@ -83,6 +84,7 @@ class TxxxIE(InfoExtractor):
'like_count': int,
'dislike_count': int,
'age_limit': 18,
'thumbnail': 'https://tn.txxx.tube/contents/videos_sources/16574000/16574965/screenshots/1.jpg',
}
}, {
'url': 'https://txxx.tube/videos/16574965/digital-desire-malena-morgan/',
@@ -98,6 +100,7 @@ class TxxxIE(InfoExtractor):
'like_count': int,
'dislike_count': int,
'age_limit': 18,
'thumbnail': 'https://tn.txxx.tube/contents/videos_sources/16574000/16574965/screenshots/1.jpg',
}
}, {
'url': 'https://vxxx.com/video-68925/',
@@ -113,6 +116,7 @@ class TxxxIE(InfoExtractor):
'like_count': int,
'dislike_count': int,
'age_limit': 18,
'thumbnail': 'https://tn.vxxx.com/contents/videos_sources/68000/68925/screenshots/1.jpg',
}
}, {
'url': 'https://hclips.com/videos/6291073/malena-morgan-masturbates-her-sweet/',
@@ -128,6 +132,7 @@ class TxxxIE(InfoExtractor):
'like_count': int,
'dislike_count': int,
'age_limit': 18,
'thumbnail': 'https://hctn.nv7s.com/contents/videos_sources/6291000/6291073/screenshots/1.jpg',
}
}, {
'url': 'https://hdzog.com/videos/67063/gorgeous-malena-morgan-will-seduce-you-at-the-first-glance/',
@@ -143,6 +148,7 @@ class TxxxIE(InfoExtractor):
'like_count': int,
'dislike_count': int,
'age_limit': 18,
'thumbnail': 'https://tn.hdzog.com/contents/videos_sources/67000/67063/screenshots/1.jpg',
}
}, {
'url': 'https://hdzog.tube/videos/67063/gorgeous-malena-morgan-will-seduce-you-at-the-first-glance/',
@@ -158,6 +164,7 @@ class TxxxIE(InfoExtractor):
'like_count': int,
'dislike_count': int,
'age_limit': 18,
'thumbnail': 'https://tn.hdzog.com/contents/videos_sources/67000/67063/screenshots/1.jpg',
}
}, {
'url': 'https://hotmovs.com/videos/8789287/unbelievable-malena-morgan-performing-in-incredible-masturantion/',
@@ -173,6 +180,7 @@ class TxxxIE(InfoExtractor):
'like_count': int,
'dislike_count': int,
'age_limit': 18,
'thumbnail': 'https://tn.hotmovs.com/contents/videos_sources/8789000/8789287/screenshots/10.jpg',
}
}, {
'url': 'https://hotmovs.tube/videos/8789287/unbelievable-malena-morgan-performing-in-incredible-masturantion/',
@@ -188,6 +196,7 @@ class TxxxIE(InfoExtractor):
'like_count': int,
'dislike_count': int,
'age_limit': 18,
'thumbnail': 'https://tn.hotmovs.com/contents/videos_sources/8789000/8789287/screenshots/10.jpg',
}
}, {
'url': 'https://inporn.com/video/517897/malena-morgan-solo/',
@@ -203,6 +212,7 @@ class TxxxIE(InfoExtractor):
'like_count': int,
'dislike_count': int,
'age_limit': 18,
'thumbnail': 'https://iptn.m3pd.com/media/tn/sources/517897_1.jpg',
}
}, {
'url': 'https://privatehomeclips.com/videos/3630599/malena-morgan-cam-show/',
@@ -218,6 +228,7 @@ class TxxxIE(InfoExtractor):
'like_count': int,
'dislike_count': int,
'age_limit': 18,
'thumbnail': 'https://hctn.nv7s.com/contents/videos_sources/3630000/3630599/screenshots/15.jpg',
}
}, {
'url': 'https://tubepornclassic.com/videos/1015455/mimi-rogers-full-body-massage-nude-compilation/',
@@ -233,6 +244,7 @@ class TxxxIE(InfoExtractor):
'like_count': int,
'dislike_count': int,
'age_limit': 18,
'thumbnail': 'https://tn.tubepornclassic.com/contents/videos_sources/1015000/1015455/screenshots/6.jpg',
}
}, {
'url': 'https://upornia.com/videos/1498858/twistys-malena-morgan-starring-at-dr-morgan-baller/',
@@ -248,6 +260,7 @@ class TxxxIE(InfoExtractor):
'like_count': int,
'dislike_count': int,
'age_limit': 18,
'thumbnail': 'https://tn.upornia.com/contents/videos_sources/1498000/1498858/screenshots/1.jpg',
}
}, {
'url': 'https://upornia.tube/videos/1498858/twistys-malena-morgan-starring-at-dr-morgan-baller/',
@@ -263,6 +276,7 @@ class TxxxIE(InfoExtractor):
'like_count': int,
'dislike_count': int,
'age_limit': 18,
'thumbnail': 'https://tn.upornia.com/contents/videos_sources/1498000/1498858/screenshots/1.jpg',
}
}, {
'url': 'https://vjav.com/videos/11761/yui-hatano-in-if-yui-was-my-girlfriend2/',
@@ -278,6 +292,7 @@ class TxxxIE(InfoExtractor):
'like_count': int,
'dislike_count': int,
'age_limit': 18,
'thumbnail': 'https://tn.vjav.com/contents/videos_sources/11000/11761/screenshots/23.jpg',
}
}, {
'url': 'https://vjav.tube/videos/11761/yui-hatano-in-if-yui-was-my-girlfriend2/',
@@ -293,6 +308,7 @@ class TxxxIE(InfoExtractor):
'like_count': int,
'dislike_count': int,
'age_limit': 18,
'thumbnail': 'https://tn.vjav.com/contents/videos_sources/11000/11761/screenshots/23.jpg',
}
}, {
'url': 'https://voyeurhit.com/videos/332875/charlotte-stokely-elle-alexandra-malena-morgan-lingerie/',
@@ -308,6 +324,7 @@ class TxxxIE(InfoExtractor):
'like_count': int,
'dislike_count': int,
'age_limit': 18,
'thumbnail': 'https://tn.voyeurhit.com/contents/videos_sources/332000/332875/screenshots/1.jpg',
}
}, {
'url': 'https://voyeurhit.tube/videos/332875/charlotte-stokely-elle-alexandra-malena-morgan-lingerie/',
@@ -323,6 +340,7 @@ class TxxxIE(InfoExtractor):
'like_count': int,
'dislike_count': int,
'age_limit': 18,
'thumbnail': 'https://tn.voyeurhit.com/contents/videos_sources/332000/332875/screenshots/1.jpg',
}
}]
_WEBPAGE_TESTS = [{
@@ -338,6 +356,7 @@ class TxxxIE(InfoExtractor):
'like_count': int,
'dislike_count': int,
'age_limit': 18,
'thumbnail': 'https://hctn.nv7s.com/contents/videos_sources/5119000/5119660/screenshots/1.jpg',
}
}]
@@ -371,6 +390,7 @@ class TxxxIE(InfoExtractor):
'like_count': int_or_none(traverse_obj(video_info, ('video', 'statistics', 'likes'))),
'dislike_count': int_or_none(traverse_obj(video_info, ('video', 'statistics', 'dislikes'))),
'age_limit': 18,
'thumbnail': traverse_obj(video_info, ('video', 'thumbsrc', {url_or_none})),
'formats': get_formats(host, video_file),
}

View File

@@ -10,7 +10,8 @@ from ..utils import (
class UtreonIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?utreon\.com/v/(?P<id>[\w-]+)'
IE_NAME = 'playeur'
_VALID_URL = r'https?://(?:www\.)?(?:utreon|playeur)\.com/v/(?P<id>[\w-]+)'
_TESTS = [{
'url': 'https://utreon.com/v/z_I7ikQbuDw',
'info_dict': {
@@ -19,8 +20,9 @@ class UtreonIE(InfoExtractor):
'title': 'Freedom Friday meditation - Rising in the wind',
'description': 'md5:a9bf15a42434a062fe313b938343ad1b',
'uploader': 'Heather Dawn Elemental Health',
'thumbnail': 'https://data-1.utreon.com/v/MG/M2/NT/z_I7ikQbuDw/z_I7ikQbuDw_preview.jpg',
'thumbnail': r're:^https?://.+\.jpg',
'release_date': '20210723',
'duration': 586,
}
}, {
'url': 'https://utreon.com/v/jerJw5EOOVU',
@@ -28,10 +30,11 @@ class UtreonIE(InfoExtractor):
'id': 'jerJw5EOOVU',
'ext': 'mp4',
'title': 'When I\'m alone, I love to reflect in peace, to make my dreams come true... [Quotes and Poems]',
'description': 'md5:61ee6c2da98be51b04b969ca80273aaa',
'description': 'md5:4026aa3a2c10169c3649926ac8ef62b6',
'uploader': 'Frases e Poemas Quotes and Poems',
'thumbnail': 'https://data-1.utreon.com/v/Mz/Zh/ND/jerJw5EOOVU/jerJw5EOOVU_89af85470a4b16eededde7f8674c96d9_cover.jpg',
'thumbnail': r're:^https?://.+\.jpg',
'release_date': '20210723',
'duration': 60,
}
}, {
'url': 'https://utreon.com/v/C4ZxXhYBBmE',
@@ -39,10 +42,11 @@ class UtreonIE(InfoExtractor):
'id': 'C4ZxXhYBBmE',
'ext': 'mp4',
'title': 'Bidens Capital Gains Tax Rate to Test Worlds Highest',
'description': 'md5:fb5a6c2e506f013cc76f133f673bc5c8',
'description': 'md5:995aa9ad0733c0e5863ebdeff954f40e',
'uploader': 'Nomad Capitalist',
'thumbnail': 'https://data-1.utreon.com/v/ZD/k1/Mj/C4ZxXhYBBmE/C4ZxXhYBBmE_628342076198c9c06dd6b2c665978584_cover.jpg',
'thumbnail': r're:^https?://.+\.jpg',
'release_date': '20210723',
'duration': 884,
}
}, {
'url': 'https://utreon.com/v/Y-stEH-FBm8',
@@ -52,15 +56,28 @@ class UtreonIE(InfoExtractor):
'title': 'Creeper-Chan Pranks Steve! 💚 [MINECRAFT ANIME]',
'description': 'md5:7a48450b0d761b96dec194be0c5ecb5f',
'uploader': 'Merryweather Comics',
'thumbnail': 'https://data-1.utreon.com/v/MT/E4/Zj/Y-stEH-FBm8/Y-stEH-FBm8_5290676a41a4a1096db133b09f54f77b_cover.jpg',
'thumbnail': r're:^https?://.+\.jpg',
'release_date': '20210718',
}},
]
'duration': 151,
}
}, {
'url': 'https://playeur.com/v/Wzqp-UrxSeu',
'info_dict': {
'id': 'Wzqp-UrxSeu',
'ext': 'mp4',
'title': 'Update: Clockwork Basilisk Books on the Way!',
'description': 'md5:d9756b0b1884c904655b0e170d17cea5',
'uploader': 'Forgotten Weapons',
'release_date': '20240208',
'thumbnail': r're:^https?://.+\.jpg',
'duration': 262,
}
}]
def _real_extract(self, url):
video_id = self._match_id(url)
json_data = self._download_json(
'https://api.utreon.com/v1/videos/' + video_id,
'https://api.playeur.com/v1/videos/' + video_id,
video_id)
videos_json = json_data['videos']
formats = [{

View File

@@ -1,5 +1,6 @@
from .common import InfoExtractor
from ..utils import ExtractorError
from ..utils import ExtractorError, base_url, int_or_none, url_basename
from ..utils.traversal import traverse_obj
class Vbox7IE(InfoExtractor):
@@ -19,7 +20,7 @@ class Vbox7IE(InfoExtractor):
_GEO_COUNTRIES = ['BG']
_TESTS = [{
'url': 'http://vbox7.com/play:0946fff23c',
'md5': 'a60f9ab3a3a2f013ef9a967d5f7be5bf',
'md5': '50ca1f78345a9c15391af47d8062d074',
'info_dict': {
'id': '0946fff23c',
'ext': 'mp4',
@@ -29,19 +30,25 @@ class Vbox7IE(InfoExtractor):
'timestamp': 1470982814,
'upload_date': '20160812',
'uploader': 'zdraveibulgaria',
},
'params': {
'proxy': '127.0.0.1:8118',
'view_count': int,
'duration': 2640,
},
}, {
'url': 'http://vbox7.com/play:249bb972c2',
'md5': '99f65c0c9ef9b682b97313e052734c3f',
'md5': 'da1dd2eb245200cb86e6d09d43232116',
'info_dict': {
'id': '249bb972c2',
'ext': 'mp4',
'title': 'Смях! Чудо - чист за секунди - Скрита камера',
'uploader': 'svideteliat_ot_varshava',
'view_count': int,
'timestamp': 1360215023,
'thumbnail': 'https://i49.vbox7.com/design/iconci/png/noimg6.png',
'description': 'Смях! Чудо - чист за секунди - Скрита камера',
'upload_date': '20130207',
'duration': 83,
},
'skip': 'georestricted',
'expected_warnings': ['Failed to download m3u8 information'],
}, {
'url': 'http://vbox7.com/emb/external.php?vid=a240d20f9c&autoplay=1',
'only_matching': True,
@@ -53,41 +60,38 @@ class Vbox7IE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
response = self._download_json(
'https://www.vbox7.com/ajax/video/nextvideo.php?vid=%s' % video_id,
video_id)
data = self._download_json(
'https://www.vbox7.com/aj/player/item/options', video_id,
query={'vid': video_id})['options']
if 'error' in response:
raise ExtractorError(
'%s said: %s' % (self.IE_NAME, response['error']), expected=True)
src_url = data.get('src')
if src_url in (None, '', 'blank'):
raise ExtractorError('Video is unavailable', expected=True)
video = response['options']
fmt_base = url_basename(src_url).rsplit('.', 1)[0].rsplit('_', 1)[0]
if fmt_base == 'vn':
self.raise_geo_restricted()
title = video['title']
video_url = video['src']
fmt_base = base_url(src_url) + fmt_base
if '/na.mp4' in video_url:
self.raise_geo_restricted(countries=self._GEO_COUNTRIES)
formats = self._extract_m3u8_formats(
f'{fmt_base}.m3u8', video_id, m3u8_id='hls', fatal=False)
# TODO: Add MPD formats, when dash range support is added
for res in traverse_obj(data, ('resolutions', lambda _, v: v != 0, {int})):
formats.append({
'url': f'{fmt_base}_{res}.mp4',
'format_id': f'http-{res}',
'height': res,
})
uploader = video.get('uploader')
webpage = self._download_webpage(
'http://vbox7.com/play:%s' % video_id, video_id, fatal=None)
info = {}
if webpage:
info = self._search_json_ld(
webpage.replace('"/*@context"', '"@context"'), video_id,
fatal=False)
info.update({
return {
'id': video_id,
'title': title,
'url': video_url,
'uploader': uploader,
'thumbnail': self._proto_relative_url(
info.get('thumbnail') or self._og_search_thumbnail(webpage),
'http:'),
})
return info
'formats': formats,
**self._search_json_ld(self._download_webpage(
f'https://www.vbox7.com/play:{video_id}', video_id, fatal=False) or '', video_id, fatal=False),
**traverse_obj(data, {
'title': ('title', {str}),
'uploader': ('uploader', {str}),
'duration': ('duration', {int_or_none}),
}),
}

View File

@@ -12,7 +12,7 @@ from ..utils import (
class ViewLiftBaseIE(InfoExtractor):
_API_BASE = 'https://prod-api.viewlift.com/'
_DOMAINS_REGEX = r'(?:(?:main\.)?snagfilms|snagxtreme|funnyforfree|kiddovid|winnersview|(?:monumental|lax)sportsnetwork|vayafilm|failarmy|ftfnext|lnppass\.legapallacanestro|moviespree|app\.myoutdoortv|neoufitness|pflmma|theidentitytb)\.com|(?:hoichoi|app\.horseandcountry|kronon|marquee|supercrosslive)\.tv'
_DOMAINS_REGEX = r'(?:(?:main\.)?snagfilms|snagxtreme|funnyforfree|kiddovid|winnersview|(?:monumental|lax)sportsnetwork|vayafilm|failarmy|ftfnext|lnppass\.legapallacanestro|moviespree|app\.myoutdoortv|neoufitness|pflmma|theidentitytb|chorki)\.com|(?:hoichoi|app\.horseandcountry|kronon|marquee|supercrosslive)\.tv'
_SITE_MAP = {
'ftfnext': 'lax',
'funnyforfree': 'snagfilms',
@@ -27,6 +27,7 @@ class ViewLiftBaseIE(InfoExtractor):
'snagxtreme': 'snagfilms',
'theidentitytb': 'tampabay',
'vayafilm': 'snagfilms',
'chorki': 'prothomalo',
}
_TOKENS = {}
@@ -296,6 +297,33 @@ class ViewLiftIE(ViewLiftBaseIE):
}, { # Premium movie
'url': 'https://www.hoichoi.tv/movies/detective-2020',
'only_matching': True
}, { # Chorki Premium series
'url': 'https://www.chorki.com/bn/series/sinpaat',
'playlist_mincount': 7,
'info_dict': {
'id': 'bn/series/sinpaat',
},
}, { # Chorki free movie
'url': 'https://www.chorki.com/bn/videos/bangla-movie-bikkhov',
'info_dict': {
'id': '564e755b-f5c7-4515-aee6-8959bee18c93',
'title': 'Bikkhov',
'ext': 'mp4',
'upload_date': '20230824',
'timestamp': 1692860553,
'categories': ['Action Movies', 'Salman Special'],
'tags': 'count:14',
'thumbnail': 'https://snagfilms-a.akamaihd.net/dd078ff5-b16e-45e4-9723-501b56b9df0a/images/2023/08/24/1692860450729_1920x1080_16x9Images.jpg',
'display_id': 'bn/videos/bangla-movie-bikkhov',
'description': 'md5:71492b086450625f4374a3eb824f27dc',
'duration': 8002,
},
'params': {
'skip_download': True,
},
}, { # Chorki Premium movie
'url': 'https://www.chorki.com/bn/videos/something-like-an-autobiography',
'only_matching': True,
}]
@classmethod

View File

@@ -48,17 +48,15 @@ class VimeoBaseInfoExtractor(InfoExtractor):
return url, data, headers
def _perform_login(self, username, password):
webpage = self._download_webpage(
self._LOGIN_URL, None, 'Downloading login page')
token, vuid = self._extract_xsrft_and_vuid(webpage)
viewer = self._download_json('https://vimeo.com/_next/viewer', None, 'Downloading login token')
data = {
'action': 'login',
'email': username,
'password': password,
'service': 'vimeo',
'token': token,
'token': viewer['xsrft'],
}
self._set_vimeo_cookie('vuid', vuid)
self._set_vimeo_cookie('vuid', viewer['vuid'])
try:
self._download_webpage(
self._LOGIN_URL, None, 'Logging in',
@@ -269,7 +267,7 @@ class VimeoBaseInfoExtractor(InfoExtractor):
'https://vimeo.com/_rv/viewer', video_id, note='Downloading jwt token', fatal=False) or {}
if not jwt_response.get('jwt'):
return
headers = {'Authorization': 'jwt %s' % jwt_response['jwt']}
headers = {'Authorization': 'jwt %s' % jwt_response['jwt'], 'Accept': 'application/json'}
original_response = self._download_json(
f'https://api.vimeo.com/videos/{video_id}', video_id,
headers=headers, fatal=False, expected_status=(403, 404)) or {}
@@ -751,6 +749,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
video = self._download_json(
api_url, video_id, headers={
'Authorization': 'jwt ' + token,
'Accept': 'application/json',
}, query={
'fields': 'config_url,created_time,description,license,metadata.connections.comments.total,metadata.connections.likes.total,release_time,stats.plays',
})
@@ -785,7 +784,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
jwt = viewer['jwt']
album = self._download_json(
'https://api.vimeo.com/albums/' + album_id,
album_id, headers={'Authorization': 'jwt ' + jwt},
album_id, headers={'Authorization': 'jwt ' + jwt, 'Accept': 'application/json'},
query={'fields': 'description,name,privacy'})
if try_get(album, lambda x: x['privacy']['view']) == 'password':
password = self.get_param('videopassword')
@@ -1147,10 +1146,12 @@ class VimeoAlbumIE(VimeoBaseInfoExtractor):
'https://api.vimeo.com/albums/%s/videos' % album_id,
album_id, 'Downloading page %d' % api_page, query=query, headers={
'Authorization': 'jwt ' + authorization,
'Accept': 'application/json',
})['data']
except ExtractorError as e:
if isinstance(e.cause, HTTPError) and e.cause.status == 400:
return
raise
for video in videos:
link = video.get('link')
if not link:
@@ -1171,7 +1172,7 @@ class VimeoAlbumIE(VimeoBaseInfoExtractor):
jwt = viewer['jwt']
album = self._download_json(
'https://api.vimeo.com/albums/' + album_id,
album_id, headers={'Authorization': 'jwt ' + jwt},
album_id, headers={'Authorization': 'jwt ' + jwt, 'Accept': 'application/json'},
query={'fields': 'description,name,privacy'})
hashed_pass = None
if try_get(album, lambda x: x['privacy']['view']) == 'password':

View File

@@ -1,159 +0,0 @@
from .common import InfoExtractor
from ..utils import (
ExtractorError,
int_or_none,
parse_iso8601,
traverse_obj,
try_get,
)
class WASDTVBaseIE(InfoExtractor):
def _fetch(self, path, video_id, description, query={}):
response = self._download_json(
f'https://wasd.tv/api/{path}', video_id, query=query,
note=f'Downloading {description} metadata',
errnote=f'Unable to download {description} metadata')
error = response.get('error')
if error:
raise ExtractorError(f'{self.IE_NAME} returned error: {error}', expected=True)
return response.get('result')
def _extract_thumbnails(self, thumbnails_dict):
return [{
'url': url,
'preference': index,
} for index, url in enumerate(
traverse_obj(thumbnails_dict, (('small', 'medium', 'large'),))) if url]
def _real_extract(self, url):
container = self._get_container(url)
stream = traverse_obj(container, ('media_container_streams', 0))
media = try_get(stream, lambda x: x['stream_media'][0])
if not media:
raise ExtractorError('Can not extract media data.', expected=True)
media_meta = media.get('media_meta')
media_url, is_live = self._get_media_url(media_meta)
video_id = media.get('media_id') or container.get('media_container_id')
formats, subtitles = self._extract_m3u8_formats_and_subtitles(media_url, video_id, 'mp4')
return {
'id': str(video_id),
'title': container.get('media_container_name') or self._og_search_title(self._download_webpage(url, video_id)),
'description': container.get('media_container_description'),
'thumbnails': self._extract_thumbnails(media_meta.get('media_preview_images')),
'timestamp': parse_iso8601(container.get('created_at')),
'view_count': int_or_none(stream.get('stream_current_viewers' if is_live else 'stream_total_viewers')),
'is_live': is_live,
'formats': formats,
'subtitles': subtitles,
}
def _get_container(self, url):
raise NotImplementedError('Subclass for get media container')
def _get_media_url(self, media_meta):
raise NotImplementedError('Subclass for get media url')
class WASDTVStreamIE(WASDTVBaseIE):
IE_NAME = 'wasdtv:stream'
_VALID_URL = r'https?://wasd\.tv/(?P<id>[^/#?]+)$'
_TESTS = [{
'url': 'https://wasd.tv/24_7',
'info_dict': {
'id': '559738',
'ext': 'mp4',
'title': 'Live 24/7 Music',
'description': '24&#x2F;7 Music',
'timestamp': int,
'upload_date': r're:^\d{8}$',
'is_live': True,
'view_count': int,
},
}]
def _get_container(self, url):
nickname = self._match_id(url)
channel = self._fetch(f'channels/nicknames/{nickname}', video_id=nickname, description='channel')
channel_id = channel.get('channel_id')
containers = self._fetch(
'v2/media-containers', channel_id, 'running media containers',
query={
'channel_id': channel_id,
'media_container_type': 'SINGLE',
'media_container_status': 'RUNNING',
})
if not containers:
raise ExtractorError(f'{nickname} is offline', expected=True)
return containers[0]
def _get_media_url(self, media_meta):
return media_meta['media_url'], True
class WASDTVRecordIE(WASDTVBaseIE):
IE_NAME = 'wasdtv:record'
_VALID_URL = r'https?://wasd\.tv/[^/#?]+(?:/videos)?\?record=(?P<id>\d+)$'
_TESTS = [{
'url': 'https://wasd.tv/spacemita/videos?record=907755',
'md5': 'c9899dd85be4cc997816ff9f9ca516ce',
'info_dict': {
'id': '906825',
'ext': 'mp4',
'title': 'Музыкальный',
'description': 'md5:f510388d929ff60ae61d4c3cab3137cc',
'timestamp': 1645812079,
'upload_date': '20220225',
'thumbnail': r're:^https?://.+\.jpg',
'is_live': False,
'view_count': int,
},
}, {
'url': 'https://wasd.tv/spacemita?record=907755',
'only_matching': True,
}]
def _get_container(self, url):
container_id = self._match_id(url)
return self._fetch(
f'v2/media-containers/{container_id}', container_id, 'media container')
def _get_media_url(self, media_meta):
media_archive_url = media_meta.get('media_archive_url')
if media_archive_url:
return media_archive_url, False
return media_meta['media_url'], True
class WASDTVClipIE(WASDTVBaseIE):
IE_NAME = 'wasdtv:clip'
_VALID_URL = r'https?://wasd\.tv/[^/#?]+/clips\?clip=(?P<id>\d+)$'
_TESTS = [{
'url': 'https://wasd.tv/spacemita/clips?clip=26804',
'md5': '818885e720143d7a4e776ff66fcff148',
'info_dict': {
'id': '26804',
'ext': 'mp4',
'title': 'Пуш флексит на голове стримера',
'timestamp': 1646682908,
'upload_date': '20220307',
'thumbnail': r're:^https?://.+\.jpg',
'view_count': int,
},
}]
def _real_extract(self, url):
clip_id = self._match_id(url)
clip = self._fetch(f'v2/clips/{clip_id}', video_id=clip_id, description='clip')
clip_data = clip.get('clip_data')
formats, subtitles = self._extract_m3u8_formats_and_subtitles(clip_data.get('url'), video_id=clip_id, ext='mp4')
return {
'id': clip_id,
'title': clip.get('clip_title') or self._og_search_title(self._download_webpage(url, clip_id, fatal=False)),
'thumbnails': self._extract_thumbnails(clip_data.get('preview')),
'timestamp': parse_iso8601(clip.get('created_at')),
'view_count': int_or_none(clip.get('clip_views_count')),
'formats': formats,
'subtitles': subtitles,
}

View File

@@ -2068,7 +2068,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'title': 'Voyeur Girl',
'description': 'md5:7ae382a65843d6df2685993e90a8628f',
'upload_date': '20190312',
'artist': 'Stephen',
'artists': ['Stephen'],
'creators': ['Stephen'],
'track': 'Voyeur Girl',
'album': 'it\'s too much love to know my dear',
'release_date': '20190313',
@@ -2081,7 +2082,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'channel': 'Stephen', # TODO: should be "Stephen - Topic"
'uploader': 'Stephen',
'availability': 'public',
'creator': 'Stephen',
'duration': 169,
'thumbnail': 'https://i.ytimg.com/vi_webp/MgNrAu2pzNs/maxresdefault.webp',
'age_limit': 0,
@@ -3669,15 +3669,28 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
return orderedSet(requested_clients)
def _invalid_player_response(self, pr, video_id):
# YouTube may return a different video player response than expected.
# See: https://github.com/TeamNewPipe/NewPipe/issues/8713
if (pr_id := traverse_obj(pr, ('videoDetails', 'videoId'))) != video_id:
return pr_id
def _extract_player_responses(self, clients, video_id, webpage, master_ytcfg, smuggled_data):
initial_pr = None
if webpage:
initial_pr = self._search_json(
self._YT_INITIAL_PLAYER_RESPONSE_RE, webpage, 'initial player response', video_id, fatal=False)
prs = []
if initial_pr and not self._invalid_player_response(initial_pr, video_id):
# Android player_response does not have microFormats which are needed for
# extraction of some data. So we return the initial_pr with formats
# stripped out even if not requested by the user
# See: https://github.com/yt-dlp/yt-dlp/issues/501
prs.append({**initial_pr, 'streamingData': None})
all_clients = set(clients)
clients = clients[::-1]
prs = []
def append_client(*client_names):
""" Append the first client name that exists but not already used """
@@ -3689,18 +3702,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
all_clients.add(actual_client)
return
# Android player_response does not have microFormats which are needed for
# extraction of some data. So we return the initial_pr with formats
# stripped out even if not requested by the user
# See: https://github.com/yt-dlp/yt-dlp/issues/501
if initial_pr:
pr = dict(initial_pr)
pr['streamingData'] = None
prs.append(pr)
last_error = None
tried_iframe_fallback = False
player_url = None
skipped_clients = {}
while clients:
client, base_client, variant = _split_innertube_client(clients.pop())
player_ytcfg = master_ytcfg if client == 'web' else {}
@@ -3721,26 +3725,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
pr = initial_pr if client == 'web' and initial_pr else self._extract_player_response(
client, video_id, player_ytcfg or master_ytcfg, player_ytcfg, player_url if require_js_player else None, initial_pr, smuggled_data)
except ExtractorError as e:
if last_error:
self.report_warning(last_error)
last_error = e
self.report_warning(e)
continue
if pr:
# YouTube may return a different video player response than expected.
# See: https://github.com/TeamNewPipe/NewPipe/issues/8713
pr_video_id = traverse_obj(pr, ('videoDetails', 'videoId'))
if pr_video_id and pr_video_id != video_id:
self.report_warning(
f'Skipping player response from {client} client (got player response for video "{pr_video_id}" instead of "{video_id}")' + bug_reports_message())
else:
# Save client name for introspection later
name = short_client_name(client)
sd = traverse_obj(pr, ('streamingData', {dict})) or {}
sd[STREAMING_DATA_CLIENT_NAME] = name
for f in traverse_obj(sd, (('formats', 'adaptiveFormats'), ..., {dict})):
f[STREAMING_DATA_CLIENT_NAME] = name
prs.append(pr)
if pr_id := self._invalid_player_response(pr, video_id):
skipped_clients[client] = pr_id
elif pr:
# Save client name for introspection later
name = short_client_name(client)
sd = traverse_obj(pr, ('streamingData', {dict})) or {}
sd[STREAMING_DATA_CLIENT_NAME] = name
for f in traverse_obj(sd, (('formats', 'adaptiveFormats'), ..., {dict})):
f[STREAMING_DATA_CLIENT_NAME] = name
prs.append(pr)
# creator clients can bypass AGE_VERIFICATION_REQUIRED if logged in
if variant == 'embedded' and self._is_unplayable(pr) and self.is_authenticated:
@@ -3751,10 +3748,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
elif not variant:
append_client(f'tv_embedded.{base_client}', f'{base_client}_embedded')
if last_error:
if not len(prs):
raise last_error
self.report_warning(last_error)
if skipped_clients:
self.report_warning(
f'Skipping player responses from {"/".join(skipped_clients)} clients '
f'(got player responses for video "{"/".join(set(skipped_clients.values()))}" instead of "{video_id}")')
if not prs:
raise ExtractorError(
'All player responses are invalid. Your IP is likely being blocked by Youtube', expected=True)
elif not prs:
raise ExtractorError('Failed to extract any player response')
return prs, player_url
def _needs_live_processing(self, live_status, duration):
@@ -4418,7 +4420,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
release_year = release_date[:4]
info.update({
'album': mobj.group('album'.strip()),
'artist': mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·')),
'artists': ([a] if (a := mobj.group('clean_artist'))
else [a.strip() for a in mobj.group('artist').split('·')]),
'track': mobj.group('track').strip(),
'release_date': release_date,
'release_year': int_or_none(release_year),
@@ -4564,7 +4567,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
if mrr_title == 'Album':
info['album'] = mrr_contents_text
elif mrr_title == 'Artist':
info['artist'] = mrr_contents_text
info['artists'] = [mrr_contents_text] if mrr_contents_text else None
elif mrr_title == 'Song':
info['track'] = mrr_contents_text
owner_badges = self._extract_badges(traverse_obj(vsir, ('owner', 'videoOwnerRenderer', 'badges')))
@@ -4598,7 +4601,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
if fmt.get('protocol') == 'm3u8_native':
fmt['__needs_testing'] = True
for s_k, d_k in [('artist', 'creator'), ('track', 'alt_title')]:
for s_k, d_k in [('artists', 'creators'), ('track', 'alt_title')]:
v = info.get(s_k)
if v:
info[d_k] = v

View File

@@ -0,0 +1,71 @@
from .common import InfoExtractor
from ..utils import merge_dicts, unified_timestamp, url_or_none
from ..utils.traversal import traverse_obj
class ZetlandDKArticleIE(InfoExtractor):
_VALID_URL = r'https?://www\.zetland\.dk/\w+/(?P<id>(?P<story_id>\w{8})-(?P<uploader_id>\w{8})-(?:\w{5}))'
_TESTS = [{
'url': 'https://www.zetland.dk/historie/sO9aq2MY-a81VP3BY-66e69?utm_source=instagram&utm_medium=linkibio&utm_campaign=artikel',
'info_dict': {
'id': 'sO9aq2MY-a81VP3BY-66e69',
'ext': 'mp3',
'modified_date': '20240118',
'title': 'Afsnit 1: “Det føltes som en kidnapning.” ',
'upload_date': '20240116',
'uploader_id': 'a81VP3BY',
'modified_timestamp': 1705568739,
'release_timestamp': 1705377592,
'uploader_url': 'https://www.zetland.dk/skribent/a81VP3BY',
'uploader': 'Helle Fuusager',
'release_date': '20240116',
'thumbnail': r're:https://zetland\.imgix\.net/2aafe500-b14e-11ee-bf83-65d5e1283a57/Zetland_Image_1\.jpg',
'description': 'md5:9619d426772c133f5abb26db27f26a01',
'timestamp': 1705377592,
'series_id': '62d54630-e87b-4ab1-a255-8de58dbe1b14',
}
}]
def _real_extract(self, url):
display_id, uploader_id = self._match_valid_url(url).group('id', 'uploader_id')
webpage = self._download_webpage(url, display_id)
next_js_data = self._search_nextjs_data(webpage, display_id)['props']['pageProps']
story_data = traverse_obj(next_js_data, ('initialState', 'consume', 'story', 'story'))
formats = []
for audio_url in traverse_obj(story_data, ('story_content', 'meta', 'audioFiles', ..., {url_or_none})):
formats.append({
'url': audio_url,
'vcodec': 'none',
})
return merge_dicts({
'id': display_id,
'formats': formats,
'uploader_id': uploader_id
}, traverse_obj(story_data, {
'title': ((('story_content', 'content', 'title'), 'title'), {str}),
'uploader': ('sharer', 'name'),
'uploader_id': ('sharer', 'sharer_id'),
'description': ('story_content', 'content', 'socialDescription'),
'series_id': ('story_content', 'meta', 'seriesId'),
'release_timestamp': ('published_at', {unified_timestamp}),
'modified_timestamp': ('revised_at', {unified_timestamp}),
}, get_all=False), traverse_obj(next_js_data, ('metaInfo', {
'title': ((('meta', 'title'), ('ld', 'headline'), ('og', 'og:title'), ('og', 'twitter:title')), {str}),
'description': ((('meta', 'description'), ('ld', 'description'), ('og', 'og:description'), ('og', 'twitter:description')), {str}),
'uploader': ((('meta', 'author'), ('ld', 'author', 'name')), {str}),
'uploader_url': ('ld', 'author', 'url', {url_or_none}),
'thumbnail': ((('ld', 'image'), ('og', 'og:image'), ('og', 'twitter:image')), {url_or_none}),
'modified_timestamp': ('ld', 'dateModified', {unified_timestamp}),
'release_timestamp': ('ld', 'datePublished', {unified_timestamp}),
'timestamp': ('ld', 'dateCreated', {unified_timestamp}),
}), get_all=False), {
'title': self._html_search_meta(['title', 'og:title', 'twitter:title'], webpage),
'description': self._html_search_meta(['description', 'og:description', 'twitter:description'], webpage),
'thumbnail': self._html_search_meta(['og:image', 'twitter:image'], webpage),
'uploader': self._html_search_meta(['author'], webpage),
'release_timestamp': unified_timestamp(self._html_search_meta(['article:published_time'], webpage)),
}, self._search_json_ld(webpage, display_id, fatal=False))