diff --git a/CONTRIBUTORS b/CONTRIBUTORS index 6aa52c595..ea391bc15 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -775,3 +775,7 @@ GeoffreyFrogeye Pawka v3DJG6GL yozel +brian6932 +iednod55 +maxbin123 +nullpos diff --git a/Changelog.md b/Changelog.md index 80b72da05..dd95abc86 100644 --- a/Changelog.md +++ b/Changelog.md @@ -4,6 +4,61 @@ # Changelog # To create a release, dispatch the https://github.com/yt-dlp/yt-dlp/actions/workflows/release.yml workflow on master --> +### 2025.06.09 + +#### Extractor changes +- [Improve JSON LD thumbnails extraction](https://github.com/yt-dlp/yt-dlp/commit/85c8a405e3651dc041b758f4744d4fb3c4c55e01) ([#13368](https://github.com/yt-dlp/yt-dlp/issues/13368)) by [bashonly](https://github.com/bashonly), [doe1080](https://github.com/doe1080) +- **10play**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/6d265388c6e943419ac99e9151cf75a3265f980f) ([#13349](https://github.com/yt-dlp/yt-dlp/issues/13349)) by [bashonly](https://github.com/bashonly) +- **adobepass** + - [Add Fubo MSO](https://github.com/yt-dlp/yt-dlp/commit/eee90acc47d7f8de24afaa8b0271ccaefdf6e88c) ([#13131](https://github.com/yt-dlp/yt-dlp/issues/13131)) by [maxbin123](https://github.com/maxbin123) + - [Always add newer user-agent when required](https://github.com/yt-dlp/yt-dlp/commit/0ee1102268cf31b07f8a8318a47424c66b2f7378) ([#13131](https://github.com/yt-dlp/yt-dlp/issues/13131)) by [bashonly](https://github.com/bashonly) + - [Fix Philo MSO authentication](https://github.com/yt-dlp/yt-dlp/commit/943083edcd3df45aaa597a6967bc6c95b720f54c) ([#13335](https://github.com/yt-dlp/yt-dlp/issues/13335)) by [Sipherdrakon](https://github.com/Sipherdrakon) + - [Rework to require software statement](https://github.com/yt-dlp/yt-dlp/commit/711c5d5d098fee2992a1a624b1c4b30364b91426) ([#13131](https://github.com/yt-dlp/yt-dlp/issues/13131)) by [bashonly](https://github.com/bashonly), [maxbin123](https://github.com/maxbin123) + - [Validate login URL before sending credentials](https://github.com/yt-dlp/yt-dlp/commit/89c1b349ad81318d9d3bea76c01c891696e58d38) ([#13131](https://github.com/yt-dlp/yt-dlp/issues/13131)) by [bashonly](https://github.com/bashonly) +- **aenetworks** + - [Fix playlist extractors](https://github.com/yt-dlp/yt-dlp/commit/f37d599a697e82fe68b423865897d55bae34f373) ([#13408](https://github.com/yt-dlp/yt-dlp/issues/13408)) by [Sipherdrakon](https://github.com/Sipherdrakon) + - [Fix provider-locked content extraction](https://github.com/yt-dlp/yt-dlp/commit/6693d6603358ae6beca834dbd822a7917498b813) ([#13131](https://github.com/yt-dlp/yt-dlp/issues/13131)) by [maxbin123](https://github.com/maxbin123) +- **bilibilibangumi**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/13e55162719528d42d2133e16b65ff59a667a6e4) ([#13416](https://github.com/yt-dlp/yt-dlp/issues/13416)) by [c-basalt](https://github.com/c-basalt) +- **brightcove**: new: [Adapt to new AdobePass requirement](https://github.com/yt-dlp/yt-dlp/commit/98f8eec956e3b16cb66a3d49cc71af3807db795e) ([#13131](https://github.com/yt-dlp/yt-dlp/issues/13131)) by [bashonly](https://github.com/bashonly) +- **cu.ntv.co.jp**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/aa863ddab9b1d104678e9cf39bb76f5b14fca660) ([#13302](https://github.com/yt-dlp/yt-dlp/issues/13302)) by [doe1080](https://github.com/doe1080), [nullpos](https://github.com/nullpos) +- **go**: [Fix provider-locked content extraction](https://github.com/yt-dlp/yt-dlp/commit/2e5bf002dad16f5ce35aa2023d392c9e518fcd8f) ([#13131](https://github.com/yt-dlp/yt-dlp/issues/13131)) by [bashonly](https://github.com/bashonly), [maxbin123](https://github.com/maxbin123) +- **nbc**: [Rework and adapt extractors to new AdobePass flow](https://github.com/yt-dlp/yt-dlp/commit/2d7949d5642bc37d1e71bf00c9a55260e5505d58) ([#13131](https://github.com/yt-dlp/yt-dlp/issues/13131)) by [bashonly](https://github.com/bashonly) +- **nobelprize**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/97ddfefeb4faba6e61cd80996c16952b8eab16f3) ([#13205](https://github.com/yt-dlp/yt-dlp/issues/13205)) by [doe1080](https://github.com/doe1080) +- **odnoklassniki**: [Detect and raise when login is required](https://github.com/yt-dlp/yt-dlp/commit/148a1eb4c59e127965396c7a6e6acf1979de459e) ([#13361](https://github.com/yt-dlp/yt-dlp/issues/13361)) by [bashonly](https://github.com/bashonly) +- **patreon**: [Fix m3u8 formats extraction](https://github.com/yt-dlp/yt-dlp/commit/e0d6c0822930f6e63f574d46d946a58b73ecd10c) ([#13266](https://github.com/yt-dlp/yt-dlp/issues/13266)) by [bashonly](https://github.com/bashonly) (With fixes in [1a8a03e](https://github.com/yt-dlp/yt-dlp/commit/1a8a03ea8d827107319a18076ee3505090667c5a)) +- **podchaser**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/538eb305673c26bff6a2b12f1c96375fe02ce41a) ([#13271](https://github.com/yt-dlp/yt-dlp/issues/13271)) by [bashonly](https://github.com/bashonly) +- **sr**: mediathek: [Improve metadata extraction](https://github.com/yt-dlp/yt-dlp/commit/e3c605a61f4cc2de9059f37434fa108c3c20f58e) ([#13294](https://github.com/yt-dlp/yt-dlp/issues/13294)) by [doe1080](https://github.com/doe1080) +- **stacommu**: [Avoid partial stream formats](https://github.com/yt-dlp/yt-dlp/commit/5d96527be80dc1ed1702d9cd548ff86de570ad70) ([#13412](https://github.com/yt-dlp/yt-dlp/issues/13412)) by [bashonly](https://github.com/bashonly) +- **startrek**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/a8bf0011bde92b3f1324a98bfbd38932fd3ebe18) ([#13188](https://github.com/yt-dlp/yt-dlp/issues/13188)) by [doe1080](https://github.com/doe1080) +- **svt**: play: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/e1b6062f8c4a3fa33c65269d48d09ec78de765a2) ([#13329](https://github.com/yt-dlp/yt-dlp/issues/13329)) by [barsnick](https://github.com/barsnick), [bashonly](https://github.com/bashonly) +- **telecinco**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/03dba2012d9bd3f402fa8c2f122afba89bbd22a4) ([#13379](https://github.com/yt-dlp/yt-dlp/issues/13379)) by [bashonly](https://github.com/bashonly) +- **theplatform**: [Improve metadata extraction](https://github.com/yt-dlp/yt-dlp/commit/ed108b3ea481c6a4b5215a9302ba92d74baa2425) ([#13131](https://github.com/yt-dlp/yt-dlp/issues/13131)) by [bashonly](https://github.com/bashonly) +- **toutiao**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/f8051e3a61686c5db1de5f5746366ecfbc3ad20c) ([#13246](https://github.com/yt-dlp/yt-dlp/issues/13246)) by [doe1080](https://github.com/doe1080) +- **turner**: [Adapt extractors to new AdobePass flow](https://github.com/yt-dlp/yt-dlp/commit/0daddc780d3ac5bebc3a3ec5b884d9243cbc0745) ([#13131](https://github.com/yt-dlp/yt-dlp/issues/13131)) by [bashonly](https://github.com/bashonly) +- **twitcasting**: [Fix password-protected livestream support](https://github.com/yt-dlp/yt-dlp/commit/52f9729c9a92ad4656d746ff0b1acecb87b3e96d) ([#13097](https://github.com/yt-dlp/yt-dlp/issues/13097)) by [bashonly](https://github.com/bashonly) +- **twitter**: broadcast: [Support events URLs](https://github.com/yt-dlp/yt-dlp/commit/7794374de8afb20499b023107e2abfd4e6b93ee4) ([#13248](https://github.com/yt-dlp/yt-dlp/issues/13248)) by [doe1080](https://github.com/doe1080) +- **umg**: de: [Rework extractor](https://github.com/yt-dlp/yt-dlp/commit/4e7c1ea346b510280218b47e8653dbbca3a69870) ([#13373](https://github.com/yt-dlp/yt-dlp/issues/13373)) by [doe1080](https://github.com/doe1080) +- **vice**: [Mark extractors as broken](https://github.com/yt-dlp/yt-dlp/commit/6121559e027a04574690799c1776bc42bb51af31) ([#13131](https://github.com/yt-dlp/yt-dlp/issues/13131)) by [bashonly](https://github.com/bashonly) +- **vimeo**: [Extract subtitles from player subdomain](https://github.com/yt-dlp/yt-dlp/commit/c723c4e5e78263df178dbe69844a3d05f3ef9e35) ([#13350](https://github.com/yt-dlp/yt-dlp/issues/13350)) by [bashonly](https://github.com/bashonly) +- **watchespn**: [Fix provider-locked content extraction](https://github.com/yt-dlp/yt-dlp/commit/b094747e93cfb0a2c53007120e37d0d84d41f030) ([#13131](https://github.com/yt-dlp/yt-dlp/issues/13131)) by [maxbin123](https://github.com/maxbin123) +- **weverse**: [Support login with oauth refresh tokens](https://github.com/yt-dlp/yt-dlp/commit/3fe72e9eea38d9a58211cde42cfaa577ce020e2c) ([#13284](https://github.com/yt-dlp/yt-dlp/issues/13284)) by [bashonly](https://github.com/bashonly) +- **youtube** + - [Add `tv_simply` player client](https://github.com/yt-dlp/yt-dlp/commit/1fd0e88b67db53ad163393d6965f68e908fa70e3) ([#13389](https://github.com/yt-dlp/yt-dlp/issues/13389)) by [gamer191](https://github.com/gamer191) + - [Extract srt subtitles](https://github.com/yt-dlp/yt-dlp/commit/231349786e8c42089c2e079ec94c0ea866c37999) ([#13411](https://github.com/yt-dlp/yt-dlp/issues/13411)) by [gamer191](https://github.com/gamer191) + - [Fix `--mark-watched` support](https://github.com/yt-dlp/yt-dlp/commit/b5be29fa58ec98226e11621fd9c58585bcff6879) ([#13222](https://github.com/yt-dlp/yt-dlp/issues/13222)) by [brian6932](https://github.com/brian6932), [iednod55](https://github.com/iednod55) + - [Fix automatic captions for some client combinations](https://github.com/yt-dlp/yt-dlp/commit/53ea743a9c158f8ca2d75a09ca44ba68606042d8) ([#13268](https://github.com/yt-dlp/yt-dlp/issues/13268)) by [bashonly](https://github.com/bashonly) + - [Improve signature extraction debug output](https://github.com/yt-dlp/yt-dlp/commit/d30a49742cfa22e61c47df4ac0e7334d648fb85d) ([#13327](https://github.com/yt-dlp/yt-dlp/issues/13327)) by [bashonly](https://github.com/bashonly) + - [Rework nsig function name extraction](https://github.com/yt-dlp/yt-dlp/commit/9e38b273b7ac942e7e9fc05a651ed810ab7d30ba) ([#13403](https://github.com/yt-dlp/yt-dlp/issues/13403)) by [Grub4K](https://github.com/Grub4K) + - [nsig code improvements and cleanup](https://github.com/yt-dlp/yt-dlp/commit/f7bbf5a617f9ab54ef51eaef99be36e175b5e9c3) ([#13280](https://github.com/yt-dlp/yt-dlp/issues/13280)) by [bashonly](https://github.com/bashonly) +- **zdf**: [Fix language extraction and format sorting](https://github.com/yt-dlp/yt-dlp/commit/db162b76f6bdece50babe2e0cacfe56888c2e125) ([#13313](https://github.com/yt-dlp/yt-dlp/issues/13313)) by [InvalidUsernameException](https://github.com/InvalidUsernameException) + +#### Misc. changes +- **build** + - [Exclude `pkg_resources` from being collected](https://github.com/yt-dlp/yt-dlp/commit/cc749a8a3b8b6e5c05318868c72a403f376a1b38) ([#13320](https://github.com/yt-dlp/yt-dlp/issues/13320)) by [bashonly](https://github.com/bashonly) + - [Fix macOS requirements caching](https://github.com/yt-dlp/yt-dlp/commit/201812100f315c6727a4418698d5b4e8a79863d4) ([#13328](https://github.com/yt-dlp/yt-dlp/issues/13328)) by [bashonly](https://github.com/bashonly) +- **cleanup**: Miscellaneous: [339614a](https://github.com/yt-dlp/yt-dlp/commit/339614a173c74b42d63e858c446a9cae262a13af) by [bashonly](https://github.com/bashonly) +- **test**: postprocessors: [Remove binary thumbnail test data](https://github.com/yt-dlp/yt-dlp/commit/a9b370069838e84d44ac7ad095d657003665885a) ([#13341](https://github.com/yt-dlp/yt-dlp/issues/13341)) by [bashonly](https://github.com/bashonly) + ### 2025.05.22 #### Core changes diff --git a/README.md b/README.md index 6e2dc6243..0f9a7d556 100644 --- a/README.md +++ b/README.md @@ -1795,9 +1795,9 @@ # EXTRACTOR ARGUMENTS The following extractors use this feature: #### youtube -* `lang`: Prefer translated metadata (`title`, `description` etc) of this language code (case-sensitive). By default, the video primary language metadata is preferred, with a fallback to `en` translated. See [youtube.py](https://github.com/yt-dlp/yt-dlp/blob/c26f9b991a0681fd3ea548d535919cec1fbbd430/yt_dlp/extractor/youtube.py#L381-L390) for list of supported content language codes +* `lang`: Prefer translated metadata (`title`, `description` etc) of this language code (case-sensitive). By default, the video primary language metadata is preferred, with a fallback to `en` translated. See [youtube/_base.py](https://github.com/yt-dlp/yt-dlp/blob/415b4c9f955b1a0391204bd24a7132590e7b3bdb/yt_dlp/extractor/youtube/_base.py#L402-L409) for the list of supported content language codes * `skip`: One or more of `hls`, `dash` or `translated_subs` to skip extraction of the m3u8 manifests, dash manifests and [auto-translated subtitles](https://github.com/yt-dlp/yt-dlp/issues/4090#issuecomment-1158102032) respectively -* `player_client`: Clients to extract video data from. The currently available clients are `web`, `web_safari`, `web_embedded`, `web_music`, `web_creator`, `mweb`, `ios`, `android`, `android_vr`, `tv` and `tv_embedded`. By default, `tv,ios,web` is used, or `tv,web` is used when authenticating with cookies. The `web_music` client is added for `music.youtube.com` URLs when logged-in cookies are used. The `web_embedded` client is added for age-restricted videos but only works if the video is embeddable. The `tv_embedded` and `web_creator` clients are added for age-restricted videos if account age-verification is required. Some clients, such as `web` and `web_music`, require a `po_token` for their formats to be downloadable. Some clients, such as `web_creator`, will only work with authentication. Not all clients support authentication via cookies. You can use `default` for the default clients, or you can use `all` for all clients (not recommended). You can prefix a client with `-` to exclude it, e.g. `youtube:player_client=default,-ios` +* `player_client`: Clients to extract video data from. The currently available clients are `web`, `web_safari`, `web_embedded`, `web_music`, `web_creator`, `mweb`, `ios`, `android`, `android_vr`, `tv`, `tv_simply` and `tv_embedded`. By default, `tv,ios,web` is used, or `tv,web` is used when authenticating with cookies. The `web_music` client is added for `music.youtube.com` URLs when logged-in cookies are used. The `web_embedded` client is added for age-restricted videos but only works if the video is embeddable. The `tv_embedded` and `web_creator` clients are added for age-restricted videos if account age-verification is required. Some clients, such as `web` and `web_music`, require a `po_token` for their formats to be downloadable. Some clients, such as `web_creator`, will only work with authentication. Not all clients support authentication via cookies. You can use `default` for the default clients, or you can use `all` for all clients (not recommended). You can prefix a client with `-` to exclude it, e.g. `youtube:player_client=default,-ios` * `player_skip`: Skip some network requests that are generally needed for robust extraction. One or more of `configs` (skip client configs), `webpage` (skip initial webpage), `js` (skip js player), `initial_data` (skip initial data/next ep request). While these options can help reduce the number of requests needed or avoid some rate-limiting, they could cause issues such as missing formats or metadata. See [#860](https://github.com/yt-dlp/yt-dlp/pull/860) and [#12826](https://github.com/yt-dlp/yt-dlp/issues/12826) for more details * `player_params`: YouTube player parameters to use for player requests. Will overwrite any default ones set by yt-dlp. * `player_js_variant`: The player javascript variant to use for signature and nsig deciphering. The known variants are: `main`, `tce`, `tv`, `tv_es6`, `phone`, `tablet`. Only `main` is recommended as a possible workaround; the others are for debugging purposes. The default is to use what is prescribed by the site, and can be selected with `actual` diff --git a/supportedsites.md b/supportedsites.md index c2d7b4555..1fe381603 100644 --- a/supportedsites.md +++ b/supportedsites.md @@ -5,6 +5,8 @@ # Supported sites Not all sites listed here are guaranteed to work; websites are constantly changing and sometimes this breaks yt-dlp's support for them. The only reliable way to check if a site is supported is to try it. + - **10play**: [*10play*](## "netrc machine") + - **10play:season** - **17live** - **17live:clip** - **17live:vod** @@ -295,7 +297,7 @@ # Supported sites - **CNNIndonesia** - **ComedyCentral** - **ComedyCentralTV** - - **ConanClassic** + - **ConanClassic**: (**Currently broken**) - **CondeNast**: Condé Nast media group: Allure, Architectural Digest, Ars Technica, Bon Appétit, Brides, Condé Nast, Condé Nast Traveler, Details, Epicurious, GQ, Glamour, Golf Digest, SELF, Teen Vogue, The New Yorker, Vanity Fair, Vogue, W Magazine, WIRED - **CONtv** - **CookingChannel** @@ -317,7 +319,7 @@ # Supported sites - **CtsNews**: 華視新聞 - **CTV** - **CTVNews** - - **cu.ntv.co.jp**: Nippon Television Network + - **cu.ntv.co.jp**: 日テレ無料TADA! - **CultureUnplugged** - **curiositystream**: [*curiositystream*](## "netrc machine") - **curiositystream:collections**: [*curiositystream*](## "netrc machine") @@ -882,19 +884,19 @@ # Supported sites - **Naver** - **Naver:live** - **navernow** - - **nba** - - **nba:channel** - - **nba:embed** - - **nba:watch** - - **nba:​watch:collection** - - **nba:​watch:embed** + - **nba**: (**Currently broken**) + - **nba:channel**: (**Currently broken**) + - **nba:embed**: (**Currently broken**) + - **nba:watch**: (**Currently broken**) + - **nba:​watch:collection**: (**Currently broken**) + - **nba:​watch:embed**: (**Currently broken**) - **NBC** - **NBCNews** - **nbcolympics** - - **nbcolympics:stream** - - **NBCSports** - - **NBCSportsStream** - - **NBCSportsVPlayer** + - **nbcolympics:stream**: (**Currently broken**) + - **NBCSports**: (**Currently broken**) + - **NBCSportsStream**: (**Currently broken**) + - **NBCSportsVPlayer**: (**Currently broken**) - **NBCStations** - **ndr**: NDR.de - Norddeutscher Rundfunk - **ndr:embed** @@ -970,7 +972,7 @@ # Supported sites - **Nitter** - **njoy**: N-JOY - **njoy:embed** - - **NobelPrize**: (**Currently broken**) + - **NobelPrize** - **NoicePodcast** - **NonkTube** - **NoodleMagazine** @@ -1393,14 +1395,14 @@ # Supported sites - **SpreakerShow** - **SpringboardPlatform** - **SproutVideo** - - **sr:mediathek**: Saarländischer Rundfunk (**Currently broken**) + - **sr:mediathek**: Saarländischer Rundfunk - **SRGSSR** - **SRGSSRPlay**: srf.ch, rts.ch, rsi.ch, rtr.ch and swissinfo.ch play sites - **StacommuLive**: [*stacommu*](## "netrc machine") - **StacommuVOD**: [*stacommu*](## "netrc machine") - **StagePlusVODConcert**: [*stageplus*](## "netrc machine") - **stanfordoc**: Stanford Open ClassRoom - - **StarTrek**: (**Currently broken**) + - **startrek**: STAR TREK - **startv** - **Steam** - **SteamCommunityBroadcast** @@ -1423,12 +1425,11 @@ # Supported sites - **SunPorno** - **sverigesradio:episode** - **sverigesradio:publication** - - **SVT** - - **SVTPage** - - **SVTPlay**: SVT Play and Öppet arkiv - - **SVTSeries** + - **svt:page** + - **svt:play**: SVT Play and Öppet arkiv + - **svt:​play:series** - **SwearnetEpisode** - - **Syfy**: (**Currently broken**) + - **Syfy** - **SYVDK** - **SztvHu** - **t-online.de**: (**Currently broken**) @@ -1472,8 +1473,6 @@ # Supported sites - **Telewebion**: (**Currently broken**) - **Tempo** - **TennisTV**: [*tennistv*](## "netrc machine") - - **TenPlay**: [*10play*](## "netrc machine") - - **TenPlaySeason** - **TF1** - **TFO** - **theatercomplextown:ppv**: [*theatercomplextown*](## "netrc machine") @@ -1511,6 +1510,7 @@ # Supported sites - **tokfm:podcast** - **ToonGoggles** - **tou.tv**: [*toutv*](## "netrc machine") + - **toutiao**: 今日头条 - **Toypics**: Toypics video (**Currently broken**) - **ToypicsUser**: Toypics user profile (**Currently broken**) - **TrailerAddict**: (**Currently broken**) @@ -1600,7 +1600,7 @@ # Supported sites - **UKTVPlay** - **UlizaPlayer** - **UlizaPortal**: ulizaportal.jp - - **umg:de**: Universal Music Deutschland (**Currently broken**) + - **umg:de**: Universal Music Deutschland - **Unistra** - **Unity**: (**Currently broken**) - **uol.com.br** @@ -1623,9 +1623,9 @@ # Supported sites - **VGTV**: VGTV, BTTV, FTV, Aftenposten and Aftonbladet - **vh1.com** - **vhx:embed**: [*vimeo*](## "netrc machine") - - **vice** - - **vice:article** - - **vice:show** + - **vice**: (**Currently broken**) + - **vice:article**: (**Currently broken**) + - **vice:show**: (**Currently broken**) - **Viddler** - **Videa** - **video.arnes.si**: Arnes Video diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index bc89b2955..e6c8d574e 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -1947,6 +1947,137 @@ def test_search_nextjs_data(self): with self.assertWarns(DeprecationWarning): self.assertEqual(self.ie._search_nextjs_data('', None, default='{}'), {}) + def test_search_nuxt_json(self): + HTML_TMPL = '' + VALID_DATA = ''' + ["ShallowReactive",1], + {"data":2,"state":21,"once":25,"_errors":28,"_server_errors":30}, + ["ShallowReactive",3], + {"$abcdef123456":4}, + {"podcast":5,"activeEpisodeData":7}, + {"podcast":6,"seasons":14}, + {"title":10,"id":11}, + ["Reactive",8], + {"episode":9,"creators":18,"empty_list":20}, + {"title":12,"id":13,"refs":34,"empty_refs":35}, + "Series Title", + "podcast-id-01", + "Episode Title", + "episode-id-99", + [15,16,17], + 1, + 2, + 3, + [19], + "Podcast Creator", + [], + {"$ssite-config":22}, + {"env":23,"name":24,"map":26,"numbers":14}, + "production", + "podcast-website", + ["Set"], + ["Reactive",27], + ["Map"], + ["ShallowReactive",29], + {}, + ["NuxtError",31], + {"status":32,"message":33}, + 503, + "Service Unavailable", + [36,37], + [38,39], + ["Ref",40], + ["ShallowRef",41], + ["EmptyRef",42], + ["EmptyShallowRef",43], + "ref", + "shallow_ref", + "{\\"ref\\":1}", + "{\\"shallow_ref\\":2}" + ''' + PAYLOAD = { + 'data': { + '$abcdef123456': { + 'podcast': { + 'podcast': { + 'title': 'Series Title', + 'id': 'podcast-id-01', + }, + 'seasons': [1, 2, 3], + }, + 'activeEpisodeData': { + 'episode': { + 'title': 'Episode Title', + 'id': 'episode-id-99', + 'refs': ['ref', 'shallow_ref'], + 'empty_refs': [{'ref': 1}, {'shallow_ref': 2}], + }, + 'creators': ['Podcast Creator'], + 'empty_list': [], + }, + }, + }, + 'state': { + '$ssite-config': { + 'env': 'production', + 'name': 'podcast-website', + 'map': [], + 'numbers': [1, 2, 3], + }, + }, + 'once': [], + '_errors': {}, + '_server_errors': { + 'status': 503, + 'message': 'Service Unavailable', + }, + } + PARTIALLY_INVALID = [( + ''' + {"data":1}, + {"invalid_raw_list":2}, + [15,16,17] + ''', + {'data': {'invalid_raw_list': [None, None, None]}}, + ), ( + ''' + {"data":1}, + ["EmptyRef",2], + "not valid JSON" + ''', + {'data': None}, + ), ( + ''' + {"data":1}, + ["EmptyShallowRef",2], + "not valid JSON" + ''', + {'data': None}, + )] + INVALID = [ + ''' + [] + ''', + ''' + ["unsupported",1], + {"data":2}, + {} + ''', + ] + DEFAULT = object() + + self.assertEqual(self.ie._search_nuxt_json(HTML_TMPL.format(VALID_DATA), None), PAYLOAD) + self.assertEqual(self.ie._search_nuxt_json('', None, fatal=False), {}) + self.assertIs(self.ie._search_nuxt_json('', None, default=DEFAULT), DEFAULT) + + for data, expected in PARTIALLY_INVALID: + self.assertEqual( + self.ie._search_nuxt_json(HTML_TMPL.format(data), None, fatal=False), expected) + + for data in INVALID: + self.assertIs( + self.ie._search_nuxt_json(HTML_TMPL.format(data), None, default=DEFAULT), DEFAULT) + if __name__ == '__main__': unittest.main() diff --git a/test/test_devalue.py b/test/test_devalue.py new file mode 100644 index 000000000..29eb89e87 --- /dev/null +++ b/test/test_devalue.py @@ -0,0 +1,235 @@ +#!/usr/bin/env python3 + +# Allow direct execution +import os +import sys + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + + +import datetime as dt +import json +import math +import re +import unittest + +from yt_dlp.utils.jslib import devalue + + +TEST_CASES_EQUALS = [{ + 'name': 'int', + 'unparsed': [-42], + 'parsed': -42, +}, { + 'name': 'str', + 'unparsed': ['woo!!!'], + 'parsed': 'woo!!!', +}, { + 'name': 'Number', + 'unparsed': [['Object', 42]], + 'parsed': 42, +}, { + 'name': 'String', + 'unparsed': [['Object', 'yar']], + 'parsed': 'yar', +}, { + 'name': 'Infinity', + 'unparsed': -4, + 'parsed': math.inf, +}, { + 'name': 'negative Infinity', + 'unparsed': -5, + 'parsed': -math.inf, +}, { + 'name': 'negative zero', + 'unparsed': -6, + 'parsed': -0.0, +}, { + 'name': 'RegExp', + 'unparsed': [['RegExp', 'regexp', 'gim']], # XXX: flags are ignored + 'parsed': re.compile('regexp'), +}, { + 'name': 'Date', + 'unparsed': [['Date', '2001-09-09T01:46:40.000Z']], + 'parsed': dt.datetime.fromtimestamp(1e9, tz=dt.timezone.utc), +}, { + 'name': 'Array', + 'unparsed': [[1, 2, 3], 'a', 'b', 'c'], + 'parsed': ['a', 'b', 'c'], +}, { + 'name': 'Array (empty)', + 'unparsed': [[]], + 'parsed': [], +}, { + 'name': 'Array (sparse)', + 'unparsed': [[-2, 1, -2], 'b'], + 'parsed': [None, 'b', None], +}, { + 'name': 'Object', + 'unparsed': [{'foo': 1, 'x-y': 2}, 'bar', 'z'], + 'parsed': {'foo': 'bar', 'x-y': 'z'}, +}, { + 'name': 'Set', + 'unparsed': [['Set', 1, 2, 3], 1, 2, 3], + 'parsed': [1, 2, 3], +}, { + 'name': 'Map', + 'unparsed': [['Map', 1, 2], 'a', 'b'], + 'parsed': [['a', 'b']], +}, { + 'name': 'BigInt', + 'unparsed': [['BigInt', '1']], + 'parsed': 1, +}, { + 'name': 'Uint8Array', + 'unparsed': [['Uint8Array', 'AQID']], + 'parsed': [1, 2, 3], +}, { + 'name': 'ArrayBuffer', + 'unparsed': [['ArrayBuffer', 'AQID']], + 'parsed': [1, 2, 3], +}, { + 'name': 'str (repetition)', + 'unparsed': [[1, 1], 'a string'], + 'parsed': ['a string', 'a string'], +}, { + 'name': 'None (repetition)', + 'unparsed': [[1, 1], None], + 'parsed': [None, None], +}, { + 'name': 'dict (repetition)', + 'unparsed': [[1, 1], {}], + 'parsed': [{}, {}], +}, { + 'name': 'Object without prototype', + 'unparsed': [['null']], + 'parsed': {}, +}, { + 'name': 'cross-realm POJO', + 'unparsed': [{}], + 'parsed': {}, +}] + +TEST_CASES_IS = [{ + 'name': 'bool', + 'unparsed': [True], + 'parsed': True, +}, { + 'name': 'Boolean', + 'unparsed': [['Object', False]], + 'parsed': False, +}, { + 'name': 'undefined', + 'unparsed': -1, + 'parsed': None, +}, { + 'name': 'null', + 'unparsed': [None], + 'parsed': None, +}, { + 'name': 'NaN', + 'unparsed': -3, + 'parsed': math.nan, +}] + +TEST_CASES_INVALID = [{ + 'name': 'empty string', + 'unparsed': '', + 'error': ValueError, + 'pattern': r'expected int or list as input', +}, { + 'name': 'hole', + 'unparsed': -2, + 'error': ValueError, + 'pattern': r'invalid integer input', +}, { + 'name': 'string', + 'unparsed': 'hello', + 'error': ValueError, + 'pattern': r'expected int or list as input', +}, { + 'name': 'number', + 'unparsed': 42, + 'error': ValueError, + 'pattern': r'invalid integer input', +}, { + 'name': 'boolean', + 'unparsed': True, + 'error': ValueError, + 'pattern': r'expected int or list as input', +}, { + 'name': 'null', + 'unparsed': None, + 'error': ValueError, + 'pattern': r'expected int or list as input', +}, { + 'name': 'object', + 'unparsed': {}, + 'error': ValueError, + 'pattern': r'expected int or list as input', +}, { + 'name': 'empty array', + 'unparsed': [], + 'error': ValueError, + 'pattern': r'expected a non-empty list as input', +}, { + 'name': 'Python negative indexing', + 'unparsed': [[1, 2, 3, 4, 5, 6, 7, -7], 1, 2, 3, 4, 5, 6, 7], + 'error': IndexError, + 'pattern': r'invalid index: -7', +}] + + +class TestDevalue(unittest.TestCase): + def test_devalue_parse_equals(self): + for tc in TEST_CASES_EQUALS: + self.assertEqual(devalue.parse(tc['unparsed']), tc['parsed'], tc['name']) + + def test_devalue_parse_is(self): + for tc in TEST_CASES_IS: + self.assertIs(devalue.parse(tc['unparsed']), tc['parsed'], tc['name']) + + def test_devalue_parse_invalid(self): + for tc in TEST_CASES_INVALID: + with self.assertRaisesRegex(tc['error'], tc['pattern'], msg=tc['name']): + devalue.parse(tc['unparsed']) + + def test_devalue_parse_cyclical(self): + name = 'Map (cyclical)' + result = devalue.parse([['Map', 1, 0], 'self']) + self.assertEqual(result[0][0], 'self', name) + self.assertIs(result, result[0][1], name) + + name = 'Set (cyclical)' + result = devalue.parse([['Set', 0, 1], 42]) + self.assertEqual(result[1], 42, name) + self.assertIs(result, result[0], name) + + result = devalue.parse([[0]]) + self.assertIs(result, result[0], 'Array (cyclical)') + + name = 'Object (cyclical)' + result = devalue.parse([{'self': 0}]) + self.assertIs(result, result['self'], name) + + name = 'Object with null prototype (cyclical)' + result = devalue.parse([['null', 'self', 0]]) + self.assertIs(result, result['self'], name) + + name = 'Objects (cyclical)' + result = devalue.parse([[1, 2], {'second': 2}, {'first': 1}]) + self.assertIs(result[0], result[1]['first'], name) + self.assertIs(result[1], result[0]['second'], name) + + def test_devalue_parse_revivers(self): + self.assertEqual( + devalue.parse([['indirect', 1], {'a': 2}, 'b'], revivers={'indirect': lambda x: x}), + {'a': 'b'}, 'revivers (indirect)') + + self.assertEqual( + devalue.parse([['parse', 1], '{"a":0}'], revivers={'parse': lambda x: json.loads(x)}), + {'a': 0}, 'revivers (parse)') + + +if __name__ == '__main__': + unittest.main() diff --git a/test/test_pot/test_pot_builtin_utils.py b/test/test_pot/test_pot_builtin_utils.py index a95fc4e15..7645ba601 100644 --- a/test/test_pot/test_pot_builtin_utils.py +++ b/test/test_pot/test_pot_builtin_utils.py @@ -11,7 +11,7 @@ class TestGetWebPoContentBinding: @pytest.mark.parametrize('client_name, context, is_authenticated, expected', [ *[(client, context, is_authenticated, expected) for client in [ - 'WEB', 'MWEB', 'TVHTML5', 'WEB_EMBEDDED_PLAYER', 'WEB_CREATOR', 'TVHTML5_SIMPLY_EMBEDDED_PLAYER'] + 'WEB', 'MWEB', 'TVHTML5', 'WEB_EMBEDDED_PLAYER', 'WEB_CREATOR', 'TVHTML5_SIMPLY_EMBEDDED_PLAYER', 'TVHTML5_SIMPLY'] for context, is_authenticated, expected in [ (PoTokenContext.GVS, False, ('example-visitor-data', ContentBindingType.VISITOR_DATA)), (PoTokenContext.PLAYER, False, ('example-video-id', ContentBindingType.VIDEO_ID)), diff --git a/test/test_pot/test_pot_builtin_webpospec.py b/test/test_pot/test_pot_builtin_webpospec.py index c5fb6f382..078008415 100644 --- a/test/test_pot/test_pot_builtin_webpospec.py +++ b/test/test_pot/test_pot_builtin_webpospec.py @@ -49,7 +49,7 @@ def test_not_supports(self, ie, logger, pot_request, client_name, context, is_au @pytest.mark.parametrize('client_name, context, is_authenticated, remote_host, source_address, request_proxy, expected', [ *[(client, context, is_authenticated, remote_host, source_address, request_proxy, expected) for client in [ - 'WEB', 'MWEB', 'TVHTML5', 'WEB_EMBEDDED_PLAYER', 'WEB_CREATOR', 'TVHTML5_SIMPLY_EMBEDDED_PLAYER'] + 'WEB', 'MWEB', 'TVHTML5', 'WEB_EMBEDDED_PLAYER', 'WEB_CREATOR', 'TVHTML5_SIMPLY_EMBEDDED_PLAYER', 'TVHTML5_SIMPLY'] for context, is_authenticated, remote_host, source_address, request_proxy, expected in [ (PoTokenContext.GVS, False, 'example-remote-host', 'example-source-address', 'example-request-proxy', {'t': 'webpo', 'ip': 'example-remote-host', 'sa': 'example-source-address', 'px': 'example-request-proxy', 'cb': '123abcXYZ_-', 'cbt': 'visitor_id'}), (PoTokenContext.PLAYER, False, 'example-remote-host', 'example-source-address', 'example-request-proxy', {'t': 'webpo', 'ip': 'example-remote-host', 'sa': 'example-source-address', 'px': 'example-request-proxy', 'cb': '123abcXYZ_-', 'cbt': 'video_id'}), diff --git a/test/test_traversal.py b/test/test_traversal.py index bc433029d..52215f5a7 100644 --- a/test/test_traversal.py +++ b/test/test_traversal.py @@ -416,18 +416,8 @@ def test_traversal_unbranching(self): '`any` should allow further branching' def test_traversal_morsel(self): - values = { - 'expires': 'a', - 'path': 'b', - 'comment': 'c', - 'domain': 'd', - 'max-age': 'e', - 'secure': 'f', - 'httponly': 'g', - 'version': 'h', - 'samesite': 'i', - } morsel = http.cookies.Morsel() + values = dict(zip(morsel, 'abcdefghijklmnop')) morsel.set('item_key', 'item_value', 'coded_value') morsel.update(values) values['key'] = 'item_key' diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index 3f777aed7..3336b6bff 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -320,6 +320,14 @@ 'https://www.youtube.com/s/player/59b252b9/player_ias.vflset/en_US/base.js', 'D3XWVpYgwhLLKNK4AGX', 'aZrQ1qWJ5yv5h', ), + ( + 'https://www.youtube.com/s/player/fc2a56a5/player_ias.vflset/en_US/base.js', + 'qTKWg_Il804jd2kAC', 'OtUAm2W6gyzJjB9u', + ), + ( + 'https://www.youtube.com/s/player/fc2a56a5/tv-player-ias.vflset/tv-player-ias.js', + 'qTKWg_Il804jd2kAC', 'OtUAm2W6gyzJjB9u', + ), ] diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index ea6264a0d..309489672 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -490,7 +490,7 @@ class YoutubeDL: The template is mapped on a dictionary with keys 'progress' and 'info' retry_sleep_functions: Dictionary of functions that takes the number of attempts as argument and returns the time to sleep in seconds. - Allowed keys are 'http', 'fragment', 'file_access' + Allowed keys are 'http', 'fragment', 'file_access', 'extractor' download_ranges: A callback function that gets called for every video with the signature (info_dict, ydl) -> Iterable[Section]. Only the returned sections will be downloaded. diff --git a/yt_dlp/extractor/aenetworks.py b/yt_dlp/extractor/aenetworks.py index c6a1b1509..e5c922b41 100644 --- a/yt_dlp/extractor/aenetworks.py +++ b/yt_dlp/extractor/aenetworks.py @@ -1,3 +1,5 @@ +import json + from .theplatform import ThePlatformIE from ..utils import ( ExtractorError, @@ -6,7 +8,6 @@ remove_start, traverse_obj, update_url_query, - urlencode_postdata, ) @@ -204,18 +205,19 @@ def _real_extract(self, url): class AENetworksListBaseIE(AENetworksBaseIE): def _call_api(self, resource, slug, brand, fields): return self._download_json( - 'https://yoga.appsvcs.aetnd.com/graphql', - slug, query={'brand': brand}, data=urlencode_postdata({ + 'https://yoga.appsvcs.aetnd.com/graphql', slug, + query={'brand': brand}, headers={'Content-Type': 'application/json'}, + data=json.dumps({ 'query': '''{ %s(slug: "%s") { %s } }''' % (resource, slug, fields), # noqa: UP031 - }))['data'][resource] + }).encode())['data'][resource] def _real_extract(self, url): domain, slug = self._match_valid_url(url).groups() - _, brand = self._DOMAIN_MAP[domain] + _, brand, _ = self._DOMAIN_MAP[domain] playlist = self._call_api(self._RESOURCE, slug, brand, self._FIELDS) base_url = f'http://watch.{domain}' diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index 6508942a4..43c9000ce 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -816,6 +816,26 @@ class BiliBiliBangumiIE(BilibiliBaseIE): 'upload_date': '20111104', 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', }, + }, { + 'note': 'new playurlSSRData scheme', + 'url': 'https://www.bilibili.com/bangumi/play/ep678060', + 'info_dict': { + 'id': '678060', + 'ext': 'mp4', + 'series': '去你家吃饭好吗', + 'series_id': '6198', + 'season': '第二季', + 'season_id': '42542', + 'season_number': 2, + 'episode': '吴老二:你家大公鸡养不熟,能煮熟吗…', + 'episode_id': '678060', + 'episode_number': 61, + 'title': '一只小九九丫 吴老二:你家大公鸡养不熟,能煮熟吗…', + 'duration': 266.123, + 'timestamp': 1663315904, + 'upload_date': '20220916', + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', + }, }, { 'url': 'https://www.bilibili.com/bangumi/play/ep267851', 'info_dict': { @@ -879,12 +899,26 @@ def _real_extract(self, url): 'Extracting episode', query={'fnval': 12240, 'ep_id': episode_id}, headers=headers)) + geo_blocked = traverse_obj(play_info, ( + 'raw', 'data', 'plugins', lambda _, v: v['name'] == 'AreaLimitPanel', 'config', 'is_block', {bool}, any)) premium_only = play_info.get('code') == -10403 - play_info = traverse_obj(play_info, ('result', 'video_info', {dict})) or {} - formats = self.extract_formats(play_info) - if not formats and (premium_only or '成为大会员抢先看' in webpage or '开通大会员观看' in webpage): - self.raise_login_required('This video is for premium members only') + video_info = traverse_obj(play_info, (('result', ('raw', 'data')), 'video_info', {dict}, any)) or {} + formats = self.extract_formats(video_info) + + if not formats: + if geo_blocked: + self.raise_geo_restricted() + elif premium_only or '成为大会员抢先看' in webpage or '开通大会员观看' in webpage: + self.raise_login_required('This video is for premium members only') + + if traverse_obj(play_info, (( + ('result', 'play_check', 'play_detail'), # 'PLAY_PREVIEW' vs 'PLAY_WHOLE' + ('raw', 'data', 'play_video_type'), # 'preview' vs 'whole' + ), any, {lambda x: x in ('PLAY_PREVIEW', 'preview')})): + self.report_warning( + 'Only preview format is available, ' + f'you have to become a premium member to access full video. {self._login_hint()}') bangumi_info = self._download_json( 'https://api.bilibili.com/pgc/view/web/season', episode_id, 'Get episode details', @@ -922,7 +956,7 @@ def _real_extract(self, url): 'season': str_or_none(season_title), 'season_id': str_or_none(season_id), 'season_number': season_number, - 'duration': float_or_none(play_info.get('timelength'), scale=1000), + 'duration': float_or_none(video_info.get('timelength'), scale=1000), 'subtitles': self.extract_subtitles(episode_id, episode_info.get('cid'), aid=aid), '__post_extractor': self.extract_comments(aid), 'http_headers': {'Referer': url}, diff --git a/yt_dlp/extractor/brightcove.py b/yt_dlp/extractor/brightcove.py index d4ac7a0c2..c0f2f8b57 100644 --- a/yt_dlp/extractor/brightcove.py +++ b/yt_dlp/extractor/brightcove.py @@ -495,8 +495,6 @@ def _real_extract(self, url): class BrightcoveNewBaseIE(AdobePassIE): def _parse_brightcove_metadata(self, json_data, video_id, headers={}): - title = json_data['name'].strip() - formats, subtitles = [], {} sources = json_data.get('sources') or [] for source in sources: @@ -600,16 +598,18 @@ def build_format_id(kind): return { 'id': video_id, - 'title': title, - 'description': clean_html(json_data.get('description')), 'thumbnails': thumbnails, 'duration': duration, - 'timestamp': parse_iso8601(json_data.get('published_at')), - 'uploader_id': json_data.get('account_id'), 'formats': formats, 'subtitles': subtitles, - 'tags': json_data.get('tags', []), 'is_live': is_live, + **traverse_obj(json_data, { + 'title': ('name', {clean_html}), + 'description': ('description', {clean_html}), + 'tags': ('tags', ..., {str}, filter, all, filter), + 'timestamp': ('published_at', {parse_iso8601}), + 'uploader_id': ('account_id', {str}), + }), } @@ -645,10 +645,7 @@ class BrightcoveNewIE(BrightcoveNewBaseIE): 'uploader_id': '4036320279001', 'formats': 'mincount:39', }, - 'params': { - # m3u8 download - 'skip_download': True, - }, + 'skip': '404 Not Found', }, { # playlist stream 'url': 'https://players.brightcove.net/1752604059001/S13cJdUBz_default/index.html?playlistId=5718313430001', @@ -709,7 +706,6 @@ class BrightcoveNewIE(BrightcoveNewBaseIE): 'ext': 'mp4', 'title': 'TGD_01-032_5', 'thumbnail': r're:^https?://.*\.jpg$', - 'tags': [], 'timestamp': 1646078943, 'uploader_id': '1569565978001', 'upload_date': '20220228', @@ -721,7 +717,6 @@ class BrightcoveNewIE(BrightcoveNewBaseIE): 'ext': 'mp4', 'title': 'TGD 01-087 (Airs 05.25.22)_Segment 5', 'thumbnail': r're:^https?://.*\.jpg$', - 'tags': [], 'timestamp': 1651604591, 'uploader_id': '1569565978001', 'upload_date': '20220503', diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 1174bd4f5..6058f66ae 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -101,6 +101,7 @@ xpath_with_ns, ) from ..utils._utils import _request_dump_filename +from ..utils.jslib import devalue class InfoExtractor: @@ -1795,6 +1796,63 @@ def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal) return traverse_obj(ret, traverse) or {} + def _resolve_nuxt_array(self, array, video_id, *, fatal=True, default=NO_DEFAULT): + """Resolves Nuxt rich JSON payload arrays""" + # Ref: https://github.com/nuxt/nuxt/commit/9e503be0f2a24f4df72a3ccab2db4d3e63511f57 + # https://github.com/nuxt/nuxt/pull/19205 + if default is not NO_DEFAULT: + fatal = False + + if not isinstance(array, list) or not array: + error_msg = 'Unable to resolve Nuxt JSON data: invalid input' + if fatal: + raise ExtractorError(error_msg, video_id=video_id) + elif default is NO_DEFAULT: + self.report_warning(error_msg, video_id=video_id) + return {} if default is NO_DEFAULT else default + + def indirect_reviver(data): + return data + + def json_reviver(data): + return json.loads(data) + + gen = devalue.parse_iter(array, revivers={ + 'NuxtError': indirect_reviver, + 'EmptyShallowRef': json_reviver, + 'EmptyRef': json_reviver, + 'ShallowRef': indirect_reviver, + 'ShallowReactive': indirect_reviver, + 'Ref': indirect_reviver, + 'Reactive': indirect_reviver, + }) + + while True: + try: + error_msg = f'Error resolving Nuxt JSON: {gen.send(None)}' + if fatal: + raise ExtractorError(error_msg, video_id=video_id) + elif default is NO_DEFAULT: + self.report_warning(error_msg, video_id=video_id, only_once=True) + else: + self.write_debug(f'{video_id}: {error_msg}', only_once=True) + except StopIteration as error: + return error.value or ({} if default is NO_DEFAULT else default) + + def _search_nuxt_json(self, webpage, video_id, *, fatal=True, default=NO_DEFAULT): + """Parses metadata from Nuxt rich JSON payloads embedded in HTML""" + passed_default = default is not NO_DEFAULT + + array = self._search_json( + r']+\bid="__NUXT_DATA__"[^>]*>', webpage, + 'Nuxt JSON data', video_id, contains_pattern=r'\[(?s:.+)\]', + fatal=fatal, default=NO_DEFAULT if not passed_default else None) + + if not array: + return default if passed_default else {} + + return self._resolve_nuxt_array(array, video_id, fatal=fatal, default=default) + @staticmethod def _hidden_inputs(html): html = re.sub(r'', '', html) diff --git a/yt_dlp/extractor/douyutv.py b/yt_dlp/extractor/douyutv.py index e36eac919..68ace240c 100644 --- a/yt_dlp/extractor/douyutv.py +++ b/yt_dlp/extractor/douyutv.py @@ -206,7 +206,7 @@ def _real_extract(self, url): 'is_live': True, **traverse_obj(room, { 'display_id': ('url', {str}, {lambda i: i[1:]}), - 'title': ('room_name', {unescapeHTML}), + 'title': ('room_name', {str}, {unescapeHTML}), 'description': ('show_details', {str}), 'uploader': ('nickname', {str}), 'thumbnail': ('room_src', {url_or_none}), diff --git a/yt_dlp/extractor/dreisat.py b/yt_dlp/extractor/dreisat.py index edd66e46c..fb8a8e87c 100644 --- a/yt_dlp/extractor/dreisat.py +++ b/yt_dlp/extractor/dreisat.py @@ -64,7 +64,7 @@ class DreiSatIE(ZDFBaseIE): 'title': 'dein buch - Das Beste von der Leipziger Buchmesse 2025 - Teil 1', 'description': 'md5:bae51bfc22f15563ce3acbf97d2e8844', 'duration': 5399.0, - 'thumbnail': 'https://www.3sat.de/assets/buchmesse-kerkeling-100~original?cb=1743329640903', + 'thumbnail': 'https://www.3sat.de/assets/buchmesse-kerkeling-100~original?cb=1747256996338', 'chapters': 'count:24', 'episode': 'dein buch - Das Beste von der Leipziger Buchmesse 2025 - Teil 1', 'episode_id': 'POS_1ef236cc-b390-401e-acd0-4fb4b04315fb', diff --git a/yt_dlp/extractor/hypergryph.py b/yt_dlp/extractor/hypergryph.py index 1fb2e9a98..f405d14b5 100644 --- a/yt_dlp/extractor/hypergryph.py +++ b/yt_dlp/extractor/hypergryph.py @@ -1,32 +1,66 @@ from .common import InfoExtractor -from ..utils import js_to_json, traverse_obj +from ..utils import ( + ExtractorError, + clean_html, + url_or_none, +) +from ..utils.traversal import subs_list_to_dict, traverse_obj class MonsterSirenHypergryphMusicIE(InfoExtractor): + IE_NAME = 'monstersiren' + IE_DESC = '塞壬唱片' + _API_BASE = 'https://monster-siren.hypergryph.com/api' _VALID_URL = r'https?://monster-siren\.hypergryph\.com/music/(?P\d+)' _TESTS = [{ 'url': 'https://monster-siren.hypergryph.com/music/514562', 'info_dict': { 'id': '514562', 'ext': 'wav', - 'artists': ['塞壬唱片-MSR'], - 'album': 'Flame Shadow', 'title': 'Flame Shadow', + 'album': 'Flame Shadow', + 'artists': ['塞壬唱片-MSR'], + 'description': 'md5:19e2acfcd1b65b41b29e8079ab948053', + 'thumbnail': r're:https?://web\.hycdn\.cn/siren/pic/.+\.jpg', + }, + }, { + 'url': 'https://monster-siren.hypergryph.com/music/514518', + 'info_dict': { + 'id': '514518', + 'ext': 'wav', + 'title': 'Heavenly Me (Instrumental)', + 'album': 'Heavenly Me', + 'artists': ['塞壬唱片-MSR', 'AIYUE blessed : 理名'], + 'description': 'md5:ce790b41c932d1ad72eb791d1d8ae598', + 'thumbnail': r're:https?://web\.hycdn\.cn/siren/pic/.+\.jpg', }, }] def _real_extract(self, url): audio_id = self._match_id(url) - webpage = self._download_webpage(url, audio_id) - json_data = self._search_json( - r'window\.g_initialProps\s*=', webpage, 'data', audio_id, transform_source=js_to_json) + song = self._download_json(f'{self._API_BASE}/song/{audio_id}', audio_id) + if traverse_obj(song, 'code') != 0: + msg = traverse_obj(song, ('msg', {str}, filter)) + raise ExtractorError( + msg or 'API returned an error response', expected=bool(msg)) + + album = None + if album_id := traverse_obj(song, ('data', 'albumCid', {str})): + album = self._download_json( + f'{self._API_BASE}/album/{album_id}/detail', album_id, fatal=False) return { 'id': audio_id, - 'title': traverse_obj(json_data, ('player', 'songDetail', 'name')), - 'url': traverse_obj(json_data, ('player', 'songDetail', 'sourceUrl')), - 'ext': 'wav', 'vcodec': 'none', - 'artists': traverse_obj(json_data, ('player', 'songDetail', 'artists', ...)), - 'album': traverse_obj(json_data, ('musicPlay', 'albumDetail', 'name')), + **traverse_obj(song, ('data', { + 'title': ('name', {str}), + 'artists': ('artists', ..., {str}), + 'subtitles': ({'url': 'lyricUrl'}, all, {subs_list_to_dict(lang='en')}), + 'url': ('sourceUrl', {url_or_none}), + })), + **traverse_obj(album, ('data', { + 'album': ('name', {str}), + 'description': ('intro', {clean_html}), + 'thumbnail': ('coverUrl', {url_or_none}), + })), } diff --git a/yt_dlp/extractor/mitele.py b/yt_dlp/extractor/mitele.py index 55fa83b51..0dded38c6 100644 --- a/yt_dlp/extractor/mitele.py +++ b/yt_dlp/extractor/mitele.py @@ -1,7 +1,5 @@ from .telecinco import TelecincoBaseIE -from ..networking.exceptions import HTTPError from ..utils import ( - ExtractorError, int_or_none, parse_iso8601, ) @@ -81,17 +79,7 @@ class MiTeleIE(TelecincoBaseIE): def _real_extract(self, url): display_id = self._match_id(url) - - try: # yt-dlp's default user-agents are too old and blocked by akamai - webpage = self._download_webpage(url, display_id, headers={ - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:136.0) Gecko/20100101 Firefox/136.0', - }) - except ExtractorError as e: - if not isinstance(e.cause, HTTPError) or e.cause.status != 403: - raise - # Retry with impersonation if hardcoded UA is insufficient to bypass akamai - webpage = self._download_webpage(url, display_id, impersonate=True) - + webpage = self._download_akamai_webpage(url, display_id) pre_player = self._search_json( r'window\.\$REACTBASE_STATE\.prePlayer_mtweb\s*=', webpage, 'Pre Player', display_id)['prePlayer'] diff --git a/yt_dlp/extractor/nobelprize.py b/yt_dlp/extractor/nobelprize.py index 536ca27f7..833bab094 100644 --- a/yt_dlp/extractor/nobelprize.py +++ b/yt_dlp/extractor/nobelprize.py @@ -1,59 +1,57 @@ from .common import InfoExtractor from ..utils import ( - determine_ext, - get_element_by_attribute, + UnsupportedError, + clean_html, int_or_none, - js_to_json, - mimetype2ext, - update_url_query, + parse_duration, + parse_qs, + str_or_none, + update_url, ) +from ..utils.traversal import find_element, traverse_obj class NobelPrizeIE(InfoExtractor): - _WORKING = False - _VALID_URL = r'https?://(?:www\.)?nobelprize\.org/mediaplayer.*?\bid=(?P\d+)' - _TEST = { - 'url': 'http://www.nobelprize.org/mediaplayer/?id=2636', - 'md5': '04c81e5714bb36cc4e2232fee1d8157f', + _VALID_URL = r'https?://(?:(?:mediaplayer|www)\.)?nobelprize\.org/mediaplayer/' + _TESTS = [{ + 'url': 'https://www.nobelprize.org/mediaplayer/?id=2636', 'info_dict': { 'id': '2636', 'ext': 'mp4', 'title': 'Announcement of the 2016 Nobel Prize in Physics', - 'description': 'md5:05beba57f4f5a4bbd4cf2ef28fcff739', + 'description': 'md5:1a2d8a6ca80c88fb3b9a326e0b0e8e43', + 'duration': 1560.0, + 'thumbnail': r're:https?://www\.nobelprize\.org/images/.+\.jpg', + 'timestamp': 1504883793, + 'upload_date': '20170908', }, - } + }, { + 'url': 'https://mediaplayer.nobelprize.org/mediaplayer/?qid=12693', + 'info_dict': { + 'id': '12693', + 'ext': 'mp4', + 'title': 'Nobel Lecture by Peter Higgs', + 'description': 'md5:9b12e275dbe3a8138484e70e00673a05', + 'duration': 1800.0, + 'thumbnail': r're:https?://www\.nobelprize\.org/images/.+\.jpg', + 'timestamp': 1504883793, + 'upload_date': '20170908', + }, + }] def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - media = self._parse_json(self._search_regex( - r'(?s)var\s*config\s*=\s*({.+?});', webpage, - 'config'), video_id, js_to_json)['media'] - title = media['title'] - - formats = [] - for source in media.get('source', []): - source_src = source.get('src') - if not source_src: - continue - ext = mimetype2ext(source.get('type')) or determine_ext(source_src) - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - source_src, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - elif ext == 'f4m': - formats.extend(self._extract_f4m_formats( - update_url_query(source_src, {'hdcore': '3.7.0'}), - video_id, f4m_id='hds', fatal=False)) - else: - formats.append({ - 'url': source_src, - }) + video_id = traverse_obj(parse_qs(url), ( + ('id', 'qid'), -1, {int_or_none}, {str_or_none}, any)) + if not video_id: + raise UnsupportedError(url) + webpage = self._download_webpage( + update_url(url, netloc='mediaplayer.nobelprize.org'), video_id) return { + **self._search_json_ld(webpage, video_id), 'id': video_id, - 'title': title, - 'description': get_element_by_attribute('itemprop', 'description', webpage), - 'duration': int_or_none(media.get('duration')), - 'formats': formats, + 'title': self._html_search_meta('caption', webpage), + 'description': traverse_obj(webpage, ( + {find_element(tag='span', attr='itemprop', value='description')}, {clean_html})), + 'duration': parse_duration(self._html_search_meta('duration', webpage)), } diff --git a/yt_dlp/extractor/ntvcojp.py b/yt_dlp/extractor/ntvcojp.py index 422ec6eb0..76c5936ba 100644 --- a/yt_dlp/extractor/ntvcojp.py +++ b/yt_dlp/extractor/ntvcojp.py @@ -1,55 +1,82 @@ -from .common import InfoExtractor +from .streaks import StreaksBaseIE from ..utils import ( - ExtractorError, - smuggle_url, - traverse_obj, + int_or_none, + parse_iso8601, + str_or_none, + url_or_none, ) +from ..utils.traversal import require, traverse_obj -class NTVCoJpCUIE(InfoExtractor): +class NTVCoJpCUIE(StreaksBaseIE): IE_NAME = 'cu.ntv.co.jp' - IE_DESC = 'Nippon Television Network' - _VALID_URL = r'https?://cu\.ntv\.co\.jp/(?!program)(?P[^/?&#]+)' - _TEST = { - 'url': 'https://cu.ntv.co.jp/televiva-chill-gohan_181031/', + IE_DESC = '日テレ無料TADA!' + _VALID_URL = r'https?://cu\.ntv\.co\.jp/(?!program-list|search)(?P[\w-]+)/?(?:[?#]|$)' + _TESTS = [{ + 'url': 'https://cu.ntv.co.jp/gaki_20250525/', 'info_dict': { - 'id': '5978891207001', + 'id': 'gaki_20250525', 'ext': 'mp4', - 'title': '桜エビと炒り卵がポイント! 「中華風 エビチリおにぎり」──『美虎』五十嵐美幸', - 'upload_date': '20181213', - 'description': 'md5:1985b51a9abc285df0104d982a325f2a', - 'uploader_id': '3855502814001', - 'timestamp': 1544669941, + 'title': '放送開始36年!方正ココリコが選ぶ神回&地獄回!', + 'cast': 'count:2', + 'description': 'md5:1e1db556224d627d4d2f74370c650927', + 'display_id': 'ref:gaki_20250525', + 'duration': 1450, + 'episode': '放送開始36年!方正ココリコが選ぶ神回&地獄回!', + 'episode_id': '000000010172808', + 'episode_number': 255, + 'genres': ['variety'], + 'live_status': 'not_live', + 'modified_date': '20250525', + 'modified_timestamp': 1748145537, + 'release_date': '20250525', + 'release_timestamp': 1748145539, + 'series': 'ダウンタウンのガキの使いやあらへんで!', + 'series_id': 'gaki', + 'thumbnail': r're:https?://.+\.jpg', + 'timestamp': 1748145197, + 'upload_date': '20250525', + 'uploader': '日本テレビ放送網', + 'uploader_id': '0x7FE2', }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - } - - BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s' + }] def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - player_config = self._search_nuxt_data(webpage, display_id) - video_id = traverse_obj(player_config, ('movie', 'video_id')) - if not video_id: - raise ExtractorError('Failed to extract video ID for Brightcove') - account_id = traverse_obj(player_config, ('player', 'account')) or '3855502814001' - title = traverse_obj(player_config, ('movie', 'name')) - if not title: - og_title = self._og_search_title(webpage, fatal=False) or traverse_obj(player_config, ('player', 'title')) - if og_title: - title = og_title.split('(', 1)[0].strip() - description = (traverse_obj(player_config, ('movie', 'description')) - or self._html_search_meta(['description', 'og:description'], webpage)) + + info = self._search_json( + r'window\.app\s*=', webpage, 'video info', + display_id)['falcorCache']['catalog']['episode'][display_id]['value'] + media_id = traverse_obj(info, ( + 'streaks_data', 'mediaid', {str_or_none}, {require('Streaks media ID')})) + non_phonetic = (lambda _, v: v['is_phonetic'] is False, 'value', {str}) + return { - '_type': 'url_transparent', - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': description, - 'url': smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % (account_id, video_id), {'geo_countries': ['JP']}), - 'ie_key': 'BrightcoveNew', + **self._extract_from_streaks_api('ntv-tada', media_id, headers={ + 'X-Streaks-Api-Key': 'df497719056b44059a0483b8faad1f4a', + }), + **traverse_obj(info, { + 'id': ('content_id', {str_or_none}), + 'title': ('title', *non_phonetic, any), + 'age_limit': ('is_adult_only_content', {lambda x: 18 if x else None}), + 'cast': ('credit', ..., 'name', *non_phonetic), + 'genres': ('genre', ..., {str}), + 'release_timestamp': ('pub_date', {parse_iso8601}), + 'tags': ('tags', ..., {str}), + 'thumbnail': ('artwork', ..., 'url', any, {url_or_none}), + }), + **traverse_obj(info, ('tv_episode_info', { + 'duration': ('duration', {int_or_none}), + 'episode_number': ('episode_number', {int}), + 'series': ('parent_show_title', *non_phonetic, any), + 'series_id': ('show_content_id', {str}), + })), + **traverse_obj(info, ('custom_data', { + 'description': ('program_detail', {str}), + 'episode': ('episode_title', {str}), + 'episode_id': ('episode_id', {str_or_none}), + 'uploader': ('network_name', {str}), + 'uploader_id': ('network_id', {str}), + })), } diff --git a/yt_dlp/extractor/qqmusic.py b/yt_dlp/extractor/qqmusic.py index fb46e0d12..56a8e7300 100644 --- a/yt_dlp/extractor/qqmusic.py +++ b/yt_dlp/extractor/qqmusic.py @@ -15,7 +15,6 @@ str_or_none, strip_jsonp, traverse_obj, - unescapeHTML, url_or_none, urljoin, ) @@ -425,7 +424,7 @@ def _real_extract(self, url): return self.playlist_result(entries, list_id, **traverse_obj(list_json, ('cdlist', 0, { 'title': ('dissname', {str}), - 'description': ('desc', {unescapeHTML}, {clean_html}), + 'description': ('desc', {clean_html}), }))) diff --git a/yt_dlp/extractor/srmediathek.py b/yt_dlp/extractor/srmediathek.py index fc63d9b1a..d6cab6ae7 100644 --- a/yt_dlp/extractor/srmediathek.py +++ b/yt_dlp/extractor/srmediathek.py @@ -1,57 +1,102 @@ from .ard import ARDMediathekBaseIE from ..utils import ( ExtractorError, - get_element_by_attribute, + clean_html, + extract_attributes, + parse_duration, + parse_qs, + unified_strdate, +) +from ..utils.traversal import ( + find_element, + require, + traverse_obj, ) class SRMediathekIE(ARDMediathekBaseIE): - _WORKING = False IE_NAME = 'sr:mediathek' IE_DESC = 'Saarländischer Rundfunk' - _VALID_URL = r'https?://sr-mediathek(?:\.sr-online)?\.de/index\.php\?.*?&id=(?P[0-9]+)' + _CLS_COMMON = 'teaser__image__caption__text teaser__image__caption__text--' + _VALID_URL = r'https?://(?:www\.)?sr-mediathek\.de/index\.php\?.*?&id=(?P\d+)' _TESTS = [{ - 'url': 'http://sr-mediathek.sr-online.de/index.php?seite=7&id=28455', + 'url': 'https://www.sr-mediathek.de/index.php?seite=7&id=141317', 'info_dict': { - 'id': '28455', + 'id': '141317', 'ext': 'mp4', - 'title': 'sportarena (26.10.2014)', - 'description': 'Ringen: KSV Köllerbach gegen Aachen-Walheim; Frauen-Fußball: 1. FC Saarbrücken gegen Sindelfingen; Motorsport: Rallye in Losheim; dazu: Interview mit Timo Bernhard; Turnen: TG Saar; Reitsport: Deutscher Voltigier-Pokal; Badminton: Interview mit Michael Fuchs ', - 'thumbnail': r're:^https?://.*\.jpg$', - }, - 'skip': 'no longer available', - }, { - 'url': 'http://sr-mediathek.sr-online.de/index.php?seite=7&id=37682', - 'info_dict': { - 'id': '37682', - 'ext': 'mp4', - 'title': 'Love, Cakes and Rock\'n\'Roll', - 'description': 'md5:18bf9763631c7d326c22603681e1123d', - }, - 'params': { - # m3u8 download - 'skip_download': True, + 'title': 'Kärnten, da will ich hin!', + 'channel': 'SR Fernsehen', + 'description': 'md5:7732e71e803379a499732864a572a456', + 'duration': 1788.0, + 'release_date': '20250525', + 'series': 'da will ich hin!', + 'series_id': 'DWIH', + 'thumbnail': r're:https?://.+\.jpg', }, }, { - 'url': 'http://sr-mediathek.de/index.php?seite=7&id=7480', - 'only_matching': True, + 'url': 'https://www.sr-mediathek.de/index.php?seite=7&id=153853', + 'info_dict': { + 'id': '153853', + 'ext': 'mp3', + 'title': 'Kappes, Klöße, Kokosmilch: Bruschetta mit Nduja', + 'channel': 'SR 3', + 'description': 'md5:3935798de3562b10c4070b408a15e225', + 'duration': 139.0, + 'release_date': '20250523', + 'series': 'Kappes, Klöße, Kokosmilch', + 'series_id': 'SR3_KKK_A', + 'thumbnail': r're:https?://.+\.jpg', + }, + }, { + 'url': 'https://www.sr-mediathek.de/index.php?seite=7&id=31406&pnr=&tbl=pf', + 'info_dict': { + 'id': '31406', + 'ext': 'mp3', + 'title': 'Das Leben schwer nehmen, ist einfach zu anstrengend', + 'channel': 'SR 1', + 'description': 'md5:3e03fd556af831ad984d0add7175fb0c', + 'duration': 1769.0, + 'release_date': '20230717', + 'series': 'Abendrot', + 'series_id': 'SR1_AB_P', + 'thumbnail': r're:https?://.+\.jpg', + }, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) + description = self._og_search_description(webpage) - if '>Der gewünschte Beitrag ist leider nicht mehr verfügbar.<' in webpage: + if description == 'Der gewünschte Beitrag ist leider nicht mehr vorhanden.': raise ExtractorError(f'Video {video_id} is no longer available', expected=True) - media_collection_url = self._search_regex( - r'data-mediacollection-ardplayer="([^"]+)"', webpage, 'media collection url') - info = self._extract_media_info(media_collection_url, webpage, video_id) - info.update({ + player_url = traverse_obj(webpage, ( + {find_element(tag='div', id=f'player{video_id}', html=True)}, + {extract_attributes}, 'data-mediacollection-ardplayer', + {self._proto_relative_url}, {require('player URL')})) + article = traverse_obj(webpage, ( + {find_element(cls='article__content')}, + {find_element(tag='p')}, {clean_html})) + + return { + **self._extract_media_info(player_url, webpage, video_id), 'id': video_id, - 'title': get_element_by_attribute('class', 'ardplayer-title', webpage), - 'description': self._og_search_description(webpage), + 'title': traverse_obj(webpage, ( + {find_element(cls='ardplayer-title')}, {clean_html})), + 'channel': traverse_obj(webpage, ( + {find_element(cls=f'{self._CLS_COMMON}subheadline')}, + {lambda x: x.split('|')[0]}, {clean_html})), + 'description': description, + 'duration': parse_duration(self._search_regex( + r'(\d{2}:\d{2}:\d{2})', article, 'duration')), + 'release_date': unified_strdate(self._search_regex( + r'(\d{2}\.\d{2}\.\d{4})', article, 'release_date')), + 'series': traverse_obj(webpage, ( + {find_element(cls=f'{self._CLS_COMMON}headline')}, {clean_html})), + 'series_id': traverse_obj(webpage, ( + {find_element(cls='teaser__link', html=True)}, + {extract_attributes}, 'href', {parse_qs}, 'sen', ..., {str}, any)), 'thumbnail': self._og_search_thumbnail(webpage), - }) - return info + } diff --git a/yt_dlp/extractor/stacommu.py b/yt_dlp/extractor/stacommu.py index 830018518..e6866f151 100644 --- a/yt_dlp/extractor/stacommu.py +++ b/yt_dlp/extractor/stacommu.py @@ -4,6 +4,7 @@ from ..utils import ( int_or_none, traverse_obj, + url_basename, url_or_none, ) @@ -65,9 +66,19 @@ def _extract_ppv(self, url): hls_info, decrypt = self._call_encrypted_api( video_id, ':watchArchive', 'stream information', data={'method': 1}) + formats = self._get_formats(hls_info, ('hls', 'urls', ..., {url_or_none}), video_id) + for f in formats: + # bitrates are exaggerated in PPV playlists, so avoid wrong/huge filesize_approx values + if f.get('tbr'): + f['tbr'] = int(f['tbr'] / 2.5) + # prefer variants with the same basename as the master playlist to avoid partial streams + f['format_id'] = url_basename(f['url']).partition('.')[0] + if not f['format_id'].startswith(url_basename(f['manifest_url']).partition('.')[0]): + f['preference'] = -10 + return { 'id': video_id, - 'formats': self._get_formats(hls_info, ('hls', 'urls', ..., {url_or_none}), video_id), + 'formats': formats, 'hls_aes': self._extract_hls_key(hls_info, 'hls', decrypt), **traverse_obj(video_info, { 'title': ('displayName', {str}), diff --git a/yt_dlp/extractor/startrek.py b/yt_dlp/extractor/startrek.py index c59187173..802702d44 100644 --- a/yt_dlp/extractor/startrek.py +++ b/yt_dlp/extractor/startrek.py @@ -1,76 +1,76 @@ from .common import InfoExtractor -from ..utils import int_or_none, urljoin +from .youtube import YoutubeIE +from ..utils import ( + clean_html, + parse_iso8601, + update_url, + url_or_none, +) +from ..utils.traversal import subs_list_to_dict, traverse_obj class StarTrekIE(InfoExtractor): - _WORKING = False - _VALID_URL = r'(?Phttps?://(?:intl|www)\.startrek\.com)/videos/(?P[^/]+)' + IE_NAME = 'startrek' + IE_DESC = 'STAR TREK' + _VALID_URL = r'https?://(?:www\.)?startrek\.com(?:/en-(?:ca|un))?/videos/(?P[^/?#]+)' _TESTS = [{ - 'url': 'https://intl.startrek.com/videos/watch-welcoming-jess-bush-to-the-ready-room', - 'md5': '491df5035c9d4dc7f63c79caaf9c839e', + 'url': 'https://www.startrek.com/en-un/videos/official-trailer-star-trek-lower-decks-season-4', 'info_dict': { - 'id': 'watch-welcoming-jess-bush-to-the-ready-room', + 'id': 'official-trailer-star-trek-lower-decks-season-4', 'ext': 'mp4', - 'title': 'WATCH: Welcoming Jess Bush to The Ready Room', - 'duration': 1888, - 'timestamp': 1655388000, - 'upload_date': '20220616', - 'description': 'md5:1ffee884e3920afbdd6dd04e926a1221', - 'thumbnail': r're:https://(?:intl|www)\.startrek\.com/sites/default/files/styles/video_1920x1080/public/images/2022-06/pp_14794_rr_thumb_107_yt_16x9\.jpg(?:\?.+)?', - 'subtitles': {'en-US': [{ - 'url': r're:https://(?:intl|www)\.startrek\.com/sites/default/files/video/captions/2022-06/TRR_SNW_107_v4\.vtt', - }, { - 'url': 'https://media.startrek.com/2022/06/16/2043801155561/1069981_hls/trr_snw_107_v4-c4bfc25d/stream_vtt.m3u8', - }]}, + 'title': 'Official Trailer | Star Trek: Lower Decks - Season 4', + 'alt_title': 'md5:dd7e3191aaaf9e95db16fc3abd5ef68b', + 'categories': ['TRAILERS'], + 'description': 'md5:563d7856ddab99bee7a5e50f45531757', + 'release_date': '20230722', + 'release_timestamp': 1690033200, + 'series': 'Star Trek: Lower Decks', + 'series_id': 'star-trek-lower-decks', + 'thumbnail': r're:https?://.+\.(?:jpg|png)', }, }, { - 'url': 'https://www.startrek.com/videos/watch-ethan-peck-and-gia-sandhu-beam-down-to-the-ready-room', - 'md5': 'f5ad74fbb86e91e0882fc0a333178d1d', + 'url': 'https://www.startrek.com/en-ca/videos/my-first-contact-senator-cory-booker', 'info_dict': { - 'id': 'watch-ethan-peck-and-gia-sandhu-beam-down-to-the-ready-room', + 'id': 'my-first-contact-senator-cory-booker', 'ext': 'mp4', - 'title': 'WATCH: Ethan Peck and Gia Sandhu Beam Down to The Ready Room', - 'duration': 1986, - 'timestamp': 1654221600, - 'upload_date': '20220603', - 'description': 'md5:b3aa0edacfe119386567362dec8ed51b', - 'thumbnail': r're:https://www\.startrek\.com/sites/default/files/styles/video_1920x1080/public/images/2022-06/pp_14792_rr_thumb_105_yt_16x9_1.jpg(?:\?.+)?', - 'subtitles': {'en-US': [{ - 'url': r're:https://(?:intl|www)\.startrek\.com/sites/default/files/video/captions/2022-06/TRR_SNW_105_v5\.vtt', - }]}, + 'title': 'My First Contact: Senator Cory Booker', + 'alt_title': 'md5:fe74a8bdb0afab421c6e159a7680db4d', + 'categories': ['MY FIRST CONTACT'], + 'description': 'md5:a3992ab3b3e0395925d71156bbc018ce', + 'release_date': '20250401', + 'release_timestamp': 1743512400, + 'series': 'Star Trek: The Original Series', + 'series_id': 'star-trek-the-original-series', + 'thumbnail': r're:https?://.+\.(?:jpg|png)', }, }] def _real_extract(self, url): - urlbase, video_id = self._match_valid_url(url).group('base', 'id') + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - player = self._search_regex( - r'(<\s*div\s+id\s*=\s*"cvp-player-[^<]+<\s*/div\s*>)', webpage, 'player') + page_props = self._search_nextjs_data(webpage, video_id)['props']['pageProps'] + video_data = page_props['video']['data'] + if youtube_id := video_data.get('youtube_video_id'): + return self.url_result(youtube_id, YoutubeIE) - hls = self._html_search_regex(r'\bdata-hls\s*=\s*"([^"]+)"', player, 'HLS URL') - formats, subtitles = self._extract_m3u8_formats_and_subtitles(hls, video_id, 'mp4') - - captions = self._html_search_regex( - r'\bdata-captions-url\s*=\s*"([^"]+)"', player, 'captions URL', fatal=False) - if captions: - subtitles.setdefault('en-US', [])[:0] = [{'url': urljoin(urlbase, captions)}] - - # NB: Most of the data in the json_ld is undesirable - json_ld = self._search_json_ld(webpage, video_id, fatal=False) + series_id = traverse_obj(video_data, ( + 'series_and_movies', ..., 'series_or_movie', 'slug', {str}, any)) return { 'id': video_id, - 'title': self._html_search_regex( - r'\bdata-title\s*=\s*"([^"]+)"', player, 'title', json_ld.get('title')), - 'description': self._html_search_regex( - r'(?s)<\s*div\s+class\s*=\s*"header-body"\s*>(.+?)<\s*/div\s*>', - webpage, 'description', fatal=False), - 'duration': int_or_none(self._html_search_regex( - r'\bdata-duration\s*=\s*"(\d+)"', player, 'duration', fatal=False)), - 'formats': formats, - 'subtitles': subtitles, - 'thumbnail': urljoin(urlbase, self._html_search_regex( - r'\bdata-poster-url\s*=\s*"([^"]+)"', player, 'thumbnail', fatal=False)), - 'timestamp': json_ld.get('timestamp'), + 'series': traverse_obj(page_props, ( + 'queried', 'header', 'tab3', 'slices', ..., 'items', + lambda _, v: v['link']['slug'] == series_id, 'link_copy', {str}, any)), + 'series_id': series_id, + **traverse_obj(video_data, { + 'title': ('title', ..., 'text', {clean_html}, any), + 'alt_title': ('subhead', ..., 'text', {clean_html}, any), + 'categories': ('category', 'data', 'category_name', {str.upper}, filter, all), + 'description': ('slices', ..., 'primary', 'content', ..., 'text', {clean_html}, any), + 'release_timestamp': ('published', {parse_iso8601}), + 'subtitles': ({'url': 'legacy_subtitle_file'}, all, {subs_list_to_dict(lang='en')}), + 'thumbnail': ('poster_frame', 'url', {url_or_none}, {update_url(query=None)}), + 'url': ('legacy_video_url', {url_or_none}), + }), } diff --git a/yt_dlp/extractor/telecinco.py b/yt_dlp/extractor/telecinco.py index a34f2afd4..2dbe2a776 100644 --- a/yt_dlp/extractor/telecinco.py +++ b/yt_dlp/extractor/telecinco.py @@ -63,6 +63,17 @@ def _parse_content(self, content, url): 'http_headers': headers, } + def _download_akamai_webpage(self, url, display_id): + try: # yt-dlp's default user-agents are too old and blocked by akamai + return self._download_webpage(url, display_id, headers={ + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:136.0) Gecko/20100101 Firefox/136.0', + }) + except ExtractorError as e: + if not isinstance(e.cause, HTTPError) or e.cause.status != 403: + raise + # Retry with impersonation if hardcoded UA is insufficient to bypass akamai + return self._download_webpage(url, display_id, impersonate=True) + class TelecincoIE(TelecincoBaseIE): IE_DESC = 'telecinco.es, cuatro.com and mediaset.es' @@ -140,7 +151,7 @@ class TelecincoIE(TelecincoBaseIE): def _real_extract(self, url): display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) + webpage = self._download_akamai_webpage(url, display_id) article = self._search_json( r'window\.\$REACTBASE_STATE\.article(?:_multisite)?\s*=', webpage, 'article', display_id)['article'] diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index c269802b3..8a106adb9 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -548,21 +548,21 @@ def _real_extract(self, url): 'formats': formats, 'subtitles': subtitles, **traverse_obj(mv_data, { - 'title': ('title', {unescapeHTML}), + 'title': ('title', {str}, {unescapeHTML}), 'description': ('desc', {clean_html}, filter), 'duration': ('duration', {int_or_none}), 'like_count': ('likes', {int_or_none}), 'comment_count': ('commcount', {int_or_none}), }), **traverse_obj(data, { - 'title': ('md_title', {unescapeHTML}), + 'title': ('md_title', {str}, {unescapeHTML}), 'description': ('description', {clean_html}, filter), 'thumbnail': ('jpg', {url_or_none}), - 'uploader': ('md_author', {unescapeHTML}), + 'uploader': ('md_author', {str}, {unescapeHTML}), 'uploader_id': (('author_id', 'authorId'), {str_or_none}, any), 'duration': ('duration', {int_or_none}), 'chapters': ('time_codes', lambda _, v: isinstance(v['time'], int), { - 'title': ('text', {unescapeHTML}), + 'title': ('text', {str}, {unescapeHTML}), 'start_time': 'time', }), }), diff --git a/yt_dlp/extractor/youtube/_base.py b/yt_dlp/extractor/youtube/_base.py index 9c5bb75fe..90e392715 100644 --- a/yt_dlp/extractor/youtube/_base.py +++ b/yt_dlp/extractor/youtube/_base.py @@ -175,6 +175,15 @@ class _PoTokenContext(enum.Enum): 'INNERTUBE_CONTEXT_CLIENT_NAME': 7, 'SUPPORTS_COOKIES': True, }, + 'tv_simply': { + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'TVHTML5_SIMPLY', + 'clientVersion': '1.0', + }, + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 75, + }, # This client now requires sign-in for every video # It was previously an age-gate workaround for videos that were `playable_in_embed` # It may still be useful if signed into an EU account that is not age-verified diff --git a/yt_dlp/extractor/youtube/_video.py b/yt_dlp/extractor/youtube/_video.py index d82225718..55ebdce1b 100644 --- a/yt_dlp/extractor/youtube/_video.py +++ b/yt_dlp/extractor/youtube/_video.py @@ -250,7 +250,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '400': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'av01.0.12M.08'}, '401': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'av01.0.12M.08'}, } - _SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'vtt') + _SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'srt', 'vtt') _DEFAULT_CLIENTS = ('tv', 'ios', 'web') _DEFAULT_AUTHED_CLIENTS = ('tv', 'web') @@ -2229,20 +2229,20 @@ def _decrypt_nsig(self, s, video_id, player_url): def _extract_n_function_name(self, jscode, player_url=None): varname, global_list = self._interpret_player_js_global_var(jscode, player_url) if debug_str := traverse_obj(global_list, (lambda _, v: v.endswith('-_w8_'), any)): - funcname = self._search_regex( - r'''(?xs) - [;\n](?: - (?Pfunction\s+)| - (?:var\s+)? - )(?P[a-zA-Z0-9_$]+)\s*(?(f)|=\s*function\s*) - \((?P[a-zA-Z0-9_$]+)\)\s*\{ - (?:(?!\}[;\n]).)+ - \}\s*catch\(\s*[a-zA-Z0-9_$]+\s*\)\s* - \{\s*return\s+%s\[%d\]\s*\+\s*(?P=argname)\s*\}\s*return\s+[^}]+\}[;\n] - ''' % (re.escape(varname), global_list.index(debug_str)), - jscode, 'nsig function name', group='funcname', default=None) - if funcname: - return funcname + pattern = r'''(?x) + \{\s*return\s+%s\[%d\]\s*\+\s*(?P[a-zA-Z0-9_$]+)\s*\} + ''' % (re.escape(varname), global_list.index(debug_str)) + if match := re.search(pattern, jscode): + pattern = r'''(?x) + \{\s*\)%s\(\s* + (?: + (?P[a-zA-Z0-9_$]+)\s*noitcnuf\s* + |noitcnuf\s*=\s*(?P[a-zA-Z0-9_$]+)(?:\s+rav)? + )[;\n] + ''' % re.escape(match.group('argname')[::-1]) + if match := re.search(pattern, jscode[match.start()::-1]): + a, b = match.group('funcname_a', 'funcname_b') + return (a or b)[::-1] self.write_debug(join_nonempty( 'Initial search was unable to find nsig function name', player_url and f' player = {player_url}', delim='\n'), only_once=True) diff --git a/yt_dlp/extractor/youtube/pot/utils.py b/yt_dlp/extractor/youtube/pot/utils.py index 7a5b7d4ab..a27921d4a 100644 --- a/yt_dlp/extractor/youtube/pot/utils.py +++ b/yt_dlp/extractor/youtube/pot/utils.py @@ -20,6 +20,7 @@ 'WEB_EMBEDDED_PLAYER', 'WEB_CREATOR', 'WEB_REMIX', + 'TVHTML5_SIMPLY', 'TVHTML5_SIMPLY_EMBEDDED_PLAYER', ) diff --git a/yt_dlp/extractor/zdf.py b/yt_dlp/extractor/zdf.py index 10be582a3..24c562ab6 100644 --- a/yt_dlp/extractor/zdf.py +++ b/yt_dlp/extractor/zdf.py @@ -6,6 +6,7 @@ from .common import InfoExtractor from ..utils import ( ExtractorError, + ISO639Utils, determine_ext, filter_dict, float_or_none, @@ -118,10 +119,7 @@ def _extract_ptmd(self, ptmd_urls, video_id, api_token=None, aspect_ratio=None): if ext == 'm3u8': fmts = self._extract_m3u8_formats( format_url, video_id, 'mp4', m3u8_id='hls', fatal=False) - elif ext == 'mpd': - fmts = self._extract_mpd_formats( - format_url, video_id, mpd_id='dash', fatal=False) - else: + elif ext in ('mp4', 'webm'): height = int_or_none(quality.get('highestVerticalResolution')) width = round(aspect_ratio * height) if aspect_ratio and height else None fmts = [{ @@ -132,16 +130,31 @@ def _extract_ptmd(self, ptmd_urls, video_id, api_token=None, aspect_ratio=None): 'format_id': join_nonempty('http', stream.get('type')), 'tbr': int_or_none(self._search_regex(r'_(\d+)k_', format_url, 'tbr', default=None)), }] + else: + self.report_warning(f'Skipping unsupported extension "{ext}"', video_id=video_id) + fmts = [] + f_class = variant.get('class') for f in fmts: + f_lang = ISO639Utils.short2long( + (f.get('language') or variant.get('language') or '').lower()) + is_audio_only = f.get('vcodec') == 'none' formats.append({ **f, - 'format_id': join_nonempty(f.get('format_id'), is_dgs and 'dgs'), + 'format_id': join_nonempty(f['format_id'], is_dgs and 'dgs'), 'format_note': join_nonempty( - f_class, is_dgs and 'German Sign Language', f.get('format_note'), delim=', '), - 'language': variant.get('language') or f.get('language'), + not is_audio_only and f_class, + is_dgs and 'German Sign Language', + f.get('format_note'), delim=', '), 'preference': -2 if is_dgs else -1, - 'language_preference': 10 if f_class == 'main' else -10 if f_class == 'ad' else -1, + 'language': f_lang, + 'language_preference': ( + -10 if ((is_audio_only and f.get('format_note') == 'Audiodeskription') + or (not is_audio_only and f_class == 'ad')) + else 10 if f_lang == 'deu' and f_class == 'main' + else 5 if f_lang == 'deu' + else 1 if f_class == 'main' + else -1), }) return { @@ -333,12 +346,13 @@ class ZDFIE(ZDFBaseIE): 'title': 'Dobrindt schließt Steuererhöhungen aus', 'description': 'md5:9a117646d7b8df6bc902eb543a9c9023', 'duration': 325, - 'thumbnail': 'https://www.zdf.de/assets/dobrindt-csu-berlin-direkt-100~1920x1080?cb=1743357653736', + 'thumbnail': 'https://www.zdfheute.de/assets/dobrindt-csu-berlin-direkt-100~1920x1080?cb=1743357653736', 'timestamp': 1743374520, 'upload_date': '20250330', '_old_archive_ids': ['zdf 250330_clip_2_bdi'], }, }, { + # FUNK video (hosted on a different CDN, has atypical PTMD and HLS files) 'url': 'https://www.zdf.de/funk/druck-11790/funk-alles-ist-verzaubert-102.html', 'md5': '57af4423db0455a3975d2dc4578536bc', 'info_dict': { @@ -651,6 +665,7 @@ class ZDFChannelIE(ZDFBaseIE): 'description': 'md5:6edad39189abf8431795d3d6d7f986b3', }, 'playlist_count': 242, + 'skip': 'Video count changes daily, needs support for playlist_maxcount', }] _PAGE_SIZE = 24 diff --git a/yt_dlp/utils/jslib/__init__.py b/yt_dlp/utils/jslib/__init__.py new file mode 100644 index 000000000..19df08b12 --- /dev/null +++ b/yt_dlp/utils/jslib/__init__.py @@ -0,0 +1 @@ +# Utility functions for handling web input based on commonly used JavaScript libraries diff --git a/yt_dlp/utils/jslib/devalue.py b/yt_dlp/utils/jslib/devalue.py new file mode 100644 index 000000000..d82880d92 --- /dev/null +++ b/yt_dlp/utils/jslib/devalue.py @@ -0,0 +1,167 @@ +from __future__ import annotations + +import array +import base64 +import datetime as dt +import math +import re + +from .._utils import parse_iso8601 + +TYPE_CHECKING = False +if TYPE_CHECKING: + import collections.abc + import typing + + T = typing.TypeVar('T') + + +_ARRAY_TYPE_LOOKUP = { + 'Int8Array': 'b', + 'Uint8Array': 'B', + 'Uint8ClampedArray': 'B', + 'Int16Array': 'h', + 'Uint16Array': 'H', + 'Int32Array': 'i', + 'Uint32Array': 'I', + 'Float32Array': 'f', + 'Float64Array': 'd', + 'BigInt64Array': 'l', + 'BigUint64Array': 'L', + 'ArrayBuffer': 'B', +} + + +def parse_iter(parsed: typing.Any, /, *, revivers: dict[str, collections.abc.Callable[[list], typing.Any]] | None = None): + # based on https://github.com/Rich-Harris/devalue/blob/f3fd2aa93d79f21746555671f955a897335edb1b/src/parse.js + resolved = { + -1: None, + -2: None, + -3: math.nan, + -4: math.inf, + -5: -math.inf, + -6: -0.0, + } + + if isinstance(parsed, int) and not isinstance(parsed, bool): + if parsed not in resolved or parsed == -2: + raise ValueError('invalid integer input') + return resolved[parsed] + elif not isinstance(parsed, list): + raise ValueError('expected int or list as input') + elif not parsed: + raise ValueError('expected a non-empty list as input') + + if revivers is None: + revivers = {} + return_value = [None] + stack: list[tuple] = [(return_value, 0, 0)] + + while stack: + target, index, source = stack.pop() + if isinstance(source, tuple): + name, source, reviver = source + try: + resolved[source] = target[index] = reviver(target[index]) + except Exception as error: + yield TypeError(f'failed to parse {source} as {name!r}: {error}') + resolved[source] = target[index] = None + continue + + if source in resolved: + target[index] = resolved[source] + continue + + # guard against Python negative indexing + if source < 0: + yield IndexError(f'invalid index: {source!r}') + continue + + try: + value = parsed[source] + except IndexError as error: + yield error + continue + + if isinstance(value, list): + if value and isinstance(value[0], str): + # TODO: implement zips `strict=True` + if reviver := revivers.get(value[0]): + if value[1] == source: + # XXX: avoid infinite loop + yield IndexError(f'{value[0]!r} cannot point to itself (index: {source})') + continue + # inverse order: resolve index, revive value + stack.append((target, index, (value[0], value[1], reviver))) + stack.append((target, index, value[1])) + continue + + elif value[0] == 'Date': + try: + result = dt.datetime.fromtimestamp(parse_iso8601(value[1]), tz=dt.timezone.utc) + except Exception: + yield ValueError(f'invalid date: {value[1]!r}') + result = None + + elif value[0] == 'Set': + result = [None] * (len(value) - 1) + for offset, new_source in enumerate(value[1:]): + stack.append((result, offset, new_source)) + + elif value[0] == 'Map': + result = [] + for key, new_source in zip(*(iter(value[1:]),) * 2): + pair = [None, None] + stack.append((pair, 0, key)) + stack.append((pair, 1, new_source)) + result.append(pair) + + elif value[0] == 'RegExp': + # XXX: use jsinterp to translate regex flags + # currently ignores `value[2]` + result = re.compile(value[1]) + + elif value[0] == 'Object': + result = value[1] + + elif value[0] == 'BigInt': + result = int(value[1]) + + elif value[0] == 'null': + result = {} + for key, new_source in zip(*(iter(value[1:]),) * 2): + stack.append((result, key, new_source)) + + elif value[0] in _ARRAY_TYPE_LOOKUP: + typecode = _ARRAY_TYPE_LOOKUP[value[0]] + data = base64.b64decode(value[1]) + result = array.array(typecode, data).tolist() + + else: + yield TypeError(f'invalid type at {source}: {value[0]!r}') + result = None + else: + result = len(value) * [None] + for offset, new_source in enumerate(value): + stack.append((result, offset, new_source)) + + elif isinstance(value, dict): + result = {} + for key, new_source in value.items(): + stack.append((result, key, new_source)) + + else: + result = value + + target[index] = resolved[source] = result + + return return_value[0] + + +def parse(parsed: typing.Any, /, *, revivers: dict[str, collections.abc.Callable[[typing.Any], typing.Any]] | None = None): + generator = parse_iter(parsed, revivers=revivers) + while True: + try: + raise generator.send(None) + except StopIteration as error: + return error.value diff --git a/yt_dlp/version.py b/yt_dlp/version.py index c375cc6ad..b97c01499 100644 --- a/yt_dlp/version.py +++ b/yt_dlp/version.py @@ -1,8 +1,8 @@ # Autogenerated by devscripts/update-version.py -__version__ = '2025.05.22' +__version__ = '2025.06.09' -RELEASE_GIT_HEAD = '7977b329ed97b216e37bd402f4935f28c00eac9e' +RELEASE_GIT_HEAD = '339614a173c74b42d63e858c446a9cae262a13af' VARIANT = None @@ -12,4 +12,4 @@ ORIGIN = 'yt-dlp/yt-dlp' -_pkg_version = '2025.05.22' +_pkg_version = '2025.06.09'