diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index a211ae165..4b71a621c 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -192,7 +192,7 @@ jobs: with: path: ./repo - name: Virtualized Install, Prepare & Build - uses: yt-dlp/run-on-arch-action@v2 + uses: yt-dlp/run-on-arch-action@v3 with: # Ref: https://github.com/uraimo/run-on-arch-action/issues/55 env: | @@ -411,7 +411,7 @@ jobs: run: | # Custom pyinstaller built with https://github.com/yt-dlp/pyinstaller-builds python devscripts/install_deps.py -o --include build python devscripts/install_deps.py --include curl-cffi - python -m pip install -U "https://yt-dlp.github.io/Pyinstaller-Builds/x86_64/pyinstaller-6.11.1-py3-none-any.whl" + python -m pip install -U "https://yt-dlp.github.io/Pyinstaller-Builds/x86_64/pyinstaller-6.13.0-py3-none-any.whl" - name: Prepare run: | @@ -460,7 +460,7 @@ jobs: run: | python devscripts/install_deps.py -o --include build python devscripts/install_deps.py - python -m pip install -U "https://yt-dlp.github.io/Pyinstaller-Builds/i686/pyinstaller-6.11.1-py3-none-any.whl" + python -m pip install -U "https://yt-dlp.github.io/Pyinstaller-Builds/i686/pyinstaller-6.13.0-py3-none-any.whl" - name: Prepare run: | diff --git a/.github/workflows/core.yml b/.github/workflows/core.yml index 9a4342a58..dd2c6f481 100644 --- a/.github/workflows/core.yml +++ b/.github/workflows/core.yml @@ -6,7 +6,7 @@ on: - devscripts/** - test/** - yt_dlp/**.py - - '!yt_dlp/extractor/*.py' + - '!yt_dlp/extractor/**.py' - yt_dlp/extractor/__init__.py - yt_dlp/extractor/common.py - yt_dlp/extractor/extractors.py @@ -16,7 +16,7 @@ on: - devscripts/** - test/** - yt_dlp/**.py - - '!yt_dlp/extractor/*.py' + - '!yt_dlp/extractor/**.py' - yt_dlp/extractor/__init__.py - yt_dlp/extractor/common.py - yt_dlp/extractor/extractors.py diff --git a/.github/workflows/quick-test.yml b/.github/workflows/quick-test.yml index 1a32bbfe3..8a7b24033 100644 --- a/.github/workflows/quick-test.yml +++ b/.github/workflows/quick-test.yml @@ -38,3 +38,5 @@ jobs: run: ruff check --output-format github . - name: Run autopep8 run: autopep8 --diff . + - name: Check file mode + run: git ls-files --format="%(objectmode) %(path)" yt_dlp/ | ( ! grep -v "^100644" ) diff --git a/CONTRIBUTORS b/CONTRIBUTORS index f22625e3f..6aa52c595 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -760,3 +760,18 @@ vallovic arabcoders mireq mlabeeb03 +1271 +CasperMcFadden95 +Kicer86 +Kiritomo +leeblackc +meGAmeS1 +NeonMan +pj47x +troex +WouterGordts +baierjan +GeoffreyFrogeye +Pawka +v3DJG6GL +yozel diff --git a/Changelog.md b/Changelog.md index 7aec62771..80b72da05 100644 --- a/Changelog.md +++ b/Changelog.md @@ -4,6 +4,131 @@ # Changelog # To create a release, dispatch the https://github.com/yt-dlp/yt-dlp/actions/workflows/release.yml workflow on master --> +### 2025.05.22 + +#### Core changes +- **cookies**: [Fix Linux desktop environment detection](https://github.com/yt-dlp/yt-dlp/commit/e491fd4d090db3af52a82863fb0553dd5e17fb85) ([#13197](https://github.com/yt-dlp/yt-dlp/issues/13197)) by [mbway](https://github.com/mbway) +- **jsinterp**: [Fix increment/decrement evaluation](https://github.com/yt-dlp/yt-dlp/commit/167d7a9f0ffd1b4fe600193441bdb7358db2740b) ([#13238](https://github.com/yt-dlp/yt-dlp/issues/13238)) by [bashonly](https://github.com/bashonly), [seproDev](https://github.com/seproDev) + +#### Extractor changes +- **1tv**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/41c0a1fb89628696f8bb88e2b9f3a68f355b8c26) ([#13168](https://github.com/yt-dlp/yt-dlp/issues/13168)) by [bashonly](https://github.com/bashonly) +- **amcnetworks**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/464c84fedf78eef822a431361155f108b5df96d7) ([#13147](https://github.com/yt-dlp/yt-dlp/issues/13147)) by [bashonly](https://github.com/bashonly) +- **bitchute**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/1d0f6539c47e5d5c68c3c47cdb7075339e2885ac) ([#13081](https://github.com/yt-dlp/yt-dlp/issues/13081)) by [bashonly](https://github.com/bashonly) +- **cartoonnetwork**: [Remove extractor](https://github.com/yt-dlp/yt-dlp/commit/7dbb47f84f0ee1266a3a01f58c9bc4c76d76794a) ([#13148](https://github.com/yt-dlp/yt-dlp/issues/13148)) by [bashonly](https://github.com/bashonly) +- **iprima**: [Fix login support](https://github.com/yt-dlp/yt-dlp/commit/a7d9a5eb79ceeecb851389f3f2c88597871ca3f2) ([#12937](https://github.com/yt-dlp/yt-dlp/issues/12937)) by [baierjan](https://github.com/baierjan) +- **jiosaavn** + - artist: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/586b557b124f954d3f625360ebe970989022ad97) ([#12803](https://github.com/yt-dlp/yt-dlp/issues/12803)) by [subrat-lima](https://github.com/subrat-lima) + - playlist, show: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/317f4b8006c2c0f0f64f095b1485163ad97c9053) ([#12803](https://github.com/yt-dlp/yt-dlp/issues/12803)) by [subrat-lima](https://github.com/subrat-lima) + - show: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/6839276496d8814cf16f58b637e45663467928e6) ([#12803](https://github.com/yt-dlp/yt-dlp/issues/12803)) by [subrat-lima](https://github.com/subrat-lima) +- **lrtradio**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/abf58dcd6a09e14eec4ea82ae12f79a0337cb383) ([#13200](https://github.com/yt-dlp/yt-dlp/issues/13200)) by [Pawka](https://github.com/Pawka) +- **nebula**: [Support `--mark-watched`](https://github.com/yt-dlp/yt-dlp/commit/20f288bdc2173c7cc58d709d25ca193c1f6001e7) ([#13120](https://github.com/yt-dlp/yt-dlp/issues/13120)) by [GeoffreyFrogeye](https://github.com/GeoffreyFrogeye) +- **niconico** + - [Fix error handling](https://github.com/yt-dlp/yt-dlp/commit/f569be4602c2a857087e495d5d7ed6060cd97abe) ([#13236](https://github.com/yt-dlp/yt-dlp/issues/13236)) by [bashonly](https://github.com/bashonly) + - live: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/7a7b85c9014d96421e18aa7ea5f4c1bee5ceece0) ([#13045](https://github.com/yt-dlp/yt-dlp/issues/13045)) by [doe1080](https://github.com/doe1080) +- **nytimesarticle**: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/b26bc32579c00ef579d75a835807ccc87d20ee0a) ([#13104](https://github.com/yt-dlp/yt-dlp/issues/13104)) by [bashonly](https://github.com/bashonly) +- **once**: [Remove extractor](https://github.com/yt-dlp/yt-dlp/commit/f475e8b529d18efdad603ffda02a56e707fe0e2c) ([#13164](https://github.com/yt-dlp/yt-dlp/issues/13164)) by [bashonly](https://github.com/bashonly) +- **picarto**: vod: [Support `/profile/` video URLs](https://github.com/yt-dlp/yt-dlp/commit/31e090cb787f3504ec25485adff9a2a51d056734) ([#13227](https://github.com/yt-dlp/yt-dlp/issues/13227)) by [subrat-lima](https://github.com/subrat-lima) +- **playsuisse**: [Improve metadata extraction](https://github.com/yt-dlp/yt-dlp/commit/d880e060803ae8ed5a047e578cca01e1f0e630ce) ([#12466](https://github.com/yt-dlp/yt-dlp/issues/12466)) by [v3DJG6GL](https://github.com/v3DJG6GL) +- **sprout**: [Remove extractor](https://github.com/yt-dlp/yt-dlp/commit/cbcfe6378dde33a650e3852ab17ad4503b8e008d) ([#13149](https://github.com/yt-dlp/yt-dlp/issues/13149)) by [bashonly](https://github.com/bashonly) +- **svtpage**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/ea8498ed534642dd7e925961b97b934987142fd3) ([#12957](https://github.com/yt-dlp/yt-dlp/issues/12957)) by [diman8](https://github.com/diman8) +- **twitch**: [Support `--live-from-start`](https://github.com/yt-dlp/yt-dlp/commit/00b1bec55249cf2ad6271d36492c51b34b6459d1) ([#13202](https://github.com/yt-dlp/yt-dlp/issues/13202)) by [bashonly](https://github.com/bashonly) +- **vimeo**: event: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/545c1a5b6f2fe88722b41aef0e7485bf3be3f3f9) ([#13216](https://github.com/yt-dlp/yt-dlp/issues/13216)) by [bashonly](https://github.com/bashonly) +- **wat.tv**: [Improve error handling](https://github.com/yt-dlp/yt-dlp/commit/f123cc83b3aea45053f5fa1d9141048b01fc2774) ([#13111](https://github.com/yt-dlp/yt-dlp/issues/13111)) by [bashonly](https://github.com/bashonly) +- **weverse**: [Fix live extraction](https://github.com/yt-dlp/yt-dlp/commit/5328eda8820cc5f21dcf917684d23fbdca41831d) ([#13084](https://github.com/yt-dlp/yt-dlp/issues/13084)) by [bashonly](https://github.com/bashonly) +- **xinpianchang**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/83fabf352489d52843f67e6e9cc752db86d27e6e) ([#13245](https://github.com/yt-dlp/yt-dlp/issues/13245)) by [garret1317](https://github.com/garret1317) +- **youtube** + - [Add PO token support for subtitles](https://github.com/yt-dlp/yt-dlp/commit/32ed5f107c6c641958d1cd2752e130de4db55a13) ([#13234](https://github.com/yt-dlp/yt-dlp/issues/13234)) by [bashonly](https://github.com/bashonly), [coletdjnz](https://github.com/coletdjnz) + - [Add `web_embedded` client for age-restricted videos](https://github.com/yt-dlp/yt-dlp/commit/0feec6dc131f488428bf881519e7c69766fbb9ae) ([#13089](https://github.com/yt-dlp/yt-dlp/issues/13089)) by [bashonly](https://github.com/bashonly) + - [Add a PO Token Provider Framework](https://github.com/yt-dlp/yt-dlp/commit/2685654a37141cca63eda3a92da0e2706e23ccfd) ([#12840](https://github.com/yt-dlp/yt-dlp/issues/12840)) by [coletdjnz](https://github.com/coletdjnz) + - [Extract `media_type` for all videos](https://github.com/yt-dlp/yt-dlp/commit/ded11ebc9afba6ba33923375103e9be2d7c804e7) ([#13136](https://github.com/yt-dlp/yt-dlp/issues/13136)) by [bashonly](https://github.com/bashonly) + - [Fix `--live-from-start` support for premieres](https://github.com/yt-dlp/yt-dlp/commit/8f303afb43395be360cafd7ad4ce2b6e2eedfb8a) ([#13079](https://github.com/yt-dlp/yt-dlp/issues/13079)) by [arabcoders](https://github.com/arabcoders) + - [Fix geo-restriction error handling](https://github.com/yt-dlp/yt-dlp/commit/c7e575e31608c19c5b26c10a4229db89db5fc9a8) ([#13217](https://github.com/yt-dlp/yt-dlp/issues/13217)) by [yozel](https://github.com/yozel) + +#### Misc. changes +- **build** + - [Bump PyInstaller to v6.13.0](https://github.com/yt-dlp/yt-dlp/commit/17cf9088d0d535e4a7feffbf02bd49cd9dae5ab9) ([#13082](https://github.com/yt-dlp/yt-dlp/issues/13082)) by [bashonly](https://github.com/bashonly) + - [Bump run-on-arch-action to v3](https://github.com/yt-dlp/yt-dlp/commit/9064d2482d1fe722bbb4a49731fe0711c410d1c8) ([#13088](https://github.com/yt-dlp/yt-dlp/issues/13088)) by [bashonly](https://github.com/bashonly) +- **cleanup**: Miscellaneous: [7977b32](https://github.com/yt-dlp/yt-dlp/commit/7977b329ed97b216e37bd402f4935f28c00eac9e) by [bashonly](https://github.com/bashonly) + +### 2025.04.30 + +#### Important changes +- **New option `--preset-alias`/`-t` has been added** +This provides convenient predefined aliases for common use cases. Available presets include `mp4`, `mp3`, `mkv`, `aac`, and `sleep`. See [the README](https://github.com/yt-dlp/yt-dlp/blob/master/README.md#preset-aliases) for more details. + +#### Core changes +- [Add `--preset-alias` option](https://github.com/yt-dlp/yt-dlp/commit/88eb1e7a9a2720ac89d653c0d0e40292388823bb) ([#12839](https://github.com/yt-dlp/yt-dlp/issues/12839)) by [Grub4K](https://github.com/Grub4K), [seproDev](https://github.com/seproDev) +- **utils** + - `_yield_json_ld`: [Make function less fatal](https://github.com/yt-dlp/yt-dlp/commit/45f01de00e1bc076b7f676a669736326178647b1) ([#12855](https://github.com/yt-dlp/yt-dlp/issues/12855)) by [seproDev](https://github.com/seproDev) + - `url_or_none`: [Support WebSocket URLs](https://github.com/yt-dlp/yt-dlp/commit/a473e592337edb8ca40cde52c1fcaee261c54df9) ([#12848](https://github.com/yt-dlp/yt-dlp/issues/12848)) by [doe1080](https://github.com/doe1080) + +#### Extractor changes +- **abematv**: [Fix thumbnail extraction](https://github.com/yt-dlp/yt-dlp/commit/f5736bb35bde62348caebf7b188668655e316deb) ([#12859](https://github.com/yt-dlp/yt-dlp/issues/12859)) by [Kiritomo](https://github.com/Kiritomo) +- **atresplayer**: [Rework extractor](https://github.com/yt-dlp/yt-dlp/commit/839d64325356310e6de6cd9cad28fb546619ca63) ([#11424](https://github.com/yt-dlp/yt-dlp/issues/11424)) by [meGAmeS1](https://github.com/meGAmeS1), [seproDev](https://github.com/seproDev) +- **bpb**: [Fix formats extraction](https://github.com/yt-dlp/yt-dlp/commit/80736b9c90818adee933a155079b8535bc06819f) ([#13015](https://github.com/yt-dlp/yt-dlp/issues/13015)) by [bashonly](https://github.com/bashonly) +- **cda**: [Fix formats extraction](https://github.com/yt-dlp/yt-dlp/commit/9032f981362ea0be90626fab51ec37934feded6d) ([#12975](https://github.com/yt-dlp/yt-dlp/issues/12975)) by [bashonly](https://github.com/bashonly) +- **cdafolder**: [Extend `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/cb271d445bc2d866c9a3404b1d8f59bcb77447df) ([#12919](https://github.com/yt-dlp/yt-dlp/issues/12919)) by [fireattack](https://github.com/fireattack), [Kicer86](https://github.com/Kicer86) +- **crowdbunker**: [Make format extraction non-fatal](https://github.com/yt-dlp/yt-dlp/commit/4ebf41309d04a6e196944f1c0f5f0154cff0055a) ([#12836](https://github.com/yt-dlp/yt-dlp/issues/12836)) by [seproDev](https://github.com/seproDev) +- **dacast**: [Support tokenized URLs](https://github.com/yt-dlp/yt-dlp/commit/e7e3b7a55c456da4a5a812b4fefce4dce8e6a616) ([#12979](https://github.com/yt-dlp/yt-dlp/issues/12979)) by [bashonly](https://github.com/bashonly) +- **dzen.ru**: [Rework extractors](https://github.com/yt-dlp/yt-dlp/commit/a3f2b54c2535d862de6efa9cfaa6ca9a2b2f7dd6) ([#12852](https://github.com/yt-dlp/yt-dlp/issues/12852)) by [seproDev](https://github.com/seproDev) +- **generic**: [Fix MPD extraction for `file://` URLs](https://github.com/yt-dlp/yt-dlp/commit/34a061a295d156934417c67ee98070b94943006b) ([#12978](https://github.com/yt-dlp/yt-dlp/issues/12978)) by [bashonly](https://github.com/bashonly) +- **getcourseru**: [Fix extractors](https://github.com/yt-dlp/yt-dlp/commit/741fd809bc4d301c19b53877692ae510334a6750) ([#12943](https://github.com/yt-dlp/yt-dlp/issues/12943)) by [troex](https://github.com/troex) +- **ivoox**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/7faa18b83dcfc74a1a1e2034e6b0369c495ca645) ([#12768](https://github.com/yt-dlp/yt-dlp/issues/12768)) by [NeonMan](https://github.com/NeonMan), [seproDev](https://github.com/seproDev) +- **kika**: [Add playlist extractor](https://github.com/yt-dlp/yt-dlp/commit/3c1c75ecb8ab352f422b59af46fff2be992e4115) ([#12832](https://github.com/yt-dlp/yt-dlp/issues/12832)) by [1100101](https://github.com/1100101) +- **linkedin** + - [Support feed URLs](https://github.com/yt-dlp/yt-dlp/commit/73a26f9ee68610e33c0b4407b77355f2ab7afd0e) ([#12927](https://github.com/yt-dlp/yt-dlp/issues/12927)) by [seproDev](https://github.com/seproDev) + - events: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/b37ff4de5baf4e4e70c6a0ec34e136a279ad20af) ([#12926](https://github.com/yt-dlp/yt-dlp/issues/12926)) by [bashonly](https://github.com/bashonly), [seproDev](https://github.com/seproDev) +- **loco**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/f5a37ea40e20865b976ffeeff13eeae60292eb23) ([#12934](https://github.com/yt-dlp/yt-dlp/issues/12934)) by [seproDev](https://github.com/seproDev) +- **lrtradio**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/74e90dd9b8f9c1a5c48a2515126654f4d398d687) ([#12801](https://github.com/yt-dlp/yt-dlp/issues/12801)) by [subrat-lima](https://github.com/subrat-lima) +- **manyvids**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/77aa15e98f34c4ad425aabf39dd1ee37b48f772c) ([#10907](https://github.com/yt-dlp/yt-dlp/issues/10907)) by [pj47x](https://github.com/pj47x) +- **mixcloud**: [Refactor extractor](https://github.com/yt-dlp/yt-dlp/commit/db6d1f145ad583e0220637726029f8f2fa6200a0) ([#12830](https://github.com/yt-dlp/yt-dlp/issues/12830)) by [seproDev](https://github.com/seproDev), [WouterGordts](https://github.com/WouterGordts) +- **mlbtv**: [Fix device ID caching](https://github.com/yt-dlp/yt-dlp/commit/36da6360e130197df927ee93409519ce3f4075f5) ([#12980](https://github.com/yt-dlp/yt-dlp/issues/12980)) by [bashonly](https://github.com/bashonly) +- **niconico** + - [Fix login support](https://github.com/yt-dlp/yt-dlp/commit/25cd7c1ecbb6cbf21dd3a6e59608e4af94715ecc) ([#13008](https://github.com/yt-dlp/yt-dlp/issues/13008)) by [doe1080](https://github.com/doe1080) + - [Remove DMC formats support](https://github.com/yt-dlp/yt-dlp/commit/7d05aa99c65352feae1cd9a3ff8784b64bfe382a) ([#12916](https://github.com/yt-dlp/yt-dlp/issues/12916)) by [doe1080](https://github.com/doe1080) + - live: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/1d45e30537bf83e069184a440703e4c43b2e0198) ([#12809](https://github.com/yt-dlp/yt-dlp/issues/12809)) by [Snack-X](https://github.com/Snack-X) +- **panopto**: [Fix formats extraction](https://github.com/yt-dlp/yt-dlp/commit/9d26daa04ad5108257bc5e30f7f040c7f1fe7a5a) ([#12925](https://github.com/yt-dlp/yt-dlp/issues/12925)) by [seproDev](https://github.com/seproDev) +- **parti**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/425017531fbc3369becb5a44013e26f26efabf45) ([#12769](https://github.com/yt-dlp/yt-dlp/issues/12769)) by [benfaerber](https://github.com/benfaerber) +- **raiplay**: [Fix DRM detection](https://github.com/yt-dlp/yt-dlp/commit/dce82346245e35a46fda836ca2089805d2347935) ([#12971](https://github.com/yt-dlp/yt-dlp/issues/12971)) by [DTrombett](https://github.com/DTrombett) +- **reddit**: [Support `--ignore-no-formats-error`](https://github.com/yt-dlp/yt-dlp/commit/28f04e8a5e383ff531db646190b4be45554610d6) ([#12993](https://github.com/yt-dlp/yt-dlp/issues/12993)) by [bashonly](https://github.com/bashonly) +- **royalive**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/e1847535e28788414a25546a45bebcada2f34558) ([#12817](https://github.com/yt-dlp/yt-dlp/issues/12817)) by [CasperMcFadden95](https://github.com/CasperMcFadden95) +- **rtve**: [Rework extractors](https://github.com/yt-dlp/yt-dlp/commit/f07ee91c71920ab1187a7ea756720e81aa406a9d) ([#10388](https://github.com/yt-dlp/yt-dlp/issues/10388)) by [meGAmeS1](https://github.com/meGAmeS1), [seproDev](https://github.com/seproDev) +- **rumble**: [Improve format extraction](https://github.com/yt-dlp/yt-dlp/commit/58d0c83457b93b3c9a81eb6bc5a4c65f25e949df) ([#12838](https://github.com/yt-dlp/yt-dlp/issues/12838)) by [seproDev](https://github.com/seproDev) +- **tokfmpodcast**: [Fix formats extraction](https://github.com/yt-dlp/yt-dlp/commit/91832111a12d87499294a0f430829b8c2254c339) ([#12842](https://github.com/yt-dlp/yt-dlp/issues/12842)) by [selfisekai](https://github.com/selfisekai) +- **tv2dk**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/a3e91df30a45943f40759d2c1e0b6c2ca4b2a263) ([#12945](https://github.com/yt-dlp/yt-dlp/issues/12945)) by [bashonly](https://github.com/bashonly), [seproDev](https://github.com/seproDev) +- **tvp**: vod: [Improve `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/4e69a626cce51428bc1d66dc606a56d9498b03a5) ([#12923](https://github.com/yt-dlp/yt-dlp/issues/12923)) by [seproDev](https://github.com/seproDev) +- **tvw**: tvchannels: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/ed8ad1b4d6b9d7a1426ff5192ff924f3371e4721) ([#12721](https://github.com/yt-dlp/yt-dlp/issues/12721)) by [fries1234](https://github.com/fries1234) +- **twitcasting**: [Fix livestream extraction](https://github.com/yt-dlp/yt-dlp/commit/de271a06fd6d20d4f55597ff7f90e4d913de0a52) ([#12977](https://github.com/yt-dlp/yt-dlp/issues/12977)) by [bashonly](https://github.com/bashonly) +- **twitch**: clips: [Fix uploader metadata extraction](https://github.com/yt-dlp/yt-dlp/commit/1ae6bff564a65af41e94f1a4727892471ecdd05a) ([#13022](https://github.com/yt-dlp/yt-dlp/issues/13022)) by [1271](https://github.com/1271) +- **twitter** + - [Fix extraction when logged-in](https://github.com/yt-dlp/yt-dlp/commit/1cf39ddf3d10b6512daa7dd139e5f6c0dc548bbc) ([#13024](https://github.com/yt-dlp/yt-dlp/issues/13024)) by [bashonly](https://github.com/bashonly) + - spaces: [Improve metadata extraction](https://github.com/yt-dlp/yt-dlp/commit/70599e53b736bb75922b737e6e0d4f76e419bb20) ([#12911](https://github.com/yt-dlp/yt-dlp/issues/12911)) by [doe1080](https://github.com/doe1080) +- **vimeo**: [Extract from mobile API](https://github.com/yt-dlp/yt-dlp/commit/22ac81a0692019ac833cf282e4ef99718e9ef3fa) ([#13034](https://github.com/yt-dlp/yt-dlp/issues/13034)) by [bashonly](https://github.com/bashonly) +- **vk** + - [Fix chapters extraction](https://github.com/yt-dlp/yt-dlp/commit/5361a7c6e2933c919716e0cb1e3116c28c40419f) ([#12821](https://github.com/yt-dlp/yt-dlp/issues/12821)) by [seproDev](https://github.com/seproDev) + - [Fix uploader extraction](https://github.com/yt-dlp/yt-dlp/commit/2381881fe58a723853350a6ab750a5efc9f10c85) ([#12985](https://github.com/yt-dlp/yt-dlp/issues/12985)) by [seproDev](https://github.com/seproDev) +- **youtube** + - [Add context to video request rate limit error](https://github.com/yt-dlp/yt-dlp/commit/26feac3dd142536ad08ad1ed731378cb88e63602) ([#12958](https://github.com/yt-dlp/yt-dlp/issues/12958)) by [coletdjnz](https://github.com/coletdjnz) + - [Add extractor arg to skip "initial_data" request](https://github.com/yt-dlp/yt-dlp/commit/ed6c6d7eefbc78fa72e4e60ad6edaa3ee2acc715) ([#12865](https://github.com/yt-dlp/yt-dlp/issues/12865)) by [leeblackc](https://github.com/leeblackc) + - [Add warning on video captcha challenge](https://github.com/yt-dlp/yt-dlp/commit/f484c51599a6cd01eb078ea7dc9bbba942967774) ([#12939](https://github.com/yt-dlp/yt-dlp/issues/12939)) by [coletdjnz](https://github.com/coletdjnz) + - [Cache signature timestamps](https://github.com/yt-dlp/yt-dlp/commit/61c9a938b390b8334ee3a879fe2d93f714e30138) ([#13047](https://github.com/yt-dlp/yt-dlp/issues/13047)) by [bashonly](https://github.com/bashonly) + - [Detect and warn when account cookies are rotated](https://github.com/yt-dlp/yt-dlp/commit/8cb08028f5be2acb9835ce1670b196b9b077052f) ([#13014](https://github.com/yt-dlp/yt-dlp/issues/13014)) by [coletdjnz](https://github.com/coletdjnz) + - [Detect player JS variants for any locale](https://github.com/yt-dlp/yt-dlp/commit/c2d6659d1069f8cff97e1fd61d1c59e949e1e63d) ([#13003](https://github.com/yt-dlp/yt-dlp/issues/13003)) by [bashonly](https://github.com/bashonly) + - [Do not strictly deprioritize `missing_pot` formats](https://github.com/yt-dlp/yt-dlp/commit/74fc2ae12c24eb6b4e02c6360c89bd05f3c8f740) ([#13061](https://github.com/yt-dlp/yt-dlp/issues/13061)) by [bashonly](https://github.com/bashonly) + - [Improve warning for SABR-only/SSAP player responses](https://github.com/yt-dlp/yt-dlp/commit/fd8394bc50301ac5e930aa65aa71ab1b8372b8ab) ([#13049](https://github.com/yt-dlp/yt-dlp/issues/13049)) by [bashonly](https://github.com/bashonly) + - tab: [Extract continuation from empty page](https://github.com/yt-dlp/yt-dlp/commit/72ba4879304c2082fecbb472e6cc05ee2d154a3b) ([#12938](https://github.com/yt-dlp/yt-dlp/issues/12938)) by [coletdjnz](https://github.com/coletdjnz) +- **zdf**: [Fix extractors](https://github.com/yt-dlp/yt-dlp/commit/7be14109a6bd493a2e881da4f9e30adaf3e7e5d5) ([#12779](https://github.com/yt-dlp/yt-dlp/issues/12779)) by [bashonly](https://github.com/bashonly), [InvalidUsernameException](https://github.com/InvalidUsernameException) + +#### Downloader changes +- **niconicodmc**: [Remove downloader](https://github.com/yt-dlp/yt-dlp/commit/8d127b18f81131453eaba05d3bb810d9b73adb75) ([#12916](https://github.com/yt-dlp/yt-dlp/issues/12916)) by [doe1080](https://github.com/doe1080) + +#### Networking changes +- [Add PATCH request shortcut](https://github.com/yt-dlp/yt-dlp/commit/ceab4d5ed63a1f135a1816fe967c9d9a1ec7e6e8) ([#12884](https://github.com/yt-dlp/yt-dlp/issues/12884)) by [doe1080](https://github.com/doe1080) + +#### Misc. changes +- **ci**: [Add file mode test to code check](https://github.com/yt-dlp/yt-dlp/commit/3690e91265d1d0bbeffaf6a9b8cc9baded1367bd) ([#13036](https://github.com/yt-dlp/yt-dlp/issues/13036)) by [Grub4K](https://github.com/Grub4K) +- **cleanup**: Miscellaneous: [505b400](https://github.com/yt-dlp/yt-dlp/commit/505b400795af557bdcfd9d4fa7e9133b26ef431c) by [bashonly](https://github.com/bashonly), [seproDev](https://github.com/seproDev) + ### 2025.03.31 #### Core changes diff --git a/README.md b/README.md index c0329f539..6e2dc6243 100644 --- a/README.md +++ b/README.md @@ -44,6 +44,7 @@ * [Post-processing Options](#post-processing-options) * [SponsorBlock Options](#sponsorblock-options) * [Extractor Options](#extractor-options) + * [Preset Aliases](#preset-aliases) * [CONFIGURATION](#configuration) * [Configuration file encoding](#configuration-file-encoding) * [Authentication with netrc](#authentication-with-netrc) @@ -348,8 +349,8 @@ ## General Options: --no-flat-playlist Fully extract the videos of a playlist (default) --live-from-start Download livestreams from the start. - Currently only supported for YouTube - (Experimental) + Currently experimental and only supported + for YouTube and Twitch --no-live-from-start Download livestreams from the current time (default) --wait-for-video MIN[-MAX] Wait for scheduled streams to become @@ -375,17 +376,23 @@ ## General Options: an alias starts with a dash "-", it is prefixed with "--". Arguments are parsed according to the Python string formatting - mini-language. E.g. --alias get-audio,-X - "-S=aext:{0},abr -x --audio-format {0}" - creates options "--get-audio" and "-X" that - takes an argument (ARG0) and expands to - "-S=aext:ARG0,abr -x --audio-format ARG0". - All defined aliases are listed in the --help + mini-language. E.g. --alias get-audio,-X "-S + aext:{0},abr -x --audio-format {0}" creates + options "--get-audio" and "-X" that takes an + argument (ARG0) and expands to "-S + aext:ARG0,abr -x --audio-format ARG0". All + defined aliases are listed in the --help output. Alias options can trigger more aliases; so be careful to avoid defining recursive options. As a safety measure, each alias may be triggered a maximum of 100 times. This option can be used multiple times + -t, --preset-alias PRESET Applies a predefined set of options. e.g. + --preset-alias mp3. The following presets + are available: mp3, aac, mp4, mkv, sleep. + See the "Preset Aliases" section at the end + for more info. This option can be used + multiple times ## Network Options: --proxy URL Use the specified HTTP/HTTPS/SOCKS proxy. To @@ -1098,6 +1105,27 @@ ## Extractor Options: can use this option multiple times to give arguments for different extractors +## Preset Aliases: +Predefined aliases for convenience and ease of use. Note that future + versions of yt-dlp may add or adjust presets, but the existing preset + names will not be changed or removed + + -t mp3 -f 'ba[acodec^=mp3]/ba/b' -x --audio-format + mp3 + + -t aac -f + 'ba[acodec^=aac]/ba[acodec^=mp4a.40.]/ba/b' + -x --audio-format aac + + -t mp4 --merge-output-format mp4 --remux-video mp4 + -S vcodec:h264,lang,quality,res,fps,hdr:12,a + codec:aac + + -t mkv --merge-output-format mkv --remux-video mkv + + -t sleep --sleep-subtitles 5 --sleep-requests 0.75 + --sleep-interval 10 --max-sleep-interval 20 + # CONFIGURATION You can configure yt-dlp by placing any supported command line option in a configuration file. The configuration is loaded from the following locations: @@ -1769,9 +1797,10 @@ # EXTRACTOR ARGUMENTS #### youtube * `lang`: Prefer translated metadata (`title`, `description` etc) of this language code (case-sensitive). By default, the video primary language metadata is preferred, with a fallback to `en` translated. See [youtube.py](https://github.com/yt-dlp/yt-dlp/blob/c26f9b991a0681fd3ea548d535919cec1fbbd430/yt_dlp/extractor/youtube.py#L381-L390) for list of supported content language codes * `skip`: One or more of `hls`, `dash` or `translated_subs` to skip extraction of the m3u8 manifests, dash manifests and [auto-translated subtitles](https://github.com/yt-dlp/yt-dlp/issues/4090#issuecomment-1158102032) respectively -* `player_client`: Clients to extract video data from. The currently available clients are `web`, `web_safari`, `web_embedded`, `web_music`, `web_creator`, `mweb`, `ios`, `android`, `android_vr`, `tv` and `tv_embedded`. By default, `tv,ios,web` is used, or `tv,web` is used when authenticating with cookies. The `web_music` client is added for `music.youtube.com` URLs when logged-in cookies are used. The `tv_embedded` and `web_creator` clients are added for age-restricted videos if account age-verification is required. Some clients, such as `web` and `web_music`, require a `po_token` for their formats to be downloadable. Some clients, such as `web_creator`, will only work with authentication. Not all clients support authentication via cookies. You can use `default` for the default clients, or you can use `all` for all clients (not recommended). You can prefix a client with `-` to exclude it, e.g. `youtube:player_client=default,-ios` -* `player_skip`: Skip some network requests that are generally needed for robust extraction. One or more of `configs` (skip client configs), `webpage` (skip initial webpage), `js` (skip js player). While these options can help reduce the number of requests needed or avoid some rate-limiting, they could cause some issues. See [#860](https://github.com/yt-dlp/yt-dlp/pull/860) for more details +* `player_client`: Clients to extract video data from. The currently available clients are `web`, `web_safari`, `web_embedded`, `web_music`, `web_creator`, `mweb`, `ios`, `android`, `android_vr`, `tv` and `tv_embedded`. By default, `tv,ios,web` is used, or `tv,web` is used when authenticating with cookies. The `web_music` client is added for `music.youtube.com` URLs when logged-in cookies are used. The `web_embedded` client is added for age-restricted videos but only works if the video is embeddable. The `tv_embedded` and `web_creator` clients are added for age-restricted videos if account age-verification is required. Some clients, such as `web` and `web_music`, require a `po_token` for their formats to be downloadable. Some clients, such as `web_creator`, will only work with authentication. Not all clients support authentication via cookies. You can use `default` for the default clients, or you can use `all` for all clients (not recommended). You can prefix a client with `-` to exclude it, e.g. `youtube:player_client=default,-ios` +* `player_skip`: Skip some network requests that are generally needed for robust extraction. One or more of `configs` (skip client configs), `webpage` (skip initial webpage), `js` (skip js player), `initial_data` (skip initial data/next ep request). While these options can help reduce the number of requests needed or avoid some rate-limiting, they could cause issues such as missing formats or metadata. See [#860](https://github.com/yt-dlp/yt-dlp/pull/860) and [#12826](https://github.com/yt-dlp/yt-dlp/issues/12826) for more details * `player_params`: YouTube player parameters to use for player requests. Will overwrite any default ones set by yt-dlp. +* `player_js_variant`: The player javascript variant to use for signature and nsig deciphering. The known variants are: `main`, `tce`, `tv`, `tv_es6`, `phone`, `tablet`. Only `main` is recommended as a possible workaround; the others are for debugging purposes. The default is to use what is prescribed by the site, and can be selected with `actual` * `comment_sort`: `top` or `new` (default) - choose comment sorting mode (on YouTube's side) * `max_comments`: Limit the amount of comments to gather. Comma-separated list of integers representing `max-comments,max-parents,max-replies,max-replies-per-thread`. Default is `all,all,all,all` * E.g. `all,all,1000,10` will get a maximum of 1000 replies total, with up to 10 replies per thread. `1000,all,100` will get a maximum of 1000 comments, with a maximum of 100 replies total @@ -1781,8 +1810,12 @@ #### youtube * `raise_incomplete_data`: `Incomplete Data Received` raises an error instead of reporting a warning * `data_sync_id`: Overrides the account Data Sync ID used in Innertube API requests. This may be needed if you are using an account with `youtube:player_skip=webpage,configs` or `youtubetab:skip=webpage` * `visitor_data`: Overrides the Visitor Data used in Innertube API requests. This should be used with `player_skip=webpage,configs` and without cookies. Note: this may have adverse effects if used improperly. If a session from a browser is wanted, you should pass cookies instead (which contain the Visitor ID) -* `po_token`: Proof of Origin (PO) Token(s) to use. Comma seperated list of PO Tokens in the format `CLIENT.CONTEXT+PO_TOKEN`, e.g. `youtube:po_token=web.gvs+XXX,web.player=XXX,web_safari.gvs+YYY`. Context can be either `gvs` (Google Video Server URLs) or `player` (Innertube player request) -* `player_js_variant`: The player javascript variant to use for signature and nsig deciphering. The known variants are: `main`, `tce`, `tv`, `tv_es6`, `phone`, `tablet`. Only `main` is recommended as a possible workaround; the others are for debugging purposes. The default is to use what is prescribed by the site, and can be selected with `actual` +* `po_token`: Proof of Origin (PO) Token(s) to use. Comma seperated list of PO Tokens in the format `CLIENT.CONTEXT+PO_TOKEN`, e.g. `youtube:po_token=web.gvs+XXX,web.player=XXX,web_safari.gvs+YYY`. Context can be any of `gvs` (Google Video Server URLs), `player` (Innertube player request) or `subs` (Subtitles) +* `pot_trace`: Enable debug logging for PO Token fetching. Either `true` or `false` (default) +* `fetch_pot`: Policy to use for fetching a PO Token from providers. One of `always` (always try fetch a PO Token regardless if the client requires one for the given context), `never` (never fetch a PO Token), or `auto` (default; only fetch a PO Token if the client requires one for the given context) + +#### youtubepot-webpo +* `bind_to_visitor_id`: Whether to use the Visitor ID instead of Visitor Data for caching WebPO tokens. Either `true` (default) or `false` #### youtubetab (YouTube playlists, channels, feeds, etc.) * `skip`: One or more of `webpage` (skip initial webpage download), `authcheck` (allow the download of playlists requiring authentication when no initial webpage is downloaded. This may cause unwanted behavior, see [#1122](https://github.com/yt-dlp/yt-dlp/pull/1122) for more details) @@ -1799,9 +1832,6 @@ #### generic #### vikichannel * `video_types`: Types of videos to download - one or more of `episodes`, `movies`, `clips`, `trailers` -#### niconico -* `segment_duration`: Segment duration in milliseconds for HLS-DMC formats. Use it at your own risk since this feature **may result in your account termination.** - #### youtubewebarchive * `check_all`: Try to check more at the cost of more requests. One or more of `thumbnails`, `captures` @@ -2153,7 +2183,7 @@ ### New features * **[Format Sorting](#sorting-formats)**: The default format sorting options have been changed so that higher resolution and better codecs will be now preferred instead of simply using larger bitrate. Furthermore, you can now specify the sort order using `-S`. This allows for much easier format selection than what is possible by simply using `--format` ([examples](#format-selection-examples)) -* **Merged with animelover1984/youtube-dl**: You get most of the features and improvements from [animelover1984/youtube-dl](https://github.com/animelover1984/youtube-dl) including `--write-comments`, `BiliBiliSearch`, `BilibiliChannel`, Embedding thumbnail in mp4/ogg/opus, playlist infojson etc. Note that NicoNico livestreams are not available. See [#31](https://github.com/yt-dlp/yt-dlp/pull/31) for details. +* **Merged with animelover1984/youtube-dl**: You get most of the features and improvements from [animelover1984/youtube-dl](https://github.com/animelover1984/youtube-dl) including `--write-comments`, `BiliBiliSearch`, `BilibiliChannel`, Embedding thumbnail in mp4/ogg/opus, playlist infojson etc. See [#31](https://github.com/yt-dlp/yt-dlp/pull/31) for details. * **YouTube improvements**: * Supports Clips, Stories (`ytstories:`), Search (including filters)**\***, YouTube Music Search, Channel-specific search, Search prefixes (`ytsearch:`, `ytsearchdate:`)**\***, Mixes, and Feeds (`:ytfav`, `:ytwatchlater`, `:ytsubs`, `:ythistory`, `:ytrec`, `:ytnotif`) diff --git a/devscripts/changelog_override.json b/devscripts/changelog_override.json index 8aa7b7e2b..269de2c68 100644 --- a/devscripts/changelog_override.json +++ b/devscripts/changelog_override.json @@ -245,5 +245,14 @@ "when": "76ac023ff02f06e8c003d104f02a03deeddebdcd", "short": "[ie/youtube:tab] Improve shorts title extraction (#11997)", "authors": ["bashonly", "d3d9"] + }, + { + "action": "add", + "when": "88eb1e7a9a2720ac89d653c0d0e40292388823bb", + "short": "[priority] **New option `--preset-alias`/`-t` has been added**\nThis provides convenient predefined aliases for common use cases. Available presets include `mp4`, `mp3`, `mkv`, `aac`, and `sleep`. See [the README](https://github.com/yt-dlp/yt-dlp/blob/master/README.md#preset-aliases) for more details." + }, + { + "action": "remove", + "when": "d596824c2f8428362c072518856065070616e348" } ] diff --git a/pyproject.toml b/pyproject.toml index 5e987a6fd..7accaeeb9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -82,7 +82,7 @@ test = [ "pytest-rerunfailures~=14.0", ] pyinstaller = [ - "pyinstaller>=6.11.1", # Windows temp cleanup fixed in 6.11.1 + "pyinstaller>=6.13.0", # Windows temp cleanup fixed in 6.13.0 ] [project.urls] diff --git a/supportedsites.md b/supportedsites.md index 44c6ef5a1..c2d7b4555 100644 --- a/supportedsites.md +++ b/supportedsites.md @@ -246,7 +246,6 @@ # Supported sites - **Canalplus**: mycanal.fr and piwiplus.fr - **Canalsurmas** - **CaracolTvPlay**: [*caracoltv-play*](## "netrc machine") - - **CartoonNetwork** - **cbc.ca** - **cbc.ca:player** - **cbc.ca:​player:playlist** @@ -394,6 +393,8 @@ # Supported sites - **dvtv**: http://video.aktualne.cz/ - **dw**: (**Currently broken**) - **dw:article**: (**Currently broken**) + - **dzen.ru**: Дзен (dzen) formerly Яндекс.Дзен (Yandex Zen) + - **dzen.ru:channel** - **EaglePlatform** - **EbaumsWorld** - **Ebay** @@ -634,6 +635,7 @@ # Supported sites - **ivi**: ivi.ru - **ivi:compilation**: ivi.ru compilations - **ivideon**: Ivideon TV + - **Ivoox** - **IVXPlayer** - **iwara**: [*iwara*](## "netrc machine") - **iwara:playlist**: [*iwara*](## "netrc machine") @@ -646,7 +648,10 @@ # Supported sites - **jiocinema**: [*jiocinema*](## "netrc machine") - **jiocinema:series**: [*jiocinema*](## "netrc machine") - **jiosaavn:album** + - **jiosaavn:artist** - **jiosaavn:playlist** + - **jiosaavn:show** + - **jiosaavn:​show:playlist** - **jiosaavn:song** - **Joj** - **JoqrAg**: 超!A&G+ 文化放送 (f.k.a. AGQR) Nippon Cultural Broadcasting, Inc. (JOQR) @@ -671,6 +676,7 @@ # Supported sites - **Kicker** - **KickStarter** - **Kika**: KiKA.de + - **KikaPlaylist** - **kinja:embed** - **KinoPoisk** - **Kommunetv** @@ -723,6 +729,7 @@ # Supported sites - **limelight:channel** - **limelight:channel_list** - **LinkedIn**: [*linkedin*](## "netrc machine") + - **linkedin:events**: [*linkedin*](## "netrc machine") - **linkedin:learning**: [*linkedin*](## "netrc machine") - **linkedin:​learning:course**: [*linkedin*](## "netrc machine") - **Liputan6** @@ -738,6 +745,7 @@ # Supported sites - **loom** - **loom:folder** - **LoveHomePorn** + - **LRTRadio** - **LRTStream** - **LRTVOD** - **LSMLREmbed** @@ -759,7 +767,7 @@ # Supported sites - **ManotoTV**: Manoto TV (Episode) - **ManotoTVLive**: Manoto TV (Live) - **ManotoTVShow**: Manoto TV (Show) - - **ManyVids**: (**Currently broken**) + - **ManyVids** - **MaoriTV** - **Markiza**: (**Currently broken**) - **MarkizaPage**: (**Currently broken**) @@ -946,7 +954,7 @@ # Supported sites - **nickelodeonru** - **niconico**: [*niconico*](## "netrc machine") ニコニコ動画 - **niconico:history**: NicoNico user history or likes. Requires cookies. - - **niconico:live**: ニコニコ生放送 + - **niconico:live**: [*niconico*](## "netrc machine") ニコニコ生放送 - **niconico:playlist** - **niconico:series** - **niconico:tag**: NicoNico video tag URLs @@ -1053,6 +1061,8 @@ # Supported sites - **Parler**: Posts on parler.com - **parliamentlive.tv**: UK parliament videos - **Parlview**: (**Currently broken**) + - **parti:livestream** + - **parti:video** - **patreon** - **patreon:campaign** - **pbs**: Public Broadcasting Service (PBS) and member stations: PBS: Public Broadcasting Service, APT - Alabama Public Television (WBIQ), GPB/Georgia Public Broadcasting (WGTV), Mississippi Public Broadcasting (WMPN), Nashville Public Television (WNPT), WFSU-TV (WFSU), WSRE (WSRE), WTCI (WTCI), WPBA/Channel 30 (WPBA), Alaska Public Media (KAKM), Arizona PBS (KAET), KNME-TV/Channel 5 (KNME), Vegas PBS (KLVX), AETN/ARKANSAS ETV NETWORK (KETS), KET (WKLE), WKNO/Channel 10 (WKNO), LPB/LOUISIANA PUBLIC BROADCASTING (WLPB), OETA (KETA), Ozarks Public Television (KOZK), WSIU Public Broadcasting (WSIU), KEET TV (KEET), KIXE/Channel 9 (KIXE), KPBS San Diego (KPBS), KQED (KQED), KVIE Public Television (KVIE), PBS SoCal/KOCE (KOCE), ValleyPBS (KVPT), CONNECTICUT PUBLIC TELEVISION (WEDH), KNPB Channel 5 (KNPB), SOPTV (KSYS), Rocky Mountain PBS (KRMA), KENW-TV3 (KENW), KUED Channel 7 (KUED), Wyoming PBS (KCWC), Colorado Public Television / KBDI 12 (KBDI), KBYU-TV (KBYU), Thirteen/WNET New York (WNET), WGBH/Channel 2 (WGBH), WGBY (WGBY), NJTV Public Media NJ (WNJT), WLIW21 (WLIW), mpt/Maryland Public Television (WMPB), WETA Television and Radio (WETA), WHYY (WHYY), PBS 39 (WLVT), WVPT - Your Source for PBS and More! (WVPT), Howard University Television (WHUT), WEDU PBS (WEDU), WGCU Public Media (WGCU), WPBT2 (WPBT), WUCF TV (WUCF), WUFT/Channel 5 (WUFT), WXEL/Channel 42 (WXEL), WLRN/Channel 17 (WLRN), WUSF Public Broadcasting (WUSF), ETV (WRLK), UNC-TV (WUNC), PBS Hawaii - Oceanic Cable Channel 10 (KHET), Idaho Public Television (KAID), KSPS (KSPS), OPB (KOPB), KWSU/Channel 10 & KTNW/Channel 31 (KWSU), WILL-TV (WILL), Network Knowledge - WSEC/Springfield (WSEC), WTTW11 (WTTW), Iowa Public Television/IPTV (KDIN), Nine Network (KETC), PBS39 Fort Wayne (WFWA), WFYI Indianapolis (WFYI), Milwaukee Public Television (WMVS), WNIN (WNIN), WNIT Public Television (WNIT), WPT (WPNE), WVUT/Channel 22 (WVUT), WEIU/Channel 51 (WEIU), WQPT-TV (WQPT), WYCC PBS Chicago (WYCC), WIPB-TV (WIPB), WTIU (WTIU), CET (WCET), ThinkTVNetwork (WPTD), WBGU-TV (WBGU), WGVU TV (WGVU), NET1 (KUON), Pioneer Public Television (KWCM), SDPB Television (KUSD), TPT (KTCA), KSMQ (KSMQ), KPTS/Channel 8 (KPTS), KTWU/Channel 11 (KTWU), East Tennessee PBS (WSJK), WCTE-TV (WCTE), WLJT, Channel 11 (WLJT), WOSU TV (WOSU), WOUB/WOUC (WOUB), WVPB (WVPB), WKYU-PBS (WKYU), KERA 13 (KERA), MPBN (WCBB), Mountain Lake PBS (WCFE), NHPTV (WENH), Vermont PBS (WETK), witf (WITF), WQED Multimedia (WQED), WMHT Educational Telecommunications (WMHT), Q-TV (WDCQ), WTVS Detroit Public TV (WTVS), CMU Public Television (WCMU), WKAR-TV (WKAR), WNMU-TV Public TV 13 (WNMU), WDSE - WRPT (WDSE), WGTE TV (WGTE), Lakeland Public Television (KAWE), KMOS-TV - Channels 6.1, 6.2 and 6.3 (KMOS), MontanaPBS (KUSM), KRWG/Channel 22 (KRWG), KACV (KACV), KCOS/Channel 13 (KCOS), WCNY/Channel 24 (WCNY), WNED (WNED), WPBS (WPBS), WSKG Public TV (WSKG), WXXI (WXXI), WPSU (WPSU), WVIA Public Media Studios (WVIA), WTVI (WTVI), Western Reserve PBS (WNEO), WVIZ/PBS ideastream (WVIZ), KCTS 9 (KCTS), Basin PBS (KPBT), KUHT / Channel 8 (KUHT), KLRN (KLRN), KLRU (KLRU), WTJX Channel 12 (WTJX), WCVE PBS (WCVE), KBTC Public Television (KBTC) @@ -1073,8 +1083,8 @@ # Supported sites - **Photobucket** - **PiaLive** - **Piapro**: [*piapro*](## "netrc machine") - - **Picarto** - - **PicartoVod** + - **picarto** + - **picarto:vod** - **Piksel** - **Pinkbike** - **Pinterest** @@ -1227,6 +1237,7 @@ # Supported sites - **RoosterTeeth**: [*roosterteeth*](## "netrc machine") - **RoosterTeethSeries**: [*roosterteeth*](## "netrc machine") - **RottenTomatoes** + - **RoyaLive** - **Rozhlas** - **RozhlasVltava** - **RTBF**: [*rtbf*](## "netrc machine") (**Currently broken**) @@ -1247,9 +1258,8 @@ # Supported sites - **RTVCKaltura** - **RTVCPlay** - **RTVCPlayEmbed** - - **rtve.es:alacarta**: RTVE a la carta + - **rtve.es:alacarta**: RTVE a la carta and Play - **rtve.es:audio**: RTVE audio - - **rtve.es:infantil**: RTVE infantil - **rtve.es:live**: RTVE.es live streams - **rtve.es:television** - **rtvslo.si** @@ -1382,7 +1392,6 @@ # Supported sites - **Spreaker** - **SpreakerShow** - **SpringboardPlatform** - - **Sprout** - **SproutVideo** - **sr:mediathek**: Saarländischer Rundfunk (**Currently broken**) - **SRGSSR** @@ -1562,7 +1571,8 @@ # Supported sites - **tvp:​vod:series** - **TVPlayer** - **TVPlayHome** - - **Tvw** + - **tvw** + - **tvw:tvchannels** - **Tweakers** - **TwitCasting** - **TwitCastingLive** @@ -1647,6 +1657,7 @@ # Supported sites - **vimeo**: [*vimeo*](## "netrc machine") - **vimeo:album**: [*vimeo*](## "netrc machine") - **vimeo:channel**: [*vimeo*](## "netrc machine") + - **vimeo:event**: [*vimeo*](## "netrc machine") - **vimeo:group**: [*vimeo*](## "netrc machine") - **vimeo:likes**: [*vimeo*](## "netrc machine") Vimeo user likes - **vimeo:ondemand**: [*vimeo*](## "netrc machine") @@ -1821,14 +1832,12 @@ # Supported sites - **ZattooLive**: [*zattoo*](## "netrc machine") - **ZattooMovies**: [*zattoo*](## "netrc machine") - **ZattooRecordings**: [*zattoo*](## "netrc machine") - - **ZDF** - - **ZDFChannel** + - **zdf** + - **zdf:channel** - **Zee5**: [*zee5*](## "netrc machine") - **zee5:series** - **ZeeNews**: (**Currently broken**) - **ZenPorn** - - **ZenYandex** - - **ZenYandexChannel** - **ZetlandDKArticle** - **Zhihu** - **zingmp3**: zingmp3.vn diff --git a/test/helper.py b/test/helper.py index 4169af799..e4cb478e2 100644 --- a/test/helper.py +++ b/test/helper.py @@ -136,7 +136,7 @@ def _iter_differences(got, expected, field): return if op == 'startswith': - if not val.startswith(got): + if not got.startswith(val): yield field, f'should start with {val!r}, got {got!r}' return diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 708a04f92..91312e4e5 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -1435,6 +1435,27 @@ def test_load_plugins_compat(self): FakeYDL().close() assert all_plugins_loaded.value + def test_close_hooks(self): + # Should call all registered close hooks on close + close_hook_called = False + close_hook_two_called = False + + def close_hook(): + nonlocal close_hook_called + close_hook_called = True + + def close_hook_two(): + nonlocal close_hook_two_called + close_hook_two_called = True + + ydl = FakeYDL() + ydl.add_close_hook(close_hook) + ydl.add_close_hook(close_hook_two) + + ydl.close() + self.assertTrue(close_hook_called, 'Close hook was not called') + self.assertTrue(close_hook_two_called, 'Close hook two was not called') + if __name__ == '__main__': unittest.main() diff --git a/test/test_cookies.py b/test/test_cookies.py index 4b9b9b5a9..f956ab187 100644 --- a/test/test_cookies.py +++ b/test/test_cookies.py @@ -58,6 +58,14 @@ def test_get_desktop_environment(self): ({'DESKTOP_SESSION': 'kde'}, _LinuxDesktopEnvironment.KDE3), ({'DESKTOP_SESSION': 'xfce'}, _LinuxDesktopEnvironment.XFCE), + ({'XDG_CURRENT_DESKTOP': 'my_custom_de', 'DESKTOP_SESSION': 'gnome'}, _LinuxDesktopEnvironment.GNOME), + ({'XDG_CURRENT_DESKTOP': 'my_custom_de', 'DESKTOP_SESSION': 'mate'}, _LinuxDesktopEnvironment.GNOME), + ({'XDG_CURRENT_DESKTOP': 'my_custom_de', 'DESKTOP_SESSION': 'kde4'}, _LinuxDesktopEnvironment.KDE4), + ({'XDG_CURRENT_DESKTOP': 'my_custom_de', 'DESKTOP_SESSION': 'kde'}, _LinuxDesktopEnvironment.KDE3), + ({'XDG_CURRENT_DESKTOP': 'my_custom_de', 'DESKTOP_SESSION': 'xfce'}, _LinuxDesktopEnvironment.XFCE), + + ({'XDG_CURRENT_DESKTOP': 'my_custom_de', 'DESKTOP_SESSION': 'my_custom_de', 'GNOME_DESKTOP_SESSION_ID': 1}, _LinuxDesktopEnvironment.GNOME), + ({'GNOME_DESKTOP_SESSION_ID': 1}, _LinuxDesktopEnvironment.GNOME), ({'KDE_FULL_SESSION': 1}, _LinuxDesktopEnvironment.KDE3), ({'KDE_FULL_SESSION': 1, 'DESKTOP_SESSION': 'kde4'}, _LinuxDesktopEnvironment.KDE4), diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py index b14069ccc..2e3cdc2a5 100644 --- a/test/test_jsinterp.py +++ b/test/test_jsinterp.py @@ -478,6 +478,14 @@ def test_extract_function_with_global_stack(self): func = jsi.extract_function('c', {'e': 10}, {'f': 100, 'g': 1000}) self.assertEqual(func([1]), 1111) + def test_increment_decrement(self): + self._test('function f() { var x = 1; return ++x; }', 2) + self._test('function f() { var x = 1; return x++; }', 1) + self._test('function f() { var x = 1; x--; return x }', 0) + self._test('function f() { var y; var x = 1; x++, --x, x--, x--, y="z", "abc", x++; return --x }', -1) + self._test('function f() { var a = "test--"; return a; }', 'test--') + self._test('function f() { var b = 1; var a = "b--"; return a; }', 'b--') + if __name__ == '__main__': unittest.main() diff --git a/test/test_networking.py b/test/test_networking.py index 3ab60fe83..2f441fced 100644 --- a/test/test_networking.py +++ b/test/test_networking.py @@ -39,6 +39,7 @@ from yt_dlp.dependencies import brotli, curl_cffi, requests, urllib3 from yt_dlp.networking import ( HEADRequest, + PATCHRequest, PUTRequest, Request, RequestDirector, @@ -1856,6 +1857,7 @@ def test_method(self): def test_request_helpers(self): assert HEADRequest('http://example.com').method == 'HEAD' + assert PATCHRequest('http://example.com').method == 'PATCH' assert PUTRequest('http://example.com').method == 'PUT' def test_headers(self): diff --git a/test/test_networking_utils.py b/test/test_networking_utils.py index 204fe87bd..a2feacba7 100644 --- a/test/test_networking_utils.py +++ b/test/test_networking_utils.py @@ -20,7 +20,6 @@ add_accept_encoding_header, get_redirect_method, make_socks_proxy_opts, - select_proxy, ssl_load_certs, ) from yt_dlp.networking.exceptions import ( @@ -28,7 +27,7 @@ IncompleteRead, ) from yt_dlp.socks import ProxyType -from yt_dlp.utils.networking import HTTPHeaderDict +from yt_dlp.utils.networking import HTTPHeaderDict, select_proxy TEST_DIR = os.path.dirname(os.path.abspath(__file__)) diff --git a/test/test_pot/conftest.py b/test/test_pot/conftest.py new file mode 100644 index 000000000..ff0667e92 --- /dev/null +++ b/test/test_pot/conftest.py @@ -0,0 +1,71 @@ +import collections + +import pytest + +from yt_dlp import YoutubeDL +from yt_dlp.cookies import YoutubeDLCookieJar +from yt_dlp.extractor.common import InfoExtractor +from yt_dlp.extractor.youtube.pot._provider import IEContentProviderLogger +from yt_dlp.extractor.youtube.pot.provider import PoTokenRequest, PoTokenContext +from yt_dlp.utils.networking import HTTPHeaderDict + + +class MockLogger(IEContentProviderLogger): + + log_level = IEContentProviderLogger.LogLevel.TRACE + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.messages = collections.defaultdict(list) + + def trace(self, message: str): + self.messages['trace'].append(message) + + def debug(self, message: str): + self.messages['debug'].append(message) + + def info(self, message: str): + self.messages['info'].append(message) + + def warning(self, message: str, *, once=False): + self.messages['warning'].append(message) + + def error(self, message: str): + self.messages['error'].append(message) + + +@pytest.fixture +def ie() -> InfoExtractor: + ydl = YoutubeDL() + return ydl.get_info_extractor('Youtube') + + +@pytest.fixture +def logger() -> MockLogger: + return MockLogger() + + +@pytest.fixture() +def pot_request() -> PoTokenRequest: + return PoTokenRequest( + context=PoTokenContext.GVS, + innertube_context={'client': {'clientName': 'WEB'}}, + innertube_host='youtube.com', + session_index=None, + player_url=None, + is_authenticated=False, + video_webpage=None, + + visitor_data='example-visitor-data', + data_sync_id='example-data-sync-id', + video_id='example-video-id', + + request_cookiejar=YoutubeDLCookieJar(), + request_proxy=None, + request_headers=HTTPHeaderDict(), + request_timeout=None, + request_source_address=None, + request_verify_tls=True, + + bypass_cache=False, + ) diff --git a/test/test_pot/test_pot_builtin_memorycache.py b/test/test_pot/test_pot_builtin_memorycache.py new file mode 100644 index 000000000..ea19fbe29 --- /dev/null +++ b/test/test_pot/test_pot_builtin_memorycache.py @@ -0,0 +1,117 @@ +import threading +import time +from collections import OrderedDict +import pytest +from yt_dlp.extractor.youtube.pot._provider import IEContentProvider, BuiltinIEContentProvider +from yt_dlp.utils import bug_reports_message +from yt_dlp.extractor.youtube.pot._builtin.memory_cache import MemoryLRUPCP, memorylru_preference, initialize_global_cache +from yt_dlp.version import __version__ +from yt_dlp.extractor.youtube.pot._registry import _pot_cache_providers, _pot_memory_cache + + +class TestMemoryLRUPCS: + + def test_base_type(self): + assert issubclass(MemoryLRUPCP, IEContentProvider) + assert issubclass(MemoryLRUPCP, BuiltinIEContentProvider) + + @pytest.fixture + def pcp(self, ie, logger) -> MemoryLRUPCP: + return MemoryLRUPCP(ie, logger, {}, initialize_cache=lambda max_size: (OrderedDict(), threading.Lock(), max_size)) + + def test_is_registered(self): + assert _pot_cache_providers.value.get('MemoryLRU') == MemoryLRUPCP + + def test_initialization(self, pcp): + assert pcp.PROVIDER_NAME == 'memory' + assert pcp.PROVIDER_VERSION == __version__ + assert pcp.BUG_REPORT_MESSAGE == bug_reports_message(before='') + assert pcp.is_available() + + def test_store_and_get(self, pcp): + pcp.store('key1', 'value1', int(time.time()) + 60) + assert pcp.get('key1') == 'value1' + assert len(pcp.cache) == 1 + + def test_store_ignore_expired(self, pcp): + pcp.store('key1', 'value1', int(time.time()) - 1) + assert len(pcp.cache) == 0 + assert pcp.get('key1') is None + assert len(pcp.cache) == 0 + + def test_store_override_existing_key(self, ie, logger): + MAX_SIZE = 2 + pcp = MemoryLRUPCP(ie, logger, {}, initialize_cache=lambda max_size: (OrderedDict(), threading.Lock(), MAX_SIZE)) + pcp.store('key1', 'value1', int(time.time()) + 60) + pcp.store('key2', 'value2', int(time.time()) + 60) + assert len(pcp.cache) == 2 + pcp.store('key1', 'value2', int(time.time()) + 60) + # Ensure that the override key gets added to the end of the cache instead of in the same position + pcp.store('key3', 'value3', int(time.time()) + 60) + assert pcp.get('key1') == 'value2' + + def test_store_ignore_expired_existing_key(self, pcp): + pcp.store('key1', 'value2', int(time.time()) + 60) + pcp.store('key1', 'value1', int(time.time()) - 1) + assert len(pcp.cache) == 1 + assert pcp.get('key1') == 'value2' + assert len(pcp.cache) == 1 + + def test_get_key_expired(self, pcp): + pcp.store('key1', 'value1', int(time.time()) + 60) + assert pcp.get('key1') == 'value1' + assert len(pcp.cache) == 1 + pcp.cache['key1'] = ('value1', int(time.time()) - 1) + assert pcp.get('key1') is None + assert len(pcp.cache) == 0 + + def test_lru_eviction(self, ie, logger): + MAX_SIZE = 2 + provider = MemoryLRUPCP(ie, logger, {}, initialize_cache=lambda max_size: (OrderedDict(), threading.Lock(), MAX_SIZE)) + provider.store('key1', 'value1', int(time.time()) + 5) + provider.store('key2', 'value2', int(time.time()) + 5) + assert len(provider.cache) == 2 + + assert provider.get('key1') == 'value1' + + provider.store('key3', 'value3', int(time.time()) + 5) + assert len(provider.cache) == 2 + + assert provider.get('key2') is None + + provider.store('key4', 'value4', int(time.time()) + 5) + assert len(provider.cache) == 2 + + assert provider.get('key1') is None + assert provider.get('key3') == 'value3' + assert provider.get('key4') == 'value4' + + def test_delete(self, pcp): + pcp.store('key1', 'value1', int(time.time()) + 5) + assert len(pcp.cache) == 1 + assert pcp.get('key1') == 'value1' + pcp.delete('key1') + assert len(pcp.cache) == 0 + assert pcp.get('key1') is None + + def test_use_global_cache_default(self, ie, logger): + pcp = MemoryLRUPCP(ie, logger, {}) + assert pcp.max_size == _pot_memory_cache.value['max_size'] == 25 + assert pcp.cache is _pot_memory_cache.value['cache'] + assert pcp.lock is _pot_memory_cache.value['lock'] + + pcp2 = MemoryLRUPCP(ie, logger, {}) + assert pcp.max_size == pcp2.max_size == _pot_memory_cache.value['max_size'] == 25 + assert pcp.cache is pcp2.cache is _pot_memory_cache.value['cache'] + assert pcp.lock is pcp2.lock is _pot_memory_cache.value['lock'] + + def test_fail_max_size_change_global(self, ie, logger): + pcp = MemoryLRUPCP(ie, logger, {}) + assert pcp.max_size == _pot_memory_cache.value['max_size'] == 25 + with pytest.raises(ValueError, match='Cannot change max_size of initialized global memory cache'): + initialize_global_cache(50) + + assert pcp.max_size == _pot_memory_cache.value['max_size'] == 25 + + def test_memory_lru_preference(self, pcp, ie, pot_request): + assert memorylru_preference(pcp, pot_request) == 10000 diff --git a/test/test_pot/test_pot_builtin_utils.py b/test/test_pot/test_pot_builtin_utils.py new file mode 100644 index 000000000..a95fc4e15 --- /dev/null +++ b/test/test_pot/test_pot_builtin_utils.py @@ -0,0 +1,47 @@ +import pytest +from yt_dlp.extractor.youtube.pot.provider import ( + PoTokenContext, + +) + +from yt_dlp.extractor.youtube.pot.utils import get_webpo_content_binding, ContentBindingType + + +class TestGetWebPoContentBinding: + + @pytest.mark.parametrize('client_name, context, is_authenticated, expected', [ + *[(client, context, is_authenticated, expected) for client in [ + 'WEB', 'MWEB', 'TVHTML5', 'WEB_EMBEDDED_PLAYER', 'WEB_CREATOR', 'TVHTML5_SIMPLY_EMBEDDED_PLAYER'] + for context, is_authenticated, expected in [ + (PoTokenContext.GVS, False, ('example-visitor-data', ContentBindingType.VISITOR_DATA)), + (PoTokenContext.PLAYER, False, ('example-video-id', ContentBindingType.VIDEO_ID)), + (PoTokenContext.SUBS, False, ('example-video-id', ContentBindingType.VIDEO_ID)), + (PoTokenContext.GVS, True, ('example-data-sync-id', ContentBindingType.DATASYNC_ID)), + ]], + ('WEB_REMIX', PoTokenContext.GVS, False, ('example-visitor-data', ContentBindingType.VISITOR_DATA)), + ('WEB_REMIX', PoTokenContext.PLAYER, False, ('example-visitor-data', ContentBindingType.VISITOR_DATA)), + ('ANDROID', PoTokenContext.GVS, False, (None, None)), + ('IOS', PoTokenContext.GVS, False, (None, None)), + ]) + def test_get_webpo_content_binding(self, pot_request, client_name, context, is_authenticated, expected): + pot_request.innertube_context['client']['clientName'] = client_name + pot_request.context = context + pot_request.is_authenticated = is_authenticated + assert get_webpo_content_binding(pot_request) == expected + + def test_extract_visitor_id(self, pot_request): + pot_request.visitor_data = 'CgsxMjNhYmNYWVpfLSiA4s%2DqBg%3D%3D' + assert get_webpo_content_binding(pot_request, bind_to_visitor_id=True) == ('123abcXYZ_-', ContentBindingType.VISITOR_ID) + + def test_invalid_visitor_id(self, pot_request): + # visitor id not alphanumeric (i.e. protobuf extraction failed) + pot_request.visitor_data = 'CggxMjM0NTY3OCiA4s-qBg%3D%3D' + assert get_webpo_content_binding(pot_request, bind_to_visitor_id=True) == (pot_request.visitor_data, ContentBindingType.VISITOR_DATA) + + def test_no_visitor_id(self, pot_request): + pot_request.visitor_data = 'KIDiz6oG' + assert get_webpo_content_binding(pot_request, bind_to_visitor_id=True) == (pot_request.visitor_data, ContentBindingType.VISITOR_DATA) + + def test_invalid_base64(self, pot_request): + pot_request.visitor_data = 'invalid-base64' + assert get_webpo_content_binding(pot_request, bind_to_visitor_id=True) == (pot_request.visitor_data, ContentBindingType.VISITOR_DATA) diff --git a/test/test_pot/test_pot_builtin_webpospec.py b/test/test_pot/test_pot_builtin_webpospec.py new file mode 100644 index 000000000..c5fb6f382 --- /dev/null +++ b/test/test_pot/test_pot_builtin_webpospec.py @@ -0,0 +1,92 @@ +import pytest + +from yt_dlp.extractor.youtube.pot._provider import IEContentProvider, BuiltinIEContentProvider +from yt_dlp.extractor.youtube.pot.cache import CacheProviderWritePolicy +from yt_dlp.utils import bug_reports_message +from yt_dlp.extractor.youtube.pot.provider import ( + PoTokenRequest, + PoTokenContext, + +) +from yt_dlp.version import __version__ + +from yt_dlp.extractor.youtube.pot._builtin.webpo_cachespec import WebPoPCSP +from yt_dlp.extractor.youtube.pot._registry import _pot_pcs_providers + + +@pytest.fixture() +def pot_request(pot_request) -> PoTokenRequest: + pot_request.visitor_data = 'CgsxMjNhYmNYWVpfLSiA4s%2DqBg%3D%3D' # visitor_id=123abcXYZ_- + return pot_request + + +class TestWebPoPCSP: + def test_base_type(self): + assert issubclass(WebPoPCSP, IEContentProvider) + assert issubclass(WebPoPCSP, BuiltinIEContentProvider) + + def test_init(self, ie, logger): + pcs = WebPoPCSP(ie=ie, logger=logger, settings={}) + assert pcs.PROVIDER_NAME == 'webpo' + assert pcs.PROVIDER_VERSION == __version__ + assert pcs.BUG_REPORT_MESSAGE == bug_reports_message(before='') + assert pcs.is_available() + + def test_is_registered(self): + assert _pot_pcs_providers.value.get('WebPo') == WebPoPCSP + + @pytest.mark.parametrize('client_name, context, is_authenticated', [ + ('ANDROID', PoTokenContext.GVS, False), + ('IOS', PoTokenContext.GVS, False), + ('IOS', PoTokenContext.PLAYER, False), + ]) + def test_not_supports(self, ie, logger, pot_request, client_name, context, is_authenticated): + pcs = WebPoPCSP(ie=ie, logger=logger, settings={}) + pot_request.innertube_context['client']['clientName'] = client_name + pot_request.context = context + pot_request.is_authenticated = is_authenticated + assert pcs.generate_cache_spec(pot_request) is None + + @pytest.mark.parametrize('client_name, context, is_authenticated, remote_host, source_address, request_proxy, expected', [ + *[(client, context, is_authenticated, remote_host, source_address, request_proxy, expected) for client in [ + 'WEB', 'MWEB', 'TVHTML5', 'WEB_EMBEDDED_PLAYER', 'WEB_CREATOR', 'TVHTML5_SIMPLY_EMBEDDED_PLAYER'] + for context, is_authenticated, remote_host, source_address, request_proxy, expected in [ + (PoTokenContext.GVS, False, 'example-remote-host', 'example-source-address', 'example-request-proxy', {'t': 'webpo', 'ip': 'example-remote-host', 'sa': 'example-source-address', 'px': 'example-request-proxy', 'cb': '123abcXYZ_-', 'cbt': 'visitor_id'}), + (PoTokenContext.PLAYER, False, 'example-remote-host', 'example-source-address', 'example-request-proxy', {'t': 'webpo', 'ip': 'example-remote-host', 'sa': 'example-source-address', 'px': 'example-request-proxy', 'cb': '123abcXYZ_-', 'cbt': 'video_id'}), + (PoTokenContext.GVS, True, 'example-remote-host', 'example-source-address', 'example-request-proxy', {'t': 'webpo', 'ip': 'example-remote-host', 'sa': 'example-source-address', 'px': 'example-request-proxy', 'cb': 'example-data-sync-id', 'cbt': 'datasync_id'}), + ]], + ('WEB_REMIX', PoTokenContext.PLAYER, False, 'example-remote-host', 'example-source-address', 'example-request-proxy', {'t': 'webpo', 'ip': 'example-remote-host', 'sa': 'example-source-address', 'px': 'example-request-proxy', 'cb': '123abcXYZ_-', 'cbt': 'visitor_id'}), + ('WEB', PoTokenContext.GVS, False, None, None, None, {'t': 'webpo', 'cb': '123abcXYZ_-', 'cbt': 'visitor_id', 'ip': None, 'sa': None, 'px': None}), + ('TVHTML5', PoTokenContext.PLAYER, False, None, None, 'http://example.com', {'t': 'webpo', 'cb': '123abcXYZ_-', 'cbt': 'video_id', 'ip': None, 'sa': None, 'px': 'http://example.com'}), + + ]) + def test_generate_key_bindings(self, ie, logger, pot_request, client_name, context, is_authenticated, remote_host, source_address, request_proxy, expected): + pcs = WebPoPCSP(ie=ie, logger=logger, settings={}) + pot_request.innertube_context['client']['clientName'] = client_name + pot_request.context = context + pot_request.is_authenticated = is_authenticated + pot_request.innertube_context['client']['remoteHost'] = remote_host + pot_request.request_source_address = source_address + pot_request.request_proxy = request_proxy + pot_request.video_id = '123abcXYZ_-' # same as visitor id to test type + + assert pcs.generate_cache_spec(pot_request).key_bindings == expected + + def test_no_bind_visitor_id(self, ie, logger, pot_request): + # Should not bind to visitor id if setting is set to False + pcs = WebPoPCSP(ie=ie, logger=logger, settings={'bind_to_visitor_id': ['false']}) + pot_request.innertube_context['client']['clientName'] = 'WEB' + pot_request.context = PoTokenContext.GVS + pot_request.is_authenticated = False + assert pcs.generate_cache_spec(pot_request).key_bindings == {'t': 'webpo', 'ip': None, 'sa': None, 'px': None, 'cb': 'CgsxMjNhYmNYWVpfLSiA4s%2DqBg%3D%3D', 'cbt': 'visitor_data'} + + def test_default_ttl(self, ie, logger, pot_request): + pcs = WebPoPCSP(ie=ie, logger=logger, settings={}) + assert pcs.generate_cache_spec(pot_request).default_ttl == 6 * 60 * 60 # should default to 6 hours + + def test_write_policy(self, ie, logger, pot_request): + pcs = WebPoPCSP(ie=ie, logger=logger, settings={}) + pot_request.context = PoTokenContext.GVS + assert pcs.generate_cache_spec(pot_request).write_policy == CacheProviderWritePolicy.WRITE_ALL + pot_request.context = PoTokenContext.PLAYER + assert pcs.generate_cache_spec(pot_request).write_policy == CacheProviderWritePolicy.WRITE_FIRST diff --git a/test/test_pot/test_pot_director.py b/test/test_pot/test_pot_director.py new file mode 100644 index 000000000..bbfdd0e98 --- /dev/null +++ b/test/test_pot/test_pot_director.py @@ -0,0 +1,1529 @@ +from __future__ import annotations +import abc +import base64 +import dataclasses +import hashlib +import json +import time +import pytest + +from yt_dlp.extractor.youtube.pot._provider import BuiltinIEContentProvider, IEContentProvider + +from yt_dlp.extractor.youtube.pot.provider import ( + PoTokenRequest, + PoTokenContext, + PoTokenProviderError, + PoTokenProviderRejectedRequest, +) +from yt_dlp.extractor.youtube.pot._director import ( + PoTokenCache, + validate_cache_spec, + clean_pot, + validate_response, + PoTokenRequestDirector, + provider_display_list, +) + +from yt_dlp.extractor.youtube.pot.cache import ( + PoTokenCacheSpec, + PoTokenCacheSpecProvider, + PoTokenCacheProvider, + CacheProviderWritePolicy, + PoTokenCacheProviderError, +) + + +from yt_dlp.extractor.youtube.pot.provider import ( + PoTokenResponse, + PoTokenProvider, +) + + +class BaseMockPoTokenProvider(PoTokenProvider, abc.ABC): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.available_called_times = 0 + self.request_called_times = 0 + self.close_called = False + + def is_available(self) -> bool: + self.available_called_times += 1 + return True + + def request_pot(self, *args, **kwargs): + self.request_called_times += 1 + return super().request_pot(*args, **kwargs) + + def close(self): + self.close_called = True + super().close() + + +class ExamplePTP(BaseMockPoTokenProvider): + PROVIDER_NAME = 'example' + PROVIDER_VERSION = '0.0.1' + BUG_REPORT_LOCATION = 'https://example.com/issues' + + _SUPPORTED_CLIENTS = ('WEB',) + _SUPPORTED_CONTEXTS = (PoTokenContext.GVS, ) + + def _real_request_pot(self, request: PoTokenRequest) -> PoTokenResponse: + if request.data_sync_id == 'example': + return PoTokenResponse(request.video_id) + return PoTokenResponse(EXAMPLE_PO_TOKEN) + + +def success_ptp(response: PoTokenResponse | None = None, key: str | None = None): + class SuccessPTP(BaseMockPoTokenProvider): + PROVIDER_NAME = 'success' + PROVIDER_VERSION = '0.0.1' + BUG_REPORT_LOCATION = 'https://success.example.com/issues' + + _SUPPORTED_CLIENTS = ('WEB',) + _SUPPORTED_CONTEXTS = (PoTokenContext.GVS,) + + def _real_request_pot(self, request: PoTokenRequest) -> PoTokenResponse: + return response or PoTokenResponse(EXAMPLE_PO_TOKEN) + + if key: + SuccessPTP.PROVIDER_KEY = key + return SuccessPTP + + +@pytest.fixture +def pot_provider(ie, logger): + return success_ptp()(ie=ie, logger=logger, settings={}) + + +class UnavailablePTP(BaseMockPoTokenProvider): + PROVIDER_NAME = 'unavailable' + BUG_REPORT_LOCATION = 'https://unavailable.example.com/issues' + _SUPPORTED_CLIENTS = None + _SUPPORTED_CONTEXTS = None + + def is_available(self) -> bool: + super().is_available() + return False + + def _real_request_pot(self, request: PoTokenRequest) -> PoTokenResponse: + raise PoTokenProviderError('something went wrong') + + +class UnsupportedPTP(BaseMockPoTokenProvider): + PROVIDER_NAME = 'unsupported' + BUG_REPORT_LOCATION = 'https://unsupported.example.com/issues' + _SUPPORTED_CLIENTS = None + _SUPPORTED_CONTEXTS = None + + def _real_request_pot(self, request: PoTokenRequest) -> PoTokenResponse: + raise PoTokenProviderRejectedRequest('unsupported request') + + +class ErrorPTP(BaseMockPoTokenProvider): + PROVIDER_NAME = 'error' + BUG_REPORT_LOCATION = 'https://error.example.com/issues' + _SUPPORTED_CLIENTS = None + _SUPPORTED_CONTEXTS = None + + def _real_request_pot(self, request: PoTokenRequest) -> PoTokenResponse: + expected = request.video_id == 'expected' + raise PoTokenProviderError('an error occurred', expected=expected) + + +class UnexpectedErrorPTP(BaseMockPoTokenProvider): + PROVIDER_NAME = 'unexpected_error' + BUG_REPORT_LOCATION = 'https://unexpected.example.com/issues' + _SUPPORTED_CLIENTS = None + _SUPPORTED_CONTEXTS = None + + def _real_request_pot(self, request: PoTokenRequest) -> PoTokenResponse: + raise ValueError('an unexpected error occurred') + + +class InvalidPTP(BaseMockPoTokenProvider): + PROVIDER_NAME = 'invalid' + BUG_REPORT_LOCATION = 'https://invalid.example.com/issues' + _SUPPORTED_CLIENTS = None + _SUPPORTED_CONTEXTS = None + + def _real_request_pot(self, request: PoTokenRequest) -> PoTokenResponse: + if request.video_id == 'invalid_type': + return 'invalid-response' + else: + return PoTokenResponse('example-token?', expires_at='123') + + +class BaseMockCacheSpecProvider(PoTokenCacheSpecProvider, abc.ABC): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.generate_called_times = 0 + self.is_available_called_times = 0 + self.close_called = False + + def is_available(self) -> bool: + self.is_available_called_times += 1 + return super().is_available() + + def generate_cache_spec(self, request: PoTokenRequest): + self.generate_called_times += 1 + + def close(self): + self.close_called = True + super().close() + + +class ExampleCacheSpecProviderPCSP(BaseMockCacheSpecProvider): + + PROVIDER_NAME = 'example' + PROVIDER_VERSION = '0.0.1' + BUG_REPORT_LOCATION = 'https://example.com/issues' + + def generate_cache_spec(self, request: PoTokenRequest): + super().generate_cache_spec(request) + return PoTokenCacheSpec( + key_bindings={'v': request.video_id, 'e': None}, + default_ttl=60, + ) + + +class UnavailableCacheSpecProviderPCSP(BaseMockCacheSpecProvider): + + PROVIDER_NAME = 'unavailable' + PROVIDER_VERSION = '0.0.1' + + def is_available(self) -> bool: + super().is_available() + return False + + def generate_cache_spec(self, request: PoTokenRequest): + super().generate_cache_spec(request) + return None + + +class UnsupportedCacheSpecProviderPCSP(BaseMockCacheSpecProvider): + + PROVIDER_NAME = 'unsupported' + PROVIDER_VERSION = '0.0.1' + + def generate_cache_spec(self, request: PoTokenRequest): + super().generate_cache_spec(request) + return None + + +class InvalidSpecCacheSpecProviderPCSP(BaseMockCacheSpecProvider): + + PROVIDER_NAME = 'invalid' + PROVIDER_VERSION = '0.0.1' + + def generate_cache_spec(self, request: PoTokenRequest): + super().generate_cache_spec(request) + return 'invalid-spec' + + +class ErrorSpecCacheSpecProviderPCSP(BaseMockCacheSpecProvider): + + PROVIDER_NAME = 'invalid' + PROVIDER_VERSION = '0.0.1' + + def generate_cache_spec(self, request: PoTokenRequest): + super().generate_cache_spec(request) + raise ValueError('something went wrong') + + +class BaseMockCacheProvider(PoTokenCacheProvider, abc.ABC): + BUG_REPORT_MESSAGE = 'example bug report message' + + def __init__(self, *args, available=True, **kwargs): + super().__init__(*args, **kwargs) + self.store_calls = 0 + self.delete_calls = 0 + self.get_calls = 0 + self.available_called_times = 0 + self.available = available + + def is_available(self) -> bool: + self.available_called_times += 1 + return self.available + + def store(self, *args, **kwargs): + self.store_calls += 1 + + def delete(self, *args, **kwargs): + self.delete_calls += 1 + + def get(self, *args, **kwargs): + self.get_calls += 1 + + def close(self): + self.close_called = True + super().close() + + +class ErrorPCP(BaseMockCacheProvider): + PROVIDER_NAME = 'error' + + def store(self, *args, **kwargs): + super().store(*args, **kwargs) + raise PoTokenCacheProviderError('something went wrong') + + def get(self, *args, **kwargs): + super().get(*args, **kwargs) + raise PoTokenCacheProviderError('something went wrong') + + +class UnexpectedErrorPCP(BaseMockCacheProvider): + PROVIDER_NAME = 'unexpected_error' + + def store(self, *args, **kwargs): + super().store(*args, **kwargs) + raise ValueError('something went wrong') + + def get(self, *args, **kwargs): + super().get(*args, **kwargs) + raise ValueError('something went wrong') + + +class MockMemoryPCP(BaseMockCacheProvider): + PROVIDER_NAME = 'memory' + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.cache = {} + + def store(self, key, value, expires_at): + super().store(key, value, expires_at) + self.cache[key] = (value, expires_at) + + def delete(self, key): + super().delete(key) + self.cache.pop(key, None) + + def get(self, key): + super().get(key) + return self.cache.get(key, [None])[0] + + +def create_memory_pcp(ie, logger, provider_key='memory', provider_name='memory', available=True): + cache = MockMemoryPCP(ie, logger, {}, available=available) + cache.PROVIDER_KEY = provider_key + cache.PROVIDER_NAME = provider_name + return cache + + +@pytest.fixture +def memorypcp(ie, logger) -> MockMemoryPCP: + return create_memory_pcp(ie, logger) + + +@pytest.fixture +def pot_cache(ie, logger): + class MockPoTokenCache(PoTokenCache): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.get_calls = 0 + self.store_calls = 0 + self.close_called = False + + def get(self, *args, **kwargs): + self.get_calls += 1 + return super().get(*args, **kwargs) + + def store(self, *args, **kwargs): + self.store_calls += 1 + return super().store(*args, **kwargs) + + def close(self): + self.close_called = True + super().close() + + return MockPoTokenCache( + cache_providers=[MockMemoryPCP(ie, logger, {})], + cache_spec_providers=[ExampleCacheSpecProviderPCSP(ie, logger, settings={})], + logger=logger, + ) + + +EXAMPLE_PO_TOKEN = base64.urlsafe_b64encode(b'example-token').decode() + + +class TestPoTokenCache: + + def test_cache_success(self, memorypcp, pot_request, ie, logger): + cache = PoTokenCache( + cache_providers=[memorypcp], + cache_spec_providers=[ExampleCacheSpecProviderPCSP(ie=ie, logger=logger, settings={})], + logger=logger, + ) + + response = PoTokenResponse(EXAMPLE_PO_TOKEN) + + assert cache.get(pot_request) is None + cache.store(pot_request, response) + + cached_response = cache.get(pot_request) + assert cached_response is not None + assert cached_response.po_token == EXAMPLE_PO_TOKEN + assert cached_response.expires_at is not None + + assert cache.get(dataclasses.replace(pot_request, video_id='another-video-id')) is None + + def test_unsupported_cache_spec_no_fallback(self, memorypcp, pot_request, ie, logger): + unsupported_provider = UnsupportedCacheSpecProviderPCSP(ie=ie, logger=logger, settings={}) + cache = PoTokenCache( + cache_providers=[memorypcp], + cache_spec_providers=[unsupported_provider], + logger=logger, + ) + + response = PoTokenResponse(EXAMPLE_PO_TOKEN) + assert cache.get(pot_request) is None + assert unsupported_provider.generate_called_times == 1 + cache.store(pot_request, response) + assert len(memorypcp.cache) == 0 + assert unsupported_provider.generate_called_times == 2 + assert cache.get(pot_request) is None + assert unsupported_provider.generate_called_times == 3 + assert len(logger.messages.get('error', [])) == 0 + + def test_unsupported_cache_spec_fallback(self, memorypcp, pot_request, ie, logger): + unsupported_provider = UnsupportedCacheSpecProviderPCSP(ie=ie, logger=logger, settings={}) + example_provider = ExampleCacheSpecProviderPCSP(ie=ie, logger=logger, settings={}) + cache = PoTokenCache( + cache_providers=[memorypcp], + cache_spec_providers=[unsupported_provider, example_provider], + logger=logger, + ) + + response = PoTokenResponse(EXAMPLE_PO_TOKEN) + + assert cache.get(pot_request) is None + assert unsupported_provider.generate_called_times == 1 + assert example_provider.generate_called_times == 1 + + cache.store(pot_request, response) + assert unsupported_provider.generate_called_times == 2 + assert example_provider.generate_called_times == 2 + + cached_response = cache.get(pot_request) + assert unsupported_provider.generate_called_times == 3 + assert example_provider.generate_called_times == 3 + assert cached_response is not None + assert cached_response.po_token == EXAMPLE_PO_TOKEN + assert cached_response.expires_at is not None + + assert len(logger.messages.get('error', [])) == 0 + + def test_invalid_cache_spec_no_fallback(self, memorypcp, pot_request, ie, logger): + cache = PoTokenCache( + cache_providers=[memorypcp], + cache_spec_providers=[InvalidSpecCacheSpecProviderPCSP(ie=ie, logger=logger, settings={})], + logger=logger, + ) + + response = PoTokenResponse(EXAMPLE_PO_TOKEN) + + assert cache.get(pot_request) is None + cache.store(pot_request, response) + + assert cache.get(pot_request) is None + + assert 'PoTokenCacheSpecProvider "InvalidSpecCacheSpecProvider" generate_cache_spec() returned invalid spec invalid-spec; please report this issue to the provider developer at (developer has not provided a bug report location) .' in logger.messages['error'] + + def test_invalid_cache_spec_fallback(self, memorypcp, pot_request, ie, logger): + + invalid_provider = InvalidSpecCacheSpecProviderPCSP(ie=ie, logger=logger, settings={}) + example_provider = ExampleCacheSpecProviderPCSP(ie=ie, logger=logger, settings={}) + cache = PoTokenCache( + cache_providers=[memorypcp], + cache_spec_providers=[invalid_provider, example_provider], + logger=logger, + ) + + response = PoTokenResponse(EXAMPLE_PO_TOKEN) + + assert cache.get(pot_request) is None + assert invalid_provider.generate_called_times == example_provider.generate_called_times == 1 + + cache.store(pot_request, response) + assert invalid_provider.generate_called_times == example_provider.generate_called_times == 2 + + cached_response = cache.get(pot_request) + assert invalid_provider.generate_called_times == example_provider.generate_called_times == 3 + assert cached_response is not None + assert cached_response.po_token == EXAMPLE_PO_TOKEN + assert cached_response.expires_at is not None + + assert 'PoTokenCacheSpecProvider "InvalidSpecCacheSpecProvider" generate_cache_spec() returned invalid spec invalid-spec; please report this issue to the provider developer at (developer has not provided a bug report location) .' in logger.messages['error'] + + def test_unavailable_cache_spec_no_fallback(self, memorypcp, pot_request, ie, logger): + unavailable_provider = UnavailableCacheSpecProviderPCSP(ie=ie, logger=logger, settings={}) + cache = PoTokenCache( + cache_providers=[memorypcp], + cache_spec_providers=[unavailable_provider], + logger=logger, + ) + + response = PoTokenResponse(EXAMPLE_PO_TOKEN) + + assert cache.get(pot_request) is None + cache.store(pot_request, response) + assert cache.get(pot_request) is None + assert unavailable_provider.generate_called_times == 0 + + def test_unavailable_cache_spec_fallback(self, memorypcp, pot_request, ie, logger): + unavailable_provider = UnavailableCacheSpecProviderPCSP(ie=ie, logger=logger, settings={}) + example_provider = ExampleCacheSpecProviderPCSP(ie=ie, logger=logger, settings={}) + cache = PoTokenCache( + cache_providers=[memorypcp], + cache_spec_providers=[unavailable_provider, example_provider], + logger=logger, + ) + + response = PoTokenResponse(EXAMPLE_PO_TOKEN) + + assert cache.get(pot_request) is None + assert unavailable_provider.generate_called_times == 0 + assert unavailable_provider.is_available_called_times == 1 + assert example_provider.generate_called_times == 1 + + cache.store(pot_request, response) + assert unavailable_provider.generate_called_times == 0 + assert unavailable_provider.is_available_called_times == 2 + assert example_provider.generate_called_times == 2 + + cached_response = cache.get(pot_request) + assert unavailable_provider.generate_called_times == 0 + assert unavailable_provider.is_available_called_times == 3 + assert example_provider.generate_called_times == 3 + assert example_provider.is_available_called_times == 3 + assert cached_response is not None + assert cached_response.po_token == EXAMPLE_PO_TOKEN + assert cached_response.expires_at is not None + + def test_unexpected_error_cache_spec(self, memorypcp, pot_request, ie, logger): + error_provider = ErrorSpecCacheSpecProviderPCSP(ie=ie, logger=logger, settings={}) + cache = PoTokenCache( + cache_providers=[memorypcp], + cache_spec_providers=[error_provider], + logger=logger, + ) + + response = PoTokenResponse(EXAMPLE_PO_TOKEN) + + assert cache.get(pot_request) is None + cache.store(pot_request, response) + assert cache.get(pot_request) is None + assert error_provider.generate_called_times == 3 + assert error_provider.is_available_called_times == 3 + + assert 'Error occurred with "invalid" PO Token cache spec provider: ValueError(\'something went wrong\'); please report this issue to the provider developer at (developer has not provided a bug report location) .' in logger.messages['error'] + + def test_unexpected_error_cache_spec_fallback(self, memorypcp, pot_request, ie, logger): + error_provider = ErrorSpecCacheSpecProviderPCSP(ie=ie, logger=logger, settings={}) + example_provider = ExampleCacheSpecProviderPCSP(ie=ie, logger=logger, settings={}) + cache = PoTokenCache( + cache_providers=[memorypcp], + cache_spec_providers=[error_provider, example_provider], + logger=logger, + ) + + response = PoTokenResponse(EXAMPLE_PO_TOKEN) + + assert cache.get(pot_request) is None + assert error_provider.generate_called_times == 1 + assert error_provider.is_available_called_times == 1 + assert example_provider.generate_called_times == 1 + + cache.store(pot_request, response) + assert error_provider.generate_called_times == 2 + assert error_provider.is_available_called_times == 2 + assert example_provider.generate_called_times == 2 + + cached_response = cache.get(pot_request) + assert error_provider.generate_called_times == 3 + assert error_provider.is_available_called_times == 3 + assert example_provider.generate_called_times == 3 + assert example_provider.is_available_called_times == 3 + assert cached_response is not None + assert cached_response.po_token == EXAMPLE_PO_TOKEN + assert cached_response.expires_at is not None + + assert 'Error occurred with "invalid" PO Token cache spec provider: ValueError(\'something went wrong\'); please report this issue to the provider developer at (developer has not provided a bug report location) .' in logger.messages['error'] + + def test_key_bindings_spec_provider(self, memorypcp, pot_request, ie, logger): + + class ExampleProviderPCSP(PoTokenCacheSpecProvider): + PROVIDER_NAME = 'example' + + def generate_cache_spec(self, request: PoTokenRequest): + return PoTokenCacheSpec( + key_bindings={'v': request.video_id}, + default_ttl=60, + ) + + class ExampleProviderTwoPCSP(ExampleProviderPCSP): + pass + + example_provider = ExampleProviderPCSP(ie=ie, logger=logger, settings={}) + example_provider_two = ExampleProviderTwoPCSP(ie=ie, logger=logger, settings={}) + + response = PoTokenResponse(EXAMPLE_PO_TOKEN) + + cache = PoTokenCache( + cache_providers=[memorypcp], + cache_spec_providers=[example_provider], + logger=logger, + ) + + assert cache.get(pot_request) is None + cache.store(pot_request, response) + assert len(memorypcp.cache) == 1 + assert hashlib.sha256( + f"{{'_dlp_cache': 'v1', '_p': 'ExampleProvider', 'v': '{pot_request.video_id}'}}".encode()).hexdigest() in memorypcp.cache + + # The second spec provider returns the exact same key bindings as the first one, + # however the PoTokenCache should use the provider key to differentiate between them + cache = PoTokenCache( + cache_providers=[memorypcp], + cache_spec_providers=[example_provider_two], + logger=logger, + ) + + assert cache.get(pot_request) is None + cache.store(pot_request, response) + assert len(memorypcp.cache) == 2 + assert hashlib.sha256( + f"{{'_dlp_cache': 'v1', '_p': 'ExampleProviderTwo', 'v': '{pot_request.video_id}'}}".encode()).hexdigest() in memorypcp.cache + + def test_cache_provider_preferences(self, pot_request, ie, logger): + pcp_one = create_memory_pcp(ie, logger, provider_key='memory_pcp_one') + pcp_two = create_memory_pcp(ie, logger, provider_key='memory_pcp_two') + + cache = PoTokenCache( + cache_providers=[pcp_one, pcp_two], + cache_spec_providers=[ExampleCacheSpecProviderPCSP(ie=ie, logger=logger, settings={})], + logger=logger, + ) + + cache.store(pot_request, PoTokenResponse(EXAMPLE_PO_TOKEN), write_policy=CacheProviderWritePolicy.WRITE_FIRST) + assert len(pcp_one.cache) == 1 + assert len(pcp_two.cache) == 0 + + assert cache.get(pot_request) + assert pcp_one.get_calls == 1 + assert pcp_two.get_calls == 0 + + standard_preference_called = False + pcp_one_preference_claled = False + + def standard_preference(provider, request, *_, **__): + nonlocal standard_preference_called + standard_preference_called = True + assert isinstance(provider, PoTokenCacheProvider) + assert isinstance(request, PoTokenRequest) + return 1 + + def pcp_one_preference(provider, request, *_, **__): + nonlocal pcp_one_preference_claled + pcp_one_preference_claled = True + assert isinstance(provider, PoTokenCacheProvider) + assert isinstance(request, PoTokenRequest) + if provider.PROVIDER_KEY == pcp_one.PROVIDER_KEY: + return -100 + return 0 + + # test that it can hanldle multiple preferences + cache.cache_provider_preferences.append(standard_preference) + cache.cache_provider_preferences.append(pcp_one_preference) + + cache.store(pot_request, PoTokenResponse(EXAMPLE_PO_TOKEN), write_policy=CacheProviderWritePolicy.WRITE_FIRST) + assert cache.get(pot_request) + assert len(pcp_one.cache) == len(pcp_two.cache) == 1 + assert pcp_two.get_calls == pcp_one.get_calls == 1 + assert pcp_one.store_calls == pcp_two.store_calls == 1 + assert standard_preference_called + assert pcp_one_preference_claled + + def test_secondary_cache_provider_hit(self, pot_request, ie, logger): + pcp_one = create_memory_pcp(ie, logger, provider_key='memory_pcp_one') + pcp_two = create_memory_pcp(ie, logger, provider_key='memory_pcp_two') + + cache = PoTokenCache( + cache_providers=[pcp_two], + cache_spec_providers=[ExampleCacheSpecProviderPCSP(ie=ie, logger=logger, settings={})], + logger=logger, + ) + + # Given the lower priority provider has the cache hit, store the response in the higher priority provider + cache.store(pot_request, PoTokenResponse(EXAMPLE_PO_TOKEN)) + assert cache.get(pot_request) + + cache.cache_providers[pcp_one.PROVIDER_KEY] = pcp_one + + def pcp_one_pref(provider, *_, **__): + if provider.PROVIDER_KEY == pcp_one.PROVIDER_KEY: + return 1 + return -1 + + cache.cache_provider_preferences.append(pcp_one_pref) + + assert cache.get(pot_request) + assert pcp_one.get_calls == 1 + assert pcp_two.get_calls == 2 + # Should write back to pcp_one (now the highest priority cache provider) + assert pcp_one.store_calls == pcp_two.store_calls == 1 + assert 'Writing PO Token response to highest priority cache provider' in logger.messages['trace'] + + def test_cache_provider_no_hits(self, pot_request, ie, logger): + pcp_one = create_memory_pcp(ie, logger, provider_key='memory_pcp_one') + pcp_two = create_memory_pcp(ie, logger, provider_key='memory_pcp_two') + + cache = PoTokenCache( + cache_providers=[pcp_one, pcp_two], + cache_spec_providers=[ExampleCacheSpecProviderPCSP(ie=ie, logger=logger, settings={})], + logger=logger, + ) + + assert cache.get(pot_request) is None + assert pcp_one.get_calls == pcp_two.get_calls == 1 + + def test_get_invalid_po_token_response(self, pot_request, ie, logger): + # Test various scenarios where the po token response stored in the cache provider is invalid + pcp_one = create_memory_pcp(ie, logger, provider_key='memory_pcp_one') + pcp_two = create_memory_pcp(ie, logger, provider_key='memory_pcp_two') + + cache = PoTokenCache( + cache_providers=[pcp_one, pcp_two], + cache_spec_providers=[ExampleCacheSpecProviderPCSP(ie=ie, logger=logger, settings={})], + logger=logger, + ) + + valid_response = PoTokenResponse(EXAMPLE_PO_TOKEN) + cache.store(pot_request, valid_response) + assert len(pcp_one.cache) == len(pcp_two.cache) == 1 + # Overwrite the valid response with an invalid one in the cache + pcp_one.store(next(iter(pcp_one.cache.keys())), json.dumps(dataclasses.asdict(PoTokenResponse(None))), int(time.time() + 1000)) + assert cache.get(pot_request).po_token == valid_response.po_token + assert pcp_one.get_calls == pcp_two.get_calls == 1 + assert pcp_one.delete_calls == 1 # Invalid response should be deleted from cache + assert pcp_one.store_calls == 3 # Since response was fetched from second cache provider, it should be stored in the first one + assert len(pcp_one.cache) == 1 + assert 'Invalid PO Token response retrieved from cache provider "memory": {"po_token": null, "expires_at": null}; example bug report message' in logger.messages['error'] + + # Overwrite the valid response with an invalid json in the cache + pcp_one.store(next(iter(pcp_one.cache.keys())), 'invalid-json', int(time.time() + 1000)) + assert cache.get(pot_request).po_token == valid_response.po_token + assert pcp_one.get_calls == pcp_two.get_calls == 2 + assert pcp_one.delete_calls == 2 + assert pcp_one.store_calls == 5 # 3 + 1 store we made in the test + 1 store from lower priority cache provider + assert len(pcp_one.cache) == 1 + + assert 'Invalid PO Token response retrieved from cache provider "memory": invalid-json; example bug report message' in logger.messages['error'] + + # Valid json, but missing required fields + pcp_one.store(next(iter(pcp_one.cache.keys())), '{"unknown_param": 0}', int(time.time() + 1000)) + assert cache.get(pot_request).po_token == valid_response.po_token + assert pcp_one.get_calls == pcp_two.get_calls == 3 + assert pcp_one.delete_calls == 3 + assert pcp_one.store_calls == 7 # 5 + 1 store from test + 1 store from lower priority cache provider + assert len(pcp_one.cache) == 1 + + assert 'Invalid PO Token response retrieved from cache provider "memory": {"unknown_param": 0}; example bug report message' in logger.messages['error'] + + def test_store_invalid_po_token_response(self, pot_request, ie, logger): + # Should not store an invalid po token response + pcp_one = create_memory_pcp(ie, logger, provider_key='memory_pcp_one') + + cache = PoTokenCache( + cache_providers=[pcp_one], + cache_spec_providers=[ExampleCacheSpecProviderPCSP(ie=ie, logger=logger, settings={})], + logger=logger, + ) + + cache.store(pot_request, PoTokenResponse(po_token=EXAMPLE_PO_TOKEN, expires_at=80)) + assert cache.get(pot_request) is None + assert pcp_one.store_calls == 0 + assert 'Invalid PO Token response provided to PoTokenCache.store()' in logger.messages['error'][0] + + def test_store_write_policy(self, pot_request, ie, logger): + pcp_one = create_memory_pcp(ie, logger, provider_key='memory_pcp_one') + pcp_two = create_memory_pcp(ie, logger, provider_key='memory_pcp_two') + + cache = PoTokenCache( + cache_providers=[pcp_one, pcp_two], + cache_spec_providers=[ExampleCacheSpecProviderPCSP(ie=ie, logger=logger, settings={})], + logger=logger, + ) + + cache.store(pot_request, PoTokenResponse(EXAMPLE_PO_TOKEN), write_policy=CacheProviderWritePolicy.WRITE_FIRST) + assert pcp_one.store_calls == 1 + assert pcp_two.store_calls == 0 + + cache.store(pot_request, PoTokenResponse(EXAMPLE_PO_TOKEN), write_policy=CacheProviderWritePolicy.WRITE_ALL) + assert pcp_one.store_calls == 2 + assert pcp_two.store_calls == 1 + + def test_store_write_first_policy_cache_spec(self, pot_request, ie, logger): + pcp_one = create_memory_pcp(ie, logger, provider_key='memory_pcp_one') + pcp_two = create_memory_pcp(ie, logger, provider_key='memory_pcp_two') + + class WriteFirstPCSP(BaseMockCacheSpecProvider): + def generate_cache_spec(self, request: PoTokenRequest): + super().generate_cache_spec(request) + return PoTokenCacheSpec( + key_bindings={'v': request.video_id, 'e': None}, + default_ttl=60, + write_policy=CacheProviderWritePolicy.WRITE_FIRST, + ) + + cache = PoTokenCache( + cache_providers=[pcp_one, pcp_two], + cache_spec_providers=[WriteFirstPCSP(ie=ie, logger=logger, settings={})], + logger=logger, + ) + + cache.store(pot_request, PoTokenResponse(EXAMPLE_PO_TOKEN)) + assert pcp_one.store_calls == 1 + assert pcp_two.store_calls == 0 + + def test_store_write_all_policy_cache_spec(self, pot_request, ie, logger): + pcp_one = create_memory_pcp(ie, logger, provider_key='memory_pcp_one') + pcp_two = create_memory_pcp(ie, logger, provider_key='memory_pcp_two') + + class WriteAllPCSP(BaseMockCacheSpecProvider): + def generate_cache_spec(self, request: PoTokenRequest): + super().generate_cache_spec(request) + return PoTokenCacheSpec( + key_bindings={'v': request.video_id, 'e': None}, + default_ttl=60, + write_policy=CacheProviderWritePolicy.WRITE_ALL, + ) + + cache = PoTokenCache( + cache_providers=[pcp_one, pcp_two], + cache_spec_providers=[WriteAllPCSP(ie=ie, logger=logger, settings={})], + logger=logger, + ) + + cache.store(pot_request, PoTokenResponse(EXAMPLE_PO_TOKEN)) + assert pcp_one.store_calls == 1 + assert pcp_two.store_calls == 1 + + def test_expires_at_pot_response(self, pot_request, memorypcp, ie, logger): + cache = PoTokenCache( + cache_providers=[memorypcp], + cache_spec_providers=[ExampleCacheSpecProviderPCSP(ie=ie, logger=logger, settings={})], + logger=logger, + ) + + response = PoTokenResponse(EXAMPLE_PO_TOKEN, expires_at=10000000000) + cache.store(pot_request, response) + assert next(iter(memorypcp.cache.values()))[1] == 10000000000 + + def test_expires_at_default_spec(self, pot_request, memorypcp, ie, logger): + + class TtlPCSP(BaseMockCacheSpecProvider): + def generate_cache_spec(self, request: PoTokenRequest): + super().generate_cache_spec(request) + return PoTokenCacheSpec( + key_bindings={'v': request.video_id, 'e': None}, + default_ttl=10000000000, + ) + + cache = PoTokenCache( + cache_providers=[memorypcp], + cache_spec_providers=[TtlPCSP(ie=ie, logger=logger, settings={})], + logger=logger, + ) + + response = PoTokenResponse(EXAMPLE_PO_TOKEN) + cache.store(pot_request, response) + assert next(iter(memorypcp.cache.values()))[1] >= 10000000000 + + def test_cache_provider_error_no_fallback(self, pot_request, ie, logger): + error_pcp = ErrorPCP(ie, logger, {}) + cache = PoTokenCache( + cache_providers=[error_pcp], + cache_spec_providers=[ExampleCacheSpecProviderPCSP(ie=ie, logger=logger, settings={})], + logger=logger, + ) + + response = PoTokenResponse(EXAMPLE_PO_TOKEN) + cache.store(pot_request, response) + assert cache.get(pot_request) is None + assert error_pcp.get_calls == 1 + assert error_pcp.store_calls == 1 + + assert logger.messages['warning'].count("Error from \"error\" PO Token cache provider: PoTokenCacheProviderError('something went wrong'); example bug report message") == 2 + + def test_cache_provider_error_fallback(self, pot_request, ie, logger): + error_pcp = ErrorPCP(ie, logger, {}) + memory_pcp = create_memory_pcp(ie, logger, provider_key='memory') + + cache = PoTokenCache( + cache_providers=[error_pcp, memory_pcp], + cache_spec_providers=[ExampleCacheSpecProviderPCSP(ie=ie, logger=logger, settings={})], + logger=logger, + ) + + response = PoTokenResponse(EXAMPLE_PO_TOKEN) + cache.store(pot_request, response) + + # 1. Store fails for error_pcp, stored in memory_pcp + # 2. Get fails for error_pcp, fetched from memory_pcp + # 3. Since fetched from lower priority, it should be stored in the highest priority cache provider + # 4. Store fails in error_pcp. Since write policy is WRITE_FIRST, it should not try to store in memory_pcp regardless of if the store in error_pcp fails + + assert cache.get(pot_request) + assert error_pcp.get_calls == 1 + assert error_pcp.store_calls == 2 # since highest priority, when fetched from lower priority, it should be stored in the highest priority cache provider + assert memory_pcp.get_calls == 1 + assert memory_pcp.store_calls == 1 + + assert logger.messages['warning'].count("Error from \"error\" PO Token cache provider: PoTokenCacheProviderError('something went wrong'); example bug report message") == 3 + + def test_cache_provider_unexpected_error_no_fallback(self, pot_request, ie, logger): + error_pcp = UnexpectedErrorPCP(ie, logger, {}) + cache = PoTokenCache( + cache_providers=[error_pcp], + cache_spec_providers=[ExampleCacheSpecProviderPCSP(ie=ie, logger=logger, settings={})], + logger=logger, + ) + + response = PoTokenResponse(EXAMPLE_PO_TOKEN) + cache.store(pot_request, response) + assert cache.get(pot_request) is None + assert error_pcp.get_calls == 1 + assert error_pcp.store_calls == 1 + + assert logger.messages['error'].count("Error occurred with \"unexpected_error\" PO Token cache provider: ValueError('something went wrong'); example bug report message") == 2 + + def test_cache_provider_unexpected_error_fallback(self, pot_request, ie, logger): + error_pcp = UnexpectedErrorPCP(ie, logger, {}) + memory_pcp = create_memory_pcp(ie, logger, provider_key='memory') + + cache = PoTokenCache( + cache_providers=[error_pcp, memory_pcp], + cache_spec_providers=[ExampleCacheSpecProviderPCSP(ie=ie, logger=logger, settings={})], + logger=logger, + ) + + response = PoTokenResponse(EXAMPLE_PO_TOKEN) + cache.store(pot_request, response) + + # 1. Store fails for error_pcp, stored in memory_pcp + # 2. Get fails for error_pcp, fetched from memory_pcp + # 3. Since fetched from lower priority, it should be stored in the highest priority cache provider + # 4. Store fails in error_pcp. Since write policy is WRITE_FIRST, it should not try to store in memory_pcp regardless of if the store in error_pcp fails + + assert cache.get(pot_request) + assert error_pcp.get_calls == 1 + assert error_pcp.store_calls == 2 # since highest priority, when fetched from lower priority, it should be stored in the highest priority cache provider + assert memory_pcp.get_calls == 1 + assert memory_pcp.store_calls == 1 + + assert logger.messages['error'].count("Error occurred with \"unexpected_error\" PO Token cache provider: ValueError('something went wrong'); example bug report message") == 3 + + def test_cache_provider_unavailable_no_fallback(self, pot_request, ie, logger): + provider = create_memory_pcp(ie, logger, available=False) + + cache = PoTokenCache( + cache_providers=[provider], + cache_spec_providers=[ExampleCacheSpecProviderPCSP(ie=ie, logger=logger, settings={})], + logger=logger, + ) + + response = PoTokenResponse(EXAMPLE_PO_TOKEN) + cache.store(pot_request, response) + assert cache.get(pot_request) is None + assert provider.get_calls == 0 + assert provider.store_calls == 0 + assert provider.available_called_times + + def test_cache_provider_unavailable_fallback(self, pot_request, ie, logger): + provider_unavailable = create_memory_pcp(ie, logger, provider_key='unavailable', provider_name='unavailable', available=False) + provider_available = create_memory_pcp(ie, logger, provider_key='available', provider_name='available') + + cache = PoTokenCache( + cache_providers=[provider_unavailable, provider_available], + cache_spec_providers=[ExampleCacheSpecProviderPCSP(ie=ie, logger=logger, settings={})], + logger=logger, + ) + + response = PoTokenResponse(EXAMPLE_PO_TOKEN) + cache.store(pot_request, response) + assert cache.get(pot_request) is not None + assert provider_unavailable.get_calls == 0 + assert provider_unavailable.store_calls == 0 + assert provider_available.get_calls == 1 + assert provider_available.store_calls == 1 + assert provider_unavailable.available_called_times + assert provider_available.available_called_times + + # should not even try to use the provider for the request + assert 'Attempting to fetch a PO Token response from "unavailable" provider' not in logger.messages['trace'] + assert 'Attempting to fetch a PO Token response from "available" provider' not in logger.messages['trace'] + + def test_available_not_called(self, ie, pot_request, logger): + # Test that the available method is not called when provider higher in the list is available + provider_unavailable = create_memory_pcp( + ie, logger, provider_key='unavailable', provider_name='unavailable', available=False) + provider_available = create_memory_pcp(ie, logger, provider_key='available', provider_name='available') + + logger.log_level = logger.LogLevel.INFO + + cache = PoTokenCache( + cache_providers=[provider_available, provider_unavailable], + cache_spec_providers=[ExampleCacheSpecProviderPCSP(ie=ie, logger=logger, settings={})], + logger=logger, + ) + + response = PoTokenResponse(EXAMPLE_PO_TOKEN) + cache.store(pot_request, response, write_policy=CacheProviderWritePolicy.WRITE_FIRST) + assert cache.get(pot_request) is not None + assert provider_unavailable.get_calls == 0 + assert provider_unavailable.store_calls == 0 + assert provider_available.get_calls == 1 + assert provider_available.store_calls == 1 + assert provider_unavailable.available_called_times == 0 + assert provider_available.available_called_times + assert 'PO Token Cache Providers: available-0.0.0 (external), unavailable-0.0.0 (external, unavailable)' not in logger.messages.get('trace', []) + + def test_available_called_trace(self, ie, pot_request, logger): + # But if logging level is trace should call available (as part of debug logging) + provider_unavailable = create_memory_pcp( + ie, logger, provider_key='unavailable', provider_name='unavailable', available=False) + provider_available = create_memory_pcp(ie, logger, provider_key='available', provider_name='available') + + logger.log_level = logger.LogLevel.TRACE + + cache = PoTokenCache( + cache_providers=[provider_available, provider_unavailable], + cache_spec_providers=[ExampleCacheSpecProviderPCSP(ie=ie, logger=logger, settings={})], + logger=logger, + ) + + response = PoTokenResponse(EXAMPLE_PO_TOKEN) + cache.store(pot_request, response, write_policy=CacheProviderWritePolicy.WRITE_FIRST) + assert cache.get(pot_request) is not None + assert provider_unavailable.get_calls == 0 + assert provider_unavailable.store_calls == 0 + assert provider_available.get_calls == 1 + assert provider_available.store_calls == 1 + assert provider_unavailable.available_called_times + assert provider_available.available_called_times + assert 'PO Token Cache Providers: available-0.0.0 (external), unavailable-0.0.0 (external, unavailable)' in logger.messages.get('trace', []) + + def test_close(self, ie, pot_request, logger): + # Should call close on the cache providers and cache specs + memory_pcp = create_memory_pcp(ie, logger, provider_key='memory') + memory2_pcp = create_memory_pcp(ie, logger, provider_key='memory2') + + spec1 = ExampleCacheSpecProviderPCSP(ie=ie, logger=logger, settings={}) + spec2 = UnavailableCacheSpecProviderPCSP(ie=ie, logger=logger, settings={}) + + cache = PoTokenCache( + cache_providers=[memory2_pcp, memory_pcp], + cache_spec_providers=[spec1, spec2], + logger=logger, + ) + + cache.close() + assert memory_pcp.close_called + assert memory2_pcp.close_called + assert spec1.close_called + assert spec2.close_called + + +class TestPoTokenRequestDirector: + + def test_request_pot_success(self, ie, pot_request, pot_cache, pot_provider, logger): + director = PoTokenRequestDirector(logger=logger, cache=pot_cache) + director.register_provider(pot_provider) + response = director.get_po_token(pot_request) + assert response == EXAMPLE_PO_TOKEN + + def test_request_and_cache(self, ie, pot_request, pot_cache, pot_provider, logger): + director = PoTokenRequestDirector(logger=logger, cache=pot_cache) + director.register_provider(pot_provider) + response = director.get_po_token(pot_request) + assert response == EXAMPLE_PO_TOKEN + assert pot_provider.request_called_times == 1 + assert pot_cache.get_calls == 1 + assert pot_cache.store_calls == 1 + + # Second request, should be cached + response = director.get_po_token(pot_request) + assert response == EXAMPLE_PO_TOKEN + assert pot_cache.get_calls == 2 + assert pot_cache.store_calls == 1 + assert pot_provider.request_called_times == 1 + + def test_bypass_cache(self, ie, pot_request, pot_cache, logger, pot_provider): + pot_request.bypass_cache = True + + director = PoTokenRequestDirector(logger=logger, cache=pot_cache) + director.register_provider(pot_provider) + response = director.get_po_token(pot_request) + assert response == EXAMPLE_PO_TOKEN + assert pot_provider.request_called_times == 1 + assert pot_cache.get_calls == 0 + assert pot_cache.store_calls == 1 + + # Second request, should not get from cache + response = director.get_po_token(pot_request) + assert response == EXAMPLE_PO_TOKEN + assert pot_provider.request_called_times == 2 + assert pot_cache.get_calls == 0 + assert pot_cache.store_calls == 2 + + # POT is still cached, should get from cache + pot_request.bypass_cache = False + response = director.get_po_token(pot_request) + assert response == EXAMPLE_PO_TOKEN + assert pot_provider.request_called_times == 2 + assert pot_cache.get_calls == 1 + assert pot_cache.store_calls == 2 + + def test_clean_pot_generate(self, ie, pot_request, pot_cache, logger): + # Token should be cleaned before returning + base_token = base64.urlsafe_b64encode(b'token').decode() + director = PoTokenRequestDirector(logger=logger, cache=pot_cache) + provider = success_ptp(PoTokenResponse(base_token + '?extra=params'))(ie, logger, settings={}) + director.register_provider(provider) + + response = director.get_po_token(pot_request) + assert response == base_token + assert provider.request_called_times == 1 + + # Confirm the cleaned version was stored in the cache + cached_token = pot_cache.get(pot_request) + assert cached_token.po_token == base_token + + def test_clean_pot_cache(self, ie, pot_request, pot_cache, logger, pot_provider): + # Token retrieved from cache should be cleaned before returning + base_token = base64.urlsafe_b64encode(b'token').decode() + pot_cache.store(pot_request, PoTokenResponse(base_token + '?extra=params')) + director = PoTokenRequestDirector(logger=logger, cache=pot_cache) + director.register_provider(pot_provider) + + response = director.get_po_token(pot_request) + assert response == base_token + assert pot_cache.get_calls == 1 + assert pot_provider.request_called_times == 0 + + def test_cache_expires_at_none(self, ie, pot_request, pot_cache, logger, pot_provider): + # Should cache if expires_at=None in the response + director = PoTokenRequestDirector(logger=logger, cache=pot_cache) + provider = success_ptp(PoTokenResponse(EXAMPLE_PO_TOKEN, expires_at=None))(ie, logger, settings={}) + director.register_provider(provider) + + response = director.get_po_token(pot_request) + assert response == EXAMPLE_PO_TOKEN + assert pot_cache.store_calls == 1 + assert pot_cache.get(pot_request).po_token == EXAMPLE_PO_TOKEN + + def test_cache_expires_at_positive(self, ie, pot_request, pot_cache, logger, pot_provider): + # Should cache if expires_at is a positive number in the response + director = PoTokenRequestDirector(logger=logger, cache=pot_cache) + provider = success_ptp(PoTokenResponse(EXAMPLE_PO_TOKEN, expires_at=99999999999))(ie, logger, settings={}) + director.register_provider(provider) + + response = director.get_po_token(pot_request) + assert response == EXAMPLE_PO_TOKEN + assert pot_cache.store_calls == 1 + assert pot_cache.get(pot_request).po_token == EXAMPLE_PO_TOKEN + + @pytest.mark.parametrize('expires_at', [0, -1]) + def test_not_cache_expires_at(self, ie, pot_request, pot_cache, logger, pot_provider, expires_at): + # Should not cache if expires_at <= 0 in the response + director = PoTokenRequestDirector(logger=logger, cache=pot_cache) + provider = success_ptp(PoTokenResponse(EXAMPLE_PO_TOKEN, expires_at=expires_at))(ie, logger, settings={}) + director.register_provider(provider) + + response = director.get_po_token(pot_request) + assert response == EXAMPLE_PO_TOKEN + assert pot_cache.store_calls == 0 + assert pot_cache.get(pot_request) is None + + def test_no_providers(self, ie, pot_request, pot_cache, logger): + director = PoTokenRequestDirector(logger=logger, cache=pot_cache) + response = director.get_po_token(pot_request) + assert response is None + + def test_try_cache_no_providers(self, ie, pot_request, pot_cache, logger): + # Should still try the cache even if no providers are configured + pot_cache.store(pot_request, PoTokenResponse(EXAMPLE_PO_TOKEN)) + director = PoTokenRequestDirector(logger=logger, cache=pot_cache) + + response = director.get_po_token(pot_request) + assert response == EXAMPLE_PO_TOKEN + + def test_close(self, ie, pot_request, pot_cache, pot_provider, logger): + # Should call close on the pot cache and any providers + director = PoTokenRequestDirector(logger=logger, cache=pot_cache) + + provider2 = UnavailablePTP(ie, logger, {}) + director.register_provider(pot_provider) + director.register_provider(provider2) + + director.close() + assert pot_provider.close_called + assert provider2.close_called + assert pot_cache.close_called + + def test_pot_provider_preferences(self, pot_request, pot_cache, ie, logger): + pot_request.bypass_cache = True + provider_two_pot = base64.urlsafe_b64encode(b'token2').decode() + + example_provider = success_ptp(response=PoTokenResponse(EXAMPLE_PO_TOKEN), key='exampleone')(ie, logger, settings={}) + example_provider_two = success_ptp(response=PoTokenResponse(provider_two_pot), key='exampletwo')(ie, logger, settings={}) + + director = PoTokenRequestDirector(logger=logger, cache=pot_cache) + director.register_provider(example_provider) + director.register_provider(example_provider_two) + + response = director.get_po_token(pot_request) + assert response == EXAMPLE_PO_TOKEN + assert example_provider.request_called_times == 1 + assert example_provider_two.request_called_times == 0 + + standard_preference_called = False + example_preference_called = False + + # Test that the provider preferences are respected + def standard_preference(provider, request, *_, **__): + nonlocal standard_preference_called + standard_preference_called = True + assert isinstance(provider, PoTokenProvider) + assert isinstance(request, PoTokenRequest) + return 1 + + def example_preference(provider, request, *_, **__): + nonlocal example_preference_called + example_preference_called = True + assert isinstance(provider, PoTokenProvider) + assert isinstance(request, PoTokenRequest) + if provider.PROVIDER_KEY == example_provider.PROVIDER_KEY: + return -100 + return 0 + + # test that it can handle multiple preferences + director.register_preference(example_preference) + director.register_preference(standard_preference) + + response = director.get_po_token(pot_request) + assert response == provider_two_pot + assert example_provider.request_called_times == 1 + assert example_provider_two.request_called_times == 1 + assert standard_preference_called + assert example_preference_called + + def test_unsupported_request_no_fallback(self, ie, logger, pot_cache, pot_request): + director = PoTokenRequestDirector(logger=logger, cache=pot_cache) + provider = UnsupportedPTP(ie, logger, {}) + director.register_provider(provider) + + response = director.get_po_token(pot_request) + assert response is None + assert provider.request_called_times == 1 + + def test_unsupported_request_fallback(self, ie, logger, pot_cache, pot_request, pot_provider): + # Should fallback to the next provider if the first one does not support the request + director = PoTokenRequestDirector(logger=logger, cache=pot_cache) + provider = UnsupportedPTP(ie, logger, {}) + director.register_provider(provider) + director.register_provider(pot_provider) + + response = director.get_po_token(pot_request) + assert response == EXAMPLE_PO_TOKEN + assert provider.request_called_times == 1 + assert pot_provider.request_called_times == 1 + assert 'PO Token Provider "unsupported" rejected this request, trying next available provider. Reason: unsupported request' in logger.messages['trace'] + + def test_unavailable_request_no_fallback(self, ie, logger, pot_cache, pot_request): + director = PoTokenRequestDirector(logger=logger, cache=pot_cache) + provider = UnavailablePTP(ie, logger, {}) + director.register_provider(provider) + + response = director.get_po_token(pot_request) + assert response is None + assert provider.request_called_times == 0 + assert provider.available_called_times + + def test_unavailable_request_fallback(self, ie, logger, pot_cache, pot_request, pot_provider): + # Should fallback to the next provider if the first one is unavailable + director = PoTokenRequestDirector(logger=logger, cache=pot_cache) + provider = UnavailablePTP(ie, logger, {}) + director.register_provider(provider) + director.register_provider(pot_provider) + + response = director.get_po_token(pot_request) + assert response == EXAMPLE_PO_TOKEN + assert provider.request_called_times == 0 + assert provider.available_called_times + assert pot_provider.request_called_times == 1 + assert pot_provider.available_called_times + # should not even try use the provider for the request + assert 'Attempting to fetch a PO Token from "unavailable" provider' not in logger.messages['trace'] + assert 'Attempting to fetch a PO Token from "success" provider' in logger.messages['trace'] + + def test_available_not_called(self, ie, logger, pot_cache, pot_request, pot_provider): + # Test that the available method is not called when provider higher in the list is available + logger.log_level = logger.LogLevel.INFO + director = PoTokenRequestDirector(logger=logger, cache=pot_cache) + provider = UnavailablePTP(ie, logger, {}) + director.register_provider(pot_provider) + director.register_provider(provider) + + response = director.get_po_token(pot_request) + assert response == EXAMPLE_PO_TOKEN + assert provider.request_called_times == 0 + assert provider.available_called_times == 0 + assert pot_provider.request_called_times == 1 + assert pot_provider.available_called_times == 2 + assert 'PO Token Providers: success-0.0.1 (external), unavailable-0.0.0 (external, unavailable)' not in logger.messages.get('trace', []) + + def test_available_called_trace(self, ie, logger, pot_cache, pot_request, pot_provider): + # But if logging level is trace should call available (as part of debug logging) + logger.log_level = logger.LogLevel.TRACE + director = PoTokenRequestDirector(logger=logger, cache=pot_cache) + provider = UnavailablePTP(ie, logger, {}) + director.register_provider(pot_provider) + director.register_provider(provider) + + response = director.get_po_token(pot_request) + assert response == EXAMPLE_PO_TOKEN + assert provider.request_called_times == 0 + assert provider.available_called_times == 1 + assert pot_provider.request_called_times == 1 + assert pot_provider.available_called_times == 3 + assert 'PO Token Providers: success-0.0.1 (external), unavailable-0.0.0 (external, unavailable)' in logger.messages['trace'] + + def test_provider_error_no_fallback_unexpected(self, ie, logger, pot_cache, pot_request): + director = PoTokenRequestDirector(logger=logger, cache=pot_cache) + provider = ErrorPTP(ie, logger, {}) + director.register_provider(provider) + pot_request.video_id = 'unexpected' + response = director.get_po_token(pot_request) + assert response is None + assert provider.request_called_times == 1 + assert "Error fetching PO Token from \"error\" provider: PoTokenProviderError('an error occurred'); please report this issue to the provider developer at https://error.example.com/issues ." in logger.messages['warning'] + + def test_provider_error_no_fallback_expected(self, ie, logger, pot_cache, pot_request): + director = PoTokenRequestDirector(logger=logger, cache=pot_cache) + provider = ErrorPTP(ie, logger, {}) + director.register_provider(provider) + pot_request.video_id = 'expected' + response = director.get_po_token(pot_request) + assert response is None + assert provider.request_called_times == 1 + assert "Error fetching PO Token from \"error\" provider: PoTokenProviderError('an error occurred')" in logger.messages['warning'] + + def test_provider_error_fallback(self, ie, logger, pot_cache, pot_request, pot_provider): + # Should fallback to the next provider if the first one raises an error + director = PoTokenRequestDirector(logger=logger, cache=pot_cache) + provider = ErrorPTP(ie, logger, {}) + director.register_provider(provider) + director.register_provider(pot_provider) + + response = director.get_po_token(pot_request) + assert response == EXAMPLE_PO_TOKEN + assert provider.request_called_times == 1 + assert pot_provider.request_called_times == 1 + assert "Error fetching PO Token from \"error\" provider: PoTokenProviderError('an error occurred'); please report this issue to the provider developer at https://error.example.com/issues ." in logger.messages['warning'] + + def test_provider_unexpected_error_no_fallback(self, ie, logger, pot_cache, pot_request): + director = PoTokenRequestDirector(logger=logger, cache=pot_cache) + provider = UnexpectedErrorPTP(ie, logger, {}) + director.register_provider(provider) + + response = director.get_po_token(pot_request) + assert response is None + assert provider.request_called_times == 1 + assert "Unexpected error when fetching PO Token from \"unexpected_error\" provider: ValueError('an unexpected error occurred'); please report this issue to the provider developer at https://unexpected.example.com/issues ." in logger.messages['error'] + + def test_provider_unexpected_error_fallback(self, ie, logger, pot_cache, pot_request, pot_provider): + # Should fallback to the next provider if the first one raises an unexpected error + director = PoTokenRequestDirector(logger=logger, cache=pot_cache) + provider = UnexpectedErrorPTP(ie, logger, {}) + director.register_provider(provider) + director.register_provider(pot_provider) + + response = director.get_po_token(pot_request) + assert response == EXAMPLE_PO_TOKEN + assert provider.request_called_times == 1 + assert pot_provider.request_called_times == 1 + assert "Unexpected error when fetching PO Token from \"unexpected_error\" provider: ValueError('an unexpected error occurred'); please report this issue to the provider developer at https://unexpected.example.com/issues ." in logger.messages['error'] + + def test_invalid_po_token_response_type(self, ie, logger, pot_cache, pot_request, pot_provider): + director = PoTokenRequestDirector(logger=logger, cache=pot_cache) + provider = InvalidPTP(ie, logger, {}) + director.register_provider(provider) + + pot_request.video_id = 'invalid_type' + + response = director.get_po_token(pot_request) + assert response is None + assert provider.request_called_times == 1 + assert 'Invalid PO Token response received from "invalid" provider: invalid-response; please report this issue to the provider developer at https://invalid.example.com/issues .' in logger.messages['error'] + + # Should fallback to next available provider + director.register_provider(pot_provider) + response = director.get_po_token(pot_request) + assert response == EXAMPLE_PO_TOKEN + assert provider.request_called_times == 2 + assert pot_provider.request_called_times == 1 + + def test_invalid_po_token_response(self, ie, logger, pot_cache, pot_request, pot_provider): + director = PoTokenRequestDirector(logger=logger, cache=pot_cache) + provider = InvalidPTP(ie, logger, {}) + director.register_provider(provider) + + response = director.get_po_token(pot_request) + assert response is None + assert provider.request_called_times == 1 + assert "Invalid PO Token response received from \"invalid\" provider: PoTokenResponse(po_token='example-token?', expires_at='123'); please report this issue to the provider developer at https://invalid.example.com/issues ." in logger.messages['error'] + + # Should fallback to next available provider + director.register_provider(pot_provider) + response = director.get_po_token(pot_request) + assert response == EXAMPLE_PO_TOKEN + assert provider.request_called_times == 2 + assert pot_provider.request_called_times == 1 + + def test_copy_request_provider(self, ie, logger, pot_cache, pot_request): + + class BadProviderPTP(BaseMockPoTokenProvider): + _SUPPORTED_CONTEXTS = None + _SUPPORTED_CLIENTS = None + + def _real_request_pot(self, request: PoTokenRequest) -> PoTokenResponse: + # Providers should not modify the request object, but we should guard against it + request.video_id = 'bad' + raise PoTokenProviderRejectedRequest('bad request') + + class GoodProviderPTP(BaseMockPoTokenProvider): + _SUPPORTED_CONTEXTS = None + _SUPPORTED_CLIENTS = None + + def _real_request_pot(self, request: PoTokenRequest) -> PoTokenResponse: + return PoTokenResponse(base64.urlsafe_b64encode(request.video_id.encode()).decode()) + + director = PoTokenRequestDirector(logger=logger, cache=pot_cache) + + bad_provider = BadProviderPTP(ie, logger, {}) + good_provider = GoodProviderPTP(ie, logger, {}) + + director.register_provider(bad_provider) + director.register_provider(good_provider) + + pot_request.video_id = 'good' + response = director.get_po_token(pot_request) + assert response == base64.urlsafe_b64encode(b'good').decode() + assert bad_provider.request_called_times == 1 + assert good_provider.request_called_times == 1 + assert pot_request.video_id == 'good' + + +@pytest.mark.parametrize('spec, expected', [ + (None, False), + (PoTokenCacheSpec(key_bindings={'v': 'video-id'}, default_ttl=60, write_policy=None), False), # type: ignore + (PoTokenCacheSpec(key_bindings={'v': 'video-id'}, default_ttl='invalid'), False), # type: ignore + (PoTokenCacheSpec(key_bindings='invalid', default_ttl=60), False), # type: ignore + (PoTokenCacheSpec(key_bindings={2: 'video-id'}, default_ttl=60), False), # type: ignore + (PoTokenCacheSpec(key_bindings={'v': 2}, default_ttl=60), False), # type: ignore + (PoTokenCacheSpec(key_bindings={'v': None}, default_ttl=60), False), # type: ignore + + (PoTokenCacheSpec(key_bindings={'v': 'video_id', 'e': None}, default_ttl=60), True), + (PoTokenCacheSpec(key_bindings={'v': 'video_id'}, default_ttl=60, write_policy=CacheProviderWritePolicy.WRITE_FIRST), True), +]) +def test_validate_cache_spec(spec, expected): + assert validate_cache_spec(spec) == expected + + +@pytest.mark.parametrize('po_token', [ + 'invalid-token?', + '123', +]) +def test_clean_pot_fail(po_token): + with pytest.raises(ValueError, match='Invalid PO Token'): + clean_pot(po_token) + + +@pytest.mark.parametrize('po_token,expected', [ + ('TwAA/+8=', 'TwAA_-8='), + ('TwAA%5F%2D9VA6Q92v%5FvEQ4==?extra-param=2', 'TwAA_-9VA6Q92v_vEQ4='), +]) +def test_clean_pot(po_token, expected): + assert clean_pot(po_token) == expected + + +@pytest.mark.parametrize( + 'response, expected', + [ + (None, False), + (PoTokenResponse(None), False), + (PoTokenResponse(1), False), + (PoTokenResponse('invalid-token?'), False), + (PoTokenResponse(EXAMPLE_PO_TOKEN, expires_at='abc'), False), # type: ignore + (PoTokenResponse(EXAMPLE_PO_TOKEN, expires_at=100), False), + (PoTokenResponse(EXAMPLE_PO_TOKEN, expires_at=time.time() + 10000.0), False), # type: ignore + (PoTokenResponse(EXAMPLE_PO_TOKEN), True), + (PoTokenResponse(EXAMPLE_PO_TOKEN, expires_at=-1), True), + (PoTokenResponse(EXAMPLE_PO_TOKEN, expires_at=0), True), + (PoTokenResponse(EXAMPLE_PO_TOKEN, expires_at=int(time.time()) + 10000), True), + ], +) +def test_validate_pot_response(response, expected): + assert validate_response(response) == expected + + +def test_built_in_provider(ie, logger): + class BuiltinProviderDefaultT(BuiltinIEContentProvider, suffix='T'): + def is_available(self): + return True + + class BuiltinProviderCustomNameT(BuiltinIEContentProvider, suffix='T'): + PROVIDER_NAME = 'CustomName' + + def is_available(self): + return True + + class ExternalProviderDefaultT(IEContentProvider, suffix='T'): + def is_available(self): + return True + + class ExternalProviderCustomT(IEContentProvider, suffix='T'): + PROVIDER_NAME = 'custom' + PROVIDER_VERSION = '5.4b2' + + def is_available(self): + return True + + class ExternalProviderUnavailableT(IEContentProvider, suffix='T'): + def is_available(self) -> bool: + return False + + class BuiltinProviderUnavailableT(IEContentProvider, suffix='T'): + def is_available(self) -> bool: + return False + + built_in_default = BuiltinProviderDefaultT(ie=ie, logger=logger, settings={}) + built_in_custom_name = BuiltinProviderCustomNameT(ie=ie, logger=logger, settings={}) + built_in_unavailable = BuiltinProviderUnavailableT(ie=ie, logger=logger, settings={}) + external_default = ExternalProviderDefaultT(ie=ie, logger=logger, settings={}) + external_custom = ExternalProviderCustomT(ie=ie, logger=logger, settings={}) + external_unavailable = ExternalProviderUnavailableT(ie=ie, logger=logger, settings={}) + + assert provider_display_list([]) == 'none' + assert provider_display_list([built_in_default]) == 'BuiltinProviderDefault' + assert provider_display_list([external_unavailable]) == 'ExternalProviderUnavailable-0.0.0 (external, unavailable)' + assert provider_display_list([ + built_in_default, + built_in_custom_name, + external_default, + external_custom, + external_unavailable, + built_in_unavailable], + ) == 'BuiltinProviderDefault, CustomName, ExternalProviderDefault-0.0.0 (external), custom-5.4b2 (external), ExternalProviderUnavailable-0.0.0 (external, unavailable), BuiltinProviderUnavailable-0.0.0 (external, unavailable)' diff --git a/test/test_pot/test_pot_framework.py b/test/test_pot/test_pot_framework.py new file mode 100644 index 000000000..bc94653f4 --- /dev/null +++ b/test/test_pot/test_pot_framework.py @@ -0,0 +1,629 @@ +import pytest + +from yt_dlp.extractor.youtube.pot._provider import IEContentProvider +from yt_dlp.cookies import YoutubeDLCookieJar +from yt_dlp.utils.networking import HTTPHeaderDict +from yt_dlp.extractor.youtube.pot.provider import ( + PoTokenRequest, + PoTokenContext, + ExternalRequestFeature, + +) + +from yt_dlp.extractor.youtube.pot.cache import ( + PoTokenCacheProvider, + PoTokenCacheSpec, + PoTokenCacheSpecProvider, + CacheProviderWritePolicy, +) + +import yt_dlp.extractor.youtube.pot.cache as cache + +from yt_dlp.networking import Request +from yt_dlp.extractor.youtube.pot.provider import ( + PoTokenResponse, + PoTokenProvider, + PoTokenProviderRejectedRequest, + provider_bug_report_message, + register_provider, + register_preference, +) + +from yt_dlp.extractor.youtube.pot._registry import _pot_providers, _ptp_preferences, _pot_pcs_providers, _pot_cache_providers, _pot_cache_provider_preferences + + +class ExamplePTP(PoTokenProvider): + PROVIDER_NAME = 'example' + PROVIDER_VERSION = '0.0.1' + BUG_REPORT_LOCATION = 'https://example.com/issues' + + _SUPPORTED_CLIENTS = ('WEB',) + _SUPPORTED_CONTEXTS = (PoTokenContext.GVS, ) + + _SUPPORTED_EXTERNAL_REQUEST_FEATURES = ( + ExternalRequestFeature.PROXY_SCHEME_HTTP, + ExternalRequestFeature.PROXY_SCHEME_SOCKS5H, + ) + + def is_available(self) -> bool: + return True + + def _real_request_pot(self, request: PoTokenRequest) -> PoTokenResponse: + return PoTokenResponse('example-token', expires_at=123) + + +class ExampleCacheProviderPCP(PoTokenCacheProvider): + + PROVIDER_NAME = 'example' + PROVIDER_VERSION = '0.0.1' + BUG_REPORT_LOCATION = 'https://example.com/issues' + + def is_available(self) -> bool: + return True + + def get(self, key: str): + return 'example-cache' + + def store(self, key: str, value: str, expires_at: int): + pass + + def delete(self, key: str): + pass + + +class ExampleCacheSpecProviderPCSP(PoTokenCacheSpecProvider): + + PROVIDER_NAME = 'example' + PROVIDER_VERSION = '0.0.1' + BUG_REPORT_LOCATION = 'https://example.com/issues' + + def generate_cache_spec(self, request: PoTokenRequest): + return PoTokenCacheSpec( + key_bindings={'field': 'example-key'}, + default_ttl=60, + write_policy=CacheProviderWritePolicy.WRITE_FIRST, + ) + + +class TestPoTokenProvider: + + def test_base_type(self): + assert issubclass(PoTokenProvider, IEContentProvider) + + def test_create_provider_missing_fetch_method(self, ie, logger): + class MissingMethodsPTP(PoTokenProvider): + def is_available(self) -> bool: + return True + + with pytest.raises(TypeError): + MissingMethodsPTP(ie=ie, logger=logger, settings={}) + + def test_create_provider_missing_available_method(self, ie, logger): + class MissingMethodsPTP(PoTokenProvider): + def _real_request_pot(self, request: PoTokenRequest) -> PoTokenResponse: + raise PoTokenProviderRejectedRequest('Not implemented') + + with pytest.raises(TypeError): + MissingMethodsPTP(ie=ie, logger=logger, settings={}) + + def test_barebones_provider(self, ie, logger): + class BarebonesProviderPTP(PoTokenProvider): + def is_available(self) -> bool: + return True + + def _real_request_pot(self, request: PoTokenRequest) -> PoTokenResponse: + raise PoTokenProviderRejectedRequest('Not implemented') + + provider = BarebonesProviderPTP(ie=ie, logger=logger, settings={}) + assert provider.PROVIDER_NAME == 'BarebonesProvider' + assert provider.PROVIDER_KEY == 'BarebonesProvider' + assert provider.PROVIDER_VERSION == '0.0.0' + assert provider.BUG_REPORT_MESSAGE == 'please report this issue to the provider developer at (developer has not provided a bug report location) .' + + def test_example_provider_success(self, ie, logger, pot_request): + provider = ExamplePTP(ie=ie, logger=logger, settings={}) + assert provider.PROVIDER_NAME == 'example' + assert provider.PROVIDER_KEY == 'Example' + assert provider.PROVIDER_VERSION == '0.0.1' + assert provider.BUG_REPORT_MESSAGE == 'please report this issue to the provider developer at https://example.com/issues .' + assert provider.is_available() + + response = provider.request_pot(pot_request) + + assert response.po_token == 'example-token' + assert response.expires_at == 123 + + def test_provider_unsupported_context(self, ie, logger, pot_request): + provider = ExamplePTP(ie=ie, logger=logger, settings={}) + pot_request.context = PoTokenContext.PLAYER + + with pytest.raises(PoTokenProviderRejectedRequest): + provider.request_pot(pot_request) + + def test_provider_unsupported_client(self, ie, logger, pot_request): + provider = ExamplePTP(ie=ie, logger=logger, settings={}) + pot_request.innertube_context['client']['clientName'] = 'ANDROID' + + with pytest.raises(PoTokenProviderRejectedRequest): + provider.request_pot(pot_request) + + def test_provider_unsupported_proxy_scheme(self, ie, logger, pot_request): + provider = ExamplePTP(ie=ie, logger=logger, settings={}) + pot_request.request_proxy = 'socks4://example.com' + + with pytest.raises( + PoTokenProviderRejectedRequest, + match='External requests by "example" provider do not support proxy scheme "socks4". Supported proxy ' + 'schemes: http, socks5h', + ): + provider.request_pot(pot_request) + + pot_request.request_proxy = 'http://example.com' + + assert provider.request_pot(pot_request) + + def test_provider_ignore_external_request_features(self, ie, logger, pot_request): + class InternalPTP(ExamplePTP): + _SUPPORTED_EXTERNAL_REQUEST_FEATURES = None + + provider = InternalPTP(ie=ie, logger=logger, settings={}) + + pot_request.request_proxy = 'socks5://example.com' + assert provider.request_pot(pot_request) + pot_request.request_source_address = '0.0.0.0' + assert provider.request_pot(pot_request) + + def test_provider_unsupported_external_request_source_address(self, ie, logger, pot_request): + class InternalPTP(ExamplePTP): + _SUPPORTED_EXTERNAL_REQUEST_FEATURES = tuple() + + provider = InternalPTP(ie=ie, logger=logger, settings={}) + + pot_request.request_source_address = None + assert provider.request_pot(pot_request) + + pot_request.request_source_address = '0.0.0.0' + with pytest.raises( + PoTokenProviderRejectedRequest, + match='External requests by "example" provider do not support setting source address', + ): + provider.request_pot(pot_request) + + def test_provider_supported_external_request_source_address(self, ie, logger, pot_request): + class InternalPTP(ExamplePTP): + _SUPPORTED_EXTERNAL_REQUEST_FEATURES = ( + ExternalRequestFeature.SOURCE_ADDRESS, + ) + + provider = InternalPTP(ie=ie, logger=logger, settings={}) + + pot_request.request_source_address = None + assert provider.request_pot(pot_request) + + pot_request.request_source_address = '0.0.0.0' + assert provider.request_pot(pot_request) + + def test_provider_unsupported_external_request_tls_verification(self, ie, logger, pot_request): + class InternalPTP(ExamplePTP): + _SUPPORTED_EXTERNAL_REQUEST_FEATURES = tuple() + + provider = InternalPTP(ie=ie, logger=logger, settings={}) + + pot_request.request_verify_tls = True + assert provider.request_pot(pot_request) + + pot_request.request_verify_tls = False + with pytest.raises( + PoTokenProviderRejectedRequest, + match='External requests by "example" provider do not support ignoring TLS certificate failures', + ): + provider.request_pot(pot_request) + + def test_provider_supported_external_request_tls_verification(self, ie, logger, pot_request): + class InternalPTP(ExamplePTP): + _SUPPORTED_EXTERNAL_REQUEST_FEATURES = ( + ExternalRequestFeature.DISABLE_TLS_VERIFICATION, + ) + + provider = InternalPTP(ie=ie, logger=logger, settings={}) + + pot_request.request_verify_tls = True + assert provider.request_pot(pot_request) + + pot_request.request_verify_tls = False + assert provider.request_pot(pot_request) + + def test_provider_request_webpage(self, ie, logger, pot_request): + provider = ExamplePTP(ie=ie, logger=logger, settings={}) + + cookiejar = YoutubeDLCookieJar() + pot_request.request_headers = HTTPHeaderDict({'User-Agent': 'example-user-agent'}) + pot_request.request_proxy = 'socks5://example-proxy.com' + pot_request.request_cookiejar = cookiejar + + def mock_urlopen(request): + return request + + ie._downloader.urlopen = mock_urlopen + + sent_request = provider._request_webpage(Request( + 'https://example.com', + ), pot_request=pot_request) + + assert sent_request.url == 'https://example.com' + assert sent_request.headers['User-Agent'] == 'example-user-agent' + assert sent_request.proxies == {'all': 'socks5://example-proxy.com'} + assert sent_request.extensions['cookiejar'] is cookiejar + assert 'Requesting webpage' in logger.messages['info'] + + def test_provider_request_webpage_override(self, ie, logger, pot_request): + provider = ExamplePTP(ie=ie, logger=logger, settings={}) + + cookiejar_request = YoutubeDLCookieJar() + pot_request.request_headers = HTTPHeaderDict({'User-Agent': 'example-user-agent'}) + pot_request.request_proxy = 'socks5://example-proxy.com' + pot_request.request_cookiejar = cookiejar_request + + def mock_urlopen(request): + return request + + ie._downloader.urlopen = mock_urlopen + + sent_request = provider._request_webpage(Request( + 'https://example.com', + headers={'User-Agent': 'override-user-agent-override'}, + proxies={'http': 'http://example-proxy-override.com'}, + extensions={'cookiejar': YoutubeDLCookieJar()}, + ), pot_request=pot_request, note='Custom requesting webpage') + + assert sent_request.url == 'https://example.com' + assert sent_request.headers['User-Agent'] == 'override-user-agent-override' + assert sent_request.proxies == {'http': 'http://example-proxy-override.com'} + assert sent_request.extensions['cookiejar'] is not cookiejar_request + assert 'Custom requesting webpage' in logger.messages['info'] + + def test_provider_request_webpage_no_log(self, ie, logger, pot_request): + provider = ExamplePTP(ie=ie, logger=logger, settings={}) + + def mock_urlopen(request): + return request + + ie._downloader.urlopen = mock_urlopen + + sent_request = provider._request_webpage(Request( + 'https://example.com', + ), note=False) + + assert sent_request.url == 'https://example.com' + assert 'info' not in logger.messages + + def test_provider_request_webpage_no_pot_request(self, ie, logger): + provider = ExamplePTP(ie=ie, logger=logger, settings={}) + + def mock_urlopen(request): + return request + + ie._downloader.urlopen = mock_urlopen + + sent_request = provider._request_webpage(Request( + 'https://example.com', + ), pot_request=None) + + assert sent_request.url == 'https://example.com' + + def test_get_config_arg(self, ie, logger): + provider = ExamplePTP(ie=ie, logger=logger, settings={'abc': ['123D'], 'xyz': ['456a', '789B']}) + + assert provider._configuration_arg('abc') == ['123d'] + assert provider._configuration_arg('abc', default=['default']) == ['123d'] + assert provider._configuration_arg('ABC', default=['default']) == ['default'] + assert provider._configuration_arg('abc', casesense=True) == ['123D'] + assert provider._configuration_arg('xyz', casesense=False) == ['456a', '789b'] + + def test_require_class_end_with_suffix(self, ie, logger): + class InvalidSuffix(PoTokenProvider): + PROVIDER_NAME = 'invalid-suffix' + + def _real_request_pot(self, request: PoTokenRequest) -> PoTokenResponse: + raise PoTokenProviderRejectedRequest('Not implemented') + + def is_available(self) -> bool: + return True + + provider = InvalidSuffix(ie=ie, logger=logger, settings={}) + + with pytest.raises(AssertionError): + provider.PROVIDER_KEY # noqa: B018 + + +class TestPoTokenCacheProvider: + + def test_base_type(self): + assert issubclass(PoTokenCacheProvider, IEContentProvider) + + def test_create_provider_missing_get_method(self, ie, logger): + class MissingMethodsPCP(PoTokenCacheProvider): + def store(self, key: str, value: str, expires_at: int): + pass + + def delete(self, key: str): + pass + + def is_available(self) -> bool: + return True + + with pytest.raises(TypeError): + MissingMethodsPCP(ie=ie, logger=logger, settings={}) + + def test_create_provider_missing_store_method(self, ie, logger): + class MissingMethodsPCP(PoTokenCacheProvider): + def get(self, key: str): + pass + + def delete(self, key: str): + pass + + def is_available(self) -> bool: + return True + + with pytest.raises(TypeError): + MissingMethodsPCP(ie=ie, logger=logger, settings={}) + + def test_create_provider_missing_delete_method(self, ie, logger): + class MissingMethodsPCP(PoTokenCacheProvider): + def get(self, key: str): + pass + + def store(self, key: str, value: str, expires_at: int): + pass + + def is_available(self) -> bool: + return True + + with pytest.raises(TypeError): + MissingMethodsPCP(ie=ie, logger=logger, settings={}) + + def test_create_provider_missing_is_available_method(self, ie, logger): + class MissingMethodsPCP(PoTokenCacheProvider): + def get(self, key: str): + pass + + def store(self, key: str, value: str, expires_at: int): + pass + + def delete(self, key: str): + pass + + with pytest.raises(TypeError): + MissingMethodsPCP(ie=ie, logger=logger, settings={}) + + def test_barebones_provider(self, ie, logger): + class BarebonesProviderPCP(PoTokenCacheProvider): + + def is_available(self) -> bool: + return True + + def get(self, key: str): + return 'example-cache' + + def store(self, key: str, value: str, expires_at: int): + pass + + def delete(self, key: str): + pass + + provider = BarebonesProviderPCP(ie=ie, logger=logger, settings={}) + assert provider.PROVIDER_NAME == 'BarebonesProvider' + assert provider.PROVIDER_KEY == 'BarebonesProvider' + assert provider.PROVIDER_VERSION == '0.0.0' + assert provider.BUG_REPORT_MESSAGE == 'please report this issue to the provider developer at (developer has not provided a bug report location) .' + + def test_create_provider_example(self, ie, logger): + provider = ExampleCacheProviderPCP(ie=ie, logger=logger, settings={}) + assert provider.PROVIDER_NAME == 'example' + assert provider.PROVIDER_KEY == 'ExampleCacheProvider' + assert provider.PROVIDER_VERSION == '0.0.1' + assert provider.BUG_REPORT_MESSAGE == 'please report this issue to the provider developer at https://example.com/issues .' + assert provider.is_available() + + def test_get_config_arg(self, ie, logger): + provider = ExampleCacheProviderPCP(ie=ie, logger=logger, settings={'abc': ['123D'], 'xyz': ['456a', '789B']}) + assert provider._configuration_arg('abc') == ['123d'] + assert provider._configuration_arg('abc', default=['default']) == ['123d'] + assert provider._configuration_arg('ABC', default=['default']) == ['default'] + assert provider._configuration_arg('abc', casesense=True) == ['123D'] + assert provider._configuration_arg('xyz', casesense=False) == ['456a', '789b'] + + def test_require_class_end_with_suffix(self, ie, logger): + class InvalidSuffix(PoTokenCacheProvider): + def get(self, key: str): + return 'example-cache' + + def store(self, key: str, value: str, expires_at: int): + pass + + def delete(self, key: str): + pass + + def is_available(self) -> bool: + return True + + provider = InvalidSuffix(ie=ie, logger=logger, settings={}) + + with pytest.raises(AssertionError): + provider.PROVIDER_KEY # noqa: B018 + + +class TestPoTokenCacheSpecProvider: + + def test_base_type(self): + assert issubclass(PoTokenCacheSpecProvider, IEContentProvider) + + def test_create_provider_missing_supports_method(self, ie, logger): + class MissingMethodsPCS(PoTokenCacheSpecProvider): + pass + + with pytest.raises(TypeError): + MissingMethodsPCS(ie=ie, logger=logger, settings={}) + + def test_create_provider_barebones(self, ie, pot_request, logger): + class BarebonesProviderPCSP(PoTokenCacheSpecProvider): + def generate_cache_spec(self, request: PoTokenRequest): + return PoTokenCacheSpec( + default_ttl=100, + key_bindings={}, + ) + + provider = BarebonesProviderPCSP(ie=ie, logger=logger, settings={}) + assert provider.PROVIDER_NAME == 'BarebonesProvider' + assert provider.PROVIDER_KEY == 'BarebonesProvider' + assert provider.PROVIDER_VERSION == '0.0.0' + assert provider.BUG_REPORT_MESSAGE == 'please report this issue to the provider developer at (developer has not provided a bug report location) .' + assert provider.is_available() + assert provider.generate_cache_spec(request=pot_request).default_ttl == 100 + assert provider.generate_cache_spec(request=pot_request).key_bindings == {} + assert provider.generate_cache_spec(request=pot_request).write_policy == CacheProviderWritePolicy.WRITE_ALL + + def test_create_provider_example(self, ie, pot_request, logger): + provider = ExampleCacheSpecProviderPCSP(ie=ie, logger=logger, settings={}) + assert provider.PROVIDER_NAME == 'example' + assert provider.PROVIDER_KEY == 'ExampleCacheSpecProvider' + assert provider.PROVIDER_VERSION == '0.0.1' + assert provider.BUG_REPORT_MESSAGE == 'please report this issue to the provider developer at https://example.com/issues .' + assert provider.is_available() + assert provider.generate_cache_spec(pot_request) + assert provider.generate_cache_spec(pot_request).key_bindings == {'field': 'example-key'} + assert provider.generate_cache_spec(pot_request).default_ttl == 60 + assert provider.generate_cache_spec(pot_request).write_policy == CacheProviderWritePolicy.WRITE_FIRST + + def test_get_config_arg(self, ie, logger): + provider = ExampleCacheSpecProviderPCSP(ie=ie, logger=logger, settings={'abc': ['123D'], 'xyz': ['456a', '789B']}) + + assert provider._configuration_arg('abc') == ['123d'] + assert provider._configuration_arg('abc', default=['default']) == ['123d'] + assert provider._configuration_arg('ABC', default=['default']) == ['default'] + assert provider._configuration_arg('abc', casesense=True) == ['123D'] + assert provider._configuration_arg('xyz', casesense=False) == ['456a', '789b'] + + def test_require_class_end_with_suffix(self, ie, logger): + class InvalidSuffix(PoTokenCacheSpecProvider): + def generate_cache_spec(self, request: PoTokenRequest): + return None + + provider = InvalidSuffix(ie=ie, logger=logger, settings={}) + + with pytest.raises(AssertionError): + provider.PROVIDER_KEY # noqa: B018 + + +class TestPoTokenRequest: + def test_copy_request(self, pot_request): + copied_request = pot_request.copy() + + assert copied_request is not pot_request + assert copied_request.context == pot_request.context + assert copied_request.innertube_context == pot_request.innertube_context + assert copied_request.innertube_context is not pot_request.innertube_context + copied_request.innertube_context['client']['clientName'] = 'ANDROID' + assert pot_request.innertube_context['client']['clientName'] != 'ANDROID' + assert copied_request.innertube_host == pot_request.innertube_host + assert copied_request.session_index == pot_request.session_index + assert copied_request.player_url == pot_request.player_url + assert copied_request.is_authenticated == pot_request.is_authenticated + assert copied_request.visitor_data == pot_request.visitor_data + assert copied_request.data_sync_id == pot_request.data_sync_id + assert copied_request.video_id == pot_request.video_id + assert copied_request.request_cookiejar is pot_request.request_cookiejar + assert copied_request.request_proxy == pot_request.request_proxy + assert copied_request.request_headers == pot_request.request_headers + assert copied_request.request_headers is not pot_request.request_headers + assert copied_request.request_timeout == pot_request.request_timeout + assert copied_request.request_source_address == pot_request.request_source_address + assert copied_request.request_verify_tls == pot_request.request_verify_tls + assert copied_request.bypass_cache == pot_request.bypass_cache + + +def test_provider_bug_report_message(ie, logger): + provider = ExamplePTP(ie=ie, logger=logger, settings={}) + assert provider.BUG_REPORT_MESSAGE == 'please report this issue to the provider developer at https://example.com/issues .' + + message = provider_bug_report_message(provider) + assert message == '; please report this issue to the provider developer at https://example.com/issues .' + + message_before = provider_bug_report_message(provider, before='custom message!') + assert message_before == 'custom message! Please report this issue to the provider developer at https://example.com/issues .' + + +def test_register_provider(ie): + + @register_provider + class UnavailableProviderPTP(PoTokenProvider): + def is_available(self) -> bool: + return False + + def _real_request_pot(self, request: PoTokenRequest) -> PoTokenResponse: + raise PoTokenProviderRejectedRequest('Not implemented') + + assert _pot_providers.value.get('UnavailableProvider') == UnavailableProviderPTP + _pot_providers.value.pop('UnavailableProvider') + + +def test_register_pot_preference(ie): + before = len(_ptp_preferences.value) + + @register_preference(ExamplePTP) + def unavailable_preference(provider: PoTokenProvider, request: PoTokenRequest): + return 1 + + assert len(_ptp_preferences.value) == before + 1 + + +def test_register_cache_provider(ie): + + @cache.register_provider + class UnavailableCacheProviderPCP(PoTokenCacheProvider): + def is_available(self) -> bool: + return False + + def get(self, key: str): + return 'example-cache' + + def store(self, key: str, value: str, expires_at: int): + pass + + def delete(self, key: str): + pass + + assert _pot_cache_providers.value.get('UnavailableCacheProvider') == UnavailableCacheProviderPCP + _pot_cache_providers.value.pop('UnavailableCacheProvider') + + +def test_register_cache_provider_spec(ie): + + @cache.register_spec + class UnavailableCacheProviderPCSP(PoTokenCacheSpecProvider): + def is_available(self) -> bool: + return False + + def generate_cache_spec(self, request: PoTokenRequest): + return None + + assert _pot_pcs_providers.value.get('UnavailableCacheProvider') == UnavailableCacheProviderPCSP + _pot_pcs_providers.value.pop('UnavailableCacheProvider') + + +def test_register_cache_provider_preference(ie): + before = len(_pot_cache_provider_preferences.value) + + @cache.register_preference(ExampleCacheProviderPCP) + def unavailable_preference(provider: PoTokenCacheProvider, request: PoTokenRequest): + return 1 + + assert len(_pot_cache_provider_preferences.value) == before + 1 + + +def test_logger_log_level(logger): + assert logger.LogLevel('INFO') == logger.LogLevel.INFO + assert logger.LogLevel('debuG') == logger.LogLevel.DEBUG + assert logger.LogLevel(10) == logger.LogLevel.DEBUG + assert logger.LogLevel('UNKNOWN') == logger.LogLevel.INFO diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index 0f0885366..3f777aed7 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -316,6 +316,10 @@ 'https://www.youtube.com/s/player/8a8ac953/tv-player-es6.vflset/tv-player-es6.js', 'MiBYeXx_vRREbiCCmh', 'RtZYMVvmkE0JE', ), + ( + 'https://www.youtube.com/s/player/59b252b9/player_ias.vflset/en_US/base.js', + 'D3XWVpYgwhLLKNK4AGX', 'aZrQ1qWJ5yv5h', + ), ] diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 63e6e11b2..ea6264a0d 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -640,6 +640,7 @@ def __init__(self, params=None, auto_init=True): self._printed_messages = set() self._first_webpage_request = True self._post_hooks = [] + self._close_hooks = [] self._progress_hooks = [] self._postprocessor_hooks = [] self._download_retcode = 0 @@ -908,6 +909,11 @@ def add_post_hook(self, ph): """Add the post hook""" self._post_hooks.append(ph) + def add_close_hook(self, ch): + """Add a close hook, called when YoutubeDL.close() is called""" + assert callable(ch), 'Close hook must be callable' + self._close_hooks.append(ch) + def add_progress_hook(self, ph): """Add the download progress hook""" self._progress_hooks.append(ph) @@ -1016,6 +1022,9 @@ def close(self): self._request_director.close() del self._request_director + for close_hook in self._close_hooks: + close_hook() + def trouble(self, message=None, tb=None, is_error=True): """Determine action to take when a download problem appears. diff --git a/yt_dlp/cookies.py b/yt_dlp/cookies.py index fad323c90..5675445ac 100644 --- a/yt_dlp/cookies.py +++ b/yt_dlp/cookies.py @@ -764,11 +764,11 @@ def _get_linux_desktop_environment(env, logger): GetDesktopEnvironment """ xdg_current_desktop = env.get('XDG_CURRENT_DESKTOP', None) - desktop_session = env.get('DESKTOP_SESSION', None) + desktop_session = env.get('DESKTOP_SESSION', '') if xdg_current_desktop is not None: for part in map(str.strip, xdg_current_desktop.split(':')): if part == 'Unity': - if desktop_session is not None and 'gnome-fallback' in desktop_session: + if 'gnome-fallback' in desktop_session: return _LinuxDesktopEnvironment.GNOME else: return _LinuxDesktopEnvironment.UNITY @@ -797,35 +797,34 @@ def _get_linux_desktop_environment(env, logger): return _LinuxDesktopEnvironment.UKUI elif part == 'LXQt': return _LinuxDesktopEnvironment.LXQT - logger.info(f'XDG_CURRENT_DESKTOP is set to an unknown value: "{xdg_current_desktop}"') + logger.debug(f'XDG_CURRENT_DESKTOP is set to an unknown value: "{xdg_current_desktop}"') - elif desktop_session is not None: - if desktop_session == 'deepin': - return _LinuxDesktopEnvironment.DEEPIN - elif desktop_session in ('mate', 'gnome'): - return _LinuxDesktopEnvironment.GNOME - elif desktop_session in ('kde4', 'kde-plasma'): + if desktop_session == 'deepin': + return _LinuxDesktopEnvironment.DEEPIN + elif desktop_session in ('mate', 'gnome'): + return _LinuxDesktopEnvironment.GNOME + elif desktop_session in ('kde4', 'kde-plasma'): + return _LinuxDesktopEnvironment.KDE4 + elif desktop_session == 'kde': + if 'KDE_SESSION_VERSION' in env: return _LinuxDesktopEnvironment.KDE4 - elif desktop_session == 'kde': - if 'KDE_SESSION_VERSION' in env: - return _LinuxDesktopEnvironment.KDE4 - else: - return _LinuxDesktopEnvironment.KDE3 - elif 'xfce' in desktop_session or desktop_session == 'xubuntu': - return _LinuxDesktopEnvironment.XFCE - elif desktop_session == 'ukui': - return _LinuxDesktopEnvironment.UKUI else: - logger.info(f'DESKTOP_SESSION is set to an unknown value: "{desktop_session}"') - + return _LinuxDesktopEnvironment.KDE3 + elif 'xfce' in desktop_session or desktop_session == 'xubuntu': + return _LinuxDesktopEnvironment.XFCE + elif desktop_session == 'ukui': + return _LinuxDesktopEnvironment.UKUI else: - if 'GNOME_DESKTOP_SESSION_ID' in env: - return _LinuxDesktopEnvironment.GNOME - elif 'KDE_FULL_SESSION' in env: - if 'KDE_SESSION_VERSION' in env: - return _LinuxDesktopEnvironment.KDE4 - else: - return _LinuxDesktopEnvironment.KDE3 + logger.debug(f'DESKTOP_SESSION is set to an unknown value: "{desktop_session}"') + + if 'GNOME_DESKTOP_SESSION_ID' in env: + return _LinuxDesktopEnvironment.GNOME + elif 'KDE_FULL_SESSION' in env: + if 'KDE_SESSION_VERSION' in env: + return _LinuxDesktopEnvironment.KDE4 + else: + return _LinuxDesktopEnvironment.KDE3 + return _LinuxDesktopEnvironment.OTHER diff --git a/yt_dlp/downloader/__init__.py b/yt_dlp/downloader/__init__.py index 1b12bd4be..9c34bd289 100644 --- a/yt_dlp/downloader/__init__.py +++ b/yt_dlp/downloader/__init__.py @@ -30,7 +30,7 @@ def get_suitable_downloader(info_dict, params={}, default=NO_DEFAULT, protocol=N from .http import HttpFD from .ism import IsmFD from .mhtml import MhtmlFD -from .niconico import NiconicoDmcFD, NiconicoLiveFD +from .niconico import NiconicoLiveFD from .rtmp import RtmpFD from .rtsp import RtspFD from .websocket import WebSocketFragmentFD @@ -50,7 +50,6 @@ def get_suitable_downloader(info_dict, params={}, default=NO_DEFAULT, protocol=N 'http_dash_segments_generator': DashSegmentsFD, 'ism': IsmFD, 'mhtml': MhtmlFD, - 'niconico_dmc': NiconicoDmcFD, 'niconico_live': NiconicoLiveFD, 'fc2_live': FC2LiveFD, 'websocket_frag': WebSocketFragmentFD, @@ -67,7 +66,6 @@ def shorten_protocol_name(proto, simplify=False): 'rtmp_ffmpeg': 'rtmpF', 'http_dash_segments': 'dash', 'http_dash_segments_generator': 'dashG', - 'niconico_dmc': 'dmc', 'websocket_frag': 'WSfrag', } if simplify: diff --git a/yt_dlp/downloader/niconico.py b/yt_dlp/downloader/niconico.py index 310144e7d..33cf15df8 100644 --- a/yt_dlp/downloader/niconico.py +++ b/yt_dlp/downloader/niconico.py @@ -2,60 +2,12 @@ import threading import time -from . import get_suitable_downloader from .common import FileDownloader from .external import FFmpegFD from ..networking import Request from ..utils import DownloadError, str_or_none, try_get -class NiconicoDmcFD(FileDownloader): - """ Downloading niconico douga from DMC with heartbeat """ - - def real_download(self, filename, info_dict): - from ..extractor.niconico import NiconicoIE - - self.to_screen(f'[{self.FD_NAME}] Downloading from DMC') - ie = NiconicoIE(self.ydl) - info_dict, heartbeat_info_dict = ie._get_heartbeat_info(info_dict) - - fd = get_suitable_downloader(info_dict, params=self.params)(self.ydl, self.params) - - success = download_complete = False - timer = [None] - heartbeat_lock = threading.Lock() - heartbeat_url = heartbeat_info_dict['url'] - heartbeat_data = heartbeat_info_dict['data'].encode() - heartbeat_interval = heartbeat_info_dict.get('interval', 30) - - request = Request(heartbeat_url, heartbeat_data) - - def heartbeat(): - try: - self.ydl.urlopen(request).read() - except Exception: - self.to_screen(f'[{self.FD_NAME}] Heartbeat failed') - - with heartbeat_lock: - if not download_complete: - timer[0] = threading.Timer(heartbeat_interval, heartbeat) - timer[0].start() - - heartbeat_info_dict['ping']() - self.to_screen('[%s] Heartbeat with %d second interval ...' % (self.FD_NAME, heartbeat_interval)) - try: - heartbeat() - if type(fd).__name__ == 'HlsFD': - info_dict.update(ie._extract_m3u8_formats(info_dict['url'], info_dict['id'])[0]) - success = fd.real_download(filename, info_dict) - finally: - if heartbeat_lock: - with heartbeat_lock: - timer[0].cancel() - download_complete = True - return success - - class NiconicoLiveFD(FileDownloader): """ Downloads niconico live without being stopped """ diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index f7e3f25c3..c516c79ce 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -338,7 +338,6 @@ from .canalplus import CanalplusIE from .canalsurmas import CanalsurmasIE from .caracoltv import CaracolTvPlayIE -from .cartoonnetwork import CartoonNetworkIE from .cbc import ( CBCIE, CBCGemIE, @@ -929,7 +928,10 @@ ) from .jiosaavn import ( JioSaavnAlbumIE, + JioSaavnArtistIE, JioSaavnPlaylistIE, + JioSaavnShowIE, + JioSaavnShowPlaylistIE, JioSaavnSongIE, ) from .joj import JojIE @@ -1042,6 +1044,7 @@ LimelightMediaIE, ) from .linkedin import ( + LinkedInEventsIE, LinkedInIE, LinkedInLearningCourseIE, LinkedInLearningIE, @@ -1783,7 +1786,6 @@ from .rtve import ( RTVEALaCartaIE, RTVEAudioIE, - RTVEInfantilIE, RTVELiveIE, RTVETelevisionIE, ) @@ -1964,7 +1966,6 @@ SpreakerShowIE, ) from .springboardplatform import SpringboardPlatformIE -from .sprout import SproutIE from .sproutvideo import ( SproutVideoIE, VidsIoIE, @@ -2146,6 +2147,7 @@ from .toggo import ToggoIE from .tonline import TOnlineIE from .toongoggles import ToonGogglesIE +from .toutiao import ToutiaoIE from .toutv import TouTvIE from .toypics import ( ToypicsIE, @@ -2237,7 +2239,10 @@ TVPlayIE, ) from .tvplayer import TVPlayerIE -from .tvw import TvwIE +from .tvw import ( + TvwIE, + TvwTvChannelsIE, +) from .tweakers import TweakersIE from .twentymin import TwentyMinutenIE from .twentythreevideo import TwentyThreeVideoIE @@ -2365,6 +2370,7 @@ VHXEmbedIE, VimeoAlbumIE, VimeoChannelIE, + VimeoEventIE, VimeoGroupsIE, VimeoIE, VimeoLikesIE, diff --git a/yt_dlp/extractor/abematv.py b/yt_dlp/extractor/abematv.py index 8c7131b10..8f2fc4c80 100644 --- a/yt_dlp/extractor/abematv.py +++ b/yt_dlp/extractor/abematv.py @@ -21,6 +21,7 @@ int_or_none, time_seconds, traverse_obj, + update_url, update_url_query, ) @@ -417,6 +418,10 @@ def _real_extract(self, url): 'is_live': is_live, 'availability': availability, }) + + if thumbnail := update_url(self._og_search_thumbnail(webpage, default=''), query=None): + info['thumbnails'] = [{'url': thumbnail}] + return info diff --git a/yt_dlp/extractor/amcnetworks.py b/yt_dlp/extractor/amcnetworks.py index 15a86e245..3817f3516 100644 --- a/yt_dlp/extractor/amcnetworks.py +++ b/yt_dlp/extractor/amcnetworks.py @@ -1,32 +1,24 @@ -import re - -from .theplatform import ThePlatformIE -from ..utils import ( - int_or_none, - parse_age_limit, - try_get, - update_url_query, -) +from .brightcove import BrightcoveNewIE +from .common import InfoExtractor +from ..utils.traversal import traverse_obj -class AMCNetworksIE(ThePlatformIE): # XXX: Do not subclass from concrete IE - _VALID_URL = r'https?://(?:www\.)?(?Pamc|bbcamerica|ifc|(?:we|sundance)tv)\.com/(?P(?:movies|shows(?:/[^/]+)+)/[^/?#&]+)' +class AMCNetworksIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?(?:amc|bbcamerica|ifc|(?:we|sundance)tv)\.com/(?P(?:movies|shows(?:/[^/?#]+)+)/[^/?#&]+)' _TESTS = [{ - 'url': 'https://www.bbcamerica.com/shows/the-graham-norton-show/videos/tina-feys-adorable-airline-themed-family-dinner--51631', + 'url': 'https://www.amc.com/shows/dark-winds/videos/dark-winds-a-look-at-season-3--1072027', 'info_dict': { - 'id': '4Lq1dzOnZGt0', + 'id': '6369261343112', 'ext': 'mp4', - 'title': "The Graham Norton Show - Season 28 - Tina Fey's Adorable Airline-Themed Family Dinner", - 'description': "It turns out child stewardesses are very generous with the wine! All-new episodes of 'The Graham Norton Show' premiere Fridays at 11/10c on BBC America.", - 'upload_date': '20201120', - 'timestamp': 1605904350, - 'uploader': 'AMCN', + 'title': 'Dark Winds: A Look at Season 3', + 'uploader_id': '6240731308001', + 'duration': 176.427, + 'thumbnail': r're:https://[^/]+\.boltdns\.net/.+/image\.jpg', + 'tags': [], + 'timestamp': 1740414792, + 'upload_date': '20250224', }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - 'skip': '404 Not Found', + 'params': {'skip_download': 'm3u8'}, }, { 'url': 'http://www.bbcamerica.com/shows/the-hunt/full-episodes/season-1/episode-01-the-hardest-challenge', 'only_matching': True, @@ -52,96 +44,18 @@ class AMCNetworksIE(ThePlatformIE): # XXX: Do not subclass from concrete IE 'url': 'https://www.sundancetv.com/shows/riviera/full-episodes/season-1/episode-01-episode-1', 'only_matching': True, }] - _REQUESTOR_ID_MAP = { - 'amc': 'AMC', - 'bbcamerica': 'BBCA', - 'ifc': 'IFC', - 'sundancetv': 'SUNDANCE', - 'wetv': 'WETV', - } def _real_extract(self, url): - site, display_id = self._match_valid_url(url).groups() - requestor_id = self._REQUESTOR_ID_MAP[site] - page_data = self._download_json( - f'https://content-delivery-gw.svc.ds.amcn.com/api/v2/content/amcn/{requestor_id.lower()}/url/{display_id}', - display_id)['data'] - properties = page_data.get('properties') or {} - query = { - 'mbr': 'true', - 'manifest': 'm3u', - } + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + initial_data = self._search_json( + r'window\.initialData\s*=\s*JSON\.parse\(String\.raw`', webpage, 'initial data', display_id) + video_id = traverse_obj(initial_data, ('initialData', 'properties', 'videoId', {str})) + if not video_id: # All locked videos are now DRM-protected + self.report_drm(display_id) + account_id = initial_data['config']['brightcove']['accountId'] + player_id = initial_data['config']['brightcove']['playerId'] - video_player_count = 0 - try: - for v in page_data['children']: - if v.get('type') == 'video-player': - release_pid = v['properties']['currentVideo']['meta']['releasePid'] - tp_path = 'M_UwQC/' + release_pid - media_url = 'https://link.theplatform.com/s/' + tp_path - video_player_count += 1 - except KeyError: - pass - if video_player_count > 1: - self.report_warning( - f'The JSON data has {video_player_count} video players. Only one will be extracted') - - # Fall back to videoPid if releasePid not found. - # TODO: Fall back to videoPid if releasePid manifest uses DRM. - if not video_player_count: - tp_path = 'M_UwQC/media/' + properties['videoPid'] - media_url = 'https://link.theplatform.com/s/' + tp_path - - theplatform_metadata = self._download_theplatform_metadata(tp_path, display_id) - info = self._parse_theplatform_metadata(theplatform_metadata) - video_id = theplatform_metadata['pid'] - title = theplatform_metadata['title'] - rating = try_get( - theplatform_metadata, lambda x: x['ratings'][0]['rating']) - video_category = properties.get('videoCategory') - if video_category and video_category.endswith('-Auth'): - resource = self._get_mvpd_resource( - requestor_id, title, video_id, rating) - query['auth'] = self._extract_mvpd_auth( - url, video_id, requestor_id, resource) - media_url = update_url_query(media_url, query) - formats, subtitles = self._extract_theplatform_smil( - media_url, video_id) - - thumbnails = [] - thumbnail_urls = [properties.get('imageDesktop')] - if 'thumbnail' in info: - thumbnail_urls.append(info.pop('thumbnail')) - for thumbnail_url in thumbnail_urls: - if not thumbnail_url: - continue - mobj = re.search(r'(\d+)x(\d+)', thumbnail_url) - thumbnails.append({ - 'url': thumbnail_url, - 'width': int(mobj.group(1)) if mobj else None, - 'height': int(mobj.group(2)) if mobj else None, - }) - - info.update({ - 'age_limit': parse_age_limit(rating), - 'formats': formats, - 'id': video_id, - 'subtitles': subtitles, - 'thumbnails': thumbnails, - }) - ns_keys = theplatform_metadata.get('$xmlns', {}).keys() - if ns_keys: - ns = next(iter(ns_keys)) - episode = theplatform_metadata.get(ns + '$episodeTitle') or None - episode_number = int_or_none( - theplatform_metadata.get(ns + '$episode')) - season_number = int_or_none( - theplatform_metadata.get(ns + '$season')) - series = theplatform_metadata.get(ns + '$show') or None - info.update({ - 'episode': episode, - 'episode_number': episode_number, - 'season_number': season_number, - 'series': series, - }) - return info + return self.url_result( + f'https://players.brightcove.net/{account_id}/{player_id}_default/index.html?videoId={video_id}', + BrightcoveNewIE, video_id) diff --git a/yt_dlp/extractor/atresplayer.py b/yt_dlp/extractor/atresplayer.py index 0fe95bec5..1258a5704 100644 --- a/yt_dlp/extractor/atresplayer.py +++ b/yt_dlp/extractor/atresplayer.py @@ -1,64 +1,105 @@ +import urllib.parse + from .common import InfoExtractor from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, int_or_none, + parse_age_limit, + url_or_none, urlencode_postdata, ) +from ..utils.traversal import traverse_obj class AtresPlayerIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?atresplayer\.com/[^/]+/[^/]+/[^/]+/[^/]+/(?P.+?)_(?P[0-9a-f]{24})' + _VALID_URL = r'https?://(?:www\.)?atresplayer\.com/(?:[^/?#]+/){4}(?P.+?)_(?P[0-9a-f]{24})' _NETRC_MACHINE = 'atresplayer' - _TESTS = [ - { - 'url': 'https://www.atresplayer.com/antena3/series/pequenas-coincidencias/temporada-1/capitulo-7-asuntos-pendientes_5d4aa2c57ed1a88fc715a615/', - 'info_dict': { - 'id': '5d4aa2c57ed1a88fc715a615', - 'ext': 'mp4', - 'title': 'Capítulo 7: Asuntos pendientes', - 'description': 'md5:7634cdcb4d50d5381bedf93efb537fbc', - 'duration': 3413, - }, - 'skip': 'This video is only available for registered users', + _TESTS = [{ + 'url': 'https://www.atresplayer.com/lasexta/programas/el-objetivo/clips/mbappe-describe-como-entrenador-a-carlo-ancelotti-sabe-cuando-tiene-que-ser-padre-jefe-amigo-entrenador_67f2dfb2fb6ab0e4c7203849/', + 'info_dict': { + 'ext': 'mp4', + 'id': '67f2dfb2fb6ab0e4c7203849', + 'display_id': 'md5:c203f8d4e425ed115ba56a1c6e4b3e6c', + 'title': 'Mbappé describe como entrenador a Carlo Ancelotti: "Sabe cuándo tiene que ser padre, jefe, amigo, entrenador..."', + 'channel': 'laSexta', + 'duration': 31, + 'thumbnail': 'https://imagenes.atresplayer.com/atp/clipping/cmsimages02/2025/04/06/B02DBE1E-D59B-4683-8404-1A9595D15269/1920x1080.jpg', + 'tags': ['Entrevista informativa', 'Actualidad', 'Debate informativo', 'Política', 'Economía', 'Sociedad', 'Cara a cara', 'Análisis', 'Más periodismo'], + 'series': 'El Objetivo', + 'season': 'Temporada 12', + 'timestamp': 1743970079, + 'upload_date': '20250406', }, - { - 'url': 'https://www.atresplayer.com/lasexta/programas/el-club-de-la-comedia/temporada-4/capitulo-10-especial-solidario-nochebuena_5ad08edf986b2855ed47adc4/', - 'only_matching': True, + }, { + 'url': 'https://www.atresplayer.com/antena3/programas/el-hormiguero/clips/revive-la-entrevista-completa-a-miguel-bose-en-el-hormiguero_67f836baa4a5b0e4147ca59a/', + 'info_dict': { + 'ext': 'mp4', + 'id': '67f836baa4a5b0e4147ca59a', + 'display_id': 'revive-la-entrevista-completa-a-miguel-bose-en-el-hormiguero', + 'title': 'Revive la entrevista completa a Miguel Bosé en El Hormiguero', + 'description': 'md5:c6d2b591408d45a7bc2986dfb938eb72', + 'channel': 'Antena 3', + 'duration': 2556, + 'thumbnail': 'https://imagenes.atresplayer.com/atp/clipping/cmsimages02/2025/04/10/9076395F-F1FD-48BE-9F18-540DBA10EBAD/1920x1080.jpg', + 'tags': ['Entrevista', 'Variedades', 'Humor', 'Entretenimiento', 'Te sigo', 'Buen rollo', 'Cara a cara'], + 'series': 'El Hormiguero ', + 'season': 'Temporada 14', + 'timestamp': 1744320111, + 'upload_date': '20250410', }, - { - 'url': 'https://www.atresplayer.com/antena3/series/el-secreto-de-puente-viejo/el-chico-de-los-tres-lunares/capitulo-977-29-12-14_5ad51046986b2886722ccdea/', - 'only_matching': True, + }, { + 'url': 'https://www.atresplayer.com/flooxer/series/biara-proyecto-lazarus/temporada-1/capitulo-3-supervivientes_67a6038b64ceca00070f4f69/', + 'info_dict': { + 'ext': 'mp4', + 'id': '67a6038b64ceca00070f4f69', + 'display_id': 'capitulo-3-supervivientes', + 'title': 'Capítulo 3: Supervivientes', + 'description': 'md5:65b231f20302f776c2b0dd24594599a1', + 'channel': 'Flooxer', + 'duration': 1196, + 'thumbnail': 'https://imagenes.atresplayer.com/atp/clipping/cmsimages01/2025/02/14/17CF90D3-FE67-40C5-A941-7825B3E13992/1920x1080.jpg', + 'tags': ['Juvenil', 'Terror', 'Piel de gallina', 'Te sigo', 'Un break', 'Del tirón'], + 'series': 'BIARA: Proyecto Lázarus', + 'season': 'Temporada 1', + 'season_number': 1, + 'episode': 'Episode 3', + 'episode_number': 3, + 'timestamp': 1743095191, + 'upload_date': '20250327', }, - ] + }, { + 'url': 'https://www.atresplayer.com/lasexta/programas/el-club-de-la-comedia/temporada-4/capitulo-10-especial-solidario-nochebuena_5ad08edf986b2855ed47adc4/', + 'only_matching': True, + }, { + 'url': 'https://www.atresplayer.com/antena3/series/el-secreto-de-puente-viejo/el-chico-de-los-tres-lunares/capitulo-977-29-12-14_5ad51046986b2886722ccdea/', + 'only_matching': True, + }] _API_BASE = 'https://api.atresplayer.com/' def _perform_login(self, username, password): - self._request_webpage( - self._API_BASE + 'login', None, 'Downloading login page') - try: - target_url = self._download_json( - 'https://account.atresmedia.com/api/login', None, - 'Logging in', headers={ - 'Content-Type': 'application/x-www-form-urlencoded', - }, data=urlencode_postdata({ + self._download_webpage( + 'https://account.atresplayer.com/auth/v1/login', None, + 'Logging in', 'Failed to log in', data=urlencode_postdata({ 'username': username, 'password': password, - }))['targetUrl'] + })) except ExtractorError as e: if isinstance(e.cause, HTTPError) and e.cause.status == 400: raise ExtractorError('Invalid username and/or password', expected=True) raise - self._request_webpage(target_url, None, 'Following Target URL') - def _real_extract(self, url): display_id, video_id = self._match_valid_url(url).groups() + metadata_url = self._download_json( + self._API_BASE + 'client/v1/url', video_id, 'Downloading API endpoint data', + query={'href': urllib.parse.urlparse(url).path})['href'] + metadata = self._download_json(metadata_url, video_id) + try: - episode = self._download_json( - self._API_BASE + 'client/v1/player/episode/' + video_id, video_id) + video_data = self._download_json(metadata['urlVideo'], video_id, 'Downloading video data') except ExtractorError as e: if isinstance(e.cause, HTTPError) and e.cause.status == 403: error = self._parse_json(e.cause.response.read(), None) @@ -67,37 +108,45 @@ def _real_extract(self, url): raise ExtractorError(error['error_description'], expected=True) raise - title = episode['titulo'] - formats = [] subtitles = {} - for source in episode.get('sources', []): - src = source.get('src') - if not src: - continue + for source in traverse_obj(video_data, ('sources', lambda _, v: url_or_none(v['src']))): + src_url = source['src'] src_type = source.get('type') - if src_type == 'application/vnd.apple.mpegurl': - formats, subtitles = self._extract_m3u8_formats( - src, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False) - elif src_type == 'application/dash+xml': - formats, subtitles = self._extract_mpd_formats( - src, video_id, mpd_id='dash', fatal=False) - - heartbeat = episode.get('heartbeat') or {} - omniture = episode.get('omniture') or {} - get_meta = lambda x: heartbeat.get(x) or omniture.get(x) + if src_type in ('application/vnd.apple.mpegurl', 'application/hls+legacy', 'application/hls+hevc'): + fmts, subs = self._extract_m3u8_formats_and_subtitles( + src_url, video_id, 'mp4', m3u8_id='hls', fatal=False) + elif src_type in ('application/dash+xml', 'application/dash+hevc'): + fmts, subs = self._extract_mpd_formats_and_subtitles( + src_url, video_id, mpd_id='dash', fatal=False) + else: + continue + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) return { 'display_id': display_id, 'id': video_id, - 'title': title, - 'description': episode.get('descripcion'), - 'thumbnail': episode.get('imgPoster'), - 'duration': int_or_none(episode.get('duration')), 'formats': formats, - 'channel': get_meta('channel'), - 'season': get_meta('season'), - 'episode_number': int_or_none(get_meta('episodeNumber')), 'subtitles': subtitles, + **traverse_obj(video_data, { + 'title': ('titulo', {str}), + 'description': ('descripcion', {str}), + 'duration': ('duration', {int_or_none}), + 'thumbnail': ('imgPoster', {url_or_none}, {lambda v: f'{v}1920x1080.jpg'}), + 'age_limit': ('ageRating', {parse_age_limit}), + }), + **traverse_obj(metadata, { + 'title': ('title', {str}), + 'description': ('description', {str}), + 'duration': ('duration', {int_or_none}), + 'tags': ('tags', ..., 'title', {str}), + 'age_limit': ('ageRating', {parse_age_limit}), + 'series': ('format', 'title', {str}), + 'season': ('currentSeason', 'title', {str}), + 'season_number': ('currentSeason', 'seasonNumber', {int_or_none}), + 'episode_number': ('numberOfEpisode', {int_or_none}), + 'timestamp': ('publicationDate', {int_or_none(scale=1000)}), + 'channel': ('channel', 'title', {str}), + }), } diff --git a/yt_dlp/extractor/bitchute.py b/yt_dlp/extractor/bitchute.py index c83222ea5..981eebfbb 100644 --- a/yt_dlp/extractor/bitchute.py +++ b/yt_dlp/extractor/bitchute.py @@ -1,30 +1,32 @@ import functools +import json import re from .common import InfoExtractor from ..networking import HEADRequest +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, OnDemandPagedList, clean_html, - extract_attributes, + determine_ext, + format_field, get_element_by_class, - get_element_by_id, - get_element_html_by_class, get_elements_html_by_class, int_or_none, orderedSet, parse_count, parse_duration, - traverse_obj, - unified_strdate, + parse_iso8601, + url_or_none, urlencode_postdata, urljoin, ) +from ..utils.traversal import traverse_obj class BitChuteIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:www|old)\.)?bitchute\.com/(?:video|embed|torrent/[^/]+)/(?P[^/?#&]+)' + _VALID_URL = r'https?://(?:(?:www|old)\.)?bitchute\.com/(?:video|embed|torrent/[^/?#]+)/(?P[^/?#&]+)' _EMBED_REGEX = [rf'<(?:script|iframe)[^>]+\bsrc=(["\'])(?P{_VALID_URL})'] _TESTS = [{ 'url': 'https://www.bitchute.com/video/UGlrF9o9b-Q/', @@ -34,12 +36,17 @@ class BitChuteIE(InfoExtractor): 'ext': 'mp4', 'title': 'This is the first video on #BitChute !', 'description': 'md5:a0337e7b1fe39e32336974af8173a034', - 'thumbnail': r're:^https?://.*\.jpg$', + 'thumbnail': r're:https?://.+/.+\.jpg$', 'uploader': 'BitChute', 'upload_date': '20170103', 'uploader_url': 'https://www.bitchute.com/profile/I5NgtHZn9vPj/', 'channel': 'BitChute', 'channel_url': 'https://www.bitchute.com/channel/bitchute/', + 'uploader_id': 'I5NgtHZn9vPj', + 'channel_id': '1VBwRfyNcKdX', + 'view_count': int, + 'duration': 16.0, + 'timestamp': 1483425443, }, }, { # test case: video with different channel and uploader @@ -49,13 +56,18 @@ class BitChuteIE(InfoExtractor): 'id': 'Yti_j9A-UZ4', 'ext': 'mp4', 'title': 'Israel at War | Full Measure', - 'description': 'md5:38cf7bc6f42da1a877835539111c69ef', - 'thumbnail': r're:^https?://.*\.jpg$', + 'description': 'md5:e60198b89971966d6030d22b3268f08f', + 'thumbnail': r're:https?://.+/.+\.jpg$', 'uploader': 'sharylattkisson', 'upload_date': '20231106', 'uploader_url': 'https://www.bitchute.com/profile/9K0kUWA9zmd9/', 'channel': 'Full Measure with Sharyl Attkisson', 'channel_url': 'https://www.bitchute.com/channel/sharylattkisson/', + 'uploader_id': '9K0kUWA9zmd9', + 'channel_id': 'NpdxoCRv3ZLb', + 'view_count': int, + 'duration': 554.0, + 'timestamp': 1699296106, }, }, { # video not downloadable in browser, but we can recover it @@ -66,25 +78,21 @@ class BitChuteIE(InfoExtractor): 'ext': 'mp4', 'filesize': 71537926, 'title': 'STYXHEXENHAMMER666 - Election Fraud, Clinton 2020, EU Armies, and Gun Control', - 'description': 'md5:228ee93bd840a24938f536aeac9cf749', - 'thumbnail': r're:^https?://.*\.jpg$', + 'description': 'md5:2029c7c212ccd4b040f52bb2d036ef4e', + 'thumbnail': r're:https?://.+/.+\.jpg$', 'uploader': 'BitChute', 'upload_date': '20181113', 'uploader_url': 'https://www.bitchute.com/profile/I5NgtHZn9vPj/', 'channel': 'BitChute', 'channel_url': 'https://www.bitchute.com/channel/bitchute/', + 'uploader_id': 'I5NgtHZn9vPj', + 'channel_id': '1VBwRfyNcKdX', + 'view_count': int, + 'duration': 1701.0, + 'tags': ['bitchute'], + 'timestamp': 1542130287, }, 'params': {'check_formats': None}, - }, { - # restricted video - 'url': 'https://www.bitchute.com/video/WEnQU7XGcTdl/', - 'info_dict': { - 'id': 'WEnQU7XGcTdl', - 'ext': 'mp4', - 'title': 'Impartial Truth - Ein Letzter Appell an die Vernunft', - }, - 'params': {'skip_download': True}, - 'skip': 'Georestricted in DE', }, { 'url': 'https://www.bitchute.com/embed/lbb5G1hjPhw/', 'only_matching': True, @@ -96,11 +104,8 @@ class BitChuteIE(InfoExtractor): 'only_matching': True, }] _GEO_BYPASS = False - - _HEADERS = { - 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.57 Safari/537.36', - 'Referer': 'https://www.bitchute.com/', - } + _UPLOADER_URL_TMPL = 'https://www.bitchute.com/profile/%s/' + _CHANNEL_URL_TMPL = 'https://www.bitchute.com/channel/%s/' def _check_format(self, video_url, video_id): urls = orderedSet( @@ -112,7 +117,7 @@ def _check_format(self, video_url, video_id): for url in urls: try: response = self._request_webpage( - HEADRequest(url), video_id=video_id, note=f'Checking {url}', headers=self._HEADERS) + HEADRequest(url), video_id=video_id, note=f'Checking {url}') except ExtractorError as e: self.to_screen(f'{video_id}: URL is invalid, skipping: {e.cause}') continue @@ -121,54 +126,79 @@ def _check_format(self, video_url, video_id): 'filesize': int_or_none(response.headers.get('Content-Length')), } - def _raise_if_restricted(self, webpage): - page_title = clean_html(get_element_by_class('page-title', webpage)) or '' - if re.fullmatch(r'(?:Channel|Video) Restricted', page_title): - reason = clean_html(get_element_by_id('page-detail', webpage)) or page_title - self.raise_geo_restricted(reason) - - @staticmethod - def _make_url(html): - path = extract_attributes(get_element_html_by_class('spa', html) or '').get('href') - return urljoin('https://www.bitchute.com', path) + def _call_api(self, endpoint, data, display_id, fatal=True): + note = endpoint.rpartition('/')[2] + try: + return self._download_json( + f'https://api.bitchute.com/api/beta/{endpoint}', display_id, + f'Downloading {note} API JSON', f'Unable to download {note} API JSON', + data=json.dumps(data).encode(), + headers={ + 'Accept': 'application/json', + 'Content-Type': 'application/json', + }) + except ExtractorError as e: + if isinstance(e.cause, HTTPError) and e.cause.status == 403: + errors = '. '.join(traverse_obj(e.cause.response.read().decode(), ( + {json.loads}, 'errors', lambda _, v: v['context'] == 'reason', 'message', {str}))) + if errors and 'location' in errors: + # Can always be fatal since the video/media call will reach this code first + self.raise_geo_restricted(errors) + if fatal: + raise + self.report_warning(e.msg) def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage( - f'https://old.bitchute.com/video/{video_id}', video_id, headers=self._HEADERS) - - self._raise_if_restricted(webpage) - publish_date = clean_html(get_element_by_class('video-publish-date', webpage)) - entries = self._parse_html5_media_entries(url, webpage, video_id) + data = {'video_id': video_id} + media_url = self._call_api('video/media', data, video_id)['media_url'] formats = [] - for format_ in traverse_obj(entries, (0, 'formats', ...)): + if determine_ext(media_url) == 'm3u8': + formats.extend( + self._extract_m3u8_formats(media_url, video_id, 'mp4', m3u8_id='hls', live=True)) + else: if self.get_param('check_formats') is not False: - format_.update(self._check_format(format_.pop('url'), video_id) or {}) - if 'url' not in format_: - continue - formats.append(format_) + if fmt := self._check_format(media_url, video_id): + formats.append(fmt) + else: + formats.append({'url': media_url}) if not formats: self.raise_no_formats( 'Video is unavailable. Please make sure this video is playable in the browser ' 'before reporting this issue.', expected=True, video_id=video_id) - details = get_element_by_class('details', webpage) or '' - uploader_html = get_element_html_by_class('creator', details) or '' - channel_html = get_element_html_by_class('name', details) or '' + video = self._call_api('video', data, video_id, fatal=False) + channel = None + if channel_id := traverse_obj(video, ('channel', 'channel_id', {str})): + channel = self._call_api('channel', {'channel_id': channel_id}, video_id, fatal=False) return { + **traverse_obj(video, { + 'title': ('video_name', {str}), + 'description': ('description', {str}), + 'thumbnail': ('thumbnail_url', {url_or_none}), + 'channel': ('channel', 'channel_name', {str}), + 'channel_id': ('channel', 'channel_id', {str}), + 'channel_url': ('channel', 'channel_url', {urljoin('https://www.bitchute.com/')}), + 'uploader_id': ('profile_id', {str}), + 'uploader_url': ('profile_id', {format_field(template=self._UPLOADER_URL_TMPL)}, filter), + 'timestamp': ('date_published', {parse_iso8601}), + 'duration': ('duration', {parse_duration}), + 'tags': ('hashtags', ..., {str}, filter, all, filter), + 'view_count': ('view_count', {int_or_none}), + 'is_live': ('state_id', {lambda x: x == 'live'}), + }), + **traverse_obj(channel, { + 'channel': ('channel_name', {str}), + 'channel_id': ('channel_id', {str}), + 'channel_url': ('url_slug', {format_field(template=self._CHANNEL_URL_TMPL)}, filter), + 'uploader': ('profile_name', {str}), + 'uploader_id': ('profile_id', {str}), + 'uploader_url': ('profile_id', {format_field(template=self._UPLOADER_URL_TMPL)}, filter), + }), 'id': video_id, - 'title': self._html_extract_title(webpage) or self._og_search_title(webpage), - 'description': self._og_search_description(webpage, default=None), - 'thumbnail': self._og_search_thumbnail(webpage), - 'uploader': clean_html(uploader_html), - 'uploader_url': self._make_url(uploader_html), - 'channel': clean_html(channel_html), - 'channel_url': self._make_url(channel_html), - 'upload_date': unified_strdate(self._search_regex( - r'at \d+:\d+ UTC on (.+?)\.', publish_date, 'upload date', fatal=False)), 'formats': formats, } @@ -190,7 +220,7 @@ class BitChuteChannelIE(InfoExtractor): 'ext': 'mp4', 'title': 'This is the first video on #BitChute !', 'description': 'md5:a0337e7b1fe39e32336974af8173a034', - 'thumbnail': r're:^https?://.*\.jpg$', + 'thumbnail': r're:https?://.+/.+\.jpg$', 'uploader': 'BitChute', 'upload_date': '20170103', 'uploader_url': 'https://www.bitchute.com/profile/I5NgtHZn9vPj/', @@ -198,6 +228,9 @@ class BitChuteChannelIE(InfoExtractor): 'channel_url': 'https://www.bitchute.com/channel/bitchute/', 'duration': 16, 'view_count': int, + 'uploader_id': 'I5NgtHZn9vPj', + 'channel_id': '1VBwRfyNcKdX', + 'timestamp': 1483425443, }, }, ], @@ -213,6 +246,7 @@ class BitChuteChannelIE(InfoExtractor): 'title': 'Bruce MacDonald and "The Light of Darkness"', 'description': 'md5:747724ef404eebdfc04277714f81863e', }, + 'skip': '404 Not Found', }, { 'url': 'https://old.bitchute.com/playlist/wV9Imujxasw9/', 'only_matching': True, diff --git a/yt_dlp/extractor/bpb.py b/yt_dlp/extractor/bpb.py index d7bf58b36..a2a908252 100644 --- a/yt_dlp/extractor/bpb.py +++ b/yt_dlp/extractor/bpb.py @@ -7,6 +7,7 @@ join_nonempty, js_to_json, mimetype2ext, + parse_resolution, unified_strdate, url_or_none, urljoin, @@ -110,24 +111,23 @@ def _parse_vue_attributes(self, name, string, video_id): return attributes - @staticmethod - def _process_source(source): + def _process_source(self, source): url = url_or_none(source['src']) if not url: return None source_type = source.get('type', '') extension = mimetype2ext(source_type) - is_video = source_type.startswith('video') - note = url.rpartition('.')[0].rpartition('_')[2] if is_video else None + note = self._search_regex(r'[_-]([a-z]+)\.[\da-z]+(?:$|\?)', url, 'note', default=None) return { 'url': url, 'ext': extension, - 'vcodec': None if is_video else 'none', + 'vcodec': None if source_type.startswith('video') else 'none', 'quality': 10 if note == 'high' else 0, 'format_note': note, 'format_id': join_nonempty(extension, note), + **parse_resolution(source.get('label')), } def _real_extract(self, url): diff --git a/yt_dlp/extractor/cartoonnetwork.py b/yt_dlp/extractor/cartoonnetwork.py deleted file mode 100644 index 1749a008a..000000000 --- a/yt_dlp/extractor/cartoonnetwork.py +++ /dev/null @@ -1,59 +0,0 @@ -from .turner import TurnerBaseIE -from ..utils import int_or_none - - -class CartoonNetworkIE(TurnerBaseIE): - _VALID_URL = r'https?://(?:www\.)?cartoonnetwork\.com/video/(?:[^/]+/)+(?P[^/?#]+)-(?:clip|episode)\.html' - _TEST = { - 'url': 'https://www.cartoonnetwork.com/video/ben-10/how-to-draw-upgrade-episode.html', - 'info_dict': { - 'id': '6e3375097f63874ebccec7ef677c1c3845fa850e', - 'ext': 'mp4', - 'title': 'How to Draw Upgrade', - 'description': 'md5:2061d83776db7e8be4879684eefe8c0f', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - } - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - - def find_field(global_re, name, content_re=None, value_re='[^"]+', fatal=False): - metadata_re = '' - if content_re: - metadata_re = r'|video_metadata\.content_' + content_re - return self._search_regex( - rf'(?:_cnglobal\.currentVideo\.{global_re}{metadata_re})\s*=\s*"({value_re})";', - webpage, name, fatal=fatal) - - media_id = find_field('mediaId', 'media id', 'id', '[0-9a-f]{40}', True) - title = find_field('episodeTitle', 'title', '(?:episodeName|name)', fatal=True) - - info = self._extract_ngtv_info( - media_id, {'networkId': 'cartoonnetwork'}, { - 'url': url, - 'site_name': 'CartoonNetwork', - 'auth_required': find_field('authType', 'auth type') != 'unauth', - }) - - series = find_field( - 'propertyName', 'series', 'showName') or self._html_search_meta('partOfSeries', webpage) - info.update({ - 'id': media_id, - 'display_id': display_id, - 'title': title, - 'description': self._html_search_meta('description', webpage), - 'series': series, - 'episode': title, - }) - - for field in ('season', 'episode'): - field_name = field + 'Number' - info[field + '_number'] = int_or_none(find_field( - field_name, field + ' number', value_re=r'\d+') or self._html_search_meta(field_name, webpage)) - - return info diff --git a/yt_dlp/extractor/cda.py b/yt_dlp/extractor/cda.py index 96f25c22a..027b37d44 100644 --- a/yt_dlp/extractor/cda.py +++ b/yt_dlp/extractor/cda.py @@ -13,16 +13,17 @@ from ..utils import ( ExtractorError, OnDemandPagedList, + determine_ext, float_or_none, int_or_none, merge_dicts, multipart_encode, parse_duration, - traverse_obj, try_call, - try_get, + url_or_none, urljoin, ) +from ..utils.traversal import traverse_obj class CDAIE(InfoExtractor): @@ -290,34 +291,47 @@ def extract_format(page, version): if not video or 'file' not in video: self.report_warning(f'Unable to extract {version} version information') return - if video['file'].startswith('uggc'): - video['file'] = codecs.decode(video['file'], 'rot_13') - if video['file'].endswith('adc.mp4'): - video['file'] = video['file'].replace('adc.mp4', '.mp4') - elif not video['file'].startswith('http'): - video['file'] = decrypt_file(video['file']) video_quality = video.get('quality') qualities = video.get('qualities', {}) video_quality = next((k for k, v in qualities.items() if v == video_quality), video_quality) - info_dict['formats'].append({ - 'url': video['file'], - 'format_id': video_quality, - 'height': int_or_none(video_quality[:-1]), - }) + if video.get('file'): + if video['file'].startswith('uggc'): + video['file'] = codecs.decode(video['file'], 'rot_13') + if video['file'].endswith('adc.mp4'): + video['file'] = video['file'].replace('adc.mp4', '.mp4') + elif not video['file'].startswith('http'): + video['file'] = decrypt_file(video['file']) + info_dict['formats'].append({ + 'url': video['file'], + 'format_id': video_quality, + 'height': int_or_none(video_quality[:-1]), + }) for quality, cda_quality in qualities.items(): if quality == video_quality: continue data = {'jsonrpc': '2.0', 'method': 'videoGetLink', 'id': 2, 'params': [video_id, cda_quality, video.get('ts'), video.get('hash2'), {}]} data = json.dumps(data).encode() - video_url = self._download_json( + response = self._download_json( f'https://www.cda.pl/video/{video_id}', video_id, headers={ 'Content-Type': 'application/json', 'X-Requested-With': 'XMLHttpRequest', }, data=data, note=f'Fetching {quality} url', errnote=f'Failed to fetch {quality} url', fatal=False) - if try_get(video_url, lambda x: x['result']['status']) == 'ok': - video_url = try_get(video_url, lambda x: x['result']['resp']) + if ( + traverse_obj(response, ('result', 'status')) != 'ok' + or not traverse_obj(response, ('result', 'resp', {url_or_none})) + ): + continue + video_url = response['result']['resp'] + ext = determine_ext(video_url) + if ext == 'mpd': + info_dict['formats'].extend(self._extract_mpd_formats( + video_url, video_id, mpd_id='dash', fatal=False)) + elif ext == 'm3u8': + info_dict['formats'].extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + else: info_dict['formats'].append({ 'url': video_url, 'format_id': quality, @@ -353,7 +367,7 @@ def extract_format(page, version): class CDAFolderIE(InfoExtractor): _MAX_PAGE_SIZE = 36 - _VALID_URL = r'https?://(?:www\.)?cda\.pl/(?P\w+)/folder/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?cda\.pl/(?P[\w-]+)/folder/(?P\d+)' _TESTS = [ { 'url': 'https://www.cda.pl/domino264/folder/31188385', @@ -378,6 +392,9 @@ class CDAFolderIE(InfoExtractor): 'title': 'TESTY KOSMETYKÓW', }, 'playlist_mincount': 139, + }, { + 'url': 'https://www.cda.pl/FILMY-SERIALE-ANIME-KRESKOWKI-BAJKI/folder/18493422', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/dacast.py b/yt_dlp/extractor/dacast.py index 537352e5f..ffc7ca824 100644 --- a/yt_dlp/extractor/dacast.py +++ b/yt_dlp/extractor/dacast.py @@ -9,6 +9,7 @@ ExtractorError, classproperty, float_or_none, + parse_qs, traverse_obj, url_or_none, ) @@ -91,11 +92,15 @@ def _usp_signing_secret(self): # Rotates every so often, but hardcode a fallback in case of JS change/breakage before rotation return self._search_regex( r'\bUSP_SIGNING_SECRET\s*=\s*(["\'])(?P(?:(?!\1).)+)', player_js, - 'usp signing secret', group='secret', fatal=False) or 'odnInCGqhvtyRTtIiddxtuRtawYYICZP' + 'usp signing secret', group='secret', fatal=False) or 'hGDtqMKYVeFdofrAfFmBcrsakaZELajI' def _real_extract(self, url): user_id, video_id = self._match_valid_url(url).group('user_id', 'id') - query = {'contentId': f'{user_id}-vod-{video_id}', 'provider': 'universe'} + query = { + 'contentId': f'{user_id}-vod-{video_id}', + 'provider': 'universe', + **traverse_obj(url, ({parse_qs}, 'uss_token', {'signedKey': -1})), + } info = self._download_json(self._API_INFO_URL, video_id, query=query, fatal=False) access = self._download_json( 'https://playback.dacast.com/content/access', video_id, diff --git a/yt_dlp/extractor/dreisat.py b/yt_dlp/extractor/dreisat.py index 376ff672d..edd66e46c 100644 --- a/yt_dlp/extractor/dreisat.py +++ b/yt_dlp/extractor/dreisat.py @@ -1,9 +1,15 @@ from .zdf import ZDFBaseIE +from ..utils import ( + int_or_none, + merge_dicts, + parse_iso8601, +) +from ..utils.traversal import require, traverse_obj class DreiSatIE(ZDFBaseIE): IE_NAME = '3sat' - _VALID_URL = r'https?://(?:www\.)?3sat\.de/(?:[^/]+/)*(?P[^/?#&]+)\.html' + _VALID_URL = r'https?://(?:www\.)?3sat\.de/(?:[^/?#]+/)*(?P[^/?#&]+)\.html' _TESTS = [{ 'url': 'https://www.3sat.de/dokumentation/reise/traumziele-suedostasiens-die-philippinen-und-vietnam-102.html', 'info_dict': { @@ -12,40 +18,59 @@ class DreiSatIE(ZDFBaseIE): 'title': 'Traumziele Südostasiens (1/2): Die Philippinen und Vietnam', 'description': 'md5:26329ce5197775b596773b939354079d', 'duration': 2625.0, - 'thumbnail': 'https://www.3sat.de/assets/traumziele-suedostasiens-die-philippinen-und-vietnam-100~2400x1350?cb=1699870351148', + 'thumbnail': 'https://www.3sat.de/assets/traumziele-suedostasiens-die-philippinen-und-vietnam-100~original?cb=1699870351148', 'episode': 'Traumziele Südostasiens (1/2): Die Philippinen und Vietnam', 'episode_id': 'POS_cc7ff51c-98cf-4d12-b99d-f7a551de1c95', - 'timestamp': 1738593000, - 'upload_date': '20250203', + 'timestamp': 1747920900, + 'upload_date': '20250522', }, }, { - # Same as https://www.zdf.de/dokumentation/ab-18/10-wochen-sommer-102.html - 'url': 'https://www.3sat.de/film/ab-18/10-wochen-sommer-108.html', - 'md5': '0aff3e7bc72c8813f5e0fae333316a1d', + 'url': 'https://www.3sat.de/film/ab-18/ab-18---mein-fremdes-ich-100.html', + 'md5': 'f92638413a11d759bdae95c9d8ec165c', 'info_dict': { - 'id': '141007_ab18_10wochensommer_film', + 'id': '221128_mein_fremdes_ich2_ab18', 'ext': 'mp4', - 'title': 'Ab 18! - 10 Wochen Sommer', - 'description': 'md5:8253f41dc99ce2c3ff892dac2d65fe26', - 'duration': 2660, - 'timestamp': 1608604200, - 'upload_date': '20201222', + 'title': 'Ab 18! - Mein fremdes Ich', + 'description': 'md5:cae0c0b27b7426d62ca0dda181738bf0', + 'duration': 2625.0, + 'thumbnail': 'https://www.3sat.de/assets/ab-18---mein-fremdes-ich-106~original?cb=1666081865812', + 'episode': 'Ab 18! - Mein fremdes Ich', + 'episode_id': 'POS_6225d1ca-a0d5-45e3-870b-e783ee6c8a3f', + 'timestamp': 1695081600, + 'upload_date': '20230919', }, - 'skip': '410 Gone', }, { - 'url': 'https://www.3sat.de/gesellschaft/schweizweit/waidmannsheil-100.html', + 'url': 'https://www.3sat.de/gesellschaft/37-grad-leben/aus-dem-leben-gerissen-102.html', + 'md5': 'a903eaf8d1fd635bd3317cd2ad87ec84', 'info_dict': { - 'id': '140913_sendung_schweizweit', + 'id': '250323_0903_sendung_sgl', 'ext': 'mp4', - 'title': 'Waidmannsheil', - 'description': 'md5:cce00ca1d70e21425e72c86a98a56817', - 'timestamp': 1410623100, - 'upload_date': '20140913', + 'title': 'Plötzlich ohne dich', + 'description': 'md5:380cc10659289dd91510ad8fa717c66b', + 'duration': 1620.0, + 'thumbnail': 'https://www.3sat.de/assets/37-grad-leben-106~original?cb=1645537156810', + 'episode': 'Plötzlich ohne dich', + 'episode_id': 'POS_faa7a93c-c0f2-4d51-823f-ce2ac3ee191b', + 'timestamp': 1743162540, + 'upload_date': '20250328', }, - 'params': { - 'skip_download': True, + }, { + # Video with chapters + 'url': 'https://www.3sat.de/kultur/buchmesse/dein-buch-das-beste-von-der-leipziger-buchmesse-2025-teil-1-100.html', + 'md5': '6b95790ce52e75f0d050adcdd2711ee6', + 'info_dict': { + 'id': '250330_dein_buch1_bum', + 'ext': 'mp4', + 'title': 'dein buch - Das Beste von der Leipziger Buchmesse 2025 - Teil 1', + 'description': 'md5:bae51bfc22f15563ce3acbf97d2e8844', + 'duration': 5399.0, + 'thumbnail': 'https://www.3sat.de/assets/buchmesse-kerkeling-100~original?cb=1743329640903', + 'chapters': 'count:24', + 'episode': 'dein buch - Das Beste von der Leipziger Buchmesse 2025 - Teil 1', + 'episode_id': 'POS_1ef236cc-b390-401e-acd0-4fb4b04315fb', + 'timestamp': 1743327000, + 'upload_date': '20250330', }, - 'skip': '404 Not Found', }, { # Same as https://www.zdf.de/filme/filme-sonstige/der-hauptmann-112.html 'url': 'https://www.3sat.de/film/spielfilm/der-hauptmann-100.html', @@ -58,11 +83,42 @@ class DreiSatIE(ZDFBaseIE): def _real_extract(self, url): video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + player = self._search_json( + r'data-zdfplayer-jsb=(["\'])', webpage, 'player JSON', video_id) + player_url = player['content'] + api_token = f'Bearer {player["apiToken"]}' - webpage = self._download_webpage(url, video_id, fatal=False) - if webpage: - player = self._extract_player(webpage, url, fatal=False) - if player: - return self._extract_regular(url, player, video_id) + content = self._call_api(player_url, video_id, 'video metadata', api_token) - return self._extract_mobile(video_id) + video_target = content['mainVideoContent']['http://zdf.de/rels/target'] + ptmd_path = traverse_obj(video_target, ( + (('streams', 'default'), None), + ('http://zdf.de/rels/streams/ptmd', 'http://zdf.de/rels/streams/ptmd-template'), + {str}, any, {require('ptmd path')})) + ptmd_url = self._expand_ptmd_template(player_url, ptmd_path) + aspect_ratio = self._parse_aspect_ratio(video_target.get('aspectRatio')) + info = self._extract_ptmd(ptmd_url, video_id, api_token, aspect_ratio) + + return merge_dicts(info, { + **traverse_obj(content, { + 'title': (('title', 'teaserHeadline'), {str}, any), + 'episode': (('title', 'teaserHeadline'), {str}, any), + 'description': (('leadParagraph', 'teasertext'), {str}, any), + 'timestamp': ('editorialDate', {parse_iso8601}), + }), + **traverse_obj(video_target, { + 'duration': ('duration', {int_or_none}), + 'chapters': ('streamAnchorTag', {self._extract_chapters}), + }), + 'thumbnails': self._extract_thumbnails(traverse_obj(content, ('teaserImageRef', 'layouts', {dict}))), + **traverse_obj(content, ('programmeItem', 0, 'http://zdf.de/rels/target', { + 'series_id': ('http://zdf.de/rels/cmdm/series', 'seriesUuid', {str}), + 'series': ('http://zdf.de/rels/cmdm/series', 'seriesTitle', {str}), + 'season': ('http://zdf.de/rels/cmdm/season', 'seasonTitle', {str}), + 'season_number': ('http://zdf.de/rels/cmdm/season', 'seasonNumber', {int_or_none}), + 'season_id': ('http://zdf.de/rels/cmdm/season', 'seasonUuid', {str}), + 'episode_number': ('episodeNumber', {int_or_none}), + 'episode_id': ('contentId', {str}), + })), + }) diff --git a/yt_dlp/extractor/espn.py b/yt_dlp/extractor/espn.py index 552f9af12..37e74bc08 100644 --- a/yt_dlp/extractor/espn.py +++ b/yt_dlp/extractor/espn.py @@ -5,7 +5,6 @@ from .adobepass import AdobePassIE from .common import InfoExtractor -from .once import OnceIE from ..utils import ( determine_ext, dict_get, @@ -16,7 +15,7 @@ ) -class ESPNIE(OnceIE): +class ESPNIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// (?: @@ -131,9 +130,7 @@ def extract_source(source_url, source_id=None): return format_urls.add(source_url) ext = determine_ext(source_url) - if OnceIE.suitable(source_url): - formats.extend(self._extract_once_formats(source_url)) - elif ext == 'smil': + if ext == 'smil': formats.extend(self._extract_smil_formats( source_url, video_id, fatal=False)) elif ext == 'f4m': diff --git a/yt_dlp/extractor/firsttv.py b/yt_dlp/extractor/firsttv.py index ac7697bb3..878732c49 100644 --- a/yt_dlp/extractor/firsttv.py +++ b/yt_dlp/extractor/firsttv.py @@ -2,11 +2,15 @@ from .common import InfoExtractor from ..utils import ( + determine_ext, int_or_none, - qualities, + join_nonempty, + mimetype2ext, + parse_qs, unified_strdate, url_or_none, ) +from ..utils.traversal import traverse_obj class FirstTVIE(InfoExtractor): @@ -15,40 +19,51 @@ class FirstTVIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?(?:sport)?1tv\.ru/(?:[^/?#]+/)+(?P[^/?#]+)' _TESTS = [{ - # single format - 'url': 'http://www.1tv.ru/shows/naedine-so-vsemi/vypuski/gost-lyudmila-senchina-naedine-so-vsemi-vypusk-ot-12-02-2015', - 'md5': 'a1b6b60d530ebcf8daacf4565762bbaf', + # single format; has item.id + 'url': 'https://www.1tv.ru/shows/naedine-so-vsemi/vypuski/gost-lyudmila-senchina-naedine-so-vsemi-vypusk-ot-12-02-2015', + 'md5': '8011ae8e88ff4150107ab9c5a8f5b659', 'info_dict': { 'id': '40049', 'ext': 'mp4', 'title': 'Гость Людмила Сенчина. Наедине со всеми. Выпуск от 12.02.2015', - 'thumbnail': r're:^https?://.*\.(?:jpg|JPG)$', + 'thumbnail': r're:https?://.+/.+\.jpg', 'upload_date': '20150212', 'duration': 2694, }, + 'params': {'skip_download': 'm3u8'}, }, { - # multiple formats - 'url': 'http://www.1tv.ru/shows/dobroe-utro/pro-zdorove/vesennyaya-allergiya-dobroe-utro-fragment-vypuska-ot-07042016', + # multiple formats; has item.id + 'url': 'https://www.1tv.ru/shows/dobroe-utro/pro-zdorove/vesennyaya-allergiya-dobroe-utro-fragment-vypuska-ot-07042016', 'info_dict': { 'id': '364746', 'ext': 'mp4', 'title': 'Весенняя аллергия. Доброе утро. Фрагмент выпуска от 07.04.2016', - 'thumbnail': r're:^https?://.*\.(?:jpg|JPG)$', + 'thumbnail': r're:https?://.+/.+\.jpg', 'upload_date': '20160407', 'duration': 179, 'formats': 'mincount:3', }, - 'params': { - 'skip_download': True, - }, + 'params': {'skip_download': 'm3u8'}, }, { - 'url': 'http://www.1tv.ru/news/issue/2016-12-01/14:00', + 'url': 'https://www.1tv.ru/news/issue/2016-12-01/14:00', 'info_dict': { 'id': '14:00', - 'title': 'Выпуск новостей в 14:00 1 декабря 2016 года. Новости. Первый канал', - 'description': 'md5:2e921b948f8c1ff93901da78ebdb1dfd', + 'title': 'Выпуск программы «Время» в 20:00 1 декабря 2016 года. Новости. Первый канал', + 'thumbnail': 'https://static.1tv.ru/uploads/photo/image/8/big/338448_big_8fc7eb236f.jpg', }, 'playlist_count': 13, + }, { + # has timestamp; has item.uid but not item.id + 'url': 'https://www.1tv.ru/shows/segodnya-vecherom/vypuski/avtory-odnogo-hita-segodnya-vecherom-vypusk-ot-03-05-2025', + 'info_dict': { + 'id': '270411', + 'ext': 'mp4', + 'title': 'Авторы одного хита. Сегодня вечером. Выпуск от 03.05.2025', + 'thumbnail': r're:https?://.+/.+\.jpg', + 'timestamp': 1746286020, + 'upload_date': '20250503', + }, + 'params': {'skip_download': 'm3u8'}, }, { 'url': 'http://www.1tv.ru/shows/tochvtoch-supersezon/vystupleniya/evgeniy-dyatlov-vladimir-vysockiy-koni-priveredlivye-toch-v-toch-supersezon-fragment-vypuska-ot-06-11-2016', 'only_matching': True, @@ -57,96 +72,60 @@ class FirstTVIE(InfoExtractor): 'only_matching': True, }] + def _entries(self, items): + for item in items: + video_id = str(item.get('id') or item['uid']) + + formats, subtitles = [], {} + for f in traverse_obj(item, ('sources', lambda _, v: url_or_none(v['src']))): + src = f['src'] + ext = mimetype2ext(f.get('type'), default=determine_ext(src)) + if ext == 'm3u8': + fmts, subs = self._extract_m3u8_formats_and_subtitles( + src, video_id, 'mp4', m3u8_id='hls', fatal=False) + elif ext == 'mpd': + fmts, subs = self._extract_mpd_formats_and_subtitles( + src, video_id, mpd_id='dash', fatal=False) + else: + tbr = self._search_regex(fr'_(\d{{3,}})\.{ext}', src, 'tbr', default=None) + formats.append({ + 'url': src, + 'ext': ext, + 'format_id': join_nonempty('http', ext, tbr), + 'tbr': int_or_none(tbr), + # quality metadata of http formats may be incorrect + 'quality': -10, + }) + continue + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + + yield { + **traverse_obj(item, { + 'title': ('title', {str}), + 'thumbnail': ('poster', {url_or_none}), + 'timestamp': ('dvr_begin_at', {int_or_none}), + 'upload_date': ('date_air', {unified_strdate}), + 'duration': ('duration', {int_or_none}), + }), + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + } + def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - playlist_url = urllib.parse.urljoin(url, self._search_regex( + playlist_url = urllib.parse.urljoin(url, self._html_search_regex( r'data-playlist-url=(["\'])(?P(?:(?!\1).)+)\1', webpage, 'playlist url', group='url')) - parsed_url = urllib.parse.urlparse(playlist_url) - qs = urllib.parse.parse_qs(parsed_url.query) - item_ids = qs.get('videos_ids[]') or qs.get('news_ids[]') + item_ids = traverse_obj(parse_qs(playlist_url), 'video_id', 'videos_ids[]', 'news_ids[]') + items = traverse_obj( + self._download_json(playlist_url, display_id), + lambda _, v: v['uid'] and (str(v['uid']) in item_ids if item_ids else True)) - items = self._download_json(playlist_url, display_id) - - if item_ids: - items = [ - item for item in items - if item.get('uid') and str(item['uid']) in item_ids] - else: - items = [items[0]] - - entries = [] - QUALITIES = ('ld', 'sd', 'hd') - - for item in items: - title = item['title'] - quality = qualities(QUALITIES) - formats = [] - path = None - for f in item.get('mbr', []): - src = url_or_none(f.get('src')) - if not src: - continue - tbr = int_or_none(self._search_regex( - r'_(\d{3,})\.mp4', src, 'tbr', default=None)) - if not path: - path = self._search_regex( - r'//[^/]+/(.+?)_\d+\.mp4', src, - 'm3u8 path', default=None) - formats.append({ - 'url': src, - 'format_id': f.get('name'), - 'tbr': tbr, - 'source_preference': quality(f.get('name')), - # quality metadata of http formats may be incorrect - 'preference': -10, - }) - # m3u8 URL format is reverse engineered from [1] (search for - # master.m3u8). dashEdges (that is currently balancer-vod.1tv.ru) - # is taken from [2]. - # 1. http://static.1tv.ru/player/eump1tv-current/eump-1tv.all.min.js?rnd=9097422834:formatted - # 2. http://static.1tv.ru/player/eump1tv-config/config-main.js?rnd=9097422834 - if not path and len(formats) == 1: - path = self._search_regex( - r'//[^/]+/(.+?$)', formats[0]['url'], - 'm3u8 path', default=None) - if path: - if len(formats) == 1: - m3u8_path = ',' - else: - tbrs = [str(t) for t in sorted(f['tbr'] for f in formats)] - m3u8_path = '_,{},{}'.format(','.join(tbrs), '.mp4') - formats.extend(self._extract_m3u8_formats( - f'http://balancer-vod.1tv.ru/{path}{m3u8_path}.urlset/master.m3u8', - display_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) - - thumbnail = item.get('poster') or self._og_search_thumbnail(webpage) - duration = int_or_none(item.get('duration') or self._html_search_meta( - 'video:duration', webpage, 'video duration', fatal=False)) - upload_date = unified_strdate(self._html_search_meta( - 'ya:ovs:upload_date', webpage, 'upload date', default=None)) - - entries.append({ - 'id': str(item.get('id') or item['uid']), - 'thumbnail': thumbnail, - 'title': title, - 'upload_date': upload_date, - 'duration': int_or_none(duration), - 'formats': formats, - }) - - title = self._html_search_regex( - (r'
\s*

([^<]*)', - r"'title'\s*:\s*'([^']+)'"), - webpage, 'title', default=None) or self._og_search_title( - webpage, default=None) - description = self._html_search_regex( - r'
\s*
 
\s*

([^<]*)

', - webpage, 'description', default=None) or self._html_search_meta( - 'description', webpage, 'description', default=None) - - return self.playlist_result(entries, display_id, title, description) + return self.playlist_result( + self._entries(items), display_id, self._og_search_title(webpage, default=None), + thumbnail=self._og_search_thumbnail(webpage, default=None)) diff --git a/yt_dlp/extractor/gamespot.py b/yt_dlp/extractor/gamespot.py index cd3f9655d..2799a27ba 100644 --- a/yt_dlp/extractor/gamespot.py +++ b/yt_dlp/extractor/gamespot.py @@ -1,9 +1,9 @@ import urllib.parse -from .once import OnceIE +from .common import InfoExtractor -class GameSpotIE(OnceIE): +class GameSpotIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?gamespot\.com/(?:video|article|review)s/(?:[^/]+/\d+-|embed/)(?P\d+)' _TESTS = [{ 'url': 'http://www.gamespot.com/videos/arma-3-community-guide-sitrep-i/2300-6410818/', diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index b0c7be462..721d04e31 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -16,7 +16,6 @@ MEDIA_EXTENSIONS, ExtractorError, UnsupportedError, - base_url, determine_ext, determine_protocol, dict_get, @@ -38,6 +37,7 @@ unescapeHTML, unified_timestamp, unsmuggle_url, + update_url, update_url_query, url_or_none, urlhandle_detect_ext, @@ -2538,12 +2538,13 @@ def _real_extract(self, url): return self.playlist_result( self._parse_xspf( doc, video_id, xspf_url=url, - xspf_base_url=full_response.url), + xspf_base_url=new_url), video_id) elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag): info_dict['formats'], info_dict['subtitles'] = self._parse_mpd_formats_and_subtitles( doc, - mpd_base_url=base_url(full_response.url), + # Do not use yt_dlp.utils.base_url here since it will raise on file:// URLs + mpd_base_url=update_url(new_url, query=None, fragment=None).rpartition('/')[0], mpd_url=url) info_dict['live_status'] = 'is_live' if doc.get('type') == 'dynamic' else None self._extra_manifest_info(info_dict, url) diff --git a/yt_dlp/extractor/getcourseru.py b/yt_dlp/extractor/getcourseru.py index b7581d77e..2d923cf54 100644 --- a/yt_dlp/extractor/getcourseru.py +++ b/yt_dlp/extractor/getcourseru.py @@ -8,7 +8,7 @@ class GetCourseRuPlayerIE(InfoExtractor): - _VALID_URL = r'https?://player02\.getcourse\.ru/sign-player/?\?(?:[^#]+&)?json=[^#&]+' + _VALID_URL = r'https?://(?:player02\.getcourse\.ru|cf-api-2\.vhcdn\.com)/sign-player/?\?(?:[^#]+&)?json=[^#&]+' _EMBED_REGEX = [rf']+\bsrc=[\'"](?P{_VALID_URL}[^\'"]*)'] _TESTS = [{ 'url': 'http://player02.getcourse.ru/sign-player/?json=eyJ2aWRlb19oYXNoIjoiMTkwYmRmOTNmMWIyOTczNTMwOTg1M2E3YTE5ZTI0YjMiLCJ1c2VyX2lkIjozNTk1MjUxODMsInN1Yl9sb2dpbl91c2VyX2lkIjpudWxsLCJsZXNzb25faWQiOm51bGwsImlwIjoiNDYuMTQyLjE4Mi4yNDciLCJnY19ob3N0IjoiYWNhZGVteW1lbC5vbmxpbmUiLCJ0aW1lIjoxNzA1NDQ5NjQyLCJwYXlsb2FkIjoidV8zNTk1MjUxODMiLCJ1aV9sYW5ndWFnZSI6InJ1IiwiaXNfaGF2ZV9jdXN0b21fc3R5bGUiOnRydWV9&s=354ad2c993d95d5ac629e3133d6cefea&vh-static-feature=zigzag', @@ -20,6 +20,16 @@ class GetCourseRuPlayerIE(InfoExtractor): 'duration': 1693, }, 'skip': 'JWT expired', + }, { + 'url': 'https://cf-api-2.vhcdn.com/sign-player/?json=example', + 'info_dict': { + 'id': '435735291', + 'title': '8afd7c489952108e00f019590f3711f3', + 'ext': 'mp4', + 'thumbnail': 'https://preview-htz.vhcdn.com/preview/8afd7c489952108e00f019590f3711f3/preview.jpg?version=1682170973&host=vh-72', + 'duration': 777, + }, + 'skip': 'JWT expired', }] def _real_extract(self, url): @@ -168,7 +178,7 @@ def _real_extract(self, url): playlist_id = self._search_regex( r'window\.(?:lessonId|gcsObjectId)\s*=\s*(\d+)', webpage, 'playlist id', default=display_id) - title = self._og_search_title(webpage) or self._html_extract_title(webpage) + title = self._og_search_title(webpage, default=None) or self._html_extract_title(webpage) return self.playlist_from_matches( re.findall(GetCourseRuPlayerIE._EMBED_REGEX[0], webpage), diff --git a/yt_dlp/extractor/iprima.py b/yt_dlp/extractor/iprima.py index 9b91a454b..bf2344f72 100644 --- a/yt_dlp/extractor/iprima.py +++ b/yt_dlp/extractor/iprima.py @@ -1,3 +1,4 @@ +import json import re import time @@ -6,9 +7,7 @@ ExtractorError, determine_ext, js_to_json, - parse_qs, traverse_obj, - urlencode_postdata, ) @@ -16,7 +15,6 @@ class IPrimaIE(InfoExtractor): _VALID_URL = r'https?://(?!cnn)(?:[^/]+)\.iprima\.cz/(?:[^/]+/)*(?P[^/?#&]+)' _GEO_BYPASS = False _NETRC_MACHINE = 'iprima' - _AUTH_ROOT = 'https://auth.iprima.cz' access_token = None _TESTS = [{ @@ -86,48 +84,18 @@ def _perform_login(self, username, password): if self.access_token: return - login_page = self._download_webpage( - f'{self._AUTH_ROOT}/oauth2/login', None, note='Downloading login page', - errnote='Downloading login page failed') - - login_form = self._hidden_inputs(login_page) - - login_form.update({ - '_email': username, - '_password': password}) - - profile_select_html, login_handle = self._download_webpage_handle( - f'{self._AUTH_ROOT}/oauth2/login', None, data=urlencode_postdata(login_form), - note='Logging in') - - # a profile may need to be selected first, even when there is only a single one - if '/profile-select' in login_handle.url: - profile_id = self._search_regex( - r'data-identifier\s*=\s*["\']?(\w+)', profile_select_html, 'profile id') - - login_handle = self._request_webpage( - f'{self._AUTH_ROOT}/user/profile-select-perform/{profile_id}', None, - query={'continueUrl': '/user/login?redirect_uri=/user/'}, note='Selecting profile') - - code = traverse_obj(login_handle.url, ({parse_qs}, 'code', 0)) - if not code: - raise ExtractorError('Login failed', expected=True) - - token_request_data = { - 'scope': 'openid+email+profile+phone+address+offline_access', - 'client_id': 'prima_sso', - 'grant_type': 'authorization_code', - 'code': code, - 'redirect_uri': f'{self._AUTH_ROOT}/sso/auth-check'} - token_data = self._download_json( - f'{self._AUTH_ROOT}/oauth2/token', None, - note='Downloading token', errnote='Downloading token failed', - data=urlencode_postdata(token_request_data)) + 'https://ucet.iprima.cz/api/session/create', None, + note='Logging in', errnote='Failed to log in', + data=json.dumps({ + 'email': username, + 'password': password, + 'deviceName': 'Windows Chrome', + }).encode(), headers={'content-type': 'application/json'}) - self.access_token = token_data.get('access_token') - if self.access_token is None: - raise ExtractorError('Getting token failed', expected=True) + self.access_token = token_data['accessToken']['value'] + if not self.access_token: + raise ExtractorError('Failed to fetch access token') def _real_initialize(self): if not self.access_token: diff --git a/yt_dlp/extractor/jiosaavn.py b/yt_dlp/extractor/jiosaavn.py index 030fe686b..3760e9abe 100644 --- a/yt_dlp/extractor/jiosaavn.py +++ b/yt_dlp/extractor/jiosaavn.py @@ -1,23 +1,33 @@ import functools +import itertools import math import re from .common import InfoExtractor from ..utils import ( InAdvancePagedList, + ISO639Utils, + OnDemandPagedList, clean_html, int_or_none, + js_to_json, make_archive_id, + orderedSet, smuggle_url, + unified_strdate, + unified_timestamp, unsmuggle_url, url_basename, url_or_none, urlencode_postdata, + urljoin, + variadic, ) from ..utils.traversal import traverse_obj class JioSaavnBaseIE(InfoExtractor): + _URL_BASE_RE = r'https?://(?:www\.)?(?:jio)?saavn\.com' _API_URL = 'https://www.jiosaavn.com/api.php' _VALID_BITRATES = {'16', '32', '64', '128', '320'} @@ -30,16 +40,20 @@ def requested_bitrates(self): f'Valid bitrates are: {", ".join(sorted(self._VALID_BITRATES, key=int))}') return requested_bitrates - def _extract_formats(self, song_data): + def _extract_formats(self, item_data): + # Show/episode JSON data has a slightly different structure than song JSON data + if media_url := traverse_obj(item_data, ('more_info', 'encrypted_media_url', {str})): + item_data.setdefault('encrypted_media_url', media_url) + for bitrate in self.requested_bitrates: media_data = self._download_json( - self._API_URL, song_data['id'], + self._API_URL, item_data['id'], f'Downloading format info for {bitrate}', fatal=False, data=urlencode_postdata({ '__call': 'song.generateAuthToken', '_format': 'json', 'bitrate': bitrate, - 'url': song_data['encrypted_media_url'], + 'url': item_data['encrypted_media_url'], })) if not traverse_obj(media_data, ('auth_url', {url_or_none})): self.report_warning(f'Unable to extract format info for {bitrate}') @@ -53,24 +67,6 @@ def _extract_formats(self, song_data): 'vcodec': 'none', } - def _extract_song(self, song_data, url=None): - info = traverse_obj(song_data, { - 'id': ('id', {str}), - 'title': ('song', {clean_html}), - 'album': ('album', {clean_html}), - 'thumbnail': ('image', {url_or_none}, {lambda x: re.sub(r'-\d+x\d+\.', '-500x500.', x)}), - 'duration': ('duration', {int_or_none}), - 'view_count': ('play_count', {int_or_none}), - 'release_year': ('year', {int_or_none}), - 'artists': ('primary_artists', {lambda x: x.split(', ') if x else None}), - 'webpage_url': ('perma_url', {url_or_none}), - }) - if webpage_url := info.get('webpage_url') or url: - info['display_id'] = url_basename(webpage_url) - info['_old_archive_ids'] = [make_archive_id(JioSaavnSongIE, info['display_id'])] - - return info - def _call_api(self, type_, token, note='API', params={}): return self._download_json( self._API_URL, token, f'Downloading {note} JSON', f'Unable to download {note} JSON', @@ -84,19 +80,89 @@ def _call_api(self, type_, token, note='API', params={}): **params, }) - def _yield_songs(self, playlist_data): - for song_data in traverse_obj(playlist_data, ('songs', lambda _, v: v['id'] and v['perma_url'])): - song_info = self._extract_song(song_data) - url = smuggle_url(song_info['webpage_url'], { - 'id': song_data['id'], - 'encrypted_media_url': song_data['encrypted_media_url'], - }) - yield self.url_result(url, JioSaavnSongIE, url_transparent=True, **song_info) + @staticmethod + def _extract_song(song_data, url=None): + info = traverse_obj(song_data, { + 'id': ('id', {str}), + 'title': (('song', 'title'), {clean_html}, any), + 'album': ((None, 'more_info'), 'album', {clean_html}, any), + 'duration': ((None, 'more_info'), 'duration', {int_or_none}, any), + 'channel': ((None, 'more_info'), 'label', {str}, any), + 'channel_id': ((None, 'more_info'), 'label_id', {str}, any), + 'channel_url': ((None, 'more_info'), 'label_url', {urljoin('https://www.jiosaavn.com/')}, any), + 'release_date': ((None, 'more_info'), 'release_date', {unified_strdate}, any), + 'release_year': ('year', {int_or_none}), + 'thumbnail': ('image', {url_or_none}, {lambda x: re.sub(r'-\d+x\d+\.', '-500x500.', x)}), + 'view_count': ('play_count', {int_or_none}), + 'language': ('language', {lambda x: ISO639Utils.short2long(x.casefold()) or 'und'}), + 'webpage_url': ('perma_url', {url_or_none}), + 'artists': ('more_info', 'artistMap', 'primary_artists', ..., 'name', {str}, filter, all), + }) + if webpage_url := info.get('webpage_url') or url: + info['display_id'] = url_basename(webpage_url) + info['_old_archive_ids'] = [make_archive_id(JioSaavnSongIE, info['display_id'])] + + if primary_artists := traverse_obj(song_data, ('primary_artists', {lambda x: x.split(', ') if x else None})): + info['artists'].extend(primary_artists) + if featured_artists := traverse_obj(song_data, ('featured_artists', {str}, filter)): + info['artists'].extend(featured_artists.split(', ')) + info['artists'] = orderedSet(info['artists']) or None + + return info + + @staticmethod + def _extract_episode(episode_data, url=None): + info = JioSaavnBaseIE._extract_song(episode_data, url) + info.pop('_old_archive_ids', None) + info.update(traverse_obj(episode_data, { + 'description': ('more_info', 'description', {str}), + 'timestamp': ('more_info', 'release_time', {unified_timestamp}), + 'series': ('more_info', 'show_title', {str}), + 'series_id': ('more_info', 'show_id', {str}), + 'season': ('more_info', 'season_title', {str}), + 'season_number': ('more_info', 'season_no', {int_or_none}), + 'season_id': ('more_info', 'season_id', {str}), + 'episode_number': ('more_info', 'episode_number', {int_or_none}), + 'cast': ('starring', {lambda x: x.split(', ') if x else None}), + })) + return info + + def _extract_jiosaavn_result(self, url, endpoint, response_key, parse_func): + url, smuggled_data = unsmuggle_url(url) + data = traverse_obj(smuggled_data, ({ + 'id': ('id', {str}), + 'encrypted_media_url': ('encrypted_media_url', {str}), + })) + + if 'id' in data and 'encrypted_media_url' in data: + result = {'id': data['id']} + else: + # only extract metadata if this is not a url_transparent result + data = self._call_api(endpoint, self._match_id(url))[response_key][0] + result = parse_func(data, url) + + result['formats'] = list(self._extract_formats(data)) + return result + + def _yield_items(self, playlist_data, keys=None, parse_func=None): + """Subclasses using this method must set _ENTRY_IE""" + if parse_func is None: + parse_func = self._extract_song + + for item_data in traverse_obj(playlist_data, ( + *variadic(keys, (str, bytes, dict, set)), lambda _, v: v['id'] and v['perma_url'], + )): + info = parse_func(item_data) + url = smuggle_url(info['webpage_url'], traverse_obj(item_data, { + 'id': ('id', {str}), + 'encrypted_media_url': ((None, 'more_info'), 'encrypted_media_url', {str}, any), + })) + yield self.url_result(url, self._ENTRY_IE, url_transparent=True, **info) class JioSaavnSongIE(JioSaavnBaseIE): IE_NAME = 'jiosaavn:song' - _VALID_URL = r'https?://(?:www\.)?(?:jiosaavn\.com/song/[^/?#]+/|saavn\.com/s/song/(?:[^/?#]+/){3})(?P[^/?#]+)' + _VALID_URL = JioSaavnBaseIE._URL_BASE_RE + r'(?:/song/[^/?#]+/|/s/song/(?:[^/?#]+/){3})(?P[^/?#]+)' _TESTS = [{ 'url': 'https://www.jiosaavn.com/song/leja-re/OQsEfQFVUXk', 'md5': '3b84396d15ed9e083c3106f1fa589c04', @@ -106,12 +172,38 @@ class JioSaavnSongIE(JioSaavnBaseIE): 'ext': 'm4a', 'title': 'Leja Re', 'album': 'Leja Re', - 'thumbnail': r're:https?://c.saavncdn.com/258/Leja-Re-Hindi-2018-20181124024539-500x500.jpg', + 'thumbnail': r're:https?://.+/.+\.jpg', 'duration': 205, 'view_count': int, 'release_year': 2018, 'artists': ['Sandesh Shandilya', 'Dhvani Bhanushali', 'Tanishk Bagchi'], '_old_archive_ids': ['jiosaavnsong OQsEfQFVUXk'], + 'channel': 'T-Series', + 'language': 'hin', + 'channel_id': '34297', + 'channel_url': 'https://www.jiosaavn.com/label/t-series-albums/6DLuXO3VoTo_', + 'release_date': '20181124', + }, + }, { + 'url': 'https://www.jiosaavn.com/song/chuttamalle/P1FfWjZkQ0Q', + 'md5': '96296c58d6ce488a417ef0728fd2d680', + 'info_dict': { + 'id': 'O94kBTtw', + 'display_id': 'P1FfWjZkQ0Q', + 'ext': 'm4a', + 'title': 'Chuttamalle', + 'album': 'Devara Part 1 - Telugu', + 'thumbnail': r're:https?://.+/.+\.jpg', + 'duration': 222, + 'view_count': int, + 'release_year': 2024, + 'artists': 'count:3', + '_old_archive_ids': ['jiosaavnsong P1FfWjZkQ0Q'], + 'channel': 'T-Series', + 'language': 'tel', + 'channel_id': '34297', + 'channel_url': 'https://www.jiosaavn.com/label/t-series-albums/6DLuXO3VoTo_', + 'release_date': '20240926', }, }, { 'url': 'https://www.saavn.com/s/song/hindi/Saathiya/O-Humdum-Suniyo-Re/KAMiazoCblU', @@ -119,26 +211,51 @@ class JioSaavnSongIE(JioSaavnBaseIE): }] def _real_extract(self, url): - url, smuggled_data = unsmuggle_url(url) - song_data = traverse_obj(smuggled_data, ({ - 'id': ('id', {str}), - 'encrypted_media_url': ('encrypted_media_url', {str}), - })) + return self._extract_jiosaavn_result(url, 'song', 'songs', self._extract_song) - if 'id' in song_data and 'encrypted_media_url' in song_data: - result = {'id': song_data['id']} - else: - # only extract metadata if this is not a url_transparent result - song_data = self._call_api('song', self._match_id(url))['songs'][0] - result = self._extract_song(song_data, url) - result['formats'] = list(self._extract_formats(song_data)) - return result +class JioSaavnShowIE(JioSaavnBaseIE): + IE_NAME = 'jiosaavn:show' + _VALID_URL = JioSaavnBaseIE._URL_BASE_RE + r'/shows/[^/?#]+/(?P[^/?#]{11,})/?(?:$|[?#])' + _TESTS = [{ + 'url': 'https://www.jiosaavn.com/shows/non-food-ways-to-boost-your-energy/XFMcKICOCgc_', + 'md5': '0733cd254cfe74ef88bea1eaedcf1f4f', + 'info_dict': { + 'id': 'qqzh3RKZ', + 'display_id': 'XFMcKICOCgc_', + 'ext': 'mp3', + 'title': 'Non-Food Ways To Boost Your Energy', + 'description': 'md5:26e7129644b5c6aada32b8851c3997c8', + 'episode': 'Episode 1', + 'timestamp': 1640563200, + 'series': 'Holistic Lifestyle With Neha Ranglani', + 'series_id': '52397', + 'season': 'Holistic Lifestyle With Neha Ranglani', + 'season_number': 1, + 'season_id': '61273', + 'thumbnail': r're:https?://.+/.+\.jpg', + 'duration': 311, + 'view_count': int, + 'release_year': 2021, + 'language': 'eng', + 'channel': 'Saavn OG', + 'channel_id': '1953876', + 'episode_number': 1, + 'upload_date': '20211227', + 'release_date': '20211227', + }, + }, { + 'url': 'https://www.jiosaavn.com/shows/himesh-reshammiya/Kr8fmfSN4vo_', + 'only_matching': True, + }] + + def _real_extract(self, url): + return self._extract_jiosaavn_result(url, 'episode', 'episodes', self._extract_episode) class JioSaavnAlbumIE(JioSaavnBaseIE): IE_NAME = 'jiosaavn:album' - _VALID_URL = r'https?://(?:www\.)?(?:jio)?saavn\.com/album/[^/?#]+/(?P[^/?#]+)' + _VALID_URL = JioSaavnBaseIE._URL_BASE_RE + r'/album/[^/?#]+/(?P[^/?#]+)' _TESTS = [{ 'url': 'https://www.jiosaavn.com/album/96/buIOjYZDrNA_', 'info_dict': { @@ -147,18 +264,19 @@ class JioSaavnAlbumIE(JioSaavnBaseIE): }, 'playlist_count': 10, }] + _ENTRY_IE = JioSaavnSongIE def _real_extract(self, url): display_id = self._match_id(url) album_data = self._call_api('album', display_id) return self.playlist_result( - self._yield_songs(album_data), display_id, traverse_obj(album_data, ('title', {str}))) + self._yield_items(album_data, 'songs'), display_id, traverse_obj(album_data, ('title', {str}))) class JioSaavnPlaylistIE(JioSaavnBaseIE): IE_NAME = 'jiosaavn:playlist' - _VALID_URL = r'https?://(?:www\.)?(?:jio)?saavn\.com/(?:s/playlist/(?:[^/?#]+/){2}|featured/[^/?#]+/)(?P[^/?#]+)' + _VALID_URL = JioSaavnBaseIE._URL_BASE_RE + r'/(?:s/playlist/(?:[^/?#]+/){2}|featured/[^/?#]+/)(?P[^/?#]+)' _TESTS = [{ 'url': 'https://www.jiosaavn.com/s/playlist/2279fbe391defa793ad7076929a2f5c9/mood-english/LlJ8ZWT1ibN5084vKHRj2Q__', 'info_dict': { @@ -172,15 +290,16 @@ class JioSaavnPlaylistIE(JioSaavnBaseIE): 'id': 'DVR,pFUOwyXqIp77B1JF,A__', 'title': 'Mood Hindi', }, - 'playlist_mincount': 801, + 'playlist_mincount': 750, }, { 'url': 'https://www.jiosaavn.com/featured/taaza-tunes/Me5RridRfDk_', 'info_dict': { 'id': 'Me5RridRfDk_', 'title': 'Taaza Tunes', }, - 'playlist_mincount': 301, + 'playlist_mincount': 50, }] + _ENTRY_IE = JioSaavnSongIE _PAGE_SIZE = 50 def _fetch_page(self, token, page): @@ -189,7 +308,7 @@ def _fetch_page(self, token, page): def _entries(self, token, first_page_data, page): page_data = first_page_data if not page else self._fetch_page(token, page + 1) - yield from self._yield_songs(page_data) + yield from self._yield_items(page_data, 'songs') def _real_extract(self, url): display_id = self._match_id(url) @@ -199,3 +318,95 @@ def _real_extract(self, url): return self.playlist_result(InAdvancePagedList( functools.partial(self._entries, display_id, playlist_data), total_pages, self._PAGE_SIZE), display_id, traverse_obj(playlist_data, ('listname', {str}))) + + +class JioSaavnShowPlaylistIE(JioSaavnBaseIE): + IE_NAME = 'jiosaavn:show:playlist' + _VALID_URL = JioSaavnBaseIE._URL_BASE_RE + r'/shows/(?P[^#/?]+)/(?P\d+)/[^/?#]+' + _TESTS = [{ + 'url': 'https://www.jiosaavn.com/shows/talking-music/1/PjReFP-Sguk_', + 'info_dict': { + 'id': 'talking-music-1', + 'title': 'Talking Music', + }, + 'playlist_mincount': 11, + }] + _ENTRY_IE = JioSaavnShowIE + _PAGE_SIZE = 10 + + def _fetch_page(self, show_id, season_id, page): + return self._call_api('show', show_id, f'show page {page}', { + 'p': page, + '__call': 'show.getAllEpisodes', + 'show_id': show_id, + 'season_number': season_id, + 'api_version': '4', + 'sort_order': 'desc', + }) + + def _entries(self, show_id, season_id, page): + page_data = self._fetch_page(show_id, season_id, page + 1) + yield from self._yield_items(page_data, keys=None, parse_func=self._extract_episode) + + def _real_extract(self, url): + show_slug, season_id = self._match_valid_url(url).group('show', 'season') + playlist_id = f'{show_slug}-{season_id}' + webpage = self._download_webpage(url, playlist_id) + + show_info = self._search_json( + r'window\.__INITIAL_DATA__\s*=', webpage, 'initial data', + playlist_id, transform_source=js_to_json)['showView'] + show_id = show_info['current_id'] + + entries = OnDemandPagedList(functools.partial(self._entries, show_id, season_id), self._PAGE_SIZE) + return self.playlist_result( + entries, playlist_id, traverse_obj(show_info, ('show', 'title', 'text', {str}))) + + +class JioSaavnArtistIE(JioSaavnBaseIE): + IE_NAME = 'jiosaavn:artist' + _VALID_URL = JioSaavnBaseIE._URL_BASE_RE + r'/artist/[^/?#]+/(?P[^/?#]+)' + _TESTS = [{ + 'url': 'https://www.jiosaavn.com/artist/krsna-songs/rYLBEve2z3U_', + 'info_dict': { + 'id': 'rYLBEve2z3U_', + 'title': 'KR$NA', + }, + 'playlist_mincount': 38, + }, { + 'url': 'https://www.jiosaavn.com/artist/sanam-puri-songs/SkNEv3qRhDE_', + 'info_dict': { + 'id': 'SkNEv3qRhDE_', + 'title': 'Sanam Puri', + }, + 'playlist_mincount': 51, + }] + _ENTRY_IE = JioSaavnSongIE + _PAGE_SIZE = 50 + + def _fetch_page(self, artist_id, page): + return self._call_api('artist', artist_id, f'artist page {page + 1}', { + 'p': page, + 'n_song': self._PAGE_SIZE, + 'n_album': self._PAGE_SIZE, + 'sub_type': '', + 'includeMetaTags': '', + 'api_version': '4', + 'category': 'alphabetical', + 'sort_order': 'asc', + }) + + def _entries(self, artist_id, first_page): + for page in itertools.count(): + playlist_data = first_page if not page else self._fetch_page(artist_id, page) + if not traverse_obj(playlist_data, ('topSongs', ..., {dict})): + break + yield from self._yield_items(playlist_data, 'topSongs') + + def _real_extract(self, url): + artist_id = self._match_id(url) + first_page = self._fetch_page(artist_id, 0) + + return self.playlist_result( + self._entries(artist_id, first_page), artist_id, + traverse_obj(first_page, ('name', {str}))) diff --git a/yt_dlp/extractor/linkedin.py b/yt_dlp/extractor/linkedin.py index c8c8ae52a..2974f4026 100644 --- a/yt_dlp/extractor/linkedin.py +++ b/yt_dlp/extractor/linkedin.py @@ -1,4 +1,5 @@ import itertools +import json import re from .common import InfoExtractor @@ -9,12 +10,12 @@ int_or_none, mimetype2ext, srt_subtitles_timecode, - traverse_obj, try_get, url_or_none, urlencode_postdata, urljoin, ) +from ..utils.traversal import find_elements, require, traverse_obj class LinkedInBaseIE(InfoExtractor): @@ -82,7 +83,10 @@ def _get_video_id(self, video_data, course_slug, video_slug): class LinkedInIE(LinkedInBaseIE): - _VALID_URL = r'https?://(?:www\.)?linkedin\.com/posts/[^/?#]+-(?P\d+)-\w{4}/?(?:[?#]|$)' + _VALID_URL = [ + r'https?://(?:www\.)?linkedin\.com/posts/[^/?#]+-(?P\d+)-\w{4}/?(?:[?#]|$)', + r'https?://(?:www\.)?linkedin\.com/feed/update/urn:li:activity:(?P\d+)', + ] _TESTS = [{ 'url': 'https://www.linkedin.com/posts/mishalkhawaja_sendinblueviews-toronto-digitalmarketing-ugcPost-6850898786781339649-mM20', 'info_dict': { @@ -106,6 +110,9 @@ class LinkedInIE(LinkedInBaseIE): 'like_count': int, 'subtitles': 'mincount:1', }, + }, { + 'url': 'https://www.linkedin.com/feed/update/urn:li:activity:7016901149999955968/?utm_source=share&utm_medium=member_desktop', + 'only_matching': True, }] def _real_extract(self, url): @@ -271,3 +278,110 @@ def _real_extract(self, url): entries, course_slug, course_data.get('title'), course_data.get('description')) + + +class LinkedInEventsIE(LinkedInBaseIE): + IE_NAME = 'linkedin:events' + _VALID_URL = r'https?://(?:www\.)?linkedin\.com/events/(?P[\w-]+)' + _TESTS = [{ + 'url': 'https://www.linkedin.com/events/7084656651378536448/comments/', + 'info_dict': { + 'id': '7084656651378536448', + 'ext': 'mp4', + 'title': '#37 Aprende a hacer una entrevista en inglés para tu próximo trabajo remoto', + 'description': '¡Agarra para anotar que se viene tremendo evento!', + 'duration': 1765, + 'timestamp': 1689113772, + 'upload_date': '20230711', + 'release_timestamp': 1689174012, + 'release_date': '20230712', + 'live_status': 'was_live', + }, + }, { + 'url': 'https://www.linkedin.com/events/27-02energyfreedombyenergyclub7295762520814874625/comments/', + 'info_dict': { + 'id': '27-02energyfreedombyenergyclub7295762520814874625', + 'ext': 'mp4', + 'title': '27.02 Energy Freedom by Energy Club', + 'description': 'md5:1292e6f31df998914c293787a02c3b91', + 'duration': 6420, + 'timestamp': 1739445333, + 'upload_date': '20250213', + 'release_timestamp': 1740657620, + 'release_date': '20250227', + 'live_status': 'was_live', + }, + }] + + def _real_initialize(self): + if not self._get_cookies('https://www.linkedin.com/').get('li_at'): + self.raise_login_required() + + def _real_extract(self, url): + event_id = self._match_id(url) + webpage = self._download_webpage(url, event_id) + + base_data = traverse_obj(webpage, ( + {find_elements(tag='code', attr='style', value='display: none')}, ..., {json.loads}, 'included', ...)) + meta_data = traverse_obj(base_data, ( + lambda _, v: v['$type'] == 'com.linkedin.voyager.dash.events.ProfessionalEvent', any)) or {} + + live_status = { + 'PAST': 'was_live', + 'ONGOING': 'is_live', + 'FUTURE': 'is_upcoming', + }.get(meta_data.get('lifecycleState')) + + if live_status == 'is_upcoming': + player_data = {} + if event_time := traverse_obj(meta_data, ('displayEventTime', {str})): + message = f'This live event is scheduled for {event_time}' + else: + message = 'This live event has not yet started' + self.raise_no_formats(message, expected=True, video_id=event_id) + else: + # TODO: Add support for audio-only live events + player_data = traverse_obj(base_data, ( + lambda _, v: v['$type'] == 'com.linkedin.videocontent.VideoPlayMetadata', + any, {require('video player data')})) + + formats, subtitles = [], {} + for prog_fmts in traverse_obj(player_data, ('progressiveStreams', ..., {dict})): + for fmt_url in traverse_obj(prog_fmts, ('streamingLocations', ..., 'url', {url_or_none})): + formats.append({ + 'url': fmt_url, + **traverse_obj(prog_fmts, { + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + 'tbr': ('bitRate', {int_or_none(scale=1000)}), + 'filesize': ('size', {int_or_none}), + 'ext': ('mediaType', {mimetype2ext}), + }), + }) + + for m3u8_url in traverse_obj(player_data, ( + 'adaptiveStreams', lambda _, v: v['protocol'] == 'HLS', 'masterPlaylists', ..., 'url', {url_or_none}, + )): + fmts, subs = self._extract_m3u8_formats_and_subtitles( + m3u8_url, event_id, 'mp4', m3u8_id='hls', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + + return { + 'id': event_id, + 'formats': formats, + 'subtitles': subtitles, + 'live_status': live_status, + **traverse_obj(meta_data, { + 'title': ('name', {str}), + 'description': ('description', 'text', {str}), + 'timestamp': ('createdAt', {int_or_none(scale=1000)}), + # timeRange.start is available when the stream is_upcoming + 'release_timestamp': ('timeRange', 'start', {int_or_none(scale=1000)}), + }), + **traverse_obj(player_data, { + 'duration': ('duration', {int_or_none(scale=1000)}), + # liveStreamCreatedAt is only available when the stream is_live or was_live + 'release_timestamp': ('liveStreamCreatedAt', {int_or_none(scale=1000)}), + }), + } diff --git a/yt_dlp/extractor/loco.py b/yt_dlp/extractor/loco.py index a648f7e13..6c9a25567 100644 --- a/yt_dlp/extractor/loco.py +++ b/yt_dlp/extractor/loco.py @@ -1,5 +1,9 @@ +import json +import random +import time + from .common import InfoExtractor -from ..utils import int_or_none, url_or_none +from ..utils import int_or_none, jwt_decode_hs256, try_call, url_or_none from ..utils.traversal import require, traverse_obj @@ -55,13 +59,81 @@ class LocoIE(InfoExtractor): 'upload_date': '20250226', 'modified_date': '20250226', }, + }, { + # Requires video authorization + 'url': 'https://loco.com/stream/ac854641-ae0f-497c-a8ea-4195f6d8cc53', + 'md5': '0513edf85c1e65c9521f555f665387d5', + 'info_dict': { + 'id': 'ac854641-ae0f-497c-a8ea-4195f6d8cc53', + 'ext': 'mp4', + 'title': 'DUAS CONTAS DESAFIANTE, RUSH TOP 1 NO BRASIL!', + 'description': 'md5:aa77818edd6fe00dd4b6be75cba5f826', + 'uploader_id': '7Y9JNAZC3Q', + 'channel': 'ayellol', + 'channel_follower_count': int, + 'comment_count': int, + 'view_count': int, + 'concurrent_view_count': int, + 'like_count': int, + 'duration': 1229, + 'thumbnail': 'https://static.ivory.getloconow.com/default_thumb/f5aa678b-6d04-45d9-a89a-859af0a8028f.jpg', + 'tags': ['Gameplay', 'Carry'], + 'series': 'League of Legends', + 'timestamp': 1741182253, + 'upload_date': '20250305', + 'modified_timestamp': 1741182419, + 'modified_date': '20250305', + }, }] + # From _app.js + _CLIENT_ID = 'TlwKp1zmF6eKFpcisn3FyR18WkhcPkZtzwPVEEC3' + _CLIENT_SECRET = 'Kp7tYlUN7LXvtcSpwYvIitgYcLparbtsQSe5AdyyCdiEJBP53Vt9J8eB4AsLdChIpcO2BM19RA3HsGtqDJFjWmwoonvMSG3ZQmnS8x1YIM8yl82xMXZGbE3NKiqmgBVU' + + def _is_jwt_expired(self, token): + return jwt_decode_hs256(token)['exp'] - time.time() < 300 + + def _get_access_token(self, video_id): + access_token = try_call(lambda: self._get_cookies('https://loco.com')['access_token'].value) + if access_token and not self._is_jwt_expired(access_token): + return access_token + access_token = traverse_obj(self._download_json( + 'https://api.getloconow.com/v3/user/device_profile/', video_id, + 'Downloading access token', fatal=False, data=json.dumps({ + 'platform': 7, + 'client_id': self._CLIENT_ID, + 'client_secret': self._CLIENT_SECRET, + 'model': 'Mozilla', + 'os_name': 'Win32', + 'os_ver': '5.0 (Windows)', + 'app_ver': '5.0 (Windows)', + }).encode(), headers={ + 'Content-Type': 'application/json;charset=utf-8', + 'DEVICE-ID': ''.join(random.choices('0123456789abcdef', k=32)) + 'live', + 'X-APP-LANG': 'en', + 'X-APP-LOCALE': 'en-US', + 'X-CLIENT-ID': self._CLIENT_ID, + 'X-CLIENT-SECRET': self._CLIENT_SECRET, + 'X-PLATFORM': '7', + }), 'access_token') + if access_token and not self._is_jwt_expired(access_token): + self._set_cookie('.loco.com', 'access_token', access_token) + return access_token + def _real_extract(self, url): video_type, video_id = self._match_valid_url(url).group('type', 'id') webpage = self._download_webpage(url, video_id) stream = traverse_obj(self._search_nextjs_data(webpage, video_id), ( - 'props', 'pageProps', ('liveStreamData', 'stream'), {dict}, any, {require('stream info')})) + 'props', 'pageProps', ('liveStreamData', 'stream', 'liveStream'), {dict}, any, {require('stream info')})) + + if access_token := self._get_access_token(video_id): + self._request_webpage( + 'https://drm.loco.com/v1/streams/playback/', video_id, + 'Downloading video authorization', fatal=False, headers={ + 'authorization': access_token, + }, query={ + 'stream_uid': stream['uid'], + }) return { 'formats': self._extract_m3u8_formats(stream['conf']['hls'], video_id), diff --git a/yt_dlp/extractor/lrt.py b/yt_dlp/extractor/lrt.py index e50194f88..caff9125e 100644 --- a/yt_dlp/extractor/lrt.py +++ b/yt_dlp/extractor/lrt.py @@ -2,7 +2,6 @@ from ..utils import ( clean_html, merge_dicts, - str_or_none, traverse_obj, unified_timestamp, url_or_none, @@ -138,13 +137,15 @@ def _real_extract(self, url): 'https://www.lrt.lt/radioteka/api/media', video_id, query={'url': f'/mediateka/irasas/{video_id}/{path}'}) - return traverse_obj(media, { - 'id': ('id', {int}, {str_or_none}), - 'title': ('title', {str}), - 'tags': ('tags', ..., 'name', {str}), - 'categories': ('playlist_item', 'category', {str}, filter, all, filter), - 'description': ('content', {clean_html}, {str}), - 'timestamp': ('date', {lambda x: x.replace('.', '/')}, {unified_timestamp}), - 'thumbnail': ('playlist_item', 'image', {urljoin('https://www.lrt.lt')}), - 'formats': ('playlist_item', 'file', {lambda x: self._extract_m3u8_formats(x, video_id)}), - }) + return { + 'id': video_id, + 'formats': self._extract_m3u8_formats(media['playlist_item']['file'], video_id), + **traverse_obj(media, { + 'title': ('title', {str}), + 'tags': ('tags', ..., 'name', {str}), + 'categories': ('playlist_item', 'category', {str}, filter, all, filter), + 'description': ('content', {clean_html}, {str}), + 'timestamp': ('date', {lambda x: x.replace('.', '/')}, {unified_timestamp}), + 'thumbnail': ('playlist_item', 'image', {urljoin('https://www.lrt.lt')}), + }), + } diff --git a/yt_dlp/extractor/manyvids.py b/yt_dlp/extractor/manyvids.py index 8caa8f87f..1356169bf 100644 --- a/yt_dlp/extractor/manyvids.py +++ b/yt_dlp/extractor/manyvids.py @@ -1,31 +1,38 @@ -import re - from .common import InfoExtractor from ..utils import ( + clean_html, determine_ext, - extract_attributes, int_or_none, - str_to_int, + join_nonempty, + parse_count, + parse_duration, + parse_iso8601, url_or_none, - urlencode_postdata, ) +from ..utils.traversal import traverse_obj class ManyVidsIE(InfoExtractor): - _WORKING = False _VALID_URL = r'(?i)https?://(?:www\.)?manyvids\.com/video/(?P\d+)' _TESTS = [{ # preview video - 'url': 'https://www.manyvids.com/Video/133957/everthing-about-me/', - 'md5': '03f11bb21c52dd12a05be21a5c7dcc97', + 'url': 'https://www.manyvids.com/Video/530341/mv-tips-tricks', + 'md5': '738dc723f7735ee9602f7ea352a6d058', 'info_dict': { - 'id': '133957', + 'id': '530341-preview', 'ext': 'mp4', - 'title': 'everthing about me (Preview)', - 'uploader': 'ellyxxix', + 'title': 'MV Tips & Tricks (Preview)', + 'description': r're:I will take you on a tour around .{1313}$', + 'thumbnail': r're:https://cdn5\.manyvids\.com/php_uploads/video_images/DestinyDiaz/.+\.jpg', + 'uploader': 'DestinyDiaz', 'view_count': int, 'like_count': int, + 'release_timestamp': 1508419904, + 'tags': ['AdultSchool', 'BBW', 'SFW', 'TeacherFetish'], + 'release_date': '20171019', + 'duration': 3167.0, }, + 'expected_warnings': ['Only extracting preview'], }, { # full video 'url': 'https://www.manyvids.com/Video/935718/MY-FACE-REVEAL/', @@ -34,129 +41,68 @@ class ManyVidsIE(InfoExtractor): 'id': '935718', 'ext': 'mp4', 'title': 'MY FACE REVEAL', - 'description': 'md5:ec5901d41808b3746fed90face161612', + 'description': r're:Today is the day!! I am finally taking off my mask .{445}$', + 'thumbnail': r're:https://ods\.manyvids\.com/1001061960/3aa5397f2a723ec4597e344df66ab845/screenshots/.+\.jpg', 'uploader': 'Sarah Calanthe', 'view_count': int, 'like_count': int, + 'release_date': '20181110', + 'tags': ['EyeContact', 'Interviews', 'MaskFetish', 'MouthFetish', 'Redhead'], + 'release_timestamp': 1541851200, + 'duration': 224.0, }, }] + _API_BASE = 'https://www.manyvids.com/bff/store/video' def _real_extract(self, url): video_id = self._match_id(url) + video_data = self._download_json(f'{self._API_BASE}/{video_id}/private', video_id)['data'] + formats, preview_only = [], True - real_url = f'https://www.manyvids.com/video/{video_id}/gtm.js' - try: - webpage = self._download_webpage(real_url, video_id) - except Exception: - # probably useless fallback - webpage = self._download_webpage(url, video_id) - - info = self._search_regex( - r'''(]*\bid\s*=\s*(['"])pageMetaDetails\2[^>]*>)''', - webpage, 'meta details', default='') - info = extract_attributes(info) - - player = self._search_regex( - r'''(]*\bid\s*=\s*(['"])rmpPlayerStream\2[^>]*>)''', - webpage, 'player details', default='') - player = extract_attributes(player) - - video_urls_and_ids = ( - (info.get('data-meta-video'), 'video'), - (player.get('data-video-transcoded'), 'transcoded'), - (player.get('data-video-filepath'), 'filepath'), - (self._og_search_video_url(webpage, secure=False, default=None), 'og_video'), - ) - - def txt_or_none(s, default=None): - return (s.strip() or default) if isinstance(s, str) else default - - uploader = txt_or_none(info.get('data-meta-author')) - - def mung_title(s): - if uploader: - s = re.sub(rf'^\s*{re.escape(uploader)}\s+[|-]', '', s) - return txt_or_none(s) - - title = ( - mung_title(info.get('data-meta-title')) - or self._html_search_regex( - (r']+class=["\']item-title[^>]+>([^<]+)', - r']+class=["\']h2 m-0["\'][^>]*>([^<]+)'), - webpage, 'title', default=None) - or self._html_search_meta( - 'twitter:title', webpage, 'title', fatal=True)) - - title = re.sub(r'\s*[|-]\s+ManyVids\s*$', '', title) or title - - if any(p in webpage for p in ('preview_videos', '_preview.mp4')): - title += ' (Preview)' - - mv_token = self._search_regex( - r'data-mvtoken=(["\'])(?P(?:(?!\1).)+)\1', webpage, - 'mv token', default=None, group='value') - - if mv_token: - # Sets some cookies - self._download_webpage( - 'https://www.manyvids.com/includes/ajax_repository/you_had_me_at_hello.php', - video_id, note='Setting format cookies', fatal=False, - data=urlencode_postdata({ - 'mvtoken': mv_token, - 'vid': video_id, - }), headers={ - 'Referer': url, - 'X-Requested-With': 'XMLHttpRequest', - }) - - formats = [] - for v_url, fmt in video_urls_and_ids: - v_url = url_or_none(v_url) - if not v_url: + for format_id, path in [ + ('preview', ['teaser', 'filepath']), + ('transcoded', ['transcodedFilepath']), + ('filepath', ['filepath']), + ]: + format_url = traverse_obj(video_data, (*path, {url_or_none})) + if not format_url: continue - if determine_ext(v_url) == 'm3u8': - formats.extend(self._extract_m3u8_formats( - v_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls')) + if determine_ext(format_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats(format_url, video_id, 'mp4', m3u8_id=format_id)) else: formats.append({ - 'url': v_url, - 'format_id': fmt, + 'url': format_url, + 'format_id': format_id, + 'preference': -10 if format_id == 'preview' else None, + 'quality': 10 if format_id == 'filepath' else None, + 'height': int_or_none( + self._search_regex(r'_(\d{2,3}[02468])_', format_url, 'height', default=None)), }) + if format_id != 'preview': + preview_only = False - self._remove_duplicate_formats(formats) + metadata = traverse_obj( + self._download_json(f'{self._API_BASE}/{video_id}', video_id, fatal=False), 'data') + title = traverse_obj(metadata, ('title', {clean_html})) - for f in formats: - if f.get('height') is None: - f['height'] = int_or_none( - self._search_regex(r'_(\d{2,3}[02468])_', f['url'], 'video height', default=None)) - if '/preview/' in f['url']: - f['format_id'] = '_'.join(filter(None, (f.get('format_id'), 'preview'))) - f['preference'] = -10 - if 'transcoded' in f['format_id']: - f['preference'] = f.get('preference', -1) - 1 - - def get_likes(): - likes = self._search_regex( - rf'''(]*\bdata-id\s*=\s*(['"]){video_id}\2[^>]*>)''', - webpage, 'likes', default='') - likes = extract_attributes(likes) - return int_or_none(likes.get('data-likes')) - - def get_views(): - return str_to_int(self._html_search_regex( - r'''(?s)]*\bclass\s*=["']views-wrapper\b[^>]+>.+?]+>\s*(\d[\d,.]*)\s*''', - webpage, 'view count', default=None)) + if preview_only: + title = join_nonempty(title, '(Preview)', delim=' ') + video_id += '-preview' + self.report_warning( + f'Only extracting preview. Video may be paid or subscription only. {self._login_hint()}') return { 'id': video_id, 'title': title, 'formats': formats, - 'description': txt_or_none(info.get('data-meta-description')), - 'uploader': txt_or_none(info.get('data-meta-author')), - 'thumbnail': ( - url_or_none(info.get('data-meta-image')) - or url_or_none(player.get('data-video-screenshot'))), - 'view_count': get_views(), - 'like_count': get_likes(), + **traverse_obj(metadata, { + 'description': ('description', {clean_html}), + 'uploader': ('model', 'displayName', {clean_html}), + 'thumbnail': (('screenshot', 'thumbnail'), {url_or_none}, any), + 'view_count': ('views', {parse_count}), + 'like_count': ('likes', {parse_count}), + 'release_timestamp': ('launchDate', {parse_iso8601}), + 'duration': ('videoDuration', {parse_duration}), + 'tags': ('tagList', ..., 'label', {str}, filter, all, filter), + }), } diff --git a/yt_dlp/extractor/mlb.py b/yt_dlp/extractor/mlb.py index f7726e5b1..562b93fc7 100644 --- a/yt_dlp/extractor/mlb.py +++ b/yt_dlp/extractor/mlb.py @@ -365,13 +365,15 @@ def _real_initialize(self): 'All videos are only available to registered users', method='password') def _set_device_id(self, username): - if not self._device_id: - self._device_id = self.cache.load( - self._NETRC_MACHINE, 'device_ids', default={}).get(username) + if self._device_id: + return + device_id_cache = self.cache.load(self._NETRC_MACHINE, 'device_ids', default={}) + self._device_id = device_id_cache.get(username) if self._device_id: return self._device_id = str(uuid.uuid4()) - self.cache.store(self._NETRC_MACHINE, 'device_ids', {username: self._device_id}) + device_id_cache[username] = self._device_id + self.cache.store(self._NETRC_MACHINE, 'device_ids', device_id_cache) def _perform_login(self, username, password): try: diff --git a/yt_dlp/extractor/nebula.py b/yt_dlp/extractor/nebula.py index 42ef25f17..6ced19d6a 100644 --- a/yt_dlp/extractor/nebula.py +++ b/yt_dlp/extractor/nebula.py @@ -3,6 +3,7 @@ from .art19 import Art19IE from .common import InfoExtractor +from ..networking import PATCHRequest from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, @@ -74,7 +75,7 @@ def _extract_formats(self, content_id, slug): 'app_version': '23.10.0', 'platform': 'ios', }) - return {'formats': fmts, 'subtitles': subs} + break except ExtractorError as e: if isinstance(e.cause, HTTPError) and e.cause.status == 401: self.raise_login_required() @@ -84,6 +85,9 @@ def _extract_formats(self, content_id, slug): continue raise + self.mark_watched(content_id, slug) + return {'formats': fmts, 'subtitles': subs} + def _extract_video_metadata(self, episode): channel_url = traverse_obj( episode, (('channel_slug', 'class_slug'), {urljoin('https://nebula.tv/')}), get_all=False) @@ -111,6 +115,13 @@ def _extract_video_metadata(self, episode): 'uploader_url': channel_url, } + def _mark_watched(self, content_id, slug): + self._call_api( + PATCHRequest(f'https://content.api.nebula.app/{content_id.split(":")[0]}s/{content_id}/progress/'), + slug, 'Marking watched', 'Unable to mark watched', fatal=False, + data=json.dumps({'completed': True}).encode(), + headers={'content-type': 'application/json'}) + class NebulaIE(NebulaBaseIE): IE_NAME = 'nebula:video' @@ -322,6 +333,7 @@ def _real_extract(self, url): if not episode_url and metadata.get('premium'): self.raise_login_required() + self.mark_watched(metadata['id'], slug) if Art19IE.suitable(episode_url): return self.url_result(episode_url, Art19IE) return traverse_obj(metadata, { diff --git a/yt_dlp/extractor/niconico.py b/yt_dlp/extractor/niconico.py index 5e66aebeb..0d0f7ceef 100644 --- a/yt_dlp/extractor/niconico.py +++ b/yt_dlp/extractor/niconico.py @@ -16,7 +16,7 @@ determine_ext, float_or_none, int_or_none, - join_nonempty, + parse_bitrate, parse_duration, parse_iso8601, parse_qs, @@ -24,8 +24,6 @@ qualities, remove_start, str_or_none, - traverse_obj, - try_get, unescapeHTML, unified_timestamp, update_url_query, @@ -34,13 +32,70 @@ urlencode_postdata, urljoin, ) +from ..utils.traversal import find_element, require, traverse_obj -class NiconicoIE(InfoExtractor): +class NiconicoBaseIE(InfoExtractor): + _GEO_BYPASS = False + _GEO_COUNTRIES = ['JP'] + _LOGIN_BASE = 'https://account.nicovideo.jp' + _NETRC_MACHINE = 'niconico' + + @property + def is_logged_in(self): + return bool(self._get_cookies('https://www.nicovideo.jp').get('user_session')) + + def _raise_login_error(self, message, expected=True): + raise ExtractorError(f'Unable to login: {message}', expected=expected) + + def _perform_login(self, username, password): + if self.is_logged_in: + return + + self._request_webpage( + f'{self._LOGIN_BASE}/login', None, 'Requesting session cookies') + webpage = self._download_webpage( + f'{self._LOGIN_BASE}/login/redirector', None, + 'Logging in', 'Unable to log in', headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + 'Referer': f'{self._LOGIN_BASE}/login', + }, data=urlencode_postdata({ + 'mail_tel': username, + 'password': password, + })) + + if self.is_logged_in: + return + elif err_msg := traverse_obj(webpage, ( + {find_element(cls='notice error')}, {find_element(cls='notice__text')}, {clean_html}, + )): + self._raise_login_error(err_msg or 'Invalid username or password') + elif 'oneTimePw' in webpage: + post_url = self._search_regex( + r']+action=(["\'])(?P.+?)\1', webpage, 'post url', group='url') + mfa, urlh = self._download_webpage_handle( + urljoin(self._LOGIN_BASE, post_url), None, + 'Performing MFA', 'Unable to complete MFA', headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + }, data=urlencode_postdata({ + 'otp': self._get_tfa_info('6 digit number shown on app'), + })) + if self.is_logged_in: + return + elif 'error-code' in parse_qs(urlh.url): + err_msg = traverse_obj(mfa, ({find_element(cls='pageMainMsg')}, {clean_html})) + self._raise_login_error(err_msg or 'MFA session expired') + elif 'formError' in mfa: + err_msg = traverse_obj(mfa, ( + {find_element(cls='formError')}, {find_element(tag='div')}, {clean_html})) + self._raise_login_error(err_msg or 'MFA challenge failed') + + self._raise_login_error('Unexpected login error', expected=False) + + +class NiconicoIE(NiconicoBaseIE): IE_NAME = 'niconico' IE_DESC = 'ニコニコ動画' - _GEO_COUNTRIES = ['JP'] - _GEO_BYPASS = False _TESTS = [{ 'url': 'http://www.nicovideo.jp/watch/sm22312215', @@ -180,229 +235,6 @@ class NiconicoIE(InfoExtractor): }] _VALID_URL = r'https?://(?:(?:www\.|secure\.|sp\.)?nicovideo\.jp/watch|nico\.ms)/(?P(?:[a-z]{2})?[0-9]+)' - _NETRC_MACHINE = 'niconico' - _API_HEADERS = { - 'X-Frontend-ID': '6', - 'X-Frontend-Version': '0', - 'X-Niconico-Language': 'en-us', - 'Referer': 'https://www.nicovideo.jp/', - 'Origin': 'https://www.nicovideo.jp', - } - - def _perform_login(self, username, password): - login_ok = True - login_form_strs = { - 'mail_tel': username, - 'password': password, - } - self._request_webpage( - 'https://account.nicovideo.jp/login', None, - note='Acquiring Login session') - page = self._download_webpage( - 'https://account.nicovideo.jp/login/redirector?show_button_twitter=1&site=niconico&show_button_facebook=1', None, - note='Logging in', errnote='Unable to log in', - data=urlencode_postdata(login_form_strs), - headers={ - 'Referer': 'https://account.nicovideo.jp/login', - 'Content-Type': 'application/x-www-form-urlencoded', - }) - if 'oneTimePw' in page: - post_url = self._search_regex( - r']+action=(["\'])(?P.+?)\1', page, 'post url', group='url') - page = self._download_webpage( - urljoin('https://account.nicovideo.jp', post_url), None, - note='Performing MFA', errnote='Unable to complete MFA', - data=urlencode_postdata({ - 'otp': self._get_tfa_info('6 digits code'), - }), headers={ - 'Content-Type': 'application/x-www-form-urlencoded', - }) - if 'oneTimePw' in page or 'formError' in page: - err_msg = self._html_search_regex( - r'formError["\']+>(.*?)

', page, 'form_error', - default='There\'s an error but the message can\'t be parsed.', - flags=re.DOTALL) - self.report_warning(f'Unable to log in: MFA challenge failed, "{err_msg}"') - return False - login_ok = 'class="notice error"' not in page - if not login_ok: - self.report_warning('Unable to log in: bad username or password') - return login_ok - - def _get_heartbeat_info(self, info_dict): - video_id, video_src_id, audio_src_id = info_dict['url'].split(':')[1].split('/') - dmc_protocol = info_dict['expected_protocol'] - - api_data = ( - info_dict.get('_api_data') - or self._parse_json( - self._html_search_regex( - 'data-api-data="([^"]+)"', - self._download_webpage('https://www.nicovideo.jp/watch/' + video_id, video_id), - 'API data', default='{}'), - video_id)) - - session_api_data = try_get(api_data, lambda x: x['media']['delivery']['movie']['session']) - session_api_endpoint = try_get(session_api_data, lambda x: x['urls'][0]) - - def ping(): - tracking_id = traverse_obj(api_data, ('media', 'delivery', 'trackingId')) - if tracking_id: - tracking_url = update_url_query('https://nvapi.nicovideo.jp/v1/2ab0cbaa/watch', {'t': tracking_id}) - watch_request_response = self._download_json( - tracking_url, video_id, - note='Acquiring permission for downloading video', fatal=False, - headers=self._API_HEADERS) - if traverse_obj(watch_request_response, ('meta', 'status')) != 200: - self.report_warning('Failed to acquire permission for playing video. Video download may fail.') - - yesno = lambda x: 'yes' if x else 'no' - - if dmc_protocol == 'http': - protocol = 'http' - protocol_parameters = { - 'http_output_download_parameters': { - 'use_ssl': yesno(session_api_data['urls'][0]['isSsl']), - 'use_well_known_port': yesno(session_api_data['urls'][0]['isWellKnownPort']), - }, - } - elif dmc_protocol == 'hls': - protocol = 'm3u8' - segment_duration = try_get(self._configuration_arg('segment_duration'), lambda x: int(x[0])) or 6000 - parsed_token = self._parse_json(session_api_data['token'], video_id) - encryption = traverse_obj(api_data, ('media', 'delivery', 'encryption')) - protocol_parameters = { - 'hls_parameters': { - 'segment_duration': segment_duration, - 'transfer_preset': '', - 'use_ssl': yesno(session_api_data['urls'][0]['isSsl']), - 'use_well_known_port': yesno(session_api_data['urls'][0]['isWellKnownPort']), - }, - } - if 'hls_encryption' in parsed_token and encryption: - protocol_parameters['hls_parameters']['encryption'] = { - parsed_token['hls_encryption']: { - 'encrypted_key': encryption['encryptedKey'], - 'key_uri': encryption['keyUri'], - }, - } - else: - protocol = 'm3u8_native' - else: - raise ExtractorError(f'Unsupported DMC protocol: {dmc_protocol}') - - session_response = self._download_json( - session_api_endpoint['url'], video_id, - query={'_format': 'json'}, - headers={'Content-Type': 'application/json'}, - note='Downloading JSON metadata for {}'.format(info_dict['format_id']), - data=json.dumps({ - 'session': { - 'client_info': { - 'player_id': session_api_data.get('playerId'), - }, - 'content_auth': { - 'auth_type': try_get(session_api_data, lambda x: x['authTypes'][session_api_data['protocols'][0]]), - 'content_key_timeout': session_api_data.get('contentKeyTimeout'), - 'service_id': 'nicovideo', - 'service_user_id': session_api_data.get('serviceUserId'), - }, - 'content_id': session_api_data.get('contentId'), - 'content_src_id_sets': [{ - 'content_src_ids': [{ - 'src_id_to_mux': { - 'audio_src_ids': [audio_src_id], - 'video_src_ids': [video_src_id], - }, - }], - }], - 'content_type': 'movie', - 'content_uri': '', - 'keep_method': { - 'heartbeat': { - 'lifetime': session_api_data.get('heartbeatLifetime'), - }, - }, - 'priority': session_api_data['priority'], - 'protocol': { - 'name': 'http', - 'parameters': { - 'http_parameters': { - 'parameters': protocol_parameters, - }, - }, - }, - 'recipe_id': session_api_data.get('recipeId'), - 'session_operation_auth': { - 'session_operation_auth_by_signature': { - 'signature': session_api_data.get('signature'), - 'token': session_api_data.get('token'), - }, - }, - 'timing_constraint': 'unlimited', - }, - }).encode()) - - info_dict['url'] = session_response['data']['session']['content_uri'] - info_dict['protocol'] = protocol - - # get heartbeat info - heartbeat_info_dict = { - 'url': session_api_endpoint['url'] + '/' + session_response['data']['session']['id'] + '?_format=json&_method=PUT', - 'data': json.dumps(session_response['data']), - # interval, convert milliseconds to seconds, then halve to make a buffer. - 'interval': float_or_none(session_api_data.get('heartbeatLifetime'), scale=3000), - 'ping': ping, - } - - return info_dict, heartbeat_info_dict - - def _extract_format_for_quality(self, video_id, audio_quality, video_quality, dmc_protocol): - - if not audio_quality.get('isAvailable') or not video_quality.get('isAvailable'): - return None - - format_id = '-'.join( - [remove_start(s['id'], 'archive_') for s in (video_quality, audio_quality)] + [dmc_protocol]) - - vid_qual_label = traverse_obj(video_quality, ('metadata', 'label')) - - return { - 'url': 'niconico_dmc:{}/{}/{}'.format(video_id, video_quality['id'], audio_quality['id']), - 'format_id': format_id, - 'format_note': join_nonempty('DMC', vid_qual_label, dmc_protocol.upper(), delim=' '), - 'ext': 'mp4', # Session API are used in HTML5, which always serves mp4 - 'acodec': 'aac', - 'vcodec': 'h264', - **traverse_obj(audio_quality, ('metadata', { - 'abr': ('bitrate', {float_or_none(scale=1000)}), - 'asr': ('samplingRate', {int_or_none}), - })), - **traverse_obj(video_quality, ('metadata', { - 'vbr': ('bitrate', {float_or_none(scale=1000)}), - 'height': ('resolution', 'height', {int_or_none}), - 'width': ('resolution', 'width', {int_or_none}), - })), - 'quality': -2 if 'low' in video_quality['id'] else None, - 'protocol': 'niconico_dmc', - 'expected_protocol': dmc_protocol, # XXX: This is not a documented field - 'http_headers': { - 'Origin': 'https://www.nicovideo.jp', - 'Referer': 'https://www.nicovideo.jp/watch/' + video_id, - }, - } - - def _yield_dmc_formats(self, api_data, video_id): - dmc_data = traverse_obj(api_data, ('media', 'delivery', 'movie')) - audios = traverse_obj(dmc_data, ('audios', ..., {dict})) - videos = traverse_obj(dmc_data, ('videos', ..., {dict})) - protocols = traverse_obj(dmc_data, ('session', 'protocols', ..., {str})) - if not all((audios, videos, protocols)): - return - - for audio_quality, video_quality, protocol in itertools.product(audios, videos, protocols): - if fmt := self._extract_format_for_quality(video_id, audio_quality, video_quality, protocol): - yield fmt def _yield_dms_formats(self, api_data, video_id): fmt_filter = lambda _, v: v['isAvailable'] and v['id'] @@ -451,42 +283,61 @@ def _yield_dms_formats(self, api_data, video_id): lambda _, v: v['id'] == video_fmt['format_id'], 'qualityLevel', {int_or_none}, any)) or -1 yield video_fmt + def _extract_server_response(self, webpage, video_id, fatal=True): + try: + return traverse_obj( + self._parse_json(self._html_search_meta('server-response', webpage) or '', video_id), + ('data', 'response', {dict}, {require('server response')})) + except ExtractorError: + if not fatal: + return {} + raise + def _real_extract(self, url): video_id = self._match_id(url) try: webpage, handle = self._download_webpage_handle( - 'https://www.nicovideo.jp/watch/' + video_id, video_id) + f'https://www.nicovideo.jp/watch/{video_id}', video_id, + headers=self.geo_verification_headers()) if video_id.startswith('so'): video_id = self._match_id(handle.url) - api_data = traverse_obj( - self._parse_json(self._html_search_meta('server-response', webpage) or '', video_id), - ('data', 'response', {dict})) - if not api_data: - raise ExtractorError('Server response data not found') + api_data = self._extract_server_response(webpage, video_id) except ExtractorError as e: try: api_data = self._download_json( - f'https://www.nicovideo.jp/api/watch/v3/{video_id}?_frontendId=6&_frontendVersion=0&actionTrackId=AAAAAAAAAA_{round(time.time() * 1000)}', video_id, - note='Downloading API JSON', errnote='Unable to fetch data')['data'] + f'https://www.nicovideo.jp/api/watch/v3/{video_id}', video_id, + 'Downloading API JSON', 'Unable to fetch data', query={ + '_frontendId': '6', + '_frontendVersion': '0', + 'actionTrackId': f'AAAAAAAAAA_{round(time.time() * 1000)}', + }, headers=self.geo_verification_headers())['data'] except ExtractorError: if not isinstance(e.cause, HTTPError): + # Raise if original exception was from _parse_json or utils.traversal.require raise + # The webpage server response has more detailed error info than the API response webpage = e.cause.response.read().decode('utf-8', 'replace') - error_msg = self._html_search_regex( - r'(?s)(.+?)', - webpage, 'error reason', default=None) - if not error_msg: + reason_code = self._extract_server_response( + webpage, video_id, fatal=False).get('reasonCode') + if not reason_code: raise - raise ExtractorError(clean_html(error_msg), expected=True) + if reason_code in ('DOMESTIC_VIDEO', 'HIGH_RISK_COUNTRY_VIDEO'): + self.raise_geo_restricted(countries=self._GEO_COUNTRIES) + elif reason_code == 'HIDDEN_VIDEO': + raise ExtractorError( + 'The viewing period of this video has expired', expected=True) + elif reason_code == 'DELETED_VIDEO': + raise ExtractorError('This video has been deleted', expected=True) + raise ExtractorError(f'Niconico says: {reason_code}') availability = self._availability(**(traverse_obj(api_data, ('payment', 'video', { 'needs_premium': ('isPremium', {bool}), 'needs_subscription': ('isAdmission', {bool}), })) or {'needs_auth': True})) - formats = [*self._yield_dmc_formats(api_data, video_id), - *self._yield_dms_formats(api_data, video_id)] + + formats = list(self._yield_dms_formats(api_data, video_id)) if not formats: fail_msg = clean_html(self._html_search_regex( r']+\bclass="fail-message"[^>]*>(?P.+?)

', @@ -921,7 +772,7 @@ def _real_extract(self, url): return self.playlist_result(self._entries(list_id), list_id) -class NiconicoLiveIE(InfoExtractor): +class NiconicoLiveIE(NiconicoBaseIE): IE_NAME = 'niconico:live' IE_DESC = 'ニコニコ生放送' _VALID_URL = r'https?://(?:sp\.)?live2?\.nicovideo\.jp/(?:watch|gate)/(?Plv\d+)' @@ -953,8 +804,6 @@ class NiconicoLiveIE(InfoExtractor): 'only_matching': True, }] - _KNOWN_LATENCY = ('high', 'low') - def _real_extract(self, url): video_id = self._match_id(url) webpage, urlh = self._download_webpage_handle(f'https://live.nicovideo.jp/watch/{video_id}', video_id) @@ -970,22 +819,19 @@ def _real_extract(self, url): }) hostname = remove_start(urllib.parse.urlparse(urlh.url).hostname, 'sp.') - latency = try_get(self._configuration_arg('latency'), lambda x: x[0]) - if latency not in self._KNOWN_LATENCY: - latency = 'high' ws = self._request_webpage( Request(ws_url, headers={'Origin': f'https://{hostname}'}), video_id=video_id, note='Connecting to WebSocket server') - self.write_debug('[debug] Sending HLS server request') + self.write_debug('Sending HLS server request') ws.send(json.dumps({ 'type': 'startWatching', 'data': { 'stream': { 'quality': 'abr', - 'protocol': 'hls+fmp4', - 'latency': latency, + 'protocol': 'hls', + 'latency': 'high', 'accessRightMethod': 'single_cookie', 'chasePlay': False, }, @@ -1049,18 +895,29 @@ def _real_extract(self, url): for cookie in cookies: self._set_cookie( cookie['domain'], cookie['name'], cookie['value'], - expire_time=unified_timestamp(cookie['expires']), path=cookie['path'], secure=cookie['secure']) + expire_time=unified_timestamp(cookie.get('expires')), path=cookie['path'], secure=cookie['secure']) + + fmt_common = { + 'live_latency': 'high', + 'origin': hostname, + 'protocol': 'niconico_live', + 'video_id': video_id, + 'ws': ws, + } + q_iter = (q for q in qualities[1:] if not q.startswith('audio_')) # ignore initial 'abr' + a_map = {96: 'audio_low', 192: 'audio_high'} formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', live=True) - for fmt, q in zip(formats, reversed(qualities[1:])): - fmt.update({ - 'format_id': q, - 'protocol': 'niconico_live', - 'ws': ws, - 'video_id': video_id, - 'live_latency': latency, - 'origin': hostname, - }) + for fmt in formats: + if fmt.get('acodec') == 'none': + fmt['format_id'] = next(q_iter, fmt['format_id']) + elif fmt.get('vcodec') == 'none': + abr = parse_bitrate(fmt['url'].lower()) + fmt.update({ + 'abr': abr, + 'format_id': a_map.get(abr, fmt['format_id']), + }) + fmt.update(fmt_common) return { 'id': video_id, diff --git a/yt_dlp/extractor/nytimes.py b/yt_dlp/extractor/nytimes.py index a97add71a..3472a2159 100644 --- a/yt_dlp/extractor/nytimes.py +++ b/yt_dlp/extractor/nytimes.py @@ -181,6 +181,7 @@ class NYTimesArticleIE(NYTimesBaseIE): 'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg', 'duration': 119.0, }, + 'skip': 'HTTP Error 500: Internal Server Error', }, { # article with audio and no video 'url': 'https://www.nytimes.com/2023/09/29/health/mosquitoes-genetic-engineering.html', @@ -190,13 +191,14 @@ class NYTimesArticleIE(NYTimesBaseIE): 'ext': 'mp3', 'title': 'The Gamble: Can Genetically Modified Mosquitoes End Disease?', 'description': 'md5:9ff8b47acbaf7f3ca8c732f5c815be2e', - 'timestamp': 1695960700, + 'timestamp': 1696008129, 'upload_date': '20230929', - 'creator': 'Stephanie Nolen, Natalija Gormalova', + 'creators': ['Stephanie Nolen', 'Natalija Gormalova'], 'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg', 'duration': 1322, }, }, { + # lede_media_block already has sourceId 'url': 'https://www.nytimes.com/2023/11/29/business/dealbook/kamala-harris-biden-voters.html', 'md5': '3eb5ddb1d6f86254fe4f233826778737', 'info_dict': { @@ -207,7 +209,7 @@ class NYTimesArticleIE(NYTimesBaseIE): 'timestamp': 1701290997, 'upload_date': '20231129', 'uploader': 'By The New York Times', - 'creator': 'Katie Rogers', + 'creators': ['Katie Rogers'], 'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg', 'duration': 97.631, }, @@ -222,10 +224,22 @@ class NYTimesArticleIE(NYTimesBaseIE): 'title': 'Drunk and Asleep on the Job: Air Traffic Controllers Pushed to the Brink', 'description': 'md5:549e5a5e935bf7d048be53ba3d2c863d', 'upload_date': '20231202', - 'creator': 'Emily Steel, Sydney Ember', + 'creators': ['Emily Steel', 'Sydney Ember'], 'timestamp': 1701511264, }, 'playlist_count': 3, + }, { + # lede_media_block does not have sourceId + 'url': 'https://www.nytimes.com/2025/04/30/well/move/hip-mobility-routine.html', + 'info_dict': { + 'id': 'hip-mobility-routine', + 'title': 'Tight Hips? These Moves Can Help.', + 'description': 'Sitting all day is hard on your hips. Try this simple routine for better mobility.', + 'creators': ['Alyssa Ages', 'Theodore Tae'], + 'timestamp': 1746003629, + 'upload_date': '20250430', + }, + 'playlist_count': 7, }, { 'url': 'https://www.nytimes.com/2023/12/02/business/media/netflix-squid-game-challenge.html', 'only_matching': True, @@ -256,14 +270,18 @@ def _extract_content_from_block(self, block): def _real_extract(self, url): page_id = self._match_id(url) - webpage = self._download_webpage(url, page_id) + webpage = self._download_webpage(url, page_id, impersonate=True) art_json = self._search_json( r'window\.__preloadedData\s*=', webpage, 'media details', page_id, transform_source=lambda x: x.replace('undefined', 'null'))['initialData']['data']['article'] + content = art_json['sprinkledBody']['content'] - blocks = traverse_obj(art_json, ( - 'sprinkledBody', 'content', ..., ('ledeMedia', None), - lambda _, v: v['__typename'] in ('Video', 'Audio'))) + blocks = [] + block_filter = lambda k, v: k == 'media' and v['__typename'] in ('Video', 'Audio') + if lede_media_block := traverse_obj(content, (..., 'ledeMedia', block_filter, any)): + lede_media_block.setdefault('sourceId', art_json.get('sourceId')) + blocks.append(lede_media_block) + blocks.extend(traverse_obj(content, (..., block_filter))) if not blocks: raise ExtractorError('Unable to extract any media blocks from webpage') @@ -273,8 +291,7 @@ def _real_extract(self, url): 'sprinkledBody', 'content', ..., 'summary', 'content', ..., 'text', {str}), get_all=False) or self._html_search_meta(['og:description', 'twitter:description'], webpage), 'timestamp': traverse_obj(art_json, ('firstPublished', {parse_iso8601})), - 'creator': ', '.join( - traverse_obj(art_json, ('bylines', ..., 'creators', ..., 'displayName'))), # TODO: change to 'creators' (list) + 'creators': traverse_obj(art_json, ('bylines', ..., 'creators', ..., 'displayName', {str})), 'thumbnails': self._extract_thumbnails(traverse_obj( art_json, ('promotionalMedia', 'assetCrops', ..., 'renditions', ...))), } diff --git a/yt_dlp/extractor/once.py b/yt_dlp/extractor/once.py deleted file mode 100644 index 989f10abb..000000000 --- a/yt_dlp/extractor/once.py +++ /dev/null @@ -1,40 +0,0 @@ -import re - -from .common import InfoExtractor - - -class OnceIE(InfoExtractor): # XXX: Conventionally, base classes should end with BaseIE/InfoExtractor - _VALID_URL = r'https?://.+?\.unicornmedia\.com/now/(?:ads/vmap/)?[^/]+/[^/]+/(?P[^/]+)/(?P[^/]+)/(?:[^/]+/)?(?P[^/]+)/content\.(?:once|m3u8|mp4)' - ADAPTIVE_URL_TEMPLATE = 'http://once.unicornmedia.com/now/master/playlist/%s/%s/%s/content.m3u8' - PROGRESSIVE_URL_TEMPLATE = 'http://once.unicornmedia.com/now/media/progressive/%s/%s/%s/%s/content.mp4' - - def _extract_once_formats(self, url, http_formats_preference=None): - domain_id, application_id, media_item_id = re.match( - OnceIE._VALID_URL, url).groups() - formats = self._extract_m3u8_formats( - self.ADAPTIVE_URL_TEMPLATE % ( - domain_id, application_id, media_item_id), - media_item_id, 'mp4', m3u8_id='hls', fatal=False) - progressive_formats = [] - for adaptive_format in formats: - # Prevent advertisement from embedding into m3u8 playlist (see - # https://github.com/ytdl-org/youtube-dl/issues/8893#issuecomment-199912684) - adaptive_format['url'] = re.sub( - r'\badsegmentlength=\d+', r'adsegmentlength=0', adaptive_format['url']) - rendition_id = self._search_regex( - r'/now/media/playlist/[^/]+/[^/]+/([^/]+)', - adaptive_format['url'], 'redition id', default=None) - if rendition_id: - progressive_format = adaptive_format.copy() - progressive_format.update({ - 'url': self.PROGRESSIVE_URL_TEMPLATE % ( - domain_id, application_id, rendition_id, media_item_id), - 'format_id': adaptive_format['format_id'].replace( - 'hls', 'http'), - 'protocol': 'http', - 'preference': http_formats_preference, - }) - progressive_formats.append(progressive_format) - self._check_formats(progressive_formats, media_item_id) - formats.extend(progressive_formats) - return formats diff --git a/yt_dlp/extractor/panopto.py b/yt_dlp/extractor/panopto.py index 91f105519..9f307a53e 100644 --- a/yt_dlp/extractor/panopto.py +++ b/yt_dlp/extractor/panopto.py @@ -14,8 +14,9 @@ int_or_none, parse_qs, srt_subtitles_timecode, - traverse_obj, + url_or_none, ) +from ..utils.traversal import traverse_obj class PanoptoBaseIE(InfoExtractor): @@ -345,21 +346,16 @@ def _extract_streams_formats_and_subtitles(self, video_id, streams, **fmt_kwargs subtitles = {} for stream in streams or []: stream_formats = [] - http_stream_url = stream.get('StreamHttpUrl') - stream_url = stream.get('StreamUrl') - - if http_stream_url: - stream_formats.append({'url': http_stream_url}) - - if stream_url: + for stream_url in set(traverse_obj(stream, (('StreamHttpUrl', 'StreamUrl'), {url_or_none}))): media_type = stream.get('ViewerMediaFileTypeName') if media_type in ('hls', ): - m3u8_formats, stream_subtitles = self._extract_m3u8_formats_and_subtitles(stream_url, video_id) - stream_formats.extend(m3u8_formats) - subtitles = self._merge_subtitles(subtitles, stream_subtitles) + fmts, subs = self._extract_m3u8_formats_and_subtitles(stream_url, video_id, m3u8_id='hls', fatal=False) + stream_formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) else: stream_formats.append({ 'url': stream_url, + 'ext': media_type, }) for fmt in stream_formats: fmt.update({ diff --git a/yt_dlp/extractor/patreon.py b/yt_dlp/extractor/patreon.py index 7794cae6c..2c1436cac 100644 --- a/yt_dlp/extractor/patreon.py +++ b/yt_dlp/extractor/patreon.py @@ -340,8 +340,9 @@ def _real_extract(self, url): 'channel_follower_count': ('attributes', 'patron_count', {int_or_none}), })) - # all-lowercase 'referer' so we can smuggle it to Generic, SproutVideo, Vimeo - headers = {'referer': 'https://patreon.com/'} + # Must be all-lowercase 'referer' so we can smuggle it to Generic, SproutVideo, and Vimeo. + # patreon.com URLs redirect to www.patreon.com; this matters when requesting mux.com m3u8s + headers = {'referer': 'https://www.patreon.com/'} # handle Vimeo embeds if traverse_obj(attributes, ('embed', 'provider')) == 'Vimeo': @@ -352,7 +353,7 @@ def _real_extract(self, url): v_url, video_id, 'Checking Vimeo embed URL', headers=headers, fatal=False, errnote=False, expected_status=429): # 429 is TLS fingerprint rejection entries.append(self.url_result( - VimeoIE._smuggle_referrer(v_url, 'https://patreon.com/'), + VimeoIE._smuggle_referrer(v_url, headers['referer']), VimeoIE, url_transparent=True)) embed_url = traverse_obj(attributes, ('embed', 'url', {url_or_none})) @@ -379,11 +380,13 @@ def _real_extract(self, url): 'url': post_file['url'], }) elif name == 'video' or determine_ext(post_file.get('url')) == 'm3u8': - formats, subtitles = self._extract_m3u8_formats_and_subtitles(post_file['url'], video_id) + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + post_file['url'], video_id, headers=headers) entries.append({ 'id': video_id, 'formats': formats, 'subtitles': subtitles, + 'http_headers': headers, }) can_view_post = traverse_obj(attributes, 'current_user_can_view') diff --git a/yt_dlp/extractor/phoenix.py b/yt_dlp/extractor/phoenix.py index 63c256019..9df34f8c9 100644 --- a/yt_dlp/extractor/phoenix.py +++ b/yt_dlp/extractor/phoenix.py @@ -1,5 +1,3 @@ -import re - from .youtube import YoutubeIE from .zdf import ZDFBaseIE from ..utils import ( @@ -7,44 +5,27 @@ merge_dicts, try_get, unified_timestamp, - urljoin, ) class PhoenixIE(ZDFBaseIE): IE_NAME = 'phoenix.de' - _VALID_URL = r'https?://(?:www\.)?phoenix\.de/(?:[^/]+/)*[^/?#&]*-a-(?P\d+)\.html' + _VALID_URL = r'https?://(?:www\.)?phoenix\.de/(?:[^/?#]+/)*[^/?#&]*-a-(?P\d+)\.html' _TESTS = [{ - # Same as https://www.zdf.de/politik/phoenix-sendungen/wohin-fuehrt-der-protest-in-der-pandemie-100.html - 'url': 'https://www.phoenix.de/sendungen/ereignisse/corona-nachgehakt/wohin-fuehrt-der-protest-in-der-pandemie-a-2050630.html', - 'md5': '34ec321e7eb34231fd88616c65c92db0', + 'url': 'https://www.phoenix.de/sendungen/dokumentationen/spitzbergen-a-893349.html', + 'md5': 'a79e86d9774d0b3f2102aff988a0bd32', 'info_dict': { - 'id': '210222_phx_nachgehakt_corona_protest', + 'id': '221215_phx_spitzbergen', 'ext': 'mp4', - 'title': 'Wohin führt der Protest in der Pandemie?', - 'description': 'md5:7d643fe7f565e53a24aac036b2122fbd', - 'duration': 1691, - 'timestamp': 1613902500, - 'upload_date': '20210221', + 'title': 'Spitzbergen', + 'description': 'Film von Tilmann Bünz', + 'duration': 728.0, + 'timestamp': 1555600500, + 'upload_date': '20190418', 'uploader': 'Phoenix', - 'series': 'corona nachgehakt', - 'episode': 'Wohin führt der Protest in der Pandemie?', - }, - }, { - # Youtube embed - 'url': 'https://www.phoenix.de/sendungen/gespraeche/phoenix-streitgut-brennglas-corona-a-1965505.html', - 'info_dict': { - 'id': 'hMQtqFYjomk', - 'ext': 'mp4', - 'title': 'phoenix streitgut: Brennglas Corona - Wie gerecht ist unsere Gesellschaft?', - 'description': 'md5:ac7a02e2eb3cb17600bc372e4ab28fdd', - 'duration': 3509, - 'upload_date': '20201219', - 'uploader': 'phoenix', - 'uploader_id': 'phoenix', - }, - 'params': { - 'skip_download': True, + 'thumbnail': 'https://www.phoenix.de/sixcms/media.php/21/Bergspitzen1.png', + 'series': 'Dokumentationen', + 'episode': 'Spitzbergen', }, }, { 'url': 'https://www.phoenix.de/entwicklungen-in-russland-a-2044720.html', @@ -90,8 +71,8 @@ def _real_extract(self, url): content_id = details['tracking']['nielsen']['content']['assetid'] info = self._extract_ptmd( - f'https://tmd.phoenix.de/tmd/2/ngplayer_2_3/vod/ptmd/phoenix/{content_id}', - content_id, None, url) + f'https://tmd.phoenix.de/tmd/2/android_native_6/vod/ptmd/phoenix/{content_id}', + content_id) duration = int_or_none(try_get( details, lambda x: x['tracking']['nielsen']['content']['length'])) @@ -101,20 +82,8 @@ def _real_extract(self, url): str) episode = title if details.get('contentType') == 'episode' else None - thumbnails = [] teaser_images = try_get(details, lambda x: x['teaserImageRef']['layouts'], dict) or {} - for thumbnail_key, thumbnail_url in teaser_images.items(): - thumbnail_url = urljoin(url, thumbnail_url) - if not thumbnail_url: - continue - thumbnail = { - 'url': thumbnail_url, - } - m = re.match('^([0-9]+)x([0-9]+)$', thumbnail_key) - if m: - thumbnail['width'] = int(m.group(1)) - thumbnail['height'] = int(m.group(2)) - thumbnails.append(thumbnail) + thumbnails = self._extract_thumbnails(teaser_images) return merge_dicts(info, { 'id': content_id, diff --git a/yt_dlp/extractor/picarto.py b/yt_dlp/extractor/picarto.py index 72e89c31e..92431fa24 100644 --- a/yt_dlp/extractor/picarto.py +++ b/yt_dlp/extractor/picarto.py @@ -10,7 +10,8 @@ class PicartoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www.)?picarto\.tv/(?P[a-zA-Z0-9]+)' + IE_NAME = 'picarto' + _VALID_URL = r'https?://(?:www.)?picarto\.tv/(?P[^/#?]+)/?(?:$|[?#])' _TEST = { 'url': 'https://picarto.tv/Setz', 'info_dict': { @@ -89,7 +90,8 @@ def _real_extract(self, url): class PicartoVodIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?picarto\.tv/(?:videopopout|\w+/videos)/(?P[^/?#&]+)' + IE_NAME = 'picarto:vod' + _VALID_URL = r'https?://(?:www\.)?picarto\.tv/(?:videopopout|\w+(?:/profile)?/videos)/(?P[^/?#&]+)' _TESTS = [{ 'url': 'https://picarto.tv/videopopout/ArtofZod_2017.12.12.00.13.23.flv', 'md5': '3ab45ba4352c52ee841a28fb73f2d9ca', @@ -111,6 +113,18 @@ class PicartoVodIE(InfoExtractor): 'channel': 'ArtofZod', 'age_limit': 18, }, + }, { + 'url': 'https://picarto.tv/DrechuArt/profile/videos/400347', + 'md5': 'f9ea54868b1d9dec40eb554b484cc7bf', + 'info_dict': { + 'id': '400347', + 'ext': 'mp4', + 'title': 'Welcome to the Show', + 'thumbnail': r're:^https?://.*\.jpg', + 'channel': 'DrechuArt', + 'age_limit': 0, + }, + }, { 'url': 'https://picarto.tv/videopopout/Plague', 'only_matching': True, diff --git a/yt_dlp/extractor/playsuisse.py b/yt_dlp/extractor/playsuisse.py index 59231d840..46e3a5b8f 100644 --- a/yt_dlp/extractor/playsuisse.py +++ b/yt_dlp/extractor/playsuisse.py @@ -7,11 +7,12 @@ from ..utils import ( ExtractorError, int_or_none, + join_nonempty, parse_qs, - traverse_obj, update_url_query, urlencode_postdata, ) +from ..utils.traversal import traverse_obj, unpack class PlaySuisseIE(InfoExtractor): @@ -26,12 +27,12 @@ class PlaySuisseIE(InfoExtractor): { # episode in a series 'url': 'https://www.playsuisse.ch/watch/763182?episodeId=763211', - 'md5': '82df2a470b2dfa60c2d33772a8a60cf8', + 'md5': 'e20d1ede6872a03b41905ca1060a1ef2', 'info_dict': { 'id': '763211', 'ext': 'mp4', 'title': 'Knochen', - 'description': 'md5:8ea7a8076ba000cd9e8bc132fd0afdd8', + 'description': 'md5:3bdd80e2ce20227c47aab1df2a79a519', 'duration': 3344, 'series': 'Wilder', 'season': 'Season 1', @@ -42,24 +43,33 @@ class PlaySuisseIE(InfoExtractor): }, }, { # film - 'url': 'https://www.playsuisse.ch/watch/808675', - 'md5': '818b94c1d2d7c4beef953f12cb8f3e75', + 'url': 'https://www.playsuisse.ch/detail/2573198', + 'md5': '1f115bb0a5191477b1a5771643a4283d', 'info_dict': { - 'id': '808675', + 'id': '2573198', 'ext': 'mp4', - 'title': 'Der Läufer', - 'description': 'md5:9f61265c7e6dcc3e046137a792b275fd', - 'duration': 5280, + 'title': 'Azor', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'genres': ['Fiction'], + 'creators': ['Andreas Fontana'], + 'cast': ['Fabrizio Rongione', 'Stéphanie Cléau', 'Gilles Privat', 'Alexandre Trocki'], + 'location': 'France; Argentine', + 'release_year': 2021, + 'duration': 5981, 'thumbnail': 're:https://playsuisse-img.akamaized.net/', }, }, { # series (treated as a playlist) 'url': 'https://www.playsuisse.ch/detail/1115687', 'info_dict': { - 'description': 'md5:e4a2ae29a8895823045b5c3145a02aa3', 'id': '1115687', 'series': 'They all came out to Montreux', 'title': 'They all came out to Montreux', + 'description': 'md5:0fefd8c5b4468a0bb35e916887681520', + 'genres': ['Documentary'], + 'creators': ['Oliver Murray'], + 'location': 'Switzerland', + 'release_year': 2021, }, 'playlist': [{ 'info_dict': { @@ -120,6 +130,12 @@ class PlaySuisseIE(InfoExtractor): id name description + descriptionLong + year + contentTypes + directors + mainCast + productionCountries duration episodeNumber seasonNumber @@ -215,9 +231,7 @@ def _perform_login(self, username, password): if not self._ID_TOKEN: raise ExtractorError('Login failed') - def _get_media_data(self, media_id): - # NOTE In the web app, the "locale" header is used to switch between languages, - # However this doesn't seem to take effect when passing the header here. + def _get_media_data(self, media_id, locale=None): response = self._download_json( 'https://www.playsuisse.ch/api/graphql', media_id, data=json.dumps({ @@ -225,7 +239,7 @@ def _get_media_data(self, media_id): 'query': self._GRAPHQL_QUERY, 'variables': {'assetId': media_id}, }).encode(), - headers={'Content-Type': 'application/json', 'locale': 'de'}) + headers={'Content-Type': 'application/json', 'locale': locale or 'de'}) return response['data']['assetV2'] @@ -234,7 +248,7 @@ def _real_extract(self, url): self.raise_login_required(method='password') media_id = self._match_id(url) - media_data = self._get_media_data(media_id) + media_data = self._get_media_data(media_id, traverse_obj(parse_qs(url), ('locale', 0))) info = self._extract_single(media_data) if media_data.get('episodes'): info.update({ @@ -257,15 +271,22 @@ def _extract_single(self, media_data): self._merge_subtitles(subs, target=subtitles) return { - 'id': media_data['id'], - 'title': media_data.get('name'), - 'description': media_data.get('description'), 'thumbnails': thumbnails, - 'duration': int_or_none(media_data.get('duration')), 'formats': formats, 'subtitles': subtitles, - 'series': media_data.get('seriesName'), - 'season_number': int_or_none(media_data.get('seasonNumber')), - 'episode': media_data.get('name') if media_data.get('episodeNumber') else None, - 'episode_number': int_or_none(media_data.get('episodeNumber')), + **traverse_obj(media_data, { + 'id': ('id', {str}), + 'title': ('name', {str}), + 'description': (('descriptionLong', 'description'), {str}, any), + 'genres': ('contentTypes', ..., {str}), + 'creators': ('directors', ..., {str}), + 'cast': ('mainCast', ..., {str}), + 'location': ('productionCountries', ..., {str}, all, {unpack(join_nonempty, delim='; ')}, filter), + 'release_year': ('year', {str}, {lambda x: x[:4]}, {int_or_none}), + 'duration': ('duration', {int_or_none}), + 'series': ('seriesName', {str}), + 'season_number': ('seasonNumber', {int_or_none}), + 'episode': ('name', {str}, {lambda x: x if media_data['episodeNumber'] is not None else None}), + 'episode_number': ('episodeNumber', {int_or_none}), + }), } diff --git a/yt_dlp/extractor/podchaser.py b/yt_dlp/extractor/podchaser.py index 4570f0f17..6c125f9ba 100644 --- a/yt_dlp/extractor/podchaser.py +++ b/yt_dlp/extractor/podchaser.py @@ -5,11 +5,13 @@ from ..utils import ( OnDemandPagedList, float_or_none, + int_or_none, + orderedSet, str_or_none, - str_to_int, - traverse_obj, unified_timestamp, + url_or_none, ) +from ..utils.traversal import require, traverse_obj class PodchaserIE(InfoExtractor): @@ -21,24 +23,25 @@ class PodchaserIE(InfoExtractor): 'id': '104365585', 'title': 'Ep. 285 – freeze me off', 'description': 'cam ahn', - 'thumbnail': r're:^https?://.*\.jpg$', + 'thumbnail': r're:https?://.+/.+\.jpg', 'ext': 'mp3', - 'categories': ['Comedy'], + 'categories': ['Comedy', 'News', 'Politics', 'Arts'], 'tags': ['comedy', 'dark humor'], - 'series': 'Cum Town', + 'series': 'The Adam Friedland Show Podcast', 'duration': 3708, 'timestamp': 1636531259, 'upload_date': '20211110', 'average_rating': 4.0, + 'series_id': '36924', }, }, { 'url': 'https://www.podchaser.com/podcasts/the-bone-zone-28853', 'info_dict': { 'id': '28853', 'title': 'The Bone Zone', - 'description': 'Podcast by The Bone Zone', + 'description': r're:The official home of the Bone Zone podcast.+', }, - 'playlist_count': 275, + 'playlist_mincount': 275, }, { 'url': 'https://www.podchaser.com/podcasts/sean-carrolls-mindscape-scienc-699349/episodes', 'info_dict': { @@ -51,19 +54,33 @@ class PodchaserIE(InfoExtractor): @staticmethod def _parse_episode(episode, podcast): - return { - 'id': str(episode.get('id')), - 'title': episode.get('title'), - 'description': episode.get('description'), - 'url': episode.get('audio_url'), - 'thumbnail': episode.get('image_url'), - 'duration': str_to_int(episode.get('length')), - 'timestamp': unified_timestamp(episode.get('air_date')), - 'average_rating': float_or_none(episode.get('rating')), - 'categories': list(set(traverse_obj(podcast, (('summary', None), 'categories', ..., 'text')))), - 'tags': traverse_obj(podcast, ('tags', ..., 'text')), - 'series': podcast.get('title'), - } + info = traverse_obj(episode, { + 'id': ('id', {int}, {str_or_none}, {require('episode ID')}), + 'title': ('title', {str}), + 'description': ('description', {str}), + 'url': ('audio_url', {url_or_none}), + 'thumbnail': ('image_url', {url_or_none}), + 'duration': ('length', {int_or_none}), + 'timestamp': ('air_date', {unified_timestamp}), + 'average_rating': ('rating', {float_or_none}), + }) + info.update(traverse_obj(podcast, { + 'series': ('title', {str}), + 'series_id': ('id', {int}, {str_or_none}), + 'categories': (('summary', None), 'categories', ..., 'text', {str}, filter, all, {orderedSet}), + 'tags': ('tags', ..., 'text', {str}), + })) + info['vcodec'] = 'none' + + if info.get('series_id'): + podcast_slug = traverse_obj(podcast, ('slug', {str})) or 'podcast' + episode_slug = traverse_obj(episode, ('slug', {str})) or 'episode' + info['webpage_url'] = '/'.join(( + 'https://www.podchaser.com/podcasts', + '-'.join((podcast_slug[:30].rstrip('-'), info['series_id'])), + '-'.join((episode_slug[:30].rstrip('-'), info['id'])))) + + return info def _call_api(self, path, *args, **kwargs): return self._download_json(f'https://api.podchaser.com/{path}', *args, **kwargs) @@ -93,5 +110,5 @@ def _real_extract(self, url): OnDemandPagedList(functools.partial(self._fetch_page, podcast_id, podcast), self._PAGE_SIZE), str_or_none(podcast.get('id')), podcast.get('title'), podcast.get('description')) - episode = self._call_api(f'episodes/{episode_id}', episode_id) + episode = self._call_api(f'podcasts/{podcast_id}/episodes/{episode_id}/player_ids', episode_id) return self._parse_episode(episode, podcast) diff --git a/yt_dlp/extractor/rai.py b/yt_dlp/extractor/rai.py index efb47affc..c489dc731 100644 --- a/yt_dlp/extractor/rai.py +++ b/yt_dlp/extractor/rai.py @@ -321,6 +321,27 @@ class RaiPlayIE(RaiBaseIE): 'timestamp': 1348495020, 'upload_date': '20120924', }, + }, { + # checking program_info gives false positive for DRM + 'url': 'https://www.raiplay.it/video/2022/10/Ad-ogni-costo---Un-giorno-in-Pretura---Puntata-del-15102022-1dfd1295-ea38-4bac-b51e-f87e2881693b.html', + 'md5': '572c6f711b7c5f2d670ba419b4ae3b08', + 'info_dict': { + 'id': '1dfd1295-ea38-4bac-b51e-f87e2881693b', + 'ext': 'mp4', + 'title': 'Ad ogni costo - Un giorno in Pretura - Puntata del 15/10/2022', + 'alt_title': 'St 2022/23 - Un giorno in pretura - Ad ogni costo', + 'description': 'md5:4046d97b2687f74f06a8b8270ba5599f', + 'uploader': 'Rai 3', + 'duration': 3773.0, + 'thumbnail': 'https://www.raiplay.it/dl/img/2022/10/12/1665586539957_2048x2048.png', + 'creators': ['Rai 3'], + 'series': 'Un giorno in pretura', + 'season': '2022/23', + 'episode': 'Ad ogni costo', + 'timestamp': 1665507240, + 'upload_date': '20221011', + 'release_year': 2025, + }, }, { 'url': 'http://www.raiplay.it/video/2016/11/gazebotraindesi-efebe701-969c-4593-92f3-285f0d1ce750.html?', 'only_matching': True, @@ -340,9 +361,8 @@ def _real_extract(self, url): media = self._download_json( f'{base}.json', video_id, 'Downloading video JSON') - if not self.get_param('allow_unplayable_formats'): - if traverse_obj(media, (('program_info', None), 'rights_management', 'rights', 'drm')): - self.report_drm(video_id) + if traverse_obj(media, ('rights_management', 'rights', 'drm')): + self.report_drm(video_id) video = media['video'] relinker_info = self._extract_relinker_info(video['content_url'], video_id) diff --git a/yt_dlp/extractor/reddit.py b/yt_dlp/extractor/reddit.py index a7a6a75dc..be4c32e13 100644 --- a/yt_dlp/extractor/reddit.py +++ b/yt_dlp/extractor/reddit.py @@ -388,7 +388,8 @@ def add_thumbnail(src): }) if entries: return self.playlist_result(entries, video_id, **info) - raise ExtractorError('No media found', expected=True) + self.raise_no_formats('No media found', expected=True, video_id=video_id) + return {**info, 'id': video_id} # Check if media is hosted on reddit: reddit_video = traverse_obj(data, ( diff --git a/yt_dlp/extractor/rtve.py b/yt_dlp/extractor/rtve.py index 7e0b666ab..2812d9305 100644 --- a/yt_dlp/extractor/rtve.py +++ b/yt_dlp/extractor/rtve.py @@ -1,35 +1,142 @@ import base64 import io import struct +import urllib.parse from .common import InfoExtractor from ..utils import ( ExtractorError, + clean_html, determine_ext, float_or_none, + make_archive_id, + parse_iso8601, qualities, - remove_end, - remove_start, - try_get, + url_or_none, ) +from ..utils.traversal import subs_list_to_dict, traverse_obj -class RTVEALaCartaIE(InfoExtractor): +class RTVEBaseIE(InfoExtractor): + # Reimplementation of https://js2.rtve.es/pages/app-player/3.5.1/js/pf_video.js + @staticmethod + def _decrypt_url(png): + encrypted_data = io.BytesIO(base64.b64decode(png)[8:]) + while True: + length_data = encrypted_data.read(4) + length = struct.unpack('!I', length_data)[0] + chunk_type = encrypted_data.read(4) + if chunk_type == b'IEND': + break + data = encrypted_data.read(length) + if chunk_type == b'tEXt': + data = bytes(filter(None, data)) + alphabet_data, _, url_data = data.partition(b'#') + quality_str, _, url_data = url_data.rpartition(b'%%') + quality_str = quality_str.decode() or '' + alphabet = RTVEBaseIE._get_alphabet(alphabet_data) + url = RTVEBaseIE._get_url(alphabet, url_data) + yield quality_str, url + encrypted_data.read(4) # CRC + + @staticmethod + def _get_url(alphabet, url_data): + url = '' + f = 0 + e = 3 + b = 1 + for char in url_data.decode('iso-8859-1'): + if f == 0: + l = int(char) * 10 + f = 1 + else: + if e == 0: + l += int(char) + url += alphabet[l] + e = (b + 3) % 4 + f = 0 + b += 1 + else: + e -= 1 + return url + + @staticmethod + def _get_alphabet(alphabet_data): + alphabet = [] + e = 0 + d = 0 + for char in alphabet_data.decode('iso-8859-1'): + if d == 0: + alphabet.append(char) + d = e = (e + 1) % 4 + else: + d -= 1 + return alphabet + + def _extract_png_formats_and_subtitles(self, video_id, media_type='videos'): + formats, subtitles = [], {} + q = qualities(['Media', 'Alta', 'HQ', 'HD_READY', 'HD_FULL']) + for manager in ('rtveplayw', 'default'): + png = self._download_webpage( + f'http://www.rtve.es/ztnr/movil/thumbnail/{manager}/{media_type}/{video_id}.png', + video_id, 'Downloading url information', query={'q': 'v2'}, fatal=False) + if not png: + continue + + for quality, video_url in self._decrypt_url(png): + ext = determine_ext(video_url) + if ext == 'm3u8': + fmts, subs = self._extract_m3u8_formats_and_subtitles( + video_url, video_id, 'mp4', m3u8_id='hls', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + elif ext == 'mpd': + fmts, subs = self._extract_mpd_formats_and_subtitles( + video_url, video_id, 'dash', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + else: + formats.append({ + 'format_id': quality, + 'quality': q(quality), + 'url': video_url, + }) + return formats, subtitles + + def _parse_metadata(self, metadata): + return traverse_obj(metadata, { + 'title': ('title', {str.strip}), + 'alt_title': ('alt', {str.strip}), + 'description': ('description', {clean_html}), + 'timestamp': ('dateOfEmission', {parse_iso8601(delimiter=' ')}), + 'release_timestamp': ('publicationDate', {parse_iso8601(delimiter=' ')}), + 'modified_timestamp': ('modificationDate', {parse_iso8601(delimiter=' ')}), + 'thumbnail': (('thumbnail', 'image', 'imageSEO'), {url_or_none}, any), + 'duration': ('duration', {float_or_none(scale=1000)}), + 'is_live': ('live', {bool}), + 'series': (('programTitle', ('programInfo', 'title')), {clean_html}, any), + }) + + +class RTVEALaCartaIE(RTVEBaseIE): IE_NAME = 'rtve.es:alacarta' - IE_DESC = 'RTVE a la carta' - _VALID_URL = r'https?://(?:www\.)?rtve\.es/(m/)?(alacarta/videos|filmoteca)/[^/]+/[^/]+/(?P\d+)' + IE_DESC = 'RTVE a la carta and Play' + _VALID_URL = [ + r'https?://(?:www\.)?rtve\.es/(?:m/)?(?:(?:alacarta|play)/videos|filmoteca)/(?!directo)(?:[^/?#]+/){2}(?P\d+)', + r'https?://(?:www\.)?rtve\.es/infantil/serie/[^/?#]+/video/[^/?#]+/(?P\d+)', + ] _TESTS = [{ - 'url': 'http://www.rtve.es/alacarta/videos/balonmano/o-swiss-cup-masculina-final-espana-suecia/2491869/', - 'md5': '1d49b7e1ca7a7502c56a4bf1b60f1b43', + 'url': 'http://www.rtve.es/alacarta/videos/la-aventura-del-saber/aventuraentornosilla/3088905/', + 'md5': 'a964547824359a5753aef09d79fe984b', 'info_dict': { - 'id': '2491869', + 'id': '3088905', 'ext': 'mp4', - 'title': 'Balonmano - Swiss Cup masculina. Final: España-Suecia', - 'duration': 5024.566, - 'series': 'Balonmano', + 'title': 'En torno a la silla', + 'duration': 1216.981, + 'series': 'La aventura del Saber', + 'thumbnail': 'https://img2.rtve.es/v/aventuraentornosilla_3088905.png', }, - 'expected_warnings': ['Failed to download MPD manifest', 'Failed to download m3u8 information'], }, { 'note': 'Live stream', 'url': 'http://www.rtve.es/alacarta/videos/television/24h-live/1694255/', @@ -38,140 +145,88 @@ class RTVEALaCartaIE(InfoExtractor): 'ext': 'mp4', 'title': 're:^24H LIVE [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'is_live': True, + 'live_status': 'is_live', + 'thumbnail': r're:https://img2\.rtve\.es/v/.*\.png', }, 'params': { 'skip_download': 'live stream', }, }, { 'url': 'http://www.rtve.es/alacarta/videos/servir-y-proteger/servir-proteger-capitulo-104/4236788/', - 'md5': 'd850f3c8731ea53952ebab489cf81cbf', + 'md5': 'f3cf0d1902d008c48c793e736706c174', 'info_dict': { 'id': '4236788', 'ext': 'mp4', - 'title': 'Servir y proteger - Capítulo 104', - 'duration': 3222.0, + 'title': 'Episodio 104', + 'duration': 3222.8, + 'thumbnail': r're:https://img2\.rtve\.es/v/.*\.png', + 'series': 'Servir y proteger', }, - 'expected_warnings': ['Failed to download MPD manifest', 'Failed to download m3u8 information'], }, { 'url': 'http://www.rtve.es/m/alacarta/videos/cuentame-como-paso/cuentame-como-paso-t16-ultimo-minuto-nuestra-vida-capitulo-276/2969138/?media=tve', 'only_matching': True, }, { 'url': 'http://www.rtve.es/filmoteca/no-do/not-1-introduccion-primer-noticiario-espanol/1465256/', 'only_matching': True, + }, { + 'url': 'https://www.rtve.es/play/videos/saber-vivir/07-07-24/16177116/', + 'md5': 'a5b24fcdfa3ff5cb7908aba53d22d4b6', + 'info_dict': { + 'id': '16177116', + 'ext': 'mp4', + 'title': 'Saber vivir - 07/07/24', + 'thumbnail': r're:https://img2\.rtve\.es/v/.*\.png', + 'duration': 2162.68, + 'series': 'Saber vivir', + }, + }, { + 'url': 'https://www.rtve.es/infantil/serie/agus-lui-churros-crafts/video/gusano/7048976/', + 'info_dict': { + 'id': '7048976', + 'ext': 'mp4', + 'title': 'Gusano', + 'thumbnail': r're:https://img2\.rtve\.es/v/.*\.png', + 'duration': 292.86, + 'series': 'Agus & Lui: Churros y Crafts', + '_old_archive_ids': ['rtveinfantil 7048976'], + }, }] - def _real_initialize(self): - user_agent_b64 = base64.b64encode(self.get_param('http_headers')['User-Agent'].encode()).decode('utf-8') - self._manager = self._download_json( - 'http://www.rtve.es/odin/loki/' + user_agent_b64, - None, 'Fetching manager info')['manager'] - - @staticmethod - def _decrypt_url(png): - encrypted_data = io.BytesIO(base64.b64decode(png)[8:]) - while True: - length = struct.unpack('!I', encrypted_data.read(4))[0] - chunk_type = encrypted_data.read(4) - if chunk_type == b'IEND': - break - data = encrypted_data.read(length) - if chunk_type == b'tEXt': - alphabet_data, text = data.split(b'\0') - quality, url_data = text.split(b'%%') - alphabet = [] - e = 0 - d = 0 - for l in alphabet_data.decode('iso-8859-1'): - if d == 0: - alphabet.append(l) - d = e = (e + 1) % 4 - else: - d -= 1 - url = '' - f = 0 - e = 3 - b = 1 - for letter in url_data.decode('iso-8859-1'): - if f == 0: - l = int(letter) * 10 - f = 1 - else: - if e == 0: - l += int(letter) - url += alphabet[l] - e = (b + 3) % 4 - f = 0 - b += 1 - else: - e -= 1 - - yield quality.decode(), url - encrypted_data.read(4) # CRC - - def _extract_png_formats(self, video_id): - png = self._download_webpage( - f'http://www.rtve.es/ztnr/movil/thumbnail/{self._manager}/videos/{video_id}.png', - video_id, 'Downloading url information', query={'q': 'v2'}) - q = qualities(['Media', 'Alta', 'HQ', 'HD_READY', 'HD_FULL']) - formats = [] - for quality, video_url in self._decrypt_url(png): - ext = determine_ext(video_url) - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - elif ext == 'mpd': - formats.extend(self._extract_mpd_formats( - video_url, video_id, 'dash', fatal=False)) - else: - formats.append({ - 'format_id': quality, - 'quality': q(quality), - 'url': video_url, - }) - return formats + def _get_subtitles(self, video_id): + subtitle_data = self._download_json( + f'https://api2.rtve.es/api/videos/{video_id}/subtitulos.json', video_id, + 'Downloading subtitles info') + return traverse_obj(subtitle_data, ('page', 'items', ..., { + 'id': ('lang', {str}), + 'url': ('src', {url_or_none}), + }, all, {subs_list_to_dict(lang='es')})) def _real_extract(self, url): video_id = self._match_id(url) - info = self._download_json( + metadata = self._download_json( f'http://www.rtve.es/api/videos/{video_id}/config/alacarta_videos.json', video_id)['page']['items'][0] - if info['state'] == 'DESPU': + if metadata['state'] == 'DESPU': raise ExtractorError('The video is no longer available', expected=True) - title = info['title'].strip() - formats = self._extract_png_formats(video_id) + formats, subtitles = self._extract_png_formats_and_subtitles(video_id) - subtitles = None - sbt_file = info.get('sbtFile') - if sbt_file: - subtitles = self.extract_subtitles(video_id, sbt_file) + self._merge_subtitles(self.extract_subtitles(video_id), target=subtitles) - is_live = info.get('live') is True + is_infantil = urllib.parse.urlparse(url).path.startswith('/infantil/') return { 'id': video_id, - 'title': title, 'formats': formats, - 'thumbnail': info.get('image'), 'subtitles': subtitles, - 'duration': float_or_none(info.get('duration'), 1000), - 'is_live': is_live, - 'series': info.get('programTitle'), + **self._parse_metadata(metadata), + '_old_archive_ids': [make_archive_id('rtveinfantil', video_id)] if is_infantil else None, } - def _get_subtitles(self, video_id, sub_file): - subs = self._download_json( - sub_file + '.json', video_id, - 'Downloading subtitles info')['page']['items'] - return dict( - (s['lang'], [{'ext': 'vtt', 'url': s['src']}]) - for s in subs) - -class RTVEAudioIE(RTVEALaCartaIE): # XXX: Do not subclass from concrete IE +class RTVEAudioIE(RTVEBaseIE): IE_NAME = 'rtve.es:audio' IE_DESC = 'RTVE audio' - _VALID_URL = r'https?://(?:www\.)?rtve\.es/(alacarta|play)/audios/[^/]+/[^/]+/(?P[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?rtve\.es/(alacarta|play)/audios/(?:[^/?#]+/){2}(?P\d+)' _TESTS = [{ 'url': 'https://www.rtve.es/alacarta/audios/a-hombros-de-gigantes/palabra-ingeniero-codigos-informaticos-27-04-21/5889192/', @@ -180,9 +235,11 @@ class RTVEAudioIE(RTVEALaCartaIE): # XXX: Do not subclass from concrete IE 'id': '5889192', 'ext': 'mp3', 'title': 'Códigos informáticos', - 'thumbnail': r're:https?://.+/1598856591583.jpg', + 'alt_title': 'Códigos informáticos - Escuchar ahora', 'duration': 349.440, 'series': 'A hombros de gigantes', + 'description': 'md5:72b0d7c1ca20fd327bdfff7ac0171afb', + 'thumbnail': 'https://img2.rtve.es/a/palabra-ingeniero-codigos-informaticos-270421_5889192.png', }, }, { 'url': 'https://www.rtve.es/play/audios/en-radio-3/ignatius-farray/5791165/', @@ -191,9 +248,11 @@ class RTVEAudioIE(RTVEALaCartaIE): # XXX: Do not subclass from concrete IE 'id': '5791165', 'ext': 'mp3', 'title': 'Ignatius Farray', + 'alt_title': 'En Radio 3 - Ignatius Farray - 13/02/21 - escuchar ahora', 'thumbnail': r're:https?://.+/1613243011863.jpg', 'duration': 3559.559, 'series': 'En Radio 3', + 'description': 'md5:124aa60b461e0b1724a380bad3bc4040', }, }, { 'url': 'https://www.rtve.es/play/audios/frankenstein-o-el-moderno-prometeo/capitulo-26-ultimo-muerte-victor-juan-jose-plans-mary-shelley/6082623/', @@ -202,126 +261,101 @@ class RTVEAudioIE(RTVEALaCartaIE): # XXX: Do not subclass from concrete IE 'id': '6082623', 'ext': 'mp3', 'title': 'Capítulo 26 y último: La muerte de Victor', + 'alt_title': 'Frankenstein o el moderno Prometeo - Capítulo 26 y último: La muerte de Victor', 'thumbnail': r're:https?://.+/1632147445707.jpg', 'duration': 3174.086, 'series': 'Frankenstein o el moderno Prometeo', + 'description': 'md5:4ee6fcb82ebe2e46d267e1d1c1a8f7b5', }, }] - def _extract_png_formats(self, audio_id): - """ - This function retrieves media related png thumbnail which obfuscate - valuable information about the media. This information is decrypted - via base class _decrypt_url function providing media quality and - media url - """ - png = self._download_webpage( - f'http://www.rtve.es/ztnr/movil/thumbnail/{self._manager}/audios/{audio_id}.png', - audio_id, 'Downloading url information', query={'q': 'v2'}) - q = qualities(['Media', 'Alta', 'HQ', 'HD_READY', 'HD_FULL']) - formats = [] - for quality, audio_url in self._decrypt_url(png): - ext = determine_ext(audio_url) - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - audio_url, audio_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - elif ext == 'mpd': - formats.extend(self._extract_mpd_formats( - audio_url, audio_id, 'dash', fatal=False)) - else: - formats.append({ - 'format_id': quality, - 'quality': q(quality), - 'url': audio_url, - }) - return formats - def _real_extract(self, url): audio_id = self._match_id(url) - info = self._download_json( - f'https://www.rtve.es/api/audios/{audio_id}.json', - audio_id)['page']['items'][0] + metadata = self._download_json( + f'https://www.rtve.es/api/audios/{audio_id}.json', audio_id)['page']['items'][0] + + formats, subtitles = self._extract_png_formats_and_subtitles(audio_id, media_type='audios') return { 'id': audio_id, - 'title': info['title'].strip(), - 'thumbnail': info.get('thumbnail'), - 'duration': float_or_none(info.get('duration'), 1000), - 'series': try_get(info, lambda x: x['programInfo']['title']), - 'formats': self._extract_png_formats(audio_id), + 'formats': formats, + 'subtitles': subtitles, + **self._parse_metadata(metadata), } -class RTVEInfantilIE(RTVEALaCartaIE): # XXX: Do not subclass from concrete IE - IE_NAME = 'rtve.es:infantil' - IE_DESC = 'RTVE infantil' - _VALID_URL = r'https?://(?:www\.)?rtve\.es/infantil/serie/[^/]+/video/[^/]+/(?P[0-9]+)/' - - _TESTS = [{ - 'url': 'http://www.rtve.es/infantil/serie/cleo/video/maneras-vivir/3040283/', - 'md5': '5747454717aedf9f9fdf212d1bcfc48d', - 'info_dict': { - 'id': '3040283', - 'ext': 'mp4', - 'title': 'Maneras de vivir', - 'thumbnail': r're:https?://.+/1426182947956\.JPG', - 'duration': 357.958, - }, - 'expected_warnings': ['Failed to download MPD manifest', 'Failed to download m3u8 information'], - }] - - -class RTVELiveIE(RTVEALaCartaIE): # XXX: Do not subclass from concrete IE +class RTVELiveIE(RTVEBaseIE): IE_NAME = 'rtve.es:live' IE_DESC = 'RTVE.es live streams' - _VALID_URL = r'https?://(?:www\.)?rtve\.es/directo/(?P[a-zA-Z0-9-]+)' + _VALID_URL = [ + r'https?://(?:www\.)?rtve\.es/directo/(?P[a-zA-Z0-9-]+)', + r'https?://(?:www\.)?rtve\.es/play/videos/directo/[^/?#]+/(?P[a-zA-Z0-9-]+)', + ] _TESTS = [{ 'url': 'http://www.rtve.es/directo/la-1/', 'info_dict': { 'id': 'la-1', 'ext': 'mp4', - 'title': 're:^La 1 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'live_status': 'is_live', + 'title': str, + 'description': str, + 'thumbnail': r're:https://img\d\.rtve\.es/resources/thumbslive/\d+\.jpg', + 'timestamp': int, + 'upload_date': str, }, - 'params': { - 'skip_download': 'live stream', + 'params': {'skip_download': 'live stream'}, + }, { + 'url': 'https://www.rtve.es/play/videos/directo/deportes/tdp/', + 'info_dict': { + 'id': 'tdp', + 'ext': 'mp4', + 'live_status': 'is_live', + 'title': str, + 'description': str, + 'thumbnail': r're:https://img2\d\.rtve\.es/resources/thumbslive/\d+\.jpg', + 'timestamp': int, + 'upload_date': str, }, + 'params': {'skip_download': 'live stream'}, + }, { + 'url': 'http://www.rtve.es/play/videos/directo/canales-lineales/la-1/', + 'only_matching': True, }] def _real_extract(self, url): - mobj = self._match_valid_url(url) - video_id = mobj.group('id') - + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - title = remove_end(self._og_search_title(webpage), ' en directo en RTVE.es') - title = remove_start(title, 'Estoy viendo ') - vidplayer_id = self._search_regex( - (r'playerId=player([0-9]+)', - r'class=["\'].*?\blive_mod\b.*?["\'][^>]+data-assetid=["\'](\d+)', - r'data-id=["\'](\d+)'), - webpage, 'internal video ID') + data_setup = self._search_json( + r']+class="[^"]*videoPlayer[^"]*"[^>]*data-setup=\'', + webpage, 'data_setup', video_id) + + formats, subtitles = self._extract_png_formats_and_subtitles(data_setup['idAsset']) return { 'id': video_id, - 'title': title, - 'formats': self._extract_png_formats(vidplayer_id), + **self._search_json_ld(webpage, video_id, fatal=False), + 'title': self._html_extract_title(webpage), + 'formats': formats, + 'subtitles': subtitles, 'is_live': True, } class RTVETelevisionIE(InfoExtractor): IE_NAME = 'rtve.es:television' - _VALID_URL = r'https?://(?:www\.)?rtve\.es/television/[^/]+/[^/]+/(?P\d+).shtml' + _VALID_URL = r'https?://(?:www\.)?rtve\.es/television/[^/?#]+/[^/?#]+/(?P\d+).shtml' _TEST = { - 'url': 'http://www.rtve.es/television/20160628/revolucion-del-movil/1364141.shtml', + 'url': 'https://www.rtve.es/television/20091103/video-inedito-del-8o-programa/299020.shtml', 'info_dict': { - 'id': '3069778', + 'id': '572515', 'ext': 'mp4', - 'title': 'Documentos TV - La revolución del móvil', - 'duration': 3496.948, + 'title': 'Clase inédita', + 'duration': 335.817, + 'thumbnail': r're:https://img2\.rtve\.es/v/.*\.png', + 'series': 'El coro de la cárcel', }, 'params': { 'skip_download': True, @@ -332,11 +366,8 @@ def _real_extract(self, url): page_id = self._match_id(url) webpage = self._download_webpage(url, page_id) - alacarta_url = self._search_regex( - r'data-location="alacarta_videos"[^<]+url":"(http://www\.rtve\.es/alacarta.+?)&', - webpage, 'alacarta url', default=None) - if alacarta_url is None: - raise ExtractorError( - 'The webpage doesn\'t contain any video', expected=True) + play_url = self._html_search_meta('contentUrl', webpage) + if play_url is None: + raise ExtractorError('The webpage doesn\'t contain any video', expected=True) - return self.url_result(alacarta_url, ie=RTVEALaCartaIE.ie_key()) + return self.url_result(play_url, ie=RTVEALaCartaIE.ie_key()) diff --git a/yt_dlp/extractor/soundcloud.py b/yt_dlp/extractor/soundcloud.py index c70940a60..3496a08ef 100644 --- a/yt_dlp/extractor/soundcloud.py +++ b/yt_dlp/extractor/soundcloud.py @@ -697,7 +697,7 @@ def _real_extract(self, url): try: return self._extract_info_dict(info, full_title, token) except ExtractorError as e: - if not isinstance(e.cause, HTTPError) or not e.cause.status == 429: + if not isinstance(e.cause, HTTPError) or e.cause.status != 429: raise self.report_warning( 'You have reached the API rate limit, which is ~600 requests per ' diff --git a/yt_dlp/extractor/sprout.py b/yt_dlp/extractor/sprout.py deleted file mode 100644 index 444a6c270..000000000 --- a/yt_dlp/extractor/sprout.py +++ /dev/null @@ -1,61 +0,0 @@ -from .adobepass import AdobePassIE -from ..utils import ( - int_or_none, - smuggle_url, - update_url_query, -) - - -class SproutIE(AdobePassIE): - _VALID_URL = r'https?://(?:www\.)?(?:sproutonline|universalkids)\.com/(?:watch|(?:[^/]+/)*videos)/(?P[^/?#]+)' - _TESTS = [{ - 'url': 'https://www.universalkids.com/shows/remy-and-boo/season/1/videos/robot-bike-race', - 'info_dict': { - 'id': 'bm0foJFaTKqb', - 'ext': 'mp4', - 'title': 'Robot Bike Race', - 'description': 'md5:436b1d97117cc437f54c383f4debc66d', - 'timestamp': 1606148940, - 'upload_date': '20201123', - 'uploader': 'NBCU-MPAT', - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'http://www.sproutonline.com/watch/cowboy-adventure', - 'only_matching': True, - }, { - 'url': 'https://www.universalkids.com/watch/robot-bike-race', - 'only_matching': True, - }] - _GEO_COUNTRIES = ['US'] - - def _real_extract(self, url): - display_id = self._match_id(url) - mpx_metadata = self._download_json( - # http://nbcuunikidsprod.apps.nbcuni.com/networks/universalkids/content/videos/ - 'https://www.universalkids.com/_api/videos/' + display_id, - display_id)['mpxMetadata'] - media_pid = mpx_metadata['mediaPid'] - theplatform_url = 'https://link.theplatform.com/s/HNK2IC/' + media_pid - query = { - 'mbr': 'true', - 'manifest': 'm3u', - } - if mpx_metadata.get('entitlement') == 'auth': - query['auth'] = self._extract_mvpd_auth(url, media_pid, 'sprout', 'sprout') - theplatform_url = smuggle_url( - update_url_query(theplatform_url, query), { - 'force_smil_url': True, - 'geo_countries': self._GEO_COUNTRIES, - }) - return { - '_type': 'url_transparent', - 'id': media_pid, - 'url': theplatform_url, - 'series': mpx_metadata.get('seriesName'), - 'season_number': int_or_none(mpx_metadata.get('seasonNumber')), - 'episode_number': int_or_none(mpx_metadata.get('episodeNumber')), - 'ie_key': 'ThePlatform', - } diff --git a/yt_dlp/extractor/svt.py b/yt_dlp/extractor/svt.py index b5df2e1a1..6a72f8d42 100644 --- a/yt_dlp/extractor/svt.py +++ b/yt_dlp/extractor/svt.py @@ -471,8 +471,7 @@ def _real_extract(self, url): webpage = self._download_webpage(url, display_id) title = self._og_search_title(webpage) - urql_state = self._search_json( - r'window\.svt\.(?:nyh\.)?urqlState\s*=', webpage, 'json data', display_id) + urql_state = self._search_json(r'urqlState\s*[=:]', webpage, 'json data', display_id) data = traverse_obj(urql_state, (..., 'data', {str}, {json.loads}), get_all=False) or {} diff --git a/yt_dlp/extractor/theplatform.py b/yt_dlp/extractor/theplatform.py index b73bea18f..ebe2ac296 100644 --- a/yt_dlp/extractor/theplatform.py +++ b/yt_dlp/extractor/theplatform.py @@ -4,7 +4,6 @@ import time from .adobepass import AdobePassIE -from .once import OnceIE from ..networking import HEADRequest, Request from ..utils import ( ExtractorError, @@ -26,7 +25,7 @@ _x = lambda p: xpath_with_ns(p, {'smil': default_ns}) -class ThePlatformBaseIE(OnceIE): +class ThePlatformBaseIE(AdobePassIE): _TP_TLD = 'com' def _extract_theplatform_smil(self, smil_url, video_id, note='Downloading SMIL data'): @@ -54,16 +53,13 @@ def _extract_theplatform_smil(self, smil_url, video_id, note='Downloading SMIL d formats = [] for _format in smil_formats: - if OnceIE.suitable(_format['url']): - formats.extend(self._extract_once_formats(_format['url'])) - else: - media_url = _format['url'] - if determine_ext(media_url) == 'm3u8': - hdnea2 = self._get_cookies(media_url).get('hdnea2') - if hdnea2: - _format['url'] = update_url_query(media_url, {'hdnea3': hdnea2.value}) + media_url = _format['url'] + if determine_ext(media_url) == 'm3u8': + hdnea2 = self._get_cookies(media_url).get('hdnea2') + if hdnea2: + _format['url'] = update_url_query(media_url, {'hdnea3': hdnea2.value}) - formats.append(_format) + formats.append(_format) return formats, subtitles @@ -129,7 +125,7 @@ def _extract_theplatform_metadata(self, path, video_id): return self._parse_theplatform_metadata(info) -class ThePlatformIE(ThePlatformBaseIE, AdobePassIE): +class ThePlatformIE(ThePlatformBaseIE): _VALID_URL = r'''(?x) (?:https?://(?:link|player)\.theplatform\.com/[sp]/(?P[^/]+)/ (?:(?:(?:[^/]+/)+select/)?(?Pmedia/(?:guid/\d+/)?)?|(?P(?:[^/\?]+/(?:swf|config)|onsite)/select/))? diff --git a/yt_dlp/extractor/toutiao.py b/yt_dlp/extractor/toutiao.py new file mode 100644 index 000000000..b2a5aa236 --- /dev/null +++ b/yt_dlp/extractor/toutiao.py @@ -0,0 +1,121 @@ +import json +import urllib.parse + +from .common import InfoExtractor +from ..utils import ( + float_or_none, + int_or_none, + str_or_none, + try_call, + url_or_none, +) +from ..utils.traversal import find_element, traverse_obj + + +class ToutiaoIE(InfoExtractor): + IE_NAME = 'toutiao' + IE_DESC = '今日头条' + + _VALID_URL = r'https?://www\.toutiao\.com/video/(?P\d+)/?(?:[?#]|$)' + _TESTS = [{ + 'url': 'https://www.toutiao.com/video/7505382061495176511/', + 'info_dict': { + 'id': '7505382061495176511', + 'ext': 'mp4', + 'title': '新疆多地现不明飞行物,目击者称和月亮一样亮,几秒内突然加速消失,气象部门回应', + 'comment_count': int, + 'duration': 9.753, + 'like_count': int, + 'release_date': '20250517', + 'release_timestamp': 1747483344, + 'thumbnail': r're:https?://p\d+-sign\.toutiaoimg\.com/.+$', + 'uploader': '极目新闻', + 'uploader_id': 'MS4wLjABAAAAeateBb9Su8I3MJOZozmvyzWktmba5LMlliRDz1KffnM', + 'view_count': int, + }, + }, { + 'url': 'https://www.toutiao.com/video/7479446610359878153/', + 'info_dict': { + 'id': '7479446610359878153', + 'ext': 'mp4', + 'title': '小伙竟然利用两块磁铁制作成磁力减震器,简直太有创意了!', + 'comment_count': int, + 'duration': 118.374, + 'like_count': int, + 'release_date': '20250308', + 'release_timestamp': 1741444368, + 'thumbnail': r're:https?://p\d+-sign\.toutiaoimg\.com/.+$', + 'uploader': '小莉创意发明', + 'uploader_id': 'MS4wLjABAAAA4f7d4mwtApALtHIiq-QM20dwXqe32NUz0DeWF7wbHKw', + 'view_count': int, + }, + }] + + def _real_initialize(self): + if self._get_cookies('https://www.toutiao.com').get('ttwid'): + return + + urlh = self._request_webpage( + 'https://ttwid.bytedance.com/ttwid/union/register/', None, + 'Fetching ttwid', 'Unable to fetch ttwid', headers={ + 'Content-Type': 'application/json', + }, data=json.dumps({ + 'aid': 24, + 'needFid': False, + 'region': 'cn', + 'service': 'www.toutiao.com', + 'union': True, + }).encode(), + ) + + if ttwid := try_call(lambda: self._get_cookies(urlh.url)['ttwid'].value): + self._set_cookie('.toutiao.com', 'ttwid', ttwid) + return + + self.raise_login_required() + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + video_data = traverse_obj(webpage, ( + {find_element(tag='script', id='RENDER_DATA')}, + {urllib.parse.unquote}, {json.loads}, 'data', 'initialVideo', + )) + + formats = [] + for video in traverse_obj(video_data, ( + 'videoPlayInfo', 'video_list', lambda _, v: v['main_url'], + )): + formats.append({ + 'url': video['main_url'], + **traverse_obj(video, ('video_meta', { + 'acodec': ('audio_profile', {str}), + 'asr': ('audio_sample_rate', {int_or_none}), + 'audio_channels': ('audio_channels', {float_or_none}, {int_or_none}), + 'ext': ('vtype', {str}), + 'filesize': ('size', {int_or_none}), + 'format_id': ('definition', {str}), + 'fps': ('fps', {int_or_none}), + 'height': ('vheight', {int_or_none}), + 'tbr': ('real_bitrate', {float_or_none(scale=1000)}), + 'vcodec': ('codec_type', {str}), + 'width': ('vwidth', {int_or_none}), + })), + }) + + return { + 'id': video_id, + 'formats': formats, + **traverse_obj(video_data, { + 'comment_count': ('commentCount', {int_or_none}), + 'duration': ('videoPlayInfo', 'video_duration', {float_or_none}), + 'like_count': ('repinCount', {int_or_none}), + 'release_timestamp': ('publishTime', {int_or_none}), + 'thumbnail': (('poster', 'coverUrl'), {url_or_none}, any), + 'title': ('title', {str}), + 'uploader': ('userInfo', 'name', {str}), + 'uploader_id': ('userInfo', 'userId', {str_or_none}), + 'view_count': ('playCount', {int_or_none}), + 'webpage_url': ('detailUrl', {url_or_none}), + }), + } diff --git a/yt_dlp/extractor/tv2dk.py b/yt_dlp/extractor/tv2dk.py index 9cd7606b0..bad120f7b 100644 --- a/yt_dlp/extractor/tv2dk.py +++ b/yt_dlp/extractor/tv2dk.py @@ -2,12 +2,13 @@ import re from .common import InfoExtractor +from .jwplatform import JWPlatformIE from ..utils import ( determine_ext, - extract_attributes, js_to_json, url_or_none, ) +from ..utils.traversal import find_element, traverse_obj class TV2DKIE(InfoExtractor): @@ -21,35 +22,46 @@ class TV2DKIE(InfoExtractor): tv2fyn| tv2east| tv2lorry| - tv2nord + tv2nord| + tv2kosmopol )\.dk/ - (:[^/]+/)* + (?:[^/?#]+/)* (?P[^/?\#&]+) ''' _TESTS = [{ 'url': 'https://www.tvsyd.dk/nyheder/28-10-2019/1930/1930-28-okt-2019?autoplay=1#player', 'info_dict': { - 'id': '0_52jmwa0p', + 'id': 'sPp5z21q', 'ext': 'mp4', 'title': '19:30 - 28. okt. 2019', - 'timestamp': 1572290248, + 'description': '', + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/sPp5z21q/poster.jpg?width=720', + 'timestamp': 1572287400, 'upload_date': '20191028', - 'uploader_id': 'tvsyd', - 'duration': 1347, - 'view_count': int, }, - 'add_ie': ['Kaltura'], }, { 'url': 'https://www.tv2lorry.dk/gadekamp/gadekamp-6-hoejhuse-i-koebenhavn', 'info_dict': { - 'id': '1_7iwll9n0', + 'id': 'oD9cyq0m', 'ext': 'mp4', - 'upload_date': '20211027', 'title': 'Gadekamp #6 - Højhuse i København', - 'uploader_id': 'tv2lorry', - 'timestamp': 1635345229, + 'description': '', + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/oD9cyq0m/poster.jpg?width=720', + 'timestamp': 1635348600, + 'upload_date': '20211027', }, - 'add_ie': ['Kaltura'], + }, { + 'url': 'https://www.tvsyd.dk/haderslev/x-factor-brodre-fulde-af-selvtillid-er-igen-hjemme-hos-mor-vores-diagnoser-har-vaeret-en-fordel', + 'info_dict': { + 'id': 'x-factor-brodre-fulde-af-selvtillid-er-igen-hjemme-hos-mor-vores-diagnoser-har-vaeret-en-fordel', + }, + 'playlist_count': 2, + }, { + 'url': 'https://www.tv2ostjylland.dk/aarhus/dom-kan-fa-alvorlige-konsekvenser', + 'info_dict': { + 'id': 'dom-kan-fa-alvorlige-konsekvenser', + }, + 'playlist_count': 3, }, { 'url': 'https://www.tv2ostjylland.dk/artikel/minister-gaar-ind-i-sag-om-diabetes-teknologi', 'only_matching': True, @@ -71,40 +83,22 @@ class TV2DKIE(InfoExtractor): }, { 'url': 'https://www.tv2nord.dk/artikel/dybt-uacceptabelt', 'only_matching': True, + }, { + 'url': 'https://www.tv2kosmopol.dk/metropolen/chaufforer-beordres-til-at-kore-videre-i-ulovlige-busser-med-rode-advarselslamper', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + search_space = traverse_obj(webpage, {find_element(tag='article')}) or webpage - entries = [] + player_ids = traverse_obj( + re.findall(r'x-data="(?:video_player|simple_player)\(({[^"]+})', search_space), + (..., {js_to_json}, {json.loads}, ('jwpMediaId', 'videoId'), {str})) - def add_entry(partner_id, kaltura_id): - entries.append(self.url_result( - f'kaltura:{partner_id}:{kaltura_id}', 'Kaltura', - video_id=kaltura_id)) - - for video_el in re.findall(r'(?s)<[^>]+\bdata-entryid\s*=[^>]*>', webpage): - video = extract_attributes(video_el) - kaltura_id = video.get('data-entryid') - if not kaltura_id: - continue - partner_id = video.get('data-partnerid') - if not partner_id: - continue - add_entry(partner_id, kaltura_id) - if not entries: - kaltura_id = self._search_regex( - (r'entry_id\s*:\s*["\']([0-9a-z_]+)', - r'\\u002FentryId\\u002F(\w+)\\u002F'), webpage, 'kaltura id') - partner_id = self._search_regex( - (r'\\u002Fp\\u002F(\d+)\\u002F', r'/p/(\d+)/'), webpage, - 'partner id') - add_entry(partner_id, kaltura_id) - if len(entries) == 1: - return entries[0] - return self.playlist_result(entries) + return self.playlist_from_matches( + player_ids, video_id, getter=lambda x: f'jwplatform:{x}', ie=JWPlatformIE) class TV2DKBornholmPlayIE(InfoExtractor): diff --git a/yt_dlp/extractor/tvp.py b/yt_dlp/extractor/tvp.py index da3082907..416cbab3c 100644 --- a/yt_dlp/extractor/tvp.py +++ b/yt_dlp/extractor/tvp.py @@ -513,7 +513,7 @@ def _parse_video(self, video, with_url=True): class TVPVODVideoIE(TVPVODBaseIE): IE_NAME = 'tvp:vod' - _VALID_URL = r'https?://vod\.tvp\.pl/(?P[a-z\d-]+,\d+)/[a-z\d-]+(?\d+)/?(?:[?#]|$)' + _VALID_URL = r'https?://vod\.tvp\.pl/(?P[a-z\d-]+,\d+)/[a-z\d-]+(?\d+)/?(?:[?#]|$)' _TESTS = [{ 'url': 'https://vod.tvp.pl/dla-dzieci,24/laboratorium-alchemika-odcinki,309338/odcinek-24,S01E24,311357', @@ -568,6 +568,9 @@ class TVPVODVideoIE(TVPVODBaseIE): 'live_status': 'is_live', 'thumbnail': 're:https?://.+', }, + }, { + 'url': 'https://vod.tvp.pl/informacje-i-publicystyka,205/konskie-2025-debata-przedwyborcza-odcinki,2028435/odcinek--1,S01E-1,2028419', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/tvw.py b/yt_dlp/extractor/tvw.py index 1c060cd7a..0ab926dbd 100644 --- a/yt_dlp/extractor/tvw.py +++ b/yt_dlp/extractor/tvw.py @@ -1,13 +1,21 @@ import json from .common import InfoExtractor -from ..utils import clean_html, remove_end, unified_timestamp, url_or_none -from ..utils.traversal import traverse_obj +from ..utils import ( + clean_html, + extract_attributes, + parse_qs, + remove_end, + require, + unified_timestamp, + url_or_none, +) +from ..utils.traversal import find_element, traverse_obj class TvwIE(InfoExtractor): + IE_NAME = 'tvw' _VALID_URL = r'https?://(?:www\.)?tvw\.org/video/(?P[^/?#]+)' - _TESTS = [{ 'url': 'https://tvw.org/video/billy-frank-jr-statue-maquette-unveiling-ceremony-2024011211/', 'md5': '9ceb94fe2bb7fd726f74f16356825703', @@ -115,3 +123,43 @@ def _real_extract(self, url): 'is_live': ('eventStatus', {lambda x: x == 'live'}), }), } + + +class TvwTvChannelsIE(InfoExtractor): + IE_NAME = 'tvw:tvchannels' + _VALID_URL = r'https?://(?:www\.)?tvw\.org/tvchannels/(?P[^/?#]+)' + _TESTS = [{ + 'url': 'https://tvw.org/tvchannels/air/', + 'info_dict': { + 'id': 'air', + 'ext': 'mp4', + 'title': r're:TVW Cable Channel Live Stream', + 'thumbnail': r're:https?://.+/.+\.(?:jpe?g|png)$', + 'live_status': 'is_live', + }, + }, { + 'url': 'https://tvw.org/tvchannels/tvw2/', + 'info_dict': { + 'id': 'tvw2', + 'ext': 'mp4', + 'title': r're:TVW-2 Broadcast Channel', + 'thumbnail': r're:https?://.+/.+\.(?:jpe?g|png)$', + 'live_status': 'is_live', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + m3u8_url = traverse_obj(webpage, ( + {find_element(id='invintus-persistent-stream-frame', html=True)}, {extract_attributes}, + 'src', {parse_qs}, 'encoder', 0, {json.loads}, 'live247URI', {url_or_none}, {require('stream url')})) + + return { + 'id': video_id, + 'formats': self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', m3u8_id='hls', live=True), + 'title': remove_end(self._og_search_title(webpage, default=None), ' - TVW'), + 'thumbnail': self._og_search_thumbnail(webpage, default=None), + 'is_live': True, + } diff --git a/yt_dlp/extractor/twitcasting.py b/yt_dlp/extractor/twitcasting.py index bf9c6348c..ebc2963b0 100644 --- a/yt_dlp/extractor/twitcasting.py +++ b/yt_dlp/extractor/twitcasting.py @@ -1,4 +1,5 @@ import base64 +import hashlib import itertools import re @@ -14,12 +15,14 @@ parse_duration, qualities, str_to_int, - traverse_obj, try_get, unified_timestamp, + update_url_query, + url_or_none, urlencode_postdata, urljoin, ) +from ..utils.traversal import traverse_obj class TwitCastingIE(InfoExtractor): @@ -138,13 +141,7 @@ def _real_extract(self, url): r'data-toggle="true"[^>]+datetime="([^"]+)"', webpage, 'datetime', None)) - stream_server_data = self._download_json( - f'https://twitcasting.tv/streamserver.php?target={uploader_id}&mode=client', video_id, - 'Downloading live info', fatal=False) - is_live = any(f'data-{x}' in webpage for x in ['is-onlive="true"', 'live-type="live"', 'status="online"']) - if not traverse_obj(stream_server_data, 'llfmp4') and is_live: - self.raise_login_required(method='cookies') base_dict = { 'title': title, @@ -165,30 +162,43 @@ def find_dmu(x): return [data_movie_url] m3u8_urls = (try_get(webpage, find_dmu, list) - or traverse_obj(video_js_data, (..., 'source', 'url')) - or ([f'https://twitcasting.tv/{uploader_id}/metastream.m3u8'] if is_live else None)) - if not m3u8_urls: - raise ExtractorError('Failed to get m3u8 playlist') + or traverse_obj(video_js_data, (..., 'source', 'url'))) if is_live: - m3u8_url = m3u8_urls[0] - formats = self._extract_m3u8_formats( - m3u8_url, video_id, ext='mp4', m3u8_id='hls', - live=True, headers=self._M3U8_HEADERS) + stream_data = self._download_json( + 'https://twitcasting.tv/streamserver.php', + video_id, 'Downloading live info', query={ + 'target': uploader_id, + 'mode': 'client', + 'player': 'pc_web', + }) - if traverse_obj(stream_server_data, ('hls', 'source')): - formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, ext='mp4', m3u8_id='source', - live=True, query={'mode': 'source'}, - note='Downloading source quality m3u8', - headers=self._M3U8_HEADERS, fatal=False)) + password_params = { + 'word': hashlib.md5(video_password.encode()).hexdigest(), + } if video_password else None + + formats = [] + # low: 640x360, medium: 1280x720, high: 1920x1080 + qq = qualities(['low', 'medium', 'high']) + for quality, m3u8_url in traverse_obj(stream_data, ( + 'tc-hls', 'streams', {dict.items}, lambda _, v: url_or_none(v[1]), + )): + formats.append({ + 'url': update_url_query(m3u8_url, password_params), + 'format_id': f'hls-{quality}', + 'ext': 'mp4', + 'quality': qq(quality), + 'protocol': 'm3u8', + 'http_headers': self._M3U8_HEADERS, + }) if websockets: qq = qualities(['base', 'mobilesource', 'main']) - streams = traverse_obj(stream_server_data, ('llfmp4', 'streams')) or {} - for mode, ws_url in streams.items(): + for mode, ws_url in traverse_obj(stream_data, ( + 'llfmp4', 'streams', {dict.items}, lambda _, v: url_or_none(v[1]), + )): formats.append({ - 'url': ws_url, + 'url': update_url_query(ws_url, password_params), 'format_id': f'ws-{mode}', 'ext': 'mp4', 'quality': qq(mode), @@ -197,10 +207,15 @@ def find_dmu(x): 'protocol': 'websocket_frag', }) + if not formats: + self.raise_login_required() + infodict = { 'formats': formats, '_format_sort_fields': ('source', ), } + elif not m3u8_urls: + raise ExtractorError('Failed to get m3u8 playlist') elif len(m3u8_urls) == 1: formats = self._extract_m3u8_formats( m3u8_urls[0], video_id, 'mp4', headers=self._M3U8_HEADERS) diff --git a/yt_dlp/extractor/twitch.py b/yt_dlp/extractor/twitch.py index a36de3c01..e4f2aec46 100644 --- a/yt_dlp/extractor/twitch.py +++ b/yt_dlp/extractor/twitch.py @@ -187,7 +187,7 @@ def _get_thumbnails(self, thumbnail): 'url': thumbnail, }] if thumbnail else None - def _extract_twitch_m3u8_formats(self, path, video_id, token, signature): + def _extract_twitch_m3u8_formats(self, path, video_id, token, signature, live_from_start=False): formats = self._extract_m3u8_formats( f'{self._USHER_BASE}/{path}/{video_id}.m3u8', video_id, 'mp4', query={ 'allow_source': 'true', @@ -204,7 +204,10 @@ def _extract_twitch_m3u8_formats(self, path, video_id, token, signature): for fmt in formats: if fmt.get('vcodec') and fmt['vcodec'].startswith('av01'): # mpegts does not yet have proper support for av1 - fmt['downloader_options'] = {'ffmpeg_args_out': ['-f', 'mp4']} + fmt.setdefault('downloader_options', {}).update({'ffmpeg_args_out': ['-f', 'mp4']}) + if live_from_start: + fmt.setdefault('downloader_options', {}).update({'ffmpeg_args': ['-live_start_index', '0']}) + fmt['is_from_start'] = True return formats @@ -550,7 +553,8 @@ def _real_extract(self, url): access_token = self._download_access_token(vod_id, 'video', 'id') formats = self._extract_twitch_m3u8_formats( - 'vod', vod_id, access_token['value'], access_token['signature']) + 'vod', vod_id, access_token['value'], access_token['signature'], + live_from_start=self.get_param('live_from_start')) formats.extend(self._extract_storyboard(vod_id, video.get('storyboard'), info.get('duration'))) self._prefer_source(formats) @@ -633,6 +637,10 @@ class TwitchPlaylistBaseIE(TwitchBaseIE): _PAGE_LIMIT = 100 def _entries(self, channel_name, *args): + """ + Subclasses must define _make_variables() and _extract_entry(), + as well as set _OPERATION_NAME, _ENTRY_KIND, _EDGE_KIND, and _NODE_KIND + """ cursor = None variables_common = self._make_variables(channel_name, *args) entries_key = f'{self._ENTRY_KIND}s' @@ -672,7 +680,22 @@ def _entries(self, channel_name, *args): break -class TwitchVideosIE(TwitchPlaylistBaseIE): +class TwitchVideosBaseIE(TwitchPlaylistBaseIE): + _OPERATION_NAME = 'FilterableVideoTower_Videos' + _ENTRY_KIND = 'video' + _EDGE_KIND = 'VideoEdge' + _NODE_KIND = 'Video' + + @staticmethod + def _make_variables(channel_name, broadcast_type, sort): + return { + 'channelOwnerLogin': channel_name, + 'broadcastType': broadcast_type, + 'videoSort': sort.upper(), + } + + +class TwitchVideosIE(TwitchVideosBaseIE): _VALID_URL = r'https?://(?:(?:www|go|m)\.)?twitch\.tv/(?P[^/]+)/(?:videos|profile)' _TESTS = [{ @@ -751,11 +774,6 @@ class TwitchVideosIE(TwitchPlaylistBaseIE): 'views': 'Popular', } - _OPERATION_NAME = 'FilterableVideoTower_Videos' - _ENTRY_KIND = 'video' - _EDGE_KIND = 'VideoEdge' - _NODE_KIND = 'Video' - @classmethod def suitable(cls, url): return (False @@ -764,14 +782,6 @@ def suitable(cls, url): TwitchVideosCollectionsIE)) else super().suitable(url)) - @staticmethod - def _make_variables(channel_name, broadcast_type, sort): - return { - 'channelOwnerLogin': channel_name, - 'broadcastType': broadcast_type, - 'videoSort': sort.upper(), - } - @staticmethod def _extract_entry(node): return _make_video_result(node) @@ -919,7 +929,7 @@ def _real_extract(self, url): playlist_title=f'{channel_name} - Collections') -class TwitchStreamIE(TwitchBaseIE): +class TwitchStreamIE(TwitchVideosBaseIE): IE_NAME = 'twitch:stream' _VALID_URL = r'''(?x) https?:// @@ -982,6 +992,7 @@ class TwitchStreamIE(TwitchBaseIE): 'skip_download': 'Livestream', }, }] + _PAGE_LIMIT = 1 @classmethod def suitable(cls, url): @@ -995,6 +1006,20 @@ def suitable(cls, url): TwitchClipsIE)) else super().suitable(url)) + @staticmethod + def _extract_entry(node): + if not isinstance(node, dict) or not node.get('id'): + return None + video_id = node['id'] + return { + '_type': 'url', + 'ie_key': TwitchVodIE.ie_key(), + 'id': 'v' + video_id, + 'url': f'https://www.twitch.tv/videos/{video_id}', + 'title': node.get('title'), + 'timestamp': unified_timestamp(node.get('publishedAt')) or 0, + } + def _real_extract(self, url): channel_name = self._match_id(url).lower() @@ -1029,6 +1054,16 @@ def _real_extract(self, url): if not stream: raise UserNotLive(video_id=channel_name) + timestamp = unified_timestamp(stream.get('createdAt')) + + if self.get_param('live_from_start'): + self.to_screen(f'{channel_name}: Extracting VOD to download live from start') + entry = next(self._entries(channel_name, None, 'time'), None) + if entry and entry.pop('timestamp') >= (timestamp or float('inf')): + return entry + self.report_warning( + 'Unable to extract the VOD associated with this livestream', video_id=channel_name) + access_token = self._download_access_token( channel_name, 'stream', 'channelName') @@ -1038,7 +1073,6 @@ def _real_extract(self, url): self._prefer_source(formats) view_count = stream.get('viewers') - timestamp = unified_timestamp(stream.get('createdAt')) sq_user = try_get(gql, lambda x: x[1]['data']['user'], dict) or {} uploader = sq_user.get('displayName') @@ -1225,8 +1259,8 @@ def _real_extract(self, url): 'channel_id': ('broadcaster', 'id', {str}), 'channel_follower_count': ('broadcaster', 'followers', 'totalCount', {int_or_none}), 'channel_is_verified': ('broadcaster', 'isPartner', {bool}), - 'uploader': ('broadcaster', 'displayName', {str}), - 'uploader_id': ('broadcaster', 'id', {str}), + 'uploader': ('curator', 'displayName', {str}), + 'uploader_id': ('curator', 'id', {str}), 'categories': ('game', 'displayName', {str}, filter, all, filter), }), } diff --git a/yt_dlp/extractor/twitter.py b/yt_dlp/extractor/twitter.py index b76cfae1d..65182b971 100644 --- a/yt_dlp/extractor/twitter.py +++ b/yt_dlp/extractor/twitter.py @@ -20,7 +20,6 @@ remove_end, str_or_none, strip_or_none, - traverse_obj, truncate_string, try_call, try_get, @@ -29,6 +28,7 @@ url_or_none, xpath_text, ) +from ..utils.traversal import require, traverse_obj class TwitterBaseIE(InfoExtractor): @@ -1221,20 +1221,10 @@ class TwitterIE(TwitterBaseIE): }] _MEDIA_ID_RE = re.compile(r'_video/(\d+)/') - - @property - def _GRAPHQL_ENDPOINT(self): - if self.is_logged_in: - return 'zZXycP0V6H7m-2r0mOnFcA/TweetDetail' - return '2ICDjqPd81tulZcYrtpTuQ/TweetResultByRestId' + _GRAPHQL_ENDPOINT = '2ICDjqPd81tulZcYrtpTuQ/TweetResultByRestId' def _graphql_to_legacy(self, data, twid): - result = traverse_obj(data, ( - 'threaded_conversation_with_injections_v2', 'instructions', 0, 'entries', - lambda _, v: v['entryId'] == f'tweet-{twid}', 'content', 'itemContent', - 'tweet_results', 'result', ('tweet', None), {dict}, - ), default={}, get_all=False) if self.is_logged_in else traverse_obj( - data, ('tweetResult', 'result', {dict}), default={}) + result = traverse_obj(data, ('tweetResult', 'result', {dict})) or {} typename = result.get('__typename') if typename not in ('Tweet', 'TweetWithVisibilityResults', 'TweetTombstone', 'TweetUnavailable', None): @@ -1278,37 +1268,6 @@ def _graphql_to_legacy(self, data, twid): def _build_graphql_query(self, media_id): return { - 'variables': { - 'focalTweetId': media_id, - 'includePromotedContent': True, - 'with_rux_injections': False, - 'withBirdwatchNotes': True, - 'withCommunity': True, - 'withDownvotePerspective': False, - 'withQuickPromoteEligibilityTweetFields': True, - 'withReactionsMetadata': False, - 'withReactionsPerspective': False, - 'withSuperFollowsTweetFields': True, - 'withSuperFollowsUserFields': True, - 'withV2Timeline': True, - 'withVoice': True, - }, - 'features': { - 'graphql_is_translatable_rweb_tweet_is_translatable_enabled': False, - 'interactive_text_enabled': True, - 'responsive_web_edit_tweet_api_enabled': True, - 'responsive_web_enhance_cards_enabled': True, - 'responsive_web_graphql_timeline_navigation_enabled': False, - 'responsive_web_text_conversations_enabled': False, - 'responsive_web_uc_gql_enabled': True, - 'standardized_nudges_misinfo': True, - 'tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled': False, - 'tweetypie_unmention_optimization_enabled': True, - 'unified_cards_ad_metadata_container_dynamic_card_content_query_enabled': True, - 'verified_phone_label_enabled': False, - 'vibe_api_enabled': True, - }, - } if self.is_logged_in else { 'variables': { 'tweetId': media_id, 'withCommunity': False, @@ -1383,7 +1342,7 @@ def _extract_status(self, twid): 'tweet_mode': 'extended', }) except ExtractorError as e: - if not isinstance(e.cause, HTTPError) or not e.cause.status == 429: + if not isinstance(e.cause, HTTPError) or e.cause.status != 429: raise self.report_warning('Rate-limit exceeded; falling back to syndication endpoint') status = self._call_syndication_api(twid) @@ -1637,8 +1596,8 @@ def _find_dimension(target): class TwitterBroadcastIE(TwitterBaseIE, PeriscopeBaseIE): IE_NAME = 'twitter:broadcast' - _VALID_URL = TwitterBaseIE._BASE_REGEX + r'i/broadcasts/(?P[0-9a-zA-Z]{13})' + _VALID_URL = TwitterBaseIE._BASE_REGEX + r'i/(?Pbroadcasts|events)/(?P\w+)' _TESTS = [{ # untitled Periscope video 'url': 'https://twitter.com/i/broadcasts/1yNGaQLWpejGj', @@ -1646,6 +1605,7 @@ class TwitterBroadcastIE(TwitterBaseIE, PeriscopeBaseIE): 'id': '1yNGaQLWpejGj', 'ext': 'mp4', 'title': 'Andrea May Sahouri - Periscope Broadcast', + 'display_id': '1yNGaQLWpejGj', 'uploader': 'Andrea May Sahouri', 'uploader_id': 'andreamsahouri', 'uploader_url': 'https://twitter.com/andreamsahouri', @@ -1653,6 +1613,8 @@ class TwitterBroadcastIE(TwitterBaseIE, PeriscopeBaseIE): 'upload_date': '20200601', 'thumbnail': r're:^https?://[^?#]+\.jpg\?token=', 'view_count': int, + 'concurrent_view_count': int, + 'live_status': 'was_live', }, }, { 'url': 'https://twitter.com/i/broadcasts/1ZkKzeyrPbaxv', @@ -1660,6 +1622,7 @@ class TwitterBroadcastIE(TwitterBaseIE, PeriscopeBaseIE): 'id': '1ZkKzeyrPbaxv', 'ext': 'mp4', 'title': 'Starship | SN10 | High-Altitude Flight Test', + 'display_id': '1ZkKzeyrPbaxv', 'uploader': 'SpaceX', 'uploader_id': 'SpaceX', 'uploader_url': 'https://twitter.com/SpaceX', @@ -1667,6 +1630,8 @@ class TwitterBroadcastIE(TwitterBaseIE, PeriscopeBaseIE): 'upload_date': '20210303', 'thumbnail': r're:^https?://[^?#]+\.jpg\?token=', 'view_count': int, + 'concurrent_view_count': int, + 'live_status': 'was_live', }, }, { 'url': 'https://twitter.com/i/broadcasts/1OyKAVQrgzwGb', @@ -1674,6 +1639,7 @@ class TwitterBroadcastIE(TwitterBaseIE, PeriscopeBaseIE): 'id': '1OyKAVQrgzwGb', 'ext': 'mp4', 'title': 'Starship Flight Test', + 'display_id': '1OyKAVQrgzwGb', 'uploader': 'SpaceX', 'uploader_id': 'SpaceX', 'uploader_url': 'https://twitter.com/SpaceX', @@ -1681,21 +1647,58 @@ class TwitterBroadcastIE(TwitterBaseIE, PeriscopeBaseIE): 'upload_date': '20230420', 'thumbnail': r're:^https?://[^?#]+\.jpg\?token=', 'view_count': int, + 'concurrent_view_count': int, + 'live_status': 'was_live', + }, + }, { + 'url': 'https://x.com/i/events/1910629646300762112', + 'info_dict': { + 'id': '1LyxBWDRNqyKN', + 'ext': 'mp4', + 'title': '#ガンニバル ウォッチパーティー', + 'concurrent_view_count': int, + 'display_id': '1910629646300762112', + 'live_status': 'was_live', + 'release_date': '20250423', + 'release_timestamp': 1745409000, + 'tags': ['ガンニバル'], + 'thumbnail': r're:https?://[^?#]+\.jpg\?token=', + 'timestamp': 1745403328, + 'upload_date': '20250423', + 'uploader': 'ディズニープラス公式', + 'uploader_id': 'DisneyPlusJP', + 'uploader_url': 'https://twitter.com/DisneyPlusJP', + 'view_count': int, }, }] def _real_extract(self, url): - broadcast_id = self._match_id(url) + broadcast_type, display_id = self._match_valid_url(url).group('type', 'id') + + if broadcast_type == 'events': + timeline = self._call_api( + f'live_event/1/{display_id}/timeline.json', display_id) + broadcast_id = traverse_obj(timeline, ( + 'twitter_objects', 'broadcasts', ..., ('id', 'broadcast_id'), + {str}, any, {require('broadcast ID')})) + else: + broadcast_id = display_id + broadcast = self._call_api( 'broadcasts/show.json', broadcast_id, {'ids': broadcast_id})['broadcasts'][broadcast_id] if not broadcast: raise ExtractorError('Broadcast no longer exists', expected=True) info = self._parse_broadcast_data(broadcast, broadcast_id) - info['title'] = broadcast.get('status') or info.get('title') - info['uploader_id'] = broadcast.get('twitter_username') or info.get('uploader_id') - info['uploader_url'] = format_field(broadcast, 'twitter_username', 'https://twitter.com/%s', default=None) + info.update({ + 'display_id': display_id, + 'title': broadcast.get('status') or info.get('title'), + 'uploader_id': broadcast.get('twitter_username') or info.get('uploader_id'), + 'uploader_url': format_field( + broadcast, 'twitter_username', 'https://twitter.com/%s', default=None), + }) if info['live_status'] == 'is_upcoming': + self.raise_no_formats('This live broadcast has not yet started', expected=True) return info media_key = broadcast['media_key'] @@ -1717,21 +1720,22 @@ class TwitterSpacesIE(TwitterBaseIE): _VALID_URL = TwitterBaseIE._BASE_REGEX + r'i/spaces/(?P[0-9a-zA-Z]{13})' _TESTS = [{ - 'url': 'https://twitter.com/i/spaces/1RDxlgyvNXzJL', + 'url': 'https://twitter.com/i/spaces/1OwxWwQOPlNxQ', 'info_dict': { - 'id': '1RDxlgyvNXzJL', + 'id': '1OwxWwQOPlNxQ', 'ext': 'm4a', - 'title': 'King Carlo e la mossa Kansas City per fare il Grande Centro', - 'description': 'Twitter Space participated by annarita digiorgio, Signor Ernesto, Raffaello Colosimo, Simone M. Sepe', - 'uploader': r're:Lucio Di Gaetano.*?', - 'uploader_id': 'luciodigaetano', + 'title': 'Everybody in: @mtbarra & @elonmusk discuss the future of EV charging', + 'description': 'Twitter Space participated by Elon Musk', 'live_status': 'was_live', - 'timestamp': 1659877956, - 'upload_date': '20220807', - 'release_timestamp': 1659904215, - 'release_date': '20220807', + 'release_date': '20230608', + 'release_timestamp': 1686256230, + 'thumbnail': r're:https?://pbs\.twimg\.com/profile_images/.+', + 'timestamp': 1686254250, + 'upload_date': '20230608', + 'uploader': 'Mary Barra', + 'uploader_id': 'mtbarra', }, - 'skip': 'No longer available', + 'params': {'skip_download': 'm3u8'}, }, { # post_live/TimedOut but downloadable 'url': 'https://twitter.com/i/spaces/1vAxRAVQWONJl', @@ -1743,9 +1747,10 @@ class TwitterSpacesIE(TwitterBaseIE): 'uploader': 'Google Cloud', 'uploader_id': 'googlecloud', 'live_status': 'post_live', + 'thumbnail': r're:https?://pbs\.twimg\.com/profile_images/.+', 'timestamp': 1681409554, 'upload_date': '20230413', - 'release_timestamp': 1681839000, + 'release_timestamp': 1681839082, 'release_date': '20230418', 'protocol': 'm3u8', # ffmpeg is forced 'container': 'm4a_dash', # audio-only format fixup is applied @@ -1762,6 +1767,9 @@ class TwitterSpacesIE(TwitterBaseIE): 'uploader': '息根とめる', 'uploader_id': 'tomeru_ikinone', 'live_status': 'was_live', + 'release_date': '20230601', + 'release_timestamp': 1685617200, + 'thumbnail': r're:https?://pbs\.twimg\.com/profile_images/.+', 'timestamp': 1685617198, 'upload_date': '20230601', 'protocol': 'm3u8', # ffmpeg is forced @@ -1779,9 +1787,10 @@ class TwitterSpacesIE(TwitterBaseIE): 'uploader': 'Candace Owens', 'uploader_id': 'RealCandaceO', 'live_status': 'was_live', + 'thumbnail': r're:https?://pbs\.twimg\.com/profile_images/.+', 'timestamp': 1723931351, 'upload_date': '20240817', - 'release_timestamp': 1723932000, + 'release_timestamp': 1723932056, 'release_date': '20240817', 'protocol': 'm3u8_native', # not ffmpeg, detected as video space }, @@ -1861,18 +1870,21 @@ def _real_extract(self, url): return { 'id': space_id, - 'title': metadata.get('title'), 'description': f'Twitter Space participated by {participants}', - 'uploader': traverse_obj( - metadata, ('creator_results', 'result', 'legacy', 'name')), - 'uploader_id': traverse_obj( - metadata, ('creator_results', 'result', 'legacy', 'screen_name')), - 'live_status': live_status, - 'release_timestamp': try_call( - lambda: int_or_none(metadata['scheduled_start'], scale=1000)), - 'timestamp': int_or_none(metadata.get('created_at'), scale=1000), 'formats': formats, 'http_headers': headers, + 'live_status': live_status, + **traverse_obj(metadata, { + 'title': ('title', {str}), + # started_at is None when stream is_upcoming so fallback to scheduled_start for --wait-for-video + 'release_timestamp': (('started_at', 'scheduled_start'), {int_or_none(scale=1000)}, any), + 'timestamp': ('created_at', {int_or_none(scale=1000)}), + }), + **traverse_obj(metadata, ('creator_results', 'result', 'legacy', { + 'uploader': ('name', {str}), + 'uploader_id': ('screen_name', {str_or_none}), + 'thumbnail': ('profile_image_url_https', {lambda x: x.replace('_normal', '_400x400')}, {url_or_none}), + })), } diff --git a/yt_dlp/extractor/vimeo.py b/yt_dlp/extractor/vimeo.py index 8a0aaaa46..09497b699 100644 --- a/yt_dlp/extractor/vimeo.py +++ b/yt_dlp/extractor/vimeo.py @@ -3,6 +3,7 @@ import itertools import json import re +import time import urllib.parse from .common import InfoExtractor @@ -13,10 +14,12 @@ OnDemandPagedList, clean_html, determine_ext, + filter_dict, get_element_by_class, int_or_none, join_nonempty, js_to_json, + jwt_decode_hs256, merge_dicts, parse_filesize, parse_iso8601, @@ -39,6 +42,18 @@ class VimeoBaseInfoExtractor(InfoExtractor): _NETRC_MACHINE = 'vimeo' _LOGIN_REQUIRED = False _LOGIN_URL = 'https://vimeo.com/log_in' + _REFERER_HINT = ( + 'Cannot download embed-only video without embedding URL. Please call yt-dlp ' + 'with the URL of the page that embeds this video.') + _IOS_CLIENT_AUTH = 'MTMxNzViY2Y0NDE0YTQ5YzhjZTc0YmU0NjVjNDQxYzNkYWVjOWRlOTpHKzRvMmgzVUh4UkxjdU5FRW80cDNDbDhDWGR5dVJLNUJZZ055dHBHTTB4V1VzaG41bEx1a2hiN0NWYWNUcldSSW53dzRUdFRYZlJEZmFoTTArOTBUZkJHS3R4V2llYU04Qnl1bERSWWxUdXRidjNqR2J4SHFpVmtFSUcyRktuQw==' + _IOS_CLIENT_HEADERS = { + 'Accept': 'application/vnd.vimeo.*+json; version=3.4.10', + 'Accept-Language': 'en', + 'User-Agent': 'Vimeo/11.10.0 (com.vimeo; build:250424.164813.0; iOS 18.4.1) Alamofire/5.9.0 VimeoNetworking/5.0.0', + } + _IOS_OAUTH_CACHE_KEY = 'oauth-token-ios' + _ios_oauth_token = None + _viewer_info = None @staticmethod def _smuggle_referrer(url, referrer_url): @@ -52,8 +67,21 @@ def _unsmuggle_headers(self, url): headers['Referer'] = data['referer'] return url, data, headers + def _jwt_is_expired(self, token): + return jwt_decode_hs256(token)['exp'] - time.time() < 120 + + def _fetch_viewer_info(self, display_id=None, fatal=True): + if self._viewer_info and not self._jwt_is_expired(self._viewer_info['jwt']): + return self._viewer_info + + self._viewer_info = self._download_json( + 'https://vimeo.com/_next/viewer', display_id, 'Downloading web token info', + 'Failed to download web token info', fatal=fatal, headers={'Accept': 'application/json'}) + + return self._viewer_info + def _perform_login(self, username, password): - viewer = self._download_json('https://vimeo.com/_next/viewer', None, 'Downloading login token') + viewer = self._fetch_viewer_info() data = { 'action': 'login', 'email': username, @@ -88,13 +116,15 @@ def _get_video_password(self): expected=True) return password - def _verify_video_password(self, video_id, password, token): - url = f'https://vimeo.com/{video_id}' + def _verify_video_password(self, video_id, path=None): + video_password = self._get_video_password() + token = self._fetch_viewer_info(video_id)['xsrft'] + url = join_nonempty('https://vimeo.com', path, video_id, delim='/') try: - return self._download_webpage( + self._request_webpage( f'{url}/password', video_id, 'Submitting video password', data=json.dumps({ - 'password': password, + 'password': video_password, 'token': token, }, separators=(',', ':')).encode(), headers={ 'Accept': '*/*', @@ -106,6 +136,10 @@ def _verify_video_password(self, video_id, password, token): raise ExtractorError('Wrong password', expected=True) raise + def _extract_config_url(self, webpage, **kwargs): + return self._html_search_regex( + r'\bdata-config-url="([^"]+)"', webpage, 'config URL', **kwargs) + def _extract_vimeo_config(self, webpage, video_id, *args, **kwargs): vimeo_config = self._search_regex( r'vimeo\.config\s*=\s*(?:({.+?})|_extend\([^,]+,\s+({.+?})\));', @@ -153,6 +187,7 @@ def _parse_config(self, config, video_id): sep_pattern = r'/sep/video/' for files_type in ('hls', 'dash'): for cdn_name, cdn_data in (try_get(config_files, lambda x: x[files_type]['cdns']) or {}).items(): + # TODO: Also extract 'avc_url'? Investigate if there are 'hevc_url', 'av1_url'? manifest_url = cdn_data.get('url') if not manifest_url: continue @@ -233,26 +268,48 @@ def _parse_config(self, config, video_id): 'formats': formats, 'subtitles': subtitles, 'live_status': live_status, - 'release_timestamp': traverse_obj(live_event, ('ingest', 'scheduled_start_time', {parse_iso8601})), + 'release_timestamp': traverse_obj(live_event, ('ingest', ( + ('scheduled_start_time', {parse_iso8601}), + ('start_time', {int_or_none}), + ), any)), # Note: Bitrates are completely broken. Single m3u8 may contain entries in kbps and bps # at the same time without actual units specified. '_format_sort_fields': ('quality', 'res', 'fps', 'hdr:12', 'source'), } - def _call_videos_api(self, video_id, jwt_token, unlisted_hash=None, **kwargs): + def _fetch_oauth_token(self): + if not self._ios_oauth_token: + self._ios_oauth_token = self.cache.load(self._NETRC_MACHINE, self._IOS_OAUTH_CACHE_KEY) + + if not self._ios_oauth_token: + self._ios_oauth_token = self._download_json( + 'https://api.vimeo.com/oauth/authorize/client', None, + 'Fetching OAuth token', 'Failed to fetch OAuth token', + headers={ + 'Authorization': f'Basic {self._IOS_CLIENT_AUTH}', + **self._IOS_CLIENT_HEADERS, + }, data=urlencode_postdata({ + 'grant_type': 'client_credentials', + 'scope': 'private public create edit delete interact upload purchased stats', + }, quote_via=urllib.parse.quote))['access_token'] + self.cache.store(self._NETRC_MACHINE, self._IOS_OAUTH_CACHE_KEY, self._ios_oauth_token) + + return self._ios_oauth_token + + def _call_videos_api(self, video_id, unlisted_hash=None, **kwargs): return self._download_json( join_nonempty(f'https://api.vimeo.com/videos/{video_id}', unlisted_hash, delim=':'), video_id, 'Downloading API JSON', headers={ - 'Authorization': f'jwt {jwt_token}', - 'Accept': 'application/json', + 'Authorization': f'Bearer {self._fetch_oauth_token()}', + **self._IOS_CLIENT_HEADERS, }, query={ 'fields': ','.join(( - 'config_url', 'created_time', 'description', 'download', 'license', - 'metadata.connections.comments.total', 'metadata.connections.likes.total', - 'release_time', 'stats.plays')), + 'config_url', 'embed_player_config_url', 'player_embed_url', 'download', 'play', + 'files', 'description', 'license', 'release_time', 'created_time', 'stats.plays', + 'metadata.connections.comments.total', 'metadata.connections.likes.total')), }, **kwargs) - def _extract_original_format(self, url, video_id, unlisted_hash=None, jwt=None, api_data=None): + def _extract_original_format(self, url, video_id, unlisted_hash=None, api_data=None): # Original/source formats are only available when logged in if not self._get_cookies('https://vimeo.com/').get('vimeo'): return @@ -283,12 +340,8 @@ def _extract_original_format(self, url, video_id, unlisted_hash=None, jwt=None, 'quality': 1, } - jwt = jwt or traverse_obj(self._download_json( - 'https://vimeo.com/_rv/viewer', video_id, 'Downloading jwt token', fatal=False), ('jwt', {str})) - if not jwt: - return original_response = api_data or self._call_videos_api( - video_id, jwt, unlisted_hash, fatal=False, expected_status=(403, 404)) + video_id, unlisted_hash, fatal=False, expected_status=(403, 404)) for download_data in traverse_obj(original_response, ('download', ..., {dict})): download_url = download_data.get('link') if not download_url or download_data.get('quality') != 'source': @@ -327,7 +380,7 @@ class VimeoIE(VimeoBaseInfoExtractor): (?: (?Puser)| (?!(?:channels|album|showcase)/[^/?#]+/?(?:$|[?#])|[^/]+/review/|ondemand/) - (?:.*?/)?? + (?:(?!event/).*?/)?? (?P (?: play_redirect_hls| @@ -410,6 +463,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'duration': 10, 'comment_count': int, 'like_count': int, + 'view_count': int, 'thumbnail': 'https://i.vimeocdn.com/video/440665496-b2c5aee2b61089442c794f64113a8e8f7d5763c3e6b3ebfaf696ae6413f8b1f4-d', }, 'params': { @@ -500,15 +554,16 @@ class VimeoIE(VimeoBaseInfoExtractor): 'uploader': 'The DMCI', 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/dmci', 'uploader_id': 'dmci', - 'timestamp': 1324343742, + 'timestamp': 1324361742, 'upload_date': '20111220', - 'description': 'md5:ae23671e82d05415868f7ad1aec21147', + 'description': 'md5:f37b4ad0f3ded6fa16f38ecde16c3c44', 'duration': 60, 'comment_count': int, 'view_count': int, 'thumbnail': 'https://i.vimeocdn.com/video/231174622-dd07f015e9221ff529d451e1cc31c982b5d87bfafa48c4189b1da72824ee289a-d', 'like_count': int, - 'tags': 'count:11', + 'release_timestamp': 1324361742, + 'release_date': '20111220', }, # 'params': {'format': 'Original'}, 'expected_warnings': ['Failed to parse XML: not well-formed'], @@ -521,15 +576,18 @@ class VimeoIE(VimeoBaseInfoExtractor): 'id': '393756517', # 'ext': 'mov', 'ext': 'mp4', - 'timestamp': 1582642091, + 'timestamp': 1582660091, 'uploader_id': 'frameworkla', 'title': 'Straight To Hell - Sabrina: Netflix', 'uploader': 'Framework Studio', - 'description': 'md5:f2edc61af3ea7a5592681ddbb683db73', 'upload_date': '20200225', 'duration': 176, 'thumbnail': 'https://i.vimeocdn.com/video/859377297-836494a4ef775e9d4edbace83937d9ad34dc846c688c0c419c0e87f7ab06c4b3-d', 'uploader_url': 'https://vimeo.com/frameworkla', + 'comment_count': int, + 'like_count': int, + 'release_timestamp': 1582660091, + 'release_date': '20200225', }, # 'params': {'format': 'source'}, 'expected_warnings': ['Failed to parse XML: not well-formed'], @@ -630,7 +688,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'description': str, # FIXME: Dynamic SEO spam description 'upload_date': '20150209', 'timestamp': 1423518307, - 'thumbnail': 'https://i.vimeocdn.com/video/default', + 'thumbnail': r're:https://i\.vimeocdn\.com/video/default', 'duration': 10, 'like_count': int, 'uploader_url': 'https://vimeo.com/user20132939', @@ -667,6 +725,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'like_count': int, 'uploader_url': 'https://vimeo.com/aliniamedia', 'release_date': '20160329', + 'view_count': int, }, 'params': {'skip_download': True}, 'expected_warnings': ['Failed to parse XML: not well-formed'], @@ -678,18 +737,19 @@ class VimeoIE(VimeoBaseInfoExtractor): # 'ext': 'm4v', 'ext': 'mp4', 'title': 'Eastnor Castle 2015 Firework Champions - The Promo!', - 'description': 'md5:5967e090768a831488f6e74b7821b3c1', + 'description': 'md5:9441e6829ae94f380cc6417d982f63ac', 'uploader_id': 'fireworkchampions', 'uploader': 'Firework Champions', 'upload_date': '20150910', - 'timestamp': 1441901895, + 'timestamp': 1441916295, 'thumbnail': 'https://i.vimeocdn.com/video/534715882-6ff8e4660cbf2fea68282876d8d44f318825dfe572cc4016e73b3266eac8ae3a-d', 'uploader_url': 'https://vimeo.com/fireworkchampions', - 'tags': 'count:6', 'duration': 229, 'view_count': int, 'like_count': int, 'comment_count': int, + 'release_timestamp': 1441916295, + 'release_date': '20150910', }, 'params': { 'skip_download': True, @@ -820,7 +880,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'uploader': 'Raja Virdi', 'uploader_id': 'rajavirdi', 'uploader_url': 'https://vimeo.com/rajavirdi', - 'duration': 309, + 'duration': 300, 'thumbnail': r're:https://i\.vimeocdn\.com/video/1716727772-[\da-f]+-d', }, # 'params': {'format': 'source'}, @@ -860,12 +920,9 @@ def _verify_player_video_password(self, url, video_id, headers): return checked def _extract_from_api(self, video_id, unlisted_hash=None): - viewer = self._download_json( - 'https://vimeo.com/_next/viewer', video_id, 'Downloading viewer info') - for retry in (False, True): try: - video = self._call_videos_api(video_id, viewer['jwt'], unlisted_hash) + video = self._call_videos_api(video_id, unlisted_hash) break except ExtractorError as e: if (not retry and isinstance(e.cause, HTTPError) and e.cause.status == 400 @@ -873,15 +930,14 @@ def _extract_from_api(self, video_id, unlisted_hash=None): self._webpage_read_content(e.cause.response, e.cause.response.url, video_id, fatal=False), ({json.loads}, 'invalid_parameters', ..., 'field'), )): - self._verify_video_password( - video_id, self._get_video_password(), viewer['xsrft']) + self._verify_video_password(video_id) continue raise info = self._parse_config(self._download_json( video['config_url'], video_id), video_id) source_format = self._extract_original_format( - f'https://vimeo.com/{video_id}', video_id, unlisted_hash, jwt=viewer['jwt'], api_data=video) + f'https://vimeo.com/{video_id}', video_id, unlisted_hash, api_data=video) if source_format: info['formats'].append(source_format) @@ -904,8 +960,7 @@ def _try_album_password(self, url): r'vimeo\.com/(?:album|showcase)/([^/]+)', url, 'album id', default=None) if not album_id: return - viewer = self._download_json( - 'https://vimeo.com/_rv/viewer', album_id, fatal=False) + viewer = self._fetch_viewer_info(album_id, fatal=False) if not viewer: webpage = self._download_webpage(url, album_id) viewer = self._parse_json(self._search_regex( @@ -963,9 +1018,7 @@ def _real_extract(self, url): raise errmsg = error.cause.response.read() if b'Because of its privacy settings, this video cannot be played here' in errmsg: - raise ExtractorError( - 'Cannot download embed-only video without embedding URL. Please call yt-dlp ' - 'with the URL of the page that embeds this video.', expected=True) + raise ExtractorError(self._REFERER_HINT, expected=True) # 403 == vimeo.com TLS fingerprint or DC IP block; 429 == player.vimeo.com TLS FP block status = error.cause.status dcip_msg = 'If you are using a data center IP or VPN/proxy, your IP may be blocked' @@ -1010,8 +1063,7 @@ def _real_extract(self, url): channel_id = self._search_regex( r'vimeo\.com/channels/([^/]+)', url, 'channel id', default=None) if channel_id: - config_url = self._html_search_regex( - r'\bdata-config-url="([^"]+)"', webpage, 'config URL', default=None) + config_url = self._extract_config_url(webpage, default=None) video_description = clean_html(get_element_by_class('description', webpage)) info_dict.update({ 'channel_id': channel_id, @@ -1122,7 +1174,7 @@ class VimeoOndemandIE(VimeoIE): # XXX: Do not subclass from concrete IE 'description': 'md5:aeeba3dbd4d04b0fa98a4fdc9c639998', 'upload_date': '20140906', 'timestamp': 1410032453, - 'thumbnail': 'https://i.vimeocdn.com/video/488238335-d7bf151c364cff8d467f1b73784668fe60aae28a54573a35d53a1210ae283bd8-d_1280', + 'thumbnail': r're:https://i\.vimeocdn\.com/video/\d+-[\da-f]+-d', 'comment_count': int, 'license': 'https://creativecommons.org/licenses/by-nc-nd/3.0/', 'duration': 53, @@ -1132,7 +1184,7 @@ class VimeoOndemandIE(VimeoIE): # XXX: Do not subclass from concrete IE 'params': { 'format': 'best[protocol=https]', }, - 'expected_warnings': ['Unable to download JSON metadata'], + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { # requires Referer to be passed along with og:video:url 'url': 'https://vimeo.com/ondemand/36938/126682985', @@ -1149,13 +1201,14 @@ class VimeoOndemandIE(VimeoIE): # XXX: Do not subclass from concrete IE 'duration': 121, 'comment_count': int, 'view_count': int, - 'thumbnail': 'https://i.vimeocdn.com/video/517077723-7066ae1d9a79d3eb361334fb5d58ec13c8f04b52f8dd5eadfbd6fb0bcf11f613-d_1280', + 'thumbnail': r're:https://i\.vimeocdn\.com/video/\d+-[\da-f]+-d', 'like_count': int, + 'tags': 'count:5', }, 'params': { 'skip_download': True, }, - 'expected_warnings': ['Unable to download JSON metadata'], + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { 'url': 'https://vimeo.com/ondemand/nazmaalik', 'only_matching': True, @@ -1237,7 +1290,7 @@ class VimeoUserIE(VimeoChannelIE): # XXX: Do not subclass from concrete IE _TESTS = [{ 'url': 'https://vimeo.com/nkistudio/videos', 'info_dict': { - 'title': 'Nki', + 'title': 'AKAMA', 'id': 'nkistudio', }, 'playlist_mincount': 66, @@ -1303,8 +1356,7 @@ def _fetch_page(self, album_id, authorization, hashed_pass, page): def _real_extract(self, url): album_id = self._match_id(url) - viewer = self._download_json( - 'https://vimeo.com/_rv/viewer', album_id, fatal=False) + viewer = self._fetch_viewer_info(album_id, fatal=False) if not viewer: webpage = self._download_webpage(url, album_id) viewer = self._parse_json(self._search_regex( @@ -1370,10 +1422,10 @@ class VimeoReviewIE(VimeoBaseInfoExtractor): 'uploader_id': 'user170863801', 'uploader_url': 'https://vimeo.com/user170863801', 'duration': 30, - 'thumbnail': 'https://i.vimeocdn.com/video/1912612821-09a43bd2e75c203d503aed89de7534f28fc4474a48f59c51999716931a246af5-d_1280', + 'thumbnail': r're:https://i\.vimeocdn\.com/video/\d+-[\da-f]+-d', }, 'params': {'skip_download': 'm3u8'}, - 'expected_warnings': ['Failed to parse XML'], + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { 'url': 'https://vimeo.com/user21297594/review/75524534/3c257a1b5d', 'md5': 'c507a72f780cacc12b2248bb4006d253', @@ -1423,12 +1475,8 @@ def _real_extract(self, url): user, video_id, review_hash = self._match_valid_url(url).group('user', 'id', 'hash') data_url = f'https://vimeo.com/{user}/review/data/{video_id}/{review_hash}' data = self._download_json(data_url, video_id) - viewer = {} if data.get('isLocked') is True: - video_password = self._get_video_password() - viewer = self._download_json( - 'https://vimeo.com/_rv/viewer', video_id) - self._verify_video_password(video_id, video_password, viewer['xsrft']) + self._verify_video_password(video_id) data = self._download_json(data_url, video_id) clip_data = data['clipData'] config_url = clip_data['configUrl'] @@ -1436,7 +1484,7 @@ def _real_extract(self, url): info_dict = self._parse_config(config, video_id) source_format = self._extract_original_format( f'https://vimeo.com/{user}/review/{video_id}/{review_hash}/action', - video_id, unlisted_hash=clip_data.get('unlistedHash'), jwt=viewer.get('jwt')) + video_id, unlisted_hash=clip_data.get('unlistedHash')) if source_format: info_dict['formats'].append(source_format) info_dict['description'] = clean_html(clip_data.get('description')) @@ -1528,20 +1576,22 @@ class VimeoProIE(VimeoBaseInfoExtractor): 'uploader_id': 'openstreetmapus', 'uploader': 'OpenStreetMap US', 'title': 'Andy Allan - Putting the Carto into OpenStreetMap Cartography', - 'description': 'md5:2c362968038d4499f4d79f88458590c1', + 'description': 'md5:8cf69a1a435f2d763f4adf601e9c3125', 'duration': 1595, 'upload_date': '20130610', - 'timestamp': 1370893156, + 'timestamp': 1370907556, 'license': 'by', - 'thumbnail': 'https://i.vimeocdn.com/video/440260469-19b0d92fca3bd84066623b53f1eb8aaa3980c6c809e2d67b6b39ab7b4a77a344-d_960', + 'thumbnail': r're:https://i\.vimeocdn\.com/video/\d+-[\da-f]+-d', 'view_count': int, 'comment_count': int, 'like_count': int, - 'tags': 'count:1', + 'release_timestamp': 1370907556, + 'release_date': '20130610', }, 'params': { 'format': 'best[protocol=https]', }, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { # password-protected VimeoPro page with Vimeo player embed 'url': 'https://vimeopro.com/cadfem/simulation-conference-mechanische-systeme-in-perfektion', @@ -1549,7 +1599,7 @@ class VimeoProIE(VimeoBaseInfoExtractor): 'id': '764543723', 'ext': 'mp4', 'title': 'Mechanische Systeme in Perfektion: Realität erfassen, Innovation treiben', - 'thumbnail': 'https://i.vimeocdn.com/video/1543784598-a1a750494a485e601110136b9fe11e28c2131942452b3a5d30391cb3800ca8fd-d_1280', + 'thumbnail': r're:https://i\.vimeocdn\.com/video/\d+-[\da-f]+-d', 'description': 'md5:2a9d195cd1b0f6f79827107dc88c2420', 'uploader': 'CADFEM', 'uploader_id': 'cadfem', @@ -1561,6 +1611,7 @@ class VimeoProIE(VimeoBaseInfoExtractor): 'videopassword': 'Conference2022', 'skip_download': True, }, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }] def _real_extract(self, url): @@ -1597,3 +1648,377 @@ def _real_extract(self, url): return self.url_result(vimeo_url, VimeoIE, video_id, url_transparent=True, description=description) + + +class VimeoEventIE(VimeoBaseInfoExtractor): + IE_NAME = 'vimeo:event' + _VALID_URL = r'''(?x) + https?://(?:www\.)?vimeo\.com/event/(?P\d+)(?:/ + (?: + (?:embed/)?(?P[\da-f]{10})| + videos/(?P\d+) + ) + )?''' + _EMBED_REGEX = [r']+\bsrc=["\'](?Phttps?://vimeo\.com/event/\d+/embed(?:[/?][^"\']*)?)["\'][^>]*>'] + _TESTS = [{ + # stream_privacy.view: 'anybody' + 'url': 'https://vimeo.com/event/5116195', + 'info_dict': { + 'id': '1082194134', + 'ext': 'mp4', + 'display_id': '5116195', + 'title': 'Skidmore College Commencement 2025', + 'description': 'md5:1902dd5165d21f98aa198297cc729d23', + 'uploader': 'Skidmore College', + 'uploader_id': 'user116066434', + 'uploader_url': 'https://vimeo.com/user116066434', + 'comment_count': int, + 'like_count': int, + 'duration': 9810, + 'thumbnail': r're:https://i\.vimeocdn\.com/video/\d+-[\da-f]+-d', + 'timestamp': 1747502974, + 'upload_date': '20250517', + 'release_timestamp': 1747502998, + 'release_date': '20250517', + 'live_status': 'was_live', + }, + 'params': {'skip_download': 'm3u8'}, + 'expected_warnings': ['Failed to parse XML: not well-formed'], + }, { + # stream_privacy.view: 'embed_only' + 'url': 'https://vimeo.com/event/5034253/embed', + 'info_dict': { + 'id': '1071439154', + 'ext': 'mp4', + 'display_id': '5034253', + 'title': 'Advancing Humans with AI', + 'description': r're:AI is here to stay, but how do we ensure that people flourish in a world of pervasive AI use.{322}$', + 'uploader': 'MIT Media Lab', + 'uploader_id': 'mitmedialab', + 'uploader_url': 'https://vimeo.com/mitmedialab', + 'duration': 23235, + 'thumbnail': r're:https://i\.vimeocdn\.com/video/\d+-[\da-f]+-d', + 'chapters': 'count:37', + 'release_timestamp': 1744290000, + 'release_date': '20250410', + 'live_status': 'was_live', + }, + 'params': { + 'skip_download': 'm3u8', + 'http_headers': {'Referer': 'https://www.media.mit.edu/events/aha-symposium/'}, + }, + 'expected_warnings': ['Failed to parse XML: not well-formed'], + }, { + # Last entry on 2nd page of the 37 video playlist, but use clip_to_play_id API param shortcut + 'url': 'https://vimeo.com/event/4753126/videos/1046153257', + 'info_dict': { + 'id': '1046153257', + 'ext': 'mp4', + 'display_id': '4753126', + 'title': 'January 12, 2025 The True Vine (Pastor John Mindrup)', + 'description': 'The True Vine (Pastor \tJohn Mindrup)', + 'uploader': 'Salem United Church of Christ', + 'uploader_id': 'user230181094', + 'uploader_url': 'https://vimeo.com/user230181094', + 'comment_count': int, + 'like_count': int, + 'duration': 4962, + 'thumbnail': r're:https://i\.vimeocdn\.com/video/\d+-[\da-f]+-d', + 'timestamp': 1736702464, + 'upload_date': '20250112', + 'release_timestamp': 1736702543, + 'release_date': '20250112', + 'live_status': 'was_live', + }, + 'params': {'skip_download': 'm3u8'}, + 'expected_warnings': ['Failed to parse XML: not well-formed'], + }, { + # "24/7" livestream + 'url': 'https://vimeo.com/event/4768062', + 'info_dict': { + 'id': '1079901414', + 'ext': 'mp4', + 'display_id': '4768062', + 'title': r're:GRACELAND CAM \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', + 'description': '24/7 camera at Graceland Mansion', + 'uploader': 'Elvis Presley\'s Graceland', + 'uploader_id': 'visitgraceland', + 'uploader_url': 'https://vimeo.com/visitgraceland', + 'release_timestamp': 1745975450, + 'release_date': '20250430', + 'live_status': 'is_live', + }, + 'params': {'skip_download': 'livestream'}, + }, { + # stream_privacy.view: 'unlisted' with unlisted_hash in URL path (stream_privacy.embed: 'whitelist') + 'url': 'https://vimeo.com/event/4259978/3db517c479', + 'info_dict': { + 'id': '939104114', + 'ext': 'mp4', + 'display_id': '4259978', + 'title': 'Enhancing Credibility in Your Community Science Project', + 'description': 'md5:eab953341168b9c146bc3cfe3f716070', + 'uploader': 'NOAA Research', + 'uploader_id': 'noaaresearch', + 'uploader_url': 'https://vimeo.com/noaaresearch', + 'comment_count': int, + 'like_count': int, + 'duration': 3961, + 'thumbnail': r're:https://i\.vimeocdn\.com/video/\d+-[\da-f]+-d', + 'timestamp': 1716408008, + 'upload_date': '20240522', + 'release_timestamp': 1716408062, + 'release_date': '20240522', + 'live_status': 'was_live', + }, + 'params': {'skip_download': 'm3u8'}, + 'expected_warnings': ['Failed to parse XML: not well-formed'], + }, { + # "done" event with video_id in URL and unlisted_hash in VimeoIE URL + 'url': 'https://vimeo.com/event/595460/videos/498149131/', + 'info_dict': { + 'id': '498149131', + 'ext': 'mp4', + 'display_id': '595460', + 'title': '2021 Eighth Annual John Cardinal Foley Lecture on Social Communications', + 'description': 'Replay: https://vimeo.com/catholicphilly/review/498149131/544f26a12f', + 'uploader': 'Kearns Media Consulting LLC', + 'uploader_id': 'kearnsmediaconsulting', + 'uploader_url': 'https://vimeo.com/kearnsmediaconsulting', + 'comment_count': int, + 'like_count': int, + 'duration': 4466, + 'thumbnail': r're:https://i\.vimeocdn\.com/video/\d+-[\da-f]+-d', + 'timestamp': 1612228466, + 'upload_date': '20210202', + 'release_timestamp': 1612228538, + 'release_date': '20210202', + 'live_status': 'was_live', + }, + 'params': {'skip_download': 'm3u8'}, + 'expected_warnings': ['Failed to parse XML: not well-formed'], + }, { + # stream_privacy.view: 'password'; stream_privacy.embed: 'public' + 'url': 'https://vimeo.com/event/4940578', + 'info_dict': { + 'id': '1059263570', + 'ext': 'mp4', + 'display_id': '4940578', + 'title': 'TMAC AKC AGILITY 2-22-2025', + 'uploader': 'Paws \'N Effect', + 'uploader_id': 'pawsneffect', + 'uploader_url': 'https://vimeo.com/pawsneffect', + 'comment_count': int, + 'like_count': int, + 'duration': 33115, + 'thumbnail': r're:https://i\.vimeocdn\.com/video/\d+-[\da-f]+-d', + 'timestamp': 1740261836, + 'upload_date': '20250222', + 'release_timestamp': 1740261873, + 'release_date': '20250222', + 'live_status': 'was_live', + }, + 'params': { + 'videopassword': '22', + 'skip_download': 'm3u8', + }, + 'expected_warnings': ['Failed to parse XML: not well-formed'], + }, { + # API serves a playlist of 37 videos, but the site only streams the newest one (changes every Sunday) + 'url': 'https://vimeo.com/event/4753126', + 'only_matching': True, + }, { + # Scheduled for 2025.05.15 but never started; "unavailable"; stream_privacy.view: "anybody" + 'url': 'https://vimeo.com/event/5120811/embed', + 'only_matching': True, + }, { + 'url': 'https://vimeo.com/event/5112969/embed?muted=1', + 'only_matching': True, + }, { + 'url': 'https://vimeo.com/event/5097437/embed/interaction?muted=1', + 'only_matching': True, + }, { + 'url': 'https://vimeo.com/event/5113032/embed?autoplay=1&muted=1', + 'only_matching': True, + }, { + # Ended livestream with video_id + 'url': 'https://vimeo.com/event/595460/videos/507329569/', + 'only_matching': True, + }, { + # stream_privacy.view: 'unlisted' with unlisted_hash in URL path (stream_privacy.embed: 'public') + 'url': 'https://vimeo.com/event/4606123/embed/358d60ce2e', + 'only_matching': True, + }] + _WEBPAGE_TESTS = [{ + # Same result as https://vimeo.com/event/5034253/embed + 'url': 'https://www.media.mit.edu/events/aha-symposium/', + 'info_dict': { + 'id': '1071439154', + 'ext': 'mp4', + 'display_id': '5034253', + 'title': 'Advancing Humans with AI', + 'description': r're:AI is here to stay, but how do we ensure that people flourish in a world of pervasive AI use.{322}$', + 'uploader': 'MIT Media Lab', + 'uploader_id': 'mitmedialab', + 'uploader_url': 'https://vimeo.com/mitmedialab', + 'duration': 23235, + 'thumbnail': r're:https://i\.vimeocdn\.com/video/\d+-[\da-f]+-d', + 'chapters': 'count:37', + 'release_timestamp': 1744290000, + 'release_date': '20250410', + 'live_status': 'was_live', + }, + 'params': {'skip_download': 'm3u8'}, + 'expected_warnings': ['Failed to parse XML: not well-formed'], + }] + + _EVENT_FIELDS = ( + 'title', 'uri', 'schedule', 'stream_description', 'stream_privacy.embed', 'stream_privacy.view', + 'clip_to_play.name', 'clip_to_play.uri', 'clip_to_play.config_url', 'clip_to_play.live.status', + 'clip_to_play.privacy.embed', 'clip_to_play.privacy.view', 'clip_to_play.password', + 'streamable_clip.name', 'streamable_clip.uri', 'streamable_clip.config_url', 'streamable_clip.live.status', + ) + _VIDEOS_FIELDS = ('items', 'uri', 'name', 'config_url', 'duration', 'live.status') + + def _call_events_api( + self, event_id, ep=None, unlisted_hash=None, note=None, + fields=(), referrer=None, query=None, headers=None, + ): + resource = join_nonempty('event', ep, note, 'API JSON', delim=' ') + + return self._download_json( + join_nonempty( + 'https://api.vimeo.com/live_events', + join_nonempty(event_id, unlisted_hash, delim=':'), ep, delim='/'), + event_id, f'Downloading {resource}', f'Failed to download {resource}', + query=filter_dict({ + 'fields': ','.join(fields) or [], + # Correct spelling with 4 R's is deliberate + 'referrer': referrer, + **(query or {}), + }), headers=filter_dict({ + 'Accept': 'application/json', + 'Authorization': f'jwt {self._fetch_viewer_info(event_id)["jwt"]}', + 'Referer': referrer, + **(headers or {}), + })) + + @staticmethod + def _extract_video_id_and_unlisted_hash(video): + if not traverse_obj(video, ('uri', {lambda x: x.startswith('/videos/')})): + return None, None + video_id, _, unlisted_hash = video['uri'][8:].partition(':') + return video_id, unlisted_hash or None + + def _vimeo_url_result(self, video_id, unlisted_hash=None, event_id=None): + # VimeoIE can extract more metadata and formats for was_live event videos + return self.url_result( + join_nonempty('https://vimeo.com', video_id, unlisted_hash, delim='/'), VimeoIE, + video_id, display_id=event_id, live_status='was_live', url_transparent=True) + + @classmethod + def _extract_embed_urls(cls, url, webpage): + for embed_url in super()._extract_embed_urls(url, webpage): + yield cls._smuggle_referrer(embed_url, url) + + def _real_extract(self, url): + url, _, headers = self._unsmuggle_headers(url) + # XXX: Keep key name in sync with _unsmuggle_headers + referrer = headers.get('Referer') + event_id, unlisted_hash, video_id = self._match_valid_url(url).group('id', 'unlisted_hash', 'video_id') + + for retry in (False, True): + try: + live_event_data = self._call_events_api( + event_id, unlisted_hash=unlisted_hash, fields=self._EVENT_FIELDS, + referrer=referrer, query={'clip_to_play_id': video_id or '0'}, + headers={'Accept': 'application/vnd.vimeo.*+json;version=3.4.9'}) + break + except ExtractorError as e: + if retry or not isinstance(e.cause, HTTPError) or e.cause.status not in (400, 403): + raise + response = traverse_obj(e.cause.response.read(), ({json.loads}, {dict})) or {} + error_code = response.get('error_code') + if error_code == 2204: + self._verify_video_password(event_id, path='event') + continue + if error_code == 3200: + raise ExtractorError(self._REFERER_HINT, expected=True) + if error_msg := response.get('error'): + raise ExtractorError(f'Vimeo says: {error_msg}', expected=True) + raise + + # stream_privacy.view can be: 'anybody', 'embed_only', 'nobody', 'password', 'unlisted' + view_policy = live_event_data['stream_privacy']['view'] + if view_policy == 'nobody': + raise ExtractorError('This event has not been made available to anyone', expected=True) + + clip_data = traverse_obj(live_event_data, ('clip_to_play', {dict})) or {} + # live.status can be: 'streaming' (is_live), 'done' (was_live), 'unavailable' (is_upcoming OR dead) + clip_status = traverse_obj(clip_data, ('live', 'status', {str})) + start_time = traverse_obj(live_event_data, ('schedule', 'start_time', {str})) + release_timestamp = parse_iso8601(start_time) + + if clip_status == 'unavailable' and release_timestamp and release_timestamp > time.time(): + self.raise_no_formats(f'This live event is scheduled for {start_time}', expected=True) + live_status = 'is_upcoming' + config_url = None + + elif view_policy == 'embed_only': + webpage = self._download_webpage( + join_nonempty('https://vimeo.com/event', event_id, 'embed', unlisted_hash, delim='/'), + event_id, 'Downloading embed iframe webpage', impersonate=True, headers=headers) + # The _parse_config result will overwrite live_status w/ 'is_live' if livestream is active + live_status = 'was_live' + config_url = self._extract_config_url(webpage) + + else: # view_policy in ('anybody', 'password', 'unlisted') + if video_id: + clip_id, clip_hash = self._extract_video_id_and_unlisted_hash(clip_data) + if video_id == clip_id and clip_status == 'done' and (clip_hash or view_policy != 'unlisted'): + return self._vimeo_url_result(clip_id, clip_hash, event_id) + + video_filter = lambda _, v: self._extract_video_id_and_unlisted_hash(v)[0] == video_id + else: + video_filter = lambda _, v: v['live']['status'] in ('streaming', 'done') + + for page in itertools.count(1): + videos_data = self._call_events_api( + event_id, 'videos', unlisted_hash=unlisted_hash, note=f'page {page}', + fields=self._VIDEOS_FIELDS, referrer=referrer, query={'page': page}, + headers={'Accept': 'application/vnd.vimeo.*;version=3.4.1'}) + + video = traverse_obj(videos_data, ('data', video_filter, any)) + if video or not traverse_obj(videos_data, ('paging', 'next', {str})): + break + + live_status = { + 'streaming': 'is_live', + 'done': 'was_live', + }.get(traverse_obj(video, ('live', 'status', {str}))) + + if not live_status: # requested video_id is unavailable or no videos are available + raise ExtractorError('This event video is unavailable', expected=True) + elif live_status == 'was_live': + return self._vimeo_url_result(*self._extract_video_id_and_unlisted_hash(video), event_id) + config_url = video['config_url'] + + if config_url: # view_policy == 'embed_only' or live_status == 'is_live' + info = filter_dict(self._parse_config( + self._download_json(config_url, event_id, 'Downloading config JSON'), event_id)) + else: # live_status == 'is_upcoming' + info = {'id': event_id} + + if info.get('live_status') == 'post_live': + self.report_warning('This live event recently ended and some formats may not yet be available') + + return { + **traverse_obj(live_event_data, { + 'title': ('title', {str}), + 'description': ('stream_description', {str}), + }), + 'display_id': event_id, + 'live_status': live_status, + 'release_timestamp': release_timestamp, + **info, + } diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index ef67c210b..c269802b3 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -300,6 +300,24 @@ class VKIE(VKBaseIE): 'upload_date': '20250130', }, }, + { + 'url': 'https://vkvideo.ru/video-50883936_456244102', + 'info_dict': { + 'id': '-50883936_456244102', + 'ext': 'mp4', + 'title': 'Добивание Украины // Техник в коме // МОЯ ЗЛОСТЬ №140', + 'description': 'md5:a9bc46181e9ebd0fdd82cef6c0191140', + 'uploader': 'Стас Ай, Как Просто!', + 'uploader_id': '-50883936', + 'comment_count': int, + 'like_count': int, + 'duration': 4651, + 'thumbnail': r're:https?://.+\.jpg', + 'chapters': 'count:59', + 'timestamp': 1743333869, + 'upload_date': '20250330', + }, + }, { # live stream, hls and rtmp links, most likely already finished live # stream by the time you are reading this comment @@ -540,7 +558,7 @@ def _real_extract(self, url): 'title': ('md_title', {unescapeHTML}), 'description': ('description', {clean_html}, filter), 'thumbnail': ('jpg', {url_or_none}), - 'uploader': ('md_author', {str}), + 'uploader': ('md_author', {unescapeHTML}), 'uploader_id': (('author_id', 'authorId'), {str_or_none}, any), 'duration': ('duration', {int_or_none}), 'chapters': ('time_codes', lambda _, v: isinstance(v['time'], int), { diff --git a/yt_dlp/extractor/voxmedia.py b/yt_dlp/extractor/voxmedia.py index e9b0047a4..995ef2185 100644 --- a/yt_dlp/extractor/voxmedia.py +++ b/yt_dlp/extractor/voxmedia.py @@ -1,7 +1,6 @@ import urllib.parse from .common import InfoExtractor -from .once import OnceIE from ..utils import ( ExtractorError, int_or_none, @@ -10,7 +9,7 @@ ) -class VoxMediaVolumeIE(OnceIE): +class VoxMediaVolumeIE(InfoExtractor): _VALID_URL = r'https?://volume\.vox-cdn\.com/embed/(?P[0-9a-f]{9})' def _real_extract(self, url): @@ -57,7 +56,8 @@ def _real_extract(self, url): if not provider_video_id: continue if provider_video_type == 'brightcove': - info['formats'] = self._extract_once_formats(provider_video_id) + # TODO: Find embed example or confirm that Vox has stopped using Brightcove + raise ExtractorError('Vox Brightcove embeds are currently unsupported') else: info.update({ '_type': 'url_transparent', @@ -155,20 +155,6 @@ class VoxMediaIE(InfoExtractor): }, }], 'skip': 'Page no longer contain videos', - }, { - # volume embed, Brightcove Once - 'url': 'https://www.recode.net/2014/6/17/11628066/post-post-pc-ceo-the-full-code-conference-video-of-microsofts-satya', - 'md5': '2dbc77b8b0bff1894c2fce16eded637d', - 'info_dict': { - 'id': '1231c973d', - 'ext': 'mp4', - 'title': 'Post-Post-PC CEO: The Full Code Conference Video of Microsoft\'s Satya Nadella', - 'description': 'The longtime veteran was chosen earlier this year as the software giant\'s third leader in its history.', - 'timestamp': 1402938000, - 'upload_date': '20140616', - 'duration': 4114, - }, - 'add_ie': ['VoxMediaVolume'], }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/wat.py b/yt_dlp/extractor/wat.py index 03bac66ac..c1c3af800 100644 --- a/yt_dlp/extractor/wat.py +++ b/yt_dlp/extractor/wat.py @@ -2,9 +2,11 @@ from ..utils import ( ExtractorError, int_or_none, + join_nonempty, try_get, unified_strdate, ) +from ..utils.traversal import traverse_obj class WatIE(InfoExtractor): @@ -70,8 +72,14 @@ def _real_extract(self, url): error_desc = video_info.get('error_desc') if error_desc: - if video_info.get('error_code') == 'GEOBLOCKED': + error_code = video_info.get('error_code') + if error_code == 'GEOBLOCKED': self.raise_geo_restricted(error_desc, video_info.get('geoList')) + elif error_code == 'DELIVERY_ERROR': + if traverse_obj(video_data, ('delivery', 'code')) == 500: + self.report_drm(video_id) + error_desc = join_nonempty( + error_desc, traverse_obj(video_data, ('delivery', 'error', {str})), delim=': ') raise ExtractorError(error_desc, expected=True) title = video_info['title'] diff --git a/yt_dlp/extractor/weverse.py b/yt_dlp/extractor/weverse.py index 53ad1100d..42b1189fe 100644 --- a/yt_dlp/extractor/weverse.py +++ b/yt_dlp/extractor/weverse.py @@ -290,12 +290,14 @@ def _real_extract(self, url): elif live_status == 'is_live': video_info = self._call_api( - f'/video/v1.2/lives/{api_video_id}/playInfo?preview.format=json&preview.version=v2', + f'/video/v1.3/lives/{api_video_id}/playInfo?preview.format=json&preview.version=v2', video_id, note='Downloading live JSON') playback = self._parse_json(video_info['lipPlayback'], video_id) m3u8_url = traverse_obj(playback, ( 'media', lambda _, v: v['protocol'] == 'HLS', 'path', {url_or_none}), get_all=False) - formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', m3u8_id='hls', live=True) + # Live subtitles are not downloadable, but extract to silence "ignoring subs" warning + formats, _ = self._extract_m3u8_formats_and_subtitles( + m3u8_url, video_id, 'mp4', m3u8_id='hls', live=True) elif live_status == 'post_live': if availability in ('premium_only', 'subscriber_only'): diff --git a/yt_dlp/extractor/xinpianchang.py b/yt_dlp/extractor/xinpianchang.py index 23ed9270d..a4263579a 100644 --- a/yt_dlp/extractor/xinpianchang.py +++ b/yt_dlp/extractor/xinpianchang.py @@ -45,7 +45,7 @@ class XinpianchangIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id=video_id) + webpage = self._download_webpage(url, video_id=video_id, headers={'Referer': url}) video_data = self._search_nextjs_data(webpage, video_id)['props']['pageProps']['detail']['video'] data = self._download_json( diff --git a/yt_dlp/extractor/youtube/_base.py b/yt_dlp/extractor/youtube/_base.py index 8ca7d898e..9c5bb75fe 100644 --- a/yt_dlp/extractor/youtube/_base.py +++ b/yt_dlp/extractor/youtube/_base.py @@ -35,6 +35,7 @@ class _PoTokenContext(enum.Enum): PLAYER = 'player' GVS = 'gvs' + SUBS = 'subs' # any clients starting with _ cannot be explicitly requested by the user @@ -417,6 +418,8 @@ class YoutubeBaseInfoExtractor(InfoExtractor): _NETRC_MACHINE = 'youtube' + _COOKIE_HOWTO_WIKI_URL = 'https://github.com/yt-dlp/yt-dlp/wiki/Extractors#exporting-youtube-cookies' + def ucid_or_none(self, ucid): return self._search_regex(rf'^({self._YT_CHANNEL_UCID_RE})$', ucid, 'UC-id', default=None) @@ -451,17 +454,15 @@ def _preferred_lang(self): return preferred_lang def _initialize_consent(self): - cookies = self._get_cookies('https://www.youtube.com/') - if cookies.get('__Secure-3PSID'): + if self._has_auth_cookies: return - socs = cookies.get('SOCS') + socs = self._youtube_cookies.get('SOCS') if socs and not socs.value.startswith('CAA'): # not consented return self._set_cookie('.youtube.com', 'SOCS', 'CAI', secure=True) # accept all (required for mixes) def _initialize_pref(self): - cookies = self._get_cookies('https://www.youtube.com/') - pref_cookie = cookies.get('PREF') + pref_cookie = self._youtube_cookies.get('PREF') pref = {} if pref_cookie: try: @@ -472,8 +473,9 @@ def _initialize_pref(self): self._set_cookie('.youtube.com', name='PREF', value=urllib.parse.urlencode(pref)) def _initialize_cookie_auth(self): - yt_sapisid, yt_1psapisid, yt_3psapisid = self._get_sid_cookies() - if yt_sapisid or yt_1psapisid or yt_3psapisid: + self._passed_auth_cookies = False + if self._has_auth_cookies: + self._passed_auth_cookies = True self.write_debug('Found YouTube account cookies') def _real_initialize(self): @@ -492,8 +494,7 @@ def _perform_login(self, username, password): @property def _youtube_login_hint(self): - return (f'{self._login_hint(method="cookies")}. Also see ' - 'https://github.com/yt-dlp/yt-dlp/wiki/Extractors#exporting-youtube-cookies ' + return (f'{self._login_hint(method="cookies")}. Also see {self._COOKIE_HOWTO_WIKI_URL} ' 'for tips on effectively exporting YouTube cookies') def _check_login_required(self): @@ -553,12 +554,16 @@ def _make_sid_authorization(scheme, sid, origin, additional_parts): return f'{scheme} {"_".join(parts)}' + @property + def _youtube_cookies(self): + return self._get_cookies('https://www.youtube.com') + def _get_sid_cookies(self): """ Get SAPISID, 1PSAPISID, 3PSAPISID cookie values @returns sapisid, 1psapisid, 3psapisid """ - yt_cookies = self._get_cookies('https://www.youtube.com') + yt_cookies = self._youtube_cookies yt_sapisid = try_call(lambda: yt_cookies['SAPISID'].value) yt_3papisid = try_call(lambda: yt_cookies['__Secure-3PAPISID'].value) yt_1papisid = try_call(lambda: yt_cookies['__Secure-1PAPISID'].value) @@ -595,6 +600,31 @@ def _get_sid_authorization_header(self, origin='https://www.youtube.com', user_s return ' '.join(authorizations) + @property + def is_authenticated(self): + return self._has_auth_cookies + + @property + def _has_auth_cookies(self): + yt_sapisid, yt_1psapisid, yt_3psapisid = self._get_sid_cookies() + # YouTube doesn't appear to clear 3PSAPISID when rotating cookies (as of 2025-04-26) + # But LOGIN_INFO is cleared and should exist if logged in + has_login_info = 'LOGIN_INFO' in self._youtube_cookies + return bool(has_login_info and (yt_sapisid or yt_1psapisid or yt_3psapisid)) + + def _request_webpage(self, *args, **kwargs): + response = super()._request_webpage(*args, **kwargs) + + # Check that we are still logged-in and cookies have not rotated after every request + if getattr(self, '_passed_auth_cookies', None) and not self._has_auth_cookies: + self.report_warning( + 'The provided YouTube account cookies are no longer valid. ' + 'They have likely been rotated in the browser as a security measure. ' + f'For tips on how to effectively export YouTube cookies, refer to {self._COOKIE_HOWTO_WIKI_URL} .', + only_once=False) + + return response + def _call_api(self, ep, query, video_id, fatal=True, headers=None, note='Downloading API JSON', errnote='Unable to download API page', context=None, api_key=None, api_hostname=None, default_client='web'): @@ -695,10 +725,6 @@ def _extract_visitor_data(self, *args): args, [('VISITOR_DATA', ('INNERTUBE_CONTEXT', 'client', 'visitorData'), ('responseContext', 'visitorData'))], expected_type=str) - @functools.cached_property - def is_authenticated(self): - return bool(self._get_sid_authorization_header()) - def extract_ytcfg(self, video_id, webpage): if not webpage: return {} @@ -762,6 +788,7 @@ def _download_webpage_with_retries(self, *args, retry_fatal=False, retry_on_stat def _download_ytcfg(self, client, video_id): url = { + 'mweb': 'https://m.youtube.com', 'web': 'https://www.youtube.com', 'web_music': 'https://music.youtube.com', 'web_embedded': f'https://www.youtube.com/embed/{video_id}?html5=1', diff --git a/yt_dlp/extractor/youtube/_clip.py b/yt_dlp/extractor/youtube/_clip.py index 7d063700e..1a708cd01 100644 --- a/yt_dlp/extractor/youtube/_clip.py +++ b/yt_dlp/extractor/youtube/_clip.py @@ -37,6 +37,7 @@ class YoutubeClipIE(YoutubeTabBaseInfoExtractor): 'chapters': 'count:20', 'comment_count': int, 'heatmap': 'count:100', + 'media_type': 'clip', }, }] @@ -59,6 +60,7 @@ def _real_extract(self, url): 'url': f'https://www.youtube.com/watch?v={video_id}', 'ie_key': YoutubeIE.ie_key(), 'id': clip_id, + 'media_type': 'clip', 'section_start': int(clip_data['startTimeMs']) / 1000, 'section_end': int(clip_data['endTimeMs']) / 1000, '_format_sort_fields': ( # https protocol is prioritized for ffmpeg compatibility diff --git a/yt_dlp/extractor/youtube/_redirect.py b/yt_dlp/extractor/youtube/_redirect.py index 1908df124..14e565b42 100644 --- a/yt_dlp/extractor/youtube/_redirect.py +++ b/yt_dlp/extractor/youtube/_redirect.py @@ -35,6 +35,7 @@ class YoutubeYtBeIE(YoutubeBaseInfoExtractor): 'duration': 59, 'comment_count': int, 'channel_follower_count': int, + 'media_type': 'short', }, 'params': { 'noplaylist': True, diff --git a/yt_dlp/extractor/youtube/_tab.py b/yt_dlp/extractor/youtube/_tab.py index 122300e60..c018ee8cf 100644 --- a/yt_dlp/extractor/youtube/_tab.py +++ b/yt_dlp/extractor/youtube/_tab.py @@ -524,10 +524,16 @@ def _entries(self, tab, item_id, ytcfg, delegated_session_id, visitor_data): response = self._extract_response( item_id=f'{item_id} page {page_num}', query=continuation, headers=headers, ytcfg=ytcfg, - check_get_keys=('continuationContents', 'onResponseReceivedActions', 'onResponseReceivedEndpoints')) + check_get_keys=( + 'continuationContents', 'onResponseReceivedActions', 'onResponseReceivedEndpoints', + # Playlist recommendations may return with no data - ignore + ('responseContext', 'serviceTrackingParams', ..., 'params', ..., lambda k, v: k == 'key' and v == 'GetRecommendedMusicPlaylists_rid'), + )) if not response: break + + continuation = None # Extracting updated visitor data is required to prevent an infinite extraction loop in some cases # See: https://github.com/ytdl-org/youtube-dl/issues/28702 visitor_data = self._extract_visitor_data(response) or visitor_data @@ -564,7 +570,13 @@ def _entries(self, tab, item_id, ytcfg, delegated_session_id, visitor_data): yield from func(video_items_renderer) continuation = continuation_list[0] or self._extract_continuation(video_items_renderer) - if not video_items_renderer: + # In the case only a continuation is returned, try to follow it. + # We extract this after trying to extract non-continuation items as otherwise this + # may be prioritized over other continuations. + # see: https://github.com/yt-dlp/yt-dlp/issues/12933 + continuation = continuation or self._extract_continuation({'contents': [continuation_item]}) + + if not continuation and not video_items_renderer: break @staticmethod @@ -999,14 +1011,14 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'playlist_mincount': 94, 'info_dict': { 'id': 'UCqj7Cz7revf5maW9g5pgNcg', - 'title': 'Igor Kleiner Ph.D. - Playlists', + 'title': 'Igor Kleiner - Playlists', 'description': 'md5:15d7dd9e333cb987907fcb0d604b233a', - 'uploader': 'Igor Kleiner Ph.D.', + 'uploader': 'Igor Kleiner ', 'uploader_id': '@IgorDataScience', 'uploader_url': 'https://www.youtube.com/@IgorDataScience', - 'channel': 'Igor Kleiner Ph.D.', + 'channel': 'Igor Kleiner ', 'channel_id': 'UCqj7Cz7revf5maW9g5pgNcg', - 'tags': ['критическое мышление', 'наука просто', 'математика', 'анализ данных'], + 'tags': 'count:23', 'channel_url': 'https://www.youtube.com/channel/UCqj7Cz7revf5maW9g5pgNcg', 'channel_follower_count': int, }, @@ -1016,18 +1028,19 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'playlist_mincount': 94, 'info_dict': { 'id': 'UCqj7Cz7revf5maW9g5pgNcg', - 'title': 'Igor Kleiner Ph.D. - Playlists', + 'title': 'Igor Kleiner - Playlists', 'description': 'md5:15d7dd9e333cb987907fcb0d604b233a', - 'uploader': 'Igor Kleiner Ph.D.', + 'uploader': 'Igor Kleiner ', 'uploader_id': '@IgorDataScience', 'uploader_url': 'https://www.youtube.com/@IgorDataScience', - 'tags': ['критическое мышление', 'наука просто', 'математика', 'анализ данных'], + 'tags': 'count:23', 'channel_id': 'UCqj7Cz7revf5maW9g5pgNcg', - 'channel': 'Igor Kleiner Ph.D.', + 'channel': 'Igor Kleiner ', 'channel_url': 'https://www.youtube.com/channel/UCqj7Cz7revf5maW9g5pgNcg', 'channel_follower_count': int, }, }, { + # TODO: fix channel_is_verified extraction 'note': 'playlists, series', 'url': 'https://www.youtube.com/c/3blue1brown/playlists?view=50&sort=dd&shelf_id=3', 'playlist_mincount': 5, @@ -1066,22 +1079,23 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists', 'only_matching': True, }, { + # TODO: fix availability extraction 'note': 'basic, single video playlist', - 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc', + 'url': 'https://www.youtube.com/playlist?list=PLt5yu3-wZAlSLRHmI1qNm0wjyVNWw1pCU', 'info_dict': { - 'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc', - 'title': 'youtube-dl public playlist', + 'id': 'PLt5yu3-wZAlSLRHmI1qNm0wjyVNWw1pCU', + 'title': 'single video playlist', 'description': '', 'tags': [], 'view_count': int, - 'modified_date': '20201130', - 'channel': 'Sergey M.', - 'channel_id': 'UCmlqkdCBesrv2Lak1mF_MxA', - 'channel_url': 'https://www.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA', + 'modified_date': '20250417', + 'channel': 'cole-dlp-test-acc', + 'channel_id': 'UCiu-3thuViMebBjw_5nWYrA', + 'channel_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA', 'availability': 'public', - 'uploader': 'Sergey M.', - 'uploader_url': 'https://www.youtube.com/@sergeym.6173', - 'uploader_id': '@sergeym.6173', + 'uploader': 'cole-dlp-test-acc', + 'uploader_url': 'https://www.youtube.com/@coletdjnz', + 'uploader_id': '@coletdjnz', }, 'playlist_count': 1, }, { @@ -1171,11 +1185,11 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): }, 'playlist_mincount': 17, }, { - 'note': 'Community tab', + 'note': 'Posts tab', 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/community', 'info_dict': { 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - 'title': 'lex will - Community', + 'title': 'lex will - Posts', 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', 'channel': 'lex will', 'channel_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w', @@ -1188,30 +1202,14 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): }, 'playlist_mincount': 18, }, { - 'note': 'Channels tab', - 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/channels', - 'info_dict': { - 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - 'title': 'lex will - Channels', - 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', - 'channel': 'lex will', - 'channel_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w', - 'channel_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - 'tags': ['bible', 'history', 'prophesy'], - 'channel_follower_count': int, - 'uploader_url': 'https://www.youtube.com/@lexwill718', - 'uploader_id': '@lexwill718', - 'uploader': 'lex will', - }, - 'playlist_mincount': 12, - }, { + # TODO: fix channel_is_verified extraction 'note': 'Search tab', 'url': 'https://www.youtube.com/c/3blue1brown/search?query=linear%20algebra', 'playlist_mincount': 40, 'info_dict': { 'id': 'UCYO_jab_esuFRV4b17AJtAw', 'title': '3Blue1Brown - Search - linear algebra', - 'description': 'md5:4d1da95432004b7ba840ebc895b6b4c9', + 'description': 'md5:602e3789e6a0cb7d9d352186b720e395', 'channel_url': 'https://www.youtube.com/channel/UCYO_jab_esuFRV4b17AJtAw', 'tags': ['Mathematics'], 'channel': '3Blue1Brown', @@ -1232,6 +1230,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'url': 'https://music.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA', 'only_matching': True, }, { + # TODO: fix availability extraction 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.', 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC', 'info_dict': { @@ -1294,24 +1293,25 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): }, 'playlist_mincount': 21, }, { + # TODO: fix availability extraction 'note': 'Playlist with "show unavailable videos" button', - 'url': 'https://www.youtube.com/playlist?list=UUTYLiWFZy8xtPwxFwX9rV7Q', + 'url': 'https://www.youtube.com/playlist?list=PLYwq8WOe86_xGmR7FrcJq8Sb7VW8K3Tt2', 'info_dict': { - 'title': 'Uploads from Phim Siêu Nhân Nhật Bản', - 'id': 'UUTYLiWFZy8xtPwxFwX9rV7Q', + 'title': 'The Memes Of 2010s.....', + 'id': 'PLYwq8WOe86_xGmR7FrcJq8Sb7VW8K3Tt2', 'view_count': int, - 'channel': 'Phim Siêu Nhân Nhật Bản', + 'channel': "I'm Not JiNxEd", 'tags': [], - 'description': '', - 'channel_url': 'https://www.youtube.com/channel/UCTYLiWFZy8xtPwxFwX9rV7Q', - 'channel_id': 'UCTYLiWFZy8xtPwxFwX9rV7Q', + 'description': 'md5:44dc3b315ba69394feaafa2f40e7b2a1', + 'channel_url': 'https://www.youtube.com/channel/UC5H5H85D1QE5-fuWWQ1hdNg', + 'channel_id': 'UC5H5H85D1QE5-fuWWQ1hdNg', 'modified_date': r're:\d{8}', 'availability': 'public', - 'uploader_url': 'https://www.youtube.com/@phimsieunhannhatban', - 'uploader_id': '@phimsieunhannhatban', - 'uploader': 'Phim Siêu Nhân Nhật Bản', + 'uploader_url': 'https://www.youtube.com/@imnotjinxed1998', + 'uploader_id': '@imnotjinxed1998', + 'uploader': "I'm Not JiNxEd", }, - 'playlist_mincount': 200, + 'playlist_mincount': 150, 'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'], }, { 'note': 'Playlist with unavailable videos in page 7', @@ -1334,6 +1334,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'playlist_mincount': 1000, 'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'], }, { + # TODO: fix availability extraction 'note': 'https://github.com/ytdl-org/youtube-dl/issues/21844', 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba', 'info_dict': { @@ -1384,7 +1385,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): }, { 'url': 'https://www.youtube.com/channel/UCoMdktPbSTixAyNGwb-UYkQ/live', 'info_dict': { - 'id': 'hGkQjiJLjWQ', # This will keep changing + 'id': 'YDvsBbKfLPA', # This will keep changing 'ext': 'mp4', 'title': str, 'upload_date': r're:\d{8}', @@ -1409,6 +1410,8 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'uploader_id': '@SkyNews', 'uploader': 'Sky News', 'channel_is_verified': True, + 'media_type': 'livestream', + 'timestamp': int, }, 'params': { 'skip_download': True, @@ -1496,6 +1499,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'url': 'https://music.youtube.com/browse/UC1a8OFewdjuLq6KlF8M_8Ng', 'only_matching': True, }, { + # TODO: fix availability extraction 'note': 'VLPL, should redirect to playlist?list=PL...', 'url': 'https://music.youtube.com/browse/VLPLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq', 'info_dict': { @@ -1537,6 +1541,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): }, { # Destination channel with only a hidden self tab (tab id is UCtFRv9O2AHqOZjjynzrv-xg) # Treat as a general feed + # TODO: fix extraction 'url': 'https://www.youtube.com/channel/UCtFRv9O2AHqOZjjynzrv-xg', 'info_dict': { 'id': 'UCtFRv9O2AHqOZjjynzrv-xg', @@ -1560,21 +1565,21 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'expected_warnings': ['YouTube Music is not directly supported'], }, { 'note': 'unlisted single video playlist', - 'url': 'https://www.youtube.com/playlist?list=PLwL24UFy54GrB3s2KMMfjZscDi1x5Dajf', + 'url': 'https://www.youtube.com/playlist?list=PLt5yu3-wZAlQLfIN0MMgp0wVV6MP3bM4_', 'info_dict': { - 'id': 'PLwL24UFy54GrB3s2KMMfjZscDi1x5Dajf', - 'title': 'yt-dlp unlisted playlist test', + 'id': 'PLt5yu3-wZAlQLfIN0MMgp0wVV6MP3bM4_', + 'title': 'unlisted playlist', 'availability': 'unlisted', 'tags': [], - 'modified_date': '20220418', - 'channel': 'colethedj', + 'modified_date': '20250417', + 'channel': 'cole-dlp-test-acc', 'view_count': int, 'description': '', - 'channel_id': 'UC9zHu_mHU96r19o-wV5Qs1Q', - 'channel_url': 'https://www.youtube.com/channel/UC9zHu_mHU96r19o-wV5Qs1Q', - 'uploader_url': 'https://www.youtube.com/@colethedj1894', - 'uploader_id': '@colethedj1894', - 'uploader': 'colethedj', + 'channel_id': 'UCiu-3thuViMebBjw_5nWYrA', + 'channel_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA', + 'uploader_url': 'https://www.youtube.com/@coletdjnz', + 'uploader_id': '@coletdjnz', + 'uploader': 'cole-dlp-test-acc', }, 'playlist': [{ 'info_dict': { @@ -1596,6 +1601,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'playlist_count': 1, 'params': {'extract_flat': True}, }, { + # By default, recommended is always empty. 'note': 'API Fallback: Recommended - redirects to home page. Requires visitorData', 'url': 'https://www.youtube.com/feed/recommended', 'info_dict': { @@ -1603,7 +1609,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'title': 'recommended', 'tags': [], }, - 'playlist_mincount': 50, + 'playlist_count': 0, 'params': { 'skip_download': True, 'extractor_args': {'youtubetab': {'skip': ['webpage']}}, @@ -1628,6 +1634,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): }, 'skip': 'Query for sorting no longer works', }, { + # TODO: fix 'unviewable' issue with this playlist when reloading with unavailable videos 'note': 'API Fallback: Topic, should redirect to playlist?list=UU...', 'url': 'https://music.youtube.com/browse/UC9ALqqC4aIeG5iDs7i90Bfw', 'info_dict': { @@ -1654,11 +1661,12 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'url': 'https://www.youtube.com/channel/UCwVVpHQ2Cs9iGJfpdFngePQ', 'only_matching': True, }, { + # TODO: fix metadata extraction 'note': 'collaborative playlist (uploader name in the form "by and x other(s)")', 'url': 'https://www.youtube.com/playlist?list=PLx-_-Kk4c89oOHEDQAojOXzEzemXxoqx6', 'info_dict': { 'id': 'PLx-_-Kk4c89oOHEDQAojOXzEzemXxoqx6', - 'modified_date': '20220407', + 'modified_date': '20250115', 'channel_url': 'https://www.youtube.com/channel/UCKcqXmCcyqnhgpA5P0oHH_Q', 'tags': [], 'availability': 'unlisted', @@ -1692,6 +1700,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'expected_warnings': ['Preferring "ja"'], }, { # XXX: this should really check flat playlist entries, but the test suite doesn't support that + # TODO: fix availability extraction 'note': 'preferred lang set with playlist with translated video titles', 'url': 'https://www.youtube.com/playlist?list=PLt5yu3-wZAlQAaPZ5Z-rJoTdbT-45Q7c0', 'info_dict': { @@ -1714,6 +1723,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): }, { # shorts audio pivot for 2GtVksBMYFM. 'url': 'https://www.youtube.com/feed/sfv_audio_pivot?bp=8gUrCikSJwoLMkd0VmtzQk1ZRk0SCzJHdFZrc0JNWUZNGgsyR3RWa3NCTVlGTQ==', + # TODO: fix extraction 'info_dict': { 'id': 'sfv_audio_pivot', 'title': 'sfv_audio_pivot', @@ -1751,6 +1761,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'playlist_mincount': 8, }, { # Should get three playlists for videos, shorts and streams tabs + # TODO: fix channel_is_verified extraction 'url': 'https://www.youtube.com/channel/UCK9V2B22uJYu3N7eR_BT9QA', 'info_dict': { 'id': 'UCK9V2B22uJYu3N7eR_BT9QA', @@ -1758,7 +1769,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'channel_follower_count': int, 'channel_id': 'UCK9V2B22uJYu3N7eR_BT9QA', 'channel_url': 'https://www.youtube.com/channel/UCK9V2B22uJYu3N7eR_BT9QA', - 'description': 'md5:49809d8bf9da539bc48ed5d1f83c33f2', + 'description': 'md5:01e53f350ab8ad6fcf7c4fedb3c1b99f', 'channel': 'Polka Ch. 尾丸ポルカ', 'tags': 'count:35', 'uploader_url': 'https://www.youtube.com/@OmaruPolka', @@ -1769,14 +1780,14 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'playlist_count': 3, }, { # Shorts tab with channel with handle - # TODO: fix channel description + # TODO: fix channel_is_verified extraction 'url': 'https://www.youtube.com/@NotJustBikes/shorts', 'info_dict': { 'id': 'UC0intLFzLaudFG-xAvUEO-A', 'title': 'Not Just Bikes - Shorts', 'tags': 'count:10', 'channel_url': 'https://www.youtube.com/channel/UC0intLFzLaudFG-xAvUEO-A', - 'description': 'md5:5e82545b3a041345927a92d0585df247', + 'description': 'md5:1d9fc1bad7f13a487299d1fe1712e031', 'channel_follower_count': int, 'channel_id': 'UC0intLFzLaudFG-xAvUEO-A', 'channel': 'Not Just Bikes', @@ -1797,7 +1808,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'channel_url': 'https://www.youtube.com/channel/UC3eYAvjCVwNHgkaGbXX3sig', 'channel': '中村悠一', 'channel_follower_count': int, - 'description': 'md5:e744f6c93dafa7a03c0c6deecb157300', + 'description': 'md5:e8fd705073a594f27d6d6d020da560dc', 'uploader_url': 'https://www.youtube.com/@Yuichi-Nakamura', 'uploader_id': '@Yuichi-Nakamura', 'uploader': '中村悠一', @@ -1815,6 +1826,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'only_matching': True, }, { # No videos tab but has a shorts tab + # TODO: fix metadata extraction 'url': 'https://www.youtube.com/c/TKFShorts', 'info_dict': { 'id': 'UCgJ5_1F6yJhYLnyMszUdmUg', @@ -1851,6 +1863,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): }, { # Shorts url result in shorts tab # TODO: Fix channel id extraction + # TODO: fix test suite, 208163447408c78673b08c172beafe5c310fb167 broke this test 'url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA/shorts', 'info_dict': { 'id': 'UCiu-3thuViMebBjw_5nWYrA', @@ -1879,6 +1892,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'params': {'extract_flat': True}, }, { # Live video status should be extracted + # TODO: fix test suite, 208163447408c78673b08c172beafe5c310fb167 broke this test 'url': 'https://www.youtube.com/channel/UCQvWX73GQygcwXOTSf_VDVg/live', 'info_dict': { 'id': 'UCQvWX73GQygcwXOTSf_VDVg', @@ -1907,6 +1921,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'playlist_mincount': 1, }, { # Channel renderer metadata. Contains number of videos on the channel + # TODO: channels tab removed, change this test to use another page with channel renderer 'url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA/channels', 'info_dict': { 'id': 'UCiu-3thuViMebBjw_5nWYrA', @@ -1940,7 +1955,9 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): }, }], 'params': {'extract_flat': True}, + 'skip': 'channels tab removed', }, { + # TODO: fix channel_is_verified extraction 'url': 'https://www.youtube.com/@3blue1brown/about', 'info_dict': { 'id': '@3blue1brown', @@ -1950,7 +1967,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'channel_id': 'UCYO_jab_esuFRV4b17AJtAw', 'channel': '3Blue1Brown', 'channel_url': 'https://www.youtube.com/channel/UCYO_jab_esuFRV4b17AJtAw', - 'description': 'md5:4d1da95432004b7ba840ebc895b6b4c9', + 'description': 'md5:602e3789e6a0cb7d9d352186b720e395', 'uploader_url': 'https://www.youtube.com/@3blue1brown', 'uploader_id': '@3blue1brown', 'uploader': '3Blue1Brown', @@ -1976,6 +1993,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'playlist_count': 5, }, { # Releases tab, with rich entry playlistRenderers (same as Podcasts tab) + # TODO: fix channel_is_verified extraction 'url': 'https://www.youtube.com/@AHimitsu/releases', 'info_dict': { 'id': 'UCgFwu-j5-xNJml2FtTrrB3A', @@ -2015,6 +2033,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'playlist_mincount': 100, 'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'], }, { + # TODO: fix channel_is_verified extraction 'note': 'Tags containing spaces', 'url': 'https://www.youtube.com/channel/UC7_YxT-KID8kRbqZo7MyscQ', 'playlist_count': 3, @@ -2035,6 +2054,24 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'challenges', 'sketches', 'scary games', 'funny games', 'rage games', 'mark fischbach'], }, + }, { + # https://github.com/yt-dlp/yt-dlp/issues/12933 + 'note': 'streams tab, some scheduled streams. Empty intermediate response with only continuation - must follow', + 'url': 'https://www.youtube.com/@sbcitygov/streams', + 'playlist_mincount': 150, + 'info_dict': { + 'id': 'UCH6-qfQwlUgz9SAf05jvc_w', + 'channel': 'sbcitygov', + 'channel_id': 'UCH6-qfQwlUgz9SAf05jvc_w', + 'title': 'sbcitygov - Live', + 'channel_follower_count': int, + 'description': 'md5:ca1a92059835c071e33b3db52f4a6d67', + 'uploader_id': '@sbcitygov', + 'uploader_url': 'https://www.youtube.com/@sbcitygov', + 'uploader': 'sbcitygov', + 'channel_url': 'https://www.youtube.com/channel/UCH6-qfQwlUgz9SAf05jvc_w', + 'tags': [], + }, }] @classmethod diff --git a/yt_dlp/extractor/youtube/_video.py b/yt_dlp/extractor/youtube/_video.py index 074a2a0d8..840829be6 100644 --- a/yt_dlp/extractor/youtube/_video.py +++ b/yt_dlp/extractor/youtube/_video.py @@ -23,6 +23,8 @@ _split_innertube_client, short_client_name, ) +from .pot._director import initialize_pot_director +from .pot.provider import PoTokenContext, PoTokenRequest from ..openload import PhantomJSwrapper from ...jsinterp import JSInterpreter from ...networking.exceptions import HTTPError @@ -66,9 +68,13 @@ urljoin, variadic, ) +from ...utils.networking import clean_headers, clean_proxies, select_proxy STREAMING_DATA_CLIENT_NAME = '__yt_dlp_client' STREAMING_DATA_INITIAL_PO_TOKEN = '__yt_dlp_po_token' +STREAMING_DATA_FETCH_SUBS_PO_TOKEN = '__yt_dlp_fetch_subs_po_token' +STREAMING_DATA_INNERTUBE_CONTEXT = '__yt_dlp_innertube_context' + PO_TOKEN_GUIDE_URL = 'https://github.com/yt-dlp/yt-dlp/wiki/PO-Token-Guide' @@ -376,6 +382,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'Afrojack', 'uploader_url': 'https://www.youtube.com/@Afrojack', 'uploader_id': '@Afrojack', + 'media_type': 'video', }, 'params': { 'youtube_include_dash_manifest': True, @@ -413,10 +420,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'channel_is_verified': True, 'heatmap': 'count:100', 'timestamp': 1401991663, + 'media_type': 'video', }, }, { - 'note': 'Age-gate video with embed allowed in public site', + 'note': 'Formerly an age-gate video with embed allowed in public site', 'url': 'https://youtube.com/watch?v=HsUATh_Nc2U', 'info_dict': { 'id': 'HsUATh_Nc2U', @@ -424,8 +432,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'title': 'Godzilla 2 (Official Video)', 'description': 'md5:bf77e03fcae5529475e500129b05668a', 'upload_date': '20200408', - 'age_limit': 18, - 'availability': 'needs_auth', + 'age_limit': 0, + 'availability': 'public', 'channel_id': 'UCYQT13AtrJC0gsM1far_zJg', 'channel': 'FlyingKitty', 'channel_url': 'https://www.youtube.com/channel/UCYQT13AtrJC0gsM1far_zJg', @@ -443,8 +451,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_id': '@FlyingKitty900', 'comment_count': int, 'channel_is_verified': True, + 'media_type': 'video', }, - 'skip': 'Age-restricted; requires authentication', }, { 'note': 'Age-gate video embedable only with clientScreen=EMBED', @@ -507,6 +515,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'Herr Lurik', 'uploader_url': 'https://www.youtube.com/@HerrLurik', 'uploader_id': '@HerrLurik', + 'media_type': 'video', }, }, { @@ -546,6 +555,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'deadmau5', 'uploader_url': 'https://www.youtube.com/@deadmau5', 'uploader_id': '@deadmau5', + 'media_type': 'video', }, 'expected_warnings': [ 'DASH manifest missing', @@ -581,6 +591,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_id': '@Olympics', 'channel_is_verified': True, 'timestamp': 1440707674, + 'media_type': 'livestream', }, 'params': { 'skip_download': 'requires avconv', @@ -615,6 +626,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_url': 'https://www.youtube.com/@AllenMeow', 'uploader_id': '@AllenMeow', 'timestamp': 1299776999, + 'media_type': 'video', }, }, # url_encoded_fmt_stream_map is empty string @@ -809,6 +821,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'like_count': int, 'age_limit': 0, 'channel_follower_count': int, + 'media_type': 'video', }, 'params': { 'skip_download': True, @@ -868,6 +881,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_id': '@BKCHarvard', 'uploader_url': 'https://www.youtube.com/@BKCHarvard', 'timestamp': 1422422076, + 'media_type': 'video', }, 'params': { 'skip_download': True, @@ -904,6 +918,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'channel_is_verified': True, 'heatmap': 'count:100', 'timestamp': 1447987198, + 'media_type': 'video', }, 'params': { 'skip_download': True, @@ -968,6 +983,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'comment_count': int, 'channel_is_verified': True, 'timestamp': 1484761047, + 'media_type': 'video', }, 'params': { 'skip_download': True, @@ -1070,6 +1086,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'tags': 'count:11', 'live_status': 'not_live', 'channel_follower_count': int, + 'media_type': 'video', }, 'params': { 'skip_download': True, @@ -1124,6 +1141,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_url': 'https://www.youtube.com/@ElevageOrVert', 'uploader_id': '@ElevageOrVert', 'timestamp': 1497343210, + 'media_type': 'video', }, 'params': { 'skip_download': True, @@ -1163,6 +1181,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'channel_is_verified': True, 'heatmap': 'count:100', 'timestamp': 1377976349, + 'media_type': 'video', }, 'params': { 'skip_download': True, @@ -1207,6 +1226,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'channel_follower_count': int, 'uploader': 'The Cinematic Orchestra', 'comment_count': int, + 'media_type': 'video', }, 'params': { 'skip_download': True, @@ -1275,6 +1295,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_url': 'https://www.youtube.com/@walkaroundjapan7124', 'uploader_id': '@walkaroundjapan7124', 'timestamp': 1605884416, + 'media_type': 'video', }, 'params': { 'skip_download': True, @@ -1371,6 +1392,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'channel_is_verified': True, 'heatmap': 'count:100', 'timestamp': 1395685455, + 'media_type': 'video', }, 'params': {'format': 'mhtml', 'skip_download': True}, }, { # Ensure video upload_date is in UTC timezone (video was uploaded 1641170939) @@ -1401,6 +1423,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_id': '@LeonNguyen', 'heatmap': 'count:100', 'timestamp': 1641170939, + 'media_type': 'video', }, }, { # date text is premiered video, ensure upload date in UTC (published 1641172509) @@ -1434,6 +1457,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'channel_is_verified': True, 'heatmap': 'count:100', 'timestamp': 1641172509, + 'media_type': 'video', }, }, { # continuous livestream. @@ -1495,6 +1519,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'Lesmiscore', 'uploader_url': 'https://www.youtube.com/@lesmiscore', 'timestamp': 1648005313, + 'media_type': 'short', }, }, { # Prefer primary title+description language metadata by default @@ -1523,6 +1548,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_id': '@coletdjnz', 'uploader': 'cole-dlp-test-acc', 'timestamp': 1662677394, + 'media_type': 'video', }, 'params': {'skip_download': True}, }, { @@ -1551,6 +1577,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'cole-dlp-test-acc', 'timestamp': 1659073275, 'like_count': int, + 'media_type': 'video', }, 'params': {'skip_download': True, 'extractor_args': {'youtube': {'lang': ['fr']}}}, 'expected_warnings': [r'Preferring "fr" translated fields'], @@ -1587,6 +1614,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'comment_count': int, 'channel_is_verified': True, 'heatmap': 'count:100', + 'media_type': 'video', }, 'params': {'extractor_args': {'youtube': {'player_client': ['ios']}}, 'format': '233-1'}, }, { @@ -1687,6 +1715,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'comment_count': int, 'channel_is_verified': True, 'heatmap': 'count:100', + 'media_type': 'video', }, 'params': { 'extractor_args': {'youtube': {'player_client': ['ios'], 'player_skip': ['webpage']}}, @@ -1719,6 +1748,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'channel_follower_count': int, 'categories': ['People & Blogs'], 'tags': [], + 'media_type': 'short', }, }, ] @@ -1754,6 +1784,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_id': '@ChristopherSykesDocumentaries', 'heatmap': 'count:100', 'timestamp': 1211825920, + 'media_type': 'video', }, 'params': { 'skip_download': True, @@ -1784,6 +1815,11 @@ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self._code_cache = {} self._player_cache = {} + self._pot_director = None + + def _real_initialize(self): + super()._real_initialize() + self._pot_director = initialize_pot_director(self) def _prepare_live_from_start_formats(self, formats, video_id, live_start_time, url, webpage_url, smuggled_data, is_live): lock = threading.Lock() @@ -1819,6 +1855,12 @@ def mpd_feed(format_id, delay): else: retry.error = f'Cannot find refreshed manifest for format {format_id}{bug_reports_message()}' continue + + # Formats from ended premieres will be missing a manifest_url + # See https://github.com/yt-dlp/yt-dlp/issues/8543 + if not f.get('manifest_url'): + break + return f['manifest_url'], f['manifest_stream_number'], is_live return None @@ -1982,7 +2024,9 @@ def _download_player_url(self, video_id, fatal=False): def _player_js_cache_key(self, player_url): player_id = self._extract_player_info(player_url) player_path = remove_start(urllib.parse.urlparse(player_url).path, f'/s/player/{player_id}/') - variant = self._INVERSE_PLAYER_JS_VARIANT_MAP.get(player_path) + variant = self._INVERSE_PLAYER_JS_VARIANT_MAP.get(player_path) or next(( + v for k, v in self._INVERSE_PLAYER_JS_VARIANT_MAP.items() + if re.fullmatch(re.escape(k).replace('en_US', r'[a-zA-Z0-9_]+'), player_path)), None) if not variant: self.write_debug( f'Unable to determine player JS variant\n' @@ -2120,23 +2164,23 @@ def inner(*args, **kwargs): return ret return inner - def _load_nsig_code_from_cache(self, player_url): - cache_id = ('youtube-nsig', self._player_js_cache_key(player_url)) + def _load_player_data_from_cache(self, name, player_url): + cache_id = (f'youtube-{name}', self._player_js_cache_key(player_url)) - if func_code := self._player_cache.get(cache_id): - return func_code + if data := self._player_cache.get(cache_id): + return data - func_code = self.cache.load(*cache_id, min_ver='2025.03.31') - if func_code: - self._player_cache[cache_id] = func_code + data = self.cache.load(*cache_id, min_ver='2025.03.31') + if data: + self._player_cache[cache_id] = data - return func_code + return data - def _store_nsig_code_to_cache(self, player_url, func_code): - cache_id = ('youtube-nsig', self._player_js_cache_key(player_url)) + def _store_player_data_to_cache(self, name, player_url, data): + cache_id = (f'youtube-{name}', self._player_js_cache_key(player_url)) if cache_id not in self._player_cache: - self.cache.store(*cache_id, func_code) - self._player_cache[cache_id] = func_code + self.cache.store(*cache_id, data) + self._player_cache[cache_id] = data def _decrypt_signature(self, s, video_id, player_url): """Turn the encrypted s field into a working signature""" @@ -2179,7 +2223,7 @@ def _decrypt_nsig(self, s, video_id, player_url): self.write_debug(f'Decrypted nsig {s} => {ret}') # Only cache nsig func JS code to disk if successful, and only once - self._store_nsig_code_to_cache(player_url, func_code) + self._store_player_data_to_cache('nsig', player_url, func_code) return ret def _extract_n_function_name(self, jscode, player_url=None): @@ -2298,7 +2342,7 @@ def _fixup_n_function_code(self, argnames, nsig_code, jscode, player_url): def _extract_n_function_code(self, video_id, player_url): player_id = self._extract_player_info(player_url) - func_code = self._load_nsig_code_from_cache(player_url) + func_code = self._load_player_data_from_cache('nsig', player_url) jscode = func_code or self._load_player(video_id, player_url) jsi = JSInterpreter(jscode) @@ -2334,23 +2378,27 @@ def _extract_signature_timestamp(self, video_id, player_url, ytcfg=None, fatal=F Extract signatureTimestamp (sts) Required to tell API what sig/player version is in use. """ - sts = None - if isinstance(ytcfg, dict): - sts = int_or_none(ytcfg.get('STS')) + if sts := traverse_obj(ytcfg, ('STS', {int_or_none})): + return sts + + if not player_url: + error_msg = 'Cannot extract signature timestamp without player url' + if fatal: + raise ExtractorError(error_msg) + self.report_warning(error_msg) + return None + + sts = self._load_player_data_from_cache('sts', player_url) + if sts: + return sts + + if code := self._load_player(video_id, player_url, fatal=fatal): + sts = int_or_none(self._search_regex( + r'(?:signatureTimestamp|sts)\s*:\s*(?P[0-9]{5})', code, + 'JS player signature timestamp', group='sts', fatal=fatal)) + if sts: + self._store_player_data_to_cache('sts', player_url, sts) - if not sts: - # Attempt to extract from player - if player_url is None: - error_msg = 'Cannot extract signature timestamp without player_url.' - if fatal: - raise ExtractorError(error_msg) - self.report_warning(error_msg) - return - code = self._load_player(video_id, player_url, fatal=fatal) - if code: - sts = int_or_none(self._search_regex( - r'(?:signatureTimestamp|sts)\s*:\s*(?P[0-9]{5})', code, - 'JS player signature timestamp', group='sts', fatal=fatal)) return sts def _mark_watched(self, video_id, player_responses): @@ -2818,7 +2866,8 @@ def _get_config_po_token(self, client: str, context: _PoTokenContext): continue def fetch_po_token(self, client='web', context=_PoTokenContext.GVS, ytcfg=None, visitor_data=None, - data_sync_id=None, session_index=None, player_url=None, video_id=None, **kwargs): + data_sync_id=None, session_index=None, player_url=None, video_id=None, webpage=None, + required=False, **kwargs): """ Fetch a PO Token for a given client and context. This function will validate required parameters for a given context and client. @@ -2832,10 +2881,15 @@ def fetch_po_token(self, client='web', context=_PoTokenContext.GVS, ytcfg=None, @param session_index: session index. @param player_url: player URL. @param video_id: video ID. + @param webpage: video webpage. + @param required: Whether the PO Token is required (i.e. try to fetch unless policy is "never"). @param kwargs: Additional arguments to pass down. May be more added in the future. @return: The fetched PO Token. None if it could not be fetched. """ + # TODO(future): This validation should be moved into pot framework. + # Some sort of middleware or validation provider perhaps? + # GVS WebPO Token is bound to visitor_data / Visitor ID when logged out. # Must have visitor_data for it to function. if player_url and context == _PoTokenContext.GVS and not visitor_data and not self.is_authenticated: @@ -2857,6 +2911,7 @@ def fetch_po_token(self, client='web', context=_PoTokenContext.GVS, ytcfg=None, f'Got a GVS PO Token for {client} client, but missing Data Sync ID for account. Formats may not work.' f'You may need to pass a Data Sync ID with --extractor-args "youtube:data_sync_id=XXX"') + self.write_debug(f'{video_id}: Retrieved a {context.value} PO Token for {client} client from config') return config_po_token # Require GVS WebPO Token if logged in for external fetching @@ -2866,7 +2921,7 @@ def fetch_po_token(self, client='web', context=_PoTokenContext.GVS, ytcfg=None, f'You may need to pass a Data Sync ID with --extractor-args "youtube:data_sync_id=XXX"') return - return self._fetch_po_token( + po_token = self._fetch_po_token( client=client, context=context.value, ytcfg=ytcfg, @@ -2875,11 +2930,68 @@ def fetch_po_token(self, client='web', context=_PoTokenContext.GVS, ytcfg=None, session_index=session_index, player_url=player_url, video_id=video_id, + video_webpage=webpage, + required=required, **kwargs, ) + if po_token: + self.write_debug(f'{video_id}: Retrieved a {context.value} PO Token for {client} client') + return po_token + def _fetch_po_token(self, client, **kwargs): - """(Unstable) External PO Token fetch stub""" + context = kwargs.get('context') + + # Avoid fetching PO Tokens when not required + fetch_pot_policy = self._configuration_arg('fetch_pot', [''], ie_key=YoutubeIE)[0] + if fetch_pot_policy not in ('never', 'auto', 'always'): + fetch_pot_policy = 'auto' + if ( + fetch_pot_policy == 'never' + or ( + fetch_pot_policy == 'auto' + and _PoTokenContext(context) not in self._get_default_ytcfg(client)['PO_TOKEN_REQUIRED_CONTEXTS'] + and not kwargs.get('required', False) + ) + ): + return None + + headers = self.get_param('http_headers').copy() + proxies = self._downloader.proxies.copy() + clean_headers(headers) + clean_proxies(proxies, headers) + + innertube_host = self._select_api_hostname(None, default_client=client) + + pot_request = PoTokenRequest( + context=PoTokenContext(context), + innertube_context=traverse_obj(kwargs, ('ytcfg', 'INNERTUBE_CONTEXT')), + innertube_host=innertube_host, + internal_client_name=client, + session_index=kwargs.get('session_index'), + player_url=kwargs.get('player_url'), + video_webpage=kwargs.get('video_webpage'), + is_authenticated=self.is_authenticated, + visitor_data=kwargs.get('visitor_data'), + data_sync_id=kwargs.get('data_sync_id'), + video_id=kwargs.get('video_id'), + request_cookiejar=self._downloader.cookiejar, + + # All requests that would need to be proxied should be in the + # context of www.youtube.com or the innertube host + request_proxy=( + select_proxy('https://www.youtube.com', proxies) + or select_proxy(f'https://{innertube_host}', proxies) + ), + request_headers=headers, + request_timeout=self.get_param('socket_timeout'), + request_verify_tls=not self.get_param('nocheckcertificate'), + request_source_address=self.get_param('source_address'), + + bypass_cache=False, + ) + + return self._pot_director.get_po_token(pot_request) @staticmethod def _is_agegated(player_response): @@ -3028,6 +3140,8 @@ def append_client(*client_names): player_url = self._download_player_url(video_id) tried_iframe_fallback = True + pr = initial_pr if client == 'web' else None + visitor_data = visitor_data or self._extract_visitor_data(master_ytcfg, initial_pr, player_ytcfg) data_sync_id = data_sync_id or self._extract_data_sync_id(master_ytcfg, initial_pr, player_ytcfg) @@ -3037,16 +3151,24 @@ def append_client(*client_names): 'video_id': video_id, 'data_sync_id': data_sync_id if self.is_authenticated else None, 'player_url': player_url if require_js_player else None, + 'webpage': webpage, 'session_index': self._extract_session_index(master_ytcfg, player_ytcfg), - 'ytcfg': player_ytcfg, + 'ytcfg': player_ytcfg or self._get_default_ytcfg(client), } - player_po_token = self.fetch_po_token( + # Don't need a player PO token for WEB if using player response from webpage + player_po_token = None if pr else self.fetch_po_token( context=_PoTokenContext.PLAYER, **fetch_po_token_args) gvs_po_token = self.fetch_po_token( context=_PoTokenContext.GVS, **fetch_po_token_args) + fetch_subs_po_token_func = functools.partial( + self.fetch_po_token, + context=_PoTokenContext.SUBS, + **fetch_po_token_args, + ) + required_pot_contexts = self._get_default_ytcfg(client)['PO_TOKEN_REQUIRED_CONTEXTS'] if ( @@ -3073,7 +3195,6 @@ def append_client(*client_names): only_once=True) deprioritize_pr = True - pr = initial_pr if client == 'web' else None try: pr = pr or self._extract_player_response( client, video_id, @@ -3091,10 +3212,13 @@ def append_client(*client_names): if pr_id := self._invalid_player_response(pr, video_id): skipped_clients[client] = pr_id elif pr: - # Save client name for introspection later - sd = traverse_obj(pr, ('streamingData', {dict})) or {} + # Save client details for introspection later + innertube_context = traverse_obj(player_ytcfg or self._get_default_ytcfg(client), 'INNERTUBE_CONTEXT') + sd = pr.setdefault('streamingData', {}) sd[STREAMING_DATA_CLIENT_NAME] = client sd[STREAMING_DATA_INITIAL_PO_TOKEN] = gvs_po_token + sd[STREAMING_DATA_INNERTUBE_CONTEXT] = innertube_context + sd[STREAMING_DATA_FETCH_SUBS_PO_TOKEN] = fetch_subs_po_token_func for f in traverse_obj(sd, (('formats', 'adaptiveFormats'), ..., {dict})): f[STREAMING_DATA_CLIENT_NAME] = client f[STREAMING_DATA_INITIAL_PO_TOKEN] = gvs_po_token @@ -3103,9 +3227,19 @@ def append_client(*client_names): else: prs.append(pr) + # web_embedded can work around age-gate and age-verification for some embeddable videos + if self._is_agegated(pr) and variant != 'web_embedded': + append_client(f'web_embedded.{base_client}') + # Unauthenticated users will only get web_embedded client formats if age-gated + if self._is_agegated(pr) and not self.is_authenticated: + self.to_screen( + f'{video_id}: This video is age-restricted; some formats may be missing ' + f'without authentication. {self._youtube_login_hint}', only_once=True) + # EU countries require age-verification for accounts to access age-restricted videos # If account is not age-verified, _is_agegated() will be truthy for non-embedded clients - if self.is_authenticated and self._is_agegated(pr): + embedding_is_disabled = variant == 'web_embedded' and self._is_unplayable(pr) + if self.is_authenticated and (self._is_agegated(pr) or embedding_is_disabled): self.to_screen( f'{video_id}: This video is age-restricted and YouTube is requiring ' 'account age-verification; some formats may be missing', only_once=True) @@ -3146,6 +3280,25 @@ def _report_pot_format_skipped(self, video_id, client_name, proto): else: self.report_warning(msg, only_once=True) + def _report_pot_subtitles_skipped(self, video_id, client_name, msg=None): + msg = msg or ( + f'{video_id}: Some {client_name} client subtitles require a PO Token which was not provided. ' + 'They will be discarded since they are not downloadable as-is. ' + f'You can manually pass a Subtitles PO Token for this client with ' + f'--extractor-args "youtube:po_token={client_name}.subs+XXX" . ' + f'For more information, refer to {PO_TOKEN_GUIDE_URL}') + + subs_wanted = any(( + self.get_param('writesubtitles'), + self.get_param('writeautomaticsub'), + self.get_param('listsubtitles'))) + + # Only raise a warning for non-default clients, to not confuse users. + if not subs_wanted or client_name in (*self._DEFAULT_CLIENTS, *self._DEFAULT_AUTHED_CLIENTS): + self.write_debug(msg, only_once=True) + else: + self.report_warning(msg, only_once=True) + def _extract_formats_and_subtitles(self, streaming_data, video_id, player_url, live_status, duration): CHUNK_SIZE = 10 << 20 PREFERRED_LANG_VALUE = 10 @@ -3232,12 +3385,16 @@ def build_fragments(f): fmt_url = url_or_none(try_get(sc, lambda x: x['url'][0])) encrypted_sig = try_get(sc, lambda x: x['s'][0]) if not all((sc, fmt_url, player_url, encrypted_sig)): - self.report_warning( - f'Some {client_name} client https formats have been skipped as they are missing a url. ' - f'{"Your account" if self.is_authenticated else "The current session"} may have ' - f'the SSAP (server-side ads) experiment which interferes with yt-dlp. ' - f'Please see https://github.com/yt-dlp/yt-dlp/issues/12482 for more details.', - video_id, only_once=True) + msg = f'Some {client_name} client https formats have been skipped as they are missing a url. ' + if client_name == 'web': + msg += 'YouTube is forcing SABR streaming for this client. ' + else: + msg += ( + f'YouTube may have enabled the SABR-only or Server-Side Ad Placement experiment for ' + f'{"your account" if self.is_authenticated else "the current session"}. ' + ) + msg += 'See https://github.com/yt-dlp/yt-dlp/issues/12482 for more details' + self.report_warning(msg, video_id, only_once=True) continue try: fmt_url += '&{}={}'.format( @@ -3324,8 +3481,8 @@ def build_fragments(f): 'width': int_or_none(fmt.get('width')), 'language': join_nonempty(language_code, 'desc' if is_descriptive else '') or None, 'language_preference': PREFERRED_LANG_VALUE if is_original else 5 if is_default else -10 if is_descriptive else -1, - # Strictly de-prioritize broken, damaged and 3gp formats - 'preference': -20 if require_po_token else -10 if is_damaged else -2 if itag == '17' else None, + # Strictly de-prioritize damaged and 3gp formats + 'preference': -10 if is_damaged else -2 if itag == '17' else None, } mime_mobj = re.match( r'((?:[^/]+)/(?:[^;]+))(?:;\s*codecs="([^"]+)")?', fmt.get('mimeType') or '') @@ -3433,6 +3590,9 @@ def process_manifest_format(f, proto, client_name, itag, po_token): hls_manifest_url = hls_manifest_url.rstrip('/') + f'/pot/{po_token}' fmts, subs = self._extract_m3u8_formats_and_subtitles( hls_manifest_url, video_id, 'mp4', fatal=False, live=live_status == 'is_live') + for sub in traverse_obj(subs, (..., ..., {dict})): + # HLS subs (m3u8) do not need a PO token; save client name for debugging + sub[STREAMING_DATA_CLIENT_NAME] = client_name subtitles = self._merge_subtitles(subs, subtitles) for f in fmts: if process_manifest_format(f, 'hls', client_name, self._search_regex( @@ -3444,6 +3604,9 @@ def process_manifest_format(f, proto, client_name, itag, po_token): if po_token: dash_manifest_url = dash_manifest_url.rstrip('/') + f'/pot/{po_token}' formats, subs = self._extract_mpd_formats_and_subtitles(dash_manifest_url, video_id, fatal=False) + for sub in traverse_obj(subs, (..., ..., {dict})): + # TODO: Investigate if DASH subs ever need a PO token; save client name for debugging + sub[STREAMING_DATA_CLIENT_NAME] = client_name subtitles = self._merge_subtitles(subs, subtitles) # Prioritize HLS subs over DASH for f in formats: if process_manifest_format(f, 'dash', client_name, f['format_id'], po_token): @@ -3635,7 +3798,7 @@ def feed_entry(name): reason = self._get_text(pemr, 'reason') or get_first(playability_statuses, 'reason') subreason = clean_html(self._get_text(pemr, 'subreason') or '') if subreason: - if subreason == 'The uploader has not made this video available in your country.': + if subreason.startswith('The uploader has not made this video available in your country'): countries = get_first(microformats, 'availableCountries') if not countries: regions_allowed = search_meta('regionsAllowed') @@ -3646,6 +3809,15 @@ def feed_entry(name): if 'sign in' in reason.lower(): reason = remove_end(reason, 'This helps protect our community. Learn more') reason = f'{remove_end(reason.strip(), ".")}. {self._youtube_login_hint}' + elif get_first(playability_statuses, ('errorScreen', 'playerCaptchaViewModel', {dict})): + reason += '. YouTube is requiring a captcha challenge before playback' + elif "This content isn't available, try again later" in reason: + reason = ( + f'{remove_end(reason.strip(), ".")}. {"Your account" if self.is_authenticated else "The current session"} ' + f'has been rate-limited by YouTube for up to an hour. It is recommended to use `-t sleep` to add a delay ' + f'between video requests to avoid exceeding the rate limit. For more information, refer to ' + f'https://github.com/yt-dlp/yt-dlp/wiki/Extractors#this-content-isnt-available-try-again-later' + ) self.raise_no_formats(reason, expected=True) keywords = get_first(video_details, 'keywords', expected_type=list) or [] @@ -3752,53 +3924,94 @@ def is_bad_format(fmt): 'tags': keywords, 'playable_in_embed': get_first(playability_statuses, 'playableInEmbed'), 'live_status': live_status, - 'media_type': 'livestream' if get_first(video_details, 'isLiveContent') else None, + 'media_type': ( + 'livestream' if get_first(video_details, 'isLiveContent') + else 'short' if get_first(microformats, 'isShortsEligible') + else 'video'), 'release_timestamp': live_start_time, '_format_sort_fields': ( # source_preference is lower for potentially damaged formats 'quality', 'res', 'fps', 'hdr:12', 'source', 'vcodec', 'channels', 'acodec', 'lang', 'proto'), } + def get_lang_code(track): + return (remove_start(track.get('vssId') or '', '.').replace('.', '-') + or track.get('languageCode')) + + def process_language(container, base_url, lang_code, sub_name, client_name, query): + lang_subs = container.setdefault(lang_code, []) + for fmt in self._SUBTITLE_FORMATS: + query = {**query, 'fmt': fmt} + lang_subs.append({ + 'ext': fmt, + 'url': urljoin('https://www.youtube.com', update_url_query(base_url, query)), + 'name': sub_name, + STREAMING_DATA_CLIENT_NAME: client_name, + }) + subtitles = {} - pctr = traverse_obj(player_responses, (..., 'captions', 'playerCaptionsTracklistRenderer'), expected_type=dict) - if pctr: - def get_lang_code(track): - return (remove_start(track.get('vssId') or '', '.').replace('.', '-') - or track.get('languageCode')) + skipped_subs_clients = set() - # Converted into dicts to remove duplicates - captions = { - get_lang_code(sub): sub - for sub in traverse_obj(pctr, (..., 'captionTracks', ...))} - translation_languages = { - lang.get('languageCode'): self._get_text(lang.get('languageName'), max_runs=1) - for lang in traverse_obj(pctr, (..., 'translationLanguages', ...))} + # Only web/mweb clients provide translationLanguages, so include initial_pr in the traversal + translation_languages = { + lang['languageCode']: self._get_text(lang['languageName'], max_runs=1) + for lang in traverse_obj(player_responses, ( + ..., 'captions', 'playerCaptionsTracklistRenderer', 'translationLanguages', + lambda _, v: v['languageCode'] and v['languageName'])) + } + # NB: Constructing the full subtitle dictionary is slow + get_translated_subs = 'translated_subs' not in self._configuration_arg('skip') and ( + self.get_param('writeautomaticsub', False) or self.get_param('listsubtitles')) - def process_language(container, base_url, lang_code, sub_name, query): - lang_subs = container.setdefault(lang_code, []) - for fmt in self._SUBTITLE_FORMATS: - query.update({ - 'fmt': fmt, - }) - lang_subs.append({ - 'ext': fmt, - 'url': urljoin('https://www.youtube.com', update_url_query(base_url, query)), - 'name': sub_name, - }) + # Filter out initial_pr which does not have streamingData (smuggled client context) + prs = traverse_obj(player_responses, ( + lambda _, v: v['streamingData'] and v['captions']['playerCaptionsTracklistRenderer'])) + all_captions = traverse_obj(prs, ( + ..., 'captions', 'playerCaptionsTracklistRenderer', 'captionTracks', ..., {dict})) + need_subs_langs = {get_lang_code(sub) for sub in all_captions if sub.get('kind') != 'asr'} + need_caps_langs = { + remove_start(get_lang_code(sub), 'a-') + for sub in all_captions if sub.get('kind') == 'asr'} - # NB: Constructing the full subtitle dictionary is slow - get_translated_subs = 'translated_subs' not in self._configuration_arg('skip') and ( - self.get_param('writeautomaticsub', False) or self.get_param('listsubtitles')) - for lang_code, caption_track in captions.items(): - base_url = caption_track.get('baseUrl') - orig_lang = parse_qs(base_url).get('lang', [None])[-1] - if not base_url: - continue + for pr in prs: + pctr = pr['captions']['playerCaptionsTracklistRenderer'] + client_name = pr['streamingData'][STREAMING_DATA_CLIENT_NAME] + innertube_client_name = pr['streamingData'][STREAMING_DATA_INNERTUBE_CONTEXT]['client']['clientName'] + required_contexts = self._get_default_ytcfg(client_name)['PO_TOKEN_REQUIRED_CONTEXTS'] + fetch_subs_po_token_func = pr['streamingData'][STREAMING_DATA_FETCH_SUBS_PO_TOKEN] + + pot_params = {} + already_fetched_pot = False + + for caption_track in traverse_obj(pctr, ('captionTracks', lambda _, v: v['baseUrl'])): + base_url = caption_track['baseUrl'] + qs = parse_qs(base_url) + lang_code = get_lang_code(caption_track) + requires_pot = ( + # We can detect the experiment for now + any(e in traverse_obj(qs, ('exp', ...)) for e in ('xpe', 'xpv')) + or _PoTokenContext.SUBS in required_contexts) + + if not already_fetched_pot: + already_fetched_pot = True + if subs_po_token := fetch_subs_po_token_func(required=requires_pot): + pot_params.update({ + 'pot': subs_po_token, + 'potc': '1', + 'c': innertube_client_name, + }) + + if not pot_params and requires_pot: + skipped_subs_clients.add(client_name) + self._report_pot_subtitles_skipped(video_id, client_name) + break + + orig_lang = qs.get('lang', [None])[-1] lang_name = self._get_text(caption_track, 'name', max_runs=1) if caption_track.get('kind') != 'asr': if not lang_code: continue process_language( - subtitles, base_url, lang_code, lang_name, {}) + subtitles, base_url, lang_code, lang_name, client_name, pot_params) if not caption_track.get('isTranslatable'): continue for trans_code, trans_name in translation_languages.items(): @@ -3818,10 +4031,25 @@ def process_language(container, base_url, lang_code, sub_name, query): # Add an "-orig" label to the original language so that it can be distinguished. # The subs are returned without "-orig" as well for compatibility process_language( - automatic_captions, base_url, f'{trans_code}-orig', f'{trans_name} (Original)', {}) + automatic_captions, base_url, f'{trans_code}-orig', + f'{trans_name} (Original)', client_name, pot_params) # Setting tlang=lang returns damaged subtitles. - process_language(automatic_captions, base_url, trans_code, trans_name, - {} if orig_lang == orig_trans_code else {'tlang': trans_code}) + process_language( + automatic_captions, base_url, trans_code, trans_name, client_name, + pot_params if orig_lang == orig_trans_code else {'tlang': trans_code, **pot_params}) + + # Avoid duplication if we've already got everything we need + need_subs_langs.difference_update(subtitles) + need_caps_langs.difference_update(automatic_captions) + if not (need_subs_langs or need_caps_langs): + break + + if skipped_subs_clients and (need_subs_langs or need_caps_langs): + self._report_pot_subtitles_skipped(video_id, True, msg=join_nonempty( + f'{video_id}: There are missing subtitles languages because a PO token was not provided.', + need_subs_langs and f'Subtitles for these languages are missing: {", ".join(need_subs_langs)}.', + need_caps_langs and f'Automatic captions for {len(need_caps_langs)} languages are missing.', + delim=' ')) info['automatic_captions'] = automatic_captions info['subtitles'] = subtitles @@ -3874,7 +4102,7 @@ def process_language(container, base_url, lang_code, sub_name, query): if not traverse_obj(initial_data, 'contents'): self.report_warning('Incomplete data received in embedded initial data; re-fetching using API.') initial_data = None - if not initial_data: + if not initial_data and 'initial_data' not in self._configuration_arg('player_skip'): query = {'videoId': video_id} query.update(self._get_checkok_params()) initial_data = self._extract_response( diff --git a/yt_dlp/extractor/youtube/pot/README.md b/yt_dlp/extractor/youtube/pot/README.md new file mode 100644 index 000000000..f39e29071 --- /dev/null +++ b/yt_dlp/extractor/youtube/pot/README.md @@ -0,0 +1,309 @@ +# YoutubeIE PO Token Provider Framework + +As part of the YouTube extractor, we have a framework for providing PO Tokens programmatically. This can be used by plugins. + +Refer to the [PO Token Guide](https://github.com/yt-dlp/yt-dlp/wiki/PO-Token-Guide) for more information on PO Tokens. + +> [!TIP] +> If publishing a PO Token Provider plugin to GitHub, add the [yt-dlp-pot-provider](https://github.com/topics/yt-dlp-pot-provider) topic to your repository to help users find it. + + +## Public APIs + +- `yt_dlp.extractor.youtube.pot.cache` +- `yt_dlp.extractor.youtube.pot.provider` +- `yt_dlp.extractor.youtube.pot.utils` + +Everything else is internal-only and no guarantees are made about the API stability. + +> [!WARNING] +> We will try our best to maintain stability with the public APIs. +> However, due to the nature of extractors and YouTube, we may need to remove or change APIs in the future. +> If you are using these APIs outside yt-dlp plugins, please account for this by importing them safely. + +## PO Token Provider + +`yt_dlp.extractor.youtube.pot.provider` + +```python +from yt_dlp.extractor.youtube.pot.provider import ( + PoTokenRequest, + PoTokenContext, + PoTokenProvider, + PoTokenResponse, + PoTokenProviderError, + PoTokenProviderRejectedRequest, + register_provider, + register_preference, + ExternalRequestFeature, +) +from yt_dlp.networking.common import Request +from yt_dlp.extractor.youtube.pot.utils import get_webpo_content_binding +from yt_dlp.utils import traverse_obj +from yt_dlp.networking.exceptions import RequestError +import json + + +@register_provider +class MyPoTokenProviderPTP(PoTokenProvider): # Provider class name must end with "PTP" + PROVIDER_VERSION = '0.2.1' + # Define a unique display name for the provider + PROVIDER_NAME = 'my-provider' + BUG_REPORT_LOCATION = 'https://issues.example.com/report' + + # -- Validation shortcuts. Set these to None to disable. -- + + # Innertube Client Name. + # For example, "WEB", "ANDROID", "TVHTML5". + # For a list of WebPO client names, + # see yt_dlp.extractor.youtube.pot.utils.WEBPO_CLIENTS. + # Also see yt_dlp.extractor.youtube._base.INNERTUBE_CLIENTS + # for a list of client names currently supported by the YouTube extractor. + _SUPPORTED_CLIENTS = ('WEB', 'TVHTML5') + + _SUPPORTED_CONTEXTS = ( + PoTokenContext.GVS, + ) + + # If your provider makes external requests to websites (i.e. to youtube.com) + # using another library or service (i.e., not _request_webpage), + # set the request features that are supported here. + # If only using _request_webpage to make external requests, set this to None. + _SUPPORTED_EXTERNAL_REQUEST_FEATURES = ( + ExternalRequestFeature.PROXY_SCHEME_HTTP, + ExternalRequestFeature.SOURCE_ADDRESS, + ExternalRequestFeature.DISABLE_TLS_VERIFICATION + ) + + def is_available(self) -> bool: + """ + Check if the provider is available (e.g. all required dependencies are available) + This is used to determine if the provider should be used and to provide debug information. + + IMPORTANT: This method SHOULD NOT make any network requests or perform any expensive operations. + + Since this is called multiple times, we recommend caching the result. + """ + return True + + def close(self): + # Optional close hook, called when YoutubeDL is closed. + pass + + def _real_request_pot(self, request: PoTokenRequest) -> PoTokenResponse: + # ℹ️ If you need to validate the request before making the request to the external source. + # Raise yt_dlp.extractor.youtube.pot.provider.PoTokenProviderRejectedRequest if the request is not supported. + if request.is_authenticated: + raise PoTokenProviderRejectedRequest( + 'This provider does not support authenticated requests' + ) + + # ℹ️ Settings are pulled from extractor args passed to yt-dlp with the key `youtubepot-`. + # For this example, the extractor arg would be: + # `--extractor-args "youtubepot-mypotokenprovider:url=https://custom.example.com/get_pot"` + external_provider_url = self._configuration_arg( + 'url', default=['https://provider.example.com/get_pot'])[0] + + # See below for logging guidelines + self.logger.trace(f'Using external provider URL: {external_provider_url}') + + # You should use the internal HTTP client to make requests where possible, + # as it will handle cookies and other networking settings passed to yt-dlp. + try: + # See docstring in _request_webpage method for request tips + response = self._request_webpage( + Request(external_provider_url, data=json.dumps({ + 'content_binding': get_webpo_content_binding(request), + 'proxy': request.request_proxy, + 'headers': request.request_headers, + 'source_address': request.request_source_address, + 'verify_tls': request.request_verify_tls, + # Important: If your provider has its own caching, please respect `bypass_cache`. + # This may be used in the future to request a fresh PO Token if required. + 'do_not_cache': request.bypass_cache, + }).encode(), proxies={'all': None}), + pot_request=request, + note=( + f'Requesting {request.context.value} PO Token ' + f'for {request.internal_client_name} client from external provider'), + ) + + except RequestError as e: + # ℹ️ If there is an error, raise PoTokenProviderError. + # You can specify whether it is expected or not. If it is unexpected, + # the log will include a link to the bug report location (BUG_REPORT_LOCATION). + raise PoTokenProviderError( + 'Networking error while fetching to get PO Token from external provider', + expected=True + ) from e + + # Note: PO Token is expected to be base64url encoded + po_token = traverse_obj(response, 'po_token') + if not po_token: + raise PoTokenProviderError( + 'Bad PO Token Response from external provider', + expected=False + ) + + return PoTokenResponse( + po_token=po_token, + # Optional, add a custom expiration timestamp for the token. Use for caching. + # By default, yt-dlp will use the default ttl from a registered cache spec (see below) + # Set to 0 or -1 to not cache this response. + expires_at=None, + ) + + +# If there are multiple PO Token Providers that can handle the same PoTokenRequest, +# you can define a preference function to increase/decrease the priority of providers. + +@register_preference(MyPoTokenProviderPTP) +def my_provider_preference(provider: PoTokenProvider, request: PoTokenRequest) -> int: + return 50 +``` + +## Logging Guidelines + +- Use the `self.logger` object to log messages. +- When making HTTP requests or any other expensive operation, use `self.logger.info` to log a message to standard non-verbose output. + - This lets users know what is happening when a time-expensive operation is taking place. + - It is recommended to include the PO Token context and internal client name in the message if possible. + - For example, `self.logger.info(f'Requesting {request.context.value} PO Token for {request.internal_client_name} client from external provider')`. +- Use `self.logger.debug` to log a message to the verbose output (`--verbose`). + - For debugging information visible to users posting verbose logs. + - Try to not log too much, prefer using trace logging for detailed debug messages. +- Use `self.logger.trace` to log a message to the PO Token debug output (`--extractor-args "youtube:pot_trace=true"`). + - Log as much as you like here as needed for debugging your provider. +- Avoid logging PO Tokens or any sensitive information to debug or info output. + +## Debugging + +- Use `-v --extractor-args "youtube:pot_trace=true"` to enable PO Token debug output. + +## Caching + +> [!WARNING] +> The following describes more advance features that most users/developers will not need to use. + +> [!IMPORTANT] +> yt-dlp currently has a built-in LRU Memory Cache Provider and a cache spec provider for WebPO Tokens. +> You should only need to implement cache providers if you want an external cache, or a cache spec if you are handling non-WebPO Tokens. + +### Cache Providers + +`yt_dlp.extractor.youtube.pot.cache` + +```python +from yt_dlp.extractor.youtube.pot.cache import ( + PoTokenCacheProvider, + register_preference, + register_provider +) + +from yt_dlp.extractor.youtube.pot.provider import PoTokenRequest + + +@register_provider +class MyCacheProviderPCP(PoTokenCacheProvider): # Provider class name must end with "PCP" + PROVIDER_VERSION = '0.1.0' + # Define a unique display name for the provider + PROVIDER_NAME = 'my-cache-provider' + BUG_REPORT_LOCATION = 'https://issues.example.com/report' + + def is_available(self) -> bool: + """ + Check if the provider is available (e.g. all required dependencies are available) + This is used to determine if the provider should be used and to provide debug information. + + IMPORTANT: This method SHOULD NOT make any network requests or perform any expensive operations. + + Since this is called multiple times, we recommend caching the result. + """ + return True + + def get(self, key: str): + # ℹ️ Similar to PO Token Providers, Cache Providers and Cache Spec Providers + # are passed down extractor args matching key youtubepot-. + some_setting = self._configuration_arg('some_setting', default=['default_value'])[0] + return self.my_cache.get(key) + + def store(self, key: str, value: str, expires_at: int): + # ⚠ expires_at MUST be respected. + # Cache entries should not be returned if they have expired. + self.my_cache.store(key, value, expires_at) + + def delete(self, key: str): + self.my_cache.delete(key) + + def close(self): + # Optional close hook, called when the YoutubeDL instance is closed. + pass + +# If there are multiple PO Token Cache Providers available, you can +# define a preference function to increase/decrease the priority of providers. + +# IMPORTANT: Providers should be in preference of cache lookup time. +# For example, a memory cache should have a higher preference than a disk cache. + +# VERY IMPORTANT: yt-dlp has a built-in memory cache with a priority of 10000. +# Your cache provider should be lower than this. + + +@register_preference(MyCacheProviderPCP) +def my_cache_preference(provider: PoTokenCacheProvider, request: PoTokenRequest) -> int: + return 50 +``` + +### Cache Specs + +`yt_dlp.extractor.youtube.pot.cache` + +These are used to provide information on how to cache a particular PO Token Request. +You might have a different cache spec for different kinds of PO Tokens. + +```python +from yt_dlp.extractor.youtube.pot.cache import ( + PoTokenCacheSpec, + PoTokenCacheSpecProvider, + CacheProviderWritePolicy, + register_spec, +) +from yt_dlp.utils import traverse_obj +from yt_dlp.extractor.youtube.pot.provider import PoTokenRequest + + +@register_spec +class MyCacheSpecProviderPCSP(PoTokenCacheSpecProvider): # Provider class name must end with "PCSP" + PROVIDER_VERSION = '0.1.0' + # Define a unique display name for the provider + PROVIDER_NAME = 'mycachespec' + BUG_REPORT_LOCATION = 'https://issues.example.com/report' + + def generate_cache_spec(self, request: PoTokenRequest): + + client_name = traverse_obj(request.innertube_context, ('client', 'clientName')) + if client_name != 'ANDROID': + # ℹ️ If the request is not supported by the cache spec, return None + return None + + # Generate a cache spec for the request + return PoTokenCacheSpec( + # Key bindings to uniquely identify the request. These are used to generate a cache key. + key_bindings={ + 'client_name': client_name, + 'content_binding': 'unique_content_binding', + 'ip': traverse_obj(request.innertube_context, ('client', 'remoteHost')), + 'source_address': request.request_source_address, + 'proxy': request.request_proxy, + }, + # Default Cache TTL in seconds + default_ttl=21600, + + # Optional: Specify a write policy. + # WRITE_FIRST will write to the highest priority provider only, + # whereas WRITE_ALL will write to all providers. + # WRITE_FIRST may be useful if the PO Token is short-lived + # and there is no use writing to all providers. + write_policy=CacheProviderWritePolicy.WRITE_ALL, + ) +``` \ No newline at end of file diff --git a/yt_dlp/extractor/youtube/pot/__init__.py b/yt_dlp/extractor/youtube/pot/__init__.py new file mode 100644 index 000000000..febcee010 --- /dev/null +++ b/yt_dlp/extractor/youtube/pot/__init__.py @@ -0,0 +1,3 @@ +# Trigger import of built-in providers +from ._builtin.memory_cache import MemoryLRUPCP as _MemoryLRUPCP # noqa: F401 +from ._builtin.webpo_cachespec import WebPoPCSP as _WebPoPCSP # noqa: F401 diff --git a/yt_dlp/extractor/youtube/pot/_builtin/__init__.py b/yt_dlp/extractor/youtube/pot/_builtin/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/yt_dlp/extractor/youtube/pot/_builtin/memory_cache.py b/yt_dlp/extractor/youtube/pot/_builtin/memory_cache.py new file mode 100644 index 000000000..9c913e8c9 --- /dev/null +++ b/yt_dlp/extractor/youtube/pot/_builtin/memory_cache.py @@ -0,0 +1,78 @@ +from __future__ import annotations + +import datetime as dt +import typing +from threading import Lock + +from yt_dlp.extractor.youtube.pot._provider import BuiltinIEContentProvider +from yt_dlp.extractor.youtube.pot._registry import _pot_memory_cache +from yt_dlp.extractor.youtube.pot.cache import ( + PoTokenCacheProvider, + register_preference, + register_provider, +) + + +def initialize_global_cache(max_size: int): + if _pot_memory_cache.value.get('cache') is None: + _pot_memory_cache.value['cache'] = {} + _pot_memory_cache.value['lock'] = Lock() + _pot_memory_cache.value['max_size'] = max_size + + if _pot_memory_cache.value['max_size'] != max_size: + raise ValueError('Cannot change max_size of initialized global memory cache') + + return ( + _pot_memory_cache.value['cache'], + _pot_memory_cache.value['lock'], + _pot_memory_cache.value['max_size'], + ) + + +@register_provider +class MemoryLRUPCP(PoTokenCacheProvider, BuiltinIEContentProvider): + PROVIDER_NAME = 'memory' + DEFAULT_CACHE_SIZE = 25 + + def __init__( + self, + *args, + initialize_cache: typing.Callable[[int], tuple[dict[str, tuple[str, int]], Lock, int]] = initialize_global_cache, + **kwargs, + ): + super().__init__(*args, **kwargs) + self.cache, self.lock, self.max_size = initialize_cache(self.DEFAULT_CACHE_SIZE) + + def is_available(self) -> bool: + return True + + def get(self, key: str) -> str | None: + with self.lock: + if key not in self.cache: + return None + value, expires_at = self.cache.pop(key) + if expires_at < int(dt.datetime.now(dt.timezone.utc).timestamp()): + return None + self.cache[key] = (value, expires_at) + return value + + def store(self, key: str, value: str, expires_at: int): + with self.lock: + if expires_at < int(dt.datetime.now(dt.timezone.utc).timestamp()): + return + if key in self.cache: + self.cache.pop(key) + self.cache[key] = (value, expires_at) + if len(self.cache) > self.max_size: + oldest_key = next(iter(self.cache)) + self.cache.pop(oldest_key) + + def delete(self, key: str): + with self.lock: + self.cache.pop(key, None) + + +@register_preference(MemoryLRUPCP) +def memorylru_preference(*_, **__): + # Memory LRU Cache SHOULD be the highest priority + return 10000 diff --git a/yt_dlp/extractor/youtube/pot/_builtin/webpo_cachespec.py b/yt_dlp/extractor/youtube/pot/_builtin/webpo_cachespec.py new file mode 100644 index 000000000..426b815c7 --- /dev/null +++ b/yt_dlp/extractor/youtube/pot/_builtin/webpo_cachespec.py @@ -0,0 +1,48 @@ +from __future__ import annotations + +from yt_dlp.extractor.youtube.pot._provider import BuiltinIEContentProvider +from yt_dlp.extractor.youtube.pot.cache import ( + CacheProviderWritePolicy, + PoTokenCacheSpec, + PoTokenCacheSpecProvider, + register_spec, +) +from yt_dlp.extractor.youtube.pot.provider import ( + PoTokenRequest, +) +from yt_dlp.extractor.youtube.pot.utils import ContentBindingType, get_webpo_content_binding +from yt_dlp.utils import traverse_obj + + +@register_spec +class WebPoPCSP(PoTokenCacheSpecProvider, BuiltinIEContentProvider): + PROVIDER_NAME = 'webpo' + + def generate_cache_spec(self, request: PoTokenRequest) -> PoTokenCacheSpec | None: + bind_to_visitor_id = self._configuration_arg( + 'bind_to_visitor_id', default=['true'])[0] == 'true' + + content_binding, content_binding_type = get_webpo_content_binding( + request, bind_to_visitor_id=bind_to_visitor_id) + + if not content_binding or not content_binding_type: + return None + + write_policy = CacheProviderWritePolicy.WRITE_ALL + if content_binding_type == ContentBindingType.VIDEO_ID: + write_policy = CacheProviderWritePolicy.WRITE_FIRST + + return PoTokenCacheSpec( + key_bindings={ + 't': 'webpo', + 'cb': content_binding, + 'cbt': content_binding_type.value, + 'ip': traverse_obj(request.innertube_context, ('client', 'remoteHost')), + 'sa': request.request_source_address, + 'px': request.request_proxy, + }, + # Integrity token response usually states it has a ttl of 12 hours (43200 seconds). + # We will default to 6 hours to be safe. + default_ttl=21600, + write_policy=write_policy, + ) diff --git a/yt_dlp/extractor/youtube/pot/_director.py b/yt_dlp/extractor/youtube/pot/_director.py new file mode 100644 index 000000000..aaf1d5290 --- /dev/null +++ b/yt_dlp/extractor/youtube/pot/_director.py @@ -0,0 +1,468 @@ +from __future__ import annotations + +import base64 +import binascii +import dataclasses +import datetime as dt +import hashlib +import json +import typing +import urllib.parse +from collections.abc import Iterable + +from yt_dlp.extractor.youtube.pot._provider import ( + BuiltinIEContentProvider, + IEContentProvider, + IEContentProviderLogger, +) +from yt_dlp.extractor.youtube.pot._registry import ( + _pot_cache_provider_preferences, + _pot_cache_providers, + _pot_pcs_providers, + _pot_providers, + _ptp_preferences, +) +from yt_dlp.extractor.youtube.pot.cache import ( + CacheProviderWritePolicy, + PoTokenCacheProvider, + PoTokenCacheProviderError, + PoTokenCacheSpec, + PoTokenCacheSpecProvider, +) +from yt_dlp.extractor.youtube.pot.provider import ( + PoTokenProvider, + PoTokenProviderError, + PoTokenProviderRejectedRequest, + PoTokenRequest, + PoTokenResponse, + provider_bug_report_message, +) +from yt_dlp.utils import bug_reports_message, format_field, join_nonempty + +if typing.TYPE_CHECKING: + from yt_dlp.extractor.youtube.pot.cache import CacheProviderPreference + from yt_dlp.extractor.youtube.pot.provider import Preference + + +class YoutubeIEContentProviderLogger(IEContentProviderLogger): + def __init__(self, ie, prefix, log_level: IEContentProviderLogger.LogLevel | None = None): + self.__ie = ie + self.prefix = prefix + self.log_level = log_level if log_level is not None else self.LogLevel.INFO + + def _format_msg(self, message: str): + prefixstr = format_field(self.prefix, None, '[%s] ') + return f'{prefixstr}{message}' + + def trace(self, message: str): + if self.log_level <= self.LogLevel.TRACE: + self.__ie.write_debug(self._format_msg('TRACE: ' + message)) + + def debug(self, message: str): + if self.log_level <= self.LogLevel.DEBUG: + self.__ie.write_debug(self._format_msg(message)) + + def info(self, message: str): + if self.log_level <= self.LogLevel.INFO: + self.__ie.to_screen(self._format_msg(message)) + + def warning(self, message: str, *, once=False): + if self.log_level <= self.LogLevel.WARNING: + self.__ie.report_warning(self._format_msg(message), only_once=once) + + def error(self, message: str): + if self.log_level <= self.LogLevel.ERROR: + self.__ie._downloader.report_error(self._format_msg(message), is_error=False) + + +class PoTokenCache: + + def __init__( + self, + logger: IEContentProviderLogger, + cache_providers: list[PoTokenCacheProvider], + cache_spec_providers: list[PoTokenCacheSpecProvider], + cache_provider_preferences: list[CacheProviderPreference] | None = None, + ): + self.cache_providers: dict[str, PoTokenCacheProvider] = { + provider.PROVIDER_KEY: provider for provider in (cache_providers or [])} + self.cache_provider_preferences: list[CacheProviderPreference] = cache_provider_preferences or [] + self.cache_spec_providers: dict[str, PoTokenCacheSpecProvider] = { + provider.PROVIDER_KEY: provider for provider in (cache_spec_providers or [])} + self.logger = logger + + def _get_cache_providers(self, request: PoTokenRequest) -> Iterable[PoTokenCacheProvider]: + """Sorts available cache providers by preference, given a request""" + preferences = { + provider: sum(pref(provider, request) for pref in self.cache_provider_preferences) + for provider in self.cache_providers.values() + } + if self.logger.log_level <= self.logger.LogLevel.TRACE: + # calling is_available() for every PO Token provider upfront may have some overhead + self.logger.trace(f'PO Token Cache Providers: {provider_display_list(self.cache_providers.values())}') + self.logger.trace('Cache Provider preferences for this request: {}'.format(', '.join( + f'{provider.PROVIDER_KEY}={pref}' for provider, pref in preferences.items()))) + + return ( + provider for provider in sorted( + self.cache_providers.values(), key=preferences.get, reverse=True) if provider.is_available()) + + def _get_cache_spec(self, request: PoTokenRequest) -> PoTokenCacheSpec | None: + for provider in self.cache_spec_providers.values(): + if not provider.is_available(): + continue + try: + spec = provider.generate_cache_spec(request) + if not spec: + continue + if not validate_cache_spec(spec): + self.logger.error( + f'PoTokenCacheSpecProvider "{provider.PROVIDER_KEY}" generate_cache_spec() ' + f'returned invalid spec {spec}{provider_bug_report_message(provider)}') + continue + spec = dataclasses.replace(spec, _provider=provider) + self.logger.trace( + f'Retrieved cache spec {spec} from cache spec provider "{provider.PROVIDER_NAME}"') + return spec + except Exception as e: + self.logger.error( + f'Error occurred with "{provider.PROVIDER_NAME}" PO Token cache spec provider: ' + f'{e!r}{provider_bug_report_message(provider)}') + continue + return None + + def _generate_key_bindings(self, spec: PoTokenCacheSpec) -> dict[str, str]: + bindings_cleaned = { + **{k: v for k, v in spec.key_bindings.items() if v is not None}, + # Allow us to invalidate caches if such need arises + '_dlp_cache': 'v1', + } + if spec._provider: + bindings_cleaned['_p'] = spec._provider.PROVIDER_KEY + self.logger.trace(f'Generated cache key bindings: {bindings_cleaned}') + return bindings_cleaned + + def _generate_key(self, bindings: dict) -> str: + binding_string = ''.join(repr(dict(sorted(bindings.items())))) + return hashlib.sha256(binding_string.encode()).hexdigest() + + def get(self, request: PoTokenRequest) -> PoTokenResponse | None: + spec = self._get_cache_spec(request) + if not spec: + self.logger.trace('No cache spec available for this request, unable to fetch from cache') + return None + + cache_key = self._generate_key(self._generate_key_bindings(spec)) + self.logger.trace(f'Attempting to access PO Token cache using key: {cache_key}') + + for idx, provider in enumerate(self._get_cache_providers(request)): + try: + self.logger.trace( + f'Attempting to fetch PO Token response from "{provider.PROVIDER_NAME}" cache provider') + cache_response = provider.get(cache_key) + if not cache_response: + continue + try: + po_token_response = PoTokenResponse(**json.loads(cache_response)) + except (TypeError, ValueError, json.JSONDecodeError): + po_token_response = None + if not validate_response(po_token_response): + self.logger.error( + f'Invalid PO Token response retrieved from cache provider "{provider.PROVIDER_NAME}": ' + f'{cache_response}{provider_bug_report_message(provider)}') + provider.delete(cache_key) + continue + self.logger.trace( + f'PO Token response retrieved from cache using "{provider.PROVIDER_NAME}" provider: ' + f'{po_token_response}') + if idx > 0: + # Write back to the highest priority cache provider, + # so we stop trying to fetch from lower priority providers + self.logger.trace('Writing PO Token response to highest priority cache provider') + self.store(request, po_token_response, write_policy=CacheProviderWritePolicy.WRITE_FIRST) + + return po_token_response + except PoTokenCacheProviderError as e: + self.logger.warning( + f'Error from "{provider.PROVIDER_NAME}" PO Token cache provider: ' + f'{e!r}{provider_bug_report_message(provider) if not e.expected else ""}') + continue + except Exception as e: + self.logger.error( + f'Error occurred with "{provider.PROVIDER_NAME}" PO Token cache provider: ' + f'{e!r}{provider_bug_report_message(provider)}', + ) + continue + return None + + def store( + self, + request: PoTokenRequest, + response: PoTokenResponse, + write_policy: CacheProviderWritePolicy | None = None, + ): + spec = self._get_cache_spec(request) + if not spec: + self.logger.trace('No cache spec available for this request. Not caching.') + return + + if not validate_response(response): + self.logger.error( + f'Invalid PO Token response provided to PoTokenCache.store(): ' + f'{response}{bug_reports_message()}') + return + + cache_key = self._generate_key(self._generate_key_bindings(spec)) + self.logger.trace(f'Attempting to access PO Token cache using key: {cache_key}') + + default_expires_at = int(dt.datetime.now(dt.timezone.utc).timestamp()) + spec.default_ttl + cache_response = dataclasses.replace(response, expires_at=response.expires_at or default_expires_at) + + write_policy = write_policy or spec.write_policy + self.logger.trace(f'Using write policy: {write_policy}') + + for idx, provider in enumerate(self._get_cache_providers(request)): + try: + self.logger.trace( + f'Caching PO Token response in "{provider.PROVIDER_NAME}" cache provider ' + f'(key={cache_key}, expires_at={cache_response.expires_at})') + provider.store( + key=cache_key, + value=json.dumps(dataclasses.asdict(cache_response)), + expires_at=cache_response.expires_at) + except PoTokenCacheProviderError as e: + self.logger.warning( + f'Error from "{provider.PROVIDER_NAME}" PO Token cache provider: ' + f'{e!r}{provider_bug_report_message(provider) if not e.expected else ""}') + except Exception as e: + self.logger.error( + f'Error occurred with "{provider.PROVIDER_NAME}" PO Token cache provider: ' + f'{e!r}{provider_bug_report_message(provider)}') + + # WRITE_FIRST should not write to lower priority providers in the case the highest priority provider fails + if idx == 0 and write_policy == CacheProviderWritePolicy.WRITE_FIRST: + return + + def close(self): + for provider in self.cache_providers.values(): + provider.close() + for spec_provider in self.cache_spec_providers.values(): + spec_provider.close() + + +class PoTokenRequestDirector: + + def __init__(self, logger: IEContentProviderLogger, cache: PoTokenCache): + self.providers: dict[str, PoTokenProvider] = {} + self.preferences: list[Preference] = [] + self.cache = cache + self.logger = logger + + def register_provider(self, provider: PoTokenProvider): + self.providers[provider.PROVIDER_KEY] = provider + + def register_preference(self, preference: Preference): + self.preferences.append(preference) + + def _get_providers(self, request: PoTokenRequest) -> Iterable[PoTokenProvider]: + """Sorts available providers by preference, given a request""" + preferences = { + provider: sum(pref(provider, request) for pref in self.preferences) + for provider in self.providers.values() + } + if self.logger.log_level <= self.logger.LogLevel.TRACE: + # calling is_available() for every PO Token provider upfront may have some overhead + self.logger.trace(f'PO Token Providers: {provider_display_list(self.providers.values())}') + self.logger.trace('Provider preferences for this request: {}'.format(', '.join( + f'{provider.PROVIDER_NAME}={pref}' for provider, pref in preferences.items()))) + + return ( + provider for provider in sorted( + self.providers.values(), key=preferences.get, reverse=True) + if provider.is_available() + ) + + def _get_po_token(self, request) -> PoTokenResponse | None: + for provider in self._get_providers(request): + try: + self.logger.trace( + f'Attempting to fetch a PO Token from "{provider.PROVIDER_NAME}" provider') + response = provider.request_pot(request.copy()) + except PoTokenProviderRejectedRequest as e: + self.logger.trace( + f'PO Token Provider "{provider.PROVIDER_NAME}" rejected this request, ' + f'trying next available provider. Reason: {e}') + continue + except PoTokenProviderError as e: + self.logger.warning( + f'Error fetching PO Token from "{provider.PROVIDER_NAME}" provider: ' + f'{e!r}{provider_bug_report_message(provider) if not e.expected else ""}') + continue + except Exception as e: + self.logger.error( + f'Unexpected error when fetching PO Token from "{provider.PROVIDER_NAME}" provider: ' + f'{e!r}{provider_bug_report_message(provider)}') + continue + + self.logger.trace(f'PO Token response from "{provider.PROVIDER_NAME}" provider: {response}') + + if not validate_response(response): + self.logger.error( + f'Invalid PO Token response received from "{provider.PROVIDER_NAME}" provider: ' + f'{response}{provider_bug_report_message(provider)}') + continue + + return response + + self.logger.trace('No PO Token providers were able to provide a valid PO Token') + return None + + def get_po_token(self, request: PoTokenRequest) -> str | None: + if not request.bypass_cache: + if pot_response := self.cache.get(request): + return clean_pot(pot_response.po_token) + + if not self.providers: + self.logger.trace('No PO Token providers registered') + return None + + pot_response = self._get_po_token(request) + if not pot_response: + return None + + pot_response.po_token = clean_pot(pot_response.po_token) + + if pot_response.expires_at is None or pot_response.expires_at > 0: + self.cache.store(request, pot_response) + else: + self.logger.trace( + f'PO Token response will not be cached (expires_at={pot_response.expires_at})') + + return pot_response.po_token + + def close(self): + for provider in self.providers.values(): + provider.close() + self.cache.close() + + +EXTRACTOR_ARG_PREFIX = 'youtubepot' + + +def initialize_pot_director(ie): + assert ie._downloader is not None, 'Downloader not set' + + enable_trace = ie._configuration_arg( + 'pot_trace', ['false'], ie_key='youtube', casesense=False)[0] == 'true' + + if enable_trace: + log_level = IEContentProviderLogger.LogLevel.TRACE + elif ie.get_param('verbose', False): + log_level = IEContentProviderLogger.LogLevel.DEBUG + else: + log_level = IEContentProviderLogger.LogLevel.INFO + + def get_provider_logger_and_settings(provider, logger_key): + logger_prefix = f'{logger_key}:{provider.PROVIDER_NAME}' + extractor_key = f'{EXTRACTOR_ARG_PREFIX}-{provider.PROVIDER_KEY.lower()}' + return ( + YoutubeIEContentProviderLogger(ie, logger_prefix, log_level=log_level), + ie.get_param('extractor_args', {}).get(extractor_key, {})) + + cache_providers = [] + for cache_provider in _pot_cache_providers.value.values(): + logger, settings = get_provider_logger_and_settings(cache_provider, 'pot:cache') + cache_providers.append(cache_provider(ie, logger, settings)) + cache_spec_providers = [] + for cache_spec_provider in _pot_pcs_providers.value.values(): + logger, settings = get_provider_logger_and_settings(cache_spec_provider, 'pot:cache:spec') + cache_spec_providers.append(cache_spec_provider(ie, logger, settings)) + + cache = PoTokenCache( + logger=YoutubeIEContentProviderLogger(ie, 'pot:cache', log_level=log_level), + cache_providers=cache_providers, + cache_spec_providers=cache_spec_providers, + cache_provider_preferences=list(_pot_cache_provider_preferences.value), + ) + + director = PoTokenRequestDirector( + logger=YoutubeIEContentProviderLogger(ie, 'pot', log_level=log_level), + cache=cache, + ) + + ie._downloader.add_close_hook(director.close) + + for provider in _pot_providers.value.values(): + logger, settings = get_provider_logger_and_settings(provider, 'pot') + director.register_provider(provider(ie, logger, settings)) + + for preference in _ptp_preferences.value: + director.register_preference(preference) + + if director.logger.log_level <= director.logger.LogLevel.DEBUG: + # calling is_available() for every PO Token provider upfront may have some overhead + director.logger.debug(f'PO Token Providers: {provider_display_list(director.providers.values())}') + director.logger.debug(f'PO Token Cache Providers: {provider_display_list(cache.cache_providers.values())}') + director.logger.debug(f'PO Token Cache Spec Providers: {provider_display_list(cache.cache_spec_providers.values())}') + director.logger.trace(f'Registered {len(director.preferences)} provider preferences') + director.logger.trace(f'Registered {len(cache.cache_provider_preferences)} cache provider preferences') + + return director + + +def provider_display_list(providers: Iterable[IEContentProvider]): + def provider_display_name(provider): + display_str = join_nonempty( + provider.PROVIDER_NAME, + provider.PROVIDER_VERSION if not isinstance(provider, BuiltinIEContentProvider) else None) + statuses = [] + if not isinstance(provider, BuiltinIEContentProvider): + statuses.append('external') + if not provider.is_available(): + statuses.append('unavailable') + if statuses: + display_str += f' ({", ".join(statuses)})' + return display_str + + return ', '.join(provider_display_name(provider) for provider in providers) or 'none' + + +def clean_pot(po_token: str): + # Clean and validate the PO Token. This will strip invalid characters off + # (e.g. additional url params the user may accidentally include) + try: + return base64.urlsafe_b64encode( + base64.urlsafe_b64decode(urllib.parse.unquote(po_token))).decode() + except (binascii.Error, ValueError): + raise ValueError('Invalid PO Token') + + +def validate_response(response: PoTokenResponse | None): + if ( + not isinstance(response, PoTokenResponse) + or not isinstance(response.po_token, str) + or not response.po_token + ): # noqa: SIM103 + return False + + try: + clean_pot(response.po_token) + except ValueError: + return False + + if not isinstance(response.expires_at, int): + return response.expires_at is None + + return response.expires_at <= 0 or response.expires_at > int(dt.datetime.now(dt.timezone.utc).timestamp()) + + +def validate_cache_spec(spec: PoTokenCacheSpec): + return ( + isinstance(spec, PoTokenCacheSpec) + and isinstance(spec.write_policy, CacheProviderWritePolicy) + and isinstance(spec.default_ttl, int) + and isinstance(spec.key_bindings, dict) + and all(isinstance(k, str) for k in spec.key_bindings) + and all(v is None or isinstance(v, str) for v in spec.key_bindings.values()) + and bool([v for v in spec.key_bindings.values() if v is not None]) + ) diff --git a/yt_dlp/extractor/youtube/pot/_provider.py b/yt_dlp/extractor/youtube/pot/_provider.py new file mode 100644 index 000000000..af7034d22 --- /dev/null +++ b/yt_dlp/extractor/youtube/pot/_provider.py @@ -0,0 +1,156 @@ +from __future__ import annotations + +import abc +import enum +import functools + +from yt_dlp.extractor.common import InfoExtractor +from yt_dlp.utils import NO_DEFAULT, bug_reports_message, classproperty, traverse_obj +from yt_dlp.version import __version__ + +# xxx: these could be generalized outside YoutubeIE eventually + + +class IEContentProviderLogger(abc.ABC): + + class LogLevel(enum.IntEnum): + TRACE = 0 + DEBUG = 10 + INFO = 20 + WARNING = 30 + ERROR = 40 + + @classmethod + def _missing_(cls, value): + if isinstance(value, str): + value = value.upper() + if value in dir(cls): + return cls[value] + + return cls.INFO + + log_level = LogLevel.INFO + + @abc.abstractmethod + def trace(self, message: str): + pass + + @abc.abstractmethod + def debug(self, message: str): + pass + + @abc.abstractmethod + def info(self, message: str): + pass + + @abc.abstractmethod + def warning(self, message: str, *, once=False): + pass + + @abc.abstractmethod + def error(self, message: str): + pass + + +class IEContentProviderError(Exception): + def __init__(self, msg=None, expected=False): + super().__init__(msg) + self.expected = expected + + +class IEContentProvider(abc.ABC): + PROVIDER_VERSION: str = '0.0.0' + BUG_REPORT_LOCATION: str = '(developer has not provided a bug report location)' + + def __init__( + self, + ie: InfoExtractor, + logger: IEContentProviderLogger, + settings: dict[str, list[str]], *_, **__, + ): + self.ie = ie + self.settings = settings or {} + self.logger = logger + super().__init__() + + @classmethod + def __init_subclass__(cls, *, suffix=None, **kwargs): + if suffix: + cls._PROVIDER_KEY_SUFFIX = suffix + return super().__init_subclass__(**kwargs) + + @classproperty + def PROVIDER_NAME(cls) -> str: + return cls.__name__[:-len(cls._PROVIDER_KEY_SUFFIX)] + + @classproperty + def BUG_REPORT_MESSAGE(cls): + return f'please report this issue to the provider developer at {cls.BUG_REPORT_LOCATION} .' + + @classproperty + def PROVIDER_KEY(cls) -> str: + assert hasattr(cls, '_PROVIDER_KEY_SUFFIX'), 'Content Provider implementation must define a suffix for the provider key' + assert cls.__name__.endswith(cls._PROVIDER_KEY_SUFFIX), f'PoTokenProvider class names must end with "{cls._PROVIDER_KEY_SUFFIX}"' + return cls.__name__[:-len(cls._PROVIDER_KEY_SUFFIX)] + + @abc.abstractmethod + def is_available(self) -> bool: + """ + Check if the provider is available (e.g. all required dependencies are available) + This is used to determine if the provider should be used and to provide debug information. + + IMPORTANT: This method should not make any network requests or perform any expensive operations. + It is called multiple times. + """ + raise NotImplementedError + + def close(self): # noqa: B027 + pass + + def _configuration_arg(self, key, default=NO_DEFAULT, *, casesense=False): + """ + @returns A list of values for the setting given by "key" + or "default" if no such key is present + @param default The default value to return when the key is not present (default: []) + @param casesense When false, the values are converted to lower case + """ + val = traverse_obj(self.settings, key) + if val is None: + return [] if default is NO_DEFAULT else default + return list(val) if casesense else [x.lower() for x in val] + + +class BuiltinIEContentProvider(IEContentProvider, abc.ABC): + PROVIDER_VERSION = __version__ + BUG_REPORT_MESSAGE = bug_reports_message(before='') + + +def register_provider_generic( + provider, + base_class, + registry, +): + """Generic function to register a provider class""" + assert issubclass(provider, base_class), f'{provider} must be a subclass of {base_class.__name__}' + assert provider.PROVIDER_KEY not in registry, f'{base_class.__name__} {provider.PROVIDER_KEY} already registered' + registry[provider.PROVIDER_KEY] = provider + return provider + + +def register_preference_generic( + base_class, + registry, + *providers, +): + """Generic function to register a preference for a provider""" + assert all(issubclass(provider, base_class) for provider in providers) + + def outer(preference): + @functools.wraps(preference) + def inner(provider, *args, **kwargs): + if not providers or isinstance(provider, providers): + return preference(provider, *args, **kwargs) + return 0 + registry.add(inner) + return preference + return outer diff --git a/yt_dlp/extractor/youtube/pot/_registry.py b/yt_dlp/extractor/youtube/pot/_registry.py new file mode 100644 index 000000000..c72a622c1 --- /dev/null +++ b/yt_dlp/extractor/youtube/pot/_registry.py @@ -0,0 +1,8 @@ +from yt_dlp.globals import Indirect + +_pot_providers = Indirect({}) +_ptp_preferences = Indirect(set()) +_pot_pcs_providers = Indirect({}) +_pot_cache_providers = Indirect({}) +_pot_cache_provider_preferences = Indirect(set()) +_pot_memory_cache = Indirect({}) diff --git a/yt_dlp/extractor/youtube/pot/cache.py b/yt_dlp/extractor/youtube/pot/cache.py new file mode 100644 index 000000000..6d69316ad --- /dev/null +++ b/yt_dlp/extractor/youtube/pot/cache.py @@ -0,0 +1,97 @@ +"""PUBLIC API""" + +from __future__ import annotations + +import abc +import dataclasses +import enum +import typing + +from yt_dlp.extractor.youtube.pot._provider import ( + IEContentProvider, + IEContentProviderError, + register_preference_generic, + register_provider_generic, +) +from yt_dlp.extractor.youtube.pot._registry import ( + _pot_cache_provider_preferences, + _pot_cache_providers, + _pot_pcs_providers, +) +from yt_dlp.extractor.youtube.pot.provider import PoTokenRequest + + +class PoTokenCacheProviderError(IEContentProviderError): + """An error occurred while fetching a PO Token""" + + +class PoTokenCacheProvider(IEContentProvider, abc.ABC, suffix='PCP'): + @abc.abstractmethod + def get(self, key: str) -> str | None: + pass + + @abc.abstractmethod + def store(self, key: str, value: str, expires_at: int): + pass + + @abc.abstractmethod + def delete(self, key: str): + pass + + +class CacheProviderWritePolicy(enum.Enum): + WRITE_ALL = enum.auto() # Write to all cache providers + WRITE_FIRST = enum.auto() # Write to only the first cache provider + + +@dataclasses.dataclass +class PoTokenCacheSpec: + key_bindings: dict[str, str | None] + default_ttl: int + write_policy: CacheProviderWritePolicy = CacheProviderWritePolicy.WRITE_ALL + + # Internal + _provider: PoTokenCacheSpecProvider | None = None + + +class PoTokenCacheSpecProvider(IEContentProvider, abc.ABC, suffix='PCSP'): + + def is_available(self) -> bool: + return True + + @abc.abstractmethod + def generate_cache_spec(self, request: PoTokenRequest) -> PoTokenCacheSpec | None: + """Generate a cache spec for the given request""" + pass + + +def register_provider(provider: type[PoTokenCacheProvider]): + """Register a PoTokenCacheProvider class""" + return register_provider_generic( + provider=provider, + base_class=PoTokenCacheProvider, + registry=_pot_cache_providers.value, + ) + + +def register_spec(provider: type[PoTokenCacheSpecProvider]): + """Register a PoTokenCacheSpecProvider class""" + return register_provider_generic( + provider=provider, + base_class=PoTokenCacheSpecProvider, + registry=_pot_pcs_providers.value, + ) + + +def register_preference( + *providers: type[PoTokenCacheProvider]) -> typing.Callable[[CacheProviderPreference], CacheProviderPreference]: + """Register a preference for a PoTokenCacheProvider""" + return register_preference_generic( + PoTokenCacheProvider, + _pot_cache_provider_preferences.value, + *providers, + ) + + +if typing.TYPE_CHECKING: + CacheProviderPreference = typing.Callable[[PoTokenCacheProvider, PoTokenRequest], int] diff --git a/yt_dlp/extractor/youtube/pot/provider.py b/yt_dlp/extractor/youtube/pot/provider.py new file mode 100644 index 000000000..13b3b1f9b --- /dev/null +++ b/yt_dlp/extractor/youtube/pot/provider.py @@ -0,0 +1,281 @@ +"""PUBLIC API""" + +from __future__ import annotations + +import abc +import copy +import dataclasses +import enum +import functools +import typing +import urllib.parse + +from yt_dlp.cookies import YoutubeDLCookieJar +from yt_dlp.extractor.youtube.pot._provider import ( + IEContentProvider, + IEContentProviderError, + register_preference_generic, + register_provider_generic, +) +from yt_dlp.extractor.youtube.pot._registry import _pot_providers, _ptp_preferences +from yt_dlp.networking import Request, Response +from yt_dlp.utils import traverse_obj +from yt_dlp.utils.networking import HTTPHeaderDict + +__all__ = [ + 'ExternalRequestFeature', + 'PoTokenContext', + 'PoTokenProvider', + 'PoTokenProviderError', + 'PoTokenProviderRejectedRequest', + 'PoTokenRequest', + 'PoTokenResponse', + 'provider_bug_report_message', + 'register_preference', + 'register_provider', +] + + +class PoTokenContext(enum.Enum): + GVS = 'gvs' + PLAYER = 'player' + SUBS = 'subs' + + +@dataclasses.dataclass +class PoTokenRequest: + # YouTube parameters + context: PoTokenContext + innertube_context: InnertubeContext + innertube_host: str | None = None + session_index: str | None = None + player_url: str | None = None + is_authenticated: bool = False + video_webpage: str | None = None + internal_client_name: str | None = None + + # Content binding parameters + visitor_data: str | None = None + data_sync_id: str | None = None + video_id: str | None = None + + # Networking parameters + request_cookiejar: YoutubeDLCookieJar = dataclasses.field(default_factory=YoutubeDLCookieJar) + request_proxy: str | None = None + request_headers: HTTPHeaderDict = dataclasses.field(default_factory=HTTPHeaderDict) + request_timeout: float | None = None + request_source_address: str | None = None + request_verify_tls: bool = True + + # Generate a new token, do not used a cached token + # The token should still be cached for future requests + bypass_cache: bool = False + + def copy(self): + return dataclasses.replace( + self, + request_headers=HTTPHeaderDict(self.request_headers), + innertube_context=copy.deepcopy(self.innertube_context), + ) + + +@dataclasses.dataclass +class PoTokenResponse: + po_token: str + expires_at: int | None = None + + +class PoTokenProviderRejectedRequest(IEContentProviderError): + """Reject the PoTokenRequest (cannot handle the request)""" + + +class PoTokenProviderError(IEContentProviderError): + """An error occurred while fetching a PO Token""" + + +class ExternalRequestFeature(enum.Enum): + PROXY_SCHEME_HTTP = enum.auto() + PROXY_SCHEME_HTTPS = enum.auto() + PROXY_SCHEME_SOCKS4 = enum.auto() + PROXY_SCHEME_SOCKS4A = enum.auto() + PROXY_SCHEME_SOCKS5 = enum.auto() + PROXY_SCHEME_SOCKS5H = enum.auto() + SOURCE_ADDRESS = enum.auto() + DISABLE_TLS_VERIFICATION = enum.auto() + + +class PoTokenProvider(IEContentProvider, abc.ABC, suffix='PTP'): + + # Set to None to disable the check + _SUPPORTED_CONTEXTS: tuple[PoTokenContext] | None = () + + # Innertube Client Name. + # For example, "WEB", "ANDROID", "TVHTML5". + # For a list of WebPO client names, see yt_dlp.extractor.youtube.pot.utils.WEBPO_CLIENTS. + # Also see yt_dlp.extractor.youtube._base.INNERTUBE_CLIENTS + # for a list of client names currently supported by the YouTube extractor. + _SUPPORTED_CLIENTS: tuple[str] | None = () + + # If making external requests to websites (i.e. to youtube.com) + # using another library or service (i.e., not _request_webpage), + # add the request features that are supported. + # If only using _request_webpage to make external requests, set this to None. + _SUPPORTED_EXTERNAL_REQUEST_FEATURES: tuple[ExternalRequestFeature] | None = () + + def __validate_request(self, request: PoTokenRequest): + if not self.is_available(): + raise PoTokenProviderRejectedRequest(f'{self.PROVIDER_NAME} is not available') + + # Validate request using built-in settings + if ( + self._SUPPORTED_CONTEXTS is not None + and request.context not in self._SUPPORTED_CONTEXTS + ): + raise PoTokenProviderRejectedRequest( + f'PO Token Context "{request.context}" is not supported by {self.PROVIDER_NAME}') + + if self._SUPPORTED_CLIENTS is not None: + client_name = traverse_obj( + request.innertube_context, ('client', 'clientName')) + if client_name not in self._SUPPORTED_CLIENTS: + raise PoTokenProviderRejectedRequest( + f'Client "{client_name}" is not supported by {self.PROVIDER_NAME}. ' + f'Supported clients: {", ".join(self._SUPPORTED_CLIENTS) or "none"}') + + self.__validate_external_request_features(request) + + @functools.cached_property + def _supported_proxy_schemes(self): + return { + scheme: feature + for scheme, feature in { + 'http': ExternalRequestFeature.PROXY_SCHEME_HTTP, + 'https': ExternalRequestFeature.PROXY_SCHEME_HTTPS, + 'socks4': ExternalRequestFeature.PROXY_SCHEME_SOCKS4, + 'socks4a': ExternalRequestFeature.PROXY_SCHEME_SOCKS4A, + 'socks5': ExternalRequestFeature.PROXY_SCHEME_SOCKS5, + 'socks5h': ExternalRequestFeature.PROXY_SCHEME_SOCKS5H, + }.items() + if feature in (self._SUPPORTED_EXTERNAL_REQUEST_FEATURES or []) + } + + def __validate_external_request_features(self, request: PoTokenRequest): + if self._SUPPORTED_EXTERNAL_REQUEST_FEATURES is None: + return + + if request.request_proxy: + scheme = urllib.parse.urlparse(request.request_proxy).scheme + if scheme.lower() not in self._supported_proxy_schemes: + raise PoTokenProviderRejectedRequest( + f'External requests by "{self.PROVIDER_NAME}" provider do not ' + f'support proxy scheme "{scheme}". Supported proxy schemes: ' + f'{", ".join(self._supported_proxy_schemes) or "none"}') + + if ( + request.request_source_address + and ExternalRequestFeature.SOURCE_ADDRESS not in self._SUPPORTED_EXTERNAL_REQUEST_FEATURES + ): + raise PoTokenProviderRejectedRequest( + f'External requests by "{self.PROVIDER_NAME}" provider ' + f'do not support setting source address') + + if ( + not request.request_verify_tls + and ExternalRequestFeature.DISABLE_TLS_VERIFICATION not in self._SUPPORTED_EXTERNAL_REQUEST_FEATURES + ): + raise PoTokenProviderRejectedRequest( + f'External requests by "{self.PROVIDER_NAME}" provider ' + f'do not support ignoring TLS certificate failures') + + def request_pot(self, request: PoTokenRequest) -> PoTokenResponse: + self.__validate_request(request) + return self._real_request_pot(request) + + @abc.abstractmethod + def _real_request_pot(self, request: PoTokenRequest) -> PoTokenResponse: + """To be implemented by subclasses""" + pass + + # Helper functions + + def _request_webpage(self, request: Request, pot_request: PoTokenRequest | None = None, note=None, **kwargs) -> Response: + """Make a request using the internal HTTP Client. + Use this instead of calling requests, urllib3 or other HTTP client libraries directly! + + YouTube cookies will be automatically applied if this request is made to YouTube. + + @param request: The request to make + @param pot_request: The PoTokenRequest to use. Request parameters will be merged from it. + @param note: Custom log message to display when making the request. Set to `False` to disable logging. + + Tips: + - Disable proxy (e.g. if calling local service): Request(..., proxies={'all': None}) + - Set request timeout: Request(..., extensions={'timeout': 5.0}) + """ + req = request.copy() + + # Merge some ctx request settings into the request + # Most of these will already be used by the configured ydl instance, + # however, the YouTube extractor may override some. + if pot_request is not None: + req.headers = HTTPHeaderDict(pot_request.request_headers, req.headers) + req.proxies = req.proxies or ({'all': pot_request.request_proxy} if pot_request.request_proxy else {}) + + if pot_request.request_cookiejar is not None: + req.extensions['cookiejar'] = req.extensions.get('cookiejar', pot_request.request_cookiejar) + + if note is not False: + self.logger.info(str(note) if note else 'Requesting webpage') + return self.ie._downloader.urlopen(req) + + +def register_provider(provider: type[PoTokenProvider]): + """Register a PoTokenProvider class""" + return register_provider_generic( + provider=provider, + base_class=PoTokenProvider, + registry=_pot_providers.value, + ) + + +def provider_bug_report_message(provider: IEContentProvider, before=';'): + msg = provider.BUG_REPORT_MESSAGE + + before = before.rstrip() + if not before or before.endswith(('.', '!', '?')): + msg = msg[0].title() + msg[1:] + + return f'{before} {msg}' if before else msg + + +def register_preference(*providers: type[PoTokenProvider]) -> typing.Callable[[Preference], Preference]: + """Register a preference for a PoTokenProvider""" + return register_preference_generic( + PoTokenProvider, + _ptp_preferences.value, + *providers, + ) + + +if typing.TYPE_CHECKING: + Preference = typing.Callable[[PoTokenProvider, PoTokenRequest], int] + __all__.append('Preference') + + # Barebones innertube context. There may be more fields. + class ClientInfo(typing.TypedDict, total=False): + hl: str | None + gl: str | None + remoteHost: str | None + deviceMake: str | None + deviceModel: str | None + visitorData: str | None + userAgent: str | None + clientName: str + clientVersion: str + osName: str | None + osVersion: str | None + + class InnertubeContext(typing.TypedDict, total=False): + client: ClientInfo + request: dict + user: dict diff --git a/yt_dlp/extractor/youtube/pot/utils.py b/yt_dlp/extractor/youtube/pot/utils.py new file mode 100644 index 000000000..7a5b7d4ab --- /dev/null +++ b/yt_dlp/extractor/youtube/pot/utils.py @@ -0,0 +1,73 @@ +"""PUBLIC API""" + +from __future__ import annotations + +import base64 +import contextlib +import enum +import re +import urllib.parse + +from yt_dlp.extractor.youtube.pot.provider import PoTokenContext, PoTokenRequest +from yt_dlp.utils import traverse_obj + +__all__ = ['WEBPO_CLIENTS', 'ContentBindingType', 'get_webpo_content_binding'] + +WEBPO_CLIENTS = ( + 'WEB', + 'MWEB', + 'TVHTML5', + 'WEB_EMBEDDED_PLAYER', + 'WEB_CREATOR', + 'WEB_REMIX', + 'TVHTML5_SIMPLY_EMBEDDED_PLAYER', +) + + +class ContentBindingType(enum.Enum): + VISITOR_DATA = 'visitor_data' + DATASYNC_ID = 'datasync_id' + VIDEO_ID = 'video_id' + VISITOR_ID = 'visitor_id' + + +def get_webpo_content_binding( + request: PoTokenRequest, + webpo_clients=WEBPO_CLIENTS, + bind_to_visitor_id=False, +) -> tuple[str | None, ContentBindingType | None]: + + client_name = traverse_obj(request.innertube_context, ('client', 'clientName')) + if not client_name or client_name not in webpo_clients: + return None, None + + if request.context == PoTokenContext.GVS or client_name in ('WEB_REMIX', ): + if request.is_authenticated: + return request.data_sync_id, ContentBindingType.DATASYNC_ID + else: + if bind_to_visitor_id: + visitor_id = _extract_visitor_id(request.visitor_data) + if visitor_id: + return visitor_id, ContentBindingType.VISITOR_ID + return request.visitor_data, ContentBindingType.VISITOR_DATA + + elif request.context in (PoTokenContext.PLAYER, PoTokenContext.SUBS): + return request.video_id, ContentBindingType.VIDEO_ID + + return None, None + + +def _extract_visitor_id(visitor_data): + if not visitor_data: + return None + + # Attempt to extract the visitor ID from the visitor_data protobuf + # xxx: ideally should use a protobuf parser + with contextlib.suppress(Exception): + visitor_id = base64.urlsafe_b64decode( + urllib.parse.unquote_plus(visitor_data))[2:13].decode() + # check that visitor id is all letters and numbers + if re.fullmatch(r'[A-Za-z0-9_-]{11}', visitor_id): + return visitor_id + + return None diff --git a/yt_dlp/extractor/zdf.py b/yt_dlp/extractor/zdf.py index 4bef1017c..10be582a3 100644 --- a/yt_dlp/extractor/zdf.py +++ b/yt_dlp/extractor/zdf.py @@ -1,331 +1,410 @@ +import itertools +import json import re +import time from .common import InfoExtractor from ..utils import ( - NO_DEFAULT, ExtractorError, determine_ext, + filter_dict, float_or_none, int_or_none, join_nonempty, - merge_dicts, + make_archive_id, parse_codecs, - qualities, - traverse_obj, - try_get, + parse_iso8601, + parse_qs, + smuggle_url, unified_timestamp, - update_url_query, + unsmuggle_url, url_or_none, urljoin, + variadic, ) +from ..utils.traversal import require, traverse_obj class ZDFBaseIE(InfoExtractor): _GEO_COUNTRIES = ['DE'] - _QUALITIES = ('auto', 'low', 'med', 'high', 'veryhigh', 'hd', 'fhd', 'uhd') + _TOKEN_CACHE_PARAMS = ('zdf', 'api-token') + _token_cache = {} - def _download_v2_doc(self, document_id): - return self._download_json( - f'https://zdf-prod-futura.zdf.de/mediathekV2/document/{document_id}', - document_id) + def _get_api_token(self): + # As of 2025-03, this API is used by the Android app for getting tokens. + # An equivalent token could be extracted from the webpage should the API become unavailable. + # For now this allows the extractor to avoid dealing with Next.js hydration data. + if not self._token_cache: + self._token_cache.update(self.cache.load(*self._TOKEN_CACHE_PARAMS, default={})) - def _call_api(self, url, video_id, item, api_token=None, referrer=None): - headers = {} - if api_token: - headers['Api-Auth'] = f'Bearer {api_token}' - if referrer: - headers['Referer'] = referrer + if traverse_obj(self._token_cache, ('expires', {int_or_none}), default=0) < int(time.time()): + self._token_cache.update(self._download_json( + 'https://zdf-prod-futura.zdf.de/mediathekV2/token', None, + 'Downloading API token', 'Failed to download API token')) + self.cache.store(*self._TOKEN_CACHE_PARAMS, self._token_cache) + + return f'{self._token_cache["type"]} {self._token_cache["token"]}' + + def _call_api(self, url, video_id, item, api_token=None): return self._download_json( - url, video_id, f'Downloading JSON {item}', headers=headers) + url, video_id, f'Downloading {item}', f'Failed to download {item}', + headers=filter_dict({'Api-Auth': api_token})) + + def _parse_aspect_ratio(self, aspect_ratio): + if not aspect_ratio or not isinstance(aspect_ratio, str): + return None + mobj = re.match(r'(?P\d+):(?P\d+)', aspect_ratio) + return int(mobj.group('width')) / int(mobj.group('height')) if mobj else None + + def _extract_chapters(self, data): + return traverse_obj(data, (lambda _, v: v['anchorOffset'], { + 'start_time': ('anchorOffset', {float_or_none}), + 'title': ('anchorLabel', {str}), + })) or None @staticmethod def _extract_subtitles(src): + seen_urls = set() subtitles = {} - for caption in try_get(src, lambda x: x['captions'], list) or []: + for caption in src: subtitle_url = url_or_none(caption.get('uri')) - if subtitle_url: - lang = caption.get('language', 'deu') - subtitles.setdefault(lang, []).append({ - 'url': subtitle_url, - }) + if not subtitle_url or subtitle_url in seen_urls: + continue + seen_urls.add(subtitle_url) + lang = caption.get('language') or 'deu' + subtitles.setdefault(lang, []).append({ + 'url': subtitle_url, + }) return subtitles - def _extract_format(self, video_id, formats, format_urls, meta): - format_url = url_or_none(meta.get('url')) - if not format_url or format_url in format_urls: - return - format_urls.add(format_url) + def _expand_ptmd_template(self, api_base_url, template): + return urljoin(api_base_url, template.replace('{playerId}', 'android_native_6')) - mime_type, ext = meta.get('mimeType'), determine_ext(format_url) - if mime_type == 'application/x-mpegURL' or ext == 'm3u8': - new_formats = self._extract_m3u8_formats( - format_url, video_id, 'mp4', m3u8_id='hls', - entry_protocol='m3u8_native', fatal=False) - elif mime_type == 'application/f4m+xml' or ext == 'f4m': - new_formats = self._extract_f4m_formats( - update_url_query(format_url, {'hdcore': '3.7.0'}), video_id, f4m_id='hds', fatal=False) - elif ext == 'mpd': - new_formats = self._extract_mpd_formats( - format_url, video_id, mpd_id='dash', fatal=False) - else: - f = parse_codecs(meta.get('mimeCodec')) - if not f and meta.get('type'): - data = meta['type'].split('_') - if try_get(data, lambda x: x[2]) == ext: - f = {'vcodec': data[0], 'acodec': data[1]} - f.update({ - 'url': format_url, - 'format_id': join_nonempty('http', meta.get('type'), meta.get('quality')), - 'tbr': int_or_none(self._search_regex(r'_(\d+)k_', format_url, 'tbr', default=None)), - }) - new_formats = [f] - formats.extend(merge_dicts(f, { - 'format_note': join_nonempty('quality', 'class', from_dict=meta, delim=', '), - 'language': meta.get('language'), - 'language_preference': 10 if meta.get('class') == 'main' else -10 if meta.get('class') == 'ad' else -1, - 'quality': qualities(self._QUALITIES)(meta.get('quality')), - }) for f in new_formats) + def _extract_ptmd(self, ptmd_urls, video_id, api_token=None, aspect_ratio=None): + content_id = None + duration = None + formats, src_captions = [], [] + seen_urls = set() - def _extract_ptmd(self, ptmd_url, video_id, api_token, referrer): - ptmd = self._call_api( - ptmd_url, video_id, 'metadata', api_token, referrer) + for ptmd_url in variadic(ptmd_urls): + ptmd_url, smuggled_data = unsmuggle_url(ptmd_url, {}) + # Is it a DGS variant? (*D*eutsche *G*ebärden*s*prache' / German Sign Language) + is_dgs = smuggled_data.get('vod_media_type') == 'DGS' + ptmd = self._call_api(ptmd_url, video_id, 'PTMD data', api_token) - content_id = ptmd.get('basename') or ptmd_url.split('/')[-1] + basename = ( + ptmd.get('basename') + # ptmd_url examples: + # https://api.zdf.de/tmd/2/android_native_6/vod/ptmd/mediathek/250328_sendung_hsh/3 + # https://tmd.phoenix.de/tmd/2/android_native_6/vod/ptmd/phoenix/221215_phx_spitzbergen + or self._search_regex(r'/vod/ptmd/[^/?#]+/(\w+)', ptmd_url, 'content ID', default=None)) + # If this is_dgs, then it's from ZDFIE and it only uses content_id for _old_archive_ids, + # and the old version of the extractor didn't extract DGS variants, so ignore basename + if not content_id and not is_dgs: + content_id = basename - formats = [] - track_uris = set() - for p in ptmd['priorityList']: - formitaeten = p.get('formitaeten') - if not isinstance(formitaeten, list): - continue - for f in formitaeten: - f_qualities = f.get('qualities') - if not isinstance(f_qualities, list): - continue - for quality in f_qualities: - tracks = try_get(quality, lambda x: x['audio']['tracks'], list) - if not tracks: - continue - for track in tracks: - self._extract_format( - content_id, formats, track_uris, { - 'url': track.get('uri'), - 'type': f.get('type'), - 'mimeType': f.get('mimeType'), - 'quality': quality.get('quality'), - 'class': track.get('class'), - 'language': track.get('language'), + if not duration: + duration = traverse_obj(ptmd, ('attributes', 'duration', 'value', {float_or_none(scale=1000)})) + src_captions += traverse_obj(ptmd, ('captions', ..., {dict})) + + for stream in traverse_obj(ptmd, ('priorityList', ..., 'formitaeten', ..., {dict})): + for quality in traverse_obj(stream, ('qualities', ..., {dict})): + for variant in traverse_obj(quality, ('audio', 'tracks', lambda _, v: url_or_none(v['uri']))): + format_url = variant['uri'] + if format_url in seen_urls: + continue + seen_urls.add(format_url) + ext = determine_ext(format_url) + if ext == 'm3u8': + fmts = self._extract_m3u8_formats( + format_url, video_id, 'mp4', m3u8_id='hls', fatal=False) + elif ext == 'mpd': + fmts = self._extract_mpd_formats( + format_url, video_id, mpd_id='dash', fatal=False) + else: + height = int_or_none(quality.get('highestVerticalResolution')) + width = round(aspect_ratio * height) if aspect_ratio and height else None + fmts = [{ + 'url': format_url, + **parse_codecs(quality.get('mimeCodec')), + 'height': height, + 'width': width, + 'format_id': join_nonempty('http', stream.get('type')), + 'tbr': int_or_none(self._search_regex(r'_(\d+)k_', format_url, 'tbr', default=None)), + }] + f_class = variant.get('class') + for f in fmts: + formats.append({ + **f, + 'format_id': join_nonempty(f.get('format_id'), is_dgs and 'dgs'), + 'format_note': join_nonempty( + f_class, is_dgs and 'German Sign Language', f.get('format_note'), delim=', '), + 'language': variant.get('language') or f.get('language'), + 'preference': -2 if is_dgs else -1, + 'language_preference': 10 if f_class == 'main' else -10 if f_class == 'ad' else -1, }) - duration = float_or_none(try_get( - ptmd, lambda x: x['attributes']['duration']['value']), scale=1000) - return { - 'extractor_key': ZDFIE.ie_key(), - 'id': content_id, + 'id': content_id or video_id, 'duration': duration, 'formats': formats, - 'subtitles': self._extract_subtitles(ptmd), - '_format_sort_fields': ('tbr', 'res', 'quality', 'language_preference'), + 'subtitles': self._extract_subtitles(src_captions), } - def _extract_player(self, webpage, video_id, fatal=True): - return self._parse_json( - self._search_regex( - r'(?s)data-zdfplayer-jsb=(["\'])(?P{.+?})\1', webpage, - 'player JSON', default='{}' if not fatal else NO_DEFAULT, - group='json'), - video_id) + def _download_graphql(self, item_id, data_desc, query=None, body=None): + assert query or body, 'One of query or body is required' - def _extract_entry(self, url, player, content, video_id): - title = content.get('title') or content['teaserHeadline'] + return self._download_json( + 'https://api.zdf.de/graphql', item_id, + f'Downloading {data_desc}', f'Failed to download {data_desc}', + query=query, data=json.dumps(body).encode() if body else None, + headers=filter_dict({ + 'Api-Auth': self._get_api_token(), + 'Apollo-Require-Preflight': True, + 'Content-Type': 'application/json' if body else None, + })) - t = content['mainVideoContent']['http://zdf.de/rels/target'] - ptmd_path = traverse_obj(t, ( - (('streams', 'default'), None), - ('http://zdf.de/rels/streams/ptmd', 'http://zdf.de/rels/streams/ptmd-template'), - ), get_all=False) - if not ptmd_path: - raise ExtractorError('Could not extract ptmd_path') - - info = self._extract_ptmd( - urljoin(url, ptmd_path.replace('{playerId}', 'android_native_5')), video_id, player['apiToken'], url) - - thumbnails = [] - layouts = try_get( - content, lambda x: x['teaserImageRef']['layouts'], dict) - if layouts: - for layout_key, layout_url in layouts.items(): - layout_url = url_or_none(layout_url) - if not layout_url: - continue - thumbnail = { - 'url': layout_url, - 'format_id': layout_key, - } - mobj = re.search(r'(?P\d+)x(?P\d+)', layout_key) - if mobj: - thumbnail.update({ - 'width': int(mobj.group('width')), - 'height': int(mobj.group('height')), - }) - thumbnails.append(thumbnail) - - chapter_marks = t.get('streamAnchorTag') or [] - chapter_marks.append({'anchorOffset': int_or_none(t.get('duration'))}) - chapters = [{ - 'start_time': chap.get('anchorOffset'), - 'end_time': next_chap.get('anchorOffset'), - 'title': chap.get('anchorLabel'), - } for chap, next_chap in zip(chapter_marks, chapter_marks[1:])] - - return merge_dicts(info, { - 'title': title, - 'description': content.get('leadParagraph') or content.get('teasertext'), - 'duration': int_or_none(t.get('duration')), - 'timestamp': unified_timestamp(content.get('editorialDate')), - 'thumbnails': thumbnails, - 'chapters': chapters or None, - 'episode': title, - **traverse_obj(content, ('programmeItem', 0, 'http://zdf.de/rels/target', { - 'series_id': ('http://zdf.de/rels/cmdm/series', 'seriesUuid', {str}), - 'series': ('http://zdf.de/rels/cmdm/series', 'seriesTitle', {str}), - 'season': ('http://zdf.de/rels/cmdm/season', 'seasonTitle', {str}), - 'season_number': ('http://zdf.de/rels/cmdm/season', 'seasonNumber', {int_or_none}), - 'season_id': ('http://zdf.de/rels/cmdm/season', 'seasonUuid', {str}), - 'episode_number': ('episodeNumber', {int_or_none}), - 'episode_id': ('contentId', {str}), - })), - }) - - def _extract_regular(self, url, player, video_id, query=None): - player_url = player['content'] - - content = self._call_api( - update_url_query(player_url, query), - video_id, 'content', player['apiToken'], url) - - return self._extract_entry(player_url, player, content, video_id) - - def _extract_mobile(self, video_id): - video = self._download_v2_doc(video_id) - - formats = [] - formitaeten = try_get(video, lambda x: x['document']['formitaeten'], list) - document = formitaeten and video['document'] - if formitaeten: - title = document['titel'] - content_id = document['basename'] - - format_urls = set() - for f in formitaeten or []: - self._extract_format(content_id, formats, format_urls, f) - - thumbnails = [] - teaser_bild = document.get('teaserBild') - if isinstance(teaser_bild, dict): - for thumbnail_key, thumbnail in teaser_bild.items(): - thumbnail_url = try_get( - thumbnail, lambda x: x['url'], str) - if thumbnail_url: - thumbnails.append({ - 'url': thumbnail_url, - 'id': thumbnail_key, - 'width': int_or_none(thumbnail.get('width')), - 'height': int_or_none(thumbnail.get('height')), - }) - - return { - 'id': content_id, - 'title': title, - 'description': document.get('beschreibung'), - 'duration': int_or_none(document.get('length')), - 'timestamp': unified_timestamp(document.get('date')) or unified_timestamp( - try_get(video, lambda x: x['meta']['editorialDate'], str)), - 'thumbnails': thumbnails, - 'subtitles': self._extract_subtitles(document), - 'formats': formats, - } + @staticmethod + def _extract_thumbnails(source): + return [{ + 'id': str(format_id), + 'url': url, + 'preference': 1 if format_id == 'original' else 0, + **traverse_obj(re.search(r'(?P\d+|auto)[Xx](?P\d+|auto)', str(format_id)), { + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + }), + } for format_id, url in traverse_obj(source, ({dict.items}, lambda _, v: url_or_none(v[1])))] class ZDFIE(ZDFBaseIE): - _VALID_URL = r'https?://www\.zdf\.de/(?:[^/]+/)*(?P[^/?#&]+)\.html' + _VALID_URL = [ + r'https?://(?:www\.)?zdf\.de/(?:video|play)/(?:[^/?#]+/)*(?P[^/?#]+)', + # /nachrichten/ sub-site URLs and legacy redirects from before the redesign in 2025-03 + r'https?://(?:www\.)?zdf\.de/(?:[^/?#]+/)*(?P[^/?#]+)\.html', + ] + IE_NAME = 'zdf' _TESTS = [{ - # Same as https://www.phoenix.de/sendungen/ereignisse/corona-nachgehakt/wohin-fuehrt-der-protest-in-der-pandemie-a-2050630.html - 'url': 'https://www.zdf.de/politik/phoenix-sendungen/wohin-fuehrt-der-protest-in-der-pandemie-100.html', - 'md5': '34ec321e7eb34231fd88616c65c92db0', + # Standalone video (i.e. not part of a playlist), video URL + 'url': 'https://www.zdf.de/video/dokus/sylt---deutschlands-edles-nordlicht-movie-100/sylt-deutschlands-edles-nordlicht-100', 'info_dict': { - 'id': '210222_phx_nachgehakt_corona_protest', + 'id': 'sylt-deutschlands-edles-nordlicht-100', 'ext': 'mp4', - 'title': 'Wohin führt der Protest in der Pandemie?', - 'description': 'md5:7d643fe7f565e53a24aac036b2122fbd', - 'duration': 1691, - 'timestamp': 1613948400, - 'upload_date': '20210221', + 'title': 'Sylt - Deutschlands edles Nordlicht', + 'description': 'md5:35407b810c2e1e33efbe15ef6e4c06c3', + 'duration': 810.0, + 'thumbnail': 'https://www.zdf.de/assets/sylt-118~original?cb=1613992485011', + 'series': 'Sylt - Deutschlands edles Nordlicht', + 'series_id': 'sylt---deutschlands-edles-nordlicht-movie-100', + 'timestamp': 1612462500, + 'upload_date': '20210204', + '_old_archive_ids': ['zdf 210402_1915_sendung_dok'], }, - 'skip': 'No longer available: "Diese Seite wurde leider nicht gefunden"', }, { - # Same as https://www.3sat.de/film/ab-18/10-wochen-sommer-108.html - 'url': 'https://www.zdf.de/dokumentation/ab-18/10-wochen-sommer-102.html', - 'md5': '0aff3e7bc72c8813f5e0fae333316a1d', + # Standalone video (i.e. not part of a playlist), play URL + 'url': 'https://www.zdf.de/play/dokus/sylt---deutschlands-edles-nordlicht-movie-100/sylt-deutschlands-edles-nordlicht-100', 'info_dict': { - 'id': '141007_ab18_10wochensommer_film', + 'id': 'sylt-deutschlands-edles-nordlicht-100', 'ext': 'mp4', - 'title': 'Ab 18! - 10 Wochen Sommer', - 'description': 'md5:8253f41dc99ce2c3ff892dac2d65fe26', - 'duration': 2660, - 'timestamp': 1608604200, - 'upload_date': '20201222', + 'title': 'Sylt - Deutschlands edles Nordlicht', + 'description': 'md5:35407b810c2e1e33efbe15ef6e4c06c3', + 'duration': 810.0, + 'thumbnail': 'https://www.zdf.de/assets/sylt-118~original?cb=1613992485011', + 'series': 'Sylt - Deutschlands edles Nordlicht', + 'series_id': 'sylt---deutschlands-edles-nordlicht-movie-100', + 'timestamp': 1612462500, + 'upload_date': '20210204', + '_old_archive_ids': ['zdf 210402_1915_sendung_dok'], }, - 'skip': 'No longer available: "Diese Seite wurde leider nicht gefunden"', + 'params': {'skip_download': True}, }, { - 'url': 'https://www.zdf.de/nachrichten/heute-journal/heute-journal-vom-30-12-2021-100.html', + # Standalone video (i.e. not part of a playlist), legacy URL before website redesign in 2025-03 + 'url': 'https://www.zdf.de/dokumentation/dokumentation-sonstige/sylt-deutschlands-edles-nordlicht-100.html', 'info_dict': { - 'id': '211230_sendung_hjo', + 'id': 'sylt-deutschlands-edles-nordlicht-100', 'ext': 'mp4', - 'description': 'md5:47dff85977bde9fb8cba9e9c9b929839', - 'duration': 1890.0, - 'upload_date': '20211230', - 'chapters': list, - 'thumbnail': 'md5:e65f459f741be5455c952cd820eb188e', - 'title': 'heute journal vom 30.12.2021', - 'timestamp': 1640897100, + 'title': 'Sylt - Deutschlands edles Nordlicht', + 'description': 'md5:35407b810c2e1e33efbe15ef6e4c06c3', + 'duration': 810.0, + 'thumbnail': 'https://www.zdf.de/assets/sylt-118~original?cb=1613992485011', + 'series': 'Sylt - Deutschlands edles Nordlicht', + 'series_id': 'sylt---deutschlands-edles-nordlicht-movie-100', + 'timestamp': 1612462500, + 'upload_date': '20210204', + '_old_archive_ids': ['zdf 210402_1915_sendung_dok'], }, - 'skip': 'No longer available: "Diese Seite wurde leider nicht gefunden"', + 'params': {'skip_download': True}, }, { - 'url': 'https://www.zdf.de/dokumentation/terra-x/die-magie-der-farben-von-koenigspurpur-und-jeansblau-100.html', + # Video belongs to a playlist, video URL + 'url': 'https://www.zdf.de/video/dokus/die-magie-der-farben-116/die-magie-der-farben-von-koenigspurpur-und-jeansblau-100', + 'md5': '1eda17eb40a9ead3046326e10b9c5973', 'info_dict': { - 'id': '151025_magie_farben2_tex', + 'id': 'die-magie-der-farben-von-koenigspurpur-und-jeansblau-100', 'ext': 'mp4', - 'duration': 2615.0, - 'title': 'Die Magie der Farben (2/2)', + 'title': 'Von Königspurpur bis Jeansblau', 'description': 'md5:a89da10c928c6235401066b60a6d5c1a', - 'timestamp': 1465021200, - 'thumbnail': 'https://www.zdf.de/assets/mauve-im-labor-100~768x432?cb=1464909117806', - 'upload_date': '20160604', - 'episode': 'Die Magie der Farben (2/2)', - 'episode_id': 'POS_954f4170-36a5-4a41-a6cf-78f1f3b1f127', - 'season': 'Staffel 1', + 'duration': 2615.0, + 'thumbnail': 'https://www.zdf.de/assets/koenigspurpur-bis-jeansblau-100~original?cb=1741857765971', 'series': 'Die Magie der Farben', + 'series_id': 'die-magie-der-farben-116', + 'season': 'Season 1', 'season_number': 1, - 'series_id': 'a39900dd-cdbd-4a6a-a413-44e8c6ae18bc', - 'season_id': '5a92e619-8a0f-4410-a3d5-19c76fbebb37', + 'episode': 'Episode 2', 'episode_number': 2, + 'timestamp': 1445797800, + 'upload_date': '20151025', + '_old_archive_ids': ['zdf 151025_magie_farben2_tex'], + }, + }, { + # Video belongs to a playlist, play URL + 'url': 'https://www.zdf.de/play/dokus/die-magie-der-farben-116/die-magie-der-farben-von-koenigspurpur-und-jeansblau-100', + 'md5': '1eda17eb40a9ead3046326e10b9c5973', + 'info_dict': { + 'id': 'die-magie-der-farben-von-koenigspurpur-und-jeansblau-100', + 'ext': 'mp4', + 'title': 'Von Königspurpur bis Jeansblau', + 'description': 'md5:a89da10c928c6235401066b60a6d5c1a', + 'duration': 2615.0, + 'thumbnail': 'https://www.zdf.de/assets/koenigspurpur-bis-jeansblau-100~original?cb=1741857765971', + 'series': 'Die Magie der Farben', + 'series_id': 'die-magie-der-farben-116', + 'season': 'Season 1', + 'season_number': 1, + 'episode': 'Episode 2', + 'episode_number': 2, + 'timestamp': 1445797800, + 'upload_date': '20151025', + '_old_archive_ids': ['zdf 151025_magie_farben2_tex'], + }, + 'params': {'skip_download': True}, + }, { + # Video belongs to a playlist, legacy URL before website redesign in 2025-03 + 'url': 'https://www.zdf.de/dokumentation/terra-x/die-magie-der-farben-von-koenigspurpur-und-jeansblau-100.html', + 'md5': '1eda17eb40a9ead3046326e10b9c5973', + 'info_dict': { + 'id': 'die-magie-der-farben-von-koenigspurpur-und-jeansblau-100', + 'ext': 'mp4', + 'title': 'Von Königspurpur bis Jeansblau', + 'description': 'md5:a89da10c928c6235401066b60a6d5c1a', + 'duration': 2615.0, + 'thumbnail': 'https://www.zdf.de/assets/koenigspurpur-bis-jeansblau-100~original?cb=1741857765971', + 'series': 'Die Magie der Farben', + 'series_id': 'die-magie-der-farben-116', + 'season': 'Season 1', + 'season_number': 1, + 'episode': 'Episode 2', + 'episode_number': 2, + 'timestamp': 1445797800, + 'upload_date': '20151025', + '_old_archive_ids': ['zdf 151025_magie_farben2_tex'], + }, + 'params': {'skip_download': True}, + }, { + # Video with chapters + # Also: video with sign-language variant + 'url': 'https://www.zdf.de/video/magazine/heute-journal-104/heute-journal-vom-19-12-2021-100', + 'md5': '6ada39465497a84fb98d48ffff69e7b7', + 'info_dict': { + 'id': 'heute-journal-vom-19-12-2021-100', + 'ext': 'mp4', + 'title': 'heute journal vom 19.12.2021', + 'description': 'md5:02504cf3b03777ff32fcc927d260c5dd', + 'duration': 1770.0, + 'thumbnail': 'https://epg-image.zdf.de/fotobase-webdelivery/images/273e5545-16e7-4ca3-898e-52fe9e06d964?layout=1920x1080', + 'chapters': 'count:11', + 'series': 'heute journal', + 'series_id': 'heute-journal-104', + 'season': 'Season 2021', + 'season_number': 2021, + 'episode': 'Episode 370', + 'episode_number': 370, + 'timestamp': 1639946700, + 'upload_date': '20211219', + # Videos with sign language variants must not have a 'dgs' suffix on their old archive IDs. + '_old_archive_ids': ['zdf 211219_sendung_hjo'], + }, + }, { + # Video that requires fallback extraction + 'url': 'https://www.zdf.de/nachrichten/politik/deutschland/koalitionsverhandlungen-spd-cdu-csu-dobrindt-100.html', + 'md5': 'c3a78514dd993a5781aa3afe50db51e2', + 'info_dict': { + 'id': 'koalitionsverhandlungen-spd-cdu-csu-dobrindt-100', + 'ext': 'mp4', + 'title': 'Dobrindt schließt Steuererhöhungen aus', + 'description': 'md5:9a117646d7b8df6bc902eb543a9c9023', + 'duration': 325, + 'thumbnail': 'https://www.zdf.de/assets/dobrindt-csu-berlin-direkt-100~1920x1080?cb=1743357653736', + 'timestamp': 1743374520, + 'upload_date': '20250330', + '_old_archive_ids': ['zdf 250330_clip_2_bdi'], }, }, { 'url': 'https://www.zdf.de/funk/druck-11790/funk-alles-ist-verzaubert-102.html', 'md5': '57af4423db0455a3975d2dc4578536bc', 'info_dict': { + 'id': 'funk-alles-ist-verzaubert-102', 'ext': 'mp4', - 'id': 'video_funk_1770473', - 'duration': 1278.0, 'title': 'Alles ist verzaubert', 'description': 'Die Neue an der Schule verdreht Ismail den Kopf.', + 'duration': 1278.0, + 'thumbnail': 'https://www.zdf.de/assets/teaser-funk-alles-ist-verzaubert-102~original?cb=1663848412907', + 'series': 'DRUCK', + 'series_id': 'funk-collection-funk-11790-1590', + 'season': 'Season 7', + 'season_number': 7, + 'episode': 'Episode 1', + 'episode_number': 1, 'timestamp': 1635520560, - 'thumbnail': 'https://www.zdf.de/assets/teaser-funk-alles-ist-verzaubert-102~1920x1080?cb=1663848412907', 'upload_date': '20211029', - 'episode': 'Alles ist verzaubert', + '_old_archive_ids': ['zdf video_funk_1770473'], }, + }, { + 'url': 'https://www.zdf.de/serien/soko-stuttgart/das-geld-anderer-leute-100.html', + 'info_dict': { + 'id': 'das-geld-anderer-leute-100', + 'ext': 'mp4', + 'title': 'Das Geld anderer Leute', + 'description': 'md5:cb6f660850dc5eb7d1ab776ea094959d', + 'duration': 2581.0, + 'thumbnail': 'https://epg-image.zdf.de/fotobase-webdelivery/images/e2d7e55a-09f0-424e-ac73-6cac4dd65f35?layout=1920x1080', + 'series': 'SOKO Stuttgart', + 'series_id': 'soko-stuttgart-104', + 'season': 'Season 11', + 'season_number': 11, + 'episode': 'Episode 10', + 'episode_number': 10, + 'timestamp': 1728983700, + 'upload_date': '20241015', + '_old_archive_ids': ['zdf 191205_1800_sendung_sok8'], + }, + }, { + 'url': 'https://www.zdf.de/serien/northern-lights/begegnung-auf-der-bruecke-100.html', + 'info_dict': { + 'id': 'begegnung-auf-der-bruecke-100', + 'ext': 'webm', + 'title': 'Begegnung auf der Brücke', + 'description': 'md5:e53a555da87447f7f1207f10353f8e45', + 'duration': 3083.0, + 'thumbnail': 'https://epg-image.zdf.de/fotobase-webdelivery/images/c5ff1d1f-f5c8-4468-86ac-1b2f1dbecc76?layout=1920x1080', + 'series': 'Northern Lights', + 'series_id': 'northern-lights-100', + 'season': 'Season 1', + 'season_number': 1, + 'episode': 'Episode 1', + 'episode_number': 1, + 'timestamp': 1738546500, + 'upload_date': '20250203', + '_old_archive_ids': ['zdf 240319_2310_sendung_not'], + }, + 'params': {'skip_download': 'geo-restricted http format'}, + }, { + # Same as https://www.phoenix.de/sendungen/ereignisse/corona-nachgehakt/wohin-fuehrt-der-protest-in-der-pandemie-a-2050630.html + 'url': 'https://www.zdf.de/politik/phoenix-sendungen/wohin-fuehrt-der-protest-in-der-pandemie-100.html', + 'only_matching': True, + }, { + # Same as https://www.3sat.de/film/ab-18/10-wochen-sommer-108.html + 'url': 'https://www.zdf.de/dokumentation/ab-18/10-wochen-sommer-102.html', + 'only_matching': True, }, { # Same as https://www.phoenix.de/sendungen/dokumentationen/gesten-der-maechtigen-i-a-89468.html?ref=suche 'url': 'https://www.zdf.de/politik/phoenix-sendungen/die-gesten-der-maechtigen-100.html', @@ -349,155 +428,323 @@ class ZDFIE(ZDFBaseIE): 'only_matching': True, }, { 'url': 'https://www.zdf.de/arte/todliche-flucht/page-video-artede-toedliche-flucht-16-100.html', - 'info_dict': { - 'id': 'video_artede_083871-001-A', - 'ext': 'mp4', - 'title': 'Tödliche Flucht (1/6)', - 'description': 'md5:e34f96a9a5f8abd839ccfcebad3d5315', - 'duration': 3193.0, - 'timestamp': 1641355200, - 'upload_date': '20220105', - }, - 'skip': 'No longer available "Diese Seite wurde leider nicht gefunden"', - }, { - 'url': 'https://www.zdf.de/serien/soko-stuttgart/das-geld-anderer-leute-100.html', - 'info_dict': { - 'id': '191205_1800_sendung_sok8', - 'ext': 'mp4', - 'title': 'Das Geld anderer Leute', - 'description': 'md5:cb6f660850dc5eb7d1ab776ea094959d', - 'duration': 2581.0, - 'timestamp': 1728983700, - 'upload_date': '20241015', - 'thumbnail': 'https://epg-image.zdf.de/fotobase-webdelivery/images/e2d7e55a-09f0-424e-ac73-6cac4dd65f35?layout=2400x1350', - 'series': 'SOKO Stuttgart', - 'series_id': 'f862ce9a-6dd1-4388-a698-22b36ac4c9e9', - 'season': 'Staffel 11', - 'season_number': 11, - 'season_id': 'ae1b4990-6d87-4970-a571-caccf1ba2879', - 'episode': 'Das Geld anderer Leute', - 'episode_number': 10, - 'episode_id': 'POS_7f367934-f2f0-45cb-9081-736781ff2d23', - }, + 'only_matching': True, }, { 'url': 'https://www.zdf.de/dokumentation/terra-x/unser-gruener-planet-wuesten-doku-100.html', - 'info_dict': { - 'id': '220525_green_planet_makingof_1_tropen_tex', - 'ext': 'mp4', - 'title': 'Making-of Unser grüner Planet - Tropen', - 'description': 'md5:d7c6949dc7c75c73c4ad51c785fb0b79', - 'duration': 435.0, - 'timestamp': 1653811200, - 'upload_date': '20220529', - 'format_note': 'hd, main', - 'thumbnail': 'https://www.zdf.de/assets/unser-gruener-planet-making-of-1-tropen-100~3840x2160?cb=1653493335577', - 'episode': 'Making-of Unser grüner Planet - Tropen', - }, - 'skip': 'No longer available: "Leider kein Video verfügbar"', - }, { - 'url': 'https://www.zdf.de/serien/northern-lights/begegnung-auf-der-bruecke-100.html', - 'info_dict': { - 'id': '240319_2310_sendung_not', - 'ext': 'mp4', - 'title': 'Begegnung auf der Brücke', - 'description': 'md5:e53a555da87447f7f1207f10353f8e45', - 'thumbnail': 'https://epg-image.zdf.de/fotobase-webdelivery/images/c5ff1d1f-f5c8-4468-86ac-1b2f1dbecc76?layout=2400x1350', - 'upload_date': '20250203', - 'duration': 3083.0, - 'timestamp': 1738546500, - 'series_id': '1d7a1879-01ee-4468-8237-c6b4ecd633c7', - 'series': 'Northern Lights', - 'season': 'Staffel 1', - 'season_number': 1, - 'season_id': '22ac26a2-4ea2-4055-ac0b-98b755cdf718', - 'episode': 'Begegnung auf der Brücke', - 'episode_number': 1, - 'episode_id': 'POS_71049438-024b-471f-b472-4fe2e490d1fb', - }, + 'only_matching': True, }] + _GRAPHQL_QUERY = ''' +query VideoByCanonical($canonical: String!) { + videoByCanonical(canonical: $canonical) { + canonical + title + leadParagraph + editorialDate + teaser { + description + image { + list + } + } + episodeInfo { + episodeNumber + seasonNumber + } + smartCollection { + canonical + title + } + currentMedia { + nodes { + ptmdTemplate + ... on VodMedia { + duration + aspectRatio + streamAnchorTags { + nodes { + anchorOffset + anchorLabel + } + } + vodMediaType + label + } + ... on LiveMedia { + start + stop + encryption + liveMediaType + label + } + id + } + } + } +} + ''' + + def _extract_ptmd(self, *args, **kwargs): + ptmd_data = super()._extract_ptmd(*args, **kwargs) + # This was the video id before the graphql redesign, other extractors still use it as such + old_archive_id = ptmd_data.pop('id') + ptmd_data['_old_archive_ids'] = [make_archive_id(self, old_archive_id)] + return ptmd_data + + # This fallback should generally only happen for pages under `zdf.de/nachrichten`. + # They are on a separate website for which GraphQL often doesn't return results. + # The API used here is no longer in use by official clients and likely deprecated. + # Long-term, news documents probably should use the API used by the mobile apps: + # https://zdf-prod-futura.zdf.de/news/documents/ (note 'news' vs 'mediathekV2') + def _extract_fallback(self, document_id): + video = self._download_json( + f'https://zdf-prod-futura.zdf.de/mediathekV2/document/{document_id}', + document_id, note='Downloading fallback metadata', + errnote='Failed to download fallback metadata') + document = video['document'] + + ptmd_url = traverse_obj(document, ( + ('streamApiUrlAndroid', ('streams', 0, 'streamApiUrlAndroid')), + {url_or_none}, any, {require('PTMD URL')})) + + thumbnails = [] + for thumbnail_key, thumbnail in traverse_obj(document, ('teaserBild', {dict.items}, ...)): + thumbnail_url = traverse_obj(thumbnail, ('url', {url_or_none})) + if not thumbnail_url: + continue + thumbnails.append({ + 'url': thumbnail_url, + 'id': thumbnail_key, + 'width': int_or_none(thumbnail.get('width')), + 'height': int_or_none(thumbnail.get('height')), + }) + + return { + 'thumbnails': thumbnails, + **traverse_obj(video, { + 'title': ('document', 'titel', {str}), + 'description': ('document', 'beschreibung', {str}), + 'timestamp': ( + (('document', 'date'), ('meta', 'editorialDate')), + {unified_timestamp}, any), + 'subtitles': ('document', 'captions', {self._extract_subtitles}), + }), + **self._extract_ptmd(ptmd_url, document_id, self._get_api_token()), + 'id': document_id, + } + def _real_extract(self, url): video_id = self._match_id(url) + video_data = self._download_graphql(video_id, 'video metadata', body={ + 'operationName': 'VideoByCanonical', + 'query': self._GRAPHQL_QUERY, + 'variables': {'canonical': video_id}, + })['data']['videoByCanonical'] - webpage = self._download_webpage(url, video_id, fatal=False) - if webpage: - player = self._extract_player(webpage, url, fatal=False) - if player: - return self._extract_regular(url, player, video_id, query={'profile': 'player-3'}) + if not video_data: + return self._extract_fallback(video_id) - return self._extract_mobile(video_id) + aspect_ratio = None + ptmd_urls = [] + for node in traverse_obj(video_data, ('currentMedia', 'nodes', lambda _, v: v['ptmdTemplate'])): + ptmd_url = self._expand_ptmd_template('https://api.zdf.de', node['ptmdTemplate']) + # Smuggle vod_media_type so that _extract_ptmd is aware of 'DGS' variants + if vod_media_type := node.get('vodMediaType'): + ptmd_url = smuggle_url(ptmd_url, {'vod_media_type': vod_media_type}) + ptmd_urls.append(ptmd_url) + if not aspect_ratio: + aspect_ratio = self._parse_aspect_ratio(node.get('aspectRatio')) + + return { + **traverse_obj(video_data, { + 'title': ('title', {str}), + 'description': (('leadParagraph', ('teaser', 'description')), any, {str}), + 'timestamp': ('editorialDate', {parse_iso8601}), + 'thumbnails': ('teaser', 'image', 'list', {self._extract_thumbnails}), + 'episode_number': ('episodeInfo', 'episodeNumber', {int_or_none}), + 'season_number': ('episodeInfo', 'seasonNumber', {int_or_none}), + 'series': ('smartCollection', 'title', {str}), + 'series_id': ('smartCollection', 'canonical', {str}), + 'chapters': ('currentMedia', 'nodes', 0, 'streamAnchorTags', 'nodes', {self._extract_chapters}), + }), + **self._extract_ptmd(ptmd_urls, video_id, self._get_api_token(), aspect_ratio), + 'id': video_id, + } class ZDFChannelIE(ZDFBaseIE): - _VALID_URL = r'https?://www\.zdf\.de/(?:[^/?#]+/)*(?P[^/?#&]+)' + _VALID_URL = r'https?://www\.zdf\.de/(?:[^/?#]+/)*(?P[^/?#]+)' + IE_NAME = 'zdf:channel' _TESTS = [{ + # Playlist, legacy URL before website redesign in 2025-03 'url': 'https://www.zdf.de/sport/das-aktuelle-sportstudio', 'info_dict': { - 'id': 'das-aktuelle-sportstudio', + 'id': 'das-aktuelle-sportstudio-220', 'title': 'das aktuelle sportstudio', + 'description': 'md5:e46c785324238a03edcf8b301c5fd5dc', }, - 'playlist_mincount': 18, + 'playlist_mincount': 25, }, { - 'url': 'https://www.zdf.de/dokumentation/planet-e', + # Playlist, current URL + 'url': 'https://www.zdf.de/sport/das-aktuelle-sportstudio-220', 'info_dict': { - 'id': 'planet-e', - 'title': 'planet e.', - 'description': 'md5:87e3b9c66a63cf1407ee443d2c4eb88e', + 'id': 'das-aktuelle-sportstudio-220', + 'title': 'das aktuelle sportstudio', + 'description': 'md5:e46c785324238a03edcf8b301c5fd5dc', }, - 'playlist_mincount': 50, + 'playlist_mincount': 25, + }, { + # Standalone video (i.e. not part of a playlist), collection URL + 'add_ie': [ZDFIE.ie_key()], + 'url': 'https://www.zdf.de/dokus/sylt---deutschlands-edles-nordlicht-movie-100', + 'info_dict': { + 'id': 'sylt-deutschlands-edles-nordlicht-100', + 'ext': 'mp4', + 'title': 'Sylt - Deutschlands edles Nordlicht', + 'description': 'md5:35407b810c2e1e33efbe15ef6e4c06c3', + 'duration': 810.0, + 'thumbnail': 'https://www.zdf.de/assets/sylt-118~original?cb=1613992485011', + 'series': 'Sylt - Deutschlands edles Nordlicht', + 'series_id': 'sylt---deutschlands-edles-nordlicht-movie-100', + 'timestamp': 1612462500, + 'upload_date': '20210204', + '_old_archive_ids': ['zdf 210402_1915_sendung_dok'], + }, + 'params': {'skip_download': True}, }, { 'url': 'https://www.zdf.de/gesellschaft/aktenzeichen-xy-ungeloest', 'info_dict': { - 'id': 'aktenzeichen-xy-ungeloest', + 'id': 'aktenzeichen-xy-ungeloest-110', 'title': 'Aktenzeichen XY... Ungelöst', - 'description': 'md5:623ede5819c400c6d04943fa8100e6e7', + 'description': 'md5:b79ac0d64b979e53cbe510c0ca9cb7be', }, 'playlist_mincount': 2, }, { 'url': 'https://www.zdf.de/serien/taunuskrimi/', - 'only_matching': True, + 'info_dict': { + 'id': 'taunuskrimi-100', + 'title': 'Taunuskrimi', + 'description': 'md5:ee7204e9c625c3b611d1274f9d0e3070', + }, + 'playlist_mincount': 8, + }, { + 'url': 'https://www.zdf.de/serien/taunuskrimi/?staffel=1', + 'info_dict': { + 'id': 'taunuskrimi-100-s1', + 'title': 'Taunuskrimi - Season 1', + 'description': 'md5:ee7204e9c625c3b611d1274f9d0e3070', + }, + 'playlist_count': 7, + }, { + 'url': 'https://www.zdf.de/magazine/heute-journal-104', + 'info_dict': { + 'id': 'heute-journal-104', + 'title': 'heute journal', + 'description': 'md5:6edad39189abf8431795d3d6d7f986b3', + }, + 'playlist_mincount': 500, + }, { + 'url': 'https://www.zdf.de/magazine/heute-journal-104?staffel=2024', + 'info_dict': { + 'id': 'heute-journal-104-s2024', + 'title': 'heute journal - Season 2024', + 'description': 'md5:6edad39189abf8431795d3d6d7f986b3', + }, + 'playlist_count': 242, }] + _PAGE_SIZE = 24 + @classmethod def suitable(cls, url): return False if ZDFIE.suitable(url) else super().suitable(url) - def _extract_entry(self, entry): - return self.url_result( - entry['sharingUrl'], ZDFIE, **traverse_obj(entry, { - 'id': ('basename', {str}), - 'title': ('titel', {str}), - 'description': ('beschreibung', {str}), - 'duration': ('length', {float_or_none}), - 'season_number': ('seasonNumber', {int_or_none}), - 'episode_number': ('episodeNumber', {int_or_none}), - })) + def _fetch_page(self, playlist_id, canonical_id, season_idx, season_number, page_number, cursor=None): + return self._download_graphql( + playlist_id, f'season {season_number} page {page_number} JSON', query={ + 'operationName': 'seasonByCanonical', + 'variables': json.dumps(filter_dict({ + 'seasonIndex': season_idx, + 'canonical': canonical_id, + 'episodesPageSize': self._PAGE_SIZE, + 'episodesAfter': cursor, + })), + 'extensions': json.dumps({ + 'persistedQuery': { + 'version': 1, + 'sha256Hash': '9412a0f4ac55dc37d46975d461ec64bfd14380d815df843a1492348f77b5c99a', + }, + }), + })['data']['smartCollectionByCanonical'] - def _entries(self, data, document_id): - for entry in traverse_obj(data, ( - 'cluster', lambda _, v: v['type'] == 'teaser', - # If 'brandId' differs, it is a 'You might also like' video. Filter these out - 'teaser', lambda _, v: v['type'] == 'video' and v['brandId'] == document_id and v['sharingUrl'], - )): - yield self._extract_entry(entry) + def _entries(self, playlist_id, canonical_id, season_numbers, requested_season_number): + for season_idx, season_number in enumerate(season_numbers): + if requested_season_number is not None and requested_season_number != season_number: + continue + + cursor = None + for page_number in itertools.count(1): + page = self._fetch_page( + playlist_id, canonical_id, season_idx, season_number, page_number, cursor) + + nodes = traverse_obj(page, ('seasons', 'nodes', ...)) + + for episode in traverse_obj(nodes, ( + ..., 'episodes', 'nodes', lambda _, v: url_or_none(v['sharingUrl']), + )): + yield self.url_result( + episode['sharingUrl'], ZDFIE, + **traverse_obj(episode, { + 'id': ('canonical', {str}), + 'title': ('teaser', 'title', {str}), + 'description': (('leadParagraph', ('teaser', 'description')), any, {str}), + 'timestamp': ('editorialDate', {parse_iso8601}), + 'episode_number': ('episodeInfo', 'episodeNumber', {int_or_none}), + 'season_number': ('episodeInfo', 'seasonNumber', {int_or_none}), + })) + + page_info = traverse_obj(nodes, (-1, 'episodes', 'pageInfo', {dict})) or {} + if not page_info.get('hasNextPage') or not page_info.get('endCursor'): + break + cursor = page_info['endCursor'] def _real_extract(self, url): - channel_id = self._match_id(url) - webpage = self._download_webpage(url, channel_id) - document_id = self._search_regex( - r'docId\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, 'document id', group='doc_id') - data = self._download_v2_doc(document_id) + canonical_id = self._match_id(url) + # Make sure to get the correct ID in case of redirects + urlh = self._request_webpage(url, canonical_id) + canonical_id = self._search_regex(self._VALID_URL, urlh.url, 'channel id', group='id') + season_number = traverse_obj(parse_qs(url), ('staffel', -1, {int_or_none})) + playlist_id = join_nonempty(canonical_id, season_number and f's{season_number}') - main_video = traverse_obj(data, ( - 'cluster', lambda _, v: v['type'] == 'teaserContent', - 'teaser', lambda _, v: v['type'] == 'video' and v['basename'] and v['sharingUrl'], any)) or {} + collection_data = self._download_graphql( + playlist_id, 'smart collection data', query={ + 'operationName': 'GetSmartCollectionByCanonical', + 'variables': json.dumps({ + 'canonical': canonical_id, + 'videoPageSize': 100, # Use max page size to get episodes from all seasons + }), + 'extensions': json.dumps({ + 'persistedQuery': { + 'version': 1, + 'sha256Hash': 'cb49420e133bd668ad895a8cea0e65cba6aa11ac1cacb02341ff5cf32a17cd02', + }, + }), + })['data']['smartCollectionByCanonical'] + video_data = traverse_obj(collection_data, ('video', {dict})) or {} + season_numbers = traverse_obj(collection_data, ('seasons', 'seasons', ..., 'number', {int_or_none})) - if not self._yes_playlist(channel_id, main_video.get('basename')): - return self._extract_entry(main_video) + if not self._yes_playlist( + season_numbers and playlist_id, + url_or_none(video_data.get('sharingUrl')) and video_data.get('canonical'), + ): + return self.url_result(video_data['sharingUrl'], ZDFIE, video_data['canonical']) + + if season_number is not None and season_number not in season_numbers: + raise ExtractorError(f'Season {season_number} was not found in the collection data') return self.playlist_result( - self._entries(data, document_id), channel_id, - re.split(r'\s+[-|]\s+ZDF(?:mediathek)?$', self._og_search_title(webpage) or '')[0] or None, - join_nonempty( - 'headline', 'text', delim='\n\n', - from_dict=traverse_obj(data, ('shortText', {dict}), default={})) or None) + self._entries(playlist_id, canonical_id, season_numbers, season_number), + playlist_id, join_nonempty( + traverse_obj(collection_data, ('title', {str})), + season_number and f'Season {season_number}', delim=' - '), + traverse_obj(collection_data, ('infoText', {str}))) diff --git a/yt_dlp/jsinterp.py b/yt_dlp/jsinterp.py index b59fb2c61..45aeffa22 100644 --- a/yt_dlp/jsinterp.py +++ b/yt_dlp/jsinterp.py @@ -590,39 +590,12 @@ def dict_item(key, val): return ret, True return ret, False - for m in re.finditer(rf'''(?x) - (?P\+\+|--)(?P{_NAME_RE})| - (?P{_NAME_RE})(?P\+\+|--)''', expr): - var = m.group('var1') or m.group('var2') - start, end = m.span() - sign = m.group('pre_sign') or m.group('post_sign') - ret = local_vars[var] - local_vars[var] += 1 if sign[0] == '+' else -1 - if m.group('pre_sign'): - ret = local_vars[var] - expr = expr[:start] + self._dump(ret, local_vars) + expr[end:] - - if not expr: - return None, should_return - m = re.match(fr'''(?x) - (?P (?P{_NAME_RE})(?:\[(?P{_NESTED_BRACKETS})\])?\s* (?P{"|".join(map(re.escape, set(_OPERATORS) - _COMP_OPERATORS))})? =(?!=)(?P.*)$ - )|(?P - (?!if|return|true|false|null|undefined|NaN)(?P{_NAME_RE})$ - )|(?P - (?P{_NAME_RE})(?: - (?P\?)?\.(?P[^(]+)| - \[(?P{_NESTED_BRACKETS})\] - )\s* - )|(?P - (?P{_NAME_RE})\[(?P.+)\]$ - )|(?P - (?P{_NAME_RE})\((?P.*)\)$ - )''', expr) - if m and m.group('assign'): + ''', expr) + if m: # We are assigning a value to a variable left_val = local_vars.get(m.group('out')) if not m.group('index'): @@ -640,7 +613,35 @@ def dict_item(key, val): m.group('op'), self._index(left_val, idx), m.group('expr'), expr, local_vars, allow_recursion) return left_val[idx], should_return - elif expr.isdigit(): + for m in re.finditer(rf'''(?x) + (?P\+\+|--)(?P{_NAME_RE})| + (?P{_NAME_RE})(?P\+\+|--)''', expr): + var = m.group('var1') or m.group('var2') + start, end = m.span() + sign = m.group('pre_sign') or m.group('post_sign') + ret = local_vars[var] + local_vars[var] += 1 if sign[0] == '+' else -1 + if m.group('pre_sign'): + ret = local_vars[var] + expr = expr[:start] + self._dump(ret, local_vars) + expr[end:] + + if not expr: + return None, should_return + + m = re.match(fr'''(?x) + (?P + (?!if|return|true|false|null|undefined|NaN)(?P{_NAME_RE})$ + )|(?P + (?P{_NAME_RE})(?: + (?P\?)?\.(?P[^(]+)| + \[(?P{_NESTED_BRACKETS})\] + )\s* + )|(?P + (?P{_NAME_RE})\[(?P.+)\]$ + )|(?P + (?P{_NAME_RE})\((?P.*)\)$ + )''', expr) + if expr.isdigit(): return int(expr), should_return elif expr == 'break': diff --git a/yt_dlp/networking/__init__.py b/yt_dlp/networking/__init__.py index 1eaa0ee5f..39158a8cc 100644 --- a/yt_dlp/networking/__init__.py +++ b/yt_dlp/networking/__init__.py @@ -3,6 +3,7 @@ from .common import ( HEADRequest, + PATCHRequest, PUTRequest, Request, RequestDirector, diff --git a/yt_dlp/networking/_curlcffi.py b/yt_dlp/networking/_curlcffi.py index c800f2c09..747879da8 100644 --- a/yt_dlp/networking/_curlcffi.py +++ b/yt_dlp/networking/_curlcffi.py @@ -6,7 +6,8 @@ import re import urllib.parse -from ._helper import InstanceStoreMixin, select_proxy +from ._helper import InstanceStoreMixin +from ..utils.networking import select_proxy from .common import ( Features, Request, diff --git a/yt_dlp/networking/_helper.py b/yt_dlp/networking/_helper.py index b86d3606d..ef9c8bafa 100644 --- a/yt_dlp/networking/_helper.py +++ b/yt_dlp/networking/_helper.py @@ -13,7 +13,6 @@ from .exceptions import RequestError from ..dependencies import certifi from ..socks import ProxyType, sockssocket -from ..utils import format_field, traverse_obj if typing.TYPE_CHECKING: from collections.abc import Iterable @@ -82,19 +81,6 @@ def unquote_if_non_empty(s): } -def select_proxy(url, proxies): - """Unified proxy selector for all backends""" - url_components = urllib.parse.urlparse(url) - if 'no' in proxies: - hostport = url_components.hostname + format_field(url_components.port, None, ':%s') - if urllib.request.proxy_bypass_environment(hostport, {'no': proxies['no']}): - return - elif urllib.request.proxy_bypass(hostport): # check system settings - return - - return traverse_obj(proxies, url_components.scheme or 'http', 'all') - - def get_redirect_method(method, status): """Unified redirect method handling""" diff --git a/yt_dlp/networking/_requests.py b/yt_dlp/networking/_requests.py index 5b6b264a6..d02e976b5 100644 --- a/yt_dlp/networking/_requests.py +++ b/yt_dlp/networking/_requests.py @@ -10,7 +10,7 @@ from ..dependencies import brotli, requests, urllib3 from ..utils import bug_reports_message, int_or_none, variadic -from ..utils.networking import normalize_url +from ..utils.networking import normalize_url, select_proxy if requests is None: raise ImportError('requests module is not installed') @@ -41,7 +41,6 @@ create_socks_proxy_socket, get_redirect_method, make_socks_proxy_opts, - select_proxy, ) from .common import ( Features, diff --git a/yt_dlp/networking/_urllib.py b/yt_dlp/networking/_urllib.py index a188b35f5..cb7a430bb 100644 --- a/yt_dlp/networking/_urllib.py +++ b/yt_dlp/networking/_urllib.py @@ -26,7 +26,6 @@ create_socks_proxy_socket, get_redirect_method, make_socks_proxy_opts, - select_proxy, ) from .common import Features, RequestHandler, Response, register_rh from .exceptions import ( @@ -41,7 +40,7 @@ from ..dependencies import brotli from ..socks import ProxyError as SocksProxyError from ..utils import update_url_query -from ..utils.networking import normalize_url +from ..utils.networking import normalize_url, select_proxy SUPPORTED_ENCODINGS = ['gzip', 'deflate'] CONTENT_DECODE_ERRORS = [zlib.error, OSError] diff --git a/yt_dlp/networking/_websockets.py b/yt_dlp/networking/_websockets.py index d29f8e45a..fd8730ac7 100644 --- a/yt_dlp/networking/_websockets.py +++ b/yt_dlp/networking/_websockets.py @@ -11,8 +11,8 @@ create_connection, create_socks_proxy_socket, make_socks_proxy_opts, - select_proxy, ) +from ..utils.networking import select_proxy from .common import Features, Response, register_rh from .exceptions import ( CertificateVerifyError, diff --git a/yt_dlp/networking/common.py b/yt_dlp/networking/common.py index ddceaa9a9..e33769422 100644 --- a/yt_dlp/networking/common.py +++ b/yt_dlp/networking/common.py @@ -505,6 +505,7 @@ def copy(self): HEADRequest = functools.partial(Request, method='HEAD') +PATCHRequest = functools.partial(Request, method='PATCH') PUTRequest = functools.partial(Request, method='PUT') diff --git a/yt_dlp/options.py b/yt_dlp/options.py index 1742cbdfa..b4d3d4d66 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -150,6 +150,15 @@ def format_option_strings(option): return opts +_PRESET_ALIASES = { + 'mp3': ['-f', 'ba[acodec^=mp3]/ba/b', '-x', '--audio-format', 'mp3'], + 'aac': ['-f', 'ba[acodec^=aac]/ba[acodec^=mp4a.40.]/ba/b', '-x', '--audio-format', 'aac'], + 'mp4': ['--merge-output-format', 'mp4', '--remux-video', 'mp4', '-S', 'vcodec:h264,lang,quality,res,fps,hdr:12,acodec:aac'], + 'mkv': ['--merge-output-format', 'mkv', '--remux-video', 'mkv'], + 'sleep': ['--sleep-subtitles', '5', '--sleep-requests', '0.75', '--sleep-interval', '10', '--max-sleep-interval', '20'], +} + + class _YoutubeDLOptionParser(optparse.OptionParser): # optparse is deprecated since Python 3.2. So assume a stable interface even for private methods ALIAS_DEST = '_triggered_aliases' @@ -215,6 +224,25 @@ def _match_long_opt(self, opt): return e.possibilities[0] raise + def format_option_help(self, formatter=None): + assert formatter, 'Formatter can not be None' + formatted_help = super().format_option_help(formatter=formatter) + formatter.indent() + heading = formatter.format_heading('Preset Aliases') + formatter.indent() + description = formatter.format_description( + 'Predefined aliases for convenience and ease of use. Note that future versions of yt-dlp ' + 'may add or adjust presets, but the existing preset names will not be changed or removed') + result = [] + for name, args in _PRESET_ALIASES.items(): + option = optparse.Option('-t', help=shlex.join(args)) + formatter.option_strings[option] = f'-t {name}' + result.append(formatter.format_option(option)) + formatter.dedent() + formatter.dedent() + help_lines = '\n'.join(result) + return f'{formatted_help}\n{heading}{description}\n{help_lines}' + def create_parser(): def _list_from_options_callback(option, opt_str, value, parser, append=True, delim=',', process=str.strip): @@ -317,6 +345,13 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): parser.rargs[:0] = shlex.split( opts if value is None else opts.format(*map(shlex.quote, value))) + def _preset_alias_callback(option, opt_str, value, parser): + if not value: + return + if value not in _PRESET_ALIASES: + raise optparse.OptionValueError(f'Unknown preset alias: {value}') + parser.rargs[:0] = _PRESET_ALIASES[value] + general = optparse.OptionGroup(parser, 'General Options') general.add_option( '-h', '--help', dest='print_help', action='store_true', @@ -438,7 +473,7 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): general.add_option( '--live-from-start', action='store_true', dest='live_from_start', - help='Download livestreams from the start. Currently only supported for YouTube (Experimental)') + help='Download livestreams from the start. Currently experimental and only supported for YouTube and Twitch') general.add_option( '--no-live-from-start', action='store_false', dest='live_from_start', @@ -513,12 +548,21 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): help=( 'Create aliases for an option string. Unless an alias starts with a dash "-", it is prefixed with "--". ' 'Arguments are parsed according to the Python string formatting mini-language. ' - 'E.g. --alias get-audio,-X "-S=aext:{0},abr -x --audio-format {0}" creates options ' + 'E.g. --alias get-audio,-X "-S aext:{0},abr -x --audio-format {0}" creates options ' '"--get-audio" and "-X" that takes an argument (ARG0) and expands to ' - '"-S=aext:ARG0,abr -x --audio-format ARG0". All defined aliases are listed in the --help output. ' + '"-S aext:ARG0,abr -x --audio-format ARG0". All defined aliases are listed in the --help output. ' 'Alias options can trigger more aliases; so be careful to avoid defining recursive options. ' f'As a safety measure, each alias may be triggered a maximum of {_YoutubeDLOptionParser.ALIAS_TRIGGER_LIMIT} times. ' 'This option can be used multiple times')) + general.add_option( + '-t', '--preset-alias', + metavar='PRESET', dest='_', type='str', + action='callback', callback=_preset_alias_callback, + help=( + 'Applies a predefined set of options. e.g. --preset-alias mp3. ' + f'The following presets are available: {", ".join(_PRESET_ALIASES)}. ' + 'See the "Preset Aliases" section at the end for more info. ' + 'This option can be used multiple times')) network = optparse.OptionGroup(parser, 'Network Options') network.add_option( diff --git a/yt_dlp/update.py b/yt_dlp/update.py index 8e887ec03..de289cb78 100644 --- a/yt_dlp/update.py +++ b/yt_dlp/update.py @@ -202,7 +202,7 @@ class UpdateInfo: requested_version: str | None = None commit: str | None = None - binary_name: str | None = _get_binary_name() # noqa: RUF009: Always returns the same value + binary_name: str | None = _get_binary_name() # noqa: RUF009 # Always returns the same value checksum: str | None = None diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index 99d725087..20aa341ca 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -54,7 +54,7 @@ from ..dependencies import xattr from ..globals import IN_CLI -__name__ = __name__.rsplit('.', 1)[0] # noqa: A001: Pretend to be the parent module +__name__ = __name__.rsplit('.', 1)[0] # noqa: A001 # Pretend to be the parent module class NO_DEFAULT: diff --git a/yt_dlp/utils/networking.py b/yt_dlp/utils/networking.py index 542abace8..9fcab6456 100644 --- a/yt_dlp/utils/networking.py +++ b/yt_dlp/utils/networking.py @@ -10,7 +10,8 @@ if typing.TYPE_CHECKING: T = typing.TypeVar('T') -from ._utils import NO_DEFAULT, remove_start +from ._utils import NO_DEFAULT, remove_start, format_field +from .traversal import traverse_obj def random_user_agent(): @@ -278,3 +279,16 @@ def normalize_url(url): query=escape_rfc3986(url_parsed.query), fragment=escape_rfc3986(url_parsed.fragment), ).geturl() + + +def select_proxy(url, proxies): + """Unified proxy selector for all backends""" + url_components = urllib.parse.urlparse(url) + if 'no' in proxies: + hostport = url_components.hostname + format_field(url_components.port, None, ':%s') + if urllib.request.proxy_bypass_environment(hostport, {'no': proxies['no']}): + return + elif urllib.request.proxy_bypass(hostport): # check system settings + return + + return traverse_obj(proxies, url_components.scheme or 'http', 'all') diff --git a/yt_dlp/version.py b/yt_dlp/version.py index 1479cd960..c375cc6ad 100644 --- a/yt_dlp/version.py +++ b/yt_dlp/version.py @@ -1,8 +1,8 @@ # Autogenerated by devscripts/update-version.py -__version__ = '2025.03.31' +__version__ = '2025.05.22' -RELEASE_GIT_HEAD = '5e457af57fae9645b1b8fa0ed689229c8fb9656b' +RELEASE_GIT_HEAD = '7977b329ed97b216e37bd402f4935f28c00eac9e' VARIANT = None @@ -12,4 +12,4 @@ ORIGIN = 'yt-dlp/yt-dlp' -_pkg_version = '2025.03.31' +_pkg_version = '2025.05.22'