diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index a211ae1652..e2411ecfad 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -192,7 +192,7 @@ jobs: with: path: ./repo - name: Virtualized Install, Prepare & Build - uses: yt-dlp/run-on-arch-action@v2 + uses: yt-dlp/run-on-arch-action@v3 with: # Ref: https://github.com/uraimo/run-on-arch-action/issues/55 env: | @@ -256,7 +256,7 @@ jobs: with: path: | ~/yt-dlp-build-venv - key: cache-reqs-${{ github.job }} + key: cache-reqs-${{ github.job }}-${{ github.ref }} - name: Install Requirements run: | @@ -331,19 +331,16 @@ jobs: if: steps.restore-cache.outputs.cache-hit == 'true' env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - cache_key: cache-reqs-${{ github.job }} - repository: ${{ github.repository }} - branch: ${{ github.ref }} + cache_key: cache-reqs-${{ github.job }}-${{ github.ref }} run: | - gh extension install actions/gh-actions-cache - gh actions-cache delete "${cache_key}" -R "${repository}" -B "${branch}" --confirm + gh cache delete "${cache_key}" - name: Cache requirements uses: actions/cache/save@v4 with: path: | ~/yt-dlp-build-venv - key: cache-reqs-${{ github.job }} + key: cache-reqs-${{ github.job }}-${{ github.ref }} macos_legacy: needs: process @@ -411,7 +408,7 @@ jobs: run: | # Custom pyinstaller built with https://github.com/yt-dlp/pyinstaller-builds python devscripts/install_deps.py -o --include build python devscripts/install_deps.py --include curl-cffi - python -m pip install -U "https://yt-dlp.github.io/Pyinstaller-Builds/x86_64/pyinstaller-6.11.1-py3-none-any.whl" + python -m pip install -U "https://yt-dlp.github.io/Pyinstaller-Builds/x86_64/pyinstaller-6.13.0-py3-none-any.whl" - name: Prepare run: | @@ -460,7 +457,7 @@ jobs: run: | python devscripts/install_deps.py -o --include build python devscripts/install_deps.py - python -m pip install -U "https://yt-dlp.github.io/Pyinstaller-Builds/i686/pyinstaller-6.11.1-py3-none-any.whl" + python -m pip install -U "https://yt-dlp.github.io/Pyinstaller-Builds/i686/pyinstaller-6.13.0-py3-none-any.whl" - name: Prepare run: | diff --git a/.github/workflows/core.yml b/.github/workflows/core.yml index 9a4342a585..dd2c6f481e 100644 --- a/.github/workflows/core.yml +++ b/.github/workflows/core.yml @@ -6,7 +6,7 @@ on: - devscripts/** - test/** - yt_dlp/**.py - - '!yt_dlp/extractor/*.py' + - '!yt_dlp/extractor/**.py' - yt_dlp/extractor/__init__.py - yt_dlp/extractor/common.py - yt_dlp/extractor/extractors.py @@ -16,7 +16,7 @@ on: - devscripts/** - test/** - yt_dlp/**.py - - '!yt_dlp/extractor/*.py' + - '!yt_dlp/extractor/**.py' - yt_dlp/extractor/__init__.py - yt_dlp/extractor/common.py - yt_dlp/extractor/extractors.py diff --git a/.github/workflows/quick-test.yml b/.github/workflows/quick-test.yml index 1a32bbfe31..8a7b24033f 100644 --- a/.github/workflows/quick-test.yml +++ b/.github/workflows/quick-test.yml @@ -38,3 +38,5 @@ jobs: run: ruff check --output-format github . - name: Run autopep8 run: autopep8 --diff . + - name: Check file mode + run: git ls-files --format="%(objectmode) %(path)" yt_dlp/ | ( ! grep -v "^100644" ) diff --git a/CONTRIBUTORS b/CONTRIBUTORS index 0b90c0e6cc..6aa52c5958 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -742,3 +742,36 @@ lfavole mp3butcher slipinthedove YoshiTabletopGamer +Arc8ne +benfaerber +chrisellsworth +fries1234 +Kenshin9977 +MichaelDeBoey +msikma +pedro +pferreir +red-acid +refack +rysson +somini +thedenv +vallovic +arabcoders +mireq +mlabeeb03 +1271 +CasperMcFadden95 +Kicer86 +Kiritomo +leeblackc +meGAmeS1 +NeonMan +pj47x +troex +WouterGordts +baierjan +GeoffreyFrogeye +Pawka +v3DJG6GL +yozel diff --git a/Changelog.md b/Changelog.md index 9c544feb92..80b72da05a 100644 --- a/Changelog.md +++ b/Changelog.md @@ -4,6 +4,267 @@ # Changelog # To create a release, dispatch the https://github.com/yt-dlp/yt-dlp/actions/workflows/release.yml workflow on master --> +### 2025.05.22 + +#### Core changes +- **cookies**: [Fix Linux desktop environment detection](https://github.com/yt-dlp/yt-dlp/commit/e491fd4d090db3af52a82863fb0553dd5e17fb85) ([#13197](https://github.com/yt-dlp/yt-dlp/issues/13197)) by [mbway](https://github.com/mbway) +- **jsinterp**: [Fix increment/decrement evaluation](https://github.com/yt-dlp/yt-dlp/commit/167d7a9f0ffd1b4fe600193441bdb7358db2740b) ([#13238](https://github.com/yt-dlp/yt-dlp/issues/13238)) by [bashonly](https://github.com/bashonly), [seproDev](https://github.com/seproDev) + +#### Extractor changes +- **1tv**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/41c0a1fb89628696f8bb88e2b9f3a68f355b8c26) ([#13168](https://github.com/yt-dlp/yt-dlp/issues/13168)) by [bashonly](https://github.com/bashonly) +- **amcnetworks**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/464c84fedf78eef822a431361155f108b5df96d7) ([#13147](https://github.com/yt-dlp/yt-dlp/issues/13147)) by [bashonly](https://github.com/bashonly) +- **bitchute**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/1d0f6539c47e5d5c68c3c47cdb7075339e2885ac) ([#13081](https://github.com/yt-dlp/yt-dlp/issues/13081)) by [bashonly](https://github.com/bashonly) +- **cartoonnetwork**: [Remove extractor](https://github.com/yt-dlp/yt-dlp/commit/7dbb47f84f0ee1266a3a01f58c9bc4c76d76794a) ([#13148](https://github.com/yt-dlp/yt-dlp/issues/13148)) by [bashonly](https://github.com/bashonly) +- **iprima**: [Fix login support](https://github.com/yt-dlp/yt-dlp/commit/a7d9a5eb79ceeecb851389f3f2c88597871ca3f2) ([#12937](https://github.com/yt-dlp/yt-dlp/issues/12937)) by [baierjan](https://github.com/baierjan) +- **jiosaavn** + - artist: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/586b557b124f954d3f625360ebe970989022ad97) ([#12803](https://github.com/yt-dlp/yt-dlp/issues/12803)) by [subrat-lima](https://github.com/subrat-lima) + - playlist, show: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/317f4b8006c2c0f0f64f095b1485163ad97c9053) ([#12803](https://github.com/yt-dlp/yt-dlp/issues/12803)) by [subrat-lima](https://github.com/subrat-lima) + - show: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/6839276496d8814cf16f58b637e45663467928e6) ([#12803](https://github.com/yt-dlp/yt-dlp/issues/12803)) by [subrat-lima](https://github.com/subrat-lima) +- **lrtradio**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/abf58dcd6a09e14eec4ea82ae12f79a0337cb383) ([#13200](https://github.com/yt-dlp/yt-dlp/issues/13200)) by [Pawka](https://github.com/Pawka) +- **nebula**: [Support `--mark-watched`](https://github.com/yt-dlp/yt-dlp/commit/20f288bdc2173c7cc58d709d25ca193c1f6001e7) ([#13120](https://github.com/yt-dlp/yt-dlp/issues/13120)) by [GeoffreyFrogeye](https://github.com/GeoffreyFrogeye) +- **niconico** + - [Fix error handling](https://github.com/yt-dlp/yt-dlp/commit/f569be4602c2a857087e495d5d7ed6060cd97abe) ([#13236](https://github.com/yt-dlp/yt-dlp/issues/13236)) by [bashonly](https://github.com/bashonly) + - live: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/7a7b85c9014d96421e18aa7ea5f4c1bee5ceece0) ([#13045](https://github.com/yt-dlp/yt-dlp/issues/13045)) by [doe1080](https://github.com/doe1080) +- **nytimesarticle**: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/b26bc32579c00ef579d75a835807ccc87d20ee0a) ([#13104](https://github.com/yt-dlp/yt-dlp/issues/13104)) by [bashonly](https://github.com/bashonly) +- **once**: [Remove extractor](https://github.com/yt-dlp/yt-dlp/commit/f475e8b529d18efdad603ffda02a56e707fe0e2c) ([#13164](https://github.com/yt-dlp/yt-dlp/issues/13164)) by [bashonly](https://github.com/bashonly) +- **picarto**: vod: [Support `/profile/` video URLs](https://github.com/yt-dlp/yt-dlp/commit/31e090cb787f3504ec25485adff9a2a51d056734) ([#13227](https://github.com/yt-dlp/yt-dlp/issues/13227)) by [subrat-lima](https://github.com/subrat-lima) +- **playsuisse**: [Improve metadata extraction](https://github.com/yt-dlp/yt-dlp/commit/d880e060803ae8ed5a047e578cca01e1f0e630ce) ([#12466](https://github.com/yt-dlp/yt-dlp/issues/12466)) by [v3DJG6GL](https://github.com/v3DJG6GL) +- **sprout**: [Remove extractor](https://github.com/yt-dlp/yt-dlp/commit/cbcfe6378dde33a650e3852ab17ad4503b8e008d) ([#13149](https://github.com/yt-dlp/yt-dlp/issues/13149)) by [bashonly](https://github.com/bashonly) +- **svtpage**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/ea8498ed534642dd7e925961b97b934987142fd3) ([#12957](https://github.com/yt-dlp/yt-dlp/issues/12957)) by [diman8](https://github.com/diman8) +- **twitch**: [Support `--live-from-start`](https://github.com/yt-dlp/yt-dlp/commit/00b1bec55249cf2ad6271d36492c51b34b6459d1) ([#13202](https://github.com/yt-dlp/yt-dlp/issues/13202)) by [bashonly](https://github.com/bashonly) +- **vimeo**: event: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/545c1a5b6f2fe88722b41aef0e7485bf3be3f3f9) ([#13216](https://github.com/yt-dlp/yt-dlp/issues/13216)) by [bashonly](https://github.com/bashonly) +- **wat.tv**: [Improve error handling](https://github.com/yt-dlp/yt-dlp/commit/f123cc83b3aea45053f5fa1d9141048b01fc2774) ([#13111](https://github.com/yt-dlp/yt-dlp/issues/13111)) by [bashonly](https://github.com/bashonly) +- **weverse**: [Fix live extraction](https://github.com/yt-dlp/yt-dlp/commit/5328eda8820cc5f21dcf917684d23fbdca41831d) ([#13084](https://github.com/yt-dlp/yt-dlp/issues/13084)) by [bashonly](https://github.com/bashonly) +- **xinpianchang**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/83fabf352489d52843f67e6e9cc752db86d27e6e) ([#13245](https://github.com/yt-dlp/yt-dlp/issues/13245)) by [garret1317](https://github.com/garret1317) +- **youtube** + - [Add PO token support for subtitles](https://github.com/yt-dlp/yt-dlp/commit/32ed5f107c6c641958d1cd2752e130de4db55a13) ([#13234](https://github.com/yt-dlp/yt-dlp/issues/13234)) by [bashonly](https://github.com/bashonly), [coletdjnz](https://github.com/coletdjnz) + - [Add `web_embedded` client for age-restricted videos](https://github.com/yt-dlp/yt-dlp/commit/0feec6dc131f488428bf881519e7c69766fbb9ae) ([#13089](https://github.com/yt-dlp/yt-dlp/issues/13089)) by [bashonly](https://github.com/bashonly) + - [Add a PO Token Provider Framework](https://github.com/yt-dlp/yt-dlp/commit/2685654a37141cca63eda3a92da0e2706e23ccfd) ([#12840](https://github.com/yt-dlp/yt-dlp/issues/12840)) by [coletdjnz](https://github.com/coletdjnz) + - [Extract `media_type` for all videos](https://github.com/yt-dlp/yt-dlp/commit/ded11ebc9afba6ba33923375103e9be2d7c804e7) ([#13136](https://github.com/yt-dlp/yt-dlp/issues/13136)) by [bashonly](https://github.com/bashonly) + - [Fix `--live-from-start` support for premieres](https://github.com/yt-dlp/yt-dlp/commit/8f303afb43395be360cafd7ad4ce2b6e2eedfb8a) ([#13079](https://github.com/yt-dlp/yt-dlp/issues/13079)) by [arabcoders](https://github.com/arabcoders) + - [Fix geo-restriction error handling](https://github.com/yt-dlp/yt-dlp/commit/c7e575e31608c19c5b26c10a4229db89db5fc9a8) ([#13217](https://github.com/yt-dlp/yt-dlp/issues/13217)) by [yozel](https://github.com/yozel) + +#### Misc. changes +- **build** + - [Bump PyInstaller to v6.13.0](https://github.com/yt-dlp/yt-dlp/commit/17cf9088d0d535e4a7feffbf02bd49cd9dae5ab9) ([#13082](https://github.com/yt-dlp/yt-dlp/issues/13082)) by [bashonly](https://github.com/bashonly) + - [Bump run-on-arch-action to v3](https://github.com/yt-dlp/yt-dlp/commit/9064d2482d1fe722bbb4a49731fe0711c410d1c8) ([#13088](https://github.com/yt-dlp/yt-dlp/issues/13088)) by [bashonly](https://github.com/bashonly) +- **cleanup**: Miscellaneous: [7977b32](https://github.com/yt-dlp/yt-dlp/commit/7977b329ed97b216e37bd402f4935f28c00eac9e) by [bashonly](https://github.com/bashonly) + +### 2025.04.30 + +#### Important changes +- **New option `--preset-alias`/`-t` has been added** +This provides convenient predefined aliases for common use cases. Available presets include `mp4`, `mp3`, `mkv`, `aac`, and `sleep`. See [the README](https://github.com/yt-dlp/yt-dlp/blob/master/README.md#preset-aliases) for more details. + +#### Core changes +- [Add `--preset-alias` option](https://github.com/yt-dlp/yt-dlp/commit/88eb1e7a9a2720ac89d653c0d0e40292388823bb) ([#12839](https://github.com/yt-dlp/yt-dlp/issues/12839)) by [Grub4K](https://github.com/Grub4K), [seproDev](https://github.com/seproDev) +- **utils** + - `_yield_json_ld`: [Make function less fatal](https://github.com/yt-dlp/yt-dlp/commit/45f01de00e1bc076b7f676a669736326178647b1) ([#12855](https://github.com/yt-dlp/yt-dlp/issues/12855)) by [seproDev](https://github.com/seproDev) + - `url_or_none`: [Support WebSocket URLs](https://github.com/yt-dlp/yt-dlp/commit/a473e592337edb8ca40cde52c1fcaee261c54df9) ([#12848](https://github.com/yt-dlp/yt-dlp/issues/12848)) by [doe1080](https://github.com/doe1080) + +#### Extractor changes +- **abematv**: [Fix thumbnail extraction](https://github.com/yt-dlp/yt-dlp/commit/f5736bb35bde62348caebf7b188668655e316deb) ([#12859](https://github.com/yt-dlp/yt-dlp/issues/12859)) by [Kiritomo](https://github.com/Kiritomo) +- **atresplayer**: [Rework extractor](https://github.com/yt-dlp/yt-dlp/commit/839d64325356310e6de6cd9cad28fb546619ca63) ([#11424](https://github.com/yt-dlp/yt-dlp/issues/11424)) by [meGAmeS1](https://github.com/meGAmeS1), [seproDev](https://github.com/seproDev) +- **bpb**: [Fix formats extraction](https://github.com/yt-dlp/yt-dlp/commit/80736b9c90818adee933a155079b8535bc06819f) ([#13015](https://github.com/yt-dlp/yt-dlp/issues/13015)) by [bashonly](https://github.com/bashonly) +- **cda**: [Fix formats extraction](https://github.com/yt-dlp/yt-dlp/commit/9032f981362ea0be90626fab51ec37934feded6d) ([#12975](https://github.com/yt-dlp/yt-dlp/issues/12975)) by [bashonly](https://github.com/bashonly) +- **cdafolder**: [Extend `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/cb271d445bc2d866c9a3404b1d8f59bcb77447df) ([#12919](https://github.com/yt-dlp/yt-dlp/issues/12919)) by [fireattack](https://github.com/fireattack), [Kicer86](https://github.com/Kicer86) +- **crowdbunker**: [Make format extraction non-fatal](https://github.com/yt-dlp/yt-dlp/commit/4ebf41309d04a6e196944f1c0f5f0154cff0055a) ([#12836](https://github.com/yt-dlp/yt-dlp/issues/12836)) by [seproDev](https://github.com/seproDev) +- **dacast**: [Support tokenized URLs](https://github.com/yt-dlp/yt-dlp/commit/e7e3b7a55c456da4a5a812b4fefce4dce8e6a616) ([#12979](https://github.com/yt-dlp/yt-dlp/issues/12979)) by [bashonly](https://github.com/bashonly) +- **dzen.ru**: [Rework extractors](https://github.com/yt-dlp/yt-dlp/commit/a3f2b54c2535d862de6efa9cfaa6ca9a2b2f7dd6) ([#12852](https://github.com/yt-dlp/yt-dlp/issues/12852)) by [seproDev](https://github.com/seproDev) +- **generic**: [Fix MPD extraction for `file://` URLs](https://github.com/yt-dlp/yt-dlp/commit/34a061a295d156934417c67ee98070b94943006b) ([#12978](https://github.com/yt-dlp/yt-dlp/issues/12978)) by [bashonly](https://github.com/bashonly) +- **getcourseru**: [Fix extractors](https://github.com/yt-dlp/yt-dlp/commit/741fd809bc4d301c19b53877692ae510334a6750) ([#12943](https://github.com/yt-dlp/yt-dlp/issues/12943)) by [troex](https://github.com/troex) +- **ivoox**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/7faa18b83dcfc74a1a1e2034e6b0369c495ca645) ([#12768](https://github.com/yt-dlp/yt-dlp/issues/12768)) by [NeonMan](https://github.com/NeonMan), [seproDev](https://github.com/seproDev) +- **kika**: [Add playlist extractor](https://github.com/yt-dlp/yt-dlp/commit/3c1c75ecb8ab352f422b59af46fff2be992e4115) ([#12832](https://github.com/yt-dlp/yt-dlp/issues/12832)) by [1100101](https://github.com/1100101) +- **linkedin** + - [Support feed URLs](https://github.com/yt-dlp/yt-dlp/commit/73a26f9ee68610e33c0b4407b77355f2ab7afd0e) ([#12927](https://github.com/yt-dlp/yt-dlp/issues/12927)) by [seproDev](https://github.com/seproDev) + - events: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/b37ff4de5baf4e4e70c6a0ec34e136a279ad20af) ([#12926](https://github.com/yt-dlp/yt-dlp/issues/12926)) by [bashonly](https://github.com/bashonly), [seproDev](https://github.com/seproDev) +- **loco**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/f5a37ea40e20865b976ffeeff13eeae60292eb23) ([#12934](https://github.com/yt-dlp/yt-dlp/issues/12934)) by [seproDev](https://github.com/seproDev) +- **lrtradio**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/74e90dd9b8f9c1a5c48a2515126654f4d398d687) ([#12801](https://github.com/yt-dlp/yt-dlp/issues/12801)) by [subrat-lima](https://github.com/subrat-lima) +- **manyvids**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/77aa15e98f34c4ad425aabf39dd1ee37b48f772c) ([#10907](https://github.com/yt-dlp/yt-dlp/issues/10907)) by [pj47x](https://github.com/pj47x) +- **mixcloud**: [Refactor extractor](https://github.com/yt-dlp/yt-dlp/commit/db6d1f145ad583e0220637726029f8f2fa6200a0) ([#12830](https://github.com/yt-dlp/yt-dlp/issues/12830)) by [seproDev](https://github.com/seproDev), [WouterGordts](https://github.com/WouterGordts) +- **mlbtv**: [Fix device ID caching](https://github.com/yt-dlp/yt-dlp/commit/36da6360e130197df927ee93409519ce3f4075f5) ([#12980](https://github.com/yt-dlp/yt-dlp/issues/12980)) by [bashonly](https://github.com/bashonly) +- **niconico** + - [Fix login support](https://github.com/yt-dlp/yt-dlp/commit/25cd7c1ecbb6cbf21dd3a6e59608e4af94715ecc) ([#13008](https://github.com/yt-dlp/yt-dlp/issues/13008)) by [doe1080](https://github.com/doe1080) + - [Remove DMC formats support](https://github.com/yt-dlp/yt-dlp/commit/7d05aa99c65352feae1cd9a3ff8784b64bfe382a) ([#12916](https://github.com/yt-dlp/yt-dlp/issues/12916)) by [doe1080](https://github.com/doe1080) + - live: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/1d45e30537bf83e069184a440703e4c43b2e0198) ([#12809](https://github.com/yt-dlp/yt-dlp/issues/12809)) by [Snack-X](https://github.com/Snack-X) +- **panopto**: [Fix formats extraction](https://github.com/yt-dlp/yt-dlp/commit/9d26daa04ad5108257bc5e30f7f040c7f1fe7a5a) ([#12925](https://github.com/yt-dlp/yt-dlp/issues/12925)) by [seproDev](https://github.com/seproDev) +- **parti**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/425017531fbc3369becb5a44013e26f26efabf45) ([#12769](https://github.com/yt-dlp/yt-dlp/issues/12769)) by [benfaerber](https://github.com/benfaerber) +- **raiplay**: [Fix DRM detection](https://github.com/yt-dlp/yt-dlp/commit/dce82346245e35a46fda836ca2089805d2347935) ([#12971](https://github.com/yt-dlp/yt-dlp/issues/12971)) by [DTrombett](https://github.com/DTrombett) +- **reddit**: [Support `--ignore-no-formats-error`](https://github.com/yt-dlp/yt-dlp/commit/28f04e8a5e383ff531db646190b4be45554610d6) ([#12993](https://github.com/yt-dlp/yt-dlp/issues/12993)) by [bashonly](https://github.com/bashonly) +- **royalive**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/e1847535e28788414a25546a45bebcada2f34558) ([#12817](https://github.com/yt-dlp/yt-dlp/issues/12817)) by [CasperMcFadden95](https://github.com/CasperMcFadden95) +- **rtve**: [Rework extractors](https://github.com/yt-dlp/yt-dlp/commit/f07ee91c71920ab1187a7ea756720e81aa406a9d) ([#10388](https://github.com/yt-dlp/yt-dlp/issues/10388)) by [meGAmeS1](https://github.com/meGAmeS1), [seproDev](https://github.com/seproDev) +- **rumble**: [Improve format extraction](https://github.com/yt-dlp/yt-dlp/commit/58d0c83457b93b3c9a81eb6bc5a4c65f25e949df) ([#12838](https://github.com/yt-dlp/yt-dlp/issues/12838)) by [seproDev](https://github.com/seproDev) +- **tokfmpodcast**: [Fix formats extraction](https://github.com/yt-dlp/yt-dlp/commit/91832111a12d87499294a0f430829b8c2254c339) ([#12842](https://github.com/yt-dlp/yt-dlp/issues/12842)) by [selfisekai](https://github.com/selfisekai) +- **tv2dk**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/a3e91df30a45943f40759d2c1e0b6c2ca4b2a263) ([#12945](https://github.com/yt-dlp/yt-dlp/issues/12945)) by [bashonly](https://github.com/bashonly), [seproDev](https://github.com/seproDev) +- **tvp**: vod: [Improve `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/4e69a626cce51428bc1d66dc606a56d9498b03a5) ([#12923](https://github.com/yt-dlp/yt-dlp/issues/12923)) by [seproDev](https://github.com/seproDev) +- **tvw**: tvchannels: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/ed8ad1b4d6b9d7a1426ff5192ff924f3371e4721) ([#12721](https://github.com/yt-dlp/yt-dlp/issues/12721)) by [fries1234](https://github.com/fries1234) +- **twitcasting**: [Fix livestream extraction](https://github.com/yt-dlp/yt-dlp/commit/de271a06fd6d20d4f55597ff7f90e4d913de0a52) ([#12977](https://github.com/yt-dlp/yt-dlp/issues/12977)) by [bashonly](https://github.com/bashonly) +- **twitch**: clips: [Fix uploader metadata extraction](https://github.com/yt-dlp/yt-dlp/commit/1ae6bff564a65af41e94f1a4727892471ecdd05a) ([#13022](https://github.com/yt-dlp/yt-dlp/issues/13022)) by [1271](https://github.com/1271) +- **twitter** + - [Fix extraction when logged-in](https://github.com/yt-dlp/yt-dlp/commit/1cf39ddf3d10b6512daa7dd139e5f6c0dc548bbc) ([#13024](https://github.com/yt-dlp/yt-dlp/issues/13024)) by [bashonly](https://github.com/bashonly) + - spaces: [Improve metadata extraction](https://github.com/yt-dlp/yt-dlp/commit/70599e53b736bb75922b737e6e0d4f76e419bb20) ([#12911](https://github.com/yt-dlp/yt-dlp/issues/12911)) by [doe1080](https://github.com/doe1080) +- **vimeo**: [Extract from mobile API](https://github.com/yt-dlp/yt-dlp/commit/22ac81a0692019ac833cf282e4ef99718e9ef3fa) ([#13034](https://github.com/yt-dlp/yt-dlp/issues/13034)) by [bashonly](https://github.com/bashonly) +- **vk** + - [Fix chapters extraction](https://github.com/yt-dlp/yt-dlp/commit/5361a7c6e2933c919716e0cb1e3116c28c40419f) ([#12821](https://github.com/yt-dlp/yt-dlp/issues/12821)) by [seproDev](https://github.com/seproDev) + - [Fix uploader extraction](https://github.com/yt-dlp/yt-dlp/commit/2381881fe58a723853350a6ab750a5efc9f10c85) ([#12985](https://github.com/yt-dlp/yt-dlp/issues/12985)) by [seproDev](https://github.com/seproDev) +- **youtube** + - [Add context to video request rate limit error](https://github.com/yt-dlp/yt-dlp/commit/26feac3dd142536ad08ad1ed731378cb88e63602) ([#12958](https://github.com/yt-dlp/yt-dlp/issues/12958)) by [coletdjnz](https://github.com/coletdjnz) + - [Add extractor arg to skip "initial_data" request](https://github.com/yt-dlp/yt-dlp/commit/ed6c6d7eefbc78fa72e4e60ad6edaa3ee2acc715) ([#12865](https://github.com/yt-dlp/yt-dlp/issues/12865)) by [leeblackc](https://github.com/leeblackc) + - [Add warning on video captcha challenge](https://github.com/yt-dlp/yt-dlp/commit/f484c51599a6cd01eb078ea7dc9bbba942967774) ([#12939](https://github.com/yt-dlp/yt-dlp/issues/12939)) by [coletdjnz](https://github.com/coletdjnz) + - [Cache signature timestamps](https://github.com/yt-dlp/yt-dlp/commit/61c9a938b390b8334ee3a879fe2d93f714e30138) ([#13047](https://github.com/yt-dlp/yt-dlp/issues/13047)) by [bashonly](https://github.com/bashonly) + - [Detect and warn when account cookies are rotated](https://github.com/yt-dlp/yt-dlp/commit/8cb08028f5be2acb9835ce1670b196b9b077052f) ([#13014](https://github.com/yt-dlp/yt-dlp/issues/13014)) by [coletdjnz](https://github.com/coletdjnz) + - [Detect player JS variants for any locale](https://github.com/yt-dlp/yt-dlp/commit/c2d6659d1069f8cff97e1fd61d1c59e949e1e63d) ([#13003](https://github.com/yt-dlp/yt-dlp/issues/13003)) by [bashonly](https://github.com/bashonly) + - [Do not strictly deprioritize `missing_pot` formats](https://github.com/yt-dlp/yt-dlp/commit/74fc2ae12c24eb6b4e02c6360c89bd05f3c8f740) ([#13061](https://github.com/yt-dlp/yt-dlp/issues/13061)) by [bashonly](https://github.com/bashonly) + - [Improve warning for SABR-only/SSAP player responses](https://github.com/yt-dlp/yt-dlp/commit/fd8394bc50301ac5e930aa65aa71ab1b8372b8ab) ([#13049](https://github.com/yt-dlp/yt-dlp/issues/13049)) by [bashonly](https://github.com/bashonly) + - tab: [Extract continuation from empty page](https://github.com/yt-dlp/yt-dlp/commit/72ba4879304c2082fecbb472e6cc05ee2d154a3b) ([#12938](https://github.com/yt-dlp/yt-dlp/issues/12938)) by [coletdjnz](https://github.com/coletdjnz) +- **zdf**: [Fix extractors](https://github.com/yt-dlp/yt-dlp/commit/7be14109a6bd493a2e881da4f9e30adaf3e7e5d5) ([#12779](https://github.com/yt-dlp/yt-dlp/issues/12779)) by [bashonly](https://github.com/bashonly), [InvalidUsernameException](https://github.com/InvalidUsernameException) + +#### Downloader changes +- **niconicodmc**: [Remove downloader](https://github.com/yt-dlp/yt-dlp/commit/8d127b18f81131453eaba05d3bb810d9b73adb75) ([#12916](https://github.com/yt-dlp/yt-dlp/issues/12916)) by [doe1080](https://github.com/doe1080) + +#### Networking changes +- [Add PATCH request shortcut](https://github.com/yt-dlp/yt-dlp/commit/ceab4d5ed63a1f135a1816fe967c9d9a1ec7e6e8) ([#12884](https://github.com/yt-dlp/yt-dlp/issues/12884)) by [doe1080](https://github.com/doe1080) + +#### Misc. changes +- **ci**: [Add file mode test to code check](https://github.com/yt-dlp/yt-dlp/commit/3690e91265d1d0bbeffaf6a9b8cc9baded1367bd) ([#13036](https://github.com/yt-dlp/yt-dlp/issues/13036)) by [Grub4K](https://github.com/Grub4K) +- **cleanup**: Miscellaneous: [505b400](https://github.com/yt-dlp/yt-dlp/commit/505b400795af557bdcfd9d4fa7e9133b26ef431c) by [bashonly](https://github.com/bashonly), [seproDev](https://github.com/seproDev) + +### 2025.03.31 + +#### Core changes +- [Add `--compat-options 2024`](https://github.com/yt-dlp/yt-dlp/commit/22e34adbd741e1c7072015debd615dc3fb71c401) ([#12789](https://github.com/yt-dlp/yt-dlp/issues/12789)) by [seproDev](https://github.com/seproDev) + +#### Extractor changes +- **francaisfacile**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/bb321cfdc3fd4400598ddb12a15862bc2ac8fc10) ([#12787](https://github.com/yt-dlp/yt-dlp/issues/12787)) by [mlabeeb03](https://github.com/mlabeeb03) +- **generic**: [Validate response before checking m3u8 live status](https://github.com/yt-dlp/yt-dlp/commit/9a1ec1d36e172d252714cef712a6d091e0a0c4f2) ([#12784](https://github.com/yt-dlp/yt-dlp/issues/12784)) by [bashonly](https://github.com/bashonly) +- **microsoftlearnepisode**: [Extract more formats](https://github.com/yt-dlp/yt-dlp/commit/d63696f23a341ee36a3237ccb5d5e14b34c2c579) ([#12799](https://github.com/yt-dlp/yt-dlp/issues/12799)) by [bashonly](https://github.com/bashonly) +- **mlbtv**: [Fix radio-only extraction](https://github.com/yt-dlp/yt-dlp/commit/f033d86b96b36f8c5289dd7c3304f42d4d9f6ff4) ([#12792](https://github.com/yt-dlp/yt-dlp/issues/12792)) by [bashonly](https://github.com/bashonly) +- **on24**: [Support `mainEvent` URLs](https://github.com/yt-dlp/yt-dlp/commit/e465b078ead75472fcb7b86f6ccaf2b5d3bc4c21) ([#12800](https://github.com/yt-dlp/yt-dlp/issues/12800)) by [bashonly](https://github.com/bashonly) +- **sbs**: [Fix subtitles extraction](https://github.com/yt-dlp/yt-dlp/commit/29560359120f28adaaac67c86fa8442eb72daa0d) ([#12785](https://github.com/yt-dlp/yt-dlp/issues/12785)) by [bashonly](https://github.com/bashonly) +- **stvr**: [Rename extractor from RTVS to STVR](https://github.com/yt-dlp/yt-dlp/commit/5fc521cbd0ce7b2410d0935369558838728e205d) ([#12788](https://github.com/yt-dlp/yt-dlp/issues/12788)) by [mireq](https://github.com/mireq) +- **twitch**: clips: [Extract portrait formats](https://github.com/yt-dlp/yt-dlp/commit/61046c31612b30c749cbdae934b7fe26abe659d7) ([#12763](https://github.com/yt-dlp/yt-dlp/issues/12763)) by [DmitryScaletta](https://github.com/DmitryScaletta) +- **youtube** + - [Add `player_js_variant` extractor-arg](https://github.com/yt-dlp/yt-dlp/commit/07f04005e40ebdb368920c511e36e98af0077ed3) ([#12767](https://github.com/yt-dlp/yt-dlp/issues/12767)) by [bashonly](https://github.com/bashonly) + - tab: [Fix playlist continuation extraction](https://github.com/yt-dlp/yt-dlp/commit/6a6d97b2cbc78f818de05cc96edcdcfd52caa259) ([#12777](https://github.com/yt-dlp/yt-dlp/issues/12777)) by [coletdjnz](https://github.com/coletdjnz) + +#### Misc. changes +- **cleanup**: Miscellaneous: [5e457af](https://github.com/yt-dlp/yt-dlp/commit/5e457af57fae9645b1b8fa0ed689229c8fb9656b) by [bashonly](https://github.com/bashonly) + +### 2025.03.27 + +#### Core changes +- **jsinterp**: [Fix nested attributes and object extraction](https://github.com/yt-dlp/yt-dlp/commit/a8b9ff3c2a0ae25735e580173becc78545b92572) ([#12760](https://github.com/yt-dlp/yt-dlp/issues/12760)) by [bashonly](https://github.com/bashonly), [seproDev](https://github.com/seproDev) + +#### Extractor changes +- **youtube**: [Make signature and nsig extraction more robust](https://github.com/yt-dlp/yt-dlp/commit/48be862b32648bff5b3e553e40fca4dcc6e88b28) ([#12761](https://github.com/yt-dlp/yt-dlp/issues/12761)) by [bashonly](https://github.com/bashonly), [seproDev](https://github.com/seproDev) + +### 2025.03.26 + +#### Extractor changes +- **youtube** + - [Fix signature and nsig extraction for player `4fcd6e4a`](https://github.com/yt-dlp/yt-dlp/commit/a550dfc904a02843a26369ae50dbb7c0febfb30e) ([#12748](https://github.com/yt-dlp/yt-dlp/issues/12748)) by [seproDev](https://github.com/seproDev) + - [Only cache nsig code on successful decoding](https://github.com/yt-dlp/yt-dlp/commit/ecee97b4fa90d51c48f9154c3a6d5a8ffe46cd5c) ([#12750](https://github.com/yt-dlp/yt-dlp/issues/12750)) by [bashonly](https://github.com/bashonly), [seproDev](https://github.com/seproDev) + +### 2025.03.25 + +#### Core changes +- [Fix attribute error on failed VT init](https://github.com/yt-dlp/yt-dlp/commit/b872ffec50fd50f790a5a490e006a369a28a3df3) ([#12696](https://github.com/yt-dlp/yt-dlp/issues/12696)) by [Grub4K](https://github.com/Grub4K) +- **utils**: `js_to_json`: [Make function less fatal](https://github.com/yt-dlp/yt-dlp/commit/9491b44032b330e05bd5eaa546187005d1e8538e) ([#12715](https://github.com/yt-dlp/yt-dlp/issues/12715)) by [seproDev](https://github.com/seproDev) + +#### Extractor changes +- [Fix sorting of HLS audio formats by `GROUP-ID`](https://github.com/yt-dlp/yt-dlp/commit/86ab79e1a5182092321102adf6ca34195803b878) ([#12714](https://github.com/yt-dlp/yt-dlp/issues/12714)) by [bashonly](https://github.com/bashonly) +- **17live**: vod: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/3396eb50dcd245b49c0f4aecd6e80ec914095d16) ([#12723](https://github.com/yt-dlp/yt-dlp/issues/12723)) by [subrat-lima](https://github.com/subrat-lima) +- **9now.com.au**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/9d5e6de2e7a47226d1f72c713ad45c88ba01db68) ([#12702](https://github.com/yt-dlp/yt-dlp/issues/12702)) by [bashonly](https://github.com/bashonly) +- **chzzk**: video: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/e2dfccaf808b406d5bcb7dd04ae9ce420752dd6f) ([#12692](https://github.com/yt-dlp/yt-dlp/issues/12692)) by [bashonly](https://github.com/bashonly), [dirkf](https://github.com/dirkf) +- **deezer**: [Remove extractors](https://github.com/yt-dlp/yt-dlp/commit/be5af3f9e91747768c2b41157851bfbe14c663f7) ([#12704](https://github.com/yt-dlp/yt-dlp/issues/12704)) by [seproDev](https://github.com/seproDev) +- **generic**: [Fix MPD base URL parsing](https://github.com/yt-dlp/yt-dlp/commit/5086d4aed6aeb3908c62f49e2d8f74cc0cb05110) ([#12718](https://github.com/yt-dlp/yt-dlp/issues/12718)) by [fireattack](https://github.com/fireattack) +- **streaks**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/801afeac91f97dc0b58cd39cc7e8c50f619dc4e1) ([#12679](https://github.com/yt-dlp/yt-dlp/issues/12679)) by [doe1080](https://github.com/doe1080) +- **tver**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/66e0bab814e4a52ef3e12d81123ad992a29df50e) ([#12659](https://github.com/yt-dlp/yt-dlp/issues/12659)) by [arabcoders](https://github.com/arabcoders), [bashonly](https://github.com/bashonly) +- **viki**: [Remove extractors](https://github.com/yt-dlp/yt-dlp/commit/fe4f14b8369038e7c58f7de546d76de1ce3a91ce) ([#12703](https://github.com/yt-dlp/yt-dlp/issues/12703)) by [seproDev](https://github.com/seproDev) +- **vrsquare**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/b7fbb5a0a16a8e8d3e29c29e26ebed677d0d6ea3) ([#12515](https://github.com/yt-dlp/yt-dlp/issues/12515)) by [doe1080](https://github.com/doe1080) +- **youtube** + - [Fix PhantomJS nsig fallback](https://github.com/yt-dlp/yt-dlp/commit/4054a2b623bd1e277b49d2e9abc3d112a4b1c7be) ([#12728](https://github.com/yt-dlp/yt-dlp/issues/12728)) by [bashonly](https://github.com/bashonly) + - [Fix signature and nsig extraction for player `363db69b`](https://github.com/yt-dlp/yt-dlp/commit/b9c979461b244713bf42691a5bc02834e2ba4b2c) ([#12725](https://github.com/yt-dlp/yt-dlp/issues/12725)) by [bashonly](https://github.com/bashonly) + +#### Networking changes +- **Request Handler**: curl_cffi: [Support `curl_cffi` 0.10.x](https://github.com/yt-dlp/yt-dlp/commit/9bf23902ceb948b9685ce1dab575491571720fc6) ([#12670](https://github.com/yt-dlp/yt-dlp/issues/12670)) by [Grub4K](https://github.com/Grub4K) + +#### Misc. changes +- **cleanup**: Miscellaneous: [9dde546](https://github.com/yt-dlp/yt-dlp/commit/9dde546e7ee3e1515d88ee3af08b099351455dc0) by [seproDev](https://github.com/seproDev) + +### 2025.03.21 + +#### Core changes +- [Fix external downloader availability when using `--ffmpeg-location`](https://github.com/yt-dlp/yt-dlp/commit/9f77e04c76e36e1cbbf49bc9eb385fa6ef804b67) ([#12318](https://github.com/yt-dlp/yt-dlp/issues/12318)) by [Kenshin9977](https://github.com/Kenshin9977) +- [Load plugins on demand](https://github.com/yt-dlp/yt-dlp/commit/4445f37a7a66b248dbd8376c43137e6e441f138e) ([#11305](https://github.com/yt-dlp/yt-dlp/issues/11305)) by [coletdjnz](https://github.com/coletdjnz), [Grub4K](https://github.com/Grub4K), [pukkandan](https://github.com/pukkandan) (With fixes in [c034d65](https://github.com/yt-dlp/yt-dlp/commit/c034d655487be668222ef9476a16f374584e49a7)) +- [Support emitting ConEmu progress codes](https://github.com/yt-dlp/yt-dlp/commit/f7a1f2d8132967a62b0f6d5665c6d2dde2d42c09) ([#10649](https://github.com/yt-dlp/yt-dlp/issues/10649)) by [Grub4K](https://github.com/Grub4K) + +#### Extractor changes +- **azmedien**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/26a502fc727d0e91b2db6bf4a112823bcc672e85) ([#12375](https://github.com/yt-dlp/yt-dlp/issues/12375)) by [goggle](https://github.com/goggle) +- **bilibiliplaylist**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/f5fb2229e66cf59d5bf16065bc041b42a28354a0) ([#12690](https://github.com/yt-dlp/yt-dlp/issues/12690)) by [bashonly](https://github.com/bashonly) +- **bunnycdn**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/3a1583ca75fb523cbad0e5e174387ea7b477d175) ([#11586](https://github.com/yt-dlp/yt-dlp/issues/11586)) by [Grub4K](https://github.com/Grub4K), [seproDev](https://github.com/seproDev) +- **canalsurmas**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/01a8be4c23f186329d85f9c78db34a55f3294ac5) ([#12497](https://github.com/yt-dlp/yt-dlp/issues/12497)) by [Arc8ne](https://github.com/Arc8ne) +- **cda**: [Fix login support](https://github.com/yt-dlp/yt-dlp/commit/be0d819e1103195043f6743650781f0d4d343f6d) ([#12552](https://github.com/yt-dlp/yt-dlp/issues/12552)) by [rysson](https://github.com/rysson) +- **cultureunplugged**: [Extend `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/3042afb5fe342d3a00de76704cd7de611acc350e) ([#12486](https://github.com/yt-dlp/yt-dlp/issues/12486)) by [seproDev](https://github.com/seproDev) +- **dailymotion**: [Improve embed detection](https://github.com/yt-dlp/yt-dlp/commit/ad60137c141efa5023fbc0ac8579eaefe8b3d8cc) ([#12464](https://github.com/yt-dlp/yt-dlp/issues/12464)) by [seproDev](https://github.com/seproDev) +- **gem.cbc.ca**: [Fix login support](https://github.com/yt-dlp/yt-dlp/commit/eb1417786a3027b1e7290ec37ef6aaece50ebed0) ([#12414](https://github.com/yt-dlp/yt-dlp/issues/12414)) by [bashonly](https://github.com/bashonly) +- **globo**: [Fix subtitles extraction](https://github.com/yt-dlp/yt-dlp/commit/0e1697232fcbba7551f983fd1ba93bb445cbb08b) ([#12270](https://github.com/yt-dlp/yt-dlp/issues/12270)) by [pedro](https://github.com/pedro) +- **instagram** + - [Add `app_id` extractor-arg](https://github.com/yt-dlp/yt-dlp/commit/a90641c8363fa0c10800b36eb6b01ee22d3a9409) ([#12359](https://github.com/yt-dlp/yt-dlp/issues/12359)) by [chrisellsworth](https://github.com/chrisellsworth) + - [Fix extraction of older private posts](https://github.com/yt-dlp/yt-dlp/commit/a59abe0636dc49b22a67246afe35613571b86f05) ([#12451](https://github.com/yt-dlp/yt-dlp/issues/12451)) by [bashonly](https://github.com/bashonly) + - [Improve error handling](https://github.com/yt-dlp/yt-dlp/commit/480125560a3b9972d29ae0da850aba8109e6bd41) ([#12410](https://github.com/yt-dlp/yt-dlp/issues/12410)) by [bashonly](https://github.com/bashonly) + - story: [Support `--no-playlist`](https://github.com/yt-dlp/yt-dlp/commit/65c3c58c0a67463a150920203cec929045c95a24) ([#12397](https://github.com/yt-dlp/yt-dlp/issues/12397)) by [fireattack](https://github.com/fireattack) +- **jamendo**: [Fix thumbnail extraction](https://github.com/yt-dlp/yt-dlp/commit/89a68c4857ddbaf937ff22f12648baaf6b5af840) ([#12622](https://github.com/yt-dlp/yt-dlp/issues/12622)) by [bashonly](https://github.com/bashonly), [JChris246](https://github.com/JChris246) +- **ketnet**: [Remove extractor](https://github.com/yt-dlp/yt-dlp/commit/bbada3ec0779422cde34f1ce3dcf595da463b493) ([#12628](https://github.com/yt-dlp/yt-dlp/issues/12628)) by [MichaelDeBoey](https://github.com/MichaelDeBoey) +- **lbry** + - [Make m3u8 format extraction non-fatal](https://github.com/yt-dlp/yt-dlp/commit/9807181cfbf87bfa732f415c30412bdbd77cbf81) ([#12463](https://github.com/yt-dlp/yt-dlp/issues/12463)) by [bashonly](https://github.com/bashonly) + - [Raise appropriate error for non-media files](https://github.com/yt-dlp/yt-dlp/commit/7126b472601814b7fd8c9de02069e8fff1764891) ([#12462](https://github.com/yt-dlp/yt-dlp/issues/12462)) by [bashonly](https://github.com/bashonly) +- **loco**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/983095485c731240aae27c950cb8c24a50827b56) ([#12667](https://github.com/yt-dlp/yt-dlp/issues/12667)) by [DTrombett](https://github.com/DTrombett) +- **magellantv**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/172d5fcd778bf2605db7647ebc56b29ed18d24ac) ([#12505](https://github.com/yt-dlp/yt-dlp/issues/12505)) by [seproDev](https://github.com/seproDev) +- **mitele**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/7223d29569a48a35ad132a508c115973866838d3) ([#12689](https://github.com/yt-dlp/yt-dlp/issues/12689)) by [bashonly](https://github.com/bashonly) +- **msn**: [Rework extractor](https://github.com/yt-dlp/yt-dlp/commit/4815dac131d42c51e12c1d05232db0bbbf607329) ([#12513](https://github.com/yt-dlp/yt-dlp/issues/12513)) by [seproDev](https://github.com/seproDev), [thedenv](https://github.com/thedenv) +- **n1**: [Fix extraction of newer articles](https://github.com/yt-dlp/yt-dlp/commit/9d70abe4de401175cbbaaa36017806f16b2df9af) ([#12514](https://github.com/yt-dlp/yt-dlp/issues/12514)) by [u-spec-png](https://github.com/u-spec-png) +- **nbcstations**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/ebac65aa9e0bf9a97c24d00f7977900d2577364b) ([#12534](https://github.com/yt-dlp/yt-dlp/issues/12534)) by [refack](https://github.com/refack) +- **niconico** + - [Fix format sorting](https://github.com/yt-dlp/yt-dlp/commit/7508e34f203e97389f1d04db92140b13401dd724) ([#12442](https://github.com/yt-dlp/yt-dlp/issues/12442)) by [xpadev-net](https://github.com/xpadev-net) + - live: [Fix thumbnail extraction](https://github.com/yt-dlp/yt-dlp/commit/c2e6e1d5f77f3b720a6266f2869eb750d20e5dc1) ([#12419](https://github.com/yt-dlp/yt-dlp/issues/12419)) by [bashonly](https://github.com/bashonly) +- **openrec**: [Fix `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/17504f253564cfad86244de2b6346d07d2300ca5) ([#12608](https://github.com/yt-dlp/yt-dlp/issues/12608)) by [fireattack](https://github.com/fireattack) +- **pinterest**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/bd0a66816934de70312eea1e71c59c13b401dc3a) ([#12538](https://github.com/yt-dlp/yt-dlp/issues/12538)) by [mikf](https://github.com/mikf) +- **playsuisse**: [Fix login support](https://github.com/yt-dlp/yt-dlp/commit/6933f5670cea9c3e2fb16c1caa1eda54d13122c5) ([#12444](https://github.com/yt-dlp/yt-dlp/issues/12444)) by [bashonly](https://github.com/bashonly) +- **reddit**: [Truncate title](https://github.com/yt-dlp/yt-dlp/commit/d9a53cc1e6fd912daf500ca4f19e9ca88994dbf9) ([#12567](https://github.com/yt-dlp/yt-dlp/issues/12567)) by [seproDev](https://github.com/seproDev) +- **rtp**: [Rework extractor](https://github.com/yt-dlp/yt-dlp/commit/8eb9c1bf3b9908cca22ef043602aa24fb9f352c6) ([#11638](https://github.com/yt-dlp/yt-dlp/issues/11638)) by [pferreir](https://github.com/pferreir), [red-acid](https://github.com/red-acid), [seproDev](https://github.com/seproDev), [somini](https://github.com/somini), [vallovic](https://github.com/vallovic) +- **softwhiteunderbelly**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/652827d5a076c9483c36654ad2cf3fe46219baf4) ([#12281](https://github.com/yt-dlp/yt-dlp/issues/12281)) by [benfaerber](https://github.com/benfaerber) +- **soop**: [Fix timestamp extraction](https://github.com/yt-dlp/yt-dlp/commit/8305df00012ff8138a6ff95279d06b54ac607f63) ([#12609](https://github.com/yt-dlp/yt-dlp/issues/12609)) by [msikma](https://github.com/msikma) +- **soundcloud** + - [Extract tags](https://github.com/yt-dlp/yt-dlp/commit/9deed13d7cce6d3647379e50589c92de89227509) ([#12420](https://github.com/yt-dlp/yt-dlp/issues/12420)) by [bashonly](https://github.com/bashonly) + - [Fix thumbnail extraction](https://github.com/yt-dlp/yt-dlp/commit/6deeda5c11f34f613724fa0627879f0d607ba1b4) ([#12447](https://github.com/yt-dlp/yt-dlp/issues/12447)) by [bashonly](https://github.com/bashonly) +- **tiktok** + - [Improve error handling](https://github.com/yt-dlp/yt-dlp/commit/99ea2978757a431eeb2a265b3395ccbe4ce202cf) ([#12445](https://github.com/yt-dlp/yt-dlp/issues/12445)) by [bashonly](https://github.com/bashonly) + - [Truncate title](https://github.com/yt-dlp/yt-dlp/commit/83b119dadb0f267f1fb66bf7ed74c097349de79e) ([#12566](https://github.com/yt-dlp/yt-dlp/issues/12566)) by [seproDev](https://github.com/seproDev) +- **tv8.it**: [Add live and playlist extractors](https://github.com/yt-dlp/yt-dlp/commit/2ee3a0aff9be2be3bea60640d3d8a0febaf0acb6) ([#12569](https://github.com/yt-dlp/yt-dlp/issues/12569)) by [DTrombett](https://github.com/DTrombett) +- **tvw**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/42b7440963866e31ff84a5b89030d1c596fa2e6e) ([#12271](https://github.com/yt-dlp/yt-dlp/issues/12271)) by [fries1234](https://github.com/fries1234) +- **twitter** + - [Fix syndication token generation](https://github.com/yt-dlp/yt-dlp/commit/b8b47547049f5ebc3dd680fc7de70ed0ca9c0d70) ([#12537](https://github.com/yt-dlp/yt-dlp/issues/12537)) by [bashonly](https://github.com/bashonly) + - [Truncate title](https://github.com/yt-dlp/yt-dlp/commit/06f6de78db2eceeabd062ab1a3023e0ff9d4df53) ([#12560](https://github.com/yt-dlp/yt-dlp/issues/12560)) by [seproDev](https://github.com/seproDev) +- **vk**: [Improve metadata extraction](https://github.com/yt-dlp/yt-dlp/commit/05c8023a27dd37c49163c0498bf98e3e3c1cb4b9) ([#12510](https://github.com/yt-dlp/yt-dlp/issues/12510)) by [seproDev](https://github.com/seproDev) +- **vrtmax**: [Rework extractor](https://github.com/yt-dlp/yt-dlp/commit/df9ebeec00d658693252978d1ffb885e67aa6ab6) ([#12479](https://github.com/yt-dlp/yt-dlp/issues/12479)) by [bergoid](https://github.com/bergoid), [MichaelDeBoey](https://github.com/MichaelDeBoey), [seproDev](https://github.com/seproDev) +- **weibo**: [Support playlists](https://github.com/yt-dlp/yt-dlp/commit/0bb39788626002a8a67e925580227952c563c8b9) ([#12284](https://github.com/yt-dlp/yt-dlp/issues/12284)) by [4ft35t](https://github.com/4ft35t) +- **wsj**: [Support opinion URLs and impersonation](https://github.com/yt-dlp/yt-dlp/commit/7f3006eb0c0659982bb956d71b0bc806bcb0a5f2) ([#12431](https://github.com/yt-dlp/yt-dlp/issues/12431)) by [refack](https://github.com/refack) +- **youtube** + - [Fix nsig and signature extraction for player `643afba4`](https://github.com/yt-dlp/yt-dlp/commit/9b868518a15599f3d7ef5a1c730dda164c30da9b) ([#12684](https://github.com/yt-dlp/yt-dlp/issues/12684)) by [bashonly](https://github.com/bashonly), [seproDev](https://github.com/seproDev) + - [Player client maintenance](https://github.com/yt-dlp/yt-dlp/commit/3380febe9984c21c79c3147c1d390a4cf339bc4c) ([#12603](https://github.com/yt-dlp/yt-dlp/issues/12603)) by [seproDev](https://github.com/seproDev) + - [Split into package](https://github.com/yt-dlp/yt-dlp/commit/4432a9390c79253ac830702b226d2e558b636725) ([#12557](https://github.com/yt-dlp/yt-dlp/issues/12557)) by [coletdjnz](https://github.com/coletdjnz) + - [Warn on DRM formats](https://github.com/yt-dlp/yt-dlp/commit/e67d786c7cc87bd449d22e0ddef08306891c1173) ([#12593](https://github.com/yt-dlp/yt-dlp/issues/12593)) by [coletdjnz](https://github.com/coletdjnz) + - [Warn on missing formats due to SSAP](https://github.com/yt-dlp/yt-dlp/commit/79ec2fdff75c8c1bb89b550266849ad4dec48dd3) ([#12483](https://github.com/yt-dlp/yt-dlp/issues/12483)) by [coletdjnz](https://github.com/coletdjnz) + +#### Networking changes +- [Add `keep_header_casing` extension](https://github.com/yt-dlp/yt-dlp/commit/7d18fed8f1983fe6de4ddc810dfb2761ba5744ac) ([#11652](https://github.com/yt-dlp/yt-dlp/issues/11652)) by [coletdjnz](https://github.com/coletdjnz), [Grub4K](https://github.com/Grub4K) +- [Always add unsupported suffix on version mismatch](https://github.com/yt-dlp/yt-dlp/commit/95f8df2f796d0048119615200758199aedcd7cf4) ([#12626](https://github.com/yt-dlp/yt-dlp/issues/12626)) by [Grub4K](https://github.com/Grub4K) + +#### Misc. changes +- **cleanup**: Miscellaneous: [f36e4b6](https://github.com/yt-dlp/yt-dlp/commit/f36e4b6e65cb8403791aae2f520697115cb88dec) by [dirkf](https://github.com/dirkf), [gamer191](https://github.com/gamer191), [Grub4K](https://github.com/Grub4K), [seproDev](https://github.com/seproDev) +- **test**: [Show all differences for `expect_value` and `expect_dict`](https://github.com/yt-dlp/yt-dlp/commit/a3e0c7d3b267abdf3933b709704a28d43bb46503) ([#12334](https://github.com/yt-dlp/yt-dlp/issues/12334)) by [Grub4K](https://github.com/Grub4K) + ### 2025.02.19 #### Core changes diff --git a/README.md b/README.md index 6e27b0a348..6e2dc6243c 100644 --- a/README.md +++ b/README.md @@ -44,6 +44,7 @@ * [Post-processing Options](#post-processing-options) * [SponsorBlock Options](#sponsorblock-options) * [Extractor Options](#extractor-options) + * [Preset Aliases](#preset-aliases) * [CONFIGURATION](#configuration) * [Configuration file encoding](#configuration-file-encoding) * [Authentication with netrc](#authentication-with-netrc) @@ -348,8 +349,8 @@ ## General Options: --no-flat-playlist Fully extract the videos of a playlist (default) --live-from-start Download livestreams from the start. - Currently only supported for YouTube - (Experimental) + Currently experimental and only supported + for YouTube and Twitch --no-live-from-start Download livestreams from the current time (default) --wait-for-video MIN[-MAX] Wait for scheduled streams to become @@ -375,17 +376,23 @@ ## General Options: an alias starts with a dash "-", it is prefixed with "--". Arguments are parsed according to the Python string formatting - mini-language. E.g. --alias get-audio,-X - "-S=aext:{0},abr -x --audio-format {0}" - creates options "--get-audio" and "-X" that - takes an argument (ARG0) and expands to - "-S=aext:ARG0,abr -x --audio-format ARG0". - All defined aliases are listed in the --help + mini-language. E.g. --alias get-audio,-X "-S + aext:{0},abr -x --audio-format {0}" creates + options "--get-audio" and "-X" that takes an + argument (ARG0) and expands to "-S + aext:ARG0,abr -x --audio-format ARG0". All + defined aliases are listed in the --help output. Alias options can trigger more aliases; so be careful to avoid defining recursive options. As a safety measure, each alias may be triggered a maximum of 100 times. This option can be used multiple times + -t, --preset-alias PRESET Applies a predefined set of options. e.g. + --preset-alias mp3. The following presets + are available: mp3, aac, mp4, mkv, sleep. + See the "Preset Aliases" section at the end + for more info. This option can be used + multiple times ## Network Options: --proxy URL Use the specified HTTP/HTTPS/SOCKS proxy. To @@ -1098,6 +1105,27 @@ ## Extractor Options: can use this option multiple times to give arguments for different extractors +## Preset Aliases: +Predefined aliases for convenience and ease of use. Note that future + versions of yt-dlp may add or adjust presets, but the existing preset + names will not be changed or removed + + -t mp3 -f 'ba[acodec^=mp3]/ba/b' -x --audio-format + mp3 + + -t aac -f + 'ba[acodec^=aac]/ba[acodec^=mp4a.40.]/ba/b' + -x --audio-format aac + + -t mp4 --merge-output-format mp4 --remux-video mp4 + -S vcodec:h264,lang,quality,res,fps,hdr:12,a + codec:aac + + -t mkv --merge-output-format mkv --remux-video mkv + + -t sleep --sleep-subtitles 5 --sleep-requests 0.75 + --sleep-interval 10 --max-sleep-interval 20 + # CONFIGURATION You can configure yt-dlp by placing any supported command line option in a configuration file. The configuration is loaded from the following locations: @@ -1769,9 +1797,10 @@ # EXTRACTOR ARGUMENTS #### youtube * `lang`: Prefer translated metadata (`title`, `description` etc) of this language code (case-sensitive). By default, the video primary language metadata is preferred, with a fallback to `en` translated. See [youtube.py](https://github.com/yt-dlp/yt-dlp/blob/c26f9b991a0681fd3ea548d535919cec1fbbd430/yt_dlp/extractor/youtube.py#L381-L390) for list of supported content language codes * `skip`: One or more of `hls`, `dash` or `translated_subs` to skip extraction of the m3u8 manifests, dash manifests and [auto-translated subtitles](https://github.com/yt-dlp/yt-dlp/issues/4090#issuecomment-1158102032) respectively -* `player_client`: Clients to extract video data from. The currently available clients are `web`, `web_safari`, `web_embedded`, `web_music`, `web_creator`, `mweb`, `ios`, `android`, `android_vr`, `tv` and `tv_embedded`. By default, `tv,ios,web` is used, or `tv,web` is used when authenticating with cookies. The `web_music` client is added for `music.youtube.com` URLs when logged-in cookies are used. The `tv_embedded` and `web_creator` clients are added for age-restricted videos if account age-verification is required. Some clients, such as `web` and `web_music`, require a `po_token` for their formats to be downloadable. Some clients, such as `web_creator`, will only work with authentication. Not all clients support authentication via cookies. You can use `default` for the default clients, or you can use `all` for all clients (not recommended). You can prefix a client with `-` to exclude it, e.g. `youtube:player_client=default,-ios` -* `player_skip`: Skip some network requests that are generally needed for robust extraction. One or more of `configs` (skip client configs), `webpage` (skip initial webpage), `js` (skip js player). While these options can help reduce the number of requests needed or avoid some rate-limiting, they could cause some issues. See [#860](https://github.com/yt-dlp/yt-dlp/pull/860) for more details +* `player_client`: Clients to extract video data from. The currently available clients are `web`, `web_safari`, `web_embedded`, `web_music`, `web_creator`, `mweb`, `ios`, `android`, `android_vr`, `tv` and `tv_embedded`. By default, `tv,ios,web` is used, or `tv,web` is used when authenticating with cookies. The `web_music` client is added for `music.youtube.com` URLs when logged-in cookies are used. The `web_embedded` client is added for age-restricted videos but only works if the video is embeddable. The `tv_embedded` and `web_creator` clients are added for age-restricted videos if account age-verification is required. Some clients, such as `web` and `web_music`, require a `po_token` for their formats to be downloadable. Some clients, such as `web_creator`, will only work with authentication. Not all clients support authentication via cookies. You can use `default` for the default clients, or you can use `all` for all clients (not recommended). You can prefix a client with `-` to exclude it, e.g. `youtube:player_client=default,-ios` +* `player_skip`: Skip some network requests that are generally needed for robust extraction. One or more of `configs` (skip client configs), `webpage` (skip initial webpage), `js` (skip js player), `initial_data` (skip initial data/next ep request). While these options can help reduce the number of requests needed or avoid some rate-limiting, they could cause issues such as missing formats or metadata. See [#860](https://github.com/yt-dlp/yt-dlp/pull/860) and [#12826](https://github.com/yt-dlp/yt-dlp/issues/12826) for more details * `player_params`: YouTube player parameters to use for player requests. Will overwrite any default ones set by yt-dlp. +* `player_js_variant`: The player javascript variant to use for signature and nsig deciphering. The known variants are: `main`, `tce`, `tv`, `tv_es6`, `phone`, `tablet`. Only `main` is recommended as a possible workaround; the others are for debugging purposes. The default is to use what is prescribed by the site, and can be selected with `actual` * `comment_sort`: `top` or `new` (default) - choose comment sorting mode (on YouTube's side) * `max_comments`: Limit the amount of comments to gather. Comma-separated list of integers representing `max-comments,max-parents,max-replies,max-replies-per-thread`. Default is `all,all,all,all` * E.g. `all,all,1000,10` will get a maximum of 1000 replies total, with up to 10 replies per thread. `1000,all,100` will get a maximum of 1000 comments, with a maximum of 100 replies total @@ -1781,7 +1810,12 @@ #### youtube * `raise_incomplete_data`: `Incomplete Data Received` raises an error instead of reporting a warning * `data_sync_id`: Overrides the account Data Sync ID used in Innertube API requests. This may be needed if you are using an account with `youtube:player_skip=webpage,configs` or `youtubetab:skip=webpage` * `visitor_data`: Overrides the Visitor Data used in Innertube API requests. This should be used with `player_skip=webpage,configs` and without cookies. Note: this may have adverse effects if used improperly. If a session from a browser is wanted, you should pass cookies instead (which contain the Visitor ID) -* `po_token`: Proof of Origin (PO) Token(s) to use. Comma seperated list of PO Tokens in the format `CLIENT.CONTEXT+PO_TOKEN`, e.g. `youtube:po_token=web.gvs+XXX,web.player=XXX,web_safari.gvs+YYY`. Context can be either `gvs` (Google Video Server URLs) or `player` (Innertube player request) +* `po_token`: Proof of Origin (PO) Token(s) to use. Comma seperated list of PO Tokens in the format `CLIENT.CONTEXT+PO_TOKEN`, e.g. `youtube:po_token=web.gvs+XXX,web.player=XXX,web_safari.gvs+YYY`. Context can be any of `gvs` (Google Video Server URLs), `player` (Innertube player request) or `subs` (Subtitles) +* `pot_trace`: Enable debug logging for PO Token fetching. Either `true` or `false` (default) +* `fetch_pot`: Policy to use for fetching a PO Token from providers. One of `always` (always try fetch a PO Token regardless if the client requires one for the given context), `never` (never fetch a PO Token), or `auto` (default; only fetch a PO Token if the client requires one for the given context) + +#### youtubepot-webpo +* `bind_to_visitor_id`: Whether to use the Visitor ID instead of Visitor Data for caching WebPO tokens. Either `true` (default) or `false` #### youtubetab (YouTube playlists, channels, feeds, etc.) * `skip`: One or more of `webpage` (skip initial webpage download), `authcheck` (allow the download of playlists requiring authentication when no initial webpage is downloaded. This may cause unwanted behavior, see [#1122](https://github.com/yt-dlp/yt-dlp/pull/1122) for more details) @@ -1798,9 +1832,6 @@ #### generic #### vikichannel * `video_types`: Types of videos to download - one or more of `episodes`, `movies`, `clips`, `trailers` -#### niconico -* `segment_duration`: Segment duration in milliseconds for HLS-DMC formats. Use it at your own risk since this feature **may result in your account termination.** - #### youtubewebarchive * `check_all`: Try to check more at the cost of more requests. One or more of `thumbnails`, `captures` @@ -1866,6 +1897,9 @@ #### bilibili #### sonylivseries * `sort_order`: Episode sort order for series extraction - one of `asc` (ascending, oldest first) or `desc` (descending, newest first). Default is `asc` +#### tver +* `backend`: Backend API to use for extraction - one of `streaks` (default) or `brightcove` (deprecated) + **Note**: These options may be changed/removed in the future without concern for backward compatibility @@ -2149,7 +2183,7 @@ ### New features * **[Format Sorting](#sorting-formats)**: The default format sorting options have been changed so that higher resolution and better codecs will be now preferred instead of simply using larger bitrate. Furthermore, you can now specify the sort order using `-S`. This allows for much easier format selection than what is possible by simply using `--format` ([examples](#format-selection-examples)) -* **Merged with animelover1984/youtube-dl**: You get most of the features and improvements from [animelover1984/youtube-dl](https://github.com/animelover1984/youtube-dl) including `--write-comments`, `BiliBiliSearch`, `BilibiliChannel`, Embedding thumbnail in mp4/ogg/opus, playlist infojson etc. Note that NicoNico livestreams are not available. See [#31](https://github.com/yt-dlp/yt-dlp/pull/31) for details. +* **Merged with animelover1984/youtube-dl**: You get most of the features and improvements from [animelover1984/youtube-dl](https://github.com/animelover1984/youtube-dl) including `--write-comments`, `BiliBiliSearch`, `BilibiliChannel`, Embedding thumbnail in mp4/ogg/opus, playlist infojson etc. See [#31](https://github.com/yt-dlp/yt-dlp/pull/31) for details. * **YouTube improvements**: * Supports Clips, Stories (`ytstories:`), Search (including filters)**\***, YouTube Music Search, Channel-specific search, Search prefixes (`ytsearch:`, `ytsearchdate:`)**\***, Mixes, and Feeds (`:ytfav`, `:ytwatchlater`, `:ytsubs`, `:ythistory`, `:ytrec`, `:ytnotif`) @@ -2215,7 +2249,7 @@ ### Differences in default behavior * Live chats (if available) are considered as subtitles. Use `--sub-langs all,-live_chat` to download all subtitles except live chat. You can also use `--compat-options no-live-chat` to prevent any live chat/danmaku from downloading * YouTube channel URLs download all uploads of the channel. To download only the videos in a specific tab, pass the tab's URL. If the channel does not show the requested tab, an error will be raised. Also, `/live` URLs raise an error if there are no live videos instead of silently downloading the entire channel. You may use `--compat-options no-youtube-channel-redirect` to revert all these redirections * Unavailable videos are also listed for YouTube playlists. Use `--compat-options no-youtube-unavailable-videos` to remove this -* The upload dates extracted from YouTube are in UTC [when available](https://github.com/yt-dlp/yt-dlp/blob/89e4d86171c7b7c997c77d4714542e0383bf0db0/yt_dlp/extractor/youtube.py#L3898-L3900). Use `--compat-options no-youtube-prefer-utc-upload-date` to prefer the non-UTC upload date. +* The upload dates extracted from YouTube are in UTC. * If `ffmpeg` is used as the downloader, the downloading and merging of formats happen in a single step when possible. Use `--compat-options no-direct-merge` to revert this * Thumbnail embedding in `mp4` is done with mutagen if possible. Use `--compat-options embed-thumbnail-atomicparsley` to force the use of AtomicParsley instead * Some internal metadata such as filenames are removed by default from the infojson. Use `--no-clean-infojson` or `--compat-options no-clean-infojson` to revert this @@ -2234,9 +2268,10 @@ ### Differences in default behavior * `--compat-options all`: Use all compat options (**Do NOT use this!**) * `--compat-options youtube-dl`: Same as `--compat-options all,-multistreams,-playlist-match-filter,-manifest-filesize-approx,-allow-unsafe-ext,-prefer-vp9-sort` * `--compat-options youtube-dlc`: Same as `--compat-options all,-no-live-chat,-no-youtube-channel-redirect,-playlist-match-filter,-manifest-filesize-approx,-allow-unsafe-ext,-prefer-vp9-sort` -* `--compat-options 2021`: Same as `--compat-options 2022,no-certifi,filename-sanitization,no-youtube-prefer-utc-upload-date` +* `--compat-options 2021`: Same as `--compat-options 2022,no-certifi,filename-sanitization` * `--compat-options 2022`: Same as `--compat-options 2023,playlist-match-filter,no-external-downloader-progress,prefer-legacy-http-handler,manifest-filesize-approx` -* `--compat-options 2023`: Same as `--compat-options prefer-vp9-sort`. Use this to enable all future compat options +* `--compat-options 2023`: Same as `--compat-options 2024,prefer-vp9-sort` +* `--compat-options 2024`: Currently does nothing. Use this to enable all future compat options The following compat options restore vulnerable behavior from before security patches: diff --git a/bundle/docker/static/entrypoint.sh b/bundle/docker/static/entrypoint.sh index 2202759742..8049e68205 100755 --- a/bundle/docker/static/entrypoint.sh +++ b/bundle/docker/static/entrypoint.sh @@ -2,6 +2,7 @@ set -e source ~/.local/share/pipx/venvs/pyinstaller/bin/activate +python -m devscripts.install_deps -o --include build python -m devscripts.install_deps --include secretstorage --include curl-cffi python -m devscripts.make_lazy_extractors python devscripts/update-version.py -c "${channel}" -r "${origin}" "${version}" diff --git a/bundle/pyinstaller.py b/bundle/pyinstaller.py index 4184c4bc9f..c2f6511210 100755 --- a/bundle/pyinstaller.py +++ b/bundle/pyinstaller.py @@ -36,6 +36,9 @@ def main(): f'--name={name}', '--icon=devscripts/logo.ico', '--upx-exclude=vcruntime140.dll', + # Ref: https://github.com/yt-dlp/yt-dlp/issues/13311 + # https://github.com/pyinstaller/pyinstaller/issues/9149 + '--exclude-module=pkg_resources', '--noconfirm', '--additional-hooks-dir=yt_dlp/__pyinstaller', *opts, diff --git a/devscripts/changelog_override.json b/devscripts/changelog_override.json index 8aa7b7e2bc..269de2c682 100644 --- a/devscripts/changelog_override.json +++ b/devscripts/changelog_override.json @@ -245,5 +245,14 @@ "when": "76ac023ff02f06e8c003d104f02a03deeddebdcd", "short": "[ie/youtube:tab] Improve shorts title extraction (#11997)", "authors": ["bashonly", "d3d9"] + }, + { + "action": "add", + "when": "88eb1e7a9a2720ac89d653c0d0e40292388823bb", + "short": "[priority] **New option `--preset-alias`/`-t` has been added**\nThis provides convenient predefined aliases for common use cases. Available presets include `mp4`, `mp3`, `mkv`, `aac`, and `sleep`. See [the README](https://github.com/yt-dlp/yt-dlp/blob/master/README.md#preset-aliases) for more details." + }, + { + "action": "remove", + "when": "d596824c2f8428362c072518856065070616e348" } ] diff --git a/pyproject.toml b/pyproject.toml index 2a0008a45a..3775251e10 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -55,8 +55,7 @@ default = [ "websockets>=13.0", ] curl-cffi = [ - "curl-cffi==0.5.10; os_name=='nt' and implementation_name=='cpython'", - "curl-cffi>=0.5.10,!=0.6.*,<0.7.2; os_name!='nt' and implementation_name=='cpython'", + "curl-cffi>=0.5.10,!=0.6.*,!=0.7.*,!=0.8.*,!=0.9.*,<0.11; implementation_name=='cpython'", ] secretstorage = [ "cffi", @@ -66,7 +65,7 @@ build = [ "build", "hatchling", "pip", - "setuptools>=71.0.2", # 71.0.0 broke pyinstaller + "setuptools>=71.0.2,<81", # See https://github.com/pyinstaller/pyinstaller/issues/9149 "wheel", ] dev = [ @@ -76,14 +75,14 @@ dev = [ ] static-analysis = [ "autopep8~=2.0", - "ruff~=0.9.0", + "ruff~=0.11.0", ] test = [ "pytest~=8.1", "pytest-rerunfailures~=14.0", ] pyinstaller = [ - "pyinstaller>=6.11.1", # Windows temp cleanup fixed in 6.11.1 + "pyinstaller>=6.13.0", # Windows temp cleanup fixed in 6.13.0 ] [project.urls] @@ -387,7 +386,11 @@ select = [ exclude = "*/extractor/lazy_extractors.py,*venv*,*/test/testdata/sigs/player-*.js,.idea,.vscode" [tool.pytest.ini_options] -addopts = "-ra -v --strict-markers" +addopts = [ + "-ra", # summary: all except passed + "--verbose", + "--strict-markers", +] markers = [ "download", ] diff --git a/supportedsites.md b/supportedsites.md index 0924c88ffb..c2d7b45556 100644 --- a/supportedsites.md +++ b/supportedsites.md @@ -7,6 +7,7 @@ # Supported sites - **17live** - **17live:clip** + - **17live:vod** - **1News**: 1news.co.nz article videos - **1tv**: Первый канал - **20min** @@ -200,7 +201,7 @@ # Supported sites - **blogger.com** - **Bloomberg** - **Bluesky** - - **BokeCC** + - **BokeCC**: CC视频 - **BongaCams** - **Boosty** - **BostonGlobe** @@ -224,6 +225,7 @@ # Supported sites - **bt:vestlendingen**: Bergens Tidende - Vestlendingen - **Bundesliga** - **Bundestag** + - **BunnyCdn** - **BusinessInsider** - **BuzzFeed** - **BYUtv**: (**Currently broken**) @@ -242,8 +244,8 @@ # Supported sites - **CanalAlpha** - **canalc2.tv** - **Canalplus**: mycanal.fr and piwiplus.fr + - **Canalsurmas** - **CaracolTvPlay**: [*caracoltv-play*](## "netrc machine") - - **CartoonNetwork** - **cbc.ca** - **cbc.ca:player** - **cbc.ca:​player:playlist** @@ -345,8 +347,6 @@ # Supported sites - **daystar:clip** - **DBTV** - **DctpTv** - - **DeezerAlbum** - - **DeezerPlaylist** - **democracynow** - **DestinationAmerica** - **DetikEmbed** @@ -393,6 +393,8 @@ # Supported sites - **dvtv**: http://video.aktualne.cz/ - **dw**: (**Currently broken**) - **dw:article**: (**Currently broken**) + - **dzen.ru**: Дзен (dzen) formerly Яндекс.Дзен (Yandex Zen) + - **dzen.ru:channel** - **EaglePlatform** - **EbaumsWorld** - **Ebay** @@ -471,6 +473,7 @@ # Supported sites - **FoxNewsVideo** - **FoxSports** - **fptplay**: fptplay.vn + - **FrancaisFacile** - **FranceCulture** - **FranceInter** - **francetv** @@ -609,10 +612,10 @@ # Supported sites - **Inc** - **IndavideoEmbed** - **InfoQ** - - **Instagram**: [*instagram*](## "netrc machine") - - **instagram:story**: [*instagram*](## "netrc machine") - - **instagram:tag**: [*instagram*](## "netrc machine") Instagram hashtag search URLs - - **instagram:user**: [*instagram*](## "netrc machine") Instagram user profile (**Currently broken**) + - **Instagram** + - **instagram:story** + - **instagram:tag**: Instagram hashtag search URLs + - **instagram:user**: Instagram user profile (**Currently broken**) - **InstagramIOS**: IOS instagram:// URL - **Internazionale** - **InternetVideoArchive** @@ -632,6 +635,7 @@ # Supported sites - **ivi**: ivi.ru - **ivi:compilation**: ivi.ru compilations - **ivideon**: Ivideon TV + - **Ivoox** - **IVXPlayer** - **iwara**: [*iwara*](## "netrc machine") - **iwara:playlist**: [*iwara*](## "netrc machine") @@ -644,7 +648,10 @@ # Supported sites - **jiocinema**: [*jiocinema*](## "netrc machine") - **jiocinema:series**: [*jiocinema*](## "netrc machine") - **jiosaavn:album** + - **jiosaavn:artist** - **jiosaavn:playlist** + - **jiosaavn:show** + - **jiosaavn:​show:playlist** - **jiosaavn:song** - **Joj** - **JoqrAg**: 超!A&G+ 文化放送 (f.k.a. AGQR) Nippon Cultural Broadcasting, Inc. (JOQR) @@ -661,7 +668,6 @@ # Supported sites - **KelbyOne**: (**Currently broken**) - **Kenh14Playlist** - **Kenh14Video** - - **Ketnet** - **khanacademy** - **khanacademy:unit** - **kick:clips** @@ -670,6 +676,7 @@ # Supported sites - **Kicker** - **KickStarter** - **Kika**: KiKA.de + - **KikaPlaylist** - **kinja:embed** - **KinoPoisk** - **Kommunetv** @@ -722,6 +729,7 @@ # Supported sites - **limelight:channel** - **limelight:channel_list** - **LinkedIn**: [*linkedin*](## "netrc machine") + - **linkedin:events**: [*linkedin*](## "netrc machine") - **linkedin:learning**: [*linkedin*](## "netrc machine") - **linkedin:​learning:course**: [*linkedin*](## "netrc machine") - **Liputan6** @@ -733,9 +741,11 @@ # Supported sites - **Livestreamfails** - **Lnk** - **loc**: Library of Congress + - **Loco** - **loom** - **loom:folder** - **LoveHomePorn** + - **LRTRadio** - **LRTStream** - **LRTVOD** - **LSMLREmbed** @@ -757,7 +767,7 @@ # Supported sites - **ManotoTV**: Manoto TV (Episode) - **ManotoTVLive**: Manoto TV (Live) - **ManotoTVShow**: Manoto TV (Show) - - **ManyVids**: (**Currently broken**) + - **ManyVids** - **MaoriTV** - **Markiza**: (**Currently broken**) - **MarkizaPage**: (**Currently broken**) @@ -827,11 +837,11 @@ # Supported sites - **MotherlessUploader** - **Motorsport**: motorsport.com (**Currently broken**) - **MovieFap** - - **Moviepilot** + - **moviepilot**: Moviepilot trailer - **MoviewPlay** - **Moviezine** - **MovingImage** - - **MSN**: (**Currently broken**) + - **MSN** - **mtg**: MTG services - **mtv** - **mtv.de**: (**Currently broken**) @@ -944,7 +954,7 @@ # Supported sites - **nickelodeonru** - **niconico**: [*niconico*](## "netrc machine") ニコニコ動画 - **niconico:history**: NicoNico user history or likes. Requires cookies. - - **niconico:live**: ニコニコ生放送 + - **niconico:live**: [*niconico*](## "netrc machine") ニコニコ生放送 - **niconico:playlist** - **niconico:series** - **niconico:tag**: NicoNico video tag URLs @@ -1051,6 +1061,8 @@ # Supported sites - **Parler**: Posts on parler.com - **parliamentlive.tv**: UK parliament videos - **Parlview**: (**Currently broken**) + - **parti:livestream** + - **parti:video** - **patreon** - **patreon:campaign** - **pbs**: Public Broadcasting Service (PBS) and member stations: PBS: Public Broadcasting Service, APT - Alabama Public Television (WBIQ), GPB/Georgia Public Broadcasting (WGTV), Mississippi Public Broadcasting (WMPN), Nashville Public Television (WNPT), WFSU-TV (WFSU), WSRE (WSRE), WTCI (WTCI), WPBA/Channel 30 (WPBA), Alaska Public Media (KAKM), Arizona PBS (KAET), KNME-TV/Channel 5 (KNME), Vegas PBS (KLVX), AETN/ARKANSAS ETV NETWORK (KETS), KET (WKLE), WKNO/Channel 10 (WKNO), LPB/LOUISIANA PUBLIC BROADCASTING (WLPB), OETA (KETA), Ozarks Public Television (KOZK), WSIU Public Broadcasting (WSIU), KEET TV (KEET), KIXE/Channel 9 (KIXE), KPBS San Diego (KPBS), KQED (KQED), KVIE Public Television (KVIE), PBS SoCal/KOCE (KOCE), ValleyPBS (KVPT), CONNECTICUT PUBLIC TELEVISION (WEDH), KNPB Channel 5 (KNPB), SOPTV (KSYS), Rocky Mountain PBS (KRMA), KENW-TV3 (KENW), KUED Channel 7 (KUED), Wyoming PBS (KCWC), Colorado Public Television / KBDI 12 (KBDI), KBYU-TV (KBYU), Thirteen/WNET New York (WNET), WGBH/Channel 2 (WGBH), WGBY (WGBY), NJTV Public Media NJ (WNJT), WLIW21 (WLIW), mpt/Maryland Public Television (WMPB), WETA Television and Radio (WETA), WHYY (WHYY), PBS 39 (WLVT), WVPT - Your Source for PBS and More! (WVPT), Howard University Television (WHUT), WEDU PBS (WEDU), WGCU Public Media (WGCU), WPBT2 (WPBT), WUCF TV (WUCF), WUFT/Channel 5 (WUFT), WXEL/Channel 42 (WXEL), WLRN/Channel 17 (WLRN), WUSF Public Broadcasting (WUSF), ETV (WRLK), UNC-TV (WUNC), PBS Hawaii - Oceanic Cable Channel 10 (KHET), Idaho Public Television (KAID), KSPS (KSPS), OPB (KOPB), KWSU/Channel 10 & KTNW/Channel 31 (KWSU), WILL-TV (WILL), Network Knowledge - WSEC/Springfield (WSEC), WTTW11 (WTTW), Iowa Public Television/IPTV (KDIN), Nine Network (KETC), PBS39 Fort Wayne (WFWA), WFYI Indianapolis (WFYI), Milwaukee Public Television (WMVS), WNIN (WNIN), WNIT Public Television (WNIT), WPT (WPNE), WVUT/Channel 22 (WVUT), WEIU/Channel 51 (WEIU), WQPT-TV (WQPT), WYCC PBS Chicago (WYCC), WIPB-TV (WIPB), WTIU (WTIU), CET (WCET), ThinkTVNetwork (WPTD), WBGU-TV (WBGU), WGVU TV (WGVU), NET1 (KUON), Pioneer Public Television (KWCM), SDPB Television (KUSD), TPT (KTCA), KSMQ (KSMQ), KPTS/Channel 8 (KPTS), KTWU/Channel 11 (KTWU), East Tennessee PBS (WSJK), WCTE-TV (WCTE), WLJT, Channel 11 (WLJT), WOSU TV (WOSU), WOUB/WOUC (WOUB), WVPB (WVPB), WKYU-PBS (WKYU), KERA 13 (KERA), MPBN (WCBB), Mountain Lake PBS (WCFE), NHPTV (WENH), Vermont PBS (WETK), witf (WITF), WQED Multimedia (WQED), WMHT Educational Telecommunications (WMHT), Q-TV (WDCQ), WTVS Detroit Public TV (WTVS), CMU Public Television (WCMU), WKAR-TV (WKAR), WNMU-TV Public TV 13 (WNMU), WDSE - WRPT (WDSE), WGTE TV (WGTE), Lakeland Public Television (KAWE), KMOS-TV - Channels 6.1, 6.2 and 6.3 (KMOS), MontanaPBS (KUSM), KRWG/Channel 22 (KRWG), KACV (KACV), KCOS/Channel 13 (KCOS), WCNY/Channel 24 (WCNY), WNED (WNED), WPBS (WPBS), WSKG Public TV (WSKG), WXXI (WXXI), WPSU (WPSU), WVIA Public Media Studios (WVIA), WTVI (WTVI), Western Reserve PBS (WNEO), WVIZ/PBS ideastream (WVIZ), KCTS 9 (KCTS), Basin PBS (KPBT), KUHT / Channel 8 (KUHT), KLRN (KLRN), KLRU (KLRU), WTJX Channel 12 (WTJX), WCVE PBS (WCVE), KBTC Public Television (KBTC) @@ -1071,8 +1083,8 @@ # Supported sites - **Photobucket** - **PiaLive** - **Piapro**: [*piapro*](## "netrc machine") - - **Picarto** - - **PicartoVod** + - **picarto** + - **picarto:vod** - **Piksel** - **Pinkbike** - **Pinterest** @@ -1225,6 +1237,7 @@ # Supported sites - **RoosterTeeth**: [*roosterteeth*](## "netrc machine") - **RoosterTeethSeries**: [*roosterteeth*](## "netrc machine") - **RottenTomatoes** + - **RoyaLive** - **Rozhlas** - **RozhlasVltava** - **RTBF**: [*rtbf*](## "netrc machine") (**Currently broken**) @@ -1245,12 +1258,10 @@ # Supported sites - **RTVCKaltura** - **RTVCPlay** - **RTVCPlayEmbed** - - **rtve.es:alacarta**: RTVE a la carta + - **rtve.es:alacarta**: RTVE a la carta and Play - **rtve.es:audio**: RTVE audio - - **rtve.es:infantil**: RTVE infantil - **rtve.es:live**: RTVE.es live streams - **rtve.es:television** - - **RTVS** - **rtvslo.si** - **rtvslo.si:show** - **RudoVideo** @@ -1305,8 +1316,8 @@ # Supported sites - **sejm** - **Sen** - **SenalColombiaLive**: (**Currently broken**) - - **SenateGov** - - **SenateISVP** + - **senate.gov** + - **senate.gov:isvp** - **SendtoNews**: (**Currently broken**) - **Servus** - **Sexu**: (**Currently broken**) @@ -1342,6 +1353,7 @@ # Supported sites - **Smotrim** - **SnapchatSpotlight** - **Snotr** + - **SoftWhiteUnderbelly**: [*softwhiteunderbelly*](## "netrc machine") - **Sohu** - **SohuV** - **SonyLIV**: [*sonyliv*](## "netrc machine") @@ -1380,7 +1392,6 @@ # Supported sites - **Spreaker** - **SpreakerShow** - **SpringboardPlatform** - - **Sprout** - **SproutVideo** - **sr:mediathek**: Saarländischer Rundfunk (**Currently broken**) - **SRGSSR** @@ -1398,12 +1409,14 @@ # Supported sites - **StoryFire** - **StoryFireSeries** - **StoryFireUser** + - **Streaks** - **Streamable** - **StreamCZ** - **StreetVoice** - **StretchInternet** - **Stripchat** - **stv:player** + - **stvr**: Slovak Television and Radio (formerly RTVS) - **Subsplash** - **subsplash:playlist** - **Substack** @@ -1536,6 +1549,8 @@ # Supported sites - **tv5unis** - **tv5unis:video** - **tv8.it** + - **tv8.it:live**: TV8 Live + - **tv8.it:playlist**: TV8 Playlist - **TVANouvelles** - **TVANouvellesArticle** - **tvaplus**: TVA+ @@ -1556,6 +1571,8 @@ # Supported sites - **tvp:​vod:series** - **TVPlayer** - **TVPlayHome** + - **tvw** + - **tvw:tvchannels** - **Tweakers** - **TwitCasting** - **TwitCastingLive** @@ -1637,11 +1654,10 @@ # Supported sites - **viewlift** - **viewlift:embed** - **Viidea** - - **viki**: [*viki*](## "netrc machine") - - **viki:channel**: [*viki*](## "netrc machine") - **vimeo**: [*vimeo*](## "netrc machine") - **vimeo:album**: [*vimeo*](## "netrc machine") - **vimeo:channel**: [*vimeo*](## "netrc machine") + - **vimeo:event**: [*vimeo*](## "netrc machine") - **vimeo:group**: [*vimeo*](## "netrc machine") - **vimeo:likes**: [*vimeo*](## "netrc machine") Vimeo user likes - **vimeo:ondemand**: [*vimeo*](## "netrc machine") @@ -1676,8 +1692,12 @@ # Supported sites - **vpro**: npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl - **vqq:series** - **vqq:video** + - **vrsquare**: VR SQUARE + - **vrsquare:channel** + - **vrsquare:search** + - **vrsquare:section** - **VRT**: VRT NWS, Flanders News, Flandern Info and Sporza - - **VrtNU**: [*vrtnu*](## "netrc machine") VRT MAX + - **vrtmax**: [*vrtnu*](## "netrc machine") VRT MAX (formerly VRT NU) - **VTM**: (**Currently broken**) - **VTV** - **VTVGo** @@ -1812,14 +1832,12 @@ # Supported sites - **ZattooLive**: [*zattoo*](## "netrc machine") - **ZattooMovies**: [*zattoo*](## "netrc machine") - **ZattooRecordings**: [*zattoo*](## "netrc machine") - - **ZDF** - - **ZDFChannel** + - **zdf** + - **zdf:channel** - **Zee5**: [*zee5*](## "netrc machine") - **zee5:series** - **ZeeNews**: (**Currently broken**) - **ZenPorn** - - **ZenYandex** - - **ZenYandexChannel** - **ZetlandDKArticle** - **Zhihu** - **zingmp3**: zingmp3.vn diff --git a/test/helper.py b/test/helper.py index 4169af799f..e4cb478e28 100644 --- a/test/helper.py +++ b/test/helper.py @@ -136,7 +136,7 @@ def _iter_differences(got, expected, field): return if op == 'startswith': - if not val.startswith(got): + if not got.startswith(val): yield field, f'should start with {val!r}, got {got!r}' return diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index 54f35ef552..c6ff6209a8 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -638,6 +638,7 @@ def test_parse_m3u8_formats(self): 'img_bipbop_adv_example_fmp4', 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/master.m3u8', [{ + # 60kbps (bitrate not provided in m3u8); sorted as worst because it's grouped with lowest bitrate video track 'format_id': 'aud1-English', 'url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/a1/prog_index.m3u8', 'manifest_url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/master.m3u8', @@ -645,15 +646,9 @@ def test_parse_m3u8_formats(self): 'ext': 'mp4', 'protocol': 'm3u8_native', 'audio_ext': 'mp4', + 'source_preference': 0, }, { - 'format_id': 'aud2-English', - 'url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/a2/prog_index.m3u8', - 'manifest_url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/master.m3u8', - 'language': 'en', - 'ext': 'mp4', - 'protocol': 'm3u8_native', - 'audio_ext': 'mp4', - }, { + # 192kbps (bitrate not provided in m3u8) 'format_id': 'aud3-English', 'url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/a3/prog_index.m3u8', 'manifest_url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/master.m3u8', @@ -661,6 +656,17 @@ def test_parse_m3u8_formats(self): 'ext': 'mp4', 'protocol': 'm3u8_native', 'audio_ext': 'mp4', + 'source_preference': 1, + }, { + # 384kbps (bitrate not provided in m3u8); sorted as best because it's grouped with the highest bitrate video track + 'format_id': 'aud2-English', + 'url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/a2/prog_index.m3u8', + 'manifest_url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/master.m3u8', + 'language': 'en', + 'ext': 'mp4', + 'protocol': 'm3u8_native', + 'audio_ext': 'mp4', + 'source_preference': 2, }, { 'format_id': '530', 'url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/v2/prog_index.m3u8', diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 708a04f92d..91312e4e5f 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -1435,6 +1435,27 @@ def test_load_plugins_compat(self): FakeYDL().close() assert all_plugins_loaded.value + def test_close_hooks(self): + # Should call all registered close hooks on close + close_hook_called = False + close_hook_two_called = False + + def close_hook(): + nonlocal close_hook_called + close_hook_called = True + + def close_hook_two(): + nonlocal close_hook_two_called + close_hook_two_called = True + + ydl = FakeYDL() + ydl.add_close_hook(close_hook) + ydl.add_close_hook(close_hook_two) + + ydl.close() + self.assertTrue(close_hook_called, 'Close hook was not called') + self.assertTrue(close_hook_two_called, 'Close hook two was not called') + if __name__ == '__main__': unittest.main() diff --git a/test/test_cookies.py b/test/test_cookies.py index 4b9b9b5a91..f956ab1876 100644 --- a/test/test_cookies.py +++ b/test/test_cookies.py @@ -58,6 +58,14 @@ def test_get_desktop_environment(self): ({'DESKTOP_SESSION': 'kde'}, _LinuxDesktopEnvironment.KDE3), ({'DESKTOP_SESSION': 'xfce'}, _LinuxDesktopEnvironment.XFCE), + ({'XDG_CURRENT_DESKTOP': 'my_custom_de', 'DESKTOP_SESSION': 'gnome'}, _LinuxDesktopEnvironment.GNOME), + ({'XDG_CURRENT_DESKTOP': 'my_custom_de', 'DESKTOP_SESSION': 'mate'}, _LinuxDesktopEnvironment.GNOME), + ({'XDG_CURRENT_DESKTOP': 'my_custom_de', 'DESKTOP_SESSION': 'kde4'}, _LinuxDesktopEnvironment.KDE4), + ({'XDG_CURRENT_DESKTOP': 'my_custom_de', 'DESKTOP_SESSION': 'kde'}, _LinuxDesktopEnvironment.KDE3), + ({'XDG_CURRENT_DESKTOP': 'my_custom_de', 'DESKTOP_SESSION': 'xfce'}, _LinuxDesktopEnvironment.XFCE), + + ({'XDG_CURRENT_DESKTOP': 'my_custom_de', 'DESKTOP_SESSION': 'my_custom_de', 'GNOME_DESKTOP_SESSION_ID': 1}, _LinuxDesktopEnvironment.GNOME), + ({'GNOME_DESKTOP_SESSION_ID': 1}, _LinuxDesktopEnvironment.GNOME), ({'KDE_FULL_SESSION': 1}, _LinuxDesktopEnvironment.KDE3), ({'KDE_FULL_SESSION': 1, 'DESKTOP_SESSION': 'kde4'}, _LinuxDesktopEnvironment.KDE4), diff --git a/test/test_http_proxy.py b/test/test_http_proxy.py index 2435c878a5..e903ff8beb 100644 --- a/test/test_http_proxy.py +++ b/test/test_http_proxy.py @@ -331,10 +331,6 @@ def test_http_connect_auth(self, handler, ctx): assert proxy_info['proxy'] == server_address assert 'Proxy-Authorization' in proxy_info['headers'] - @pytest.mark.skip_handler( - 'Requests', - 'bug in urllib3 causes unclosed socket: https://github.com/urllib3/urllib3/issues/3374', - ) def test_http_connect_bad_auth(self, handler, ctx): with ctx.http_server(HTTPConnectProxyHandler, username='test', password='test') as server_address: with handler(verify=False, proxies={ctx.REQUEST_PROTO: f'http://test:bad@{server_address}'}) as rh: diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py index ed3ca61c4b..2e3cdc2a59 100644 --- a/test/test_jsinterp.py +++ b/test/test_jsinterp.py @@ -118,6 +118,7 @@ def test_assignments(self): self._test('function f(){var x = 20; x = 30 + 1; return x;}', 31) self._test('function f(){var x = 20; x += 30 + 1; return x;}', 51) self._test('function f(){var x = 20; x -= 30 + 1; return x;}', -11) + self._test('function f(){var x = 2; var y = ["a", "b"]; y[x%y["length"]]="z"; return y}', ['z', 'b']) @unittest.skip('Not implemented') def test_comments(self): @@ -384,7 +385,7 @@ def test_negative(self): @unittest.skip('Not implemented') def test_packed(self): jsi = JSInterpreter('''function f(p,a,c,k,e,d){while(c--)if(k[c])p=p.replace(new RegExp('\\b'+c.toString(a)+'\\b','g'),k[c]);return p}''') - self.assertEqual(jsi.call_function('f', '''h 7=g("1j");7.7h({7g:[{33:"w://7f-7e-7d-7c.v.7b/7a/79/78/77/76.74?t=73&s=2s&e=72&f=2t&71=70.0.0.1&6z=6y&6x=6w"}],6v:"w://32.v.u/6u.31",16:"r%",15:"r%",6t:"6s",6r:"",6q:"l",6p:"l",6o:"6n",6m:\'6l\',6k:"6j",9:[{33:"/2u?b=6i&n=50&6h=w://32.v.u/6g.31",6f:"6e"}],1y:{6d:1,6c:\'#6b\',6a:\'#69\',68:"67",66:30,65:r,},"64":{63:"%62 2m%m%61%5z%5y%5x.u%5w%5v%5u.2y%22 2k%m%1o%22 5t%m%1o%22 5s%m%1o%22 2j%m%5r%22 16%m%5q%22 15%m%5p%22 5o%2z%5n%5m%2z",5l:"w://v.u/d/1k/5k.2y",5j:[]},\'5i\':{"5h":"5g"},5f:"5e",5d:"w://v.u",5c:{},5b:l,1x:[0.25,0.50,0.75,1,1.25,1.5,2]});h 1m,1n,5a;h 59=0,58=0;h 7=g("1j");h 2x=0,57=0,56=0;$.55({54:{\'53-52\':\'2i-51\'}});7.j(\'4z\',6(x){c(5>0&&x.1l>=5&&1n!=1){1n=1;$(\'q.4y\').4x(\'4w\')}});7.j(\'13\',6(x){2x=x.1l});7.j(\'2g\',6(x){2w(x)});7.j(\'4v\',6(){$(\'q.2v\').4u()});6 2w(x){$(\'q.2v\').4t();c(1m)19;1m=1;17=0;c(4s.4r===l){17=1}$.4q(\'/2u?b=4p&2l=1k&4o=2t-4n-4m-2s-4l&4k=&4j=&4i=&17=\'+17,6(2r){$(\'#4h\').4g(2r)});$(\'.3-8-4f-4e:4d("4c")\').2h(6(e){2q();g().4b(0);g().4a(l)});6 2q(){h $14=$("").2p({1l:"49",16:"r%",15:"r%",48:0,2n:0,2o:47,46:"45(10%, 10%, 10%, 0.4)","44-43":"42"});$("<41 />").2p({16:"60%",15:"60%",2o:40,"3z-2n":"3y"}).3x({\'2m\':\'/?b=3w&2l=1k\',\'2k\':\'0\',\'2j\':\'2i\'}).2f($14);$14.2h(6(){$(3v).3u();g().2g()});$14.2f($(\'#1j\'))}g().13(0);}6 3t(){h 9=7.1b(2e);2d.2c(9);c(9.n>1){1r(i=0;i<9.n;i++){c(9[i].1a==2e){2d.2c(\'!!=\'+i);7.1p(i)}}}}7.j(\'3s\',6(){g().1h("/2a/3r.29","3q 10 28",6(){g().13(g().27()+10)},"2b");$("q[26=2b]").23().21(\'.3-20-1z\');g().1h("/2a/3p.29","3o 10 28",6(){h 12=g().27()-10;c(12<0)12=0;g().13(12)},"24");$("q[26=24]").23().21(\'.3-20-1z\');});6 1i(){}7.j(\'3n\',6(){1i()});7.j(\'3m\',6(){1i()});7.j("k",6(y){h 9=7.1b();c(9.n<2)19;$(\'.3-8-3l-3k\').3j(6(){$(\'#3-8-a-k\').1e(\'3-8-a-z\');$(\'.3-a-k\').p(\'o-1f\',\'11\')});7.1h("/3i/3h.3g","3f 3e",6(){$(\'.3-1w\').3d(\'3-8-1v\');$(\'.3-8-1y, .3-8-1x\').p(\'o-1g\',\'11\');c($(\'.3-1w\').3c(\'3-8-1v\')){$(\'.3-a-k\').p(\'o-1g\',\'l\');$(\'.3-a-k\').p(\'o-1f\',\'l\');$(\'.3-8-a\').1e(\'3-8-a-z\');$(\'.3-8-a:1u\').3b(\'3-8-a-z\')}3a{$(\'.3-a-k\').p(\'o-1g\',\'11\');$(\'.3-a-k\').p(\'o-1f\',\'11\');$(\'.3-8-a:1u\').1e(\'3-8-a-z\')}},"39");7.j("38",6(y){1d.37(\'1c\',y.9[y.36].1a)});c(1d.1t(\'1c\')){35("1s(1d.1t(\'1c\'));",34)}});h 18;6 1s(1q){h 9=7.1b();c(9.n>1){1r(i=0;i<9.n;i++){c(9[i].1a==1q){c(i==18){19}18=i;7.1p(i)}}}}',36,270,'|||jw|||function|player|settings|tracks|submenu||if||||jwplayer|var||on|audioTracks|true|3D|length|aria|attr|div|100|||sx|filemoon|https||event|active||false|tt|seek|dd|height|width|adb|current_audio|return|name|getAudioTracks|default_audio|localStorage|removeClass|expanded|checked|addButton|callMeMaybe|vplayer|0fxcyc2ajhp1|position|vvplay|vvad|220|setCurrentAudioTrack|audio_name|for|audio_set|getItem|last|open|controls|playbackRates|captions|rewind|icon|insertAfter||detach|ff00||button|getPosition|sec|png|player8|ff11|log|console|track_name|appendTo|play|click|no|scrolling|frameborder|file_code|src|top|zIndex|css|showCCform|data|1662367683|383371|dl|video_ad|doPlay|prevt|mp4|3E||jpg|thumbs|file|300|setTimeout|currentTrack|setItem|audioTrackChanged|dualSound|else|addClass|hasClass|toggleClass|Track|Audio|svg|dualy|images|mousedown|buttons|topbar|playAttemptFailed|beforePlay|Rewind|fr|Forward|ff|ready|set_audio_track|remove|this|upload_srt|prop|50px|margin|1000001|iframe|center|align|text|rgba|background|1000000|left|absolute|pause|setCurrentCaptions|Upload|contains|item|content|html|fviews|referer|prem|embed|3e57249ef633e0d03bf76ceb8d8a4b65|216|83|hash|view|get|TokenZir|window|hide|show|complete|slow|fadeIn|video_ad_fadein|time||cache|Cache|Content|headers|ajaxSetup|v2done|tott|vastdone2|vastdone1|vvbefore|playbackRateControls|cast|aboutlink|FileMoon|abouttext|UHD|1870|qualityLabels|sites|GNOME_POWER|link|2Fiframe|3C|allowfullscreen|22360|22640|22no|marginheight|marginwidth|2FGNOME_POWER|2F0fxcyc2ajhp1|2Fe|2Ffilemoon|2F|3A||22https|3Ciframe|code|sharing|fontOpacity|backgroundOpacity|Tahoma|fontFamily|303030|backgroundColor|FFFFFF|color|userFontScale|thumbnails|kind|0fxcyc2ajhp10000|url|get_slides|start|startparam|none|preload|html5|primary|hlshtml|androidhls|duration|uniform|stretching|0fxcyc2ajhp1_xt|image|2048|sp|6871|asn|127|srv|43200|_g3XlBcu2lmD9oDexD2NLWSmah2Nu3XcDrl93m9PwXY|m3u8||master|0fxcyc2ajhp1_x|00076|01|hls2|to|s01|delivery|storage|moon|sources|setup'''.split('|'))) + self.assertEqual(jsi.call_function('f', '''h 7=g("1j");7.7h({7g:[{33:"w://7f-7e-7d-7c.v.7b/7a/79/78/77/76.74?t=73&s=2s&e=72&f=2t&71=70.0.0.1&6z=6y&6x=6w"}],6v:"w://32.v.u/6u.31",16:"r%",15:"r%",6t:"6s",6r:"",6q:"l",6p:"l",6o:"6n",6m:\'6l\',6k:"6j",9:[{33:"/2u?b=6i&n=50&6h=w://32.v.u/6g.31",6f:"6e"}],1y:{6d:1,6c:\'#6b\',6a:\'#69\',68:"67",66:30,65:r,},"64":{63:"%62 2m%m%61%5z%5y%5x.u%5w%5v%5u.2y%22 2k%m%1o%22 5t%m%1o%22 5s%m%1o%22 2j%m%5r%22 16%m%5q%22 15%m%5p%22 5o%2z%5n%5m%2z",5l:"w://v.u/d/1k/5k.2y",5j:[]},\'5i\':{"5h":"5g"},5f:"5e",5d:"w://v.u",5c:{},5b:l,1x:[0.25,0.50,0.75,1,1.25,1.5,2]});h 1m,1n,5a;h 59=0,58=0;h 7=g("1j");h 2x=0,57=0,56=0;$.55({54:{\'53-52\':\'2i-51\'}});7.j(\'4z\',6(x){c(5>0&&x.1l>=5&&1n!=1){1n=1;$(\'q.4y\').4x(\'4w\')}});7.j(\'13\',6(x){2x=x.1l});7.j(\'2g\',6(x){2w(x)});7.j(\'4v\',6(){$(\'q.2v\').4u()});6 2w(x){$(\'q.2v\').4t();c(1m)19;1m=1;17=0;c(4s.4r===l){17=1}$.4q(\'/2u?b=4p&2l=1k&4o=2t-4n-4m-2s-4l&4k=&4j=&4i=&17=\'+17,6(2r){$(\'#4h\').4g(2r)});$(\'.3-8-4f-4e:4d("4c")\').2h(6(e){2q();g().4b(0);g().4a(l)});6 2q(){h $14=$("").2p({1l:"49",16:"r%",15:"r%",48:0,2n:0,2o:47,46:"45(10%, 10%, 10%, 0.4)","44-43":"42"});$("<41 />").2p({16:"60%",15:"60%",2o:40,"3z-2n":"3y"}).3x({\'2m\':\'/?b=3w&2l=1k\',\'2k\':\'0\',\'2j\':\'2i\'}).2f($14);$14.2h(6(){$(3v).3u();g().2g()});$14.2f($(\'#1j\'))}g().13(0);}6 3t(){h 9=7.1b(2e);2d.2c(9);c(9.n>1){1r(i=0;i<9.n;i++){c(9[i].1a==2e){2d.2c(\'!!=\'+i);7.1p(i)}}}}7.j(\'3s\',6(){g().1h("/2a/3r.29","3q 10 28",6(){g().13(g().27()+10)},"2b");$("q[26=2b]").23().21(\'.3-20-1z\');g().1h("/2a/3p.29","3o 10 28",6(){h 12=g().27()-10;c(12<0)12=0;g().13(12)},"24");$("q[26=24]").23().21(\'.3-20-1z\');});6 1i(){}7.j(\'3n\',6(){1i()});7.j(\'3m\',6(){1i()});7.j("k",6(y){h 9=7.1b();c(9.n<2)19;$(\'.3-8-3l-3k\').3j(6(){$(\'#3-8-a-k\').1e(\'3-8-a-z\');$(\'.3-a-k\').p(\'o-1f\',\'11\')});7.1h("/3i/3h.3g","3f 3e",6(){$(\'.3-1w\').3d(\'3-8-1v\');$(\'.3-8-1y, .3-8-1x\').p(\'o-1g\',\'11\');c($(\'.3-1w\').3c(\'3-8-1v\')){$(\'.3-a-k\').p(\'o-1g\',\'l\');$(\'.3-a-k\').p(\'o-1f\',\'l\');$(\'.3-8-a\').1e(\'3-8-a-z\');$(\'.3-8-a:1u\').3b(\'3-8-a-z\')}3a{$(\'.3-a-k\').p(\'o-1g\',\'11\');$(\'.3-a-k\').p(\'o-1f\',\'11\');$(\'.3-8-a:1u\').1e(\'3-8-a-z\')}},"39");7.j("38",6(y){1d.37(\'1c\',y.9[y.36].1a)});c(1d.1t(\'1c\')){35("1s(1d.1t(\'1c\'));",34)}});h 18;6 1s(1q){h 9=7.1b();c(9.n>1){1r(i=0;i<9.n;i++){c(9[i].1a==1q){c(i==18){19}18=i;7.1p(i)}}}}',36,270,'|||jw|||function|player|settings|tracks|submenu||if||||jwplayer|var||on|audioTracks|true|3D|length|aria|attr|div|100|||sx|filemoon|https||event|active||false|tt|seek|dd|height|width|adb|current_audio|return|name|getAudioTracks|default_audio|localStorage|removeClass|expanded|checked|addButton|callMeMaybe|vplayer|0fxcyc2ajhp1|position|vvplay|vvad|220|setCurrentAudioTrack|audio_name|for|audio_set|getItem|last|open|controls|playbackRates|captions|rewind|icon|insertAfter||detach|ff00||button|getPosition|sec|png|player8|ff11|log|console|track_name|appendTo|play|click|no|scrolling|frameborder|file_code|src|top|zIndex|css|showCCform|data|1662367683|383371|dl|video_ad|doPlay|prevt|mp4|3E||jpg|thumbs|file|300|setTimeout|currentTrack|setItem|audioTrackChanged|dualSound|else|addClass|hasClass|toggleClass|Track|Audio|svg|dualy|images|mousedown|buttons|topbar|playAttemptFailed|beforePlay|Rewind|fr|Forward|ff|ready|set_audio_track|remove|this|upload_srt|prop|50px|margin|1000001|iframe|center|align|text|rgba|background|1000000|left|absolute|pause|setCurrentCaptions|Upload|contains|item|content|html|fviews|referer|prem|embed|3e57249ef633e0d03bf76ceb8d8a4b65|216|83|hash|view|get|TokenZir|window|hide|show|complete|slow|fadeIn|video_ad_fadein|time||cache|Cache|Content|headers|ajaxSetup|v2done|tott|vastdone2|vastdone1|vvbefore|playbackRateControls|cast|aboutlink|FileMoon|abouttext|UHD|1870|qualityLabels|sites|GNOME_POWER|link|2Fiframe|3C|allowfullscreen|22360|22640|22no|marginheight|marginwidth|2FGNOME_POWER|2F0fxcyc2ajhp1|2Fe|2Ffilemoon|2F|3A||22https|3Ciframe|code|sharing|fontOpacity|backgroundOpacity|Tahoma|fontFamily|303030|backgroundColor|FFFFFF|color|userFontScale|thumbnails|kind|0fxcyc2ajhp10000|url|get_slides|start|startparam|none|preload|html5|primary|hlshtml|androidhls|duration|uniform|stretching|0fxcyc2ajhp1_xt|image|2048|sp|6871|asn|127|srv|43200|_g3XlBcu2lmD9oDexD2NLWSmah2Nu3XcDrl93m9PwXY|m3u8||master|0fxcyc2ajhp1_x|00076|01|hls2|to|s01|delivery|storage|moon|sources|setup'''.split('|'))) # noqa: SIM905 def test_join(self): test_input = list('test') @@ -403,6 +404,8 @@ def test_split(self): test_result = list('test') tests = [ 'function f(a, b){return a.split(b)}', + 'function f(a, b){return a["split"](b)}', + 'function f(a, b){let x = ["split"]; return a[x[0]](b)}', 'function f(a, b){return String.prototype.split.call(a, b)}', 'function f(a, b){return String.prototype.split.apply(a, [b])}', ] @@ -441,6 +444,9 @@ def test_slice(self): self._test('function f(){return "012345678".slice(-1, 1)}', '') self._test('function f(){return "012345678".slice(-3, -1)}', '67') + def test_splice(self): + self._test('function f(){var T = ["0", "1", "2"]; T["splice"](2, 1, "0")[0]; return T }', ['0', '1', '0']) + def test_js_number_to_string(self): for test, radix, expected in [ (0, None, '0'), @@ -462,6 +468,24 @@ def test_js_number_to_string(self): ]: assert js_number_to_string(test, radix) == expected + def test_extract_function(self): + jsi = JSInterpreter('function a(b) { return b + 1; }') + func = jsi.extract_function('a') + self.assertEqual(func([2]), 3) + + def test_extract_function_with_global_stack(self): + jsi = JSInterpreter('function c(d) { return d + e + f + g; }') + func = jsi.extract_function('c', {'e': 10}, {'f': 100, 'g': 1000}) + self.assertEqual(func([1]), 1111) + + def test_increment_decrement(self): + self._test('function f() { var x = 1; return ++x; }', 2) + self._test('function f() { var x = 1; return x++; }', 1) + self._test('function f() { var x = 1; x--; return x }', 0) + self._test('function f() { var y; var x = 1; x++, --x, x--, x--, y="z", "abc", x++; return --x }', -1) + self._test('function f() { var a = "test--"; return a; }', 'test--') + self._test('function f() { var b = 1; var a = "b--"; return a; }', 'b--') + if __name__ == '__main__': unittest.main() diff --git a/test/test_networking.py b/test/test_networking.py index 63914bc4ba..2f441fced2 100644 --- a/test/test_networking.py +++ b/test/test_networking.py @@ -39,6 +39,7 @@ from yt_dlp.dependencies import brotli, curl_cffi, requests, urllib3 from yt_dlp.networking import ( HEADRequest, + PATCHRequest, PUTRequest, Request, RequestDirector, @@ -614,7 +615,6 @@ def test_source_address(self, handler): rh, Request(f'http://127.0.0.1:{self.http_port}/source_address')).read().decode() assert source_address == data - # Not supported by CurlCFFI @pytest.mark.skip_handler('CurlCFFI', 'not supported by curl-cffi') def test_gzip_trailing_garbage(self, handler): with handler() as rh: @@ -1857,6 +1857,7 @@ def test_method(self): def test_request_helpers(self): assert HEADRequest('http://example.com').method == 'HEAD' + assert PATCHRequest('http://example.com').method == 'PATCH' assert PUTRequest('http://example.com').method == 'PUT' def test_headers(self): diff --git a/test/test_networking_utils.py b/test/test_networking_utils.py index 204fe87bda..a2feacba71 100644 --- a/test/test_networking_utils.py +++ b/test/test_networking_utils.py @@ -20,7 +20,6 @@ add_accept_encoding_header, get_redirect_method, make_socks_proxy_opts, - select_proxy, ssl_load_certs, ) from yt_dlp.networking.exceptions import ( @@ -28,7 +27,7 @@ IncompleteRead, ) from yt_dlp.socks import ProxyType -from yt_dlp.utils.networking import HTTPHeaderDict +from yt_dlp.utils.networking import HTTPHeaderDict, select_proxy TEST_DIR = os.path.dirname(os.path.abspath(__file__)) diff --git a/test/test_pot/conftest.py b/test/test_pot/conftest.py new file mode 100644 index 0000000000..ff0667e928 --- /dev/null +++ b/test/test_pot/conftest.py @@ -0,0 +1,71 @@ +import collections + +import pytest + +from yt_dlp import YoutubeDL +from yt_dlp.cookies import YoutubeDLCookieJar +from yt_dlp.extractor.common import InfoExtractor +from yt_dlp.extractor.youtube.pot._provider import IEContentProviderLogger +from yt_dlp.extractor.youtube.pot.provider import PoTokenRequest, PoTokenContext +from yt_dlp.utils.networking import HTTPHeaderDict + + +class MockLogger(IEContentProviderLogger): + + log_level = IEContentProviderLogger.LogLevel.TRACE + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.messages = collections.defaultdict(list) + + def trace(self, message: str): + self.messages['trace'].append(message) + + def debug(self, message: str): + self.messages['debug'].append(message) + + def info(self, message: str): + self.messages['info'].append(message) + + def warning(self, message: str, *, once=False): + self.messages['warning'].append(message) + + def error(self, message: str): + self.messages['error'].append(message) + + +@pytest.fixture +def ie() -> InfoExtractor: + ydl = YoutubeDL() + return ydl.get_info_extractor('Youtube') + + +@pytest.fixture +def logger() -> MockLogger: + return MockLogger() + + +@pytest.fixture() +def pot_request() -> PoTokenRequest: + return PoTokenRequest( + context=PoTokenContext.GVS, + innertube_context={'client': {'clientName': 'WEB'}}, + innertube_host='youtube.com', + session_index=None, + player_url=None, + is_authenticated=False, + video_webpage=None, + + visitor_data='example-visitor-data', + data_sync_id='example-data-sync-id', + video_id='example-video-id', + + request_cookiejar=YoutubeDLCookieJar(), + request_proxy=None, + request_headers=HTTPHeaderDict(), + request_timeout=None, + request_source_address=None, + request_verify_tls=True, + + bypass_cache=False, + ) diff --git a/test/test_pot/test_pot_builtin_memorycache.py b/test/test_pot/test_pot_builtin_memorycache.py new file mode 100644 index 0000000000..ea19fbe29f --- /dev/null +++ b/test/test_pot/test_pot_builtin_memorycache.py @@ -0,0 +1,117 @@ +import threading +import time +from collections import OrderedDict +import pytest +from yt_dlp.extractor.youtube.pot._provider import IEContentProvider, BuiltinIEContentProvider +from yt_dlp.utils import bug_reports_message +from yt_dlp.extractor.youtube.pot._builtin.memory_cache import MemoryLRUPCP, memorylru_preference, initialize_global_cache +from yt_dlp.version import __version__ +from yt_dlp.extractor.youtube.pot._registry import _pot_cache_providers, _pot_memory_cache + + +class TestMemoryLRUPCS: + + def test_base_type(self): + assert issubclass(MemoryLRUPCP, IEContentProvider) + assert issubclass(MemoryLRUPCP, BuiltinIEContentProvider) + + @pytest.fixture + def pcp(self, ie, logger) -> MemoryLRUPCP: + return MemoryLRUPCP(ie, logger, {}, initialize_cache=lambda max_size: (OrderedDict(), threading.Lock(), max_size)) + + def test_is_registered(self): + assert _pot_cache_providers.value.get('MemoryLRU') == MemoryLRUPCP + + def test_initialization(self, pcp): + assert pcp.PROVIDER_NAME == 'memory' + assert pcp.PROVIDER_VERSION == __version__ + assert pcp.BUG_REPORT_MESSAGE == bug_reports_message(before='') + assert pcp.is_available() + + def test_store_and_get(self, pcp): + pcp.store('key1', 'value1', int(time.time()) + 60) + assert pcp.get('key1') == 'value1' + assert len(pcp.cache) == 1 + + def test_store_ignore_expired(self, pcp): + pcp.store('key1', 'value1', int(time.time()) - 1) + assert len(pcp.cache) == 0 + assert pcp.get('key1') is None + assert len(pcp.cache) == 0 + + def test_store_override_existing_key(self, ie, logger): + MAX_SIZE = 2 + pcp = MemoryLRUPCP(ie, logger, {}, initialize_cache=lambda max_size: (OrderedDict(), threading.Lock(), MAX_SIZE)) + pcp.store('key1', 'value1', int(time.time()) + 60) + pcp.store('key2', 'value2', int(time.time()) + 60) + assert len(pcp.cache) == 2 + pcp.store('key1', 'value2', int(time.time()) + 60) + # Ensure that the override key gets added to the end of the cache instead of in the same position + pcp.store('key3', 'value3', int(time.time()) + 60) + assert pcp.get('key1') == 'value2' + + def test_store_ignore_expired_existing_key(self, pcp): + pcp.store('key1', 'value2', int(time.time()) + 60) + pcp.store('key1', 'value1', int(time.time()) - 1) + assert len(pcp.cache) == 1 + assert pcp.get('key1') == 'value2' + assert len(pcp.cache) == 1 + + def test_get_key_expired(self, pcp): + pcp.store('key1', 'value1', int(time.time()) + 60) + assert pcp.get('key1') == 'value1' + assert len(pcp.cache) == 1 + pcp.cache['key1'] = ('value1', int(time.time()) - 1) + assert pcp.get('key1') is None + assert len(pcp.cache) == 0 + + def test_lru_eviction(self, ie, logger): + MAX_SIZE = 2 + provider = MemoryLRUPCP(ie, logger, {}, initialize_cache=lambda max_size: (OrderedDict(), threading.Lock(), MAX_SIZE)) + provider.store('key1', 'value1', int(time.time()) + 5) + provider.store('key2', 'value2', int(time.time()) + 5) + assert len(provider.cache) == 2 + + assert provider.get('key1') == 'value1' + + provider.store('key3', 'value3', int(time.time()) + 5) + assert len(provider.cache) == 2 + + assert provider.get('key2') is None + + provider.store('key4', 'value4', int(time.time()) + 5) + assert len(provider.cache) == 2 + + assert provider.get('key1') is None + assert provider.get('key3') == 'value3' + assert provider.get('key4') == 'value4' + + def test_delete(self, pcp): + pcp.store('key1', 'value1', int(time.time()) + 5) + assert len(pcp.cache) == 1 + assert pcp.get('key1') == 'value1' + pcp.delete('key1') + assert len(pcp.cache) == 0 + assert pcp.get('key1') is None + + def test_use_global_cache_default(self, ie, logger): + pcp = MemoryLRUPCP(ie, logger, {}) + assert pcp.max_size == _pot_memory_cache.value['max_size'] == 25 + assert pcp.cache is _pot_memory_cache.value['cache'] + assert pcp.lock is _pot_memory_cache.value['lock'] + + pcp2 = MemoryLRUPCP(ie, logger, {}) + assert pcp.max_size == pcp2.max_size == _pot_memory_cache.value['max_size'] == 25 + assert pcp.cache is pcp2.cache is _pot_memory_cache.value['cache'] + assert pcp.lock is pcp2.lock is _pot_memory_cache.value['lock'] + + def test_fail_max_size_change_global(self, ie, logger): + pcp = MemoryLRUPCP(ie, logger, {}) + assert pcp.max_size == _pot_memory_cache.value['max_size'] == 25 + with pytest.raises(ValueError, match='Cannot change max_size of initialized global memory cache'): + initialize_global_cache(50) + + assert pcp.max_size == _pot_memory_cache.value['max_size'] == 25 + + def test_memory_lru_preference(self, pcp, ie, pot_request): + assert memorylru_preference(pcp, pot_request) == 10000 diff --git a/test/test_pot/test_pot_builtin_utils.py b/test/test_pot/test_pot_builtin_utils.py new file mode 100644 index 0000000000..a95fc4e159 --- /dev/null +++ b/test/test_pot/test_pot_builtin_utils.py @@ -0,0 +1,47 @@ +import pytest +from yt_dlp.extractor.youtube.pot.provider import ( + PoTokenContext, + +) + +from yt_dlp.extractor.youtube.pot.utils import get_webpo_content_binding, ContentBindingType + + +class TestGetWebPoContentBinding: + + @pytest.mark.parametrize('client_name, context, is_authenticated, expected', [ + *[(client, context, is_authenticated, expected) for client in [ + 'WEB', 'MWEB', 'TVHTML5', 'WEB_EMBEDDED_PLAYER', 'WEB_CREATOR', 'TVHTML5_SIMPLY_EMBEDDED_PLAYER'] + for context, is_authenticated, expected in [ + (PoTokenContext.GVS, False, ('example-visitor-data', ContentBindingType.VISITOR_DATA)), + (PoTokenContext.PLAYER, False, ('example-video-id', ContentBindingType.VIDEO_ID)), + (PoTokenContext.SUBS, False, ('example-video-id', ContentBindingType.VIDEO_ID)), + (PoTokenContext.GVS, True, ('example-data-sync-id', ContentBindingType.DATASYNC_ID)), + ]], + ('WEB_REMIX', PoTokenContext.GVS, False, ('example-visitor-data', ContentBindingType.VISITOR_DATA)), + ('WEB_REMIX', PoTokenContext.PLAYER, False, ('example-visitor-data', ContentBindingType.VISITOR_DATA)), + ('ANDROID', PoTokenContext.GVS, False, (None, None)), + ('IOS', PoTokenContext.GVS, False, (None, None)), + ]) + def test_get_webpo_content_binding(self, pot_request, client_name, context, is_authenticated, expected): + pot_request.innertube_context['client']['clientName'] = client_name + pot_request.context = context + pot_request.is_authenticated = is_authenticated + assert get_webpo_content_binding(pot_request) == expected + + def test_extract_visitor_id(self, pot_request): + pot_request.visitor_data = 'CgsxMjNhYmNYWVpfLSiA4s%2DqBg%3D%3D' + assert get_webpo_content_binding(pot_request, bind_to_visitor_id=True) == ('123abcXYZ_-', ContentBindingType.VISITOR_ID) + + def test_invalid_visitor_id(self, pot_request): + # visitor id not alphanumeric (i.e. protobuf extraction failed) + pot_request.visitor_data = 'CggxMjM0NTY3OCiA4s-qBg%3D%3D' + assert get_webpo_content_binding(pot_request, bind_to_visitor_id=True) == (pot_request.visitor_data, ContentBindingType.VISITOR_DATA) + + def test_no_visitor_id(self, pot_request): + pot_request.visitor_data = 'KIDiz6oG' + assert get_webpo_content_binding(pot_request, bind_to_visitor_id=True) == (pot_request.visitor_data, ContentBindingType.VISITOR_DATA) + + def test_invalid_base64(self, pot_request): + pot_request.visitor_data = 'invalid-base64' + assert get_webpo_content_binding(pot_request, bind_to_visitor_id=True) == (pot_request.visitor_data, ContentBindingType.VISITOR_DATA) diff --git a/test/test_pot/test_pot_builtin_webpospec.py b/test/test_pot/test_pot_builtin_webpospec.py new file mode 100644 index 0000000000..c5fb6f3820 --- /dev/null +++ b/test/test_pot/test_pot_builtin_webpospec.py @@ -0,0 +1,92 @@ +import pytest + +from yt_dlp.extractor.youtube.pot._provider import IEContentProvider, BuiltinIEContentProvider +from yt_dlp.extractor.youtube.pot.cache import CacheProviderWritePolicy +from yt_dlp.utils import bug_reports_message +from yt_dlp.extractor.youtube.pot.provider import ( + PoTokenRequest, + PoTokenContext, + +) +from yt_dlp.version import __version__ + +from yt_dlp.extractor.youtube.pot._builtin.webpo_cachespec import WebPoPCSP +from yt_dlp.extractor.youtube.pot._registry import _pot_pcs_providers + + +@pytest.fixture() +def pot_request(pot_request) -> PoTokenRequest: + pot_request.visitor_data = 'CgsxMjNhYmNYWVpfLSiA4s%2DqBg%3D%3D' # visitor_id=123abcXYZ_- + return pot_request + + +class TestWebPoPCSP: + def test_base_type(self): + assert issubclass(WebPoPCSP, IEContentProvider) + assert issubclass(WebPoPCSP, BuiltinIEContentProvider) + + def test_init(self, ie, logger): + pcs = WebPoPCSP(ie=ie, logger=logger, settings={}) + assert pcs.PROVIDER_NAME == 'webpo' + assert pcs.PROVIDER_VERSION == __version__ + assert pcs.BUG_REPORT_MESSAGE == bug_reports_message(before='') + assert pcs.is_available() + + def test_is_registered(self): + assert _pot_pcs_providers.value.get('WebPo') == WebPoPCSP + + @pytest.mark.parametrize('client_name, context, is_authenticated', [ + ('ANDROID', PoTokenContext.GVS, False), + ('IOS', PoTokenContext.GVS, False), + ('IOS', PoTokenContext.PLAYER, False), + ]) + def test_not_supports(self, ie, logger, pot_request, client_name, context, is_authenticated): + pcs = WebPoPCSP(ie=ie, logger=logger, settings={}) + pot_request.innertube_context['client']['clientName'] = client_name + pot_request.context = context + pot_request.is_authenticated = is_authenticated + assert pcs.generate_cache_spec(pot_request) is None + + @pytest.mark.parametrize('client_name, context, is_authenticated, remote_host, source_address, request_proxy, expected', [ + *[(client, context, is_authenticated, remote_host, source_address, request_proxy, expected) for client in [ + 'WEB', 'MWEB', 'TVHTML5', 'WEB_EMBEDDED_PLAYER', 'WEB_CREATOR', 'TVHTML5_SIMPLY_EMBEDDED_PLAYER'] + for context, is_authenticated, remote_host, source_address, request_proxy, expected in [ + (PoTokenContext.GVS, False, 'example-remote-host', 'example-source-address', 'example-request-proxy', {'t': 'webpo', 'ip': 'example-remote-host', 'sa': 'example-source-address', 'px': 'example-request-proxy', 'cb': '123abcXYZ_-', 'cbt': 'visitor_id'}), + (PoTokenContext.PLAYER, False, 'example-remote-host', 'example-source-address', 'example-request-proxy', {'t': 'webpo', 'ip': 'example-remote-host', 'sa': 'example-source-address', 'px': 'example-request-proxy', 'cb': '123abcXYZ_-', 'cbt': 'video_id'}), + (PoTokenContext.GVS, True, 'example-remote-host', 'example-source-address', 'example-request-proxy', {'t': 'webpo', 'ip': 'example-remote-host', 'sa': 'example-source-address', 'px': 'example-request-proxy', 'cb': 'example-data-sync-id', 'cbt': 'datasync_id'}), + ]], + ('WEB_REMIX', PoTokenContext.PLAYER, False, 'example-remote-host', 'example-source-address', 'example-request-proxy', {'t': 'webpo', 'ip': 'example-remote-host', 'sa': 'example-source-address', 'px': 'example-request-proxy', 'cb': '123abcXYZ_-', 'cbt': 'visitor_id'}), + ('WEB', PoTokenContext.GVS, False, None, None, None, {'t': 'webpo', 'cb': '123abcXYZ_-', 'cbt': 'visitor_id', 'ip': None, 'sa': None, 'px': None}), + ('TVHTML5', PoTokenContext.PLAYER, False, None, None, 'http://example.com', {'t': 'webpo', 'cb': '123abcXYZ_-', 'cbt': 'video_id', 'ip': None, 'sa': None, 'px': 'http://example.com'}), + + ]) + def test_generate_key_bindings(self, ie, logger, pot_request, client_name, context, is_authenticated, remote_host, source_address, request_proxy, expected): + pcs = WebPoPCSP(ie=ie, logger=logger, settings={}) + pot_request.innertube_context['client']['clientName'] = client_name + pot_request.context = context + pot_request.is_authenticated = is_authenticated + pot_request.innertube_context['client']['remoteHost'] = remote_host + pot_request.request_source_address = source_address + pot_request.request_proxy = request_proxy + pot_request.video_id = '123abcXYZ_-' # same as visitor id to test type + + assert pcs.generate_cache_spec(pot_request).key_bindings == expected + + def test_no_bind_visitor_id(self, ie, logger, pot_request): + # Should not bind to visitor id if setting is set to False + pcs = WebPoPCSP(ie=ie, logger=logger, settings={'bind_to_visitor_id': ['false']}) + pot_request.innertube_context['client']['clientName'] = 'WEB' + pot_request.context = PoTokenContext.GVS + pot_request.is_authenticated = False + assert pcs.generate_cache_spec(pot_request).key_bindings == {'t': 'webpo', 'ip': None, 'sa': None, 'px': None, 'cb': 'CgsxMjNhYmNYWVpfLSiA4s%2DqBg%3D%3D', 'cbt': 'visitor_data'} + + def test_default_ttl(self, ie, logger, pot_request): + pcs = WebPoPCSP(ie=ie, logger=logger, settings={}) + assert pcs.generate_cache_spec(pot_request).default_ttl == 6 * 60 * 60 # should default to 6 hours + + def test_write_policy(self, ie, logger, pot_request): + pcs = WebPoPCSP(ie=ie, logger=logger, settings={}) + pot_request.context = PoTokenContext.GVS + assert pcs.generate_cache_spec(pot_request).write_policy == CacheProviderWritePolicy.WRITE_ALL + pot_request.context = PoTokenContext.PLAYER + assert pcs.generate_cache_spec(pot_request).write_policy == CacheProviderWritePolicy.WRITE_FIRST diff --git a/test/test_pot/test_pot_director.py b/test/test_pot/test_pot_director.py new file mode 100644 index 0000000000..bbfdd0e98e --- /dev/null +++ b/test/test_pot/test_pot_director.py @@ -0,0 +1,1529 @@ +from __future__ import annotations +import abc +import base64 +import dataclasses +import hashlib +import json +import time +import pytest + +from yt_dlp.extractor.youtube.pot._provider import BuiltinIEContentProvider, IEContentProvider + +from yt_dlp.extractor.youtube.pot.provider import ( + PoTokenRequest, + PoTokenContext, + PoTokenProviderError, + PoTokenProviderRejectedRequest, +) +from yt_dlp.extractor.youtube.pot._director import ( + PoTokenCache, + validate_cache_spec, + clean_pot, + validate_response, + PoTokenRequestDirector, + provider_display_list, +) + +from yt_dlp.extractor.youtube.pot.cache import ( + PoTokenCacheSpec, + PoTokenCacheSpecProvider, + PoTokenCacheProvider, + CacheProviderWritePolicy, + PoTokenCacheProviderError, +) + + +from yt_dlp.extractor.youtube.pot.provider import ( + PoTokenResponse, + PoTokenProvider, +) + + +class BaseMockPoTokenProvider(PoTokenProvider, abc.ABC): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.available_called_times = 0 + self.request_called_times = 0 + self.close_called = False + + def is_available(self) -> bool: + self.available_called_times += 1 + return True + + def request_pot(self, *args, **kwargs): + self.request_called_times += 1 + return super().request_pot(*args, **kwargs) + + def close(self): + self.close_called = True + super().close() + + +class ExamplePTP(BaseMockPoTokenProvider): + PROVIDER_NAME = 'example' + PROVIDER_VERSION = '0.0.1' + BUG_REPORT_LOCATION = 'https://example.com/issues' + + _SUPPORTED_CLIENTS = ('WEB',) + _SUPPORTED_CONTEXTS = (PoTokenContext.GVS, ) + + def _real_request_pot(self, request: PoTokenRequest) -> PoTokenResponse: + if request.data_sync_id == 'example': + return PoTokenResponse(request.video_id) + return PoTokenResponse(EXAMPLE_PO_TOKEN) + + +def success_ptp(response: PoTokenResponse | None = None, key: str | None = None): + class SuccessPTP(BaseMockPoTokenProvider): + PROVIDER_NAME = 'success' + PROVIDER_VERSION = '0.0.1' + BUG_REPORT_LOCATION = 'https://success.example.com/issues' + + _SUPPORTED_CLIENTS = ('WEB',) + _SUPPORTED_CONTEXTS = (PoTokenContext.GVS,) + + def _real_request_pot(self, request: PoTokenRequest) -> PoTokenResponse: + return response or PoTokenResponse(EXAMPLE_PO_TOKEN) + + if key: + SuccessPTP.PROVIDER_KEY = key + return SuccessPTP + + +@pytest.fixture +def pot_provider(ie, logger): + return success_ptp()(ie=ie, logger=logger, settings={}) + + +class UnavailablePTP(BaseMockPoTokenProvider): + PROVIDER_NAME = 'unavailable' + BUG_REPORT_LOCATION = 'https://unavailable.example.com/issues' + _SUPPORTED_CLIENTS = None + _SUPPORTED_CONTEXTS = None + + def is_available(self) -> bool: + super().is_available() + return False + + def _real_request_pot(self, request: PoTokenRequest) -> PoTokenResponse: + raise PoTokenProviderError('something went wrong') + + +class UnsupportedPTP(BaseMockPoTokenProvider): + PROVIDER_NAME = 'unsupported' + BUG_REPORT_LOCATION = 'https://unsupported.example.com/issues' + _SUPPORTED_CLIENTS = None + _SUPPORTED_CONTEXTS = None + + def _real_request_pot(self, request: PoTokenRequest) -> PoTokenResponse: + raise PoTokenProviderRejectedRequest('unsupported request') + + +class ErrorPTP(BaseMockPoTokenProvider): + PROVIDER_NAME = 'error' + BUG_REPORT_LOCATION = 'https://error.example.com/issues' + _SUPPORTED_CLIENTS = None + _SUPPORTED_CONTEXTS = None + + def _real_request_pot(self, request: PoTokenRequest) -> PoTokenResponse: + expected = request.video_id == 'expected' + raise PoTokenProviderError('an error occurred', expected=expected) + + +class UnexpectedErrorPTP(BaseMockPoTokenProvider): + PROVIDER_NAME = 'unexpected_error' + BUG_REPORT_LOCATION = 'https://unexpected.example.com/issues' + _SUPPORTED_CLIENTS = None + _SUPPORTED_CONTEXTS = None + + def _real_request_pot(self, request: PoTokenRequest) -> PoTokenResponse: + raise ValueError('an unexpected error occurred') + + +class InvalidPTP(BaseMockPoTokenProvider): + PROVIDER_NAME = 'invalid' + BUG_REPORT_LOCATION = 'https://invalid.example.com/issues' + _SUPPORTED_CLIENTS = None + _SUPPORTED_CONTEXTS = None + + def _real_request_pot(self, request: PoTokenRequest) -> PoTokenResponse: + if request.video_id == 'invalid_type': + return 'invalid-response' + else: + return PoTokenResponse('example-token?', expires_at='123') + + +class BaseMockCacheSpecProvider(PoTokenCacheSpecProvider, abc.ABC): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.generate_called_times = 0 + self.is_available_called_times = 0 + self.close_called = False + + def is_available(self) -> bool: + self.is_available_called_times += 1 + return super().is_available() + + def generate_cache_spec(self, request: PoTokenRequest): + self.generate_called_times += 1 + + def close(self): + self.close_called = True + super().close() + + +class ExampleCacheSpecProviderPCSP(BaseMockCacheSpecProvider): + + PROVIDER_NAME = 'example' + PROVIDER_VERSION = '0.0.1' + BUG_REPORT_LOCATION = 'https://example.com/issues' + + def generate_cache_spec(self, request: PoTokenRequest): + super().generate_cache_spec(request) + return PoTokenCacheSpec( + key_bindings={'v': request.video_id, 'e': None}, + default_ttl=60, + ) + + +class UnavailableCacheSpecProviderPCSP(BaseMockCacheSpecProvider): + + PROVIDER_NAME = 'unavailable' + PROVIDER_VERSION = '0.0.1' + + def is_available(self) -> bool: + super().is_available() + return False + + def generate_cache_spec(self, request: PoTokenRequest): + super().generate_cache_spec(request) + return None + + +class UnsupportedCacheSpecProviderPCSP(BaseMockCacheSpecProvider): + + PROVIDER_NAME = 'unsupported' + PROVIDER_VERSION = '0.0.1' + + def generate_cache_spec(self, request: PoTokenRequest): + super().generate_cache_spec(request) + return None + + +class InvalidSpecCacheSpecProviderPCSP(BaseMockCacheSpecProvider): + + PROVIDER_NAME = 'invalid' + PROVIDER_VERSION = '0.0.1' + + def generate_cache_spec(self, request: PoTokenRequest): + super().generate_cache_spec(request) + return 'invalid-spec' + + +class ErrorSpecCacheSpecProviderPCSP(BaseMockCacheSpecProvider): + + PROVIDER_NAME = 'invalid' + PROVIDER_VERSION = '0.0.1' + + def generate_cache_spec(self, request: PoTokenRequest): + super().generate_cache_spec(request) + raise ValueError('something went wrong') + + +class BaseMockCacheProvider(PoTokenCacheProvider, abc.ABC): + BUG_REPORT_MESSAGE = 'example bug report message' + + def __init__(self, *args, available=True, **kwargs): + super().__init__(*args, **kwargs) + self.store_calls = 0 + self.delete_calls = 0 + self.get_calls = 0 + self.available_called_times = 0 + self.available = available + + def is_available(self) -> bool: + self.available_called_times += 1 + return self.available + + def store(self, *args, **kwargs): + self.store_calls += 1 + + def delete(self, *args, **kwargs): + self.delete_calls += 1 + + def get(self, *args, **kwargs): + self.get_calls += 1 + + def close(self): + self.close_called = True + super().close() + + +class ErrorPCP(BaseMockCacheProvider): + PROVIDER_NAME = 'error' + + def store(self, *args, **kwargs): + super().store(*args, **kwargs) + raise PoTokenCacheProviderError('something went wrong') + + def get(self, *args, **kwargs): + super().get(*args, **kwargs) + raise PoTokenCacheProviderError('something went wrong') + + +class UnexpectedErrorPCP(BaseMockCacheProvider): + PROVIDER_NAME = 'unexpected_error' + + def store(self, *args, **kwargs): + super().store(*args, **kwargs) + raise ValueError('something went wrong') + + def get(self, *args, **kwargs): + super().get(*args, **kwargs) + raise ValueError('something went wrong') + + +class MockMemoryPCP(BaseMockCacheProvider): + PROVIDER_NAME = 'memory' + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.cache = {} + + def store(self, key, value, expires_at): + super().store(key, value, expires_at) + self.cache[key] = (value, expires_at) + + def delete(self, key): + super().delete(key) + self.cache.pop(key, None) + + def get(self, key): + super().get(key) + return self.cache.get(key, [None])[0] + + +def create_memory_pcp(ie, logger, provider_key='memory', provider_name='memory', available=True): + cache = MockMemoryPCP(ie, logger, {}, available=available) + cache.PROVIDER_KEY = provider_key + cache.PROVIDER_NAME = provider_name + return cache + + +@pytest.fixture +def memorypcp(ie, logger) -> MockMemoryPCP: + return create_memory_pcp(ie, logger) + + +@pytest.fixture +def pot_cache(ie, logger): + class MockPoTokenCache(PoTokenCache): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.get_calls = 0 + self.store_calls = 0 + self.close_called = False + + def get(self, *args, **kwargs): + self.get_calls += 1 + return super().get(*args, **kwargs) + + def store(self, *args, **kwargs): + self.store_calls += 1 + return super().store(*args, **kwargs) + + def close(self): + self.close_called = True + super().close() + + return MockPoTokenCache( + cache_providers=[MockMemoryPCP(ie, logger, {})], + cache_spec_providers=[ExampleCacheSpecProviderPCSP(ie, logger, settings={})], + logger=logger, + ) + + +EXAMPLE_PO_TOKEN = base64.urlsafe_b64encode(b'example-token').decode() + + +class TestPoTokenCache: + + def test_cache_success(self, memorypcp, pot_request, ie, logger): + cache = PoTokenCache( + cache_providers=[memorypcp], + cache_spec_providers=[ExampleCacheSpecProviderPCSP(ie=ie, logger=logger, settings={})], + logger=logger, + ) + + response = PoTokenResponse(EXAMPLE_PO_TOKEN) + + assert cache.get(pot_request) is None + cache.store(pot_request, response) + + cached_response = cache.get(pot_request) + assert cached_response is not None + assert cached_response.po_token == EXAMPLE_PO_TOKEN + assert cached_response.expires_at is not None + + assert cache.get(dataclasses.replace(pot_request, video_id='another-video-id')) is None + + def test_unsupported_cache_spec_no_fallback(self, memorypcp, pot_request, ie, logger): + unsupported_provider = UnsupportedCacheSpecProviderPCSP(ie=ie, logger=logger, settings={}) + cache = PoTokenCache( + cache_providers=[memorypcp], + cache_spec_providers=[unsupported_provider], + logger=logger, + ) + + response = PoTokenResponse(EXAMPLE_PO_TOKEN) + assert cache.get(pot_request) is None + assert unsupported_provider.generate_called_times == 1 + cache.store(pot_request, response) + assert len(memorypcp.cache) == 0 + assert unsupported_provider.generate_called_times == 2 + assert cache.get(pot_request) is None + assert unsupported_provider.generate_called_times == 3 + assert len(logger.messages.get('error', [])) == 0 + + def test_unsupported_cache_spec_fallback(self, memorypcp, pot_request, ie, logger): + unsupported_provider = UnsupportedCacheSpecProviderPCSP(ie=ie, logger=logger, settings={}) + example_provider = ExampleCacheSpecProviderPCSP(ie=ie, logger=logger, settings={}) + cache = PoTokenCache( + cache_providers=[memorypcp], + cache_spec_providers=[unsupported_provider, example_provider], + logger=logger, + ) + + response = PoTokenResponse(EXAMPLE_PO_TOKEN) + + assert cache.get(pot_request) is None + assert unsupported_provider.generate_called_times == 1 + assert example_provider.generate_called_times == 1 + + cache.store(pot_request, response) + assert unsupported_provider.generate_called_times == 2 + assert example_provider.generate_called_times == 2 + + cached_response = cache.get(pot_request) + assert unsupported_provider.generate_called_times == 3 + assert example_provider.generate_called_times == 3 + assert cached_response is not None + assert cached_response.po_token == EXAMPLE_PO_TOKEN + assert cached_response.expires_at is not None + + assert len(logger.messages.get('error', [])) == 0 + + def test_invalid_cache_spec_no_fallback(self, memorypcp, pot_request, ie, logger): + cache = PoTokenCache( + cache_providers=[memorypcp], + cache_spec_providers=[InvalidSpecCacheSpecProviderPCSP(ie=ie, logger=logger, settings={})], + logger=logger, + ) + + response = PoTokenResponse(EXAMPLE_PO_TOKEN) + + assert cache.get(pot_request) is None + cache.store(pot_request, response) + + assert cache.get(pot_request) is None + + assert 'PoTokenCacheSpecProvider "InvalidSpecCacheSpecProvider" generate_cache_spec() returned invalid spec invalid-spec; please report this issue to the provider developer at (developer has not provided a bug report location) .' in logger.messages['error'] + + def test_invalid_cache_spec_fallback(self, memorypcp, pot_request, ie, logger): + + invalid_provider = InvalidSpecCacheSpecProviderPCSP(ie=ie, logger=logger, settings={}) + example_provider = ExampleCacheSpecProviderPCSP(ie=ie, logger=logger, settings={}) + cache = PoTokenCache( + cache_providers=[memorypcp], + cache_spec_providers=[invalid_provider, example_provider], + logger=logger, + ) + + response = PoTokenResponse(EXAMPLE_PO_TOKEN) + + assert cache.get(pot_request) is None + assert invalid_provider.generate_called_times == example_provider.generate_called_times == 1 + + cache.store(pot_request, response) + assert invalid_provider.generate_called_times == example_provider.generate_called_times == 2 + + cached_response = cache.get(pot_request) + assert invalid_provider.generate_called_times == example_provider.generate_called_times == 3 + assert cached_response is not None + assert cached_response.po_token == EXAMPLE_PO_TOKEN + assert cached_response.expires_at is not None + + assert 'PoTokenCacheSpecProvider "InvalidSpecCacheSpecProvider" generate_cache_spec() returned invalid spec invalid-spec; please report this issue to the provider developer at (developer has not provided a bug report location) .' in logger.messages['error'] + + def test_unavailable_cache_spec_no_fallback(self, memorypcp, pot_request, ie, logger): + unavailable_provider = UnavailableCacheSpecProviderPCSP(ie=ie, logger=logger, settings={}) + cache = PoTokenCache( + cache_providers=[memorypcp], + cache_spec_providers=[unavailable_provider], + logger=logger, + ) + + response = PoTokenResponse(EXAMPLE_PO_TOKEN) + + assert cache.get(pot_request) is None + cache.store(pot_request, response) + assert cache.get(pot_request) is None + assert unavailable_provider.generate_called_times == 0 + + def test_unavailable_cache_spec_fallback(self, memorypcp, pot_request, ie, logger): + unavailable_provider = UnavailableCacheSpecProviderPCSP(ie=ie, logger=logger, settings={}) + example_provider = ExampleCacheSpecProviderPCSP(ie=ie, logger=logger, settings={}) + cache = PoTokenCache( + cache_providers=[memorypcp], + cache_spec_providers=[unavailable_provider, example_provider], + logger=logger, + ) + + response = PoTokenResponse(EXAMPLE_PO_TOKEN) + + assert cache.get(pot_request) is None + assert unavailable_provider.generate_called_times == 0 + assert unavailable_provider.is_available_called_times == 1 + assert example_provider.generate_called_times == 1 + + cache.store(pot_request, response) + assert unavailable_provider.generate_called_times == 0 + assert unavailable_provider.is_available_called_times == 2 + assert example_provider.generate_called_times == 2 + + cached_response = cache.get(pot_request) + assert unavailable_provider.generate_called_times == 0 + assert unavailable_provider.is_available_called_times == 3 + assert example_provider.generate_called_times == 3 + assert example_provider.is_available_called_times == 3 + assert cached_response is not None + assert cached_response.po_token == EXAMPLE_PO_TOKEN + assert cached_response.expires_at is not None + + def test_unexpected_error_cache_spec(self, memorypcp, pot_request, ie, logger): + error_provider = ErrorSpecCacheSpecProviderPCSP(ie=ie, logger=logger, settings={}) + cache = PoTokenCache( + cache_providers=[memorypcp], + cache_spec_providers=[error_provider], + logger=logger, + ) + + response = PoTokenResponse(EXAMPLE_PO_TOKEN) + + assert cache.get(pot_request) is None + cache.store(pot_request, response) + assert cache.get(pot_request) is None + assert error_provider.generate_called_times == 3 + assert error_provider.is_available_called_times == 3 + + assert 'Error occurred with "invalid" PO Token cache spec provider: ValueError(\'something went wrong\'); please report this issue to the provider developer at (developer has not provided a bug report location) .' in logger.messages['error'] + + def test_unexpected_error_cache_spec_fallback(self, memorypcp, pot_request, ie, logger): + error_provider = ErrorSpecCacheSpecProviderPCSP(ie=ie, logger=logger, settings={}) + example_provider = ExampleCacheSpecProviderPCSP(ie=ie, logger=logger, settings={}) + cache = PoTokenCache( + cache_providers=[memorypcp], + cache_spec_providers=[error_provider, example_provider], + logger=logger, + ) + + response = PoTokenResponse(EXAMPLE_PO_TOKEN) + + assert cache.get(pot_request) is None + assert error_provider.generate_called_times == 1 + assert error_provider.is_available_called_times == 1 + assert example_provider.generate_called_times == 1 + + cache.store(pot_request, response) + assert error_provider.generate_called_times == 2 + assert error_provider.is_available_called_times == 2 + assert example_provider.generate_called_times == 2 + + cached_response = cache.get(pot_request) + assert error_provider.generate_called_times == 3 + assert error_provider.is_available_called_times == 3 + assert example_provider.generate_called_times == 3 + assert example_provider.is_available_called_times == 3 + assert cached_response is not None + assert cached_response.po_token == EXAMPLE_PO_TOKEN + assert cached_response.expires_at is not None + + assert 'Error occurred with "invalid" PO Token cache spec provider: ValueError(\'something went wrong\'); please report this issue to the provider developer at (developer has not provided a bug report location) .' in logger.messages['error'] + + def test_key_bindings_spec_provider(self, memorypcp, pot_request, ie, logger): + + class ExampleProviderPCSP(PoTokenCacheSpecProvider): + PROVIDER_NAME = 'example' + + def generate_cache_spec(self, request: PoTokenRequest): + return PoTokenCacheSpec( + key_bindings={'v': request.video_id}, + default_ttl=60, + ) + + class ExampleProviderTwoPCSP(ExampleProviderPCSP): + pass + + example_provider = ExampleProviderPCSP(ie=ie, logger=logger, settings={}) + example_provider_two = ExampleProviderTwoPCSP(ie=ie, logger=logger, settings={}) + + response = PoTokenResponse(EXAMPLE_PO_TOKEN) + + cache = PoTokenCache( + cache_providers=[memorypcp], + cache_spec_providers=[example_provider], + logger=logger, + ) + + assert cache.get(pot_request) is None + cache.store(pot_request, response) + assert len(memorypcp.cache) == 1 + assert hashlib.sha256( + f"{{'_dlp_cache': 'v1', '_p': 'ExampleProvider', 'v': '{pot_request.video_id}'}}".encode()).hexdigest() in memorypcp.cache + + # The second spec provider returns the exact same key bindings as the first one, + # however the PoTokenCache should use the provider key to differentiate between them + cache = PoTokenCache( + cache_providers=[memorypcp], + cache_spec_providers=[example_provider_two], + logger=logger, + ) + + assert cache.get(pot_request) is None + cache.store(pot_request, response) + assert len(memorypcp.cache) == 2 + assert hashlib.sha256( + f"{{'_dlp_cache': 'v1', '_p': 'ExampleProviderTwo', 'v': '{pot_request.video_id}'}}".encode()).hexdigest() in memorypcp.cache + + def test_cache_provider_preferences(self, pot_request, ie, logger): + pcp_one = create_memory_pcp(ie, logger, provider_key='memory_pcp_one') + pcp_two = create_memory_pcp(ie, logger, provider_key='memory_pcp_two') + + cache = PoTokenCache( + cache_providers=[pcp_one, pcp_two], + cache_spec_providers=[ExampleCacheSpecProviderPCSP(ie=ie, logger=logger, settings={})], + logger=logger, + ) + + cache.store(pot_request, PoTokenResponse(EXAMPLE_PO_TOKEN), write_policy=CacheProviderWritePolicy.WRITE_FIRST) + assert len(pcp_one.cache) == 1 + assert len(pcp_two.cache) == 0 + + assert cache.get(pot_request) + assert pcp_one.get_calls == 1 + assert pcp_two.get_calls == 0 + + standard_preference_called = False + pcp_one_preference_claled = False + + def standard_preference(provider, request, *_, **__): + nonlocal standard_preference_called + standard_preference_called = True + assert isinstance(provider, PoTokenCacheProvider) + assert isinstance(request, PoTokenRequest) + return 1 + + def pcp_one_preference(provider, request, *_, **__): + nonlocal pcp_one_preference_claled + pcp_one_preference_claled = True + assert isinstance(provider, PoTokenCacheProvider) + assert isinstance(request, PoTokenRequest) + if provider.PROVIDER_KEY == pcp_one.PROVIDER_KEY: + return -100 + return 0 + + # test that it can hanldle multiple preferences + cache.cache_provider_preferences.append(standard_preference) + cache.cache_provider_preferences.append(pcp_one_preference) + + cache.store(pot_request, PoTokenResponse(EXAMPLE_PO_TOKEN), write_policy=CacheProviderWritePolicy.WRITE_FIRST) + assert cache.get(pot_request) + assert len(pcp_one.cache) == len(pcp_two.cache) == 1 + assert pcp_two.get_calls == pcp_one.get_calls == 1 + assert pcp_one.store_calls == pcp_two.store_calls == 1 + assert standard_preference_called + assert pcp_one_preference_claled + + def test_secondary_cache_provider_hit(self, pot_request, ie, logger): + pcp_one = create_memory_pcp(ie, logger, provider_key='memory_pcp_one') + pcp_two = create_memory_pcp(ie, logger, provider_key='memory_pcp_two') + + cache = PoTokenCache( + cache_providers=[pcp_two], + cache_spec_providers=[ExampleCacheSpecProviderPCSP(ie=ie, logger=logger, settings={})], + logger=logger, + ) + + # Given the lower priority provider has the cache hit, store the response in the higher priority provider + cache.store(pot_request, PoTokenResponse(EXAMPLE_PO_TOKEN)) + assert cache.get(pot_request) + + cache.cache_providers[pcp_one.PROVIDER_KEY] = pcp_one + + def pcp_one_pref(provider, *_, **__): + if provider.PROVIDER_KEY == pcp_one.PROVIDER_KEY: + return 1 + return -1 + + cache.cache_provider_preferences.append(pcp_one_pref) + + assert cache.get(pot_request) + assert pcp_one.get_calls == 1 + assert pcp_two.get_calls == 2 + # Should write back to pcp_one (now the highest priority cache provider) + assert pcp_one.store_calls == pcp_two.store_calls == 1 + assert 'Writing PO Token response to highest priority cache provider' in logger.messages['trace'] + + def test_cache_provider_no_hits(self, pot_request, ie, logger): + pcp_one = create_memory_pcp(ie, logger, provider_key='memory_pcp_one') + pcp_two = create_memory_pcp(ie, logger, provider_key='memory_pcp_two') + + cache = PoTokenCache( + cache_providers=[pcp_one, pcp_two], + cache_spec_providers=[ExampleCacheSpecProviderPCSP(ie=ie, logger=logger, settings={})], + logger=logger, + ) + + assert cache.get(pot_request) is None + assert pcp_one.get_calls == pcp_two.get_calls == 1 + + def test_get_invalid_po_token_response(self, pot_request, ie, logger): + # Test various scenarios where the po token response stored in the cache provider is invalid + pcp_one = create_memory_pcp(ie, logger, provider_key='memory_pcp_one') + pcp_two = create_memory_pcp(ie, logger, provider_key='memory_pcp_two') + + cache = PoTokenCache( + cache_providers=[pcp_one, pcp_two], + cache_spec_providers=[ExampleCacheSpecProviderPCSP(ie=ie, logger=logger, settings={})], + logger=logger, + ) + + valid_response = PoTokenResponse(EXAMPLE_PO_TOKEN) + cache.store(pot_request, valid_response) + assert len(pcp_one.cache) == len(pcp_two.cache) == 1 + # Overwrite the valid response with an invalid one in the cache + pcp_one.store(next(iter(pcp_one.cache.keys())), json.dumps(dataclasses.asdict(PoTokenResponse(None))), int(time.time() + 1000)) + assert cache.get(pot_request).po_token == valid_response.po_token + assert pcp_one.get_calls == pcp_two.get_calls == 1 + assert pcp_one.delete_calls == 1 # Invalid response should be deleted from cache + assert pcp_one.store_calls == 3 # Since response was fetched from second cache provider, it should be stored in the first one + assert len(pcp_one.cache) == 1 + assert 'Invalid PO Token response retrieved from cache provider "memory": {"po_token": null, "expires_at": null}; example bug report message' in logger.messages['error'] + + # Overwrite the valid response with an invalid json in the cache + pcp_one.store(next(iter(pcp_one.cache.keys())), 'invalid-json', int(time.time() + 1000)) + assert cache.get(pot_request).po_token == valid_response.po_token + assert pcp_one.get_calls == pcp_two.get_calls == 2 + assert pcp_one.delete_calls == 2 + assert pcp_one.store_calls == 5 # 3 + 1 store we made in the test + 1 store from lower priority cache provider + assert len(pcp_one.cache) == 1 + + assert 'Invalid PO Token response retrieved from cache provider "memory": invalid-json; example bug report message' in logger.messages['error'] + + # Valid json, but missing required fields + pcp_one.store(next(iter(pcp_one.cache.keys())), '{"unknown_param": 0}', int(time.time() + 1000)) + assert cache.get(pot_request).po_token == valid_response.po_token + assert pcp_one.get_calls == pcp_two.get_calls == 3 + assert pcp_one.delete_calls == 3 + assert pcp_one.store_calls == 7 # 5 + 1 store from test + 1 store from lower priority cache provider + assert len(pcp_one.cache) == 1 + + assert 'Invalid PO Token response retrieved from cache provider "memory": {"unknown_param": 0}; example bug report message' in logger.messages['error'] + + def test_store_invalid_po_token_response(self, pot_request, ie, logger): + # Should not store an invalid po token response + pcp_one = create_memory_pcp(ie, logger, provider_key='memory_pcp_one') + + cache = PoTokenCache( + cache_providers=[pcp_one], + cache_spec_providers=[ExampleCacheSpecProviderPCSP(ie=ie, logger=logger, settings={})], + logger=logger, + ) + + cache.store(pot_request, PoTokenResponse(po_token=EXAMPLE_PO_TOKEN, expires_at=80)) + assert cache.get(pot_request) is None + assert pcp_one.store_calls == 0 + assert 'Invalid PO Token response provided to PoTokenCache.store()' in logger.messages['error'][0] + + def test_store_write_policy(self, pot_request, ie, logger): + pcp_one = create_memory_pcp(ie, logger, provider_key='memory_pcp_one') + pcp_two = create_memory_pcp(ie, logger, provider_key='memory_pcp_two') + + cache = PoTokenCache( + cache_providers=[pcp_one, pcp_two], + cache_spec_providers=[ExampleCacheSpecProviderPCSP(ie=ie, logger=logger, settings={})], + logger=logger, + ) + + cache.store(pot_request, PoTokenResponse(EXAMPLE_PO_TOKEN), write_policy=CacheProviderWritePolicy.WRITE_FIRST) + assert pcp_one.store_calls == 1 + assert pcp_two.store_calls == 0 + + cache.store(pot_request, PoTokenResponse(EXAMPLE_PO_TOKEN), write_policy=CacheProviderWritePolicy.WRITE_ALL) + assert pcp_one.store_calls == 2 + assert pcp_two.store_calls == 1 + + def test_store_write_first_policy_cache_spec(self, pot_request, ie, logger): + pcp_one = create_memory_pcp(ie, logger, provider_key='memory_pcp_one') + pcp_two = create_memory_pcp(ie, logger, provider_key='memory_pcp_two') + + class WriteFirstPCSP(BaseMockCacheSpecProvider): + def generate_cache_spec(self, request: PoTokenRequest): + super().generate_cache_spec(request) + return PoTokenCacheSpec( + key_bindings={'v': request.video_id, 'e': None}, + default_ttl=60, + write_policy=CacheProviderWritePolicy.WRITE_FIRST, + ) + + cache = PoTokenCache( + cache_providers=[pcp_one, pcp_two], + cache_spec_providers=[WriteFirstPCSP(ie=ie, logger=logger, settings={})], + logger=logger, + ) + + cache.store(pot_request, PoTokenResponse(EXAMPLE_PO_TOKEN)) + assert pcp_one.store_calls == 1 + assert pcp_two.store_calls == 0 + + def test_store_write_all_policy_cache_spec(self, pot_request, ie, logger): + pcp_one = create_memory_pcp(ie, logger, provider_key='memory_pcp_one') + pcp_two = create_memory_pcp(ie, logger, provider_key='memory_pcp_two') + + class WriteAllPCSP(BaseMockCacheSpecProvider): + def generate_cache_spec(self, request: PoTokenRequest): + super().generate_cache_spec(request) + return PoTokenCacheSpec( + key_bindings={'v': request.video_id, 'e': None}, + default_ttl=60, + write_policy=CacheProviderWritePolicy.WRITE_ALL, + ) + + cache = PoTokenCache( + cache_providers=[pcp_one, pcp_two], + cache_spec_providers=[WriteAllPCSP(ie=ie, logger=logger, settings={})], + logger=logger, + ) + + cache.store(pot_request, PoTokenResponse(EXAMPLE_PO_TOKEN)) + assert pcp_one.store_calls == 1 + assert pcp_two.store_calls == 1 + + def test_expires_at_pot_response(self, pot_request, memorypcp, ie, logger): + cache = PoTokenCache( + cache_providers=[memorypcp], + cache_spec_providers=[ExampleCacheSpecProviderPCSP(ie=ie, logger=logger, settings={})], + logger=logger, + ) + + response = PoTokenResponse(EXAMPLE_PO_TOKEN, expires_at=10000000000) + cache.store(pot_request, response) + assert next(iter(memorypcp.cache.values()))[1] == 10000000000 + + def test_expires_at_default_spec(self, pot_request, memorypcp, ie, logger): + + class TtlPCSP(BaseMockCacheSpecProvider): + def generate_cache_spec(self, request: PoTokenRequest): + super().generate_cache_spec(request) + return PoTokenCacheSpec( + key_bindings={'v': request.video_id, 'e': None}, + default_ttl=10000000000, + ) + + cache = PoTokenCache( + cache_providers=[memorypcp], + cache_spec_providers=[TtlPCSP(ie=ie, logger=logger, settings={})], + logger=logger, + ) + + response = PoTokenResponse(EXAMPLE_PO_TOKEN) + cache.store(pot_request, response) + assert next(iter(memorypcp.cache.values()))[1] >= 10000000000 + + def test_cache_provider_error_no_fallback(self, pot_request, ie, logger): + error_pcp = ErrorPCP(ie, logger, {}) + cache = PoTokenCache( + cache_providers=[error_pcp], + cache_spec_providers=[ExampleCacheSpecProviderPCSP(ie=ie, logger=logger, settings={})], + logger=logger, + ) + + response = PoTokenResponse(EXAMPLE_PO_TOKEN) + cache.store(pot_request, response) + assert cache.get(pot_request) is None + assert error_pcp.get_calls == 1 + assert error_pcp.store_calls == 1 + + assert logger.messages['warning'].count("Error from \"error\" PO Token cache provider: PoTokenCacheProviderError('something went wrong'); example bug report message") == 2 + + def test_cache_provider_error_fallback(self, pot_request, ie, logger): + error_pcp = ErrorPCP(ie, logger, {}) + memory_pcp = create_memory_pcp(ie, logger, provider_key='memory') + + cache = PoTokenCache( + cache_providers=[error_pcp, memory_pcp], + cache_spec_providers=[ExampleCacheSpecProviderPCSP(ie=ie, logger=logger, settings={})], + logger=logger, + ) + + response = PoTokenResponse(EXAMPLE_PO_TOKEN) + cache.store(pot_request, response) + + # 1. Store fails for error_pcp, stored in memory_pcp + # 2. Get fails for error_pcp, fetched from memory_pcp + # 3. Since fetched from lower priority, it should be stored in the highest priority cache provider + # 4. Store fails in error_pcp. Since write policy is WRITE_FIRST, it should not try to store in memory_pcp regardless of if the store in error_pcp fails + + assert cache.get(pot_request) + assert error_pcp.get_calls == 1 + assert error_pcp.store_calls == 2 # since highest priority, when fetched from lower priority, it should be stored in the highest priority cache provider + assert memory_pcp.get_calls == 1 + assert memory_pcp.store_calls == 1 + + assert logger.messages['warning'].count("Error from \"error\" PO Token cache provider: PoTokenCacheProviderError('something went wrong'); example bug report message") == 3 + + def test_cache_provider_unexpected_error_no_fallback(self, pot_request, ie, logger): + error_pcp = UnexpectedErrorPCP(ie, logger, {}) + cache = PoTokenCache( + cache_providers=[error_pcp], + cache_spec_providers=[ExampleCacheSpecProviderPCSP(ie=ie, logger=logger, settings={})], + logger=logger, + ) + + response = PoTokenResponse(EXAMPLE_PO_TOKEN) + cache.store(pot_request, response) + assert cache.get(pot_request) is None + assert error_pcp.get_calls == 1 + assert error_pcp.store_calls == 1 + + assert logger.messages['error'].count("Error occurred with \"unexpected_error\" PO Token cache provider: ValueError('something went wrong'); example bug report message") == 2 + + def test_cache_provider_unexpected_error_fallback(self, pot_request, ie, logger): + error_pcp = UnexpectedErrorPCP(ie, logger, {}) + memory_pcp = create_memory_pcp(ie, logger, provider_key='memory') + + cache = PoTokenCache( + cache_providers=[error_pcp, memory_pcp], + cache_spec_providers=[ExampleCacheSpecProviderPCSP(ie=ie, logger=logger, settings={})], + logger=logger, + ) + + response = PoTokenResponse(EXAMPLE_PO_TOKEN) + cache.store(pot_request, response) + + # 1. Store fails for error_pcp, stored in memory_pcp + # 2. Get fails for error_pcp, fetched from memory_pcp + # 3. Since fetched from lower priority, it should be stored in the highest priority cache provider + # 4. Store fails in error_pcp. Since write policy is WRITE_FIRST, it should not try to store in memory_pcp regardless of if the store in error_pcp fails + + assert cache.get(pot_request) + assert error_pcp.get_calls == 1 + assert error_pcp.store_calls == 2 # since highest priority, when fetched from lower priority, it should be stored in the highest priority cache provider + assert memory_pcp.get_calls == 1 + assert memory_pcp.store_calls == 1 + + assert logger.messages['error'].count("Error occurred with \"unexpected_error\" PO Token cache provider: ValueError('something went wrong'); example bug report message") == 3 + + def test_cache_provider_unavailable_no_fallback(self, pot_request, ie, logger): + provider = create_memory_pcp(ie, logger, available=False) + + cache = PoTokenCache( + cache_providers=[provider], + cache_spec_providers=[ExampleCacheSpecProviderPCSP(ie=ie, logger=logger, settings={})], + logger=logger, + ) + + response = PoTokenResponse(EXAMPLE_PO_TOKEN) + cache.store(pot_request, response) + assert cache.get(pot_request) is None + assert provider.get_calls == 0 + assert provider.store_calls == 0 + assert provider.available_called_times + + def test_cache_provider_unavailable_fallback(self, pot_request, ie, logger): + provider_unavailable = create_memory_pcp(ie, logger, provider_key='unavailable', provider_name='unavailable', available=False) + provider_available = create_memory_pcp(ie, logger, provider_key='available', provider_name='available') + + cache = PoTokenCache( + cache_providers=[provider_unavailable, provider_available], + cache_spec_providers=[ExampleCacheSpecProviderPCSP(ie=ie, logger=logger, settings={})], + logger=logger, + ) + + response = PoTokenResponse(EXAMPLE_PO_TOKEN) + cache.store(pot_request, response) + assert cache.get(pot_request) is not None + assert provider_unavailable.get_calls == 0 + assert provider_unavailable.store_calls == 0 + assert provider_available.get_calls == 1 + assert provider_available.store_calls == 1 + assert provider_unavailable.available_called_times + assert provider_available.available_called_times + + # should not even try to use the provider for the request + assert 'Attempting to fetch a PO Token response from "unavailable" provider' not in logger.messages['trace'] + assert 'Attempting to fetch a PO Token response from "available" provider' not in logger.messages['trace'] + + def test_available_not_called(self, ie, pot_request, logger): + # Test that the available method is not called when provider higher in the list is available + provider_unavailable = create_memory_pcp( + ie, logger, provider_key='unavailable', provider_name='unavailable', available=False) + provider_available = create_memory_pcp(ie, logger, provider_key='available', provider_name='available') + + logger.log_level = logger.LogLevel.INFO + + cache = PoTokenCache( + cache_providers=[provider_available, provider_unavailable], + cache_spec_providers=[ExampleCacheSpecProviderPCSP(ie=ie, logger=logger, settings={})], + logger=logger, + ) + + response = PoTokenResponse(EXAMPLE_PO_TOKEN) + cache.store(pot_request, response, write_policy=CacheProviderWritePolicy.WRITE_FIRST) + assert cache.get(pot_request) is not None + assert provider_unavailable.get_calls == 0 + assert provider_unavailable.store_calls == 0 + assert provider_available.get_calls == 1 + assert provider_available.store_calls == 1 + assert provider_unavailable.available_called_times == 0 + assert provider_available.available_called_times + assert 'PO Token Cache Providers: available-0.0.0 (external), unavailable-0.0.0 (external, unavailable)' not in logger.messages.get('trace', []) + + def test_available_called_trace(self, ie, pot_request, logger): + # But if logging level is trace should call available (as part of debug logging) + provider_unavailable = create_memory_pcp( + ie, logger, provider_key='unavailable', provider_name='unavailable', available=False) + provider_available = create_memory_pcp(ie, logger, provider_key='available', provider_name='available') + + logger.log_level = logger.LogLevel.TRACE + + cache = PoTokenCache( + cache_providers=[provider_available, provider_unavailable], + cache_spec_providers=[ExampleCacheSpecProviderPCSP(ie=ie, logger=logger, settings={})], + logger=logger, + ) + + response = PoTokenResponse(EXAMPLE_PO_TOKEN) + cache.store(pot_request, response, write_policy=CacheProviderWritePolicy.WRITE_FIRST) + assert cache.get(pot_request) is not None + assert provider_unavailable.get_calls == 0 + assert provider_unavailable.store_calls == 0 + assert provider_available.get_calls == 1 + assert provider_available.store_calls == 1 + assert provider_unavailable.available_called_times + assert provider_available.available_called_times + assert 'PO Token Cache Providers: available-0.0.0 (external), unavailable-0.0.0 (external, unavailable)' in logger.messages.get('trace', []) + + def test_close(self, ie, pot_request, logger): + # Should call close on the cache providers and cache specs + memory_pcp = create_memory_pcp(ie, logger, provider_key='memory') + memory2_pcp = create_memory_pcp(ie, logger, provider_key='memory2') + + spec1 = ExampleCacheSpecProviderPCSP(ie=ie, logger=logger, settings={}) + spec2 = UnavailableCacheSpecProviderPCSP(ie=ie, logger=logger, settings={}) + + cache = PoTokenCache( + cache_providers=[memory2_pcp, memory_pcp], + cache_spec_providers=[spec1, spec2], + logger=logger, + ) + + cache.close() + assert memory_pcp.close_called + assert memory2_pcp.close_called + assert spec1.close_called + assert spec2.close_called + + +class TestPoTokenRequestDirector: + + def test_request_pot_success(self, ie, pot_request, pot_cache, pot_provider, logger): + director = PoTokenRequestDirector(logger=logger, cache=pot_cache) + director.register_provider(pot_provider) + response = director.get_po_token(pot_request) + assert response == EXAMPLE_PO_TOKEN + + def test_request_and_cache(self, ie, pot_request, pot_cache, pot_provider, logger): + director = PoTokenRequestDirector(logger=logger, cache=pot_cache) + director.register_provider(pot_provider) + response = director.get_po_token(pot_request) + assert response == EXAMPLE_PO_TOKEN + assert pot_provider.request_called_times == 1 + assert pot_cache.get_calls == 1 + assert pot_cache.store_calls == 1 + + # Second request, should be cached + response = director.get_po_token(pot_request) + assert response == EXAMPLE_PO_TOKEN + assert pot_cache.get_calls == 2 + assert pot_cache.store_calls == 1 + assert pot_provider.request_called_times == 1 + + def test_bypass_cache(self, ie, pot_request, pot_cache, logger, pot_provider): + pot_request.bypass_cache = True + + director = PoTokenRequestDirector(logger=logger, cache=pot_cache) + director.register_provider(pot_provider) + response = director.get_po_token(pot_request) + assert response == EXAMPLE_PO_TOKEN + assert pot_provider.request_called_times == 1 + assert pot_cache.get_calls == 0 + assert pot_cache.store_calls == 1 + + # Second request, should not get from cache + response = director.get_po_token(pot_request) + assert response == EXAMPLE_PO_TOKEN + assert pot_provider.request_called_times == 2 + assert pot_cache.get_calls == 0 + assert pot_cache.store_calls == 2 + + # POT is still cached, should get from cache + pot_request.bypass_cache = False + response = director.get_po_token(pot_request) + assert response == EXAMPLE_PO_TOKEN + assert pot_provider.request_called_times == 2 + assert pot_cache.get_calls == 1 + assert pot_cache.store_calls == 2 + + def test_clean_pot_generate(self, ie, pot_request, pot_cache, logger): + # Token should be cleaned before returning + base_token = base64.urlsafe_b64encode(b'token').decode() + director = PoTokenRequestDirector(logger=logger, cache=pot_cache) + provider = success_ptp(PoTokenResponse(base_token + '?extra=params'))(ie, logger, settings={}) + director.register_provider(provider) + + response = director.get_po_token(pot_request) + assert response == base_token + assert provider.request_called_times == 1 + + # Confirm the cleaned version was stored in the cache + cached_token = pot_cache.get(pot_request) + assert cached_token.po_token == base_token + + def test_clean_pot_cache(self, ie, pot_request, pot_cache, logger, pot_provider): + # Token retrieved from cache should be cleaned before returning + base_token = base64.urlsafe_b64encode(b'token').decode() + pot_cache.store(pot_request, PoTokenResponse(base_token + '?extra=params')) + director = PoTokenRequestDirector(logger=logger, cache=pot_cache) + director.register_provider(pot_provider) + + response = director.get_po_token(pot_request) + assert response == base_token + assert pot_cache.get_calls == 1 + assert pot_provider.request_called_times == 0 + + def test_cache_expires_at_none(self, ie, pot_request, pot_cache, logger, pot_provider): + # Should cache if expires_at=None in the response + director = PoTokenRequestDirector(logger=logger, cache=pot_cache) + provider = success_ptp(PoTokenResponse(EXAMPLE_PO_TOKEN, expires_at=None))(ie, logger, settings={}) + director.register_provider(provider) + + response = director.get_po_token(pot_request) + assert response == EXAMPLE_PO_TOKEN + assert pot_cache.store_calls == 1 + assert pot_cache.get(pot_request).po_token == EXAMPLE_PO_TOKEN + + def test_cache_expires_at_positive(self, ie, pot_request, pot_cache, logger, pot_provider): + # Should cache if expires_at is a positive number in the response + director = PoTokenRequestDirector(logger=logger, cache=pot_cache) + provider = success_ptp(PoTokenResponse(EXAMPLE_PO_TOKEN, expires_at=99999999999))(ie, logger, settings={}) + director.register_provider(provider) + + response = director.get_po_token(pot_request) + assert response == EXAMPLE_PO_TOKEN + assert pot_cache.store_calls == 1 + assert pot_cache.get(pot_request).po_token == EXAMPLE_PO_TOKEN + + @pytest.mark.parametrize('expires_at', [0, -1]) + def test_not_cache_expires_at(self, ie, pot_request, pot_cache, logger, pot_provider, expires_at): + # Should not cache if expires_at <= 0 in the response + director = PoTokenRequestDirector(logger=logger, cache=pot_cache) + provider = success_ptp(PoTokenResponse(EXAMPLE_PO_TOKEN, expires_at=expires_at))(ie, logger, settings={}) + director.register_provider(provider) + + response = director.get_po_token(pot_request) + assert response == EXAMPLE_PO_TOKEN + assert pot_cache.store_calls == 0 + assert pot_cache.get(pot_request) is None + + def test_no_providers(self, ie, pot_request, pot_cache, logger): + director = PoTokenRequestDirector(logger=logger, cache=pot_cache) + response = director.get_po_token(pot_request) + assert response is None + + def test_try_cache_no_providers(self, ie, pot_request, pot_cache, logger): + # Should still try the cache even if no providers are configured + pot_cache.store(pot_request, PoTokenResponse(EXAMPLE_PO_TOKEN)) + director = PoTokenRequestDirector(logger=logger, cache=pot_cache) + + response = director.get_po_token(pot_request) + assert response == EXAMPLE_PO_TOKEN + + def test_close(self, ie, pot_request, pot_cache, pot_provider, logger): + # Should call close on the pot cache and any providers + director = PoTokenRequestDirector(logger=logger, cache=pot_cache) + + provider2 = UnavailablePTP(ie, logger, {}) + director.register_provider(pot_provider) + director.register_provider(provider2) + + director.close() + assert pot_provider.close_called + assert provider2.close_called + assert pot_cache.close_called + + def test_pot_provider_preferences(self, pot_request, pot_cache, ie, logger): + pot_request.bypass_cache = True + provider_two_pot = base64.urlsafe_b64encode(b'token2').decode() + + example_provider = success_ptp(response=PoTokenResponse(EXAMPLE_PO_TOKEN), key='exampleone')(ie, logger, settings={}) + example_provider_two = success_ptp(response=PoTokenResponse(provider_two_pot), key='exampletwo')(ie, logger, settings={}) + + director = PoTokenRequestDirector(logger=logger, cache=pot_cache) + director.register_provider(example_provider) + director.register_provider(example_provider_two) + + response = director.get_po_token(pot_request) + assert response == EXAMPLE_PO_TOKEN + assert example_provider.request_called_times == 1 + assert example_provider_two.request_called_times == 0 + + standard_preference_called = False + example_preference_called = False + + # Test that the provider preferences are respected + def standard_preference(provider, request, *_, **__): + nonlocal standard_preference_called + standard_preference_called = True + assert isinstance(provider, PoTokenProvider) + assert isinstance(request, PoTokenRequest) + return 1 + + def example_preference(provider, request, *_, **__): + nonlocal example_preference_called + example_preference_called = True + assert isinstance(provider, PoTokenProvider) + assert isinstance(request, PoTokenRequest) + if provider.PROVIDER_KEY == example_provider.PROVIDER_KEY: + return -100 + return 0 + + # test that it can handle multiple preferences + director.register_preference(example_preference) + director.register_preference(standard_preference) + + response = director.get_po_token(pot_request) + assert response == provider_two_pot + assert example_provider.request_called_times == 1 + assert example_provider_two.request_called_times == 1 + assert standard_preference_called + assert example_preference_called + + def test_unsupported_request_no_fallback(self, ie, logger, pot_cache, pot_request): + director = PoTokenRequestDirector(logger=logger, cache=pot_cache) + provider = UnsupportedPTP(ie, logger, {}) + director.register_provider(provider) + + response = director.get_po_token(pot_request) + assert response is None + assert provider.request_called_times == 1 + + def test_unsupported_request_fallback(self, ie, logger, pot_cache, pot_request, pot_provider): + # Should fallback to the next provider if the first one does not support the request + director = PoTokenRequestDirector(logger=logger, cache=pot_cache) + provider = UnsupportedPTP(ie, logger, {}) + director.register_provider(provider) + director.register_provider(pot_provider) + + response = director.get_po_token(pot_request) + assert response == EXAMPLE_PO_TOKEN + assert provider.request_called_times == 1 + assert pot_provider.request_called_times == 1 + assert 'PO Token Provider "unsupported" rejected this request, trying next available provider. Reason: unsupported request' in logger.messages['trace'] + + def test_unavailable_request_no_fallback(self, ie, logger, pot_cache, pot_request): + director = PoTokenRequestDirector(logger=logger, cache=pot_cache) + provider = UnavailablePTP(ie, logger, {}) + director.register_provider(provider) + + response = director.get_po_token(pot_request) + assert response is None + assert provider.request_called_times == 0 + assert provider.available_called_times + + def test_unavailable_request_fallback(self, ie, logger, pot_cache, pot_request, pot_provider): + # Should fallback to the next provider if the first one is unavailable + director = PoTokenRequestDirector(logger=logger, cache=pot_cache) + provider = UnavailablePTP(ie, logger, {}) + director.register_provider(provider) + director.register_provider(pot_provider) + + response = director.get_po_token(pot_request) + assert response == EXAMPLE_PO_TOKEN + assert provider.request_called_times == 0 + assert provider.available_called_times + assert pot_provider.request_called_times == 1 + assert pot_provider.available_called_times + # should not even try use the provider for the request + assert 'Attempting to fetch a PO Token from "unavailable" provider' not in logger.messages['trace'] + assert 'Attempting to fetch a PO Token from "success" provider' in logger.messages['trace'] + + def test_available_not_called(self, ie, logger, pot_cache, pot_request, pot_provider): + # Test that the available method is not called when provider higher in the list is available + logger.log_level = logger.LogLevel.INFO + director = PoTokenRequestDirector(logger=logger, cache=pot_cache) + provider = UnavailablePTP(ie, logger, {}) + director.register_provider(pot_provider) + director.register_provider(provider) + + response = director.get_po_token(pot_request) + assert response == EXAMPLE_PO_TOKEN + assert provider.request_called_times == 0 + assert provider.available_called_times == 0 + assert pot_provider.request_called_times == 1 + assert pot_provider.available_called_times == 2 + assert 'PO Token Providers: success-0.0.1 (external), unavailable-0.0.0 (external, unavailable)' not in logger.messages.get('trace', []) + + def test_available_called_trace(self, ie, logger, pot_cache, pot_request, pot_provider): + # But if logging level is trace should call available (as part of debug logging) + logger.log_level = logger.LogLevel.TRACE + director = PoTokenRequestDirector(logger=logger, cache=pot_cache) + provider = UnavailablePTP(ie, logger, {}) + director.register_provider(pot_provider) + director.register_provider(provider) + + response = director.get_po_token(pot_request) + assert response == EXAMPLE_PO_TOKEN + assert provider.request_called_times == 0 + assert provider.available_called_times == 1 + assert pot_provider.request_called_times == 1 + assert pot_provider.available_called_times == 3 + assert 'PO Token Providers: success-0.0.1 (external), unavailable-0.0.0 (external, unavailable)' in logger.messages['trace'] + + def test_provider_error_no_fallback_unexpected(self, ie, logger, pot_cache, pot_request): + director = PoTokenRequestDirector(logger=logger, cache=pot_cache) + provider = ErrorPTP(ie, logger, {}) + director.register_provider(provider) + pot_request.video_id = 'unexpected' + response = director.get_po_token(pot_request) + assert response is None + assert provider.request_called_times == 1 + assert "Error fetching PO Token from \"error\" provider: PoTokenProviderError('an error occurred'); please report this issue to the provider developer at https://error.example.com/issues ." in logger.messages['warning'] + + def test_provider_error_no_fallback_expected(self, ie, logger, pot_cache, pot_request): + director = PoTokenRequestDirector(logger=logger, cache=pot_cache) + provider = ErrorPTP(ie, logger, {}) + director.register_provider(provider) + pot_request.video_id = 'expected' + response = director.get_po_token(pot_request) + assert response is None + assert provider.request_called_times == 1 + assert "Error fetching PO Token from \"error\" provider: PoTokenProviderError('an error occurred')" in logger.messages['warning'] + + def test_provider_error_fallback(self, ie, logger, pot_cache, pot_request, pot_provider): + # Should fallback to the next provider if the first one raises an error + director = PoTokenRequestDirector(logger=logger, cache=pot_cache) + provider = ErrorPTP(ie, logger, {}) + director.register_provider(provider) + director.register_provider(pot_provider) + + response = director.get_po_token(pot_request) + assert response == EXAMPLE_PO_TOKEN + assert provider.request_called_times == 1 + assert pot_provider.request_called_times == 1 + assert "Error fetching PO Token from \"error\" provider: PoTokenProviderError('an error occurred'); please report this issue to the provider developer at https://error.example.com/issues ." in logger.messages['warning'] + + def test_provider_unexpected_error_no_fallback(self, ie, logger, pot_cache, pot_request): + director = PoTokenRequestDirector(logger=logger, cache=pot_cache) + provider = UnexpectedErrorPTP(ie, logger, {}) + director.register_provider(provider) + + response = director.get_po_token(pot_request) + assert response is None + assert provider.request_called_times == 1 + assert "Unexpected error when fetching PO Token from \"unexpected_error\" provider: ValueError('an unexpected error occurred'); please report this issue to the provider developer at https://unexpected.example.com/issues ." in logger.messages['error'] + + def test_provider_unexpected_error_fallback(self, ie, logger, pot_cache, pot_request, pot_provider): + # Should fallback to the next provider if the first one raises an unexpected error + director = PoTokenRequestDirector(logger=logger, cache=pot_cache) + provider = UnexpectedErrorPTP(ie, logger, {}) + director.register_provider(provider) + director.register_provider(pot_provider) + + response = director.get_po_token(pot_request) + assert response == EXAMPLE_PO_TOKEN + assert provider.request_called_times == 1 + assert pot_provider.request_called_times == 1 + assert "Unexpected error when fetching PO Token from \"unexpected_error\" provider: ValueError('an unexpected error occurred'); please report this issue to the provider developer at https://unexpected.example.com/issues ." in logger.messages['error'] + + def test_invalid_po_token_response_type(self, ie, logger, pot_cache, pot_request, pot_provider): + director = PoTokenRequestDirector(logger=logger, cache=pot_cache) + provider = InvalidPTP(ie, logger, {}) + director.register_provider(provider) + + pot_request.video_id = 'invalid_type' + + response = director.get_po_token(pot_request) + assert response is None + assert provider.request_called_times == 1 + assert 'Invalid PO Token response received from "invalid" provider: invalid-response; please report this issue to the provider developer at https://invalid.example.com/issues .' in logger.messages['error'] + + # Should fallback to next available provider + director.register_provider(pot_provider) + response = director.get_po_token(pot_request) + assert response == EXAMPLE_PO_TOKEN + assert provider.request_called_times == 2 + assert pot_provider.request_called_times == 1 + + def test_invalid_po_token_response(self, ie, logger, pot_cache, pot_request, pot_provider): + director = PoTokenRequestDirector(logger=logger, cache=pot_cache) + provider = InvalidPTP(ie, logger, {}) + director.register_provider(provider) + + response = director.get_po_token(pot_request) + assert response is None + assert provider.request_called_times == 1 + assert "Invalid PO Token response received from \"invalid\" provider: PoTokenResponse(po_token='example-token?', expires_at='123'); please report this issue to the provider developer at https://invalid.example.com/issues ." in logger.messages['error'] + + # Should fallback to next available provider + director.register_provider(pot_provider) + response = director.get_po_token(pot_request) + assert response == EXAMPLE_PO_TOKEN + assert provider.request_called_times == 2 + assert pot_provider.request_called_times == 1 + + def test_copy_request_provider(self, ie, logger, pot_cache, pot_request): + + class BadProviderPTP(BaseMockPoTokenProvider): + _SUPPORTED_CONTEXTS = None + _SUPPORTED_CLIENTS = None + + def _real_request_pot(self, request: PoTokenRequest) -> PoTokenResponse: + # Providers should not modify the request object, but we should guard against it + request.video_id = 'bad' + raise PoTokenProviderRejectedRequest('bad request') + + class GoodProviderPTP(BaseMockPoTokenProvider): + _SUPPORTED_CONTEXTS = None + _SUPPORTED_CLIENTS = None + + def _real_request_pot(self, request: PoTokenRequest) -> PoTokenResponse: + return PoTokenResponse(base64.urlsafe_b64encode(request.video_id.encode()).decode()) + + director = PoTokenRequestDirector(logger=logger, cache=pot_cache) + + bad_provider = BadProviderPTP(ie, logger, {}) + good_provider = GoodProviderPTP(ie, logger, {}) + + director.register_provider(bad_provider) + director.register_provider(good_provider) + + pot_request.video_id = 'good' + response = director.get_po_token(pot_request) + assert response == base64.urlsafe_b64encode(b'good').decode() + assert bad_provider.request_called_times == 1 + assert good_provider.request_called_times == 1 + assert pot_request.video_id == 'good' + + +@pytest.mark.parametrize('spec, expected', [ + (None, False), + (PoTokenCacheSpec(key_bindings={'v': 'video-id'}, default_ttl=60, write_policy=None), False), # type: ignore + (PoTokenCacheSpec(key_bindings={'v': 'video-id'}, default_ttl='invalid'), False), # type: ignore + (PoTokenCacheSpec(key_bindings='invalid', default_ttl=60), False), # type: ignore + (PoTokenCacheSpec(key_bindings={2: 'video-id'}, default_ttl=60), False), # type: ignore + (PoTokenCacheSpec(key_bindings={'v': 2}, default_ttl=60), False), # type: ignore + (PoTokenCacheSpec(key_bindings={'v': None}, default_ttl=60), False), # type: ignore + + (PoTokenCacheSpec(key_bindings={'v': 'video_id', 'e': None}, default_ttl=60), True), + (PoTokenCacheSpec(key_bindings={'v': 'video_id'}, default_ttl=60, write_policy=CacheProviderWritePolicy.WRITE_FIRST), True), +]) +def test_validate_cache_spec(spec, expected): + assert validate_cache_spec(spec) == expected + + +@pytest.mark.parametrize('po_token', [ + 'invalid-token?', + '123', +]) +def test_clean_pot_fail(po_token): + with pytest.raises(ValueError, match='Invalid PO Token'): + clean_pot(po_token) + + +@pytest.mark.parametrize('po_token,expected', [ + ('TwAA/+8=', 'TwAA_-8='), + ('TwAA%5F%2D9VA6Q92v%5FvEQ4==?extra-param=2', 'TwAA_-9VA6Q92v_vEQ4='), +]) +def test_clean_pot(po_token, expected): + assert clean_pot(po_token) == expected + + +@pytest.mark.parametrize( + 'response, expected', + [ + (None, False), + (PoTokenResponse(None), False), + (PoTokenResponse(1), False), + (PoTokenResponse('invalid-token?'), False), + (PoTokenResponse(EXAMPLE_PO_TOKEN, expires_at='abc'), False), # type: ignore + (PoTokenResponse(EXAMPLE_PO_TOKEN, expires_at=100), False), + (PoTokenResponse(EXAMPLE_PO_TOKEN, expires_at=time.time() + 10000.0), False), # type: ignore + (PoTokenResponse(EXAMPLE_PO_TOKEN), True), + (PoTokenResponse(EXAMPLE_PO_TOKEN, expires_at=-1), True), + (PoTokenResponse(EXAMPLE_PO_TOKEN, expires_at=0), True), + (PoTokenResponse(EXAMPLE_PO_TOKEN, expires_at=int(time.time()) + 10000), True), + ], +) +def test_validate_pot_response(response, expected): + assert validate_response(response) == expected + + +def test_built_in_provider(ie, logger): + class BuiltinProviderDefaultT(BuiltinIEContentProvider, suffix='T'): + def is_available(self): + return True + + class BuiltinProviderCustomNameT(BuiltinIEContentProvider, suffix='T'): + PROVIDER_NAME = 'CustomName' + + def is_available(self): + return True + + class ExternalProviderDefaultT(IEContentProvider, suffix='T'): + def is_available(self): + return True + + class ExternalProviderCustomT(IEContentProvider, suffix='T'): + PROVIDER_NAME = 'custom' + PROVIDER_VERSION = '5.4b2' + + def is_available(self): + return True + + class ExternalProviderUnavailableT(IEContentProvider, suffix='T'): + def is_available(self) -> bool: + return False + + class BuiltinProviderUnavailableT(IEContentProvider, suffix='T'): + def is_available(self) -> bool: + return False + + built_in_default = BuiltinProviderDefaultT(ie=ie, logger=logger, settings={}) + built_in_custom_name = BuiltinProviderCustomNameT(ie=ie, logger=logger, settings={}) + built_in_unavailable = BuiltinProviderUnavailableT(ie=ie, logger=logger, settings={}) + external_default = ExternalProviderDefaultT(ie=ie, logger=logger, settings={}) + external_custom = ExternalProviderCustomT(ie=ie, logger=logger, settings={}) + external_unavailable = ExternalProviderUnavailableT(ie=ie, logger=logger, settings={}) + + assert provider_display_list([]) == 'none' + assert provider_display_list([built_in_default]) == 'BuiltinProviderDefault' + assert provider_display_list([external_unavailable]) == 'ExternalProviderUnavailable-0.0.0 (external, unavailable)' + assert provider_display_list([ + built_in_default, + built_in_custom_name, + external_default, + external_custom, + external_unavailable, + built_in_unavailable], + ) == 'BuiltinProviderDefault, CustomName, ExternalProviderDefault-0.0.0 (external), custom-5.4b2 (external), ExternalProviderUnavailable-0.0.0 (external, unavailable), BuiltinProviderUnavailable-0.0.0 (external, unavailable)' diff --git a/test/test_pot/test_pot_framework.py b/test/test_pot/test_pot_framework.py new file mode 100644 index 0000000000..bc94653f4a --- /dev/null +++ b/test/test_pot/test_pot_framework.py @@ -0,0 +1,629 @@ +import pytest + +from yt_dlp.extractor.youtube.pot._provider import IEContentProvider +from yt_dlp.cookies import YoutubeDLCookieJar +from yt_dlp.utils.networking import HTTPHeaderDict +from yt_dlp.extractor.youtube.pot.provider import ( + PoTokenRequest, + PoTokenContext, + ExternalRequestFeature, + +) + +from yt_dlp.extractor.youtube.pot.cache import ( + PoTokenCacheProvider, + PoTokenCacheSpec, + PoTokenCacheSpecProvider, + CacheProviderWritePolicy, +) + +import yt_dlp.extractor.youtube.pot.cache as cache + +from yt_dlp.networking import Request +from yt_dlp.extractor.youtube.pot.provider import ( + PoTokenResponse, + PoTokenProvider, + PoTokenProviderRejectedRequest, + provider_bug_report_message, + register_provider, + register_preference, +) + +from yt_dlp.extractor.youtube.pot._registry import _pot_providers, _ptp_preferences, _pot_pcs_providers, _pot_cache_providers, _pot_cache_provider_preferences + + +class ExamplePTP(PoTokenProvider): + PROVIDER_NAME = 'example' + PROVIDER_VERSION = '0.0.1' + BUG_REPORT_LOCATION = 'https://example.com/issues' + + _SUPPORTED_CLIENTS = ('WEB',) + _SUPPORTED_CONTEXTS = (PoTokenContext.GVS, ) + + _SUPPORTED_EXTERNAL_REQUEST_FEATURES = ( + ExternalRequestFeature.PROXY_SCHEME_HTTP, + ExternalRequestFeature.PROXY_SCHEME_SOCKS5H, + ) + + def is_available(self) -> bool: + return True + + def _real_request_pot(self, request: PoTokenRequest) -> PoTokenResponse: + return PoTokenResponse('example-token', expires_at=123) + + +class ExampleCacheProviderPCP(PoTokenCacheProvider): + + PROVIDER_NAME = 'example' + PROVIDER_VERSION = '0.0.1' + BUG_REPORT_LOCATION = 'https://example.com/issues' + + def is_available(self) -> bool: + return True + + def get(self, key: str): + return 'example-cache' + + def store(self, key: str, value: str, expires_at: int): + pass + + def delete(self, key: str): + pass + + +class ExampleCacheSpecProviderPCSP(PoTokenCacheSpecProvider): + + PROVIDER_NAME = 'example' + PROVIDER_VERSION = '0.0.1' + BUG_REPORT_LOCATION = 'https://example.com/issues' + + def generate_cache_spec(self, request: PoTokenRequest): + return PoTokenCacheSpec( + key_bindings={'field': 'example-key'}, + default_ttl=60, + write_policy=CacheProviderWritePolicy.WRITE_FIRST, + ) + + +class TestPoTokenProvider: + + def test_base_type(self): + assert issubclass(PoTokenProvider, IEContentProvider) + + def test_create_provider_missing_fetch_method(self, ie, logger): + class MissingMethodsPTP(PoTokenProvider): + def is_available(self) -> bool: + return True + + with pytest.raises(TypeError): + MissingMethodsPTP(ie=ie, logger=logger, settings={}) + + def test_create_provider_missing_available_method(self, ie, logger): + class MissingMethodsPTP(PoTokenProvider): + def _real_request_pot(self, request: PoTokenRequest) -> PoTokenResponse: + raise PoTokenProviderRejectedRequest('Not implemented') + + with pytest.raises(TypeError): + MissingMethodsPTP(ie=ie, logger=logger, settings={}) + + def test_barebones_provider(self, ie, logger): + class BarebonesProviderPTP(PoTokenProvider): + def is_available(self) -> bool: + return True + + def _real_request_pot(self, request: PoTokenRequest) -> PoTokenResponse: + raise PoTokenProviderRejectedRequest('Not implemented') + + provider = BarebonesProviderPTP(ie=ie, logger=logger, settings={}) + assert provider.PROVIDER_NAME == 'BarebonesProvider' + assert provider.PROVIDER_KEY == 'BarebonesProvider' + assert provider.PROVIDER_VERSION == '0.0.0' + assert provider.BUG_REPORT_MESSAGE == 'please report this issue to the provider developer at (developer has not provided a bug report location) .' + + def test_example_provider_success(self, ie, logger, pot_request): + provider = ExamplePTP(ie=ie, logger=logger, settings={}) + assert provider.PROVIDER_NAME == 'example' + assert provider.PROVIDER_KEY == 'Example' + assert provider.PROVIDER_VERSION == '0.0.1' + assert provider.BUG_REPORT_MESSAGE == 'please report this issue to the provider developer at https://example.com/issues .' + assert provider.is_available() + + response = provider.request_pot(pot_request) + + assert response.po_token == 'example-token' + assert response.expires_at == 123 + + def test_provider_unsupported_context(self, ie, logger, pot_request): + provider = ExamplePTP(ie=ie, logger=logger, settings={}) + pot_request.context = PoTokenContext.PLAYER + + with pytest.raises(PoTokenProviderRejectedRequest): + provider.request_pot(pot_request) + + def test_provider_unsupported_client(self, ie, logger, pot_request): + provider = ExamplePTP(ie=ie, logger=logger, settings={}) + pot_request.innertube_context['client']['clientName'] = 'ANDROID' + + with pytest.raises(PoTokenProviderRejectedRequest): + provider.request_pot(pot_request) + + def test_provider_unsupported_proxy_scheme(self, ie, logger, pot_request): + provider = ExamplePTP(ie=ie, logger=logger, settings={}) + pot_request.request_proxy = 'socks4://example.com' + + with pytest.raises( + PoTokenProviderRejectedRequest, + match='External requests by "example" provider do not support proxy scheme "socks4". Supported proxy ' + 'schemes: http, socks5h', + ): + provider.request_pot(pot_request) + + pot_request.request_proxy = 'http://example.com' + + assert provider.request_pot(pot_request) + + def test_provider_ignore_external_request_features(self, ie, logger, pot_request): + class InternalPTP(ExamplePTP): + _SUPPORTED_EXTERNAL_REQUEST_FEATURES = None + + provider = InternalPTP(ie=ie, logger=logger, settings={}) + + pot_request.request_proxy = 'socks5://example.com' + assert provider.request_pot(pot_request) + pot_request.request_source_address = '0.0.0.0' + assert provider.request_pot(pot_request) + + def test_provider_unsupported_external_request_source_address(self, ie, logger, pot_request): + class InternalPTP(ExamplePTP): + _SUPPORTED_EXTERNAL_REQUEST_FEATURES = tuple() + + provider = InternalPTP(ie=ie, logger=logger, settings={}) + + pot_request.request_source_address = None + assert provider.request_pot(pot_request) + + pot_request.request_source_address = '0.0.0.0' + with pytest.raises( + PoTokenProviderRejectedRequest, + match='External requests by "example" provider do not support setting source address', + ): + provider.request_pot(pot_request) + + def test_provider_supported_external_request_source_address(self, ie, logger, pot_request): + class InternalPTP(ExamplePTP): + _SUPPORTED_EXTERNAL_REQUEST_FEATURES = ( + ExternalRequestFeature.SOURCE_ADDRESS, + ) + + provider = InternalPTP(ie=ie, logger=logger, settings={}) + + pot_request.request_source_address = None + assert provider.request_pot(pot_request) + + pot_request.request_source_address = '0.0.0.0' + assert provider.request_pot(pot_request) + + def test_provider_unsupported_external_request_tls_verification(self, ie, logger, pot_request): + class InternalPTP(ExamplePTP): + _SUPPORTED_EXTERNAL_REQUEST_FEATURES = tuple() + + provider = InternalPTP(ie=ie, logger=logger, settings={}) + + pot_request.request_verify_tls = True + assert provider.request_pot(pot_request) + + pot_request.request_verify_tls = False + with pytest.raises( + PoTokenProviderRejectedRequest, + match='External requests by "example" provider do not support ignoring TLS certificate failures', + ): + provider.request_pot(pot_request) + + def test_provider_supported_external_request_tls_verification(self, ie, logger, pot_request): + class InternalPTP(ExamplePTP): + _SUPPORTED_EXTERNAL_REQUEST_FEATURES = ( + ExternalRequestFeature.DISABLE_TLS_VERIFICATION, + ) + + provider = InternalPTP(ie=ie, logger=logger, settings={}) + + pot_request.request_verify_tls = True + assert provider.request_pot(pot_request) + + pot_request.request_verify_tls = False + assert provider.request_pot(pot_request) + + def test_provider_request_webpage(self, ie, logger, pot_request): + provider = ExamplePTP(ie=ie, logger=logger, settings={}) + + cookiejar = YoutubeDLCookieJar() + pot_request.request_headers = HTTPHeaderDict({'User-Agent': 'example-user-agent'}) + pot_request.request_proxy = 'socks5://example-proxy.com' + pot_request.request_cookiejar = cookiejar + + def mock_urlopen(request): + return request + + ie._downloader.urlopen = mock_urlopen + + sent_request = provider._request_webpage(Request( + 'https://example.com', + ), pot_request=pot_request) + + assert sent_request.url == 'https://example.com' + assert sent_request.headers['User-Agent'] == 'example-user-agent' + assert sent_request.proxies == {'all': 'socks5://example-proxy.com'} + assert sent_request.extensions['cookiejar'] is cookiejar + assert 'Requesting webpage' in logger.messages['info'] + + def test_provider_request_webpage_override(self, ie, logger, pot_request): + provider = ExamplePTP(ie=ie, logger=logger, settings={}) + + cookiejar_request = YoutubeDLCookieJar() + pot_request.request_headers = HTTPHeaderDict({'User-Agent': 'example-user-agent'}) + pot_request.request_proxy = 'socks5://example-proxy.com' + pot_request.request_cookiejar = cookiejar_request + + def mock_urlopen(request): + return request + + ie._downloader.urlopen = mock_urlopen + + sent_request = provider._request_webpage(Request( + 'https://example.com', + headers={'User-Agent': 'override-user-agent-override'}, + proxies={'http': 'http://example-proxy-override.com'}, + extensions={'cookiejar': YoutubeDLCookieJar()}, + ), pot_request=pot_request, note='Custom requesting webpage') + + assert sent_request.url == 'https://example.com' + assert sent_request.headers['User-Agent'] == 'override-user-agent-override' + assert sent_request.proxies == {'http': 'http://example-proxy-override.com'} + assert sent_request.extensions['cookiejar'] is not cookiejar_request + assert 'Custom requesting webpage' in logger.messages['info'] + + def test_provider_request_webpage_no_log(self, ie, logger, pot_request): + provider = ExamplePTP(ie=ie, logger=logger, settings={}) + + def mock_urlopen(request): + return request + + ie._downloader.urlopen = mock_urlopen + + sent_request = provider._request_webpage(Request( + 'https://example.com', + ), note=False) + + assert sent_request.url == 'https://example.com' + assert 'info' not in logger.messages + + def test_provider_request_webpage_no_pot_request(self, ie, logger): + provider = ExamplePTP(ie=ie, logger=logger, settings={}) + + def mock_urlopen(request): + return request + + ie._downloader.urlopen = mock_urlopen + + sent_request = provider._request_webpage(Request( + 'https://example.com', + ), pot_request=None) + + assert sent_request.url == 'https://example.com' + + def test_get_config_arg(self, ie, logger): + provider = ExamplePTP(ie=ie, logger=logger, settings={'abc': ['123D'], 'xyz': ['456a', '789B']}) + + assert provider._configuration_arg('abc') == ['123d'] + assert provider._configuration_arg('abc', default=['default']) == ['123d'] + assert provider._configuration_arg('ABC', default=['default']) == ['default'] + assert provider._configuration_arg('abc', casesense=True) == ['123D'] + assert provider._configuration_arg('xyz', casesense=False) == ['456a', '789b'] + + def test_require_class_end_with_suffix(self, ie, logger): + class InvalidSuffix(PoTokenProvider): + PROVIDER_NAME = 'invalid-suffix' + + def _real_request_pot(self, request: PoTokenRequest) -> PoTokenResponse: + raise PoTokenProviderRejectedRequest('Not implemented') + + def is_available(self) -> bool: + return True + + provider = InvalidSuffix(ie=ie, logger=logger, settings={}) + + with pytest.raises(AssertionError): + provider.PROVIDER_KEY # noqa: B018 + + +class TestPoTokenCacheProvider: + + def test_base_type(self): + assert issubclass(PoTokenCacheProvider, IEContentProvider) + + def test_create_provider_missing_get_method(self, ie, logger): + class MissingMethodsPCP(PoTokenCacheProvider): + def store(self, key: str, value: str, expires_at: int): + pass + + def delete(self, key: str): + pass + + def is_available(self) -> bool: + return True + + with pytest.raises(TypeError): + MissingMethodsPCP(ie=ie, logger=logger, settings={}) + + def test_create_provider_missing_store_method(self, ie, logger): + class MissingMethodsPCP(PoTokenCacheProvider): + def get(self, key: str): + pass + + def delete(self, key: str): + pass + + def is_available(self) -> bool: + return True + + with pytest.raises(TypeError): + MissingMethodsPCP(ie=ie, logger=logger, settings={}) + + def test_create_provider_missing_delete_method(self, ie, logger): + class MissingMethodsPCP(PoTokenCacheProvider): + def get(self, key: str): + pass + + def store(self, key: str, value: str, expires_at: int): + pass + + def is_available(self) -> bool: + return True + + with pytest.raises(TypeError): + MissingMethodsPCP(ie=ie, logger=logger, settings={}) + + def test_create_provider_missing_is_available_method(self, ie, logger): + class MissingMethodsPCP(PoTokenCacheProvider): + def get(self, key: str): + pass + + def store(self, key: str, value: str, expires_at: int): + pass + + def delete(self, key: str): + pass + + with pytest.raises(TypeError): + MissingMethodsPCP(ie=ie, logger=logger, settings={}) + + def test_barebones_provider(self, ie, logger): + class BarebonesProviderPCP(PoTokenCacheProvider): + + def is_available(self) -> bool: + return True + + def get(self, key: str): + return 'example-cache' + + def store(self, key: str, value: str, expires_at: int): + pass + + def delete(self, key: str): + pass + + provider = BarebonesProviderPCP(ie=ie, logger=logger, settings={}) + assert provider.PROVIDER_NAME == 'BarebonesProvider' + assert provider.PROVIDER_KEY == 'BarebonesProvider' + assert provider.PROVIDER_VERSION == '0.0.0' + assert provider.BUG_REPORT_MESSAGE == 'please report this issue to the provider developer at (developer has not provided a bug report location) .' + + def test_create_provider_example(self, ie, logger): + provider = ExampleCacheProviderPCP(ie=ie, logger=logger, settings={}) + assert provider.PROVIDER_NAME == 'example' + assert provider.PROVIDER_KEY == 'ExampleCacheProvider' + assert provider.PROVIDER_VERSION == '0.0.1' + assert provider.BUG_REPORT_MESSAGE == 'please report this issue to the provider developer at https://example.com/issues .' + assert provider.is_available() + + def test_get_config_arg(self, ie, logger): + provider = ExampleCacheProviderPCP(ie=ie, logger=logger, settings={'abc': ['123D'], 'xyz': ['456a', '789B']}) + assert provider._configuration_arg('abc') == ['123d'] + assert provider._configuration_arg('abc', default=['default']) == ['123d'] + assert provider._configuration_arg('ABC', default=['default']) == ['default'] + assert provider._configuration_arg('abc', casesense=True) == ['123D'] + assert provider._configuration_arg('xyz', casesense=False) == ['456a', '789b'] + + def test_require_class_end_with_suffix(self, ie, logger): + class InvalidSuffix(PoTokenCacheProvider): + def get(self, key: str): + return 'example-cache' + + def store(self, key: str, value: str, expires_at: int): + pass + + def delete(self, key: str): + pass + + def is_available(self) -> bool: + return True + + provider = InvalidSuffix(ie=ie, logger=logger, settings={}) + + with pytest.raises(AssertionError): + provider.PROVIDER_KEY # noqa: B018 + + +class TestPoTokenCacheSpecProvider: + + def test_base_type(self): + assert issubclass(PoTokenCacheSpecProvider, IEContentProvider) + + def test_create_provider_missing_supports_method(self, ie, logger): + class MissingMethodsPCS(PoTokenCacheSpecProvider): + pass + + with pytest.raises(TypeError): + MissingMethodsPCS(ie=ie, logger=logger, settings={}) + + def test_create_provider_barebones(self, ie, pot_request, logger): + class BarebonesProviderPCSP(PoTokenCacheSpecProvider): + def generate_cache_spec(self, request: PoTokenRequest): + return PoTokenCacheSpec( + default_ttl=100, + key_bindings={}, + ) + + provider = BarebonesProviderPCSP(ie=ie, logger=logger, settings={}) + assert provider.PROVIDER_NAME == 'BarebonesProvider' + assert provider.PROVIDER_KEY == 'BarebonesProvider' + assert provider.PROVIDER_VERSION == '0.0.0' + assert provider.BUG_REPORT_MESSAGE == 'please report this issue to the provider developer at (developer has not provided a bug report location) .' + assert provider.is_available() + assert provider.generate_cache_spec(request=pot_request).default_ttl == 100 + assert provider.generate_cache_spec(request=pot_request).key_bindings == {} + assert provider.generate_cache_spec(request=pot_request).write_policy == CacheProviderWritePolicy.WRITE_ALL + + def test_create_provider_example(self, ie, pot_request, logger): + provider = ExampleCacheSpecProviderPCSP(ie=ie, logger=logger, settings={}) + assert provider.PROVIDER_NAME == 'example' + assert provider.PROVIDER_KEY == 'ExampleCacheSpecProvider' + assert provider.PROVIDER_VERSION == '0.0.1' + assert provider.BUG_REPORT_MESSAGE == 'please report this issue to the provider developer at https://example.com/issues .' + assert provider.is_available() + assert provider.generate_cache_spec(pot_request) + assert provider.generate_cache_spec(pot_request).key_bindings == {'field': 'example-key'} + assert provider.generate_cache_spec(pot_request).default_ttl == 60 + assert provider.generate_cache_spec(pot_request).write_policy == CacheProviderWritePolicy.WRITE_FIRST + + def test_get_config_arg(self, ie, logger): + provider = ExampleCacheSpecProviderPCSP(ie=ie, logger=logger, settings={'abc': ['123D'], 'xyz': ['456a', '789B']}) + + assert provider._configuration_arg('abc') == ['123d'] + assert provider._configuration_arg('abc', default=['default']) == ['123d'] + assert provider._configuration_arg('ABC', default=['default']) == ['default'] + assert provider._configuration_arg('abc', casesense=True) == ['123D'] + assert provider._configuration_arg('xyz', casesense=False) == ['456a', '789b'] + + def test_require_class_end_with_suffix(self, ie, logger): + class InvalidSuffix(PoTokenCacheSpecProvider): + def generate_cache_spec(self, request: PoTokenRequest): + return None + + provider = InvalidSuffix(ie=ie, logger=logger, settings={}) + + with pytest.raises(AssertionError): + provider.PROVIDER_KEY # noqa: B018 + + +class TestPoTokenRequest: + def test_copy_request(self, pot_request): + copied_request = pot_request.copy() + + assert copied_request is not pot_request + assert copied_request.context == pot_request.context + assert copied_request.innertube_context == pot_request.innertube_context + assert copied_request.innertube_context is not pot_request.innertube_context + copied_request.innertube_context['client']['clientName'] = 'ANDROID' + assert pot_request.innertube_context['client']['clientName'] != 'ANDROID' + assert copied_request.innertube_host == pot_request.innertube_host + assert copied_request.session_index == pot_request.session_index + assert copied_request.player_url == pot_request.player_url + assert copied_request.is_authenticated == pot_request.is_authenticated + assert copied_request.visitor_data == pot_request.visitor_data + assert copied_request.data_sync_id == pot_request.data_sync_id + assert copied_request.video_id == pot_request.video_id + assert copied_request.request_cookiejar is pot_request.request_cookiejar + assert copied_request.request_proxy == pot_request.request_proxy + assert copied_request.request_headers == pot_request.request_headers + assert copied_request.request_headers is not pot_request.request_headers + assert copied_request.request_timeout == pot_request.request_timeout + assert copied_request.request_source_address == pot_request.request_source_address + assert copied_request.request_verify_tls == pot_request.request_verify_tls + assert copied_request.bypass_cache == pot_request.bypass_cache + + +def test_provider_bug_report_message(ie, logger): + provider = ExamplePTP(ie=ie, logger=logger, settings={}) + assert provider.BUG_REPORT_MESSAGE == 'please report this issue to the provider developer at https://example.com/issues .' + + message = provider_bug_report_message(provider) + assert message == '; please report this issue to the provider developer at https://example.com/issues .' + + message_before = provider_bug_report_message(provider, before='custom message!') + assert message_before == 'custom message! Please report this issue to the provider developer at https://example.com/issues .' + + +def test_register_provider(ie): + + @register_provider + class UnavailableProviderPTP(PoTokenProvider): + def is_available(self) -> bool: + return False + + def _real_request_pot(self, request: PoTokenRequest) -> PoTokenResponse: + raise PoTokenProviderRejectedRequest('Not implemented') + + assert _pot_providers.value.get('UnavailableProvider') == UnavailableProviderPTP + _pot_providers.value.pop('UnavailableProvider') + + +def test_register_pot_preference(ie): + before = len(_ptp_preferences.value) + + @register_preference(ExamplePTP) + def unavailable_preference(provider: PoTokenProvider, request: PoTokenRequest): + return 1 + + assert len(_ptp_preferences.value) == before + 1 + + +def test_register_cache_provider(ie): + + @cache.register_provider + class UnavailableCacheProviderPCP(PoTokenCacheProvider): + def is_available(self) -> bool: + return False + + def get(self, key: str): + return 'example-cache' + + def store(self, key: str, value: str, expires_at: int): + pass + + def delete(self, key: str): + pass + + assert _pot_cache_providers.value.get('UnavailableCacheProvider') == UnavailableCacheProviderPCP + _pot_cache_providers.value.pop('UnavailableCacheProvider') + + +def test_register_cache_provider_spec(ie): + + @cache.register_spec + class UnavailableCacheProviderPCSP(PoTokenCacheSpecProvider): + def is_available(self) -> bool: + return False + + def generate_cache_spec(self, request: PoTokenRequest): + return None + + assert _pot_pcs_providers.value.get('UnavailableCacheProvider') == UnavailableCacheProviderPCSP + _pot_pcs_providers.value.pop('UnavailableCacheProvider') + + +def test_register_cache_provider_preference(ie): + before = len(_pot_cache_provider_preferences.value) + + @cache.register_preference(ExampleCacheProviderPCP) + def unavailable_preference(provider: PoTokenCacheProvider, request: PoTokenRequest): + return 1 + + assert len(_pot_cache_provider_preferences.value) == before + 1 + + +def test_logger_log_level(logger): + assert logger.LogLevel('INFO') == logger.LogLevel.INFO + assert logger.LogLevel('debuG') == logger.LogLevel.DEBUG + assert logger.LogLevel(10) == logger.LogLevel.DEBUG + assert logger.LogLevel('UNKNOWN') == logger.LogLevel.INFO diff --git a/test/test_subtitles.py b/test/test_subtitles.py index f3b0056179..efd69b33d9 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -23,7 +23,6 @@ TedTalkIE, ThePlatformFeedIE, ThePlatformIE, - VikiIE, VimeoIE, WallaIE, YoutubeIE, @@ -331,20 +330,6 @@ def test_subtitles_array_key(self): self.assertEqual(md5(subtitles['it']), '4b3264186fbb103508abe5311cfcb9cd') -@is_download_test -@unittest.skip('IE broken - DRM only') -class TestVikiSubtitles(BaseTestSubtitles): - url = 'http://www.viki.com/videos/1060846v-punch-episode-18' - IE = VikiIE - - def test_allsubtitles(self): - self.DL.params['writesubtitles'] = True - self.DL.params['allsubtitles'] = True - subtitles = self.getSubtitles() - self.assertEqual(set(subtitles.keys()), {'en'}) - self.assertEqual(md5(subtitles['en']), '53cb083a5914b2d84ef1ab67b880d18a') - - @is_download_test class TestThePlatformSubtitles(BaseTestSubtitles): # from http://www.3playmedia.com/services-features/tools/integrations/theplatform/ diff --git a/test/test_utils.py b/test/test_utils.py index 65f28db363..aedb565ec1 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -219,11 +219,8 @@ def test_sanitize_ids(self): self.assertEqual(sanitize_filename('_BD_eEpuzXw', is_id=True), '_BD_eEpuzXw') self.assertEqual(sanitize_filename('N0Y__7-UOdI', is_id=True), 'N0Y__7-UOdI') + @unittest.mock.patch('sys.platform', 'win32') def test_sanitize_path(self): - with unittest.mock.patch('sys.platform', 'win32'): - self._test_sanitize_path() - - def _test_sanitize_path(self): self.assertEqual(sanitize_path('abc'), 'abc') self.assertEqual(sanitize_path('abc/def'), 'abc\\def') self.assertEqual(sanitize_path('abc\\def'), 'abc\\def') @@ -254,10 +251,8 @@ def _test_sanitize_path(self): # Check with nt._path_normpath if available try: - import nt - - nt_path_normpath = getattr(nt, '_path_normpath', None) - except Exception: + from nt import _path_normpath as nt_path_normpath + except ImportError: nt_path_normpath = None for test, expected in [ @@ -664,6 +659,8 @@ def test_url_or_none(self): self.assertEqual(url_or_none('mms://foo.de'), 'mms://foo.de') self.assertEqual(url_or_none('rtspu://foo.de'), 'rtspu://foo.de') self.assertEqual(url_or_none('ftps://foo.de'), 'ftps://foo.de') + self.assertEqual(url_or_none('ws://foo.de'), 'ws://foo.de') + self.assertEqual(url_or_none('wss://foo.de'), 'wss://foo.de') def test_parse_age_limit(self): self.assertEqual(parse_age_limit(None), None) @@ -1265,6 +1262,7 @@ def test_js_to_json_edgecases(self): def test_js_to_json_malformed(self): self.assertEqual(js_to_json('42a1'), '42"a1"') self.assertEqual(js_to_json('42a-1'), '42"a"-1') + self.assertEqual(js_to_json('{a: `${e("")}`}'), '{"a": "\\"e\\"(\\"\\")"}') def test_js_to_json_template_literal(self): self.assertEqual(js_to_json('`Hello ${name}`', {'name': '"world"'}), '"Hello world"') diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index 7ae627f2c0..3f777aed7a 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -78,6 +78,61 @@ '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', '0QJ8wRAIgXmPlOPSBkkUs1bYFYlJCfe29xxAj7v1pDL0QwbdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_EMu-m37KtXJ2OySqa0q', ), + ( + 'https://www.youtube.com/s/player/643afba4/tv-player-ias.vflset/tv-player-ias.js', + '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', + 'AAOAOq0QJ8wRAIgXmPlOPSBkkUs1bYFYlJCfe29xx8j7vgpDL0QwbdV06sCIEzpWqMGkFR20CFOS21Tp-7vj_EMu-m37KtXJoOy1', + ), + ( + 'https://www.youtube.com/s/player/363db69b/player_ias.vflset/en_US/base.js', + '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', + '0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpz2ICs6EVdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', + ), + ( + 'https://www.youtube.com/s/player/363db69b/player_ias_tce.vflset/en_US/base.js', + '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', + '0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpz2ICs6EVdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', + ), + ( + 'https://www.youtube.com/s/player/4fcd6e4a/player_ias.vflset/en_US/base.js', + '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', + 'wAOAOq0QJ8ARAIgXmPlOPSBkkUs1bYFYlJCfe29xx8q7v1pDL0QwbdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_EMu-m37KtXJoOySqa0', + ), + ( + 'https://www.youtube.com/s/player/4fcd6e4a/player_ias_tce.vflset/en_US/base.js', + '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', + 'wAOAOq0QJ8ARAIgXmPlOPSBkkUs1bYFYlJCfe29xx8q7v1pDL0QwbdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_EMu-m37KtXJoOySqa0', + ), + ( + 'https://www.youtube.com/s/player/20830619/player_ias.vflset/en_US/base.js', + '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', + '7AOq0QJ8wRAIgXmPlOPSBkkAs1bYFYlJCfe29xx8jOv1pDL0Q2bdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_EMu-m37KtXJoOySqa0qaw', + ), + ( + 'https://www.youtube.com/s/player/20830619/player_ias_tce.vflset/en_US/base.js', + '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', + '7AOq0QJ8wRAIgXmPlOPSBkkAs1bYFYlJCfe29xx8jOv1pDL0Q2bdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_EMu-m37KtXJoOySqa0qaw', + ), + ( + 'https://www.youtube.com/s/player/20830619/player-plasma-ias-phone-en_US.vflset/base.js', + '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', + '7AOq0QJ8wRAIgXmPlOPSBkkAs1bYFYlJCfe29xx8jOv1pDL0Q2bdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_EMu-m37KtXJoOySqa0qaw', + ), + ( + 'https://www.youtube.com/s/player/20830619/player-plasma-ias-tablet-en_US.vflset/base.js', + '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', + '7AOq0QJ8wRAIgXmPlOPSBkkAs1bYFYlJCfe29xx8jOv1pDL0Q2bdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_EMu-m37KtXJoOySqa0qaw', + ), + ( + 'https://www.youtube.com/s/player/8a8ac953/player_ias_tce.vflset/en_US/base.js', + '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', + 'IAOAOq0QJ8wRAAgXmPlOPSBkkUs1bYFYlJCfe29xx8j7v1pDL0QwbdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_E2u-m37KtXJoOySqa0', + ), + ( + 'https://www.youtube.com/s/player/8a8ac953/tv-player-es6.vflset/tv-player-es6.js', + '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', + 'IAOAOq0QJ8wRAAgXmPlOPSBkkUs1bYFYlJCfe29xx8j7v1pDL0QwbdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_E2u-m37KtXJoOySqa0', + ), ] _NSIG_TESTS = [ @@ -205,6 +260,66 @@ 'https://www.youtube.com/s/player/9c6dfc4a/player_ias.vflset/en_US/base.js', 'jbu7ylIosQHyJyJV', 'uwI0ESiynAmhNg', ), + ( + 'https://www.youtube.com/s/player/e7567ecf/player_ias_tce.vflset/en_US/base.js', + 'Sy4aDGc0VpYRR9ew_', '5UPOT1VhoZxNLQ', + ), + ( + 'https://www.youtube.com/s/player/d50f54ef/player_ias_tce.vflset/en_US/base.js', + 'Ha7507LzRmH3Utygtj', 'XFTb2HoeOE5MHg', + ), + ( + 'https://www.youtube.com/s/player/074a8365/player_ias_tce.vflset/en_US/base.js', + 'Ha7507LzRmH3Utygtj', 'ufTsrE0IVYrkl8v', + ), + ( + 'https://www.youtube.com/s/player/643afba4/player_ias.vflset/en_US/base.js', + 'N5uAlLqm0eg1GyHO', 'dCBQOejdq5s-ww', + ), + ( + 'https://www.youtube.com/s/player/69f581a5/tv-player-ias.vflset/tv-player-ias.js', + '-qIP447rVlTTwaZjY', 'KNcGOksBAvwqQg', + ), + ( + 'https://www.youtube.com/s/player/643afba4/tv-player-ias.vflset/tv-player-ias.js', + 'ir9-V6cdbCiyKxhr', '2PL7ZDYAALMfmA', + ), + ( + 'https://www.youtube.com/s/player/363db69b/player_ias.vflset/en_US/base.js', + 'eWYu5d5YeY_4LyEDc', 'XJQqf-N7Xra3gg', + ), + ( + 'https://www.youtube.com/s/player/4fcd6e4a/player_ias.vflset/en_US/base.js', + 'o_L251jm8yhZkWtBW', 'lXoxI3XvToqn6A', + ), + ( + 'https://www.youtube.com/s/player/4fcd6e4a/player_ias_tce.vflset/en_US/base.js', + 'o_L251jm8yhZkWtBW', 'lXoxI3XvToqn6A', + ), + ( + 'https://www.youtube.com/s/player/20830619/tv-player-ias.vflset/tv-player-ias.js', + 'ir9-V6cdbCiyKxhr', '9YE85kNjZiS4', + ), + ( + 'https://www.youtube.com/s/player/20830619/player-plasma-ias-phone-en_US.vflset/base.js', + 'ir9-V6cdbCiyKxhr', '9YE85kNjZiS4', + ), + ( + 'https://www.youtube.com/s/player/20830619/player-plasma-ias-tablet-en_US.vflset/base.js', + 'ir9-V6cdbCiyKxhr', '9YE85kNjZiS4', + ), + ( + 'https://www.youtube.com/s/player/8a8ac953/player_ias_tce.vflset/en_US/base.js', + 'MiBYeXx_vRREbiCCmh', 'RtZYMVvmkE0JE', + ), + ( + 'https://www.youtube.com/s/player/8a8ac953/tv-player-es6.vflset/tv-player-es6.js', + 'MiBYeXx_vRREbiCCmh', 'RtZYMVvmkE0JE', + ), + ( + 'https://www.youtube.com/s/player/59b252b9/player_ias.vflset/en_US/base.js', + 'D3XWVpYgwhLLKNK4AGX', 'aZrQ1qWJ5yv5h', + ), ] @@ -218,6 +333,8 @@ def test_youtube_extract_player_info(self): ('https://www.youtube.com/s/player/64dddad9/player-plasma-ias-phone-en_US.vflset/base.js', '64dddad9'), ('https://www.youtube.com/s/player/64dddad9/player-plasma-ias-phone-de_DE.vflset/base.js', '64dddad9'), ('https://www.youtube.com/s/player/64dddad9/player-plasma-ias-tablet-en_US.vflset/base.js', '64dddad9'), + ('https://www.youtube.com/s/player/e7567ecf/player_ias_tce.vflset/en_US/base.js', 'e7567ecf'), + ('https://www.youtube.com/s/player/643afba4/tv-player-ias.vflset/tv-player-ias.js', '643afba4'), # obsolete ('https://www.youtube.com/yts/jsbin/player_ias-vfle4-e03/en_US/base.js', 'vfle4-e03'), ('https://www.youtube.com/yts/jsbin/player_ias-vfl49f_g4/en_US/base.js', 'vfl49f_g4'), @@ -250,46 +367,51 @@ def t_factory(name, sig_func, url_pattern): def make_tfunc(url, sig_input, expected_sig): m = url_pattern.match(url) assert m, f'{url!r} should follow URL format' - test_id = m.group('id') + test_id = re.sub(r'[/.-]', '_', m.group('id') or m.group('compat_id')) def test_func(self): - basename = f'player-{name}-{test_id}.js' + basename = f'player-{test_id}.js' fn = os.path.join(self.TESTDATA_DIR, basename) if not os.path.exists(fn): urllib.request.urlretrieve(url, fn) with open(fn, encoding='utf-8') as testf: jscode = testf.read() - self.assertEqual(sig_func(jscode, sig_input), expected_sig) + self.assertEqual(sig_func(jscode, sig_input, url), expected_sig) test_func.__name__ = f'test_{name}_js_{test_id}' setattr(TestSignature, test_func.__name__, test_func) return make_tfunc -def signature(jscode, sig_input): - func = YoutubeIE(FakeYDL())._parse_sig_js(jscode) +def signature(jscode, sig_input, player_url): + func = YoutubeIE(FakeYDL())._parse_sig_js(jscode, player_url) src_sig = ( str(string.printable[:sig_input]) if isinstance(sig_input, int) else sig_input) return func(src_sig) -def n_sig(jscode, sig_input): +def n_sig(jscode, sig_input, player_url): ie = YoutubeIE(FakeYDL()) - funcname = ie._extract_n_function_name(jscode) + funcname = ie._extract_n_function_name(jscode, player_url=player_url) jsi = JSInterpreter(jscode) - func = jsi.extract_function_from_code(*ie._fixup_n_function_code(*jsi.extract_function_code(funcname))) + func = jsi.extract_function_from_code(*ie._fixup_n_function_code(*jsi.extract_function_code(funcname), jscode, player_url)) return func([sig_input]) make_sig_test = t_factory( - 'signature', signature, re.compile(r'.*(?:-|/player/)(?P[a-zA-Z0-9_-]+)(?:/.+\.js|(?:/watch_as3|/html5player)?\.[a-z]+)$')) + 'signature', signature, + re.compile(r'''(?x) + .+(?: + /player/(?P[a-zA-Z0-9_/.-]+)| + /html5player-(?:en_US-)?(?P[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)? + )\.js$''')) for test_spec in _SIG_TESTS: make_sig_test(*test_spec) make_nsig_test = t_factory( - 'nsig', n_sig, re.compile(r'.+/player/(?P[a-zA-Z0-9_-]+)/.+.js$')) + 'nsig', n_sig, re.compile(r'.+/player/(?P[a-zA-Z0-9_/.-]+)\.js$')) for test_spec in _NSIG_TESTS: make_nsig_test(*test_spec) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 8790b326b7..ea6264a0d6 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -640,6 +640,7 @@ def __init__(self, params=None, auto_init=True): self._printed_messages = set() self._first_webpage_request = True self._post_hooks = [] + self._close_hooks = [] self._progress_hooks = [] self._postprocessor_hooks = [] self._download_retcode = 0 @@ -654,19 +655,21 @@ def __init__(self, params=None, auto_init=True): if not all_plugins_loaded.value: load_all_plugins() - try: - windows_enable_vt_mode() - except Exception as e: - self.write_debug(f'Failed to enable VT mode: {e}') - stdout = sys.stderr if self.params.get('logtostderr') else sys.stdout self._out_files = Namespace( out=stdout, error=sys.stderr, screen=sys.stderr if self.params.get('quiet') else stdout, - console=next(filter(supports_terminal_sequences, (sys.stderr, sys.stdout)), None), ) + try: + windows_enable_vt_mode() + except Exception as e: + self.write_debug(f'Failed to enable VT mode: {e}') + + # hehe "immutable" namespace + self._out_files.console = next(filter(supports_terminal_sequences, (sys.stderr, sys.stdout)), None) + if self.params.get('no_color'): if self.params.get('color') is not None: self.params.setdefault('_warnings', []).append( @@ -906,6 +909,11 @@ def add_post_hook(self, ph): """Add the post hook""" self._post_hooks.append(ph) + def add_close_hook(self, ch): + """Add a close hook, called when YoutubeDL.close() is called""" + assert callable(ch), 'Close hook must be callable' + self._close_hooks.append(ch) + def add_progress_hook(self, ph): """Add the download progress hook""" self._progress_hooks.append(ph) @@ -1014,6 +1022,9 @@ def close(self): self._request_director.close() del self._request_director + for close_hook in self._close_hooks: + close_hook() + def trouble(self, message=None, tb=None, is_error=True): """Determine action to take when a download problem appears. @@ -4150,7 +4161,7 @@ def _get_available_impersonate_targets(self): (target, rh.RH_NAME) for rh in self._request_director.handlers.values() if isinstance(rh, ImpersonateRequestHandler) - for target in rh.supported_targets + for target in reversed(rh.supported_targets) ] def _impersonate_target_available(self, target): diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py index 7d8f100474..714d9ad5c2 100644 --- a/yt_dlp/__init__.py +++ b/yt_dlp/__init__.py @@ -1021,8 +1021,9 @@ def _real_main(argv=None): # List of simplified targets we know are supported, # to help users know what dependencies may be required. (ImpersonateTarget('chrome'), 'curl_cffi'), - (ImpersonateTarget('edge'), 'curl_cffi'), (ImpersonateTarget('safari'), 'curl_cffi'), + (ImpersonateTarget('firefox'), 'curl_cffi>=0.10'), + (ImpersonateTarget('edge'), 'curl_cffi'), ] available_targets = ydl._get_available_impersonate_targets() @@ -1038,12 +1039,12 @@ def make_row(target, handler): for known_target, known_handler in known_targets: if not any( - known_target in target and handler == known_handler + known_target in target and known_handler.startswith(handler) for target, handler in available_targets ): - rows.append([ + rows.insert(0, [ ydl._format_out(text, ydl.Styles.SUPPRESS) - for text in make_row(known_target, f'{known_handler} (not available)') + for text in make_row(known_target, f'{known_handler} (unavailable)') ]) ydl.to_screen('[info] Available impersonate targets') diff --git a/yt_dlp/aes.py b/yt_dlp/aes.py index 9908434a58..065901d68d 100644 --- a/yt_dlp/aes.py +++ b/yt_dlp/aes.py @@ -83,7 +83,7 @@ def aes_ecb_encrypt(data, key, iv=None): @returns {int[]} encrypted data """ expanded_key = key_expansion(key) - block_count = int(ceil(float(len(data)) / BLOCK_SIZE_BYTES)) + block_count = ceil(len(data) / BLOCK_SIZE_BYTES) encrypted_data = [] for i in range(block_count): @@ -103,7 +103,7 @@ def aes_ecb_decrypt(data, key, iv=None): @returns {int[]} decrypted data """ expanded_key = key_expansion(key) - block_count = int(ceil(float(len(data)) / BLOCK_SIZE_BYTES)) + block_count = ceil(len(data) / BLOCK_SIZE_BYTES) encrypted_data = [] for i in range(block_count): @@ -134,7 +134,7 @@ def aes_ctr_encrypt(data, key, iv): @returns {int[]} encrypted data """ expanded_key = key_expansion(key) - block_count = int(ceil(float(len(data)) / BLOCK_SIZE_BYTES)) + block_count = ceil(len(data) / BLOCK_SIZE_BYTES) counter = iter_vector(iv) encrypted_data = [] @@ -158,7 +158,7 @@ def aes_cbc_decrypt(data, key, iv): @returns {int[]} decrypted data """ expanded_key = key_expansion(key) - block_count = int(ceil(float(len(data)) / BLOCK_SIZE_BYTES)) + block_count = ceil(len(data) / BLOCK_SIZE_BYTES) decrypted_data = [] previous_cipher_block = iv @@ -183,7 +183,7 @@ def aes_cbc_encrypt(data, key, iv, *, padding_mode='pkcs7'): @returns {int[]} encrypted data """ expanded_key = key_expansion(key) - block_count = int(ceil(float(len(data)) / BLOCK_SIZE_BYTES)) + block_count = ceil(len(data) / BLOCK_SIZE_BYTES) encrypted_data = [] previous_cipher_block = iv diff --git a/yt_dlp/cookies.py b/yt_dlp/cookies.py index fad323c901..5675445ace 100644 --- a/yt_dlp/cookies.py +++ b/yt_dlp/cookies.py @@ -764,11 +764,11 @@ def _get_linux_desktop_environment(env, logger): GetDesktopEnvironment """ xdg_current_desktop = env.get('XDG_CURRENT_DESKTOP', None) - desktop_session = env.get('DESKTOP_SESSION', None) + desktop_session = env.get('DESKTOP_SESSION', '') if xdg_current_desktop is not None: for part in map(str.strip, xdg_current_desktop.split(':')): if part == 'Unity': - if desktop_session is not None and 'gnome-fallback' in desktop_session: + if 'gnome-fallback' in desktop_session: return _LinuxDesktopEnvironment.GNOME else: return _LinuxDesktopEnvironment.UNITY @@ -797,35 +797,34 @@ def _get_linux_desktop_environment(env, logger): return _LinuxDesktopEnvironment.UKUI elif part == 'LXQt': return _LinuxDesktopEnvironment.LXQT - logger.info(f'XDG_CURRENT_DESKTOP is set to an unknown value: "{xdg_current_desktop}"') + logger.debug(f'XDG_CURRENT_DESKTOP is set to an unknown value: "{xdg_current_desktop}"') - elif desktop_session is not None: - if desktop_session == 'deepin': - return _LinuxDesktopEnvironment.DEEPIN - elif desktop_session in ('mate', 'gnome'): - return _LinuxDesktopEnvironment.GNOME - elif desktop_session in ('kde4', 'kde-plasma'): + if desktop_session == 'deepin': + return _LinuxDesktopEnvironment.DEEPIN + elif desktop_session in ('mate', 'gnome'): + return _LinuxDesktopEnvironment.GNOME + elif desktop_session in ('kde4', 'kde-plasma'): + return _LinuxDesktopEnvironment.KDE4 + elif desktop_session == 'kde': + if 'KDE_SESSION_VERSION' in env: return _LinuxDesktopEnvironment.KDE4 - elif desktop_session == 'kde': - if 'KDE_SESSION_VERSION' in env: - return _LinuxDesktopEnvironment.KDE4 - else: - return _LinuxDesktopEnvironment.KDE3 - elif 'xfce' in desktop_session or desktop_session == 'xubuntu': - return _LinuxDesktopEnvironment.XFCE - elif desktop_session == 'ukui': - return _LinuxDesktopEnvironment.UKUI else: - logger.info(f'DESKTOP_SESSION is set to an unknown value: "{desktop_session}"') - + return _LinuxDesktopEnvironment.KDE3 + elif 'xfce' in desktop_session or desktop_session == 'xubuntu': + return _LinuxDesktopEnvironment.XFCE + elif desktop_session == 'ukui': + return _LinuxDesktopEnvironment.UKUI else: - if 'GNOME_DESKTOP_SESSION_ID' in env: - return _LinuxDesktopEnvironment.GNOME - elif 'KDE_FULL_SESSION' in env: - if 'KDE_SESSION_VERSION' in env: - return _LinuxDesktopEnvironment.KDE4 - else: - return _LinuxDesktopEnvironment.KDE3 + logger.debug(f'DESKTOP_SESSION is set to an unknown value: "{desktop_session}"') + + if 'GNOME_DESKTOP_SESSION_ID' in env: + return _LinuxDesktopEnvironment.GNOME + elif 'KDE_FULL_SESSION' in env: + if 'KDE_SESSION_VERSION' in env: + return _LinuxDesktopEnvironment.KDE4 + else: + return _LinuxDesktopEnvironment.KDE3 + return _LinuxDesktopEnvironment.OTHER diff --git a/yt_dlp/downloader/__init__.py b/yt_dlp/downloader/__init__.py index 1b12bd4bed..9c34bd289a 100644 --- a/yt_dlp/downloader/__init__.py +++ b/yt_dlp/downloader/__init__.py @@ -30,7 +30,7 @@ def get_suitable_downloader(info_dict, params={}, default=NO_DEFAULT, protocol=N from .http import HttpFD from .ism import IsmFD from .mhtml import MhtmlFD -from .niconico import NiconicoDmcFD, NiconicoLiveFD +from .niconico import NiconicoLiveFD from .rtmp import RtmpFD from .rtsp import RtspFD from .websocket import WebSocketFragmentFD @@ -50,7 +50,6 @@ def get_suitable_downloader(info_dict, params={}, default=NO_DEFAULT, protocol=N 'http_dash_segments_generator': DashSegmentsFD, 'ism': IsmFD, 'mhtml': MhtmlFD, - 'niconico_dmc': NiconicoDmcFD, 'niconico_live': NiconicoLiveFD, 'fc2_live': FC2LiveFD, 'websocket_frag': WebSocketFragmentFD, @@ -67,7 +66,6 @@ def shorten_protocol_name(proto, simplify=False): 'rtmp_ffmpeg': 'rtmpF', 'http_dash_segments': 'dash', 'http_dash_segments_generator': 'dashG', - 'niconico_dmc': 'dmc', 'websocket_frag': 'WSfrag', } if simplify: diff --git a/yt_dlp/downloader/niconico.py b/yt_dlp/downloader/niconico.py index 462c6e2d63..33cf15df88 100644 --- a/yt_dlp/downloader/niconico.py +++ b/yt_dlp/downloader/niconico.py @@ -2,60 +2,12 @@ import threading import time -from . import get_suitable_downloader from .common import FileDownloader from .external import FFmpegFD from ..networking import Request from ..utils import DownloadError, str_or_none, try_get -class NiconicoDmcFD(FileDownloader): - """ Downloading niconico douga from DMC with heartbeat """ - - def real_download(self, filename, info_dict): - from ..extractor.niconico import NiconicoIE - - self.to_screen(f'[{self.FD_NAME}] Downloading from DMC') - ie = NiconicoIE(self.ydl) - info_dict, heartbeat_info_dict = ie._get_heartbeat_info(info_dict) - - fd = get_suitable_downloader(info_dict, params=self.params)(self.ydl, self.params) - - success = download_complete = False - timer = [None] - heartbeat_lock = threading.Lock() - heartbeat_url = heartbeat_info_dict['url'] - heartbeat_data = heartbeat_info_dict['data'].encode() - heartbeat_interval = heartbeat_info_dict.get('interval', 30) - - request = Request(heartbeat_url, heartbeat_data) - - def heartbeat(): - try: - self.ydl.urlopen(request).read() - except Exception: - self.to_screen(f'[{self.FD_NAME}] Heartbeat failed') - - with heartbeat_lock: - if not download_complete: - timer[0] = threading.Timer(heartbeat_interval, heartbeat) - timer[0].start() - - heartbeat_info_dict['ping']() - self.to_screen('[%s] Heartbeat with %d second interval ...' % (self.FD_NAME, heartbeat_interval)) - try: - heartbeat() - if type(fd).__name__ == 'HlsFD': - info_dict.update(ie._extract_m3u8_formats(info_dict['url'], info_dict['id'])[0]) - success = fd.real_download(filename, info_dict) - finally: - if heartbeat_lock: - with heartbeat_lock: - timer[0].cancel() - download_complete = True - return success - - class NiconicoLiveFD(FileDownloader): """ Downloads niconico live without being stopped """ @@ -85,6 +37,7 @@ def communicate_ws(reconnect): 'quality': live_quality, 'protocol': 'hls+fmp4', 'latency': live_latency, + 'accessRightMethod': 'single_cookie', 'chasePlay': False, }, 'room': { diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index cec604a524..3a06934cdd 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -300,7 +300,6 @@ BrainPOPIlIE, BrainPOPJrIE, ) -from .bravotv import BravoTVIE from .breitbart import BreitBartIE from .brightcove import ( BrightcoveLegacyIE, @@ -338,7 +337,6 @@ from .canalplus import CanalplusIE from .canalsurmas import CanalsurmasIE from .caracoltv import CaracolTvPlayIE -from .cartoonnetwork import CartoonNetworkIE from .cbc import ( CBCIE, CBCGemIE, @@ -496,10 +494,6 @@ from .daystar import DaystarClipIE from .dbtv import DBTVIE from .dctp import DctpTvIE -from .deezer import ( - DeezerAlbumIE, - DeezerPlaylistIE, -) from .democracynow import DemocracynowIE from .detik import DetikEmbedIE from .deuxm import ( @@ -687,6 +681,7 @@ ) from .foxsports import FoxSportsIE from .fptplay import FptplayIE +from .francaisfacile import FrancaisFacileIE from .franceinter import FranceInterIE from .francetv import ( FranceTVIE, @@ -843,6 +838,7 @@ from .ichinanalive import ( IchinanaLiveClipIE, IchinanaLiveIE, + IchinanaLiveVODIE, ) from .idolplus import IdolPlusIE from .ign import ( @@ -905,6 +901,7 @@ IviIE, ) from .ivideon import IvideonIE +from .ivoox import IvooxIE from .iwara import ( IwaraIE, IwaraPlaylistIE, @@ -930,7 +927,10 @@ ) from .jiosaavn import ( JioSaavnAlbumIE, + JioSaavnArtistIE, JioSaavnPlaylistIE, + JioSaavnShowIE, + JioSaavnShowPlaylistIE, JioSaavnSongIE, ) from .joj import JojIE @@ -962,7 +962,10 @@ ) from .kicker import KickerIE from .kickstarter import KickStarterIE -from .kika import KikaIE +from .kika import ( + KikaIE, + KikaPlaylistIE, +) from .kinja import KinjaEmbedIE from .kinopoisk import KinoPoiskIE from .kommunetv import KommunetvIE @@ -1040,6 +1043,7 @@ LimelightMediaIE, ) from .linkedin import ( + LinkedInEventsIE, LinkedInIE, LinkedInLearningCourseIE, LinkedInLearningIE, @@ -1055,6 +1059,7 @@ ) from .livestreamfails import LivestreamfailsIE from .lnk import LnkIE +from .loco import LocoIE from .loom import ( LoomFolderIE, LoomIE, @@ -1062,6 +1067,7 @@ from .lovehomeporn import LoveHomePornIE from .lrt import ( LRTVODIE, + LRTRadioIE, LRTStreamIE, ) from .lsm import ( @@ -1255,6 +1261,7 @@ ) from .nbc import ( NBCIE, + BravoTVIE, NBCNewsIE, NBCOlympicsIE, NBCOlympicsStreamIE, @@ -1262,6 +1269,7 @@ NBCSportsStreamIE, NBCSportsVPlayerIE, NBCStationsIE, + SyfyIE, ) from .ndr import ( NDRIE, @@ -1494,6 +1502,10 @@ ) from .parler import ParlerIE from .parlview import ParlviewIE +from .parti import ( + PartiLivestreamIE, + PartiVideoIE, +) from .patreon import ( PatreonCampaignIE, PatreonIE, @@ -1741,6 +1753,7 @@ RoosterTeethSeriesIE, ) from .rottentomatoes import RottenTomatoesIE +from .roya import RoyaLiveIE from .rozhlas import ( MujRozhlasIE, RozhlasIE, @@ -1775,7 +1788,6 @@ from .rtve import ( RTVEALaCartaIE, RTVEAudioIE, - RTVEInfantilIE, RTVELiveIE, RTVETelevisionIE, ) @@ -1956,7 +1968,6 @@ SpreakerShowIE, ) from .springboardplatform import SpringboardPlatformIE -from .sprout import SproutIE from .sproutvideo import ( SproutVideoIE, VidsIoIE, @@ -1989,6 +2000,7 @@ StoryFireSeriesIE, StoryFireUserIE, ) +from .streaks import StreaksIE from .streamable import StreamableIE from .streamcz import StreamCZIE from .streetvoice import StreetVoiceIE @@ -2012,7 +2024,6 @@ SVTSeriesIE, ) from .swearnet import SwearnetEpisodeIE -from .syfy import SyfyIE from .syvdk import SYVDKIE from .sztvhu import SztvHuIE from .tagesschau import TagesschauIE @@ -2137,6 +2148,7 @@ from .toggo import ToggoIE from .tonline import TOnlineIE from .toongoggles import ToonGogglesIE +from .toutiao import ToutiaoIE from .toutv import TouTvIE from .toypics import ( ToypicsIE, @@ -2228,7 +2240,10 @@ TVPlayIE, ) from .tvplayer import TVPlayerIE -from .tvw import TvwIE +from .tvw import ( + TvwIE, + TvwTvChannelsIE, +) from .tweakers import TweakersIE from .twentymin import TwentyMinutenIE from .twentythreevideo import TwentyThreeVideoIE @@ -2352,14 +2367,11 @@ ViewLiftIE, ) from .viidea import ViideaIE -from .viki import ( - VikiChannelIE, - VikiIE, -) from .vimeo import ( VHXEmbedIE, VimeoAlbumIE, VimeoChannelIE, + VimeoEventIE, VimeoGroupsIE, VimeoIE, VimeoLikesIE, @@ -2400,10 +2412,15 @@ VoxMediaIE, VoxMediaVolumeIE, ) +from .vrsquare import ( + VrSquareChannelIE, + VrSquareIE, + VrSquareSearchIE, + VrSquareSectionIE, +) from .vrt import ( VRTIE, DagelijkseKostIE, - KetnetIE, Radio1BeIE, VrtNUIE, ) diff --git a/yt_dlp/extractor/abematv.py b/yt_dlp/extractor/abematv.py index 8c7131b10a..8f2fc4c80a 100644 --- a/yt_dlp/extractor/abematv.py +++ b/yt_dlp/extractor/abematv.py @@ -21,6 +21,7 @@ int_or_none, time_seconds, traverse_obj, + update_url, update_url_query, ) @@ -417,6 +418,10 @@ def _real_extract(self, url): 'is_live': is_live, 'availability': availability, }) + + if thumbnail := update_url(self._og_search_thumbnail(webpage, default=''), query=None): + info['thumbnails'] = [{'url': thumbnail}] + return info diff --git a/yt_dlp/extractor/adobepass.py b/yt_dlp/extractor/adobepass.py index f1b8779271..91c40b32ef 100644 --- a/yt_dlp/extractor/adobepass.py +++ b/yt_dlp/extractor/adobepass.py @@ -3,6 +3,7 @@ import re import time import urllib.parse +import uuid import xml.etree.ElementTree as etree from .common import InfoExtractor @@ -10,6 +11,7 @@ from ..utils import ( NO_DEFAULT, ExtractorError, + parse_qs, unescapeHTML, unified_timestamp, urlencode_postdata, @@ -45,6 +47,8 @@ 'name': 'Comcast XFINITY', 'username_field': 'user', 'password_field': 'passwd', + 'login_hostname': 'login.xfinity.com', + 'needs_newer_ua': True, }, 'TWC': { 'name': 'Time Warner Cable | Spectrum', @@ -74,6 +78,12 @@ 'name': 'Verizon FiOS', 'username_field': 'IDToken1', 'password_field': 'IDToken2', + 'login_hostname': 'ssoauth.verizon.com', + }, + 'Fubo': { + 'name': 'Fubo', + 'username_field': 'username', + 'password_field': 'password', }, 'Cablevision': { 'name': 'Optimum/Cablevision', @@ -1338,6 +1348,7 @@ 'name': 'Sling TV', 'username_field': 'username', 'password_field': 'password', + 'login_hostname': 'identity.sling.com', }, 'Suddenlink': { 'name': 'Suddenlink', @@ -1355,7 +1366,6 @@ class AdobePassIE(InfoExtractor): # XXX: Conventionally, base classes should end with BaseIE/InfoExtractor _SERVICE_PROVIDER_TEMPLATE = 'https://sp.auth.adobe.com/adobe-services/%s' _USER_AGENT = 'Mozilla/5.0 (X11; Linux i686; rv:47.0) Gecko/20100101 Firefox/47.0' - _MODERN_USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; rv:131.0) Gecko/20100101 Firefox/131.0' _MVPD_CACHE = 'ap-mvpd' _DOWNLOADING_LOGIN_PAGE = 'Downloading Provider Login Page' @@ -1367,6 +1377,14 @@ def _download_webpage_handle(self, *args, **kwargs): return super()._download_webpage_handle( *args, **kwargs) + @staticmethod + def _get_mso_headers(mso_info): + # yt-dlp's default user-agent is usually too old for some MSO's like Comcast_SSO + # See: https://github.com/yt-dlp/yt-dlp/issues/10848 + return { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:131.0) Gecko/20100101 Firefox/131.0', + } if mso_info.get('needs_newer_ua') else {} + @staticmethod def _get_mvpd_resource(provider_id, title, guid, rating): channel = etree.Element('channel') @@ -1382,7 +1400,13 @@ def _get_mvpd_resource(provider_id, title, guid, rating): resource_rating.text = rating return '' + etree.tostring(channel).decode() + '' - def _extract_mvpd_auth(self, url, video_id, requestor_id, resource): + def _extract_mvpd_auth(self, url, video_id, requestor_id, resource, software_statement): + mso_id = self.get_param('ap_mso') + if mso_id: + mso_info = MSO_INFO[mso_id] + else: + mso_info = {} + def xml_text(xml_str, tag): return self._search_regex( f'<{tag}>(.+?)', xml_str, tag) @@ -1391,15 +1415,27 @@ def is_expired(token, date_ele): token_expires = unified_timestamp(re.sub(r'[_ ]GMT', '', xml_text(token, date_ele))) return token_expires and token_expires <= int(time.time()) - def post_form(form_page_res, note, data={}): + def post_form(form_page_res, note, data={}, validate_url=False): form_page, urlh = form_page_res post_url = self._html_search_regex(r']+action=(["\'])(?P.+?)\1', form_page, 'post url', group='url') if not re.match(r'https?://', post_url): post_url = urllib.parse.urljoin(urlh.url, post_url) + if validate_url: + # This request is submitting credentials so we should validate it when possible + url_parsed = urllib.parse.urlparse(post_url) + expected_hostname = mso_info.get('login_hostname') + if expected_hostname and expected_hostname != url_parsed.hostname: + raise ExtractorError( + f'Unexpected login URL hostname; expected "{expected_hostname}" but got ' + f'"{url_parsed.hostname}". Aborting before submitting credentials') + if url_parsed.scheme != 'https': + self.write_debug('Upgrading login URL scheme to https') + post_url = urllib.parse.urlunparse(url_parsed._replace(scheme='https')) form_data = self._hidden_inputs(form_page) form_data.update(data) return self._download_webpage_handle( post_url, video_id, note, data=urlencode_postdata(form_data), headers={ + **self._get_mso_headers(mso_info), 'Content-Type': 'application/x-www-form-urlencoded', }) @@ -1432,40 +1468,72 @@ def extract_redirect_url(html, url=None, fatal=False): } guid = xml_text(resource, 'guid') if '<' in resource else resource - count = 0 - while count < 2: + for _ in range(2): requestor_info = self.cache.load(self._MVPD_CACHE, requestor_id) or {} authn_token = requestor_info.get('authn_token') if authn_token and is_expired(authn_token, 'simpleTokenExpires'): authn_token = None if not authn_token: - mso_id = self.get_param('ap_mso') - if mso_id: - username, password = self._get_login_info('ap_username', 'ap_password', mso_id) - if not username or not password: - raise_mvpd_required() - mso_info = MSO_INFO[mso_id] - - provider_redirect_page_res = self._download_webpage_handle( - self._SERVICE_PROVIDER_TEMPLATE % 'authenticate/saml', video_id, - 'Downloading Provider Redirect Page', query={ - 'noflash': 'true', - 'mso_id': mso_id, - 'requestor_id': requestor_id, - 'no_iframe': 'false', - 'domain_name': 'adobe.com', - 'redirect_url': url, - }, headers={ - # yt-dlp's default user-agent is usually too old for Comcast_SSO - # See: https://github.com/yt-dlp/yt-dlp/issues/10848 - 'User-Agent': self._MODERN_USER_AGENT, - } if mso_id == 'Comcast_SSO' else None) - elif not self._cookies_passed: + if not mso_id: + raise_mvpd_required() + username, password = self._get_login_info('ap_username', 'ap_password', mso_id) + if not username or not password: raise_mvpd_required() - if not mso_id: - pass - elif mso_id == 'Comcast_SSO': + device_info, urlh = self._download_json_handle( + 'https://sp.auth.adobe.com/indiv/devices', + video_id, 'Registering device with Adobe', + data=json.dumps({'fingerprint': uuid.uuid4().hex}).encode(), + headers={'Content-Type': 'application/json; charset=UTF-8'}) + + device_id = device_info['deviceId'] + mvpd_headers['pass_sfp'] = urlh.get_header('pass_sfp') + mvpd_headers['Ap_21'] = device_id + + registration = self._download_json( + 'https://sp.auth.adobe.com/o/client/register', + video_id, 'Registering client with Adobe', + data=json.dumps({'software_statement': software_statement}).encode(), + headers={'Content-Type': 'application/json; charset=UTF-8'}) + + access_token = self._download_json( + 'https://sp.auth.adobe.com/o/client/token', video_id, + 'Obtaining access token', data=urlencode_postdata({ + 'grant_type': 'client_credentials', + 'client_id': registration['client_id'], + 'client_secret': registration['client_secret'], + }), + headers={ + 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', + })['access_token'] + mvpd_headers['Authorization'] = f'Bearer {access_token}' + + reg_code = self._download_json( + f'https://sp.auth.adobe.com/reggie/v1/{requestor_id}/regcode', + video_id, 'Obtaining registration code', + data=urlencode_postdata({ + 'requestor': requestor_id, + 'deviceId': device_id, + 'format': 'json', + }), + headers={ + 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', + 'Authorization': f'Bearer {access_token}', + })['code'] + + provider_redirect_page_res = self._download_webpage_handle( + self._SERVICE_PROVIDER_TEMPLATE % 'authenticate/saml', video_id, + 'Downloading Provider Redirect Page', query={ + 'noflash': 'true', + 'mso_id': mso_id, + 'requestor_id': requestor_id, + 'no_iframe': 'false', + 'domain_name': 'adobe.com', + 'redirect_url': url, + 'reg_code': reg_code, + }, headers=self._get_mso_headers(mso_info)) + + if mso_id == 'Comcast_SSO': # Comcast page flow varies by video site and whether you # are on Comcast's network. provider_redirect_page, urlh = provider_redirect_page_res @@ -1489,8 +1557,8 @@ def extract_redirect_url(html, url=None, fatal=False): oauth_redirect_url = extract_redirect_url( provider_redirect_page, fatal=True) provider_login_page_res = self._download_webpage_handle( - oauth_redirect_url, video_id, - self._DOWNLOADING_LOGIN_PAGE) + oauth_redirect_url, video_id, self._DOWNLOADING_LOGIN_PAGE, + headers=self._get_mso_headers(mso_info)) else: provider_login_page_res = post_form( provider_redirect_page_res, @@ -1500,7 +1568,7 @@ def extract_redirect_url(html, url=None, fatal=False): provider_login_page_res, 'Logging in', { mso_info['username_field']: username, mso_info['password_field']: password, - }) + }, validate_url=True) mvpd_confirm_page, urlh = mvpd_confirm_page_res if '' in mvpd_confirm_page: post_form(mvpd_confirm_page_res, 'Confirming Login') @@ -1539,7 +1607,7 @@ def extract_redirect_url(html, url=None, fatal=False): provider_redirect_page_res, 'Logging in', { mso_info['username_field']: username, mso_info['password_field']: password, - }) + }, validate_url=True) saml_login_page, urlh = saml_login_page_res if 'Please try again.' in saml_login_page: raise ExtractorError( @@ -1560,7 +1628,7 @@ def extract_redirect_url(html, url=None, fatal=False): [saml_login_page, saml_redirect_url], 'Logging in', { mso_info['username_field']: username, mso_info['password_field']: password, - }) + }, validate_url=True) if 'Please try again.' in saml_login_page: raise ExtractorError( 'Failed to login, incorrect User ID or Password.') @@ -1631,7 +1699,7 @@ def extract_redirect_url(html, url=None, fatal=False): provider_login_page_res, 'Logging in', { mso_info['username_field']: username, mso_info['password_field']: password, - }) + }, validate_url=True) provider_refresh_redirect_url = extract_redirect_url( provider_association_redirect, url=urlh.url) @@ -1682,7 +1750,7 @@ def extract_redirect_url(html, url=None, fatal=False): provider_login_page_res, 'Logging in', { mso_info['username_field']: username, mso_info['password_field']: password, - }) + }, validate_url=True) provider_refresh_redirect_url = extract_redirect_url( provider_association_redirect, url=urlh.url) @@ -1699,6 +1767,27 @@ def extract_redirect_url(html, url=None, fatal=False): query=hidden_data) post_form(mvpd_confirm_page_res, 'Confirming Login') + elif mso_id == 'Fubo': + _, urlh = provider_redirect_page_res + + fubo_response = self._download_json( + 'https://api.fubo.tv/partners/tve/connect', video_id, + 'Authenticating with Fubo', 'Unable to authenticate with Fubo', + query=parse_qs(urlh.url), data=json.dumps({ + 'username': username, + 'password': password, + }).encode(), headers={ + 'Accept': 'application/json', + 'Content-Type': 'application/json', + }) + + self._request_webpage( + 'https://sp.auth.adobe.com/adobe-services/oauth2', video_id, + 'Authenticating with Adobe', 'Failed to authenticate with Adobe', + query={ + 'code': fubo_response['code'], + 'state': fubo_response['state'], + }) else: # Some providers (e.g. DIRECTV NOW) have another meta refresh # based redirect that should be followed. @@ -1717,7 +1806,8 @@ def extract_redirect_url(html, url=None, fatal=False): } if mso_id in ('Cablevision', 'AlticeOne'): form_data['_eventId_proceed'] = '' - mvpd_confirm_page_res = post_form(provider_login_page_res, 'Logging in', form_data) + mvpd_confirm_page_res = post_form( + provider_login_page_res, 'Logging in', form_data, validate_url=True) if mso_id != 'Rogers': post_form(mvpd_confirm_page_res, 'Confirming Login') @@ -1727,6 +1817,7 @@ def extract_redirect_url(html, url=None, fatal=False): 'Retrieving Session', data=urlencode_postdata({ '_method': 'GET', 'requestor_id': requestor_id, + 'reg_code': reg_code, }), headers=mvpd_headers) except ExtractorError as e: if not mso_id and isinstance(e.cause, HTTPError) and e.cause.status == 401: @@ -1734,7 +1825,6 @@ def extract_redirect_url(html, url=None, fatal=False): raise if 'amc|bbcamerica|ifc|(?:we|sundance)tv)\.com/(?P(?:movies|shows(?:/[^/]+)+)/[^/?#&]+)' +class AMCNetworksIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?(?:amc|bbcamerica|ifc|(?:we|sundance)tv)\.com/(?P(?:movies|shows(?:/[^/?#]+)+)/[^/?#&]+)' _TESTS = [{ - 'url': 'https://www.bbcamerica.com/shows/the-graham-norton-show/videos/tina-feys-adorable-airline-themed-family-dinner--51631', + 'url': 'https://www.amc.com/shows/dark-winds/videos/dark-winds-a-look-at-season-3--1072027', 'info_dict': { - 'id': '4Lq1dzOnZGt0', + 'id': '6369261343112', 'ext': 'mp4', - 'title': "The Graham Norton Show - Season 28 - Tina Fey's Adorable Airline-Themed Family Dinner", - 'description': "It turns out child stewardesses are very generous with the wine! All-new episodes of 'The Graham Norton Show' premiere Fridays at 11/10c on BBC America.", - 'upload_date': '20201120', - 'timestamp': 1605904350, - 'uploader': 'AMCN', + 'title': 'Dark Winds: A Look at Season 3', + 'uploader_id': '6240731308001', + 'duration': 176.427, + 'thumbnail': r're:https://[^/]+\.boltdns\.net/.+/image\.jpg', + 'tags': [], + 'timestamp': 1740414792, + 'upload_date': '20250224', }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - 'skip': '404 Not Found', + 'params': {'skip_download': 'm3u8'}, }, { 'url': 'http://www.bbcamerica.com/shows/the-hunt/full-episodes/season-1/episode-01-the-hardest-challenge', 'only_matching': True, @@ -52,96 +44,18 @@ class AMCNetworksIE(ThePlatformIE): # XXX: Do not subclass from concrete IE 'url': 'https://www.sundancetv.com/shows/riviera/full-episodes/season-1/episode-01-episode-1', 'only_matching': True, }] - _REQUESTOR_ID_MAP = { - 'amc': 'AMC', - 'bbcamerica': 'BBCA', - 'ifc': 'IFC', - 'sundancetv': 'SUNDANCE', - 'wetv': 'WETV', - } def _real_extract(self, url): - site, display_id = self._match_valid_url(url).groups() - requestor_id = self._REQUESTOR_ID_MAP[site] - page_data = self._download_json( - f'https://content-delivery-gw.svc.ds.amcn.com/api/v2/content/amcn/{requestor_id.lower()}/url/{display_id}', - display_id)['data'] - properties = page_data.get('properties') or {} - query = { - 'mbr': 'true', - 'manifest': 'm3u', - } + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + initial_data = self._search_json( + r'window\.initialData\s*=\s*JSON\.parse\(String\.raw`', webpage, 'initial data', display_id) + video_id = traverse_obj(initial_data, ('initialData', 'properties', 'videoId', {str})) + if not video_id: # All locked videos are now DRM-protected + self.report_drm(display_id) + account_id = initial_data['config']['brightcove']['accountId'] + player_id = initial_data['config']['brightcove']['playerId'] - video_player_count = 0 - try: - for v in page_data['children']: - if v.get('type') == 'video-player': - release_pid = v['properties']['currentVideo']['meta']['releasePid'] - tp_path = 'M_UwQC/' + release_pid - media_url = 'https://link.theplatform.com/s/' + tp_path - video_player_count += 1 - except KeyError: - pass - if video_player_count > 1: - self.report_warning( - f'The JSON data has {video_player_count} video players. Only one will be extracted') - - # Fall back to videoPid if releasePid not found. - # TODO: Fall back to videoPid if releasePid manifest uses DRM. - if not video_player_count: - tp_path = 'M_UwQC/media/' + properties['videoPid'] - media_url = 'https://link.theplatform.com/s/' + tp_path - - theplatform_metadata = self._download_theplatform_metadata(tp_path, display_id) - info = self._parse_theplatform_metadata(theplatform_metadata) - video_id = theplatform_metadata['pid'] - title = theplatform_metadata['title'] - rating = try_get( - theplatform_metadata, lambda x: x['ratings'][0]['rating']) - video_category = properties.get('videoCategory') - if video_category and video_category.endswith('-Auth'): - resource = self._get_mvpd_resource( - requestor_id, title, video_id, rating) - query['auth'] = self._extract_mvpd_auth( - url, video_id, requestor_id, resource) - media_url = update_url_query(media_url, query) - formats, subtitles = self._extract_theplatform_smil( - media_url, video_id) - - thumbnails = [] - thumbnail_urls = [properties.get('imageDesktop')] - if 'thumbnail' in info: - thumbnail_urls.append(info.pop('thumbnail')) - for thumbnail_url in thumbnail_urls: - if not thumbnail_url: - continue - mobj = re.search(r'(\d+)x(\d+)', thumbnail_url) - thumbnails.append({ - 'url': thumbnail_url, - 'width': int(mobj.group(1)) if mobj else None, - 'height': int(mobj.group(2)) if mobj else None, - }) - - info.update({ - 'age_limit': parse_age_limit(rating), - 'formats': formats, - 'id': video_id, - 'subtitles': subtitles, - 'thumbnails': thumbnails, - }) - ns_keys = theplatform_metadata.get('$xmlns', {}).keys() - if ns_keys: - ns = next(iter(ns_keys)) - episode = theplatform_metadata.get(ns + '$episodeTitle') or None - episode_number = int_or_none( - theplatform_metadata.get(ns + '$episode')) - season_number = int_or_none( - theplatform_metadata.get(ns + '$season')) - series = theplatform_metadata.get(ns + '$show') or None - info.update({ - 'episode': episode, - 'episode_number': episode_number, - 'season_number': season_number, - 'series': series, - }) - return info + return self.url_result( + f'https://players.brightcove.net/{account_id}/{player_id}_default/index.html?videoId={video_id}', + BrightcoveNewIE, video_id) diff --git a/yt_dlp/extractor/atresplayer.py b/yt_dlp/extractor/atresplayer.py index 0fe95bec5c..1258a5704d 100644 --- a/yt_dlp/extractor/atresplayer.py +++ b/yt_dlp/extractor/atresplayer.py @@ -1,64 +1,105 @@ +import urllib.parse + from .common import InfoExtractor from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, int_or_none, + parse_age_limit, + url_or_none, urlencode_postdata, ) +from ..utils.traversal import traverse_obj class AtresPlayerIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?atresplayer\.com/[^/]+/[^/]+/[^/]+/[^/]+/(?P.+?)_(?P[0-9a-f]{24})' + _VALID_URL = r'https?://(?:www\.)?atresplayer\.com/(?:[^/?#]+/){4}(?P.+?)_(?P[0-9a-f]{24})' _NETRC_MACHINE = 'atresplayer' - _TESTS = [ - { - 'url': 'https://www.atresplayer.com/antena3/series/pequenas-coincidencias/temporada-1/capitulo-7-asuntos-pendientes_5d4aa2c57ed1a88fc715a615/', - 'info_dict': { - 'id': '5d4aa2c57ed1a88fc715a615', - 'ext': 'mp4', - 'title': 'Capítulo 7: Asuntos pendientes', - 'description': 'md5:7634cdcb4d50d5381bedf93efb537fbc', - 'duration': 3413, - }, - 'skip': 'This video is only available for registered users', + _TESTS = [{ + 'url': 'https://www.atresplayer.com/lasexta/programas/el-objetivo/clips/mbappe-describe-como-entrenador-a-carlo-ancelotti-sabe-cuando-tiene-que-ser-padre-jefe-amigo-entrenador_67f2dfb2fb6ab0e4c7203849/', + 'info_dict': { + 'ext': 'mp4', + 'id': '67f2dfb2fb6ab0e4c7203849', + 'display_id': 'md5:c203f8d4e425ed115ba56a1c6e4b3e6c', + 'title': 'Mbappé describe como entrenador a Carlo Ancelotti: "Sabe cuándo tiene que ser padre, jefe, amigo, entrenador..."', + 'channel': 'laSexta', + 'duration': 31, + 'thumbnail': 'https://imagenes.atresplayer.com/atp/clipping/cmsimages02/2025/04/06/B02DBE1E-D59B-4683-8404-1A9595D15269/1920x1080.jpg', + 'tags': ['Entrevista informativa', 'Actualidad', 'Debate informativo', 'Política', 'Economía', 'Sociedad', 'Cara a cara', 'Análisis', 'Más periodismo'], + 'series': 'El Objetivo', + 'season': 'Temporada 12', + 'timestamp': 1743970079, + 'upload_date': '20250406', }, - { - 'url': 'https://www.atresplayer.com/lasexta/programas/el-club-de-la-comedia/temporada-4/capitulo-10-especial-solidario-nochebuena_5ad08edf986b2855ed47adc4/', - 'only_matching': True, + }, { + 'url': 'https://www.atresplayer.com/antena3/programas/el-hormiguero/clips/revive-la-entrevista-completa-a-miguel-bose-en-el-hormiguero_67f836baa4a5b0e4147ca59a/', + 'info_dict': { + 'ext': 'mp4', + 'id': '67f836baa4a5b0e4147ca59a', + 'display_id': 'revive-la-entrevista-completa-a-miguel-bose-en-el-hormiguero', + 'title': 'Revive la entrevista completa a Miguel Bosé en El Hormiguero', + 'description': 'md5:c6d2b591408d45a7bc2986dfb938eb72', + 'channel': 'Antena 3', + 'duration': 2556, + 'thumbnail': 'https://imagenes.atresplayer.com/atp/clipping/cmsimages02/2025/04/10/9076395F-F1FD-48BE-9F18-540DBA10EBAD/1920x1080.jpg', + 'tags': ['Entrevista', 'Variedades', 'Humor', 'Entretenimiento', 'Te sigo', 'Buen rollo', 'Cara a cara'], + 'series': 'El Hormiguero ', + 'season': 'Temporada 14', + 'timestamp': 1744320111, + 'upload_date': '20250410', }, - { - 'url': 'https://www.atresplayer.com/antena3/series/el-secreto-de-puente-viejo/el-chico-de-los-tres-lunares/capitulo-977-29-12-14_5ad51046986b2886722ccdea/', - 'only_matching': True, + }, { + 'url': 'https://www.atresplayer.com/flooxer/series/biara-proyecto-lazarus/temporada-1/capitulo-3-supervivientes_67a6038b64ceca00070f4f69/', + 'info_dict': { + 'ext': 'mp4', + 'id': '67a6038b64ceca00070f4f69', + 'display_id': 'capitulo-3-supervivientes', + 'title': 'Capítulo 3: Supervivientes', + 'description': 'md5:65b231f20302f776c2b0dd24594599a1', + 'channel': 'Flooxer', + 'duration': 1196, + 'thumbnail': 'https://imagenes.atresplayer.com/atp/clipping/cmsimages01/2025/02/14/17CF90D3-FE67-40C5-A941-7825B3E13992/1920x1080.jpg', + 'tags': ['Juvenil', 'Terror', 'Piel de gallina', 'Te sigo', 'Un break', 'Del tirón'], + 'series': 'BIARA: Proyecto Lázarus', + 'season': 'Temporada 1', + 'season_number': 1, + 'episode': 'Episode 3', + 'episode_number': 3, + 'timestamp': 1743095191, + 'upload_date': '20250327', }, - ] + }, { + 'url': 'https://www.atresplayer.com/lasexta/programas/el-club-de-la-comedia/temporada-4/capitulo-10-especial-solidario-nochebuena_5ad08edf986b2855ed47adc4/', + 'only_matching': True, + }, { + 'url': 'https://www.atresplayer.com/antena3/series/el-secreto-de-puente-viejo/el-chico-de-los-tres-lunares/capitulo-977-29-12-14_5ad51046986b2886722ccdea/', + 'only_matching': True, + }] _API_BASE = 'https://api.atresplayer.com/' def _perform_login(self, username, password): - self._request_webpage( - self._API_BASE + 'login', None, 'Downloading login page') - try: - target_url = self._download_json( - 'https://account.atresmedia.com/api/login', None, - 'Logging in', headers={ - 'Content-Type': 'application/x-www-form-urlencoded', - }, data=urlencode_postdata({ + self._download_webpage( + 'https://account.atresplayer.com/auth/v1/login', None, + 'Logging in', 'Failed to log in', data=urlencode_postdata({ 'username': username, 'password': password, - }))['targetUrl'] + })) except ExtractorError as e: if isinstance(e.cause, HTTPError) and e.cause.status == 400: raise ExtractorError('Invalid username and/or password', expected=True) raise - self._request_webpage(target_url, None, 'Following Target URL') - def _real_extract(self, url): display_id, video_id = self._match_valid_url(url).groups() + metadata_url = self._download_json( + self._API_BASE + 'client/v1/url', video_id, 'Downloading API endpoint data', + query={'href': urllib.parse.urlparse(url).path})['href'] + metadata = self._download_json(metadata_url, video_id) + try: - episode = self._download_json( - self._API_BASE + 'client/v1/player/episode/' + video_id, video_id) + video_data = self._download_json(metadata['urlVideo'], video_id, 'Downloading video data') except ExtractorError as e: if isinstance(e.cause, HTTPError) and e.cause.status == 403: error = self._parse_json(e.cause.response.read(), None) @@ -67,37 +108,45 @@ def _real_extract(self, url): raise ExtractorError(error['error_description'], expected=True) raise - title = episode['titulo'] - formats = [] subtitles = {} - for source in episode.get('sources', []): - src = source.get('src') - if not src: - continue + for source in traverse_obj(video_data, ('sources', lambda _, v: url_or_none(v['src']))): + src_url = source['src'] src_type = source.get('type') - if src_type == 'application/vnd.apple.mpegurl': - formats, subtitles = self._extract_m3u8_formats( - src, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False) - elif src_type == 'application/dash+xml': - formats, subtitles = self._extract_mpd_formats( - src, video_id, mpd_id='dash', fatal=False) - - heartbeat = episode.get('heartbeat') or {} - omniture = episode.get('omniture') or {} - get_meta = lambda x: heartbeat.get(x) or omniture.get(x) + if src_type in ('application/vnd.apple.mpegurl', 'application/hls+legacy', 'application/hls+hevc'): + fmts, subs = self._extract_m3u8_formats_and_subtitles( + src_url, video_id, 'mp4', m3u8_id='hls', fatal=False) + elif src_type in ('application/dash+xml', 'application/dash+hevc'): + fmts, subs = self._extract_mpd_formats_and_subtitles( + src_url, video_id, mpd_id='dash', fatal=False) + else: + continue + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) return { 'display_id': display_id, 'id': video_id, - 'title': title, - 'description': episode.get('descripcion'), - 'thumbnail': episode.get('imgPoster'), - 'duration': int_or_none(episode.get('duration')), 'formats': formats, - 'channel': get_meta('channel'), - 'season': get_meta('season'), - 'episode_number': int_or_none(get_meta('episodeNumber')), 'subtitles': subtitles, + **traverse_obj(video_data, { + 'title': ('titulo', {str}), + 'description': ('descripcion', {str}), + 'duration': ('duration', {int_or_none}), + 'thumbnail': ('imgPoster', {url_or_none}, {lambda v: f'{v}1920x1080.jpg'}), + 'age_limit': ('ageRating', {parse_age_limit}), + }), + **traverse_obj(metadata, { + 'title': ('title', {str}), + 'description': ('description', {str}), + 'duration': ('duration', {int_or_none}), + 'tags': ('tags', ..., 'title', {str}), + 'age_limit': ('ageRating', {parse_age_limit}), + 'series': ('format', 'title', {str}), + 'season': ('currentSeason', 'title', {str}), + 'season_number': ('currentSeason', 'seasonNumber', {int_or_none}), + 'episode_number': ('numberOfEpisode', {int_or_none}), + 'timestamp': ('publicationDate', {int_or_none(scale=1000)}), + 'channel': ('channel', 'title', {str}), + }), } diff --git a/yt_dlp/extractor/bandlab.py b/yt_dlp/extractor/bandlab.py index 64aa2ba70d..f110b793b5 100644 --- a/yt_dlp/extractor/bandlab.py +++ b/yt_dlp/extractor/bandlab.py @@ -86,7 +86,7 @@ def _parse_video(self, video_data, url=None): 'webpage_url': ( 'id', ({value(url)}, {format_field(template='https://www.bandlab.com/post/%s')}), filter, any), 'url': ('video', 'url', {url_or_none}), - 'title': ('caption', {lambda x: x.replace('\n', ' ')}, {truncate_string(left=50)}), + 'title': ('caption', {lambda x: x.replace('\n', ' ')}, {truncate_string(left=72)}), 'description': ('caption', {str}), 'thumbnail': ('video', 'picture', 'url', {url_or_none}), 'view_count': ('video', 'counters', 'plays', {int_or_none}), @@ -120,7 +120,7 @@ class BandlabIE(BandlabBaseIE): 'duration': 54.629999999999995, 'title': 'sweet black', 'upload_date': '20231210', - 'thumbnail': 'https://bandlabimages.azureedge.net/v1.0/songs/fa082beb-b856-4730-9170-a57e4e32cc2c/', + 'thumbnail': 'https://bl-prod-images.azureedge.net/v1.0/songs/fa082beb-b856-4730-9170-a57e4e32cc2c/', 'genres': ['Lofi'], 'uploader': 'ender milze', 'comment_count': int, @@ -142,7 +142,7 @@ class BandlabIE(BandlabBaseIE): 'duration': 54.629999999999995, 'title': 'sweet black', 'upload_date': '20231210', - 'thumbnail': 'https://bandlabimages.azureedge.net/v1.0/songs/fa082beb-b856-4730-9170-a57e4e32cc2c/', + 'thumbnail': 'https://bl-prod-images.azureedge.net/v1.0/songs/fa082beb-b856-4730-9170-a57e4e32cc2c/', 'genres': ['Lofi'], 'uploader': 'ender milze', 'comment_count': int, @@ -158,7 +158,7 @@ class BandlabIE(BandlabBaseIE): 'comment_count': int, 'genres': ['Other'], 'uploader_id': 'user8353034818103753', - 'thumbnail': 'https://bandlabimages.azureedge.net/v1.0/songs/51b18363-da23-4b9b-a29c-2933a3e561ca/', + 'thumbnail': 'https://bl-prod-images.azureedge.net/v1.0/songs/51b18363-da23-4b9b-a29c-2933a3e561ca/', 'timestamp': 1709625771, 'track': 'PodcastMaerchen4b', 'duration': 468.14, @@ -178,7 +178,7 @@ class BandlabIE(BandlabBaseIE): 'id': '110343fc-148b-ea11-96d2-0003ffd1fc09', 'ext': 'm4a', 'timestamp': 1588273294, - 'thumbnail': 'https://bandlabimages.azureedge.net/v1.0/users/b612e533-e4f7-4542-9f50-3fcfd8dd822c/', + 'thumbnail': 'https://bl-prod-images.azureedge.net/v1.0/users/b612e533-e4f7-4542-9f50-3fcfd8dd822c/', 'description': 'Final Revision.', 'title': 'Replay ( Instrumental)', 'uploader': 'David R Sparks', @@ -200,7 +200,7 @@ class BandlabIE(BandlabBaseIE): 'id': '5cdf9036-3857-ef11-991a-6045bd36e0d9', 'ext': 'mp4', 'duration': 44.705, - 'thumbnail': 'https://bandlabimages.azureedge.net/v1.0/videos/67c6cef1-cef6-40d3-831e-a55bc1dcb972/', + 'thumbnail': 'https://bl-prod-images.azureedge.net/v1.0/videos/67c6cef1-cef6-40d3-831e-a55bc1dcb972/', 'comment_count': int, 'title': 'backing vocals', 'uploader_id': 'marliashya', @@ -224,7 +224,7 @@ class BandlabIE(BandlabBaseIE): 'view_count': int, 'track': 'Positronic Meltdown', 'duration': 318.55, - 'thumbnail': 'https://bandlabimages.azureedge.net/v1.0/songs/87165bc3-5439-496e-b1f7-a9f13b541ff2/', + 'thumbnail': 'https://bl-prod-images.azureedge.net/v1.0/songs/87165bc3-5439-496e-b1f7-a9f13b541ff2/', 'description': 'Checkout my tracks at AOMX http://aomxsounds.com/', 'uploader_id': 'microfreaks', 'title': 'Positronic Meltdown', @@ -246,7 +246,7 @@ class BandlabIE(BandlabBaseIE): 'comment_count': int, 'uploader': 'Sorakime', 'uploader_id': 'sorakime', - 'thumbnail': 'https://bandlabimages.azureedge.net/v1.0/users/572a351a-0f3a-4c6a-ac39-1a5defdeeb1c/', + 'thumbnail': 'https://bl-prod-images.azureedge.net/v1.0/users/572a351a-0f3a-4c6a-ac39-1a5defdeeb1c/', 'timestamp': 1691162128, 'upload_date': '20230804', 'media_type': 'track', diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index 42b4e2d3c2..6508942a4f 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -1596,16 +1596,16 @@ def _real_extract(self, url): webpage = self._download_webpage(url, list_id) initial_state = self._search_json(r'window\.__INITIAL_STATE__\s*=', webpage, 'initial state', list_id) - if traverse_obj(initial_state, ('error', 'code', {int_or_none})) != 200: - error_code = traverse_obj(initial_state, ('error', 'trueCode', {int_or_none})) - error_message = traverse_obj(initial_state, ('error', 'message', {str_or_none})) + error = traverse_obj(initial_state, (('error', 'listError'), all, lambda _, v: v['code'], any)) + if error and error['code'] != 200: + error_code = error.get('trueCode') if error_code == -400 and list_id == 'watchlater': self.raise_login_required('You need to login to access your watchlater playlist') elif error_code == -403: self.raise_login_required('This is a private playlist. You need to login as its owner') elif error_code == 11010: raise ExtractorError('Playlist is no longer available', expected=True) - raise ExtractorError(f'Could not access playlist: {error_code} {error_message}') + raise ExtractorError(f'Could not access playlist: {error_code} {error.get("message")}') query = { 'ps': 20, diff --git a/yt_dlp/extractor/bitchute.py b/yt_dlp/extractor/bitchute.py index c83222ea5b..981eebfbb8 100644 --- a/yt_dlp/extractor/bitchute.py +++ b/yt_dlp/extractor/bitchute.py @@ -1,30 +1,32 @@ import functools +import json import re from .common import InfoExtractor from ..networking import HEADRequest +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, OnDemandPagedList, clean_html, - extract_attributes, + determine_ext, + format_field, get_element_by_class, - get_element_by_id, - get_element_html_by_class, get_elements_html_by_class, int_or_none, orderedSet, parse_count, parse_duration, - traverse_obj, - unified_strdate, + parse_iso8601, + url_or_none, urlencode_postdata, urljoin, ) +from ..utils.traversal import traverse_obj class BitChuteIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:www|old)\.)?bitchute\.com/(?:video|embed|torrent/[^/]+)/(?P[^/?#&]+)' + _VALID_URL = r'https?://(?:(?:www|old)\.)?bitchute\.com/(?:video|embed|torrent/[^/?#]+)/(?P[^/?#&]+)' _EMBED_REGEX = [rf'<(?:script|iframe)[^>]+\bsrc=(["\'])(?P{_VALID_URL})'] _TESTS = [{ 'url': 'https://www.bitchute.com/video/UGlrF9o9b-Q/', @@ -34,12 +36,17 @@ class BitChuteIE(InfoExtractor): 'ext': 'mp4', 'title': 'This is the first video on #BitChute !', 'description': 'md5:a0337e7b1fe39e32336974af8173a034', - 'thumbnail': r're:^https?://.*\.jpg$', + 'thumbnail': r're:https?://.+/.+\.jpg$', 'uploader': 'BitChute', 'upload_date': '20170103', 'uploader_url': 'https://www.bitchute.com/profile/I5NgtHZn9vPj/', 'channel': 'BitChute', 'channel_url': 'https://www.bitchute.com/channel/bitchute/', + 'uploader_id': 'I5NgtHZn9vPj', + 'channel_id': '1VBwRfyNcKdX', + 'view_count': int, + 'duration': 16.0, + 'timestamp': 1483425443, }, }, { # test case: video with different channel and uploader @@ -49,13 +56,18 @@ class BitChuteIE(InfoExtractor): 'id': 'Yti_j9A-UZ4', 'ext': 'mp4', 'title': 'Israel at War | Full Measure', - 'description': 'md5:38cf7bc6f42da1a877835539111c69ef', - 'thumbnail': r're:^https?://.*\.jpg$', + 'description': 'md5:e60198b89971966d6030d22b3268f08f', + 'thumbnail': r're:https?://.+/.+\.jpg$', 'uploader': 'sharylattkisson', 'upload_date': '20231106', 'uploader_url': 'https://www.bitchute.com/profile/9K0kUWA9zmd9/', 'channel': 'Full Measure with Sharyl Attkisson', 'channel_url': 'https://www.bitchute.com/channel/sharylattkisson/', + 'uploader_id': '9K0kUWA9zmd9', + 'channel_id': 'NpdxoCRv3ZLb', + 'view_count': int, + 'duration': 554.0, + 'timestamp': 1699296106, }, }, { # video not downloadable in browser, but we can recover it @@ -66,25 +78,21 @@ class BitChuteIE(InfoExtractor): 'ext': 'mp4', 'filesize': 71537926, 'title': 'STYXHEXENHAMMER666 - Election Fraud, Clinton 2020, EU Armies, and Gun Control', - 'description': 'md5:228ee93bd840a24938f536aeac9cf749', - 'thumbnail': r're:^https?://.*\.jpg$', + 'description': 'md5:2029c7c212ccd4b040f52bb2d036ef4e', + 'thumbnail': r're:https?://.+/.+\.jpg$', 'uploader': 'BitChute', 'upload_date': '20181113', 'uploader_url': 'https://www.bitchute.com/profile/I5NgtHZn9vPj/', 'channel': 'BitChute', 'channel_url': 'https://www.bitchute.com/channel/bitchute/', + 'uploader_id': 'I5NgtHZn9vPj', + 'channel_id': '1VBwRfyNcKdX', + 'view_count': int, + 'duration': 1701.0, + 'tags': ['bitchute'], + 'timestamp': 1542130287, }, 'params': {'check_formats': None}, - }, { - # restricted video - 'url': 'https://www.bitchute.com/video/WEnQU7XGcTdl/', - 'info_dict': { - 'id': 'WEnQU7XGcTdl', - 'ext': 'mp4', - 'title': 'Impartial Truth - Ein Letzter Appell an die Vernunft', - }, - 'params': {'skip_download': True}, - 'skip': 'Georestricted in DE', }, { 'url': 'https://www.bitchute.com/embed/lbb5G1hjPhw/', 'only_matching': True, @@ -96,11 +104,8 @@ class BitChuteIE(InfoExtractor): 'only_matching': True, }] _GEO_BYPASS = False - - _HEADERS = { - 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.57 Safari/537.36', - 'Referer': 'https://www.bitchute.com/', - } + _UPLOADER_URL_TMPL = 'https://www.bitchute.com/profile/%s/' + _CHANNEL_URL_TMPL = 'https://www.bitchute.com/channel/%s/' def _check_format(self, video_url, video_id): urls = orderedSet( @@ -112,7 +117,7 @@ def _check_format(self, video_url, video_id): for url in urls: try: response = self._request_webpage( - HEADRequest(url), video_id=video_id, note=f'Checking {url}', headers=self._HEADERS) + HEADRequest(url), video_id=video_id, note=f'Checking {url}') except ExtractorError as e: self.to_screen(f'{video_id}: URL is invalid, skipping: {e.cause}') continue @@ -121,54 +126,79 @@ def _check_format(self, video_url, video_id): 'filesize': int_or_none(response.headers.get('Content-Length')), } - def _raise_if_restricted(self, webpage): - page_title = clean_html(get_element_by_class('page-title', webpage)) or '' - if re.fullmatch(r'(?:Channel|Video) Restricted', page_title): - reason = clean_html(get_element_by_id('page-detail', webpage)) or page_title - self.raise_geo_restricted(reason) - - @staticmethod - def _make_url(html): - path = extract_attributes(get_element_html_by_class('spa', html) or '').get('href') - return urljoin('https://www.bitchute.com', path) + def _call_api(self, endpoint, data, display_id, fatal=True): + note = endpoint.rpartition('/')[2] + try: + return self._download_json( + f'https://api.bitchute.com/api/beta/{endpoint}', display_id, + f'Downloading {note} API JSON', f'Unable to download {note} API JSON', + data=json.dumps(data).encode(), + headers={ + 'Accept': 'application/json', + 'Content-Type': 'application/json', + }) + except ExtractorError as e: + if isinstance(e.cause, HTTPError) and e.cause.status == 403: + errors = '. '.join(traverse_obj(e.cause.response.read().decode(), ( + {json.loads}, 'errors', lambda _, v: v['context'] == 'reason', 'message', {str}))) + if errors and 'location' in errors: + # Can always be fatal since the video/media call will reach this code first + self.raise_geo_restricted(errors) + if fatal: + raise + self.report_warning(e.msg) def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage( - f'https://old.bitchute.com/video/{video_id}', video_id, headers=self._HEADERS) - - self._raise_if_restricted(webpage) - publish_date = clean_html(get_element_by_class('video-publish-date', webpage)) - entries = self._parse_html5_media_entries(url, webpage, video_id) + data = {'video_id': video_id} + media_url = self._call_api('video/media', data, video_id)['media_url'] formats = [] - for format_ in traverse_obj(entries, (0, 'formats', ...)): + if determine_ext(media_url) == 'm3u8': + formats.extend( + self._extract_m3u8_formats(media_url, video_id, 'mp4', m3u8_id='hls', live=True)) + else: if self.get_param('check_formats') is not False: - format_.update(self._check_format(format_.pop('url'), video_id) or {}) - if 'url' not in format_: - continue - formats.append(format_) + if fmt := self._check_format(media_url, video_id): + formats.append(fmt) + else: + formats.append({'url': media_url}) if not formats: self.raise_no_formats( 'Video is unavailable. Please make sure this video is playable in the browser ' 'before reporting this issue.', expected=True, video_id=video_id) - details = get_element_by_class('details', webpage) or '' - uploader_html = get_element_html_by_class('creator', details) or '' - channel_html = get_element_html_by_class('name', details) or '' + video = self._call_api('video', data, video_id, fatal=False) + channel = None + if channel_id := traverse_obj(video, ('channel', 'channel_id', {str})): + channel = self._call_api('channel', {'channel_id': channel_id}, video_id, fatal=False) return { + **traverse_obj(video, { + 'title': ('video_name', {str}), + 'description': ('description', {str}), + 'thumbnail': ('thumbnail_url', {url_or_none}), + 'channel': ('channel', 'channel_name', {str}), + 'channel_id': ('channel', 'channel_id', {str}), + 'channel_url': ('channel', 'channel_url', {urljoin('https://www.bitchute.com/')}), + 'uploader_id': ('profile_id', {str}), + 'uploader_url': ('profile_id', {format_field(template=self._UPLOADER_URL_TMPL)}, filter), + 'timestamp': ('date_published', {parse_iso8601}), + 'duration': ('duration', {parse_duration}), + 'tags': ('hashtags', ..., {str}, filter, all, filter), + 'view_count': ('view_count', {int_or_none}), + 'is_live': ('state_id', {lambda x: x == 'live'}), + }), + **traverse_obj(channel, { + 'channel': ('channel_name', {str}), + 'channel_id': ('channel_id', {str}), + 'channel_url': ('url_slug', {format_field(template=self._CHANNEL_URL_TMPL)}, filter), + 'uploader': ('profile_name', {str}), + 'uploader_id': ('profile_id', {str}), + 'uploader_url': ('profile_id', {format_field(template=self._UPLOADER_URL_TMPL)}, filter), + }), 'id': video_id, - 'title': self._html_extract_title(webpage) or self._og_search_title(webpage), - 'description': self._og_search_description(webpage, default=None), - 'thumbnail': self._og_search_thumbnail(webpage), - 'uploader': clean_html(uploader_html), - 'uploader_url': self._make_url(uploader_html), - 'channel': clean_html(channel_html), - 'channel_url': self._make_url(channel_html), - 'upload_date': unified_strdate(self._search_regex( - r'at \d+:\d+ UTC on (.+?)\.', publish_date, 'upload date', fatal=False)), 'formats': formats, } @@ -190,7 +220,7 @@ class BitChuteChannelIE(InfoExtractor): 'ext': 'mp4', 'title': 'This is the first video on #BitChute !', 'description': 'md5:a0337e7b1fe39e32336974af8173a034', - 'thumbnail': r're:^https?://.*\.jpg$', + 'thumbnail': r're:https?://.+/.+\.jpg$', 'uploader': 'BitChute', 'upload_date': '20170103', 'uploader_url': 'https://www.bitchute.com/profile/I5NgtHZn9vPj/', @@ -198,6 +228,9 @@ class BitChuteChannelIE(InfoExtractor): 'channel_url': 'https://www.bitchute.com/channel/bitchute/', 'duration': 16, 'view_count': int, + 'uploader_id': 'I5NgtHZn9vPj', + 'channel_id': '1VBwRfyNcKdX', + 'timestamp': 1483425443, }, }, ], @@ -213,6 +246,7 @@ class BitChuteChannelIE(InfoExtractor): 'title': 'Bruce MacDonald and "The Light of Darkness"', 'description': 'md5:747724ef404eebdfc04277714f81863e', }, + 'skip': '404 Not Found', }, { 'url': 'https://old.bitchute.com/playlist/wV9Imujxasw9/', 'only_matching': True, diff --git a/yt_dlp/extractor/bluesky.py b/yt_dlp/extractor/bluesky.py index 23344ac6c5..8cb5c0d257 100644 --- a/yt_dlp/extractor/bluesky.py +++ b/yt_dlp/extractor/bluesky.py @@ -53,7 +53,7 @@ class BlueskyIE(InfoExtractor): 'channel_id': 'did:plc:z72i7hdynmk6r22z27h6tvur', 'channel_url': 'https://bsky.app/profile/did:plc:z72i7hdynmk6r22z27h6tvur', 'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$', - 'title': 'Bluesky now has video! Update your app to versi...', + 'title': 'Bluesky now has video! Update your app to version 1.91 or refresh on ...', 'alt_title': 'Bluesky video feature announcement', 'description': r're:(?s)Bluesky now has video! .{239}', 'upload_date': '20240911', @@ -172,7 +172,7 @@ class BlueskyIE(InfoExtractor): 'channel_id': 'did:plc:z72i7hdynmk6r22z27h6tvur', 'channel_url': 'https://bsky.app/profile/did:plc:z72i7hdynmk6r22z27h6tvur', 'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$', - 'title': 'Bluesky now has video! Update your app to versi...', + 'title': 'Bluesky now has video! Update your app to version 1.91 or refresh on ...', 'alt_title': 'Bluesky video feature announcement', 'description': r're:(?s)Bluesky now has video! .{239}', 'upload_date': '20240911', @@ -191,7 +191,7 @@ class BlueskyIE(InfoExtractor): 'info_dict': { 'id': '3l7rdfxhyds2f', 'ext': 'mp4', - 'uploader': 'cinnamon', + 'uploader': 'cinnamon 🐇 🏳️‍⚧️', 'uploader_id': 'cinny.bun.how', 'uploader_url': 'https://bsky.app/profile/cinny.bun.how', 'channel_id': 'did:plc:7x6rtuenkuvxq3zsvffp2ide', @@ -255,7 +255,7 @@ class BlueskyIE(InfoExtractor): 'info_dict': { 'id': '3l77u64l7le2e', 'ext': 'mp4', - 'title': 'hearing people on twitter say that bluesky isn\'...', + 'title': "hearing people on twitter say that bluesky isn't funny yet so post t...", 'like_count': int, 'uploader_id': 'thafnine.net', 'uploader_url': 'https://bsky.app/profile/thafnine.net', @@ -387,7 +387,7 @@ def _extract_videos(self, root, video_id, embed_path='embed', record_path='recor 'age_limit': ( 'labels', ..., 'val', {lambda x: 18 if x in ('sexual', 'porn', 'graphic-media') else None}, any), 'description': (*record_path, 'text', {str}, filter), - 'title': (*record_path, 'text', {lambda x: x.replace('\n', ' ')}, {truncate_string(left=50)}), + 'title': (*record_path, 'text', {lambda x: x.replace('\n', ' ')}, {truncate_string(left=72)}), }), }) return entries diff --git a/yt_dlp/extractor/bokecc.py b/yt_dlp/extractor/bokecc.py index 5fe937a6ac..42047aced1 100644 --- a/yt_dlp/extractor/bokecc.py +++ b/yt_dlp/extractor/bokecc.py @@ -24,7 +24,7 @@ def _extract_bokecc_formats(self, webpage, video_id, format_id=None): class BokeCCIE(BokeCCBaseIE): - _IE_DESC = 'CC视频' + IE_DESC = 'CC视频' _VALID_URL = r'https?://union\.bokecc\.com/playvideo\.bo\?(?P.*)' _TESTS = [{ diff --git a/yt_dlp/extractor/bpb.py b/yt_dlp/extractor/bpb.py index d7bf58b366..a2a9082527 100644 --- a/yt_dlp/extractor/bpb.py +++ b/yt_dlp/extractor/bpb.py @@ -7,6 +7,7 @@ join_nonempty, js_to_json, mimetype2ext, + parse_resolution, unified_strdate, url_or_none, urljoin, @@ -110,24 +111,23 @@ def _parse_vue_attributes(self, name, string, video_id): return attributes - @staticmethod - def _process_source(source): + def _process_source(self, source): url = url_or_none(source['src']) if not url: return None source_type = source.get('type', '') extension = mimetype2ext(source_type) - is_video = source_type.startswith('video') - note = url.rpartition('.')[0].rpartition('_')[2] if is_video else None + note = self._search_regex(r'[_-]([a-z]+)\.[\da-z]+(?:$|\?)', url, 'note', default=None) return { 'url': url, 'ext': extension, - 'vcodec': None if is_video else 'none', + 'vcodec': None if source_type.startswith('video') else 'none', 'quality': 10 if note == 'high' else 0, 'format_note': note, 'format_id': join_nonempty(extension, note), + **parse_resolution(source.get('label')), } def _real_extract(self, url): diff --git a/yt_dlp/extractor/bravotv.py b/yt_dlp/extractor/bravotv.py deleted file mode 100644 index 0b2c447987..0000000000 --- a/yt_dlp/extractor/bravotv.py +++ /dev/null @@ -1,188 +0,0 @@ -from .adobepass import AdobePassIE -from ..networking import HEADRequest -from ..utils import ( - extract_attributes, - float_or_none, - get_element_html_by_class, - int_or_none, - merge_dicts, - parse_age_limit, - remove_end, - str_or_none, - traverse_obj, - unescapeHTML, - unified_timestamp, - update_url_query, - url_or_none, -) - - -class BravoTVIE(AdobePassIE): - _VALID_URL = r'https?://(?:www\.)?(?Pbravotv|oxygen)\.com/(?:[^/]+/)+(?P[^/?#]+)' - _TESTS = [{ - 'url': 'https://www.bravotv.com/top-chef/season-16/episode-15/videos/the-top-chef-season-16-winner-is', - 'info_dict': { - 'id': '3923059', - 'ext': 'mp4', - 'title': 'The Top Chef Season 16 Winner Is...', - 'description': 'Find out who takes the title of Top Chef!', - 'upload_date': '20190314', - 'timestamp': 1552591860, - 'season_number': 16, - 'episode_number': 15, - 'series': 'Top Chef', - 'episode': 'The Top Chef Season 16 Winner Is...', - 'duration': 190.357, - 'season': 'Season 16', - 'thumbnail': r're:^https://.+\.jpg', - }, - 'params': {'skip_download': 'm3u8'}, - }, { - 'url': 'https://www.bravotv.com/top-chef/season-20/episode-1/london-calling', - 'info_dict': { - 'id': '9000234570', - 'ext': 'mp4', - 'title': 'London Calling', - 'description': 'md5:5af95a8cbac1856bd10e7562f86bb759', - 'upload_date': '20230310', - 'timestamp': 1678410000, - 'season_number': 20, - 'episode_number': 1, - 'series': 'Top Chef', - 'episode': 'London Calling', - 'duration': 3266.03, - 'season': 'Season 20', - 'chapters': 'count:7', - 'thumbnail': r're:^https://.+\.jpg', - 'age_limit': 14, - }, - 'params': {'skip_download': 'm3u8'}, - 'skip': 'This video requires AdobePass MSO credentials', - }, { - 'url': 'https://www.oxygen.com/in-ice-cold-blood/season-1/closing-night', - 'info_dict': { - 'id': '3692045', - 'ext': 'mp4', - 'title': 'Closing Night', - 'description': 'md5:3170065c5c2f19548d72a4cbc254af63', - 'upload_date': '20180401', - 'timestamp': 1522623600, - 'season_number': 1, - 'episode_number': 1, - 'series': 'In Ice Cold Blood', - 'episode': 'Closing Night', - 'duration': 2629.051, - 'season': 'Season 1', - 'chapters': 'count:6', - 'thumbnail': r're:^https://.+\.jpg', - 'age_limit': 14, - }, - 'params': {'skip_download': 'm3u8'}, - 'skip': 'This video requires AdobePass MSO credentials', - }, { - 'url': 'https://www.oxygen.com/in-ice-cold-blood/season-2/episode-16/videos/handling-the-horwitz-house-after-the-murder-season-2', - 'info_dict': { - 'id': '3974019', - 'ext': 'mp4', - 'title': '\'Handling The Horwitz House After The Murder (Season 2, Episode 16)', - 'description': 'md5:f9d638dd6946a1c1c0533a9c6100eae5', - 'upload_date': '20190617', - 'timestamp': 1560790800, - 'season_number': 2, - 'episode_number': 16, - 'series': 'In Ice Cold Blood', - 'episode': '\'Handling The Horwitz House After The Murder (Season 2, Episode 16)', - 'duration': 68.235, - 'season': 'Season 2', - 'thumbnail': r're:^https://.+\.jpg', - 'age_limit': 14, - }, - 'params': {'skip_download': 'm3u8'}, - }, { - 'url': 'https://www.bravotv.com/below-deck/season-3/ep-14-reunion-part-1', - 'only_matching': True, - }] - - def _real_extract(self, url): - site, display_id = self._match_valid_url(url).group('site', 'id') - webpage = self._download_webpage(url, display_id) - settings = self._search_json( - r']+data-drupal-selector="drupal-settings-json"[^>]*>', webpage, 'settings', display_id) - tve = extract_attributes(get_element_html_by_class('tve-video-deck-app', webpage) or '') - query = { - 'manifest': 'm3u', - 'formats': 'm3u,mpeg4', - } - - if tve: - account_pid = tve.get('data-mpx-media-account-pid') or 'HNK2IC' - account_id = tve['data-mpx-media-account-id'] - metadata = self._parse_json( - tve.get('data-normalized-video', ''), display_id, fatal=False, transform_source=unescapeHTML) - video_id = tve.get('data-guid') or metadata['guid'] - if tve.get('data-entitlement') == 'auth': - auth = traverse_obj(settings, ('tve_adobe_auth', {dict})) or {} - site = remove_end(site, 'tv') - release_pid = tve['data-release-pid'] - resource = self._get_mvpd_resource( - tve.get('data-adobe-pass-resource-id') or auth.get('adobePassResourceId') or site, - tve['data-title'], release_pid, tve.get('data-rating')) - query.update({ - 'switch': 'HLSServiceSecure', - 'auth': self._extract_mvpd_auth( - url, release_pid, auth.get('adobePassRequestorId') or site, resource), - }) - - else: - ls_playlist = traverse_obj(settings, ('ls_playlist', ..., {dict}), get_all=False) or {} - account_pid = ls_playlist.get('mpxMediaAccountPid') or 'PHSl-B' - account_id = ls_playlist['mpxMediaAccountId'] - video_id = ls_playlist['defaultGuid'] - metadata = traverse_obj( - ls_playlist, ('videos', lambda _, v: v['guid'] == video_id, {dict}), get_all=False) - - tp_url = f'https://link.theplatform.com/s/{account_pid}/media/guid/{account_id}/{video_id}' - tp_metadata = self._download_json( - update_url_query(tp_url, {'format': 'preview'}), video_id, fatal=False) - - chapters = traverse_obj(tp_metadata, ('chapters', ..., { - 'start_time': ('startTime', {float_or_none(scale=1000)}), - 'end_time': ('endTime', {float_or_none(scale=1000)}), - })) - # prune pointless single chapters that span the entire duration from short videos - if len(chapters) == 1 and not traverse_obj(chapters, (0, 'end_time')): - chapters = None - - m3u8_url = self._request_webpage(HEADRequest( - update_url_query(f'{tp_url}/stream.m3u8', query)), video_id, 'Checking m3u8 URL').url - if 'mpeg_cenc' in m3u8_url: - self.report_drm(video_id) - formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, 'mp4', m3u8_id='hls') - - return { - 'id': video_id, - 'formats': formats, - 'subtitles': subtitles, - 'chapters': chapters, - **merge_dicts(traverse_obj(tp_metadata, { - 'title': 'title', - 'description': 'description', - 'duration': ('duration', {float_or_none(scale=1000)}), - 'timestamp': ('pubDate', {float_or_none(scale=1000)}), - 'season_number': (('pl1$seasonNumber', 'nbcu$seasonNumber'), {int_or_none}), - 'episode_number': (('pl1$episodeNumber', 'nbcu$episodeNumber'), {int_or_none}), - 'series': (('pl1$show', 'nbcu$show'), (None, ...), {str}), - 'episode': (('title', 'pl1$episodeNumber', 'nbcu$episodeNumber'), {str_or_none}), - 'age_limit': ('ratings', ..., 'rating', {parse_age_limit}), - }, get_all=False), traverse_obj(metadata, { - 'title': 'title', - 'description': 'description', - 'duration': ('durationInSeconds', {int_or_none}), - 'timestamp': ('airDate', {unified_timestamp}), - 'thumbnail': ('thumbnailUrl', {url_or_none}), - 'season_number': ('seasonNumber', {int_or_none}), - 'episode_number': ('episodeNumber', {int_or_none}), - 'episode': 'episodeTitle', - 'series': 'show', - })), - } diff --git a/yt_dlp/extractor/brightcove.py b/yt_dlp/extractor/brightcove.py index 3ada1fd5de..d4ac7a0c28 100644 --- a/yt_dlp/extractor/brightcove.py +++ b/yt_dlp/extractor/brightcove.py @@ -923,10 +923,18 @@ def extract_policy_key(): errors = json_data.get('errors') if errors and errors[0].get('error_subcode') == 'TVE_AUTH': custom_fields = json_data['custom_fields'] + missing_fields = ', '.join( + key for key in ('source_url', 'software_statement') if not smuggled_data.get(key)) + if missing_fields: + raise ExtractorError( + f'Missing fields in smuggled data: {missing_fields}. ' + f'This video can be only extracted from the webpage where it is embedded. ' + f'Pass the URL of the embedding webpage instead of the Brightcove URL', expected=True) tve_token = self._extract_mvpd_auth( smuggled_data['source_url'], video_id, custom_fields['bcadobepassrequestorid'], - custom_fields['bcadobepassresourceid']) + custom_fields['bcadobepassresourceid'], + smuggled_data['software_statement']) json_data = self._download_json( api_url, video_id, headers={ 'Accept': f'application/json;pk={policy_key}', diff --git a/yt_dlp/extractor/cartoonnetwork.py b/yt_dlp/extractor/cartoonnetwork.py deleted file mode 100644 index 1749a008a2..0000000000 --- a/yt_dlp/extractor/cartoonnetwork.py +++ /dev/null @@ -1,59 +0,0 @@ -from .turner import TurnerBaseIE -from ..utils import int_or_none - - -class CartoonNetworkIE(TurnerBaseIE): - _VALID_URL = r'https?://(?:www\.)?cartoonnetwork\.com/video/(?:[^/]+/)+(?P[^/?#]+)-(?:clip|episode)\.html' - _TEST = { - 'url': 'https://www.cartoonnetwork.com/video/ben-10/how-to-draw-upgrade-episode.html', - 'info_dict': { - 'id': '6e3375097f63874ebccec7ef677c1c3845fa850e', - 'ext': 'mp4', - 'title': 'How to Draw Upgrade', - 'description': 'md5:2061d83776db7e8be4879684eefe8c0f', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - } - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - - def find_field(global_re, name, content_re=None, value_re='[^"]+', fatal=False): - metadata_re = '' - if content_re: - metadata_re = r'|video_metadata\.content_' + content_re - return self._search_regex( - rf'(?:_cnglobal\.currentVideo\.{global_re}{metadata_re})\s*=\s*"({value_re})";', - webpage, name, fatal=fatal) - - media_id = find_field('mediaId', 'media id', 'id', '[0-9a-f]{40}', True) - title = find_field('episodeTitle', 'title', '(?:episodeName|name)', fatal=True) - - info = self._extract_ngtv_info( - media_id, {'networkId': 'cartoonnetwork'}, { - 'url': url, - 'site_name': 'CartoonNetwork', - 'auth_required': find_field('authType', 'auth type') != 'unauth', - }) - - series = find_field( - 'propertyName', 'series', 'showName') or self._html_search_meta('partOfSeries', webpage) - info.update({ - 'id': media_id, - 'display_id': display_id, - 'title': title, - 'description': self._html_search_meta('description', webpage), - 'series': series, - 'episode': title, - }) - - for field in ('season', 'episode'): - field_name = field + 'Number' - info[field + '_number'] = int_or_none(find_field( - field_name, field + ' number', value_re=r'\d+') or self._html_search_meta(field_name, webpage)) - - return info diff --git a/yt_dlp/extractor/cda.py b/yt_dlp/extractor/cda.py index 96f25c22a8..027b37d448 100644 --- a/yt_dlp/extractor/cda.py +++ b/yt_dlp/extractor/cda.py @@ -13,16 +13,17 @@ from ..utils import ( ExtractorError, OnDemandPagedList, + determine_ext, float_or_none, int_or_none, merge_dicts, multipart_encode, parse_duration, - traverse_obj, try_call, - try_get, + url_or_none, urljoin, ) +from ..utils.traversal import traverse_obj class CDAIE(InfoExtractor): @@ -290,34 +291,47 @@ def extract_format(page, version): if not video or 'file' not in video: self.report_warning(f'Unable to extract {version} version information') return - if video['file'].startswith('uggc'): - video['file'] = codecs.decode(video['file'], 'rot_13') - if video['file'].endswith('adc.mp4'): - video['file'] = video['file'].replace('adc.mp4', '.mp4') - elif not video['file'].startswith('http'): - video['file'] = decrypt_file(video['file']) video_quality = video.get('quality') qualities = video.get('qualities', {}) video_quality = next((k for k, v in qualities.items() if v == video_quality), video_quality) - info_dict['formats'].append({ - 'url': video['file'], - 'format_id': video_quality, - 'height': int_or_none(video_quality[:-1]), - }) + if video.get('file'): + if video['file'].startswith('uggc'): + video['file'] = codecs.decode(video['file'], 'rot_13') + if video['file'].endswith('adc.mp4'): + video['file'] = video['file'].replace('adc.mp4', '.mp4') + elif not video['file'].startswith('http'): + video['file'] = decrypt_file(video['file']) + info_dict['formats'].append({ + 'url': video['file'], + 'format_id': video_quality, + 'height': int_or_none(video_quality[:-1]), + }) for quality, cda_quality in qualities.items(): if quality == video_quality: continue data = {'jsonrpc': '2.0', 'method': 'videoGetLink', 'id': 2, 'params': [video_id, cda_quality, video.get('ts'), video.get('hash2'), {}]} data = json.dumps(data).encode() - video_url = self._download_json( + response = self._download_json( f'https://www.cda.pl/video/{video_id}', video_id, headers={ 'Content-Type': 'application/json', 'X-Requested-With': 'XMLHttpRequest', }, data=data, note=f'Fetching {quality} url', errnote=f'Failed to fetch {quality} url', fatal=False) - if try_get(video_url, lambda x: x['result']['status']) == 'ok': - video_url = try_get(video_url, lambda x: x['result']['resp']) + if ( + traverse_obj(response, ('result', 'status')) != 'ok' + or not traverse_obj(response, ('result', 'resp', {url_or_none})) + ): + continue + video_url = response['result']['resp'] + ext = determine_ext(video_url) + if ext == 'mpd': + info_dict['formats'].extend(self._extract_mpd_formats( + video_url, video_id, mpd_id='dash', fatal=False)) + elif ext == 'm3u8': + info_dict['formats'].extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + else: info_dict['formats'].append({ 'url': video_url, 'format_id': quality, @@ -353,7 +367,7 @@ def extract_format(page, version): class CDAFolderIE(InfoExtractor): _MAX_PAGE_SIZE = 36 - _VALID_URL = r'https?://(?:www\.)?cda\.pl/(?P\w+)/folder/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?cda\.pl/(?P[\w-]+)/folder/(?P\d+)' _TESTS = [ { 'url': 'https://www.cda.pl/domino264/folder/31188385', @@ -378,6 +392,9 @@ class CDAFolderIE(InfoExtractor): 'title': 'TESTY KOSMETYKÓW', }, 'playlist_mincount': 139, + }, { + 'url': 'https://www.cda.pl/FILMY-SERIALE-ANIME-KRESKOWKI-BAJKI/folder/18493422', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/chzzk.py b/yt_dlp/extractor/chzzk.py index aec77ac454..a5daf5ca7c 100644 --- a/yt_dlp/extractor/chzzk.py +++ b/yt_dlp/extractor/chzzk.py @@ -21,7 +21,7 @@ class CHZZKLiveIE(InfoExtractor): 'channel': '진짜도현', 'channel_id': 'c68b8ef525fb3d2fa146344d84991753', 'channel_is_verified': False, - 'thumbnail': r're:^https?://.*\.jpg$', + 'thumbnail': r're:https?://.+/.+\.jpg', 'timestamp': 1705510344, 'upload_date': '20240117', 'live_status': 'is_live', @@ -98,7 +98,7 @@ class CHZZKVideoIE(InfoExtractor): 'channel': '침착맨', 'channel_id': 'bb382c2c0cc9fa7c86ab3b037fb5799c', 'channel_is_verified': False, - 'thumbnail': r're:^https?://.*\.jpg$', + 'thumbnail': r're:https?://.+/.+\.jpg', 'duration': 15577, 'timestamp': 1702970505.417, 'upload_date': '20231219', @@ -115,7 +115,7 @@ class CHZZKVideoIE(InfoExtractor): 'channel': '라디유radiyu', 'channel_id': '68f895c59a1043bc5019b5e08c83a5c5', 'channel_is_verified': False, - 'thumbnail': r're:^https?://.*\.jpg$', + 'thumbnail': r're:https?://.+/.+\.jpg', 'duration': 95, 'timestamp': 1703102631.722, 'upload_date': '20231220', @@ -131,12 +131,30 @@ class CHZZKVideoIE(InfoExtractor): 'channel': '강지', 'channel_id': 'b5ed5db484d04faf4d150aedd362f34b', 'channel_is_verified': True, - 'thumbnail': r're:^https?://.*\.jpg$', + 'thumbnail': r're:https?://.+/.+\.jpg', 'duration': 4433, 'timestamp': 1703307460.214, 'upload_date': '20231223', 'view_count': int, }, + }, { + # video_status == 'NONE' but is downloadable + 'url': 'https://chzzk.naver.com/video/6325166', + 'info_dict': { + 'id': '6325166', + 'ext': 'mp4', + 'title': '와이프 숙제빼주기', + 'channel': '이 다', + 'channel_id': '0076a519f147ee9fd0959bf02f9571ca', + 'channel_is_verified': False, + 'view_count': int, + 'duration': 28167, + 'thumbnail': r're:https?://.+/.+\.jpg', + 'timestamp': 1742139216.86, + 'upload_date': '20250316', + 'live_status': 'was_live', + }, + 'params': {'skip_download': 'm3u8'}, }] def _real_extract(self, url): @@ -147,11 +165,7 @@ def _real_extract(self, url): live_status = 'was_live' if video_meta.get('liveOpenDate') else 'not_live' video_status = video_meta.get('vodStatus') - if video_status == 'UPLOAD': - playback = self._parse_json(video_meta['liveRewindPlaybackJson'], video_id) - formats, subtitles = self._extract_m3u8_formats_and_subtitles( - playback['media'][0]['path'], video_id, 'mp4', m3u8_id='hls') - elif video_status == 'ABR_HLS': + if video_status == 'ABR_HLS': formats, subtitles = self._extract_mpd_formats_and_subtitles( f'https://apis.naver.com/neonplayer/vodplay/v1/playback/{video_meta["videoId"]}', video_id, query={ @@ -161,10 +175,17 @@ def _real_extract(self, url): 'cpl': 'en_US', }) else: - self.raise_no_formats( - f'Unknown video status detected: "{video_status}"', expected=True, video_id=video_id) - formats, subtitles = [], {} - live_status = 'post_live' if live_status == 'was_live' else None + fatal = video_status == 'UPLOAD' + playback = self._parse_json(video_meta['liveRewindPlaybackJson'], video_id, fatal=fatal) + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + traverse_obj(playback, ('media', 0, 'path')), video_id, 'mp4', m3u8_id='hls', fatal=fatal) + if formats and video_status != 'UPLOAD': + self.write_debug(f'Video found with status: "{video_status}"') + elif not formats: + self.raise_no_formats( + f'Unknown video status detected: "{video_status}"', expected=True, video_id=video_id) + formats, subtitles = [], {} + live_status = 'post_live' if live_status == 'was_live' else None return { 'id': video_id, diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index b816d788fa..d5607296df 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -78,6 +78,7 @@ parse_iso8601, parse_m3u8_attributes, parse_resolution, + qualities, sanitize_url, smuggle_url, str_or_none, @@ -1569,6 +1570,8 @@ def _yield_json_ld(self, html, video_id, *, fatal=True, default=NO_DEFAULT): """Yield all json ld objects in the html""" if default is not NO_DEFAULT: fatal = False + if not fatal and not isinstance(html, str): + return for mobj in re.finditer(JSON_LD_RE, html): json_ld_item = self._parse_json( mobj.group('json_ld'), video_id, fatal=fatal, @@ -2177,6 +2180,8 @@ def extract_media(x_media_line): media_url = media.get('URI') if media_url: manifest_url = format_url(media_url) + is_audio = media_type == 'AUDIO' + is_alternate = media.get('DEFAULT') == 'NO' or media.get('AUTOSELECT') == 'NO' formats.extend({ 'format_id': join_nonempty(m3u8_id, group_id, name, idx), 'format_note': name, @@ -2189,7 +2194,11 @@ def extract_media(x_media_line): 'preference': preference, 'quality': quality, 'has_drm': has_drm, - 'vcodec': 'none' if media_type == 'AUDIO' else None, + 'vcodec': 'none' if is_audio else None, + # Alternate audio formats (e.g. audio description) should be deprioritized + 'source_preference': -2 if is_audio and is_alternate else None, + # Save this to assign source_preference based on associated video stream + '_audio_group_id': group_id if is_audio and not is_alternate else None, } for idx in _extract_m3u8_playlist_indices(manifest_url)) def build_stream_name(): @@ -2284,6 +2293,8 @@ def build_stream_name(): # ignore references to rendition groups and treat them # as complete formats. if audio_group_id and codecs and f.get('vcodec') != 'none': + # Save this to determine quality of audio formats that only have a GROUP-ID + f['_audio_group_id'] = audio_group_id audio_group = groups.get(audio_group_id) if audio_group and audio_group[0].get('URI'): # TODO: update acodec for audio only formats with @@ -2306,6 +2317,28 @@ def build_stream_name(): formats.append(http_f) last_stream_inf = {} + + # Some audio-only formats only have a GROUP-ID without any other quality/bitrate/codec info + # Each audio GROUP-ID corresponds with one or more video formats' AUDIO attribute + # For sorting purposes, set source_preference based on the quality of the video formats they are grouped with + # See https://github.com/yt-dlp/yt-dlp/issues/11178 + audio_groups_by_quality = orderedSet(f['_audio_group_id'] for f in sorted( + traverse_obj(formats, lambda _, v: v.get('vcodec') != 'none' and v['_audio_group_id']), + key=lambda x: (x.get('tbr') or 0, x.get('width') or 0))) + audio_quality_map = { + audio_groups_by_quality[0]: 'low', + audio_groups_by_quality[-1]: 'high', + } if len(audio_groups_by_quality) > 1 else None + audio_preference = qualities(audio_groups_by_quality) + for fmt in formats: + audio_group_id = fmt.pop('_audio_group_id', None) + if not audio_quality_map or not audio_group_id or fmt.get('vcodec') != 'none': + continue + # Use source_preference since quality and preference are set by params + fmt['source_preference'] = audio_preference(audio_group_id) + fmt['format_note'] = join_nonempty( + fmt.get('format_note'), audio_quality_map.get(audio_group_id), delim=', ') + return formats, subtitles def _extract_m3u8_vod_duration( @@ -2935,8 +2968,7 @@ def location_key(location): segment_duration = None if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info: segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale']) - representation_ms_info['total_number'] = int(math.ceil( - float_or_none(period_duration, segment_duration, default=0))) + representation_ms_info['total_number'] = math.ceil(float_or_none(period_duration, segment_duration, default=0)) representation_ms_info['fragments'] = [{ media_location_key: media_template % { 'Number': segment_number, diff --git a/yt_dlp/extractor/crowdbunker.py b/yt_dlp/extractor/crowdbunker.py index bf814570fe..ca0323431c 100644 --- a/yt_dlp/extractor/crowdbunker.py +++ b/yt_dlp/extractor/crowdbunker.py @@ -5,7 +5,9 @@ int_or_none, try_get, unified_strdate, + url_or_none, ) +from ..utils.traversal import traverse_obj class CrowdBunkerIE(InfoExtractor): @@ -44,16 +46,15 @@ def _real_extract(self, url): 'url': sub_url, }) - mpd_url = try_get(video_json, lambda x: x['dashManifest']['url']) - if mpd_url: - fmts, subs = self._extract_mpd_formats_and_subtitles(mpd_url, video_id) + if mpd_url := traverse_obj(video_json, ('dashManifest', 'url', {url_or_none})): + fmts, subs = self._extract_mpd_formats_and_subtitles(mpd_url, video_id, mpd_id='dash', fatal=False) formats.extend(fmts) - subtitles = self._merge_subtitles(subtitles, subs) - m3u8_url = try_get(video_json, lambda x: x['hlsManifest']['url']) - if m3u8_url: - fmts, subs = self._extract_m3u8_formats_and_subtitles(mpd_url, video_id) + self._merge_subtitles(subs, target=subtitles) + + if m3u8_url := traverse_obj(video_json, ('hlsManifest', 'url', {url_or_none})): + fmts, subs = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, m3u8_id='hls', fatal=False) formats.extend(fmts) - subtitles = self._merge_subtitles(subtitles, subs) + self._merge_subtitles(subs, target=subtitles) thumbnails = [{ 'url': image['url'], diff --git a/yt_dlp/extractor/dacast.py b/yt_dlp/extractor/dacast.py index 537352e5f7..ffc7ca8246 100644 --- a/yt_dlp/extractor/dacast.py +++ b/yt_dlp/extractor/dacast.py @@ -9,6 +9,7 @@ ExtractorError, classproperty, float_or_none, + parse_qs, traverse_obj, url_or_none, ) @@ -91,11 +92,15 @@ def _usp_signing_secret(self): # Rotates every so often, but hardcode a fallback in case of JS change/breakage before rotation return self._search_regex( r'\bUSP_SIGNING_SECRET\s*=\s*(["\'])(?P(?:(?!\1).)+)', player_js, - 'usp signing secret', group='secret', fatal=False) or 'odnInCGqhvtyRTtIiddxtuRtawYYICZP' + 'usp signing secret', group='secret', fatal=False) or 'hGDtqMKYVeFdofrAfFmBcrsakaZELajI' def _real_extract(self, url): user_id, video_id = self._match_valid_url(url).group('user_id', 'id') - query = {'contentId': f'{user_id}-vod-{video_id}', 'provider': 'universe'} + query = { + 'contentId': f'{user_id}-vod-{video_id}', + 'provider': 'universe', + **traverse_obj(url, ({parse_qs}, 'uss_token', {'signedKey': -1})), + } info = self._download_json(self._API_INFO_URL, video_id, query=query, fatal=False) access = self._download_json( 'https://playback.dacast.com/content/access', video_id, diff --git a/yt_dlp/extractor/deezer.py b/yt_dlp/extractor/deezer.py deleted file mode 100644 index 2ca8be5ca0..0000000000 --- a/yt_dlp/extractor/deezer.py +++ /dev/null @@ -1,142 +0,0 @@ -import json - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - int_or_none, - orderedSet, -) - - -class DeezerBaseInfoExtractor(InfoExtractor): - def get_data(self, url): - if not self.get_param('test'): - self.report_warning('For now, this extractor only supports the 30 second previews. Patches welcome!') - - mobj = self._match_valid_url(url) - data_id = mobj.group('id') - - webpage = self._download_webpage(url, data_id) - geoblocking_msg = self._html_search_regex( - r'

(.*?)

', webpage, 'geoblocking message', - default=None) - if geoblocking_msg is not None: - raise ExtractorError( - f'Deezer said: {geoblocking_msg}', expected=True) - - data_json = self._search_regex( - (r'__DZR_APP_STATE__\s*=\s*({.+?})\s*', - r'naboo\.display\(\'[^\']+\',\s*(.*?)\);\n'), - webpage, 'data JSON') - data = json.loads(data_json) - return data_id, webpage, data - - -class DeezerPlaylistIE(DeezerBaseInfoExtractor): - _VALID_URL = r'https?://(?:www\.)?deezer\.com/(../)?playlist/(?P[0-9]+)' - _TEST = { - 'url': 'http://www.deezer.com/playlist/176747451', - 'info_dict': { - 'id': '176747451', - 'title': 'Best!', - 'uploader': 'anonymous', - 'thumbnail': r're:^https?://(e-)?cdns-images\.dzcdn\.net/images/cover/.*\.jpg$', - }, - 'playlist_count': 29, - } - - def _real_extract(self, url): - playlist_id, webpage, data = self.get_data(url) - - playlist_title = data.get('DATA', {}).get('TITLE') - playlist_uploader = data.get('DATA', {}).get('PARENT_USERNAME') - playlist_thumbnail = self._search_regex( - r'[0-9]+)' - _TEST = { - 'url': 'https://www.deezer.com/fr/album/67505622', - 'info_dict': { - 'id': '67505622', - 'title': 'Last Week', - 'uploader': 'Home Brew', - 'thumbnail': r're:^https?://(e-)?cdns-images\.dzcdn\.net/images/cover/.*\.jpg$', - }, - 'playlist_count': 7, - } - - def _real_extract(self, url): - album_id, webpage, data = self.get_data(url) - - album_title = data.get('DATA', {}).get('ALB_TITLE') - album_uploader = data.get('DATA', {}).get('ART_NAME') - album_thumbnail = self._search_regex( - r'[^/?#&]+)\.html' + _VALID_URL = r'https?://(?:www\.)?3sat\.de/(?:[^/?#]+/)*(?P[^/?#&]+)\.html' _TESTS = [{ 'url': 'https://www.3sat.de/dokumentation/reise/traumziele-suedostasiens-die-philippinen-und-vietnam-102.html', 'info_dict': { @@ -12,40 +18,59 @@ class DreiSatIE(ZDFBaseIE): 'title': 'Traumziele Südostasiens (1/2): Die Philippinen und Vietnam', 'description': 'md5:26329ce5197775b596773b939354079d', 'duration': 2625.0, - 'thumbnail': 'https://www.3sat.de/assets/traumziele-suedostasiens-die-philippinen-und-vietnam-100~2400x1350?cb=1699870351148', + 'thumbnail': 'https://www.3sat.de/assets/traumziele-suedostasiens-die-philippinen-und-vietnam-100~original?cb=1699870351148', 'episode': 'Traumziele Südostasiens (1/2): Die Philippinen und Vietnam', 'episode_id': 'POS_cc7ff51c-98cf-4d12-b99d-f7a551de1c95', - 'timestamp': 1738593000, - 'upload_date': '20250203', + 'timestamp': 1747920900, + 'upload_date': '20250522', }, }, { - # Same as https://www.zdf.de/dokumentation/ab-18/10-wochen-sommer-102.html - 'url': 'https://www.3sat.de/film/ab-18/10-wochen-sommer-108.html', - 'md5': '0aff3e7bc72c8813f5e0fae333316a1d', + 'url': 'https://www.3sat.de/film/ab-18/ab-18---mein-fremdes-ich-100.html', + 'md5': 'f92638413a11d759bdae95c9d8ec165c', 'info_dict': { - 'id': '141007_ab18_10wochensommer_film', + 'id': '221128_mein_fremdes_ich2_ab18', 'ext': 'mp4', - 'title': 'Ab 18! - 10 Wochen Sommer', - 'description': 'md5:8253f41dc99ce2c3ff892dac2d65fe26', - 'duration': 2660, - 'timestamp': 1608604200, - 'upload_date': '20201222', + 'title': 'Ab 18! - Mein fremdes Ich', + 'description': 'md5:cae0c0b27b7426d62ca0dda181738bf0', + 'duration': 2625.0, + 'thumbnail': 'https://www.3sat.de/assets/ab-18---mein-fremdes-ich-106~original?cb=1666081865812', + 'episode': 'Ab 18! - Mein fremdes Ich', + 'episode_id': 'POS_6225d1ca-a0d5-45e3-870b-e783ee6c8a3f', + 'timestamp': 1695081600, + 'upload_date': '20230919', }, - 'skip': '410 Gone', }, { - 'url': 'https://www.3sat.de/gesellschaft/schweizweit/waidmannsheil-100.html', + 'url': 'https://www.3sat.de/gesellschaft/37-grad-leben/aus-dem-leben-gerissen-102.html', + 'md5': 'a903eaf8d1fd635bd3317cd2ad87ec84', 'info_dict': { - 'id': '140913_sendung_schweizweit', + 'id': '250323_0903_sendung_sgl', 'ext': 'mp4', - 'title': 'Waidmannsheil', - 'description': 'md5:cce00ca1d70e21425e72c86a98a56817', - 'timestamp': 1410623100, - 'upload_date': '20140913', + 'title': 'Plötzlich ohne dich', + 'description': 'md5:380cc10659289dd91510ad8fa717c66b', + 'duration': 1620.0, + 'thumbnail': 'https://www.3sat.de/assets/37-grad-leben-106~original?cb=1645537156810', + 'episode': 'Plötzlich ohne dich', + 'episode_id': 'POS_faa7a93c-c0f2-4d51-823f-ce2ac3ee191b', + 'timestamp': 1743162540, + 'upload_date': '20250328', }, - 'params': { - 'skip_download': True, + }, { + # Video with chapters + 'url': 'https://www.3sat.de/kultur/buchmesse/dein-buch-das-beste-von-der-leipziger-buchmesse-2025-teil-1-100.html', + 'md5': '6b95790ce52e75f0d050adcdd2711ee6', + 'info_dict': { + 'id': '250330_dein_buch1_bum', + 'ext': 'mp4', + 'title': 'dein buch - Das Beste von der Leipziger Buchmesse 2025 - Teil 1', + 'description': 'md5:bae51bfc22f15563ce3acbf97d2e8844', + 'duration': 5399.0, + 'thumbnail': 'https://www.3sat.de/assets/buchmesse-kerkeling-100~original?cb=1743329640903', + 'chapters': 'count:24', + 'episode': 'dein buch - Das Beste von der Leipziger Buchmesse 2025 - Teil 1', + 'episode_id': 'POS_1ef236cc-b390-401e-acd0-4fb4b04315fb', + 'timestamp': 1743327000, + 'upload_date': '20250330', }, - 'skip': '404 Not Found', }, { # Same as https://www.zdf.de/filme/filme-sonstige/der-hauptmann-112.html 'url': 'https://www.3sat.de/film/spielfilm/der-hauptmann-100.html', @@ -58,11 +83,42 @@ class DreiSatIE(ZDFBaseIE): def _real_extract(self, url): video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + player = self._search_json( + r'data-zdfplayer-jsb=(["\'])', webpage, 'player JSON', video_id) + player_url = player['content'] + api_token = f'Bearer {player["apiToken"]}' - webpage = self._download_webpage(url, video_id, fatal=False) - if webpage: - player = self._extract_player(webpage, url, fatal=False) - if player: - return self._extract_regular(url, player, video_id) + content = self._call_api(player_url, video_id, 'video metadata', api_token) - return self._extract_mobile(video_id) + video_target = content['mainVideoContent']['http://zdf.de/rels/target'] + ptmd_path = traverse_obj(video_target, ( + (('streams', 'default'), None), + ('http://zdf.de/rels/streams/ptmd', 'http://zdf.de/rels/streams/ptmd-template'), + {str}, any, {require('ptmd path')})) + ptmd_url = self._expand_ptmd_template(player_url, ptmd_path) + aspect_ratio = self._parse_aspect_ratio(video_target.get('aspectRatio')) + info = self._extract_ptmd(ptmd_url, video_id, api_token, aspect_ratio) + + return merge_dicts(info, { + **traverse_obj(content, { + 'title': (('title', 'teaserHeadline'), {str}, any), + 'episode': (('title', 'teaserHeadline'), {str}, any), + 'description': (('leadParagraph', 'teasertext'), {str}, any), + 'timestamp': ('editorialDate', {parse_iso8601}), + }), + **traverse_obj(video_target, { + 'duration': ('duration', {int_or_none}), + 'chapters': ('streamAnchorTag', {self._extract_chapters}), + }), + 'thumbnails': self._extract_thumbnails(traverse_obj(content, ('teaserImageRef', 'layouts', {dict}))), + **traverse_obj(content, ('programmeItem', 0, 'http://zdf.de/rels/target', { + 'series_id': ('http://zdf.de/rels/cmdm/series', 'seriesUuid', {str}), + 'series': ('http://zdf.de/rels/cmdm/series', 'seriesTitle', {str}), + 'season': ('http://zdf.de/rels/cmdm/season', 'seasonTitle', {str}), + 'season_number': ('http://zdf.de/rels/cmdm/season', 'seasonNumber', {int_or_none}), + 'season_id': ('http://zdf.de/rels/cmdm/season', 'seasonUuid', {str}), + 'episode_number': ('episodeNumber', {int_or_none}), + 'episode_id': ('contentId', {str}), + })), + }) diff --git a/yt_dlp/extractor/espn.py b/yt_dlp/extractor/espn.py index 552f9af12e..ceba024bc3 100644 --- a/yt_dlp/extractor/espn.py +++ b/yt_dlp/extractor/espn.py @@ -5,7 +5,6 @@ from .adobepass import AdobePassIE from .common import InfoExtractor -from .once import OnceIE from ..utils import ( determine_ext, dict_get, @@ -16,7 +15,7 @@ ) -class ESPNIE(OnceIE): +class ESPNIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// (?: @@ -131,9 +130,7 @@ def extract_source(source_url, source_id=None): return format_urls.add(source_url) ext = determine_ext(source_url) - if OnceIE.suitable(source_url): - formats.extend(self._extract_once_formats(source_url)) - elif ext == 'smil': + if ext == 'smil': formats.extend(self._extract_smil_formats( source_url, video_id, fatal=False)) elif ext == 'f4m': @@ -332,6 +329,7 @@ class WatchESPNIE(AdobePassIE): }] _API_KEY = 'ZXNwbiZicm93c2VyJjEuMC4w.ptUt7QxsteaRruuPmGZFaJByOoqKvDP2a5YkInHrc7c' + _SOFTWARE_STATEMENT = 'eyJhbGciOiJSUzI1NiJ9.eyJzdWIiOiIyZGJmZWM4My03OWE1LTQyNzEtYTVmZC04NTZjYTMxMjRjNjMiLCJuYmYiOjE1NDAyMTI3NjEsImlzcyI6ImF1dGguYWRvYmUuY29tIiwiaWF0IjoxNTQwMjEyNzYxfQ.yaK3r4AI2uLVvsyN1GLzqzgzRlxMPtasSaiYYBV0wIstqih5tvjTmeoLmi8Xy9Kp_U7Md-bOffwiyK3srHkpUkhhwXLH2x6RPjmS1tPmhaG7-3LBcHTf2ySPvXhVf7cN4ngldawK4tdtLtsw6rF_JoZE2yaC6XbS2F51nXSFEDDnOQWIHEQRG3aYAj-38P2CLGf7g-Yfhbp5cKXeksHHQ90u3eOO4WH0EAjc9oO47h33U8KMEXxJbvjV5J8Va2G2fQSgLDZ013NBI3kQnE313qgqQh2feQILkyCENpB7g-TVBreAjOaH1fU471htSoGGYepcAXv-UDtpgitDiLy7CQ' def _call_bamgrid_api(self, path, video_id, payload=None, headers={}): if 'Authorization' not in headers: @@ -408,8 +406,8 @@ def _real_extract(self, url): # TV Provider required else: - resource = self._get_mvpd_resource('ESPN', video_data['name'], video_id, None) - auth = self._extract_mvpd_auth(url, video_id, 'ESPN', resource).encode() + resource = self._get_mvpd_resource('espn1', video_data['name'], video_id, None) + auth = self._extract_mvpd_auth(url, video_id, 'ESPN', resource, self._SOFTWARE_STATEMENT).encode() asset = self._download_json( f'https://watch.auth.api.espn.com/video/auth/media/{video_id}/asset?apikey=uiqlbgzdwuru14v627vdusswb', diff --git a/yt_dlp/extractor/firsttv.py b/yt_dlp/extractor/firsttv.py index ac7697bb35..878732c49e 100644 --- a/yt_dlp/extractor/firsttv.py +++ b/yt_dlp/extractor/firsttv.py @@ -2,11 +2,15 @@ from .common import InfoExtractor from ..utils import ( + determine_ext, int_or_none, - qualities, + join_nonempty, + mimetype2ext, + parse_qs, unified_strdate, url_or_none, ) +from ..utils.traversal import traverse_obj class FirstTVIE(InfoExtractor): @@ -15,40 +19,51 @@ class FirstTVIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?(?:sport)?1tv\.ru/(?:[^/?#]+/)+(?P[^/?#]+)' _TESTS = [{ - # single format - 'url': 'http://www.1tv.ru/shows/naedine-so-vsemi/vypuski/gost-lyudmila-senchina-naedine-so-vsemi-vypusk-ot-12-02-2015', - 'md5': 'a1b6b60d530ebcf8daacf4565762bbaf', + # single format; has item.id + 'url': 'https://www.1tv.ru/shows/naedine-so-vsemi/vypuski/gost-lyudmila-senchina-naedine-so-vsemi-vypusk-ot-12-02-2015', + 'md5': '8011ae8e88ff4150107ab9c5a8f5b659', 'info_dict': { 'id': '40049', 'ext': 'mp4', 'title': 'Гость Людмила Сенчина. Наедине со всеми. Выпуск от 12.02.2015', - 'thumbnail': r're:^https?://.*\.(?:jpg|JPG)$', + 'thumbnail': r're:https?://.+/.+\.jpg', 'upload_date': '20150212', 'duration': 2694, }, + 'params': {'skip_download': 'm3u8'}, }, { - # multiple formats - 'url': 'http://www.1tv.ru/shows/dobroe-utro/pro-zdorove/vesennyaya-allergiya-dobroe-utro-fragment-vypuska-ot-07042016', + # multiple formats; has item.id + 'url': 'https://www.1tv.ru/shows/dobroe-utro/pro-zdorove/vesennyaya-allergiya-dobroe-utro-fragment-vypuska-ot-07042016', 'info_dict': { 'id': '364746', 'ext': 'mp4', 'title': 'Весенняя аллергия. Доброе утро. Фрагмент выпуска от 07.04.2016', - 'thumbnail': r're:^https?://.*\.(?:jpg|JPG)$', + 'thumbnail': r're:https?://.+/.+\.jpg', 'upload_date': '20160407', 'duration': 179, 'formats': 'mincount:3', }, - 'params': { - 'skip_download': True, - }, + 'params': {'skip_download': 'm3u8'}, }, { - 'url': 'http://www.1tv.ru/news/issue/2016-12-01/14:00', + 'url': 'https://www.1tv.ru/news/issue/2016-12-01/14:00', 'info_dict': { 'id': '14:00', - 'title': 'Выпуск новостей в 14:00 1 декабря 2016 года. Новости. Первый канал', - 'description': 'md5:2e921b948f8c1ff93901da78ebdb1dfd', + 'title': 'Выпуск программы «Время» в 20:00 1 декабря 2016 года. Новости. Первый канал', + 'thumbnail': 'https://static.1tv.ru/uploads/photo/image/8/big/338448_big_8fc7eb236f.jpg', }, 'playlist_count': 13, + }, { + # has timestamp; has item.uid but not item.id + 'url': 'https://www.1tv.ru/shows/segodnya-vecherom/vypuski/avtory-odnogo-hita-segodnya-vecherom-vypusk-ot-03-05-2025', + 'info_dict': { + 'id': '270411', + 'ext': 'mp4', + 'title': 'Авторы одного хита. Сегодня вечером. Выпуск от 03.05.2025', + 'thumbnail': r're:https?://.+/.+\.jpg', + 'timestamp': 1746286020, + 'upload_date': '20250503', + }, + 'params': {'skip_download': 'm3u8'}, }, { 'url': 'http://www.1tv.ru/shows/tochvtoch-supersezon/vystupleniya/evgeniy-dyatlov-vladimir-vysockiy-koni-priveredlivye-toch-v-toch-supersezon-fragment-vypuska-ot-06-11-2016', 'only_matching': True, @@ -57,96 +72,60 @@ class FirstTVIE(InfoExtractor): 'only_matching': True, }] + def _entries(self, items): + for item in items: + video_id = str(item.get('id') or item['uid']) + + formats, subtitles = [], {} + for f in traverse_obj(item, ('sources', lambda _, v: url_or_none(v['src']))): + src = f['src'] + ext = mimetype2ext(f.get('type'), default=determine_ext(src)) + if ext == 'm3u8': + fmts, subs = self._extract_m3u8_formats_and_subtitles( + src, video_id, 'mp4', m3u8_id='hls', fatal=False) + elif ext == 'mpd': + fmts, subs = self._extract_mpd_formats_and_subtitles( + src, video_id, mpd_id='dash', fatal=False) + else: + tbr = self._search_regex(fr'_(\d{{3,}})\.{ext}', src, 'tbr', default=None) + formats.append({ + 'url': src, + 'ext': ext, + 'format_id': join_nonempty('http', ext, tbr), + 'tbr': int_or_none(tbr), + # quality metadata of http formats may be incorrect + 'quality': -10, + }) + continue + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + + yield { + **traverse_obj(item, { + 'title': ('title', {str}), + 'thumbnail': ('poster', {url_or_none}), + 'timestamp': ('dvr_begin_at', {int_or_none}), + 'upload_date': ('date_air', {unified_strdate}), + 'duration': ('duration', {int_or_none}), + }), + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + } + def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - playlist_url = urllib.parse.urljoin(url, self._search_regex( + playlist_url = urllib.parse.urljoin(url, self._html_search_regex( r'data-playlist-url=(["\'])(?P(?:(?!\1).)+)\1', webpage, 'playlist url', group='url')) - parsed_url = urllib.parse.urlparse(playlist_url) - qs = urllib.parse.parse_qs(parsed_url.query) - item_ids = qs.get('videos_ids[]') or qs.get('news_ids[]') + item_ids = traverse_obj(parse_qs(playlist_url), 'video_id', 'videos_ids[]', 'news_ids[]') + items = traverse_obj( + self._download_json(playlist_url, display_id), + lambda _, v: v['uid'] and (str(v['uid']) in item_ids if item_ids else True)) - items = self._download_json(playlist_url, display_id) - - if item_ids: - items = [ - item for item in items - if item.get('uid') and str(item['uid']) in item_ids] - else: - items = [items[0]] - - entries = [] - QUALITIES = ('ld', 'sd', 'hd') - - for item in items: - title = item['title'] - quality = qualities(QUALITIES) - formats = [] - path = None - for f in item.get('mbr', []): - src = url_or_none(f.get('src')) - if not src: - continue - tbr = int_or_none(self._search_regex( - r'_(\d{3,})\.mp4', src, 'tbr', default=None)) - if not path: - path = self._search_regex( - r'//[^/]+/(.+?)_\d+\.mp4', src, - 'm3u8 path', default=None) - formats.append({ - 'url': src, - 'format_id': f.get('name'), - 'tbr': tbr, - 'source_preference': quality(f.get('name')), - # quality metadata of http formats may be incorrect - 'preference': -10, - }) - # m3u8 URL format is reverse engineered from [1] (search for - # master.m3u8). dashEdges (that is currently balancer-vod.1tv.ru) - # is taken from [2]. - # 1. http://static.1tv.ru/player/eump1tv-current/eump-1tv.all.min.js?rnd=9097422834:formatted - # 2. http://static.1tv.ru/player/eump1tv-config/config-main.js?rnd=9097422834 - if not path and len(formats) == 1: - path = self._search_regex( - r'//[^/]+/(.+?$)', formats[0]['url'], - 'm3u8 path', default=None) - if path: - if len(formats) == 1: - m3u8_path = ',' - else: - tbrs = [str(t) for t in sorted(f['tbr'] for f in formats)] - m3u8_path = '_,{},{}'.format(','.join(tbrs), '.mp4') - formats.extend(self._extract_m3u8_formats( - f'http://balancer-vod.1tv.ru/{path}{m3u8_path}.urlset/master.m3u8', - display_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) - - thumbnail = item.get('poster') or self._og_search_thumbnail(webpage) - duration = int_or_none(item.get('duration') or self._html_search_meta( - 'video:duration', webpage, 'video duration', fatal=False)) - upload_date = unified_strdate(self._html_search_meta( - 'ya:ovs:upload_date', webpage, 'upload date', default=None)) - - entries.append({ - 'id': str(item.get('id') or item['uid']), - 'thumbnail': thumbnail, - 'title': title, - 'upload_date': upload_date, - 'duration': int_or_none(duration), - 'formats': formats, - }) - - title = self._html_search_regex( - (r'
\s*

([^<]*)', - r"'title'\s*:\s*'([^']+)'"), - webpage, 'title', default=None) or self._og_search_title( - webpage, default=None) - description = self._html_search_regex( - r'
\s*
 
\s*

([^<]*)

', - webpage, 'description', default=None) or self._html_search_meta( - 'description', webpage, 'description', default=None) - - return self.playlist_result(entries, display_id, title, description) + return self.playlist_result( + self._entries(items), display_id, self._og_search_title(webpage, default=None), + thumbnail=self._og_search_thumbnail(webpage, default=None)) diff --git a/yt_dlp/extractor/francaisfacile.py b/yt_dlp/extractor/francaisfacile.py new file mode 100644 index 0000000000..d3208c2828 --- /dev/null +++ b/yt_dlp/extractor/francaisfacile.py @@ -0,0 +1,87 @@ +import urllib.parse + +from .common import InfoExtractor +from ..networking.exceptions import HTTPError +from ..utils import ( + ExtractorError, + float_or_none, + url_or_none, +) +from ..utils.traversal import traverse_obj + + +class FrancaisFacileIE(InfoExtractor): + _VALID_URL = r'https?://francaisfacile\.rfi\.fr/[a-z]{2}/(?:actualit%C3%A9|podcasts/[^/#?]+)/(?P[^/#?]+)' + _TESTS = [{ + 'url': 'https://francaisfacile.rfi.fr/fr/actualit%C3%A9/20250305-r%C3%A9concilier-les-jeunes-avec-la-lecture-gr%C3%A2ce-aux-r%C3%A9seaux-sociaux', + 'md5': '4f33674cb205744345cc835991100afa', + 'info_dict': { + 'id': 'WBMZ58952-FLE-FR-20250305', + 'display_id': '20250305-réconcilier-les-jeunes-avec-la-lecture-grâce-aux-réseaux-sociaux', + 'title': 'Réconcilier les jeunes avec la lecture grâce aux réseaux sociaux', + 'url': 'https://aod-fle.akamaized.net/fle/sounds/fr/2025/03/05/6b6af52a-f9ba-11ef-a1f8-005056a97652.mp3', + 'ext': 'mp3', + 'description': 'md5:b903c63d8585bd59e8cc4d5f80c4272d', + 'duration': 103.15, + 'timestamp': 1741177984, + 'upload_date': '20250305', + }, + }, { + 'url': 'https://francaisfacile.rfi.fr/fr/actualit%C3%A9/20250307-argentine-le-sac-d-un-alpiniste-retrouv%C3%A9-40-ans-apr%C3%A8s-sa-mort', + 'md5': 'b8c3a63652d4ae8e8092dda5700c1cd9', + 'info_dict': { + 'id': 'WBMZ59102-FLE-FR-20250307', + 'display_id': '20250307-argentine-le-sac-d-un-alpiniste-retrouvé-40-ans-après-sa-mort', + 'title': 'Argentine: le sac d\'un alpiniste retrouvé 40 ans après sa mort', + 'url': 'https://aod-fle.akamaized.net/fle/sounds/fr/2025/03/07/8edf4082-fb46-11ef-8a37-005056bf762b.mp3', + 'ext': 'mp3', + 'description': 'md5:7fd088fbdf4a943bb68cf82462160dca', + 'duration': 117.74, + 'timestamp': 1741352789, + 'upload_date': '20250307', + }, + }, { + 'url': 'https://francaisfacile.rfi.fr/fr/podcasts/un-mot-une-histoire/20250317-le-mot-de-david-foenkinos-peut-%C3%AAtre', + 'md5': 'db83c2cc2589b4c24571c6b6cf14f5f1', + 'info_dict': { + 'id': 'WBMZ59441-FLE-FR-20250317', + 'display_id': '20250317-le-mot-de-david-foenkinos-peut-être', + 'title': 'Le mot de David Foenkinos: «peut-être» - Un mot, une histoire', + 'url': 'https://aod-fle.akamaized.net/fle/sounds/fr/2025/03/17/4ca6cbbe-0315-11f0-a85b-005056a97652.mp3', + 'ext': 'mp3', + 'description': 'md5:3fe35fae035803df696bfa7af2496e49', + 'duration': 198.96, + 'timestamp': 1742210897, + 'upload_date': '20250317', + }, + }] + + def _real_extract(self, url): + display_id = urllib.parse.unquote(self._match_id(url)) + + try: # yt-dlp's default user-agents are too old and blocked by the site + webpage = self._download_webpage(url, display_id, headers={ + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:136.0) Gecko/20100101 Firefox/136.0', + }) + except ExtractorError as e: + if not isinstance(e.cause, HTTPError) or e.cause.status != 403: + raise + # Retry with impersonation if hardcoded UA is insufficient + webpage = self._download_webpage(url, display_id, impersonate=True) + + data = self._search_json( + r']+\bdata-media-id=[^>]+\btype="application/json"[^>]*>', + webpage, 'audio data', display_id) + + return { + 'id': data['mediaId'], + 'display_id': display_id, + 'vcodec': 'none', + 'title': self._html_extract_title(webpage), + **self._search_json_ld(webpage, display_id, fatal=False), + **traverse_obj(data, { + 'title': ('title', {str}), + 'url': ('sources', ..., 'url', {url_or_none}, any), + 'duration': ('sources', ..., 'duration', {float_or_none}, any), + }), + } diff --git a/yt_dlp/extractor/gamespot.py b/yt_dlp/extractor/gamespot.py index cd3f9655d8..2799a27bac 100644 --- a/yt_dlp/extractor/gamespot.py +++ b/yt_dlp/extractor/gamespot.py @@ -1,9 +1,9 @@ import urllib.parse -from .once import OnceIE +from .common import InfoExtractor -class GameSpotIE(OnceIE): +class GameSpotIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?gamespot\.com/(?:video|article|review)s/(?:[^/]+/\d+-|embed/)(?P\d+)' _TESTS = [{ 'url': 'http://www.gamespot.com/videos/arma-3-community-guide-sitrep-i/2300-6410818/', diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index 67c224e502..721d04e317 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -37,6 +37,7 @@ unescapeHTML, unified_timestamp, unsmuggle_url, + update_url, update_url_query, url_or_none, urlhandle_detect_ext, @@ -2213,10 +2214,21 @@ def hex_or_none(value): if is_live is not None: info['live_status'] = 'not_live' if is_live == 'false' else 'is_live' return - headers = m3u8_format.get('http_headers') or info.get('http_headers') - duration = self._extract_m3u8_vod_duration( - m3u8_format['url'], info.get('id'), note='Checking m3u8 live status', - errnote='Failed to download m3u8 media playlist', headers=headers) + headers = m3u8_format.get('http_headers') or info.get('http_headers') or {} + display_id = info.get('id') + urlh = self._request_webpage( + m3u8_format['url'], display_id, 'Checking m3u8 live status', errnote=False, + headers={**headers, 'Accept-Encoding': 'identity'}, fatal=False) + if urlh is False: + return + first_bytes = urlh.read(512) + if not first_bytes.startswith(b'#EXTM3U'): + return + m3u8_doc = self._webpage_read_content( + urlh, urlh.url, display_id, prefix=first_bytes, fatal=False, errnote=False) + if not m3u8_doc: + return + duration = self._parse_m3u8_vod_duration(m3u8_doc, display_id) if not duration: info['live_status'] = 'is_live' info['duration'] = info.get('duration') or duration @@ -2526,12 +2538,13 @@ def _real_extract(self, url): return self.playlist_result( self._parse_xspf( doc, video_id, xspf_url=url, - xspf_base_url=full_response.url), + xspf_base_url=new_url), video_id) elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag): info_dict['formats'], info_dict['subtitles'] = self._parse_mpd_formats_and_subtitles( doc, - mpd_base_url=full_response.url.rpartition('/')[0], + # Do not use yt_dlp.utils.base_url here since it will raise on file:// URLs + mpd_base_url=update_url(new_url, query=None, fragment=None).rpartition('/')[0], mpd_url=url) info_dict['live_status'] = 'is_live' if doc.get('type') == 'dynamic' else None self._extra_manifest_info(info_dict, url) diff --git a/yt_dlp/extractor/getcourseru.py b/yt_dlp/extractor/getcourseru.py index b7581d77e2..2d923cf540 100644 --- a/yt_dlp/extractor/getcourseru.py +++ b/yt_dlp/extractor/getcourseru.py @@ -8,7 +8,7 @@ class GetCourseRuPlayerIE(InfoExtractor): - _VALID_URL = r'https?://player02\.getcourse\.ru/sign-player/?\?(?:[^#]+&)?json=[^#&]+' + _VALID_URL = r'https?://(?:player02\.getcourse\.ru|cf-api-2\.vhcdn\.com)/sign-player/?\?(?:[^#]+&)?json=[^#&]+' _EMBED_REGEX = [rf']+\bsrc=[\'"](?P{_VALID_URL}[^\'"]*)'] _TESTS = [{ 'url': 'http://player02.getcourse.ru/sign-player/?json=eyJ2aWRlb19oYXNoIjoiMTkwYmRmOTNmMWIyOTczNTMwOTg1M2E3YTE5ZTI0YjMiLCJ1c2VyX2lkIjozNTk1MjUxODMsInN1Yl9sb2dpbl91c2VyX2lkIjpudWxsLCJsZXNzb25faWQiOm51bGwsImlwIjoiNDYuMTQyLjE4Mi4yNDciLCJnY19ob3N0IjoiYWNhZGVteW1lbC5vbmxpbmUiLCJ0aW1lIjoxNzA1NDQ5NjQyLCJwYXlsb2FkIjoidV8zNTk1MjUxODMiLCJ1aV9sYW5ndWFnZSI6InJ1IiwiaXNfaGF2ZV9jdXN0b21fc3R5bGUiOnRydWV9&s=354ad2c993d95d5ac629e3133d6cefea&vh-static-feature=zigzag', @@ -20,6 +20,16 @@ class GetCourseRuPlayerIE(InfoExtractor): 'duration': 1693, }, 'skip': 'JWT expired', + }, { + 'url': 'https://cf-api-2.vhcdn.com/sign-player/?json=example', + 'info_dict': { + 'id': '435735291', + 'title': '8afd7c489952108e00f019590f3711f3', + 'ext': 'mp4', + 'thumbnail': 'https://preview-htz.vhcdn.com/preview/8afd7c489952108e00f019590f3711f3/preview.jpg?version=1682170973&host=vh-72', + 'duration': 777, + }, + 'skip': 'JWT expired', }] def _real_extract(self, url): @@ -168,7 +178,7 @@ def _real_extract(self, url): playlist_id = self._search_regex( r'window\.(?:lessonId|gcsObjectId)\s*=\s*(\d+)', webpage, 'playlist id', default=display_id) - title = self._og_search_title(webpage) or self._html_extract_title(webpage) + title = self._og_search_title(webpage, default=None) or self._html_extract_title(webpage) return self.playlist_from_matches( re.findall(GetCourseRuPlayerIE._EMBED_REGEX[0], webpage), diff --git a/yt_dlp/extractor/go.py b/yt_dlp/extractor/go.py index 83c1979db8..4e138a828f 100644 --- a/yt_dlp/extractor/go.py +++ b/yt_dlp/extractor/go.py @@ -7,161 +7,157 @@ int_or_none, join_nonempty, parse_age_limit, - remove_end, - remove_start, - traverse_obj, - try_get, unified_timestamp, urlencode_postdata, ) +from ..utils.traversal import traverse_obj class GoIE(AdobePassIE): _SITE_INFO = { 'abc': { 'brand': '001', - 'requestor_id': 'ABC', + 'requestor_id': 'dtci', + 'provider_id': 'ABC', + 'software_statement': 'eyJhbGciOiJSUzI1NiJ9.eyJzdWIiOiI4OTcwMjlkYS0yYjM1LTQyOWUtYWQ0NS02ZjZiZjVkZTdhOTUiLCJuYmYiOjE2MjAxNzM5NjksImlzcyI6ImF1dGguYWRvYmUuY29tIiwiaWF0IjoxNjIwMTczOTY5fQ.SC69DVJWSL8sIe-vVUrP6xS_kzHKqwz9PdKYexs_y-f7Vin6mM-7S-W1TE_-K55O0pyf-TL4xYgvm6LIye8CckG-nZfVwNPV4huduov0jmIcxCQFeUwkHULG2IaA44wfBVUBdaHgkhPweZ2amjycO_IXtez-gBXOLbE3B7Gx9j_5ISCFtyVUblThKfoGyQv6KT6t8Vpmc4ZSKCCQp74KWFFypydb9ucego1taW_nQD06Cdf4yByLd6NaTBceMcIKbug9b9gxFm3XBgJ5q3z7KGo1Kr6XalAV5j4m-fQ91wczlTilX8FM4AljMupyRM9mA_aEADILQ4hS79q4SM0w6w', }, 'freeform': { 'brand': '002', 'requestor_id': 'ABCFamily', - }, - 'watchdisneychannel': { - 'brand': '004', - 'resource_id': 'Disney', - }, - 'watchdisneyjunior': { - 'brand': '008', - 'resource_id': 'DisneyJunior', - }, - 'watchdisneyxd': { - 'brand': '009', - 'resource_id': 'DisneyXD', + 'provider_id': 'ABCFamily', + 'software_statement': 'eyJhbGciOiJSUzI1NiJ9.eyJzdWIiOiJhZWM2MGYyNC0xYzRjLTQ1NzQtYjc0Zi03ZmM4N2E5YWMzMzgiLCJuYmYiOjE1ODc2NjU5MjMsImlzcyI6ImF1dGguYWRvYmUuY29tIiwiaWF0IjoxNTg3NjY1OTIzfQ.flCn3dhvmvPnWmV0JV8Fm0YFyj07yPez9-n1GFEwVIm_S2wQVWbWyJhqsAyLZVFrhOMZYTqmPS3OHxGwTwXkEYn6PD7o_vIVG3oqi-Xn1m5jRt_Gazw5qEtpat6VE7bvKGSD3ZhcidOrsCk8NcYyq75u61NHDvSl81pcedJjVRVUpsqrEwmo0aVbA0C8PX3ri0mEbGvkMKvHn8E60xp-PSE-VK8SDT0plwPu_TwUszkZ6-_I8_2xcv_WBqcXFkAVg7Q-iNJXgQvmNsrpcrYuLvi6hEH4ZLtoDcXU6MhwTQAJTiHSo8x9aHX1_qFP09CzlNOFQbC2ZEJdP9SvA53SLQ', }, 'disneynow': { - 'brand': '011', + 'brand': '011', # also: '004', '008', '009' + 'requestor_id': 'DisneyChannels', + 'provider_id': 'DisneyChannels', + 'software_statement': 'eyJhbGciOiJSUzI1NiJ9.eyJzdWIiOiI1MzAzNTRiOS04NDNiLTRkNjAtYTQ3ZS0yNzk1MzlkOTIyNTciLCJuYmYiOjE1NTg5ODc0NDksImlzcyI6ImF1dGguYWRvYmUuY29tIiwiaWF0IjoxNTU4OTg3NDQ5fQ.Jud6YS6-J2h0h6po0oMheDym0qRTJQGj4kzacrz4DFuEwhcBkkykW6pF5pKuAUJy9HCZ40oDAHe2KcTlDJjCZF5tDaUEfdihakZ9cC_rG7MU-QoRne8qaB_dPDKwGuk-ZyWD8eV3zwTJmbGo8hDxYTEU81YNCxwhyc_BPDr5TYiubbmpP3_pTnXmSpuL58isJ2peSKWlX9BacuXtBY25c_QnPFKk-_EETm7IHkTpDazde1QfHWGu4s4yJpKGk8RVVujVG6h6ELlL-ZeYLilBm7iS7h1TYG1u7fJhyZRL7isaom6NvAzsvN3ngss1fLwt8decP8wzdFHrbYTdTjW8qw', 'resource_id': 'Disney', }, - 'fxnow.fxnetworks': { - 'brand': '025', + 'fxnetworks': { + 'brand': '025', # also: '020' 'requestor_id': 'dtci', + 'provider_id': 'fx', # also 'fxx', 'fxm' + 'software_statement': 'eyJhbGciOiJSUzI1NiJ9.eyJzdWIiOiIzYWRhYWZiNC02OTAxLTRlYzktOTdmNy1lYWZkZTJkODJkN2EiLCJuYmYiOjE1NjIwMjQwNzYsImlzcyI6ImF1dGguYWRvYmUuY29tIiwiaWF0IjoxNTYyMDI0MDc2fQ.dhKMpZK50AObbZYrMiYPSfWtzXHUaeMP3jrIY4Cgfvh0GaEgk0Mns_zp78jypFeZgRtPVleQMQDNq2YEloRLcAGqP1aa6WVDglnK77ZWUm4IKai14Rwf3A6YBhSRoO2_lMmUGkuTf6gZY-kMIPqBYKqzTQiQl4HbniPFodIzFRiuI9QJVrkoyTGrJL4oqiX08PoFI3Z-TOti1Heu3EbFC-GveQHhlinYrzU7rbiAqLEz7FImtfBDsnXX1Y3uJDLYM3Bq4Oh0nrzTv1Fd62wNsCNErHHIbELidh1zZF0ujvt7ReuZUwAitm0UhEJ7OxNOUbEQWtae6pVNscvdvTFMpg', + }, + 'nationalgeographic': { + 'brand': '026', # also '023' + 'requestor_id': 'dtci', + 'provider_id': 'ngc', # also 'ngw' + 'software_statement': 'eyJhbGciOiJSUzI1NiJ9.eyJzdWIiOiIxMzE4YTM1Ni05Mjc4LTQ4NjEtYTFmNi1jMTIzMzg1ZWMzYzMiLCJuYmYiOjE1NjIwMjM4MjgsImlzcyI6ImF1dGguYWRvYmUuY29tIiwiaWF0IjoxNTYyMDIzODI4fQ.Le-2OzF9-jrhJ7ZfWtLWk5iSHGVZoxeU1w0_fO--Heli0OwRZsRq2slSmx-oZTzxuWmAgDEiBkWSDcDK6sM25DrCLsdsJa3MBuZ-slBRtH8aq3HpNoqqLkU-vg6gRUEKMtwBUtwCu_9aKUCayYtndWv4b1DjVQeSrteOW5NNudWVYleAe0kxeNJQHo5If9SCzDudKVJktFUjhNks4QPOC_uONPkRRlL9D0fNvtOY-LRFckfcHhf5z9l1iZjeukV0YhdKnuw1wyiaWrQXBUDiBfbkCRd2DM-KnelqPxfiXCaTjGKDURRBO3pz33ebge3IFXSiU5vl4qHQ8xvunzGpFw', }, } - _VALID_URL = r'''(?x) - https?:// - (?P - (?:{}\.)?go|fxnow\.fxnetworks| - (?:www\.)?(?:abc|freeform|disneynow) - )\.com/ - (?: - (?:[^/]+/)*(?P[Vv][Dd][Kk][Aa]\w+)| - (?:[^/]+/)*(?P[^/?\#]+) - ) - '''.format(r'\.|'.join(list(_SITE_INFO.keys()))) + _URL_PATH_RE = r'(?:video|episode|movies-and-specials)/(?P[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})' + _VALID_URL = [ + fr'https?://(?:www\.)?(?Pabc)\.com/{_URL_PATH_RE}', + fr'https?://(?:www\.)?(?Pfreeform)\.com/{_URL_PATH_RE}', + fr'https?://(?:www\.)?(?Pdisneynow)\.com/{_URL_PATH_RE}', + fr'https?://fxnow\.(?Pfxnetworks)\.com/{_URL_PATH_RE}', + fr'https?://(?:www\.)?(?Pnationalgeographic)\.com/tv/{_URL_PATH_RE}', + ] _TESTS = [{ - 'url': 'http://abc.go.com/shows/designated-survivor/video/most-recent/VDKA3807643', + 'url': 'https://abc.com/episode/4192c0e6-26e5-47a8-817b-ce8272b9e440/playlist/PL551127435', 'info_dict': { - 'id': 'VDKA3807643', + 'id': 'VDKA10805898', 'ext': 'mp4', - 'title': 'The Traitor in the White House', - 'description': 'md5:05b009d2d145a1e85d25111bd37222e8', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - 'skip': 'This content is no longer available.', - }, { - 'url': 'https://disneynow.com/shows/big-hero-6-the-series', - 'info_dict': { - 'title': 'Doraemon', - 'id': 'SH55574025', - }, - 'playlist_mincount': 51, - }, { - 'url': 'http://freeform.go.com/shows/shadowhunters/episodes/season-2/1-this-guilty-blood', - 'info_dict': { - 'id': 'VDKA3609139', - 'title': 'This Guilty Blood', - 'description': 'md5:f18e79ad1c613798d95fdabfe96cd292', + 'title': 'Switch the Flip', + 'description': 'To help get Brian’s life in order, Stewie and Brian swap bodies using a machine that Stewie invents.', 'age_limit': 14, + 'duration': 1297, + 'thumbnail': r're:https?://.+/.+\.jpg', + 'series': 'Family Guy', + 'season': 'Season 16', + 'season_number': 16, + 'episode': 'Episode 17', + 'episode_number': 17, + 'timestamp': 1746082800.0, + 'upload_date': '20250501', + }, + 'params': {'skip_download': 'm3u8'}, + 'skip': 'This video requires AdobePass MSO credentials', + }, { + 'url': 'https://disneynow.com/episode/21029660-ba06-4406-adb0-a9a78f6e265e/playlist/PL553044961', + 'info_dict': { + 'id': 'VDKA39546942', + 'ext': 'mp4', + 'title': 'Zero Friends Again', + 'description': 'Relationships fray under the pressures of a difficult journey.', + 'age_limit': 0, + 'duration': 1721, + 'thumbnail': r're:https?://.+/.+\.jpg', + 'series': 'Star Wars: Skeleton Crew', + 'season': 'Season 1', + 'season_number': 1, + 'episode': 'Episode 6', + 'episode_number': 6, + 'timestamp': 1746946800.0, + 'upload_date': '20250511', + }, + 'params': {'skip_download': 'm3u8'}, + 'skip': 'This video requires AdobePass MSO credentials', + }, { + 'url': 'https://fxnow.fxnetworks.com/episode/09f4fa6f-c293-469e-aebe-32c9ca5842a7/playlist/PL554408064', + 'info_dict': { + 'id': 'VDKA38112033', + 'ext': 'mp4', + 'title': 'The Return of Jerry', + 'description': 'The vampires’ long-lost fifth roommate returns. Written by Paul Simms; directed by Kyle Newacheck.', + 'age_limit': 17, + 'duration': 1493, + 'thumbnail': r're:https?://.+/.+\.jpg', + 'series': 'What We Do in the Shadows', + 'season': 'Season 6', + 'season_number': 6, 'episode': 'Episode 1', - 'upload_date': '20170102', - 'season': 'Season 2', - 'thumbnail': 'http://cdn1.edgedatg.com/aws/v2/abcf/Shadowhunters/video/201/ae5f75608d86bf88aa4f9f4aa76ab1b7/579x325-Q100_ae5f75608d86bf88aa4f9f4aa76ab1b7.jpg', - 'duration': 2544, - 'season_number': 2, - 'series': 'Shadowhunters', 'episode_number': 1, - 'timestamp': 1483387200, - 'ext': 'mp4', - }, - 'params': { - 'geo_bypass_ip_block': '3.244.239.0/24', - # m3u8 download - 'skip_download': True, + 'timestamp': 1729573200.0, + 'upload_date': '20241022', }, + 'params': {'skip_download': 'm3u8'}, + 'skip': 'This video requires AdobePass MSO credentials', }, { - 'url': 'https://abc.com/shows/the-rookie/episode-guide/season-04/12-the-knock', + 'url': 'https://www.freeform.com/episode/bda0eaf7-761a-4838-aa44-96f794000844/playlist/PL553044961', 'info_dict': { - 'id': 'VDKA26050359', - 'title': 'The Knock', - 'description': 'md5:0c2947e3ada4c31f28296db7db14aa64', - 'age_limit': 14, + 'id': 'VDKA39007340', 'ext': 'mp4', - 'thumbnail': 'http://cdn1.edgedatg.com/aws/v2/abc/TheRookie/video/412/daf830d06e83b11eaf5c0a299d993ae3/1556x876-Q75_daf830d06e83b11eaf5c0a299d993ae3.jpg', - 'episode': 'Episode 12', - 'season_number': 4, - 'season': 'Season 4', - 'timestamp': 1642975200, - 'episode_number': 12, - 'upload_date': '20220123', - 'series': 'The Rookie', - 'duration': 2572, - }, - 'params': { - 'geo_bypass_ip_block': '3.244.239.0/24', - # m3u8 download - 'skip_download': True, + 'title': 'Angel\'s Landing', + 'description': 'md5:91bf084e785c968fab16734df7313446', + 'age_limit': 14, + 'duration': 2523, + 'thumbnail': r're:https?://.+/.+\.jpg', + 'series': 'How I Escaped My Cult', + 'season': 'Season 1', + 'season_number': 1, + 'episode': 'Episode 2', + 'episode_number': 2, + 'timestamp': 1740038400.0, + 'upload_date': '20250220', }, + 'params': {'skip_download': 'm3u8'}, }, { - 'url': 'https://fxnow.fxnetworks.com/shows/better-things/video/vdka12782841', + 'url': 'https://www.nationalgeographic.com/tv/episode/ca694661-1186-41ae-8089-82f64d69b16d/playlist/PL554408064', 'info_dict': { - 'id': 'VDKA12782841', - 'title': 'First Look: Better Things - Season 2', - 'description': 'md5:fa73584a95761c605d9d54904e35b407', + 'id': 'VDKA39492078', 'ext': 'mp4', - 'age_limit': 14, - 'upload_date': '20170825', - 'duration': 161, - 'series': 'Better Things', - 'thumbnail': 'http://cdn1.edgedatg.com/aws/v2/fx/BetterThings/video/12782841/b6b05e58264121cc2c98811318e6d507/1556x876-Q75_b6b05e58264121cc2c98811318e6d507.jpg', - 'timestamp': 1503661074, - }, - 'params': { - 'geo_bypass_ip_block': '3.244.239.0/24', - # m3u8 download - 'skip_download': True, + 'title': 'Heart of the Emperors', + 'description': 'md5:4fc50a2878f030bb3a7eac9124dca677', + 'age_limit': 0, + 'duration': 2775, + 'thumbnail': r're:https?://.+/.+\.jpg', + 'series': 'Secrets of the Penguins', + 'season': 'Season 1', + 'season_number': 1, + 'episode': 'Episode 1', + 'episode_number': 1, + 'timestamp': 1745204400.0, + 'upload_date': '20250421', }, + 'params': {'skip_download': 'm3u8'}, }, { - 'url': 'http://abc.go.com/shows/the-catch/episode-guide/season-01/10-the-wedding', + 'url': 'https://www.freeform.com/movies-and-specials/c38281fc-9f8f-47c7-8220-22394f9df2e1', 'only_matching': True, }, { - 'url': 'http://abc.go.com/shows/world-news-tonight/episode-guide/2017-02/17-021717-intense-stand-off-between-man-with-rifle-and-police-in-oakland', - 'only_matching': True, - }, { - # brand 004 - 'url': 'http://disneynow.go.com/shows/big-hero-6-the-series/season-01/episode-10-mr-sparkles-loses-his-sparkle/vdka4637915', - 'only_matching': True, - }, { - # brand 008 - 'url': 'http://disneynow.go.com/shows/minnies-bow-toons/video/happy-campers/vdka4872013', - 'only_matching': True, - }, { - 'url': 'https://disneynow.com/shows/minnies-bow-toons/video/happy-campers/vdka4872013', - 'only_matching': True, - }, { - 'url': 'https://www.freeform.com/shows/cruel-summer/episode-guide/season-01/01-happy-birthday-jeanette-turner', + 'url': 'https://abc.com/video/219a454a-172c-41bf-878a-d169e6bc0bdc/playlist/PL5523098420', 'only_matching': True, }] @@ -171,58 +167,29 @@ def _extract_videos(self, brand, video_id='-1', show_id='-1'): f'http://api.contents.watchabc.go.com/vp2/ws/contents/3000/videos/{brand}/001/-1/{show_id}/-1/{video_id}/-1/-1.json', display_id)['video'] + def _extract_global_var(self, name, webpage, video_id): + return self._search_json( + fr'window\[["\']{re.escape(name)}["\']\]\s*=', + webpage, f'{name.strip("_")} JSON', video_id) + def _real_extract(self, url): - mobj = self._match_valid_url(url) - sub_domain = remove_start(remove_end(mobj.group('sub_domain') or '', '.go'), 'www.') - video_id, display_id = mobj.group('id', 'display_id') - site_info = self._SITE_INFO.get(sub_domain, {}) - brand = site_info.get('brand') - if not video_id or not site_info: - webpage = self._download_webpage(url, display_id or video_id) - data = self._parse_json( - self._search_regex( - r'["\']__abc_com__["\']\s*\]\s*=\s*({.+?})\s*;', webpage, - 'data', default='{}'), - display_id or video_id, fatal=False) - # https://abc.com/shows/modern-family/episode-guide/season-01/101-pilot - layout = try_get(data, lambda x: x['page']['content']['video']['layout'], dict) - video_id = None - if layout: - video_id = try_get( - layout, - (lambda x: x['videoid'], lambda x: x['video']['id']), - str) - if not video_id: - video_id = self._search_regex( - ( - # There may be inner quotes, e.g. data-video-id="'VDKA3609139'" - # from http://freeform.go.com/shows/shadowhunters/episodes/season-2/1-this-guilty-blood - r'data-video-id=["\']*(VDKA\w+)', - # page.analytics.videoIdCode - r'\bvideoIdCode["\']\s*:\s*["\']((?:vdka|VDKA)\w+)', - # https://abc.com/shows/the-rookie/episode-guide/season-02/03-the-bet - r'\b(?:video)?id["\']\s*:\s*["\'](VDKA\w+)', - ), webpage, 'video id', default=video_id) - if not site_info: - brand = self._search_regex( - (r'data-brand=\s*["\']\s*(\d+)', - r'data-page-brand=\s*["\']\s*(\d+)'), webpage, 'brand', - default='004') - site_info = next( - si for _, si in self._SITE_INFO.items() - if si.get('brand') == brand) - if not video_id: - # show extraction works for Disney, DisneyJunior and DisneyXD - # ABC and Freeform has different layout - show_id = self._search_regex(r'data-show-id=["\']*(SH\d+)', webpage, 'show id') - videos = self._extract_videos(brand, show_id=show_id) - show_title = self._search_regex(r'data-show-title="([^"]+)"', webpage, 'show title', fatal=False) - entries = [] - for video in videos: - entries.append(self.url_result( - video['url'], 'Go', video.get('id'), video.get('title'))) - entries.reverse() - return self.playlist_result(entries, show_id, show_title) + site, display_id = self._match_valid_url(url).group('site', 'id') + webpage = self._download_webpage(url, display_id) + config = self._extract_global_var('__CONFIG__', webpage, display_id) + data = self._extract_global_var(config['globalVar'], webpage, display_id) + video_id = traverse_obj(data, ( + 'page', 'content', 'video', 'layout', (('video', 'id'), 'videoid'), {str}, any)) + if not video_id: + video_id = self._search_regex([ + # data-track-video_id="VDKA39492078" + # data-track-video_id_code="vdka39492078" + # data-video-id="'VDKA3609139'" + r'data-(?:track-)?video[_-]id(?:_code)?=["\']*((?:vdka|VDKA)\d+)', + # page.analytics.videoIdCode + r'\bvideoIdCode["\']\s*:\s*["\']((?:vdka|VDKA)\d+)'], webpage, 'video ID') + + site_info = self._SITE_INFO[site] + brand = site_info['brand'] video_data = self._extract_videos(brand, video_id)[0] video_id = video_data['id'] title = video_data['title'] @@ -238,26 +205,31 @@ def _real_extract(self, url): if ext == 'm3u8': video_type = video_data.get('type') data = { - 'video_id': video_data['id'], + 'video_id': video_id, 'video_type': video_type, 'brand': brand, 'device': '001', + 'app_name': 'webplayer-abc', } if video_data.get('accesslevel') == '1': - requestor_id = site_info.get('requestor_id', 'DisneyChannels') + provider_id = site_info['provider_id'] + software_statement = traverse_obj(data, ('app', 'config', ( + ('features', 'auth', 'softwareStatement'), + ('tvAuth', 'SOFTWARE_STATEMENTS', 'PRODUCTION'), + ), {str}, any)) or site_info['software_statement'] resource = site_info.get('resource_id') or self._get_mvpd_resource( - requestor_id, title, video_id, None) + provider_id, title, video_id, None) auth = self._extract_mvpd_auth( - url, video_id, requestor_id, resource) + url, video_id, site_info['requestor_id'], resource, software_statement) data.update({ 'token': auth, 'token_type': 'ap', - 'adobe_requestor_id': requestor_id, + 'adobe_requestor_id': provider_id, }) else: self._initialize_geo_bypass({'countries': ['US']}) entitlement = self._download_json( - 'https://api.entitlement.watchabc.go.com/vp2/ws-secure/entitlement/2020/authorize.json', + 'https://prod.gatekeeper.us-abc.symphony.edgedatg.go.com/vp2/ws-secure/entitlement/2020/playmanifest_secure.json', video_id, data=urlencode_postdata(data)) errors = entitlement.get('errors', {}).get('errors', []) if errors: @@ -267,7 +239,7 @@ def _real_extract(self, url): error['message'], countries=['US']) error_message = ', '.join([error['message'] for error in errors]) raise ExtractorError(f'{self.IE_NAME} said: {error_message}', expected=True) - asset_url += '?' + entitlement['uplynkData']['sessionKey'] + asset_url += '?' + entitlement['entitlement']['uplynkData']['sessionKey'] fmts, subs = self._extract_m3u8_formats_and_subtitles( asset_url, video_id, 'mp4', m3u8_id=format_id or 'hls', fatal=False) formats.extend(fmts) diff --git a/yt_dlp/extractor/hse.py b/yt_dlp/extractor/hse.py index d9004293ff..c3c7bb32e2 100644 --- a/yt_dlp/extractor/hse.py +++ b/yt_dlp/extractor/hse.py @@ -6,7 +6,7 @@ ) -class HSEShowBaseInfoExtractor(InfoExtractor): +class HSEShowBaseIE(InfoExtractor): _GEO_COUNTRIES = ['DE'] def _extract_redux_data(self, url, video_id): @@ -28,7 +28,7 @@ def _extract_formats_and_subtitles(self, sources, video_id): return formats, subtitles -class HSEShowIE(HSEShowBaseInfoExtractor): +class HSEShowIE(HSEShowBaseIE): _VALID_URL = r'https?://(?:www\.)?hse\.de/dpl/c/tv-shows/(?P[0-9]+)' _TESTS = [{ 'url': 'https://www.hse.de/dpl/c/tv-shows/505350', @@ -64,7 +64,7 @@ def _real_extract(self, url): } -class HSEProductIE(HSEShowBaseInfoExtractor): +class HSEProductIE(HSEShowBaseIE): _VALID_URL = r'https?://(?:www\.)?hse\.de/dpl/p/product/(?P[0-9]+)' _TESTS = [{ 'url': 'https://www.hse.de/dpl/p/product/408630', diff --git a/yt_dlp/extractor/ichinanalive.py b/yt_dlp/extractor/ichinanalive.py index a37cfe77bd..475d33593d 100644 --- a/yt_dlp/extractor/ichinanalive.py +++ b/yt_dlp/extractor/ichinanalive.py @@ -1,5 +1,13 @@ + from .common import InfoExtractor -from ..utils import ExtractorError, str_or_none, traverse_obj, unified_strdate +from ..utils import ( + ExtractorError, + int_or_none, + str_or_none, + traverse_obj, + unified_strdate, + url_or_none, +) class IchinanaLiveIE(InfoExtractor): @@ -157,3 +165,51 @@ def _real_extract(self, url): 'description': view_data.get('caption'), 'upload_date': unified_strdate(str_or_none(view_data.get('createdAt'))), } + + +class IchinanaLiveVODIE(InfoExtractor): + IE_NAME = '17live:vod' + _VALID_URL = r'https?://(?:www\.)?17\.live/ja/vod/[^/?#]+/(?P[^/?#]+)' + _TESTS = [{ + 'url': 'https://17.live/ja/vod/27323042/2cf84520-e65e-4b22-891e-1d3a00b0f068', + 'md5': '3299b930d7457b069639486998a89580', + 'info_dict': { + 'id': '2cf84520-e65e-4b22-891e-1d3a00b0f068', + 'ext': 'mp4', + 'title': 'md5:b5f8cbf497d54cc6a60eb3b480182f01', + 'uploader': 'md5:29fb12122ab94b5a8495586e7c3085a5', + 'uploader_id': '27323042', + 'channel': '🌟オールナイトニッポン アーカイブ🌟', + 'channel_id': '2b4f85f1-d61e-429d-a901-68d32bdd8645', + 'like_count': int, + 'view_count': int, + 'thumbnail': r're:https?://.+/.+\.(?:jpe?g|png)', + 'duration': 549, + 'description': 'md5:116f326579700f00eaaf5581aae1192e', + 'timestamp': 1741058645, + 'upload_date': '20250304', + }, + }, { + 'url': 'https://17.live/ja/vod/27323042/0de11bac-9bea-40b8-9eab-0239a7d88079', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + json_data = self._download_json(f'https://wap-api.17app.co/api/v1/vods/{video_id}', video_id) + + return traverse_obj(json_data, { + 'id': ('vodID', {str}), + 'title': ('title', {str}), + 'formats': ('vodURL', {lambda x: self._extract_m3u8_formats(x, video_id)}), + 'uploader': ('userInfo', 'displayName', {str}), + 'uploader_id': ('userInfo', 'roomID', {int}, {str_or_none}), + 'channel': ('userInfo', 'name', {str}), + 'channel_id': ('userInfo', 'userID', {str}), + 'like_count': ('likeCount', {int_or_none}), + 'view_count': ('viewCount', {int_or_none}), + 'thumbnail': ('imageURL', {url_or_none}), + 'duration': ('duration', {int_or_none}), + 'description': ('description', {str}), + 'timestamp': ('createdAt', {int_or_none}), + }) diff --git a/yt_dlp/extractor/iprima.py b/yt_dlp/extractor/iprima.py index 9b91a454b1..bf2344f720 100644 --- a/yt_dlp/extractor/iprima.py +++ b/yt_dlp/extractor/iprima.py @@ -1,3 +1,4 @@ +import json import re import time @@ -6,9 +7,7 @@ ExtractorError, determine_ext, js_to_json, - parse_qs, traverse_obj, - urlencode_postdata, ) @@ -16,7 +15,6 @@ class IPrimaIE(InfoExtractor): _VALID_URL = r'https?://(?!cnn)(?:[^/]+)\.iprima\.cz/(?:[^/]+/)*(?P[^/?#&]+)' _GEO_BYPASS = False _NETRC_MACHINE = 'iprima' - _AUTH_ROOT = 'https://auth.iprima.cz' access_token = None _TESTS = [{ @@ -86,48 +84,18 @@ def _perform_login(self, username, password): if self.access_token: return - login_page = self._download_webpage( - f'{self._AUTH_ROOT}/oauth2/login', None, note='Downloading login page', - errnote='Downloading login page failed') - - login_form = self._hidden_inputs(login_page) - - login_form.update({ - '_email': username, - '_password': password}) - - profile_select_html, login_handle = self._download_webpage_handle( - f'{self._AUTH_ROOT}/oauth2/login', None, data=urlencode_postdata(login_form), - note='Logging in') - - # a profile may need to be selected first, even when there is only a single one - if '/profile-select' in login_handle.url: - profile_id = self._search_regex( - r'data-identifier\s*=\s*["\']?(\w+)', profile_select_html, 'profile id') - - login_handle = self._request_webpage( - f'{self._AUTH_ROOT}/user/profile-select-perform/{profile_id}', None, - query={'continueUrl': '/user/login?redirect_uri=/user/'}, note='Selecting profile') - - code = traverse_obj(login_handle.url, ({parse_qs}, 'code', 0)) - if not code: - raise ExtractorError('Login failed', expected=True) - - token_request_data = { - 'scope': 'openid+email+profile+phone+address+offline_access', - 'client_id': 'prima_sso', - 'grant_type': 'authorization_code', - 'code': code, - 'redirect_uri': f'{self._AUTH_ROOT}/sso/auth-check'} - token_data = self._download_json( - f'{self._AUTH_ROOT}/oauth2/token', None, - note='Downloading token', errnote='Downloading token failed', - data=urlencode_postdata(token_request_data)) + 'https://ucet.iprima.cz/api/session/create', None, + note='Logging in', errnote='Failed to log in', + data=json.dumps({ + 'email': username, + 'password': password, + 'deviceName': 'Windows Chrome', + }).encode(), headers={'content-type': 'application/json'}) - self.access_token = token_data.get('access_token') - if self.access_token is None: - raise ExtractorError('Getting token failed', expected=True) + self.access_token = token_data['accessToken']['value'] + if not self.access_token: + raise ExtractorError('Failed to fetch access token') def _real_initialize(self): if not self.access_token: diff --git a/yt_dlp/extractor/ivoox.py b/yt_dlp/extractor/ivoox.py new file mode 100644 index 0000000000..36e02493a5 --- /dev/null +++ b/yt_dlp/extractor/ivoox.py @@ -0,0 +1,78 @@ +from .common import InfoExtractor +from ..utils import int_or_none, parse_iso8601, url_or_none, urljoin +from ..utils.traversal import traverse_obj + + +class IvooxIE(InfoExtractor): + _VALID_URL = ( + r'https?://(?:www\.)?ivoox\.com/(?:\w{2}/)?[^/?#]+_rf_(?P[0-9]+)_1\.html', + r'https?://go\.ivoox\.com/rf/(?P[0-9]+)', + ) + _TESTS = [{ + 'url': 'https://www.ivoox.com/dex-08x30-rostros-del-mal-los-asesinos-en-audios-mp3_rf_143594959_1.html', + 'md5': '993f712de5b7d552459fc66aa3726885', + 'info_dict': { + 'id': '143594959', + 'ext': 'mp3', + 'timestamp': 1742731200, + 'channel': 'DIAS EXTRAÑOS con Santiago Camacho', + 'title': 'DEx 08x30 Rostros del mal: Los asesinos en serie que aterrorizaron España', + 'description': 'md5:eae8b4b9740d0216d3871390b056bb08', + 'uploader': 'Santiago Camacho', + 'thumbnail': 'https://static-1.ivoox.com/audios/c/d/5/2/cd52f46783fe735000c33a803dce2554_XXL.jpg', + 'upload_date': '20250323', + 'episode': 'DEx 08x30 Rostros del mal: Los asesinos en serie que aterrorizaron España', + 'duration': 11837, + 'tags': ['españa', 'asesinos en serie', 'arropiero', 'historia criminal', 'mataviejas'], + }, + }, { + 'url': 'https://go.ivoox.com/rf/143594959', + 'only_matching': True, + }, { + 'url': 'https://www.ivoox.com/en/campodelgas-28-03-2025-audios-mp3_rf_144036942_1.html', + 'only_matching': True, + }] + + def _real_extract(self, url): + media_id = self._match_id(url) + webpage = self._download_webpage(url, media_id, fatal=False) + + data = self._search_nuxt_data( + webpage, media_id, fatal=False, traverse=('data', 0, 'data', 'audio')) + + direct_download = self._download_json( + f'https://vcore-web.ivoox.com/v1/public/audios/{media_id}/download-url', media_id, fatal=False, + note='Fetching direct download link', headers={'Referer': url}) + + download_paths = { + *traverse_obj(direct_download, ('data', 'downloadUrl', {str}, filter, all)), + *traverse_obj(data, (('downloadUrl', 'mediaUrl'), {str}, filter)), + } + + formats = [] + for path in download_paths: + formats.append({ + 'url': urljoin('https://ivoox.com', path), + 'http_headers': {'Referer': url}, + }) + + return { + 'id': media_id, + 'formats': formats, + 'uploader': self._html_search_regex(r'data-prm-author="([^"]+)"', webpage, 'author', default=None), + 'timestamp': parse_iso8601( + self._html_search_regex(r'data-prm-pubdate="([^"]+)"', webpage, 'timestamp', default=None)), + 'channel': self._html_search_regex(r'data-prm-podname="([^"]+)"', webpage, 'channel', default=None), + 'title': self._html_search_regex(r'data-prm-title="([^"]+)"', webpage, 'title', default=None), + 'thumbnail': self._og_search_thumbnail(webpage, default=None), + 'description': self._og_search_description(webpage, default=None), + **self._search_json_ld(webpage, media_id, default={}), + **traverse_obj(data, { + 'title': ('title', {str}), + 'description': ('description', {str}), + 'thumbnail': ('image', {url_or_none}), + 'timestamp': ('uploadDate', {parse_iso8601(delimiter=' ')}), + 'duration': ('duration', {int_or_none}), + 'tags': ('tags', ..., 'name', {str}), + }), + } diff --git a/yt_dlp/extractor/jamendo.py b/yt_dlp/extractor/jamendo.py index 16540c4147..bac0e869ce 100644 --- a/yt_dlp/extractor/jamendo.py +++ b/yt_dlp/extractor/jamendo.py @@ -2,10 +2,12 @@ import random from .common import InfoExtractor +from ..networking import HEADRequest from ..utils import ( clean_html, int_or_none, try_get, + urlhandle_detect_ext, ) @@ -27,7 +29,7 @@ class JamendoIE(InfoExtractor): 'ext': 'flac', # 'title': 'Maya Filipič - Stories from Emona I', 'title': 'Stories from Emona I', - 'artist': 'Maya Filipič', + 'artists': ['Maya Filipič'], 'album': 'Between two worlds', 'track': 'Stories from Emona I', 'duration': 210, @@ -93,9 +95,15 @@ def _real_extract(self, url): if not cover_url or cover_url in urls: continue urls.append(cover_url) + urlh = self._request_webpage( + HEADRequest(cover_url), track_id, 'Checking thumbnail extension', + errnote=False, fatal=False) + if not urlh: + continue size = int_or_none(cover_id.lstrip('size')) thumbnails.append({ 'id': cover_id, + 'ext': urlhandle_detect_ext(urlh, default='jpg'), 'url': cover_url, 'width': size, 'height': size, diff --git a/yt_dlp/extractor/jiosaavn.py b/yt_dlp/extractor/jiosaavn.py index 030fe686bd..3760e9abea 100644 --- a/yt_dlp/extractor/jiosaavn.py +++ b/yt_dlp/extractor/jiosaavn.py @@ -1,23 +1,33 @@ import functools +import itertools import math import re from .common import InfoExtractor from ..utils import ( InAdvancePagedList, + ISO639Utils, + OnDemandPagedList, clean_html, int_or_none, + js_to_json, make_archive_id, + orderedSet, smuggle_url, + unified_strdate, + unified_timestamp, unsmuggle_url, url_basename, url_or_none, urlencode_postdata, + urljoin, + variadic, ) from ..utils.traversal import traverse_obj class JioSaavnBaseIE(InfoExtractor): + _URL_BASE_RE = r'https?://(?:www\.)?(?:jio)?saavn\.com' _API_URL = 'https://www.jiosaavn.com/api.php' _VALID_BITRATES = {'16', '32', '64', '128', '320'} @@ -30,16 +40,20 @@ def requested_bitrates(self): f'Valid bitrates are: {", ".join(sorted(self._VALID_BITRATES, key=int))}') return requested_bitrates - def _extract_formats(self, song_data): + def _extract_formats(self, item_data): + # Show/episode JSON data has a slightly different structure than song JSON data + if media_url := traverse_obj(item_data, ('more_info', 'encrypted_media_url', {str})): + item_data.setdefault('encrypted_media_url', media_url) + for bitrate in self.requested_bitrates: media_data = self._download_json( - self._API_URL, song_data['id'], + self._API_URL, item_data['id'], f'Downloading format info for {bitrate}', fatal=False, data=urlencode_postdata({ '__call': 'song.generateAuthToken', '_format': 'json', 'bitrate': bitrate, - 'url': song_data['encrypted_media_url'], + 'url': item_data['encrypted_media_url'], })) if not traverse_obj(media_data, ('auth_url', {url_or_none})): self.report_warning(f'Unable to extract format info for {bitrate}') @@ -53,24 +67,6 @@ def _extract_formats(self, song_data): 'vcodec': 'none', } - def _extract_song(self, song_data, url=None): - info = traverse_obj(song_data, { - 'id': ('id', {str}), - 'title': ('song', {clean_html}), - 'album': ('album', {clean_html}), - 'thumbnail': ('image', {url_or_none}, {lambda x: re.sub(r'-\d+x\d+\.', '-500x500.', x)}), - 'duration': ('duration', {int_or_none}), - 'view_count': ('play_count', {int_or_none}), - 'release_year': ('year', {int_or_none}), - 'artists': ('primary_artists', {lambda x: x.split(', ') if x else None}), - 'webpage_url': ('perma_url', {url_or_none}), - }) - if webpage_url := info.get('webpage_url') or url: - info['display_id'] = url_basename(webpage_url) - info['_old_archive_ids'] = [make_archive_id(JioSaavnSongIE, info['display_id'])] - - return info - def _call_api(self, type_, token, note='API', params={}): return self._download_json( self._API_URL, token, f'Downloading {note} JSON', f'Unable to download {note} JSON', @@ -84,19 +80,89 @@ def _call_api(self, type_, token, note='API', params={}): **params, }) - def _yield_songs(self, playlist_data): - for song_data in traverse_obj(playlist_data, ('songs', lambda _, v: v['id'] and v['perma_url'])): - song_info = self._extract_song(song_data) - url = smuggle_url(song_info['webpage_url'], { - 'id': song_data['id'], - 'encrypted_media_url': song_data['encrypted_media_url'], - }) - yield self.url_result(url, JioSaavnSongIE, url_transparent=True, **song_info) + @staticmethod + def _extract_song(song_data, url=None): + info = traverse_obj(song_data, { + 'id': ('id', {str}), + 'title': (('song', 'title'), {clean_html}, any), + 'album': ((None, 'more_info'), 'album', {clean_html}, any), + 'duration': ((None, 'more_info'), 'duration', {int_or_none}, any), + 'channel': ((None, 'more_info'), 'label', {str}, any), + 'channel_id': ((None, 'more_info'), 'label_id', {str}, any), + 'channel_url': ((None, 'more_info'), 'label_url', {urljoin('https://www.jiosaavn.com/')}, any), + 'release_date': ((None, 'more_info'), 'release_date', {unified_strdate}, any), + 'release_year': ('year', {int_or_none}), + 'thumbnail': ('image', {url_or_none}, {lambda x: re.sub(r'-\d+x\d+\.', '-500x500.', x)}), + 'view_count': ('play_count', {int_or_none}), + 'language': ('language', {lambda x: ISO639Utils.short2long(x.casefold()) or 'und'}), + 'webpage_url': ('perma_url', {url_or_none}), + 'artists': ('more_info', 'artistMap', 'primary_artists', ..., 'name', {str}, filter, all), + }) + if webpage_url := info.get('webpage_url') or url: + info['display_id'] = url_basename(webpage_url) + info['_old_archive_ids'] = [make_archive_id(JioSaavnSongIE, info['display_id'])] + + if primary_artists := traverse_obj(song_data, ('primary_artists', {lambda x: x.split(', ') if x else None})): + info['artists'].extend(primary_artists) + if featured_artists := traverse_obj(song_data, ('featured_artists', {str}, filter)): + info['artists'].extend(featured_artists.split(', ')) + info['artists'] = orderedSet(info['artists']) or None + + return info + + @staticmethod + def _extract_episode(episode_data, url=None): + info = JioSaavnBaseIE._extract_song(episode_data, url) + info.pop('_old_archive_ids', None) + info.update(traverse_obj(episode_data, { + 'description': ('more_info', 'description', {str}), + 'timestamp': ('more_info', 'release_time', {unified_timestamp}), + 'series': ('more_info', 'show_title', {str}), + 'series_id': ('more_info', 'show_id', {str}), + 'season': ('more_info', 'season_title', {str}), + 'season_number': ('more_info', 'season_no', {int_or_none}), + 'season_id': ('more_info', 'season_id', {str}), + 'episode_number': ('more_info', 'episode_number', {int_or_none}), + 'cast': ('starring', {lambda x: x.split(', ') if x else None}), + })) + return info + + def _extract_jiosaavn_result(self, url, endpoint, response_key, parse_func): + url, smuggled_data = unsmuggle_url(url) + data = traverse_obj(smuggled_data, ({ + 'id': ('id', {str}), + 'encrypted_media_url': ('encrypted_media_url', {str}), + })) + + if 'id' in data and 'encrypted_media_url' in data: + result = {'id': data['id']} + else: + # only extract metadata if this is not a url_transparent result + data = self._call_api(endpoint, self._match_id(url))[response_key][0] + result = parse_func(data, url) + + result['formats'] = list(self._extract_formats(data)) + return result + + def _yield_items(self, playlist_data, keys=None, parse_func=None): + """Subclasses using this method must set _ENTRY_IE""" + if parse_func is None: + parse_func = self._extract_song + + for item_data in traverse_obj(playlist_data, ( + *variadic(keys, (str, bytes, dict, set)), lambda _, v: v['id'] and v['perma_url'], + )): + info = parse_func(item_data) + url = smuggle_url(info['webpage_url'], traverse_obj(item_data, { + 'id': ('id', {str}), + 'encrypted_media_url': ((None, 'more_info'), 'encrypted_media_url', {str}, any), + })) + yield self.url_result(url, self._ENTRY_IE, url_transparent=True, **info) class JioSaavnSongIE(JioSaavnBaseIE): IE_NAME = 'jiosaavn:song' - _VALID_URL = r'https?://(?:www\.)?(?:jiosaavn\.com/song/[^/?#]+/|saavn\.com/s/song/(?:[^/?#]+/){3})(?P[^/?#]+)' + _VALID_URL = JioSaavnBaseIE._URL_BASE_RE + r'(?:/song/[^/?#]+/|/s/song/(?:[^/?#]+/){3})(?P[^/?#]+)' _TESTS = [{ 'url': 'https://www.jiosaavn.com/song/leja-re/OQsEfQFVUXk', 'md5': '3b84396d15ed9e083c3106f1fa589c04', @@ -106,12 +172,38 @@ class JioSaavnSongIE(JioSaavnBaseIE): 'ext': 'm4a', 'title': 'Leja Re', 'album': 'Leja Re', - 'thumbnail': r're:https?://c.saavncdn.com/258/Leja-Re-Hindi-2018-20181124024539-500x500.jpg', + 'thumbnail': r're:https?://.+/.+\.jpg', 'duration': 205, 'view_count': int, 'release_year': 2018, 'artists': ['Sandesh Shandilya', 'Dhvani Bhanushali', 'Tanishk Bagchi'], '_old_archive_ids': ['jiosaavnsong OQsEfQFVUXk'], + 'channel': 'T-Series', + 'language': 'hin', + 'channel_id': '34297', + 'channel_url': 'https://www.jiosaavn.com/label/t-series-albums/6DLuXO3VoTo_', + 'release_date': '20181124', + }, + }, { + 'url': 'https://www.jiosaavn.com/song/chuttamalle/P1FfWjZkQ0Q', + 'md5': '96296c58d6ce488a417ef0728fd2d680', + 'info_dict': { + 'id': 'O94kBTtw', + 'display_id': 'P1FfWjZkQ0Q', + 'ext': 'm4a', + 'title': 'Chuttamalle', + 'album': 'Devara Part 1 - Telugu', + 'thumbnail': r're:https?://.+/.+\.jpg', + 'duration': 222, + 'view_count': int, + 'release_year': 2024, + 'artists': 'count:3', + '_old_archive_ids': ['jiosaavnsong P1FfWjZkQ0Q'], + 'channel': 'T-Series', + 'language': 'tel', + 'channel_id': '34297', + 'channel_url': 'https://www.jiosaavn.com/label/t-series-albums/6DLuXO3VoTo_', + 'release_date': '20240926', }, }, { 'url': 'https://www.saavn.com/s/song/hindi/Saathiya/O-Humdum-Suniyo-Re/KAMiazoCblU', @@ -119,26 +211,51 @@ class JioSaavnSongIE(JioSaavnBaseIE): }] def _real_extract(self, url): - url, smuggled_data = unsmuggle_url(url) - song_data = traverse_obj(smuggled_data, ({ - 'id': ('id', {str}), - 'encrypted_media_url': ('encrypted_media_url', {str}), - })) + return self._extract_jiosaavn_result(url, 'song', 'songs', self._extract_song) - if 'id' in song_data and 'encrypted_media_url' in song_data: - result = {'id': song_data['id']} - else: - # only extract metadata if this is not a url_transparent result - song_data = self._call_api('song', self._match_id(url))['songs'][0] - result = self._extract_song(song_data, url) - result['formats'] = list(self._extract_formats(song_data)) - return result +class JioSaavnShowIE(JioSaavnBaseIE): + IE_NAME = 'jiosaavn:show' + _VALID_URL = JioSaavnBaseIE._URL_BASE_RE + r'/shows/[^/?#]+/(?P[^/?#]{11,})/?(?:$|[?#])' + _TESTS = [{ + 'url': 'https://www.jiosaavn.com/shows/non-food-ways-to-boost-your-energy/XFMcKICOCgc_', + 'md5': '0733cd254cfe74ef88bea1eaedcf1f4f', + 'info_dict': { + 'id': 'qqzh3RKZ', + 'display_id': 'XFMcKICOCgc_', + 'ext': 'mp3', + 'title': 'Non-Food Ways To Boost Your Energy', + 'description': 'md5:26e7129644b5c6aada32b8851c3997c8', + 'episode': 'Episode 1', + 'timestamp': 1640563200, + 'series': 'Holistic Lifestyle With Neha Ranglani', + 'series_id': '52397', + 'season': 'Holistic Lifestyle With Neha Ranglani', + 'season_number': 1, + 'season_id': '61273', + 'thumbnail': r're:https?://.+/.+\.jpg', + 'duration': 311, + 'view_count': int, + 'release_year': 2021, + 'language': 'eng', + 'channel': 'Saavn OG', + 'channel_id': '1953876', + 'episode_number': 1, + 'upload_date': '20211227', + 'release_date': '20211227', + }, + }, { + 'url': 'https://www.jiosaavn.com/shows/himesh-reshammiya/Kr8fmfSN4vo_', + 'only_matching': True, + }] + + def _real_extract(self, url): + return self._extract_jiosaavn_result(url, 'episode', 'episodes', self._extract_episode) class JioSaavnAlbumIE(JioSaavnBaseIE): IE_NAME = 'jiosaavn:album' - _VALID_URL = r'https?://(?:www\.)?(?:jio)?saavn\.com/album/[^/?#]+/(?P[^/?#]+)' + _VALID_URL = JioSaavnBaseIE._URL_BASE_RE + r'/album/[^/?#]+/(?P[^/?#]+)' _TESTS = [{ 'url': 'https://www.jiosaavn.com/album/96/buIOjYZDrNA_', 'info_dict': { @@ -147,18 +264,19 @@ class JioSaavnAlbumIE(JioSaavnBaseIE): }, 'playlist_count': 10, }] + _ENTRY_IE = JioSaavnSongIE def _real_extract(self, url): display_id = self._match_id(url) album_data = self._call_api('album', display_id) return self.playlist_result( - self._yield_songs(album_data), display_id, traverse_obj(album_data, ('title', {str}))) + self._yield_items(album_data, 'songs'), display_id, traverse_obj(album_data, ('title', {str}))) class JioSaavnPlaylistIE(JioSaavnBaseIE): IE_NAME = 'jiosaavn:playlist' - _VALID_URL = r'https?://(?:www\.)?(?:jio)?saavn\.com/(?:s/playlist/(?:[^/?#]+/){2}|featured/[^/?#]+/)(?P[^/?#]+)' + _VALID_URL = JioSaavnBaseIE._URL_BASE_RE + r'/(?:s/playlist/(?:[^/?#]+/){2}|featured/[^/?#]+/)(?P[^/?#]+)' _TESTS = [{ 'url': 'https://www.jiosaavn.com/s/playlist/2279fbe391defa793ad7076929a2f5c9/mood-english/LlJ8ZWT1ibN5084vKHRj2Q__', 'info_dict': { @@ -172,15 +290,16 @@ class JioSaavnPlaylistIE(JioSaavnBaseIE): 'id': 'DVR,pFUOwyXqIp77B1JF,A__', 'title': 'Mood Hindi', }, - 'playlist_mincount': 801, + 'playlist_mincount': 750, }, { 'url': 'https://www.jiosaavn.com/featured/taaza-tunes/Me5RridRfDk_', 'info_dict': { 'id': 'Me5RridRfDk_', 'title': 'Taaza Tunes', }, - 'playlist_mincount': 301, + 'playlist_mincount': 50, }] + _ENTRY_IE = JioSaavnSongIE _PAGE_SIZE = 50 def _fetch_page(self, token, page): @@ -189,7 +308,7 @@ def _fetch_page(self, token, page): def _entries(self, token, first_page_data, page): page_data = first_page_data if not page else self._fetch_page(token, page + 1) - yield from self._yield_songs(page_data) + yield from self._yield_items(page_data, 'songs') def _real_extract(self, url): display_id = self._match_id(url) @@ -199,3 +318,95 @@ def _real_extract(self, url): return self.playlist_result(InAdvancePagedList( functools.partial(self._entries, display_id, playlist_data), total_pages, self._PAGE_SIZE), display_id, traverse_obj(playlist_data, ('listname', {str}))) + + +class JioSaavnShowPlaylistIE(JioSaavnBaseIE): + IE_NAME = 'jiosaavn:show:playlist' + _VALID_URL = JioSaavnBaseIE._URL_BASE_RE + r'/shows/(?P[^#/?]+)/(?P\d+)/[^/?#]+' + _TESTS = [{ + 'url': 'https://www.jiosaavn.com/shows/talking-music/1/PjReFP-Sguk_', + 'info_dict': { + 'id': 'talking-music-1', + 'title': 'Talking Music', + }, + 'playlist_mincount': 11, + }] + _ENTRY_IE = JioSaavnShowIE + _PAGE_SIZE = 10 + + def _fetch_page(self, show_id, season_id, page): + return self._call_api('show', show_id, f'show page {page}', { + 'p': page, + '__call': 'show.getAllEpisodes', + 'show_id': show_id, + 'season_number': season_id, + 'api_version': '4', + 'sort_order': 'desc', + }) + + def _entries(self, show_id, season_id, page): + page_data = self._fetch_page(show_id, season_id, page + 1) + yield from self._yield_items(page_data, keys=None, parse_func=self._extract_episode) + + def _real_extract(self, url): + show_slug, season_id = self._match_valid_url(url).group('show', 'season') + playlist_id = f'{show_slug}-{season_id}' + webpage = self._download_webpage(url, playlist_id) + + show_info = self._search_json( + r'window\.__INITIAL_DATA__\s*=', webpage, 'initial data', + playlist_id, transform_source=js_to_json)['showView'] + show_id = show_info['current_id'] + + entries = OnDemandPagedList(functools.partial(self._entries, show_id, season_id), self._PAGE_SIZE) + return self.playlist_result( + entries, playlist_id, traverse_obj(show_info, ('show', 'title', 'text', {str}))) + + +class JioSaavnArtistIE(JioSaavnBaseIE): + IE_NAME = 'jiosaavn:artist' + _VALID_URL = JioSaavnBaseIE._URL_BASE_RE + r'/artist/[^/?#]+/(?P[^/?#]+)' + _TESTS = [{ + 'url': 'https://www.jiosaavn.com/artist/krsna-songs/rYLBEve2z3U_', + 'info_dict': { + 'id': 'rYLBEve2z3U_', + 'title': 'KR$NA', + }, + 'playlist_mincount': 38, + }, { + 'url': 'https://www.jiosaavn.com/artist/sanam-puri-songs/SkNEv3qRhDE_', + 'info_dict': { + 'id': 'SkNEv3qRhDE_', + 'title': 'Sanam Puri', + }, + 'playlist_mincount': 51, + }] + _ENTRY_IE = JioSaavnSongIE + _PAGE_SIZE = 50 + + def _fetch_page(self, artist_id, page): + return self._call_api('artist', artist_id, f'artist page {page + 1}', { + 'p': page, + 'n_song': self._PAGE_SIZE, + 'n_album': self._PAGE_SIZE, + 'sub_type': '', + 'includeMetaTags': '', + 'api_version': '4', + 'category': 'alphabetical', + 'sort_order': 'asc', + }) + + def _entries(self, artist_id, first_page): + for page in itertools.count(): + playlist_data = first_page if not page else self._fetch_page(artist_id, page) + if not traverse_obj(playlist_data, ('topSongs', ..., {dict})): + break + yield from self._yield_items(playlist_data, 'topSongs') + + def _real_extract(self, url): + artist_id = self._match_id(url) + first_page = self._fetch_page(artist_id, 0) + + return self.playlist_result( + self._entries(artist_id, first_page), artist_id, + traverse_obj(first_page, ('name', {str}))) diff --git a/yt_dlp/extractor/kika.py b/yt_dlp/extractor/kika.py index 69f4a3ce03..e277564524 100644 --- a/yt_dlp/extractor/kika.py +++ b/yt_dlp/extractor/kika.py @@ -1,3 +1,5 @@ +import itertools + from .common import InfoExtractor from ..utils import ( determine_ext, @@ -124,3 +126,43 @@ def _extract_formats(self, media_info, video_id): 'vbr': ('bitrateVideo', {int_or_none}, {lambda x: None if x == -1 else x}), }), } + + +class KikaPlaylistIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?kika\.de/[\w-]+/(?P[a-z-]+\d+)' + + _TESTS = [{ + 'url': 'https://www.kika.de/logo/logo-die-welt-und-ich-562', + 'info_dict': { + 'id': 'logo-die-welt-und-ich-562', + 'title': 'logo!', + 'description': 'md5:7b9d7f65561b82fa512f2cfb553c397d', + }, + 'playlist_count': 100, + }] + + def _entries(self, playlist_url, playlist_id): + for page in itertools.count(1): + data = self._download_json(playlist_url, playlist_id, note=f'Downloading page {page}') + for item in traverse_obj(data, ('content', lambda _, v: url_or_none(v['api']['url']))): + yield self.url_result( + item['api']['url'], ie=KikaIE, + **traverse_obj(item, { + 'id': ('id', {str}), + 'title': ('title', {str}), + 'duration': ('duration', {int_or_none}), + 'timestamp': ('date', {parse_iso8601}), + })) + + playlist_url = traverse_obj(data, ('links', 'next', {url_or_none})) + if not playlist_url: + break + + def _real_extract(self, url): + playlist_id = self._match_id(url) + brand_data = self._download_json( + f'https://www.kika.de/_next-api/proxy/v1/brands/{playlist_id}', playlist_id) + + return self.playlist_result( + self._entries(brand_data['videoSubchannel']['videosPageUrl'], playlist_id), + playlist_id, title=brand_data.get('title'), description=brand_data.get('description')) diff --git a/yt_dlp/extractor/linkedin.py b/yt_dlp/extractor/linkedin.py index c8c8ae52ad..2974f4026f 100644 --- a/yt_dlp/extractor/linkedin.py +++ b/yt_dlp/extractor/linkedin.py @@ -1,4 +1,5 @@ import itertools +import json import re from .common import InfoExtractor @@ -9,12 +10,12 @@ int_or_none, mimetype2ext, srt_subtitles_timecode, - traverse_obj, try_get, url_or_none, urlencode_postdata, urljoin, ) +from ..utils.traversal import find_elements, require, traverse_obj class LinkedInBaseIE(InfoExtractor): @@ -82,7 +83,10 @@ def _get_video_id(self, video_data, course_slug, video_slug): class LinkedInIE(LinkedInBaseIE): - _VALID_URL = r'https?://(?:www\.)?linkedin\.com/posts/[^/?#]+-(?P\d+)-\w{4}/?(?:[?#]|$)' + _VALID_URL = [ + r'https?://(?:www\.)?linkedin\.com/posts/[^/?#]+-(?P\d+)-\w{4}/?(?:[?#]|$)', + r'https?://(?:www\.)?linkedin\.com/feed/update/urn:li:activity:(?P\d+)', + ] _TESTS = [{ 'url': 'https://www.linkedin.com/posts/mishalkhawaja_sendinblueviews-toronto-digitalmarketing-ugcPost-6850898786781339649-mM20', 'info_dict': { @@ -106,6 +110,9 @@ class LinkedInIE(LinkedInBaseIE): 'like_count': int, 'subtitles': 'mincount:1', }, + }, { + 'url': 'https://www.linkedin.com/feed/update/urn:li:activity:7016901149999955968/?utm_source=share&utm_medium=member_desktop', + 'only_matching': True, }] def _real_extract(self, url): @@ -271,3 +278,110 @@ def _real_extract(self, url): entries, course_slug, course_data.get('title'), course_data.get('description')) + + +class LinkedInEventsIE(LinkedInBaseIE): + IE_NAME = 'linkedin:events' + _VALID_URL = r'https?://(?:www\.)?linkedin\.com/events/(?P[\w-]+)' + _TESTS = [{ + 'url': 'https://www.linkedin.com/events/7084656651378536448/comments/', + 'info_dict': { + 'id': '7084656651378536448', + 'ext': 'mp4', + 'title': '#37 Aprende a hacer una entrevista en inglés para tu próximo trabajo remoto', + 'description': '¡Agarra para anotar que se viene tremendo evento!', + 'duration': 1765, + 'timestamp': 1689113772, + 'upload_date': '20230711', + 'release_timestamp': 1689174012, + 'release_date': '20230712', + 'live_status': 'was_live', + }, + }, { + 'url': 'https://www.linkedin.com/events/27-02energyfreedombyenergyclub7295762520814874625/comments/', + 'info_dict': { + 'id': '27-02energyfreedombyenergyclub7295762520814874625', + 'ext': 'mp4', + 'title': '27.02 Energy Freedom by Energy Club', + 'description': 'md5:1292e6f31df998914c293787a02c3b91', + 'duration': 6420, + 'timestamp': 1739445333, + 'upload_date': '20250213', + 'release_timestamp': 1740657620, + 'release_date': '20250227', + 'live_status': 'was_live', + }, + }] + + def _real_initialize(self): + if not self._get_cookies('https://www.linkedin.com/').get('li_at'): + self.raise_login_required() + + def _real_extract(self, url): + event_id = self._match_id(url) + webpage = self._download_webpage(url, event_id) + + base_data = traverse_obj(webpage, ( + {find_elements(tag='code', attr='style', value='display: none')}, ..., {json.loads}, 'included', ...)) + meta_data = traverse_obj(base_data, ( + lambda _, v: v['$type'] == 'com.linkedin.voyager.dash.events.ProfessionalEvent', any)) or {} + + live_status = { + 'PAST': 'was_live', + 'ONGOING': 'is_live', + 'FUTURE': 'is_upcoming', + }.get(meta_data.get('lifecycleState')) + + if live_status == 'is_upcoming': + player_data = {} + if event_time := traverse_obj(meta_data, ('displayEventTime', {str})): + message = f'This live event is scheduled for {event_time}' + else: + message = 'This live event has not yet started' + self.raise_no_formats(message, expected=True, video_id=event_id) + else: + # TODO: Add support for audio-only live events + player_data = traverse_obj(base_data, ( + lambda _, v: v['$type'] == 'com.linkedin.videocontent.VideoPlayMetadata', + any, {require('video player data')})) + + formats, subtitles = [], {} + for prog_fmts in traverse_obj(player_data, ('progressiveStreams', ..., {dict})): + for fmt_url in traverse_obj(prog_fmts, ('streamingLocations', ..., 'url', {url_or_none})): + formats.append({ + 'url': fmt_url, + **traverse_obj(prog_fmts, { + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + 'tbr': ('bitRate', {int_or_none(scale=1000)}), + 'filesize': ('size', {int_or_none}), + 'ext': ('mediaType', {mimetype2ext}), + }), + }) + + for m3u8_url in traverse_obj(player_data, ( + 'adaptiveStreams', lambda _, v: v['protocol'] == 'HLS', 'masterPlaylists', ..., 'url', {url_or_none}, + )): + fmts, subs = self._extract_m3u8_formats_and_subtitles( + m3u8_url, event_id, 'mp4', m3u8_id='hls', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + + return { + 'id': event_id, + 'formats': formats, + 'subtitles': subtitles, + 'live_status': live_status, + **traverse_obj(meta_data, { + 'title': ('name', {str}), + 'description': ('description', 'text', {str}), + 'timestamp': ('createdAt', {int_or_none(scale=1000)}), + # timeRange.start is available when the stream is_upcoming + 'release_timestamp': ('timeRange', 'start', {int_or_none(scale=1000)}), + }), + **traverse_obj(player_data, { + 'duration': ('duration', {int_or_none(scale=1000)}), + # liveStreamCreatedAt is only available when the stream is_live or was_live + 'release_timestamp': ('liveStreamCreatedAt', {int_or_none(scale=1000)}), + }), + } diff --git a/yt_dlp/extractor/loco.py b/yt_dlp/extractor/loco.py new file mode 100644 index 0000000000..6c9a255678 --- /dev/null +++ b/yt_dlp/extractor/loco.py @@ -0,0 +1,159 @@ +import json +import random +import time + +from .common import InfoExtractor +from ..utils import int_or_none, jwt_decode_hs256, try_call, url_or_none +from ..utils.traversal import require, traverse_obj + + +class LocoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?loco\.com/(?Pstreamers|stream)/(?P[^/?#]+)' + _TESTS = [{ + 'url': 'https://loco.com/streamers/teuzinfps', + 'info_dict': { + 'id': 'teuzinfps', + 'ext': 'mp4', + 'title': r're:MS BOLADAO, RESENHA & GAMEPLAY ALTO NIVEL', + 'description': 'bom e novo', + 'uploader_id': 'RLUVE3S9JU', + 'channel': 'teuzinfps', + 'channel_follower_count': int, + 'comment_count': int, + 'view_count': int, + 'concurrent_view_count': int, + 'like_count': int, + 'thumbnail': 'https://static.ivory.getloconow.com/default_thumb/743701a9-98ca-41ae-9a8b-70bd5da070ad.jpg', + 'tags': ['MMORPG', 'Gameplay'], + 'series': 'Tibia', + 'timestamp': int, + 'modified_timestamp': int, + 'live_status': 'is_live', + 'upload_date': str, + 'modified_date': str, + }, + 'params': { + 'skip_download': 'Livestream', + }, + }, { + 'url': 'https://loco.com/stream/c64916eb-10fb-46a9-9a19-8c4b7ed064e7', + 'md5': '45ebc8a47ee1c2240178757caf8881b5', + 'info_dict': { + 'id': 'c64916eb-10fb-46a9-9a19-8c4b7ed064e7', + 'ext': 'mp4', + 'title': 'PAULINHO LOKO NA LOCO!', + 'description': 'live on na loco', + 'uploader_id': '2MDO7Z1DPM', + 'channel': 'paulinholokobr', + 'channel_follower_count': int, + 'comment_count': int, + 'view_count': int, + 'concurrent_view_count': int, + 'like_count': int, + 'duration': 14491, + 'thumbnail': 'https://static.ivory.getloconow.com/default_thumb/59b5970b-23c1-4518-9e96-17ce341299fe.jpg', + 'tags': ['Gameplay'], + 'series': 'GTA 5', + 'timestamp': 1740612872, + 'modified_timestamp': 1740613037, + 'upload_date': '20250226', + 'modified_date': '20250226', + }, + }, { + # Requires video authorization + 'url': 'https://loco.com/stream/ac854641-ae0f-497c-a8ea-4195f6d8cc53', + 'md5': '0513edf85c1e65c9521f555f665387d5', + 'info_dict': { + 'id': 'ac854641-ae0f-497c-a8ea-4195f6d8cc53', + 'ext': 'mp4', + 'title': 'DUAS CONTAS DESAFIANTE, RUSH TOP 1 NO BRASIL!', + 'description': 'md5:aa77818edd6fe00dd4b6be75cba5f826', + 'uploader_id': '7Y9JNAZC3Q', + 'channel': 'ayellol', + 'channel_follower_count': int, + 'comment_count': int, + 'view_count': int, + 'concurrent_view_count': int, + 'like_count': int, + 'duration': 1229, + 'thumbnail': 'https://static.ivory.getloconow.com/default_thumb/f5aa678b-6d04-45d9-a89a-859af0a8028f.jpg', + 'tags': ['Gameplay', 'Carry'], + 'series': 'League of Legends', + 'timestamp': 1741182253, + 'upload_date': '20250305', + 'modified_timestamp': 1741182419, + 'modified_date': '20250305', + }, + }] + + # From _app.js + _CLIENT_ID = 'TlwKp1zmF6eKFpcisn3FyR18WkhcPkZtzwPVEEC3' + _CLIENT_SECRET = 'Kp7tYlUN7LXvtcSpwYvIitgYcLparbtsQSe5AdyyCdiEJBP53Vt9J8eB4AsLdChIpcO2BM19RA3HsGtqDJFjWmwoonvMSG3ZQmnS8x1YIM8yl82xMXZGbE3NKiqmgBVU' + + def _is_jwt_expired(self, token): + return jwt_decode_hs256(token)['exp'] - time.time() < 300 + + def _get_access_token(self, video_id): + access_token = try_call(lambda: self._get_cookies('https://loco.com')['access_token'].value) + if access_token and not self._is_jwt_expired(access_token): + return access_token + access_token = traverse_obj(self._download_json( + 'https://api.getloconow.com/v3/user/device_profile/', video_id, + 'Downloading access token', fatal=False, data=json.dumps({ + 'platform': 7, + 'client_id': self._CLIENT_ID, + 'client_secret': self._CLIENT_SECRET, + 'model': 'Mozilla', + 'os_name': 'Win32', + 'os_ver': '5.0 (Windows)', + 'app_ver': '5.0 (Windows)', + }).encode(), headers={ + 'Content-Type': 'application/json;charset=utf-8', + 'DEVICE-ID': ''.join(random.choices('0123456789abcdef', k=32)) + 'live', + 'X-APP-LANG': 'en', + 'X-APP-LOCALE': 'en-US', + 'X-CLIENT-ID': self._CLIENT_ID, + 'X-CLIENT-SECRET': self._CLIENT_SECRET, + 'X-PLATFORM': '7', + }), 'access_token') + if access_token and not self._is_jwt_expired(access_token): + self._set_cookie('.loco.com', 'access_token', access_token) + return access_token + + def _real_extract(self, url): + video_type, video_id = self._match_valid_url(url).group('type', 'id') + webpage = self._download_webpage(url, video_id) + stream = traverse_obj(self._search_nextjs_data(webpage, video_id), ( + 'props', 'pageProps', ('liveStreamData', 'stream', 'liveStream'), {dict}, any, {require('stream info')})) + + if access_token := self._get_access_token(video_id): + self._request_webpage( + 'https://drm.loco.com/v1/streams/playback/', video_id, + 'Downloading video authorization', fatal=False, headers={ + 'authorization': access_token, + }, query={ + 'stream_uid': stream['uid'], + }) + + return { + 'formats': self._extract_m3u8_formats(stream['conf']['hls'], video_id), + 'id': video_id, + 'is_live': video_type == 'streamers', + **traverse_obj(stream, { + 'title': ('title', {str}), + 'series': ('game_name', {str}), + 'uploader_id': ('user_uid', {str}), + 'channel': ('alias', {str}), + 'description': ('description', {str}), + 'concurrent_view_count': ('viewersCurrent', {int_or_none}), + 'view_count': ('total_views', {int_or_none}), + 'thumbnail': ('thumbnail_url_small', {url_or_none}), + 'like_count': ('likes', {int_or_none}), + 'tags': ('tags', ..., {str}), + 'timestamp': ('started_at', {int_or_none(scale=1000)}), + 'modified_timestamp': ('updated_at', {int_or_none(scale=1000)}), + 'comment_count': ('comments_count', {int_or_none}), + 'channel_follower_count': ('followers_count', {int_or_none}), + 'duration': ('duration', {int_or_none}), + }), + } diff --git a/yt_dlp/extractor/lrt.py b/yt_dlp/extractor/lrt.py index 1a0b6da230..caff9125e0 100644 --- a/yt_dlp/extractor/lrt.py +++ b/yt_dlp/extractor/lrt.py @@ -3,7 +3,9 @@ clean_html, merge_dicts, traverse_obj, + unified_timestamp, url_or_none, + urljoin, ) @@ -80,7 +82,7 @@ class LRTVODIE(LRTBaseIE): }] def _real_extract(self, url): - path, video_id = self._match_valid_url(url).groups() + path, video_id = self._match_valid_url(url).group('path', 'id') webpage = self._download_webpage(url, video_id) media_url = self._extract_js_var(webpage, 'main_url', path) @@ -106,3 +108,44 @@ def _real_extract(self, url): } return merge_dicts(clean_info, jw_data, json_ld_data) + + +class LRTRadioIE(LRTBaseIE): + _VALID_URL = r'https?://(?:www\.)?lrt\.lt/radioteka/irasas/(?P\d+)/(?P[^?#/]+)' + _TESTS = [{ + # m3u8 download + 'url': 'https://www.lrt.lt/radioteka/irasas/2000359728/nemarios-eiles-apie-pragarus-ir-skaistyklas-su-aiste-kiltinaviciute', + 'info_dict': { + 'id': '2000359728', + 'ext': 'm4a', + 'title': 'Nemarios eilės: apie pragarus ir skaistyklas su Aiste Kiltinavičiūte', + 'description': 'md5:5eee9a0e86a55bf547bd67596204625d', + 'timestamp': 1726143120, + 'upload_date': '20240912', + 'tags': 'count:5', + 'thumbnail': r're:https?://.+/.+\.jpe?g', + 'categories': ['Daiktiniai įrodymai'], + }, + }, { + 'url': 'https://www.lrt.lt/radioteka/irasas/2000304654/vakaras-su-knyga-svetlana-aleksijevic-cernobylio-malda-v-dalis?season=%2Fmediateka%2Faudio%2Fvakaras-su-knyga%2F2023', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id, path = self._match_valid_url(url).group('id', 'path') + media = self._download_json( + 'https://www.lrt.lt/radioteka/api/media', video_id, + query={'url': f'/mediateka/irasas/{video_id}/{path}'}) + + return { + 'id': video_id, + 'formats': self._extract_m3u8_formats(media['playlist_item']['file'], video_id), + **traverse_obj(media, { + 'title': ('title', {str}), + 'tags': ('tags', ..., 'name', {str}), + 'categories': ('playlist_item', 'category', {str}, filter, all, filter), + 'description': ('content', {clean_html}, {str}), + 'timestamp': ('date', {lambda x: x.replace('.', '/')}, {unified_timestamp}), + 'thumbnail': ('playlist_item', 'image', {urljoin('https://www.lrt.lt')}), + }), + } diff --git a/yt_dlp/extractor/manyvids.py b/yt_dlp/extractor/manyvids.py index 8caa8f87fe..1356169bfd 100644 --- a/yt_dlp/extractor/manyvids.py +++ b/yt_dlp/extractor/manyvids.py @@ -1,31 +1,38 @@ -import re - from .common import InfoExtractor from ..utils import ( + clean_html, determine_ext, - extract_attributes, int_or_none, - str_to_int, + join_nonempty, + parse_count, + parse_duration, + parse_iso8601, url_or_none, - urlencode_postdata, ) +from ..utils.traversal import traverse_obj class ManyVidsIE(InfoExtractor): - _WORKING = False _VALID_URL = r'(?i)https?://(?:www\.)?manyvids\.com/video/(?P\d+)' _TESTS = [{ # preview video - 'url': 'https://www.manyvids.com/Video/133957/everthing-about-me/', - 'md5': '03f11bb21c52dd12a05be21a5c7dcc97', + 'url': 'https://www.manyvids.com/Video/530341/mv-tips-tricks', + 'md5': '738dc723f7735ee9602f7ea352a6d058', 'info_dict': { - 'id': '133957', + 'id': '530341-preview', 'ext': 'mp4', - 'title': 'everthing about me (Preview)', - 'uploader': 'ellyxxix', + 'title': 'MV Tips & Tricks (Preview)', + 'description': r're:I will take you on a tour around .{1313}$', + 'thumbnail': r're:https://cdn5\.manyvids\.com/php_uploads/video_images/DestinyDiaz/.+\.jpg', + 'uploader': 'DestinyDiaz', 'view_count': int, 'like_count': int, + 'release_timestamp': 1508419904, + 'tags': ['AdultSchool', 'BBW', 'SFW', 'TeacherFetish'], + 'release_date': '20171019', + 'duration': 3167.0, }, + 'expected_warnings': ['Only extracting preview'], }, { # full video 'url': 'https://www.manyvids.com/Video/935718/MY-FACE-REVEAL/', @@ -34,129 +41,68 @@ class ManyVidsIE(InfoExtractor): 'id': '935718', 'ext': 'mp4', 'title': 'MY FACE REVEAL', - 'description': 'md5:ec5901d41808b3746fed90face161612', + 'description': r're:Today is the day!! I am finally taking off my mask .{445}$', + 'thumbnail': r're:https://ods\.manyvids\.com/1001061960/3aa5397f2a723ec4597e344df66ab845/screenshots/.+\.jpg', 'uploader': 'Sarah Calanthe', 'view_count': int, 'like_count': int, + 'release_date': '20181110', + 'tags': ['EyeContact', 'Interviews', 'MaskFetish', 'MouthFetish', 'Redhead'], + 'release_timestamp': 1541851200, + 'duration': 224.0, }, }] + _API_BASE = 'https://www.manyvids.com/bff/store/video' def _real_extract(self, url): video_id = self._match_id(url) + video_data = self._download_json(f'{self._API_BASE}/{video_id}/private', video_id)['data'] + formats, preview_only = [], True - real_url = f'https://www.manyvids.com/video/{video_id}/gtm.js' - try: - webpage = self._download_webpage(real_url, video_id) - except Exception: - # probably useless fallback - webpage = self._download_webpage(url, video_id) - - info = self._search_regex( - r'''(]*\bid\s*=\s*(['"])pageMetaDetails\2[^>]*>)''', - webpage, 'meta details', default='') - info = extract_attributes(info) - - player = self._search_regex( - r'''(]*\bid\s*=\s*(['"])rmpPlayerStream\2[^>]*>)''', - webpage, 'player details', default='') - player = extract_attributes(player) - - video_urls_and_ids = ( - (info.get('data-meta-video'), 'video'), - (player.get('data-video-transcoded'), 'transcoded'), - (player.get('data-video-filepath'), 'filepath'), - (self._og_search_video_url(webpage, secure=False, default=None), 'og_video'), - ) - - def txt_or_none(s, default=None): - return (s.strip() or default) if isinstance(s, str) else default - - uploader = txt_or_none(info.get('data-meta-author')) - - def mung_title(s): - if uploader: - s = re.sub(rf'^\s*{re.escape(uploader)}\s+[|-]', '', s) - return txt_or_none(s) - - title = ( - mung_title(info.get('data-meta-title')) - or self._html_search_regex( - (r']+class=["\']item-title[^>]+>([^<]+)', - r']+class=["\']h2 m-0["\'][^>]*>([^<]+)'), - webpage, 'title', default=None) - or self._html_search_meta( - 'twitter:title', webpage, 'title', fatal=True)) - - title = re.sub(r'\s*[|-]\s+ManyVids\s*$', '', title) or title - - if any(p in webpage for p in ('preview_videos', '_preview.mp4')): - title += ' (Preview)' - - mv_token = self._search_regex( - r'data-mvtoken=(["\'])(?P(?:(?!\1).)+)\1', webpage, - 'mv token', default=None, group='value') - - if mv_token: - # Sets some cookies - self._download_webpage( - 'https://www.manyvids.com/includes/ajax_repository/you_had_me_at_hello.php', - video_id, note='Setting format cookies', fatal=False, - data=urlencode_postdata({ - 'mvtoken': mv_token, - 'vid': video_id, - }), headers={ - 'Referer': url, - 'X-Requested-With': 'XMLHttpRequest', - }) - - formats = [] - for v_url, fmt in video_urls_and_ids: - v_url = url_or_none(v_url) - if not v_url: + for format_id, path in [ + ('preview', ['teaser', 'filepath']), + ('transcoded', ['transcodedFilepath']), + ('filepath', ['filepath']), + ]: + format_url = traverse_obj(video_data, (*path, {url_or_none})) + if not format_url: continue - if determine_ext(v_url) == 'm3u8': - formats.extend(self._extract_m3u8_formats( - v_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls')) + if determine_ext(format_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats(format_url, video_id, 'mp4', m3u8_id=format_id)) else: formats.append({ - 'url': v_url, - 'format_id': fmt, + 'url': format_url, + 'format_id': format_id, + 'preference': -10 if format_id == 'preview' else None, + 'quality': 10 if format_id == 'filepath' else None, + 'height': int_or_none( + self._search_regex(r'_(\d{2,3}[02468])_', format_url, 'height', default=None)), }) + if format_id != 'preview': + preview_only = False - self._remove_duplicate_formats(formats) + metadata = traverse_obj( + self._download_json(f'{self._API_BASE}/{video_id}', video_id, fatal=False), 'data') + title = traverse_obj(metadata, ('title', {clean_html})) - for f in formats: - if f.get('height') is None: - f['height'] = int_or_none( - self._search_regex(r'_(\d{2,3}[02468])_', f['url'], 'video height', default=None)) - if '/preview/' in f['url']: - f['format_id'] = '_'.join(filter(None, (f.get('format_id'), 'preview'))) - f['preference'] = -10 - if 'transcoded' in f['format_id']: - f['preference'] = f.get('preference', -1) - 1 - - def get_likes(): - likes = self._search_regex( - rf'''(]*\bdata-id\s*=\s*(['"]){video_id}\2[^>]*>)''', - webpage, 'likes', default='') - likes = extract_attributes(likes) - return int_or_none(likes.get('data-likes')) - - def get_views(): - return str_to_int(self._html_search_regex( - r'''(?s)]*\bclass\s*=["']views-wrapper\b[^>]+>.+?]+>\s*(\d[\d,.]*)\s*''', - webpage, 'view count', default=None)) + if preview_only: + title = join_nonempty(title, '(Preview)', delim=' ') + video_id += '-preview' + self.report_warning( + f'Only extracting preview. Video may be paid or subscription only. {self._login_hint()}') return { 'id': video_id, 'title': title, 'formats': formats, - 'description': txt_or_none(info.get('data-meta-description')), - 'uploader': txt_or_none(info.get('data-meta-author')), - 'thumbnail': ( - url_or_none(info.get('data-meta-image')) - or url_or_none(player.get('data-video-screenshot'))), - 'view_count': get_views(), - 'like_count': get_likes(), + **traverse_obj(metadata, { + 'description': ('description', {clean_html}), + 'uploader': ('model', 'displayName', {clean_html}), + 'thumbnail': (('screenshot', 'thumbnail'), {url_or_none}, any), + 'view_count': ('views', {parse_count}), + 'like_count': ('likes', {parse_count}), + 'release_timestamp': ('launchDate', {parse_iso8601}), + 'duration': ('videoDuration', {parse_duration}), + 'tags': ('tagList', ..., 'label', {str}, filter, all, filter), + }), } diff --git a/yt_dlp/extractor/medaltv.py b/yt_dlp/extractor/medaltv.py index d64dbfe638..94c51ed0e7 100644 --- a/yt_dlp/extractor/medaltv.py +++ b/yt_dlp/extractor/medaltv.py @@ -102,11 +102,10 @@ def add_item(container, item_url, height, id_key='format_id', item_id=None): item_id = item_id or '%dp' % height if item_id not in item_url: return - width = int(round(aspect_ratio * height)) container.append({ 'url': item_url, id_key: item_id, - 'width': width, + 'width': round(aspect_ratio * height), 'height': height, }) diff --git a/yt_dlp/extractor/microsoftembed.py b/yt_dlp/extractor/microsoftembed.py index 2575d6c5e4..9d58fa0a60 100644 --- a/yt_dlp/extractor/microsoftembed.py +++ b/yt_dlp/extractor/microsoftembed.py @@ -4,6 +4,7 @@ from ..utils import ( int_or_none, parse_iso8601, + parse_resolution, traverse_obj, unified_timestamp, url_basename, @@ -83,8 +84,8 @@ def _sub_to_dict(subtitle_list): subtitles.setdefault(sub.pop('tag', 'und'), []).append(sub) return subtitles - def _extract_ism(self, ism_url, video_id): - formats = self._extract_ism_formats(ism_url, video_id) + def _extract_ism(self, ism_url, video_id, fatal=True): + formats = self._extract_ism_formats(ism_url, video_id, fatal=fatal) for fmt in formats: if fmt['language'] != 'eng' and 'English' not in fmt['format_id']: fmt['language_preference'] = -10 @@ -218,9 +219,21 @@ class MicrosoftLearnEpisodeIE(MicrosoftMediusBaseIE): 'description': 'md5:7bbbfb593d21c2cf2babc3715ade6b88', 'timestamp': 1676339547, 'upload_date': '20230214', - 'thumbnail': r're:https://learn\.microsoft\.com/video/media/.*\.png', + 'thumbnail': r're:https://learn\.microsoft\.com/video/media/.+\.png', 'subtitles': 'count:14', }, + }, { + 'url': 'https://learn.microsoft.com/en-gb/shows/on-demand-instructor-led-training-series/az-900-module-1', + 'info_dict': { + 'id': '4fe10f7c-d83c-463b-ac0e-c30a8195e01b', + 'ext': 'mp4', + 'title': 'AZ-900 Cloud fundamentals (1 of 6)', + 'description': 'md5:3c2212ce865e9142f402c766441bd5c9', + 'thumbnail': r're:https://.+/.+\.jpg', + 'timestamp': 1706605184, + 'upload_date': '20240130', + }, + 'params': {'format': 'bv[protocol=https]'}, }] def _real_extract(self, url): @@ -230,9 +243,32 @@ def _real_extract(self, url): entry_id = self._html_search_meta('entryId', webpage, 'entryId', fatal=True) video_info = self._download_json( f'https://learn.microsoft.com/api/video/public/v1/entries/{entry_id}', video_id) + + formats = [] + if ism_url := traverse_obj(video_info, ('publicVideo', 'adaptiveVideoUrl', {url_or_none})): + formats.extend(self._extract_ism(ism_url, video_id, fatal=False)) + if hls_url := traverse_obj(video_info, ('publicVideo', 'adaptiveVideoHLSUrl', {url_or_none})): + formats.extend(self._extract_m3u8_formats(hls_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + if mpd_url := traverse_obj(video_info, ('publicVideo', 'adaptiveVideoDashUrl', {url_or_none})): + formats.extend(self._extract_mpd_formats(mpd_url, video_id, mpd_id='dash', fatal=False)) + for key in ('low', 'medium', 'high'): + if video_url := traverse_obj(video_info, ('publicVideo', f'{key}QualityVideoUrl', {url_or_none})): + formats.append({ + 'url': video_url, + 'format_id': f'video-http-{key}', + 'acodec': 'none', + **parse_resolution(video_url), + }) + if audio_url := traverse_obj(video_info, ('publicVideo', 'audioUrl', {url_or_none})): + formats.append({ + 'url': audio_url, + 'format_id': 'audio-http', + 'vcodec': 'none', + }) + return { 'id': entry_id, - 'formats': self._extract_ism(video_info['publicVideo']['adaptiveVideoUrl'], video_id), + 'formats': formats, 'subtitles': self._sub_to_dict(traverse_obj(video_info, ( 'publicVideo', 'captions', lambda _, v: url_or_none(v['url']), { 'tag': ('language', {str}), diff --git a/yt_dlp/extractor/mitele.py b/yt_dlp/extractor/mitele.py index 76fef337a2..55fa83b51f 100644 --- a/yt_dlp/extractor/mitele.py +++ b/yt_dlp/extractor/mitele.py @@ -1,5 +1,7 @@ from .telecinco import TelecincoBaseIE +from ..networking.exceptions import HTTPError from ..utils import ( + ExtractorError, int_or_none, parse_iso8601, ) @@ -79,7 +81,17 @@ class MiTeleIE(TelecincoBaseIE): def _real_extract(self, url): display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) + + try: # yt-dlp's default user-agents are too old and blocked by akamai + webpage = self._download_webpage(url, display_id, headers={ + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:136.0) Gecko/20100101 Firefox/136.0', + }) + except ExtractorError as e: + if not isinstance(e.cause, HTTPError) or e.cause.status != 403: + raise + # Retry with impersonation if hardcoded UA is insufficient to bypass akamai + webpage = self._download_webpage(url, display_id, impersonate=True) + pre_player = self._search_json( r'window\.\$REACTBASE_STATE\.prePlayer_mtweb\s*=', webpage, 'Pre Player', display_id)['prePlayer'] diff --git a/yt_dlp/extractor/mixcloud.py b/yt_dlp/extractor/mixcloud.py index 19b7fd4e70..852670fcba 100644 --- a/yt_dlp/extractor/mixcloud.py +++ b/yt_dlp/extractor/mixcloud.py @@ -10,7 +10,9 @@ parse_iso8601, strip_or_none, try_get, + url_or_none, ) +from ..utils.traversal import traverse_obj class MixcloudBaseIE(InfoExtractor): @@ -37,7 +39,7 @@ class MixcloudIE(MixcloudBaseIE): 'ext': 'm4a', 'title': 'Cryptkeeper', 'description': 'After quite a long silence from myself, finally another Drum\'n\'Bass mix with my favourite current dance floor bangers.', - 'uploader': 'Daniel Holbach', + 'uploader': 'dholbach', 'uploader_id': 'dholbach', 'thumbnail': r're:https?://.*\.jpg', 'view_count': int, @@ -46,10 +48,11 @@ class MixcloudIE(MixcloudBaseIE): 'uploader_url': 'https://www.mixcloud.com/dholbach/', 'artist': 'Submorphics & Chino , Telekinesis, Porter Robinson, Enei, Breakage ft Jess Mills', 'duration': 3723, - 'tags': [], + 'tags': ['liquid drum and bass', 'drum and bass'], 'comment_count': int, 'repost_count': int, 'like_count': int, + 'artists': list, }, 'params': {'skip_download': 'm3u8'}, }, { @@ -67,7 +70,7 @@ class MixcloudIE(MixcloudBaseIE): 'upload_date': '20150203', 'uploader_url': 'https://www.mixcloud.com/gillespeterson/', 'duration': 2992, - 'tags': [], + 'tags': ['jazz', 'soul', 'world music', 'funk'], 'comment_count': int, 'repost_count': int, 'like_count': int, @@ -149,8 +152,6 @@ def _real_extract(self, url): elif reason: raise ExtractorError('Track is restricted', expected=True) - title = cloudcast['name'] - stream_info = cloudcast['streamInfo'] formats = [] @@ -182,47 +183,39 @@ def _real_extract(self, url): self.raise_login_required(metadata_available=True) comments = [] - for edge in (try_get(cloudcast, lambda x: x['comments']['edges']) or []): - node = edge.get('node') or {} + for node in traverse_obj(cloudcast, ('comments', 'edges', ..., 'node', {dict})): text = strip_or_none(node.get('comment')) if not text: continue - user = node.get('user') or {} comments.append({ - 'author': user.get('displayName'), - 'author_id': user.get('username'), 'text': text, - 'timestamp': parse_iso8601(node.get('created')), + **traverse_obj(node, { + 'author': ('user', 'displayName', {str}), + 'author_id': ('user', 'username', {str}), + 'timestamp': ('created', {parse_iso8601}), + }), }) - tags = [] - for t in cloudcast.get('tags'): - tag = try_get(t, lambda x: x['tag']['name'], str) - if not tag: - tags.append(tag) - - get_count = lambda x: int_or_none(try_get(cloudcast, lambda y: y[x]['totalCount'])) - - owner = cloudcast.get('owner') or {} - return { 'id': track_id, - 'title': title, 'formats': formats, - 'description': cloudcast.get('description'), - 'thumbnail': try_get(cloudcast, lambda x: x['picture']['url'], str), - 'uploader': owner.get('displayName'), - 'timestamp': parse_iso8601(cloudcast.get('publishDate')), - 'uploader_id': owner.get('username'), - 'uploader_url': owner.get('url'), - 'duration': int_or_none(cloudcast.get('audioLength')), - 'view_count': int_or_none(cloudcast.get('plays')), - 'like_count': get_count('favorites'), - 'repost_count': get_count('reposts'), - 'comment_count': get_count('comments'), 'comments': comments, - 'tags': tags, - 'artist': ', '.join(cloudcast.get('featuringArtistList') or []) or None, + **traverse_obj(cloudcast, { + 'title': ('name', {str}), + 'description': ('description', {str}), + 'thumbnail': ('picture', 'url', {url_or_none}), + 'timestamp': ('publishDate', {parse_iso8601}), + 'duration': ('audioLength', {int_or_none}), + 'uploader': ('owner', 'displayName', {str}), + 'uploader_id': ('owner', 'username', {str}), + 'uploader_url': ('owner', 'url', {url_or_none}), + 'view_count': ('plays', {int_or_none}), + 'like_count': ('favorites', 'totalCount', {int_or_none}), + 'repost_count': ('reposts', 'totalCount', {int_or_none}), + 'comment_count': ('comments', 'totalCount', {int_or_none}), + 'tags': ('tags', ..., 'tag', 'name', {str}, filter, all, filter), + 'artists': ('featuringArtistList', ..., {str}, filter, all, filter), + }), } @@ -295,7 +288,7 @@ class MixcloudUserIE(MixcloudPlaylistBaseIE): 'url': 'http://www.mixcloud.com/dholbach/', 'info_dict': { 'id': 'dholbach_uploads', - 'title': 'Daniel Holbach (uploads)', + 'title': 'dholbach (uploads)', 'description': 'md5:a3f468a60ac8c3e1f8616380fc469b2b', }, 'playlist_mincount': 36, @@ -303,7 +296,7 @@ class MixcloudUserIE(MixcloudPlaylistBaseIE): 'url': 'http://www.mixcloud.com/dholbach/uploads/', 'info_dict': { 'id': 'dholbach_uploads', - 'title': 'Daniel Holbach (uploads)', + 'title': 'dholbach (uploads)', 'description': 'md5:a3f468a60ac8c3e1f8616380fc469b2b', }, 'playlist_mincount': 36, @@ -311,7 +304,7 @@ class MixcloudUserIE(MixcloudPlaylistBaseIE): 'url': 'http://www.mixcloud.com/dholbach/favorites/', 'info_dict': { 'id': 'dholbach_favorites', - 'title': 'Daniel Holbach (favorites)', + 'title': 'dholbach (favorites)', 'description': 'md5:a3f468a60ac8c3e1f8616380fc469b2b', }, # 'params': { @@ -337,7 +330,7 @@ class MixcloudUserIE(MixcloudPlaylistBaseIE): 'title': 'First Ear (stream)', 'description': 'we maraud for ears', }, - 'playlist_mincount': 269, + 'playlist_mincount': 267, }] _TITLE_KEY = 'displayName' @@ -361,7 +354,7 @@ class MixcloudPlaylistIE(MixcloudPlaylistBaseIE): 'id': 'maxvibes_jazzcat-on-ness-radio', 'title': 'Ness Radio sessions', }, - 'playlist_mincount': 59, + 'playlist_mincount': 58, }] _TITLE_KEY = 'name' _DESCRIPTION_KEY = 'description' diff --git a/yt_dlp/extractor/mlb.py b/yt_dlp/extractor/mlb.py index 935bf85615..562b93fc78 100644 --- a/yt_dlp/extractor/mlb.py +++ b/yt_dlp/extractor/mlb.py @@ -365,13 +365,15 @@ def _real_initialize(self): 'All videos are only available to registered users', method='password') def _set_device_id(self, username): - if not self._device_id: - self._device_id = self.cache.load( - self._NETRC_MACHINE, 'device_ids', default={}).get(username) + if self._device_id: + return + device_id_cache = self.cache.load(self._NETRC_MACHINE, 'device_ids', default={}) + self._device_id = device_id_cache.get(username) if self._device_id: return self._device_id = str(uuid.uuid4()) - self.cache.store(self._NETRC_MACHINE, 'device_ids', {username: self._device_id}) + device_id_cache[username] = self._device_id + self.cache.store(self._NETRC_MACHINE, 'device_ids', device_id_cache) def _perform_login(self, username, password): try: @@ -449,9 +451,7 @@ def _extract_formats_and_subtitles(self, broadcast, video_id): if not (m3u8_url and token): errors = '; '.join(traverse_obj(response, ('errors', ..., 'message', {str}))) - if 'not entitled' in errors: - raise ExtractorError(errors, expected=True) - elif errors: # Only warn when 'blacked out' since radio formats are available + if errors: # Only warn when 'blacked out' or 'not entitled'; radio formats may be available self.report_warning(f'API returned errors for {format_id}: {errors}') else: self.report_warning(f'No formats available for {format_id} broadcast; skipping') diff --git a/yt_dlp/extractor/moviepilot.py b/yt_dlp/extractor/moviepilot.py index ed5be4fa65..3b20232263 100644 --- a/yt_dlp/extractor/moviepilot.py +++ b/yt_dlp/extractor/moviepilot.py @@ -3,8 +3,8 @@ class MoviepilotIE(InfoExtractor): - _IE_NAME = 'moviepilot' - _IE_DESC = 'Moviepilot trailer' + IE_NAME = 'moviepilot' + IE_DESC = 'Moviepilot trailer' _VALID_URL = r'https?://(?:www\.)?moviepilot\.de/movies/(?P[^/]+)' _TESTS = [{ diff --git a/yt_dlp/extractor/nba.py b/yt_dlp/extractor/nba.py index 91ae1d14c6..c87046d8ef 100644 --- a/yt_dlp/extractor/nba.py +++ b/yt_dlp/extractor/nba.py @@ -19,7 +19,8 @@ class NBACVPBaseIE(TurnerBaseIE): def _extract_nba_cvp_info(self, path, video_id, fatal=False): return self._extract_cvp_info( - f'http://secure.nba.com/{path}', video_id, { + # XXX: The 3rd argument (None) needs to be the AdobePass software_statement + f'http://secure.nba.com/{path}', video_id, None, { 'default': { 'media_src': 'http://nba.cdn.turner.com/nba/big', }, @@ -94,6 +95,7 @@ def _extract_video(self, filter_key, filter_value): class NBAWatchEmbedIE(NBAWatchBaseIE): + _WORKING = False IE_NAME = 'nba:watch:embed' _VALID_URL = NBAWatchBaseIE._VALID_URL_BASE + r'embed\?.*?\bid=(?P\d+)' _TESTS = [{ @@ -115,6 +117,7 @@ def _real_extract(self, url): class NBAWatchIE(NBAWatchBaseIE): + _WORKING = False IE_NAME = 'nba:watch' _VALID_URL = NBAWatchBaseIE._VALID_URL_BASE + r'(?:nba/)?video/(?P.+?(?=/index\.html)|(?:[^/]+/)*[^/?#&]+)' _TESTS = [{ @@ -167,6 +170,7 @@ def _real_extract(self, url): class NBAWatchCollectionIE(NBAWatchBaseIE): + _WORKING = False IE_NAME = 'nba:watch:collection' _VALID_URL = NBAWatchBaseIE._VALID_URL_BASE + r'list/collection/(?P[^/?#&]+)' _TESTS = [{ @@ -336,6 +340,7 @@ def _real_extract(self, url): class NBAEmbedIE(NBABaseIE): + _WORKING = False IE_NAME = 'nba:embed' _VALID_URL = r'https?://secure\.nba\.com/assets/amp/include/video/(?:topI|i)frame\.html\?.*?\bcontentId=(?P[^?#&]+)' _TESTS = [{ @@ -358,6 +363,7 @@ def _real_extract(self, url): class NBAIE(NBABaseIE): + _WORKING = False IE_NAME = 'nba' _VALID_URL = NBABaseIE._VALID_URL_BASE + f'(?!{NBABaseIE._CHANNEL_PATH_REGEX})video/(?P(?:[^/]+/)*[^/?#&]+)' _TESTS = [{ @@ -385,6 +391,7 @@ def _extract_url_results(self, team, content_id): class NBAChannelIE(NBABaseIE): + _WORKING = False IE_NAME = 'nba:channel' _VALID_URL = NBABaseIE._VALID_URL_BASE + f'(?:{NBABaseIE._CHANNEL_PATH_REGEX})/(?P[^/?#&]+)' _TESTS = [{ diff --git a/yt_dlp/extractor/nbc.py b/yt_dlp/extractor/nbc.py index d9aded09ea..bd4862bde0 100644 --- a/yt_dlp/extractor/nbc.py +++ b/yt_dlp/extractor/nbc.py @@ -6,7 +6,7 @@ from .adobepass import AdobePassIE from .common import InfoExtractor -from .theplatform import ThePlatformIE, default_ns +from .theplatform import ThePlatformBaseIE, ThePlatformIE, default_ns from ..networking import HEADRequest from ..utils import ( ExtractorError, @@ -14,26 +14,130 @@ UserNotLive, clean_html, determine_ext, + extract_attributes, float_or_none, + get_element_html_by_class, int_or_none, join_nonempty, + make_archive_id, mimetype2ext, parse_age_limit, parse_duration, + parse_iso8601, remove_end, - smuggle_url, - traverse_obj, try_get, unescapeHTML, unified_timestamp, update_url_query, url_basename, + url_or_none, ) +from ..utils.traversal import require, traverse_obj -class NBCIE(ThePlatformIE): # XXX: Do not subclass from concrete IE - _VALID_URL = r'https?(?P://(?:www\.)?nbc\.com/(?:classic-tv/)?[^/]+/video/[^/]+/(?P(?:NBCE|n)?\d+))' +class NBCUniversalBaseIE(ThePlatformBaseIE): + _GEO_COUNTRIES = ['US'] + _GEO_BYPASS = False + _M3U8_RE = r'https?://[^/?#]+/prod/[\w-]+/(?P[^?#]+/)cmaf/mpeg_(?:cbcs|cenc)\w*/master_cmaf\w*\.m3u8' + def _download_nbcu_smil_and_extract_m3u8_url(self, tp_path, video_id, query): + smil = self._download_xml( + f'https://link.theplatform.com/s/{tp_path}', video_id, + 'Downloading SMIL manifest', 'Failed to download SMIL manifest', query={ + **query, + 'format': 'SMIL', # XXX: Do not confuse "format" with "formats" + 'manifest': 'm3u', + 'switch': 'HLSServiceSecure', # Or else we get broken mp4 http URLs instead of HLS + }, headers=self.geo_verification_headers()) + + ns = f'//{{{default_ns}}}' + if url := traverse_obj(smil, (f'{ns}video/@src', lambda _, v: determine_ext(v) == 'm3u8', any)): + return url + + exc = traverse_obj(smil, (f'{ns}param', lambda _, v: v.get('name') == 'exception', '@value', any)) + if exc == 'GeoLocationBlocked': + self.raise_geo_restricted(countries=self._GEO_COUNTRIES) + raise ExtractorError(traverse_obj(smil, (f'{ns}ref/@abstract', ..., any)), expected=exc == 'Expired') + + def _extract_nbcu_formats_and_subtitles(self, tp_path, video_id, query): + # formats='mpeg4' will return either a working m3u8 URL or an m3u8 template for non-DRM HLS + # formats='m3u+none,mpeg4' may return DRM HLS but w/the "folders" needed for non-DRM template + query['formats'] = 'm3u+none,mpeg4' + m3u8_url = self._download_nbcu_smil_and_extract_m3u8_url(tp_path, video_id, query) + + if mobj := re.fullmatch(self._M3U8_RE, m3u8_url): + query['formats'] = 'mpeg4' + m3u8_tmpl = self._download_nbcu_smil_and_extract_m3u8_url(tp_path, video_id, query) + # Example: https://vod-lf-oneapp-prd.akamaized.net/prod/video/{folders}master_hls.m3u8 + if '{folders}' in m3u8_tmpl: + self.write_debug('Found m3u8 URL template, formatting URL path') + m3u8_url = m3u8_tmpl.format(folders=mobj.group('folders')) + + if '/mpeg_cenc' in m3u8_url or '/mpeg_cbcs' in m3u8_url: + self.report_drm(video_id) + + return self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, 'mp4', m3u8_id='hls') + + def _extract_nbcu_video(self, url, display_id, old_ie_key=None): + webpage = self._download_webpage(url, display_id) + settings = self._search_json( + r']+data-drupal-selector="drupal-settings-json"[^>]*>', + webpage, 'settings', display_id) + + query = {} + tve = extract_attributes(get_element_html_by_class('tve-video-deck-app', webpage) or '') + if tve: + account_pid = tve.get('data-mpx-media-account-pid') or tve['data-mpx-account-pid'] + account_id = tve['data-mpx-media-account-id'] + metadata = self._parse_json( + tve.get('data-normalized-video') or '', display_id, fatal=False, transform_source=unescapeHTML) + video_id = tve.get('data-guid') or metadata['guid'] + if tve.get('data-entitlement') == 'auth': + auth = settings['tve_adobe_auth'] + release_pid = tve['data-release-pid'] + resource = self._get_mvpd_resource( + tve.get('data-adobe-pass-resource-id') or auth['adobePassResourceId'], + tve['data-title'], release_pid, tve.get('data-rating')) + query['auth'] = self._extract_mvpd_auth( + url, release_pid, auth['adobePassRequestorId'], + resource, auth['adobePassSoftwareStatement']) + else: + ls_playlist = traverse_obj(settings, ( + 'ls_playlist', lambda _, v: v['defaultGuid'], any, {require('LS playlist')})) + video_id = ls_playlist['defaultGuid'] + account_pid = ls_playlist.get('mpxMediaAccountPid') or ls_playlist['mpxAccountPid'] + account_id = ls_playlist['mpxMediaAccountId'] + metadata = traverse_obj(ls_playlist, ('videos', lambda _, v: v['guid'] == video_id, any)) or {} + + tp_path = f'{account_pid}/media/guid/{account_id}/{video_id}' + formats, subtitles = self._extract_nbcu_formats_and_subtitles(tp_path, video_id, query) + tp_metadata = self._download_theplatform_metadata(tp_path, video_id, fatal=False) + parsed_info = self._parse_theplatform_metadata(tp_metadata) + self._merge_subtitles(parsed_info['subtitles'], target=subtitles) + + return { + **parsed_info, + **traverse_obj(metadata, { + 'title': ('title', {str}), + 'description': ('description', {str}), + 'duration': ('durationInSeconds', {int_or_none}), + 'timestamp': ('airDate', {parse_iso8601}), + 'thumbnail': ('thumbnailUrl', {url_or_none}), + 'season_number': ('seasonNumber', {int_or_none}), + 'episode_number': ('episodeNumber', {int_or_none}), + 'episode': ('episodeTitle', {str}), + 'series': ('show', {str}), + }), + 'id': video_id, + 'display_id': display_id, + 'formats': formats, + 'subtitles': subtitles, + '_old_archive_ids': [make_archive_id(old_ie_key, video_id)] if old_ie_key else None, + } + + +class NBCIE(NBCUniversalBaseIE): + _VALID_URL = r'https?(?P://(?:www\.)?nbc\.com/(?:classic-tv/)?[^/?#]+/video/[^/?#]+/(?P\w+))' _TESTS = [ { 'url': 'http://www.nbc.com/the-tonight-show/video/jimmy-fallon-surprises-fans-at-ben-jerrys/2848237', @@ -49,47 +153,20 @@ class NBCIE(ThePlatformIE): # XXX: Do not subclass from concrete IE 'episode_number': 86, 'season': 'Season 2', 'season_number': 2, - 'series': 'Tonight Show: Jimmy Fallon', - 'duration': 237.0, - 'chapters': 'count:1', - 'tags': 'count:4', + 'series': 'Tonight', + 'duration': 236.504, + 'tags': 'count:2', 'thumbnail': r're:https?://.+\.jpg', 'categories': ['Series/The Tonight Show Starring Jimmy Fallon'], 'media_type': 'Full Episode', + 'age_limit': 14, + '_old_archive_ids': ['theplatform 2848237'], }, 'params': { 'skip_download': 'm3u8', }, }, { - 'url': 'http://www.nbc.com/saturday-night-live/video/star-wars-teaser/2832821', - 'info_dict': { - 'id': '2832821', - 'ext': 'mp4', - 'title': 'Star Wars Teaser', - 'description': 'md5:0b40f9cbde5b671a7ff62fceccc4f442', - 'timestamp': 1417852800, - 'upload_date': '20141206', - 'uploader': 'NBCU-COM', - }, - 'skip': 'page not found', - }, - { - # HLS streams requires the 'hdnea3' cookie - 'url': 'http://www.nbc.com/Kings/video/goliath/n1806', - 'info_dict': { - 'id': '101528f5a9e8127b107e98c5e6ce4638', - 'ext': 'mp4', - 'title': 'Goliath', - 'description': 'When an unknown soldier saves the life of the King\'s son in battle, he\'s thrust into the limelight and politics of the kingdom.', - 'timestamp': 1237100400, - 'upload_date': '20090315', - 'uploader': 'NBCU-COM', - }, - 'skip': 'page not found', - }, - { - # manifest url does not have extension 'url': 'https://www.nbc.com/the-golden-globe-awards/video/oprah-winfrey-receives-cecil-b-de-mille-award-at-the-2018-golden-globes/3646439', 'info_dict': { 'id': '3646439', @@ -99,48 +176,47 @@ class NBCIE(ThePlatformIE): # XXX: Do not subclass from concrete IE 'episode_number': 1, 'season': 'Season 75', 'season_number': 75, - 'series': 'The Golden Globe Awards', + 'series': 'Golden Globes', 'description': 'Oprah Winfrey receives the Cecil B. de Mille Award at the 75th Annual Golden Globe Awards.', 'uploader': 'NBCU-COM', 'upload_date': '20180107', 'timestamp': 1515312000, - 'duration': 570.0, + 'duration': 569.703, 'tags': 'count:8', 'thumbnail': r're:https?://.+\.jpg', - 'chapters': 'count:1', + 'media_type': 'Highlight', + 'age_limit': 0, + 'categories': ['Series/The Golden Globe Awards'], + '_old_archive_ids': ['theplatform 3646439'], }, 'params': { 'skip_download': 'm3u8', }, }, { - # new video_id format - 'url': 'https://www.nbc.com/quantum-leap/video/bens-first-leap-nbcs-quantum-leap/NBCE125189978', + # Needs to be extracted from webpage instead of GraphQL + 'url': 'https://www.nbc.com/paris2024/video/ali-truwit-found-purpose-pool-after-her-life-changed/para24_sww_alitruwittodayshow_240823', 'info_dict': { - 'id': 'NBCE125189978', + 'id': 'para24_sww_alitruwittodayshow_240823', 'ext': 'mp4', - 'title': 'Ben\'s First Leap | NBC\'s Quantum Leap', - 'description': 'md5:a82762449b7ec4bb83291a7b355ebf8e', - 'uploader': 'NBCU-COM', - 'series': 'Quantum Leap', - 'season': 'Season 1', - 'season_number': 1, - 'episode': 'Ben\'s First Leap | NBC\'s Quantum Leap', - 'episode_number': 1, - 'duration': 170.171, - 'chapters': [], - 'timestamp': 1663956155, - 'upload_date': '20220923', - 'tags': 'count:10', - 'age_limit': 0, + 'title': 'Ali Truwit found purpose in the pool after her life changed', + 'description': 'md5:c16d7489e1516593de1cc5d3f39b9bdb', + 'uploader': 'NBCU-SPORTS', + 'duration': 311.077, 'thumbnail': r're:https?://.+\.jpg', - 'categories': ['Series/Quantum Leap 2022'], - 'media_type': 'Highlight', + 'episode': 'Ali Truwit found purpose in the pool after her life changed', + 'timestamp': 1724435902.0, + 'upload_date': '20240823', + '_old_archive_ids': ['theplatform para24_sww_alitruwittodayshow_240823'], }, 'params': { 'skip_download': 'm3u8', }, }, + { + 'url': 'https://www.nbc.com/quantum-leap/video/bens-first-leap-nbcs-quantum-leap/NBCE125189978', + 'only_matching': True, + }, { 'url': 'https://www.nbc.com/classic-tv/charles-in-charge/video/charles-in-charge-pilot/n3310', 'only_matching': True, @@ -151,6 +227,7 @@ class NBCIE(ThePlatformIE): # XXX: Do not subclass from concrete IE 'only_matching': True, }, ] + _SOFTWARE_STATEMENT = 'eyJhbGciOiJSUzI1NiJ9.eyJzdWIiOiI1Yzg2YjdkYy04NDI3LTRjNDUtOGQwZi1iNDkzYmE3MmQwYjQiLCJuYmYiOjE1Nzg3MDM2MzEsImlzcyI6ImF1dGguYWRvYmUuY29tIiwiaWF0IjoxNTc4NzAzNjMxfQ.QQKIsBhAjGQTMdAqRTqhcz2Cddr4Y2hEjnSiOeKKki4nLrkDOsjQMmqeTR0hSRarraxH54wBgLvsxI7LHwKMvr7G8QpynNAxylHlQD3yhN9tFhxt4KR5wW3as02B-W2TznK9bhNWPKIyHND95Uo2Mi6rEQoq8tM9O09WPWaanE5BX_-r6Llr6dPq5F0Lpx2QOn2xYRb1T4nFxdFTNoss8GBds8OvChTiKpXMLHegLTc1OS4H_1a8tO_37jDwSdJuZ8iTyRLV4kZ2cpL6OL5JPMObD4-HQiec_dfcYgMKPiIfP9ZqdXpec2SVaCLsWEk86ZYvD97hLIQrK5rrKd1y-A' def _real_extract(self, url): permalink, video_id = self._match_valid_url(url).groups() @@ -196,62 +273,50 @@ def _real_extract(self, url): 'userId': '0', }), })['data']['bonanzaPage']['metadata'] - query = { - 'mbr': 'true', - 'manifest': 'm3u', - 'switch': 'HLSServiceSecure', - } + + if not video_data: + # Some videos are not available via GraphQL API + webpage = self._download_webpage(url, video_id) + video_data = self._search_json( + r'', webpage), + (..., {json.loads}, ..., {self._find_json}, + lambda _, v: v['payload'][video_type]['slug'] == display_id, + 'payload', any, {require('video data')})) - if not self.get_param('allow_unplayable_formats') and try_get(common_data, lambda x: x['episode']['video']['drm'], bool): + if traverse_obj(common_data, (video_type, 'video', 'drm', {bool})): self.report_drm(display_id) - brightcove_id = try_get( - common_data, lambda x: x['episode']['video']['brightcoveId'], str) or 'ref:{}'.format(common_data['episode']['video']['referenceId']) - video_id = str_or_none(try_get(common_data, lambda x: x['episode']['video']['id'])) or brightcove_id - - title = try_get(common_data, lambda x: x['episode']['name'], str) - season_number = try_get(common_data, lambda x: x['season']['seasonNumber'], int) - episode_number = try_get(common_data, lambda x: x['episode']['episodeNumber'], int) - timestamp = unified_timestamp(try_get(common_data, lambda x: x['episode']['airDate'], str)) - release_date = unified_strdate(try_get(common_data, lambda x: x['episode']['availability'], str)) - thumbnails_data = try_get(common_data, lambda x: x['episode']['image']['sizes'], dict) or {} - thumbnails = [{ - 'id': thumbnail_id, - 'url': thumbnail_url, - 'width': int_or_none(thumbnail_id[1:]), - } for thumbnail_id, thumbnail_url in thumbnails_data.items()] + brightcove_id = traverse_obj(common_data, ( + video_type, 'video', ( + ('brightcoveId', {str}), + ('referenceId', {str}, {lambda x: f'ref:{x}' if x else None}), + ), any, {require('brightcove ID')})) return { '_type': 'url_transparent', - 'url': smuggle_url( - self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, - {'geo_countries': self._GEO_COUNTRIES}), - 'id': video_id, - 'title': title, - 'description': try_get(common_data, lambda x: x['episode']['description'], str), - 'duration': float_or_none(try_get(common_data, lambda x: x['episode']['video']['duration'], float), 1000), - 'thumbnails': thumbnails, - 'ie_key': 'BrightcoveNew', - 'season_number': season_number, - 'episode_number': episode_number, - 'timestamp': timestamp, - 'release_date': release_date, + 'ie_key': BrightcoveNewIE.ie_key(), + 'url': self.BRIGHTCOVE_URL_TEMPLATE.format(brightcove_id), + **traverse_obj(common_data, { + 'id': (video_type, 'video', 'id', {int}, ({str_or_none}, {value(brightcove_id)}), any), + 'title': (video_type, 'name', {str}), + 'description': (video_type, 'description', {str}), + 'duration': (video_type, 'video', 'duration', {float_or_none(scale=1000)}), + 'tags': (video_type, 'tags', ..., 'name', {str}, all, filter), + 'series': ('tvSeries', 'name', {str}), + 'season_number': ('season', 'seasonNumber', {int_or_none}), + 'episode_number': ('episode', 'episodeNumber', {int_or_none}), + 'timestamp': ('episode', 'airDate', {parse_iso8601}), + 'release_timestamp': (video_type, 'availability', {parse_iso8601}), + 'thumbnails': (video_type, 'image', 'sizes', {dict.items}, lambda _, v: url_or_none(v[1]), { + 'id': 0, + 'url': 1, + 'width': (1, {parse_resolution}, 'width'), + }), + }), } diff --git a/yt_dlp/extractor/nytimes.py b/yt_dlp/extractor/nytimes.py index a97add71a4..3472a21597 100644 --- a/yt_dlp/extractor/nytimes.py +++ b/yt_dlp/extractor/nytimes.py @@ -181,6 +181,7 @@ class NYTimesArticleIE(NYTimesBaseIE): 'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg', 'duration': 119.0, }, + 'skip': 'HTTP Error 500: Internal Server Error', }, { # article with audio and no video 'url': 'https://www.nytimes.com/2023/09/29/health/mosquitoes-genetic-engineering.html', @@ -190,13 +191,14 @@ class NYTimesArticleIE(NYTimesBaseIE): 'ext': 'mp3', 'title': 'The Gamble: Can Genetically Modified Mosquitoes End Disease?', 'description': 'md5:9ff8b47acbaf7f3ca8c732f5c815be2e', - 'timestamp': 1695960700, + 'timestamp': 1696008129, 'upload_date': '20230929', - 'creator': 'Stephanie Nolen, Natalija Gormalova', + 'creators': ['Stephanie Nolen', 'Natalija Gormalova'], 'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg', 'duration': 1322, }, }, { + # lede_media_block already has sourceId 'url': 'https://www.nytimes.com/2023/11/29/business/dealbook/kamala-harris-biden-voters.html', 'md5': '3eb5ddb1d6f86254fe4f233826778737', 'info_dict': { @@ -207,7 +209,7 @@ class NYTimesArticleIE(NYTimesBaseIE): 'timestamp': 1701290997, 'upload_date': '20231129', 'uploader': 'By The New York Times', - 'creator': 'Katie Rogers', + 'creators': ['Katie Rogers'], 'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg', 'duration': 97.631, }, @@ -222,10 +224,22 @@ class NYTimesArticleIE(NYTimesBaseIE): 'title': 'Drunk and Asleep on the Job: Air Traffic Controllers Pushed to the Brink', 'description': 'md5:549e5a5e935bf7d048be53ba3d2c863d', 'upload_date': '20231202', - 'creator': 'Emily Steel, Sydney Ember', + 'creators': ['Emily Steel', 'Sydney Ember'], 'timestamp': 1701511264, }, 'playlist_count': 3, + }, { + # lede_media_block does not have sourceId + 'url': 'https://www.nytimes.com/2025/04/30/well/move/hip-mobility-routine.html', + 'info_dict': { + 'id': 'hip-mobility-routine', + 'title': 'Tight Hips? These Moves Can Help.', + 'description': 'Sitting all day is hard on your hips. Try this simple routine for better mobility.', + 'creators': ['Alyssa Ages', 'Theodore Tae'], + 'timestamp': 1746003629, + 'upload_date': '20250430', + }, + 'playlist_count': 7, }, { 'url': 'https://www.nytimes.com/2023/12/02/business/media/netflix-squid-game-challenge.html', 'only_matching': True, @@ -256,14 +270,18 @@ def _extract_content_from_block(self, block): def _real_extract(self, url): page_id = self._match_id(url) - webpage = self._download_webpage(url, page_id) + webpage = self._download_webpage(url, page_id, impersonate=True) art_json = self._search_json( r'window\.__preloadedData\s*=', webpage, 'media details', page_id, transform_source=lambda x: x.replace('undefined', 'null'))['initialData']['data']['article'] + content = art_json['sprinkledBody']['content'] - blocks = traverse_obj(art_json, ( - 'sprinkledBody', 'content', ..., ('ledeMedia', None), - lambda _, v: v['__typename'] in ('Video', 'Audio'))) + blocks = [] + block_filter = lambda k, v: k == 'media' and v['__typename'] in ('Video', 'Audio') + if lede_media_block := traverse_obj(content, (..., 'ledeMedia', block_filter, any)): + lede_media_block.setdefault('sourceId', art_json.get('sourceId')) + blocks.append(lede_media_block) + blocks.extend(traverse_obj(content, (..., block_filter))) if not blocks: raise ExtractorError('Unable to extract any media blocks from webpage') @@ -273,8 +291,7 @@ def _real_extract(self, url): 'sprinkledBody', 'content', ..., 'summary', 'content', ..., 'text', {str}), get_all=False) or self._html_search_meta(['og:description', 'twitter:description'], webpage), 'timestamp': traverse_obj(art_json, ('firstPublished', {parse_iso8601})), - 'creator': ', '.join( - traverse_obj(art_json, ('bylines', ..., 'creators', ..., 'displayName'))), # TODO: change to 'creators' (list) + 'creators': traverse_obj(art_json, ('bylines', ..., 'creators', ..., 'displayName', {str})), 'thumbnails': self._extract_thumbnails(traverse_obj( art_json, ('promotionalMedia', 'assetCrops', ..., 'renditions', ...))), } diff --git a/yt_dlp/extractor/on24.py b/yt_dlp/extractor/on24.py index 05218e9de1..1dfc25a7da 100644 --- a/yt_dlp/extractor/on24.py +++ b/yt_dlp/extractor/on24.py @@ -11,12 +11,15 @@ class On24IE(InfoExtractor): IE_NAME = 'on24' IE_DESC = 'ON24' - _VALID_URL = r'''(?x) - https?://event\.on24\.com/(?: - wcc/r/(?P\d{7})/(?P[0-9A-F]{32})| - eventRegistration/(?:console/EventConsoleApollo|EventLobbyServlet\?target=lobby30) - \.jsp\?(?:[^/#?]*&)?eventid=(?P\d{7})[^/#?]*&key=(?P[0-9A-F]{32}) - )''' + _ID_RE = r'(?P\d{7})' + _KEY_RE = r'(?P[0-9A-F]{32})' + _URL_BASE_RE = r'https?://event\.on24\.com' + _URL_QUERY_RE = rf'(?:[^#]*&)?eventid={_ID_RE}&(?:[^#]+&)?key={_KEY_RE}' + _VALID_URL = [ + rf'{_URL_BASE_RE}/wcc/r/{_ID_RE}/{_KEY_RE}', + rf'{_URL_BASE_RE}/eventRegistration/console/(?:EventConsoleApollo\.jsp|apollox/mainEvent/?)\?{_URL_QUERY_RE}', + rf'{_URL_BASE_RE}/eventRegistration/EventLobbyServlet/?\?{_URL_QUERY_RE}', + ] _TESTS = [{ 'url': 'https://event.on24.com/eventRegistration/console/EventConsoleApollo.jsp?uimode=nextgeneration&eventid=2197467&sessionid=1&key=5DF57BE53237F36A43B478DD36277A84&contenttype=A&eventuserid=305999&playerwidth=1000&playerheight=650&caller=previewLobby&text_language_id=en&format=fhaudio&newConsole=false', @@ -34,12 +37,16 @@ class On24IE(InfoExtractor): }, { 'url': 'https://event.on24.com/eventRegistration/console/EventConsoleApollo.jsp?&eventid=2639291&sessionid=1&username=&partnerref=&format=fhvideo1&mobile=&flashsupportedmobiledevice=&helpcenter=&key=82829018E813065A122363877975752E&newConsole=true&nxChe=true&newTabCon=true&text_language_id=en&playerwidth=748&playerheight=526&eventuserid=338788762&contenttype=A&mediametricsessionid=384764716&mediametricid=3558192&usercd=369267058&mode=launch', 'only_matching': True, + }, { + 'url': 'https://event.on24.com/eventRegistration/EventLobbyServlet?target=reg20.jsp&eventid=3543176&key=BC0F6B968B67C34B50D461D40FDB3E18&groupId=3143628', + 'only_matching': True, + }, { + 'url': 'https://event.on24.com/eventRegistration/console/apollox/mainEvent?&eventid=4843671&sessionid=1&username=&partnerref=&format=fhvideo1&mobile=&flashsupportedmobiledevice=&helpcenter=&key=4EAC9B5C564CC98FF29E619B06A2F743&newConsole=true&nxChe=true&newTabCon=true&consoleEarEventConsole=false&consoleEarCloudApi=false&text_language_id=en&playerwidth=748&playerheight=526&referrer=https%3A%2F%2Fevent.on24.com%2Finterface%2Fregistration%2Fautoreg%2Findex.html%3Fsessionid%3D1%26eventid%3D4843671%26key%3D4EAC9B5C564CC98FF29E619B06A2F743%26email%3D000a3e42-7952-4dd6-8f8a-34c38ea3cf02%2540platform%26firstname%3Ds%26lastname%3Ds%26deletecookie%3Dtrue%26event_email%3DN%26marketing_email%3DN%26std1%3D0642572014177%26std2%3D0642572014179%26std3%3D550165f7-a44e-4725-9fe6-716f89908c2b%26std4%3D0&eventuserid=745776448&contenttype=A&mediametricsessionid=640613707&mediametricid=6810717&usercd=745776448&mode=launch', + 'only_matching': True, }] def _real_extract(self, url): - mobj = self._match_valid_url(url) - event_id = mobj.group('id_1') or mobj.group('id_2') - event_key = mobj.group('key_1') or mobj.group('key_2') + event_id, event_key = self._match_valid_url(url).group('id', 'key') event_data = self._download_json( 'https://event.on24.com/apic/utilApp/EventConsoleCachedServlet', diff --git a/yt_dlp/extractor/once.py b/yt_dlp/extractor/once.py deleted file mode 100644 index 989f10abb1..0000000000 --- a/yt_dlp/extractor/once.py +++ /dev/null @@ -1,40 +0,0 @@ -import re - -from .common import InfoExtractor - - -class OnceIE(InfoExtractor): # XXX: Conventionally, base classes should end with BaseIE/InfoExtractor - _VALID_URL = r'https?://.+?\.unicornmedia\.com/now/(?:ads/vmap/)?[^/]+/[^/]+/(?P[^/]+)/(?P[^/]+)/(?:[^/]+/)?(?P[^/]+)/content\.(?:once|m3u8|mp4)' - ADAPTIVE_URL_TEMPLATE = 'http://once.unicornmedia.com/now/master/playlist/%s/%s/%s/content.m3u8' - PROGRESSIVE_URL_TEMPLATE = 'http://once.unicornmedia.com/now/media/progressive/%s/%s/%s/%s/content.mp4' - - def _extract_once_formats(self, url, http_formats_preference=None): - domain_id, application_id, media_item_id = re.match( - OnceIE._VALID_URL, url).groups() - formats = self._extract_m3u8_formats( - self.ADAPTIVE_URL_TEMPLATE % ( - domain_id, application_id, media_item_id), - media_item_id, 'mp4', m3u8_id='hls', fatal=False) - progressive_formats = [] - for adaptive_format in formats: - # Prevent advertisement from embedding into m3u8 playlist (see - # https://github.com/ytdl-org/youtube-dl/issues/8893#issuecomment-199912684) - adaptive_format['url'] = re.sub( - r'\badsegmentlength=\d+', r'adsegmentlength=0', adaptive_format['url']) - rendition_id = self._search_regex( - r'/now/media/playlist/[^/]+/[^/]+/([^/]+)', - adaptive_format['url'], 'redition id', default=None) - if rendition_id: - progressive_format = adaptive_format.copy() - progressive_format.update({ - 'url': self.PROGRESSIVE_URL_TEMPLATE % ( - domain_id, application_id, rendition_id, media_item_id), - 'format_id': adaptive_format['format_id'].replace( - 'hls', 'http'), - 'protocol': 'http', - 'preference': http_formats_preference, - }) - progressive_formats.append(progressive_format) - self._check_formats(progressive_formats, media_item_id) - formats.extend(progressive_formats) - return formats diff --git a/yt_dlp/extractor/panopto.py b/yt_dlp/extractor/panopto.py index 91f1055193..9f307a53e2 100644 --- a/yt_dlp/extractor/panopto.py +++ b/yt_dlp/extractor/panopto.py @@ -14,8 +14,9 @@ int_or_none, parse_qs, srt_subtitles_timecode, - traverse_obj, + url_or_none, ) +from ..utils.traversal import traverse_obj class PanoptoBaseIE(InfoExtractor): @@ -345,21 +346,16 @@ def _extract_streams_formats_and_subtitles(self, video_id, streams, **fmt_kwargs subtitles = {} for stream in streams or []: stream_formats = [] - http_stream_url = stream.get('StreamHttpUrl') - stream_url = stream.get('StreamUrl') - - if http_stream_url: - stream_formats.append({'url': http_stream_url}) - - if stream_url: + for stream_url in set(traverse_obj(stream, (('StreamHttpUrl', 'StreamUrl'), {url_or_none}))): media_type = stream.get('ViewerMediaFileTypeName') if media_type in ('hls', ): - m3u8_formats, stream_subtitles = self._extract_m3u8_formats_and_subtitles(stream_url, video_id) - stream_formats.extend(m3u8_formats) - subtitles = self._merge_subtitles(subtitles, stream_subtitles) + fmts, subs = self._extract_m3u8_formats_and_subtitles(stream_url, video_id, m3u8_id='hls', fatal=False) + stream_formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) else: stream_formats.append({ 'url': stream_url, + 'ext': media_type, }) for fmt in stream_formats: fmt.update({ diff --git a/yt_dlp/extractor/parti.py b/yt_dlp/extractor/parti.py new file mode 100644 index 0000000000..acadefc4e4 --- /dev/null +++ b/yt_dlp/extractor/parti.py @@ -0,0 +1,101 @@ +from .common import InfoExtractor +from ..utils import UserNotLive, int_or_none, parse_iso8601, url_or_none, urljoin +from ..utils.traversal import traverse_obj + + +class PartiBaseIE(InfoExtractor): + def _call_api(self, path, video_id, note=None): + return self._download_json( + f'https://api-backend.parti.com/parti_v2/profile/{path}', video_id, note) + + +class PartiVideoIE(PartiBaseIE): + IE_NAME = 'parti:video' + _VALID_URL = r'https?://(?:www\.)?parti\.com/video/(?P\d+)' + _TESTS = [{ + 'url': 'https://parti.com/video/66284', + 'info_dict': { + 'id': '66284', + 'ext': 'mp4', + 'title': 'NOW LIVE ', + 'upload_date': '20250327', + 'categories': ['Gaming'], + 'thumbnail': 'https://assets.parti.com/351424_eb9e5250-2821-484a-9c5f-ca99aa666c87.png', + 'channel': 'ItZTMGG', + 'timestamp': 1743044379, + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + data = self._call_api(f'get_livestream_channel_info/recent/{video_id}', video_id) + + return { + 'id': video_id, + 'formats': self._extract_m3u8_formats( + urljoin('https://watch.parti.com', data['livestream_recording']), video_id, 'mp4'), + **traverse_obj(data, { + 'title': ('event_title', {str}), + 'channel': ('user_name', {str}), + 'thumbnail': ('event_file', {url_or_none}), + 'categories': ('category_name', {str}, filter, all), + 'timestamp': ('event_start_ts', {int_or_none}), + }), + } + + +class PartiLivestreamIE(PartiBaseIE): + IE_NAME = 'parti:livestream' + _VALID_URL = r'https?://(?:www\.)?parti\.com/creator/(?P[\w]+)/(?P[\w/-]+)' + _TESTS = [{ + 'url': 'https://parti.com/creator/parti/Capt_Robs_Adventures', + 'info_dict': { + 'id': 'Capt_Robs_Adventures', + 'ext': 'mp4', + 'title': r"re:I'm Live on Parti \d{4}-\d{2}-\d{2} \d{2}:\d{2}", + 'view_count': int, + 'thumbnail': r're:https://assets\.parti\.com/.+\.png', + 'timestamp': 1743879776, + 'upload_date': '20250405', + 'live_status': 'is_live', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://parti.com/creator/discord/sazboxgaming/0', + 'only_matching': True, + }] + + def _real_extract(self, url): + service, creator_slug = self._match_valid_url(url).group('service', 'id') + + encoded_creator_slug = creator_slug.replace('/', '%23') + creator_id = self._call_api( + f'get_user_by_social_media/{service}/{encoded_creator_slug}', + creator_slug, note='Fetching user ID') + + data = self._call_api( + f'get_livestream_channel_info/{creator_id}', creator_id, + note='Fetching user profile feed')['channel_info'] + + if not traverse_obj(data, ('channel', 'is_live', {bool})): + raise UserNotLive(video_id=creator_id) + + channel_info = data['channel'] + + return { + 'id': creator_slug, + 'formats': self._extract_m3u8_formats( + channel_info['playback_url'], creator_slug, live=True, query={ + 'token': channel_info['playback_auth_token'], + 'player_version': '1.17.0', + }), + 'is_live': True, + **traverse_obj(data, { + 'title': ('livestream_event_info', 'event_name', {str}), + 'description': ('livestream_event_info', 'event_description', {str}), + 'thumbnail': ('livestream_event_info', 'livestream_preview_file', {url_or_none}), + 'timestamp': ('stream', 'start_time', {parse_iso8601}), + 'view_count': ('stream', 'viewer_count', {int_or_none}), + }), + } diff --git a/yt_dlp/extractor/patreon.py b/yt_dlp/extractor/patreon.py index 7794cae6c0..2c1436cac1 100644 --- a/yt_dlp/extractor/patreon.py +++ b/yt_dlp/extractor/patreon.py @@ -340,8 +340,9 @@ def _real_extract(self, url): 'channel_follower_count': ('attributes', 'patron_count', {int_or_none}), })) - # all-lowercase 'referer' so we can smuggle it to Generic, SproutVideo, Vimeo - headers = {'referer': 'https://patreon.com/'} + # Must be all-lowercase 'referer' so we can smuggle it to Generic, SproutVideo, and Vimeo. + # patreon.com URLs redirect to www.patreon.com; this matters when requesting mux.com m3u8s + headers = {'referer': 'https://www.patreon.com/'} # handle Vimeo embeds if traverse_obj(attributes, ('embed', 'provider')) == 'Vimeo': @@ -352,7 +353,7 @@ def _real_extract(self, url): v_url, video_id, 'Checking Vimeo embed URL', headers=headers, fatal=False, errnote=False, expected_status=429): # 429 is TLS fingerprint rejection entries.append(self.url_result( - VimeoIE._smuggle_referrer(v_url, 'https://patreon.com/'), + VimeoIE._smuggle_referrer(v_url, headers['referer']), VimeoIE, url_transparent=True)) embed_url = traverse_obj(attributes, ('embed', 'url', {url_or_none})) @@ -379,11 +380,13 @@ def _real_extract(self, url): 'url': post_file['url'], }) elif name == 'video' or determine_ext(post_file.get('url')) == 'm3u8': - formats, subtitles = self._extract_m3u8_formats_and_subtitles(post_file['url'], video_id) + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + post_file['url'], video_id, headers=headers) entries.append({ 'id': video_id, 'formats': formats, 'subtitles': subtitles, + 'http_headers': headers, }) can_view_post = traverse_obj(attributes, 'current_user_can_view') diff --git a/yt_dlp/extractor/phoenix.py b/yt_dlp/extractor/phoenix.py index 63c256019e..9df34f8c91 100644 --- a/yt_dlp/extractor/phoenix.py +++ b/yt_dlp/extractor/phoenix.py @@ -1,5 +1,3 @@ -import re - from .youtube import YoutubeIE from .zdf import ZDFBaseIE from ..utils import ( @@ -7,44 +5,27 @@ merge_dicts, try_get, unified_timestamp, - urljoin, ) class PhoenixIE(ZDFBaseIE): IE_NAME = 'phoenix.de' - _VALID_URL = r'https?://(?:www\.)?phoenix\.de/(?:[^/]+/)*[^/?#&]*-a-(?P\d+)\.html' + _VALID_URL = r'https?://(?:www\.)?phoenix\.de/(?:[^/?#]+/)*[^/?#&]*-a-(?P\d+)\.html' _TESTS = [{ - # Same as https://www.zdf.de/politik/phoenix-sendungen/wohin-fuehrt-der-protest-in-der-pandemie-100.html - 'url': 'https://www.phoenix.de/sendungen/ereignisse/corona-nachgehakt/wohin-fuehrt-der-protest-in-der-pandemie-a-2050630.html', - 'md5': '34ec321e7eb34231fd88616c65c92db0', + 'url': 'https://www.phoenix.de/sendungen/dokumentationen/spitzbergen-a-893349.html', + 'md5': 'a79e86d9774d0b3f2102aff988a0bd32', 'info_dict': { - 'id': '210222_phx_nachgehakt_corona_protest', + 'id': '221215_phx_spitzbergen', 'ext': 'mp4', - 'title': 'Wohin führt der Protest in der Pandemie?', - 'description': 'md5:7d643fe7f565e53a24aac036b2122fbd', - 'duration': 1691, - 'timestamp': 1613902500, - 'upload_date': '20210221', + 'title': 'Spitzbergen', + 'description': 'Film von Tilmann Bünz', + 'duration': 728.0, + 'timestamp': 1555600500, + 'upload_date': '20190418', 'uploader': 'Phoenix', - 'series': 'corona nachgehakt', - 'episode': 'Wohin führt der Protest in der Pandemie?', - }, - }, { - # Youtube embed - 'url': 'https://www.phoenix.de/sendungen/gespraeche/phoenix-streitgut-brennglas-corona-a-1965505.html', - 'info_dict': { - 'id': 'hMQtqFYjomk', - 'ext': 'mp4', - 'title': 'phoenix streitgut: Brennglas Corona - Wie gerecht ist unsere Gesellschaft?', - 'description': 'md5:ac7a02e2eb3cb17600bc372e4ab28fdd', - 'duration': 3509, - 'upload_date': '20201219', - 'uploader': 'phoenix', - 'uploader_id': 'phoenix', - }, - 'params': { - 'skip_download': True, + 'thumbnail': 'https://www.phoenix.de/sixcms/media.php/21/Bergspitzen1.png', + 'series': 'Dokumentationen', + 'episode': 'Spitzbergen', }, }, { 'url': 'https://www.phoenix.de/entwicklungen-in-russland-a-2044720.html', @@ -90,8 +71,8 @@ def _real_extract(self, url): content_id = details['tracking']['nielsen']['content']['assetid'] info = self._extract_ptmd( - f'https://tmd.phoenix.de/tmd/2/ngplayer_2_3/vod/ptmd/phoenix/{content_id}', - content_id, None, url) + f'https://tmd.phoenix.de/tmd/2/android_native_6/vod/ptmd/phoenix/{content_id}', + content_id) duration = int_or_none(try_get( details, lambda x: x['tracking']['nielsen']['content']['length'])) @@ -101,20 +82,8 @@ def _real_extract(self, url): str) episode = title if details.get('contentType') == 'episode' else None - thumbnails = [] teaser_images = try_get(details, lambda x: x['teaserImageRef']['layouts'], dict) or {} - for thumbnail_key, thumbnail_url in teaser_images.items(): - thumbnail_url = urljoin(url, thumbnail_url) - if not thumbnail_url: - continue - thumbnail = { - 'url': thumbnail_url, - } - m = re.match('^([0-9]+)x([0-9]+)$', thumbnail_key) - if m: - thumbnail['width'] = int(m.group(1)) - thumbnail['height'] = int(m.group(2)) - thumbnails.append(thumbnail) + thumbnails = self._extract_thumbnails(teaser_images) return merge_dicts(info, { 'id': content_id, diff --git a/yt_dlp/extractor/picarto.py b/yt_dlp/extractor/picarto.py index 72e89c31ed..92431fa241 100644 --- a/yt_dlp/extractor/picarto.py +++ b/yt_dlp/extractor/picarto.py @@ -10,7 +10,8 @@ class PicartoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www.)?picarto\.tv/(?P[a-zA-Z0-9]+)' + IE_NAME = 'picarto' + _VALID_URL = r'https?://(?:www.)?picarto\.tv/(?P[^/#?]+)/?(?:$|[?#])' _TEST = { 'url': 'https://picarto.tv/Setz', 'info_dict': { @@ -89,7 +90,8 @@ def _real_extract(self, url): class PicartoVodIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?picarto\.tv/(?:videopopout|\w+/videos)/(?P[^/?#&]+)' + IE_NAME = 'picarto:vod' + _VALID_URL = r'https?://(?:www\.)?picarto\.tv/(?:videopopout|\w+(?:/profile)?/videos)/(?P[^/?#&]+)' _TESTS = [{ 'url': 'https://picarto.tv/videopopout/ArtofZod_2017.12.12.00.13.23.flv', 'md5': '3ab45ba4352c52ee841a28fb73f2d9ca', @@ -111,6 +113,18 @@ class PicartoVodIE(InfoExtractor): 'channel': 'ArtofZod', 'age_limit': 18, }, + }, { + 'url': 'https://picarto.tv/DrechuArt/profile/videos/400347', + 'md5': 'f9ea54868b1d9dec40eb554b484cc7bf', + 'info_dict': { + 'id': '400347', + 'ext': 'mp4', + 'title': 'Welcome to the Show', + 'thumbnail': r're:^https?://.*\.jpg', + 'channel': 'DrechuArt', + 'age_limit': 0, + }, + }, { 'url': 'https://picarto.tv/videopopout/Plague', 'only_matching': True, diff --git a/yt_dlp/extractor/playsuisse.py b/yt_dlp/extractor/playsuisse.py index 59231d8405..46e3a5b8ff 100644 --- a/yt_dlp/extractor/playsuisse.py +++ b/yt_dlp/extractor/playsuisse.py @@ -7,11 +7,12 @@ from ..utils import ( ExtractorError, int_or_none, + join_nonempty, parse_qs, - traverse_obj, update_url_query, urlencode_postdata, ) +from ..utils.traversal import traverse_obj, unpack class PlaySuisseIE(InfoExtractor): @@ -26,12 +27,12 @@ class PlaySuisseIE(InfoExtractor): { # episode in a series 'url': 'https://www.playsuisse.ch/watch/763182?episodeId=763211', - 'md5': '82df2a470b2dfa60c2d33772a8a60cf8', + 'md5': 'e20d1ede6872a03b41905ca1060a1ef2', 'info_dict': { 'id': '763211', 'ext': 'mp4', 'title': 'Knochen', - 'description': 'md5:8ea7a8076ba000cd9e8bc132fd0afdd8', + 'description': 'md5:3bdd80e2ce20227c47aab1df2a79a519', 'duration': 3344, 'series': 'Wilder', 'season': 'Season 1', @@ -42,24 +43,33 @@ class PlaySuisseIE(InfoExtractor): }, }, { # film - 'url': 'https://www.playsuisse.ch/watch/808675', - 'md5': '818b94c1d2d7c4beef953f12cb8f3e75', + 'url': 'https://www.playsuisse.ch/detail/2573198', + 'md5': '1f115bb0a5191477b1a5771643a4283d', 'info_dict': { - 'id': '808675', + 'id': '2573198', 'ext': 'mp4', - 'title': 'Der Läufer', - 'description': 'md5:9f61265c7e6dcc3e046137a792b275fd', - 'duration': 5280, + 'title': 'Azor', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'genres': ['Fiction'], + 'creators': ['Andreas Fontana'], + 'cast': ['Fabrizio Rongione', 'Stéphanie Cléau', 'Gilles Privat', 'Alexandre Trocki'], + 'location': 'France; Argentine', + 'release_year': 2021, + 'duration': 5981, 'thumbnail': 're:https://playsuisse-img.akamaized.net/', }, }, { # series (treated as a playlist) 'url': 'https://www.playsuisse.ch/detail/1115687', 'info_dict': { - 'description': 'md5:e4a2ae29a8895823045b5c3145a02aa3', 'id': '1115687', 'series': 'They all came out to Montreux', 'title': 'They all came out to Montreux', + 'description': 'md5:0fefd8c5b4468a0bb35e916887681520', + 'genres': ['Documentary'], + 'creators': ['Oliver Murray'], + 'location': 'Switzerland', + 'release_year': 2021, }, 'playlist': [{ 'info_dict': { @@ -120,6 +130,12 @@ class PlaySuisseIE(InfoExtractor): id name description + descriptionLong + year + contentTypes + directors + mainCast + productionCountries duration episodeNumber seasonNumber @@ -215,9 +231,7 @@ def _perform_login(self, username, password): if not self._ID_TOKEN: raise ExtractorError('Login failed') - def _get_media_data(self, media_id): - # NOTE In the web app, the "locale" header is used to switch between languages, - # However this doesn't seem to take effect when passing the header here. + def _get_media_data(self, media_id, locale=None): response = self._download_json( 'https://www.playsuisse.ch/api/graphql', media_id, data=json.dumps({ @@ -225,7 +239,7 @@ def _get_media_data(self, media_id): 'query': self._GRAPHQL_QUERY, 'variables': {'assetId': media_id}, }).encode(), - headers={'Content-Type': 'application/json', 'locale': 'de'}) + headers={'Content-Type': 'application/json', 'locale': locale or 'de'}) return response['data']['assetV2'] @@ -234,7 +248,7 @@ def _real_extract(self, url): self.raise_login_required(method='password') media_id = self._match_id(url) - media_data = self._get_media_data(media_id) + media_data = self._get_media_data(media_id, traverse_obj(parse_qs(url), ('locale', 0))) info = self._extract_single(media_data) if media_data.get('episodes'): info.update({ @@ -257,15 +271,22 @@ def _extract_single(self, media_data): self._merge_subtitles(subs, target=subtitles) return { - 'id': media_data['id'], - 'title': media_data.get('name'), - 'description': media_data.get('description'), 'thumbnails': thumbnails, - 'duration': int_or_none(media_data.get('duration')), 'formats': formats, 'subtitles': subtitles, - 'series': media_data.get('seriesName'), - 'season_number': int_or_none(media_data.get('seasonNumber')), - 'episode': media_data.get('name') if media_data.get('episodeNumber') else None, - 'episode_number': int_or_none(media_data.get('episodeNumber')), + **traverse_obj(media_data, { + 'id': ('id', {str}), + 'title': ('name', {str}), + 'description': (('descriptionLong', 'description'), {str}, any), + 'genres': ('contentTypes', ..., {str}), + 'creators': ('directors', ..., {str}), + 'cast': ('mainCast', ..., {str}), + 'location': ('productionCountries', ..., {str}, all, {unpack(join_nonempty, delim='; ')}, filter), + 'release_year': ('year', {str}, {lambda x: x[:4]}, {int_or_none}), + 'duration': ('duration', {int_or_none}), + 'series': ('seriesName', {str}), + 'season_number': ('seasonNumber', {int_or_none}), + 'episode': ('name', {str}, {lambda x: x if media_data['episodeNumber'] is not None else None}), + 'episode_number': ('episodeNumber', {int_or_none}), + }), } diff --git a/yt_dlp/extractor/podchaser.py b/yt_dlp/extractor/podchaser.py index 4570f0f175..6c125f9ba6 100644 --- a/yt_dlp/extractor/podchaser.py +++ b/yt_dlp/extractor/podchaser.py @@ -5,11 +5,13 @@ from ..utils import ( OnDemandPagedList, float_or_none, + int_or_none, + orderedSet, str_or_none, - str_to_int, - traverse_obj, unified_timestamp, + url_or_none, ) +from ..utils.traversal import require, traverse_obj class PodchaserIE(InfoExtractor): @@ -21,24 +23,25 @@ class PodchaserIE(InfoExtractor): 'id': '104365585', 'title': 'Ep. 285 – freeze me off', 'description': 'cam ahn', - 'thumbnail': r're:^https?://.*\.jpg$', + 'thumbnail': r're:https?://.+/.+\.jpg', 'ext': 'mp3', - 'categories': ['Comedy'], + 'categories': ['Comedy', 'News', 'Politics', 'Arts'], 'tags': ['comedy', 'dark humor'], - 'series': 'Cum Town', + 'series': 'The Adam Friedland Show Podcast', 'duration': 3708, 'timestamp': 1636531259, 'upload_date': '20211110', 'average_rating': 4.0, + 'series_id': '36924', }, }, { 'url': 'https://www.podchaser.com/podcasts/the-bone-zone-28853', 'info_dict': { 'id': '28853', 'title': 'The Bone Zone', - 'description': 'Podcast by The Bone Zone', + 'description': r're:The official home of the Bone Zone podcast.+', }, - 'playlist_count': 275, + 'playlist_mincount': 275, }, { 'url': 'https://www.podchaser.com/podcasts/sean-carrolls-mindscape-scienc-699349/episodes', 'info_dict': { @@ -51,19 +54,33 @@ class PodchaserIE(InfoExtractor): @staticmethod def _parse_episode(episode, podcast): - return { - 'id': str(episode.get('id')), - 'title': episode.get('title'), - 'description': episode.get('description'), - 'url': episode.get('audio_url'), - 'thumbnail': episode.get('image_url'), - 'duration': str_to_int(episode.get('length')), - 'timestamp': unified_timestamp(episode.get('air_date')), - 'average_rating': float_or_none(episode.get('rating')), - 'categories': list(set(traverse_obj(podcast, (('summary', None), 'categories', ..., 'text')))), - 'tags': traverse_obj(podcast, ('tags', ..., 'text')), - 'series': podcast.get('title'), - } + info = traverse_obj(episode, { + 'id': ('id', {int}, {str_or_none}, {require('episode ID')}), + 'title': ('title', {str}), + 'description': ('description', {str}), + 'url': ('audio_url', {url_or_none}), + 'thumbnail': ('image_url', {url_or_none}), + 'duration': ('length', {int_or_none}), + 'timestamp': ('air_date', {unified_timestamp}), + 'average_rating': ('rating', {float_or_none}), + }) + info.update(traverse_obj(podcast, { + 'series': ('title', {str}), + 'series_id': ('id', {int}, {str_or_none}), + 'categories': (('summary', None), 'categories', ..., 'text', {str}, filter, all, {orderedSet}), + 'tags': ('tags', ..., 'text', {str}), + })) + info['vcodec'] = 'none' + + if info.get('series_id'): + podcast_slug = traverse_obj(podcast, ('slug', {str})) or 'podcast' + episode_slug = traverse_obj(episode, ('slug', {str})) or 'episode' + info['webpage_url'] = '/'.join(( + 'https://www.podchaser.com/podcasts', + '-'.join((podcast_slug[:30].rstrip('-'), info['series_id'])), + '-'.join((episode_slug[:30].rstrip('-'), info['id'])))) + + return info def _call_api(self, path, *args, **kwargs): return self._download_json(f'https://api.podchaser.com/{path}', *args, **kwargs) @@ -93,5 +110,5 @@ def _real_extract(self, url): OnDemandPagedList(functools.partial(self._fetch_page, podcast_id, podcast), self._PAGE_SIZE), str_or_none(podcast.get('id')), podcast.get('title'), podcast.get('description')) - episode = self._call_api(f'episodes/{episode_id}', episode_id) + episode = self._call_api(f'podcasts/{podcast_id}/episodes/{episode_id}/player_ids', episode_id) return self._parse_episode(episode, podcast) diff --git a/yt_dlp/extractor/polskieradio.py b/yt_dlp/extractor/polskieradio.py index 6fb21e156d..9d0496bdf0 100644 --- a/yt_dlp/extractor/polskieradio.py +++ b/yt_dlp/extractor/polskieradio.py @@ -22,7 +22,7 @@ ) -class PolskieRadioBaseExtractor(InfoExtractor): +class PolskieRadioBaseIE(InfoExtractor): def _extract_webpage_player_entries(self, webpage, playlist_id, base_data): media_urls = set() @@ -47,7 +47,7 @@ def _extract_webpage_player_entries(self, webpage, playlist_id, base_data): yield entry -class PolskieRadioLegacyIE(PolskieRadioBaseExtractor): +class PolskieRadioLegacyIE(PolskieRadioBaseIE): # legacy sites IE_NAME = 'polskieradio:legacy' _VALID_URL = r'https?://(?:www\.)?polskieradio(?:24)?\.pl/\d+/\d+/[Aa]rtykul/(?P\d+)' @@ -127,7 +127,7 @@ def _real_extract(self, url): return self.playlist_result(entries, playlist_id, title, description) -class PolskieRadioIE(PolskieRadioBaseExtractor): +class PolskieRadioIE(PolskieRadioBaseIE): # new next.js sites _VALID_URL = r'https?://(?:[^/]+\.)?(?:polskieradio(?:24)?|radiokierowcow)\.pl/artykul/(?P\d+)' _TESTS = [{ @@ -519,7 +519,7 @@ def _real_extract(self, url): } -class PolskieRadioPodcastBaseExtractor(InfoExtractor): +class PolskieRadioPodcastBaseIE(InfoExtractor): _API_BASE = 'https://apipodcasts.polskieradio.pl/api' def _parse_episode(self, data): @@ -539,7 +539,7 @@ def _parse_episode(self, data): } -class PolskieRadioPodcastListIE(PolskieRadioPodcastBaseExtractor): +class PolskieRadioPodcastListIE(PolskieRadioPodcastBaseIE): IE_NAME = 'polskieradio:podcast:list' _VALID_URL = r'https?://podcasty\.polskieradio\.pl/podcast/(?P\d+)' _TESTS = [{ @@ -578,7 +578,7 @@ def get_page(page_num): } -class PolskieRadioPodcastIE(PolskieRadioPodcastBaseExtractor): +class PolskieRadioPodcastIE(PolskieRadioPodcastBaseIE): IE_NAME = 'polskieradio:podcast' _VALID_URL = r'https?://podcasty\.polskieradio\.pl/track/(?P[a-f\d]{8}(?:-[a-f\d]{4}){4}[a-f\d]{8})' _TESTS = [{ diff --git a/yt_dlp/extractor/rai.py b/yt_dlp/extractor/rai.py index efb47affc9..c489dc7312 100644 --- a/yt_dlp/extractor/rai.py +++ b/yt_dlp/extractor/rai.py @@ -321,6 +321,27 @@ class RaiPlayIE(RaiBaseIE): 'timestamp': 1348495020, 'upload_date': '20120924', }, + }, { + # checking program_info gives false positive for DRM + 'url': 'https://www.raiplay.it/video/2022/10/Ad-ogni-costo---Un-giorno-in-Pretura---Puntata-del-15102022-1dfd1295-ea38-4bac-b51e-f87e2881693b.html', + 'md5': '572c6f711b7c5f2d670ba419b4ae3b08', + 'info_dict': { + 'id': '1dfd1295-ea38-4bac-b51e-f87e2881693b', + 'ext': 'mp4', + 'title': 'Ad ogni costo - Un giorno in Pretura - Puntata del 15/10/2022', + 'alt_title': 'St 2022/23 - Un giorno in pretura - Ad ogni costo', + 'description': 'md5:4046d97b2687f74f06a8b8270ba5599f', + 'uploader': 'Rai 3', + 'duration': 3773.0, + 'thumbnail': 'https://www.raiplay.it/dl/img/2022/10/12/1665586539957_2048x2048.png', + 'creators': ['Rai 3'], + 'series': 'Un giorno in pretura', + 'season': '2022/23', + 'episode': 'Ad ogni costo', + 'timestamp': 1665507240, + 'upload_date': '20221011', + 'release_year': 2025, + }, }, { 'url': 'http://www.raiplay.it/video/2016/11/gazebotraindesi-efebe701-969c-4593-92f3-285f0d1ce750.html?', 'only_matching': True, @@ -340,9 +361,8 @@ def _real_extract(self, url): media = self._download_json( f'{base}.json', video_id, 'Downloading video JSON') - if not self.get_param('allow_unplayable_formats'): - if traverse_obj(media, (('program_info', None), 'rights_management', 'rights', 'drm')): - self.report_drm(video_id) + if traverse_obj(media, ('rights_management', 'rights', 'drm')): + self.report_drm(video_id) video = media['video'] relinker_info = self._extract_relinker_info(video['content_url'], video_id) diff --git a/yt_dlp/extractor/reddit.py b/yt_dlp/extractor/reddit.py index a7a6a75dc4..be4c32e135 100644 --- a/yt_dlp/extractor/reddit.py +++ b/yt_dlp/extractor/reddit.py @@ -388,7 +388,8 @@ def add_thumbnail(src): }) if entries: return self.playlist_result(entries, video_id, **info) - raise ExtractorError('No media found', expected=True) + self.raise_no_formats('No media found', expected=True, video_id=video_id) + return {**info, 'id': video_id} # Check if media is hosted on reddit: reddit_video = traverse_obj(data, ( diff --git a/yt_dlp/extractor/redgifs.py b/yt_dlp/extractor/redgifs.py index 870e33253f..cd3cd323e5 100644 --- a/yt_dlp/extractor/redgifs.py +++ b/yt_dlp/extractor/redgifs.py @@ -12,7 +12,7 @@ ) -class RedGifsBaseInfoExtractor(InfoExtractor): +class RedGifsBaseIE(InfoExtractor): _FORMATS = { 'gif': 250, 'sd': 480, @@ -113,7 +113,7 @@ def _paged_entries(self, ep, item_id, query, fields): return page_fetcher(page) if page else OnDemandPagedList(page_fetcher, self._PAGE_SIZE) -class RedGifsIE(RedGifsBaseInfoExtractor): +class RedGifsIE(RedGifsBaseIE): _VALID_URL = r'https?://(?:(?:www\.)?redgifs\.com/(?:watch|ifr)/|thumbs2\.redgifs\.com/)(?P[^-/?#\.]+)' _TESTS = [{ 'url': 'https://www.redgifs.com/watch/squeakyhelplesswisent', @@ -172,7 +172,7 @@ def _real_extract(self, url): return self._parse_gif_data(video_info['gif']) -class RedGifsSearchIE(RedGifsBaseInfoExtractor): +class RedGifsSearchIE(RedGifsBaseIE): IE_DESC = 'Redgifs search' _VALID_URL = r'https?://(?:www\.)?redgifs\.com/browse\?(?P[^#]+)' _PAGE_SIZE = 80 @@ -226,7 +226,7 @@ def _real_extract(self, url): entries, query_str, tags, f'RedGifs search for {tags}, ordered by {order}') -class RedGifsUserIE(RedGifsBaseInfoExtractor): +class RedGifsUserIE(RedGifsBaseIE): IE_DESC = 'Redgifs user' _VALID_URL = r'https?://(?:www\.)?redgifs\.com/users/(?P[^/?#]+)(?:\?(?P[^#]+))?' _PAGE_SIZE = 80 diff --git a/yt_dlp/extractor/roya.py b/yt_dlp/extractor/roya.py new file mode 100644 index 0000000000..e9fe304eeb --- /dev/null +++ b/yt_dlp/extractor/roya.py @@ -0,0 +1,43 @@ +from .common import InfoExtractor +from ..utils.traversal import traverse_obj + + +class RoyaLiveIE(InfoExtractor): + _VALID_URL = r'https?://roya\.tv/live-stream/(?P\d+)' + _TESTS = [{ + 'url': 'https://roya.tv/live-stream/1', + 'info_dict': { + 'id': '1', + 'title': r're:Roya TV \d{4}-\d{2}-\d{2} \d{2}:\d{2}', + 'ext': 'mp4', + 'live_status': 'is_live', + }, + }, { + 'url': 'https://roya.tv/live-stream/21', + 'info_dict': { + 'id': '21', + 'title': r're:Roya News \d{4}-\d{2}-\d{2} \d{2}:\d{2}', + 'ext': 'mp4', + 'live_status': 'is_live', + }, + }, { + 'url': 'https://roya.tv/live-stream/10000', + 'only_matching': True, + }] + + def _real_extract(self, url): + media_id = self._match_id(url) + + stream_url = self._download_json( + f'https://ticket.roya-tv.com/api/v5/fastchannel/{media_id}', media_id)['data']['secured_url'] + + title = traverse_obj( + self._download_json('https://backend.roya.tv/api/v01/channels/schedule-pagination', media_id, fatal=False), + ('data', 0, 'channel', lambda _, v: str(v['id']) == media_id, 'title', {str}, any)) + + return { + 'id': media_id, + 'formats': self._extract_m3u8_formats(stream_url, media_id, 'mp4', m3u8_id='hls', live=True), + 'title': title, + 'is_live': True, + } diff --git a/yt_dlp/extractor/rtve.py b/yt_dlp/extractor/rtve.py index 7e0b666ab3..2812d93059 100644 --- a/yt_dlp/extractor/rtve.py +++ b/yt_dlp/extractor/rtve.py @@ -1,35 +1,142 @@ import base64 import io import struct +import urllib.parse from .common import InfoExtractor from ..utils import ( ExtractorError, + clean_html, determine_ext, float_or_none, + make_archive_id, + parse_iso8601, qualities, - remove_end, - remove_start, - try_get, + url_or_none, ) +from ..utils.traversal import subs_list_to_dict, traverse_obj -class RTVEALaCartaIE(InfoExtractor): +class RTVEBaseIE(InfoExtractor): + # Reimplementation of https://js2.rtve.es/pages/app-player/3.5.1/js/pf_video.js + @staticmethod + def _decrypt_url(png): + encrypted_data = io.BytesIO(base64.b64decode(png)[8:]) + while True: + length_data = encrypted_data.read(4) + length = struct.unpack('!I', length_data)[0] + chunk_type = encrypted_data.read(4) + if chunk_type == b'IEND': + break + data = encrypted_data.read(length) + if chunk_type == b'tEXt': + data = bytes(filter(None, data)) + alphabet_data, _, url_data = data.partition(b'#') + quality_str, _, url_data = url_data.rpartition(b'%%') + quality_str = quality_str.decode() or '' + alphabet = RTVEBaseIE._get_alphabet(alphabet_data) + url = RTVEBaseIE._get_url(alphabet, url_data) + yield quality_str, url + encrypted_data.read(4) # CRC + + @staticmethod + def _get_url(alphabet, url_data): + url = '' + f = 0 + e = 3 + b = 1 + for char in url_data.decode('iso-8859-1'): + if f == 0: + l = int(char) * 10 + f = 1 + else: + if e == 0: + l += int(char) + url += alphabet[l] + e = (b + 3) % 4 + f = 0 + b += 1 + else: + e -= 1 + return url + + @staticmethod + def _get_alphabet(alphabet_data): + alphabet = [] + e = 0 + d = 0 + for char in alphabet_data.decode('iso-8859-1'): + if d == 0: + alphabet.append(char) + d = e = (e + 1) % 4 + else: + d -= 1 + return alphabet + + def _extract_png_formats_and_subtitles(self, video_id, media_type='videos'): + formats, subtitles = [], {} + q = qualities(['Media', 'Alta', 'HQ', 'HD_READY', 'HD_FULL']) + for manager in ('rtveplayw', 'default'): + png = self._download_webpage( + f'http://www.rtve.es/ztnr/movil/thumbnail/{manager}/{media_type}/{video_id}.png', + video_id, 'Downloading url information', query={'q': 'v2'}, fatal=False) + if not png: + continue + + for quality, video_url in self._decrypt_url(png): + ext = determine_ext(video_url) + if ext == 'm3u8': + fmts, subs = self._extract_m3u8_formats_and_subtitles( + video_url, video_id, 'mp4', m3u8_id='hls', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + elif ext == 'mpd': + fmts, subs = self._extract_mpd_formats_and_subtitles( + video_url, video_id, 'dash', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + else: + formats.append({ + 'format_id': quality, + 'quality': q(quality), + 'url': video_url, + }) + return formats, subtitles + + def _parse_metadata(self, metadata): + return traverse_obj(metadata, { + 'title': ('title', {str.strip}), + 'alt_title': ('alt', {str.strip}), + 'description': ('description', {clean_html}), + 'timestamp': ('dateOfEmission', {parse_iso8601(delimiter=' ')}), + 'release_timestamp': ('publicationDate', {parse_iso8601(delimiter=' ')}), + 'modified_timestamp': ('modificationDate', {parse_iso8601(delimiter=' ')}), + 'thumbnail': (('thumbnail', 'image', 'imageSEO'), {url_or_none}, any), + 'duration': ('duration', {float_or_none(scale=1000)}), + 'is_live': ('live', {bool}), + 'series': (('programTitle', ('programInfo', 'title')), {clean_html}, any), + }) + + +class RTVEALaCartaIE(RTVEBaseIE): IE_NAME = 'rtve.es:alacarta' - IE_DESC = 'RTVE a la carta' - _VALID_URL = r'https?://(?:www\.)?rtve\.es/(m/)?(alacarta/videos|filmoteca)/[^/]+/[^/]+/(?P\d+)' + IE_DESC = 'RTVE a la carta and Play' + _VALID_URL = [ + r'https?://(?:www\.)?rtve\.es/(?:m/)?(?:(?:alacarta|play)/videos|filmoteca)/(?!directo)(?:[^/?#]+/){2}(?P\d+)', + r'https?://(?:www\.)?rtve\.es/infantil/serie/[^/?#]+/video/[^/?#]+/(?P\d+)', + ] _TESTS = [{ - 'url': 'http://www.rtve.es/alacarta/videos/balonmano/o-swiss-cup-masculina-final-espana-suecia/2491869/', - 'md5': '1d49b7e1ca7a7502c56a4bf1b60f1b43', + 'url': 'http://www.rtve.es/alacarta/videos/la-aventura-del-saber/aventuraentornosilla/3088905/', + 'md5': 'a964547824359a5753aef09d79fe984b', 'info_dict': { - 'id': '2491869', + 'id': '3088905', 'ext': 'mp4', - 'title': 'Balonmano - Swiss Cup masculina. Final: España-Suecia', - 'duration': 5024.566, - 'series': 'Balonmano', + 'title': 'En torno a la silla', + 'duration': 1216.981, + 'series': 'La aventura del Saber', + 'thumbnail': 'https://img2.rtve.es/v/aventuraentornosilla_3088905.png', }, - 'expected_warnings': ['Failed to download MPD manifest', 'Failed to download m3u8 information'], }, { 'note': 'Live stream', 'url': 'http://www.rtve.es/alacarta/videos/television/24h-live/1694255/', @@ -38,140 +145,88 @@ class RTVEALaCartaIE(InfoExtractor): 'ext': 'mp4', 'title': 're:^24H LIVE [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'is_live': True, + 'live_status': 'is_live', + 'thumbnail': r're:https://img2\.rtve\.es/v/.*\.png', }, 'params': { 'skip_download': 'live stream', }, }, { 'url': 'http://www.rtve.es/alacarta/videos/servir-y-proteger/servir-proteger-capitulo-104/4236788/', - 'md5': 'd850f3c8731ea53952ebab489cf81cbf', + 'md5': 'f3cf0d1902d008c48c793e736706c174', 'info_dict': { 'id': '4236788', 'ext': 'mp4', - 'title': 'Servir y proteger - Capítulo 104', - 'duration': 3222.0, + 'title': 'Episodio 104', + 'duration': 3222.8, + 'thumbnail': r're:https://img2\.rtve\.es/v/.*\.png', + 'series': 'Servir y proteger', }, - 'expected_warnings': ['Failed to download MPD manifest', 'Failed to download m3u8 information'], }, { 'url': 'http://www.rtve.es/m/alacarta/videos/cuentame-como-paso/cuentame-como-paso-t16-ultimo-minuto-nuestra-vida-capitulo-276/2969138/?media=tve', 'only_matching': True, }, { 'url': 'http://www.rtve.es/filmoteca/no-do/not-1-introduccion-primer-noticiario-espanol/1465256/', 'only_matching': True, + }, { + 'url': 'https://www.rtve.es/play/videos/saber-vivir/07-07-24/16177116/', + 'md5': 'a5b24fcdfa3ff5cb7908aba53d22d4b6', + 'info_dict': { + 'id': '16177116', + 'ext': 'mp4', + 'title': 'Saber vivir - 07/07/24', + 'thumbnail': r're:https://img2\.rtve\.es/v/.*\.png', + 'duration': 2162.68, + 'series': 'Saber vivir', + }, + }, { + 'url': 'https://www.rtve.es/infantil/serie/agus-lui-churros-crafts/video/gusano/7048976/', + 'info_dict': { + 'id': '7048976', + 'ext': 'mp4', + 'title': 'Gusano', + 'thumbnail': r're:https://img2\.rtve\.es/v/.*\.png', + 'duration': 292.86, + 'series': 'Agus & Lui: Churros y Crafts', + '_old_archive_ids': ['rtveinfantil 7048976'], + }, }] - def _real_initialize(self): - user_agent_b64 = base64.b64encode(self.get_param('http_headers')['User-Agent'].encode()).decode('utf-8') - self._manager = self._download_json( - 'http://www.rtve.es/odin/loki/' + user_agent_b64, - None, 'Fetching manager info')['manager'] - - @staticmethod - def _decrypt_url(png): - encrypted_data = io.BytesIO(base64.b64decode(png)[8:]) - while True: - length = struct.unpack('!I', encrypted_data.read(4))[0] - chunk_type = encrypted_data.read(4) - if chunk_type == b'IEND': - break - data = encrypted_data.read(length) - if chunk_type == b'tEXt': - alphabet_data, text = data.split(b'\0') - quality, url_data = text.split(b'%%') - alphabet = [] - e = 0 - d = 0 - for l in alphabet_data.decode('iso-8859-1'): - if d == 0: - alphabet.append(l) - d = e = (e + 1) % 4 - else: - d -= 1 - url = '' - f = 0 - e = 3 - b = 1 - for letter in url_data.decode('iso-8859-1'): - if f == 0: - l = int(letter) * 10 - f = 1 - else: - if e == 0: - l += int(letter) - url += alphabet[l] - e = (b + 3) % 4 - f = 0 - b += 1 - else: - e -= 1 - - yield quality.decode(), url - encrypted_data.read(4) # CRC - - def _extract_png_formats(self, video_id): - png = self._download_webpage( - f'http://www.rtve.es/ztnr/movil/thumbnail/{self._manager}/videos/{video_id}.png', - video_id, 'Downloading url information', query={'q': 'v2'}) - q = qualities(['Media', 'Alta', 'HQ', 'HD_READY', 'HD_FULL']) - formats = [] - for quality, video_url in self._decrypt_url(png): - ext = determine_ext(video_url) - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - elif ext == 'mpd': - formats.extend(self._extract_mpd_formats( - video_url, video_id, 'dash', fatal=False)) - else: - formats.append({ - 'format_id': quality, - 'quality': q(quality), - 'url': video_url, - }) - return formats + def _get_subtitles(self, video_id): + subtitle_data = self._download_json( + f'https://api2.rtve.es/api/videos/{video_id}/subtitulos.json', video_id, + 'Downloading subtitles info') + return traverse_obj(subtitle_data, ('page', 'items', ..., { + 'id': ('lang', {str}), + 'url': ('src', {url_or_none}), + }, all, {subs_list_to_dict(lang='es')})) def _real_extract(self, url): video_id = self._match_id(url) - info = self._download_json( + metadata = self._download_json( f'http://www.rtve.es/api/videos/{video_id}/config/alacarta_videos.json', video_id)['page']['items'][0] - if info['state'] == 'DESPU': + if metadata['state'] == 'DESPU': raise ExtractorError('The video is no longer available', expected=True) - title = info['title'].strip() - formats = self._extract_png_formats(video_id) + formats, subtitles = self._extract_png_formats_and_subtitles(video_id) - subtitles = None - sbt_file = info.get('sbtFile') - if sbt_file: - subtitles = self.extract_subtitles(video_id, sbt_file) + self._merge_subtitles(self.extract_subtitles(video_id), target=subtitles) - is_live = info.get('live') is True + is_infantil = urllib.parse.urlparse(url).path.startswith('/infantil/') return { 'id': video_id, - 'title': title, 'formats': formats, - 'thumbnail': info.get('image'), 'subtitles': subtitles, - 'duration': float_or_none(info.get('duration'), 1000), - 'is_live': is_live, - 'series': info.get('programTitle'), + **self._parse_metadata(metadata), + '_old_archive_ids': [make_archive_id('rtveinfantil', video_id)] if is_infantil else None, } - def _get_subtitles(self, video_id, sub_file): - subs = self._download_json( - sub_file + '.json', video_id, - 'Downloading subtitles info')['page']['items'] - return dict( - (s['lang'], [{'ext': 'vtt', 'url': s['src']}]) - for s in subs) - -class RTVEAudioIE(RTVEALaCartaIE): # XXX: Do not subclass from concrete IE +class RTVEAudioIE(RTVEBaseIE): IE_NAME = 'rtve.es:audio' IE_DESC = 'RTVE audio' - _VALID_URL = r'https?://(?:www\.)?rtve\.es/(alacarta|play)/audios/[^/]+/[^/]+/(?P[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?rtve\.es/(alacarta|play)/audios/(?:[^/?#]+/){2}(?P\d+)' _TESTS = [{ 'url': 'https://www.rtve.es/alacarta/audios/a-hombros-de-gigantes/palabra-ingeniero-codigos-informaticos-27-04-21/5889192/', @@ -180,9 +235,11 @@ class RTVEAudioIE(RTVEALaCartaIE): # XXX: Do not subclass from concrete IE 'id': '5889192', 'ext': 'mp3', 'title': 'Códigos informáticos', - 'thumbnail': r're:https?://.+/1598856591583.jpg', + 'alt_title': 'Códigos informáticos - Escuchar ahora', 'duration': 349.440, 'series': 'A hombros de gigantes', + 'description': 'md5:72b0d7c1ca20fd327bdfff7ac0171afb', + 'thumbnail': 'https://img2.rtve.es/a/palabra-ingeniero-codigos-informaticos-270421_5889192.png', }, }, { 'url': 'https://www.rtve.es/play/audios/en-radio-3/ignatius-farray/5791165/', @@ -191,9 +248,11 @@ class RTVEAudioIE(RTVEALaCartaIE): # XXX: Do not subclass from concrete IE 'id': '5791165', 'ext': 'mp3', 'title': 'Ignatius Farray', + 'alt_title': 'En Radio 3 - Ignatius Farray - 13/02/21 - escuchar ahora', 'thumbnail': r're:https?://.+/1613243011863.jpg', 'duration': 3559.559, 'series': 'En Radio 3', + 'description': 'md5:124aa60b461e0b1724a380bad3bc4040', }, }, { 'url': 'https://www.rtve.es/play/audios/frankenstein-o-el-moderno-prometeo/capitulo-26-ultimo-muerte-victor-juan-jose-plans-mary-shelley/6082623/', @@ -202,126 +261,101 @@ class RTVEAudioIE(RTVEALaCartaIE): # XXX: Do not subclass from concrete IE 'id': '6082623', 'ext': 'mp3', 'title': 'Capítulo 26 y último: La muerte de Victor', + 'alt_title': 'Frankenstein o el moderno Prometeo - Capítulo 26 y último: La muerte de Victor', 'thumbnail': r're:https?://.+/1632147445707.jpg', 'duration': 3174.086, 'series': 'Frankenstein o el moderno Prometeo', + 'description': 'md5:4ee6fcb82ebe2e46d267e1d1c1a8f7b5', }, }] - def _extract_png_formats(self, audio_id): - """ - This function retrieves media related png thumbnail which obfuscate - valuable information about the media. This information is decrypted - via base class _decrypt_url function providing media quality and - media url - """ - png = self._download_webpage( - f'http://www.rtve.es/ztnr/movil/thumbnail/{self._manager}/audios/{audio_id}.png', - audio_id, 'Downloading url information', query={'q': 'v2'}) - q = qualities(['Media', 'Alta', 'HQ', 'HD_READY', 'HD_FULL']) - formats = [] - for quality, audio_url in self._decrypt_url(png): - ext = determine_ext(audio_url) - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - audio_url, audio_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - elif ext == 'mpd': - formats.extend(self._extract_mpd_formats( - audio_url, audio_id, 'dash', fatal=False)) - else: - formats.append({ - 'format_id': quality, - 'quality': q(quality), - 'url': audio_url, - }) - return formats - def _real_extract(self, url): audio_id = self._match_id(url) - info = self._download_json( - f'https://www.rtve.es/api/audios/{audio_id}.json', - audio_id)['page']['items'][0] + metadata = self._download_json( + f'https://www.rtve.es/api/audios/{audio_id}.json', audio_id)['page']['items'][0] + + formats, subtitles = self._extract_png_formats_and_subtitles(audio_id, media_type='audios') return { 'id': audio_id, - 'title': info['title'].strip(), - 'thumbnail': info.get('thumbnail'), - 'duration': float_or_none(info.get('duration'), 1000), - 'series': try_get(info, lambda x: x['programInfo']['title']), - 'formats': self._extract_png_formats(audio_id), + 'formats': formats, + 'subtitles': subtitles, + **self._parse_metadata(metadata), } -class RTVEInfantilIE(RTVEALaCartaIE): # XXX: Do not subclass from concrete IE - IE_NAME = 'rtve.es:infantil' - IE_DESC = 'RTVE infantil' - _VALID_URL = r'https?://(?:www\.)?rtve\.es/infantil/serie/[^/]+/video/[^/]+/(?P[0-9]+)/' - - _TESTS = [{ - 'url': 'http://www.rtve.es/infantil/serie/cleo/video/maneras-vivir/3040283/', - 'md5': '5747454717aedf9f9fdf212d1bcfc48d', - 'info_dict': { - 'id': '3040283', - 'ext': 'mp4', - 'title': 'Maneras de vivir', - 'thumbnail': r're:https?://.+/1426182947956\.JPG', - 'duration': 357.958, - }, - 'expected_warnings': ['Failed to download MPD manifest', 'Failed to download m3u8 information'], - }] - - -class RTVELiveIE(RTVEALaCartaIE): # XXX: Do not subclass from concrete IE +class RTVELiveIE(RTVEBaseIE): IE_NAME = 'rtve.es:live' IE_DESC = 'RTVE.es live streams' - _VALID_URL = r'https?://(?:www\.)?rtve\.es/directo/(?P[a-zA-Z0-9-]+)' + _VALID_URL = [ + r'https?://(?:www\.)?rtve\.es/directo/(?P[a-zA-Z0-9-]+)', + r'https?://(?:www\.)?rtve\.es/play/videos/directo/[^/?#]+/(?P[a-zA-Z0-9-]+)', + ] _TESTS = [{ 'url': 'http://www.rtve.es/directo/la-1/', 'info_dict': { 'id': 'la-1', 'ext': 'mp4', - 'title': 're:^La 1 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'live_status': 'is_live', + 'title': str, + 'description': str, + 'thumbnail': r're:https://img\d\.rtve\.es/resources/thumbslive/\d+\.jpg', + 'timestamp': int, + 'upload_date': str, }, - 'params': { - 'skip_download': 'live stream', + 'params': {'skip_download': 'live stream'}, + }, { + 'url': 'https://www.rtve.es/play/videos/directo/deportes/tdp/', + 'info_dict': { + 'id': 'tdp', + 'ext': 'mp4', + 'live_status': 'is_live', + 'title': str, + 'description': str, + 'thumbnail': r're:https://img2\d\.rtve\.es/resources/thumbslive/\d+\.jpg', + 'timestamp': int, + 'upload_date': str, }, + 'params': {'skip_download': 'live stream'}, + }, { + 'url': 'http://www.rtve.es/play/videos/directo/canales-lineales/la-1/', + 'only_matching': True, }] def _real_extract(self, url): - mobj = self._match_valid_url(url) - video_id = mobj.group('id') - + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - title = remove_end(self._og_search_title(webpage), ' en directo en RTVE.es') - title = remove_start(title, 'Estoy viendo ') - vidplayer_id = self._search_regex( - (r'playerId=player([0-9]+)', - r'class=["\'].*?\blive_mod\b.*?["\'][^>]+data-assetid=["\'](\d+)', - r'data-id=["\'](\d+)'), - webpage, 'internal video ID') + data_setup = self._search_json( + r']+class="[^"]*videoPlayer[^"]*"[^>]*data-setup=\'', + webpage, 'data_setup', video_id) + + formats, subtitles = self._extract_png_formats_and_subtitles(data_setup['idAsset']) return { 'id': video_id, - 'title': title, - 'formats': self._extract_png_formats(vidplayer_id), + **self._search_json_ld(webpage, video_id, fatal=False), + 'title': self._html_extract_title(webpage), + 'formats': formats, + 'subtitles': subtitles, 'is_live': True, } class RTVETelevisionIE(InfoExtractor): IE_NAME = 'rtve.es:television' - _VALID_URL = r'https?://(?:www\.)?rtve\.es/television/[^/]+/[^/]+/(?P\d+).shtml' + _VALID_URL = r'https?://(?:www\.)?rtve\.es/television/[^/?#]+/[^/?#]+/(?P\d+).shtml' _TEST = { - 'url': 'http://www.rtve.es/television/20160628/revolucion-del-movil/1364141.shtml', + 'url': 'https://www.rtve.es/television/20091103/video-inedito-del-8o-programa/299020.shtml', 'info_dict': { - 'id': '3069778', + 'id': '572515', 'ext': 'mp4', - 'title': 'Documentos TV - La revolución del móvil', - 'duration': 3496.948, + 'title': 'Clase inédita', + 'duration': 335.817, + 'thumbnail': r're:https://img2\.rtve\.es/v/.*\.png', + 'series': 'El coro de la cárcel', }, 'params': { 'skip_download': True, @@ -332,11 +366,8 @@ def _real_extract(self, url): page_id = self._match_id(url) webpage = self._download_webpage(url, page_id) - alacarta_url = self._search_regex( - r'data-location="alacarta_videos"[^<]+url":"(http://www\.rtve\.es/alacarta.+?)&', - webpage, 'alacarta url', default=None) - if alacarta_url is None: - raise ExtractorError( - 'The webpage doesn\'t contain any video', expected=True) + play_url = self._html_search_meta('contentUrl', webpage) + if play_url is None: + raise ExtractorError('The webpage doesn\'t contain any video', expected=True) - return self.url_result(alacarta_url, ie=RTVEALaCartaIE.ie_key()) + return self.url_result(play_url, ie=RTVEALaCartaIE.ie_key()) diff --git a/yt_dlp/extractor/rtvs.py b/yt_dlp/extractor/rtvs.py index 927da57787..fcbc88a9f9 100644 --- a/yt_dlp/extractor/rtvs.py +++ b/yt_dlp/extractor/rtvs.py @@ -9,7 +9,9 @@ class RTVSIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?rtvs\.sk/(?:radio|televizia)/archiv(?:/\d+)?/(?P\d+)/?(?:[#?]|$)' + IE_NAME = 'stvr' + IE_DESC = 'Slovak Television and Radio (formerly RTVS)' + _VALID_URL = r'https?://(?:www\.)?(?:rtvs|stvr)\.sk/(?:radio|televizia)/archiv(?:/\d+)?/(?P\d+)/?(?:[#?]|$)' _TESTS = [{ # radio archive 'url': 'http://www.rtvs.sk/radio/archiv/11224/414872', @@ -19,7 +21,7 @@ class RTVSIE(InfoExtractor): 'ext': 'mp3', 'title': 'Ostrov pokladov 1 časť.mp3', 'duration': 2854, - 'thumbnail': 'https://www.rtvs.sk/media/a501/image/file/2/0000/b1R8.rtvs.jpg', + 'thumbnail': 'https://www.stvr.sk/media/a501/image/file/2/0000/rtvs-00009383.png', 'display_id': '135331', }, }, { @@ -30,7 +32,7 @@ class RTVSIE(InfoExtractor): 'ext': 'mp4', 'title': 'Amaro Džives - Náš deň', 'description': 'Galavečer pri príležitosti Medzinárodného dňa Rómov.', - 'thumbnail': 'https://www.rtvs.sk/media/a501/image/file/2/0031/L7Qm.amaro_dzives_png.jpg', + 'thumbnail': 'https://www.stvr.sk/media/a501/image/file/2/0031/L7Qm.amaro_dzives_png.jpg', 'timestamp': 1428555900, 'upload_date': '20150409', 'duration': 4986, @@ -47,8 +49,11 @@ class RTVSIE(InfoExtractor): 'display_id': '307655', 'duration': 831, 'upload_date': '20211111', - 'thumbnail': 'https://www.rtvs.sk/media/a501/image/file/2/0916/robin.jpg', + 'thumbnail': 'https://www.stvr.sk/media/a501/image/file/2/0916/robin.jpg', }, + }, { + 'url': 'https://www.stvr.sk/radio/archiv/11224/414872', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/rumble.py b/yt_dlp/extractor/rumble.py index 74c7e4f176..757d6994ca 100644 --- a/yt_dlp/extractor/rumble.py +++ b/yt_dlp/extractor/rumble.py @@ -7,7 +7,6 @@ ExtractorError, UnsupportedError, clean_html, - determine_ext, extract_attributes, format_field, get_element_by_class, @@ -36,7 +35,7 @@ class RumbleEmbedIE(InfoExtractor): 'upload_date': '20191020', 'channel_url': 'https://rumble.com/c/WMAR', 'channel': 'WMAR', - 'thumbnail': 'https://sp.rmbl.ws/s8/1/5/M/z/1/5Mz1a.qR4e-small-WMAR-2-News-Latest-Headline.jpg', + 'thumbnail': r're:https://.+\.jpg', 'duration': 234, 'uploader': 'WMAR', 'live_status': 'not_live', @@ -52,7 +51,7 @@ class RumbleEmbedIE(InfoExtractor): 'upload_date': '20220217', 'channel_url': 'https://rumble.com/c/CyberTechNews', 'channel': 'CTNews', - 'thumbnail': 'https://sp.rmbl.ws/s8/6/7/i/9/h/7i9hd.OvCc.jpg', + 'thumbnail': r're:https://.+\.jpg', 'duration': 901, 'uploader': 'CTNews', 'live_status': 'not_live', @@ -114,6 +113,22 @@ class RumbleEmbedIE(InfoExtractor): 'live_status': 'was_live', }, 'params': {'skip_download': True}, + }, { + 'url': 'https://rumble.com/embed/v6pezdb', + 'info_dict': { + 'id': 'v6pezdb', + 'ext': 'mp4', + 'title': '"Es war einmal ein Mädchen" – Ein filmisches Zeitzeugnis aus Leningrad 1944', + 'uploader': 'RT DE', + 'channel': 'RT DE', + 'channel_url': 'https://rumble.com/c/RTDE', + 'duration': 309, + 'thumbnail': 'https://1a-1791.com/video/fww1/dc/s8/1/n/z/2/y/nz2yy.qR4e-small-Es-war-einmal-ein-Mdchen-Ei.jpg', + 'timestamp': 1743703500, + 'upload_date': '20250403', + 'live_status': 'not_live', + }, + 'params': {'skip_download': True}, }, { 'url': 'https://rumble.com/embed/ufe9n.v5pv5f', 'only_matching': True, @@ -168,40 +183,42 @@ def _real_extract(self, url): live_status = None formats = [] - for ext, ext_info in (video.get('ua') or {}).items(): - if isinstance(ext_info, dict): - for height, video_info in ext_info.items(): + for format_type, format_info in (video.get('ua') or {}).items(): + if isinstance(format_info, dict): + for height, video_info in format_info.items(): if not traverse_obj(video_info, ('meta', 'h', {int_or_none})): video_info.setdefault('meta', {})['h'] = height - ext_info = ext_info.values() + format_info = format_info.values() - for video_info in ext_info: + for video_info in format_info: meta = video_info.get('meta') or {} if not video_info.get('url'): continue - if ext == 'hls': + # With default query params returns m3u8 variants which are duplicates, without returns tar files + if format_type == 'tar': + continue + if format_type == 'hls': if meta.get('live') is True and video.get('live') == 1: live_status = 'post_live' formats.extend(self._extract_m3u8_formats( video_info['url'], video_id, ext='mp4', m3u8_id='hls', fatal=False, live=live_status == 'is_live')) continue - timeline = ext == 'timeline' - if timeline: - ext = determine_ext(video_info['url']) + is_timeline = format_type == 'timeline' + is_audio = format_type == 'audio' formats.append({ - 'ext': ext, - 'acodec': 'none' if timeline else None, + 'acodec': 'none' if is_timeline else None, + 'vcodec': 'none' if is_audio else None, 'url': video_info['url'], - 'format_id': join_nonempty(ext, format_field(meta, 'h', '%sp')), - 'format_note': 'Timeline' if timeline else None, - 'fps': None if timeline else video.get('fps'), + 'format_id': join_nonempty(format_type, format_field(meta, 'h', '%sp')), + 'format_note': 'Timeline' if is_timeline else None, + 'fps': None if is_timeline or is_audio else video.get('fps'), **traverse_obj(meta, { - 'tbr': 'bitrate', - 'filesize': 'size', - 'width': 'w', - 'height': 'h', - }, expected_type=lambda x: int(x) or None), + 'tbr': ('bitrate', {int_or_none}), + 'filesize': ('size', {int_or_none}), + 'width': ('w', {int_or_none}), + 'height': ('h', {int_or_none}), + }), }) subtitles = { diff --git a/yt_dlp/extractor/sbs.py b/yt_dlp/extractor/sbs.py index 8d61e22fce..7edb5214ee 100644 --- a/yt_dlp/extractor/sbs.py +++ b/yt_dlp/extractor/sbs.py @@ -122,6 +122,15 @@ def _real_extract(self, url): if traverse_obj(media, ('partOfSeries', {dict})): media['epName'] = traverse_obj(media, ('title', {str})) + # Need to set different language for forced subs or else they have priority over full subs + fixed_subtitles = {} + for lang, subs in subtitles.items(): + for sub in subs: + fixed_lang = lang + if sub['url'].lower().endswith('_fe.vtt'): + fixed_lang += '-forced' + fixed_subtitles.setdefault(fixed_lang, []).append(sub) + return { 'id': video_id, **traverse_obj(media, { @@ -151,6 +160,6 @@ def _real_extract(self, url): }), }), 'formats': formats, - 'subtitles': subtitles, + 'subtitles': fixed_subtitles, 'uploader': 'SBSC', } diff --git a/yt_dlp/extractor/senategov.py b/yt_dlp/extractor/senategov.py index efcdb79d0c..dc275e58da 100644 --- a/yt_dlp/extractor/senategov.py +++ b/yt_dlp/extractor/senategov.py @@ -13,7 +13,7 @@ class SenateISVPIE(InfoExtractor): - _IE_NAME = 'senate.gov:isvp' + IE_NAME = 'senate.gov:isvp' _VALID_URL = r'https?://(?:www\.)?senate\.gov/isvp/?\?(?P.+)' _EMBED_REGEX = [r"]+src=['\"](?Phttps?://www\.senate\.gov/isvp/?\?[^'\"]+)['\"]"] @@ -137,7 +137,7 @@ def _real_extract(self, url): class SenateGovIE(InfoExtractor): - _IE_NAME = 'senate.gov' + IE_NAME = 'senate.gov' _SUBDOMAIN_RE = '|'.join(map(re.escape, ( 'agriculture', 'aging', 'appropriations', 'armed-services', 'banking', 'budget', 'commerce', 'energy', 'epw', 'finance', 'foreign', 'help', diff --git a/yt_dlp/extractor/soundcloud.py b/yt_dlp/extractor/soundcloud.py index c70940a606..3496a08ef6 100644 --- a/yt_dlp/extractor/soundcloud.py +++ b/yt_dlp/extractor/soundcloud.py @@ -697,7 +697,7 @@ def _real_extract(self, url): try: return self._extract_info_dict(info, full_title, token) except ExtractorError as e: - if not isinstance(e.cause, HTTPError) or not e.cause.status == 429: + if not isinstance(e.cause, HTTPError) or e.cause.status != 429: raise self.report_warning( 'You have reached the API rate limit, which is ~600 requests per ' diff --git a/yt_dlp/extractor/sprout.py b/yt_dlp/extractor/sprout.py deleted file mode 100644 index 444a6c270f..0000000000 --- a/yt_dlp/extractor/sprout.py +++ /dev/null @@ -1,61 +0,0 @@ -from .adobepass import AdobePassIE -from ..utils import ( - int_or_none, - smuggle_url, - update_url_query, -) - - -class SproutIE(AdobePassIE): - _VALID_URL = r'https?://(?:www\.)?(?:sproutonline|universalkids)\.com/(?:watch|(?:[^/]+/)*videos)/(?P[^/?#]+)' - _TESTS = [{ - 'url': 'https://www.universalkids.com/shows/remy-and-boo/season/1/videos/robot-bike-race', - 'info_dict': { - 'id': 'bm0foJFaTKqb', - 'ext': 'mp4', - 'title': 'Robot Bike Race', - 'description': 'md5:436b1d97117cc437f54c383f4debc66d', - 'timestamp': 1606148940, - 'upload_date': '20201123', - 'uploader': 'NBCU-MPAT', - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'http://www.sproutonline.com/watch/cowboy-adventure', - 'only_matching': True, - }, { - 'url': 'https://www.universalkids.com/watch/robot-bike-race', - 'only_matching': True, - }] - _GEO_COUNTRIES = ['US'] - - def _real_extract(self, url): - display_id = self._match_id(url) - mpx_metadata = self._download_json( - # http://nbcuunikidsprod.apps.nbcuni.com/networks/universalkids/content/videos/ - 'https://www.universalkids.com/_api/videos/' + display_id, - display_id)['mpxMetadata'] - media_pid = mpx_metadata['mediaPid'] - theplatform_url = 'https://link.theplatform.com/s/HNK2IC/' + media_pid - query = { - 'mbr': 'true', - 'manifest': 'm3u', - } - if mpx_metadata.get('entitlement') == 'auth': - query['auth'] = self._extract_mvpd_auth(url, media_pid, 'sprout', 'sprout') - theplatform_url = smuggle_url( - update_url_query(theplatform_url, query), { - 'force_smil_url': True, - 'geo_countries': self._GEO_COUNTRIES, - }) - return { - '_type': 'url_transparent', - 'id': media_pid, - 'url': theplatform_url, - 'series': mpx_metadata.get('seriesName'), - 'season_number': int_or_none(mpx_metadata.get('seasonNumber')), - 'episode_number': int_or_none(mpx_metadata.get('episodeNumber')), - 'ie_key': 'ThePlatform', - } diff --git a/yt_dlp/extractor/streaks.py b/yt_dlp/extractor/streaks.py new file mode 100644 index 0000000000..1b3718473d --- /dev/null +++ b/yt_dlp/extractor/streaks.py @@ -0,0 +1,236 @@ +import json +import urllib.parse + +from .common import InfoExtractor +from ..networking.exceptions import HTTPError +from ..utils import ( + ExtractorError, + filter_dict, + float_or_none, + join_nonempty, + mimetype2ext, + parse_iso8601, + unsmuggle_url, + update_url_query, + url_or_none, +) +from ..utils.traversal import traverse_obj + + +class StreaksBaseIE(InfoExtractor): + _API_URL_TEMPLATE = 'https://{}.api.streaks.jp/v1/projects/{}/medias/{}{}' + _GEO_BYPASS = False + _GEO_COUNTRIES = ['JP'] + + def _extract_from_streaks_api(self, project_id, media_id, headers=None, query=None, ssai=False): + try: + response = self._download_json( + self._API_URL_TEMPLATE.format('playback', project_id, media_id, ''), + media_id, 'Downloading STREAKS playback API JSON', headers={ + 'Accept': 'application/json', + 'Origin': 'https://players.streaks.jp', + **self.geo_verification_headers(), + **(headers or {}), + }) + except ExtractorError as e: + if isinstance(e.cause, HTTPError) and e.cause.status in {403, 404}: + error = self._parse_json(e.cause.response.read().decode(), media_id, fatal=False) + message = traverse_obj(error, ('message', {str})) + code = traverse_obj(error, ('code', {str})) + if code == 'REQUEST_FAILED': + self.raise_geo_restricted(message, countries=self._GEO_COUNTRIES) + elif code == 'MEDIA_NOT_FOUND': + raise ExtractorError(message, expected=True) + elif code or message: + raise ExtractorError(join_nonempty(code, message, delim=': ')) + raise + + streaks_id = response['id'] + live_status = { + 'clip': 'was_live', + 'file': 'not_live', + 'linear': 'is_live', + 'live': 'is_live', + }.get(response.get('type')) + + formats, subtitles = [], {} + drm_formats = False + + for source in traverse_obj(response, ('sources', lambda _, v: v['src'])): + if source.get('key_systems'): + drm_formats = True + continue + + src_url = source['src'] + is_live = live_status == 'is_live' + ext = mimetype2ext(source.get('type')) + if ext != 'm3u8': + self.report_warning(f'Unsupported stream type: {ext}') + continue + + if is_live and ssai: + session_params = traverse_obj(self._download_json( + self._API_URL_TEMPLATE.format('ssai', project_id, streaks_id, '/ssai/session'), + media_id, 'Downloading session parameters', + headers={'Content-Type': 'application/json', 'Accept': 'application/json'}, + data=json.dumps({'id': source['id']}).encode(), + ), (0, 'query', {urllib.parse.parse_qs})) + src_url = update_url_query(src_url, session_params) + + fmts, subs = self._extract_m3u8_formats_and_subtitles( + src_url, media_id, 'mp4', m3u8_id='hls', fatal=False, live=is_live, query=query) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + + if not formats and drm_formats: + self.report_drm(media_id) + self._remove_duplicate_formats(formats) + + for subs in traverse_obj(response, ( + 'tracks', lambda _, v: v['kind'] in ('captions', 'subtitles') and url_or_none(v['src']), + )): + lang = traverse_obj(subs, ('srclang', {str.lower})) or 'ja' + subtitles.setdefault(lang, []).append({'url': subs['src']}) + + return { + 'id': streaks_id, + 'display_id': media_id, + 'formats': formats, + 'live_status': live_status, + 'subtitles': subtitles, + 'uploader_id': project_id, + **traverse_obj(response, { + 'title': ('name', {str}), + 'description': ('description', {str}, filter), + 'duration': ('duration', {float_or_none}), + 'modified_timestamp': ('updated_at', {parse_iso8601}), + 'tags': ('tags', ..., {str}), + 'thumbnails': (('poster', 'thumbnail'), 'src', {'url': {url_or_none}}), + 'timestamp': ('created_at', {parse_iso8601}), + }), + } + + +class StreaksIE(StreaksBaseIE): + _VALID_URL = [ + r'https?://players\.streaks\.jp/(?P[\w-]+)/[\da-f]+/index\.html\?(?:[^#]+&)?m=(?P(?:ref:)?[\w-]+)', + r'https?://playback\.api\.streaks\.jp/v1/projects/(?P[\w-]+)/medias/(?P(?:ref:)?[\w-]+)', + ] + _EMBED_REGEX = [rf']*\bsrc\s*=\s*["\'](?P{_VALID_URL[0]})'] + _TESTS = [{ + 'url': 'https://players.streaks.jp/tipness/08155cd19dc14c12bebefb69b92eafcc/index.html?m=dbdf2df35b4d483ebaeeaeb38c594647', + 'info_dict': { + 'id': 'dbdf2df35b4d483ebaeeaeb38c594647', + 'ext': 'mp4', + 'title': '3shunenCM_edit.mp4', + 'display_id': 'dbdf2df35b4d483ebaeeaeb38c594647', + 'duration': 47.533, + 'live_status': 'not_live', + 'modified_date': '20230726', + 'modified_timestamp': 1690356180, + 'timestamp': 1690355996, + 'upload_date': '20230726', + 'uploader_id': 'tipness', + }, + }, { + 'url': 'https://players.streaks.jp/ktv-web/0298e8964c164ab384c07ef6e08c444b/index.html?m=ref:mycoffeetime_250317', + 'info_dict': { + 'id': 'dccdc079e3fd41f88b0c8435e2d453ab', + 'ext': 'mp4', + 'title': 'わたしの珈琲時間_250317', + 'display_id': 'ref:mycoffeetime_250317', + 'duration': 122.99, + 'live_status': 'not_live', + 'modified_date': '20250310', + 'modified_timestamp': 1741586302, + 'thumbnail': r're:https?://.+\.jpg', + 'timestamp': 1741585839, + 'upload_date': '20250310', + 'uploader_id': 'ktv-web', + }, + }, { + 'url': 'https://playback.api.streaks.jp/v1/projects/ktv-web/medias/b5411938e1e5435dac71edf829dd4813', + 'info_dict': { + 'id': 'b5411938e1e5435dac71edf829dd4813', + 'ext': 'mp4', + 'title': 'KANTELE_SYUSEi_0630', + 'display_id': 'b5411938e1e5435dac71edf829dd4813', + 'live_status': 'not_live', + 'modified_date': '20250122', + 'modified_timestamp': 1737522999, + 'thumbnail': r're:https?://.+\.jpg', + 'timestamp': 1735205137, + 'upload_date': '20241226', + 'uploader_id': 'ktv-web', + }, + }, { + # TVer Olympics: website already down, but api remains accessible + 'url': 'https://playback.api.streaks.jp/v1/projects/tver-olympic/medias/ref:sp_240806_1748_dvr', + 'info_dict': { + 'id': 'c10f7345adb648cf804d7578ab93b2e3', + 'ext': 'mp4', + 'title': 'サッカー 男子 準決勝_dvr', + 'display_id': 'ref:sp_240806_1748_dvr', + 'duration': 12960.0, + 'live_status': 'was_live', + 'modified_date': '20240805', + 'modified_timestamp': 1722896263, + 'timestamp': 1722777618, + 'upload_date': '20240804', + 'uploader_id': 'tver-olympic', + }, + }, { + # TBS FREE: 24-hour stream + 'url': 'https://playback.api.streaks.jp/v1/projects/tbs/medias/ref:simul-02', + 'info_dict': { + 'id': 'c4e83a7b48f4409a96adacec674b4e22', + 'ext': 'mp4', + 'title': str, + 'display_id': 'ref:simul-02', + 'live_status': 'is_live', + 'modified_date': '20241031', + 'modified_timestamp': 1730339858, + 'timestamp': 1705466840, + 'upload_date': '20240117', + 'uploader_id': 'tbs', + }, + }, { + # DRM protected + 'url': 'https://players.streaks.jp/sp-jbc/a12d7ee0f40c49d6a0a2bff520639677/index.html?m=5f89c62f37ee4a68be8e6e3b1396c7d8', + 'only_matching': True, + }] + _WEBPAGE_TESTS = [{ + 'url': 'https://event.play.jp/playnext2023/', + 'info_dict': { + 'id': '2d975178293140dc8074a7fc536a7604', + 'ext': 'mp4', + 'title': 'PLAY NEXTキームービー(本番)', + 'uploader_id': 'play', + 'duration': 17.05, + 'thumbnail': r're:https?://.+\.jpg', + 'timestamp': 1668387517, + 'upload_date': '20221114', + 'modified_timestamp': 1739411523, + 'modified_date': '20250213', + 'live_status': 'not_live', + }, + }, { + 'url': 'https://wowshop.jp/Page/special/cooking_goods/?bid=wowshop&srsltid=AfmBOor_phUNoPEE_UCPiGGSCMrJE5T2US397smvsbrSdLqUxwON0el4', + 'playlist_mincount': 2, + 'info_dict': { + 'id': '?bid=wowshop&srsltid=AfmBOor_phUNoPEE_UCPiGGSCMrJE5T2US397smvsbrSdLqUxwON0el4', + 'title': 'ワンランク上の料理道具でとびきりの“おいしい”を食卓へ|wowshop', + 'description': 'md5:914b5cb8624fc69274c7fb7b2342958f', + 'age_limit': 0, + 'thumbnail': 'https://wowshop.jp/Page/special/cooking_goods/images/ogp.jpg', + }, + }] + + def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + project_id, media_id = self._match_valid_url(url).group('project_id', 'id') + + return self._extract_from_streaks_api( + project_id, media_id, headers=filter_dict({ + 'X-Streaks-Api-Key': smuggled_data.get('api_key'), + })) diff --git a/yt_dlp/extractor/svt.py b/yt_dlp/extractor/svt.py index b5df2e1a18..6a72f8d420 100644 --- a/yt_dlp/extractor/svt.py +++ b/yt_dlp/extractor/svt.py @@ -471,8 +471,7 @@ def _real_extract(self, url): webpage = self._download_webpage(url, display_id) title = self._og_search_title(webpage) - urql_state = self._search_json( - r'window\.svt\.(?:nyh\.)?urqlState\s*=', webpage, 'json data', display_id) + urql_state = self._search_json(r'urqlState\s*[=:]', webpage, 'json data', display_id) data = traverse_obj(urql_state, (..., 'data', {str}, {json.loads}), get_all=False) or {} diff --git a/yt_dlp/extractor/syfy.py b/yt_dlp/extractor/syfy.py deleted file mode 100644 index a32b50080f..0000000000 --- a/yt_dlp/extractor/syfy.py +++ /dev/null @@ -1,58 +0,0 @@ -from .adobepass import AdobePassIE -from ..utils import ( - smuggle_url, - update_url_query, -) - - -class SyfyIE(AdobePassIE): - _WORKING = False - _VALID_URL = r'https?://(?:www\.)?syfy\.com/(?:[^/]+/)?videos/(?P[^/?#]+)' - _TESTS = [{ - 'url': 'http://www.syfy.com/theinternetruinedmylife/videos/the-internet-ruined-my-life-season-1-trailer', - 'info_dict': { - 'id': '2968097', - 'ext': 'mp4', - 'title': 'The Internet Ruined My Life: Season 1 Trailer', - 'description': 'One tweet, one post, one click, can destroy everything.', - 'uploader': 'NBCU-MPAT', - 'upload_date': '20170113', - 'timestamp': 1484345640, - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - 'add_ie': ['ThePlatform'], - 'skip': 'Redirects to main page', - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - syfy_mpx = next(iter(self._parse_json(self._search_regex( - r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);', webpage, 'drupal settings'), - display_id)['syfy']['syfy_mpx'].values())) - video_id = syfy_mpx['mpxGUID'] - title = syfy_mpx['episodeTitle'] - query = { - 'mbr': 'true', - 'manifest': 'm3u', - } - if syfy_mpx.get('entitlement') == 'auth': - resource = self._get_mvpd_resource( - 'syfy', title, video_id, - syfy_mpx.get('mpxRating', 'TV-14')) - query['auth'] = self._extract_mvpd_auth( - url, video_id, 'syfy', resource) - - return { - '_type': 'url_transparent', - 'ie_key': 'ThePlatform', - 'url': smuggle_url(update_url_query( - self._proto_relative_url(syfy_mpx['releaseURL']), query), - {'force_smil_url': True}), - 'title': title, - 'id': video_id, - 'display_id': display_id, - } diff --git a/yt_dlp/extractor/taptap.py b/yt_dlp/extractor/taptap.py index e4c31da4e2..f5900194f1 100644 --- a/yt_dlp/extractor/taptap.py +++ b/yt_dlp/extractor/taptap.py @@ -191,12 +191,12 @@ class TapTapAppIE(TapTapBaseIE): }] -class TapTapIntlBase(TapTapBaseIE): +class TapTapIntlBaseIE(TapTapBaseIE): _X_UA = 'V=1&PN=WebAppIntl2&LANG=zh_TW&VN_CODE=115&VN=0.1.0&LOC=CN&PLT=PC&DS=Android&UID={uuid}&CURR=&DT=PC&OS=Windows&OSV=NT%208.0.0' _VIDEO_API = 'https://www.taptap.io/webapiv2/video-resource/v1/multi-get' -class TapTapAppIntlIE(TapTapIntlBase): +class TapTapAppIntlIE(TapTapIntlBaseIE): _VALID_URL = r'https?://www\.taptap\.io/app/(?P\d+)' _INFO_API = 'https://www.taptap.io/webapiv2/i/app/v5/detail' _DATA_PATH = 'app' @@ -227,7 +227,7 @@ class TapTapAppIntlIE(TapTapIntlBase): }] -class TapTapPostIntlIE(TapTapIntlBase): +class TapTapPostIntlIE(TapTapIntlBaseIE): _VALID_URL = r'https?://www\.taptap\.io/post/(?P\d+)' _INFO_API = 'https://www.taptap.io/webapiv2/creation/post/v1/detail' _INFO_QUERY_KEY = 'id_str' diff --git a/yt_dlp/extractor/tbs.py b/yt_dlp/extractor/tbs.py index 9b9aa50d37..80534731e1 100644 --- a/yt_dlp/extractor/tbs.py +++ b/yt_dlp/extractor/tbs.py @@ -32,6 +32,10 @@ class TBSIE(TurnerBaseIE): 'url': 'http://www.tntdrama.com/movies/star-wars-a-new-hope', 'only_matching': True, }] + _SOFTWARE_STATEMENT_MAP = { + 'tbs': 'eyJhbGciOiJSUzI1NiJ9.eyJzdWIiOiJkZTA0NTYxZS1iMTFhLTRlYTgtYTg5NC01NjI3MGM1NmM2MWIiLCJuYmYiOjE1MzcxODkzOTAsImlzcyI6ImF1dGguYWRvYmUuY29tIiwiaWF0IjoxNTM3MTg5MzkwfQ.Z7ny66kaqNDdCHf9Y9KsV12LrBxrLkGGxlYe2XGm6qsw2T-k1OCKC1TMzeqiZP735292MMRAQkcJDKrMIzNbAuf9nCdIcv4kE1E2nqUnjPMBduC1bHffZp8zlllyrN2ElDwM8Vhwv_5nElLRwWGEt0Kaq6KJAMZA__WDxKWC18T-wVtsOZWXQpDqO7nByhfj2t-Z8c3TUNVsA_wHgNXlkzJCZ16F2b7yGLT5ZhLPupOScd3MXC5iPh19HSVIok22h8_F_noTmGzmMnIRQi6bWYWK2zC7TQ_MsYHfv7V6EaG5m1RKZTV6JAwwoJQF_9ByzarLV1DGwZxD9-eQdqswvg', + 'tntdrama': 'eyJhbGciOiJSUzI1NiJ9.eyJzdWIiOiIwOTMxYTU4OS1jZjEzLTRmNjMtYTJmYy03MzhjMjE1NWU5NjEiLCJuYmYiOjE1MzcxOTA4MjcsImlzcyI6ImF1dGguYWRvYmUuY29tIiwiaWF0IjoxNTM3MTkwODI3fQ.AucKvtws7oekTXi80_zX4-BlgJD9GLvlOI9FlBCjdlx7Pa3eJ0AqbogynKMiatMbnLOTMHGjd7tTiq422unmZjBz70dhePAe9BbW0dIo7oQ57vZ-VBYw_tWYRPmON61MwAbLVlqROD3n_zURs85S8TlkQx9aNx9x_riGGELjd8l05CVa_pOluNhYvuIFn6wmrASOKI1hNEblBDWh468UWP571-fe4zzi0rlYeeHd-cjvtWvOB3bQsWrUVbK4pRmqvzEH59j0vNF-ihJF9HncmUicYONe47Mib3elfMok23v4dB1_UAlQY_oawfNcynmEnJQCcqFmbHdEwTW6gMiYsA', + } def _real_extract(self, url): site, path, display_id = self._match_valid_url(url).groups() @@ -48,7 +52,7 @@ def _real_extract(self, url): drupal_settings['ngtv_token_url']).query) info = self._extract_ngtv_info( - media_id, tokenizer_query, { + media_id, tokenizer_query, self._SOFTWARE_STATEMENT_MAP[site], { 'url': url, 'site_name': site[:3].upper(), 'auth_required': video_data.get('authRequired') == '1' or is_live, diff --git a/yt_dlp/extractor/teamcoco.py b/yt_dlp/extractor/teamcoco.py index a94ff9b332..ae8fd7a442 100644 --- a/yt_dlp/extractor/teamcoco.py +++ b/yt_dlp/extractor/teamcoco.py @@ -156,6 +156,7 @@ def _real_extract(self, url): class ConanClassicIE(TeamcocoBaseIE): + _WORKING = False _VALID_URL = r'https?://(?:(?:www\.)?conanclassic|conan25\.teamcoco)\.com/(?P([^/]+/)*[^/?#]+)' _TESTS = [{ 'url': 'https://conanclassic.com/video/ice-cube-kevin-hart-conan-share-lyft', @@ -263,7 +264,7 @@ def _real_extract(self, url): info.update(self._extract_ngtv_info(media_id, { 'accessToken': token, 'accessTokenType': 'jws', - })) + }, None)) # TODO: the None arg needs to be the AdobePass software_statement else: formats, subtitles = self._get_formats_and_subtitles( traverse_obj(response, ('data', 'findRecordVideoMetadata')), video_id) diff --git a/yt_dlp/extractor/telecinco.py b/yt_dlp/extractor/telecinco.py index 9ef621446d..a34f2afd4a 100644 --- a/yt_dlp/extractor/telecinco.py +++ b/yt_dlp/extractor/telecinco.py @@ -46,7 +46,7 @@ def _parse_content(self, content, url): error_code = traverse_obj( self._webpage_read_content(error.cause.response, caronte['cerbero'], video_id, fatal=False), ({json.loads}, 'code', {int})) - if error_code == 4038: + if error_code in (4038, 40313): self.raise_geo_restricted(countries=['ES']) raise diff --git a/yt_dlp/extractor/theplatform.py b/yt_dlp/extractor/theplatform.py index b73bea18fd..0f8cdfed99 100644 --- a/yt_dlp/extractor/theplatform.py +++ b/yt_dlp/extractor/theplatform.py @@ -4,7 +4,6 @@ import time from .adobepass import AdobePassIE -from .once import OnceIE from ..networking import HEADRequest, Request from ..utils import ( ExtractorError, @@ -13,11 +12,13 @@ float_or_none, int_or_none, mimetype2ext, + parse_age_limit, parse_qs, traverse_obj, unsmuggle_url, update_url, update_url_query, + url_or_none, urlhandle_detect_ext, xpath_with_ns, ) @@ -26,7 +27,7 @@ _x = lambda p: xpath_with_ns(p, {'smil': default_ns}) -class ThePlatformBaseIE(OnceIE): +class ThePlatformBaseIE(AdobePassIE): _TP_TLD = 'com' def _extract_theplatform_smil(self, smil_url, video_id, note='Downloading SMIL data'): @@ -54,82 +55,70 @@ def _extract_theplatform_smil(self, smil_url, video_id, note='Downloading SMIL d formats = [] for _format in smil_formats: - if OnceIE.suitable(_format['url']): - formats.extend(self._extract_once_formats(_format['url'])) - else: - media_url = _format['url'] - if determine_ext(media_url) == 'm3u8': - hdnea2 = self._get_cookies(media_url).get('hdnea2') - if hdnea2: - _format['url'] = update_url_query(media_url, {'hdnea3': hdnea2.value}) + media_url = _format['url'] + if determine_ext(media_url) == 'm3u8': + hdnea2 = self._get_cookies(media_url).get('hdnea2') + if hdnea2: + _format['url'] = update_url_query(media_url, {'hdnea3': hdnea2.value}) - formats.append(_format) + formats.append(_format) return formats, subtitles - def _download_theplatform_metadata(self, path, video_id): - info_url = f'http://link.theplatform.{self._TP_TLD}/s/{path}?format=preview' - return self._download_json(info_url, video_id) + def _download_theplatform_metadata(self, path, video_id, fatal=True): + return self._download_json( + f'https://link.theplatform.{self._TP_TLD}/s/{path}', video_id, + fatal=fatal, query={'format': 'preview'}) or {} - def _parse_theplatform_metadata(self, info): - subtitles = {} - captions = info.get('captions') - if isinstance(captions, list): - for caption in captions: - lang, src, mime = caption.get('lang', 'en'), caption.get('src'), caption.get('type') - subtitles.setdefault(lang, []).append({ - 'ext': mimetype2ext(mime), - 'url': src, - }) + @staticmethod + def _parse_theplatform_metadata(tp_metadata): + def site_specific_filter(*fields): + return lambda k, v: v and k.endswith(tuple(f'${f}' for f in fields)) - duration = info.get('duration') - tp_chapters = info.get('chapters', []) - chapters = [] - if tp_chapters: - def _add_chapter(start_time, end_time): - start_time = float_or_none(start_time, 1000) - end_time = float_or_none(end_time, 1000) - if start_time is None or end_time is None: - return - chapters.append({ - 'start_time': start_time, - 'end_time': end_time, - }) + info = traverse_obj(tp_metadata, { + 'title': ('title', {str}), + 'episode': ('title', {str}), + 'description': ('description', {str}), + 'thumbnail': ('defaultThumbnailUrl', {url_or_none}), + 'duration': ('duration', {float_or_none(scale=1000)}), + 'timestamp': ('pubDate', {float_or_none(scale=1000)}), + 'uploader': ('billingCode', {str}), + 'creators': ('author', {str}, filter, all, filter), + 'categories': ( + 'categories', lambda _, v: v.get('label') in ['category', None], + 'name', {str}, filter, all, filter), + 'tags': ('keywords', {str}, filter, {lambda x: re.split(r'[;,]\s?', x)}, filter), + 'age_limit': ('ratings', ..., 'rating', {parse_age_limit}, any), + 'season_number': (site_specific_filter('seasonNumber'), {int_or_none}, any), + 'episode_number': (site_specific_filter('episodeNumber', 'airOrder'), {int_or_none}, any), + 'series': (site_specific_filter('show', 'seriesTitle', 'seriesShortTitle'), (None, ...), {str}, any), + 'location': (site_specific_filter('region'), {str}, any), + 'media_type': (site_specific_filter('programmingType', 'type'), {str}, any), + }) - for chapter in tp_chapters[:-1]: - _add_chapter(chapter.get('startTime'), chapter.get('endTime')) - _add_chapter(tp_chapters[-1].get('startTime'), tp_chapters[-1].get('endTime') or duration) + chapters = traverse_obj(tp_metadata, ('chapters', ..., { + 'start_time': ('startTime', {float_or_none(scale=1000)}), + 'end_time': ('endTime', {float_or_none(scale=1000)}), + })) + # Ignore pointless single chapters from short videos that span the entire video's duration + if len(chapters) > 1 or traverse_obj(chapters, (0, 'end_time')): + info['chapters'] = chapters - def extract_site_specific_field(field): - # A number of sites have custom-prefixed keys, e.g. 'cbc$seasonNumber' - return traverse_obj(info, lambda k, v: v and k.endswith(f'${field}'), get_all=False) + info['subtitles'] = {} + for caption in traverse_obj(tp_metadata, ('captions', lambda _, v: url_or_none(v['src']))): + info['subtitles'].setdefault(caption.get('lang') or 'en', []).append({ + 'url': caption['src'], + 'ext': mimetype2ext(caption.get('type')), + }) - return { - 'title': info['title'], - 'subtitles': subtitles, - 'description': info['description'], - 'thumbnail': info['defaultThumbnailUrl'], - 'duration': float_or_none(duration, 1000), - 'timestamp': int_or_none(info.get('pubDate'), 1000) or None, - 'uploader': info.get('billingCode'), - 'chapters': chapters, - 'creator': traverse_obj(info, ('author', {str})) or None, - 'categories': traverse_obj(info, ( - 'categories', lambda _, v: v.get('label') in ('category', None), 'name', {str})) or None, - 'tags': traverse_obj(info, ('keywords', {lambda x: re.split(r'[;,]\s?', x) if x else None})), - 'location': extract_site_specific_field('region'), - 'series': extract_site_specific_field('show') or extract_site_specific_field('seriesTitle'), - 'season_number': int_or_none(extract_site_specific_field('seasonNumber')), - 'episode_number': int_or_none(extract_site_specific_field('episodeNumber')), - 'media_type': extract_site_specific_field('programmingType') or extract_site_specific_field('type'), - } + return info def _extract_theplatform_metadata(self, path, video_id): info = self._download_theplatform_metadata(path, video_id) return self._parse_theplatform_metadata(info) -class ThePlatformIE(ThePlatformBaseIE, AdobePassIE): +class ThePlatformIE(ThePlatformBaseIE): _VALID_URL = r'''(?x) (?:https?://(?:link|player)\.theplatform\.com/[sp]/(?P[^/]+)/ (?:(?:(?:[^/]+/)+select/)?(?Pmedia/(?:guid/\d+/)?)?|(?P(?:[^/\?]+/(?:swf|config)|onsite)/select/))? diff --git a/yt_dlp/extractor/toutiao.py b/yt_dlp/extractor/toutiao.py new file mode 100644 index 0000000000..b2a5aa2362 --- /dev/null +++ b/yt_dlp/extractor/toutiao.py @@ -0,0 +1,121 @@ +import json +import urllib.parse + +from .common import InfoExtractor +from ..utils import ( + float_or_none, + int_or_none, + str_or_none, + try_call, + url_or_none, +) +from ..utils.traversal import find_element, traverse_obj + + +class ToutiaoIE(InfoExtractor): + IE_NAME = 'toutiao' + IE_DESC = '今日头条' + + _VALID_URL = r'https?://www\.toutiao\.com/video/(?P\d+)/?(?:[?#]|$)' + _TESTS = [{ + 'url': 'https://www.toutiao.com/video/7505382061495176511/', + 'info_dict': { + 'id': '7505382061495176511', + 'ext': 'mp4', + 'title': '新疆多地现不明飞行物,目击者称和月亮一样亮,几秒内突然加速消失,气象部门回应', + 'comment_count': int, + 'duration': 9.753, + 'like_count': int, + 'release_date': '20250517', + 'release_timestamp': 1747483344, + 'thumbnail': r're:https?://p\d+-sign\.toutiaoimg\.com/.+$', + 'uploader': '极目新闻', + 'uploader_id': 'MS4wLjABAAAAeateBb9Su8I3MJOZozmvyzWktmba5LMlliRDz1KffnM', + 'view_count': int, + }, + }, { + 'url': 'https://www.toutiao.com/video/7479446610359878153/', + 'info_dict': { + 'id': '7479446610359878153', + 'ext': 'mp4', + 'title': '小伙竟然利用两块磁铁制作成磁力减震器,简直太有创意了!', + 'comment_count': int, + 'duration': 118.374, + 'like_count': int, + 'release_date': '20250308', + 'release_timestamp': 1741444368, + 'thumbnail': r're:https?://p\d+-sign\.toutiaoimg\.com/.+$', + 'uploader': '小莉创意发明', + 'uploader_id': 'MS4wLjABAAAA4f7d4mwtApALtHIiq-QM20dwXqe32NUz0DeWF7wbHKw', + 'view_count': int, + }, + }] + + def _real_initialize(self): + if self._get_cookies('https://www.toutiao.com').get('ttwid'): + return + + urlh = self._request_webpage( + 'https://ttwid.bytedance.com/ttwid/union/register/', None, + 'Fetching ttwid', 'Unable to fetch ttwid', headers={ + 'Content-Type': 'application/json', + }, data=json.dumps({ + 'aid': 24, + 'needFid': False, + 'region': 'cn', + 'service': 'www.toutiao.com', + 'union': True, + }).encode(), + ) + + if ttwid := try_call(lambda: self._get_cookies(urlh.url)['ttwid'].value): + self._set_cookie('.toutiao.com', 'ttwid', ttwid) + return + + self.raise_login_required() + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + video_data = traverse_obj(webpage, ( + {find_element(tag='script', id='RENDER_DATA')}, + {urllib.parse.unquote}, {json.loads}, 'data', 'initialVideo', + )) + + formats = [] + for video in traverse_obj(video_data, ( + 'videoPlayInfo', 'video_list', lambda _, v: v['main_url'], + )): + formats.append({ + 'url': video['main_url'], + **traverse_obj(video, ('video_meta', { + 'acodec': ('audio_profile', {str}), + 'asr': ('audio_sample_rate', {int_or_none}), + 'audio_channels': ('audio_channels', {float_or_none}, {int_or_none}), + 'ext': ('vtype', {str}), + 'filesize': ('size', {int_or_none}), + 'format_id': ('definition', {str}), + 'fps': ('fps', {int_or_none}), + 'height': ('vheight', {int_or_none}), + 'tbr': ('real_bitrate', {float_or_none(scale=1000)}), + 'vcodec': ('codec_type', {str}), + 'width': ('vwidth', {int_or_none}), + })), + }) + + return { + 'id': video_id, + 'formats': formats, + **traverse_obj(video_data, { + 'comment_count': ('commentCount', {int_or_none}), + 'duration': ('videoPlayInfo', 'video_duration', {float_or_none}), + 'like_count': ('repinCount', {int_or_none}), + 'release_timestamp': ('publishTime', {int_or_none}), + 'thumbnail': (('poster', 'coverUrl'), {url_or_none}, any), + 'title': ('title', {str}), + 'uploader': ('userInfo', 'name', {str}), + 'uploader_id': ('userInfo', 'userId', {str_or_none}), + 'view_count': ('playCount', {int_or_none}), + 'webpage_url': ('detailUrl', {url_or_none}), + }), + } diff --git a/yt_dlp/extractor/trutv.py b/yt_dlp/extractor/trutv.py index cbfe67af25..c1d0cb0d14 100644 --- a/yt_dlp/extractor/trutv.py +++ b/yt_dlp/extractor/trutv.py @@ -20,6 +20,7 @@ class TruTVIE(TurnerBaseIE): 'skip_download': True, }, } + _SOFTWARE_STATEMENT = 'eyJhbGciOiJSUzI1NiJ9.eyJzdWIiOiJhYzQyOTkwMi0xMDYzLTQyNTQtYWJlYS1iZTY2ODM4MTVmZGIiLCJuYmYiOjE1MzcxOTA4NjgsImlzcyI6ImF1dGguYWRvYmUuY29tIiwiaWF0IjoxNTM3MTkwODY4fQ.ewXl5LDMDvvx3nDXV4jCdSwUq_sOluKoOVsIjznAo6Zo4zrGe9rjlZ9DOmQKW66g6VRMexJsJ5vM1EkY8TC5-YcQw_BclK1FPGO1rH3Wf7tX_l0b1BVbSJQKIj9UgqDp_QbGcBXz24kN4So3U22mhs6di9PYyyfG68ccKL2iRprcVKWCslIHwUF-T7FaEqb0K57auilxeW1PONG2m-lIAcZ62DUwqXDWvw0CRoWI08aVVqkkhnXaSsQfLs5Ph1Pfh9Oq3g_epUm9Ss45mq6XM7gbOb5omTcKLADRKK-PJVB_JXnZnlsXbG0ttKE1cTKJ738qu7j4aipYTf-W0nKF5Q' def _real_extract(self, url): series_slug, clip_slug, video_id = self._match_valid_url(url).groups() @@ -39,7 +40,7 @@ def _real_extract(self, url): title = video_data['title'].strip() info = self._extract_ngtv_info( - media_id, {}, { + media_id, {}, self._SOFTWARE_STATEMENT, { 'url': url, 'site_name': 'truTV', 'auth_required': video_data.get('isAuthRequired'), diff --git a/yt_dlp/extractor/turner.py b/yt_dlp/extractor/turner.py index 8b79a8ba9a..4493705e99 100644 --- a/yt_dlp/extractor/turner.py +++ b/yt_dlp/extractor/turner.py @@ -22,7 +22,7 @@ class TurnerBaseIE(AdobePassIE): def _extract_timestamp(self, video_data): return int_or_none(xpath_attr(video_data, 'dateCreated', 'uts')) - def _add_akamai_spe_token(self, tokenizer_src, video_url, content_id, ap_data, custom_tokenizer_query=None): + def _add_akamai_spe_token(self, tokenizer_src, video_url, content_id, ap_data, software_statement, custom_tokenizer_query=None): secure_path = self._search_regex(r'https?://[^/]+(.+/)', video_url, 'secure path') + '*' token = self._AKAMAI_SPE_TOKEN_CACHE.get(secure_path) if not token: @@ -34,7 +34,8 @@ def _add_akamai_spe_token(self, tokenizer_src, video_url, content_id, ap_data, c else: query['videoId'] = content_id if ap_data.get('auth_required'): - query['accessToken'] = self._extract_mvpd_auth(ap_data['url'], content_id, ap_data['site_name'], ap_data['site_name']) + query['accessToken'] = self._extract_mvpd_auth( + ap_data['url'], content_id, ap_data['site_name'], ap_data['site_name'], software_statement) auth = self._download_xml( tokenizer_src, content_id, query=query) error_msg = xpath_text(auth, 'error/msg') @@ -46,7 +47,7 @@ def _add_akamai_spe_token(self, tokenizer_src, video_url, content_id, ap_data, c self._AKAMAI_SPE_TOKEN_CACHE[secure_path] = token return video_url + '?hdnea=' + token - def _extract_cvp_info(self, data_src, video_id, path_data={}, ap_data={}, fatal=False): + def _extract_cvp_info(self, data_src, video_id, software_statement, path_data={}, ap_data={}, fatal=False): video_data = self._download_xml( data_src, video_id, transform_source=lambda s: fix_xml_ampersands(s).strip(), @@ -101,7 +102,7 @@ def _extract_cvp_info(self, data_src, video_id, path_data={}, ap_data={}, fatal= video_url = self._add_akamai_spe_token( secure_path_data['tokenizer_src'], secure_path_data['media_src'] + video_url, - content_id, ap_data) + content_id, ap_data, software_statement) elif not re.match('https?://', video_url): base_path_data = path_data.get(ext, path_data.get('default', {})) media_src = base_path_data.get('media_src') @@ -215,10 +216,12 @@ def _extract_cvp_info(self, data_src, video_id, path_data={}, ap_data={}, fatal= 'is_live': is_live, } - def _extract_ngtv_info(self, media_id, tokenizer_query, ap_data=None): + def _extract_ngtv_info(self, media_id, tokenizer_query, software_statement, ap_data=None): + if not isinstance(ap_data, dict): + ap_data = {} is_live = ap_data.get('is_live') streams_data = self._download_json( - f'http://medium.ngtv.io/media/{media_id}/tv', + f'https://medium.ngtv.io/media/{media_id}/tv', media_id)['media']['tv'] duration = None chapters = [] @@ -230,8 +233,8 @@ def _extract_ngtv_info(self, media_id, tokenizer_query, ap_data=None): continue if stream_data.get('playlistProtection') == 'spe': m3u8_url = self._add_akamai_spe_token( - 'http://token.ngtv.io/token/token_spe', - m3u8_url, media_id, ap_data or {}, tokenizer_query) + 'https://token.ngtv.io/token/token_spe', + m3u8_url, media_id, ap_data, software_statement, tokenizer_query) formats.extend(self._extract_m3u8_formats( m3u8_url, media_id, 'mp4', m3u8_id='hls', live=is_live, fatal=False)) diff --git a/yt_dlp/extractor/tv2dk.py b/yt_dlp/extractor/tv2dk.py index 9cd7606b0a..bad120f7b4 100644 --- a/yt_dlp/extractor/tv2dk.py +++ b/yt_dlp/extractor/tv2dk.py @@ -2,12 +2,13 @@ import re from .common import InfoExtractor +from .jwplatform import JWPlatformIE from ..utils import ( determine_ext, - extract_attributes, js_to_json, url_or_none, ) +from ..utils.traversal import find_element, traverse_obj class TV2DKIE(InfoExtractor): @@ -21,35 +22,46 @@ class TV2DKIE(InfoExtractor): tv2fyn| tv2east| tv2lorry| - tv2nord + tv2nord| + tv2kosmopol )\.dk/ - (:[^/]+/)* + (?:[^/?#]+/)* (?P[^/?\#&]+) ''' _TESTS = [{ 'url': 'https://www.tvsyd.dk/nyheder/28-10-2019/1930/1930-28-okt-2019?autoplay=1#player', 'info_dict': { - 'id': '0_52jmwa0p', + 'id': 'sPp5z21q', 'ext': 'mp4', 'title': '19:30 - 28. okt. 2019', - 'timestamp': 1572290248, + 'description': '', + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/sPp5z21q/poster.jpg?width=720', + 'timestamp': 1572287400, 'upload_date': '20191028', - 'uploader_id': 'tvsyd', - 'duration': 1347, - 'view_count': int, }, - 'add_ie': ['Kaltura'], }, { 'url': 'https://www.tv2lorry.dk/gadekamp/gadekamp-6-hoejhuse-i-koebenhavn', 'info_dict': { - 'id': '1_7iwll9n0', + 'id': 'oD9cyq0m', 'ext': 'mp4', - 'upload_date': '20211027', 'title': 'Gadekamp #6 - Højhuse i København', - 'uploader_id': 'tv2lorry', - 'timestamp': 1635345229, + 'description': '', + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/oD9cyq0m/poster.jpg?width=720', + 'timestamp': 1635348600, + 'upload_date': '20211027', }, - 'add_ie': ['Kaltura'], + }, { + 'url': 'https://www.tvsyd.dk/haderslev/x-factor-brodre-fulde-af-selvtillid-er-igen-hjemme-hos-mor-vores-diagnoser-har-vaeret-en-fordel', + 'info_dict': { + 'id': 'x-factor-brodre-fulde-af-selvtillid-er-igen-hjemme-hos-mor-vores-diagnoser-har-vaeret-en-fordel', + }, + 'playlist_count': 2, + }, { + 'url': 'https://www.tv2ostjylland.dk/aarhus/dom-kan-fa-alvorlige-konsekvenser', + 'info_dict': { + 'id': 'dom-kan-fa-alvorlige-konsekvenser', + }, + 'playlist_count': 3, }, { 'url': 'https://www.tv2ostjylland.dk/artikel/minister-gaar-ind-i-sag-om-diabetes-teknologi', 'only_matching': True, @@ -71,40 +83,22 @@ class TV2DKIE(InfoExtractor): }, { 'url': 'https://www.tv2nord.dk/artikel/dybt-uacceptabelt', 'only_matching': True, + }, { + 'url': 'https://www.tv2kosmopol.dk/metropolen/chaufforer-beordres-til-at-kore-videre-i-ulovlige-busser-med-rode-advarselslamper', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + search_space = traverse_obj(webpage, {find_element(tag='article')}) or webpage - entries = [] + player_ids = traverse_obj( + re.findall(r'x-data="(?:video_player|simple_player)\(({[^"]+})', search_space), + (..., {js_to_json}, {json.loads}, ('jwpMediaId', 'videoId'), {str})) - def add_entry(partner_id, kaltura_id): - entries.append(self.url_result( - f'kaltura:{partner_id}:{kaltura_id}', 'Kaltura', - video_id=kaltura_id)) - - for video_el in re.findall(r'(?s)<[^>]+\bdata-entryid\s*=[^>]*>', webpage): - video = extract_attributes(video_el) - kaltura_id = video.get('data-entryid') - if not kaltura_id: - continue - partner_id = video.get('data-partnerid') - if not partner_id: - continue - add_entry(partner_id, kaltura_id) - if not entries: - kaltura_id = self._search_regex( - (r'entry_id\s*:\s*["\']([0-9a-z_]+)', - r'\\u002FentryId\\u002F(\w+)\\u002F'), webpage, 'kaltura id') - partner_id = self._search_regex( - (r'\\u002Fp\\u002F(\d+)\\u002F', r'/p/(\d+)/'), webpage, - 'partner id') - add_entry(partner_id, kaltura_id) - if len(entries) == 1: - return entries[0] - return self.playlist_result(entries) + return self.playlist_from_matches( + player_ids, video_id, getter=lambda x: f'jwplatform:{x}', ie=JWPlatformIE) class TV2DKBornholmPlayIE(InfoExtractor): diff --git a/yt_dlp/extractor/tver.py b/yt_dlp/extractor/tver.py index f3daf89469..805150db45 100644 --- a/yt_dlp/extractor/tver.py +++ b/yt_dlp/extractor/tver.py @@ -1,31 +1,70 @@ -from .common import InfoExtractor +from .streaks import StreaksBaseIE from ..utils import ( ExtractorError, + int_or_none, join_nonempty, + make_archive_id, smuggle_url, str_or_none, strip_or_none, - traverse_obj, update_url_query, ) +from ..utils.traversal import require, traverse_obj -class TVerIE(InfoExtractor): +class TVerIE(StreaksBaseIE): _VALID_URL = r'https?://(?:www\.)?tver\.jp/(?:(?Plp|corner|series|episodes?|feature)/)+(?P[a-zA-Z0-9]+)' + _GEO_COUNTRIES = ['JP'] + _GEO_BYPASS = False _TESTS = [{ - 'skip': 'videos are only available for 7 days', - 'url': 'https://tver.jp/episodes/ep83nf3w4p', + # via Streaks backend + 'url': 'https://tver.jp/episodes/epc1hdugbk', 'info_dict': { - 'title': '家事ヤロウ!!! 売り場席巻のチーズSP&財前直見×森泉親子の脱東京暮らし密着!', - 'description': 'md5:dc2c06b6acc23f1e7c730c513737719b', - 'series': '家事ヤロウ!!!', - 'episode': '売り場席巻のチーズSP&財前直見×森泉親子の脱東京暮らし密着!', - 'alt_title': '売り場席巻のチーズSP&財前直見×森泉親子の脱東京暮らし密着!', - 'channel': 'テレビ朝日', - 'id': 'ep83nf3w4p', + 'id': 'epc1hdugbk', 'ext': 'mp4', + 'display_id': 'ref:baeebeac-a2a6-4dbf-9eb3-c40d59b40068', + 'title': '神回だけ見せます! #2 壮烈!車大騎馬戦(木曜スペシャル)', + 'alt_title': '神回だけ見せます! #2 壮烈!車大騎馬戦(木曜スペシャル) 日テレ', + 'description': 'md5:2726f742d5e3886edeaf72fb6d740fef', + 'uploader_id': 'tver-ntv', + 'channel': '日テレ', + 'duration': 1158.024, + 'thumbnail': 'https://statics.tver.jp/images/content/thumbnail/episode/xlarge/epc1hdugbk.jpg?v=16', + 'series': '神回だけ見せます!', + 'episode': '#2 壮烈!車大騎馬戦(木曜スペシャル)', + 'episode_number': 2, + 'timestamp': 1736486036, + 'upload_date': '20250110', + 'modified_timestamp': 1736870264, + 'modified_date': '20250114', + 'live_status': 'not_live', + 'release_timestamp': 1651453200, + 'release_date': '20220502', + '_old_archive_ids': ['brightcovenew ref:baeebeac-a2a6-4dbf-9eb3-c40d59b40068'], }, - 'add_ie': ['BrightcoveNew'], + }, { + # via Brightcove backend (deprecated) + 'url': 'https://tver.jp/episodes/epc1hdugbk', + 'info_dict': { + 'id': 'ref:baeebeac-a2a6-4dbf-9eb3-c40d59b40068', + 'ext': 'mp4', + 'title': '神回だけ見せます! #2 壮烈!車大騎馬戦(木曜スペシャル)', + 'alt_title': '神回だけ見せます! #2 壮烈!車大騎馬戦(木曜スペシャル) 日テレ', + 'description': 'md5:2726f742d5e3886edeaf72fb6d740fef', + 'uploader_id': '4394098882001', + 'channel': '日テレ', + 'duration': 1158.101, + 'thumbnail': 'https://statics.tver.jp/images/content/thumbnail/episode/xlarge/epc1hdugbk.jpg?v=16', + 'tags': [], + 'series': '神回だけ見せます!', + 'episode': '#2 壮烈!車大騎馬戦(木曜スペシャル)', + 'episode_number': 2, + 'timestamp': 1651388531, + 'upload_date': '20220501', + 'release_timestamp': 1651453200, + 'release_date': '20220502', + }, + 'params': {'extractor_args': {'tver': {'backend': ['brightcove']}}}, }, { 'url': 'https://tver.jp/corner/f0103888', 'only_matching': True, @@ -38,26 +77,7 @@ class TVerIE(InfoExtractor): 'id': 'srtxft431v', 'title': '名探偵コナン', }, - 'playlist': [ - { - 'md5': '779ffd97493ed59b0a6277ea726b389e', - 'info_dict': { - 'id': 'ref:conan-1137-241005', - 'ext': 'mp4', - 'title': '名探偵コナン #1137「行列店、味変の秘密」', - 'uploader_id': '5330942432001', - 'tags': [], - 'channel': '読売テレビ', - 'series': '名探偵コナン', - 'description': 'md5:601fccc1d2430d942a2c8068c4b33eb5', - 'episode': '#1137「行列店、味変の秘密」', - 'duration': 1469.077, - 'timestamp': 1728030405, - 'upload_date': '20241004', - 'alt_title': '名探偵コナン #1137「行列店、味変の秘密」 読売テレビ 10月5日(土)放送分', - 'thumbnail': r're:https://.+\.jpg', - }, - }], + 'playlist_mincount': 21, }, { 'url': 'https://tver.jp/series/sru35hwdd2', 'info_dict': { @@ -70,7 +90,11 @@ class TVerIE(InfoExtractor): 'only_matching': True, }] BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s' - _HEADERS = {'x-tver-platform-type': 'web'} + _HEADERS = { + 'x-tver-platform-type': 'web', + 'Origin': 'https://tver.jp', + 'Referer': 'https://tver.jp/', + } _PLATFORM_QUERY = {} def _real_initialize(self): @@ -103,6 +127,9 @@ def _yield_episode_ids_for_series(self, series_id): def _real_extract(self, url): video_id, video_type = self._match_valid_url(url).group('id', 'type') + backend = self._configuration_arg('backend', ['streaks'])[0] + if backend not in ('brightcove', 'streaks'): + raise ExtractorError(f'Invalid backend value: {backend}', expected=True) if video_type == 'series': series_info = self._call_platform_api( @@ -129,12 +156,6 @@ def _real_extract(self, url): video_info = self._download_json( f'https://statics.tver.jp/content/episode/{video_id}.json', video_id, 'Downloading video info', query={'v': version}, headers={'Referer': 'https://tver.jp/'}) - p_id = video_info['video']['accountID'] - r_id = traverse_obj(video_info, ('video', ('videoRefID', 'videoID')), get_all=False) - if not r_id: - raise ExtractorError('Failed to extract reference ID for Brightcove') - if not r_id.isdigit(): - r_id = f'ref:{r_id}' episode = strip_or_none(episode_content.get('title')) series = str_or_none(episode_content.get('seriesTitle')) @@ -161,17 +182,53 @@ def _real_extract(self, url): ] ] - return { - '_type': 'url_transparent', + metadata = { 'title': title, 'series': series, 'episode': episode, # an another title which is considered "full title" for some viewers 'alt_title': join_nonempty(title, provider, onair_label, delim=' '), 'channel': provider, - 'description': str_or_none(video_info.get('description')), 'thumbnails': thumbnails, - 'url': smuggle_url( - self.BRIGHTCOVE_URL_TEMPLATE % (p_id, r_id), {'geo_countries': ['JP']}), - 'ie_key': 'BrightcoveNew', + **traverse_obj(video_info, { + 'description': ('description', {str}), + 'release_timestamp': ('viewStatus', 'startAt', {int_or_none}), + 'episode_number': ('no', {int_or_none}), + }), + } + + brightcove_id = traverse_obj(video_info, ('video', ('videoRefID', 'videoID'), {str}, any)) + if brightcove_id and not brightcove_id.isdecimal(): + brightcove_id = f'ref:{brightcove_id}' + + streaks_id = traverse_obj(video_info, ('streaks', 'videoRefID', {str})) + if streaks_id and not streaks_id.startswith('ref:'): + streaks_id = f'ref:{streaks_id}' + + # Deprecated Brightcove extraction reachable w/extractor-arg or fallback; errors are expected + if backend == 'brightcove' or not streaks_id: + if backend != 'brightcove': + self.report_warning( + 'No STREAKS ID found; falling back to Brightcove extraction', video_id=video_id) + if not brightcove_id: + raise ExtractorError('Unable to extract brightcove reference ID', expected=True) + account_id = traverse_obj(video_info, ( + 'video', 'accountID', {str}, {require('brightcove account ID', expected=True)})) + return { + **metadata, + '_type': 'url_transparent', + 'url': smuggle_url( + self.BRIGHTCOVE_URL_TEMPLATE % (account_id, brightcove_id), + {'geo_countries': ['JP']}), + 'ie_key': 'BrightcoveNew', + } + + return { + **self._extract_from_streaks_api(video_info['streaks']['projectID'], streaks_id, { + 'Origin': 'https://tver.jp', + 'Referer': 'https://tver.jp/', + }), + **metadata, + 'id': video_id, + '_old_archive_ids': [make_archive_id('BrightcoveNew', brightcove_id)] if brightcove_id else None, } diff --git a/yt_dlp/extractor/tvp.py b/yt_dlp/extractor/tvp.py index da3082907e..416cbab3c5 100644 --- a/yt_dlp/extractor/tvp.py +++ b/yt_dlp/extractor/tvp.py @@ -513,7 +513,7 @@ def _parse_video(self, video, with_url=True): class TVPVODVideoIE(TVPVODBaseIE): IE_NAME = 'tvp:vod' - _VALID_URL = r'https?://vod\.tvp\.pl/(?P[a-z\d-]+,\d+)/[a-z\d-]+(?\d+)/?(?:[?#]|$)' + _VALID_URL = r'https?://vod\.tvp\.pl/(?P[a-z\d-]+,\d+)/[a-z\d-]+(?\d+)/?(?:[?#]|$)' _TESTS = [{ 'url': 'https://vod.tvp.pl/dla-dzieci,24/laboratorium-alchemika-odcinki,309338/odcinek-24,S01E24,311357', @@ -568,6 +568,9 @@ class TVPVODVideoIE(TVPVODBaseIE): 'live_status': 'is_live', 'thumbnail': 're:https?://.+', }, + }, { + 'url': 'https://vod.tvp.pl/informacje-i-publicystyka,205/konskie-2025-debata-przedwyborcza-odcinki,2028435/odcinek--1,S01E-1,2028419', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/tvw.py b/yt_dlp/extractor/tvw.py index 1c060cd7a0..0ab926dbdd 100644 --- a/yt_dlp/extractor/tvw.py +++ b/yt_dlp/extractor/tvw.py @@ -1,13 +1,21 @@ import json from .common import InfoExtractor -from ..utils import clean_html, remove_end, unified_timestamp, url_or_none -from ..utils.traversal import traverse_obj +from ..utils import ( + clean_html, + extract_attributes, + parse_qs, + remove_end, + require, + unified_timestamp, + url_or_none, +) +from ..utils.traversal import find_element, traverse_obj class TvwIE(InfoExtractor): + IE_NAME = 'tvw' _VALID_URL = r'https?://(?:www\.)?tvw\.org/video/(?P[^/?#]+)' - _TESTS = [{ 'url': 'https://tvw.org/video/billy-frank-jr-statue-maquette-unveiling-ceremony-2024011211/', 'md5': '9ceb94fe2bb7fd726f74f16356825703', @@ -115,3 +123,43 @@ def _real_extract(self, url): 'is_live': ('eventStatus', {lambda x: x == 'live'}), }), } + + +class TvwTvChannelsIE(InfoExtractor): + IE_NAME = 'tvw:tvchannels' + _VALID_URL = r'https?://(?:www\.)?tvw\.org/tvchannels/(?P[^/?#]+)' + _TESTS = [{ + 'url': 'https://tvw.org/tvchannels/air/', + 'info_dict': { + 'id': 'air', + 'ext': 'mp4', + 'title': r're:TVW Cable Channel Live Stream', + 'thumbnail': r're:https?://.+/.+\.(?:jpe?g|png)$', + 'live_status': 'is_live', + }, + }, { + 'url': 'https://tvw.org/tvchannels/tvw2/', + 'info_dict': { + 'id': 'tvw2', + 'ext': 'mp4', + 'title': r're:TVW-2 Broadcast Channel', + 'thumbnail': r're:https?://.+/.+\.(?:jpe?g|png)$', + 'live_status': 'is_live', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + m3u8_url = traverse_obj(webpage, ( + {find_element(id='invintus-persistent-stream-frame', html=True)}, {extract_attributes}, + 'src', {parse_qs}, 'encoder', 0, {json.loads}, 'live247URI', {url_or_none}, {require('stream url')})) + + return { + 'id': video_id, + 'formats': self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', m3u8_id='hls', live=True), + 'title': remove_end(self._og_search_title(webpage, default=None), ' - TVW'), + 'thumbnail': self._og_search_thumbnail(webpage, default=None), + 'is_live': True, + } diff --git a/yt_dlp/extractor/twitcasting.py b/yt_dlp/extractor/twitcasting.py index bf9c6348cb..ebc2963b0f 100644 --- a/yt_dlp/extractor/twitcasting.py +++ b/yt_dlp/extractor/twitcasting.py @@ -1,4 +1,5 @@ import base64 +import hashlib import itertools import re @@ -14,12 +15,14 @@ parse_duration, qualities, str_to_int, - traverse_obj, try_get, unified_timestamp, + update_url_query, + url_or_none, urlencode_postdata, urljoin, ) +from ..utils.traversal import traverse_obj class TwitCastingIE(InfoExtractor): @@ -138,13 +141,7 @@ def _real_extract(self, url): r'data-toggle="true"[^>]+datetime="([^"]+)"', webpage, 'datetime', None)) - stream_server_data = self._download_json( - f'https://twitcasting.tv/streamserver.php?target={uploader_id}&mode=client', video_id, - 'Downloading live info', fatal=False) - is_live = any(f'data-{x}' in webpage for x in ['is-onlive="true"', 'live-type="live"', 'status="online"']) - if not traverse_obj(stream_server_data, 'llfmp4') and is_live: - self.raise_login_required(method='cookies') base_dict = { 'title': title, @@ -165,30 +162,43 @@ def find_dmu(x): return [data_movie_url] m3u8_urls = (try_get(webpage, find_dmu, list) - or traverse_obj(video_js_data, (..., 'source', 'url')) - or ([f'https://twitcasting.tv/{uploader_id}/metastream.m3u8'] if is_live else None)) - if not m3u8_urls: - raise ExtractorError('Failed to get m3u8 playlist') + or traverse_obj(video_js_data, (..., 'source', 'url'))) if is_live: - m3u8_url = m3u8_urls[0] - formats = self._extract_m3u8_formats( - m3u8_url, video_id, ext='mp4', m3u8_id='hls', - live=True, headers=self._M3U8_HEADERS) + stream_data = self._download_json( + 'https://twitcasting.tv/streamserver.php', + video_id, 'Downloading live info', query={ + 'target': uploader_id, + 'mode': 'client', + 'player': 'pc_web', + }) - if traverse_obj(stream_server_data, ('hls', 'source')): - formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, ext='mp4', m3u8_id='source', - live=True, query={'mode': 'source'}, - note='Downloading source quality m3u8', - headers=self._M3U8_HEADERS, fatal=False)) + password_params = { + 'word': hashlib.md5(video_password.encode()).hexdigest(), + } if video_password else None + + formats = [] + # low: 640x360, medium: 1280x720, high: 1920x1080 + qq = qualities(['low', 'medium', 'high']) + for quality, m3u8_url in traverse_obj(stream_data, ( + 'tc-hls', 'streams', {dict.items}, lambda _, v: url_or_none(v[1]), + )): + formats.append({ + 'url': update_url_query(m3u8_url, password_params), + 'format_id': f'hls-{quality}', + 'ext': 'mp4', + 'quality': qq(quality), + 'protocol': 'm3u8', + 'http_headers': self._M3U8_HEADERS, + }) if websockets: qq = qualities(['base', 'mobilesource', 'main']) - streams = traverse_obj(stream_server_data, ('llfmp4', 'streams')) or {} - for mode, ws_url in streams.items(): + for mode, ws_url in traverse_obj(stream_data, ( + 'llfmp4', 'streams', {dict.items}, lambda _, v: url_or_none(v[1]), + )): formats.append({ - 'url': ws_url, + 'url': update_url_query(ws_url, password_params), 'format_id': f'ws-{mode}', 'ext': 'mp4', 'quality': qq(mode), @@ -197,10 +207,15 @@ def find_dmu(x): 'protocol': 'websocket_frag', }) + if not formats: + self.raise_login_required() + infodict = { 'formats': formats, '_format_sort_fields': ('source', ), } + elif not m3u8_urls: + raise ExtractorError('Failed to get m3u8 playlist') elif len(m3u8_urls) == 1: formats = self._extract_m3u8_formats( m3u8_urls[0], video_id, 'mp4', headers=self._M3U8_HEADERS) diff --git a/yt_dlp/extractor/twitch.py b/yt_dlp/extractor/twitch.py index 44b19ad135..e4f2aec465 100644 --- a/yt_dlp/extractor/twitch.py +++ b/yt_dlp/extractor/twitch.py @@ -14,19 +14,20 @@ dict_get, float_or_none, int_or_none, + join_nonempty, make_archive_id, parse_duration, parse_iso8601, parse_qs, qualities, str_or_none, - traverse_obj, try_get, unified_timestamp, update_url_query, url_or_none, urljoin, ) +from ..utils.traversal import traverse_obj, value class TwitchBaseIE(InfoExtractor): @@ -42,10 +43,10 @@ class TwitchBaseIE(InfoExtractor): 'CollectionSideBar': '27111f1b382effad0b6def325caef1909c733fe6a4fbabf54f8d491ef2cf2f14', 'FilterableVideoTower_Videos': 'a937f1d22e269e39a03b509f65a7490f9fc247d7f83d6ac1421523e3b68042cb', 'ClipsCards__User': 'b73ad2bfaecfd30a9e6c28fada15bd97032c83ec77a0440766a56fe0bd632777', + 'ShareClipRenderStatus': 'f130048a462a0ac86bb54d653c968c514e9ab9ca94db52368c1179e97b0f16eb', 'ChannelCollectionsContent': '447aec6a0cc1e8d0a8d7732d47eb0762c336a2294fdb009e9c9d854e49d484b9', 'StreamMetadata': 'a647c2a13599e5991e175155f798ca7f1ecddde73f7f341f39009c14dbf59962', 'ComscoreStreamingQuery': 'e1edae8122517d013405f237ffcc124515dc6ded82480a88daef69c83b53ac01', - 'VideoAccessToken_Clip': '36b89d2507fce29e5ca551df756d27c1cfe079e2609642b4390aa4c35796eb11', 'VideoPreviewOverlay': '3006e77e51b128d838fa4e835723ca4dc9a05c5efd4466c1085215c6e437e65c', 'VideoMetadata': '49b5b8f268cdeb259d75b58dcb0c1a748e3b575003448a2333dc5cdafd49adad', 'VideoPlayer_ChapterSelectButtonVideo': '8d2793384aac3773beab5e59bd5d6f585aedb923d292800119e03d40cd0f9b41', @@ -186,7 +187,7 @@ def _get_thumbnails(self, thumbnail): 'url': thumbnail, }] if thumbnail else None - def _extract_twitch_m3u8_formats(self, path, video_id, token, signature): + def _extract_twitch_m3u8_formats(self, path, video_id, token, signature, live_from_start=False): formats = self._extract_m3u8_formats( f'{self._USHER_BASE}/{path}/{video_id}.m3u8', video_id, 'mp4', query={ 'allow_source': 'true', @@ -203,7 +204,10 @@ def _extract_twitch_m3u8_formats(self, path, video_id, token, signature): for fmt in formats: if fmt.get('vcodec') and fmt['vcodec'].startswith('av01'): # mpegts does not yet have proper support for av1 - fmt['downloader_options'] = {'ffmpeg_args_out': ['-f', 'mp4']} + fmt.setdefault('downloader_options', {}).update({'ffmpeg_args_out': ['-f', 'mp4']}) + if live_from_start: + fmt.setdefault('downloader_options', {}).update({'ffmpeg_args': ['-live_start_index', '0']}) + fmt['is_from_start'] = True return formats @@ -549,7 +553,8 @@ def _real_extract(self, url): access_token = self._download_access_token(vod_id, 'video', 'id') formats = self._extract_twitch_m3u8_formats( - 'vod', vod_id, access_token['value'], access_token['signature']) + 'vod', vod_id, access_token['value'], access_token['signature'], + live_from_start=self.get_param('live_from_start')) formats.extend(self._extract_storyboard(vod_id, video.get('storyboard'), info.get('duration'))) self._prefer_source(formats) @@ -632,6 +637,10 @@ class TwitchPlaylistBaseIE(TwitchBaseIE): _PAGE_LIMIT = 100 def _entries(self, channel_name, *args): + """ + Subclasses must define _make_variables() and _extract_entry(), + as well as set _OPERATION_NAME, _ENTRY_KIND, _EDGE_KIND, and _NODE_KIND + """ cursor = None variables_common = self._make_variables(channel_name, *args) entries_key = f'{self._ENTRY_KIND}s' @@ -671,7 +680,22 @@ def _entries(self, channel_name, *args): break -class TwitchVideosIE(TwitchPlaylistBaseIE): +class TwitchVideosBaseIE(TwitchPlaylistBaseIE): + _OPERATION_NAME = 'FilterableVideoTower_Videos' + _ENTRY_KIND = 'video' + _EDGE_KIND = 'VideoEdge' + _NODE_KIND = 'Video' + + @staticmethod + def _make_variables(channel_name, broadcast_type, sort): + return { + 'channelOwnerLogin': channel_name, + 'broadcastType': broadcast_type, + 'videoSort': sort.upper(), + } + + +class TwitchVideosIE(TwitchVideosBaseIE): _VALID_URL = r'https?://(?:(?:www|go|m)\.)?twitch\.tv/(?P[^/]+)/(?:videos|profile)' _TESTS = [{ @@ -750,11 +774,6 @@ class TwitchVideosIE(TwitchPlaylistBaseIE): 'views': 'Popular', } - _OPERATION_NAME = 'FilterableVideoTower_Videos' - _ENTRY_KIND = 'video' - _EDGE_KIND = 'VideoEdge' - _NODE_KIND = 'Video' - @classmethod def suitable(cls, url): return (False @@ -763,14 +782,6 @@ def suitable(cls, url): TwitchVideosCollectionsIE)) else super().suitable(url)) - @staticmethod - def _make_variables(channel_name, broadcast_type, sort): - return { - 'channelOwnerLogin': channel_name, - 'broadcastType': broadcast_type, - 'videoSort': sort.upper(), - } - @staticmethod def _extract_entry(node): return _make_video_result(node) @@ -918,7 +929,7 @@ def _real_extract(self, url): playlist_title=f'{channel_name} - Collections') -class TwitchStreamIE(TwitchBaseIE): +class TwitchStreamIE(TwitchVideosBaseIE): IE_NAME = 'twitch:stream' _VALID_URL = r'''(?x) https?:// @@ -981,6 +992,7 @@ class TwitchStreamIE(TwitchBaseIE): 'skip_download': 'Livestream', }, }] + _PAGE_LIMIT = 1 @classmethod def suitable(cls, url): @@ -994,6 +1006,20 @@ def suitable(cls, url): TwitchClipsIE)) else super().suitable(url)) + @staticmethod + def _extract_entry(node): + if not isinstance(node, dict) or not node.get('id'): + return None + video_id = node['id'] + return { + '_type': 'url', + 'ie_key': TwitchVodIE.ie_key(), + 'id': 'v' + video_id, + 'url': f'https://www.twitch.tv/videos/{video_id}', + 'title': node.get('title'), + 'timestamp': unified_timestamp(node.get('publishedAt')) or 0, + } + def _real_extract(self, url): channel_name = self._match_id(url).lower() @@ -1028,6 +1054,16 @@ def _real_extract(self, url): if not stream: raise UserNotLive(video_id=channel_name) + timestamp = unified_timestamp(stream.get('createdAt')) + + if self.get_param('live_from_start'): + self.to_screen(f'{channel_name}: Extracting VOD to download live from start') + entry = next(self._entries(channel_name, None, 'time'), None) + if entry and entry.pop('timestamp') >= (timestamp or float('inf')): + return entry + self.report_warning( + 'Unable to extract the VOD associated with this livestream', video_id=channel_name) + access_token = self._download_access_token( channel_name, 'stream', 'channelName') @@ -1037,7 +1073,6 @@ def _real_extract(self, url): self._prefer_source(formats) view_count = stream.get('viewers') - timestamp = unified_timestamp(stream.get('createdAt')) sq_user = try_get(gql, lambda x: x[1]['data']['user'], dict) or {} uploader = sq_user.get('displayName') @@ -1083,16 +1118,44 @@ class TwitchClipsIE(TwitchBaseIE): 'url': 'https://clips.twitch.tv/FaintLightGullWholeWheat', 'md5': '761769e1eafce0ffebfb4089cb3847cd', 'info_dict': { - 'id': '42850523', + 'id': '396245304', 'display_id': 'FaintLightGullWholeWheat', 'ext': 'mp4', 'title': 'EA Play 2016 Live from the Novo Theatre', + 'duration': 32, + 'view_count': int, 'thumbnail': r're:^https?://.*\.jpg', 'timestamp': 1465767393, 'upload_date': '20160612', - 'creator': 'EA', - 'uploader': 'stereotype_', - 'uploader_id': '43566419', + 'creators': ['EA'], + 'channel': 'EA', + 'channel_id': '25163635', + 'channel_is_verified': False, + 'channel_follower_count': int, + 'uploader': 'EA', + 'uploader_id': '25163635', + }, + }, { + 'url': 'https://www.twitch.tv/xqc/clip/CulturedAmazingKuduDatSheffy-TiZ_-ixAGYR3y2Uy', + 'md5': 'e90fe616b36e722a8cfa562547c543f0', + 'info_dict': { + 'id': '3207364882', + 'display_id': 'CulturedAmazingKuduDatSheffy-TiZ_-ixAGYR3y2Uy', + 'ext': 'mp4', + 'title': 'A day in the life of xQc', + 'duration': 60, + 'view_count': int, + 'thumbnail': r're:^https?://.*\.jpg', + 'timestamp': 1742869615, + 'upload_date': '20250325', + 'creators': ['xQc'], + 'channel': 'xQc', + 'channel_id': '71092938', + 'channel_is_verified': True, + 'channel_follower_count': int, + 'uploader': 'xQc', + 'uploader_id': '71092938', + 'categories': ['Just Chatting'], }, }, { # multiple formats @@ -1116,16 +1179,14 @@ class TwitchClipsIE(TwitchBaseIE): }] def _real_extract(self, url): - video_id = self._match_id(url) + slug = self._match_id(url) clip = self._download_gql( - video_id, [{ - 'operationName': 'VideoAccessToken_Clip', - 'variables': { - 'slug': video_id, - }, + slug, [{ + 'operationName': 'ShareClipRenderStatus', + 'variables': {'slug': slug}, }], - 'Downloading clip access token GraphQL')[0]['data']['clip'] + 'Downloading clip GraphQL')[0]['data']['clip'] if not clip: raise ExtractorError( @@ -1135,81 +1196,71 @@ def _real_extract(self, url): 'sig': clip['playbackAccessToken']['signature'], 'token': clip['playbackAccessToken']['value'], } - - data = self._download_base_gql( - video_id, { - 'query': '''{ - clip(slug: "%s") { - broadcaster { - displayName - } - createdAt - curator { - displayName - id - } - durationSeconds - id - tiny: thumbnailURL(width: 86, height: 45) - small: thumbnailURL(width: 260, height: 147) - medium: thumbnailURL(width: 480, height: 272) - title - videoQualities { - frameRate - quality - sourceURL - } - viewCount - } -}''' % video_id}, 'Downloading clip GraphQL', fatal=False) # noqa: UP031 - - if data: - clip = try_get(data, lambda x: x['data']['clip'], dict) or clip + asset_default = traverse_obj(clip, ('assets', 0, {dict})) or {} + asset_portrait = traverse_obj(clip, ('assets', 1, {dict})) or {} formats = [] - for option in clip.get('videoQualities', []): - if not isinstance(option, dict): - continue - source = url_or_none(option.get('sourceURL')) - if not source: - continue + default_aspect_ratio = float_or_none(asset_default.get('aspectRatio')) + formats.extend(traverse_obj(asset_default, ('videoQualities', lambda _, v: url_or_none(v['sourceURL']), { + 'url': ('sourceURL', {update_url_query(query=access_query)}), + 'format_id': ('quality', {str}), + 'height': ('quality', {int_or_none}), + 'fps': ('frameRate', {float_or_none}), + 'aspect_ratio': {value(default_aspect_ratio)}, + }))) + portrait_aspect_ratio = float_or_none(asset_portrait.get('aspectRatio')) + for source in traverse_obj(asset_portrait, ('videoQualities', lambda _, v: url_or_none(v['sourceURL']))): formats.append({ - 'url': update_url_query(source, access_query), - 'format_id': option.get('quality'), - 'height': int_or_none(option.get('quality')), - 'fps': int_or_none(option.get('frameRate')), + 'url': update_url_query(source['sourceURL'], access_query), + 'format_id': join_nonempty('portrait', source.get('quality')), + 'height': int_or_none(source.get('quality')), + 'fps': float_or_none(source.get('frameRate')), + 'aspect_ratio': portrait_aspect_ratio, + 'quality': -2, }) thumbnails = [] - for thumbnail_id in ('tiny', 'small', 'medium'): - thumbnail_url = clip.get(thumbnail_id) - if not thumbnail_url: - continue - thumb = { - 'id': thumbnail_id, - 'url': thumbnail_url, - } - mobj = re.search(r'-(\d+)x(\d+)\.', thumbnail_url) - if mobj: - thumb.update({ - 'height': int(mobj.group(2)), - 'width': int(mobj.group(1)), - }) - thumbnails.append(thumb) + thumb_asset_default_url = url_or_none(asset_default.get('thumbnailURL')) + if thumb_asset_default_url: + thumbnails.append({ + 'id': 'default', + 'url': thumb_asset_default_url, + 'preference': 0, + }) + if thumb_asset_portrait_url := url_or_none(asset_portrait.get('thumbnailURL')): + thumbnails.append({ + 'id': 'portrait', + 'url': thumb_asset_portrait_url, + 'preference': -1, + }) + thumb_default_url = url_or_none(clip.get('thumbnailURL')) + if thumb_default_url and thumb_default_url != thumb_asset_default_url: + thumbnails.append({ + 'id': 'small', + 'url': thumb_default_url, + 'preference': -2, + }) old_id = self._search_regex(r'%7C(\d+)(?:-\d+)?.mp4', formats[-1]['url'], 'old id', default=None) return { - 'id': clip.get('id') or video_id, + 'id': clip.get('id') or slug, '_old_archive_ids': [make_archive_id(self, old_id)] if old_id else None, - 'display_id': video_id, - 'title': clip.get('title'), + 'display_id': slug, 'formats': formats, - 'duration': int_or_none(clip.get('durationSeconds')), - 'view_count': int_or_none(clip.get('viewCount')), - 'timestamp': unified_timestamp(clip.get('createdAt')), 'thumbnails': thumbnails, - 'creator': try_get(clip, lambda x: x['broadcaster']['displayName'], str), - 'uploader': try_get(clip, lambda x: x['curator']['displayName'], str), - 'uploader_id': try_get(clip, lambda x: x['curator']['id'], str), + **traverse_obj(clip, { + 'title': ('title', {str}), + 'duration': ('durationSeconds', {int_or_none}), + 'view_count': ('viewCount', {int_or_none}), + 'timestamp': ('createdAt', {parse_iso8601}), + 'creators': ('broadcaster', 'displayName', {str}, filter, all), + 'channel': ('broadcaster', 'displayName', {str}), + 'channel_id': ('broadcaster', 'id', {str}), + 'channel_follower_count': ('broadcaster', 'followers', 'totalCount', {int_or_none}), + 'channel_is_verified': ('broadcaster', 'isPartner', {bool}), + 'uploader': ('curator', 'displayName', {str}), + 'uploader_id': ('curator', 'id', {str}), + 'categories': ('game', 'displayName', {str}, filter, all, filter), + }), } diff --git a/yt_dlp/extractor/twitter.py b/yt_dlp/extractor/twitter.py index b76cfae1db..65182b971b 100644 --- a/yt_dlp/extractor/twitter.py +++ b/yt_dlp/extractor/twitter.py @@ -20,7 +20,6 @@ remove_end, str_or_none, strip_or_none, - traverse_obj, truncate_string, try_call, try_get, @@ -29,6 +28,7 @@ url_or_none, xpath_text, ) +from ..utils.traversal import require, traverse_obj class TwitterBaseIE(InfoExtractor): @@ -1221,20 +1221,10 @@ class TwitterIE(TwitterBaseIE): }] _MEDIA_ID_RE = re.compile(r'_video/(\d+)/') - - @property - def _GRAPHQL_ENDPOINT(self): - if self.is_logged_in: - return 'zZXycP0V6H7m-2r0mOnFcA/TweetDetail' - return '2ICDjqPd81tulZcYrtpTuQ/TweetResultByRestId' + _GRAPHQL_ENDPOINT = '2ICDjqPd81tulZcYrtpTuQ/TweetResultByRestId' def _graphql_to_legacy(self, data, twid): - result = traverse_obj(data, ( - 'threaded_conversation_with_injections_v2', 'instructions', 0, 'entries', - lambda _, v: v['entryId'] == f'tweet-{twid}', 'content', 'itemContent', - 'tweet_results', 'result', ('tweet', None), {dict}, - ), default={}, get_all=False) if self.is_logged_in else traverse_obj( - data, ('tweetResult', 'result', {dict}), default={}) + result = traverse_obj(data, ('tweetResult', 'result', {dict})) or {} typename = result.get('__typename') if typename not in ('Tweet', 'TweetWithVisibilityResults', 'TweetTombstone', 'TweetUnavailable', None): @@ -1278,37 +1268,6 @@ def _graphql_to_legacy(self, data, twid): def _build_graphql_query(self, media_id): return { - 'variables': { - 'focalTweetId': media_id, - 'includePromotedContent': True, - 'with_rux_injections': False, - 'withBirdwatchNotes': True, - 'withCommunity': True, - 'withDownvotePerspective': False, - 'withQuickPromoteEligibilityTweetFields': True, - 'withReactionsMetadata': False, - 'withReactionsPerspective': False, - 'withSuperFollowsTweetFields': True, - 'withSuperFollowsUserFields': True, - 'withV2Timeline': True, - 'withVoice': True, - }, - 'features': { - 'graphql_is_translatable_rweb_tweet_is_translatable_enabled': False, - 'interactive_text_enabled': True, - 'responsive_web_edit_tweet_api_enabled': True, - 'responsive_web_enhance_cards_enabled': True, - 'responsive_web_graphql_timeline_navigation_enabled': False, - 'responsive_web_text_conversations_enabled': False, - 'responsive_web_uc_gql_enabled': True, - 'standardized_nudges_misinfo': True, - 'tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled': False, - 'tweetypie_unmention_optimization_enabled': True, - 'unified_cards_ad_metadata_container_dynamic_card_content_query_enabled': True, - 'verified_phone_label_enabled': False, - 'vibe_api_enabled': True, - }, - } if self.is_logged_in else { 'variables': { 'tweetId': media_id, 'withCommunity': False, @@ -1383,7 +1342,7 @@ def _extract_status(self, twid): 'tweet_mode': 'extended', }) except ExtractorError as e: - if not isinstance(e.cause, HTTPError) or not e.cause.status == 429: + if not isinstance(e.cause, HTTPError) or e.cause.status != 429: raise self.report_warning('Rate-limit exceeded; falling back to syndication endpoint') status = self._call_syndication_api(twid) @@ -1637,8 +1596,8 @@ def _find_dimension(target): class TwitterBroadcastIE(TwitterBaseIE, PeriscopeBaseIE): IE_NAME = 'twitter:broadcast' - _VALID_URL = TwitterBaseIE._BASE_REGEX + r'i/broadcasts/(?P[0-9a-zA-Z]{13})' + _VALID_URL = TwitterBaseIE._BASE_REGEX + r'i/(?Pbroadcasts|events)/(?P\w+)' _TESTS = [{ # untitled Periscope video 'url': 'https://twitter.com/i/broadcasts/1yNGaQLWpejGj', @@ -1646,6 +1605,7 @@ class TwitterBroadcastIE(TwitterBaseIE, PeriscopeBaseIE): 'id': '1yNGaQLWpejGj', 'ext': 'mp4', 'title': 'Andrea May Sahouri - Periscope Broadcast', + 'display_id': '1yNGaQLWpejGj', 'uploader': 'Andrea May Sahouri', 'uploader_id': 'andreamsahouri', 'uploader_url': 'https://twitter.com/andreamsahouri', @@ -1653,6 +1613,8 @@ class TwitterBroadcastIE(TwitterBaseIE, PeriscopeBaseIE): 'upload_date': '20200601', 'thumbnail': r're:^https?://[^?#]+\.jpg\?token=', 'view_count': int, + 'concurrent_view_count': int, + 'live_status': 'was_live', }, }, { 'url': 'https://twitter.com/i/broadcasts/1ZkKzeyrPbaxv', @@ -1660,6 +1622,7 @@ class TwitterBroadcastIE(TwitterBaseIE, PeriscopeBaseIE): 'id': '1ZkKzeyrPbaxv', 'ext': 'mp4', 'title': 'Starship | SN10 | High-Altitude Flight Test', + 'display_id': '1ZkKzeyrPbaxv', 'uploader': 'SpaceX', 'uploader_id': 'SpaceX', 'uploader_url': 'https://twitter.com/SpaceX', @@ -1667,6 +1630,8 @@ class TwitterBroadcastIE(TwitterBaseIE, PeriscopeBaseIE): 'upload_date': '20210303', 'thumbnail': r're:^https?://[^?#]+\.jpg\?token=', 'view_count': int, + 'concurrent_view_count': int, + 'live_status': 'was_live', }, }, { 'url': 'https://twitter.com/i/broadcasts/1OyKAVQrgzwGb', @@ -1674,6 +1639,7 @@ class TwitterBroadcastIE(TwitterBaseIE, PeriscopeBaseIE): 'id': '1OyKAVQrgzwGb', 'ext': 'mp4', 'title': 'Starship Flight Test', + 'display_id': '1OyKAVQrgzwGb', 'uploader': 'SpaceX', 'uploader_id': 'SpaceX', 'uploader_url': 'https://twitter.com/SpaceX', @@ -1681,21 +1647,58 @@ class TwitterBroadcastIE(TwitterBaseIE, PeriscopeBaseIE): 'upload_date': '20230420', 'thumbnail': r're:^https?://[^?#]+\.jpg\?token=', 'view_count': int, + 'concurrent_view_count': int, + 'live_status': 'was_live', + }, + }, { + 'url': 'https://x.com/i/events/1910629646300762112', + 'info_dict': { + 'id': '1LyxBWDRNqyKN', + 'ext': 'mp4', + 'title': '#ガンニバル ウォッチパーティー', + 'concurrent_view_count': int, + 'display_id': '1910629646300762112', + 'live_status': 'was_live', + 'release_date': '20250423', + 'release_timestamp': 1745409000, + 'tags': ['ガンニバル'], + 'thumbnail': r're:https?://[^?#]+\.jpg\?token=', + 'timestamp': 1745403328, + 'upload_date': '20250423', + 'uploader': 'ディズニープラス公式', + 'uploader_id': 'DisneyPlusJP', + 'uploader_url': 'https://twitter.com/DisneyPlusJP', + 'view_count': int, }, }] def _real_extract(self, url): - broadcast_id = self._match_id(url) + broadcast_type, display_id = self._match_valid_url(url).group('type', 'id') + + if broadcast_type == 'events': + timeline = self._call_api( + f'live_event/1/{display_id}/timeline.json', display_id) + broadcast_id = traverse_obj(timeline, ( + 'twitter_objects', 'broadcasts', ..., ('id', 'broadcast_id'), + {str}, any, {require('broadcast ID')})) + else: + broadcast_id = display_id + broadcast = self._call_api( 'broadcasts/show.json', broadcast_id, {'ids': broadcast_id})['broadcasts'][broadcast_id] if not broadcast: raise ExtractorError('Broadcast no longer exists', expected=True) info = self._parse_broadcast_data(broadcast, broadcast_id) - info['title'] = broadcast.get('status') or info.get('title') - info['uploader_id'] = broadcast.get('twitter_username') or info.get('uploader_id') - info['uploader_url'] = format_field(broadcast, 'twitter_username', 'https://twitter.com/%s', default=None) + info.update({ + 'display_id': display_id, + 'title': broadcast.get('status') or info.get('title'), + 'uploader_id': broadcast.get('twitter_username') or info.get('uploader_id'), + 'uploader_url': format_field( + broadcast, 'twitter_username', 'https://twitter.com/%s', default=None), + }) if info['live_status'] == 'is_upcoming': + self.raise_no_formats('This live broadcast has not yet started', expected=True) return info media_key = broadcast['media_key'] @@ -1717,21 +1720,22 @@ class TwitterSpacesIE(TwitterBaseIE): _VALID_URL = TwitterBaseIE._BASE_REGEX + r'i/spaces/(?P[0-9a-zA-Z]{13})' _TESTS = [{ - 'url': 'https://twitter.com/i/spaces/1RDxlgyvNXzJL', + 'url': 'https://twitter.com/i/spaces/1OwxWwQOPlNxQ', 'info_dict': { - 'id': '1RDxlgyvNXzJL', + 'id': '1OwxWwQOPlNxQ', 'ext': 'm4a', - 'title': 'King Carlo e la mossa Kansas City per fare il Grande Centro', - 'description': 'Twitter Space participated by annarita digiorgio, Signor Ernesto, Raffaello Colosimo, Simone M. Sepe', - 'uploader': r're:Lucio Di Gaetano.*?', - 'uploader_id': 'luciodigaetano', + 'title': 'Everybody in: @mtbarra & @elonmusk discuss the future of EV charging', + 'description': 'Twitter Space participated by Elon Musk', 'live_status': 'was_live', - 'timestamp': 1659877956, - 'upload_date': '20220807', - 'release_timestamp': 1659904215, - 'release_date': '20220807', + 'release_date': '20230608', + 'release_timestamp': 1686256230, + 'thumbnail': r're:https?://pbs\.twimg\.com/profile_images/.+', + 'timestamp': 1686254250, + 'upload_date': '20230608', + 'uploader': 'Mary Barra', + 'uploader_id': 'mtbarra', }, - 'skip': 'No longer available', + 'params': {'skip_download': 'm3u8'}, }, { # post_live/TimedOut but downloadable 'url': 'https://twitter.com/i/spaces/1vAxRAVQWONJl', @@ -1743,9 +1747,10 @@ class TwitterSpacesIE(TwitterBaseIE): 'uploader': 'Google Cloud', 'uploader_id': 'googlecloud', 'live_status': 'post_live', + 'thumbnail': r're:https?://pbs\.twimg\.com/profile_images/.+', 'timestamp': 1681409554, 'upload_date': '20230413', - 'release_timestamp': 1681839000, + 'release_timestamp': 1681839082, 'release_date': '20230418', 'protocol': 'm3u8', # ffmpeg is forced 'container': 'm4a_dash', # audio-only format fixup is applied @@ -1762,6 +1767,9 @@ class TwitterSpacesIE(TwitterBaseIE): 'uploader': '息根とめる', 'uploader_id': 'tomeru_ikinone', 'live_status': 'was_live', + 'release_date': '20230601', + 'release_timestamp': 1685617200, + 'thumbnail': r're:https?://pbs\.twimg\.com/profile_images/.+', 'timestamp': 1685617198, 'upload_date': '20230601', 'protocol': 'm3u8', # ffmpeg is forced @@ -1779,9 +1787,10 @@ class TwitterSpacesIE(TwitterBaseIE): 'uploader': 'Candace Owens', 'uploader_id': 'RealCandaceO', 'live_status': 'was_live', + 'thumbnail': r're:https?://pbs\.twimg\.com/profile_images/.+', 'timestamp': 1723931351, 'upload_date': '20240817', - 'release_timestamp': 1723932000, + 'release_timestamp': 1723932056, 'release_date': '20240817', 'protocol': 'm3u8_native', # not ffmpeg, detected as video space }, @@ -1861,18 +1870,21 @@ def _real_extract(self, url): return { 'id': space_id, - 'title': metadata.get('title'), 'description': f'Twitter Space participated by {participants}', - 'uploader': traverse_obj( - metadata, ('creator_results', 'result', 'legacy', 'name')), - 'uploader_id': traverse_obj( - metadata, ('creator_results', 'result', 'legacy', 'screen_name')), - 'live_status': live_status, - 'release_timestamp': try_call( - lambda: int_or_none(metadata['scheduled_start'], scale=1000)), - 'timestamp': int_or_none(metadata.get('created_at'), scale=1000), 'formats': formats, 'http_headers': headers, + 'live_status': live_status, + **traverse_obj(metadata, { + 'title': ('title', {str}), + # started_at is None when stream is_upcoming so fallback to scheduled_start for --wait-for-video + 'release_timestamp': (('started_at', 'scheduled_start'), {int_or_none(scale=1000)}, any), + 'timestamp': ('created_at', {int_or_none(scale=1000)}), + }), + **traverse_obj(metadata, ('creator_results', 'result', 'legacy', { + 'uploader': ('name', {str}), + 'uploader_id': ('screen_name', {str_or_none}), + 'thumbnail': ('profile_image_url_https', {lambda x: x.replace('_normal', '_400x400')}, {url_or_none}), + })), } diff --git a/yt_dlp/extractor/unsupported.py b/yt_dlp/extractor/unsupported.py index e8d6ae1289..31393b02a4 100644 --- a/yt_dlp/extractor/unsupported.py +++ b/yt_dlp/extractor/unsupported.py @@ -51,6 +51,8 @@ class KnownDRMIE(UnsupportedInfoExtractor): r'(?:watch|front)\.njpwworld\.com', r'qub\.ca/vrai', r'(?:beta\.)?crunchyroll\.com', + r'viki\.com', + r'deezer\.com', ) _TESTS = [{ @@ -160,6 +162,12 @@ class KnownDRMIE(UnsupportedInfoExtractor): }, { 'url': 'https://beta.crunchyroll.com/pt-br/watch/G8WUN8VKP/the-ruler-of-conspiracy', 'only_matching': True, + }, { + 'url': 'https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1', + 'only_matching': True, + }, { + 'url': 'http://www.deezer.com/playlist/176747451', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/vice.py b/yt_dlp/extractor/vice.py index 3739a37e4f..5ede30deb1 100644 --- a/yt_dlp/extractor/vice.py +++ b/yt_dlp/extractor/vice.py @@ -32,6 +32,7 @@ def _call_api(self, resource, resource_key, resource_id, locale, fields, args='' class ViceIE(ViceBaseIE, AdobePassIE): + _WORKING = False IE_NAME = 'vice' _VALID_URL = r'https?://(?:(?:video|vms)\.vice|(?:www\.)?vice(?:land|tv))\.com/(?P[^/]+)/(?:video/[^/]+|embed)/(?P[\da-f]{24})' _EMBED_REGEX = [r']+\bsrc=["\'](?P(?:https?:)?//video\.vice\.com/[^/]+/embed/[\da-f]{24})'] @@ -99,6 +100,7 @@ class ViceIE(ViceBaseIE, AdobePassIE): 'url': 'https://www.viceland.com/en_us/video/thursday-march-1-2018/5a8f2d7ff1cdb332dd446ec1', 'only_matching': True, }] + _SOFTWARE_STATEMENT = 'eyJhbGciOiJSUzI1NiJ9.eyJzdWIiOiIwMTVjODBlZC04ZDcxLTQ4ZGEtOTZkZi00NzU5NjIwNzJlYTQiLCJuYmYiOjE2NjgwMTM0ODQsImlzcyI6ImF1dGguYWRvYmUuY29tIiwiaWF0IjoxNjY4MDEzNDg0fQ.CjhUnTrlh-bmYnEFHyC2Y4it5Y_Zfza1x66O4-ki5gBR7JT6aUunYI_YflXomQPACriMpObkITFz4grVaDwdd8Xp9hrQ2R0SwRBdaklkdy1_j68RqSP5PnexJIa0q_ThtOwfRBd5uGcb33nMJ9Qs92W4kVXuca0Ta-i7SJyWgXUaPDlRDdgyCL3hKj5wuM7qUIwrd9A5CMm-j3dMIBCDgw7X6TwRK65eUQe6gTWqcvL2yONHHTpmIfeOTUxGwwKFr29COOTBowm0VJ6HE08xjXCShP08Neusu-JsgkjzhkEbiDE2531EKgfAki_7WCd2JUZVsAsCusv4a1maokk6NA' def _real_extract(self, url): locale, video_id = self._match_valid_url(url).groups() @@ -116,7 +118,7 @@ def _real_extract(self, url): resource = self._get_mvpd_resource( 'VICELAND', title, video_id, rating) query['tvetoken'] = self._extract_mvpd_auth( - url, video_id, 'VICELAND', resource) + url, video_id, 'VICELAND', resource, self._SOFTWARE_STATEMENT) # signature generation algorithm is reverse engineered from signatureGenerator in # webpack:///../shared/~/vice-player/dist/js/vice-player.js in @@ -181,6 +183,7 @@ def _real_extract(self, url): class ViceShowIE(ViceBaseIE): + _WORKING = False IE_NAME = 'vice:show' _VALID_URL = r'https?://(?:video\.vice|(?:www\.)?vice(?:land|tv))\.com/(?P[^/]+)/show/(?P[^/?#&]+)' _PAGE_SIZE = 25 @@ -221,6 +224,7 @@ def _real_extract(self, url): class ViceArticleIE(ViceBaseIE): + _WORKING = False IE_NAME = 'vice:article' _VALID_URL = r'https?://(?:www\.)?vice\.com/(?P[^/]+)/article/(?:[0-9a-z]{6}/)?(?P[^?#]+)' diff --git a/yt_dlp/extractor/viki.py b/yt_dlp/extractor/viki.py deleted file mode 100644 index 75f9cdf2ff..0000000000 --- a/yt_dlp/extractor/viki.py +++ /dev/null @@ -1,346 +0,0 @@ -import hashlib -import hmac -import json -import time - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - int_or_none, - parse_age_limit, - parse_iso8601, - try_get, -) - - -class VikiBaseIE(InfoExtractor): - _VALID_URL_BASE = r'https?://(?:www\.)?viki\.(?:com|net|mx|jp|fr)/' - _API_URL_TEMPLATE = 'https://api.viki.io%s' - - _DEVICE_ID = '112395910d' - _APP = '100005a' - _APP_VERSION = '6.11.3' - _APP_SECRET = 'd96704b180208dbb2efa30fe44c48bd8690441af9f567ba8fd710a72badc85198f7472' - - _GEO_BYPASS = False - _NETRC_MACHINE = 'viki' - - _token = None - - _ERRORS = { - 'geo': 'Sorry, this content is not available in your region.', - 'upcoming': 'Sorry, this content is not yet available.', - 'paywall': 'Sorry, this content is only available to Viki Pass Plus subscribers', - } - - def _stream_headers(self, timestamp, sig): - return { - 'X-Viki-manufacturer': 'vivo', - 'X-Viki-device-model': 'vivo 1606', - 'X-Viki-device-os-ver': '6.0.1', - 'X-Viki-connection-type': 'WIFI', - 'X-Viki-carrier': '', - 'X-Viki-as-id': '100005a-1625321982-3932', - 'timestamp': str(timestamp), - 'signature': str(sig), - 'x-viki-app-ver': self._APP_VERSION, - } - - def _api_query(self, path, version=4, **kwargs): - path += '?' if '?' not in path else '&' - query = f'/v{version}/{path}app={self._APP}' - if self._token: - query += f'&token={self._token}' - return query + ''.join(f'&{name}={val}' for name, val in kwargs.items()) - - def _sign_query(self, path): - timestamp = int(time.time()) - query = self._api_query(path, version=5) - sig = hmac.new( - self._APP_SECRET.encode('ascii'), f'{query}&t={timestamp}'.encode('ascii'), hashlib.sha1).hexdigest() - return timestamp, sig, self._API_URL_TEMPLATE % query - - def _call_api( - self, path, video_id, note='Downloading JSON metadata', data=None, query=None, fatal=True): - if query is None: - timestamp, sig, url = self._sign_query(path) - else: - url = self._API_URL_TEMPLATE % self._api_query(path, version=4) - resp = self._download_json( - url, video_id, note, fatal=fatal, query=query, - data=json.dumps(data).encode() if data else None, - headers=({'x-viki-app-ver': self._APP_VERSION} if data - else self._stream_headers(timestamp, sig) if query is None - else None), expected_status=400) or {} - - self._raise_error(resp.get('error'), fatal) - return resp - - def _raise_error(self, error, fatal=True): - if error is None: - return - msg = f'{self.IE_NAME} said: {error}' - if fatal: - raise ExtractorError(msg, expected=True) - else: - self.report_warning(msg) - - def _check_errors(self, data): - for reason, status in (data.get('blocking') or {}).items(): - if status and reason in self._ERRORS: - message = self._ERRORS[reason] - if reason == 'geo': - self.raise_geo_restricted(msg=message) - elif reason == 'paywall': - if try_get(data, lambda x: x['paywallable']['tvod']): - self._raise_error('This video is for rent only or TVOD (Transactional Video On demand)') - self.raise_login_required(message) - self._raise_error(message) - - def _perform_login(self, username, password): - self._token = self._call_api( - 'sessions.json', None, 'Logging in', fatal=False, - data={'username': username, 'password': password}).get('token') - if not self._token: - self.report_warning('Login Failed: Unable to get session token') - - @staticmethod - def dict_selection(dict_obj, preferred_key): - if preferred_key in dict_obj: - return dict_obj[preferred_key] - return (list(filter(None, dict_obj.values())) or [None])[0] - - -class VikiIE(VikiBaseIE): - IE_NAME = 'viki' - _VALID_URL = rf'{VikiBaseIE._VALID_URL_BASE}(?:videos|player)/(?P[0-9]+v)' - _TESTS = [{ - 'note': 'Free non-DRM video with storyboards in MPD', - 'url': 'https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1', - 'info_dict': { - 'id': '1175236v', - 'ext': 'mp4', - 'title': 'Choosing Spouse by Lottery - Episode 1', - 'timestamp': 1606463239, - 'age_limit': 13, - 'uploader': 'FCC', - 'upload_date': '20201127', - }, - }, { - 'url': 'http://www.viki.com/videos/1023585v-heirs-episode-14', - 'info_dict': { - 'id': '1023585v', - 'ext': 'mp4', - 'title': 'Heirs - Episode 14', - 'uploader': 'SBS Contents Hub', - 'timestamp': 1385047627, - 'upload_date': '20131121', - 'age_limit': 13, - 'duration': 3570, - 'episode_number': 14, - }, - 'skip': 'Blocked in the US', - }, { - # clip - 'url': 'http://www.viki.com/videos/1067139v-the-avengers-age-of-ultron-press-conference', - 'md5': '86c0b5dbd4d83a6611a79987cc7a1989', - 'info_dict': { - 'id': '1067139v', - 'ext': 'mp4', - 'title': "'The Avengers: Age of Ultron' Press Conference", - 'description': 'md5:d70b2f9428f5488321bfe1db10d612ea', - 'duration': 352, - 'timestamp': 1430380829, - 'upload_date': '20150430', - 'uploader': 'Arirang TV', - 'like_count': int, - 'age_limit': 0, - }, - 'skip': 'Sorry. There was an error loading this video', - }, { - 'url': 'http://www.viki.com/videos/1048879v-ankhon-dekhi', - 'info_dict': { - 'id': '1048879v', - 'ext': 'mp4', - 'title': 'Ankhon Dekhi', - 'duration': 6512, - 'timestamp': 1408532356, - 'upload_date': '20140820', - 'uploader': 'Spuul', - 'like_count': int, - 'age_limit': 13, - }, - 'skip': 'Blocked in the US', - }, { - # episode - 'url': 'http://www.viki.com/videos/44699v-boys-over-flowers-episode-1', - 'md5': '0a53dc252e6e690feccd756861495a8c', - 'info_dict': { - 'id': '44699v', - 'ext': 'mp4', - 'title': 'Boys Over Flowers - Episode 1', - 'description': 'md5:b89cf50038b480b88b5b3c93589a9076', - 'duration': 4172, - 'timestamp': 1270496524, - 'upload_date': '20100405', - 'uploader': 'group8', - 'like_count': int, - 'age_limit': 13, - 'episode_number': 1, - }, - }, { - # youtube external - 'url': 'http://www.viki.com/videos/50562v-poor-nastya-complete-episode-1', - 'md5': '63f8600c1da6f01b7640eee7eca4f1da', - 'info_dict': { - 'id': '50562v', - 'ext': 'webm', - 'title': 'Poor Nastya [COMPLETE] - Episode 1', - 'description': '', - 'duration': 606, - 'timestamp': 1274949505, - 'upload_date': '20101213', - 'uploader': 'ad14065n', - 'uploader_id': 'ad14065n', - 'like_count': int, - 'age_limit': 13, - }, - 'skip': 'Page not found!', - }, { - 'url': 'http://www.viki.com/player/44699v', - 'only_matching': True, - }, { - # non-English description - 'url': 'http://www.viki.com/videos/158036v-love-in-magic', - 'md5': '41faaba0de90483fb4848952af7c7d0d', - 'info_dict': { - 'id': '158036v', - 'ext': 'mp4', - 'uploader': 'I Planet Entertainment', - 'upload_date': '20111122', - 'timestamp': 1321985454, - 'description': 'md5:44b1e46619df3a072294645c770cef36', - 'title': 'Love In Magic', - 'age_limit': 13, - }, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - video = self._call_api(f'videos/{video_id}.json', video_id, 'Downloading video JSON', query={}) - self._check_errors(video) - - title = try_get(video, lambda x: x['titles']['en'], str) - episode_number = int_or_none(video.get('number')) - if not title: - title = f'Episode {episode_number}' if video.get('type') == 'episode' else video.get('id') or video_id - container_titles = try_get(video, lambda x: x['container']['titles'], dict) or {} - container_title = self.dict_selection(container_titles, 'en') - title = f'{container_title} - {title}' - - thumbnails = [{ - 'id': thumbnail_id, - 'url': thumbnail['url'], - } for thumbnail_id, thumbnail in (video.get('images') or {}).items() if thumbnail.get('url')] - - resp = self._call_api( - f'playback_streams/{video_id}.json?drms=dt3&device_id={self._DEVICE_ID}', - video_id, 'Downloading video streams JSON')['main'][0] - - stream_id = try_get(resp, lambda x: x['properties']['track']['stream_id']) - subtitles = dict((lang, [{ - 'ext': ext, - 'url': self._API_URL_TEMPLATE % self._api_query( - f'videos/{video_id}/auth_subtitles/{lang}.{ext}', stream_id=stream_id), - } for ext in ('srt', 'vtt')]) for lang in (video.get('subtitle_completions') or {})) - - mpd_url = resp['url'] - # 720p is hidden in another MPD which can be found in the current manifest content - mpd_content = self._download_webpage(mpd_url, video_id, note='Downloading initial MPD manifest') - mpd_url = self._search_regex( - r'(?mi)(http.+.mpd)', mpd_content, 'new manifest', default=mpd_url) - if 'mpdhd_high' not in mpd_url and 'sig=' not in mpd_url: - # Modify the URL to get 1080p - mpd_url = mpd_url.replace('mpdhd', 'mpdhd_high') - formats = self._extract_mpd_formats(mpd_url, video_id) - - return { - 'id': video_id, - 'formats': formats, - 'title': title, - 'description': self.dict_selection(video.get('descriptions', {}), 'en'), - 'duration': int_or_none(video.get('duration')), - 'timestamp': parse_iso8601(video.get('created_at')), - 'uploader': video.get('author'), - 'uploader_url': video.get('author_url'), - 'like_count': int_or_none(try_get(video, lambda x: x['likes']['count'])), - 'age_limit': parse_age_limit(video.get('rating')), - 'thumbnails': thumbnails, - 'subtitles': subtitles, - 'episode_number': episode_number, - } - - -class VikiChannelIE(VikiBaseIE): - IE_NAME = 'viki:channel' - _VALID_URL = rf'{VikiBaseIE._VALID_URL_BASE}(?:tv|news|movies|artists)/(?P[0-9]+c)' - _TESTS = [{ - 'url': 'http://www.viki.com/tv/50c-boys-over-flowers', - 'info_dict': { - 'id': '50c', - 'title': 'Boys Over Flowers', - 'description': 'md5:804ce6e7837e1fd527ad2f25420f4d59', - }, - 'playlist_mincount': 51, - }, { - 'url': 'http://www.viki.com/tv/1354c-poor-nastya-complete', - 'info_dict': { - 'id': '1354c', - 'title': 'Poor Nastya [COMPLETE]', - 'description': 'md5:05bf5471385aa8b21c18ad450e350525', - }, - 'playlist_count': 127, - 'skip': 'Page not found', - }, { - 'url': 'http://www.viki.com/news/24569c-showbiz-korea', - 'only_matching': True, - }, { - 'url': 'http://www.viki.com/movies/22047c-pride-and-prejudice-2005', - 'only_matching': True, - }, { - 'url': 'http://www.viki.com/artists/2141c-shinee', - 'only_matching': True, - }] - - _video_types = ('episodes', 'movies', 'clips', 'trailers') - - def _entries(self, channel_id): - params = { - 'app': self._APP, 'token': self._token, 'only_ids': 'true', - 'direction': 'asc', 'sort': 'number', 'per_page': 30, - } - video_types = self._configuration_arg('video_types') or self._video_types - for video_type in video_types: - if video_type not in self._video_types: - self.report_warning(f'Unknown video_type: {video_type}') - page_num = 0 - while True: - page_num += 1 - params['page'] = page_num - res = self._call_api( - f'containers/{channel_id}/{video_type}.json', channel_id, query=params, fatal=False, - note=f'Downloading {video_type.title()} JSON page {page_num}') - - for video_id in res.get('response') or []: - yield self.url_result(f'https://www.viki.com/videos/{video_id}', VikiIE.ie_key(), video_id) - if not res.get('more'): - break - - def _real_extract(self, url): - channel_id = self._match_id(url) - channel = self._call_api(f'containers/{channel_id}.json', channel_id, 'Downloading channel JSON') - self._check_errors(channel) - return self.playlist_result( - self._entries(channel_id), channel_id, - self.dict_selection(channel['titles'], 'en'), - self.dict_selection(channel['descriptions'], 'en')) diff --git a/yt_dlp/extractor/vimeo.py b/yt_dlp/extractor/vimeo.py index 8a0aaaa464..09497b699d 100644 --- a/yt_dlp/extractor/vimeo.py +++ b/yt_dlp/extractor/vimeo.py @@ -3,6 +3,7 @@ import itertools import json import re +import time import urllib.parse from .common import InfoExtractor @@ -13,10 +14,12 @@ OnDemandPagedList, clean_html, determine_ext, + filter_dict, get_element_by_class, int_or_none, join_nonempty, js_to_json, + jwt_decode_hs256, merge_dicts, parse_filesize, parse_iso8601, @@ -39,6 +42,18 @@ class VimeoBaseInfoExtractor(InfoExtractor): _NETRC_MACHINE = 'vimeo' _LOGIN_REQUIRED = False _LOGIN_URL = 'https://vimeo.com/log_in' + _REFERER_HINT = ( + 'Cannot download embed-only video without embedding URL. Please call yt-dlp ' + 'with the URL of the page that embeds this video.') + _IOS_CLIENT_AUTH = 'MTMxNzViY2Y0NDE0YTQ5YzhjZTc0YmU0NjVjNDQxYzNkYWVjOWRlOTpHKzRvMmgzVUh4UkxjdU5FRW80cDNDbDhDWGR5dVJLNUJZZ055dHBHTTB4V1VzaG41bEx1a2hiN0NWYWNUcldSSW53dzRUdFRYZlJEZmFoTTArOTBUZkJHS3R4V2llYU04Qnl1bERSWWxUdXRidjNqR2J4SHFpVmtFSUcyRktuQw==' + _IOS_CLIENT_HEADERS = { + 'Accept': 'application/vnd.vimeo.*+json; version=3.4.10', + 'Accept-Language': 'en', + 'User-Agent': 'Vimeo/11.10.0 (com.vimeo; build:250424.164813.0; iOS 18.4.1) Alamofire/5.9.0 VimeoNetworking/5.0.0', + } + _IOS_OAUTH_CACHE_KEY = 'oauth-token-ios' + _ios_oauth_token = None + _viewer_info = None @staticmethod def _smuggle_referrer(url, referrer_url): @@ -52,8 +67,21 @@ def _unsmuggle_headers(self, url): headers['Referer'] = data['referer'] return url, data, headers + def _jwt_is_expired(self, token): + return jwt_decode_hs256(token)['exp'] - time.time() < 120 + + def _fetch_viewer_info(self, display_id=None, fatal=True): + if self._viewer_info and not self._jwt_is_expired(self._viewer_info['jwt']): + return self._viewer_info + + self._viewer_info = self._download_json( + 'https://vimeo.com/_next/viewer', display_id, 'Downloading web token info', + 'Failed to download web token info', fatal=fatal, headers={'Accept': 'application/json'}) + + return self._viewer_info + def _perform_login(self, username, password): - viewer = self._download_json('https://vimeo.com/_next/viewer', None, 'Downloading login token') + viewer = self._fetch_viewer_info() data = { 'action': 'login', 'email': username, @@ -88,13 +116,15 @@ def _get_video_password(self): expected=True) return password - def _verify_video_password(self, video_id, password, token): - url = f'https://vimeo.com/{video_id}' + def _verify_video_password(self, video_id, path=None): + video_password = self._get_video_password() + token = self._fetch_viewer_info(video_id)['xsrft'] + url = join_nonempty('https://vimeo.com', path, video_id, delim='/') try: - return self._download_webpage( + self._request_webpage( f'{url}/password', video_id, 'Submitting video password', data=json.dumps({ - 'password': password, + 'password': video_password, 'token': token, }, separators=(',', ':')).encode(), headers={ 'Accept': '*/*', @@ -106,6 +136,10 @@ def _verify_video_password(self, video_id, password, token): raise ExtractorError('Wrong password', expected=True) raise + def _extract_config_url(self, webpage, **kwargs): + return self._html_search_regex( + r'\bdata-config-url="([^"]+)"', webpage, 'config URL', **kwargs) + def _extract_vimeo_config(self, webpage, video_id, *args, **kwargs): vimeo_config = self._search_regex( r'vimeo\.config\s*=\s*(?:({.+?})|_extend\([^,]+,\s+({.+?})\));', @@ -153,6 +187,7 @@ def _parse_config(self, config, video_id): sep_pattern = r'/sep/video/' for files_type in ('hls', 'dash'): for cdn_name, cdn_data in (try_get(config_files, lambda x: x[files_type]['cdns']) or {}).items(): + # TODO: Also extract 'avc_url'? Investigate if there are 'hevc_url', 'av1_url'? manifest_url = cdn_data.get('url') if not manifest_url: continue @@ -233,26 +268,48 @@ def _parse_config(self, config, video_id): 'formats': formats, 'subtitles': subtitles, 'live_status': live_status, - 'release_timestamp': traverse_obj(live_event, ('ingest', 'scheduled_start_time', {parse_iso8601})), + 'release_timestamp': traverse_obj(live_event, ('ingest', ( + ('scheduled_start_time', {parse_iso8601}), + ('start_time', {int_or_none}), + ), any)), # Note: Bitrates are completely broken. Single m3u8 may contain entries in kbps and bps # at the same time without actual units specified. '_format_sort_fields': ('quality', 'res', 'fps', 'hdr:12', 'source'), } - def _call_videos_api(self, video_id, jwt_token, unlisted_hash=None, **kwargs): + def _fetch_oauth_token(self): + if not self._ios_oauth_token: + self._ios_oauth_token = self.cache.load(self._NETRC_MACHINE, self._IOS_OAUTH_CACHE_KEY) + + if not self._ios_oauth_token: + self._ios_oauth_token = self._download_json( + 'https://api.vimeo.com/oauth/authorize/client', None, + 'Fetching OAuth token', 'Failed to fetch OAuth token', + headers={ + 'Authorization': f'Basic {self._IOS_CLIENT_AUTH}', + **self._IOS_CLIENT_HEADERS, + }, data=urlencode_postdata({ + 'grant_type': 'client_credentials', + 'scope': 'private public create edit delete interact upload purchased stats', + }, quote_via=urllib.parse.quote))['access_token'] + self.cache.store(self._NETRC_MACHINE, self._IOS_OAUTH_CACHE_KEY, self._ios_oauth_token) + + return self._ios_oauth_token + + def _call_videos_api(self, video_id, unlisted_hash=None, **kwargs): return self._download_json( join_nonempty(f'https://api.vimeo.com/videos/{video_id}', unlisted_hash, delim=':'), video_id, 'Downloading API JSON', headers={ - 'Authorization': f'jwt {jwt_token}', - 'Accept': 'application/json', + 'Authorization': f'Bearer {self._fetch_oauth_token()}', + **self._IOS_CLIENT_HEADERS, }, query={ 'fields': ','.join(( - 'config_url', 'created_time', 'description', 'download', 'license', - 'metadata.connections.comments.total', 'metadata.connections.likes.total', - 'release_time', 'stats.plays')), + 'config_url', 'embed_player_config_url', 'player_embed_url', 'download', 'play', + 'files', 'description', 'license', 'release_time', 'created_time', 'stats.plays', + 'metadata.connections.comments.total', 'metadata.connections.likes.total')), }, **kwargs) - def _extract_original_format(self, url, video_id, unlisted_hash=None, jwt=None, api_data=None): + def _extract_original_format(self, url, video_id, unlisted_hash=None, api_data=None): # Original/source formats are only available when logged in if not self._get_cookies('https://vimeo.com/').get('vimeo'): return @@ -283,12 +340,8 @@ def _extract_original_format(self, url, video_id, unlisted_hash=None, jwt=None, 'quality': 1, } - jwt = jwt or traverse_obj(self._download_json( - 'https://vimeo.com/_rv/viewer', video_id, 'Downloading jwt token', fatal=False), ('jwt', {str})) - if not jwt: - return original_response = api_data or self._call_videos_api( - video_id, jwt, unlisted_hash, fatal=False, expected_status=(403, 404)) + video_id, unlisted_hash, fatal=False, expected_status=(403, 404)) for download_data in traverse_obj(original_response, ('download', ..., {dict})): download_url = download_data.get('link') if not download_url or download_data.get('quality') != 'source': @@ -327,7 +380,7 @@ class VimeoIE(VimeoBaseInfoExtractor): (?: (?Puser)| (?!(?:channels|album|showcase)/[^/?#]+/?(?:$|[?#])|[^/]+/review/|ondemand/) - (?:.*?/)?? + (?:(?!event/).*?/)?? (?P (?: play_redirect_hls| @@ -410,6 +463,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'duration': 10, 'comment_count': int, 'like_count': int, + 'view_count': int, 'thumbnail': 'https://i.vimeocdn.com/video/440665496-b2c5aee2b61089442c794f64113a8e8f7d5763c3e6b3ebfaf696ae6413f8b1f4-d', }, 'params': { @@ -500,15 +554,16 @@ class VimeoIE(VimeoBaseInfoExtractor): 'uploader': 'The DMCI', 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/dmci', 'uploader_id': 'dmci', - 'timestamp': 1324343742, + 'timestamp': 1324361742, 'upload_date': '20111220', - 'description': 'md5:ae23671e82d05415868f7ad1aec21147', + 'description': 'md5:f37b4ad0f3ded6fa16f38ecde16c3c44', 'duration': 60, 'comment_count': int, 'view_count': int, 'thumbnail': 'https://i.vimeocdn.com/video/231174622-dd07f015e9221ff529d451e1cc31c982b5d87bfafa48c4189b1da72824ee289a-d', 'like_count': int, - 'tags': 'count:11', + 'release_timestamp': 1324361742, + 'release_date': '20111220', }, # 'params': {'format': 'Original'}, 'expected_warnings': ['Failed to parse XML: not well-formed'], @@ -521,15 +576,18 @@ class VimeoIE(VimeoBaseInfoExtractor): 'id': '393756517', # 'ext': 'mov', 'ext': 'mp4', - 'timestamp': 1582642091, + 'timestamp': 1582660091, 'uploader_id': 'frameworkla', 'title': 'Straight To Hell - Sabrina: Netflix', 'uploader': 'Framework Studio', - 'description': 'md5:f2edc61af3ea7a5592681ddbb683db73', 'upload_date': '20200225', 'duration': 176, 'thumbnail': 'https://i.vimeocdn.com/video/859377297-836494a4ef775e9d4edbace83937d9ad34dc846c688c0c419c0e87f7ab06c4b3-d', 'uploader_url': 'https://vimeo.com/frameworkla', + 'comment_count': int, + 'like_count': int, + 'release_timestamp': 1582660091, + 'release_date': '20200225', }, # 'params': {'format': 'source'}, 'expected_warnings': ['Failed to parse XML: not well-formed'], @@ -630,7 +688,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'description': str, # FIXME: Dynamic SEO spam description 'upload_date': '20150209', 'timestamp': 1423518307, - 'thumbnail': 'https://i.vimeocdn.com/video/default', + 'thumbnail': r're:https://i\.vimeocdn\.com/video/default', 'duration': 10, 'like_count': int, 'uploader_url': 'https://vimeo.com/user20132939', @@ -667,6 +725,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'like_count': int, 'uploader_url': 'https://vimeo.com/aliniamedia', 'release_date': '20160329', + 'view_count': int, }, 'params': {'skip_download': True}, 'expected_warnings': ['Failed to parse XML: not well-formed'], @@ -678,18 +737,19 @@ class VimeoIE(VimeoBaseInfoExtractor): # 'ext': 'm4v', 'ext': 'mp4', 'title': 'Eastnor Castle 2015 Firework Champions - The Promo!', - 'description': 'md5:5967e090768a831488f6e74b7821b3c1', + 'description': 'md5:9441e6829ae94f380cc6417d982f63ac', 'uploader_id': 'fireworkchampions', 'uploader': 'Firework Champions', 'upload_date': '20150910', - 'timestamp': 1441901895, + 'timestamp': 1441916295, 'thumbnail': 'https://i.vimeocdn.com/video/534715882-6ff8e4660cbf2fea68282876d8d44f318825dfe572cc4016e73b3266eac8ae3a-d', 'uploader_url': 'https://vimeo.com/fireworkchampions', - 'tags': 'count:6', 'duration': 229, 'view_count': int, 'like_count': int, 'comment_count': int, + 'release_timestamp': 1441916295, + 'release_date': '20150910', }, 'params': { 'skip_download': True, @@ -820,7 +880,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'uploader': 'Raja Virdi', 'uploader_id': 'rajavirdi', 'uploader_url': 'https://vimeo.com/rajavirdi', - 'duration': 309, + 'duration': 300, 'thumbnail': r're:https://i\.vimeocdn\.com/video/1716727772-[\da-f]+-d', }, # 'params': {'format': 'source'}, @@ -860,12 +920,9 @@ def _verify_player_video_password(self, url, video_id, headers): return checked def _extract_from_api(self, video_id, unlisted_hash=None): - viewer = self._download_json( - 'https://vimeo.com/_next/viewer', video_id, 'Downloading viewer info') - for retry in (False, True): try: - video = self._call_videos_api(video_id, viewer['jwt'], unlisted_hash) + video = self._call_videos_api(video_id, unlisted_hash) break except ExtractorError as e: if (not retry and isinstance(e.cause, HTTPError) and e.cause.status == 400 @@ -873,15 +930,14 @@ def _extract_from_api(self, video_id, unlisted_hash=None): self._webpage_read_content(e.cause.response, e.cause.response.url, video_id, fatal=False), ({json.loads}, 'invalid_parameters', ..., 'field'), )): - self._verify_video_password( - video_id, self._get_video_password(), viewer['xsrft']) + self._verify_video_password(video_id) continue raise info = self._parse_config(self._download_json( video['config_url'], video_id), video_id) source_format = self._extract_original_format( - f'https://vimeo.com/{video_id}', video_id, unlisted_hash, jwt=viewer['jwt'], api_data=video) + f'https://vimeo.com/{video_id}', video_id, unlisted_hash, api_data=video) if source_format: info['formats'].append(source_format) @@ -904,8 +960,7 @@ def _try_album_password(self, url): r'vimeo\.com/(?:album|showcase)/([^/]+)', url, 'album id', default=None) if not album_id: return - viewer = self._download_json( - 'https://vimeo.com/_rv/viewer', album_id, fatal=False) + viewer = self._fetch_viewer_info(album_id, fatal=False) if not viewer: webpage = self._download_webpage(url, album_id) viewer = self._parse_json(self._search_regex( @@ -963,9 +1018,7 @@ def _real_extract(self, url): raise errmsg = error.cause.response.read() if b'Because of its privacy settings, this video cannot be played here' in errmsg: - raise ExtractorError( - 'Cannot download embed-only video without embedding URL. Please call yt-dlp ' - 'with the URL of the page that embeds this video.', expected=True) + raise ExtractorError(self._REFERER_HINT, expected=True) # 403 == vimeo.com TLS fingerprint or DC IP block; 429 == player.vimeo.com TLS FP block status = error.cause.status dcip_msg = 'If you are using a data center IP or VPN/proxy, your IP may be blocked' @@ -1010,8 +1063,7 @@ def _real_extract(self, url): channel_id = self._search_regex( r'vimeo\.com/channels/([^/]+)', url, 'channel id', default=None) if channel_id: - config_url = self._html_search_regex( - r'\bdata-config-url="([^"]+)"', webpage, 'config URL', default=None) + config_url = self._extract_config_url(webpage, default=None) video_description = clean_html(get_element_by_class('description', webpage)) info_dict.update({ 'channel_id': channel_id, @@ -1122,7 +1174,7 @@ class VimeoOndemandIE(VimeoIE): # XXX: Do not subclass from concrete IE 'description': 'md5:aeeba3dbd4d04b0fa98a4fdc9c639998', 'upload_date': '20140906', 'timestamp': 1410032453, - 'thumbnail': 'https://i.vimeocdn.com/video/488238335-d7bf151c364cff8d467f1b73784668fe60aae28a54573a35d53a1210ae283bd8-d_1280', + 'thumbnail': r're:https://i\.vimeocdn\.com/video/\d+-[\da-f]+-d', 'comment_count': int, 'license': 'https://creativecommons.org/licenses/by-nc-nd/3.0/', 'duration': 53, @@ -1132,7 +1184,7 @@ class VimeoOndemandIE(VimeoIE): # XXX: Do not subclass from concrete IE 'params': { 'format': 'best[protocol=https]', }, - 'expected_warnings': ['Unable to download JSON metadata'], + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { # requires Referer to be passed along with og:video:url 'url': 'https://vimeo.com/ondemand/36938/126682985', @@ -1149,13 +1201,14 @@ class VimeoOndemandIE(VimeoIE): # XXX: Do not subclass from concrete IE 'duration': 121, 'comment_count': int, 'view_count': int, - 'thumbnail': 'https://i.vimeocdn.com/video/517077723-7066ae1d9a79d3eb361334fb5d58ec13c8f04b52f8dd5eadfbd6fb0bcf11f613-d_1280', + 'thumbnail': r're:https://i\.vimeocdn\.com/video/\d+-[\da-f]+-d', 'like_count': int, + 'tags': 'count:5', }, 'params': { 'skip_download': True, }, - 'expected_warnings': ['Unable to download JSON metadata'], + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { 'url': 'https://vimeo.com/ondemand/nazmaalik', 'only_matching': True, @@ -1237,7 +1290,7 @@ class VimeoUserIE(VimeoChannelIE): # XXX: Do not subclass from concrete IE _TESTS = [{ 'url': 'https://vimeo.com/nkistudio/videos', 'info_dict': { - 'title': 'Nki', + 'title': 'AKAMA', 'id': 'nkistudio', }, 'playlist_mincount': 66, @@ -1303,8 +1356,7 @@ def _fetch_page(self, album_id, authorization, hashed_pass, page): def _real_extract(self, url): album_id = self._match_id(url) - viewer = self._download_json( - 'https://vimeo.com/_rv/viewer', album_id, fatal=False) + viewer = self._fetch_viewer_info(album_id, fatal=False) if not viewer: webpage = self._download_webpage(url, album_id) viewer = self._parse_json(self._search_regex( @@ -1370,10 +1422,10 @@ class VimeoReviewIE(VimeoBaseInfoExtractor): 'uploader_id': 'user170863801', 'uploader_url': 'https://vimeo.com/user170863801', 'duration': 30, - 'thumbnail': 'https://i.vimeocdn.com/video/1912612821-09a43bd2e75c203d503aed89de7534f28fc4474a48f59c51999716931a246af5-d_1280', + 'thumbnail': r're:https://i\.vimeocdn\.com/video/\d+-[\da-f]+-d', }, 'params': {'skip_download': 'm3u8'}, - 'expected_warnings': ['Failed to parse XML'], + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { 'url': 'https://vimeo.com/user21297594/review/75524534/3c257a1b5d', 'md5': 'c507a72f780cacc12b2248bb4006d253', @@ -1423,12 +1475,8 @@ def _real_extract(self, url): user, video_id, review_hash = self._match_valid_url(url).group('user', 'id', 'hash') data_url = f'https://vimeo.com/{user}/review/data/{video_id}/{review_hash}' data = self._download_json(data_url, video_id) - viewer = {} if data.get('isLocked') is True: - video_password = self._get_video_password() - viewer = self._download_json( - 'https://vimeo.com/_rv/viewer', video_id) - self._verify_video_password(video_id, video_password, viewer['xsrft']) + self._verify_video_password(video_id) data = self._download_json(data_url, video_id) clip_data = data['clipData'] config_url = clip_data['configUrl'] @@ -1436,7 +1484,7 @@ def _real_extract(self, url): info_dict = self._parse_config(config, video_id) source_format = self._extract_original_format( f'https://vimeo.com/{user}/review/{video_id}/{review_hash}/action', - video_id, unlisted_hash=clip_data.get('unlistedHash'), jwt=viewer.get('jwt')) + video_id, unlisted_hash=clip_data.get('unlistedHash')) if source_format: info_dict['formats'].append(source_format) info_dict['description'] = clean_html(clip_data.get('description')) @@ -1528,20 +1576,22 @@ class VimeoProIE(VimeoBaseInfoExtractor): 'uploader_id': 'openstreetmapus', 'uploader': 'OpenStreetMap US', 'title': 'Andy Allan - Putting the Carto into OpenStreetMap Cartography', - 'description': 'md5:2c362968038d4499f4d79f88458590c1', + 'description': 'md5:8cf69a1a435f2d763f4adf601e9c3125', 'duration': 1595, 'upload_date': '20130610', - 'timestamp': 1370893156, + 'timestamp': 1370907556, 'license': 'by', - 'thumbnail': 'https://i.vimeocdn.com/video/440260469-19b0d92fca3bd84066623b53f1eb8aaa3980c6c809e2d67b6b39ab7b4a77a344-d_960', + 'thumbnail': r're:https://i\.vimeocdn\.com/video/\d+-[\da-f]+-d', 'view_count': int, 'comment_count': int, 'like_count': int, - 'tags': 'count:1', + 'release_timestamp': 1370907556, + 'release_date': '20130610', }, 'params': { 'format': 'best[protocol=https]', }, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { # password-protected VimeoPro page with Vimeo player embed 'url': 'https://vimeopro.com/cadfem/simulation-conference-mechanische-systeme-in-perfektion', @@ -1549,7 +1599,7 @@ class VimeoProIE(VimeoBaseInfoExtractor): 'id': '764543723', 'ext': 'mp4', 'title': 'Mechanische Systeme in Perfektion: Realität erfassen, Innovation treiben', - 'thumbnail': 'https://i.vimeocdn.com/video/1543784598-a1a750494a485e601110136b9fe11e28c2131942452b3a5d30391cb3800ca8fd-d_1280', + 'thumbnail': r're:https://i\.vimeocdn\.com/video/\d+-[\da-f]+-d', 'description': 'md5:2a9d195cd1b0f6f79827107dc88c2420', 'uploader': 'CADFEM', 'uploader_id': 'cadfem', @@ -1561,6 +1611,7 @@ class VimeoProIE(VimeoBaseInfoExtractor): 'videopassword': 'Conference2022', 'skip_download': True, }, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }] def _real_extract(self, url): @@ -1597,3 +1648,377 @@ def _real_extract(self, url): return self.url_result(vimeo_url, VimeoIE, video_id, url_transparent=True, description=description) + + +class VimeoEventIE(VimeoBaseInfoExtractor): + IE_NAME = 'vimeo:event' + _VALID_URL = r'''(?x) + https?://(?:www\.)?vimeo\.com/event/(?P\d+)(?:/ + (?: + (?:embed/)?(?P[\da-f]{10})| + videos/(?P\d+) + ) + )?''' + _EMBED_REGEX = [r']+\bsrc=["\'](?Phttps?://vimeo\.com/event/\d+/embed(?:[/?][^"\']*)?)["\'][^>]*>'] + _TESTS = [{ + # stream_privacy.view: 'anybody' + 'url': 'https://vimeo.com/event/5116195', + 'info_dict': { + 'id': '1082194134', + 'ext': 'mp4', + 'display_id': '5116195', + 'title': 'Skidmore College Commencement 2025', + 'description': 'md5:1902dd5165d21f98aa198297cc729d23', + 'uploader': 'Skidmore College', + 'uploader_id': 'user116066434', + 'uploader_url': 'https://vimeo.com/user116066434', + 'comment_count': int, + 'like_count': int, + 'duration': 9810, + 'thumbnail': r're:https://i\.vimeocdn\.com/video/\d+-[\da-f]+-d', + 'timestamp': 1747502974, + 'upload_date': '20250517', + 'release_timestamp': 1747502998, + 'release_date': '20250517', + 'live_status': 'was_live', + }, + 'params': {'skip_download': 'm3u8'}, + 'expected_warnings': ['Failed to parse XML: not well-formed'], + }, { + # stream_privacy.view: 'embed_only' + 'url': 'https://vimeo.com/event/5034253/embed', + 'info_dict': { + 'id': '1071439154', + 'ext': 'mp4', + 'display_id': '5034253', + 'title': 'Advancing Humans with AI', + 'description': r're:AI is here to stay, but how do we ensure that people flourish in a world of pervasive AI use.{322}$', + 'uploader': 'MIT Media Lab', + 'uploader_id': 'mitmedialab', + 'uploader_url': 'https://vimeo.com/mitmedialab', + 'duration': 23235, + 'thumbnail': r're:https://i\.vimeocdn\.com/video/\d+-[\da-f]+-d', + 'chapters': 'count:37', + 'release_timestamp': 1744290000, + 'release_date': '20250410', + 'live_status': 'was_live', + }, + 'params': { + 'skip_download': 'm3u8', + 'http_headers': {'Referer': 'https://www.media.mit.edu/events/aha-symposium/'}, + }, + 'expected_warnings': ['Failed to parse XML: not well-formed'], + }, { + # Last entry on 2nd page of the 37 video playlist, but use clip_to_play_id API param shortcut + 'url': 'https://vimeo.com/event/4753126/videos/1046153257', + 'info_dict': { + 'id': '1046153257', + 'ext': 'mp4', + 'display_id': '4753126', + 'title': 'January 12, 2025 The True Vine (Pastor John Mindrup)', + 'description': 'The True Vine (Pastor \tJohn Mindrup)', + 'uploader': 'Salem United Church of Christ', + 'uploader_id': 'user230181094', + 'uploader_url': 'https://vimeo.com/user230181094', + 'comment_count': int, + 'like_count': int, + 'duration': 4962, + 'thumbnail': r're:https://i\.vimeocdn\.com/video/\d+-[\da-f]+-d', + 'timestamp': 1736702464, + 'upload_date': '20250112', + 'release_timestamp': 1736702543, + 'release_date': '20250112', + 'live_status': 'was_live', + }, + 'params': {'skip_download': 'm3u8'}, + 'expected_warnings': ['Failed to parse XML: not well-formed'], + }, { + # "24/7" livestream + 'url': 'https://vimeo.com/event/4768062', + 'info_dict': { + 'id': '1079901414', + 'ext': 'mp4', + 'display_id': '4768062', + 'title': r're:GRACELAND CAM \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', + 'description': '24/7 camera at Graceland Mansion', + 'uploader': 'Elvis Presley\'s Graceland', + 'uploader_id': 'visitgraceland', + 'uploader_url': 'https://vimeo.com/visitgraceland', + 'release_timestamp': 1745975450, + 'release_date': '20250430', + 'live_status': 'is_live', + }, + 'params': {'skip_download': 'livestream'}, + }, { + # stream_privacy.view: 'unlisted' with unlisted_hash in URL path (stream_privacy.embed: 'whitelist') + 'url': 'https://vimeo.com/event/4259978/3db517c479', + 'info_dict': { + 'id': '939104114', + 'ext': 'mp4', + 'display_id': '4259978', + 'title': 'Enhancing Credibility in Your Community Science Project', + 'description': 'md5:eab953341168b9c146bc3cfe3f716070', + 'uploader': 'NOAA Research', + 'uploader_id': 'noaaresearch', + 'uploader_url': 'https://vimeo.com/noaaresearch', + 'comment_count': int, + 'like_count': int, + 'duration': 3961, + 'thumbnail': r're:https://i\.vimeocdn\.com/video/\d+-[\da-f]+-d', + 'timestamp': 1716408008, + 'upload_date': '20240522', + 'release_timestamp': 1716408062, + 'release_date': '20240522', + 'live_status': 'was_live', + }, + 'params': {'skip_download': 'm3u8'}, + 'expected_warnings': ['Failed to parse XML: not well-formed'], + }, { + # "done" event with video_id in URL and unlisted_hash in VimeoIE URL + 'url': 'https://vimeo.com/event/595460/videos/498149131/', + 'info_dict': { + 'id': '498149131', + 'ext': 'mp4', + 'display_id': '595460', + 'title': '2021 Eighth Annual John Cardinal Foley Lecture on Social Communications', + 'description': 'Replay: https://vimeo.com/catholicphilly/review/498149131/544f26a12f', + 'uploader': 'Kearns Media Consulting LLC', + 'uploader_id': 'kearnsmediaconsulting', + 'uploader_url': 'https://vimeo.com/kearnsmediaconsulting', + 'comment_count': int, + 'like_count': int, + 'duration': 4466, + 'thumbnail': r're:https://i\.vimeocdn\.com/video/\d+-[\da-f]+-d', + 'timestamp': 1612228466, + 'upload_date': '20210202', + 'release_timestamp': 1612228538, + 'release_date': '20210202', + 'live_status': 'was_live', + }, + 'params': {'skip_download': 'm3u8'}, + 'expected_warnings': ['Failed to parse XML: not well-formed'], + }, { + # stream_privacy.view: 'password'; stream_privacy.embed: 'public' + 'url': 'https://vimeo.com/event/4940578', + 'info_dict': { + 'id': '1059263570', + 'ext': 'mp4', + 'display_id': '4940578', + 'title': 'TMAC AKC AGILITY 2-22-2025', + 'uploader': 'Paws \'N Effect', + 'uploader_id': 'pawsneffect', + 'uploader_url': 'https://vimeo.com/pawsneffect', + 'comment_count': int, + 'like_count': int, + 'duration': 33115, + 'thumbnail': r're:https://i\.vimeocdn\.com/video/\d+-[\da-f]+-d', + 'timestamp': 1740261836, + 'upload_date': '20250222', + 'release_timestamp': 1740261873, + 'release_date': '20250222', + 'live_status': 'was_live', + }, + 'params': { + 'videopassword': '22', + 'skip_download': 'm3u8', + }, + 'expected_warnings': ['Failed to parse XML: not well-formed'], + }, { + # API serves a playlist of 37 videos, but the site only streams the newest one (changes every Sunday) + 'url': 'https://vimeo.com/event/4753126', + 'only_matching': True, + }, { + # Scheduled for 2025.05.15 but never started; "unavailable"; stream_privacy.view: "anybody" + 'url': 'https://vimeo.com/event/5120811/embed', + 'only_matching': True, + }, { + 'url': 'https://vimeo.com/event/5112969/embed?muted=1', + 'only_matching': True, + }, { + 'url': 'https://vimeo.com/event/5097437/embed/interaction?muted=1', + 'only_matching': True, + }, { + 'url': 'https://vimeo.com/event/5113032/embed?autoplay=1&muted=1', + 'only_matching': True, + }, { + # Ended livestream with video_id + 'url': 'https://vimeo.com/event/595460/videos/507329569/', + 'only_matching': True, + }, { + # stream_privacy.view: 'unlisted' with unlisted_hash in URL path (stream_privacy.embed: 'public') + 'url': 'https://vimeo.com/event/4606123/embed/358d60ce2e', + 'only_matching': True, + }] + _WEBPAGE_TESTS = [{ + # Same result as https://vimeo.com/event/5034253/embed + 'url': 'https://www.media.mit.edu/events/aha-symposium/', + 'info_dict': { + 'id': '1071439154', + 'ext': 'mp4', + 'display_id': '5034253', + 'title': 'Advancing Humans with AI', + 'description': r're:AI is here to stay, but how do we ensure that people flourish in a world of pervasive AI use.{322}$', + 'uploader': 'MIT Media Lab', + 'uploader_id': 'mitmedialab', + 'uploader_url': 'https://vimeo.com/mitmedialab', + 'duration': 23235, + 'thumbnail': r're:https://i\.vimeocdn\.com/video/\d+-[\da-f]+-d', + 'chapters': 'count:37', + 'release_timestamp': 1744290000, + 'release_date': '20250410', + 'live_status': 'was_live', + }, + 'params': {'skip_download': 'm3u8'}, + 'expected_warnings': ['Failed to parse XML: not well-formed'], + }] + + _EVENT_FIELDS = ( + 'title', 'uri', 'schedule', 'stream_description', 'stream_privacy.embed', 'stream_privacy.view', + 'clip_to_play.name', 'clip_to_play.uri', 'clip_to_play.config_url', 'clip_to_play.live.status', + 'clip_to_play.privacy.embed', 'clip_to_play.privacy.view', 'clip_to_play.password', + 'streamable_clip.name', 'streamable_clip.uri', 'streamable_clip.config_url', 'streamable_clip.live.status', + ) + _VIDEOS_FIELDS = ('items', 'uri', 'name', 'config_url', 'duration', 'live.status') + + def _call_events_api( + self, event_id, ep=None, unlisted_hash=None, note=None, + fields=(), referrer=None, query=None, headers=None, + ): + resource = join_nonempty('event', ep, note, 'API JSON', delim=' ') + + return self._download_json( + join_nonempty( + 'https://api.vimeo.com/live_events', + join_nonempty(event_id, unlisted_hash, delim=':'), ep, delim='/'), + event_id, f'Downloading {resource}', f'Failed to download {resource}', + query=filter_dict({ + 'fields': ','.join(fields) or [], + # Correct spelling with 4 R's is deliberate + 'referrer': referrer, + **(query or {}), + }), headers=filter_dict({ + 'Accept': 'application/json', + 'Authorization': f'jwt {self._fetch_viewer_info(event_id)["jwt"]}', + 'Referer': referrer, + **(headers or {}), + })) + + @staticmethod + def _extract_video_id_and_unlisted_hash(video): + if not traverse_obj(video, ('uri', {lambda x: x.startswith('/videos/')})): + return None, None + video_id, _, unlisted_hash = video['uri'][8:].partition(':') + return video_id, unlisted_hash or None + + def _vimeo_url_result(self, video_id, unlisted_hash=None, event_id=None): + # VimeoIE can extract more metadata and formats for was_live event videos + return self.url_result( + join_nonempty('https://vimeo.com', video_id, unlisted_hash, delim='/'), VimeoIE, + video_id, display_id=event_id, live_status='was_live', url_transparent=True) + + @classmethod + def _extract_embed_urls(cls, url, webpage): + for embed_url in super()._extract_embed_urls(url, webpage): + yield cls._smuggle_referrer(embed_url, url) + + def _real_extract(self, url): + url, _, headers = self._unsmuggle_headers(url) + # XXX: Keep key name in sync with _unsmuggle_headers + referrer = headers.get('Referer') + event_id, unlisted_hash, video_id = self._match_valid_url(url).group('id', 'unlisted_hash', 'video_id') + + for retry in (False, True): + try: + live_event_data = self._call_events_api( + event_id, unlisted_hash=unlisted_hash, fields=self._EVENT_FIELDS, + referrer=referrer, query={'clip_to_play_id': video_id or '0'}, + headers={'Accept': 'application/vnd.vimeo.*+json;version=3.4.9'}) + break + except ExtractorError as e: + if retry or not isinstance(e.cause, HTTPError) or e.cause.status not in (400, 403): + raise + response = traverse_obj(e.cause.response.read(), ({json.loads}, {dict})) or {} + error_code = response.get('error_code') + if error_code == 2204: + self._verify_video_password(event_id, path='event') + continue + if error_code == 3200: + raise ExtractorError(self._REFERER_HINT, expected=True) + if error_msg := response.get('error'): + raise ExtractorError(f'Vimeo says: {error_msg}', expected=True) + raise + + # stream_privacy.view can be: 'anybody', 'embed_only', 'nobody', 'password', 'unlisted' + view_policy = live_event_data['stream_privacy']['view'] + if view_policy == 'nobody': + raise ExtractorError('This event has not been made available to anyone', expected=True) + + clip_data = traverse_obj(live_event_data, ('clip_to_play', {dict})) or {} + # live.status can be: 'streaming' (is_live), 'done' (was_live), 'unavailable' (is_upcoming OR dead) + clip_status = traverse_obj(clip_data, ('live', 'status', {str})) + start_time = traverse_obj(live_event_data, ('schedule', 'start_time', {str})) + release_timestamp = parse_iso8601(start_time) + + if clip_status == 'unavailable' and release_timestamp and release_timestamp > time.time(): + self.raise_no_formats(f'This live event is scheduled for {start_time}', expected=True) + live_status = 'is_upcoming' + config_url = None + + elif view_policy == 'embed_only': + webpage = self._download_webpage( + join_nonempty('https://vimeo.com/event', event_id, 'embed', unlisted_hash, delim='/'), + event_id, 'Downloading embed iframe webpage', impersonate=True, headers=headers) + # The _parse_config result will overwrite live_status w/ 'is_live' if livestream is active + live_status = 'was_live' + config_url = self._extract_config_url(webpage) + + else: # view_policy in ('anybody', 'password', 'unlisted') + if video_id: + clip_id, clip_hash = self._extract_video_id_and_unlisted_hash(clip_data) + if video_id == clip_id and clip_status == 'done' and (clip_hash or view_policy != 'unlisted'): + return self._vimeo_url_result(clip_id, clip_hash, event_id) + + video_filter = lambda _, v: self._extract_video_id_and_unlisted_hash(v)[0] == video_id + else: + video_filter = lambda _, v: v['live']['status'] in ('streaming', 'done') + + for page in itertools.count(1): + videos_data = self._call_events_api( + event_id, 'videos', unlisted_hash=unlisted_hash, note=f'page {page}', + fields=self._VIDEOS_FIELDS, referrer=referrer, query={'page': page}, + headers={'Accept': 'application/vnd.vimeo.*;version=3.4.1'}) + + video = traverse_obj(videos_data, ('data', video_filter, any)) + if video or not traverse_obj(videos_data, ('paging', 'next', {str})): + break + + live_status = { + 'streaming': 'is_live', + 'done': 'was_live', + }.get(traverse_obj(video, ('live', 'status', {str}))) + + if not live_status: # requested video_id is unavailable or no videos are available + raise ExtractorError('This event video is unavailable', expected=True) + elif live_status == 'was_live': + return self._vimeo_url_result(*self._extract_video_id_and_unlisted_hash(video), event_id) + config_url = video['config_url'] + + if config_url: # view_policy == 'embed_only' or live_status == 'is_live' + info = filter_dict(self._parse_config( + self._download_json(config_url, event_id, 'Downloading config JSON'), event_id)) + else: # live_status == 'is_upcoming' + info = {'id': event_id} + + if info.get('live_status') == 'post_live': + self.report_warning('This live event recently ended and some formats may not yet be available') + + return { + **traverse_obj(live_event_data, { + 'title': ('title', {str}), + 'description': ('stream_description', {str}), + }), + 'display_id': event_id, + 'live_status': live_status, + 'release_timestamp': release_timestamp, + **info, + } diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index faf3e60b0b..c269802b37 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -300,6 +300,24 @@ class VKIE(VKBaseIE): 'upload_date': '20250130', }, }, + { + 'url': 'https://vkvideo.ru/video-50883936_456244102', + 'info_dict': { + 'id': '-50883936_456244102', + 'ext': 'mp4', + 'title': 'Добивание Украины // Техник в коме // МОЯ ЗЛОСТЬ №140', + 'description': 'md5:a9bc46181e9ebd0fdd82cef6c0191140', + 'uploader': 'Стас Ай, Как Просто!', + 'uploader_id': '-50883936', + 'comment_count': int, + 'like_count': int, + 'duration': 4651, + 'thumbnail': r're:https?://.+\.jpg', + 'chapters': 'count:59', + 'timestamp': 1743333869, + 'upload_date': '20250330', + }, + }, { # live stream, hls and rtmp links, most likely already finished live # stream by the time you are reading this comment @@ -540,11 +558,11 @@ def _real_extract(self, url): 'title': ('md_title', {unescapeHTML}), 'description': ('description', {clean_html}, filter), 'thumbnail': ('jpg', {url_or_none}), - 'uploader': ('md_author', {str}), + 'uploader': ('md_author', {unescapeHTML}), 'uploader_id': (('author_id', 'authorId'), {str_or_none}, any), 'duration': ('duration', {int_or_none}), 'chapters': ('time_codes', lambda _, v: isinstance(v['time'], int), { - 'title': ('text', {str}), + 'title': ('text', {unescapeHTML}), 'start_time': 'time', }), }), diff --git a/yt_dlp/extractor/voxmedia.py b/yt_dlp/extractor/voxmedia.py index e9b0047a47..995ef21859 100644 --- a/yt_dlp/extractor/voxmedia.py +++ b/yt_dlp/extractor/voxmedia.py @@ -1,7 +1,6 @@ import urllib.parse from .common import InfoExtractor -from .once import OnceIE from ..utils import ( ExtractorError, int_or_none, @@ -10,7 +9,7 @@ ) -class VoxMediaVolumeIE(OnceIE): +class VoxMediaVolumeIE(InfoExtractor): _VALID_URL = r'https?://volume\.vox-cdn\.com/embed/(?P[0-9a-f]{9})' def _real_extract(self, url): @@ -57,7 +56,8 @@ def _real_extract(self, url): if not provider_video_id: continue if provider_video_type == 'brightcove': - info['formats'] = self._extract_once_formats(provider_video_id) + # TODO: Find embed example or confirm that Vox has stopped using Brightcove + raise ExtractorError('Vox Brightcove embeds are currently unsupported') else: info.update({ '_type': 'url_transparent', @@ -155,20 +155,6 @@ class VoxMediaIE(InfoExtractor): }, }], 'skip': 'Page no longer contain videos', - }, { - # volume embed, Brightcove Once - 'url': 'https://www.recode.net/2014/6/17/11628066/post-post-pc-ceo-the-full-code-conference-video-of-microsofts-satya', - 'md5': '2dbc77b8b0bff1894c2fce16eded637d', - 'info_dict': { - 'id': '1231c973d', - 'ext': 'mp4', - 'title': 'Post-Post-PC CEO: The Full Code Conference Video of Microsoft\'s Satya Nadella', - 'description': 'The longtime veteran was chosen earlier this year as the software giant\'s third leader in its history.', - 'timestamp': 1402938000, - 'upload_date': '20140616', - 'duration': 4114, - }, - 'add_ie': ['VoxMediaVolume'], }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/vrsquare.py b/yt_dlp/extractor/vrsquare.py new file mode 100644 index 0000000000..9e8740b421 --- /dev/null +++ b/yt_dlp/extractor/vrsquare.py @@ -0,0 +1,185 @@ +import itertools + +from .common import InfoExtractor +from ..networking.exceptions import HTTPError +from ..utils import ( + ExtractorError, + clean_html, + extract_attributes, + parse_duration, + parse_qs, +) +from ..utils.traversal import ( + find_element, + find_elements, + traverse_obj, +) + + +class VrSquareIE(InfoExtractor): + IE_NAME = 'vrsquare' + IE_DESC = 'VR SQUARE' + + _BASE_URL = 'https://livr.jp' + _VALID_URL = r'https?://livr\.jp/contents/(?P[\w-]+)' + _TESTS = [{ + 'url': 'https://livr.jp/contents/P470896661', + 'info_dict': { + 'id': 'P470896661', + 'ext': 'mp4', + 'title': 'そこ曲がったら、櫻坂? 7年間お疲れ様!菅井友香の卒業を祝う会!前半 2022年11月6日放送分', + 'description': 'md5:523726dc835aa8014dfe1e2b38d36cd1', + 'duration': 1515.0, + 'tags': 'count:2', + 'thumbnail': r're:https?://media\.livr\.jp/vod/img/.+\.jpg', + }, + }, { + 'url': 'https://livr.jp/contents/P589523973', + 'info_dict': { + 'id': 'P589523973', + 'ext': 'mp4', + 'title': '薄闇に仰ぐ しだれ桜の妖艶', + 'description': 'md5:a042f517b2cbb4ed6746707afec4d306', + 'duration': 1084.0, + 'tags': list, + 'thumbnail': r're:https?://media\.livr\.jp/vod/img/.+\.jpg', + }, + 'skip': 'Paid video', + }, { + 'url': 'https://livr.jp/contents/P316939908', + 'info_dict': { + 'id': 'P316939908', + 'ext': 'mp4', + 'title': '2024年5月16日(木) 「今日は誰に恋をする?」公演 小栗有以 生誕祭', + 'description': 'md5:2110bdcf947f28bd7d06ec420e51b619', + 'duration': 8559.0, + 'tags': list, + 'thumbnail': r're:https?://media\.livr\.jp/vod/img/.+\.jpg', + }, + 'skip': 'Premium channel subscribers only', + }, { + # Accessible only in the VR SQUARE app + 'url': 'https://livr.jp/contents/P126481458', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + status = self._download_json( + f'{self._BASE_URL}/webApi/contentsStatus/{video_id}', + video_id, 'Checking contents status', fatal=False) + if traverse_obj(status, 'result_code') == '40407': + self.raise_login_required('Unable to access this video') + + try: + web_api = self._download_json( + f'{self._BASE_URL}/webApi/play/url/{video_id}', video_id) + except ExtractorError as e: + if isinstance(e.cause, HTTPError) and e.cause.status == 500: + raise ExtractorError('VR SQUARE app-only videos are not supported', expected=True) + raise + + return { + 'id': video_id, + 'title': self._html_search_meta(['og:title', 'twitter:title'], webpage), + 'description': self._html_search_meta('description', webpage), + 'formats': self._extract_m3u8_formats(traverse_obj(web_api, ( + 'urls', ..., 'url', any)), video_id, 'mp4', fatal=False), + 'thumbnail': self._html_search_meta('og:image', webpage), + **traverse_obj(webpage, { + 'duration': ({find_element(cls='layout-product-data-time')}, {parse_duration}), + 'tags': ({find_elements(cls='search-tag')}, ..., {clean_html}), + }), + } + + +class VrSquarePlaylistBaseIE(InfoExtractor): + _BASE_URL = 'https://livr.jp' + + def _fetch_vids(self, source, keys=()): + for url_path in traverse_obj(source, ( + *keys, {find_elements(cls='video', html=True)}, ..., + {extract_attributes}, 'data-url', {str}, filter), + ): + yield self.url_result( + f'{self._BASE_URL}/contents/{url_path.removeprefix("/contents/")}', VrSquareIE) + + def _entries(self, path, display_id, query=None): + for page in itertools.count(1): + ajax = self._download_json( + f'{self._BASE_URL}{path}', display_id, + f'Downloading playlist JSON page {page}', + query={'p': page, **(query or {})}) + yield from self._fetch_vids(ajax, ('contents_render_list', ...)) + if not traverse_obj(ajax, (('has_next', 'hasNext'), {bool}, any)): + break + + +class VrSquareChannelIE(VrSquarePlaylistBaseIE): + IE_NAME = 'vrsquare:channel' + + _VALID_URL = r'https?://livr\.jp/channel/(?P\w+)' + _TESTS = [{ + 'url': 'https://livr.jp/channel/H372648599', + 'info_dict': { + 'id': 'H372648599', + 'title': 'AKB48+チャンネル', + }, + 'playlist_mincount': 502, + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + webpage = self._download_webpage(url, playlist_id) + + return self.playlist_result( + self._entries(f'/ajax/channel/{playlist_id}', playlist_id), + playlist_id, self._html_search_meta('og:title', webpage)) + + +class VrSquareSearchIE(VrSquarePlaylistBaseIE): + IE_NAME = 'vrsquare:search' + + _VALID_URL = r'https?://livr\.jp/web-search/?\?(?:[^#]+&)?w=[^#]+' + _TESTS = [{ + 'url': 'https://livr.jp/web-search?w=%23%E5%B0%8F%E6%A0%97%E6%9C%89%E4%BB%A5', + 'info_dict': { + 'id': '#小栗有以', + }, + 'playlist_mincount': 60, + }] + + def _real_extract(self, url): + search_query = parse_qs(url)['w'][0] + + return self.playlist_result( + self._entries('/ajax/web-search', search_query, {'w': search_query}), search_query) + + +class VrSquareSectionIE(VrSquarePlaylistBaseIE): + IE_NAME = 'vrsquare:section' + + _VALID_URL = r'https?://livr\.jp/(?:category|headline)/(?P\w+)' + _TESTS = [{ + 'url': 'https://livr.jp/category/C133936275', + 'info_dict': { + 'id': 'C133936275', + 'title': 'そこ曲がったら、櫻坂?VR', + }, + 'playlist_mincount': 308, + }, { + 'url': 'https://livr.jp/headline/A296449604', + 'info_dict': { + 'id': 'A296449604', + 'title': 'AKB48 アフターVR', + }, + 'playlist_mincount': 22, + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + webpage = self._download_webpage(url, playlist_id) + + return self.playlist_result( + self._fetch_vids(webpage), playlist_id, self._html_search_meta('og:title', webpage)) diff --git a/yt_dlp/extractor/vrt.py b/yt_dlp/extractor/vrt.py index 5def2bacf4..6e5514eefd 100644 --- a/yt_dlp/extractor/vrt.py +++ b/yt_dlp/extractor/vrt.py @@ -201,7 +201,35 @@ class VrtNUIE(VRTBaseIE): 'timestamp': 1740373200, 'title': 'Reeks 6 volledig vanaf 3 maart', 'upload_date': '20250224', - '_old_archive_ids': ['canvas pbs-pub-c8a78645-5d3e-468a-89ec-6f3ed5534bd5$vid-242ddfe9-18f5-4e16-ab45-09b122a19251'], + '_old_archive_ids': [ + 'canvas pbs-pub-c8a78645-5d3e-468a-89ec-6f3ed5534bd5$vid-242ddfe9-18f5-4e16-ab45-09b122a19251', + 'ketnet pbs-pub-c8a78645-5d3e-468a-89ec-6f3ed5534bd5$vid-242ddfe9-18f5-4e16-ab45-09b122a19251', + ], + }, + }, { + 'url': 'https://www.vrt.be/vrtmax/a-z/meisjes/6/meisjes-s6a5/', + 'info_dict': { + 'id': 'pbs-pub-97b541ab-e05c-43b9-9a40-445702ef7189$vid-5e306921-a9aa-4fa9-9f39-5b82c8f1028e', + 'ext': 'mp4', + 'channel': 'ketnet', + 'description': 'md5:713793f15cbf677f66200b36b7b1ec5a', + 'display_id': 'meisjes-s6a5', + 'duration': 1336.02, + 'episode': 'Week 5', + 'episode_id': '1684157692901', + 'episode_number': 5, + 'season': '6', + 'season_id': '1684157692901', + 'season_number': 6, + 'series': 'Meisjes', + 'thumbnail': 'https://images.vrt.be/orig/2023/05/14/bf526ae0-f1d9-11ed-91d7-02b7b76bf47f.jpg', + 'timestamp': 1685251800, + 'title': 'Week 5', + 'upload_date': '20230528', + '_old_archive_ids': [ + 'canvas pbs-pub-97b541ab-e05c-43b9-9a40-445702ef7189$vid-5e306921-a9aa-4fa9-9f39-5b82c8f1028e', + 'ketnet pbs-pub-97b541ab-e05c-43b9-9a40-445702ef7189$vid-5e306921-a9aa-4fa9-9f39-5b82c8f1028e', + ], }, }, { 'url': 'https://www.vrt.be/vrtnu/a-z/taboe/3/taboe-s3a4/', @@ -223,7 +251,10 @@ class VrtNUIE(VRTBaseIE): 'timestamp': 1740286800, 'title': 'Mensen met het syndroom van Gilles de la Tourette', 'upload_date': '20250223', - '_old_archive_ids': ['canvas pbs-pub-f50faa3a-1778-46b6-9117-4ba85f197703$vid-547507fe-1c8b-4394-b361-21e627cbd0fd'], + '_old_archive_ids': [ + 'canvas pbs-pub-f50faa3a-1778-46b6-9117-4ba85f197703$vid-547507fe-1c8b-4394-b361-21e627cbd0fd', + 'ketnet pbs-pub-f50faa3a-1778-46b6-9117-4ba85f197703$vid-547507fe-1c8b-4394-b361-21e627cbd0fd', + ], }, }] _NETRC_MACHINE = 'vrtnu' @@ -427,66 +458,8 @@ def _real_extract(self, url): 'display_id': display_id, 'formats': formats, 'subtitles': subtitles, - '_old_archive_ids': [make_archive_id('Canvas', video_id)], - } - - -class KetnetIE(VRTBaseIE): - _VALID_URL = r'https?://(?:www\.)?ketnet\.be/(?P(?:[^/]+/)*[^/?#&]+)' - _TESTS = [{ - 'url': 'https://www.ketnet.be/kijken/m/meisjes/6/meisjes-s6a5', - 'info_dict': { - 'id': 'pbs-pub-39f8351c-a0a0-43e6-8394-205d597d6162$vid-5e306921-a9aa-4fa9-9f39-5b82c8f1028e', - 'ext': 'mp4', - 'title': 'Meisjes', - 'episode': 'Reeks 6: Week 5', - 'season': 'Reeks 6', - 'series': 'Meisjes', - 'timestamp': 1685251800, - 'upload_date': '20230528', - }, - 'params': {'skip_download': 'm3u8'}, - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - - video = self._download_json( - 'https://senior-bff.ketnet.be/graphql', display_id, query={ - 'query': '''{ - video(id: "content/ketnet/nl/%s.model.json") { - description - episodeNr - imageUrl - mediaReference - programTitle - publicationDate - seasonTitle - subtitleVideodetail - titleVideodetail - } -}''' % display_id, # noqa: UP031 - })['data']['video'] - - video_id = urllib.parse.unquote(video['mediaReference']) - data = self._call_api(video_id, 'ketnet@PROD', version='v1') - formats, subtitles = self._extract_formats_and_subtitles(data, video_id) - - return { - 'id': video_id, - 'formats': formats, - 'subtitles': subtitles, - '_old_archive_ids': [make_archive_id('Canvas', video_id)], - **traverse_obj(video, { - 'title': ('titleVideodetail', {str}), - 'description': ('description', {str}), - 'thumbnail': ('thumbnail', {url_or_none}), - 'timestamp': ('publicationDate', {parse_iso8601}), - 'series': ('programTitle', {str}), - 'season': ('seasonTitle', {str}), - 'episode': ('subtitleVideodetail', {str}), - 'episode_number': ('episodeNr', {int_or_none}), - }), + '_old_archive_ids': [make_archive_id('Canvas', video_id), + make_archive_id('Ketnet', video_id)], } diff --git a/yt_dlp/extractor/wat.py b/yt_dlp/extractor/wat.py index 03bac66ac6..c1c3af800b 100644 --- a/yt_dlp/extractor/wat.py +++ b/yt_dlp/extractor/wat.py @@ -2,9 +2,11 @@ from ..utils import ( ExtractorError, int_or_none, + join_nonempty, try_get, unified_strdate, ) +from ..utils.traversal import traverse_obj class WatIE(InfoExtractor): @@ -70,8 +72,14 @@ def _real_extract(self, url): error_desc = video_info.get('error_desc') if error_desc: - if video_info.get('error_code') == 'GEOBLOCKED': + error_code = video_info.get('error_code') + if error_code == 'GEOBLOCKED': self.raise_geo_restricted(error_desc, video_info.get('geoList')) + elif error_code == 'DELIVERY_ERROR': + if traverse_obj(video_data, ('delivery', 'code')) == 500: + self.report_drm(video_id) + error_desc = join_nonempty( + error_desc, traverse_obj(video_data, ('delivery', 'error', {str})), delim=': ') raise ExtractorError(error_desc, expected=True) title = video_info['title'] diff --git a/yt_dlp/extractor/weibo.py b/yt_dlp/extractor/weibo.py index 6d4bd46e25..420ac38299 100644 --- a/yt_dlp/extractor/weibo.py +++ b/yt_dlp/extractor/weibo.py @@ -109,7 +109,7 @@ def _parse_video_info(self, video_info): **traverse_obj(video_info, { 'display_id': ('mblogid', {str_or_none}), 'title': ('page_info', 'media_info', ('video_title', 'kol_title', 'name'), - {lambda x: x.replace('\n', ' ')}, {truncate_string(left=50)}, filter), + {lambda x: x.replace('\n', ' ')}, {truncate_string(left=72)}, filter), 'alt_title': ('page_info', 'media_info', ('video_title', 'kol_title', 'name'), {str}, filter), 'description': ('text_raw', {str}), 'duration': ('page_info', 'media_info', 'duration', {int_or_none}), @@ -213,6 +213,7 @@ class WeiboVideoIE(WeiboBaseIE): 'ext': 'mp4', 'display_id': 'LEZDodaiW', 'title': '呃,稍微了解了一下靡烟miya,感觉这东西也太二了', + 'alt_title': '呃,稍微了解了一下靡烟miya,感觉这东西也太二了', 'description': '呃,稍微了解了一下靡烟miya,感觉这东西也太二了 http://t.cn/A6aerGsM \u200b\u200b\u200b', 'duration': 76, 'timestamp': 1659344278, @@ -224,6 +225,7 @@ class WeiboVideoIE(WeiboBaseIE): 'view_count': int, 'like_count': int, 'repost_count': int, + '_old_archive_ids': ['weibomobile 4797700463137878'], }, }] diff --git a/yt_dlp/extractor/weverse.py b/yt_dlp/extractor/weverse.py index 53ad1100d2..42b1189fe8 100644 --- a/yt_dlp/extractor/weverse.py +++ b/yt_dlp/extractor/weverse.py @@ -290,12 +290,14 @@ def _real_extract(self, url): elif live_status == 'is_live': video_info = self._call_api( - f'/video/v1.2/lives/{api_video_id}/playInfo?preview.format=json&preview.version=v2', + f'/video/v1.3/lives/{api_video_id}/playInfo?preview.format=json&preview.version=v2', video_id, note='Downloading live JSON') playback = self._parse_json(video_info['lipPlayback'], video_id) m3u8_url = traverse_obj(playback, ( 'media', lambda _, v: v['protocol'] == 'HLS', 'path', {url_or_none}), get_all=False) - formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', m3u8_id='hls', live=True) + # Live subtitles are not downloadable, but extract to silence "ignoring subs" warning + formats, _ = self._extract_m3u8_formats_and_subtitles( + m3u8_url, video_id, 'mp4', m3u8_id='hls', live=True) elif live_status == 'post_live': if availability in ('premium_only', 'subscriber_only'): diff --git a/yt_dlp/extractor/wykop.py b/yt_dlp/extractor/wykop.py index 2ae0a2a5ed..08cad1fffd 100644 --- a/yt_dlp/extractor/wykop.py +++ b/yt_dlp/extractor/wykop.py @@ -11,7 +11,7 @@ ) -class WykopBaseExtractor(InfoExtractor): +class WykopBaseIE(InfoExtractor): def _get_token(self, force_refresh=False): if not force_refresh: maybe_cached = self.cache.load('wykop', 'bearer') @@ -72,7 +72,7 @@ def _common_data_extract(self, data): } -class WykopDigIE(WykopBaseExtractor): +class WykopDigIE(WykopBaseIE): IE_NAME = 'wykop:dig' _VALID_URL = r'https?://(?:www\.)?wykop\.pl/link/(?P\d+)' @@ -128,7 +128,7 @@ def _real_extract(self, url): } -class WykopDigCommentIE(WykopBaseExtractor): +class WykopDigCommentIE(WykopBaseIE): IE_NAME = 'wykop:dig:comment' _VALID_URL = r'https?://(?:www\.)?wykop\.pl/link/(?P\d+)/[^/]+/komentarz/(?P\d+)' @@ -177,7 +177,7 @@ def _real_extract(self, url): } -class WykopPostIE(WykopBaseExtractor): +class WykopPostIE(WykopBaseIE): IE_NAME = 'wykop:post' _VALID_URL = r'https?://(?:www\.)?wykop\.pl/wpis/(?P\d+)' @@ -228,7 +228,7 @@ def _real_extract(self, url): } -class WykopPostCommentIE(WykopBaseExtractor): +class WykopPostCommentIE(WykopBaseIE): IE_NAME = 'wykop:post:comment' _VALID_URL = r'https?://(?:www\.)?wykop\.pl/wpis/(?P\d+)/[^/#]+#(?P\d+)' diff --git a/yt_dlp/extractor/xinpianchang.py b/yt_dlp/extractor/xinpianchang.py index 23ed9270da..a4263579af 100644 --- a/yt_dlp/extractor/xinpianchang.py +++ b/yt_dlp/extractor/xinpianchang.py @@ -45,7 +45,7 @@ class XinpianchangIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id=video_id) + webpage = self._download_webpage(url, video_id=video_id, headers={'Referer': url}) video_data = self._search_nextjs_data(webpage, video_id)['props']['pageProps']['detail']['video'] data = self._download_json( diff --git a/yt_dlp/extractor/yandexvideo.py b/yt_dlp/extractor/yandexvideo.py index cdd32c5e4e..09e7c98785 100644 --- a/yt_dlp/extractor/yandexvideo.py +++ b/yt_dlp/extractor/yandexvideo.py @@ -2,15 +2,17 @@ from .common import InfoExtractor from ..utils import ( + bug_reports_message, determine_ext, - extract_attributes, int_or_none, lowercase_escape, parse_qs, - traverse_obj, + qualities, try_get, + update_url_query, url_or_none, ) +from ..utils.traversal import traverse_obj class YandexVideoIE(InfoExtractor): @@ -186,7 +188,22 @@ def _real_extract(self, url): return self.url_result(data_json['video']['url']) -class ZenYandexIE(InfoExtractor): +class ZenYandexBaseIE(InfoExtractor): + def _fetch_ssr_data(self, url, video_id): + webpage = self._download_webpage(url, video_id) + redirect = self._search_json( + r'(?:var|let|const)\s+it\s*=', webpage, 'redirect', video_id, default={}).get('retpath') + if redirect: + video_id = self._match_id(redirect) + webpage = self._download_webpage(redirect, video_id, note='Redirecting') + return video_id, self._search_json( + r'(?:var|let|const)\s+_params\s*=\s*\(', webpage, 'metadata', video_id, + contains_pattern=r'{["\']ssrData.+}')['ssrData'] + + +class ZenYandexIE(ZenYandexBaseIE): + IE_NAME = 'dzen.ru' + IE_DESC = 'Дзен (dzen) formerly Яндекс.Дзен (Yandex Zen)' _VALID_URL = r'https?://(zen\.yandex|dzen)\.ru(?:/video)?/(media|watch)/(?:(?:id/[^/]+/|[^/]+/)(?:[a-z0-9-]+)-)?(?P[a-z0-9-]+)' _TESTS = [{ 'url': 'https://zen.yandex.ru/media/id/606fd806cc13cb3c58c05cf5/vot-eto-focus-dedy-morozy-na-gidrociklah-60c7c443da18892ebfe85ed7', @@ -216,6 +233,7 @@ class ZenYandexIE(InfoExtractor): 'timestamp': 1573465585, }, 'params': {'skip_download': 'm3u8'}, + 'skip': 'The page does not exist', }, { 'url': 'https://zen.yandex.ru/video/watch/6002240ff8b1af50bb2da5e3', 'info_dict': { @@ -227,6 +245,9 @@ class ZenYandexIE(InfoExtractor): 'uploader': 'TechInsider', 'timestamp': 1611378221, 'upload_date': '20210123', + 'view_count': int, + 'duration': 243, + 'tags': ['опыт', 'эксперимент', 'огонь'], }, 'params': {'skip_download': 'm3u8'}, }, { @@ -240,6 +261,9 @@ class ZenYandexIE(InfoExtractor): 'uploader': 'TechInsider', 'upload_date': '20210123', 'timestamp': 1611378221, + 'view_count': int, + 'duration': 243, + 'tags': ['опыт', 'эксперимент', 'огонь'], }, 'params': {'skip_download': 'm3u8'}, }, { @@ -252,44 +276,56 @@ class ZenYandexIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - redirect = self._search_json(r'var it\s*=', webpage, 'redirect', id, default={}).get('retpath') - if redirect: - video_id = self._match_id(redirect) - webpage = self._download_webpage(redirect, video_id, note='Redirecting') - data_json = self._search_json( - r'("data"\s*:|data\s*=)', webpage, 'metadata', video_id, contains_pattern=r'{["\']_*serverState_*video.+}') - serverstate = self._search_regex(r'(_+serverState_+video-site_[^_]+_+)', webpage, 'server state') - uploader = self._search_regex(r'(]+>)', - webpage, 'uploader', default='') - uploader_name = extract_attributes(uploader).get('aria-label') - item_id = traverse_obj(data_json, (serverstate, 'videoViewer', 'openedItemId', {str})) - video_json = traverse_obj(data_json, (serverstate, 'videoViewer', 'items', item_id, {dict})) or {} + video_id, ssr_data = self._fetch_ssr_data(url, video_id) + video_data = ssr_data['videoMetaResponse'] formats, subtitles = [], {} - for s_url in traverse_obj(video_json, ('video', 'streams', ..., {url_or_none})): + quality = qualities(('4', '0', '1', '2', '3', '5', '6', '7')) + # Deduplicate stream URLs. The "dzen_dash" query parameter is present in some URLs but can be omitted + stream_urls = set(traverse_obj(video_data, ( + 'video', ('id', ('streams', ...), ('mp4Streams', ..., 'url'), ('oneVideoStreams', ..., 'url')), + {url_or_none}, {update_url_query(query={'dzen_dash': []})}))) + for s_url in stream_urls: ext = determine_ext(s_url) - if ext == 'mpd': - fmts, subs = self._extract_mpd_formats_and_subtitles(s_url, video_id, mpd_id='dash') - elif ext == 'm3u8': - fmts, subs = self._extract_m3u8_formats_and_subtitles(s_url, video_id, 'mp4') + content_type = traverse_obj(parse_qs(s_url), ('ct', 0)) + if ext == 'mpd' or content_type == '6': + fmts, subs = self._extract_mpd_formats_and_subtitles(s_url, video_id, mpd_id='dash', fatal=False) + elif ext == 'm3u8' or content_type == '8': + fmts, subs = self._extract_m3u8_formats_and_subtitles(s_url, video_id, 'mp4', m3u8_id='hls', fatal=False) + elif content_type == '0': + format_type = traverse_obj(parse_qs(s_url), ('type', 0)) + formats.append({ + 'url': s_url, + 'format_id': format_type, + 'ext': 'mp4', + 'quality': quality(format_type), + }) + continue + else: + self.report_warning(f'Unsupported stream URL: {s_url}{bug_reports_message()}') + continue formats.extend(fmts) - subtitles = self._merge_subtitles(subtitles, subs) + self._merge_subtitles(subs, target=subtitles) + return { 'id': video_id, - 'title': video_json.get('title') or self._og_search_title(webpage), 'formats': formats, 'subtitles': subtitles, - 'duration': int_or_none(video_json.get('duration')), - 'view_count': int_or_none(video_json.get('views')), - 'timestamp': int_or_none(video_json.get('publicationDate')), - 'uploader': uploader_name or data_json.get('authorName') or try_get(data_json, lambda x: x['publisher']['name']), - 'description': video_json.get('description') or self._og_search_description(webpage), - 'thumbnail': self._og_search_thumbnail(webpage) or try_get(data_json, lambda x: x['og']['imageUrl']), + **traverse_obj(video_data, { + 'title': ('title', {str}), + 'description': ('description', {str}), + 'thumbnail': ('image', {url_or_none}), + 'duration': ('video', 'duration', {int_or_none}), + 'view_count': ('video', 'views', {int_or_none}), + 'timestamp': ('publicationDate', {int_or_none}), + 'tags': ('tags', ..., {str}), + 'uploader': ('source', 'title', {str}), + }), } -class ZenYandexChannelIE(InfoExtractor): +class ZenYandexChannelIE(ZenYandexBaseIE): + IE_NAME = 'dzen.ru:channel' _VALID_URL = r'https?://(zen\.yandex|dzen)\.ru/(?!media|video)(?:id/)?(?P[a-z0-9-_]+)' _TESTS = [{ 'url': 'https://zen.yandex.ru/tok_media', @@ -323,8 +359,8 @@ class ZenYandexChannelIE(InfoExtractor): 'url': 'https://zen.yandex.ru/jony_me', 'info_dict': { 'id': 'jony_me', - 'description': 'md5:ce0a5cad2752ab58701b5497835b2cc5', - 'title': 'JONY ', + 'description': 'md5:7c30d11dc005faba8826feae99da3113', + 'title': 'JONY', }, 'playlist_count': 18, }, { @@ -333,9 +369,8 @@ class ZenYandexChannelIE(InfoExtractor): 'url': 'https://zen.yandex.ru/tatyanareva', 'info_dict': { 'id': 'tatyanareva', - 'description': 'md5:40a1e51f174369ec3ba9d657734ac31f', + 'description': 'md5:92e56fa730a932ca2483ba5c2186ad96', 'title': 'Татьяна Рева', - 'entries': 'maxcount:200', }, 'playlist_mincount': 46, }, { @@ -348,43 +383,31 @@ class ZenYandexChannelIE(InfoExtractor): 'playlist_mincount': 657, }] - def _entries(self, item_id, server_state_json, server_settings_json): - items = (traverse_obj(server_state_json, ('feed', 'items', ...)) - or traverse_obj(server_settings_json, ('exportData', 'items', ...))) - - more = (traverse_obj(server_state_json, ('links', 'more')) - or traverse_obj(server_settings_json, ('exportData', 'more', 'link'))) - + def _entries(self, feed_data, channel_id): next_page_id = None for page in itertools.count(1): - for item in items or []: - if item.get('type') != 'gif': - continue - video_id = traverse_obj(item, 'publication_id', 'publicationId') or '' - yield self.url_result(item['link'], ZenYandexIE, video_id.split(':')[-1]) + for item in traverse_obj(feed_data, ( + (None, ('items', lambda _, v: v['tab'] in ('shorts', 'longs'))), + 'items', lambda _, v: url_or_none(v['link']), + )): + yield self.url_result(item['link'], ZenYandexIE, item.get('id'), title=item.get('title')) + more = traverse_obj(feed_data, ('more', 'link', {url_or_none})) current_page_id = next_page_id next_page_id = traverse_obj(parse_qs(more), ('next_page_id', -1)) - if not all((more, items, next_page_id, next_page_id != current_page_id)): + if not all((more, next_page_id, next_page_id != current_page_id)): break - data = self._download_json(more, item_id, note=f'Downloading Page {page}') - items, more = data.get('items'), traverse_obj(data, ('more', 'link')) + feed_data = self._download_json(more, channel_id, note=f'Downloading Page {page}') def _real_extract(self, url): - item_id = self._match_id(url) - webpage = self._download_webpage(url, item_id) - redirect = self._search_json( - r'var it\s*=', webpage, 'redirect', item_id, default={}).get('retpath') - if redirect: - item_id = self._match_id(redirect) - webpage = self._download_webpage(redirect, item_id, note='Redirecting') - data = self._search_json( - r'("data"\s*:|data\s*=)', webpage, 'channel data', item_id, contains_pattern=r'{\"__serverState__.+}') - server_state_json = traverse_obj(data, lambda k, _: k.startswith('__serverState__'), get_all=False) - server_settings_json = traverse_obj(data, lambda k, _: k.startswith('__serverSettings__'), get_all=False) + channel_id = self._match_id(url) + channel_id, ssr_data = self._fetch_ssr_data(url, channel_id) + channel_data = ssr_data['exportResponse'] return self.playlist_result( - self._entries(item_id, server_state_json, server_settings_json), - item_id, traverse_obj(server_state_json, ('channel', 'source', 'title')), - traverse_obj(server_state_json, ('channel', 'source', 'description'))) + self._entries(channel_data['feedData'], channel_id), + channel_id, **traverse_obj(channel_data, ('channel', 'source', { + 'title': ('title', {str}), + 'description': ('description', {str}), + }))) diff --git a/yt_dlp/extractor/youporn.py b/yt_dlp/extractor/youporn.py index 8eb77aa031..b3c4b76d78 100644 --- a/yt_dlp/extractor/youporn.py +++ b/yt_dlp/extractor/youporn.py @@ -227,7 +227,7 @@ def extract_tag_box(regex, title): return result -class YouPornListBase(InfoExtractor): +class YouPornListBaseIE(InfoExtractor): def _get_next_url(self, url, pl_id, html): return urljoin(url, self._search_regex( r''']*?\bhref\s*=\s*("|')(?P(?:(?!\1)[^>])+)\1''', @@ -284,7 +284,7 @@ def _real_extract(self, url, html=None): playlist_id=pl_id, playlist_title=title) -class YouPornCategoryIE(YouPornListBase): +class YouPornCategoryIE(YouPornListBaseIE): IE_DESC = 'YouPorn category, with sorting, filtering and pagination' _VALID_URL = r'''(?x) https?://(?:www\.)?youporn\.com/ @@ -319,7 +319,7 @@ class YouPornCategoryIE(YouPornListBase): }] -class YouPornChannelIE(YouPornListBase): +class YouPornChannelIE(YouPornListBaseIE): IE_DESC = 'YouPorn channel, with sorting and pagination' _VALID_URL = r'''(?x) https?://(?:www\.)?youporn\.com/ @@ -349,7 +349,7 @@ def _get_title_from_slug(title_slug): return re.sub(r'_', ' ', title_slug).title() -class YouPornCollectionIE(YouPornListBase): +class YouPornCollectionIE(YouPornListBaseIE): IE_DESC = 'YouPorn collection (user playlist), with sorting and pagination' _VALID_URL = r'''(?x) https?://(?:www\.)?youporn\.com/ @@ -394,7 +394,7 @@ def _real_extract(self, url): return playlist -class YouPornTagIE(YouPornListBase): +class YouPornTagIE(YouPornListBaseIE): IE_DESC = 'YouPorn tag (porntags), with sorting, filtering and pagination' _VALID_URL = r'''(?x) https?://(?:www\.)?youporn\.com/ @@ -442,7 +442,7 @@ def _real_extract(self, url): return super()._real_extract(url) -class YouPornStarIE(YouPornListBase): +class YouPornStarIE(YouPornListBaseIE): IE_DESC = 'YouPorn Pornstar, with description, sorting and pagination' _VALID_URL = r'''(?x) https?://(?:www\.)?youporn\.com/ @@ -493,7 +493,7 @@ def _real_extract(self, url): } -class YouPornVideosIE(YouPornListBase): +class YouPornVideosIE(YouPornListBaseIE): IE_DESC = 'YouPorn video (browse) playlists, with sorting, filtering and pagination' _VALID_URL = r'''(?x) https?://(?:www\.)?youporn\.com/ diff --git a/yt_dlp/extractor/youtube/_base.py b/yt_dlp/extractor/youtube/_base.py index ac0604a9cf..9c5bb75fe4 100644 --- a/yt_dlp/extractor/youtube/_base.py +++ b/yt_dlp/extractor/youtube/_base.py @@ -35,6 +35,7 @@ class _PoTokenContext(enum.Enum): PLAYER = 'player' GVS = 'gvs' + SUBS = 'subs' # any clients starting with _ cannot be explicitly requested by the user @@ -417,6 +418,8 @@ class YoutubeBaseInfoExtractor(InfoExtractor): _NETRC_MACHINE = 'youtube' + _COOKIE_HOWTO_WIKI_URL = 'https://github.com/yt-dlp/yt-dlp/wiki/Extractors#exporting-youtube-cookies' + def ucid_or_none(self, ucid): return self._search_regex(rf'^({self._YT_CHANNEL_UCID_RE})$', ucid, 'UC-id', default=None) @@ -451,17 +454,15 @@ def _preferred_lang(self): return preferred_lang def _initialize_consent(self): - cookies = self._get_cookies('https://www.youtube.com/') - if cookies.get('__Secure-3PSID'): + if self._has_auth_cookies: return - socs = cookies.get('SOCS') + socs = self._youtube_cookies.get('SOCS') if socs and not socs.value.startswith('CAA'): # not consented return self._set_cookie('.youtube.com', 'SOCS', 'CAI', secure=True) # accept all (required for mixes) def _initialize_pref(self): - cookies = self._get_cookies('https://www.youtube.com/') - pref_cookie = cookies.get('PREF') + pref_cookie = self._youtube_cookies.get('PREF') pref = {} if pref_cookie: try: @@ -472,8 +473,9 @@ def _initialize_pref(self): self._set_cookie('.youtube.com', name='PREF', value=urllib.parse.urlencode(pref)) def _initialize_cookie_auth(self): - yt_sapisid, yt_1psapisid, yt_3psapisid = self._get_sid_cookies() - if yt_sapisid or yt_1psapisid or yt_3psapisid: + self._passed_auth_cookies = False + if self._has_auth_cookies: + self._passed_auth_cookies = True self.write_debug('Found YouTube account cookies') def _real_initialize(self): @@ -492,8 +494,7 @@ def _perform_login(self, username, password): @property def _youtube_login_hint(self): - return (f'{self._login_hint(method="cookies")}. Also see ' - 'https://github.com/yt-dlp/yt-dlp/wiki/Extractors#exporting-youtube-cookies ' + return (f'{self._login_hint(method="cookies")}. Also see {self._COOKIE_HOWTO_WIKI_URL} ' 'for tips on effectively exporting YouTube cookies') def _check_login_required(self): @@ -553,12 +554,16 @@ def _make_sid_authorization(scheme, sid, origin, additional_parts): return f'{scheme} {"_".join(parts)}' + @property + def _youtube_cookies(self): + return self._get_cookies('https://www.youtube.com') + def _get_sid_cookies(self): """ Get SAPISID, 1PSAPISID, 3PSAPISID cookie values @returns sapisid, 1psapisid, 3psapisid """ - yt_cookies = self._get_cookies('https://www.youtube.com') + yt_cookies = self._youtube_cookies yt_sapisid = try_call(lambda: yt_cookies['SAPISID'].value) yt_3papisid = try_call(lambda: yt_cookies['__Secure-3PAPISID'].value) yt_1papisid = try_call(lambda: yt_cookies['__Secure-1PAPISID'].value) @@ -595,6 +600,31 @@ def _get_sid_authorization_header(self, origin='https://www.youtube.com', user_s return ' '.join(authorizations) + @property + def is_authenticated(self): + return self._has_auth_cookies + + @property + def _has_auth_cookies(self): + yt_sapisid, yt_1psapisid, yt_3psapisid = self._get_sid_cookies() + # YouTube doesn't appear to clear 3PSAPISID when rotating cookies (as of 2025-04-26) + # But LOGIN_INFO is cleared and should exist if logged in + has_login_info = 'LOGIN_INFO' in self._youtube_cookies + return bool(has_login_info and (yt_sapisid or yt_1psapisid or yt_3psapisid)) + + def _request_webpage(self, *args, **kwargs): + response = super()._request_webpage(*args, **kwargs) + + # Check that we are still logged-in and cookies have not rotated after every request + if getattr(self, '_passed_auth_cookies', None) and not self._has_auth_cookies: + self.report_warning( + 'The provided YouTube account cookies are no longer valid. ' + 'They have likely been rotated in the browser as a security measure. ' + f'For tips on how to effectively export YouTube cookies, refer to {self._COOKIE_HOWTO_WIKI_URL} .', + only_once=False) + + return response + def _call_api(self, ep, query, video_id, fatal=True, headers=None, note='Downloading API JSON', errnote='Unable to download API page', context=None, api_key=None, api_hostname=None, default_client='web'): @@ -695,10 +725,6 @@ def _extract_visitor_data(self, *args): args, [('VISITOR_DATA', ('INNERTUBE_CONTEXT', 'client', 'visitorData'), ('responseContext', 'visitorData'))], expected_type=str) - @functools.cached_property - def is_authenticated(self): - return bool(self._get_sid_authorization_header()) - def extract_ytcfg(self, video_id, webpage): if not webpage: return {} @@ -762,6 +788,7 @@ def _download_webpage_with_retries(self, *args, retry_fatal=False, retry_on_stat def _download_ytcfg(self, client, video_id): url = { + 'mweb': 'https://m.youtube.com', 'web': 'https://www.youtube.com', 'web_music': 'https://music.youtube.com', 'web_embedded': f'https://www.youtube.com/embed/{video_id}?html5=1', @@ -803,12 +830,14 @@ def _extract_next_continuation_data(cls, renderer): @classmethod def _extract_continuation_ep_data(cls, continuation_ep: dict): - if isinstance(continuation_ep, dict): - continuation = try_get( - continuation_ep, lambda x: x['continuationCommand']['token'], str) + continuation_commands = traverse_obj( + continuation_ep, ('commandExecutorCommand', 'commands', ..., {dict})) + continuation_commands.append(continuation_ep) + for command in continuation_commands: + continuation = traverse_obj(command, ('continuationCommand', 'token', {str})) if not continuation: - return - ctp = continuation_ep.get('clickTrackingParams') + continue + ctp = command.get('clickTrackingParams') return cls._build_api_continuation_query(continuation, ctp) @classmethod diff --git a/yt_dlp/extractor/youtube/_clip.py b/yt_dlp/extractor/youtube/_clip.py index 7d063700e3..1a708cd01b 100644 --- a/yt_dlp/extractor/youtube/_clip.py +++ b/yt_dlp/extractor/youtube/_clip.py @@ -37,6 +37,7 @@ class YoutubeClipIE(YoutubeTabBaseInfoExtractor): 'chapters': 'count:20', 'comment_count': int, 'heatmap': 'count:100', + 'media_type': 'clip', }, }] @@ -59,6 +60,7 @@ def _real_extract(self, url): 'url': f'https://www.youtube.com/watch?v={video_id}', 'ie_key': YoutubeIE.ie_key(), 'id': clip_id, + 'media_type': 'clip', 'section_start': int(clip_data['startTimeMs']) / 1000, 'section_end': int(clip_data['endTimeMs']) / 1000, '_format_sort_fields': ( # https protocol is prioritized for ffmpeg compatibility diff --git a/yt_dlp/extractor/youtube/_redirect.py b/yt_dlp/extractor/youtube/_redirect.py index 1908df124c..14e565b426 100644 --- a/yt_dlp/extractor/youtube/_redirect.py +++ b/yt_dlp/extractor/youtube/_redirect.py @@ -35,6 +35,7 @@ class YoutubeYtBeIE(YoutubeBaseInfoExtractor): 'duration': 59, 'comment_count': int, 'channel_follower_count': int, + 'media_type': 'short', }, 'params': { 'noplaylist': True, diff --git a/yt_dlp/extractor/youtube/_tab.py b/yt_dlp/extractor/youtube/_tab.py index 122300e600..c018ee8cfb 100644 --- a/yt_dlp/extractor/youtube/_tab.py +++ b/yt_dlp/extractor/youtube/_tab.py @@ -524,10 +524,16 @@ def _entries(self, tab, item_id, ytcfg, delegated_session_id, visitor_data): response = self._extract_response( item_id=f'{item_id} page {page_num}', query=continuation, headers=headers, ytcfg=ytcfg, - check_get_keys=('continuationContents', 'onResponseReceivedActions', 'onResponseReceivedEndpoints')) + check_get_keys=( + 'continuationContents', 'onResponseReceivedActions', 'onResponseReceivedEndpoints', + # Playlist recommendations may return with no data - ignore + ('responseContext', 'serviceTrackingParams', ..., 'params', ..., lambda k, v: k == 'key' and v == 'GetRecommendedMusicPlaylists_rid'), + )) if not response: break + + continuation = None # Extracting updated visitor data is required to prevent an infinite extraction loop in some cases # See: https://github.com/ytdl-org/youtube-dl/issues/28702 visitor_data = self._extract_visitor_data(response) or visitor_data @@ -564,7 +570,13 @@ def _entries(self, tab, item_id, ytcfg, delegated_session_id, visitor_data): yield from func(video_items_renderer) continuation = continuation_list[0] or self._extract_continuation(video_items_renderer) - if not video_items_renderer: + # In the case only a continuation is returned, try to follow it. + # We extract this after trying to extract non-continuation items as otherwise this + # may be prioritized over other continuations. + # see: https://github.com/yt-dlp/yt-dlp/issues/12933 + continuation = continuation or self._extract_continuation({'contents': [continuation_item]}) + + if not continuation and not video_items_renderer: break @staticmethod @@ -999,14 +1011,14 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'playlist_mincount': 94, 'info_dict': { 'id': 'UCqj7Cz7revf5maW9g5pgNcg', - 'title': 'Igor Kleiner Ph.D. - Playlists', + 'title': 'Igor Kleiner - Playlists', 'description': 'md5:15d7dd9e333cb987907fcb0d604b233a', - 'uploader': 'Igor Kleiner Ph.D.', + 'uploader': 'Igor Kleiner ', 'uploader_id': '@IgorDataScience', 'uploader_url': 'https://www.youtube.com/@IgorDataScience', - 'channel': 'Igor Kleiner Ph.D.', + 'channel': 'Igor Kleiner ', 'channel_id': 'UCqj7Cz7revf5maW9g5pgNcg', - 'tags': ['критическое мышление', 'наука просто', 'математика', 'анализ данных'], + 'tags': 'count:23', 'channel_url': 'https://www.youtube.com/channel/UCqj7Cz7revf5maW9g5pgNcg', 'channel_follower_count': int, }, @@ -1016,18 +1028,19 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'playlist_mincount': 94, 'info_dict': { 'id': 'UCqj7Cz7revf5maW9g5pgNcg', - 'title': 'Igor Kleiner Ph.D. - Playlists', + 'title': 'Igor Kleiner - Playlists', 'description': 'md5:15d7dd9e333cb987907fcb0d604b233a', - 'uploader': 'Igor Kleiner Ph.D.', + 'uploader': 'Igor Kleiner ', 'uploader_id': '@IgorDataScience', 'uploader_url': 'https://www.youtube.com/@IgorDataScience', - 'tags': ['критическое мышление', 'наука просто', 'математика', 'анализ данных'], + 'tags': 'count:23', 'channel_id': 'UCqj7Cz7revf5maW9g5pgNcg', - 'channel': 'Igor Kleiner Ph.D.', + 'channel': 'Igor Kleiner ', 'channel_url': 'https://www.youtube.com/channel/UCqj7Cz7revf5maW9g5pgNcg', 'channel_follower_count': int, }, }, { + # TODO: fix channel_is_verified extraction 'note': 'playlists, series', 'url': 'https://www.youtube.com/c/3blue1brown/playlists?view=50&sort=dd&shelf_id=3', 'playlist_mincount': 5, @@ -1066,22 +1079,23 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists', 'only_matching': True, }, { + # TODO: fix availability extraction 'note': 'basic, single video playlist', - 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc', + 'url': 'https://www.youtube.com/playlist?list=PLt5yu3-wZAlSLRHmI1qNm0wjyVNWw1pCU', 'info_dict': { - 'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc', - 'title': 'youtube-dl public playlist', + 'id': 'PLt5yu3-wZAlSLRHmI1qNm0wjyVNWw1pCU', + 'title': 'single video playlist', 'description': '', 'tags': [], 'view_count': int, - 'modified_date': '20201130', - 'channel': 'Sergey M.', - 'channel_id': 'UCmlqkdCBesrv2Lak1mF_MxA', - 'channel_url': 'https://www.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA', + 'modified_date': '20250417', + 'channel': 'cole-dlp-test-acc', + 'channel_id': 'UCiu-3thuViMebBjw_5nWYrA', + 'channel_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA', 'availability': 'public', - 'uploader': 'Sergey M.', - 'uploader_url': 'https://www.youtube.com/@sergeym.6173', - 'uploader_id': '@sergeym.6173', + 'uploader': 'cole-dlp-test-acc', + 'uploader_url': 'https://www.youtube.com/@coletdjnz', + 'uploader_id': '@coletdjnz', }, 'playlist_count': 1, }, { @@ -1171,11 +1185,11 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): }, 'playlist_mincount': 17, }, { - 'note': 'Community tab', + 'note': 'Posts tab', 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/community', 'info_dict': { 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - 'title': 'lex will - Community', + 'title': 'lex will - Posts', 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', 'channel': 'lex will', 'channel_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w', @@ -1188,30 +1202,14 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): }, 'playlist_mincount': 18, }, { - 'note': 'Channels tab', - 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/channels', - 'info_dict': { - 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - 'title': 'lex will - Channels', - 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', - 'channel': 'lex will', - 'channel_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w', - 'channel_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - 'tags': ['bible', 'history', 'prophesy'], - 'channel_follower_count': int, - 'uploader_url': 'https://www.youtube.com/@lexwill718', - 'uploader_id': '@lexwill718', - 'uploader': 'lex will', - }, - 'playlist_mincount': 12, - }, { + # TODO: fix channel_is_verified extraction 'note': 'Search tab', 'url': 'https://www.youtube.com/c/3blue1brown/search?query=linear%20algebra', 'playlist_mincount': 40, 'info_dict': { 'id': 'UCYO_jab_esuFRV4b17AJtAw', 'title': '3Blue1Brown - Search - linear algebra', - 'description': 'md5:4d1da95432004b7ba840ebc895b6b4c9', + 'description': 'md5:602e3789e6a0cb7d9d352186b720e395', 'channel_url': 'https://www.youtube.com/channel/UCYO_jab_esuFRV4b17AJtAw', 'tags': ['Mathematics'], 'channel': '3Blue1Brown', @@ -1232,6 +1230,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'url': 'https://music.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA', 'only_matching': True, }, { + # TODO: fix availability extraction 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.', 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC', 'info_dict': { @@ -1294,24 +1293,25 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): }, 'playlist_mincount': 21, }, { + # TODO: fix availability extraction 'note': 'Playlist with "show unavailable videos" button', - 'url': 'https://www.youtube.com/playlist?list=UUTYLiWFZy8xtPwxFwX9rV7Q', + 'url': 'https://www.youtube.com/playlist?list=PLYwq8WOe86_xGmR7FrcJq8Sb7VW8K3Tt2', 'info_dict': { - 'title': 'Uploads from Phim Siêu Nhân Nhật Bản', - 'id': 'UUTYLiWFZy8xtPwxFwX9rV7Q', + 'title': 'The Memes Of 2010s.....', + 'id': 'PLYwq8WOe86_xGmR7FrcJq8Sb7VW8K3Tt2', 'view_count': int, - 'channel': 'Phim Siêu Nhân Nhật Bản', + 'channel': "I'm Not JiNxEd", 'tags': [], - 'description': '', - 'channel_url': 'https://www.youtube.com/channel/UCTYLiWFZy8xtPwxFwX9rV7Q', - 'channel_id': 'UCTYLiWFZy8xtPwxFwX9rV7Q', + 'description': 'md5:44dc3b315ba69394feaafa2f40e7b2a1', + 'channel_url': 'https://www.youtube.com/channel/UC5H5H85D1QE5-fuWWQ1hdNg', + 'channel_id': 'UC5H5H85D1QE5-fuWWQ1hdNg', 'modified_date': r're:\d{8}', 'availability': 'public', - 'uploader_url': 'https://www.youtube.com/@phimsieunhannhatban', - 'uploader_id': '@phimsieunhannhatban', - 'uploader': 'Phim Siêu Nhân Nhật Bản', + 'uploader_url': 'https://www.youtube.com/@imnotjinxed1998', + 'uploader_id': '@imnotjinxed1998', + 'uploader': "I'm Not JiNxEd", }, - 'playlist_mincount': 200, + 'playlist_mincount': 150, 'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'], }, { 'note': 'Playlist with unavailable videos in page 7', @@ -1334,6 +1334,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'playlist_mincount': 1000, 'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'], }, { + # TODO: fix availability extraction 'note': 'https://github.com/ytdl-org/youtube-dl/issues/21844', 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba', 'info_dict': { @@ -1384,7 +1385,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): }, { 'url': 'https://www.youtube.com/channel/UCoMdktPbSTixAyNGwb-UYkQ/live', 'info_dict': { - 'id': 'hGkQjiJLjWQ', # This will keep changing + 'id': 'YDvsBbKfLPA', # This will keep changing 'ext': 'mp4', 'title': str, 'upload_date': r're:\d{8}', @@ -1409,6 +1410,8 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'uploader_id': '@SkyNews', 'uploader': 'Sky News', 'channel_is_verified': True, + 'media_type': 'livestream', + 'timestamp': int, }, 'params': { 'skip_download': True, @@ -1496,6 +1499,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'url': 'https://music.youtube.com/browse/UC1a8OFewdjuLq6KlF8M_8Ng', 'only_matching': True, }, { + # TODO: fix availability extraction 'note': 'VLPL, should redirect to playlist?list=PL...', 'url': 'https://music.youtube.com/browse/VLPLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq', 'info_dict': { @@ -1537,6 +1541,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): }, { # Destination channel with only a hidden self tab (tab id is UCtFRv9O2AHqOZjjynzrv-xg) # Treat as a general feed + # TODO: fix extraction 'url': 'https://www.youtube.com/channel/UCtFRv9O2AHqOZjjynzrv-xg', 'info_dict': { 'id': 'UCtFRv9O2AHqOZjjynzrv-xg', @@ -1560,21 +1565,21 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'expected_warnings': ['YouTube Music is not directly supported'], }, { 'note': 'unlisted single video playlist', - 'url': 'https://www.youtube.com/playlist?list=PLwL24UFy54GrB3s2KMMfjZscDi1x5Dajf', + 'url': 'https://www.youtube.com/playlist?list=PLt5yu3-wZAlQLfIN0MMgp0wVV6MP3bM4_', 'info_dict': { - 'id': 'PLwL24UFy54GrB3s2KMMfjZscDi1x5Dajf', - 'title': 'yt-dlp unlisted playlist test', + 'id': 'PLt5yu3-wZAlQLfIN0MMgp0wVV6MP3bM4_', + 'title': 'unlisted playlist', 'availability': 'unlisted', 'tags': [], - 'modified_date': '20220418', - 'channel': 'colethedj', + 'modified_date': '20250417', + 'channel': 'cole-dlp-test-acc', 'view_count': int, 'description': '', - 'channel_id': 'UC9zHu_mHU96r19o-wV5Qs1Q', - 'channel_url': 'https://www.youtube.com/channel/UC9zHu_mHU96r19o-wV5Qs1Q', - 'uploader_url': 'https://www.youtube.com/@colethedj1894', - 'uploader_id': '@colethedj1894', - 'uploader': 'colethedj', + 'channel_id': 'UCiu-3thuViMebBjw_5nWYrA', + 'channel_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA', + 'uploader_url': 'https://www.youtube.com/@coletdjnz', + 'uploader_id': '@coletdjnz', + 'uploader': 'cole-dlp-test-acc', }, 'playlist': [{ 'info_dict': { @@ -1596,6 +1601,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'playlist_count': 1, 'params': {'extract_flat': True}, }, { + # By default, recommended is always empty. 'note': 'API Fallback: Recommended - redirects to home page. Requires visitorData', 'url': 'https://www.youtube.com/feed/recommended', 'info_dict': { @@ -1603,7 +1609,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'title': 'recommended', 'tags': [], }, - 'playlist_mincount': 50, + 'playlist_count': 0, 'params': { 'skip_download': True, 'extractor_args': {'youtubetab': {'skip': ['webpage']}}, @@ -1628,6 +1634,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): }, 'skip': 'Query for sorting no longer works', }, { + # TODO: fix 'unviewable' issue with this playlist when reloading with unavailable videos 'note': 'API Fallback: Topic, should redirect to playlist?list=UU...', 'url': 'https://music.youtube.com/browse/UC9ALqqC4aIeG5iDs7i90Bfw', 'info_dict': { @@ -1654,11 +1661,12 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'url': 'https://www.youtube.com/channel/UCwVVpHQ2Cs9iGJfpdFngePQ', 'only_matching': True, }, { + # TODO: fix metadata extraction 'note': 'collaborative playlist (uploader name in the form "by and x other(s)")', 'url': 'https://www.youtube.com/playlist?list=PLx-_-Kk4c89oOHEDQAojOXzEzemXxoqx6', 'info_dict': { 'id': 'PLx-_-Kk4c89oOHEDQAojOXzEzemXxoqx6', - 'modified_date': '20220407', + 'modified_date': '20250115', 'channel_url': 'https://www.youtube.com/channel/UCKcqXmCcyqnhgpA5P0oHH_Q', 'tags': [], 'availability': 'unlisted', @@ -1692,6 +1700,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'expected_warnings': ['Preferring "ja"'], }, { # XXX: this should really check flat playlist entries, but the test suite doesn't support that + # TODO: fix availability extraction 'note': 'preferred lang set with playlist with translated video titles', 'url': 'https://www.youtube.com/playlist?list=PLt5yu3-wZAlQAaPZ5Z-rJoTdbT-45Q7c0', 'info_dict': { @@ -1714,6 +1723,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): }, { # shorts audio pivot for 2GtVksBMYFM. 'url': 'https://www.youtube.com/feed/sfv_audio_pivot?bp=8gUrCikSJwoLMkd0VmtzQk1ZRk0SCzJHdFZrc0JNWUZNGgsyR3RWa3NCTVlGTQ==', + # TODO: fix extraction 'info_dict': { 'id': 'sfv_audio_pivot', 'title': 'sfv_audio_pivot', @@ -1751,6 +1761,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'playlist_mincount': 8, }, { # Should get three playlists for videos, shorts and streams tabs + # TODO: fix channel_is_verified extraction 'url': 'https://www.youtube.com/channel/UCK9V2B22uJYu3N7eR_BT9QA', 'info_dict': { 'id': 'UCK9V2B22uJYu3N7eR_BT9QA', @@ -1758,7 +1769,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'channel_follower_count': int, 'channel_id': 'UCK9V2B22uJYu3N7eR_BT9QA', 'channel_url': 'https://www.youtube.com/channel/UCK9V2B22uJYu3N7eR_BT9QA', - 'description': 'md5:49809d8bf9da539bc48ed5d1f83c33f2', + 'description': 'md5:01e53f350ab8ad6fcf7c4fedb3c1b99f', 'channel': 'Polka Ch. 尾丸ポルカ', 'tags': 'count:35', 'uploader_url': 'https://www.youtube.com/@OmaruPolka', @@ -1769,14 +1780,14 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'playlist_count': 3, }, { # Shorts tab with channel with handle - # TODO: fix channel description + # TODO: fix channel_is_verified extraction 'url': 'https://www.youtube.com/@NotJustBikes/shorts', 'info_dict': { 'id': 'UC0intLFzLaudFG-xAvUEO-A', 'title': 'Not Just Bikes - Shorts', 'tags': 'count:10', 'channel_url': 'https://www.youtube.com/channel/UC0intLFzLaudFG-xAvUEO-A', - 'description': 'md5:5e82545b3a041345927a92d0585df247', + 'description': 'md5:1d9fc1bad7f13a487299d1fe1712e031', 'channel_follower_count': int, 'channel_id': 'UC0intLFzLaudFG-xAvUEO-A', 'channel': 'Not Just Bikes', @@ -1797,7 +1808,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'channel_url': 'https://www.youtube.com/channel/UC3eYAvjCVwNHgkaGbXX3sig', 'channel': '中村悠一', 'channel_follower_count': int, - 'description': 'md5:e744f6c93dafa7a03c0c6deecb157300', + 'description': 'md5:e8fd705073a594f27d6d6d020da560dc', 'uploader_url': 'https://www.youtube.com/@Yuichi-Nakamura', 'uploader_id': '@Yuichi-Nakamura', 'uploader': '中村悠一', @@ -1815,6 +1826,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'only_matching': True, }, { # No videos tab but has a shorts tab + # TODO: fix metadata extraction 'url': 'https://www.youtube.com/c/TKFShorts', 'info_dict': { 'id': 'UCgJ5_1F6yJhYLnyMszUdmUg', @@ -1851,6 +1863,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): }, { # Shorts url result in shorts tab # TODO: Fix channel id extraction + # TODO: fix test suite, 208163447408c78673b08c172beafe5c310fb167 broke this test 'url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA/shorts', 'info_dict': { 'id': 'UCiu-3thuViMebBjw_5nWYrA', @@ -1879,6 +1892,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'params': {'extract_flat': True}, }, { # Live video status should be extracted + # TODO: fix test suite, 208163447408c78673b08c172beafe5c310fb167 broke this test 'url': 'https://www.youtube.com/channel/UCQvWX73GQygcwXOTSf_VDVg/live', 'info_dict': { 'id': 'UCQvWX73GQygcwXOTSf_VDVg', @@ -1907,6 +1921,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'playlist_mincount': 1, }, { # Channel renderer metadata. Contains number of videos on the channel + # TODO: channels tab removed, change this test to use another page with channel renderer 'url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA/channels', 'info_dict': { 'id': 'UCiu-3thuViMebBjw_5nWYrA', @@ -1940,7 +1955,9 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): }, }], 'params': {'extract_flat': True}, + 'skip': 'channels tab removed', }, { + # TODO: fix channel_is_verified extraction 'url': 'https://www.youtube.com/@3blue1brown/about', 'info_dict': { 'id': '@3blue1brown', @@ -1950,7 +1967,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'channel_id': 'UCYO_jab_esuFRV4b17AJtAw', 'channel': '3Blue1Brown', 'channel_url': 'https://www.youtube.com/channel/UCYO_jab_esuFRV4b17AJtAw', - 'description': 'md5:4d1da95432004b7ba840ebc895b6b4c9', + 'description': 'md5:602e3789e6a0cb7d9d352186b720e395', 'uploader_url': 'https://www.youtube.com/@3blue1brown', 'uploader_id': '@3blue1brown', 'uploader': '3Blue1Brown', @@ -1976,6 +1993,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'playlist_count': 5, }, { # Releases tab, with rich entry playlistRenderers (same as Podcasts tab) + # TODO: fix channel_is_verified extraction 'url': 'https://www.youtube.com/@AHimitsu/releases', 'info_dict': { 'id': 'UCgFwu-j5-xNJml2FtTrrB3A', @@ -2015,6 +2033,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'playlist_mincount': 100, 'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'], }, { + # TODO: fix channel_is_verified extraction 'note': 'Tags containing spaces', 'url': 'https://www.youtube.com/channel/UC7_YxT-KID8kRbqZo7MyscQ', 'playlist_count': 3, @@ -2035,6 +2054,24 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'challenges', 'sketches', 'scary games', 'funny games', 'rage games', 'mark fischbach'], }, + }, { + # https://github.com/yt-dlp/yt-dlp/issues/12933 + 'note': 'streams tab, some scheduled streams. Empty intermediate response with only continuation - must follow', + 'url': 'https://www.youtube.com/@sbcitygov/streams', + 'playlist_mincount': 150, + 'info_dict': { + 'id': 'UCH6-qfQwlUgz9SAf05jvc_w', + 'channel': 'sbcitygov', + 'channel_id': 'UCH6-qfQwlUgz9SAf05jvc_w', + 'title': 'sbcitygov - Live', + 'channel_follower_count': int, + 'description': 'md5:ca1a92059835c071e33b3db52f4a6d67', + 'uploader_id': '@sbcitygov', + 'uploader_url': 'https://www.youtube.com/@sbcitygov', + 'uploader': 'sbcitygov', + 'channel_url': 'https://www.youtube.com/channel/UCH6-qfQwlUgz9SAf05jvc_w', + 'tags': [], + }, }] @classmethod diff --git a/yt_dlp/extractor/youtube/_video.py b/yt_dlp/extractor/youtube/_video.py index 5c1485a43d..3d4bdfd56d 100644 --- a/yt_dlp/extractor/youtube/_video.py +++ b/yt_dlp/extractor/youtube/_video.py @@ -23,6 +23,8 @@ _split_innertube_client, short_client_name, ) +from .pot._director import initialize_pot_director +from .pot.provider import PoTokenContext, PoTokenRequest from ..openload import PhantomJSwrapper from ...jsinterp import JSInterpreter from ...networking.exceptions import HTTPError @@ -34,6 +36,7 @@ clean_html, datetime_from_str, filesize_from_tbr, + filter_dict, float_or_none, format_field, get_first, @@ -65,9 +68,13 @@ urljoin, variadic, ) +from ...utils.networking import clean_headers, clean_proxies, select_proxy STREAMING_DATA_CLIENT_NAME = '__yt_dlp_client' STREAMING_DATA_INITIAL_PO_TOKEN = '__yt_dlp_po_token' +STREAMING_DATA_FETCH_SUBS_PO_TOKEN = '__yt_dlp_fetch_subs_po_token' +STREAMING_DATA_INNERTUBE_CONTEXT = '__yt_dlp_innertube_context' + PO_TOKEN_GUIDE_URL = 'https://github.com/yt-dlp/yt-dlp/wiki/PO-Token-Guide' @@ -130,7 +137,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): _RETURN_TYPE = 'video' # XXX: How to handle multifeed? _PLAYER_INFO_RE = ( - r'/s/player/(?P[a-zA-Z0-9_-]{8,})/player', + r'/s/player/(?P[a-zA-Z0-9_-]{8,})/(?:tv-)?player', r'/(?P[a-zA-Z0-9_-]{8,})/player(?:_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?|-plasma-ias-(?:phone|tablet)-[a-z]{2}_[A-Z]{2}\.vflset)/base\.js$', r'\b(?Pvfl[a-zA-Z0-9_-]+)\b.*?\.js$', ) @@ -375,6 +382,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'Afrojack', 'uploader_url': 'https://www.youtube.com/@Afrojack', 'uploader_id': '@Afrojack', + 'media_type': 'video', }, 'params': { 'youtube_include_dash_manifest': True, @@ -412,10 +420,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'channel_is_verified': True, 'heatmap': 'count:100', 'timestamp': 1401991663, + 'media_type': 'video', }, }, { - 'note': 'Age-gate video with embed allowed in public site', + 'note': 'Formerly an age-gate video with embed allowed in public site', 'url': 'https://youtube.com/watch?v=HsUATh_Nc2U', 'info_dict': { 'id': 'HsUATh_Nc2U', @@ -423,8 +432,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'title': 'Godzilla 2 (Official Video)', 'description': 'md5:bf77e03fcae5529475e500129b05668a', 'upload_date': '20200408', - 'age_limit': 18, - 'availability': 'needs_auth', + 'age_limit': 0, + 'availability': 'public', 'channel_id': 'UCYQT13AtrJC0gsM1far_zJg', 'channel': 'FlyingKitty', 'channel_url': 'https://www.youtube.com/channel/UCYQT13AtrJC0gsM1far_zJg', @@ -442,8 +451,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_id': '@FlyingKitty900', 'comment_count': int, 'channel_is_verified': True, + 'media_type': 'video', }, - 'skip': 'Age-restricted; requires authentication', }, { 'note': 'Age-gate video embedable only with clientScreen=EMBED', @@ -506,6 +515,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'Herr Lurik', 'uploader_url': 'https://www.youtube.com/@HerrLurik', 'uploader_id': '@HerrLurik', + 'media_type': 'video', }, }, { @@ -545,6 +555,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'deadmau5', 'uploader_url': 'https://www.youtube.com/@deadmau5', 'uploader_id': '@deadmau5', + 'media_type': 'video', }, 'expected_warnings': [ 'DASH manifest missing', @@ -580,6 +591,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_id': '@Olympics', 'channel_is_verified': True, 'timestamp': 1440707674, + 'media_type': 'livestream', }, 'params': { 'skip_download': 'requires avconv', @@ -614,6 +626,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_url': 'https://www.youtube.com/@AllenMeow', 'uploader_id': '@AllenMeow', 'timestamp': 1299776999, + 'media_type': 'video', }, }, # url_encoded_fmt_stream_map is empty string @@ -808,6 +821,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'like_count': int, 'age_limit': 0, 'channel_follower_count': int, + 'media_type': 'video', }, 'params': { 'skip_download': True, @@ -867,6 +881,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_id': '@BKCHarvard', 'uploader_url': 'https://www.youtube.com/@BKCHarvard', 'timestamp': 1422422076, + 'media_type': 'video', }, 'params': { 'skip_download': True, @@ -903,6 +918,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'channel_is_verified': True, 'heatmap': 'count:100', 'timestamp': 1447987198, + 'media_type': 'video', }, 'params': { 'skip_download': True, @@ -967,6 +983,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'comment_count': int, 'channel_is_verified': True, 'timestamp': 1484761047, + 'media_type': 'video', }, 'params': { 'skip_download': True, @@ -1069,6 +1086,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'tags': 'count:11', 'live_status': 'not_live', 'channel_follower_count': int, + 'media_type': 'video', }, 'params': { 'skip_download': True, @@ -1123,6 +1141,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_url': 'https://www.youtube.com/@ElevageOrVert', 'uploader_id': '@ElevageOrVert', 'timestamp': 1497343210, + 'media_type': 'video', }, 'params': { 'skip_download': True, @@ -1162,6 +1181,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'channel_is_verified': True, 'heatmap': 'count:100', 'timestamp': 1377976349, + 'media_type': 'video', }, 'params': { 'skip_download': True, @@ -1206,6 +1226,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'channel_follower_count': int, 'uploader': 'The Cinematic Orchestra', 'comment_count': int, + 'media_type': 'video', }, 'params': { 'skip_download': True, @@ -1274,6 +1295,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_url': 'https://www.youtube.com/@walkaroundjapan7124', 'uploader_id': '@walkaroundjapan7124', 'timestamp': 1605884416, + 'media_type': 'video', }, 'params': { 'skip_download': True, @@ -1370,6 +1392,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'channel_is_verified': True, 'heatmap': 'count:100', 'timestamp': 1395685455, + 'media_type': 'video', }, 'params': {'format': 'mhtml', 'skip_download': True}, }, { # Ensure video upload_date is in UTC timezone (video was uploaded 1641170939) @@ -1400,6 +1423,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_id': '@LeonNguyen', 'heatmap': 'count:100', 'timestamp': 1641170939, + 'media_type': 'video', }, }, { # date text is premiered video, ensure upload date in UTC (published 1641172509) @@ -1433,6 +1457,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'channel_is_verified': True, 'heatmap': 'count:100', 'timestamp': 1641172509, + 'media_type': 'video', }, }, { # continuous livestream. @@ -1494,6 +1519,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'Lesmiscore', 'uploader_url': 'https://www.youtube.com/@lesmiscore', 'timestamp': 1648005313, + 'media_type': 'short', }, }, { # Prefer primary title+description language metadata by default @@ -1522,6 +1548,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_id': '@coletdjnz', 'uploader': 'cole-dlp-test-acc', 'timestamp': 1662677394, + 'media_type': 'video', }, 'params': {'skip_download': True}, }, { @@ -1550,6 +1577,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'cole-dlp-test-acc', 'timestamp': 1659073275, 'like_count': int, + 'media_type': 'video', }, 'params': {'skip_download': True, 'extractor_args': {'youtube': {'lang': ['fr']}}}, 'expected_warnings': [r'Preferring "fr" translated fields'], @@ -1586,6 +1614,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'comment_count': int, 'channel_is_verified': True, 'heatmap': 'count:100', + 'media_type': 'video', }, 'params': {'extractor_args': {'youtube': {'player_client': ['ios']}}, 'format': '233-1'}, }, { @@ -1686,6 +1715,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'comment_count': int, 'channel_is_verified': True, 'heatmap': 'count:100', + 'media_type': 'video', }, 'params': { 'extractor_args': {'youtube': {'player_client': ['ios'], 'player_skip': ['webpage']}}, @@ -1718,6 +1748,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'channel_follower_count': int, 'categories': ['People & Blogs'], 'tags': [], + 'media_type': 'short', }, }, ] @@ -1753,6 +1784,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_id': '@ChristopherSykesDocumentaries', 'heatmap': 'count:100', 'timestamp': 1211825920, + 'media_type': 'video', }, 'params': { 'skip_download': True, @@ -1760,6 +1792,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor): }, ] + _PLAYER_JS_VARIANT_MAP = { + 'main': 'player_ias.vflset/en_US/base.js', + 'tce': 'player_ias_tce.vflset/en_US/base.js', + 'tv': 'tv-player-ias.vflset/tv-player-ias.js', + 'tv_es6': 'tv-player-es6.vflset/tv-player-es6.js', + 'phone': 'player-plasma-ias-phone-en_US.vflset/base.js', + 'tablet': 'player-plasma-ias-tablet-en_US.vflset/base.js', + } + _INVERSE_PLAYER_JS_VARIANT_MAP = {v: k for k, v in _PLAYER_JS_VARIANT_MAP.items()} + @classmethod def suitable(cls, url): from yt_dlp.utils import parse_qs @@ -1773,6 +1815,11 @@ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self._code_cache = {} self._player_cache = {} + self._pot_director = None + + def _real_initialize(self): + super()._real_initialize() + self._pot_director = initialize_pot_director(self) def _prepare_live_from_start_formats(self, formats, video_id, live_start_time, url, webpage_url, smuggled_data, is_live): lock = threading.Lock() @@ -1808,6 +1855,12 @@ def mpd_feed(format_id, delay): else: retry.error = f'Cannot find refreshed manifest for format {format_id}{bug_reports_message()}' continue + + # Formats from ended premieres will be missing a manifest_url + # See https://github.com/yt-dlp/yt-dlp/issues/8543 + if not f.get('manifest_url'): + break + return f['manifest_url'], f['manifest_stream_number'], is_live return None @@ -1939,11 +1992,21 @@ def _extract_player_url(self, *ytcfgs, webpage=None): get_all=False, expected_type=str) if not player_url: return - # TODO: Add proper support for the 'tce' variant players - # See https://github.com/yt-dlp/yt-dlp/issues/12398 - if '/player_ias_tce.vflset/' in player_url: - self.write_debug(f'Modifying tce player URL: {player_url}') - player_url = player_url.replace('/player_ias_tce.vflset/', '/player_ias.vflset/') + + requested_js_variant = self._configuration_arg('player_js_variant', [''])[0] or 'actual' + if requested_js_variant in self._PLAYER_JS_VARIANT_MAP: + player_id = self._extract_player_info(player_url) + original_url = player_url + player_url = f'/s/player/{player_id}/{self._PLAYER_JS_VARIANT_MAP[requested_js_variant]}' + if original_url != player_url: + self.write_debug( + f'Forcing "{requested_js_variant}" player JS variant for player {player_id}\n' + f' original url = {original_url}', only_once=True) + elif requested_js_variant != 'actual': + self.report_warning( + f'Invalid player JS variant name "{requested_js_variant}" requested. ' + f'Valid choices are: {", ".join(self._PLAYER_JS_VARIANT_MAP)}', only_once=True) + return urljoin('https://www.youtube.com', player_url) def _download_player_url(self, video_id, fatal=False): @@ -1958,6 +2021,19 @@ def _download_player_url(self, video_id, fatal=False): if player_version: return f'https://www.youtube.com/s/player/{player_version}/player_ias.vflset/en_US/base.js' + def _player_js_cache_key(self, player_url): + player_id = self._extract_player_info(player_url) + player_path = remove_start(urllib.parse.urlparse(player_url).path, f'/s/player/{player_id}/') + variant = self._INVERSE_PLAYER_JS_VARIANT_MAP.get(player_path) or next(( + v for k, v in self._INVERSE_PLAYER_JS_VARIANT_MAP.items() + if re.fullmatch(re.escape(k).replace('en_US', r'[a-zA-Z0-9_]+'), player_path)), None) + if not variant: + self.write_debug( + f'Unable to determine player JS variant\n' + f' player = {player_url}', only_once=True) + variant = re.sub(r'[^a-zA-Z0-9]', '_', remove_end(player_path, '.js')) + return join_nonempty(player_id, variant) + def _signature_cache_id(self, example_sig): """ Return a string representation of a signature """ return '.'.join(str(len(part)) for part in example_sig.split('.')) @@ -1973,30 +2049,29 @@ def _extract_player_info(cls, player_url): return id_m.group('id') def _load_player(self, video_id, player_url, fatal=True): - player_id = self._extract_player_info(player_url) - if player_id not in self._code_cache: + player_js_key = self._player_js_cache_key(player_url) + if player_js_key not in self._code_cache: code = self._download_webpage( player_url, video_id, fatal=fatal, - note='Downloading player ' + player_id, - errnote=f'Download of {player_url} failed') + note=f'Downloading player {player_js_key}', + errnote=f'Download of {player_js_key} failed') if code: - self._code_cache[player_id] = code - return self._code_cache.get(player_id) + self._code_cache[player_js_key] = code + return self._code_cache.get(player_js_key) def _extract_signature_function(self, video_id, player_url, example_sig): - player_id = self._extract_player_info(player_url) - # Read from filesystem cache - func_id = f'js_{player_id}_{self._signature_cache_id(example_sig)}' + func_id = join_nonempty( + self._player_js_cache_key(player_url), self._signature_cache_id(example_sig)) assert os.path.basename(func_id) == func_id self.write_debug(f'Extracting signature function {func_id}') - cache_spec, code = self.cache.load('youtube-sigfuncs', func_id), None + cache_spec, code = self.cache.load('youtube-sigfuncs', func_id, min_ver='2025.03.31'), None if not cache_spec: code = self._load_player(video_id, player_url) if code: - res = self._parse_sig_js(code) + res = self._parse_sig_js(code, player_url) test_string = ''.join(map(chr, range(len(example_sig)))) cache_spec = [ord(c) for c in res(test_string)] self.cache.store('youtube-sigfuncs', func_id, cache_spec) @@ -2044,7 +2119,7 @@ def _genslice(start, end, step): f' return {expr_code}\n') self.to_screen('Extracted signature function:\n' + code) - def _parse_sig_js(self, jscode): + def _parse_sig_js(self, jscode, player_url): # Examples where `sig` is funcname: # sig=function(a){a=a.split(""); ... ;return a.join("")}; # ;c&&(c=sig(decodeURIComponent(c)),a.set(b,encodeURIComponent(c)));return a}; @@ -2068,8 +2143,9 @@ def _parse_sig_js(self, jscode): r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P[a-zA-Z0-9$]+)\('), jscode, 'Initial JS player signature function name', group='sig') + varname, global_list = self._interpret_player_js_global_var(jscode, player_url) jsi = JSInterpreter(jscode) - initial_function = jsi.extract_function(funcname) + initial_function = jsi.extract_function(funcname, filter_dict({varname: global_list})) return lambda s: initial_function([s]) def _cached(self, func, *cache_id): @@ -2088,6 +2164,24 @@ def inner(*args, **kwargs): return ret return inner + def _load_player_data_from_cache(self, name, player_url): + cache_id = (f'youtube-{name}', self._player_js_cache_key(player_url)) + + if data := self._player_cache.get(cache_id): + return data + + data = self.cache.load(*cache_id, min_ver='2025.03.31') + if data: + self._player_cache[cache_id] = data + + return data + + def _store_player_data_to_cache(self, name, player_url, data): + cache_id = (f'youtube-{name}', self._player_js_cache_key(player_url)) + if cache_id not in self._player_cache: + self.cache.store(*cache_id, data) + self._player_cache[cache_id] = data + def _decrypt_signature(self, s, video_id, player_url): """Turn the encrypted s field into a working signature""" extract_sig = self._cached( @@ -2128,9 +2222,31 @@ def _decrypt_nsig(self, s, video_id, player_url): video_id=video_id, note='Executing signature code').strip() self.write_debug(f'Decrypted nsig {s} => {ret}') + # Only cache nsig func JS code to disk if successful, and only once + self._store_player_data_to_cache('nsig', player_url, func_code) return ret def _extract_n_function_name(self, jscode, player_url=None): + varname, global_list = self._interpret_player_js_global_var(jscode, player_url) + if debug_str := traverse_obj(global_list, (lambda _, v: v.endswith('-_w8_'), any)): + funcname = self._search_regex( + r'''(?xs) + [;\n](?: + (?Pfunction\s+)| + (?:var\s+)? + )(?P[a-zA-Z0-9_$]+)\s*(?(f)|=\s*function\s*) + \((?P[a-zA-Z0-9_$]+)\)\s*\{ + (?:(?!\}[;\n]).)+ + \}\s*catch\(\s*[a-zA-Z0-9_$]+\s*\)\s* + \{\s*return\s+%s\[%d\]\s*\+\s*(?P=argname)\s*\}\s*return\s+[^}]+\}[;\n] + ''' % (re.escape(varname), global_list.index(debug_str)), + jscode, 'nsig function name', group='funcname', default=None) + if funcname: + return funcname + self.write_debug(join_nonempty( + 'Initial search was unable to find nsig function name', + player_url and f' player = {player_url}', delim='\n'), only_once=True) + # Examples (with placeholders nfunc, narray, idx): # * .get("n"))&&(b=nfunc(b) # * .get("n"))&&(b=narray[idx](b) @@ -2160,7 +2276,7 @@ def _extract_n_function_name(self, jscode, player_url=None): if not funcname: self.report_warning(join_nonempty( 'Falling back to generic n function search', - player_url and f' player = {player_url}', delim='\n')) + player_url and f' player = {player_url}', delim='\n'), only_once=True) return self._search_regex( r'''(?xs) ;\s*(?P[a-zA-Z0-9_$]+)\s*=\s*function\([a-zA-Z0-9_$]+\) @@ -2173,14 +2289,56 @@ def _extract_n_function_name(self, jscode, player_url=None): rf'var {re.escape(funcname)}\s*=\s*(\[.+?\])\s*[,;]', jscode, f'Initial JS player n function list ({funcname}.{idx})')))[int(idx)] - def _fixup_n_function_code(self, argnames, code): - return argnames, re.sub( - rf';\s*if\s*\(\s*typeof\s+[a-zA-Z0-9_$]+\s*===?\s*(["\'])undefined\1\s*\)\s*return\s+{argnames[0]};', - ';', code) + def _interpret_player_js_global_var(self, jscode, player_url): + """Returns tuple of: variable name string, variable value list""" + extract_global_var = self._cached(self._search_regex, 'js global array', player_url) + varcode, varname, varvalue = extract_global_var( + r'''(?x) + (?P["\'])use\s+strict(?P=q1);\s* + (?P + var\s+(?P[a-zA-Z0-9_$]+)\s*=\s* + (?P + (?P["\'])(?:(?!(?P=q2)).|\\.)+(?P=q2) + \.split\((?P["\'])(?:(?!(?P=q3)).)+(?P=q3)\) + |\[\s*(?:(?P["\'])(?:(?!(?P=q4)).|\\.)*(?P=q4)\s*,?\s*)+\] + ) + )[;,] + ''', jscode, 'global variable', group=('code', 'name', 'value'), default=(None, None, None)) + if not varcode: + self.write_debug(join_nonempty( + 'No global array variable found in player JS', + player_url and f' player = {player_url}', delim='\n'), only_once=True) + return None, None + + jsi = JSInterpreter(varcode) + interpret_global_var = self._cached(jsi.interpret_expression, 'js global list', player_url) + return varname, interpret_global_var(varvalue, {}, allow_recursion=10) + + def _fixup_n_function_code(self, argnames, nsig_code, jscode, player_url): + varname, global_list = self._interpret_player_js_global_var(jscode, player_url) + if varname and global_list: + nsig_code = f'var {varname}={json.dumps(global_list)}; {nsig_code}' + else: + varname = 'dlp_wins' + global_list = [] + + undefined_idx = global_list.index('undefined') if 'undefined' in global_list else r'\d+' + fixed_code = re.sub( + fr'''(?x) + ;\s*if\s*\(\s*typeof\s+[a-zA-Z0-9_$]+\s*===?\s*(?: + (["\'])undefined\1| + {re.escape(varname)}\[{undefined_idx}\] + )\s*\)\s*return\s+{re.escape(argnames[0])}; + ''', ';', nsig_code) + if fixed_code == nsig_code: + self.write_debug(join_nonempty( + 'No typeof statement found in nsig function code', + player_url and f' player = {player_url}', delim='\n'), only_once=True) + return argnames, fixed_code def _extract_n_function_code(self, video_id, player_url): player_id = self._extract_player_info(player_url) - func_code = self.cache.load('youtube-nsig', player_id, min_ver='2025.02.19') + func_code = self._load_player_data_from_cache('nsig', player_url) jscode = func_code or self._load_player(video_id, player_url) jsi = JSInterpreter(jscode) @@ -2189,10 +2347,9 @@ def _extract_n_function_code(self, video_id, player_url): func_name = self._extract_n_function_name(jscode, player_url=player_url) - # XXX: Workaround for the `typeof` gotcha - func_code = self._fixup_n_function_code(*jsi.extract_function_code(func_name)) + # XXX: Workaround for the global array variable and lack of `typeof` implementation + func_code = self._fixup_n_function_code(*jsi.extract_function_code(func_name), jscode, player_url) - self.cache.store('youtube-nsig', player_id, func_code) return jsi, player_id, func_code def _extract_n_function_from_code(self, jsi, func_code): @@ -2217,26 +2374,35 @@ def _extract_signature_timestamp(self, video_id, player_url, ytcfg=None, fatal=F Extract signatureTimestamp (sts) Required to tell API what sig/player version is in use. """ - sts = None - if isinstance(ytcfg, dict): - sts = int_or_none(ytcfg.get('STS')) + if sts := traverse_obj(ytcfg, ('STS', {int_or_none})): + return sts + + if not player_url: + error_msg = 'Cannot extract signature timestamp without player url' + if fatal: + raise ExtractorError(error_msg) + self.report_warning(error_msg) + return None + + sts = self._load_player_data_from_cache('sts', player_url) + if sts: + return sts + + if code := self._load_player(video_id, player_url, fatal=fatal): + sts = int_or_none(self._search_regex( + r'(?:signatureTimestamp|sts)\s*:\s*(?P[0-9]{5})', code, + 'JS player signature timestamp', group='sts', fatal=fatal)) + if sts: + self._store_player_data_to_cache('sts', player_url, sts) - if not sts: - # Attempt to extract from player - if player_url is None: - error_msg = 'Cannot extract signature timestamp without player_url.' - if fatal: - raise ExtractorError(error_msg) - self.report_warning(error_msg) - return - code = self._load_player(video_id, player_url, fatal=fatal) - if code: - sts = int_or_none(self._search_regex( - r'(?:signatureTimestamp|sts)\s*:\s*(?P[0-9]{5})', code, - 'JS player signature timestamp', group='sts', fatal=fatal)) return sts def _mark_watched(self, video_id, player_responses): + # cpn generation algorithm is reverse engineered from base.js. + # In fact it works even with dummy cpn. + CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_' + cpn = ''.join(CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(16)) + for is_full, key in enumerate(('videostatsPlaybackUrl', 'videostatsWatchtimeUrl')): label = 'fully ' if is_full else '' url = get_first(player_responses, ('playbackTracking', key, 'baseUrl'), @@ -2247,11 +2413,6 @@ def _mark_watched(self, video_id, player_responses): parsed_url = urllib.parse.urlparse(url) qs = urllib.parse.parse_qs(parsed_url.query) - # cpn generation algorithm is reverse engineered from base.js. - # In fact it works even with dummy cpn. - CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_' - cpn = ''.join(CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(16)) - # # more consistent results setting it to right before the end video_length = [str(float((qs.get('len') or ['1.5'])[0]) - 1)] @@ -2701,7 +2862,8 @@ def _get_config_po_token(self, client: str, context: _PoTokenContext): continue def fetch_po_token(self, client='web', context=_PoTokenContext.GVS, ytcfg=None, visitor_data=None, - data_sync_id=None, session_index=None, player_url=None, video_id=None, **kwargs): + data_sync_id=None, session_index=None, player_url=None, video_id=None, webpage=None, + required=False, **kwargs): """ Fetch a PO Token for a given client and context. This function will validate required parameters for a given context and client. @@ -2715,10 +2877,15 @@ def fetch_po_token(self, client='web', context=_PoTokenContext.GVS, ytcfg=None, @param session_index: session index. @param player_url: player URL. @param video_id: video ID. + @param webpage: video webpage. + @param required: Whether the PO Token is required (i.e. try to fetch unless policy is "never"). @param kwargs: Additional arguments to pass down. May be more added in the future. @return: The fetched PO Token. None if it could not be fetched. """ + # TODO(future): This validation should be moved into pot framework. + # Some sort of middleware or validation provider perhaps? + # GVS WebPO Token is bound to visitor_data / Visitor ID when logged out. # Must have visitor_data for it to function. if player_url and context == _PoTokenContext.GVS and not visitor_data and not self.is_authenticated: @@ -2740,6 +2907,7 @@ def fetch_po_token(self, client='web', context=_PoTokenContext.GVS, ytcfg=None, f'Got a GVS PO Token for {client} client, but missing Data Sync ID for account. Formats may not work.' f'You may need to pass a Data Sync ID with --extractor-args "youtube:data_sync_id=XXX"') + self.write_debug(f'{video_id}: Retrieved a {context.value} PO Token for {client} client from config') return config_po_token # Require GVS WebPO Token if logged in for external fetching @@ -2749,7 +2917,7 @@ def fetch_po_token(self, client='web', context=_PoTokenContext.GVS, ytcfg=None, f'You may need to pass a Data Sync ID with --extractor-args "youtube:data_sync_id=XXX"') return - return self._fetch_po_token( + po_token = self._fetch_po_token( client=client, context=context.value, ytcfg=ytcfg, @@ -2758,11 +2926,68 @@ def fetch_po_token(self, client='web', context=_PoTokenContext.GVS, ytcfg=None, session_index=session_index, player_url=player_url, video_id=video_id, + video_webpage=webpage, + required=required, **kwargs, ) + if po_token: + self.write_debug(f'{video_id}: Retrieved a {context.value} PO Token for {client} client') + return po_token + def _fetch_po_token(self, client, **kwargs): - """(Unstable) External PO Token fetch stub""" + context = kwargs.get('context') + + # Avoid fetching PO Tokens when not required + fetch_pot_policy = self._configuration_arg('fetch_pot', [''], ie_key=YoutubeIE)[0] + if fetch_pot_policy not in ('never', 'auto', 'always'): + fetch_pot_policy = 'auto' + if ( + fetch_pot_policy == 'never' + or ( + fetch_pot_policy == 'auto' + and _PoTokenContext(context) not in self._get_default_ytcfg(client)['PO_TOKEN_REQUIRED_CONTEXTS'] + and not kwargs.get('required', False) + ) + ): + return None + + headers = self.get_param('http_headers').copy() + proxies = self._downloader.proxies.copy() + clean_headers(headers) + clean_proxies(proxies, headers) + + innertube_host = self._select_api_hostname(None, default_client=client) + + pot_request = PoTokenRequest( + context=PoTokenContext(context), + innertube_context=traverse_obj(kwargs, ('ytcfg', 'INNERTUBE_CONTEXT')), + innertube_host=innertube_host, + internal_client_name=client, + session_index=kwargs.get('session_index'), + player_url=kwargs.get('player_url'), + video_webpage=kwargs.get('video_webpage'), + is_authenticated=self.is_authenticated, + visitor_data=kwargs.get('visitor_data'), + data_sync_id=kwargs.get('data_sync_id'), + video_id=kwargs.get('video_id'), + request_cookiejar=self._downloader.cookiejar, + + # All requests that would need to be proxied should be in the + # context of www.youtube.com or the innertube host + request_proxy=( + select_proxy('https://www.youtube.com', proxies) + or select_proxy(f'https://{innertube_host}', proxies) + ), + request_headers=headers, + request_timeout=self.get_param('socket_timeout'), + request_verify_tls=not self.get_param('nocheckcertificate'), + request_source_address=self.get_param('source_address'), + + bypass_cache=False, + ) + + return self._pot_director.get_po_token(pot_request) @staticmethod def _is_agegated(player_response): @@ -2911,6 +3136,8 @@ def append_client(*client_names): player_url = self._download_player_url(video_id) tried_iframe_fallback = True + pr = initial_pr if client == 'web' else None + visitor_data = visitor_data or self._extract_visitor_data(master_ytcfg, initial_pr, player_ytcfg) data_sync_id = data_sync_id or self._extract_data_sync_id(master_ytcfg, initial_pr, player_ytcfg) @@ -2920,16 +3147,24 @@ def append_client(*client_names): 'video_id': video_id, 'data_sync_id': data_sync_id if self.is_authenticated else None, 'player_url': player_url if require_js_player else None, + 'webpage': webpage, 'session_index': self._extract_session_index(master_ytcfg, player_ytcfg), - 'ytcfg': player_ytcfg, + 'ytcfg': player_ytcfg or self._get_default_ytcfg(client), } - player_po_token = self.fetch_po_token( + # Don't need a player PO token for WEB if using player response from webpage + player_po_token = None if pr else self.fetch_po_token( context=_PoTokenContext.PLAYER, **fetch_po_token_args) gvs_po_token = self.fetch_po_token( context=_PoTokenContext.GVS, **fetch_po_token_args) + fetch_subs_po_token_func = functools.partial( + self.fetch_po_token, + context=_PoTokenContext.SUBS, + **fetch_po_token_args, + ) + required_pot_contexts = self._get_default_ytcfg(client)['PO_TOKEN_REQUIRED_CONTEXTS'] if ( @@ -2956,7 +3191,6 @@ def append_client(*client_names): only_once=True) deprioritize_pr = True - pr = initial_pr if client == 'web' else None try: pr = pr or self._extract_player_response( client, video_id, @@ -2974,10 +3208,13 @@ def append_client(*client_names): if pr_id := self._invalid_player_response(pr, video_id): skipped_clients[client] = pr_id elif pr: - # Save client name for introspection later - sd = traverse_obj(pr, ('streamingData', {dict})) or {} + # Save client details for introspection later + innertube_context = traverse_obj(player_ytcfg or self._get_default_ytcfg(client), 'INNERTUBE_CONTEXT') + sd = pr.setdefault('streamingData', {}) sd[STREAMING_DATA_CLIENT_NAME] = client sd[STREAMING_DATA_INITIAL_PO_TOKEN] = gvs_po_token + sd[STREAMING_DATA_INNERTUBE_CONTEXT] = innertube_context + sd[STREAMING_DATA_FETCH_SUBS_PO_TOKEN] = fetch_subs_po_token_func for f in traverse_obj(sd, (('formats', 'adaptiveFormats'), ..., {dict})): f[STREAMING_DATA_CLIENT_NAME] = client f[STREAMING_DATA_INITIAL_PO_TOKEN] = gvs_po_token @@ -2986,9 +3223,19 @@ def append_client(*client_names): else: prs.append(pr) + # web_embedded can work around age-gate and age-verification for some embeddable videos + if self._is_agegated(pr) and variant != 'web_embedded': + append_client(f'web_embedded.{base_client}') + # Unauthenticated users will only get web_embedded client formats if age-gated + if self._is_agegated(pr) and not self.is_authenticated: + self.to_screen( + f'{video_id}: This video is age-restricted; some formats may be missing ' + f'without authentication. {self._youtube_login_hint}', only_once=True) + # EU countries require age-verification for accounts to access age-restricted videos # If account is not age-verified, _is_agegated() will be truthy for non-embedded clients - if self.is_authenticated and self._is_agegated(pr): + embedding_is_disabled = variant == 'web_embedded' and self._is_unplayable(pr) + if self.is_authenticated and (self._is_agegated(pr) or embedding_is_disabled): self.to_screen( f'{video_id}: This video is age-restricted and YouTube is requiring ' 'account age-verification; some formats may be missing', only_once=True) @@ -3029,6 +3276,25 @@ def _report_pot_format_skipped(self, video_id, client_name, proto): else: self.report_warning(msg, only_once=True) + def _report_pot_subtitles_skipped(self, video_id, client_name, msg=None): + msg = msg or ( + f'{video_id}: Some {client_name} client subtitles require a PO Token which was not provided. ' + 'They will be discarded since they are not downloadable as-is. ' + f'You can manually pass a Subtitles PO Token for this client with ' + f'--extractor-args "youtube:po_token={client_name}.subs+XXX" . ' + f'For more information, refer to {PO_TOKEN_GUIDE_URL}') + + subs_wanted = any(( + self.get_param('writesubtitles'), + self.get_param('writeautomaticsub'), + self.get_param('listsubtitles'))) + + # Only raise a warning for non-default clients, to not confuse users. + if not subs_wanted or client_name in (*self._DEFAULT_CLIENTS, *self._DEFAULT_AUTHED_CLIENTS): + self.write_debug(msg, only_once=True) + else: + self.report_warning(msg, only_once=True) + def _extract_formats_and_subtitles(self, streaming_data, video_id, player_url, live_status, duration): CHUNK_SIZE = 10 << 20 PREFERRED_LANG_VALUE = 10 @@ -3115,12 +3381,16 @@ def build_fragments(f): fmt_url = url_or_none(try_get(sc, lambda x: x['url'][0])) encrypted_sig = try_get(sc, lambda x: x['s'][0]) if not all((sc, fmt_url, player_url, encrypted_sig)): - self.report_warning( - f'Some {client_name} client https formats have been skipped as they are missing a url. ' - f'{"Your account" if self.is_authenticated else "The current session"} may have ' - f'the SSAP (server-side ads) experiment which interferes with yt-dlp. ' - f'Please see https://github.com/yt-dlp/yt-dlp/issues/12482 for more details.', - video_id, only_once=True) + msg = f'Some {client_name} client https formats have been skipped as they are missing a url. ' + if client_name == 'web': + msg += 'YouTube is forcing SABR streaming for this client. ' + else: + msg += ( + f'YouTube may have enabled the SABR-only or Server-Side Ad Placement experiment for ' + f'{"your account" if self.is_authenticated else "the current session"}. ' + ) + msg += 'See https://github.com/yt-dlp/yt-dlp/issues/12482 for more details' + self.report_warning(msg, video_id, only_once=True) continue try: fmt_url += '&{}={}'.format( @@ -3141,14 +3411,12 @@ def build_fragments(f): 'n': decrypt_nsig(query['n'][0], video_id, player_url), }) except ExtractorError as e: - phantomjs_hint = '' - if isinstance(e, JSInterpreter.Exception): - phantomjs_hint = (f' Install {self._downloader._format_err("PhantomJS", self._downloader.Styles.EMPHASIS)} ' - f'to workaround the issue. {PhantomJSwrapper.INSTALL_HINT}\n') if player_url: self.report_warning( - f'nsig extraction failed: Some formats may be missing\n{phantomjs_hint}' - f' n = {query["n"][0]} ; player = {player_url}', video_id=video_id, only_once=True) + f'nsig extraction failed: Some formats may be missing\n' + f' n = {query["n"][0]} ; player = {player_url}\n' + f' {bug_reports_message(before="")}', + video_id=video_id, only_once=True) self.write_debug(e, only_once=True) else: self.report_warning( @@ -3165,7 +3433,7 @@ def build_fragments(f): is_damaged = try_call(lambda: format_duration < duration // 2) if is_damaged: self.report_warning( - f'{video_id}: Some formats are possibly damaged. They will be deprioritized', only_once=True) + 'Some formats are possibly damaged. They will be deprioritized', video_id, only_once=True) po_token = fmt.get(STREAMING_DATA_INITIAL_PO_TOKEN) @@ -3209,8 +3477,8 @@ def build_fragments(f): 'width': int_or_none(fmt.get('width')), 'language': join_nonempty(language_code, 'desc' if is_descriptive else '') or None, 'language_preference': PREFERRED_LANG_VALUE if is_original else 5 if is_default else -10 if is_descriptive else -1, - # Strictly de-prioritize broken, damaged and 3gp formats - 'preference': -20 if require_po_token else -10 if is_damaged else -2 if itag == '17' else None, + # Strictly de-prioritize damaged and 3gp formats + 'preference': -10 if is_damaged else -2 if itag == '17' else None, } mime_mobj = re.match( r'((?:[^/]+)/(?:[^;]+))(?:;\s*codecs="([^"]+)")?', fmt.get('mimeType') or '') @@ -3318,6 +3586,9 @@ def process_manifest_format(f, proto, client_name, itag, po_token): hls_manifest_url = hls_manifest_url.rstrip('/') + f'/pot/{po_token}' fmts, subs = self._extract_m3u8_formats_and_subtitles( hls_manifest_url, video_id, 'mp4', fatal=False, live=live_status == 'is_live') + for sub in traverse_obj(subs, (..., ..., {dict})): + # HLS subs (m3u8) do not need a PO token; save client name for debugging + sub[STREAMING_DATA_CLIENT_NAME] = client_name subtitles = self._merge_subtitles(subs, subtitles) for f in fmts: if process_manifest_format(f, 'hls', client_name, self._search_regex( @@ -3329,6 +3600,9 @@ def process_manifest_format(f, proto, client_name, itag, po_token): if po_token: dash_manifest_url = dash_manifest_url.rstrip('/') + f'/pot/{po_token}' formats, subs = self._extract_mpd_formats_and_subtitles(dash_manifest_url, video_id, fatal=False) + for sub in traverse_obj(subs, (..., ..., {dict})): + # TODO: Investigate if DASH subs ever need a PO token; save client name for debugging + sub[STREAMING_DATA_CLIENT_NAME] = client_name subtitles = self._merge_subtitles(subs, subtitles) # Prioritize HLS subs over DASH for f in formats: if process_manifest_format(f, 'dash', client_name, f['format_id'], po_token): @@ -3520,7 +3794,7 @@ def feed_entry(name): reason = self._get_text(pemr, 'reason') or get_first(playability_statuses, 'reason') subreason = clean_html(self._get_text(pemr, 'subreason') or '') if subreason: - if subreason == 'The uploader has not made this video available in your country.': + if subreason.startswith('The uploader has not made this video available in your country'): countries = get_first(microformats, 'availableCountries') if not countries: regions_allowed = search_meta('regionsAllowed') @@ -3531,6 +3805,15 @@ def feed_entry(name): if 'sign in' in reason.lower(): reason = remove_end(reason, 'This helps protect our community. Learn more') reason = f'{remove_end(reason.strip(), ".")}. {self._youtube_login_hint}' + elif get_first(playability_statuses, ('errorScreen', 'playerCaptchaViewModel', {dict})): + reason += '. YouTube is requiring a captcha challenge before playback' + elif "This content isn't available, try again later" in reason: + reason = ( + f'{remove_end(reason.strip(), ".")}. {"Your account" if self.is_authenticated else "The current session"} ' + f'has been rate-limited by YouTube for up to an hour. It is recommended to use `-t sleep` to add a delay ' + f'between video requests to avoid exceeding the rate limit. For more information, refer to ' + f'https://github.com/yt-dlp/yt-dlp/wiki/Extractors#this-content-isnt-available-try-again-later' + ) self.raise_no_formats(reason, expected=True) keywords = get_first(video_details, 'keywords', expected_type=list) or [] @@ -3637,53 +3920,94 @@ def is_bad_format(fmt): 'tags': keywords, 'playable_in_embed': get_first(playability_statuses, 'playableInEmbed'), 'live_status': live_status, - 'media_type': 'livestream' if get_first(video_details, 'isLiveContent') else None, + 'media_type': ( + 'livestream' if get_first(video_details, 'isLiveContent') + else 'short' if get_first(microformats, 'isShortsEligible') + else 'video'), 'release_timestamp': live_start_time, '_format_sort_fields': ( # source_preference is lower for potentially damaged formats 'quality', 'res', 'fps', 'hdr:12', 'source', 'vcodec', 'channels', 'acodec', 'lang', 'proto'), } + def get_lang_code(track): + return (remove_start(track.get('vssId') or '', '.').replace('.', '-') + or track.get('languageCode')) + + def process_language(container, base_url, lang_code, sub_name, client_name, query): + lang_subs = container.setdefault(lang_code, []) + for fmt in self._SUBTITLE_FORMATS: + query = {**query, 'fmt': fmt} + lang_subs.append({ + 'ext': fmt, + 'url': urljoin('https://www.youtube.com', update_url_query(base_url, query)), + 'name': sub_name, + STREAMING_DATA_CLIENT_NAME: client_name, + }) + subtitles = {} - pctr = traverse_obj(player_responses, (..., 'captions', 'playerCaptionsTracklistRenderer'), expected_type=dict) - if pctr: - def get_lang_code(track): - return (remove_start(track.get('vssId') or '', '.').replace('.', '-') - or track.get('languageCode')) + skipped_subs_clients = set() - # Converted into dicts to remove duplicates - captions = { - get_lang_code(sub): sub - for sub in traverse_obj(pctr, (..., 'captionTracks', ...))} - translation_languages = { - lang.get('languageCode'): self._get_text(lang.get('languageName'), max_runs=1) - for lang in traverse_obj(pctr, (..., 'translationLanguages', ...))} + # Only web/mweb clients provide translationLanguages, so include initial_pr in the traversal + translation_languages = { + lang['languageCode']: self._get_text(lang['languageName'], max_runs=1) + for lang in traverse_obj(player_responses, ( + ..., 'captions', 'playerCaptionsTracklistRenderer', 'translationLanguages', + lambda _, v: v['languageCode'] and v['languageName'])) + } + # NB: Constructing the full subtitle dictionary is slow + get_translated_subs = 'translated_subs' not in self._configuration_arg('skip') and ( + self.get_param('writeautomaticsub', False) or self.get_param('listsubtitles')) - def process_language(container, base_url, lang_code, sub_name, query): - lang_subs = container.setdefault(lang_code, []) - for fmt in self._SUBTITLE_FORMATS: - query.update({ - 'fmt': fmt, - }) - lang_subs.append({ - 'ext': fmt, - 'url': urljoin('https://www.youtube.com', update_url_query(base_url, query)), - 'name': sub_name, - }) + # Filter out initial_pr which does not have streamingData (smuggled client context) + prs = traverse_obj(player_responses, ( + lambda _, v: v['streamingData'] and v['captions']['playerCaptionsTracklistRenderer'])) + all_captions = traverse_obj(prs, ( + ..., 'captions', 'playerCaptionsTracklistRenderer', 'captionTracks', ..., {dict})) + need_subs_langs = {get_lang_code(sub) for sub in all_captions if sub.get('kind') != 'asr'} + need_caps_langs = { + remove_start(get_lang_code(sub), 'a-') + for sub in all_captions if sub.get('kind') == 'asr'} - # NB: Constructing the full subtitle dictionary is slow - get_translated_subs = 'translated_subs' not in self._configuration_arg('skip') and ( - self.get_param('writeautomaticsub', False) or self.get_param('listsubtitles')) - for lang_code, caption_track in captions.items(): - base_url = caption_track.get('baseUrl') - orig_lang = parse_qs(base_url).get('lang', [None])[-1] - if not base_url: - continue + for pr in prs: + pctr = pr['captions']['playerCaptionsTracklistRenderer'] + client_name = pr['streamingData'][STREAMING_DATA_CLIENT_NAME] + innertube_client_name = pr['streamingData'][STREAMING_DATA_INNERTUBE_CONTEXT]['client']['clientName'] + required_contexts = self._get_default_ytcfg(client_name)['PO_TOKEN_REQUIRED_CONTEXTS'] + fetch_subs_po_token_func = pr['streamingData'][STREAMING_DATA_FETCH_SUBS_PO_TOKEN] + + pot_params = {} + already_fetched_pot = False + + for caption_track in traverse_obj(pctr, ('captionTracks', lambda _, v: v['baseUrl'])): + base_url = caption_track['baseUrl'] + qs = parse_qs(base_url) + lang_code = get_lang_code(caption_track) + requires_pot = ( + # We can detect the experiment for now + any(e in traverse_obj(qs, ('exp', ...)) for e in ('xpe', 'xpv')) + or _PoTokenContext.SUBS in required_contexts) + + if not already_fetched_pot: + already_fetched_pot = True + if subs_po_token := fetch_subs_po_token_func(required=requires_pot): + pot_params.update({ + 'pot': subs_po_token, + 'potc': '1', + 'c': innertube_client_name, + }) + + if not pot_params and requires_pot: + skipped_subs_clients.add(client_name) + self._report_pot_subtitles_skipped(video_id, client_name) + break + + orig_lang = qs.get('lang', [None])[-1] lang_name = self._get_text(caption_track, 'name', max_runs=1) if caption_track.get('kind') != 'asr': if not lang_code: continue process_language( - subtitles, base_url, lang_code, lang_name, {}) + subtitles, base_url, lang_code, lang_name, client_name, pot_params) if not caption_track.get('isTranslatable'): continue for trans_code, trans_name in translation_languages.items(): @@ -3703,10 +4027,25 @@ def process_language(container, base_url, lang_code, sub_name, query): # Add an "-orig" label to the original language so that it can be distinguished. # The subs are returned without "-orig" as well for compatibility process_language( - automatic_captions, base_url, f'{trans_code}-orig', f'{trans_name} (Original)', {}) + automatic_captions, base_url, f'{trans_code}-orig', + f'{trans_name} (Original)', client_name, pot_params) # Setting tlang=lang returns damaged subtitles. - process_language(automatic_captions, base_url, trans_code, trans_name, - {} if orig_lang == orig_trans_code else {'tlang': trans_code}) + process_language( + automatic_captions, base_url, trans_code, trans_name, client_name, + pot_params if orig_lang == orig_trans_code else {'tlang': trans_code, **pot_params}) + + # Avoid duplication if we've already got everything we need + need_subs_langs.difference_update(subtitles) + need_caps_langs.difference_update(automatic_captions) + if not (need_subs_langs or need_caps_langs): + break + + if skipped_subs_clients and (need_subs_langs or need_caps_langs): + self._report_pot_subtitles_skipped(video_id, True, msg=join_nonempty( + f'{video_id}: There are missing subtitles languages because a PO token was not provided.', + need_subs_langs and f'Subtitles for these languages are missing: {", ".join(need_subs_langs)}.', + need_caps_langs and f'Automatic captions for {len(need_caps_langs)} languages are missing.', + delim=' ')) info['automatic_captions'] = automatic_captions info['subtitles'] = subtitles @@ -3759,7 +4098,7 @@ def process_language(container, base_url, lang_code, sub_name, query): if not traverse_obj(initial_data, 'contents'): self.report_warning('Incomplete data received in embedded initial data; re-fetching using API.') initial_data = None - if not initial_data: + if not initial_data and 'initial_data' not in self._configuration_arg('player_skip'): query = {'videoId': video_id} query.update(self._get_checkok_params()) initial_data = self._extract_response( diff --git a/yt_dlp/extractor/youtube/pot/README.md b/yt_dlp/extractor/youtube/pot/README.md new file mode 100644 index 0000000000..f39e290710 --- /dev/null +++ b/yt_dlp/extractor/youtube/pot/README.md @@ -0,0 +1,309 @@ +# YoutubeIE PO Token Provider Framework + +As part of the YouTube extractor, we have a framework for providing PO Tokens programmatically. This can be used by plugins. + +Refer to the [PO Token Guide](https://github.com/yt-dlp/yt-dlp/wiki/PO-Token-Guide) for more information on PO Tokens. + +> [!TIP] +> If publishing a PO Token Provider plugin to GitHub, add the [yt-dlp-pot-provider](https://github.com/topics/yt-dlp-pot-provider) topic to your repository to help users find it. + + +## Public APIs + +- `yt_dlp.extractor.youtube.pot.cache` +- `yt_dlp.extractor.youtube.pot.provider` +- `yt_dlp.extractor.youtube.pot.utils` + +Everything else is internal-only and no guarantees are made about the API stability. + +> [!WARNING] +> We will try our best to maintain stability with the public APIs. +> However, due to the nature of extractors and YouTube, we may need to remove or change APIs in the future. +> If you are using these APIs outside yt-dlp plugins, please account for this by importing them safely. + +## PO Token Provider + +`yt_dlp.extractor.youtube.pot.provider` + +```python +from yt_dlp.extractor.youtube.pot.provider import ( + PoTokenRequest, + PoTokenContext, + PoTokenProvider, + PoTokenResponse, + PoTokenProviderError, + PoTokenProviderRejectedRequest, + register_provider, + register_preference, + ExternalRequestFeature, +) +from yt_dlp.networking.common import Request +from yt_dlp.extractor.youtube.pot.utils import get_webpo_content_binding +from yt_dlp.utils import traverse_obj +from yt_dlp.networking.exceptions import RequestError +import json + + +@register_provider +class MyPoTokenProviderPTP(PoTokenProvider): # Provider class name must end with "PTP" + PROVIDER_VERSION = '0.2.1' + # Define a unique display name for the provider + PROVIDER_NAME = 'my-provider' + BUG_REPORT_LOCATION = 'https://issues.example.com/report' + + # -- Validation shortcuts. Set these to None to disable. -- + + # Innertube Client Name. + # For example, "WEB", "ANDROID", "TVHTML5". + # For a list of WebPO client names, + # see yt_dlp.extractor.youtube.pot.utils.WEBPO_CLIENTS. + # Also see yt_dlp.extractor.youtube._base.INNERTUBE_CLIENTS + # for a list of client names currently supported by the YouTube extractor. + _SUPPORTED_CLIENTS = ('WEB', 'TVHTML5') + + _SUPPORTED_CONTEXTS = ( + PoTokenContext.GVS, + ) + + # If your provider makes external requests to websites (i.e. to youtube.com) + # using another library or service (i.e., not _request_webpage), + # set the request features that are supported here. + # If only using _request_webpage to make external requests, set this to None. + _SUPPORTED_EXTERNAL_REQUEST_FEATURES = ( + ExternalRequestFeature.PROXY_SCHEME_HTTP, + ExternalRequestFeature.SOURCE_ADDRESS, + ExternalRequestFeature.DISABLE_TLS_VERIFICATION + ) + + def is_available(self) -> bool: + """ + Check if the provider is available (e.g. all required dependencies are available) + This is used to determine if the provider should be used and to provide debug information. + + IMPORTANT: This method SHOULD NOT make any network requests or perform any expensive operations. + + Since this is called multiple times, we recommend caching the result. + """ + return True + + def close(self): + # Optional close hook, called when YoutubeDL is closed. + pass + + def _real_request_pot(self, request: PoTokenRequest) -> PoTokenResponse: + # ℹ️ If you need to validate the request before making the request to the external source. + # Raise yt_dlp.extractor.youtube.pot.provider.PoTokenProviderRejectedRequest if the request is not supported. + if request.is_authenticated: + raise PoTokenProviderRejectedRequest( + 'This provider does not support authenticated requests' + ) + + # ℹ️ Settings are pulled from extractor args passed to yt-dlp with the key `youtubepot-`. + # For this example, the extractor arg would be: + # `--extractor-args "youtubepot-mypotokenprovider:url=https://custom.example.com/get_pot"` + external_provider_url = self._configuration_arg( + 'url', default=['https://provider.example.com/get_pot'])[0] + + # See below for logging guidelines + self.logger.trace(f'Using external provider URL: {external_provider_url}') + + # You should use the internal HTTP client to make requests where possible, + # as it will handle cookies and other networking settings passed to yt-dlp. + try: + # See docstring in _request_webpage method for request tips + response = self._request_webpage( + Request(external_provider_url, data=json.dumps({ + 'content_binding': get_webpo_content_binding(request), + 'proxy': request.request_proxy, + 'headers': request.request_headers, + 'source_address': request.request_source_address, + 'verify_tls': request.request_verify_tls, + # Important: If your provider has its own caching, please respect `bypass_cache`. + # This may be used in the future to request a fresh PO Token if required. + 'do_not_cache': request.bypass_cache, + }).encode(), proxies={'all': None}), + pot_request=request, + note=( + f'Requesting {request.context.value} PO Token ' + f'for {request.internal_client_name} client from external provider'), + ) + + except RequestError as e: + # ℹ️ If there is an error, raise PoTokenProviderError. + # You can specify whether it is expected or not. If it is unexpected, + # the log will include a link to the bug report location (BUG_REPORT_LOCATION). + raise PoTokenProviderError( + 'Networking error while fetching to get PO Token from external provider', + expected=True + ) from e + + # Note: PO Token is expected to be base64url encoded + po_token = traverse_obj(response, 'po_token') + if not po_token: + raise PoTokenProviderError( + 'Bad PO Token Response from external provider', + expected=False + ) + + return PoTokenResponse( + po_token=po_token, + # Optional, add a custom expiration timestamp for the token. Use for caching. + # By default, yt-dlp will use the default ttl from a registered cache spec (see below) + # Set to 0 or -1 to not cache this response. + expires_at=None, + ) + + +# If there are multiple PO Token Providers that can handle the same PoTokenRequest, +# you can define a preference function to increase/decrease the priority of providers. + +@register_preference(MyPoTokenProviderPTP) +def my_provider_preference(provider: PoTokenProvider, request: PoTokenRequest) -> int: + return 50 +``` + +## Logging Guidelines + +- Use the `self.logger` object to log messages. +- When making HTTP requests or any other expensive operation, use `self.logger.info` to log a message to standard non-verbose output. + - This lets users know what is happening when a time-expensive operation is taking place. + - It is recommended to include the PO Token context and internal client name in the message if possible. + - For example, `self.logger.info(f'Requesting {request.context.value} PO Token for {request.internal_client_name} client from external provider')`. +- Use `self.logger.debug` to log a message to the verbose output (`--verbose`). + - For debugging information visible to users posting verbose logs. + - Try to not log too much, prefer using trace logging for detailed debug messages. +- Use `self.logger.trace` to log a message to the PO Token debug output (`--extractor-args "youtube:pot_trace=true"`). + - Log as much as you like here as needed for debugging your provider. +- Avoid logging PO Tokens or any sensitive information to debug or info output. + +## Debugging + +- Use `-v --extractor-args "youtube:pot_trace=true"` to enable PO Token debug output. + +## Caching + +> [!WARNING] +> The following describes more advance features that most users/developers will not need to use. + +> [!IMPORTANT] +> yt-dlp currently has a built-in LRU Memory Cache Provider and a cache spec provider for WebPO Tokens. +> You should only need to implement cache providers if you want an external cache, or a cache spec if you are handling non-WebPO Tokens. + +### Cache Providers + +`yt_dlp.extractor.youtube.pot.cache` + +```python +from yt_dlp.extractor.youtube.pot.cache import ( + PoTokenCacheProvider, + register_preference, + register_provider +) + +from yt_dlp.extractor.youtube.pot.provider import PoTokenRequest + + +@register_provider +class MyCacheProviderPCP(PoTokenCacheProvider): # Provider class name must end with "PCP" + PROVIDER_VERSION = '0.1.0' + # Define a unique display name for the provider + PROVIDER_NAME = 'my-cache-provider' + BUG_REPORT_LOCATION = 'https://issues.example.com/report' + + def is_available(self) -> bool: + """ + Check if the provider is available (e.g. all required dependencies are available) + This is used to determine if the provider should be used and to provide debug information. + + IMPORTANT: This method SHOULD NOT make any network requests or perform any expensive operations. + + Since this is called multiple times, we recommend caching the result. + """ + return True + + def get(self, key: str): + # ℹ️ Similar to PO Token Providers, Cache Providers and Cache Spec Providers + # are passed down extractor args matching key youtubepot-. + some_setting = self._configuration_arg('some_setting', default=['default_value'])[0] + return self.my_cache.get(key) + + def store(self, key: str, value: str, expires_at: int): + # ⚠ expires_at MUST be respected. + # Cache entries should not be returned if they have expired. + self.my_cache.store(key, value, expires_at) + + def delete(self, key: str): + self.my_cache.delete(key) + + def close(self): + # Optional close hook, called when the YoutubeDL instance is closed. + pass + +# If there are multiple PO Token Cache Providers available, you can +# define a preference function to increase/decrease the priority of providers. + +# IMPORTANT: Providers should be in preference of cache lookup time. +# For example, a memory cache should have a higher preference than a disk cache. + +# VERY IMPORTANT: yt-dlp has a built-in memory cache with a priority of 10000. +# Your cache provider should be lower than this. + + +@register_preference(MyCacheProviderPCP) +def my_cache_preference(provider: PoTokenCacheProvider, request: PoTokenRequest) -> int: + return 50 +``` + +### Cache Specs + +`yt_dlp.extractor.youtube.pot.cache` + +These are used to provide information on how to cache a particular PO Token Request. +You might have a different cache spec for different kinds of PO Tokens. + +```python +from yt_dlp.extractor.youtube.pot.cache import ( + PoTokenCacheSpec, + PoTokenCacheSpecProvider, + CacheProviderWritePolicy, + register_spec, +) +from yt_dlp.utils import traverse_obj +from yt_dlp.extractor.youtube.pot.provider import PoTokenRequest + + +@register_spec +class MyCacheSpecProviderPCSP(PoTokenCacheSpecProvider): # Provider class name must end with "PCSP" + PROVIDER_VERSION = '0.1.0' + # Define a unique display name for the provider + PROVIDER_NAME = 'mycachespec' + BUG_REPORT_LOCATION = 'https://issues.example.com/report' + + def generate_cache_spec(self, request: PoTokenRequest): + + client_name = traverse_obj(request.innertube_context, ('client', 'clientName')) + if client_name != 'ANDROID': + # ℹ️ If the request is not supported by the cache spec, return None + return None + + # Generate a cache spec for the request + return PoTokenCacheSpec( + # Key bindings to uniquely identify the request. These are used to generate a cache key. + key_bindings={ + 'client_name': client_name, + 'content_binding': 'unique_content_binding', + 'ip': traverse_obj(request.innertube_context, ('client', 'remoteHost')), + 'source_address': request.request_source_address, + 'proxy': request.request_proxy, + }, + # Default Cache TTL in seconds + default_ttl=21600, + + # Optional: Specify a write policy. + # WRITE_FIRST will write to the highest priority provider only, + # whereas WRITE_ALL will write to all providers. + # WRITE_FIRST may be useful if the PO Token is short-lived + # and there is no use writing to all providers. + write_policy=CacheProviderWritePolicy.WRITE_ALL, + ) +``` \ No newline at end of file diff --git a/yt_dlp/extractor/youtube/pot/__init__.py b/yt_dlp/extractor/youtube/pot/__init__.py new file mode 100644 index 0000000000..febcee0104 --- /dev/null +++ b/yt_dlp/extractor/youtube/pot/__init__.py @@ -0,0 +1,3 @@ +# Trigger import of built-in providers +from ._builtin.memory_cache import MemoryLRUPCP as _MemoryLRUPCP # noqa: F401 +from ._builtin.webpo_cachespec import WebPoPCSP as _WebPoPCSP # noqa: F401 diff --git a/yt_dlp/extractor/youtube/pot/_builtin/__init__.py b/yt_dlp/extractor/youtube/pot/_builtin/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/yt_dlp/extractor/youtube/pot/_builtin/memory_cache.py b/yt_dlp/extractor/youtube/pot/_builtin/memory_cache.py new file mode 100644 index 0000000000..9c913e8c98 --- /dev/null +++ b/yt_dlp/extractor/youtube/pot/_builtin/memory_cache.py @@ -0,0 +1,78 @@ +from __future__ import annotations + +import datetime as dt +import typing +from threading import Lock + +from yt_dlp.extractor.youtube.pot._provider import BuiltinIEContentProvider +from yt_dlp.extractor.youtube.pot._registry import _pot_memory_cache +from yt_dlp.extractor.youtube.pot.cache import ( + PoTokenCacheProvider, + register_preference, + register_provider, +) + + +def initialize_global_cache(max_size: int): + if _pot_memory_cache.value.get('cache') is None: + _pot_memory_cache.value['cache'] = {} + _pot_memory_cache.value['lock'] = Lock() + _pot_memory_cache.value['max_size'] = max_size + + if _pot_memory_cache.value['max_size'] != max_size: + raise ValueError('Cannot change max_size of initialized global memory cache') + + return ( + _pot_memory_cache.value['cache'], + _pot_memory_cache.value['lock'], + _pot_memory_cache.value['max_size'], + ) + + +@register_provider +class MemoryLRUPCP(PoTokenCacheProvider, BuiltinIEContentProvider): + PROVIDER_NAME = 'memory' + DEFAULT_CACHE_SIZE = 25 + + def __init__( + self, + *args, + initialize_cache: typing.Callable[[int], tuple[dict[str, tuple[str, int]], Lock, int]] = initialize_global_cache, + **kwargs, + ): + super().__init__(*args, **kwargs) + self.cache, self.lock, self.max_size = initialize_cache(self.DEFAULT_CACHE_SIZE) + + def is_available(self) -> bool: + return True + + def get(self, key: str) -> str | None: + with self.lock: + if key not in self.cache: + return None + value, expires_at = self.cache.pop(key) + if expires_at < int(dt.datetime.now(dt.timezone.utc).timestamp()): + return None + self.cache[key] = (value, expires_at) + return value + + def store(self, key: str, value: str, expires_at: int): + with self.lock: + if expires_at < int(dt.datetime.now(dt.timezone.utc).timestamp()): + return + if key in self.cache: + self.cache.pop(key) + self.cache[key] = (value, expires_at) + if len(self.cache) > self.max_size: + oldest_key = next(iter(self.cache)) + self.cache.pop(oldest_key) + + def delete(self, key: str): + with self.lock: + self.cache.pop(key, None) + + +@register_preference(MemoryLRUPCP) +def memorylru_preference(*_, **__): + # Memory LRU Cache SHOULD be the highest priority + return 10000 diff --git a/yt_dlp/extractor/youtube/pot/_builtin/webpo_cachespec.py b/yt_dlp/extractor/youtube/pot/_builtin/webpo_cachespec.py new file mode 100644 index 0000000000..426b815c7e --- /dev/null +++ b/yt_dlp/extractor/youtube/pot/_builtin/webpo_cachespec.py @@ -0,0 +1,48 @@ +from __future__ import annotations + +from yt_dlp.extractor.youtube.pot._provider import BuiltinIEContentProvider +from yt_dlp.extractor.youtube.pot.cache import ( + CacheProviderWritePolicy, + PoTokenCacheSpec, + PoTokenCacheSpecProvider, + register_spec, +) +from yt_dlp.extractor.youtube.pot.provider import ( + PoTokenRequest, +) +from yt_dlp.extractor.youtube.pot.utils import ContentBindingType, get_webpo_content_binding +from yt_dlp.utils import traverse_obj + + +@register_spec +class WebPoPCSP(PoTokenCacheSpecProvider, BuiltinIEContentProvider): + PROVIDER_NAME = 'webpo' + + def generate_cache_spec(self, request: PoTokenRequest) -> PoTokenCacheSpec | None: + bind_to_visitor_id = self._configuration_arg( + 'bind_to_visitor_id', default=['true'])[0] == 'true' + + content_binding, content_binding_type = get_webpo_content_binding( + request, bind_to_visitor_id=bind_to_visitor_id) + + if not content_binding or not content_binding_type: + return None + + write_policy = CacheProviderWritePolicy.WRITE_ALL + if content_binding_type == ContentBindingType.VIDEO_ID: + write_policy = CacheProviderWritePolicy.WRITE_FIRST + + return PoTokenCacheSpec( + key_bindings={ + 't': 'webpo', + 'cb': content_binding, + 'cbt': content_binding_type.value, + 'ip': traverse_obj(request.innertube_context, ('client', 'remoteHost')), + 'sa': request.request_source_address, + 'px': request.request_proxy, + }, + # Integrity token response usually states it has a ttl of 12 hours (43200 seconds). + # We will default to 6 hours to be safe. + default_ttl=21600, + write_policy=write_policy, + ) diff --git a/yt_dlp/extractor/youtube/pot/_director.py b/yt_dlp/extractor/youtube/pot/_director.py new file mode 100644 index 0000000000..aaf1d5290a --- /dev/null +++ b/yt_dlp/extractor/youtube/pot/_director.py @@ -0,0 +1,468 @@ +from __future__ import annotations + +import base64 +import binascii +import dataclasses +import datetime as dt +import hashlib +import json +import typing +import urllib.parse +from collections.abc import Iterable + +from yt_dlp.extractor.youtube.pot._provider import ( + BuiltinIEContentProvider, + IEContentProvider, + IEContentProviderLogger, +) +from yt_dlp.extractor.youtube.pot._registry import ( + _pot_cache_provider_preferences, + _pot_cache_providers, + _pot_pcs_providers, + _pot_providers, + _ptp_preferences, +) +from yt_dlp.extractor.youtube.pot.cache import ( + CacheProviderWritePolicy, + PoTokenCacheProvider, + PoTokenCacheProviderError, + PoTokenCacheSpec, + PoTokenCacheSpecProvider, +) +from yt_dlp.extractor.youtube.pot.provider import ( + PoTokenProvider, + PoTokenProviderError, + PoTokenProviderRejectedRequest, + PoTokenRequest, + PoTokenResponse, + provider_bug_report_message, +) +from yt_dlp.utils import bug_reports_message, format_field, join_nonempty + +if typing.TYPE_CHECKING: + from yt_dlp.extractor.youtube.pot.cache import CacheProviderPreference + from yt_dlp.extractor.youtube.pot.provider import Preference + + +class YoutubeIEContentProviderLogger(IEContentProviderLogger): + def __init__(self, ie, prefix, log_level: IEContentProviderLogger.LogLevel | None = None): + self.__ie = ie + self.prefix = prefix + self.log_level = log_level if log_level is not None else self.LogLevel.INFO + + def _format_msg(self, message: str): + prefixstr = format_field(self.prefix, None, '[%s] ') + return f'{prefixstr}{message}' + + def trace(self, message: str): + if self.log_level <= self.LogLevel.TRACE: + self.__ie.write_debug(self._format_msg('TRACE: ' + message)) + + def debug(self, message: str): + if self.log_level <= self.LogLevel.DEBUG: + self.__ie.write_debug(self._format_msg(message)) + + def info(self, message: str): + if self.log_level <= self.LogLevel.INFO: + self.__ie.to_screen(self._format_msg(message)) + + def warning(self, message: str, *, once=False): + if self.log_level <= self.LogLevel.WARNING: + self.__ie.report_warning(self._format_msg(message), only_once=once) + + def error(self, message: str): + if self.log_level <= self.LogLevel.ERROR: + self.__ie._downloader.report_error(self._format_msg(message), is_error=False) + + +class PoTokenCache: + + def __init__( + self, + logger: IEContentProviderLogger, + cache_providers: list[PoTokenCacheProvider], + cache_spec_providers: list[PoTokenCacheSpecProvider], + cache_provider_preferences: list[CacheProviderPreference] | None = None, + ): + self.cache_providers: dict[str, PoTokenCacheProvider] = { + provider.PROVIDER_KEY: provider for provider in (cache_providers or [])} + self.cache_provider_preferences: list[CacheProviderPreference] = cache_provider_preferences or [] + self.cache_spec_providers: dict[str, PoTokenCacheSpecProvider] = { + provider.PROVIDER_KEY: provider for provider in (cache_spec_providers or [])} + self.logger = logger + + def _get_cache_providers(self, request: PoTokenRequest) -> Iterable[PoTokenCacheProvider]: + """Sorts available cache providers by preference, given a request""" + preferences = { + provider: sum(pref(provider, request) for pref in self.cache_provider_preferences) + for provider in self.cache_providers.values() + } + if self.logger.log_level <= self.logger.LogLevel.TRACE: + # calling is_available() for every PO Token provider upfront may have some overhead + self.logger.trace(f'PO Token Cache Providers: {provider_display_list(self.cache_providers.values())}') + self.logger.trace('Cache Provider preferences for this request: {}'.format(', '.join( + f'{provider.PROVIDER_KEY}={pref}' for provider, pref in preferences.items()))) + + return ( + provider for provider in sorted( + self.cache_providers.values(), key=preferences.get, reverse=True) if provider.is_available()) + + def _get_cache_spec(self, request: PoTokenRequest) -> PoTokenCacheSpec | None: + for provider in self.cache_spec_providers.values(): + if not provider.is_available(): + continue + try: + spec = provider.generate_cache_spec(request) + if not spec: + continue + if not validate_cache_spec(spec): + self.logger.error( + f'PoTokenCacheSpecProvider "{provider.PROVIDER_KEY}" generate_cache_spec() ' + f'returned invalid spec {spec}{provider_bug_report_message(provider)}') + continue + spec = dataclasses.replace(spec, _provider=provider) + self.logger.trace( + f'Retrieved cache spec {spec} from cache spec provider "{provider.PROVIDER_NAME}"') + return spec + except Exception as e: + self.logger.error( + f'Error occurred with "{provider.PROVIDER_NAME}" PO Token cache spec provider: ' + f'{e!r}{provider_bug_report_message(provider)}') + continue + return None + + def _generate_key_bindings(self, spec: PoTokenCacheSpec) -> dict[str, str]: + bindings_cleaned = { + **{k: v for k, v in spec.key_bindings.items() if v is not None}, + # Allow us to invalidate caches if such need arises + '_dlp_cache': 'v1', + } + if spec._provider: + bindings_cleaned['_p'] = spec._provider.PROVIDER_KEY + self.logger.trace(f'Generated cache key bindings: {bindings_cleaned}') + return bindings_cleaned + + def _generate_key(self, bindings: dict) -> str: + binding_string = ''.join(repr(dict(sorted(bindings.items())))) + return hashlib.sha256(binding_string.encode()).hexdigest() + + def get(self, request: PoTokenRequest) -> PoTokenResponse | None: + spec = self._get_cache_spec(request) + if not spec: + self.logger.trace('No cache spec available for this request, unable to fetch from cache') + return None + + cache_key = self._generate_key(self._generate_key_bindings(spec)) + self.logger.trace(f'Attempting to access PO Token cache using key: {cache_key}') + + for idx, provider in enumerate(self._get_cache_providers(request)): + try: + self.logger.trace( + f'Attempting to fetch PO Token response from "{provider.PROVIDER_NAME}" cache provider') + cache_response = provider.get(cache_key) + if not cache_response: + continue + try: + po_token_response = PoTokenResponse(**json.loads(cache_response)) + except (TypeError, ValueError, json.JSONDecodeError): + po_token_response = None + if not validate_response(po_token_response): + self.logger.error( + f'Invalid PO Token response retrieved from cache provider "{provider.PROVIDER_NAME}": ' + f'{cache_response}{provider_bug_report_message(provider)}') + provider.delete(cache_key) + continue + self.logger.trace( + f'PO Token response retrieved from cache using "{provider.PROVIDER_NAME}" provider: ' + f'{po_token_response}') + if idx > 0: + # Write back to the highest priority cache provider, + # so we stop trying to fetch from lower priority providers + self.logger.trace('Writing PO Token response to highest priority cache provider') + self.store(request, po_token_response, write_policy=CacheProviderWritePolicy.WRITE_FIRST) + + return po_token_response + except PoTokenCacheProviderError as e: + self.logger.warning( + f'Error from "{provider.PROVIDER_NAME}" PO Token cache provider: ' + f'{e!r}{provider_bug_report_message(provider) if not e.expected else ""}') + continue + except Exception as e: + self.logger.error( + f'Error occurred with "{provider.PROVIDER_NAME}" PO Token cache provider: ' + f'{e!r}{provider_bug_report_message(provider)}', + ) + continue + return None + + def store( + self, + request: PoTokenRequest, + response: PoTokenResponse, + write_policy: CacheProviderWritePolicy | None = None, + ): + spec = self._get_cache_spec(request) + if not spec: + self.logger.trace('No cache spec available for this request. Not caching.') + return + + if not validate_response(response): + self.logger.error( + f'Invalid PO Token response provided to PoTokenCache.store(): ' + f'{response}{bug_reports_message()}') + return + + cache_key = self._generate_key(self._generate_key_bindings(spec)) + self.logger.trace(f'Attempting to access PO Token cache using key: {cache_key}') + + default_expires_at = int(dt.datetime.now(dt.timezone.utc).timestamp()) + spec.default_ttl + cache_response = dataclasses.replace(response, expires_at=response.expires_at or default_expires_at) + + write_policy = write_policy or spec.write_policy + self.logger.trace(f'Using write policy: {write_policy}') + + for idx, provider in enumerate(self._get_cache_providers(request)): + try: + self.logger.trace( + f'Caching PO Token response in "{provider.PROVIDER_NAME}" cache provider ' + f'(key={cache_key}, expires_at={cache_response.expires_at})') + provider.store( + key=cache_key, + value=json.dumps(dataclasses.asdict(cache_response)), + expires_at=cache_response.expires_at) + except PoTokenCacheProviderError as e: + self.logger.warning( + f'Error from "{provider.PROVIDER_NAME}" PO Token cache provider: ' + f'{e!r}{provider_bug_report_message(provider) if not e.expected else ""}') + except Exception as e: + self.logger.error( + f'Error occurred with "{provider.PROVIDER_NAME}" PO Token cache provider: ' + f'{e!r}{provider_bug_report_message(provider)}') + + # WRITE_FIRST should not write to lower priority providers in the case the highest priority provider fails + if idx == 0 and write_policy == CacheProviderWritePolicy.WRITE_FIRST: + return + + def close(self): + for provider in self.cache_providers.values(): + provider.close() + for spec_provider in self.cache_spec_providers.values(): + spec_provider.close() + + +class PoTokenRequestDirector: + + def __init__(self, logger: IEContentProviderLogger, cache: PoTokenCache): + self.providers: dict[str, PoTokenProvider] = {} + self.preferences: list[Preference] = [] + self.cache = cache + self.logger = logger + + def register_provider(self, provider: PoTokenProvider): + self.providers[provider.PROVIDER_KEY] = provider + + def register_preference(self, preference: Preference): + self.preferences.append(preference) + + def _get_providers(self, request: PoTokenRequest) -> Iterable[PoTokenProvider]: + """Sorts available providers by preference, given a request""" + preferences = { + provider: sum(pref(provider, request) for pref in self.preferences) + for provider in self.providers.values() + } + if self.logger.log_level <= self.logger.LogLevel.TRACE: + # calling is_available() for every PO Token provider upfront may have some overhead + self.logger.trace(f'PO Token Providers: {provider_display_list(self.providers.values())}') + self.logger.trace('Provider preferences for this request: {}'.format(', '.join( + f'{provider.PROVIDER_NAME}={pref}' for provider, pref in preferences.items()))) + + return ( + provider for provider in sorted( + self.providers.values(), key=preferences.get, reverse=True) + if provider.is_available() + ) + + def _get_po_token(self, request) -> PoTokenResponse | None: + for provider in self._get_providers(request): + try: + self.logger.trace( + f'Attempting to fetch a PO Token from "{provider.PROVIDER_NAME}" provider') + response = provider.request_pot(request.copy()) + except PoTokenProviderRejectedRequest as e: + self.logger.trace( + f'PO Token Provider "{provider.PROVIDER_NAME}" rejected this request, ' + f'trying next available provider. Reason: {e}') + continue + except PoTokenProviderError as e: + self.logger.warning( + f'Error fetching PO Token from "{provider.PROVIDER_NAME}" provider: ' + f'{e!r}{provider_bug_report_message(provider) if not e.expected else ""}') + continue + except Exception as e: + self.logger.error( + f'Unexpected error when fetching PO Token from "{provider.PROVIDER_NAME}" provider: ' + f'{e!r}{provider_bug_report_message(provider)}') + continue + + self.logger.trace(f'PO Token response from "{provider.PROVIDER_NAME}" provider: {response}') + + if not validate_response(response): + self.logger.error( + f'Invalid PO Token response received from "{provider.PROVIDER_NAME}" provider: ' + f'{response}{provider_bug_report_message(provider)}') + continue + + return response + + self.logger.trace('No PO Token providers were able to provide a valid PO Token') + return None + + def get_po_token(self, request: PoTokenRequest) -> str | None: + if not request.bypass_cache: + if pot_response := self.cache.get(request): + return clean_pot(pot_response.po_token) + + if not self.providers: + self.logger.trace('No PO Token providers registered') + return None + + pot_response = self._get_po_token(request) + if not pot_response: + return None + + pot_response.po_token = clean_pot(pot_response.po_token) + + if pot_response.expires_at is None or pot_response.expires_at > 0: + self.cache.store(request, pot_response) + else: + self.logger.trace( + f'PO Token response will not be cached (expires_at={pot_response.expires_at})') + + return pot_response.po_token + + def close(self): + for provider in self.providers.values(): + provider.close() + self.cache.close() + + +EXTRACTOR_ARG_PREFIX = 'youtubepot' + + +def initialize_pot_director(ie): + assert ie._downloader is not None, 'Downloader not set' + + enable_trace = ie._configuration_arg( + 'pot_trace', ['false'], ie_key='youtube', casesense=False)[0] == 'true' + + if enable_trace: + log_level = IEContentProviderLogger.LogLevel.TRACE + elif ie.get_param('verbose', False): + log_level = IEContentProviderLogger.LogLevel.DEBUG + else: + log_level = IEContentProviderLogger.LogLevel.INFO + + def get_provider_logger_and_settings(provider, logger_key): + logger_prefix = f'{logger_key}:{provider.PROVIDER_NAME}' + extractor_key = f'{EXTRACTOR_ARG_PREFIX}-{provider.PROVIDER_KEY.lower()}' + return ( + YoutubeIEContentProviderLogger(ie, logger_prefix, log_level=log_level), + ie.get_param('extractor_args', {}).get(extractor_key, {})) + + cache_providers = [] + for cache_provider in _pot_cache_providers.value.values(): + logger, settings = get_provider_logger_and_settings(cache_provider, 'pot:cache') + cache_providers.append(cache_provider(ie, logger, settings)) + cache_spec_providers = [] + for cache_spec_provider in _pot_pcs_providers.value.values(): + logger, settings = get_provider_logger_and_settings(cache_spec_provider, 'pot:cache:spec') + cache_spec_providers.append(cache_spec_provider(ie, logger, settings)) + + cache = PoTokenCache( + logger=YoutubeIEContentProviderLogger(ie, 'pot:cache', log_level=log_level), + cache_providers=cache_providers, + cache_spec_providers=cache_spec_providers, + cache_provider_preferences=list(_pot_cache_provider_preferences.value), + ) + + director = PoTokenRequestDirector( + logger=YoutubeIEContentProviderLogger(ie, 'pot', log_level=log_level), + cache=cache, + ) + + ie._downloader.add_close_hook(director.close) + + for provider in _pot_providers.value.values(): + logger, settings = get_provider_logger_and_settings(provider, 'pot') + director.register_provider(provider(ie, logger, settings)) + + for preference in _ptp_preferences.value: + director.register_preference(preference) + + if director.logger.log_level <= director.logger.LogLevel.DEBUG: + # calling is_available() for every PO Token provider upfront may have some overhead + director.logger.debug(f'PO Token Providers: {provider_display_list(director.providers.values())}') + director.logger.debug(f'PO Token Cache Providers: {provider_display_list(cache.cache_providers.values())}') + director.logger.debug(f'PO Token Cache Spec Providers: {provider_display_list(cache.cache_spec_providers.values())}') + director.logger.trace(f'Registered {len(director.preferences)} provider preferences') + director.logger.trace(f'Registered {len(cache.cache_provider_preferences)} cache provider preferences') + + return director + + +def provider_display_list(providers: Iterable[IEContentProvider]): + def provider_display_name(provider): + display_str = join_nonempty( + provider.PROVIDER_NAME, + provider.PROVIDER_VERSION if not isinstance(provider, BuiltinIEContentProvider) else None) + statuses = [] + if not isinstance(provider, BuiltinIEContentProvider): + statuses.append('external') + if not provider.is_available(): + statuses.append('unavailable') + if statuses: + display_str += f' ({", ".join(statuses)})' + return display_str + + return ', '.join(provider_display_name(provider) for provider in providers) or 'none' + + +def clean_pot(po_token: str): + # Clean and validate the PO Token. This will strip invalid characters off + # (e.g. additional url params the user may accidentally include) + try: + return base64.urlsafe_b64encode( + base64.urlsafe_b64decode(urllib.parse.unquote(po_token))).decode() + except (binascii.Error, ValueError): + raise ValueError('Invalid PO Token') + + +def validate_response(response: PoTokenResponse | None): + if ( + not isinstance(response, PoTokenResponse) + or not isinstance(response.po_token, str) + or not response.po_token + ): # noqa: SIM103 + return False + + try: + clean_pot(response.po_token) + except ValueError: + return False + + if not isinstance(response.expires_at, int): + return response.expires_at is None + + return response.expires_at <= 0 or response.expires_at > int(dt.datetime.now(dt.timezone.utc).timestamp()) + + +def validate_cache_spec(spec: PoTokenCacheSpec): + return ( + isinstance(spec, PoTokenCacheSpec) + and isinstance(spec.write_policy, CacheProviderWritePolicy) + and isinstance(spec.default_ttl, int) + and isinstance(spec.key_bindings, dict) + and all(isinstance(k, str) for k in spec.key_bindings) + and all(v is None or isinstance(v, str) for v in spec.key_bindings.values()) + and bool([v for v in spec.key_bindings.values() if v is not None]) + ) diff --git a/yt_dlp/extractor/youtube/pot/_provider.py b/yt_dlp/extractor/youtube/pot/_provider.py new file mode 100644 index 0000000000..af7034d227 --- /dev/null +++ b/yt_dlp/extractor/youtube/pot/_provider.py @@ -0,0 +1,156 @@ +from __future__ import annotations + +import abc +import enum +import functools + +from yt_dlp.extractor.common import InfoExtractor +from yt_dlp.utils import NO_DEFAULT, bug_reports_message, classproperty, traverse_obj +from yt_dlp.version import __version__ + +# xxx: these could be generalized outside YoutubeIE eventually + + +class IEContentProviderLogger(abc.ABC): + + class LogLevel(enum.IntEnum): + TRACE = 0 + DEBUG = 10 + INFO = 20 + WARNING = 30 + ERROR = 40 + + @classmethod + def _missing_(cls, value): + if isinstance(value, str): + value = value.upper() + if value in dir(cls): + return cls[value] + + return cls.INFO + + log_level = LogLevel.INFO + + @abc.abstractmethod + def trace(self, message: str): + pass + + @abc.abstractmethod + def debug(self, message: str): + pass + + @abc.abstractmethod + def info(self, message: str): + pass + + @abc.abstractmethod + def warning(self, message: str, *, once=False): + pass + + @abc.abstractmethod + def error(self, message: str): + pass + + +class IEContentProviderError(Exception): + def __init__(self, msg=None, expected=False): + super().__init__(msg) + self.expected = expected + + +class IEContentProvider(abc.ABC): + PROVIDER_VERSION: str = '0.0.0' + BUG_REPORT_LOCATION: str = '(developer has not provided a bug report location)' + + def __init__( + self, + ie: InfoExtractor, + logger: IEContentProviderLogger, + settings: dict[str, list[str]], *_, **__, + ): + self.ie = ie + self.settings = settings or {} + self.logger = logger + super().__init__() + + @classmethod + def __init_subclass__(cls, *, suffix=None, **kwargs): + if suffix: + cls._PROVIDER_KEY_SUFFIX = suffix + return super().__init_subclass__(**kwargs) + + @classproperty + def PROVIDER_NAME(cls) -> str: + return cls.__name__[:-len(cls._PROVIDER_KEY_SUFFIX)] + + @classproperty + def BUG_REPORT_MESSAGE(cls): + return f'please report this issue to the provider developer at {cls.BUG_REPORT_LOCATION} .' + + @classproperty + def PROVIDER_KEY(cls) -> str: + assert hasattr(cls, '_PROVIDER_KEY_SUFFIX'), 'Content Provider implementation must define a suffix for the provider key' + assert cls.__name__.endswith(cls._PROVIDER_KEY_SUFFIX), f'PoTokenProvider class names must end with "{cls._PROVIDER_KEY_SUFFIX}"' + return cls.__name__[:-len(cls._PROVIDER_KEY_SUFFIX)] + + @abc.abstractmethod + def is_available(self) -> bool: + """ + Check if the provider is available (e.g. all required dependencies are available) + This is used to determine if the provider should be used and to provide debug information. + + IMPORTANT: This method should not make any network requests or perform any expensive operations. + It is called multiple times. + """ + raise NotImplementedError + + def close(self): # noqa: B027 + pass + + def _configuration_arg(self, key, default=NO_DEFAULT, *, casesense=False): + """ + @returns A list of values for the setting given by "key" + or "default" if no such key is present + @param default The default value to return when the key is not present (default: []) + @param casesense When false, the values are converted to lower case + """ + val = traverse_obj(self.settings, key) + if val is None: + return [] if default is NO_DEFAULT else default + return list(val) if casesense else [x.lower() for x in val] + + +class BuiltinIEContentProvider(IEContentProvider, abc.ABC): + PROVIDER_VERSION = __version__ + BUG_REPORT_MESSAGE = bug_reports_message(before='') + + +def register_provider_generic( + provider, + base_class, + registry, +): + """Generic function to register a provider class""" + assert issubclass(provider, base_class), f'{provider} must be a subclass of {base_class.__name__}' + assert provider.PROVIDER_KEY not in registry, f'{base_class.__name__} {provider.PROVIDER_KEY} already registered' + registry[provider.PROVIDER_KEY] = provider + return provider + + +def register_preference_generic( + base_class, + registry, + *providers, +): + """Generic function to register a preference for a provider""" + assert all(issubclass(provider, base_class) for provider in providers) + + def outer(preference): + @functools.wraps(preference) + def inner(provider, *args, **kwargs): + if not providers or isinstance(provider, providers): + return preference(provider, *args, **kwargs) + return 0 + registry.add(inner) + return preference + return outer diff --git a/yt_dlp/extractor/youtube/pot/_registry.py b/yt_dlp/extractor/youtube/pot/_registry.py new file mode 100644 index 0000000000..c72a622c12 --- /dev/null +++ b/yt_dlp/extractor/youtube/pot/_registry.py @@ -0,0 +1,8 @@ +from yt_dlp.globals import Indirect + +_pot_providers = Indirect({}) +_ptp_preferences = Indirect(set()) +_pot_pcs_providers = Indirect({}) +_pot_cache_providers = Indirect({}) +_pot_cache_provider_preferences = Indirect(set()) +_pot_memory_cache = Indirect({}) diff --git a/yt_dlp/extractor/youtube/pot/cache.py b/yt_dlp/extractor/youtube/pot/cache.py new file mode 100644 index 0000000000..6d69316adc --- /dev/null +++ b/yt_dlp/extractor/youtube/pot/cache.py @@ -0,0 +1,97 @@ +"""PUBLIC API""" + +from __future__ import annotations + +import abc +import dataclasses +import enum +import typing + +from yt_dlp.extractor.youtube.pot._provider import ( + IEContentProvider, + IEContentProviderError, + register_preference_generic, + register_provider_generic, +) +from yt_dlp.extractor.youtube.pot._registry import ( + _pot_cache_provider_preferences, + _pot_cache_providers, + _pot_pcs_providers, +) +from yt_dlp.extractor.youtube.pot.provider import PoTokenRequest + + +class PoTokenCacheProviderError(IEContentProviderError): + """An error occurred while fetching a PO Token""" + + +class PoTokenCacheProvider(IEContentProvider, abc.ABC, suffix='PCP'): + @abc.abstractmethod + def get(self, key: str) -> str | None: + pass + + @abc.abstractmethod + def store(self, key: str, value: str, expires_at: int): + pass + + @abc.abstractmethod + def delete(self, key: str): + pass + + +class CacheProviderWritePolicy(enum.Enum): + WRITE_ALL = enum.auto() # Write to all cache providers + WRITE_FIRST = enum.auto() # Write to only the first cache provider + + +@dataclasses.dataclass +class PoTokenCacheSpec: + key_bindings: dict[str, str | None] + default_ttl: int + write_policy: CacheProviderWritePolicy = CacheProviderWritePolicy.WRITE_ALL + + # Internal + _provider: PoTokenCacheSpecProvider | None = None + + +class PoTokenCacheSpecProvider(IEContentProvider, abc.ABC, suffix='PCSP'): + + def is_available(self) -> bool: + return True + + @abc.abstractmethod + def generate_cache_spec(self, request: PoTokenRequest) -> PoTokenCacheSpec | None: + """Generate a cache spec for the given request""" + pass + + +def register_provider(provider: type[PoTokenCacheProvider]): + """Register a PoTokenCacheProvider class""" + return register_provider_generic( + provider=provider, + base_class=PoTokenCacheProvider, + registry=_pot_cache_providers.value, + ) + + +def register_spec(provider: type[PoTokenCacheSpecProvider]): + """Register a PoTokenCacheSpecProvider class""" + return register_provider_generic( + provider=provider, + base_class=PoTokenCacheSpecProvider, + registry=_pot_pcs_providers.value, + ) + + +def register_preference( + *providers: type[PoTokenCacheProvider]) -> typing.Callable[[CacheProviderPreference], CacheProviderPreference]: + """Register a preference for a PoTokenCacheProvider""" + return register_preference_generic( + PoTokenCacheProvider, + _pot_cache_provider_preferences.value, + *providers, + ) + + +if typing.TYPE_CHECKING: + CacheProviderPreference = typing.Callable[[PoTokenCacheProvider, PoTokenRequest], int] diff --git a/yt_dlp/extractor/youtube/pot/provider.py b/yt_dlp/extractor/youtube/pot/provider.py new file mode 100644 index 0000000000..13b3b1f9bb --- /dev/null +++ b/yt_dlp/extractor/youtube/pot/provider.py @@ -0,0 +1,281 @@ +"""PUBLIC API""" + +from __future__ import annotations + +import abc +import copy +import dataclasses +import enum +import functools +import typing +import urllib.parse + +from yt_dlp.cookies import YoutubeDLCookieJar +from yt_dlp.extractor.youtube.pot._provider import ( + IEContentProvider, + IEContentProviderError, + register_preference_generic, + register_provider_generic, +) +from yt_dlp.extractor.youtube.pot._registry import _pot_providers, _ptp_preferences +from yt_dlp.networking import Request, Response +from yt_dlp.utils import traverse_obj +from yt_dlp.utils.networking import HTTPHeaderDict + +__all__ = [ + 'ExternalRequestFeature', + 'PoTokenContext', + 'PoTokenProvider', + 'PoTokenProviderError', + 'PoTokenProviderRejectedRequest', + 'PoTokenRequest', + 'PoTokenResponse', + 'provider_bug_report_message', + 'register_preference', + 'register_provider', +] + + +class PoTokenContext(enum.Enum): + GVS = 'gvs' + PLAYER = 'player' + SUBS = 'subs' + + +@dataclasses.dataclass +class PoTokenRequest: + # YouTube parameters + context: PoTokenContext + innertube_context: InnertubeContext + innertube_host: str | None = None + session_index: str | None = None + player_url: str | None = None + is_authenticated: bool = False + video_webpage: str | None = None + internal_client_name: str | None = None + + # Content binding parameters + visitor_data: str | None = None + data_sync_id: str | None = None + video_id: str | None = None + + # Networking parameters + request_cookiejar: YoutubeDLCookieJar = dataclasses.field(default_factory=YoutubeDLCookieJar) + request_proxy: str | None = None + request_headers: HTTPHeaderDict = dataclasses.field(default_factory=HTTPHeaderDict) + request_timeout: float | None = None + request_source_address: str | None = None + request_verify_tls: bool = True + + # Generate a new token, do not used a cached token + # The token should still be cached for future requests + bypass_cache: bool = False + + def copy(self): + return dataclasses.replace( + self, + request_headers=HTTPHeaderDict(self.request_headers), + innertube_context=copy.deepcopy(self.innertube_context), + ) + + +@dataclasses.dataclass +class PoTokenResponse: + po_token: str + expires_at: int | None = None + + +class PoTokenProviderRejectedRequest(IEContentProviderError): + """Reject the PoTokenRequest (cannot handle the request)""" + + +class PoTokenProviderError(IEContentProviderError): + """An error occurred while fetching a PO Token""" + + +class ExternalRequestFeature(enum.Enum): + PROXY_SCHEME_HTTP = enum.auto() + PROXY_SCHEME_HTTPS = enum.auto() + PROXY_SCHEME_SOCKS4 = enum.auto() + PROXY_SCHEME_SOCKS4A = enum.auto() + PROXY_SCHEME_SOCKS5 = enum.auto() + PROXY_SCHEME_SOCKS5H = enum.auto() + SOURCE_ADDRESS = enum.auto() + DISABLE_TLS_VERIFICATION = enum.auto() + + +class PoTokenProvider(IEContentProvider, abc.ABC, suffix='PTP'): + + # Set to None to disable the check + _SUPPORTED_CONTEXTS: tuple[PoTokenContext] | None = () + + # Innertube Client Name. + # For example, "WEB", "ANDROID", "TVHTML5". + # For a list of WebPO client names, see yt_dlp.extractor.youtube.pot.utils.WEBPO_CLIENTS. + # Also see yt_dlp.extractor.youtube._base.INNERTUBE_CLIENTS + # for a list of client names currently supported by the YouTube extractor. + _SUPPORTED_CLIENTS: tuple[str] | None = () + + # If making external requests to websites (i.e. to youtube.com) + # using another library or service (i.e., not _request_webpage), + # add the request features that are supported. + # If only using _request_webpage to make external requests, set this to None. + _SUPPORTED_EXTERNAL_REQUEST_FEATURES: tuple[ExternalRequestFeature] | None = () + + def __validate_request(self, request: PoTokenRequest): + if not self.is_available(): + raise PoTokenProviderRejectedRequest(f'{self.PROVIDER_NAME} is not available') + + # Validate request using built-in settings + if ( + self._SUPPORTED_CONTEXTS is not None + and request.context not in self._SUPPORTED_CONTEXTS + ): + raise PoTokenProviderRejectedRequest( + f'PO Token Context "{request.context}" is not supported by {self.PROVIDER_NAME}') + + if self._SUPPORTED_CLIENTS is not None: + client_name = traverse_obj( + request.innertube_context, ('client', 'clientName')) + if client_name not in self._SUPPORTED_CLIENTS: + raise PoTokenProviderRejectedRequest( + f'Client "{client_name}" is not supported by {self.PROVIDER_NAME}. ' + f'Supported clients: {", ".join(self._SUPPORTED_CLIENTS) or "none"}') + + self.__validate_external_request_features(request) + + @functools.cached_property + def _supported_proxy_schemes(self): + return { + scheme: feature + for scheme, feature in { + 'http': ExternalRequestFeature.PROXY_SCHEME_HTTP, + 'https': ExternalRequestFeature.PROXY_SCHEME_HTTPS, + 'socks4': ExternalRequestFeature.PROXY_SCHEME_SOCKS4, + 'socks4a': ExternalRequestFeature.PROXY_SCHEME_SOCKS4A, + 'socks5': ExternalRequestFeature.PROXY_SCHEME_SOCKS5, + 'socks5h': ExternalRequestFeature.PROXY_SCHEME_SOCKS5H, + }.items() + if feature in (self._SUPPORTED_EXTERNAL_REQUEST_FEATURES or []) + } + + def __validate_external_request_features(self, request: PoTokenRequest): + if self._SUPPORTED_EXTERNAL_REQUEST_FEATURES is None: + return + + if request.request_proxy: + scheme = urllib.parse.urlparse(request.request_proxy).scheme + if scheme.lower() not in self._supported_proxy_schemes: + raise PoTokenProviderRejectedRequest( + f'External requests by "{self.PROVIDER_NAME}" provider do not ' + f'support proxy scheme "{scheme}". Supported proxy schemes: ' + f'{", ".join(self._supported_proxy_schemes) or "none"}') + + if ( + request.request_source_address + and ExternalRequestFeature.SOURCE_ADDRESS not in self._SUPPORTED_EXTERNAL_REQUEST_FEATURES + ): + raise PoTokenProviderRejectedRequest( + f'External requests by "{self.PROVIDER_NAME}" provider ' + f'do not support setting source address') + + if ( + not request.request_verify_tls + and ExternalRequestFeature.DISABLE_TLS_VERIFICATION not in self._SUPPORTED_EXTERNAL_REQUEST_FEATURES + ): + raise PoTokenProviderRejectedRequest( + f'External requests by "{self.PROVIDER_NAME}" provider ' + f'do not support ignoring TLS certificate failures') + + def request_pot(self, request: PoTokenRequest) -> PoTokenResponse: + self.__validate_request(request) + return self._real_request_pot(request) + + @abc.abstractmethod + def _real_request_pot(self, request: PoTokenRequest) -> PoTokenResponse: + """To be implemented by subclasses""" + pass + + # Helper functions + + def _request_webpage(self, request: Request, pot_request: PoTokenRequest | None = None, note=None, **kwargs) -> Response: + """Make a request using the internal HTTP Client. + Use this instead of calling requests, urllib3 or other HTTP client libraries directly! + + YouTube cookies will be automatically applied if this request is made to YouTube. + + @param request: The request to make + @param pot_request: The PoTokenRequest to use. Request parameters will be merged from it. + @param note: Custom log message to display when making the request. Set to `False` to disable logging. + + Tips: + - Disable proxy (e.g. if calling local service): Request(..., proxies={'all': None}) + - Set request timeout: Request(..., extensions={'timeout': 5.0}) + """ + req = request.copy() + + # Merge some ctx request settings into the request + # Most of these will already be used by the configured ydl instance, + # however, the YouTube extractor may override some. + if pot_request is not None: + req.headers = HTTPHeaderDict(pot_request.request_headers, req.headers) + req.proxies = req.proxies or ({'all': pot_request.request_proxy} if pot_request.request_proxy else {}) + + if pot_request.request_cookiejar is not None: + req.extensions['cookiejar'] = req.extensions.get('cookiejar', pot_request.request_cookiejar) + + if note is not False: + self.logger.info(str(note) if note else 'Requesting webpage') + return self.ie._downloader.urlopen(req) + + +def register_provider(provider: type[PoTokenProvider]): + """Register a PoTokenProvider class""" + return register_provider_generic( + provider=provider, + base_class=PoTokenProvider, + registry=_pot_providers.value, + ) + + +def provider_bug_report_message(provider: IEContentProvider, before=';'): + msg = provider.BUG_REPORT_MESSAGE + + before = before.rstrip() + if not before or before.endswith(('.', '!', '?')): + msg = msg[0].title() + msg[1:] + + return f'{before} {msg}' if before else msg + + +def register_preference(*providers: type[PoTokenProvider]) -> typing.Callable[[Preference], Preference]: + """Register a preference for a PoTokenProvider""" + return register_preference_generic( + PoTokenProvider, + _ptp_preferences.value, + *providers, + ) + + +if typing.TYPE_CHECKING: + Preference = typing.Callable[[PoTokenProvider, PoTokenRequest], int] + __all__.append('Preference') + + # Barebones innertube context. There may be more fields. + class ClientInfo(typing.TypedDict, total=False): + hl: str | None + gl: str | None + remoteHost: str | None + deviceMake: str | None + deviceModel: str | None + visitorData: str | None + userAgent: str | None + clientName: str + clientVersion: str + osName: str | None + osVersion: str | None + + class InnertubeContext(typing.TypedDict, total=False): + client: ClientInfo + request: dict + user: dict diff --git a/yt_dlp/extractor/youtube/pot/utils.py b/yt_dlp/extractor/youtube/pot/utils.py new file mode 100644 index 0000000000..7a5b7d4ab3 --- /dev/null +++ b/yt_dlp/extractor/youtube/pot/utils.py @@ -0,0 +1,73 @@ +"""PUBLIC API""" + +from __future__ import annotations + +import base64 +import contextlib +import enum +import re +import urllib.parse + +from yt_dlp.extractor.youtube.pot.provider import PoTokenContext, PoTokenRequest +from yt_dlp.utils import traverse_obj + +__all__ = ['WEBPO_CLIENTS', 'ContentBindingType', 'get_webpo_content_binding'] + +WEBPO_CLIENTS = ( + 'WEB', + 'MWEB', + 'TVHTML5', + 'WEB_EMBEDDED_PLAYER', + 'WEB_CREATOR', + 'WEB_REMIX', + 'TVHTML5_SIMPLY_EMBEDDED_PLAYER', +) + + +class ContentBindingType(enum.Enum): + VISITOR_DATA = 'visitor_data' + DATASYNC_ID = 'datasync_id' + VIDEO_ID = 'video_id' + VISITOR_ID = 'visitor_id' + + +def get_webpo_content_binding( + request: PoTokenRequest, + webpo_clients=WEBPO_CLIENTS, + bind_to_visitor_id=False, +) -> tuple[str | None, ContentBindingType | None]: + + client_name = traverse_obj(request.innertube_context, ('client', 'clientName')) + if not client_name or client_name not in webpo_clients: + return None, None + + if request.context == PoTokenContext.GVS or client_name in ('WEB_REMIX', ): + if request.is_authenticated: + return request.data_sync_id, ContentBindingType.DATASYNC_ID + else: + if bind_to_visitor_id: + visitor_id = _extract_visitor_id(request.visitor_data) + if visitor_id: + return visitor_id, ContentBindingType.VISITOR_ID + return request.visitor_data, ContentBindingType.VISITOR_DATA + + elif request.context in (PoTokenContext.PLAYER, PoTokenContext.SUBS): + return request.video_id, ContentBindingType.VIDEO_ID + + return None, None + + +def _extract_visitor_id(visitor_data): + if not visitor_data: + return None + + # Attempt to extract the visitor ID from the visitor_data protobuf + # xxx: ideally should use a protobuf parser + with contextlib.suppress(Exception): + visitor_id = base64.urlsafe_b64decode( + urllib.parse.unquote_plus(visitor_data))[2:13].decode() + # check that visitor id is all letters and numbers + if re.fullmatch(r'[A-Za-z0-9_-]{11}', visitor_id): + return visitor_id + + return None diff --git a/yt_dlp/extractor/zdf.py b/yt_dlp/extractor/zdf.py index 4bef1017cc..10be582a33 100644 --- a/yt_dlp/extractor/zdf.py +++ b/yt_dlp/extractor/zdf.py @@ -1,331 +1,410 @@ +import itertools +import json import re +import time from .common import InfoExtractor from ..utils import ( - NO_DEFAULT, ExtractorError, determine_ext, + filter_dict, float_or_none, int_or_none, join_nonempty, - merge_dicts, + make_archive_id, parse_codecs, - qualities, - traverse_obj, - try_get, + parse_iso8601, + parse_qs, + smuggle_url, unified_timestamp, - update_url_query, + unsmuggle_url, url_or_none, urljoin, + variadic, ) +from ..utils.traversal import require, traverse_obj class ZDFBaseIE(InfoExtractor): _GEO_COUNTRIES = ['DE'] - _QUALITIES = ('auto', 'low', 'med', 'high', 'veryhigh', 'hd', 'fhd', 'uhd') + _TOKEN_CACHE_PARAMS = ('zdf', 'api-token') + _token_cache = {} - def _download_v2_doc(self, document_id): - return self._download_json( - f'https://zdf-prod-futura.zdf.de/mediathekV2/document/{document_id}', - document_id) + def _get_api_token(self): + # As of 2025-03, this API is used by the Android app for getting tokens. + # An equivalent token could be extracted from the webpage should the API become unavailable. + # For now this allows the extractor to avoid dealing with Next.js hydration data. + if not self._token_cache: + self._token_cache.update(self.cache.load(*self._TOKEN_CACHE_PARAMS, default={})) - def _call_api(self, url, video_id, item, api_token=None, referrer=None): - headers = {} - if api_token: - headers['Api-Auth'] = f'Bearer {api_token}' - if referrer: - headers['Referer'] = referrer + if traverse_obj(self._token_cache, ('expires', {int_or_none}), default=0) < int(time.time()): + self._token_cache.update(self._download_json( + 'https://zdf-prod-futura.zdf.de/mediathekV2/token', None, + 'Downloading API token', 'Failed to download API token')) + self.cache.store(*self._TOKEN_CACHE_PARAMS, self._token_cache) + + return f'{self._token_cache["type"]} {self._token_cache["token"]}' + + def _call_api(self, url, video_id, item, api_token=None): return self._download_json( - url, video_id, f'Downloading JSON {item}', headers=headers) + url, video_id, f'Downloading {item}', f'Failed to download {item}', + headers=filter_dict({'Api-Auth': api_token})) + + def _parse_aspect_ratio(self, aspect_ratio): + if not aspect_ratio or not isinstance(aspect_ratio, str): + return None + mobj = re.match(r'(?P\d+):(?P\d+)', aspect_ratio) + return int(mobj.group('width')) / int(mobj.group('height')) if mobj else None + + def _extract_chapters(self, data): + return traverse_obj(data, (lambda _, v: v['anchorOffset'], { + 'start_time': ('anchorOffset', {float_or_none}), + 'title': ('anchorLabel', {str}), + })) or None @staticmethod def _extract_subtitles(src): + seen_urls = set() subtitles = {} - for caption in try_get(src, lambda x: x['captions'], list) or []: + for caption in src: subtitle_url = url_or_none(caption.get('uri')) - if subtitle_url: - lang = caption.get('language', 'deu') - subtitles.setdefault(lang, []).append({ - 'url': subtitle_url, - }) + if not subtitle_url or subtitle_url in seen_urls: + continue + seen_urls.add(subtitle_url) + lang = caption.get('language') or 'deu' + subtitles.setdefault(lang, []).append({ + 'url': subtitle_url, + }) return subtitles - def _extract_format(self, video_id, formats, format_urls, meta): - format_url = url_or_none(meta.get('url')) - if not format_url or format_url in format_urls: - return - format_urls.add(format_url) + def _expand_ptmd_template(self, api_base_url, template): + return urljoin(api_base_url, template.replace('{playerId}', 'android_native_6')) - mime_type, ext = meta.get('mimeType'), determine_ext(format_url) - if mime_type == 'application/x-mpegURL' or ext == 'm3u8': - new_formats = self._extract_m3u8_formats( - format_url, video_id, 'mp4', m3u8_id='hls', - entry_protocol='m3u8_native', fatal=False) - elif mime_type == 'application/f4m+xml' or ext == 'f4m': - new_formats = self._extract_f4m_formats( - update_url_query(format_url, {'hdcore': '3.7.0'}), video_id, f4m_id='hds', fatal=False) - elif ext == 'mpd': - new_formats = self._extract_mpd_formats( - format_url, video_id, mpd_id='dash', fatal=False) - else: - f = parse_codecs(meta.get('mimeCodec')) - if not f and meta.get('type'): - data = meta['type'].split('_') - if try_get(data, lambda x: x[2]) == ext: - f = {'vcodec': data[0], 'acodec': data[1]} - f.update({ - 'url': format_url, - 'format_id': join_nonempty('http', meta.get('type'), meta.get('quality')), - 'tbr': int_or_none(self._search_regex(r'_(\d+)k_', format_url, 'tbr', default=None)), - }) - new_formats = [f] - formats.extend(merge_dicts(f, { - 'format_note': join_nonempty('quality', 'class', from_dict=meta, delim=', '), - 'language': meta.get('language'), - 'language_preference': 10 if meta.get('class') == 'main' else -10 if meta.get('class') == 'ad' else -1, - 'quality': qualities(self._QUALITIES)(meta.get('quality')), - }) for f in new_formats) + def _extract_ptmd(self, ptmd_urls, video_id, api_token=None, aspect_ratio=None): + content_id = None + duration = None + formats, src_captions = [], [] + seen_urls = set() - def _extract_ptmd(self, ptmd_url, video_id, api_token, referrer): - ptmd = self._call_api( - ptmd_url, video_id, 'metadata', api_token, referrer) + for ptmd_url in variadic(ptmd_urls): + ptmd_url, smuggled_data = unsmuggle_url(ptmd_url, {}) + # Is it a DGS variant? (*D*eutsche *G*ebärden*s*prache' / German Sign Language) + is_dgs = smuggled_data.get('vod_media_type') == 'DGS' + ptmd = self._call_api(ptmd_url, video_id, 'PTMD data', api_token) - content_id = ptmd.get('basename') or ptmd_url.split('/')[-1] + basename = ( + ptmd.get('basename') + # ptmd_url examples: + # https://api.zdf.de/tmd/2/android_native_6/vod/ptmd/mediathek/250328_sendung_hsh/3 + # https://tmd.phoenix.de/tmd/2/android_native_6/vod/ptmd/phoenix/221215_phx_spitzbergen + or self._search_regex(r'/vod/ptmd/[^/?#]+/(\w+)', ptmd_url, 'content ID', default=None)) + # If this is_dgs, then it's from ZDFIE and it only uses content_id for _old_archive_ids, + # and the old version of the extractor didn't extract DGS variants, so ignore basename + if not content_id and not is_dgs: + content_id = basename - formats = [] - track_uris = set() - for p in ptmd['priorityList']: - formitaeten = p.get('formitaeten') - if not isinstance(formitaeten, list): - continue - for f in formitaeten: - f_qualities = f.get('qualities') - if not isinstance(f_qualities, list): - continue - for quality in f_qualities: - tracks = try_get(quality, lambda x: x['audio']['tracks'], list) - if not tracks: - continue - for track in tracks: - self._extract_format( - content_id, formats, track_uris, { - 'url': track.get('uri'), - 'type': f.get('type'), - 'mimeType': f.get('mimeType'), - 'quality': quality.get('quality'), - 'class': track.get('class'), - 'language': track.get('language'), + if not duration: + duration = traverse_obj(ptmd, ('attributes', 'duration', 'value', {float_or_none(scale=1000)})) + src_captions += traverse_obj(ptmd, ('captions', ..., {dict})) + + for stream in traverse_obj(ptmd, ('priorityList', ..., 'formitaeten', ..., {dict})): + for quality in traverse_obj(stream, ('qualities', ..., {dict})): + for variant in traverse_obj(quality, ('audio', 'tracks', lambda _, v: url_or_none(v['uri']))): + format_url = variant['uri'] + if format_url in seen_urls: + continue + seen_urls.add(format_url) + ext = determine_ext(format_url) + if ext == 'm3u8': + fmts = self._extract_m3u8_formats( + format_url, video_id, 'mp4', m3u8_id='hls', fatal=False) + elif ext == 'mpd': + fmts = self._extract_mpd_formats( + format_url, video_id, mpd_id='dash', fatal=False) + else: + height = int_or_none(quality.get('highestVerticalResolution')) + width = round(aspect_ratio * height) if aspect_ratio and height else None + fmts = [{ + 'url': format_url, + **parse_codecs(quality.get('mimeCodec')), + 'height': height, + 'width': width, + 'format_id': join_nonempty('http', stream.get('type')), + 'tbr': int_or_none(self._search_regex(r'_(\d+)k_', format_url, 'tbr', default=None)), + }] + f_class = variant.get('class') + for f in fmts: + formats.append({ + **f, + 'format_id': join_nonempty(f.get('format_id'), is_dgs and 'dgs'), + 'format_note': join_nonempty( + f_class, is_dgs and 'German Sign Language', f.get('format_note'), delim=', '), + 'language': variant.get('language') or f.get('language'), + 'preference': -2 if is_dgs else -1, + 'language_preference': 10 if f_class == 'main' else -10 if f_class == 'ad' else -1, }) - duration = float_or_none(try_get( - ptmd, lambda x: x['attributes']['duration']['value']), scale=1000) - return { - 'extractor_key': ZDFIE.ie_key(), - 'id': content_id, + 'id': content_id or video_id, 'duration': duration, 'formats': formats, - 'subtitles': self._extract_subtitles(ptmd), - '_format_sort_fields': ('tbr', 'res', 'quality', 'language_preference'), + 'subtitles': self._extract_subtitles(src_captions), } - def _extract_player(self, webpage, video_id, fatal=True): - return self._parse_json( - self._search_regex( - r'(?s)data-zdfplayer-jsb=(["\'])(?P{.+?})\1', webpage, - 'player JSON', default='{}' if not fatal else NO_DEFAULT, - group='json'), - video_id) + def _download_graphql(self, item_id, data_desc, query=None, body=None): + assert query or body, 'One of query or body is required' - def _extract_entry(self, url, player, content, video_id): - title = content.get('title') or content['teaserHeadline'] + return self._download_json( + 'https://api.zdf.de/graphql', item_id, + f'Downloading {data_desc}', f'Failed to download {data_desc}', + query=query, data=json.dumps(body).encode() if body else None, + headers=filter_dict({ + 'Api-Auth': self._get_api_token(), + 'Apollo-Require-Preflight': True, + 'Content-Type': 'application/json' if body else None, + })) - t = content['mainVideoContent']['http://zdf.de/rels/target'] - ptmd_path = traverse_obj(t, ( - (('streams', 'default'), None), - ('http://zdf.de/rels/streams/ptmd', 'http://zdf.de/rels/streams/ptmd-template'), - ), get_all=False) - if not ptmd_path: - raise ExtractorError('Could not extract ptmd_path') - - info = self._extract_ptmd( - urljoin(url, ptmd_path.replace('{playerId}', 'android_native_5')), video_id, player['apiToken'], url) - - thumbnails = [] - layouts = try_get( - content, lambda x: x['teaserImageRef']['layouts'], dict) - if layouts: - for layout_key, layout_url in layouts.items(): - layout_url = url_or_none(layout_url) - if not layout_url: - continue - thumbnail = { - 'url': layout_url, - 'format_id': layout_key, - } - mobj = re.search(r'(?P\d+)x(?P\d+)', layout_key) - if mobj: - thumbnail.update({ - 'width': int(mobj.group('width')), - 'height': int(mobj.group('height')), - }) - thumbnails.append(thumbnail) - - chapter_marks = t.get('streamAnchorTag') or [] - chapter_marks.append({'anchorOffset': int_or_none(t.get('duration'))}) - chapters = [{ - 'start_time': chap.get('anchorOffset'), - 'end_time': next_chap.get('anchorOffset'), - 'title': chap.get('anchorLabel'), - } for chap, next_chap in zip(chapter_marks, chapter_marks[1:])] - - return merge_dicts(info, { - 'title': title, - 'description': content.get('leadParagraph') or content.get('teasertext'), - 'duration': int_or_none(t.get('duration')), - 'timestamp': unified_timestamp(content.get('editorialDate')), - 'thumbnails': thumbnails, - 'chapters': chapters or None, - 'episode': title, - **traverse_obj(content, ('programmeItem', 0, 'http://zdf.de/rels/target', { - 'series_id': ('http://zdf.de/rels/cmdm/series', 'seriesUuid', {str}), - 'series': ('http://zdf.de/rels/cmdm/series', 'seriesTitle', {str}), - 'season': ('http://zdf.de/rels/cmdm/season', 'seasonTitle', {str}), - 'season_number': ('http://zdf.de/rels/cmdm/season', 'seasonNumber', {int_or_none}), - 'season_id': ('http://zdf.de/rels/cmdm/season', 'seasonUuid', {str}), - 'episode_number': ('episodeNumber', {int_or_none}), - 'episode_id': ('contentId', {str}), - })), - }) - - def _extract_regular(self, url, player, video_id, query=None): - player_url = player['content'] - - content = self._call_api( - update_url_query(player_url, query), - video_id, 'content', player['apiToken'], url) - - return self._extract_entry(player_url, player, content, video_id) - - def _extract_mobile(self, video_id): - video = self._download_v2_doc(video_id) - - formats = [] - formitaeten = try_get(video, lambda x: x['document']['formitaeten'], list) - document = formitaeten and video['document'] - if formitaeten: - title = document['titel'] - content_id = document['basename'] - - format_urls = set() - for f in formitaeten or []: - self._extract_format(content_id, formats, format_urls, f) - - thumbnails = [] - teaser_bild = document.get('teaserBild') - if isinstance(teaser_bild, dict): - for thumbnail_key, thumbnail in teaser_bild.items(): - thumbnail_url = try_get( - thumbnail, lambda x: x['url'], str) - if thumbnail_url: - thumbnails.append({ - 'url': thumbnail_url, - 'id': thumbnail_key, - 'width': int_or_none(thumbnail.get('width')), - 'height': int_or_none(thumbnail.get('height')), - }) - - return { - 'id': content_id, - 'title': title, - 'description': document.get('beschreibung'), - 'duration': int_or_none(document.get('length')), - 'timestamp': unified_timestamp(document.get('date')) or unified_timestamp( - try_get(video, lambda x: x['meta']['editorialDate'], str)), - 'thumbnails': thumbnails, - 'subtitles': self._extract_subtitles(document), - 'formats': formats, - } + @staticmethod + def _extract_thumbnails(source): + return [{ + 'id': str(format_id), + 'url': url, + 'preference': 1 if format_id == 'original' else 0, + **traverse_obj(re.search(r'(?P\d+|auto)[Xx](?P\d+|auto)', str(format_id)), { + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + }), + } for format_id, url in traverse_obj(source, ({dict.items}, lambda _, v: url_or_none(v[1])))] class ZDFIE(ZDFBaseIE): - _VALID_URL = r'https?://www\.zdf\.de/(?:[^/]+/)*(?P[^/?#&]+)\.html' + _VALID_URL = [ + r'https?://(?:www\.)?zdf\.de/(?:video|play)/(?:[^/?#]+/)*(?P[^/?#]+)', + # /nachrichten/ sub-site URLs and legacy redirects from before the redesign in 2025-03 + r'https?://(?:www\.)?zdf\.de/(?:[^/?#]+/)*(?P[^/?#]+)\.html', + ] + IE_NAME = 'zdf' _TESTS = [{ - # Same as https://www.phoenix.de/sendungen/ereignisse/corona-nachgehakt/wohin-fuehrt-der-protest-in-der-pandemie-a-2050630.html - 'url': 'https://www.zdf.de/politik/phoenix-sendungen/wohin-fuehrt-der-protest-in-der-pandemie-100.html', - 'md5': '34ec321e7eb34231fd88616c65c92db0', + # Standalone video (i.e. not part of a playlist), video URL + 'url': 'https://www.zdf.de/video/dokus/sylt---deutschlands-edles-nordlicht-movie-100/sylt-deutschlands-edles-nordlicht-100', 'info_dict': { - 'id': '210222_phx_nachgehakt_corona_protest', + 'id': 'sylt-deutschlands-edles-nordlicht-100', 'ext': 'mp4', - 'title': 'Wohin führt der Protest in der Pandemie?', - 'description': 'md5:7d643fe7f565e53a24aac036b2122fbd', - 'duration': 1691, - 'timestamp': 1613948400, - 'upload_date': '20210221', + 'title': 'Sylt - Deutschlands edles Nordlicht', + 'description': 'md5:35407b810c2e1e33efbe15ef6e4c06c3', + 'duration': 810.0, + 'thumbnail': 'https://www.zdf.de/assets/sylt-118~original?cb=1613992485011', + 'series': 'Sylt - Deutschlands edles Nordlicht', + 'series_id': 'sylt---deutschlands-edles-nordlicht-movie-100', + 'timestamp': 1612462500, + 'upload_date': '20210204', + '_old_archive_ids': ['zdf 210402_1915_sendung_dok'], }, - 'skip': 'No longer available: "Diese Seite wurde leider nicht gefunden"', }, { - # Same as https://www.3sat.de/film/ab-18/10-wochen-sommer-108.html - 'url': 'https://www.zdf.de/dokumentation/ab-18/10-wochen-sommer-102.html', - 'md5': '0aff3e7bc72c8813f5e0fae333316a1d', + # Standalone video (i.e. not part of a playlist), play URL + 'url': 'https://www.zdf.de/play/dokus/sylt---deutschlands-edles-nordlicht-movie-100/sylt-deutschlands-edles-nordlicht-100', 'info_dict': { - 'id': '141007_ab18_10wochensommer_film', + 'id': 'sylt-deutschlands-edles-nordlicht-100', 'ext': 'mp4', - 'title': 'Ab 18! - 10 Wochen Sommer', - 'description': 'md5:8253f41dc99ce2c3ff892dac2d65fe26', - 'duration': 2660, - 'timestamp': 1608604200, - 'upload_date': '20201222', + 'title': 'Sylt - Deutschlands edles Nordlicht', + 'description': 'md5:35407b810c2e1e33efbe15ef6e4c06c3', + 'duration': 810.0, + 'thumbnail': 'https://www.zdf.de/assets/sylt-118~original?cb=1613992485011', + 'series': 'Sylt - Deutschlands edles Nordlicht', + 'series_id': 'sylt---deutschlands-edles-nordlicht-movie-100', + 'timestamp': 1612462500, + 'upload_date': '20210204', + '_old_archive_ids': ['zdf 210402_1915_sendung_dok'], }, - 'skip': 'No longer available: "Diese Seite wurde leider nicht gefunden"', + 'params': {'skip_download': True}, }, { - 'url': 'https://www.zdf.de/nachrichten/heute-journal/heute-journal-vom-30-12-2021-100.html', + # Standalone video (i.e. not part of a playlist), legacy URL before website redesign in 2025-03 + 'url': 'https://www.zdf.de/dokumentation/dokumentation-sonstige/sylt-deutschlands-edles-nordlicht-100.html', 'info_dict': { - 'id': '211230_sendung_hjo', + 'id': 'sylt-deutschlands-edles-nordlicht-100', 'ext': 'mp4', - 'description': 'md5:47dff85977bde9fb8cba9e9c9b929839', - 'duration': 1890.0, - 'upload_date': '20211230', - 'chapters': list, - 'thumbnail': 'md5:e65f459f741be5455c952cd820eb188e', - 'title': 'heute journal vom 30.12.2021', - 'timestamp': 1640897100, + 'title': 'Sylt - Deutschlands edles Nordlicht', + 'description': 'md5:35407b810c2e1e33efbe15ef6e4c06c3', + 'duration': 810.0, + 'thumbnail': 'https://www.zdf.de/assets/sylt-118~original?cb=1613992485011', + 'series': 'Sylt - Deutschlands edles Nordlicht', + 'series_id': 'sylt---deutschlands-edles-nordlicht-movie-100', + 'timestamp': 1612462500, + 'upload_date': '20210204', + '_old_archive_ids': ['zdf 210402_1915_sendung_dok'], }, - 'skip': 'No longer available: "Diese Seite wurde leider nicht gefunden"', + 'params': {'skip_download': True}, }, { - 'url': 'https://www.zdf.de/dokumentation/terra-x/die-magie-der-farben-von-koenigspurpur-und-jeansblau-100.html', + # Video belongs to a playlist, video URL + 'url': 'https://www.zdf.de/video/dokus/die-magie-der-farben-116/die-magie-der-farben-von-koenigspurpur-und-jeansblau-100', + 'md5': '1eda17eb40a9ead3046326e10b9c5973', 'info_dict': { - 'id': '151025_magie_farben2_tex', + 'id': 'die-magie-der-farben-von-koenigspurpur-und-jeansblau-100', 'ext': 'mp4', - 'duration': 2615.0, - 'title': 'Die Magie der Farben (2/2)', + 'title': 'Von Königspurpur bis Jeansblau', 'description': 'md5:a89da10c928c6235401066b60a6d5c1a', - 'timestamp': 1465021200, - 'thumbnail': 'https://www.zdf.de/assets/mauve-im-labor-100~768x432?cb=1464909117806', - 'upload_date': '20160604', - 'episode': 'Die Magie der Farben (2/2)', - 'episode_id': 'POS_954f4170-36a5-4a41-a6cf-78f1f3b1f127', - 'season': 'Staffel 1', + 'duration': 2615.0, + 'thumbnail': 'https://www.zdf.de/assets/koenigspurpur-bis-jeansblau-100~original?cb=1741857765971', 'series': 'Die Magie der Farben', + 'series_id': 'die-magie-der-farben-116', + 'season': 'Season 1', 'season_number': 1, - 'series_id': 'a39900dd-cdbd-4a6a-a413-44e8c6ae18bc', - 'season_id': '5a92e619-8a0f-4410-a3d5-19c76fbebb37', + 'episode': 'Episode 2', 'episode_number': 2, + 'timestamp': 1445797800, + 'upload_date': '20151025', + '_old_archive_ids': ['zdf 151025_magie_farben2_tex'], + }, + }, { + # Video belongs to a playlist, play URL + 'url': 'https://www.zdf.de/play/dokus/die-magie-der-farben-116/die-magie-der-farben-von-koenigspurpur-und-jeansblau-100', + 'md5': '1eda17eb40a9ead3046326e10b9c5973', + 'info_dict': { + 'id': 'die-magie-der-farben-von-koenigspurpur-und-jeansblau-100', + 'ext': 'mp4', + 'title': 'Von Königspurpur bis Jeansblau', + 'description': 'md5:a89da10c928c6235401066b60a6d5c1a', + 'duration': 2615.0, + 'thumbnail': 'https://www.zdf.de/assets/koenigspurpur-bis-jeansblau-100~original?cb=1741857765971', + 'series': 'Die Magie der Farben', + 'series_id': 'die-magie-der-farben-116', + 'season': 'Season 1', + 'season_number': 1, + 'episode': 'Episode 2', + 'episode_number': 2, + 'timestamp': 1445797800, + 'upload_date': '20151025', + '_old_archive_ids': ['zdf 151025_magie_farben2_tex'], + }, + 'params': {'skip_download': True}, + }, { + # Video belongs to a playlist, legacy URL before website redesign in 2025-03 + 'url': 'https://www.zdf.de/dokumentation/terra-x/die-magie-der-farben-von-koenigspurpur-und-jeansblau-100.html', + 'md5': '1eda17eb40a9ead3046326e10b9c5973', + 'info_dict': { + 'id': 'die-magie-der-farben-von-koenigspurpur-und-jeansblau-100', + 'ext': 'mp4', + 'title': 'Von Königspurpur bis Jeansblau', + 'description': 'md5:a89da10c928c6235401066b60a6d5c1a', + 'duration': 2615.0, + 'thumbnail': 'https://www.zdf.de/assets/koenigspurpur-bis-jeansblau-100~original?cb=1741857765971', + 'series': 'Die Magie der Farben', + 'series_id': 'die-magie-der-farben-116', + 'season': 'Season 1', + 'season_number': 1, + 'episode': 'Episode 2', + 'episode_number': 2, + 'timestamp': 1445797800, + 'upload_date': '20151025', + '_old_archive_ids': ['zdf 151025_magie_farben2_tex'], + }, + 'params': {'skip_download': True}, + }, { + # Video with chapters + # Also: video with sign-language variant + 'url': 'https://www.zdf.de/video/magazine/heute-journal-104/heute-journal-vom-19-12-2021-100', + 'md5': '6ada39465497a84fb98d48ffff69e7b7', + 'info_dict': { + 'id': 'heute-journal-vom-19-12-2021-100', + 'ext': 'mp4', + 'title': 'heute journal vom 19.12.2021', + 'description': 'md5:02504cf3b03777ff32fcc927d260c5dd', + 'duration': 1770.0, + 'thumbnail': 'https://epg-image.zdf.de/fotobase-webdelivery/images/273e5545-16e7-4ca3-898e-52fe9e06d964?layout=1920x1080', + 'chapters': 'count:11', + 'series': 'heute journal', + 'series_id': 'heute-journal-104', + 'season': 'Season 2021', + 'season_number': 2021, + 'episode': 'Episode 370', + 'episode_number': 370, + 'timestamp': 1639946700, + 'upload_date': '20211219', + # Videos with sign language variants must not have a 'dgs' suffix on their old archive IDs. + '_old_archive_ids': ['zdf 211219_sendung_hjo'], + }, + }, { + # Video that requires fallback extraction + 'url': 'https://www.zdf.de/nachrichten/politik/deutschland/koalitionsverhandlungen-spd-cdu-csu-dobrindt-100.html', + 'md5': 'c3a78514dd993a5781aa3afe50db51e2', + 'info_dict': { + 'id': 'koalitionsverhandlungen-spd-cdu-csu-dobrindt-100', + 'ext': 'mp4', + 'title': 'Dobrindt schließt Steuererhöhungen aus', + 'description': 'md5:9a117646d7b8df6bc902eb543a9c9023', + 'duration': 325, + 'thumbnail': 'https://www.zdf.de/assets/dobrindt-csu-berlin-direkt-100~1920x1080?cb=1743357653736', + 'timestamp': 1743374520, + 'upload_date': '20250330', + '_old_archive_ids': ['zdf 250330_clip_2_bdi'], }, }, { 'url': 'https://www.zdf.de/funk/druck-11790/funk-alles-ist-verzaubert-102.html', 'md5': '57af4423db0455a3975d2dc4578536bc', 'info_dict': { + 'id': 'funk-alles-ist-verzaubert-102', 'ext': 'mp4', - 'id': 'video_funk_1770473', - 'duration': 1278.0, 'title': 'Alles ist verzaubert', 'description': 'Die Neue an der Schule verdreht Ismail den Kopf.', + 'duration': 1278.0, + 'thumbnail': 'https://www.zdf.de/assets/teaser-funk-alles-ist-verzaubert-102~original?cb=1663848412907', + 'series': 'DRUCK', + 'series_id': 'funk-collection-funk-11790-1590', + 'season': 'Season 7', + 'season_number': 7, + 'episode': 'Episode 1', + 'episode_number': 1, 'timestamp': 1635520560, - 'thumbnail': 'https://www.zdf.de/assets/teaser-funk-alles-ist-verzaubert-102~1920x1080?cb=1663848412907', 'upload_date': '20211029', - 'episode': 'Alles ist verzaubert', + '_old_archive_ids': ['zdf video_funk_1770473'], }, + }, { + 'url': 'https://www.zdf.de/serien/soko-stuttgart/das-geld-anderer-leute-100.html', + 'info_dict': { + 'id': 'das-geld-anderer-leute-100', + 'ext': 'mp4', + 'title': 'Das Geld anderer Leute', + 'description': 'md5:cb6f660850dc5eb7d1ab776ea094959d', + 'duration': 2581.0, + 'thumbnail': 'https://epg-image.zdf.de/fotobase-webdelivery/images/e2d7e55a-09f0-424e-ac73-6cac4dd65f35?layout=1920x1080', + 'series': 'SOKO Stuttgart', + 'series_id': 'soko-stuttgart-104', + 'season': 'Season 11', + 'season_number': 11, + 'episode': 'Episode 10', + 'episode_number': 10, + 'timestamp': 1728983700, + 'upload_date': '20241015', + '_old_archive_ids': ['zdf 191205_1800_sendung_sok8'], + }, + }, { + 'url': 'https://www.zdf.de/serien/northern-lights/begegnung-auf-der-bruecke-100.html', + 'info_dict': { + 'id': 'begegnung-auf-der-bruecke-100', + 'ext': 'webm', + 'title': 'Begegnung auf der Brücke', + 'description': 'md5:e53a555da87447f7f1207f10353f8e45', + 'duration': 3083.0, + 'thumbnail': 'https://epg-image.zdf.de/fotobase-webdelivery/images/c5ff1d1f-f5c8-4468-86ac-1b2f1dbecc76?layout=1920x1080', + 'series': 'Northern Lights', + 'series_id': 'northern-lights-100', + 'season': 'Season 1', + 'season_number': 1, + 'episode': 'Episode 1', + 'episode_number': 1, + 'timestamp': 1738546500, + 'upload_date': '20250203', + '_old_archive_ids': ['zdf 240319_2310_sendung_not'], + }, + 'params': {'skip_download': 'geo-restricted http format'}, + }, { + # Same as https://www.phoenix.de/sendungen/ereignisse/corona-nachgehakt/wohin-fuehrt-der-protest-in-der-pandemie-a-2050630.html + 'url': 'https://www.zdf.de/politik/phoenix-sendungen/wohin-fuehrt-der-protest-in-der-pandemie-100.html', + 'only_matching': True, + }, { + # Same as https://www.3sat.de/film/ab-18/10-wochen-sommer-108.html + 'url': 'https://www.zdf.de/dokumentation/ab-18/10-wochen-sommer-102.html', + 'only_matching': True, }, { # Same as https://www.phoenix.de/sendungen/dokumentationen/gesten-der-maechtigen-i-a-89468.html?ref=suche 'url': 'https://www.zdf.de/politik/phoenix-sendungen/die-gesten-der-maechtigen-100.html', @@ -349,155 +428,323 @@ class ZDFIE(ZDFBaseIE): 'only_matching': True, }, { 'url': 'https://www.zdf.de/arte/todliche-flucht/page-video-artede-toedliche-flucht-16-100.html', - 'info_dict': { - 'id': 'video_artede_083871-001-A', - 'ext': 'mp4', - 'title': 'Tödliche Flucht (1/6)', - 'description': 'md5:e34f96a9a5f8abd839ccfcebad3d5315', - 'duration': 3193.0, - 'timestamp': 1641355200, - 'upload_date': '20220105', - }, - 'skip': 'No longer available "Diese Seite wurde leider nicht gefunden"', - }, { - 'url': 'https://www.zdf.de/serien/soko-stuttgart/das-geld-anderer-leute-100.html', - 'info_dict': { - 'id': '191205_1800_sendung_sok8', - 'ext': 'mp4', - 'title': 'Das Geld anderer Leute', - 'description': 'md5:cb6f660850dc5eb7d1ab776ea094959d', - 'duration': 2581.0, - 'timestamp': 1728983700, - 'upload_date': '20241015', - 'thumbnail': 'https://epg-image.zdf.de/fotobase-webdelivery/images/e2d7e55a-09f0-424e-ac73-6cac4dd65f35?layout=2400x1350', - 'series': 'SOKO Stuttgart', - 'series_id': 'f862ce9a-6dd1-4388-a698-22b36ac4c9e9', - 'season': 'Staffel 11', - 'season_number': 11, - 'season_id': 'ae1b4990-6d87-4970-a571-caccf1ba2879', - 'episode': 'Das Geld anderer Leute', - 'episode_number': 10, - 'episode_id': 'POS_7f367934-f2f0-45cb-9081-736781ff2d23', - }, + 'only_matching': True, }, { 'url': 'https://www.zdf.de/dokumentation/terra-x/unser-gruener-planet-wuesten-doku-100.html', - 'info_dict': { - 'id': '220525_green_planet_makingof_1_tropen_tex', - 'ext': 'mp4', - 'title': 'Making-of Unser grüner Planet - Tropen', - 'description': 'md5:d7c6949dc7c75c73c4ad51c785fb0b79', - 'duration': 435.0, - 'timestamp': 1653811200, - 'upload_date': '20220529', - 'format_note': 'hd, main', - 'thumbnail': 'https://www.zdf.de/assets/unser-gruener-planet-making-of-1-tropen-100~3840x2160?cb=1653493335577', - 'episode': 'Making-of Unser grüner Planet - Tropen', - }, - 'skip': 'No longer available: "Leider kein Video verfügbar"', - }, { - 'url': 'https://www.zdf.de/serien/northern-lights/begegnung-auf-der-bruecke-100.html', - 'info_dict': { - 'id': '240319_2310_sendung_not', - 'ext': 'mp4', - 'title': 'Begegnung auf der Brücke', - 'description': 'md5:e53a555da87447f7f1207f10353f8e45', - 'thumbnail': 'https://epg-image.zdf.de/fotobase-webdelivery/images/c5ff1d1f-f5c8-4468-86ac-1b2f1dbecc76?layout=2400x1350', - 'upload_date': '20250203', - 'duration': 3083.0, - 'timestamp': 1738546500, - 'series_id': '1d7a1879-01ee-4468-8237-c6b4ecd633c7', - 'series': 'Northern Lights', - 'season': 'Staffel 1', - 'season_number': 1, - 'season_id': '22ac26a2-4ea2-4055-ac0b-98b755cdf718', - 'episode': 'Begegnung auf der Brücke', - 'episode_number': 1, - 'episode_id': 'POS_71049438-024b-471f-b472-4fe2e490d1fb', - }, + 'only_matching': True, }] + _GRAPHQL_QUERY = ''' +query VideoByCanonical($canonical: String!) { + videoByCanonical(canonical: $canonical) { + canonical + title + leadParagraph + editorialDate + teaser { + description + image { + list + } + } + episodeInfo { + episodeNumber + seasonNumber + } + smartCollection { + canonical + title + } + currentMedia { + nodes { + ptmdTemplate + ... on VodMedia { + duration + aspectRatio + streamAnchorTags { + nodes { + anchorOffset + anchorLabel + } + } + vodMediaType + label + } + ... on LiveMedia { + start + stop + encryption + liveMediaType + label + } + id + } + } + } +} + ''' + + def _extract_ptmd(self, *args, **kwargs): + ptmd_data = super()._extract_ptmd(*args, **kwargs) + # This was the video id before the graphql redesign, other extractors still use it as such + old_archive_id = ptmd_data.pop('id') + ptmd_data['_old_archive_ids'] = [make_archive_id(self, old_archive_id)] + return ptmd_data + + # This fallback should generally only happen for pages under `zdf.de/nachrichten`. + # They are on a separate website for which GraphQL often doesn't return results. + # The API used here is no longer in use by official clients and likely deprecated. + # Long-term, news documents probably should use the API used by the mobile apps: + # https://zdf-prod-futura.zdf.de/news/documents/ (note 'news' vs 'mediathekV2') + def _extract_fallback(self, document_id): + video = self._download_json( + f'https://zdf-prod-futura.zdf.de/mediathekV2/document/{document_id}', + document_id, note='Downloading fallback metadata', + errnote='Failed to download fallback metadata') + document = video['document'] + + ptmd_url = traverse_obj(document, ( + ('streamApiUrlAndroid', ('streams', 0, 'streamApiUrlAndroid')), + {url_or_none}, any, {require('PTMD URL')})) + + thumbnails = [] + for thumbnail_key, thumbnail in traverse_obj(document, ('teaserBild', {dict.items}, ...)): + thumbnail_url = traverse_obj(thumbnail, ('url', {url_or_none})) + if not thumbnail_url: + continue + thumbnails.append({ + 'url': thumbnail_url, + 'id': thumbnail_key, + 'width': int_or_none(thumbnail.get('width')), + 'height': int_or_none(thumbnail.get('height')), + }) + + return { + 'thumbnails': thumbnails, + **traverse_obj(video, { + 'title': ('document', 'titel', {str}), + 'description': ('document', 'beschreibung', {str}), + 'timestamp': ( + (('document', 'date'), ('meta', 'editorialDate')), + {unified_timestamp}, any), + 'subtitles': ('document', 'captions', {self._extract_subtitles}), + }), + **self._extract_ptmd(ptmd_url, document_id, self._get_api_token()), + 'id': document_id, + } + def _real_extract(self, url): video_id = self._match_id(url) + video_data = self._download_graphql(video_id, 'video metadata', body={ + 'operationName': 'VideoByCanonical', + 'query': self._GRAPHQL_QUERY, + 'variables': {'canonical': video_id}, + })['data']['videoByCanonical'] - webpage = self._download_webpage(url, video_id, fatal=False) - if webpage: - player = self._extract_player(webpage, url, fatal=False) - if player: - return self._extract_regular(url, player, video_id, query={'profile': 'player-3'}) + if not video_data: + return self._extract_fallback(video_id) - return self._extract_mobile(video_id) + aspect_ratio = None + ptmd_urls = [] + for node in traverse_obj(video_data, ('currentMedia', 'nodes', lambda _, v: v['ptmdTemplate'])): + ptmd_url = self._expand_ptmd_template('https://api.zdf.de', node['ptmdTemplate']) + # Smuggle vod_media_type so that _extract_ptmd is aware of 'DGS' variants + if vod_media_type := node.get('vodMediaType'): + ptmd_url = smuggle_url(ptmd_url, {'vod_media_type': vod_media_type}) + ptmd_urls.append(ptmd_url) + if not aspect_ratio: + aspect_ratio = self._parse_aspect_ratio(node.get('aspectRatio')) + + return { + **traverse_obj(video_data, { + 'title': ('title', {str}), + 'description': (('leadParagraph', ('teaser', 'description')), any, {str}), + 'timestamp': ('editorialDate', {parse_iso8601}), + 'thumbnails': ('teaser', 'image', 'list', {self._extract_thumbnails}), + 'episode_number': ('episodeInfo', 'episodeNumber', {int_or_none}), + 'season_number': ('episodeInfo', 'seasonNumber', {int_or_none}), + 'series': ('smartCollection', 'title', {str}), + 'series_id': ('smartCollection', 'canonical', {str}), + 'chapters': ('currentMedia', 'nodes', 0, 'streamAnchorTags', 'nodes', {self._extract_chapters}), + }), + **self._extract_ptmd(ptmd_urls, video_id, self._get_api_token(), aspect_ratio), + 'id': video_id, + } class ZDFChannelIE(ZDFBaseIE): - _VALID_URL = r'https?://www\.zdf\.de/(?:[^/?#]+/)*(?P[^/?#&]+)' + _VALID_URL = r'https?://www\.zdf\.de/(?:[^/?#]+/)*(?P[^/?#]+)' + IE_NAME = 'zdf:channel' _TESTS = [{ + # Playlist, legacy URL before website redesign in 2025-03 'url': 'https://www.zdf.de/sport/das-aktuelle-sportstudio', 'info_dict': { - 'id': 'das-aktuelle-sportstudio', + 'id': 'das-aktuelle-sportstudio-220', 'title': 'das aktuelle sportstudio', + 'description': 'md5:e46c785324238a03edcf8b301c5fd5dc', }, - 'playlist_mincount': 18, + 'playlist_mincount': 25, }, { - 'url': 'https://www.zdf.de/dokumentation/planet-e', + # Playlist, current URL + 'url': 'https://www.zdf.de/sport/das-aktuelle-sportstudio-220', 'info_dict': { - 'id': 'planet-e', - 'title': 'planet e.', - 'description': 'md5:87e3b9c66a63cf1407ee443d2c4eb88e', + 'id': 'das-aktuelle-sportstudio-220', + 'title': 'das aktuelle sportstudio', + 'description': 'md5:e46c785324238a03edcf8b301c5fd5dc', }, - 'playlist_mincount': 50, + 'playlist_mincount': 25, + }, { + # Standalone video (i.e. not part of a playlist), collection URL + 'add_ie': [ZDFIE.ie_key()], + 'url': 'https://www.zdf.de/dokus/sylt---deutschlands-edles-nordlicht-movie-100', + 'info_dict': { + 'id': 'sylt-deutschlands-edles-nordlicht-100', + 'ext': 'mp4', + 'title': 'Sylt - Deutschlands edles Nordlicht', + 'description': 'md5:35407b810c2e1e33efbe15ef6e4c06c3', + 'duration': 810.0, + 'thumbnail': 'https://www.zdf.de/assets/sylt-118~original?cb=1613992485011', + 'series': 'Sylt - Deutschlands edles Nordlicht', + 'series_id': 'sylt---deutschlands-edles-nordlicht-movie-100', + 'timestamp': 1612462500, + 'upload_date': '20210204', + '_old_archive_ids': ['zdf 210402_1915_sendung_dok'], + }, + 'params': {'skip_download': True}, }, { 'url': 'https://www.zdf.de/gesellschaft/aktenzeichen-xy-ungeloest', 'info_dict': { - 'id': 'aktenzeichen-xy-ungeloest', + 'id': 'aktenzeichen-xy-ungeloest-110', 'title': 'Aktenzeichen XY... Ungelöst', - 'description': 'md5:623ede5819c400c6d04943fa8100e6e7', + 'description': 'md5:b79ac0d64b979e53cbe510c0ca9cb7be', }, 'playlist_mincount': 2, }, { 'url': 'https://www.zdf.de/serien/taunuskrimi/', - 'only_matching': True, + 'info_dict': { + 'id': 'taunuskrimi-100', + 'title': 'Taunuskrimi', + 'description': 'md5:ee7204e9c625c3b611d1274f9d0e3070', + }, + 'playlist_mincount': 8, + }, { + 'url': 'https://www.zdf.de/serien/taunuskrimi/?staffel=1', + 'info_dict': { + 'id': 'taunuskrimi-100-s1', + 'title': 'Taunuskrimi - Season 1', + 'description': 'md5:ee7204e9c625c3b611d1274f9d0e3070', + }, + 'playlist_count': 7, + }, { + 'url': 'https://www.zdf.de/magazine/heute-journal-104', + 'info_dict': { + 'id': 'heute-journal-104', + 'title': 'heute journal', + 'description': 'md5:6edad39189abf8431795d3d6d7f986b3', + }, + 'playlist_mincount': 500, + }, { + 'url': 'https://www.zdf.de/magazine/heute-journal-104?staffel=2024', + 'info_dict': { + 'id': 'heute-journal-104-s2024', + 'title': 'heute journal - Season 2024', + 'description': 'md5:6edad39189abf8431795d3d6d7f986b3', + }, + 'playlist_count': 242, }] + _PAGE_SIZE = 24 + @classmethod def suitable(cls, url): return False if ZDFIE.suitable(url) else super().suitable(url) - def _extract_entry(self, entry): - return self.url_result( - entry['sharingUrl'], ZDFIE, **traverse_obj(entry, { - 'id': ('basename', {str}), - 'title': ('titel', {str}), - 'description': ('beschreibung', {str}), - 'duration': ('length', {float_or_none}), - 'season_number': ('seasonNumber', {int_or_none}), - 'episode_number': ('episodeNumber', {int_or_none}), - })) + def _fetch_page(self, playlist_id, canonical_id, season_idx, season_number, page_number, cursor=None): + return self._download_graphql( + playlist_id, f'season {season_number} page {page_number} JSON', query={ + 'operationName': 'seasonByCanonical', + 'variables': json.dumps(filter_dict({ + 'seasonIndex': season_idx, + 'canonical': canonical_id, + 'episodesPageSize': self._PAGE_SIZE, + 'episodesAfter': cursor, + })), + 'extensions': json.dumps({ + 'persistedQuery': { + 'version': 1, + 'sha256Hash': '9412a0f4ac55dc37d46975d461ec64bfd14380d815df843a1492348f77b5c99a', + }, + }), + })['data']['smartCollectionByCanonical'] - def _entries(self, data, document_id): - for entry in traverse_obj(data, ( - 'cluster', lambda _, v: v['type'] == 'teaser', - # If 'brandId' differs, it is a 'You might also like' video. Filter these out - 'teaser', lambda _, v: v['type'] == 'video' and v['brandId'] == document_id and v['sharingUrl'], - )): - yield self._extract_entry(entry) + def _entries(self, playlist_id, canonical_id, season_numbers, requested_season_number): + for season_idx, season_number in enumerate(season_numbers): + if requested_season_number is not None and requested_season_number != season_number: + continue + + cursor = None + for page_number in itertools.count(1): + page = self._fetch_page( + playlist_id, canonical_id, season_idx, season_number, page_number, cursor) + + nodes = traverse_obj(page, ('seasons', 'nodes', ...)) + + for episode in traverse_obj(nodes, ( + ..., 'episodes', 'nodes', lambda _, v: url_or_none(v['sharingUrl']), + )): + yield self.url_result( + episode['sharingUrl'], ZDFIE, + **traverse_obj(episode, { + 'id': ('canonical', {str}), + 'title': ('teaser', 'title', {str}), + 'description': (('leadParagraph', ('teaser', 'description')), any, {str}), + 'timestamp': ('editorialDate', {parse_iso8601}), + 'episode_number': ('episodeInfo', 'episodeNumber', {int_or_none}), + 'season_number': ('episodeInfo', 'seasonNumber', {int_or_none}), + })) + + page_info = traverse_obj(nodes, (-1, 'episodes', 'pageInfo', {dict})) or {} + if not page_info.get('hasNextPage') or not page_info.get('endCursor'): + break + cursor = page_info['endCursor'] def _real_extract(self, url): - channel_id = self._match_id(url) - webpage = self._download_webpage(url, channel_id) - document_id = self._search_regex( - r'docId\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, 'document id', group='doc_id') - data = self._download_v2_doc(document_id) + canonical_id = self._match_id(url) + # Make sure to get the correct ID in case of redirects + urlh = self._request_webpage(url, canonical_id) + canonical_id = self._search_regex(self._VALID_URL, urlh.url, 'channel id', group='id') + season_number = traverse_obj(parse_qs(url), ('staffel', -1, {int_or_none})) + playlist_id = join_nonempty(canonical_id, season_number and f's{season_number}') - main_video = traverse_obj(data, ( - 'cluster', lambda _, v: v['type'] == 'teaserContent', - 'teaser', lambda _, v: v['type'] == 'video' and v['basename'] and v['sharingUrl'], any)) or {} + collection_data = self._download_graphql( + playlist_id, 'smart collection data', query={ + 'operationName': 'GetSmartCollectionByCanonical', + 'variables': json.dumps({ + 'canonical': canonical_id, + 'videoPageSize': 100, # Use max page size to get episodes from all seasons + }), + 'extensions': json.dumps({ + 'persistedQuery': { + 'version': 1, + 'sha256Hash': 'cb49420e133bd668ad895a8cea0e65cba6aa11ac1cacb02341ff5cf32a17cd02', + }, + }), + })['data']['smartCollectionByCanonical'] + video_data = traverse_obj(collection_data, ('video', {dict})) or {} + season_numbers = traverse_obj(collection_data, ('seasons', 'seasons', ..., 'number', {int_or_none})) - if not self._yes_playlist(channel_id, main_video.get('basename')): - return self._extract_entry(main_video) + if not self._yes_playlist( + season_numbers and playlist_id, + url_or_none(video_data.get('sharingUrl')) and video_data.get('canonical'), + ): + return self.url_result(video_data['sharingUrl'], ZDFIE, video_data['canonical']) + + if season_number is not None and season_number not in season_numbers: + raise ExtractorError(f'Season {season_number} was not found in the collection data') return self.playlist_result( - self._entries(data, document_id), channel_id, - re.split(r'\s+[-|]\s+ZDF(?:mediathek)?$', self._og_search_title(webpage) or '')[0] or None, - join_nonempty( - 'headline', 'text', delim='\n\n', - from_dict=traverse_obj(data, ('shortText', {dict}), default={})) or None) + self._entries(playlist_id, canonical_id, season_numbers, season_number), + playlist_id, join_nonempty( + traverse_obj(collection_data, ('title', {str})), + season_number and f'Season {season_number}', delim=' - '), + traverse_obj(collection_data, ('infoText', {str}))) diff --git a/yt_dlp/jsinterp.py b/yt_dlp/jsinterp.py index ac06297154..45aeffa229 100644 --- a/yt_dlp/jsinterp.py +++ b/yt_dlp/jsinterp.py @@ -188,6 +188,7 @@ def js_number_to_string(val: float, radix: int = 10): _NAME_RE = r'[a-zA-Z_$][\w$]*' _MATCHING_PARENS = dict(zip(*zip('()', '{}', '[]'))) _QUOTES = '\'"/' +_NESTED_BRACKETS = r'[^[\]]+(?:\[[^[\]]+(?:\[[^\]]+\])?\])?' class JS_Undefined: @@ -301,7 +302,7 @@ def _separate(expr, delim=',', max_split=None): OP_CHARS = '+-*/%&|^=<>!,;{}:[' if not expr: return - counters = {k: 0 for k in _MATCHING_PARENS.values()} + counters = dict.fromkeys(_MATCHING_PARENS.values(), 0) start, splits, pos, delim_len = 0, 0, 0, len(delim) - 1 in_quote, escaping, after_op, in_regex_char_group = None, False, True, False for idx, char in enumerate(expr): @@ -589,36 +590,12 @@ def dict_item(key, val): return ret, True return ret, False - for m in re.finditer(rf'''(?x) - (?P\+\+|--)(?P{_NAME_RE})| - (?P{_NAME_RE})(?P\+\+|--)''', expr): - var = m.group('var1') or m.group('var2') - start, end = m.span() - sign = m.group('pre_sign') or m.group('post_sign') - ret = local_vars[var] - local_vars[var] += 1 if sign[0] == '+' else -1 - if m.group('pre_sign'): - ret = local_vars[var] - expr = expr[:start] + self._dump(ret, local_vars) + expr[end:] - - if not expr: - return None, should_return - m = re.match(fr'''(?x) - (?P - (?P{_NAME_RE})(?:\[(?P[^\]]+?)\])?\s* + (?P{_NAME_RE})(?:\[(?P{_NESTED_BRACKETS})\])?\s* (?P{"|".join(map(re.escape, set(_OPERATORS) - _COMP_OPERATORS))})? =(?!=)(?P.*)$ - )|(?P - (?!if|return|true|false|null|undefined|NaN)(?P{_NAME_RE})$ - )|(?P - (?P{_NAME_RE})\[(?P.+)\]$ - )|(?P - (?P{_NAME_RE})(?:(?P\?)?\.(?P[^(]+)|\[(?P[^\]]+)\])\s* - )|(?P - (?P{_NAME_RE})\((?P.*)\)$ - )''', expr) - if m and m.group('assign'): + ''', expr) + if m: # We are assigning a value to a variable left_val = local_vars.get(m.group('out')) if not m.group('index'): @@ -636,7 +613,35 @@ def dict_item(key, val): m.group('op'), self._index(left_val, idx), m.group('expr'), expr, local_vars, allow_recursion) return left_val[idx], should_return - elif expr.isdigit(): + for m in re.finditer(rf'''(?x) + (?P\+\+|--)(?P{_NAME_RE})| + (?P{_NAME_RE})(?P\+\+|--)''', expr): + var = m.group('var1') or m.group('var2') + start, end = m.span() + sign = m.group('pre_sign') or m.group('post_sign') + ret = local_vars[var] + local_vars[var] += 1 if sign[0] == '+' else -1 + if m.group('pre_sign'): + ret = local_vars[var] + expr = expr[:start] + self._dump(ret, local_vars) + expr[end:] + + if not expr: + return None, should_return + + m = re.match(fr'''(?x) + (?P + (?!if|return|true|false|null|undefined|NaN)(?P{_NAME_RE})$ + )|(?P + (?P{_NAME_RE})(?: + (?P\?)?\.(?P[^(]+)| + \[(?P{_NESTED_BRACKETS})\] + )\s* + )|(?P + (?P{_NAME_RE})\[(?P.+)\]$ + )|(?P + (?P{_NAME_RE})\((?P.*)\)$ + )''', expr) + if expr.isdigit(): return int(expr), should_return elif expr == 'break': @@ -707,7 +712,7 @@ def eval_method(): if obj is NO_DEFAULT: if variable not in self._objects: try: - self._objects[variable] = self.extract_object(variable) + self._objects[variable] = self.extract_object(variable, local_vars) except self.Exception: if not nullish: raise @@ -847,7 +852,7 @@ def interpret_expression(self, expr, local_vars, allow_recursion): raise self.Exception('Cannot return from an expression', expr) return ret - def extract_object(self, objname): + def extract_object(self, objname, *global_stack): _FUNC_NAME_RE = r'''(?:[a-zA-Z$0-9]+|"[a-zA-Z$0-9]+"|'[a-zA-Z$0-9]+')''' obj = {} obj_m = re.search( @@ -869,7 +874,8 @@ def extract_object(self, objname): for f in fields_m: argnames = f.group('args').split(',') name = remove_quotes(f.group('key')) - obj[name] = function_with_repr(self.build_function(argnames, f.group('code')), f'F<{name}>') + obj[name] = function_with_repr( + self.build_function(argnames, f.group('code'), *global_stack), f'F<{name}>') return obj @@ -890,9 +896,9 @@ def extract_function_code(self, funcname): code, _ = self._separate_at_paren(func_m.group('code')) return [x.strip() for x in func_m.group('args').split(',')], code - def extract_function(self, funcname): + def extract_function(self, funcname, *global_stack): return function_with_repr( - self.extract_function_from_code(*self.extract_function_code(funcname)), + self.extract_function_from_code(*self.extract_function_code(funcname), *global_stack), f'F<{funcname}>') def extract_function_from_code(self, argnames, code, *global_stack): diff --git a/yt_dlp/networking/__init__.py b/yt_dlp/networking/__init__.py index 1eaa0ee5fd..39158a8cc1 100644 --- a/yt_dlp/networking/__init__.py +++ b/yt_dlp/networking/__init__.py @@ -3,6 +3,7 @@ from .common import ( HEADRequest, + PATCHRequest, PUTRequest, Request, RequestDirector, diff --git a/yt_dlp/networking/_curlcffi.py b/yt_dlp/networking/_curlcffi.py index 0643348e7e..747879da87 100644 --- a/yt_dlp/networking/_curlcffi.py +++ b/yt_dlp/networking/_curlcffi.py @@ -1,11 +1,13 @@ from __future__ import annotations import io +import itertools import math import re import urllib.parse -from ._helper import InstanceStoreMixin, select_proxy +from ._helper import InstanceStoreMixin +from ..utils.networking import select_proxy from .common import ( Features, Request, @@ -31,9 +33,9 @@ curl_cffi_version = tuple(map(int, re.split(r'[^\d]+', curl_cffi.__version__)[:3])) -if curl_cffi_version != (0, 5, 10) and not ((0, 7, 0) <= curl_cffi_version < (0, 7, 2)): +if curl_cffi_version != (0, 5, 10) and not (0, 10) <= curl_cffi_version: curl_cffi._yt_dlp__version = f'{curl_cffi.__version__} (unsupported)' - raise ImportError('Only curl_cffi versions 0.5.10, 0.7.0 and 0.7.1 are supported') + raise ImportError('Only curl_cffi versions 0.5.10 and 0.10.x are supported') import curl_cffi.requests from curl_cffi.const import CurlECode, CurlOpt @@ -97,7 +99,7 @@ def read(self, amt=None): return self.fp.read(amt) except curl_cffi.requests.errors.RequestsError as e: if e.code == CurlECode.PARTIAL_FILE: - content_length = int_or_none(e.response.headers.get('Content-Length')) + content_length = e.response and int_or_none(e.response.headers.get('Content-Length')) raise IncompleteRead( partial=self.fp.bytes_read, expected=content_length - self.fp.bytes_read if content_length is not None else None, @@ -105,6 +107,51 @@ def read(self, amt=None): raise TransportError(cause=e) from e +# See: https://github.com/lexiforest/curl_cffi?tab=readme-ov-file#supported-impersonate-browsers +# https://github.com/lexiforest/curl-impersonate?tab=readme-ov-file#supported-browsers +BROWSER_TARGETS: dict[tuple[int, ...], dict[str, ImpersonateTarget]] = { + (0, 5): { + 'chrome99': ImpersonateTarget('chrome', '99', 'windows', '10'), + 'chrome99_android': ImpersonateTarget('chrome', '99', 'android', '12'), + 'chrome100': ImpersonateTarget('chrome', '100', 'windows', '10'), + 'chrome101': ImpersonateTarget('chrome', '101', 'windows', '10'), + 'chrome104': ImpersonateTarget('chrome', '104', 'windows', '10'), + 'chrome107': ImpersonateTarget('chrome', '107', 'windows', '10'), + 'chrome110': ImpersonateTarget('chrome', '110', 'windows', '10'), + 'edge99': ImpersonateTarget('edge', '99', 'windows', '10'), + 'edge101': ImpersonateTarget('edge', '101', 'windows', '10'), + 'safari15_3': ImpersonateTarget('safari', '15.3', 'macos', '11'), + 'safari15_5': ImpersonateTarget('safari', '15.5', 'macos', '12'), + }, + (0, 7): { + 'chrome116': ImpersonateTarget('chrome', '116', 'windows', '10'), + 'chrome119': ImpersonateTarget('chrome', '119', 'macos', '14'), + 'chrome120': ImpersonateTarget('chrome', '120', 'macos', '14'), + 'chrome123': ImpersonateTarget('chrome', '123', 'macos', '14'), + 'chrome124': ImpersonateTarget('chrome', '124', 'macos', '14'), + 'safari17_0': ImpersonateTarget('safari', '17.0', 'macos', '14'), + 'safari17_2_ios': ImpersonateTarget('safari', '17.2', 'ios', '17.2'), + }, + (0, 9): { + 'safari15_3': ImpersonateTarget('safari', '15.3', 'macos', '14'), + 'safari15_5': ImpersonateTarget('safari', '15.5', 'macos', '14'), + 'chrome119': ImpersonateTarget('chrome', '119', 'macos', '14'), + 'chrome120': ImpersonateTarget('chrome', '120', 'macos', '14'), + 'chrome123': ImpersonateTarget('chrome', '123', 'macos', '14'), + 'chrome124': ImpersonateTarget('chrome', '124', 'macos', '14'), + 'chrome131': ImpersonateTarget('chrome', '131', 'macos', '14'), + 'chrome131_android': ImpersonateTarget('chrome', '131', 'android', '14'), + 'chrome133a': ImpersonateTarget('chrome', '133', 'macos', '15'), + 'firefox133': ImpersonateTarget('firefox', '133', 'macos', '14'), + 'safari18_0': ImpersonateTarget('safari', '18.0', 'macos', '15'), + 'safari18_0_ios': ImpersonateTarget('safari', '18.0', 'ios', '18.0'), + }, + (0, 10): { + 'firefox135': ImpersonateTarget('firefox', '135', 'macos', '14'), + }, +} + + @register_rh class CurlCFFIRH(ImpersonateRequestHandler, InstanceStoreMixin): RH_NAME = 'curl_cffi' @@ -112,30 +159,21 @@ class CurlCFFIRH(ImpersonateRequestHandler, InstanceStoreMixin): _SUPPORTED_FEATURES = (Features.NO_PROXY, Features.ALL_PROXY) _SUPPORTED_PROXY_SCHEMES = ('http', 'https', 'socks4', 'socks4a', 'socks5', 'socks5h') _SUPPORTED_IMPERSONATE_TARGET_MAP = { - **({ - ImpersonateTarget('chrome', '124', 'macos', '14'): curl_cffi.requests.BrowserType.chrome124, - ImpersonateTarget('chrome', '123', 'macos', '14'): curl_cffi.requests.BrowserType.chrome123, - ImpersonateTarget('chrome', '120', 'macos', '14'): curl_cffi.requests.BrowserType.chrome120, - ImpersonateTarget('chrome', '119', 'macos', '14'): curl_cffi.requests.BrowserType.chrome119, - ImpersonateTarget('chrome', '116', 'windows', '10'): curl_cffi.requests.BrowserType.chrome116, - } if curl_cffi_version >= (0, 7, 0) else {}), - ImpersonateTarget('chrome', '110', 'windows', '10'): curl_cffi.requests.BrowserType.chrome110, - ImpersonateTarget('chrome', '107', 'windows', '10'): curl_cffi.requests.BrowserType.chrome107, - ImpersonateTarget('chrome', '104', 'windows', '10'): curl_cffi.requests.BrowserType.chrome104, - ImpersonateTarget('chrome', '101', 'windows', '10'): curl_cffi.requests.BrowserType.chrome101, - ImpersonateTarget('chrome', '100', 'windows', '10'): curl_cffi.requests.BrowserType.chrome100, - ImpersonateTarget('chrome', '99', 'windows', '10'): curl_cffi.requests.BrowserType.chrome99, - ImpersonateTarget('edge', '101', 'windows', '10'): curl_cffi.requests.BrowserType.edge101, - ImpersonateTarget('edge', '99', 'windows', '10'): curl_cffi.requests.BrowserType.edge99, - **({ - ImpersonateTarget('safari', '17.0', 'macos', '14'): curl_cffi.requests.BrowserType.safari17_0, - } if curl_cffi_version >= (0, 7, 0) else {}), - ImpersonateTarget('safari', '15.5', 'macos', '12'): curl_cffi.requests.BrowserType.safari15_5, - ImpersonateTarget('safari', '15.3', 'macos', '11'): curl_cffi.requests.BrowserType.safari15_3, - ImpersonateTarget('chrome', '99', 'android', '12'): curl_cffi.requests.BrowserType.chrome99_android, - **({ - ImpersonateTarget('safari', '17.2', 'ios', '17.2'): curl_cffi.requests.BrowserType.safari17_2_ios, - } if curl_cffi_version >= (0, 7, 0) else {}), + target: name if curl_cffi_version >= (0, 9) else curl_cffi.requests.BrowserType[name] + for name, target in dict(sorted(itertools.chain.from_iterable( + targets.items() + for version, targets in BROWSER_TARGETS.items() + if curl_cffi_version >= version + ), key=lambda x: ( + # deprioritize mobile targets since they give very different behavior + x[1].os not in ('ios', 'android'), + # prioritize edge < firefox < safari < chrome + ('edge', 'firefox', 'safari', 'chrome').index(x[1].client), + # prioritize newest version + float(x[1].version) if x[1].version else 0, + # group by os name + x[1].os, + ), reverse=True)).items() } def _create_instance(self, cookiejar=None): diff --git a/yt_dlp/networking/_helper.py b/yt_dlp/networking/_helper.py index b86d3606d8..ef9c8bafab 100644 --- a/yt_dlp/networking/_helper.py +++ b/yt_dlp/networking/_helper.py @@ -13,7 +13,6 @@ from .exceptions import RequestError from ..dependencies import certifi from ..socks import ProxyType, sockssocket -from ..utils import format_field, traverse_obj if typing.TYPE_CHECKING: from collections.abc import Iterable @@ -82,19 +81,6 @@ def unquote_if_non_empty(s): } -def select_proxy(url, proxies): - """Unified proxy selector for all backends""" - url_components = urllib.parse.urlparse(url) - if 'no' in proxies: - hostport = url_components.hostname + format_field(url_components.port, None, ':%s') - if urllib.request.proxy_bypass_environment(hostport, {'no': proxies['no']}): - return - elif urllib.request.proxy_bypass(hostport): # check system settings - return - - return traverse_obj(proxies, url_components.scheme or 'http', 'all') - - def get_redirect_method(method, status): """Unified redirect method handling""" diff --git a/yt_dlp/networking/_requests.py b/yt_dlp/networking/_requests.py index 5b6b264a68..d02e976b57 100644 --- a/yt_dlp/networking/_requests.py +++ b/yt_dlp/networking/_requests.py @@ -10,7 +10,7 @@ from ..dependencies import brotli, requests, urllib3 from ..utils import bug_reports_message, int_or_none, variadic -from ..utils.networking import normalize_url +from ..utils.networking import normalize_url, select_proxy if requests is None: raise ImportError('requests module is not installed') @@ -41,7 +41,6 @@ create_socks_proxy_socket, get_redirect_method, make_socks_proxy_opts, - select_proxy, ) from .common import ( Features, diff --git a/yt_dlp/networking/_urllib.py b/yt_dlp/networking/_urllib.py index a188b35f57..cb7a430bb3 100644 --- a/yt_dlp/networking/_urllib.py +++ b/yt_dlp/networking/_urllib.py @@ -26,7 +26,6 @@ create_socks_proxy_socket, get_redirect_method, make_socks_proxy_opts, - select_proxy, ) from .common import Features, RequestHandler, Response, register_rh from .exceptions import ( @@ -41,7 +40,7 @@ from ..dependencies import brotli from ..socks import ProxyError as SocksProxyError from ..utils import update_url_query -from ..utils.networking import normalize_url +from ..utils.networking import normalize_url, select_proxy SUPPORTED_ENCODINGS = ['gzip', 'deflate'] CONTENT_DECODE_ERRORS = [zlib.error, OSError] diff --git a/yt_dlp/networking/_websockets.py b/yt_dlp/networking/_websockets.py index d29f8e45a9..fd8730ac7e 100644 --- a/yt_dlp/networking/_websockets.py +++ b/yt_dlp/networking/_websockets.py @@ -11,8 +11,8 @@ create_connection, create_socks_proxy_socket, make_socks_proxy_opts, - select_proxy, ) +from ..utils.networking import select_proxy from .common import Features, Response, register_rh from .exceptions import ( CertificateVerifyError, diff --git a/yt_dlp/networking/common.py b/yt_dlp/networking/common.py index ddceaa9a97..e33769422b 100644 --- a/yt_dlp/networking/common.py +++ b/yt_dlp/networking/common.py @@ -505,6 +505,7 @@ def copy(self): HEADRequest = functools.partial(Request, method='HEAD') +PATCHRequest = functools.partial(Request, method='PATCH') PUTRequest = functools.partial(Request, method='PUT') diff --git a/yt_dlp/options.py b/yt_dlp/options.py index 91c2635a79..b4d3d4d668 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -150,6 +150,15 @@ def format_option_strings(option): return opts +_PRESET_ALIASES = { + 'mp3': ['-f', 'ba[acodec^=mp3]/ba/b', '-x', '--audio-format', 'mp3'], + 'aac': ['-f', 'ba[acodec^=aac]/ba[acodec^=mp4a.40.]/ba/b', '-x', '--audio-format', 'aac'], + 'mp4': ['--merge-output-format', 'mp4', '--remux-video', 'mp4', '-S', 'vcodec:h264,lang,quality,res,fps,hdr:12,acodec:aac'], + 'mkv': ['--merge-output-format', 'mkv', '--remux-video', 'mkv'], + 'sleep': ['--sleep-subtitles', '5', '--sleep-requests', '0.75', '--sleep-interval', '10', '--max-sleep-interval', '20'], +} + + class _YoutubeDLOptionParser(optparse.OptionParser): # optparse is deprecated since Python 3.2. So assume a stable interface even for private methods ALIAS_DEST = '_triggered_aliases' @@ -215,6 +224,25 @@ def _match_long_opt(self, opt): return e.possibilities[0] raise + def format_option_help(self, formatter=None): + assert formatter, 'Formatter can not be None' + formatted_help = super().format_option_help(formatter=formatter) + formatter.indent() + heading = formatter.format_heading('Preset Aliases') + formatter.indent() + description = formatter.format_description( + 'Predefined aliases for convenience and ease of use. Note that future versions of yt-dlp ' + 'may add or adjust presets, but the existing preset names will not be changed or removed') + result = [] + for name, args in _PRESET_ALIASES.items(): + option = optparse.Option('-t', help=shlex.join(args)) + formatter.option_strings[option] = f'-t {name}' + result.append(formatter.format_option(option)) + formatter.dedent() + formatter.dedent() + help_lines = '\n'.join(result) + return f'{formatted_help}\n{heading}{description}\n{help_lines}' + def create_parser(): def _list_from_options_callback(option, opt_str, value, parser, append=True, delim=',', process=str.strip): @@ -317,6 +345,13 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): parser.rargs[:0] = shlex.split( opts if value is None else opts.format(*map(shlex.quote, value))) + def _preset_alias_callback(option, opt_str, value, parser): + if not value: + return + if value not in _PRESET_ALIASES: + raise optparse.OptionValueError(f'Unknown preset alias: {value}') + parser.rargs[:0] = _PRESET_ALIASES[value] + general = optparse.OptionGroup(parser, 'General Options') general.add_option( '-h', '--help', dest='print_help', action='store_true', @@ -438,7 +473,7 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): general.add_option( '--live-from-start', action='store_true', dest='live_from_start', - help='Download livestreams from the start. Currently only supported for YouTube (Experimental)') + help='Download livestreams from the start. Currently experimental and only supported for YouTube and Twitch') general.add_option( '--no-live-from-start', action='store_false', dest='live_from_start', @@ -500,7 +535,8 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): 'youtube-dlc': ['all', '-no-youtube-channel-redirect', '-no-live-chat', '-playlist-match-filter', '-manifest-filesize-approx', '-allow-unsafe-ext', '-prefer-vp9-sort'], '2021': ['2022', 'no-certifi', 'filename-sanitization'], '2022': ['2023', 'no-external-downloader-progress', 'playlist-match-filter', 'prefer-legacy-http-handler', 'manifest-filesize-approx'], - '2023': ['prefer-vp9-sort'], + '2023': ['2024', 'prefer-vp9-sort'], + '2024': [], }, }, help=( 'Options that can help keep compatibility with youtube-dl or youtube-dlc ' @@ -512,12 +548,21 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): help=( 'Create aliases for an option string. Unless an alias starts with a dash "-", it is prefixed with "--". ' 'Arguments are parsed according to the Python string formatting mini-language. ' - 'E.g. --alias get-audio,-X "-S=aext:{0},abr -x --audio-format {0}" creates options ' + 'E.g. --alias get-audio,-X "-S aext:{0},abr -x --audio-format {0}" creates options ' '"--get-audio" and "-X" that takes an argument (ARG0) and expands to ' - '"-S=aext:ARG0,abr -x --audio-format ARG0". All defined aliases are listed in the --help output. ' + '"-S aext:ARG0,abr -x --audio-format ARG0". All defined aliases are listed in the --help output. ' 'Alias options can trigger more aliases; so be careful to avoid defining recursive options. ' f'As a safety measure, each alias may be triggered a maximum of {_YoutubeDLOptionParser.ALIAS_TRIGGER_LIMIT} times. ' 'This option can be used multiple times')) + general.add_option( + '-t', '--preset-alias', + metavar='PRESET', dest='_', type='str', + action='callback', callback=_preset_alias_callback, + help=( + 'Applies a predefined set of options. e.g. --preset-alias mp3. ' + f'The following presets are available: {", ".join(_PRESET_ALIASES)}. ' + 'See the "Preset Aliases" section at the end for more info. ' + 'This option can be used multiple times')) network = optparse.OptionGroup(parser, 'Network Options') network.add_option( diff --git a/yt_dlp/postprocessor/ffmpeg.py b/yt_dlp/postprocessor/ffmpeg.py index e59e9832bd..59a49aa578 100644 --- a/yt_dlp/postprocessor/ffmpeg.py +++ b/yt_dlp/postprocessor/ffmpeg.py @@ -743,7 +743,7 @@ def add(meta_list, info_list=None): if value not in ('', None): value = ', '.join(map(str, variadic(value))) value = value.replace('\0', '') # nul character cannot be passed in command line - metadata['common'].update({meta_f: value for meta_f in variadic(meta_list)}) + metadata['common'].update(dict.fromkeys(variadic(meta_list), value)) # Info on media metadata/metadata supported by ffmpeg: # https://wiki.multimedia.cx/index.php/FFmpeg_Metadata diff --git a/yt_dlp/update.py b/yt_dlp/update.py index 360f5ad58c..de289cb780 100644 --- a/yt_dlp/update.py +++ b/yt_dlp/update.py @@ -117,7 +117,7 @@ def current_git_head(): } _NON_UPDATEABLE_REASONS = { - **{variant: None for variant in _FILE_SUFFIXES}, # Updatable + **dict.fromkeys(_FILE_SUFFIXES), # Updatable **{variant: f'Auto-update is not supported for unpackaged {name} executable; Re-download the latest release' for variant, name in {'win32_dir': 'Windows', 'darwin_dir': 'MacOS', 'linux_dir': 'Linux'}.items()}, 'py2exe': 'py2exe is no longer supported by yt-dlp; This executable cannot be updated', @@ -202,7 +202,7 @@ class UpdateInfo: requested_version: str | None = None commit: str | None = None - binary_name: str | None = _get_binary_name() # noqa: RUF009: Always returns the same value + binary_name: str | None = _get_binary_name() # noqa: RUF009 # Always returns the same value checksum: str | None = None diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index 4093c238c2..20aa341ca3 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -54,7 +54,7 @@ from ..dependencies import xattr from ..globals import IN_CLI -__name__ = __name__.rsplit('.', 1)[0] # noqa: A001: Pretend to be the parent module +__name__ = __name__.rsplit('.', 1)[0] # noqa: A001 # Pretend to be the parent module class NO_DEFAULT: @@ -2044,7 +2044,7 @@ def url_or_none(url): if not url or not isinstance(url, str): return None url = url.strip() - return url if re.match(r'(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None + return url if re.match(r'(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?|wss?):)?//', url) else None def strftime_or_none(timestamp, date_format='%Y%m%d', default=None): @@ -2767,7 +2767,8 @@ def process_escape(match): def template_substitute(match): evaluated = js_to_json(match.group(1), vars, strict=strict) if evaluated[0] == '"': - return json.loads(evaluated) + with contextlib.suppress(json.JSONDecodeError): + return json.loads(evaluated) return evaluated def fix_kv(m): @@ -3247,7 +3248,7 @@ def _match_one(filter_part, dct, incomplete): op = lambda attr, value: not unnegated_op(attr, value) else: op = unnegated_op - comparison_value = m['quotedstrval'] or m['strval'] or m['intval'] + comparison_value = m['quotedstrval'] or m['strval'] if m['quote']: comparison_value = comparison_value.replace(r'\{}'.format(m['quote']), m['quote']) actual_value = dct.get(m['key']) diff --git a/yt_dlp/utils/networking.py b/yt_dlp/utils/networking.py index 542abace87..9fcab6456f 100644 --- a/yt_dlp/utils/networking.py +++ b/yt_dlp/utils/networking.py @@ -10,7 +10,8 @@ if typing.TYPE_CHECKING: T = typing.TypeVar('T') -from ._utils import NO_DEFAULT, remove_start +from ._utils import NO_DEFAULT, remove_start, format_field +from .traversal import traverse_obj def random_user_agent(): @@ -278,3 +279,16 @@ def normalize_url(url): query=escape_rfc3986(url_parsed.query), fragment=escape_rfc3986(url_parsed.fragment), ).geturl() + + +def select_proxy(url, proxies): + """Unified proxy selector for all backends""" + url_components = urllib.parse.urlparse(url) + if 'no' in proxies: + hostport = url_components.hostname + format_field(url_components.port, None, ':%s') + if urllib.request.proxy_bypass_environment(hostport, {'no': proxies['no']}): + return + elif urllib.request.proxy_bypass(hostport): # check system settings + return + + return traverse_obj(proxies, url_components.scheme or 'http', 'all') diff --git a/yt_dlp/version.py b/yt_dlp/version.py index 7346ca49c9..c375cc6ad8 100644 --- a/yt_dlp/version.py +++ b/yt_dlp/version.py @@ -1,8 +1,8 @@ # Autogenerated by devscripts/update-version.py -__version__ = '2025.02.19' +__version__ = '2025.05.22' -RELEASE_GIT_HEAD = '4985a4041770eaa0016271809a1fd950dc809a55' +RELEASE_GIT_HEAD = '7977b329ed97b216e37bd402f4935f28c00eac9e' VARIANT = None @@ -12,4 +12,4 @@ ORIGIN = 'yt-dlp/yt-dlp' -_pkg_version = '2025.02.19' +_pkg_version = '2025.05.22'