diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.yml b/.github/ISSUE_TEMPLATE/1_broken_site.yml index 20e5e944f..c8d3de06b 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.yml +++ b/.github/ISSUE_TEMPLATE/1_broken_site.yml @@ -2,13 +2,11 @@ name: Broken site support description: Report issue with yt-dlp on a supported site labels: [triage, site-bug] body: - - type: checkboxes + - type: markdown attributes: - label: DO NOT REMOVE OR SKIP THE ISSUE TEMPLATE - description: Fill all fields even if you think it is irrelevant for the issue - options: - - label: I understand that I will be **blocked** if I *intentionally* remove or skip any mandatory\* field - required: true + value: | + > [!IMPORTANT] + > Not providing the required (*) information or removing the template will result in your issue being closed and ignored. - type: checkboxes id: checklist attributes: @@ -24,9 +22,7 @@ body: required: true - label: I've checked that all URLs and arguments with special characters are [properly quoted or escaped](https://github.com/yt-dlp/yt-dlp/wiki/FAQ#video-url-contains-an-ampersand--and-im-getting-some-strange-output-1-2839-or-v-is-not-recognized-as-an-internal-or-external-command) required: true - - label: I've searched [known issues](https://github.com/yt-dlp/yt-dlp/issues/3766) and the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar issues **including closed ones**. DO NOT post duplicates - required: true - - label: I've read the [guidelines for opening an issue](https://github.com/yt-dlp/yt-dlp/blob/master/CONTRIBUTING.md#opening-an-issue) + - label: I've searched [known issues](https://github.com/yt-dlp/yt-dlp/issues/3766), [the FAQ](https://github.com/yt-dlp/yt-dlp/wiki/FAQ), and the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=is%3Aissue%20-label%3Aspam%20%20) for similar issues **including closed ones**. DO NOT post duplicates required: true - label: I've read about [sharing account credentials](https://github.com/yt-dlp/yt-dlp/blob/master/CONTRIBUTING.md#are-you-willing-to-share-account-details-if-needed) and I'm willing to share it if required - type: input @@ -47,6 +43,8 @@ body: id: verbose attributes: label: Provide verbose output that clearly demonstrates the problem + description: | + This is mandatory unless absolutely impossible to provide. If you are unable to provide the output, please explain why. options: - label: Run **your** yt-dlp command with **-vU** flag added (`yt-dlp -vU `) required: true @@ -78,11 +76,3 @@ body: render: shell validations: required: true - - type: markdown - attributes: - value: | - > [!CAUTION] - > ### GitHub is experiencing a high volume of malicious spam comments. - > ### If you receive any replies asking you download a file, do NOT follow the download links! - > - > Note that this issue may be temporarily locked as an anti-spam measure after it is opened. diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.yml b/.github/ISSUE_TEMPLATE/2_site_support_request.yml index 4aeff7dc6..a9564c0c2 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.yml +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.yml @@ -2,13 +2,11 @@ name: Site support request description: Request support for a new site labels: [triage, site-request] body: - - type: checkboxes + - type: markdown attributes: - label: DO NOT REMOVE OR SKIP THE ISSUE TEMPLATE - description: Fill all fields even if you think it is irrelevant for the issue - options: - - label: I understand that I will be **blocked** if I *intentionally* remove or skip any mandatory\* field - required: true + value: | + > [!IMPORTANT] + > Not providing the required (*) information or removing the template will result in your issue being closed and ignored. - type: checkboxes id: checklist attributes: @@ -24,9 +22,7 @@ body: required: true - label: I've checked that none of provided URLs [violate any copyrights](https://github.com/yt-dlp/yt-dlp/blob/master/CONTRIBUTING.md#is-the-website-primarily-used-for-piracy) or contain any [DRM](https://en.wikipedia.org/wiki/Digital_rights_management) to the best of my knowledge required: true - - label: I've searched [known issues](https://github.com/yt-dlp/yt-dlp/issues/3766) and the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar issues **including closed ones**. DO NOT post duplicates - required: true - - label: I've read the [guidelines for opening an issue](https://github.com/yt-dlp/yt-dlp/blob/master/CONTRIBUTING.md#opening-an-issue) + - label: I've searched the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=is%3Aissue%20-label%3Aspam%20%20) for similar requests **including closed ones**. DO NOT post duplicates required: true - label: I've read about [sharing account credentials](https://github.com/yt-dlp/yt-dlp/blob/master/CONTRIBUTING.md#are-you-willing-to-share-account-details-if-needed) and am willing to share it if required - type: input @@ -59,6 +55,8 @@ body: id: verbose attributes: label: Provide verbose output that clearly demonstrates the problem + description: | + This is mandatory unless absolutely impossible to provide. If you are unable to provide the output, please explain why. options: - label: Run **your** yt-dlp command with **-vU** flag added (`yt-dlp -vU `) required: true @@ -90,11 +88,3 @@ body: render: shell validations: required: true - - type: markdown - attributes: - value: | - > [!CAUTION] - > ### GitHub is experiencing a high volume of malicious spam comments. - > ### If you receive any replies asking you download a file, do NOT follow the download links! - > - > Note that this issue may be temporarily locked as an anti-spam measure after it is opened. diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.yml b/.github/ISSUE_TEMPLATE/3_site_feature_request.yml index 2f516ebb7..6e2380fae 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.yml +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.yml @@ -1,14 +1,12 @@ name: Site feature request -description: Request a new functionality for a supported site +description: Request new functionality for a site supported by yt-dlp labels: [triage, site-enhancement] body: - - type: checkboxes + - type: markdown attributes: - label: DO NOT REMOVE OR SKIP THE ISSUE TEMPLATE - description: Fill all fields even if you think it is irrelevant for the issue - options: - - label: I understand that I will be **blocked** if I *intentionally* remove or skip any mandatory\* field - required: true + value: | + > [!IMPORTANT] + > Not providing the required (*) information or removing the template will result in your issue being closed and ignored. - type: checkboxes id: checklist attributes: @@ -22,9 +20,7 @@ body: required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true - - label: I've searched [known issues](https://github.com/yt-dlp/yt-dlp/issues/3766) and the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar issues **including closed ones**. DO NOT post duplicates - required: true - - label: I've read the [guidelines for opening an issue](https://github.com/yt-dlp/yt-dlp/blob/master/CONTRIBUTING.md#opening-an-issue) + - label: I've searched the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=is%3Aissue%20-label%3Aspam%20%20) for similar requests **including closed ones**. DO NOT post duplicates required: true - label: I've read about [sharing account credentials](https://github.com/yt-dlp/yt-dlp/blob/master/CONTRIBUTING.md#are-you-willing-to-share-account-details-if-needed) and I'm willing to share it if required - type: input @@ -55,6 +51,8 @@ body: id: verbose attributes: label: Provide verbose output that clearly demonstrates the problem + description: | + This is mandatory unless absolutely impossible to provide. If you are unable to provide the output, please explain why. options: - label: Run **your** yt-dlp command with **-vU** flag added (`yt-dlp -vU `) required: true @@ -86,11 +84,3 @@ body: render: shell validations: required: true - - type: markdown - attributes: - value: | - > [!CAUTION] - > ### GitHub is experiencing a high volume of malicious spam comments. - > ### If you receive any replies asking you download a file, do NOT follow the download links! - > - > Note that this issue may be temporarily locked as an anti-spam measure after it is opened. diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.yml b/.github/ISSUE_TEMPLATE/4_bug_report.yml index 201586e9d..6fc523be0 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.yml +++ b/.github/ISSUE_TEMPLATE/4_bug_report.yml @@ -2,13 +2,11 @@ name: Core bug report description: Report a bug unrelated to any particular site or extractor labels: [triage, bug] body: - - type: checkboxes + - type: markdown attributes: - label: DO NOT REMOVE OR SKIP THE ISSUE TEMPLATE - description: Fill all fields even if you think it is irrelevant for the issue - options: - - label: I understand that I will be **blocked** if I *intentionally* remove or skip any mandatory\* field - required: true + value: | + > [!IMPORTANT] + > Not providing the required (*) information or removing the template will result in your issue being closed and ignored. - type: checkboxes id: checklist attributes: @@ -20,13 +18,7 @@ body: required: true - label: I've verified that I have **updated yt-dlp to nightly or master** ([update instructions](https://github.com/yt-dlp/yt-dlp#update-channels)) required: true - - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details - required: true - - label: I've checked that all URLs and arguments with special characters are [properly quoted or escaped](https://github.com/yt-dlp/yt-dlp/wiki/FAQ#video-url-contains-an-ampersand--and-im-getting-some-strange-output-1-2839-or-v-is-not-recognized-as-an-internal-or-external-command) - required: true - - label: I've searched [known issues](https://github.com/yt-dlp/yt-dlp/issues/3766) and the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar issues **including closed ones**. DO NOT post duplicates - required: true - - label: I've read the [guidelines for opening an issue](https://github.com/yt-dlp/yt-dlp/blob/master/CONTRIBUTING.md#opening-an-issue) + - label: I've searched [known issues](https://github.com/yt-dlp/yt-dlp/issues/3766), [the FAQ](https://github.com/yt-dlp/yt-dlp/wiki/FAQ), and the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=is%3Aissue%20-label%3Aspam%20%20) for similar issues **including closed ones**. DO NOT post duplicates required: true - type: textarea id: description @@ -40,6 +32,8 @@ body: id: verbose attributes: label: Provide verbose output that clearly demonstrates the problem + description: | + This is mandatory unless absolutely impossible to provide. If you are unable to provide the output, please explain why. options: - label: Run **your** yt-dlp command with **-vU** flag added (`yt-dlp -vU `) required: true @@ -71,11 +65,3 @@ body: render: shell validations: required: true - - type: markdown - attributes: - value: | - > [!CAUTION] - > ### GitHub is experiencing a high volume of malicious spam comments. - > ### If you receive any replies asking you download a file, do NOT follow the download links! - > - > Note that this issue may be temporarily locked as an anti-spam measure after it is opened. diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.yml b/.github/ISSUE_TEMPLATE/5_feature_request.yml index 765de86a2..57a33bb71 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.yml +++ b/.github/ISSUE_TEMPLATE/5_feature_request.yml @@ -1,14 +1,12 @@ name: Feature request -description: Request a new functionality unrelated to any particular site or extractor +description: Request a new feature unrelated to any particular site or extractor labels: [triage, enhancement] body: - - type: checkboxes + - type: markdown attributes: - label: DO NOT REMOVE OR SKIP THE ISSUE TEMPLATE - description: Fill all fields even if you think it is irrelevant for the issue - options: - - label: I understand that I will be **blocked** if I *intentionally* remove or skip any mandatory\* field - required: true + value: | + > [!IMPORTANT] + > Not providing the required (*) information or removing the template will result in your issue being closed and ignored. - type: checkboxes id: checklist attributes: @@ -22,9 +20,7 @@ body: required: true - label: I've verified that I have **updated yt-dlp to nightly or master** ([update instructions](https://github.com/yt-dlp/yt-dlp#update-channels)) required: true - - label: I've searched [known issues](https://github.com/yt-dlp/yt-dlp/issues/3766) and the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar issues **including closed ones**. DO NOT post duplicates - required: true - - label: I've read the [guidelines for opening an issue](https://github.com/yt-dlp/yt-dlp/blob/master/CONTRIBUTING.md#opening-an-issue) + - label: I've searched the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=is%3Aissue%20-label%3Aspam%20%20) for similar requests **including closed ones**. DO NOT post duplicates required: true - type: textarea id: description @@ -38,6 +34,8 @@ body: id: verbose attributes: label: Provide verbose output that clearly demonstrates the problem + description: | + This is mandatory unless absolutely impossible to provide. If you are unable to provide the output, please explain why. options: - label: Run **your** yt-dlp command with **-vU** flag added (`yt-dlp -vU `) - label: "If using API, add `'verbose': True` to `YoutubeDL` params instead" @@ -65,11 +63,3 @@ body: [youtube] Extracting URL: https://www.youtube.com/watch?v=BaW_jenozKc render: shell - - type: markdown - attributes: - value: | - > [!CAUTION] - > ### GitHub is experiencing a high volume of malicious spam comments. - > ### If you receive any replies asking you download a file, do NOT follow the download links! - > - > Note that this issue may be temporarily locked as an anti-spam measure after it is opened. diff --git a/.github/ISSUE_TEMPLATE/6_question.yml b/.github/ISSUE_TEMPLATE/6_question.yml index 198e21bec..28ec7cbe0 100644 --- a/.github/ISSUE_TEMPLATE/6_question.yml +++ b/.github/ISSUE_TEMPLATE/6_question.yml @@ -1,14 +1,12 @@ name: Ask question -description: Ask yt-dlp related question +description: Ask a question about using yt-dlp labels: [question] body: - - type: checkboxes + - type: markdown attributes: - label: DO NOT REMOVE OR SKIP THE ISSUE TEMPLATE - description: Fill all fields even if you think it is irrelevant for the issue - options: - - label: I understand that I will be **blocked** if I *intentionally* remove or skip any mandatory\* field - required: true + value: | + > [!IMPORTANT] + > Not providing the required (*) information or removing the template will result in your issue being closed and ignored. - type: markdown attributes: value: | @@ -28,9 +26,7 @@ body: required: true - label: I've verified that I have **updated yt-dlp to nightly or master** ([update instructions](https://github.com/yt-dlp/yt-dlp#update-channels)) required: true - - label: I've searched [known issues](https://github.com/yt-dlp/yt-dlp/issues/3766) and the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar questions **including closed ones**. DO NOT post duplicates - required: true - - label: I've read the [guidelines for opening an issue](https://github.com/yt-dlp/yt-dlp/blob/master/CONTRIBUTING.md#opening-an-issue) + - label: I've searched [known issues](https://github.com/yt-dlp/yt-dlp/issues/3766), [the FAQ](https://github.com/yt-dlp/yt-dlp/wiki/FAQ), and the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=is%3Aissue%20-label%3Aspam%20%20) for similar questions **including closed ones**. DO NOT post duplicates required: true - type: textarea id: question @@ -44,6 +40,8 @@ body: id: verbose attributes: label: Provide verbose output that clearly demonstrates the problem + description: | + This is mandatory unless absolutely impossible to provide. If you are unable to provide the output, please explain why. options: - label: Run **your** yt-dlp command with **-vU** flag added (`yt-dlp -vU `) - label: "If using API, add `'verbose': True` to `YoutubeDL` params instead" @@ -71,11 +69,3 @@ body: [youtube] Extracting URL: https://www.youtube.com/watch?v=BaW_jenozKc render: shell - - type: markdown - attributes: - value: | - > [!CAUTION] - > ### GitHub is experiencing a high volume of malicious spam comments. - > ### If you receive any replies asking you download a file, do NOT follow the download links! - > - > Note that this issue may be temporarily locked as an anti-spam measure after it is opened. diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml index 9cdffa4b1..0131631bb 100644 --- a/.github/ISSUE_TEMPLATE/config.yml +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -1,8 +1,5 @@ blank_issues_enabled: false contact_links: - - name: Get help from the community on Discord + - name: Get help on Discord url: https://discord.gg/H5MNcFW63r - about: Join the yt-dlp Discord for community-powered support! - - name: Matrix Bridge to the Discord server - url: https://matrix.to/#/#yt-dlp:matrix.org - about: For those who do not want to use Discord + about: Join the yt-dlp Discord server for support and discussion diff --git a/.github/ISSUE_TEMPLATE_tmpl/1_broken_site.yml b/.github/ISSUE_TEMPLATE_tmpl/1_broken_site.yml index bff28ae4e..f1a2d3090 100644 --- a/.github/ISSUE_TEMPLATE_tmpl/1_broken_site.yml +++ b/.github/ISSUE_TEMPLATE_tmpl/1_broken_site.yml @@ -18,9 +18,7 @@ body: required: true - label: I've checked that all URLs and arguments with special characters are [properly quoted or escaped](https://github.com/yt-dlp/yt-dlp/wiki/FAQ#video-url-contains-an-ampersand--and-im-getting-some-strange-output-1-2839-or-v-is-not-recognized-as-an-internal-or-external-command) required: true - - label: I've searched [known issues](https://github.com/yt-dlp/yt-dlp/issues/3766) and the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar issues **including closed ones**. DO NOT post duplicates - required: true - - label: I've read the [guidelines for opening an issue](https://github.com/yt-dlp/yt-dlp/blob/master/CONTRIBUTING.md#opening-an-issue) + - label: I've searched [known issues](https://github.com/yt-dlp/yt-dlp/issues/3766), [the FAQ](https://github.com/yt-dlp/yt-dlp/wiki/FAQ), and the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=is%%3Aissue%%20-label%%3Aspam%%20%%20) for similar issues **including closed ones**. DO NOT post duplicates required: true - label: I've read about [sharing account credentials](https://github.com/yt-dlp/yt-dlp/blob/master/CONTRIBUTING.md#are-you-willing-to-share-account-details-if-needed) and I'm willing to share it if required - type: input diff --git a/.github/ISSUE_TEMPLATE_tmpl/2_site_support_request.yml b/.github/ISSUE_TEMPLATE_tmpl/2_site_support_request.yml index 2bffe738d..31b89b683 100644 --- a/.github/ISSUE_TEMPLATE_tmpl/2_site_support_request.yml +++ b/.github/ISSUE_TEMPLATE_tmpl/2_site_support_request.yml @@ -18,9 +18,7 @@ body: required: true - label: I've checked that none of provided URLs [violate any copyrights](https://github.com/yt-dlp/yt-dlp/blob/master/CONTRIBUTING.md#is-the-website-primarily-used-for-piracy) or contain any [DRM](https://en.wikipedia.org/wiki/Digital_rights_management) to the best of my knowledge required: true - - label: I've searched [known issues](https://github.com/yt-dlp/yt-dlp/issues/3766) and the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar issues **including closed ones**. DO NOT post duplicates - required: true - - label: I've read the [guidelines for opening an issue](https://github.com/yt-dlp/yt-dlp/blob/master/CONTRIBUTING.md#opening-an-issue) + - label: I've searched the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=is%%3Aissue%%20-label%%3Aspam%%20%%20) for similar requests **including closed ones**. DO NOT post duplicates required: true - label: I've read about [sharing account credentials](https://github.com/yt-dlp/yt-dlp/blob/master/CONTRIBUTING.md#are-you-willing-to-share-account-details-if-needed) and am willing to share it if required - type: input diff --git a/.github/ISSUE_TEMPLATE_tmpl/3_site_feature_request.yml b/.github/ISSUE_TEMPLATE_tmpl/3_site_feature_request.yml index 6c3127983..421766a75 100644 --- a/.github/ISSUE_TEMPLATE_tmpl/3_site_feature_request.yml +++ b/.github/ISSUE_TEMPLATE_tmpl/3_site_feature_request.yml @@ -1,5 +1,5 @@ name: Site feature request -description: Request a new functionality for a supported site +description: Request new functionality for a site supported by yt-dlp labels: [triage, site-enhancement] body: %(no_skip)s @@ -16,9 +16,7 @@ body: required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true - - label: I've searched [known issues](https://github.com/yt-dlp/yt-dlp/issues/3766) and the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar issues **including closed ones**. DO NOT post duplicates - required: true - - label: I've read the [guidelines for opening an issue](https://github.com/yt-dlp/yt-dlp/blob/master/CONTRIBUTING.md#opening-an-issue) + - label: I've searched the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=is%%3Aissue%%20-label%%3Aspam%%20%%20) for similar requests **including closed ones**. DO NOT post duplicates required: true - label: I've read about [sharing account credentials](https://github.com/yt-dlp/yt-dlp/blob/master/CONTRIBUTING.md#are-you-willing-to-share-account-details-if-needed) and I'm willing to share it if required - type: input diff --git a/.github/ISSUE_TEMPLATE_tmpl/4_bug_report.yml b/.github/ISSUE_TEMPLATE_tmpl/4_bug_report.yml index 5f357d96e..31a19b292 100644 --- a/.github/ISSUE_TEMPLATE_tmpl/4_bug_report.yml +++ b/.github/ISSUE_TEMPLATE_tmpl/4_bug_report.yml @@ -14,13 +14,7 @@ body: required: true - label: I've verified that I have **updated yt-dlp to nightly or master** ([update instructions](https://github.com/yt-dlp/yt-dlp#update-channels)) required: true - - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details - required: true - - label: I've checked that all URLs and arguments with special characters are [properly quoted or escaped](https://github.com/yt-dlp/yt-dlp/wiki/FAQ#video-url-contains-an-ampersand--and-im-getting-some-strange-output-1-2839-or-v-is-not-recognized-as-an-internal-or-external-command) - required: true - - label: I've searched [known issues](https://github.com/yt-dlp/yt-dlp/issues/3766) and the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar issues **including closed ones**. DO NOT post duplicates - required: true - - label: I've read the [guidelines for opening an issue](https://github.com/yt-dlp/yt-dlp/blob/master/CONTRIBUTING.md#opening-an-issue) + - label: I've searched [known issues](https://github.com/yt-dlp/yt-dlp/issues/3766), [the FAQ](https://github.com/yt-dlp/yt-dlp/wiki/FAQ), and the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=is%%3Aissue%%20-label%%3Aspam%%20%%20) for similar issues **including closed ones**. DO NOT post duplicates required: true - type: textarea id: description diff --git a/.github/ISSUE_TEMPLATE_tmpl/5_feature_request.yml b/.github/ISSUE_TEMPLATE_tmpl/5_feature_request.yml index 99107ff58..b8ab6610b 100644 --- a/.github/ISSUE_TEMPLATE_tmpl/5_feature_request.yml +++ b/.github/ISSUE_TEMPLATE_tmpl/5_feature_request.yml @@ -1,5 +1,5 @@ name: Feature request -description: Request a new functionality unrelated to any particular site or extractor +description: Request a new feature unrelated to any particular site or extractor labels: [triage, enhancement] body: %(no_skip)s @@ -16,9 +16,7 @@ body: required: true - label: I've verified that I have **updated yt-dlp to nightly or master** ([update instructions](https://github.com/yt-dlp/yt-dlp#update-channels)) required: true - - label: I've searched [known issues](https://github.com/yt-dlp/yt-dlp/issues/3766) and the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar issues **including closed ones**. DO NOT post duplicates - required: true - - label: I've read the [guidelines for opening an issue](https://github.com/yt-dlp/yt-dlp/blob/master/CONTRIBUTING.md#opening-an-issue) + - label: I've searched the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=is%%3Aissue%%20-label%%3Aspam%%20%%20) for similar requests **including closed ones**. DO NOT post duplicates required: true - type: textarea id: description diff --git a/.github/ISSUE_TEMPLATE_tmpl/6_question.yml b/.github/ISSUE_TEMPLATE_tmpl/6_question.yml index bd742109a..062e96321 100644 --- a/.github/ISSUE_TEMPLATE_tmpl/6_question.yml +++ b/.github/ISSUE_TEMPLATE_tmpl/6_question.yml @@ -1,5 +1,5 @@ name: Ask question -description: Ask yt-dlp related question +description: Ask a question about using yt-dlp labels: [question] body: %(no_skip)s @@ -22,9 +22,7 @@ body: required: true - label: I've verified that I have **updated yt-dlp to nightly or master** ([update instructions](https://github.com/yt-dlp/yt-dlp#update-channels)) required: true - - label: I've searched [known issues](https://github.com/yt-dlp/yt-dlp/issues/3766) and the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar questions **including closed ones**. DO NOT post duplicates - required: true - - label: I've read the [guidelines for opening an issue](https://github.com/yt-dlp/yt-dlp/blob/master/CONTRIBUTING.md#opening-an-issue) + - label: I've searched [known issues](https://github.com/yt-dlp/yt-dlp/issues/3766), [the FAQ](https://github.com/yt-dlp/yt-dlp/wiki/FAQ), and the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=is%%3Aissue%%20-label%%3Aspam%%20%%20) for similar questions **including closed ones**. DO NOT post duplicates required: true - type: textarea id: question diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 4deee572f..4dcfcc48c 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,14 +1,17 @@ -**IMPORTANT**: PRs without the template will be CLOSED + ### Description of your *pull request* and other information - - -ADD DESCRIPTION HERE +ADD DETAILED DESCRIPTION HERE Fixes # @@ -16,24 +19,22 @@ ### Description of your *pull request* and other information
Template ### Before submitting a *pull request* make sure you have: - [ ] At least skimmed through [contributing guidelines](https://github.com/yt-dlp/yt-dlp/blob/master/CONTRIBUTING.md#developer-instructions) including [yt-dlp coding conventions](https://github.com/yt-dlp/yt-dlp/blob/master/CONTRIBUTING.md#yt-dlp-coding-conventions) - [ ] [Searched](https://github.com/yt-dlp/yt-dlp/search?q=is%3Apr&type=Issues) the bugtracker for similar pull requests -### In order to be accepted and merged into yt-dlp each piece of code must be in public domain or released under [Unlicense](http://unlicense.org/). Check all of the following options that apply: -- [ ] I am the original author of this code and I am willing to release it under [Unlicense](http://unlicense.org/) -- [ ] I am not the original author of this code but it is in public domain or released under [Unlicense](http://unlicense.org/) (provide reliable evidence) +### In order to be accepted and merged into yt-dlp each piece of code must be in public domain or released under [Unlicense](http://unlicense.org/). Check those that apply and remove the others: +- [ ] I am the original author of the code in this PR, and I am willing to release it under [Unlicense](http://unlicense.org/) +- [ ] I am not the original author of the code in this PR, but it is in the public domain or released under [Unlicense](http://unlicense.org/) (provide reliable evidence) -### What is the purpose of your *pull request*? +### What is the purpose of your *pull request*? Check those that apply and remove the others: - [ ] Fix or improvement to an extractor (Make sure to add/update tests) - [ ] New extractor ([Piracy websites will not be accepted](https://github.com/yt-dlp/yt-dlp/blob/master/CONTRIBUTING.md#is-the-website-primarily-used-for-piracy)) - [ ] Core bug fix/improvement diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index a211ae165..4b71a621c 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -192,7 +192,7 @@ jobs: with: path: ./repo - name: Virtualized Install, Prepare & Build - uses: yt-dlp/run-on-arch-action@v2 + uses: yt-dlp/run-on-arch-action@v3 with: # Ref: https://github.com/uraimo/run-on-arch-action/issues/55 env: | @@ -411,7 +411,7 @@ jobs: run: | # Custom pyinstaller built with https://github.com/yt-dlp/pyinstaller-builds python devscripts/install_deps.py -o --include build python devscripts/install_deps.py --include curl-cffi - python -m pip install -U "https://yt-dlp.github.io/Pyinstaller-Builds/x86_64/pyinstaller-6.11.1-py3-none-any.whl" + python -m pip install -U "https://yt-dlp.github.io/Pyinstaller-Builds/x86_64/pyinstaller-6.13.0-py3-none-any.whl" - name: Prepare run: | @@ -460,7 +460,7 @@ jobs: run: | python devscripts/install_deps.py -o --include build python devscripts/install_deps.py - python -m pip install -U "https://yt-dlp.github.io/Pyinstaller-Builds/i686/pyinstaller-6.11.1-py3-none-any.whl" + python -m pip install -U "https://yt-dlp.github.io/Pyinstaller-Builds/i686/pyinstaller-6.13.0-py3-none-any.whl" - name: Prepare run: | diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 170a6ac19..910c409ef 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -33,7 +33,7 @@ jobs: # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL - uses: github/codeql-action/init@v2 + uses: github/codeql-action/init@v3 with: languages: ${{ matrix.language }} # If you wish to specify custom queries, you can do so here or in a config file. @@ -47,7 +47,7 @@ jobs: # Autobuild attempts to build any compiled languages (C/C++, C#, Go, Java, or Swift). # If this step fails, then you should remove it and run the build manually (see below) - name: Autobuild - uses: github/codeql-action/autobuild@v2 + uses: github/codeql-action/autobuild@v3 # ℹ️ Command-line programs to run using the OS shell. # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun @@ -60,6 +60,6 @@ jobs: # ./location_of_script_within_repo/buildscript.sh - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@v2 + uses: github/codeql-action/analyze@v3 with: category: "/language:${{matrix.language}}" diff --git a/.github/workflows/core.yml b/.github/workflows/core.yml index 9a4342a58..dd2c6f481 100644 --- a/.github/workflows/core.yml +++ b/.github/workflows/core.yml @@ -6,7 +6,7 @@ on: - devscripts/** - test/** - yt_dlp/**.py - - '!yt_dlp/extractor/*.py' + - '!yt_dlp/extractor/**.py' - yt_dlp/extractor/__init__.py - yt_dlp/extractor/common.py - yt_dlp/extractor/extractors.py @@ -16,7 +16,7 @@ on: - devscripts/** - test/** - yt_dlp/**.py - - '!yt_dlp/extractor/*.py' + - '!yt_dlp/extractor/**.py' - yt_dlp/extractor/__init__.py - yt_dlp/extractor/common.py - yt_dlp/extractor/extractors.py diff --git a/.github/workflows/quick-test.yml b/.github/workflows/quick-test.yml index 1a32bbfe3..8a7b24033 100644 --- a/.github/workflows/quick-test.yml +++ b/.github/workflows/quick-test.yml @@ -38,3 +38,5 @@ jobs: run: ruff check --output-format github . - name: Run autopep8 run: autopep8 --diff . + - name: Check file mode + run: git ls-files --format="%(objectmode) %(path)" yt_dlp/ | ( ! grep -v "^100644" ) diff --git a/CONTRIBUTORS b/CONTRIBUTORS index 7376b1801..5710f9a9e 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -736,3 +736,37 @@ NecroRomnt pjrobertson subsense test20140 +arantius +entourage8 +lfavole +mp3butcher +slipinthedove +YoshiTabletopGamer +Arc8ne +benfaerber +chrisellsworth +fries1234 +Kenshin9977 +MichaelDeBoey +msikma +pedro +pferreir +red-acid +refack +rysson +somini +thedenv +vallovic +arabcoders +mireq +mlabeeb03 +1271 +CasperMcFadden95 +Kicer86 +Kiritomo +leeblackc +meGAmeS1 +NeonMan +pj47x +troex +WouterGordts diff --git a/Changelog.md b/Changelog.md index 3232c158b..513724bf4 100644 --- a/Changelog.md +++ b/Changelog.md @@ -4,6 +4,264 @@ # Changelog # To create a release, dispatch the https://github.com/yt-dlp/yt-dlp/actions/workflows/release.yml workflow on master --> +### 2025.04.30 + +#### Important changes +- **New option `--preset-alias`/`-t` has been added** +This provides convenient predefined aliases for common use cases. Available presets include `mp4`, `mp3`, `mkv`, `aac`, and `sleep`. See [the README](https://github.com/yt-dlp/yt-dlp/blob/master/README.md#preset-aliases) for more details. + +#### Core changes +- [Add `--preset-alias` option](https://github.com/yt-dlp/yt-dlp/commit/88eb1e7a9a2720ac89d653c0d0e40292388823bb) ([#12839](https://github.com/yt-dlp/yt-dlp/issues/12839)) by [Grub4K](https://github.com/Grub4K), [seproDev](https://github.com/seproDev) +- **utils** + - `_yield_json_ld`: [Make function less fatal](https://github.com/yt-dlp/yt-dlp/commit/45f01de00e1bc076b7f676a669736326178647b1) ([#12855](https://github.com/yt-dlp/yt-dlp/issues/12855)) by [seproDev](https://github.com/seproDev) + - `url_or_none`: [Support WebSocket URLs](https://github.com/yt-dlp/yt-dlp/commit/a473e592337edb8ca40cde52c1fcaee261c54df9) ([#12848](https://github.com/yt-dlp/yt-dlp/issues/12848)) by [doe1080](https://github.com/doe1080) + +#### Extractor changes +- **abematv**: [Fix thumbnail extraction](https://github.com/yt-dlp/yt-dlp/commit/f5736bb35bde62348caebf7b188668655e316deb) ([#12859](https://github.com/yt-dlp/yt-dlp/issues/12859)) by [Kiritomo](https://github.com/Kiritomo) +- **atresplayer**: [Rework extractor](https://github.com/yt-dlp/yt-dlp/commit/839d64325356310e6de6cd9cad28fb546619ca63) ([#11424](https://github.com/yt-dlp/yt-dlp/issues/11424)) by [meGAmeS1](https://github.com/meGAmeS1), [seproDev](https://github.com/seproDev) +- **bpb**: [Fix formats extraction](https://github.com/yt-dlp/yt-dlp/commit/80736b9c90818adee933a155079b8535bc06819f) ([#13015](https://github.com/yt-dlp/yt-dlp/issues/13015)) by [bashonly](https://github.com/bashonly) +- **cda**: [Fix formats extraction](https://github.com/yt-dlp/yt-dlp/commit/9032f981362ea0be90626fab51ec37934feded6d) ([#12975](https://github.com/yt-dlp/yt-dlp/issues/12975)) by [bashonly](https://github.com/bashonly) +- **cdafolder**: [Extend `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/cb271d445bc2d866c9a3404b1d8f59bcb77447df) ([#12919](https://github.com/yt-dlp/yt-dlp/issues/12919)) by [fireattack](https://github.com/fireattack), [Kicer86](https://github.com/Kicer86) +- **crowdbunker**: [Make format extraction non-fatal](https://github.com/yt-dlp/yt-dlp/commit/4ebf41309d04a6e196944f1c0f5f0154cff0055a) ([#12836](https://github.com/yt-dlp/yt-dlp/issues/12836)) by [seproDev](https://github.com/seproDev) +- **dacast**: [Support tokenized URLs](https://github.com/yt-dlp/yt-dlp/commit/e7e3b7a55c456da4a5a812b4fefce4dce8e6a616) ([#12979](https://github.com/yt-dlp/yt-dlp/issues/12979)) by [bashonly](https://github.com/bashonly) +- **dzen.ru**: [Rework extractors](https://github.com/yt-dlp/yt-dlp/commit/a3f2b54c2535d862de6efa9cfaa6ca9a2b2f7dd6) ([#12852](https://github.com/yt-dlp/yt-dlp/issues/12852)) by [seproDev](https://github.com/seproDev) +- **generic**: [Fix MPD extraction for `file://` URLs](https://github.com/yt-dlp/yt-dlp/commit/34a061a295d156934417c67ee98070b94943006b) ([#12978](https://github.com/yt-dlp/yt-dlp/issues/12978)) by [bashonly](https://github.com/bashonly) +- **getcourseru**: [Fix extractors](https://github.com/yt-dlp/yt-dlp/commit/741fd809bc4d301c19b53877692ae510334a6750) ([#12943](https://github.com/yt-dlp/yt-dlp/issues/12943)) by [troex](https://github.com/troex) +- **ivoox**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/7faa18b83dcfc74a1a1e2034e6b0369c495ca645) ([#12768](https://github.com/yt-dlp/yt-dlp/issues/12768)) by [NeonMan](https://github.com/NeonMan), [seproDev](https://github.com/seproDev) +- **kika**: [Add playlist extractor](https://github.com/yt-dlp/yt-dlp/commit/3c1c75ecb8ab352f422b59af46fff2be992e4115) ([#12832](https://github.com/yt-dlp/yt-dlp/issues/12832)) by [1100101](https://github.com/1100101) +- **linkedin** + - [Support feed URLs](https://github.com/yt-dlp/yt-dlp/commit/73a26f9ee68610e33c0b4407b77355f2ab7afd0e) ([#12927](https://github.com/yt-dlp/yt-dlp/issues/12927)) by [seproDev](https://github.com/seproDev) + - events: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/b37ff4de5baf4e4e70c6a0ec34e136a279ad20af) ([#12926](https://github.com/yt-dlp/yt-dlp/issues/12926)) by [bashonly](https://github.com/bashonly), [seproDev](https://github.com/seproDev) +- **loco**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/f5a37ea40e20865b976ffeeff13eeae60292eb23) ([#12934](https://github.com/yt-dlp/yt-dlp/issues/12934)) by [seproDev](https://github.com/seproDev) +- **lrtradio**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/74e90dd9b8f9c1a5c48a2515126654f4d398d687) ([#12801](https://github.com/yt-dlp/yt-dlp/issues/12801)) by [subrat-lima](https://github.com/subrat-lima) +- **manyvids**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/77aa15e98f34c4ad425aabf39dd1ee37b48f772c) ([#10907](https://github.com/yt-dlp/yt-dlp/issues/10907)) by [pj47x](https://github.com/pj47x) +- **mixcloud**: [Refactor extractor](https://github.com/yt-dlp/yt-dlp/commit/db6d1f145ad583e0220637726029f8f2fa6200a0) ([#12830](https://github.com/yt-dlp/yt-dlp/issues/12830)) by [seproDev](https://github.com/seproDev), [WouterGordts](https://github.com/WouterGordts) +- **mlbtv**: [Fix device ID caching](https://github.com/yt-dlp/yt-dlp/commit/36da6360e130197df927ee93409519ce3f4075f5) ([#12980](https://github.com/yt-dlp/yt-dlp/issues/12980)) by [bashonly](https://github.com/bashonly) +- **niconico** + - [Fix login support](https://github.com/yt-dlp/yt-dlp/commit/25cd7c1ecbb6cbf21dd3a6e59608e4af94715ecc) ([#13008](https://github.com/yt-dlp/yt-dlp/issues/13008)) by [doe1080](https://github.com/doe1080) + - [Remove DMC formats support](https://github.com/yt-dlp/yt-dlp/commit/7d05aa99c65352feae1cd9a3ff8784b64bfe382a) ([#12916](https://github.com/yt-dlp/yt-dlp/issues/12916)) by [doe1080](https://github.com/doe1080) + - live: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/1d45e30537bf83e069184a440703e4c43b2e0198) ([#12809](https://github.com/yt-dlp/yt-dlp/issues/12809)) by [Snack-X](https://github.com/Snack-X) +- **panopto**: [Fix formats extraction](https://github.com/yt-dlp/yt-dlp/commit/9d26daa04ad5108257bc5e30f7f040c7f1fe7a5a) ([#12925](https://github.com/yt-dlp/yt-dlp/issues/12925)) by [seproDev](https://github.com/seproDev) +- **parti**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/425017531fbc3369becb5a44013e26f26efabf45) ([#12769](https://github.com/yt-dlp/yt-dlp/issues/12769)) by [benfaerber](https://github.com/benfaerber) +- **raiplay**: [Fix DRM detection](https://github.com/yt-dlp/yt-dlp/commit/dce82346245e35a46fda836ca2089805d2347935) ([#12971](https://github.com/yt-dlp/yt-dlp/issues/12971)) by [DTrombett](https://github.com/DTrombett) +- **reddit**: [Support `--ignore-no-formats-error`](https://github.com/yt-dlp/yt-dlp/commit/28f04e8a5e383ff531db646190b4be45554610d6) ([#12993](https://github.com/yt-dlp/yt-dlp/issues/12993)) by [bashonly](https://github.com/bashonly) +- **royalive**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/e1847535e28788414a25546a45bebcada2f34558) ([#12817](https://github.com/yt-dlp/yt-dlp/issues/12817)) by [CasperMcFadden95](https://github.com/CasperMcFadden95) +- **rtve**: [Rework extractors](https://github.com/yt-dlp/yt-dlp/commit/f07ee91c71920ab1187a7ea756720e81aa406a9d) ([#10388](https://github.com/yt-dlp/yt-dlp/issues/10388)) by [meGAmeS1](https://github.com/meGAmeS1), [seproDev](https://github.com/seproDev) +- **rumble**: [Improve format extraction](https://github.com/yt-dlp/yt-dlp/commit/58d0c83457b93b3c9a81eb6bc5a4c65f25e949df) ([#12838](https://github.com/yt-dlp/yt-dlp/issues/12838)) by [seproDev](https://github.com/seproDev) +- **tokfmpodcast**: [Fix formats extraction](https://github.com/yt-dlp/yt-dlp/commit/91832111a12d87499294a0f430829b8c2254c339) ([#12842](https://github.com/yt-dlp/yt-dlp/issues/12842)) by [selfisekai](https://github.com/selfisekai) +- **tv2dk**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/a3e91df30a45943f40759d2c1e0b6c2ca4b2a263) ([#12945](https://github.com/yt-dlp/yt-dlp/issues/12945)) by [bashonly](https://github.com/bashonly), [seproDev](https://github.com/seproDev) +- **tvp**: vod: [Improve `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/4e69a626cce51428bc1d66dc606a56d9498b03a5) ([#12923](https://github.com/yt-dlp/yt-dlp/issues/12923)) by [seproDev](https://github.com/seproDev) +- **tvw**: tvchannels: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/ed8ad1b4d6b9d7a1426ff5192ff924f3371e4721) ([#12721](https://github.com/yt-dlp/yt-dlp/issues/12721)) by [fries1234](https://github.com/fries1234) +- **twitcasting**: [Fix livestream extraction](https://github.com/yt-dlp/yt-dlp/commit/de271a06fd6d20d4f55597ff7f90e4d913de0a52) ([#12977](https://github.com/yt-dlp/yt-dlp/issues/12977)) by [bashonly](https://github.com/bashonly) +- **twitch**: clips: [Fix uploader metadata extraction](https://github.com/yt-dlp/yt-dlp/commit/1ae6bff564a65af41e94f1a4727892471ecdd05a) ([#13022](https://github.com/yt-dlp/yt-dlp/issues/13022)) by [1271](https://github.com/1271) +- **twitter** + - [Fix extraction when logged-in](https://github.com/yt-dlp/yt-dlp/commit/1cf39ddf3d10b6512daa7dd139e5f6c0dc548bbc) ([#13024](https://github.com/yt-dlp/yt-dlp/issues/13024)) by [bashonly](https://github.com/bashonly) + - spaces: [Improve metadata extraction](https://github.com/yt-dlp/yt-dlp/commit/70599e53b736bb75922b737e6e0d4f76e419bb20) ([#12911](https://github.com/yt-dlp/yt-dlp/issues/12911)) by [doe1080](https://github.com/doe1080) +- **vimeo**: [Extract from mobile API](https://github.com/yt-dlp/yt-dlp/commit/22ac81a0692019ac833cf282e4ef99718e9ef3fa) ([#13034](https://github.com/yt-dlp/yt-dlp/issues/13034)) by [bashonly](https://github.com/bashonly) +- **vk** + - [Fix chapters extraction](https://github.com/yt-dlp/yt-dlp/commit/5361a7c6e2933c919716e0cb1e3116c28c40419f) ([#12821](https://github.com/yt-dlp/yt-dlp/issues/12821)) by [seproDev](https://github.com/seproDev) + - [Fix uploader extraction](https://github.com/yt-dlp/yt-dlp/commit/2381881fe58a723853350a6ab750a5efc9f10c85) ([#12985](https://github.com/yt-dlp/yt-dlp/issues/12985)) by [seproDev](https://github.com/seproDev) +- **youtube** + - [Add context to video request rate limit error](https://github.com/yt-dlp/yt-dlp/commit/26feac3dd142536ad08ad1ed731378cb88e63602) ([#12958](https://github.com/yt-dlp/yt-dlp/issues/12958)) by [coletdjnz](https://github.com/coletdjnz) + - [Add extractor arg to skip "initial_data" request](https://github.com/yt-dlp/yt-dlp/commit/ed6c6d7eefbc78fa72e4e60ad6edaa3ee2acc715) ([#12865](https://github.com/yt-dlp/yt-dlp/issues/12865)) by [leeblackc](https://github.com/leeblackc) + - [Add warning on video captcha challenge](https://github.com/yt-dlp/yt-dlp/commit/f484c51599a6cd01eb078ea7dc9bbba942967774) ([#12939](https://github.com/yt-dlp/yt-dlp/issues/12939)) by [coletdjnz](https://github.com/coletdjnz) + - [Cache signature timestamps](https://github.com/yt-dlp/yt-dlp/commit/61c9a938b390b8334ee3a879fe2d93f714e30138) ([#13047](https://github.com/yt-dlp/yt-dlp/issues/13047)) by [bashonly](https://github.com/bashonly) + - [Detect and warn when account cookies are rotated](https://github.com/yt-dlp/yt-dlp/commit/8cb08028f5be2acb9835ce1670b196b9b077052f) ([#13014](https://github.com/yt-dlp/yt-dlp/issues/13014)) by [coletdjnz](https://github.com/coletdjnz) + - [Detect player JS variants for any locale](https://github.com/yt-dlp/yt-dlp/commit/c2d6659d1069f8cff97e1fd61d1c59e949e1e63d) ([#13003](https://github.com/yt-dlp/yt-dlp/issues/13003)) by [bashonly](https://github.com/bashonly) + - [Do not strictly deprioritize `missing_pot` formats](https://github.com/yt-dlp/yt-dlp/commit/74fc2ae12c24eb6b4e02c6360c89bd05f3c8f740) ([#13061](https://github.com/yt-dlp/yt-dlp/issues/13061)) by [bashonly](https://github.com/bashonly) + - [Improve warning for SABR-only/SSAP player responses](https://github.com/yt-dlp/yt-dlp/commit/fd8394bc50301ac5e930aa65aa71ab1b8372b8ab) ([#13049](https://github.com/yt-dlp/yt-dlp/issues/13049)) by [bashonly](https://github.com/bashonly) + - tab: [Extract continuation from empty page](https://github.com/yt-dlp/yt-dlp/commit/72ba4879304c2082fecbb472e6cc05ee2d154a3b) ([#12938](https://github.com/yt-dlp/yt-dlp/issues/12938)) by [coletdjnz](https://github.com/coletdjnz) +- **zdf**: [Fix extractors](https://github.com/yt-dlp/yt-dlp/commit/7be14109a6bd493a2e881da4f9e30adaf3e7e5d5) ([#12779](https://github.com/yt-dlp/yt-dlp/issues/12779)) by [bashonly](https://github.com/bashonly), [InvalidUsernameException](https://github.com/InvalidUsernameException) + +#### Downloader changes +- **niconicodmc**: [Remove downloader](https://github.com/yt-dlp/yt-dlp/commit/8d127b18f81131453eaba05d3bb810d9b73adb75) ([#12916](https://github.com/yt-dlp/yt-dlp/issues/12916)) by [doe1080](https://github.com/doe1080) + +#### Networking changes +- [Add PATCH request shortcut](https://github.com/yt-dlp/yt-dlp/commit/ceab4d5ed63a1f135a1816fe967c9d9a1ec7e6e8) ([#12884](https://github.com/yt-dlp/yt-dlp/issues/12884)) by [doe1080](https://github.com/doe1080) + +#### Misc. changes +- **ci**: [Add file mode test to code check](https://github.com/yt-dlp/yt-dlp/commit/3690e91265d1d0bbeffaf6a9b8cc9baded1367bd) ([#13036](https://github.com/yt-dlp/yt-dlp/issues/13036)) by [Grub4K](https://github.com/Grub4K) +- **cleanup**: Miscellaneous: [505b400](https://github.com/yt-dlp/yt-dlp/commit/505b400795af557bdcfd9d4fa7e9133b26ef431c) by [bashonly](https://github.com/bashonly), [seproDev](https://github.com/seproDev) + +### 2025.03.31 + +#### Core changes +- [Add `--compat-options 2024`](https://github.com/yt-dlp/yt-dlp/commit/22e34adbd741e1c7072015debd615dc3fb71c401) ([#12789](https://github.com/yt-dlp/yt-dlp/issues/12789)) by [seproDev](https://github.com/seproDev) + +#### Extractor changes +- **francaisfacile**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/bb321cfdc3fd4400598ddb12a15862bc2ac8fc10) ([#12787](https://github.com/yt-dlp/yt-dlp/issues/12787)) by [mlabeeb03](https://github.com/mlabeeb03) +- **generic**: [Validate response before checking m3u8 live status](https://github.com/yt-dlp/yt-dlp/commit/9a1ec1d36e172d252714cef712a6d091e0a0c4f2) ([#12784](https://github.com/yt-dlp/yt-dlp/issues/12784)) by [bashonly](https://github.com/bashonly) +- **microsoftlearnepisode**: [Extract more formats](https://github.com/yt-dlp/yt-dlp/commit/d63696f23a341ee36a3237ccb5d5e14b34c2c579) ([#12799](https://github.com/yt-dlp/yt-dlp/issues/12799)) by [bashonly](https://github.com/bashonly) +- **mlbtv**: [Fix radio-only extraction](https://github.com/yt-dlp/yt-dlp/commit/f033d86b96b36f8c5289dd7c3304f42d4d9f6ff4) ([#12792](https://github.com/yt-dlp/yt-dlp/issues/12792)) by [bashonly](https://github.com/bashonly) +- **on24**: [Support `mainEvent` URLs](https://github.com/yt-dlp/yt-dlp/commit/e465b078ead75472fcb7b86f6ccaf2b5d3bc4c21) ([#12800](https://github.com/yt-dlp/yt-dlp/issues/12800)) by [bashonly](https://github.com/bashonly) +- **sbs**: [Fix subtitles extraction](https://github.com/yt-dlp/yt-dlp/commit/29560359120f28adaaac67c86fa8442eb72daa0d) ([#12785](https://github.com/yt-dlp/yt-dlp/issues/12785)) by [bashonly](https://github.com/bashonly) +- **stvr**: [Rename extractor from RTVS to STVR](https://github.com/yt-dlp/yt-dlp/commit/5fc521cbd0ce7b2410d0935369558838728e205d) ([#12788](https://github.com/yt-dlp/yt-dlp/issues/12788)) by [mireq](https://github.com/mireq) +- **twitch**: clips: [Extract portrait formats](https://github.com/yt-dlp/yt-dlp/commit/61046c31612b30c749cbdae934b7fe26abe659d7) ([#12763](https://github.com/yt-dlp/yt-dlp/issues/12763)) by [DmitryScaletta](https://github.com/DmitryScaletta) +- **youtube** + - [Add `player_js_variant` extractor-arg](https://github.com/yt-dlp/yt-dlp/commit/07f04005e40ebdb368920c511e36e98af0077ed3) ([#12767](https://github.com/yt-dlp/yt-dlp/issues/12767)) by [bashonly](https://github.com/bashonly) + - tab: [Fix playlist continuation extraction](https://github.com/yt-dlp/yt-dlp/commit/6a6d97b2cbc78f818de05cc96edcdcfd52caa259) ([#12777](https://github.com/yt-dlp/yt-dlp/issues/12777)) by [coletdjnz](https://github.com/coletdjnz) + +#### Misc. changes +- **cleanup**: Miscellaneous: [5e457af](https://github.com/yt-dlp/yt-dlp/commit/5e457af57fae9645b1b8fa0ed689229c8fb9656b) by [bashonly](https://github.com/bashonly) + +### 2025.03.27 + +#### Core changes +- **jsinterp**: [Fix nested attributes and object extraction](https://github.com/yt-dlp/yt-dlp/commit/a8b9ff3c2a0ae25735e580173becc78545b92572) ([#12760](https://github.com/yt-dlp/yt-dlp/issues/12760)) by [bashonly](https://github.com/bashonly), [seproDev](https://github.com/seproDev) + +#### Extractor changes +- **youtube**: [Make signature and nsig extraction more robust](https://github.com/yt-dlp/yt-dlp/commit/48be862b32648bff5b3e553e40fca4dcc6e88b28) ([#12761](https://github.com/yt-dlp/yt-dlp/issues/12761)) by [bashonly](https://github.com/bashonly), [seproDev](https://github.com/seproDev) + +### 2025.03.26 + +#### Extractor changes +- **youtube** + - [Fix signature and nsig extraction for player `4fcd6e4a`](https://github.com/yt-dlp/yt-dlp/commit/a550dfc904a02843a26369ae50dbb7c0febfb30e) ([#12748](https://github.com/yt-dlp/yt-dlp/issues/12748)) by [seproDev](https://github.com/seproDev) + - [Only cache nsig code on successful decoding](https://github.com/yt-dlp/yt-dlp/commit/ecee97b4fa90d51c48f9154c3a6d5a8ffe46cd5c) ([#12750](https://github.com/yt-dlp/yt-dlp/issues/12750)) by [bashonly](https://github.com/bashonly), [seproDev](https://github.com/seproDev) + +### 2025.03.25 + +#### Core changes +- [Fix attribute error on failed VT init](https://github.com/yt-dlp/yt-dlp/commit/b872ffec50fd50f790a5a490e006a369a28a3df3) ([#12696](https://github.com/yt-dlp/yt-dlp/issues/12696)) by [Grub4K](https://github.com/Grub4K) +- **utils**: `js_to_json`: [Make function less fatal](https://github.com/yt-dlp/yt-dlp/commit/9491b44032b330e05bd5eaa546187005d1e8538e) ([#12715](https://github.com/yt-dlp/yt-dlp/issues/12715)) by [seproDev](https://github.com/seproDev) + +#### Extractor changes +- [Fix sorting of HLS audio formats by `GROUP-ID`](https://github.com/yt-dlp/yt-dlp/commit/86ab79e1a5182092321102adf6ca34195803b878) ([#12714](https://github.com/yt-dlp/yt-dlp/issues/12714)) by [bashonly](https://github.com/bashonly) +- **17live**: vod: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/3396eb50dcd245b49c0f4aecd6e80ec914095d16) ([#12723](https://github.com/yt-dlp/yt-dlp/issues/12723)) by [subrat-lima](https://github.com/subrat-lima) +- **9now.com.au**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/9d5e6de2e7a47226d1f72c713ad45c88ba01db68) ([#12702](https://github.com/yt-dlp/yt-dlp/issues/12702)) by [bashonly](https://github.com/bashonly) +- **chzzk**: video: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/e2dfccaf808b406d5bcb7dd04ae9ce420752dd6f) ([#12692](https://github.com/yt-dlp/yt-dlp/issues/12692)) by [bashonly](https://github.com/bashonly), [dirkf](https://github.com/dirkf) +- **deezer**: [Remove extractors](https://github.com/yt-dlp/yt-dlp/commit/be5af3f9e91747768c2b41157851bfbe14c663f7) ([#12704](https://github.com/yt-dlp/yt-dlp/issues/12704)) by [seproDev](https://github.com/seproDev) +- **generic**: [Fix MPD base URL parsing](https://github.com/yt-dlp/yt-dlp/commit/5086d4aed6aeb3908c62f49e2d8f74cc0cb05110) ([#12718](https://github.com/yt-dlp/yt-dlp/issues/12718)) by [fireattack](https://github.com/fireattack) +- **streaks**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/801afeac91f97dc0b58cd39cc7e8c50f619dc4e1) ([#12679](https://github.com/yt-dlp/yt-dlp/issues/12679)) by [doe1080](https://github.com/doe1080) +- **tver**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/66e0bab814e4a52ef3e12d81123ad992a29df50e) ([#12659](https://github.com/yt-dlp/yt-dlp/issues/12659)) by [arabcoders](https://github.com/arabcoders), [bashonly](https://github.com/bashonly) +- **viki**: [Remove extractors](https://github.com/yt-dlp/yt-dlp/commit/fe4f14b8369038e7c58f7de546d76de1ce3a91ce) ([#12703](https://github.com/yt-dlp/yt-dlp/issues/12703)) by [seproDev](https://github.com/seproDev) +- **vrsquare**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/b7fbb5a0a16a8e8d3e29c29e26ebed677d0d6ea3) ([#12515](https://github.com/yt-dlp/yt-dlp/issues/12515)) by [doe1080](https://github.com/doe1080) +- **youtube** + - [Fix PhantomJS nsig fallback](https://github.com/yt-dlp/yt-dlp/commit/4054a2b623bd1e277b49d2e9abc3d112a4b1c7be) ([#12728](https://github.com/yt-dlp/yt-dlp/issues/12728)) by [bashonly](https://github.com/bashonly) + - [Fix signature and nsig extraction for player `363db69b`](https://github.com/yt-dlp/yt-dlp/commit/b9c979461b244713bf42691a5bc02834e2ba4b2c) ([#12725](https://github.com/yt-dlp/yt-dlp/issues/12725)) by [bashonly](https://github.com/bashonly) + +#### Networking changes +- **Request Handler**: curl_cffi: [Support `curl_cffi` 0.10.x](https://github.com/yt-dlp/yt-dlp/commit/9bf23902ceb948b9685ce1dab575491571720fc6) ([#12670](https://github.com/yt-dlp/yt-dlp/issues/12670)) by [Grub4K](https://github.com/Grub4K) + +#### Misc. changes +- **cleanup**: Miscellaneous: [9dde546](https://github.com/yt-dlp/yt-dlp/commit/9dde546e7ee3e1515d88ee3af08b099351455dc0) by [seproDev](https://github.com/seproDev) + +### 2025.03.21 + +#### Core changes +- [Fix external downloader availability when using `--ffmpeg-location`](https://github.com/yt-dlp/yt-dlp/commit/9f77e04c76e36e1cbbf49bc9eb385fa6ef804b67) ([#12318](https://github.com/yt-dlp/yt-dlp/issues/12318)) by [Kenshin9977](https://github.com/Kenshin9977) +- [Load plugins on demand](https://github.com/yt-dlp/yt-dlp/commit/4445f37a7a66b248dbd8376c43137e6e441f138e) ([#11305](https://github.com/yt-dlp/yt-dlp/issues/11305)) by [coletdjnz](https://github.com/coletdjnz), [Grub4K](https://github.com/Grub4K), [pukkandan](https://github.com/pukkandan) (With fixes in [c034d65](https://github.com/yt-dlp/yt-dlp/commit/c034d655487be668222ef9476a16f374584e49a7)) +- [Support emitting ConEmu progress codes](https://github.com/yt-dlp/yt-dlp/commit/f7a1f2d8132967a62b0f6d5665c6d2dde2d42c09) ([#10649](https://github.com/yt-dlp/yt-dlp/issues/10649)) by [Grub4K](https://github.com/Grub4K) + +#### Extractor changes +- **azmedien**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/26a502fc727d0e91b2db6bf4a112823bcc672e85) ([#12375](https://github.com/yt-dlp/yt-dlp/issues/12375)) by [goggle](https://github.com/goggle) +- **bilibiliplaylist**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/f5fb2229e66cf59d5bf16065bc041b42a28354a0) ([#12690](https://github.com/yt-dlp/yt-dlp/issues/12690)) by [bashonly](https://github.com/bashonly) +- **bunnycdn**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/3a1583ca75fb523cbad0e5e174387ea7b477d175) ([#11586](https://github.com/yt-dlp/yt-dlp/issues/11586)) by [Grub4K](https://github.com/Grub4K), [seproDev](https://github.com/seproDev) +- **canalsurmas**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/01a8be4c23f186329d85f9c78db34a55f3294ac5) ([#12497](https://github.com/yt-dlp/yt-dlp/issues/12497)) by [Arc8ne](https://github.com/Arc8ne) +- **cda**: [Fix login support](https://github.com/yt-dlp/yt-dlp/commit/be0d819e1103195043f6743650781f0d4d343f6d) ([#12552](https://github.com/yt-dlp/yt-dlp/issues/12552)) by [rysson](https://github.com/rysson) +- **cultureunplugged**: [Extend `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/3042afb5fe342d3a00de76704cd7de611acc350e) ([#12486](https://github.com/yt-dlp/yt-dlp/issues/12486)) by [seproDev](https://github.com/seproDev) +- **dailymotion**: [Improve embed detection](https://github.com/yt-dlp/yt-dlp/commit/ad60137c141efa5023fbc0ac8579eaefe8b3d8cc) ([#12464](https://github.com/yt-dlp/yt-dlp/issues/12464)) by [seproDev](https://github.com/seproDev) +- **gem.cbc.ca**: [Fix login support](https://github.com/yt-dlp/yt-dlp/commit/eb1417786a3027b1e7290ec37ef6aaece50ebed0) ([#12414](https://github.com/yt-dlp/yt-dlp/issues/12414)) by [bashonly](https://github.com/bashonly) +- **globo**: [Fix subtitles extraction](https://github.com/yt-dlp/yt-dlp/commit/0e1697232fcbba7551f983fd1ba93bb445cbb08b) ([#12270](https://github.com/yt-dlp/yt-dlp/issues/12270)) by [pedro](https://github.com/pedro) +- **instagram** + - [Add `app_id` extractor-arg](https://github.com/yt-dlp/yt-dlp/commit/a90641c8363fa0c10800b36eb6b01ee22d3a9409) ([#12359](https://github.com/yt-dlp/yt-dlp/issues/12359)) by [chrisellsworth](https://github.com/chrisellsworth) + - [Fix extraction of older private posts](https://github.com/yt-dlp/yt-dlp/commit/a59abe0636dc49b22a67246afe35613571b86f05) ([#12451](https://github.com/yt-dlp/yt-dlp/issues/12451)) by [bashonly](https://github.com/bashonly) + - [Improve error handling](https://github.com/yt-dlp/yt-dlp/commit/480125560a3b9972d29ae0da850aba8109e6bd41) ([#12410](https://github.com/yt-dlp/yt-dlp/issues/12410)) by [bashonly](https://github.com/bashonly) + - story: [Support `--no-playlist`](https://github.com/yt-dlp/yt-dlp/commit/65c3c58c0a67463a150920203cec929045c95a24) ([#12397](https://github.com/yt-dlp/yt-dlp/issues/12397)) by [fireattack](https://github.com/fireattack) +- **jamendo**: [Fix thumbnail extraction](https://github.com/yt-dlp/yt-dlp/commit/89a68c4857ddbaf937ff22f12648baaf6b5af840) ([#12622](https://github.com/yt-dlp/yt-dlp/issues/12622)) by [bashonly](https://github.com/bashonly), [JChris246](https://github.com/JChris246) +- **ketnet**: [Remove extractor](https://github.com/yt-dlp/yt-dlp/commit/bbada3ec0779422cde34f1ce3dcf595da463b493) ([#12628](https://github.com/yt-dlp/yt-dlp/issues/12628)) by [MichaelDeBoey](https://github.com/MichaelDeBoey) +- **lbry** + - [Make m3u8 format extraction non-fatal](https://github.com/yt-dlp/yt-dlp/commit/9807181cfbf87bfa732f415c30412bdbd77cbf81) ([#12463](https://github.com/yt-dlp/yt-dlp/issues/12463)) by [bashonly](https://github.com/bashonly) + - [Raise appropriate error for non-media files](https://github.com/yt-dlp/yt-dlp/commit/7126b472601814b7fd8c9de02069e8fff1764891) ([#12462](https://github.com/yt-dlp/yt-dlp/issues/12462)) by [bashonly](https://github.com/bashonly) +- **loco**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/983095485c731240aae27c950cb8c24a50827b56) ([#12667](https://github.com/yt-dlp/yt-dlp/issues/12667)) by [DTrombett](https://github.com/DTrombett) +- **magellantv**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/172d5fcd778bf2605db7647ebc56b29ed18d24ac) ([#12505](https://github.com/yt-dlp/yt-dlp/issues/12505)) by [seproDev](https://github.com/seproDev) +- **mitele**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/7223d29569a48a35ad132a508c115973866838d3) ([#12689](https://github.com/yt-dlp/yt-dlp/issues/12689)) by [bashonly](https://github.com/bashonly) +- **msn**: [Rework extractor](https://github.com/yt-dlp/yt-dlp/commit/4815dac131d42c51e12c1d05232db0bbbf607329) ([#12513](https://github.com/yt-dlp/yt-dlp/issues/12513)) by [seproDev](https://github.com/seproDev), [thedenv](https://github.com/thedenv) +- **n1**: [Fix extraction of newer articles](https://github.com/yt-dlp/yt-dlp/commit/9d70abe4de401175cbbaaa36017806f16b2df9af) ([#12514](https://github.com/yt-dlp/yt-dlp/issues/12514)) by [u-spec-png](https://github.com/u-spec-png) +- **nbcstations**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/ebac65aa9e0bf9a97c24d00f7977900d2577364b) ([#12534](https://github.com/yt-dlp/yt-dlp/issues/12534)) by [refack](https://github.com/refack) +- **niconico** + - [Fix format sorting](https://github.com/yt-dlp/yt-dlp/commit/7508e34f203e97389f1d04db92140b13401dd724) ([#12442](https://github.com/yt-dlp/yt-dlp/issues/12442)) by [xpadev-net](https://github.com/xpadev-net) + - live: [Fix thumbnail extraction](https://github.com/yt-dlp/yt-dlp/commit/c2e6e1d5f77f3b720a6266f2869eb750d20e5dc1) ([#12419](https://github.com/yt-dlp/yt-dlp/issues/12419)) by [bashonly](https://github.com/bashonly) +- **openrec**: [Fix `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/17504f253564cfad86244de2b6346d07d2300ca5) ([#12608](https://github.com/yt-dlp/yt-dlp/issues/12608)) by [fireattack](https://github.com/fireattack) +- **pinterest**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/bd0a66816934de70312eea1e71c59c13b401dc3a) ([#12538](https://github.com/yt-dlp/yt-dlp/issues/12538)) by [mikf](https://github.com/mikf) +- **playsuisse**: [Fix login support](https://github.com/yt-dlp/yt-dlp/commit/6933f5670cea9c3e2fb16c1caa1eda54d13122c5) ([#12444](https://github.com/yt-dlp/yt-dlp/issues/12444)) by [bashonly](https://github.com/bashonly) +- **reddit**: [Truncate title](https://github.com/yt-dlp/yt-dlp/commit/d9a53cc1e6fd912daf500ca4f19e9ca88994dbf9) ([#12567](https://github.com/yt-dlp/yt-dlp/issues/12567)) by [seproDev](https://github.com/seproDev) +- **rtp**: [Rework extractor](https://github.com/yt-dlp/yt-dlp/commit/8eb9c1bf3b9908cca22ef043602aa24fb9f352c6) ([#11638](https://github.com/yt-dlp/yt-dlp/issues/11638)) by [pferreir](https://github.com/pferreir), [red-acid](https://github.com/red-acid), [seproDev](https://github.com/seproDev), [somini](https://github.com/somini), [vallovic](https://github.com/vallovic) +- **softwhiteunderbelly**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/652827d5a076c9483c36654ad2cf3fe46219baf4) ([#12281](https://github.com/yt-dlp/yt-dlp/issues/12281)) by [benfaerber](https://github.com/benfaerber) +- **soop**: [Fix timestamp extraction](https://github.com/yt-dlp/yt-dlp/commit/8305df00012ff8138a6ff95279d06b54ac607f63) ([#12609](https://github.com/yt-dlp/yt-dlp/issues/12609)) by [msikma](https://github.com/msikma) +- **soundcloud** + - [Extract tags](https://github.com/yt-dlp/yt-dlp/commit/9deed13d7cce6d3647379e50589c92de89227509) ([#12420](https://github.com/yt-dlp/yt-dlp/issues/12420)) by [bashonly](https://github.com/bashonly) + - [Fix thumbnail extraction](https://github.com/yt-dlp/yt-dlp/commit/6deeda5c11f34f613724fa0627879f0d607ba1b4) ([#12447](https://github.com/yt-dlp/yt-dlp/issues/12447)) by [bashonly](https://github.com/bashonly) +- **tiktok** + - [Improve error handling](https://github.com/yt-dlp/yt-dlp/commit/99ea2978757a431eeb2a265b3395ccbe4ce202cf) ([#12445](https://github.com/yt-dlp/yt-dlp/issues/12445)) by [bashonly](https://github.com/bashonly) + - [Truncate title](https://github.com/yt-dlp/yt-dlp/commit/83b119dadb0f267f1fb66bf7ed74c097349de79e) ([#12566](https://github.com/yt-dlp/yt-dlp/issues/12566)) by [seproDev](https://github.com/seproDev) +- **tv8.it**: [Add live and playlist extractors](https://github.com/yt-dlp/yt-dlp/commit/2ee3a0aff9be2be3bea60640d3d8a0febaf0acb6) ([#12569](https://github.com/yt-dlp/yt-dlp/issues/12569)) by [DTrombett](https://github.com/DTrombett) +- **tvw**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/42b7440963866e31ff84a5b89030d1c596fa2e6e) ([#12271](https://github.com/yt-dlp/yt-dlp/issues/12271)) by [fries1234](https://github.com/fries1234) +- **twitter** + - [Fix syndication token generation](https://github.com/yt-dlp/yt-dlp/commit/b8b47547049f5ebc3dd680fc7de70ed0ca9c0d70) ([#12537](https://github.com/yt-dlp/yt-dlp/issues/12537)) by [bashonly](https://github.com/bashonly) + - [Truncate title](https://github.com/yt-dlp/yt-dlp/commit/06f6de78db2eceeabd062ab1a3023e0ff9d4df53) ([#12560](https://github.com/yt-dlp/yt-dlp/issues/12560)) by [seproDev](https://github.com/seproDev) +- **vk**: [Improve metadata extraction](https://github.com/yt-dlp/yt-dlp/commit/05c8023a27dd37c49163c0498bf98e3e3c1cb4b9) ([#12510](https://github.com/yt-dlp/yt-dlp/issues/12510)) by [seproDev](https://github.com/seproDev) +- **vrtmax**: [Rework extractor](https://github.com/yt-dlp/yt-dlp/commit/df9ebeec00d658693252978d1ffb885e67aa6ab6) ([#12479](https://github.com/yt-dlp/yt-dlp/issues/12479)) by [bergoid](https://github.com/bergoid), [MichaelDeBoey](https://github.com/MichaelDeBoey), [seproDev](https://github.com/seproDev) +- **weibo**: [Support playlists](https://github.com/yt-dlp/yt-dlp/commit/0bb39788626002a8a67e925580227952c563c8b9) ([#12284](https://github.com/yt-dlp/yt-dlp/issues/12284)) by [4ft35t](https://github.com/4ft35t) +- **wsj**: [Support opinion URLs and impersonation](https://github.com/yt-dlp/yt-dlp/commit/7f3006eb0c0659982bb956d71b0bc806bcb0a5f2) ([#12431](https://github.com/yt-dlp/yt-dlp/issues/12431)) by [refack](https://github.com/refack) +- **youtube** + - [Fix nsig and signature extraction for player `643afba4`](https://github.com/yt-dlp/yt-dlp/commit/9b868518a15599f3d7ef5a1c730dda164c30da9b) ([#12684](https://github.com/yt-dlp/yt-dlp/issues/12684)) by [bashonly](https://github.com/bashonly), [seproDev](https://github.com/seproDev) + - [Player client maintenance](https://github.com/yt-dlp/yt-dlp/commit/3380febe9984c21c79c3147c1d390a4cf339bc4c) ([#12603](https://github.com/yt-dlp/yt-dlp/issues/12603)) by [seproDev](https://github.com/seproDev) + - [Split into package](https://github.com/yt-dlp/yt-dlp/commit/4432a9390c79253ac830702b226d2e558b636725) ([#12557](https://github.com/yt-dlp/yt-dlp/issues/12557)) by [coletdjnz](https://github.com/coletdjnz) + - [Warn on DRM formats](https://github.com/yt-dlp/yt-dlp/commit/e67d786c7cc87bd449d22e0ddef08306891c1173) ([#12593](https://github.com/yt-dlp/yt-dlp/issues/12593)) by [coletdjnz](https://github.com/coletdjnz) + - [Warn on missing formats due to SSAP](https://github.com/yt-dlp/yt-dlp/commit/79ec2fdff75c8c1bb89b550266849ad4dec48dd3) ([#12483](https://github.com/yt-dlp/yt-dlp/issues/12483)) by [coletdjnz](https://github.com/coletdjnz) + +#### Networking changes +- [Add `keep_header_casing` extension](https://github.com/yt-dlp/yt-dlp/commit/7d18fed8f1983fe6de4ddc810dfb2761ba5744ac) ([#11652](https://github.com/yt-dlp/yt-dlp/issues/11652)) by [coletdjnz](https://github.com/coletdjnz), [Grub4K](https://github.com/Grub4K) +- [Always add unsupported suffix on version mismatch](https://github.com/yt-dlp/yt-dlp/commit/95f8df2f796d0048119615200758199aedcd7cf4) ([#12626](https://github.com/yt-dlp/yt-dlp/issues/12626)) by [Grub4K](https://github.com/Grub4K) + +#### Misc. changes +- **cleanup**: Miscellaneous: [f36e4b6](https://github.com/yt-dlp/yt-dlp/commit/f36e4b6e65cb8403791aae2f520697115cb88dec) by [dirkf](https://github.com/dirkf), [gamer191](https://github.com/gamer191), [Grub4K](https://github.com/Grub4K), [seproDev](https://github.com/seproDev) +- **test**: [Show all differences for `expect_value` and `expect_dict`](https://github.com/yt-dlp/yt-dlp/commit/a3e0c7d3b267abdf3933b709704a28d43bb46503) ([#12334](https://github.com/yt-dlp/yt-dlp/issues/12334)) by [Grub4K](https://github.com/Grub4K) + +### 2025.02.19 + +#### Core changes +- **jsinterp** + - [Add `js_number_to_string`](https://github.com/yt-dlp/yt-dlp/commit/0d9f061d38c3a4da61972e2adad317079f2f1c84) ([#12110](https://github.com/yt-dlp/yt-dlp/issues/12110)) by [Grub4K](https://github.com/Grub4K) + - [Improve zeroise](https://github.com/yt-dlp/yt-dlp/commit/4ca8c44a073d5aa3a3e3112c35b2b23d6ce25ac6) ([#12313](https://github.com/yt-dlp/yt-dlp/issues/12313)) by [seproDev](https://github.com/seproDev) + +#### Extractor changes +- **acast**: [Support shows.acast.com URLs](https://github.com/yt-dlp/yt-dlp/commit/57c717fee4bfbc9309845bbb48901b72e4b69304) ([#12223](https://github.com/yt-dlp/yt-dlp/issues/12223)) by [barsnick](https://github.com/barsnick) +- **cwtv** + - [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/18a28514e306e822eab4f3a79c76d515bf076406) ([#12207](https://github.com/yt-dlp/yt-dlp/issues/12207)) by [arantius](https://github.com/arantius) + - movie: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/03c3d705778c07739e0034b51490877cffdc0983) ([#12227](https://github.com/yt-dlp/yt-dlp/issues/12227)) by [bashonly](https://github.com/bashonly) +- **digiview**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/f53553087d3fde9dcd61d6e9f98caf09db1d8ef2) ([#9902](https://github.com/yt-dlp/yt-dlp/issues/9902)) by [lfavole](https://github.com/lfavole) +- **dropbox**: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/861aeec449c8f3c062d962945b234ff0341f61f3) ([#12228](https://github.com/yt-dlp/yt-dlp/issues/12228)) by [bashonly](https://github.com/bashonly) +- **francetv** + - site + - [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/817483ccc68aed6049ed9c4a2ffae44ca82d2b1c) ([#12236](https://github.com/yt-dlp/yt-dlp/issues/12236)) by [bashonly](https://github.com/bashonly) + - [Fix livestream extraction](https://github.com/yt-dlp/yt-dlp/commit/1295bbedd45fa8d9bc3f7a194864ae280297848e) ([#12316](https://github.com/yt-dlp/yt-dlp/issues/12316)) by [bashonly](https://github.com/bashonly) +- **francetvinfo.fr**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/5c4c2ddfaa47988b4d50c1ad4988badc0b4f30c2) ([#12402](https://github.com/yt-dlp/yt-dlp/issues/12402)) by [bashonly](https://github.com/bashonly) +- **gem.cbc.ca**: [Fix extractors](https://github.com/yt-dlp/yt-dlp/commit/5271ef48c6f61c145e03e18e960995d2e651d205) ([#12404](https://github.com/yt-dlp/yt-dlp/issues/12404)) by [bashonly](https://github.com/bashonly), [dirkf](https://github.com/dirkf) +- **generic**: [Extract `live_status` for DASH manifest URLs](https://github.com/yt-dlp/yt-dlp/commit/19edaa44fcd375f54e63d6227b092f5252d3e889) ([#12256](https://github.com/yt-dlp/yt-dlp/issues/12256)) by [mp3butcher](https://github.com/mp3butcher) +- **globo**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/f8d0161455f00add65585ca1a476a7b5d56f5f96) ([#11795](https://github.com/yt-dlp/yt-dlp/issues/11795)) by [slipinthedove](https://github.com/slipinthedove), [YoshiTabletopGamer](https://github.com/YoshiTabletopGamer) +- **goplay**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/d59f14a0a7a8b55e6bf468237def62b73ab4a517) ([#12237](https://github.com/yt-dlp/yt-dlp/issues/12237)) by [alard](https://github.com/alard) +- **pbs**: [Support www.thirteen.org URLs](https://github.com/yt-dlp/yt-dlp/commit/9fb8ab2ff67fb699f60cce09163a580976e90c0e) ([#11191](https://github.com/yt-dlp/yt-dlp/issues/11191)) by [rohieb](https://github.com/rohieb) +- **reddit**: [Bypass gated subreddit warning](https://github.com/yt-dlp/yt-dlp/commit/6ca23ffaa4663cb552f937f0b1e9769b66db11bd) ([#12335](https://github.com/yt-dlp/yt-dlp/issues/12335)) by [bashonly](https://github.com/bashonly) +- **twitter**: [Fix syndication token generation](https://github.com/yt-dlp/yt-dlp/commit/14cd7f3443c6da4d49edaefcc12da9dee86e243e) ([#12107](https://github.com/yt-dlp/yt-dlp/issues/12107)) by [Grub4K](https://github.com/Grub4K), [pjrobertson](https://github.com/pjrobertson) +- **youtube** + - [Retry on more critical requests](https://github.com/yt-dlp/yt-dlp/commit/d48e612609d012abbea3785be4d26d78a014abb2) ([#12339](https://github.com/yt-dlp/yt-dlp/issues/12339)) by [coletdjnz](https://github.com/coletdjnz) + - [nsig workaround for `tce` player JS](https://github.com/yt-dlp/yt-dlp/commit/ec17fb16e8d69d4e3e10fb73bf3221be8570dfee) ([#12401](https://github.com/yt-dlp/yt-dlp/issues/12401)) by [bashonly](https://github.com/bashonly) +- **zdf**: [Extract more metadata](https://github.com/yt-dlp/yt-dlp/commit/241ace4f104d50fdf7638f9203927aefcf57a1f7) ([#9565](https://github.com/yt-dlp/yt-dlp/issues/9565)) by [StefanLobbenmeier](https://github.com/StefanLobbenmeier) (With fixes in [e7882b6](https://github.com/yt-dlp/yt-dlp/commit/e7882b682b959e476d8454911655b3e9b14c79b2) by [bashonly](https://github.com/bashonly)) + +#### Downloader changes +- **hls** + - [Fix `BYTERANGE` logic](https://github.com/yt-dlp/yt-dlp/commit/10b7ff68e98f17655e31952f6e17120b2d7dda96) ([#11972](https://github.com/yt-dlp/yt-dlp/issues/11972)) by [entourage8](https://github.com/entourage8) + - [Support `--write-pages` for m3u8 media playlists](https://github.com/yt-dlp/yt-dlp/commit/be69468752ff598cacee57bb80533deab2367a5d) ([#12333](https://github.com/yt-dlp/yt-dlp/issues/12333)) by [bashonly](https://github.com/bashonly) + - [Support `hls_media_playlist_data` format field](https://github.com/yt-dlp/yt-dlp/commit/c987be0acb6872c6561f28aa28171e803393d851) ([#12322](https://github.com/yt-dlp/yt-dlp/issues/12322)) by [bashonly](https://github.com/bashonly) + +#### Misc. changes +- [Improve Issue/PR templates](https://github.com/yt-dlp/yt-dlp/commit/517ddf3c3f12560ab93e3d36244dc82db9f97818) ([#11499](https://github.com/yt-dlp/yt-dlp/issues/11499)) by [seproDev](https://github.com/seproDev) (With fixes in [4ecb833](https://github.com/yt-dlp/yt-dlp/commit/4ecb833472c90e078567b561fb7c089f1aa9587b) by [bashonly](https://github.com/bashonly)) +- **cleanup**: Miscellaneous: [4985a40](https://github.com/yt-dlp/yt-dlp/commit/4985a4041770eaa0016271809a1fd950dc809a55) by [dirkf](https://github.com/dirkf), [Grub4K](https://github.com/Grub4K), [StefanLobbenmeier](https://github.com/StefanLobbenmeier) +- **docs**: [Add note to `supportedsites.md`](https://github.com/yt-dlp/yt-dlp/commit/01a63629a21781458dcbd38779898e117678f5ff) ([#12382](https://github.com/yt-dlp/yt-dlp/issues/12382)) by [seproDev](https://github.com/seproDev) +- **test**: download: [Validate and sort info dict fields](https://github.com/yt-dlp/yt-dlp/commit/208163447408c78673b08c172beafe5c310fb167) ([#12299](https://github.com/yt-dlp/yt-dlp/issues/12299)) by [bashonly](https://github.com/bashonly), [pzhlkj6612](https://github.com/pzhlkj6612) + ### 2025.01.26 #### Core changes diff --git a/README.md b/README.md index 45c56434a..a87b52832 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,6 @@ [![Release version](https://img.shields.io/github/v/release/yt-dlp/yt-dlp?color=brightgreen&label=Download&style=for-the-badge)](#installation "Installation") [![PyPI](https://img.shields.io/badge/-PyPI-blue.svg?logo=pypi&labelColor=555555&style=for-the-badge)](https://pypi.org/project/yt-dlp "PyPI") [![Donate](https://img.shields.io/badge/_-Donate-red.svg?logo=githubsponsors&labelColor=555555&style=for-the-badge)](Collaborators.md#collaborators "Donate") -[![Matrix](https://img.shields.io/matrix/yt-dlp:matrix.org?color=brightgreen&labelColor=555555&label=&logo=element&style=for-the-badge)](https://matrix.to/#/#yt-dlp:matrix.org "Matrix") [![Discord](https://img.shields.io/discord/807245652072857610?color=blue&labelColor=555555&label=&logo=discord&style=for-the-badge)](https://discord.gg/H5MNcFW63r "Discord") [![Supported Sites](https://img.shields.io/badge/-Supported_Sites-brightgreen.svg?style=for-the-badge)](supportedsites.md "Supported Sites") [![License: Unlicense](https://img.shields.io/badge/-Unlicense-blue.svg?style=for-the-badge)](LICENSE "License") @@ -338,10 +337,11 @@ ## General Options: --plugin-dirs PATH Path to an additional directory to search for plugins. This option can be used multiple times to add multiple directories. - Note that this currently only works for - extractor plugins; postprocessor plugins can - only be loaded from the default plugin - directories + Use "default" to search the default plugin + directories (default) + --no-plugin-dirs Clear plugin directories to search, + including defaults and those provided by + previous --plugin-dirs --flat-playlist Do not extract a playlist's URL result entries; some entry metadata may be missing and downloading may be bypassed @@ -386,6 +386,12 @@ ## General Options: recursive options. As a safety measure, each alias may be triggered a maximum of 100 times. This option can be used multiple times + -t, --preset-alias PRESET Applies a predefined set of options. e.g. + --preset-alias mp3. The following presets + are available: mp3, aac, mp4, mkv, sleep. + See the "Preset Aliases" section at the end + for more info. This option can be used + multiple times ## Network Options: --proxy URL Use the specified HTTP/HTTPS/SOCKS proxy. To @@ -1098,6 +1104,23 @@ ## Extractor Options: can use this option multiple times to give arguments for different extractors +## Preset Aliases: + -t mp3 -f 'ba[acodec^=mp3]/ba/b' -x --audio-format + mp3 + + -t aac -f + 'ba[acodec^=aac]/ba[acodec^=mp4a.40.]/ba/b' + -x --audio-format aac + + -t mp4 --merge-output-format mp4 --remux-video mp4 + -S vcodec:h264,lang,quality,res,fps,hdr:12,a + codec:aac + + -t mkv --merge-output-format mkv --remux-video mkv + + -t sleep --sleep-subtitles 5 --sleep-requests 0.75 + --sleep-interval 10 --max-sleep-interval 20 + # CONFIGURATION You can configure yt-dlp by placing any supported command line option in a configuration file. The configuration is loaded from the following locations: @@ -1526,7 +1549,7 @@ ## Sorting Formats - `hasvid`: Gives priority to formats that have a video stream - `hasaud`: Gives priority to formats that have an audio stream - `ie_pref`: The format preference - - `lang`: The language preference + - `lang`: The language preference as determined by the extractor (e.g. original language preferred over audio description) - `quality`: The quality of the format - `source`: The preference of the source - `proto`: Protocol used for download (`https`/`ftps` > `http`/`ftp` > `m3u8_native`/`m3u8` > `http_dash_segments`> `websocket_frag` > `mms`/`rtsp` > `f4f`/`f4m`) @@ -1769,8 +1792,8 @@ # EXTRACTOR ARGUMENTS #### youtube * `lang`: Prefer translated metadata (`title`, `description` etc) of this language code (case-sensitive). By default, the video primary language metadata is preferred, with a fallback to `en` translated. See [youtube.py](https://github.com/yt-dlp/yt-dlp/blob/c26f9b991a0681fd3ea548d535919cec1fbbd430/yt_dlp/extractor/youtube.py#L381-L390) for list of supported content language codes * `skip`: One or more of `hls`, `dash` or `translated_subs` to skip extraction of the m3u8 manifests, dash manifests and [auto-translated subtitles](https://github.com/yt-dlp/yt-dlp/issues/4090#issuecomment-1158102032) respectively -* `player_client`: Clients to extract video data from. The main clients are `web`, `ios` and `android`, with variants `_music` and `_creator` (e.g. `ios_creator`); and `mweb`, `android_vr`, `web_safari`, `web_embedded`, `tv` and `tv_embedded` with no variants. By default, `tv,ios,web` is used, or `tv,web` is used when authenticating with cookies. The `web_music` client is added for `music.youtube.com` URLs when logged-in cookies are used. The `tv_embedded` and `web_creator` clients are added for age-restricted videos if account age-verification is required. Some clients, such as `web` and `web_music`, require a `po_token` for their formats to be downloadable. Some clients, such as the `_creator` variants, will only work with authentication. Not all clients support authentication via cookies. You can use `default` for the default clients, or you can use `all` for all clients (not recommended). You can prefix a client with `-` to exclude it, e.g. `youtube:player_client=default,-ios` -* `player_skip`: Skip some network requests that are generally needed for robust extraction. One or more of `configs` (skip client configs), `webpage` (skip initial webpage), `js` (skip js player). While these options can help reduce the number of requests needed or avoid some rate-limiting, they could cause some issues. See [#860](https://github.com/yt-dlp/yt-dlp/pull/860) for more details +* `player_client`: Clients to extract video data from. The currently available clients are `web`, `web_safari`, `web_embedded`, `web_music`, `web_creator`, `mweb`, `ios`, `android`, `android_vr`, `tv` and `tv_embedded`. By default, `tv,ios,web` is used, or `tv,web` is used when authenticating with cookies. The `web_music` client is added for `music.youtube.com` URLs when logged-in cookies are used. The `web_embedded` client is added for age-restricted videos but only works if the video is embeddable. The `tv_embedded` and `web_creator` clients are added for age-restricted videos if account age-verification is required. Some clients, such as `web` and `web_music`, require a `po_token` for their formats to be downloadable. Some clients, such as `web_creator`, will only work with authentication. Not all clients support authentication via cookies. You can use `default` for the default clients, or you can use `all` for all clients (not recommended). You can prefix a client with `-` to exclude it, e.g. `youtube:player_client=default,-ios` +* `player_skip`: Skip some network requests that are generally needed for robust extraction. One or more of `configs` (skip client configs), `webpage` (skip initial webpage), `js` (skip js player), `initial_data` (skip initial data/next ep request). While these options can help reduce the number of requests needed or avoid some rate-limiting, they could cause issues such as missing formats or metadata. See [#860](https://github.com/yt-dlp/yt-dlp/pull/860) and [#12826](https://github.com/yt-dlp/yt-dlp/issues/12826) for more details * `player_params`: YouTube player parameters to use for player requests. Will overwrite any default ones set by yt-dlp. * `comment_sort`: `top` or `new` (default) - choose comment sorting mode (on YouTube's side) * `max_comments`: Limit the amount of comments to gather. Comma-separated list of integers representing `max-comments,max-parents,max-replies,max-replies-per-thread`. Default is `all,all,all,all` @@ -1782,6 +1805,7 @@ #### youtube * `data_sync_id`: Overrides the account Data Sync ID used in Innertube API requests. This may be needed if you are using an account with `youtube:player_skip=webpage,configs` or `youtubetab:skip=webpage` * `visitor_data`: Overrides the Visitor Data used in Innertube API requests. This should be used with `player_skip=webpage,configs` and without cookies. Note: this may have adverse effects if used improperly. If a session from a browser is wanted, you should pass cookies instead (which contain the Visitor ID) * `po_token`: Proof of Origin (PO) Token(s) to use. Comma seperated list of PO Tokens in the format `CLIENT.CONTEXT+PO_TOKEN`, e.g. `youtube:po_token=web.gvs+XXX,web.player=XXX,web_safari.gvs+YYY`. Context can be either `gvs` (Google Video Server URLs) or `player` (Innertube player request) +* `player_js_variant`: The player javascript variant to use for signature and nsig deciphering. The known variants are: `main`, `tce`, `tv`, `tv_es6`, `phone`, `tablet`. Only `main` is recommended as a possible workaround; the others are for debugging purposes. The default is to use what is prescribed by the site, and can be selected with `actual` #### youtubetab (YouTube playlists, channels, feeds, etc.) * `skip`: One or more of `webpage` (skip initial webpage download), `authcheck` (allow the download of playlists requiring authentication when no initial webpage is downloaded. This may cause unwanted behavior, see [#1122](https://github.com/yt-dlp/yt-dlp/pull/1122) for more details) @@ -1798,9 +1822,6 @@ #### generic #### vikichannel * `video_types`: Types of videos to download - one or more of `episodes`, `movies`, `clips`, `trailers` -#### niconico -* `segment_duration`: Segment duration in milliseconds for HLS-DMC formats. Use it at your own risk since this feature **may result in your account termination.** - #### youtubewebarchive * `check_all`: Try to check more at the cost of more requests. One or more of `thumbnails`, `captures` @@ -1812,6 +1833,9 @@ #### hotstar * `vcodec`: vcodec to ignore - one or more of `h264`, `h265`, `dvh265` * `dr`: dynamic range to ignore - one or more of `sdr`, `hdr10`, `dv` +#### instagram +* `app_id`: The value of the `X-IG-App-ID` header used for API requests. Default is the web app ID, `936619743392459` + #### niconicochannelplus * `max_comments`: Maximum number of comments to extract - default is `120` @@ -1863,6 +1887,9 @@ #### bilibili #### sonylivseries * `sort_order`: Episode sort order for series extraction - one of `asc` (ascending, oldest first) or `desc` (descending, newest first). Default is `asc` +#### tver +* `backend`: Backend API to use for extraction - one of `streaks` (default) or `brightcove` (deprecated) + **Note**: These options may be changed/removed in the future without concern for backward compatibility @@ -2146,7 +2173,7 @@ ### New features * **[Format Sorting](#sorting-formats)**: The default format sorting options have been changed so that higher resolution and better codecs will be now preferred instead of simply using larger bitrate. Furthermore, you can now specify the sort order using `-S`. This allows for much easier format selection than what is possible by simply using `--format` ([examples](#format-selection-examples)) -* **Merged with animelover1984/youtube-dl**: You get most of the features and improvements from [animelover1984/youtube-dl](https://github.com/animelover1984/youtube-dl) including `--write-comments`, `BiliBiliSearch`, `BilibiliChannel`, Embedding thumbnail in mp4/ogg/opus, playlist infojson etc. Note that NicoNico livestreams are not available. See [#31](https://github.com/yt-dlp/yt-dlp/pull/31) for details. +* **Merged with animelover1984/youtube-dl**: You get most of the features and improvements from [animelover1984/youtube-dl](https://github.com/animelover1984/youtube-dl) including `--write-comments`, `BiliBiliSearch`, `BilibiliChannel`, Embedding thumbnail in mp4/ogg/opus, playlist infojson etc. See [#31](https://github.com/yt-dlp/yt-dlp/pull/31) for details. * **YouTube improvements**: * Supports Clips, Stories (`ytstories:`), Search (including filters)**\***, YouTube Music Search, Channel-specific search, Search prefixes (`ytsearch:`, `ytsearchdate:`)**\***, Mixes, and Feeds (`:ytfav`, `:ytwatchlater`, `:ytsubs`, `:ythistory`, `:ytrec`, `:ytnotif`) @@ -2212,7 +2239,7 @@ ### Differences in default behavior * Live chats (if available) are considered as subtitles. Use `--sub-langs all,-live_chat` to download all subtitles except live chat. You can also use `--compat-options no-live-chat` to prevent any live chat/danmaku from downloading * YouTube channel URLs download all uploads of the channel. To download only the videos in a specific tab, pass the tab's URL. If the channel does not show the requested tab, an error will be raised. Also, `/live` URLs raise an error if there are no live videos instead of silently downloading the entire channel. You may use `--compat-options no-youtube-channel-redirect` to revert all these redirections * Unavailable videos are also listed for YouTube playlists. Use `--compat-options no-youtube-unavailable-videos` to remove this -* The upload dates extracted from YouTube are in UTC [when available](https://github.com/yt-dlp/yt-dlp/blob/89e4d86171c7b7c997c77d4714542e0383bf0db0/yt_dlp/extractor/youtube.py#L3898-L3900). Use `--compat-options no-youtube-prefer-utc-upload-date` to prefer the non-UTC upload date. +* The upload dates extracted from YouTube are in UTC. * If `ffmpeg` is used as the downloader, the downloading and merging of formats happen in a single step when possible. Use `--compat-options no-direct-merge` to revert this * Thumbnail embedding in `mp4` is done with mutagen if possible. Use `--compat-options embed-thumbnail-atomicparsley` to force the use of AtomicParsley instead * Some internal metadata such as filenames are removed by default from the infojson. Use `--no-clean-infojson` or `--compat-options no-clean-infojson` to revert this @@ -2231,9 +2258,10 @@ ### Differences in default behavior * `--compat-options all`: Use all compat options (**Do NOT use this!**) * `--compat-options youtube-dl`: Same as `--compat-options all,-multistreams,-playlist-match-filter,-manifest-filesize-approx,-allow-unsafe-ext,-prefer-vp9-sort` * `--compat-options youtube-dlc`: Same as `--compat-options all,-no-live-chat,-no-youtube-channel-redirect,-playlist-match-filter,-manifest-filesize-approx,-allow-unsafe-ext,-prefer-vp9-sort` -* `--compat-options 2021`: Same as `--compat-options 2022,no-certifi,filename-sanitization,no-youtube-prefer-utc-upload-date` +* `--compat-options 2021`: Same as `--compat-options 2022,no-certifi,filename-sanitization` * `--compat-options 2022`: Same as `--compat-options 2023,playlist-match-filter,no-external-downloader-progress,prefer-legacy-http-handler,manifest-filesize-approx` -* `--compat-options 2023`: Same as `--compat-options prefer-vp9-sort`. Use this to enable all future compat options +* `--compat-options 2023`: Same as `--compat-options 2024,prefer-vp9-sort` +* `--compat-options 2024`: Currently does nothing. Use this to enable all future compat options The following compat options restore vulnerable behavior from before security patches: diff --git a/devscripts/changelog_override.json b/devscripts/changelog_override.json index 8aa7b7e2b..269de2c68 100644 --- a/devscripts/changelog_override.json +++ b/devscripts/changelog_override.json @@ -245,5 +245,14 @@ "when": "76ac023ff02f06e8c003d104f02a03deeddebdcd", "short": "[ie/youtube:tab] Improve shorts title extraction (#11997)", "authors": ["bashonly", "d3d9"] + }, + { + "action": "add", + "when": "88eb1e7a9a2720ac89d653c0d0e40292388823bb", + "short": "[priority] **New option `--preset-alias`/`-t` has been added**\nThis provides convenient predefined aliases for common use cases. Available presets include `mp4`, `mp3`, `mkv`, `aac`, and `sleep`. See [the README](https://github.com/yt-dlp/yt-dlp/blob/master/README.md#preset-aliases) for more details." + }, + { + "action": "remove", + "when": "d596824c2f8428362c072518856065070616e348" } ] diff --git a/devscripts/make_issue_template.py b/devscripts/make_issue_template.py index 2a418ddbf..110fcc245 100644 --- a/devscripts/make_issue_template.py +++ b/devscripts/make_issue_template.py @@ -11,11 +11,13 @@ from devscripts.utils import get_filename_args, read_file, write_file -VERBOSE_TMPL = ''' +VERBOSE = ''' - type: checkboxes id: verbose attributes: label: Provide verbose output that clearly demonstrates the problem + description: | + This is mandatory unless absolutely impossible to provide. If you are unable to provide the output, please explain why. options: - label: Run **your** yt-dlp command with **-vU** flag added (`yt-dlp -vU `) required: true @@ -47,31 +49,23 @@ render: shell validations: required: true - - type: markdown - attributes: - value: | - > [!CAUTION] - > ### GitHub is experiencing a high volume of malicious spam comments. - > ### If you receive any replies asking you download a file, do NOT follow the download links! - > - > Note that this issue may be temporarily locked as an anti-spam measure after it is opened. '''.strip() NO_SKIP = ''' - - type: checkboxes + - type: markdown attributes: - label: DO NOT REMOVE OR SKIP THE ISSUE TEMPLATE - description: Fill all fields even if you think it is irrelevant for the issue - options: - - label: I understand that I will be **blocked** if I *intentionally* remove or skip any mandatory\\* field - required: true + value: | + > [!IMPORTANT] + > Not providing the required (*) information or removing the template will result in your issue being closed and ignored. '''.strip() def main(): - fields = {'no_skip': NO_SKIP} - fields['verbose'] = VERBOSE_TMPL % fields - fields['verbose_optional'] = re.sub(r'(\n\s+validations:)?\n\s+required: true', '', fields['verbose']) + fields = { + 'no_skip': NO_SKIP, + 'verbose': VERBOSE, + 'verbose_optional': re.sub(r'(\n\s+validations:)?\n\s+required: true', '', VERBOSE), + } infile, outfile = get_filename_args(has_infile=True) write_file(outfile, read_file(infile) % fields) diff --git a/devscripts/make_lazy_extractors.py b/devscripts/make_lazy_extractors.py index d288d8429..0ce773e82 100644 --- a/devscripts/make_lazy_extractors.py +++ b/devscripts/make_lazy_extractors.py @@ -10,6 +10,9 @@ from inspect import getsource from devscripts.utils import get_filename_args, read_file, write_file +from yt_dlp.extractor import import_extractors +from yt_dlp.extractor.common import InfoExtractor, SearchInfoExtractor +from yt_dlp.globals import extractors NO_ATTR = object() STATIC_CLASS_PROPERTIES = [ @@ -38,8 +41,7 @@ def main(): lazy_extractors_filename = get_filename_args(default_outfile='yt_dlp/extractor/lazy_extractors.py') - from yt_dlp.extractor.extractors import _ALL_CLASSES - from yt_dlp.extractor.common import InfoExtractor, SearchInfoExtractor + import_extractors() DummyInfoExtractor = type('InfoExtractor', (InfoExtractor,), {'IE_NAME': NO_ATTR}) module_src = '\n'.join(( @@ -47,7 +49,7 @@ def main(): ' _module = None', *extra_ie_code(DummyInfoExtractor), '\nclass LazyLoadSearchExtractor(LazyLoadExtractor):\n pass\n', - *build_ies(_ALL_CLASSES, (InfoExtractor, SearchInfoExtractor), DummyInfoExtractor), + *build_ies(list(extractors.value.values()), (InfoExtractor, SearchInfoExtractor), DummyInfoExtractor), )) write_file(lazy_extractors_filename, f'{module_src}\n') @@ -73,7 +75,7 @@ def build_ies(ies, bases, attr_base): if ie in ies: names.append(ie.__name__) - yield f'\n_ALL_CLASSES = [{", ".join(names)}]' + yield '\n_CLASS_LOOKUP = {%s}' % ', '.join(f'{name!r}: {name}' for name in names) def sort_ies(ies, ignored_bases): diff --git a/devscripts/make_supportedsites.py b/devscripts/make_supportedsites.py index 01548ef97..145f6d47f 100644 --- a/devscripts/make_supportedsites.py +++ b/devscripts/make_supportedsites.py @@ -10,10 +10,21 @@ from devscripts.utils import get_filename_args, write_file from yt_dlp.extractor import list_extractor_classes +TEMPLATE = '''\ +# Supported sites + +Below is a list of all extractors that are currently included with yt-dlp. +If a site is not listed here, it might still be supported by yt-dlp's embed extraction or generic extractor. +Not all sites listed here are guaranteed to work; websites are constantly changing and sometimes this breaks yt-dlp's support for them. +The only reliable way to check if a site is supported is to try it. + +{ie_list} +''' + def main(): out = '\n'.join(ie.description() for ie in list_extractor_classes() if ie.IE_DESC is not False) - write_file(get_filename_args(), f'# Supported sites\n{out}\n') + write_file(get_filename_args(), TEMPLATE.format(ie_list=out)) if __name__ == '__main__': diff --git a/devscripts/run_tests.py b/devscripts/run_tests.py index eb614fe59..ebb3500b6 100755 --- a/devscripts/run_tests.py +++ b/devscripts/run_tests.py @@ -25,7 +25,8 @@ def parse_args(): def run_tests(*tests, pattern=None, ci=False): - run_core = 'core' in tests or (not pattern and not tests) + # XXX: hatch uses `tests` if no arguments are passed + run_core = 'core' in tests or 'tests' in tests or (not pattern and not tests) run_download = 'download' in tests pytest_args = args.pytest_args or os.getenv('HATCH_TEST_ARGS', '') diff --git a/pyproject.toml b/pyproject.toml index 5eb9a9644..7accaeeb9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -55,8 +55,7 @@ default = [ "websockets>=13.0", ] curl-cffi = [ - "curl-cffi==0.5.10; os_name=='nt' and implementation_name=='cpython'", - "curl-cffi>=0.5.10,!=0.6.*,<0.7.2; os_name!='nt' and implementation_name=='cpython'", + "curl-cffi>=0.5.10,!=0.6.*,!=0.7.*,!=0.8.*,!=0.9.*,<0.11; implementation_name=='cpython'", ] secretstorage = [ "cffi", @@ -76,14 +75,14 @@ dev = [ ] static-analysis = [ "autopep8~=2.0", - "ruff~=0.9.0", + "ruff~=0.11.0", ] test = [ "pytest~=8.1", "pytest-rerunfailures~=14.0", ] pyinstaller = [ - "pyinstaller>=6.11.1", # Windows temp cleanup fixed in 6.11.1 + "pyinstaller>=6.13.0", # Windows temp cleanup fixed in 6.13.0 ] [project.urls] @@ -384,9 +383,14 @@ select = [ "W391", "W504", ] +exclude = "*/extractor/lazy_extractors.py,*venv*,*/test/testdata/sigs/player-*.js,.idea,.vscode" [tool.pytest.ini_options] -addopts = "-ra -v --strict-markers" +addopts = [ + "-ra", # summary: all except passed + "--verbose", + "--strict-markers", +] markers = [ "download", ] diff --git a/supportedsites.md b/supportedsites.md index 70909ef00..03bd8a7c3 100644 --- a/supportedsites.md +++ b/supportedsites.md @@ -1,6 +1,13 @@ # Supported sites + +Below is a list of all extractors that are currently included with yt-dlp. +If a site is not listed here, it might still be supported by yt-dlp's embed extraction or generic extractor. +Not all sites listed here are guaranteed to work; websites are constantly changing and sometimes this breaks yt-dlp's support for them. +The only reliable way to check if a site is supported is to try it. + - **17live** - **17live:clip** + - **17live:vod** - **1News**: 1news.co.nz article videos - **1tv**: Первый канал - **20min** @@ -194,7 +201,7 @@ # Supported sites - **blogger.com** - **Bloomberg** - **Bluesky** - - **BokeCC** + - **BokeCC**: CC视频 - **BongaCams** - **Boosty** - **BostonGlobe** @@ -218,6 +225,7 @@ # Supported sites - **bt:vestlendingen**: Bergens Tidende - Vestlendingen - **Bundesliga** - **Bundestag** + - **BunnyCdn** - **BusinessInsider** - **BuzzFeed** - **BYUtv**: (**Currently broken**) @@ -236,6 +244,7 @@ # Supported sites - **CanalAlpha** - **canalc2.tv** - **Canalplus**: mycanal.fr and piwiplus.fr + - **Canalsurmas** - **CaracolTvPlay**: [*caracoltv-play*](## "netrc machine") - **CartoonNetwork** - **cbc.ca** @@ -314,7 +323,8 @@ # Supported sites - **curiositystream**: [*curiositystream*](## "netrc machine") - **curiositystream:collections**: [*curiositystream*](## "netrc machine") - **curiositystream:series**: [*curiositystream*](## "netrc machine") - - **CWTV** + - **cwtv** + - **cwtv:movie** - **Cybrary**: [*cybrary*](## "netrc machine") - **CybraryCourse**: [*cybrary*](## "netrc machine") - **DacastPlaylist** @@ -338,8 +348,6 @@ # Supported sites - **daystar:clip** - **DBTV** - **DctpTv** - - **DeezerAlbum** - - **DeezerPlaylist** - **democracynow** - **DestinationAmerica** - **DetikEmbed** @@ -349,6 +357,7 @@ # Supported sites - **DigitalConcertHall**: [*digitalconcerthall*](## "netrc machine") DigitalConcertHall extractor - **DigitallySpeaking** - **Digiteka** + - **Digiview** - **DiscogsReleasePlaylist** - **DiscoveryLife** - **DiscoveryNetworksDe** @@ -385,6 +394,8 @@ # Supported sites - **dvtv**: http://video.aktualne.cz/ - **dw**: (**Currently broken**) - **dw:article**: (**Currently broken**) + - **dzen.ru**: Дзен (dzen) formerly Яндекс.Дзен (Yandex Zen) + - **dzen.ru:channel** - **EaglePlatform** - **EbaumsWorld** - **Ebay** @@ -463,11 +474,12 @@ # Supported sites - **FoxNewsVideo** - **FoxSports** - **fptplay**: fptplay.vn + - **FrancaisFacile** - **FranceCulture** - **FranceInter** - - **FranceTV** + - **francetv** + - **francetv:site** - **francetvinfo.fr** - - **FranceTVSite** - **Freesound** - **freespeech.org** - **freetv:series** @@ -499,7 +511,7 @@ # Supported sites - **GediDigital** - **gem.cbc.ca**: [*cbcgem*](## "netrc machine") - **gem.cbc.ca:live** - - **gem.cbc.ca:playlist** + - **gem.cbc.ca:playlist**: [*cbcgem*](## "netrc machine") - **Genius** - **GeniusLyrics** - **Germanupa**: germanupa.de @@ -601,10 +613,10 @@ # Supported sites - **Inc** - **IndavideoEmbed** - **InfoQ** - - **Instagram**: [*instagram*](## "netrc machine") - - **instagram:story**: [*instagram*](## "netrc machine") - - **instagram:tag**: [*instagram*](## "netrc machine") Instagram hashtag search URLs - - **instagram:user**: [*instagram*](## "netrc machine") Instagram user profile (**Currently broken**) + - **Instagram** + - **instagram:story** + - **instagram:tag**: Instagram hashtag search URLs + - **instagram:user**: Instagram user profile (**Currently broken**) - **InstagramIOS**: IOS instagram:// URL - **Internazionale** - **InternetVideoArchive** @@ -624,6 +636,7 @@ # Supported sites - **ivi**: ivi.ru - **ivi:compilation**: ivi.ru compilations - **ivideon**: Ivideon TV + - **Ivoox** - **IVXPlayer** - **iwara**: [*iwara*](## "netrc machine") - **iwara:playlist**: [*iwara*](## "netrc machine") @@ -653,7 +666,6 @@ # Supported sites - **KelbyOne**: (**Currently broken**) - **Kenh14Playlist** - **Kenh14Video** - - **Ketnet** - **khanacademy** - **khanacademy:unit** - **kick:clips** @@ -662,6 +674,7 @@ # Supported sites - **Kicker** - **KickStarter** - **Kika**: KiKA.de + - **KikaPlaylist** - **kinja:embed** - **KinoPoisk** - **Kommunetv** @@ -714,6 +727,7 @@ # Supported sites - **limelight:channel** - **limelight:channel_list** - **LinkedIn**: [*linkedin*](## "netrc machine") + - **linkedin:events**: [*linkedin*](## "netrc machine") - **linkedin:learning**: [*linkedin*](## "netrc machine") - **linkedin:​learning:course**: [*linkedin*](## "netrc machine") - **Liputan6** @@ -725,9 +739,11 @@ # Supported sites - **Livestreamfails** - **Lnk** - **loc**: Library of Congress + - **Loco** - **loom** - **loom:folder** - **LoveHomePorn** + - **LRTRadio** - **LRTStream** - **LRTVOD** - **LSMLREmbed** @@ -749,7 +765,7 @@ # Supported sites - **ManotoTV**: Manoto TV (Episode) - **ManotoTVLive**: Manoto TV (Live) - **ManotoTVShow**: Manoto TV (Show) - - **ManyVids**: (**Currently broken**) + - **ManyVids** - **MaoriTV** - **Markiza**: (**Currently broken**) - **MarkizaPage**: (**Currently broken**) @@ -819,11 +835,11 @@ # Supported sites - **MotherlessUploader** - **Motorsport**: motorsport.com (**Currently broken**) - **MovieFap** - - **Moviepilot** + - **moviepilot**: Moviepilot trailer - **MoviewPlay** - **Moviezine** - **MovingImage** - - **MSN**: (**Currently broken**) + - **MSN** - **mtg**: MTG services - **mtv** - **mtv.de**: (**Currently broken**) @@ -936,7 +952,7 @@ # Supported sites - **nickelodeonru** - **niconico**: [*niconico*](## "netrc machine") ニコニコ動画 - **niconico:history**: NicoNico user history or likes. Requires cookies. - - **niconico:live**: ニコニコ生放送 + - **niconico:live**: [*niconico*](## "netrc machine") ニコニコ生放送 - **niconico:playlist** - **niconico:series** - **niconico:tag**: NicoNico video tag URLs @@ -1043,6 +1059,8 @@ # Supported sites - **Parler**: Posts on parler.com - **parliamentlive.tv**: UK parliament videos - **Parlview**: (**Currently broken**) + - **parti:livestream** + - **parti:video** - **patreon** - **patreon:campaign** - **pbs**: Public Broadcasting Service (PBS) and member stations: PBS: Public Broadcasting Service, APT - Alabama Public Television (WBIQ), GPB/Georgia Public Broadcasting (WGTV), Mississippi Public Broadcasting (WMPN), Nashville Public Television (WNPT), WFSU-TV (WFSU), WSRE (WSRE), WTCI (WTCI), WPBA/Channel 30 (WPBA), Alaska Public Media (KAKM), Arizona PBS (KAET), KNME-TV/Channel 5 (KNME), Vegas PBS (KLVX), AETN/ARKANSAS ETV NETWORK (KETS), KET (WKLE), WKNO/Channel 10 (WKNO), LPB/LOUISIANA PUBLIC BROADCASTING (WLPB), OETA (KETA), Ozarks Public Television (KOZK), WSIU Public Broadcasting (WSIU), KEET TV (KEET), KIXE/Channel 9 (KIXE), KPBS San Diego (KPBS), KQED (KQED), KVIE Public Television (KVIE), PBS SoCal/KOCE (KOCE), ValleyPBS (KVPT), CONNECTICUT PUBLIC TELEVISION (WEDH), KNPB Channel 5 (KNPB), SOPTV (KSYS), Rocky Mountain PBS (KRMA), KENW-TV3 (KENW), KUED Channel 7 (KUED), Wyoming PBS (KCWC), Colorado Public Television / KBDI 12 (KBDI), KBYU-TV (KBYU), Thirteen/WNET New York (WNET), WGBH/Channel 2 (WGBH), WGBY (WGBY), NJTV Public Media NJ (WNJT), WLIW21 (WLIW), mpt/Maryland Public Television (WMPB), WETA Television and Radio (WETA), WHYY (WHYY), PBS 39 (WLVT), WVPT - Your Source for PBS and More! (WVPT), Howard University Television (WHUT), WEDU PBS (WEDU), WGCU Public Media (WGCU), WPBT2 (WPBT), WUCF TV (WUCF), WUFT/Channel 5 (WUFT), WXEL/Channel 42 (WXEL), WLRN/Channel 17 (WLRN), WUSF Public Broadcasting (WUSF), ETV (WRLK), UNC-TV (WUNC), PBS Hawaii - Oceanic Cable Channel 10 (KHET), Idaho Public Television (KAID), KSPS (KSPS), OPB (KOPB), KWSU/Channel 10 & KTNW/Channel 31 (KWSU), WILL-TV (WILL), Network Knowledge - WSEC/Springfield (WSEC), WTTW11 (WTTW), Iowa Public Television/IPTV (KDIN), Nine Network (KETC), PBS39 Fort Wayne (WFWA), WFYI Indianapolis (WFYI), Milwaukee Public Television (WMVS), WNIN (WNIN), WNIT Public Television (WNIT), WPT (WPNE), WVUT/Channel 22 (WVUT), WEIU/Channel 51 (WEIU), WQPT-TV (WQPT), WYCC PBS Chicago (WYCC), WIPB-TV (WIPB), WTIU (WTIU), CET (WCET), ThinkTVNetwork (WPTD), WBGU-TV (WBGU), WGVU TV (WGVU), NET1 (KUON), Pioneer Public Television (KWCM), SDPB Television (KUSD), TPT (KTCA), KSMQ (KSMQ), KPTS/Channel 8 (KPTS), KTWU/Channel 11 (KTWU), East Tennessee PBS (WSJK), WCTE-TV (WCTE), WLJT, Channel 11 (WLJT), WOSU TV (WOSU), WOUB/WOUC (WOUB), WVPB (WVPB), WKYU-PBS (WKYU), KERA 13 (KERA), MPBN (WCBB), Mountain Lake PBS (WCFE), NHPTV (WENH), Vermont PBS (WETK), witf (WITF), WQED Multimedia (WQED), WMHT Educational Telecommunications (WMHT), Q-TV (WDCQ), WTVS Detroit Public TV (WTVS), CMU Public Television (WCMU), WKAR-TV (WKAR), WNMU-TV Public TV 13 (WNMU), WDSE - WRPT (WDSE), WGTE TV (WGTE), Lakeland Public Television (KAWE), KMOS-TV - Channels 6.1, 6.2 and 6.3 (KMOS), MontanaPBS (KUSM), KRWG/Channel 22 (KRWG), KACV (KACV), KCOS/Channel 13 (KCOS), WCNY/Channel 24 (WCNY), WNED (WNED), WPBS (WPBS), WSKG Public TV (WSKG), WXXI (WXXI), WPSU (WPSU), WVIA Public Media Studios (WVIA), WTVI (WTVI), Western Reserve PBS (WNEO), WVIZ/PBS ideastream (WVIZ), KCTS 9 (KCTS), Basin PBS (KPBT), KUHT / Channel 8 (KUHT), KLRN (KLRN), KLRU (KLRU), WTJX Channel 12 (WTJX), WCVE PBS (WCVE), KBTC Public Television (KBTC) @@ -1217,6 +1235,7 @@ # Supported sites - **RoosterTeeth**: [*roosterteeth*](## "netrc machine") - **RoosterTeethSeries**: [*roosterteeth*](## "netrc machine") - **RottenTomatoes** + - **RoyaLive** - **Rozhlas** - **RozhlasVltava** - **RTBF**: [*rtbf*](## "netrc machine") (**Currently broken**) @@ -1237,12 +1256,10 @@ # Supported sites - **RTVCKaltura** - **RTVCPlay** - **RTVCPlayEmbed** - - **rtve.es:alacarta**: RTVE a la carta + - **rtve.es:alacarta**: RTVE a la carta and Play - **rtve.es:audio**: RTVE audio - - **rtve.es:infantil**: RTVE infantil - **rtve.es:live**: RTVE.es live streams - **rtve.es:television** - - **RTVS** - **rtvslo.si** - **rtvslo.si:show** - **RudoVideo** @@ -1297,8 +1314,8 @@ # Supported sites - **sejm** - **Sen** - **SenalColombiaLive**: (**Currently broken**) - - **SenateGov** - - **SenateISVP** + - **senate.gov** + - **senate.gov:isvp** - **SendtoNews**: (**Currently broken**) - **Servus** - **Sexu**: (**Currently broken**) @@ -1334,6 +1351,7 @@ # Supported sites - **Smotrim** - **SnapchatSpotlight** - **Snotr** + - **SoftWhiteUnderbelly**: [*softwhiteunderbelly*](## "netrc machine") - **Sohu** - **SohuV** - **SonyLIV**: [*sonyliv*](## "netrc machine") @@ -1390,12 +1408,14 @@ # Supported sites - **StoryFire** - **StoryFireSeries** - **StoryFireUser** + - **Streaks** - **Streamable** - **StreamCZ** - **StreetVoice** - **StretchInternet** - **Stripchat** - **stv:player** + - **stvr**: Slovak Television and Radio (formerly RTVS) - **Subsplash** - **subsplash:playlist** - **Substack** @@ -1528,6 +1548,8 @@ # Supported sites - **tv5unis** - **tv5unis:video** - **tv8.it** + - **tv8.it:live**: TV8 Live + - **tv8.it:playlist**: TV8 Playlist - **TVANouvelles** - **TVANouvellesArticle** - **tvaplus**: TVA+ @@ -1548,6 +1570,8 @@ # Supported sites - **tvp:​vod:series** - **TVPlayer** - **TVPlayHome** + - **tvw** + - **tvw:tvchannels** - **Tweakers** - **TwitCasting** - **TwitCastingLive** @@ -1629,8 +1653,6 @@ # Supported sites - **viewlift** - **viewlift:embed** - **Viidea** - - **viki**: [*viki*](## "netrc machine") - - **viki:channel**: [*viki*](## "netrc machine") - **vimeo**: [*vimeo*](## "netrc machine") - **vimeo:album**: [*vimeo*](## "netrc machine") - **vimeo:channel**: [*vimeo*](## "netrc machine") @@ -1668,8 +1690,12 @@ # Supported sites - **vpro**: npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl - **vqq:series** - **vqq:video** + - **vrsquare**: VR SQUARE + - **vrsquare:channel** + - **vrsquare:search** + - **vrsquare:section** - **VRT**: VRT NWS, Flanders News, Flandern Info and Sporza - - **VrtNU**: [*vrtnu*](## "netrc machine") VRT MAX + - **vrtmax**: [*vrtnu*](## "netrc machine") VRT MAX (formerly VRT NU) - **VTM**: (**Currently broken**) - **VTV** - **VTVGo** @@ -1804,14 +1830,12 @@ # Supported sites - **ZattooLive**: [*zattoo*](## "netrc machine") - **ZattooMovies**: [*zattoo*](## "netrc machine") - **ZattooRecordings**: [*zattoo*](## "netrc machine") - - **ZDF** - - **ZDFChannel** + - **zdf** + - **zdf:channel** - **Zee5**: [*zee5*](## "netrc machine") - **zee5:series** - **ZeeNews**: (**Currently broken**) - **ZenPorn** - - **ZenYandex** - - **ZenYandexChannel** - **ZetlandDKArticle** - **Zhihu** - **zingmp3**: zingmp3.vn diff --git a/test/helper.py b/test/helper.py index c776e70b7..e4cb478e2 100644 --- a/test/helper.py +++ b/test/helper.py @@ -101,87 +101,109 @@ def getwebpagetestcases(): md5 = lambda s: hashlib.md5(s.encode()).hexdigest() -def expect_value(self, got, expected, field): - if isinstance(expected, str) and expected.startswith('re:'): - match_str = expected[len('re:'):] - match_rex = re.compile(match_str) +def _iter_differences(got, expected, field): + if isinstance(expected, str): + op, _, val = expected.partition(':') + if op in ('mincount', 'maxcount', 'count'): + if not isinstance(got, (list, dict)): + yield field, f'expected either {list.__name__} or {dict.__name__}, got {type(got).__name__}' + return - self.assertTrue( - isinstance(got, str), - f'Expected a {str.__name__} object, but got {type(got).__name__} for field {field}') - self.assertTrue( - match_rex.match(got), - f'field {field} (value: {got!r}) should match {match_str!r}') - elif isinstance(expected, str) and expected.startswith('startswith:'): - start_str = expected[len('startswith:'):] - self.assertTrue( - isinstance(got, str), - f'Expected a {str.__name__} object, but got {type(got).__name__} for field {field}') - self.assertTrue( - got.startswith(start_str), - f'field {field} (value: {got!r}) should start with {start_str!r}') - elif isinstance(expected, str) and expected.startswith('contains:'): - contains_str = expected[len('contains:'):] - self.assertTrue( - isinstance(got, str), - f'Expected a {str.__name__} object, but got {type(got).__name__} for field {field}') - self.assertTrue( - contains_str in got, - f'field {field} (value: {got!r}) should contain {contains_str!r}') - elif isinstance(expected, type): - self.assertTrue( - isinstance(got, expected), - f'Expected type {expected!r} for field {field}, but got value {got!r} of type {type(got)!r}') - elif isinstance(expected, dict) and isinstance(got, dict): - expect_dict(self, got, expected) - elif isinstance(expected, list) and isinstance(got, list): - self.assertEqual( - len(expected), len(got), - f'Expect a list of length {len(expected)}, but got a list of length {len(got)} for field {field}') - for index, (item_got, item_expected) in enumerate(zip(got, expected)): - type_got = type(item_got) - type_expected = type(item_expected) - self.assertEqual( - type_expected, type_got, - f'Type mismatch for list item at index {index} for field {field}, ' - f'expected {type_expected!r}, got {type_got!r}') - expect_value(self, item_got, item_expected, field) - else: - if isinstance(expected, str) and expected.startswith('md5:'): - self.assertTrue( - isinstance(got, str), - f'Expected field {field} to be a unicode object, but got value {got!r} of type {type(got)!r}') - got = 'md5:' + md5(got) - elif isinstance(expected, str) and re.match(r'^(?:min|max)?count:\d+', expected): - self.assertTrue( - isinstance(got, (list, dict)), - f'Expected field {field} to be a list or a dict, but it is of type {type(got).__name__}') - op, _, expected_num = expected.partition(':') - expected_num = int(expected_num) + expected_num = int(val) + got_num = len(got) if op == 'mincount': - assert_func = assertGreaterEqual - msg_tmpl = 'Expected %d items in field %s, but only got %d' - elif op == 'maxcount': - assert_func = assertLessEqual - msg_tmpl = 'Expected maximum %d items in field %s, but got %d' - elif op == 'count': - assert_func = assertEqual - msg_tmpl = 'Expected exactly %d items in field %s, but got %d' - else: - assert False - assert_func( - self, len(got), expected_num, - msg_tmpl % (expected_num, field, len(got))) + if got_num < expected_num: + yield field, f'expected at least {val} items, got {got_num}' + return + + if op == 'maxcount': + if got_num > expected_num: + yield field, f'expected at most {val} items, got {got_num}' + return + + assert op == 'count' + if got_num != expected_num: + yield field, f'expected exactly {val} items, got {got_num}' return - self.assertEqual( - expected, got, - f'Invalid value for field {field}, expected {expected!r}, got {got!r}') + + if not isinstance(got, str): + yield field, f'expected {str.__name__}, got {type(got).__name__}' + return + + if op == 're': + if not re.match(val, got): + yield field, f'should match {val!r}, got {got!r}' + return + + if op == 'startswith': + if not got.startswith(val): + yield field, f'should start with {val!r}, got {got!r}' + return + + if op == 'contains': + if not val.startswith(got): + yield field, f'should contain {val!r}, got {got!r}' + return + + if op == 'md5': + hash_val = md5(got) + if hash_val != val: + yield field, f'expected hash {val}, got {hash_val}' + return + + if got != expected: + yield field, f'expected {expected!r}, got {got!r}' + return + + if isinstance(expected, dict) and isinstance(got, dict): + for key, expected_val in expected.items(): + if key not in got: + yield field, f'missing key: {key!r}' + continue + + field_name = key if field is None else f'{field}.{key}' + yield from _iter_differences(got[key], expected_val, field_name) + return + + if isinstance(expected, type): + if not isinstance(got, expected): + yield field, f'expected {expected.__name__}, got {type(got).__name__}' + return + + if isinstance(expected, list) and isinstance(got, list): + # TODO: clever diffing algorithm lmao + if len(expected) != len(got): + yield field, f'expected length of {len(expected)}, got {len(got)}' + return + + for index, (got_val, expected_val) in enumerate(zip(got, expected)): + field_name = str(index) if field is None else f'{field}.{index}' + yield from _iter_differences(got_val, expected_val, field_name) + return + + if got != expected: + yield field, f'expected {expected!r}, got {got!r}' + + +def _expect_value(message, got, expected, field): + mismatches = list(_iter_differences(got, expected, field)) + if not mismatches: + return + + fields = [field for field, _ in mismatches if field is not None] + return ''.join(( + message, f' ({", ".join(fields)})' if fields else '', + *(f'\n\t{field}: {message}' for field, message in mismatches))) + + +def expect_value(self, got, expected, field): + if message := _expect_value('values differ', got, expected, field): + self.fail(message) def expect_dict(self, got_dict, expected_dict): - for info_field, expected in expected_dict.items(): - got = got_dict.get(info_field) - expect_value(self, got, expected, info_field) + if message := _expect_value('dictionaries differ', got_dict, expected_dict, None): + self.fail(message) def sanitize_got_info_dict(got_dict): @@ -237,6 +259,20 @@ def sanitize(key, value): def expect_info_dict(self, got_dict, expected_dict): + ALLOWED_KEYS_SORT_ORDER = ( + # NB: Keep in sync with the docstring of extractor/common.py + 'id', 'ext', 'direct', 'display_id', 'title', 'alt_title', 'description', 'media_type', + 'uploader', 'uploader_id', 'uploader_url', 'channel', 'channel_id', 'channel_url', 'channel_is_verified', + 'channel_follower_count', 'comment_count', 'view_count', 'concurrent_view_count', + 'like_count', 'dislike_count', 'repost_count', 'average_rating', 'age_limit', 'duration', 'thumbnail', 'heatmap', + 'chapters', 'chapter', 'chapter_number', 'chapter_id', 'start_time', 'end_time', 'section_start', 'section_end', + 'categories', 'tags', 'cast', 'composers', 'artists', 'album_artists', 'creators', 'genres', + 'track', 'track_number', 'track_id', 'album', 'album_type', 'disc_number', + 'series', 'series_id', 'season', 'season_number', 'season_id', 'episode', 'episode_number', 'episode_id', + 'timestamp', 'upload_date', 'release_timestamp', 'release_date', 'release_year', 'modified_timestamp', 'modified_date', + 'playable_in_embed', 'availability', 'live_status', 'location', 'license', '_old_archive_ids', + ) + expect_dict(self, got_dict, expected_dict) # Check for the presence of mandatory fields if got_dict.get('_type') not in ('playlist', 'multi_video'): @@ -252,7 +288,13 @@ def expect_info_dict(self, got_dict, expected_dict): test_info_dict = sanitize_got_info_dict(got_dict) - missing_keys = set(test_info_dict.keys()) - set(expected_dict.keys()) + # Check for invalid/misspelled field names being returned by the extractor + invalid_keys = sorted(test_info_dict.keys() - ALLOWED_KEYS_SORT_ORDER) + self.assertFalse(invalid_keys, f'Invalid fields returned by the extractor: {", ".join(invalid_keys)}') + + missing_keys = sorted( + test_info_dict.keys() - expected_dict.keys(), + key=lambda x: ALLOWED_KEYS_SORT_ORDER.index(x)) if missing_keys: def _repr(v): if isinstance(v, str): diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index 54f35ef55..c6ff6209a 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -638,6 +638,7 @@ def test_parse_m3u8_formats(self): 'img_bipbop_adv_example_fmp4', 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/master.m3u8', [{ + # 60kbps (bitrate not provided in m3u8); sorted as worst because it's grouped with lowest bitrate video track 'format_id': 'aud1-English', 'url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/a1/prog_index.m3u8', 'manifest_url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/master.m3u8', @@ -645,15 +646,9 @@ def test_parse_m3u8_formats(self): 'ext': 'mp4', 'protocol': 'm3u8_native', 'audio_ext': 'mp4', + 'source_preference': 0, }, { - 'format_id': 'aud2-English', - 'url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/a2/prog_index.m3u8', - 'manifest_url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/master.m3u8', - 'language': 'en', - 'ext': 'mp4', - 'protocol': 'm3u8_native', - 'audio_ext': 'mp4', - }, { + # 192kbps (bitrate not provided in m3u8) 'format_id': 'aud3-English', 'url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/a3/prog_index.m3u8', 'manifest_url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/master.m3u8', @@ -661,6 +656,17 @@ def test_parse_m3u8_formats(self): 'ext': 'mp4', 'protocol': 'm3u8_native', 'audio_ext': 'mp4', + 'source_preference': 1, + }, { + # 384kbps (bitrate not provided in m3u8); sorted as best because it's grouped with the highest bitrate video track + 'format_id': 'aud2-English', + 'url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/a2/prog_index.m3u8', + 'manifest_url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/master.m3u8', + 'language': 'en', + 'ext': 'mp4', + 'protocol': 'm3u8_native', + 'audio_ext': 'mp4', + 'source_preference': 2, }, { 'format_id': '530', 'url': 'https://devstreaming-cdn.apple.com/videos/streaming/examples/img_bipbop_adv_example_fmp4/v2/prog_index.m3u8', diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 17e081bc6..708a04f92 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -6,6 +6,8 @@ import unittest from unittest.mock import patch +from yt_dlp.globals import all_plugins_loaded + sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) @@ -1427,6 +1429,12 @@ def check_for_cookie_header(result): self.assertFalse(result.get('cookies'), msg='Cookies set in cookies field for wrong domain') self.assertFalse(ydl.cookiejar.get_cookie_header(fmt['url']), msg='Cookies set in cookiejar for wrong domain') + def test_load_plugins_compat(self): + # Should try to reload plugins if they haven't already been loaded + all_plugins_loaded.value = False + FakeYDL().close() + assert all_plugins_loaded.value + if __name__ == '__main__': unittest.main() diff --git a/test/test_http_proxy.py b/test/test_http_proxy.py index 2435c878a..e903ff8be 100644 --- a/test/test_http_proxy.py +++ b/test/test_http_proxy.py @@ -331,10 +331,6 @@ def test_http_connect_auth(self, handler, ctx): assert proxy_info['proxy'] == server_address assert 'Proxy-Authorization' in proxy_info['headers'] - @pytest.mark.skip_handler( - 'Requests', - 'bug in urllib3 causes unclosed socket: https://github.com/urllib3/urllib3/issues/3374', - ) def test_http_connect_bad_auth(self, handler, ctx): with ctx.http_server(HTTPConnectProxyHandler, username='test', password='test') as server_address: with handler(verify=False, proxies={ctx.REQUEST_PROTO: f'http://test:bad@{server_address}'}) as rh: diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py index 06840ed85..b14069ccc 100644 --- a/test/test_jsinterp.py +++ b/test/test_jsinterp.py @@ -9,7 +9,7 @@ import math -from yt_dlp.jsinterp import JS_Undefined, JSInterpreter +from yt_dlp.jsinterp import JS_Undefined, JSInterpreter, js_number_to_string class NaN: @@ -93,6 +93,16 @@ def test_operators(self): self._test('function f(){return 0 ?? 42;}', 0) self._test('function f(){return "life, the universe and everything" < 42;}', False) self._test('function f(){return 0 - 7 * - 6;}', 42) + self._test('function f(){return true << "5";}', 32) + self._test('function f(){return true << true;}', 2) + self._test('function f(){return "19" & "21.9";}', 17) + self._test('function f(){return "19" & false;}', 0) + self._test('function f(){return "11.0" >> "2.1";}', 2) + self._test('function f(){return 5 ^ 9;}', 12) + self._test('function f(){return 0.0 << NaN}', 0) + self._test('function f(){return null << undefined}', 0) + # TODO: Does not work due to number too large + # self._test('function f(){return 21 << 4294967297}', 42) def test_array_access(self): self._test('function f(){var x = [1,2,3]; x[0] = 4; x[0] = 5; x[2.0] = 7; return x;}', [5, 2, 7]) @@ -108,6 +118,7 @@ def test_assignments(self): self._test('function f(){var x = 20; x = 30 + 1; return x;}', 31) self._test('function f(){var x = 20; x += 30 + 1; return x;}', 51) self._test('function f(){var x = 20; x -= 30 + 1; return x;}', -11) + self._test('function f(){var x = 2; var y = ["a", "b"]; y[x%y["length"]]="z"; return y}', ['z', 'b']) @unittest.skip('Not implemented') def test_comments(self): @@ -374,7 +385,7 @@ def test_negative(self): @unittest.skip('Not implemented') def test_packed(self): jsi = JSInterpreter('''function f(p,a,c,k,e,d){while(c--)if(k[c])p=p.replace(new RegExp('\\b'+c.toString(a)+'\\b','g'),k[c]);return p}''') - self.assertEqual(jsi.call_function('f', '''h 7=g("1j");7.7h({7g:[{33:"w://7f-7e-7d-7c.v.7b/7a/79/78/77/76.74?t=73&s=2s&e=72&f=2t&71=70.0.0.1&6z=6y&6x=6w"}],6v:"w://32.v.u/6u.31",16:"r%",15:"r%",6t:"6s",6r:"",6q:"l",6p:"l",6o:"6n",6m:\'6l\',6k:"6j",9:[{33:"/2u?b=6i&n=50&6h=w://32.v.u/6g.31",6f:"6e"}],1y:{6d:1,6c:\'#6b\',6a:\'#69\',68:"67",66:30,65:r,},"64":{63:"%62 2m%m%61%5z%5y%5x.u%5w%5v%5u.2y%22 2k%m%1o%22 5t%m%1o%22 5s%m%1o%22 2j%m%5r%22 16%m%5q%22 15%m%5p%22 5o%2z%5n%5m%2z",5l:"w://v.u/d/1k/5k.2y",5j:[]},\'5i\':{"5h":"5g"},5f:"5e",5d:"w://v.u",5c:{},5b:l,1x:[0.25,0.50,0.75,1,1.25,1.5,2]});h 1m,1n,5a;h 59=0,58=0;h 7=g("1j");h 2x=0,57=0,56=0;$.55({54:{\'53-52\':\'2i-51\'}});7.j(\'4z\',6(x){c(5>0&&x.1l>=5&&1n!=1){1n=1;$(\'q.4y\').4x(\'4w\')}});7.j(\'13\',6(x){2x=x.1l});7.j(\'2g\',6(x){2w(x)});7.j(\'4v\',6(){$(\'q.2v\').4u()});6 2w(x){$(\'q.2v\').4t();c(1m)19;1m=1;17=0;c(4s.4r===l){17=1}$.4q(\'/2u?b=4p&2l=1k&4o=2t-4n-4m-2s-4l&4k=&4j=&4i=&17=\'+17,6(2r){$(\'#4h\').4g(2r)});$(\'.3-8-4f-4e:4d("4c")\').2h(6(e){2q();g().4b(0);g().4a(l)});6 2q(){h $14=$("").2p({1l:"49",16:"r%",15:"r%",48:0,2n:0,2o:47,46:"45(10%, 10%, 10%, 0.4)","44-43":"42"});$("<41 />").2p({16:"60%",15:"60%",2o:40,"3z-2n":"3y"}).3x({\'2m\':\'/?b=3w&2l=1k\',\'2k\':\'0\',\'2j\':\'2i\'}).2f($14);$14.2h(6(){$(3v).3u();g().2g()});$14.2f($(\'#1j\'))}g().13(0);}6 3t(){h 9=7.1b(2e);2d.2c(9);c(9.n>1){1r(i=0;i<9.n;i++){c(9[i].1a==2e){2d.2c(\'!!=\'+i);7.1p(i)}}}}7.j(\'3s\',6(){g().1h("/2a/3r.29","3q 10 28",6(){g().13(g().27()+10)},"2b");$("q[26=2b]").23().21(\'.3-20-1z\');g().1h("/2a/3p.29","3o 10 28",6(){h 12=g().27()-10;c(12<0)12=0;g().13(12)},"24");$("q[26=24]").23().21(\'.3-20-1z\');});6 1i(){}7.j(\'3n\',6(){1i()});7.j(\'3m\',6(){1i()});7.j("k",6(y){h 9=7.1b();c(9.n<2)19;$(\'.3-8-3l-3k\').3j(6(){$(\'#3-8-a-k\').1e(\'3-8-a-z\');$(\'.3-a-k\').p(\'o-1f\',\'11\')});7.1h("/3i/3h.3g","3f 3e",6(){$(\'.3-1w\').3d(\'3-8-1v\');$(\'.3-8-1y, .3-8-1x\').p(\'o-1g\',\'11\');c($(\'.3-1w\').3c(\'3-8-1v\')){$(\'.3-a-k\').p(\'o-1g\',\'l\');$(\'.3-a-k\').p(\'o-1f\',\'l\');$(\'.3-8-a\').1e(\'3-8-a-z\');$(\'.3-8-a:1u\').3b(\'3-8-a-z\')}3a{$(\'.3-a-k\').p(\'o-1g\',\'11\');$(\'.3-a-k\').p(\'o-1f\',\'11\');$(\'.3-8-a:1u\').1e(\'3-8-a-z\')}},"39");7.j("38",6(y){1d.37(\'1c\',y.9[y.36].1a)});c(1d.1t(\'1c\')){35("1s(1d.1t(\'1c\'));",34)}});h 18;6 1s(1q){h 9=7.1b();c(9.n>1){1r(i=0;i<9.n;i++){c(9[i].1a==1q){c(i==18){19}18=i;7.1p(i)}}}}',36,270,'|||jw|||function|player|settings|tracks|submenu||if||||jwplayer|var||on|audioTracks|true|3D|length|aria|attr|div|100|||sx|filemoon|https||event|active||false|tt|seek|dd|height|width|adb|current_audio|return|name|getAudioTracks|default_audio|localStorage|removeClass|expanded|checked|addButton|callMeMaybe|vplayer|0fxcyc2ajhp1|position|vvplay|vvad|220|setCurrentAudioTrack|audio_name|for|audio_set|getItem|last|open|controls|playbackRates|captions|rewind|icon|insertAfter||detach|ff00||button|getPosition|sec|png|player8|ff11|log|console|track_name|appendTo|play|click|no|scrolling|frameborder|file_code|src|top|zIndex|css|showCCform|data|1662367683|383371|dl|video_ad|doPlay|prevt|mp4|3E||jpg|thumbs|file|300|setTimeout|currentTrack|setItem|audioTrackChanged|dualSound|else|addClass|hasClass|toggleClass|Track|Audio|svg|dualy|images|mousedown|buttons|topbar|playAttemptFailed|beforePlay|Rewind|fr|Forward|ff|ready|set_audio_track|remove|this|upload_srt|prop|50px|margin|1000001|iframe|center|align|text|rgba|background|1000000|left|absolute|pause|setCurrentCaptions|Upload|contains|item|content|html|fviews|referer|prem|embed|3e57249ef633e0d03bf76ceb8d8a4b65|216|83|hash|view|get|TokenZir|window|hide|show|complete|slow|fadeIn|video_ad_fadein|time||cache|Cache|Content|headers|ajaxSetup|v2done|tott|vastdone2|vastdone1|vvbefore|playbackRateControls|cast|aboutlink|FileMoon|abouttext|UHD|1870|qualityLabels|sites|GNOME_POWER|link|2Fiframe|3C|allowfullscreen|22360|22640|22no|marginheight|marginwidth|2FGNOME_POWER|2F0fxcyc2ajhp1|2Fe|2Ffilemoon|2F|3A||22https|3Ciframe|code|sharing|fontOpacity|backgroundOpacity|Tahoma|fontFamily|303030|backgroundColor|FFFFFF|color|userFontScale|thumbnails|kind|0fxcyc2ajhp10000|url|get_slides|start|startparam|none|preload|html5|primary|hlshtml|androidhls|duration|uniform|stretching|0fxcyc2ajhp1_xt|image|2048|sp|6871|asn|127|srv|43200|_g3XlBcu2lmD9oDexD2NLWSmah2Nu3XcDrl93m9PwXY|m3u8||master|0fxcyc2ajhp1_x|00076|01|hls2|to|s01|delivery|storage|moon|sources|setup'''.split('|'))) + self.assertEqual(jsi.call_function('f', '''h 7=g("1j");7.7h({7g:[{33:"w://7f-7e-7d-7c.v.7b/7a/79/78/77/76.74?t=73&s=2s&e=72&f=2t&71=70.0.0.1&6z=6y&6x=6w"}],6v:"w://32.v.u/6u.31",16:"r%",15:"r%",6t:"6s",6r:"",6q:"l",6p:"l",6o:"6n",6m:\'6l\',6k:"6j",9:[{33:"/2u?b=6i&n=50&6h=w://32.v.u/6g.31",6f:"6e"}],1y:{6d:1,6c:\'#6b\',6a:\'#69\',68:"67",66:30,65:r,},"64":{63:"%62 2m%m%61%5z%5y%5x.u%5w%5v%5u.2y%22 2k%m%1o%22 5t%m%1o%22 5s%m%1o%22 2j%m%5r%22 16%m%5q%22 15%m%5p%22 5o%2z%5n%5m%2z",5l:"w://v.u/d/1k/5k.2y",5j:[]},\'5i\':{"5h":"5g"},5f:"5e",5d:"w://v.u",5c:{},5b:l,1x:[0.25,0.50,0.75,1,1.25,1.5,2]});h 1m,1n,5a;h 59=0,58=0;h 7=g("1j");h 2x=0,57=0,56=0;$.55({54:{\'53-52\':\'2i-51\'}});7.j(\'4z\',6(x){c(5>0&&x.1l>=5&&1n!=1){1n=1;$(\'q.4y\').4x(\'4w\')}});7.j(\'13\',6(x){2x=x.1l});7.j(\'2g\',6(x){2w(x)});7.j(\'4v\',6(){$(\'q.2v\').4u()});6 2w(x){$(\'q.2v\').4t();c(1m)19;1m=1;17=0;c(4s.4r===l){17=1}$.4q(\'/2u?b=4p&2l=1k&4o=2t-4n-4m-2s-4l&4k=&4j=&4i=&17=\'+17,6(2r){$(\'#4h\').4g(2r)});$(\'.3-8-4f-4e:4d("4c")\').2h(6(e){2q();g().4b(0);g().4a(l)});6 2q(){h $14=$("").2p({1l:"49",16:"r%",15:"r%",48:0,2n:0,2o:47,46:"45(10%, 10%, 10%, 0.4)","44-43":"42"});$("<41 />").2p({16:"60%",15:"60%",2o:40,"3z-2n":"3y"}).3x({\'2m\':\'/?b=3w&2l=1k\',\'2k\':\'0\',\'2j\':\'2i\'}).2f($14);$14.2h(6(){$(3v).3u();g().2g()});$14.2f($(\'#1j\'))}g().13(0);}6 3t(){h 9=7.1b(2e);2d.2c(9);c(9.n>1){1r(i=0;i<9.n;i++){c(9[i].1a==2e){2d.2c(\'!!=\'+i);7.1p(i)}}}}7.j(\'3s\',6(){g().1h("/2a/3r.29","3q 10 28",6(){g().13(g().27()+10)},"2b");$("q[26=2b]").23().21(\'.3-20-1z\');g().1h("/2a/3p.29","3o 10 28",6(){h 12=g().27()-10;c(12<0)12=0;g().13(12)},"24");$("q[26=24]").23().21(\'.3-20-1z\');});6 1i(){}7.j(\'3n\',6(){1i()});7.j(\'3m\',6(){1i()});7.j("k",6(y){h 9=7.1b();c(9.n<2)19;$(\'.3-8-3l-3k\').3j(6(){$(\'#3-8-a-k\').1e(\'3-8-a-z\');$(\'.3-a-k\').p(\'o-1f\',\'11\')});7.1h("/3i/3h.3g","3f 3e",6(){$(\'.3-1w\').3d(\'3-8-1v\');$(\'.3-8-1y, .3-8-1x\').p(\'o-1g\',\'11\');c($(\'.3-1w\').3c(\'3-8-1v\')){$(\'.3-a-k\').p(\'o-1g\',\'l\');$(\'.3-a-k\').p(\'o-1f\',\'l\');$(\'.3-8-a\').1e(\'3-8-a-z\');$(\'.3-8-a:1u\').3b(\'3-8-a-z\')}3a{$(\'.3-a-k\').p(\'o-1g\',\'11\');$(\'.3-a-k\').p(\'o-1f\',\'11\');$(\'.3-8-a:1u\').1e(\'3-8-a-z\')}},"39");7.j("38",6(y){1d.37(\'1c\',y.9[y.36].1a)});c(1d.1t(\'1c\')){35("1s(1d.1t(\'1c\'));",34)}});h 18;6 1s(1q){h 9=7.1b();c(9.n>1){1r(i=0;i<9.n;i++){c(9[i].1a==1q){c(i==18){19}18=i;7.1p(i)}}}}',36,270,'|||jw|||function|player|settings|tracks|submenu||if||||jwplayer|var||on|audioTracks|true|3D|length|aria|attr|div|100|||sx|filemoon|https||event|active||false|tt|seek|dd|height|width|adb|current_audio|return|name|getAudioTracks|default_audio|localStorage|removeClass|expanded|checked|addButton|callMeMaybe|vplayer|0fxcyc2ajhp1|position|vvplay|vvad|220|setCurrentAudioTrack|audio_name|for|audio_set|getItem|last|open|controls|playbackRates|captions|rewind|icon|insertAfter||detach|ff00||button|getPosition|sec|png|player8|ff11|log|console|track_name|appendTo|play|click|no|scrolling|frameborder|file_code|src|top|zIndex|css|showCCform|data|1662367683|383371|dl|video_ad|doPlay|prevt|mp4|3E||jpg|thumbs|file|300|setTimeout|currentTrack|setItem|audioTrackChanged|dualSound|else|addClass|hasClass|toggleClass|Track|Audio|svg|dualy|images|mousedown|buttons|topbar|playAttemptFailed|beforePlay|Rewind|fr|Forward|ff|ready|set_audio_track|remove|this|upload_srt|prop|50px|margin|1000001|iframe|center|align|text|rgba|background|1000000|left|absolute|pause|setCurrentCaptions|Upload|contains|item|content|html|fviews|referer|prem|embed|3e57249ef633e0d03bf76ceb8d8a4b65|216|83|hash|view|get|TokenZir|window|hide|show|complete|slow|fadeIn|video_ad_fadein|time||cache|Cache|Content|headers|ajaxSetup|v2done|tott|vastdone2|vastdone1|vvbefore|playbackRateControls|cast|aboutlink|FileMoon|abouttext|UHD|1870|qualityLabels|sites|GNOME_POWER|link|2Fiframe|3C|allowfullscreen|22360|22640|22no|marginheight|marginwidth|2FGNOME_POWER|2F0fxcyc2ajhp1|2Fe|2Ffilemoon|2F|3A||22https|3Ciframe|code|sharing|fontOpacity|backgroundOpacity|Tahoma|fontFamily|303030|backgroundColor|FFFFFF|color|userFontScale|thumbnails|kind|0fxcyc2ajhp10000|url|get_slides|start|startparam|none|preload|html5|primary|hlshtml|androidhls|duration|uniform|stretching|0fxcyc2ajhp1_xt|image|2048|sp|6871|asn|127|srv|43200|_g3XlBcu2lmD9oDexD2NLWSmah2Nu3XcDrl93m9PwXY|m3u8||master|0fxcyc2ajhp1_x|00076|01|hls2|to|s01|delivery|storage|moon|sources|setup'''.split('|'))) # noqa: SIM905 def test_join(self): test_input = list('test') @@ -393,6 +404,8 @@ def test_split(self): test_result = list('test') tests = [ 'function f(a, b){return a.split(b)}', + 'function f(a, b){return a["split"](b)}', + 'function f(a, b){let x = ["split"]; return a[x[0]](b)}', 'function f(a, b){return String.prototype.split.call(a, b)}', 'function f(a, b){return String.prototype.split.apply(a, [b])}', ] @@ -431,6 +444,40 @@ def test_slice(self): self._test('function f(){return "012345678".slice(-1, 1)}', '') self._test('function f(){return "012345678".slice(-3, -1)}', '67') + def test_splice(self): + self._test('function f(){var T = ["0", "1", "2"]; T["splice"](2, 1, "0")[0]; return T }', ['0', '1', '0']) + + def test_js_number_to_string(self): + for test, radix, expected in [ + (0, None, '0'), + (-0, None, '0'), + (0.0, None, '0'), + (-0.0, None, '0'), + (math.nan, None, 'NaN'), + (-math.nan, None, 'NaN'), + (math.inf, None, 'Infinity'), + (-math.inf, None, '-Infinity'), + (10 ** 21.5, 8, '526665530627250154000000'), + (6, 2, '110'), + (254, 16, 'fe'), + (-10, 2, '-1010'), + (-0xff, 2, '-11111111'), + (0.1 + 0.2, 16, '0.4cccccccccccd'), + (1234.1234, 10, '1234.1234'), + # (1000000000000000128, 10, '1000000000000000100') + ]: + assert js_number_to_string(test, radix) == expected + + def test_extract_function(self): + jsi = JSInterpreter('function a(b) { return b + 1; }') + func = jsi.extract_function('a') + self.assertEqual(func([2]), 3) + + def test_extract_function_with_global_stack(self): + jsi = JSInterpreter('function c(d) { return d + e + f + g; }') + func = jsi.extract_function('c', {'e': 10}, {'f': 100, 'g': 1000}) + self.assertEqual(func([1]), 1111) + if __name__ == '__main__': unittest.main() diff --git a/test/test_networking.py b/test/test_networking.py index d96624af1..2f441fced 100644 --- a/test/test_networking.py +++ b/test/test_networking.py @@ -39,6 +39,7 @@ from yt_dlp.dependencies import brotli, curl_cffi, requests, urllib3 from yt_dlp.networking import ( HEADRequest, + PATCHRequest, PUTRequest, Request, RequestDirector, @@ -614,7 +615,6 @@ def test_source_address(self, handler): rh, Request(f'http://127.0.0.1:{self.http_port}/source_address')).read().decode() assert source_address == data - # Not supported by CurlCFFI @pytest.mark.skip_handler('CurlCFFI', 'not supported by curl-cffi') def test_gzip_trailing_garbage(self, handler): with handler() as rh: @@ -720,6 +720,15 @@ def test_allproxy(self, handler): rh, Request( f'http://127.0.0.1:{self.http_port}/headers', proxies={'all': 'http://10.255.255.255'})).close() + @pytest.mark.skip_handlers_if(lambda _, handler: handler not in ['Urllib', 'CurlCFFI'], 'handler does not support keep_header_casing') + def test_keep_header_casing(self, handler): + with handler() as rh: + res = validate_and_send( + rh, Request( + f'http://127.0.0.1:{self.http_port}/headers', headers={'X-test-heaDer': 'test'}, extensions={'keep_header_casing': True})).read().decode() + + assert 'X-test-heaDer: test' in res + @pytest.mark.parametrize('handler', ['Urllib', 'Requests', 'CurlCFFI'], indirect=True) class TestClientCertificate: @@ -1289,6 +1298,7 @@ class HTTPSupportedRH(ValidationRH): ({'legacy_ssl': False}, False), ({'legacy_ssl': True}, False), ({'legacy_ssl': 'notabool'}, AssertionError), + ({'keep_header_casing': True}, UnsupportedRequest), ]), ('Requests', 'http', [ ({'cookiejar': 'notacookiejar'}, AssertionError), @@ -1299,6 +1309,9 @@ class HTTPSupportedRH(ValidationRH): ({'legacy_ssl': False}, False), ({'legacy_ssl': True}, False), ({'legacy_ssl': 'notabool'}, AssertionError), + ({'keep_header_casing': False}, False), + ({'keep_header_casing': True}, False), + ({'keep_header_casing': 'notabool'}, AssertionError), ]), ('CurlCFFI', 'http', [ ({'cookiejar': 'notacookiejar'}, AssertionError), @@ -1844,6 +1857,7 @@ def test_method(self): def test_request_helpers(self): assert HEADRequest('http://example.com').method == 'HEAD' + assert PATCHRequest('http://example.com').method == 'PATCH' assert PUTRequest('http://example.com').method == 'PUT' def test_headers(self): diff --git a/test/test_plugins.py b/test/test_plugins.py index 77545d136..195726b18 100644 --- a/test/test_plugins.py +++ b/test/test_plugins.py @@ -10,22 +10,71 @@ sys.path.append(str(TEST_DATA_DIR)) importlib.invalidate_caches() -from yt_dlp.utils import Config -from yt_dlp.plugins import PACKAGE_NAME, directories, load_plugins +from yt_dlp.plugins import ( + PACKAGE_NAME, + PluginSpec, + directories, + load_plugins, + load_all_plugins, + register_plugin_spec, +) + +from yt_dlp.globals import ( + extractors, + postprocessors, + plugin_dirs, + plugin_ies, + plugin_pps, + all_plugins_loaded, + plugin_specs, +) + + +EXTRACTOR_PLUGIN_SPEC = PluginSpec( + module_name='extractor', + suffix='IE', + destination=extractors, + plugin_destination=plugin_ies, +) + +POSTPROCESSOR_PLUGIN_SPEC = PluginSpec( + module_name='postprocessor', + suffix='PP', + destination=postprocessors, + plugin_destination=plugin_pps, +) + + +def reset_plugins(): + plugin_ies.value = {} + plugin_pps.value = {} + plugin_dirs.value = ['default'] + plugin_specs.value = {} + all_plugins_loaded.value = False + # Clearing override plugins is probably difficult + for module_name in tuple(sys.modules): + for plugin_type in ('extractor', 'postprocessor'): + if module_name.startswith(f'{PACKAGE_NAME}.{plugin_type}.'): + del sys.modules[module_name] + + importlib.invalidate_caches() class TestPlugins(unittest.TestCase): TEST_PLUGIN_DIR = TEST_DATA_DIR / PACKAGE_NAME + def setUp(self): + reset_plugins() + + def tearDown(self): + reset_plugins() + def test_directories_containing_plugins(self): self.assertIn(self.TEST_PLUGIN_DIR, map(Path, directories())) def test_extractor_classes(self): - for module_name in tuple(sys.modules): - if module_name.startswith(f'{PACKAGE_NAME}.extractor'): - del sys.modules[module_name] - plugins_ie = load_plugins('extractor', 'IE') + plugins_ie = load_plugins(EXTRACTOR_PLUGIN_SPEC) self.assertIn(f'{PACKAGE_NAME}.extractor.normal', sys.modules.keys()) self.assertIn('NormalPluginIE', plugins_ie.keys()) @@ -35,17 +84,29 @@ def test_extractor_classes(self): f'{PACKAGE_NAME}.extractor._ignore' in sys.modules, 'loaded module beginning with underscore') self.assertNotIn('IgnorePluginIE', plugins_ie.keys()) + self.assertNotIn('IgnorePluginIE', plugin_ies.value) # Don't load extractors with underscore prefix self.assertNotIn('_IgnoreUnderscorePluginIE', plugins_ie.keys()) + self.assertNotIn('_IgnoreUnderscorePluginIE', plugin_ies.value) # Don't load extractors not specified in __all__ (if supplied) self.assertNotIn('IgnoreNotInAllPluginIE', plugins_ie.keys()) + self.assertNotIn('IgnoreNotInAllPluginIE', plugin_ies.value) self.assertIn('InAllPluginIE', plugins_ie.keys()) + self.assertIn('InAllPluginIE', plugin_ies.value) + + # Don't load override extractors + self.assertNotIn('OverrideGenericIE', plugins_ie.keys()) + self.assertNotIn('OverrideGenericIE', plugin_ies.value) + self.assertNotIn('_UnderscoreOverrideGenericIE', plugins_ie.keys()) + self.assertNotIn('_UnderscoreOverrideGenericIE', plugin_ies.value) def test_postprocessor_classes(self): - plugins_pp = load_plugins('postprocessor', 'PP') + plugins_pp = load_plugins(POSTPROCESSOR_PLUGIN_SPEC) self.assertIn('NormalPluginPP', plugins_pp.keys()) + self.assertIn(f'{PACKAGE_NAME}.postprocessor.normal', sys.modules.keys()) + self.assertIn('NormalPluginPP', plugin_pps.value) def test_importing_zipped_module(self): zip_path = TEST_DATA_DIR / 'zipped_plugins.zip' @@ -58,10 +119,10 @@ def test_importing_zipped_module(self): package = importlib.import_module(f'{PACKAGE_NAME}.{plugin_type}') self.assertIn(zip_path / PACKAGE_NAME / plugin_type, map(Path, package.__path__)) - plugins_ie = load_plugins('extractor', 'IE') + plugins_ie = load_plugins(EXTRACTOR_PLUGIN_SPEC) self.assertIn('ZippedPluginIE', plugins_ie.keys()) - plugins_pp = load_plugins('postprocessor', 'PP') + plugins_pp = load_plugins(POSTPROCESSOR_PLUGIN_SPEC) self.assertIn('ZippedPluginPP', plugins_pp.keys()) finally: @@ -69,23 +130,116 @@ def test_importing_zipped_module(self): os.remove(zip_path) importlib.invalidate_caches() # reset the import caches - def test_plugin_dirs(self): - # Internal plugin dirs hack for CLI --plugin-dirs - # To be replaced with proper system later - custom_plugin_dir = TEST_DATA_DIR / 'plugin_packages' - Config._plugin_dirs = [str(custom_plugin_dir)] - importlib.invalidate_caches() # reset the import caches + def test_reloading_plugins(self): + reload_plugins_path = TEST_DATA_DIR / 'reload_plugins' + load_plugins(EXTRACTOR_PLUGIN_SPEC) + load_plugins(POSTPROCESSOR_PLUGIN_SPEC) + # Remove default folder and add reload_plugin path + sys.path.remove(str(TEST_DATA_DIR)) + sys.path.append(str(reload_plugins_path)) + importlib.invalidate_caches() try: - package = importlib.import_module(f'{PACKAGE_NAME}.extractor') - self.assertIn(custom_plugin_dir / 'testpackage' / PACKAGE_NAME / 'extractor', map(Path, package.__path__)) + for plugin_type in ('extractor', 'postprocessor'): + package = importlib.import_module(f'{PACKAGE_NAME}.{plugin_type}') + self.assertIn(reload_plugins_path / PACKAGE_NAME / plugin_type, map(Path, package.__path__)) - plugins_ie = load_plugins('extractor', 'IE') - self.assertIn('PackagePluginIE', plugins_ie.keys()) + plugins_ie = load_plugins(EXTRACTOR_PLUGIN_SPEC) + self.assertIn('NormalPluginIE', plugins_ie.keys()) + self.assertTrue( + plugins_ie['NormalPluginIE'].REPLACED, + msg='Reloading has not replaced original extractor plugin') + self.assertTrue( + extractors.value['NormalPluginIE'].REPLACED, + msg='Reloading has not replaced original extractor plugin globally') + + plugins_pp = load_plugins(POSTPROCESSOR_PLUGIN_SPEC) + self.assertIn('NormalPluginPP', plugins_pp.keys()) + self.assertTrue(plugins_pp['NormalPluginPP'].REPLACED, + msg='Reloading has not replaced original postprocessor plugin') + self.assertTrue( + postprocessors.value['NormalPluginPP'].REPLACED, + msg='Reloading has not replaced original postprocessor plugin globally') finally: - Config._plugin_dirs = [] - importlib.invalidate_caches() # reset the import caches + sys.path.remove(str(reload_plugins_path)) + sys.path.append(str(TEST_DATA_DIR)) + importlib.invalidate_caches() + + def test_extractor_override_plugin(self): + load_plugins(EXTRACTOR_PLUGIN_SPEC) + + from yt_dlp.extractor.generic import GenericIE + + self.assertEqual(GenericIE.TEST_FIELD, 'override') + self.assertEqual(GenericIE.SECONDARY_TEST_FIELD, 'underscore-override') + + self.assertEqual(GenericIE.IE_NAME, 'generic+override+underscore-override') + importlib.invalidate_caches() + # test that loading a second time doesn't wrap a second time + load_plugins(EXTRACTOR_PLUGIN_SPEC) + from yt_dlp.extractor.generic import GenericIE + self.assertEqual(GenericIE.IE_NAME, 'generic+override+underscore-override') + + def test_load_all_plugin_types(self): + + # no plugin specs registered + load_all_plugins() + + self.assertNotIn(f'{PACKAGE_NAME}.extractor.normal', sys.modules.keys()) + self.assertNotIn(f'{PACKAGE_NAME}.postprocessor.normal', sys.modules.keys()) + + register_plugin_spec(EXTRACTOR_PLUGIN_SPEC) + register_plugin_spec(POSTPROCESSOR_PLUGIN_SPEC) + load_all_plugins() + self.assertTrue(all_plugins_loaded.value) + + self.assertIn(f'{PACKAGE_NAME}.extractor.normal', sys.modules.keys()) + self.assertIn(f'{PACKAGE_NAME}.postprocessor.normal', sys.modules.keys()) + + def test_no_plugin_dirs(self): + register_plugin_spec(EXTRACTOR_PLUGIN_SPEC) + register_plugin_spec(POSTPROCESSOR_PLUGIN_SPEC) + + plugin_dirs.value = [] + load_all_plugins() + + self.assertNotIn(f'{PACKAGE_NAME}.extractor.normal', sys.modules.keys()) + self.assertNotIn(f'{PACKAGE_NAME}.postprocessor.normal', sys.modules.keys()) + + def test_set_plugin_dirs(self): + custom_plugin_dir = str(TEST_DATA_DIR / 'plugin_packages') + plugin_dirs.value = [custom_plugin_dir] + + load_plugins(EXTRACTOR_PLUGIN_SPEC) + + self.assertIn(f'{PACKAGE_NAME}.extractor.package', sys.modules.keys()) + self.assertIn('PackagePluginIE', plugin_ies.value) + + def test_invalid_plugin_dir(self): + plugin_dirs.value = ['invalid_dir'] + with self.assertRaises(ValueError): + load_plugins(EXTRACTOR_PLUGIN_SPEC) + + def test_append_plugin_dirs(self): + custom_plugin_dir = str(TEST_DATA_DIR / 'plugin_packages') + + self.assertEqual(plugin_dirs.value, ['default']) + plugin_dirs.value.append(custom_plugin_dir) + self.assertEqual(plugin_dirs.value, ['default', custom_plugin_dir]) + + load_plugins(EXTRACTOR_PLUGIN_SPEC) + + self.assertIn(f'{PACKAGE_NAME}.extractor.package', sys.modules.keys()) + self.assertIn('PackagePluginIE', plugin_ies.value) + + def test_get_plugin_spec(self): + register_plugin_spec(EXTRACTOR_PLUGIN_SPEC) + register_plugin_spec(POSTPROCESSOR_PLUGIN_SPEC) + + self.assertEqual(plugin_specs.value.get('extractor'), EXTRACTOR_PLUGIN_SPEC) + self.assertEqual(plugin_specs.value.get('postprocessor'), POSTPROCESSOR_PLUGIN_SPEC) + self.assertIsNone(plugin_specs.value.get('invalid')) if __name__ == '__main__': diff --git a/test/test_subtitles.py b/test/test_subtitles.py index f3b005617..efd69b33d 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -23,7 +23,6 @@ TedTalkIE, ThePlatformFeedIE, ThePlatformIE, - VikiIE, VimeoIE, WallaIE, YoutubeIE, @@ -331,20 +330,6 @@ def test_subtitles_array_key(self): self.assertEqual(md5(subtitles['it']), '4b3264186fbb103508abe5311cfcb9cd') -@is_download_test -@unittest.skip('IE broken - DRM only') -class TestVikiSubtitles(BaseTestSubtitles): - url = 'http://www.viki.com/videos/1060846v-punch-episode-18' - IE = VikiIE - - def test_allsubtitles(self): - self.DL.params['writesubtitles'] = True - self.DL.params['allsubtitles'] = True - subtitles = self.getSubtitles() - self.assertEqual(set(subtitles.keys()), {'en'}) - self.assertEqual(md5(subtitles['en']), '53cb083a5914b2d84ef1ab67b880d18a') - - @is_download_test class TestThePlatformSubtitles(BaseTestSubtitles): # from http://www.3playmedia.com/services-features/tools/integrations/theplatform/ diff --git a/test/test_utils.py b/test/test_utils.py index 4f4f1daa4..0395367cb 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -3,19 +3,20 @@ # Allow direct execution import os import sys -import unittest -import unittest.mock -import warnings -import datetime as dt sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import contextlib +import datetime as dt import io import itertools import json +import pickle import subprocess +import unittest +import unittest.mock +import warnings import xml.etree.ElementTree from yt_dlp.compat import ( @@ -218,11 +219,8 @@ def test_sanitize_ids(self): self.assertEqual(sanitize_filename('_BD_eEpuzXw', is_id=True), '_BD_eEpuzXw') self.assertEqual(sanitize_filename('N0Y__7-UOdI', is_id=True), 'N0Y__7-UOdI') + @unittest.mock.patch('sys.platform', 'win32') def test_sanitize_path(self): - with unittest.mock.patch('sys.platform', 'win32'): - self._test_sanitize_path() - - def _test_sanitize_path(self): self.assertEqual(sanitize_path('abc'), 'abc') self.assertEqual(sanitize_path('abc/def'), 'abc\\def') self.assertEqual(sanitize_path('abc\\def'), 'abc\\def') @@ -253,10 +251,8 @@ def _test_sanitize_path(self): # Check with nt._path_normpath if available try: - import nt - - nt_path_normpath = getattr(nt, '_path_normpath', None) - except Exception: + from nt import _path_normpath as nt_path_normpath + except ImportError: nt_path_normpath = None for test, expected in [ @@ -668,6 +664,8 @@ def test_url_or_none(self): self.assertEqual(url_or_none('mms://foo.de'), 'mms://foo.de') self.assertEqual(url_or_none('rtspu://foo.de'), 'rtspu://foo.de') self.assertEqual(url_or_none('ftps://foo.de'), 'ftps://foo.de') + self.assertEqual(url_or_none('ws://foo.de'), 'ws://foo.de') + self.assertEqual(url_or_none('wss://foo.de'), 'wss://foo.de') def test_parse_age_limit(self): self.assertEqual(parse_age_limit(None), None) @@ -1269,6 +1267,7 @@ def test_js_to_json_edgecases(self): def test_js_to_json_malformed(self): self.assertEqual(js_to_json('42a1'), '42"a1"') self.assertEqual(js_to_json('42a-1'), '42"a"-1') + self.assertEqual(js_to_json('{a: `${e("")}`}'), '{"a": "\\"e\\"(\\"\\")"}') def test_js_to_json_template_literal(self): self.assertEqual(js_to_json('`Hello ${name}`', {'name': '"world"'}), '"Hello world"') @@ -2092,21 +2091,26 @@ def test_http_header_dict(self): headers = HTTPHeaderDict() headers['ytdl-test'] = b'0' self.assertEqual(list(headers.items()), [('Ytdl-Test', '0')]) + self.assertEqual(list(headers.sensitive().items()), [('ytdl-test', '0')]) headers['ytdl-test'] = 1 self.assertEqual(list(headers.items()), [('Ytdl-Test', '1')]) + self.assertEqual(list(headers.sensitive().items()), [('ytdl-test', '1')]) headers['Ytdl-test'] = '2' self.assertEqual(list(headers.items()), [('Ytdl-Test', '2')]) + self.assertEqual(list(headers.sensitive().items()), [('Ytdl-test', '2')]) self.assertTrue('ytDl-Test' in headers) self.assertEqual(str(headers), str(dict(headers))) self.assertEqual(repr(headers), str(dict(headers))) headers.update({'X-dlp': 'data'}) self.assertEqual(set(headers.items()), {('Ytdl-Test', '2'), ('X-Dlp', 'data')}) + self.assertEqual(set(headers.sensitive().items()), {('Ytdl-test', '2'), ('X-dlp', 'data')}) self.assertEqual(dict(headers), {'Ytdl-Test': '2', 'X-Dlp': 'data'}) self.assertEqual(len(headers), 2) self.assertEqual(headers.copy(), headers) - headers2 = HTTPHeaderDict({'X-dlp': 'data3'}, **headers, **{'X-dlp': 'data2'}) + headers2 = HTTPHeaderDict({'X-dlp': 'data3'}, headers, **{'X-dlP': 'data2'}) self.assertEqual(set(headers2.items()), {('Ytdl-Test', '2'), ('X-Dlp', 'data2')}) + self.assertEqual(set(headers2.sensitive().items()), {('Ytdl-test', '2'), ('X-dlP', 'data2')}) self.assertEqual(len(headers2), 2) headers2.clear() self.assertEqual(len(headers2), 0) @@ -2114,16 +2118,23 @@ def test_http_header_dict(self): # ensure we prefer latter headers headers3 = HTTPHeaderDict({'Ytdl-TeSt': 1}, {'Ytdl-test': 2}) self.assertEqual(set(headers3.items()), {('Ytdl-Test', '2')}) + self.assertEqual(set(headers3.sensitive().items()), {('Ytdl-test', '2')}) del headers3['ytdl-tesT'] self.assertEqual(dict(headers3), {}) headers4 = HTTPHeaderDict({'ytdl-test': 'data;'}) self.assertEqual(set(headers4.items()), {('Ytdl-Test', 'data;')}) + self.assertEqual(set(headers4.sensitive().items()), {('ytdl-test', 'data;')}) # common mistake: strip whitespace from values # https://github.com/yt-dlp/yt-dlp/issues/8729 headers5 = HTTPHeaderDict({'ytdl-test': ' data; '}) self.assertEqual(set(headers5.items()), {('Ytdl-Test', 'data;')}) + self.assertEqual(set(headers5.sensitive().items()), {('ytdl-test', 'data;')}) + + # test if picklable + headers6 = HTTPHeaderDict(a=1, b=2) + self.assertEqual(pickle.loads(pickle.dumps(headers6)), headers6) def test_extract_basic_auth(self): assert extract_basic_auth('http://:foo.bar') == ('http://:foo.bar', None) diff --git a/test/test_websockets.py b/test/test_websockets.py index 06112cc0b..dead5fe5c 100644 --- a/test/test_websockets.py +++ b/test/test_websockets.py @@ -44,7 +44,7 @@ def websocket_handler(websocket): return websocket.send('2') elif isinstance(message, str): if message == 'headers': - return websocket.send(json.dumps(dict(websocket.request.headers))) + return websocket.send(json.dumps(dict(websocket.request.headers.raw_items()))) elif message == 'path': return websocket.send(websocket.request.path) elif message == 'source_address': @@ -266,18 +266,18 @@ def test_cookies(self, handler): with handler(cookiejar=cookiejar) as rh: ws = ws_validate_and_send(rh, Request(self.ws_base_url)) ws.send('headers') - assert json.loads(ws.recv())['cookie'] == 'test=ytdlp' + assert HTTPHeaderDict(json.loads(ws.recv()))['cookie'] == 'test=ytdlp' ws.close() with handler() as rh: ws = ws_validate_and_send(rh, Request(self.ws_base_url)) ws.send('headers') - assert 'cookie' not in json.loads(ws.recv()) + assert 'cookie' not in HTTPHeaderDict(json.loads(ws.recv())) ws.close() ws = ws_validate_and_send(rh, Request(self.ws_base_url, extensions={'cookiejar': cookiejar})) ws.send('headers') - assert json.loads(ws.recv())['cookie'] == 'test=ytdlp' + assert HTTPHeaderDict(json.loads(ws.recv()))['cookie'] == 'test=ytdlp' ws.close() @pytest.mark.skip_handler('Websockets', 'Set-Cookie not supported by websockets') @@ -287,7 +287,7 @@ def test_cookie_sync_only_cookiejar(self, handler): ws_validate_and_send(rh, Request(f'{self.ws_base_url}/get_cookie', extensions={'cookiejar': YoutubeDLCookieJar()})) ws = ws_validate_and_send(rh, Request(self.ws_base_url, extensions={'cookiejar': YoutubeDLCookieJar()})) ws.send('headers') - assert 'cookie' not in json.loads(ws.recv()) + assert 'cookie' not in HTTPHeaderDict(json.loads(ws.recv())) ws.close() @pytest.mark.skip_handler('Websockets', 'Set-Cookie not supported by websockets') @@ -298,12 +298,12 @@ def test_cookie_sync_delete_cookie(self, handler): ws_validate_and_send(rh, Request(f'{self.ws_base_url}/get_cookie')) ws = ws_validate_and_send(rh, Request(self.ws_base_url)) ws.send('headers') - assert json.loads(ws.recv())['cookie'] == 'test=ytdlp' + assert HTTPHeaderDict(json.loads(ws.recv()))['cookie'] == 'test=ytdlp' ws.close() cookiejar.clear_session_cookies() ws = ws_validate_and_send(rh, Request(self.ws_base_url)) ws.send('headers') - assert 'cookie' not in json.loads(ws.recv()) + assert 'cookie' not in HTTPHeaderDict(json.loads(ws.recv())) ws.close() def test_source_address(self, handler): @@ -341,6 +341,14 @@ def test_request_headers(self, handler): assert headers['test3'] == 'test3' ws.close() + def test_keep_header_casing(self, handler): + with handler(headers=HTTPHeaderDict({'x-TeSt1': 'test'})) as rh: + ws = ws_validate_and_send(rh, Request(self.ws_base_url, headers={'x-TeSt2': 'test'}, extensions={'keep_header_casing': True})) + ws.send('headers') + headers = json.loads(ws.recv()) + assert 'x-TeSt1' in headers + assert 'x-TeSt2' in headers + @pytest.mark.parametrize('client_cert', ( {'client_certificate': os.path.join(MTLS_CERT_DIR, 'clientwithkey.crt')}, { diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index 13436f088..0f0885366 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -78,6 +78,61 @@ '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', '0QJ8wRAIgXmPlOPSBkkUs1bYFYlJCfe29xxAj7v1pDL0QwbdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_EMu-m37KtXJ2OySqa0q', ), + ( + 'https://www.youtube.com/s/player/643afba4/tv-player-ias.vflset/tv-player-ias.js', + '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', + 'AAOAOq0QJ8wRAIgXmPlOPSBkkUs1bYFYlJCfe29xx8j7vgpDL0QwbdV06sCIEzpWqMGkFR20CFOS21Tp-7vj_EMu-m37KtXJoOy1', + ), + ( + 'https://www.youtube.com/s/player/363db69b/player_ias.vflset/en_US/base.js', + '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', + '0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpz2ICs6EVdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', + ), + ( + 'https://www.youtube.com/s/player/363db69b/player_ias_tce.vflset/en_US/base.js', + '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', + '0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpz2ICs6EVdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', + ), + ( + 'https://www.youtube.com/s/player/4fcd6e4a/player_ias.vflset/en_US/base.js', + '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', + 'wAOAOq0QJ8ARAIgXmPlOPSBkkUs1bYFYlJCfe29xx8q7v1pDL0QwbdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_EMu-m37KtXJoOySqa0', + ), + ( + 'https://www.youtube.com/s/player/4fcd6e4a/player_ias_tce.vflset/en_US/base.js', + '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', + 'wAOAOq0QJ8ARAIgXmPlOPSBkkUs1bYFYlJCfe29xx8q7v1pDL0QwbdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_EMu-m37KtXJoOySqa0', + ), + ( + 'https://www.youtube.com/s/player/20830619/player_ias.vflset/en_US/base.js', + '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', + '7AOq0QJ8wRAIgXmPlOPSBkkAs1bYFYlJCfe29xx8jOv1pDL0Q2bdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_EMu-m37KtXJoOySqa0qaw', + ), + ( + 'https://www.youtube.com/s/player/20830619/player_ias_tce.vflset/en_US/base.js', + '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', + '7AOq0QJ8wRAIgXmPlOPSBkkAs1bYFYlJCfe29xx8jOv1pDL0Q2bdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_EMu-m37KtXJoOySqa0qaw', + ), + ( + 'https://www.youtube.com/s/player/20830619/player-plasma-ias-phone-en_US.vflset/base.js', + '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', + '7AOq0QJ8wRAIgXmPlOPSBkkAs1bYFYlJCfe29xx8jOv1pDL0Q2bdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_EMu-m37KtXJoOySqa0qaw', + ), + ( + 'https://www.youtube.com/s/player/20830619/player-plasma-ias-tablet-en_US.vflset/base.js', + '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', + '7AOq0QJ8wRAIgXmPlOPSBkkAs1bYFYlJCfe29xx8jOv1pDL0Q2bdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_EMu-m37KtXJoOySqa0qaw', + ), + ( + 'https://www.youtube.com/s/player/8a8ac953/player_ias_tce.vflset/en_US/base.js', + '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', + 'IAOAOq0QJ8wRAAgXmPlOPSBkkUs1bYFYlJCfe29xx8j7v1pDL0QwbdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_E2u-m37KtXJoOySqa0', + ), + ( + 'https://www.youtube.com/s/player/8a8ac953/tv-player-es6.vflset/tv-player-es6.js', + '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', + 'IAOAOq0QJ8wRAAgXmPlOPSBkkUs1bYFYlJCfe29xx8j7v1pDL0QwbdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_E2u-m37KtXJoOySqa0', + ), ] _NSIG_TESTS = [ @@ -201,6 +256,66 @@ 'https://www.youtube.com/s/player/2f1832d2/player_ias.vflset/en_US/base.js', 'YWt1qdbe8SAfkoPHW5d', 'RrRjWQOJmBiP', ), + ( + 'https://www.youtube.com/s/player/9c6dfc4a/player_ias.vflset/en_US/base.js', + 'jbu7ylIosQHyJyJV', 'uwI0ESiynAmhNg', + ), + ( + 'https://www.youtube.com/s/player/e7567ecf/player_ias_tce.vflset/en_US/base.js', + 'Sy4aDGc0VpYRR9ew_', '5UPOT1VhoZxNLQ', + ), + ( + 'https://www.youtube.com/s/player/d50f54ef/player_ias_tce.vflset/en_US/base.js', + 'Ha7507LzRmH3Utygtj', 'XFTb2HoeOE5MHg', + ), + ( + 'https://www.youtube.com/s/player/074a8365/player_ias_tce.vflset/en_US/base.js', + 'Ha7507LzRmH3Utygtj', 'ufTsrE0IVYrkl8v', + ), + ( + 'https://www.youtube.com/s/player/643afba4/player_ias.vflset/en_US/base.js', + 'N5uAlLqm0eg1GyHO', 'dCBQOejdq5s-ww', + ), + ( + 'https://www.youtube.com/s/player/69f581a5/tv-player-ias.vflset/tv-player-ias.js', + '-qIP447rVlTTwaZjY', 'KNcGOksBAvwqQg', + ), + ( + 'https://www.youtube.com/s/player/643afba4/tv-player-ias.vflset/tv-player-ias.js', + 'ir9-V6cdbCiyKxhr', '2PL7ZDYAALMfmA', + ), + ( + 'https://www.youtube.com/s/player/363db69b/player_ias.vflset/en_US/base.js', + 'eWYu5d5YeY_4LyEDc', 'XJQqf-N7Xra3gg', + ), + ( + 'https://www.youtube.com/s/player/4fcd6e4a/player_ias.vflset/en_US/base.js', + 'o_L251jm8yhZkWtBW', 'lXoxI3XvToqn6A', + ), + ( + 'https://www.youtube.com/s/player/4fcd6e4a/player_ias_tce.vflset/en_US/base.js', + 'o_L251jm8yhZkWtBW', 'lXoxI3XvToqn6A', + ), + ( + 'https://www.youtube.com/s/player/20830619/tv-player-ias.vflset/tv-player-ias.js', + 'ir9-V6cdbCiyKxhr', '9YE85kNjZiS4', + ), + ( + 'https://www.youtube.com/s/player/20830619/player-plasma-ias-phone-en_US.vflset/base.js', + 'ir9-V6cdbCiyKxhr', '9YE85kNjZiS4', + ), + ( + 'https://www.youtube.com/s/player/20830619/player-plasma-ias-tablet-en_US.vflset/base.js', + 'ir9-V6cdbCiyKxhr', '9YE85kNjZiS4', + ), + ( + 'https://www.youtube.com/s/player/8a8ac953/player_ias_tce.vflset/en_US/base.js', + 'MiBYeXx_vRREbiCCmh', 'RtZYMVvmkE0JE', + ), + ( + 'https://www.youtube.com/s/player/8a8ac953/tv-player-es6.vflset/tv-player-es6.js', + 'MiBYeXx_vRREbiCCmh', 'RtZYMVvmkE0JE', + ), ] @@ -214,6 +329,8 @@ def test_youtube_extract_player_info(self): ('https://www.youtube.com/s/player/64dddad9/player-plasma-ias-phone-en_US.vflset/base.js', '64dddad9'), ('https://www.youtube.com/s/player/64dddad9/player-plasma-ias-phone-de_DE.vflset/base.js', '64dddad9'), ('https://www.youtube.com/s/player/64dddad9/player-plasma-ias-tablet-en_US.vflset/base.js', '64dddad9'), + ('https://www.youtube.com/s/player/e7567ecf/player_ias_tce.vflset/en_US/base.js', 'e7567ecf'), + ('https://www.youtube.com/s/player/643afba4/tv-player-ias.vflset/tv-player-ias.js', '643afba4'), # obsolete ('https://www.youtube.com/yts/jsbin/player_ias-vfle4-e03/en_US/base.js', 'vfle4-e03'), ('https://www.youtube.com/yts/jsbin/player_ias-vfl49f_g4/en_US/base.js', 'vfl49f_g4'), @@ -246,46 +363,51 @@ def t_factory(name, sig_func, url_pattern): def make_tfunc(url, sig_input, expected_sig): m = url_pattern.match(url) assert m, f'{url!r} should follow URL format' - test_id = m.group('id') + test_id = re.sub(r'[/.-]', '_', m.group('id') or m.group('compat_id')) def test_func(self): - basename = f'player-{name}-{test_id}.js' + basename = f'player-{test_id}.js' fn = os.path.join(self.TESTDATA_DIR, basename) if not os.path.exists(fn): urllib.request.urlretrieve(url, fn) with open(fn, encoding='utf-8') as testf: jscode = testf.read() - self.assertEqual(sig_func(jscode, sig_input), expected_sig) + self.assertEqual(sig_func(jscode, sig_input, url), expected_sig) test_func.__name__ = f'test_{name}_js_{test_id}' setattr(TestSignature, test_func.__name__, test_func) return make_tfunc -def signature(jscode, sig_input): - func = YoutubeIE(FakeYDL())._parse_sig_js(jscode) +def signature(jscode, sig_input, player_url): + func = YoutubeIE(FakeYDL())._parse_sig_js(jscode, player_url) src_sig = ( str(string.printable[:sig_input]) if isinstance(sig_input, int) else sig_input) return func(src_sig) -def n_sig(jscode, sig_input): +def n_sig(jscode, sig_input, player_url): ie = YoutubeIE(FakeYDL()) - funcname = ie._extract_n_function_name(jscode) + funcname = ie._extract_n_function_name(jscode, player_url=player_url) jsi = JSInterpreter(jscode) - func = jsi.extract_function_from_code(*ie._fixup_n_function_code(*jsi.extract_function_code(funcname))) + func = jsi.extract_function_from_code(*ie._fixup_n_function_code(*jsi.extract_function_code(funcname), jscode, player_url)) return func([sig_input]) make_sig_test = t_factory( - 'signature', signature, re.compile(r'.*(?:-|/player/)(?P[a-zA-Z0-9_-]+)(?:/.+\.js|(?:/watch_as3|/html5player)?\.[a-z]+)$')) + 'signature', signature, + re.compile(r'''(?x) + .+(?: + /player/(?P[a-zA-Z0-9_/.-]+)| + /html5player-(?:en_US-)?(?P[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)? + )\.js$''')) for test_spec in _SIG_TESTS: make_sig_test(*test_spec) make_nsig_test = t_factory( - 'nsig', n_sig, re.compile(r'.+/player/(?P[a-zA-Z0-9_-]+)/.+.js$')) + 'nsig', n_sig, re.compile(r'.+/player/(?P[a-zA-Z0-9_/.-]+)\.js$')) for test_spec in _NSIG_TESTS: make_nsig_test(*test_spec) diff --git a/test/testdata/plugin_packages/testpackage/yt_dlp_plugins/extractor/package.py b/test/testdata/plugin_packages/testpackage/yt_dlp_plugins/extractor/package.py index b860300d8..39020fef9 100644 --- a/test/testdata/plugin_packages/testpackage/yt_dlp_plugins/extractor/package.py +++ b/test/testdata/plugin_packages/testpackage/yt_dlp_plugins/extractor/package.py @@ -2,4 +2,5 @@ class PackagePluginIE(InfoExtractor): + _VALID_URL = 'package' pass diff --git a/test/testdata/reload_plugins/yt_dlp_plugins/extractor/normal.py b/test/testdata/reload_plugins/yt_dlp_plugins/extractor/normal.py new file mode 100644 index 000000000..6b927077f --- /dev/null +++ b/test/testdata/reload_plugins/yt_dlp_plugins/extractor/normal.py @@ -0,0 +1,10 @@ +from yt_dlp.extractor.common import InfoExtractor + + +class NormalPluginIE(InfoExtractor): + _VALID_URL = 'normal' + REPLACED = True + + +class _IgnoreUnderscorePluginIE(InfoExtractor): + pass diff --git a/test/testdata/reload_plugins/yt_dlp_plugins/postprocessor/normal.py b/test/testdata/reload_plugins/yt_dlp_plugins/postprocessor/normal.py new file mode 100644 index 000000000..5e44ba2b5 --- /dev/null +++ b/test/testdata/reload_plugins/yt_dlp_plugins/postprocessor/normal.py @@ -0,0 +1,5 @@ +from yt_dlp.postprocessor.common import PostProcessor + + +class NormalPluginPP(PostProcessor): + REPLACED = True diff --git a/test/testdata/yt_dlp_plugins/extractor/ignore.py b/test/testdata/yt_dlp_plugins/extractor/ignore.py index 816a16aa2..dca111a37 100644 --- a/test/testdata/yt_dlp_plugins/extractor/ignore.py +++ b/test/testdata/yt_dlp_plugins/extractor/ignore.py @@ -6,6 +6,7 @@ class IgnoreNotInAllPluginIE(InfoExtractor): class InAllPluginIE(InfoExtractor): + _VALID_URL = 'inallpluginie' pass diff --git a/test/testdata/yt_dlp_plugins/extractor/normal.py b/test/testdata/yt_dlp_plugins/extractor/normal.py index b09009bdc..996b2936f 100644 --- a/test/testdata/yt_dlp_plugins/extractor/normal.py +++ b/test/testdata/yt_dlp_plugins/extractor/normal.py @@ -2,8 +2,10 @@ class NormalPluginIE(InfoExtractor): - pass + _VALID_URL = 'normalpluginie' + REPLACED = False class _IgnoreUnderscorePluginIE(InfoExtractor): + _VALID_URL = 'ignoreunderscorepluginie' pass diff --git a/test/testdata/yt_dlp_plugins/extractor/override.py b/test/testdata/yt_dlp_plugins/extractor/override.py new file mode 100644 index 000000000..766dc32e1 --- /dev/null +++ b/test/testdata/yt_dlp_plugins/extractor/override.py @@ -0,0 +1,5 @@ +from yt_dlp.extractor.generic import GenericIE + + +class OverrideGenericIE(GenericIE, plugin_name='override'): + TEST_FIELD = 'override' diff --git a/test/testdata/yt_dlp_plugins/extractor/overridetwo.py b/test/testdata/yt_dlp_plugins/extractor/overridetwo.py new file mode 100644 index 000000000..826184c64 --- /dev/null +++ b/test/testdata/yt_dlp_plugins/extractor/overridetwo.py @@ -0,0 +1,5 @@ +from yt_dlp.extractor.generic import GenericIE + + +class _UnderscoreOverrideGenericIE(GenericIE, plugin_name='underscore-override'): + SECONDARY_TEST_FIELD = 'underscore-override' diff --git a/test/testdata/yt_dlp_plugins/postprocessor/normal.py b/test/testdata/yt_dlp_plugins/postprocessor/normal.py index 315b85a48..1e94d7b8b 100644 --- a/test/testdata/yt_dlp_plugins/postprocessor/normal.py +++ b/test/testdata/yt_dlp_plugins/postprocessor/normal.py @@ -2,4 +2,4 @@ class NormalPluginPP(PostProcessor): - pass + REPLACED = False diff --git a/test/testdata/zipped_plugins/yt_dlp_plugins/extractor/zipped.py b/test/testdata/zipped_plugins/yt_dlp_plugins/extractor/zipped.py index 01542e0d8..c5140bb02 100644 --- a/test/testdata/zipped_plugins/yt_dlp_plugins/extractor/zipped.py +++ b/test/testdata/zipped_plugins/yt_dlp_plugins/extractor/zipped.py @@ -2,4 +2,5 @@ class ZippedPluginIE(InfoExtractor): + _VALID_URL = 'zippedpluginie' pass diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 989479e84..e219068cd 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -35,9 +35,18 @@ shorten_protocol_name, ) from .downloader.rtmp import rtmpdump_version -from .extractor import gen_extractor_classes, get_info_extractor +from .extractor import gen_extractor_classes, get_info_extractor, import_extractors from .extractor.common import UnsupportedURLIE from .extractor.openload import PhantomJSwrapper +from .globals import ( + IN_CLI, + LAZY_EXTRACTORS, + plugin_ies, + plugin_ies_overrides, + plugin_pps, + all_plugins_loaded, + plugin_dirs, +) from .minicurses import format_text from .networking import HEADRequest, Request, RequestDirector from .networking.common import _REQUEST_HANDLERS, _RH_PREFERENCES @@ -49,8 +58,7 @@ network_exceptions, ) from .networking.impersonate import ImpersonateRequestHandler -from .plugins import directories as plugin_directories -from .postprocessor import _PLUGIN_CLASSES as plugin_pps +from .plugins import directories as plugin_directories, load_all_plugins from .postprocessor import ( EmbedThumbnailPP, FFmpegFixupDuplicateMoovPP, @@ -162,7 +170,7 @@ write_json_file, write_string, ) -from .utils._utils import _UnsafeExtensionError, _YDLLogger +from .utils._utils import _UnsafeExtensionError, _YDLLogger, _ProgressState from .utils.networking import ( HTTPHeaderDict, clean_headers, @@ -603,7 +611,7 @@ class YoutubeDL: # NB: Keep in sync with the docstring of extractor/common.py 'url', 'manifest_url', 'manifest_stream_number', 'ext', 'format', 'format_id', 'format_note', 'width', 'height', 'aspect_ratio', 'resolution', 'dynamic_range', 'tbr', 'abr', 'acodec', 'asr', 'audio_channels', - 'vbr', 'fps', 'vcodec', 'container', 'filesize', 'filesize_approx', 'rows', 'columns', + 'vbr', 'fps', 'vcodec', 'container', 'filesize', 'filesize_approx', 'rows', 'columns', 'hls_media_playlist_data', 'player_url', 'protocol', 'fragment_base_url', 'fragments', 'is_from_start', 'is_dash_periods', 'request_data', 'preference', 'language', 'language_preference', 'quality', 'source_preference', 'cookies', 'http_headers', 'stretched_ratio', 'no_resume', 'has_drm', 'extra_param_to_segment_url', 'extra_param_to_key_url', @@ -647,13 +655,15 @@ def __init__(self, params=None, auto_init=True): self.cache = Cache(self) self.__header_cookies = [] + # compat for API: load plugins if they have not already + if not all_plugins_loaded.value: + load_all_plugins() + stdout = sys.stderr if self.params.get('logtostderr') else sys.stdout self._out_files = Namespace( out=stdout, error=sys.stderr, screen=sys.stderr if self.params.get('quiet') else stdout, - console=None if os.name == 'nt' else next( - filter(supports_terminal_sequences, (sys.stderr, sys.stdout)), None), ) try: @@ -661,6 +671,9 @@ def __init__(self, params=None, auto_init=True): except Exception as e: self.write_debug(f'Failed to enable VT mode: {e}') + # hehe "immutable" namespace + self._out_files.console = next(filter(supports_terminal_sequences, (sys.stderr, sys.stdout)), None) + if self.params.get('no_color'): if self.params.get('color') is not None: self.params.setdefault('_warnings', []).append( @@ -961,21 +974,22 @@ def to_stderr(self, message, only_once=False): self._write_string(f'{self._bidi_workaround(message)}\n', self._out_files.error, only_once=only_once) def _send_console_code(self, code): - if os.name == 'nt' or not self._out_files.console: - return + if not supports_terminal_sequences(self._out_files.console): + return False self._write_string(code, self._out_files.console) + return True - def to_console_title(self, message): - if not self.params.get('consoletitle', False): + def to_console_title(self, message=None, progress_state=None, percent=None): + if not self.params.get('consoletitle'): return - message = remove_terminal_sequences(message) - if os.name == 'nt': - if ctypes.windll.kernel32.GetConsoleWindow(): - # c_wchar_p() might not be necessary if `message` is - # already of type unicode() - ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message)) - else: - self._send_console_code(f'\033]0;{message}\007') + + if message: + success = self._send_console_code(f'\033]0;{remove_terminal_sequences(message)}\007') + if not success and os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow(): + ctypes.windll.kernel32.SetConsoleTitleW(message) + + if isinstance(progress_state, _ProgressState): + self._send_console_code(progress_state.get_ansi_escape(percent)) def save_console_title(self): if not self.params.get('consoletitle') or self.params.get('simulate'): @@ -989,6 +1003,7 @@ def restore_console_title(self): def __enter__(self): self.save_console_title() + self.to_console_title(progress_state=_ProgressState.INDETERMINATE) return self def save_cookies(self): @@ -997,6 +1012,7 @@ def save_cookies(self): def __exit__(self, *args): self.restore_console_title() + self.to_console_title(progress_state=_ProgressState.HIDDEN) self.close() def close(self): @@ -3998,15 +4014,6 @@ def print_debug_header(self): if not self.params.get('verbose'): return - from . import _IN_CLI # Must be delayed import - - # These imports can be slow. So import them only as needed - from .extractor.extractors import _LAZY_LOADER - from .extractor.extractors import ( - _PLUGIN_CLASSES as plugin_ies, - _PLUGIN_OVERRIDES as plugin_ie_overrides, - ) - def get_encoding(stream): ret = str(getattr(stream, 'encoding', f'missing ({type(stream).__name__})')) additional_info = [] @@ -4045,17 +4052,18 @@ def get_encoding(stream): _make_label(ORIGIN, CHANNEL.partition('@')[2] or __version__, __version__), f'[{RELEASE_GIT_HEAD[:9]}]' if RELEASE_GIT_HEAD else '', '' if source == 'unknown' else f'({source})', - '' if _IN_CLI else 'API' if klass == YoutubeDL else f'API:{self.__module__}.{klass.__qualname__}', + '' if IN_CLI.value else 'API' if klass == YoutubeDL else f'API:{self.__module__}.{klass.__qualname__}', delim=' ')) - if not _IN_CLI: + if not IN_CLI.value: write_debug(f'params: {self.params}') - if not _LAZY_LOADER: - if os.environ.get('YTDLP_NO_LAZY_EXTRACTORS'): - write_debug('Lazy loading extractors is forcibly disabled') - else: - write_debug('Lazy loading extractors is disabled') + import_extractors() + lazy_extractors = LAZY_EXTRACTORS.value + if lazy_extractors is None: + write_debug('Lazy loading extractors is disabled') + elif not lazy_extractors: + write_debug('Lazy loading extractors is forcibly disabled') if self.params['compat_opts']: write_debug('Compatibility options: {}'.format(', '.join(self.params['compat_opts']))) @@ -4084,24 +4092,27 @@ def get_encoding(stream): write_debug(f'Proxy map: {self.proxies}') write_debug(f'Request Handlers: {", ".join(rh.RH_NAME for rh in self._request_director.handlers.values())}') - if os.environ.get('YTDLP_NO_PLUGINS'): - write_debug('Plugins are forcibly disabled') - return - for plugin_type, plugins in {'Extractor': plugin_ies, 'Post-Processor': plugin_pps}.items(): - display_list = ['{}{}'.format( - klass.__name__, '' if klass.__name__ == name else f' as {name}') - for name, klass in plugins.items()] + for plugin_type, plugins in (('Extractor', plugin_ies), ('Post-Processor', plugin_pps)): + display_list = [ + klass.__name__ if klass.__name__ == name else f'{klass.__name__} as {name}' + for name, klass in plugins.value.items()] if plugin_type == 'Extractor': display_list.extend(f'{plugins[-1].IE_NAME.partition("+")[2]} ({parent.__name__})' - for parent, plugins in plugin_ie_overrides.items()) + for parent, plugins in plugin_ies_overrides.value.items()) if not display_list: continue write_debug(f'{plugin_type} Plugins: {", ".join(sorted(display_list))}') - plugin_dirs = plugin_directories() - if plugin_dirs: - write_debug(f'Plugin directories: {plugin_dirs}') + plugin_dirs_msg = 'none' + if not plugin_dirs.value: + plugin_dirs_msg = 'none (disabled)' + else: + found_plugin_directories = plugin_directories() + if found_plugin_directories: + plugin_dirs_msg = ', '.join(found_plugin_directories) + + write_debug(f'Plugin directories: {plugin_dirs_msg}') @functools.cached_property def proxies(self): @@ -4146,7 +4157,7 @@ def _get_available_impersonate_targets(self): (target, rh.RH_NAME) for rh in self._request_director.handlers.values() if isinstance(rh, ImpersonateRequestHandler) - for target in rh.supported_targets + for target in reversed(rh.supported_targets) ] def _impersonate_target_available(self, target): diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py index 3630f7193..34f0cdc47 100644 --- a/yt_dlp/__init__.py +++ b/yt_dlp/__init__.py @@ -20,7 +20,9 @@ from .extractor import list_extractor_classes from .extractor.adobepass import MSO_INFO from .networking.impersonate import ImpersonateTarget +from .globals import IN_CLI, plugin_dirs from .options import parseOpts +from .plugins import load_all_plugins as _load_all_plugins from .postprocessor import ( FFmpegExtractAudioPP, FFmpegMergerPP, @@ -34,7 +36,6 @@ ) from .update import Updater from .utils import ( - Config, NO_DEFAULT, POSTPROCESS_WHEN, DateRange, @@ -67,8 +68,6 @@ from .utils._utils import _UnsafeExtensionError from .YoutubeDL import YoutubeDL -_IN_CLI = False - def _exit(status=0, *args): for msg in args: @@ -296,18 +295,20 @@ def parse_sleep_func(expr): raise ValueError(f'invalid {key} retry sleep expression {expr!r}') # Bytes - def validate_bytes(name, value): + def validate_bytes(name, value, strict_positive=False): if value is None: return None numeric_limit = parse_bytes(value) - validate(numeric_limit is not None, 'rate limit', value) + validate(numeric_limit is not None, name, value) + if strict_positive: + validate_positive(name, numeric_limit, True) return numeric_limit - opts.ratelimit = validate_bytes('rate limit', opts.ratelimit) + opts.ratelimit = validate_bytes('rate limit', opts.ratelimit, True) opts.throttledratelimit = validate_bytes('throttled rate limit', opts.throttledratelimit) opts.min_filesize = validate_bytes('min filesize', opts.min_filesize) opts.max_filesize = validate_bytes('max filesize', opts.max_filesize) - opts.buffersize = validate_bytes('buffer size', opts.buffersize) + opts.buffersize = validate_bytes('buffer size', opts.buffersize, True) opts.http_chunk_size = validate_bytes('http chunk size', opts.http_chunk_size) # Output templates @@ -438,6 +439,10 @@ def metadataparser_actions(f): } # Other options + opts.plugin_dirs = opts.plugin_dirs + if opts.plugin_dirs is None: + opts.plugin_dirs = ['default'] + if opts.playlist_items is not None: try: tuple(PlaylistEntries.parse_playlist_items(opts.playlist_items)) @@ -978,11 +983,6 @@ def _real_main(argv=None): parser, opts, all_urls, ydl_opts = parse_options(argv) - # HACK: Set the plugin dirs early on - # TODO(coletdjnz): remove when plugin globals system is implemented - if opts.plugin_dirs is not None: - Config._plugin_dirs = list(map(expand_path, opts.plugin_dirs)) - # Dump user agent if opts.dump_user_agent: ua = traverse_obj(opts.headers, 'User-Agent', casesense=False, default=std_headers['User-Agent']) @@ -997,6 +997,11 @@ def _real_main(argv=None): if opts.ffmpeg_location: FFmpegPostProcessor._ffmpeg_location.set(opts.ffmpeg_location) + # load all plugins into the global lookup + plugin_dirs.value = opts.plugin_dirs + if plugin_dirs.value: + _load_all_plugins() + with YoutubeDL(ydl_opts) as ydl: pre_process = opts.update_self or opts.rm_cachedir actual_use = all_urls or opts.load_info_filename @@ -1023,8 +1028,9 @@ def _real_main(argv=None): # List of simplified targets we know are supported, # to help users know what dependencies may be required. (ImpersonateTarget('chrome'), 'curl_cffi'), - (ImpersonateTarget('edge'), 'curl_cffi'), (ImpersonateTarget('safari'), 'curl_cffi'), + (ImpersonateTarget('firefox'), 'curl_cffi>=0.10'), + (ImpersonateTarget('edge'), 'curl_cffi'), ] available_targets = ydl._get_available_impersonate_targets() @@ -1040,12 +1046,12 @@ def make_row(target, handler): for known_target, known_handler in known_targets: if not any( - known_target in target and handler == known_handler + known_target in target and known_handler.startswith(handler) for target, handler in available_targets ): - rows.append([ + rows.insert(0, [ ydl._format_out(text, ydl.Styles.SUPPRESS) - for text in make_row(known_target, f'{known_handler} (not available)') + for text in make_row(known_target, f'{known_handler} (unavailable)') ]) ydl.to_screen('[info] Available impersonate targets') @@ -1096,8 +1102,7 @@ def make_row(target, handler): def main(argv=None): - global _IN_CLI - _IN_CLI = True + IN_CLI.value = True try: _exit(*variadic(_real_main(argv))) except (CookieLoadError, DownloadError): diff --git a/yt_dlp/aes.py b/yt_dlp/aes.py index 9908434a5..065901d68 100644 --- a/yt_dlp/aes.py +++ b/yt_dlp/aes.py @@ -83,7 +83,7 @@ def aes_ecb_encrypt(data, key, iv=None): @returns {int[]} encrypted data """ expanded_key = key_expansion(key) - block_count = int(ceil(float(len(data)) / BLOCK_SIZE_BYTES)) + block_count = ceil(len(data) / BLOCK_SIZE_BYTES) encrypted_data = [] for i in range(block_count): @@ -103,7 +103,7 @@ def aes_ecb_decrypt(data, key, iv=None): @returns {int[]} decrypted data """ expanded_key = key_expansion(key) - block_count = int(ceil(float(len(data)) / BLOCK_SIZE_BYTES)) + block_count = ceil(len(data) / BLOCK_SIZE_BYTES) encrypted_data = [] for i in range(block_count): @@ -134,7 +134,7 @@ def aes_ctr_encrypt(data, key, iv): @returns {int[]} encrypted data """ expanded_key = key_expansion(key) - block_count = int(ceil(float(len(data)) / BLOCK_SIZE_BYTES)) + block_count = ceil(len(data) / BLOCK_SIZE_BYTES) counter = iter_vector(iv) encrypted_data = [] @@ -158,7 +158,7 @@ def aes_cbc_decrypt(data, key, iv): @returns {int[]} decrypted data """ expanded_key = key_expansion(key) - block_count = int(ceil(float(len(data)) / BLOCK_SIZE_BYTES)) + block_count = ceil(len(data) / BLOCK_SIZE_BYTES) decrypted_data = [] previous_cipher_block = iv @@ -183,7 +183,7 @@ def aes_cbc_encrypt(data, key, iv, *, padding_mode='pkcs7'): @returns {int[]} encrypted data """ expanded_key = key_expansion(key) - block_count = int(ceil(float(len(data)) / BLOCK_SIZE_BYTES)) + block_count = ceil(len(data) / BLOCK_SIZE_BYTES) encrypted_data = [] previous_cipher_block = iv diff --git a/yt_dlp/downloader/__init__.py b/yt_dlp/downloader/__init__.py index 51a9f28f0..9c34bd289 100644 --- a/yt_dlp/downloader/__init__.py +++ b/yt_dlp/downloader/__init__.py @@ -30,11 +30,12 @@ def get_suitable_downloader(info_dict, params={}, default=NO_DEFAULT, protocol=N from .http import HttpFD from .ism import IsmFD from .mhtml import MhtmlFD -from .niconico import NiconicoDmcFD, NiconicoLiveFD +from .niconico import NiconicoLiveFD from .rtmp import RtmpFD from .rtsp import RtspFD from .websocket import WebSocketFragmentFD from .youtube_live_chat import YoutubeLiveChatFD +from .bunnycdn import BunnyCdnFD PROTOCOL_MAP = { 'rtmp': RtmpFD, @@ -49,12 +50,12 @@ def get_suitable_downloader(info_dict, params={}, default=NO_DEFAULT, protocol=N 'http_dash_segments_generator': DashSegmentsFD, 'ism': IsmFD, 'mhtml': MhtmlFD, - 'niconico_dmc': NiconicoDmcFD, 'niconico_live': NiconicoLiveFD, 'fc2_live': FC2LiveFD, 'websocket_frag': WebSocketFragmentFD, 'youtube_live_chat': YoutubeLiveChatFD, 'youtube_live_chat_replay': YoutubeLiveChatFD, + 'bunnycdn': BunnyCdnFD, } @@ -65,7 +66,6 @@ def shorten_protocol_name(proto, simplify=False): 'rtmp_ffmpeg': 'rtmpF', 'http_dash_segments': 'dash', 'http_dash_segments_generator': 'dashG', - 'niconico_dmc': 'dmc', 'websocket_frag': 'WSfrag', } if simplify: diff --git a/yt_dlp/downloader/bunnycdn.py b/yt_dlp/downloader/bunnycdn.py new file mode 100644 index 000000000..e787f698a --- /dev/null +++ b/yt_dlp/downloader/bunnycdn.py @@ -0,0 +1,50 @@ +import hashlib +import random +import threading + +from .common import FileDownloader +from . import HlsFD +from ..networking import Request +from ..networking.exceptions import network_exceptions + + +class BunnyCdnFD(FileDownloader): + """ + Downloads from BunnyCDN with required pings + Note, this is not a part of public API, and will be removed without notice. + DO NOT USE + """ + + def real_download(self, filename, info_dict): + self.to_screen(f'[{self.FD_NAME}] Downloading from BunnyCDN') + + fd = HlsFD(self.ydl, self.params) + + stop_event = threading.Event() + ping_thread = threading.Thread(target=self.ping_thread, args=(stop_event,), kwargs=info_dict['_bunnycdn_ping_data']) + ping_thread.start() + + try: + return fd.real_download(filename, info_dict) + finally: + stop_event.set() + + def ping_thread(self, stop_event, url, headers, secret, context_id): + # Site sends ping every 4 seconds, but this throttles the download. Pinging every 2 seconds seems to work. + ping_interval = 2 + # Hard coded resolution as it doesn't seem to matter + res = 1080 + paused = 'false' + current_time = 0 + + while not stop_event.wait(ping_interval): + current_time += ping_interval + + time = current_time + round(random.random(), 6) + md5_hash = hashlib.md5(f'{secret}_{context_id}_{time}_{paused}_{res}'.encode()).hexdigest() + ping_url = f'{url}?hash={md5_hash}&time={time}&paused={paused}&resolution={res}' + + try: + self.ydl.urlopen(Request(ping_url, headers=headers)).read() + except network_exceptions as e: + self.to_screen(f'[{self.FD_NAME}] Ping failed: {e}') diff --git a/yt_dlp/downloader/common.py b/yt_dlp/downloader/common.py index e8dcb37cc..bb9303f8a 100644 --- a/yt_dlp/downloader/common.py +++ b/yt_dlp/downloader/common.py @@ -31,6 +31,7 @@ timetuple_from_msec, try_call, ) +from ..utils._utils import _ProgressState class FileDownloader: @@ -333,7 +334,7 @@ def _report_progress_status(self, s, default_template): progress_dict), s.get('progress_idx') or 0) self.to_console_title(self.ydl.evaluate_outtmpl( progress_template.get('download-title') or 'yt-dlp %(progress._default_template)s', - progress_dict)) + progress_dict), _ProgressState.from_dict(s), s.get('_percent')) def _format_progress(self, *args, **kwargs): return self.ydl._format_text( @@ -357,6 +358,7 @@ def with_fields(*tups, default=''): '_speed_str': self.format_speed(speed).strip(), '_total_bytes_str': _format_bytes('total_bytes'), '_elapsed_str': self.format_seconds(s.get('elapsed')), + '_percent': 100.0, '_percent_str': self.format_percent(100), }) self._report_progress_status(s, join_nonempty( @@ -375,13 +377,15 @@ def with_fields(*tups, default=''): return self._progress_delta_time += update_delta + progress = try_call( + lambda: 100 * s['downloaded_bytes'] / s['total_bytes'], + lambda: 100 * s['downloaded_bytes'] / s['total_bytes_estimate'], + lambda: s['downloaded_bytes'] == 0 and 0) s.update({ '_eta_str': self.format_eta(s.get('eta')).strip(), '_speed_str': self.format_speed(s.get('speed')), - '_percent_str': self.format_percent(try_call( - lambda: 100 * s['downloaded_bytes'] / s['total_bytes'], - lambda: 100 * s['downloaded_bytes'] / s['total_bytes_estimate'], - lambda: s['downloaded_bytes'] == 0 and 0)), + '_percent': progress, + '_percent_str': self.format_percent(progress), '_total_bytes_str': _format_bytes('total_bytes'), '_total_bytes_estimate_str': _format_bytes('total_bytes_estimate'), '_downloaded_bytes_str': _format_bytes('downloaded_bytes'), diff --git a/yt_dlp/downloader/external.py b/yt_dlp/downloader/external.py index 7f6b5b45c..ee73ac043 100644 --- a/yt_dlp/downloader/external.py +++ b/yt_dlp/downloader/external.py @@ -457,8 +457,6 @@ class FFmpegFD(ExternalFD): @classmethod def available(cls, path=None): - # TODO: Fix path for ffmpeg - # Fixme: This may be wrong when --ffmpeg-location is used return FFmpegPostProcessor().available def on_process_started(self, proc, stdin): diff --git a/yt_dlp/downloader/hls.py b/yt_dlp/downloader/hls.py index da2574da7..1f36a07f5 100644 --- a/yt_dlp/downloader/hls.py +++ b/yt_dlp/downloader/hls.py @@ -16,6 +16,7 @@ update_url_query, urljoin, ) +from ..utils._utils import _request_dump_filename class HlsFD(FragmentFD): @@ -72,11 +73,23 @@ def check_results(): def real_download(self, filename, info_dict): man_url = info_dict['url'] - self.to_screen(f'[{self.FD_NAME}] Downloading m3u8 manifest') - urlh = self.ydl.urlopen(self._prepare_url(info_dict, man_url)) - man_url = urlh.url - s = urlh.read().decode('utf-8', 'ignore') + s = info_dict.get('hls_media_playlist_data') + if s: + self.to_screen(f'[{self.FD_NAME}] Using m3u8 manifest from extracted info') + else: + self.to_screen(f'[{self.FD_NAME}] Downloading m3u8 manifest') + urlh = self.ydl.urlopen(self._prepare_url(info_dict, man_url)) + man_url = urlh.url + s_bytes = urlh.read() + if self.params.get('write_pages'): + dump_filename = _request_dump_filename( + man_url, info_dict['id'], None, + trim_length=self.params.get('trim_file_name')) + self.to_screen(f'[{self.FD_NAME}] Saving request to {dump_filename}') + with open(dump_filename, 'wb') as outf: + outf.write(s_bytes) + s = s_bytes.decode('utf-8', 'ignore') can_download, message = self.can_download(s, info_dict, self.params.get('allow_unplayable_formats')), None if can_download: @@ -177,6 +190,7 @@ def is_ad_fragment_end(s): if external_aes_iv: external_aes_iv = binascii.unhexlify(remove_start(external_aes_iv, '0x').zfill(32)) byte_range = {} + byte_range_offset = 0 discontinuity_count = 0 frag_index = 0 ad_frag_next = False @@ -204,6 +218,11 @@ def is_ad_fragment_end(s): }) media_sequence += 1 + # If the byte_range is truthy, reset it after appending a fragment that uses it + if byte_range: + byte_range_offset = byte_range['end'] + byte_range = {} + elif line.startswith('#EXT-X-MAP'): if format_index and discontinuity_count != format_index: continue @@ -217,10 +236,12 @@ def is_ad_fragment_end(s): if extra_segment_query: frag_url = update_url_query(frag_url, extra_segment_query) + map_byte_range = {} + if map_info.get('BYTERANGE'): splitted_byte_range = map_info.get('BYTERANGE').split('@') - sub_range_start = int(splitted_byte_range[1]) if len(splitted_byte_range) == 2 else byte_range['end'] - byte_range = { + sub_range_start = int(splitted_byte_range[1]) if len(splitted_byte_range) == 2 else 0 + map_byte_range = { 'start': sub_range_start, 'end': sub_range_start + int(splitted_byte_range[0]), } @@ -229,7 +250,7 @@ def is_ad_fragment_end(s): 'frag_index': frag_index, 'url': frag_url, 'decrypt_info': decrypt_info, - 'byte_range': byte_range, + 'byte_range': map_byte_range, 'media_sequence': media_sequence, }) media_sequence += 1 @@ -257,7 +278,7 @@ def is_ad_fragment_end(s): media_sequence = int(line[22:]) elif line.startswith('#EXT-X-BYTERANGE'): splitted_byte_range = line[17:].split('@') - sub_range_start = int(splitted_byte_range[1]) if len(splitted_byte_range) == 2 else byte_range['end'] + sub_range_start = int(splitted_byte_range[1]) if len(splitted_byte_range) == 2 else byte_range_offset byte_range = { 'start': sub_range_start, 'end': sub_range_start + int(splitted_byte_range[0]), diff --git a/yt_dlp/downloader/niconico.py b/yt_dlp/downloader/niconico.py index 462c6e2d6..33cf15df8 100644 --- a/yt_dlp/downloader/niconico.py +++ b/yt_dlp/downloader/niconico.py @@ -2,60 +2,12 @@ import threading import time -from . import get_suitable_downloader from .common import FileDownloader from .external import FFmpegFD from ..networking import Request from ..utils import DownloadError, str_or_none, try_get -class NiconicoDmcFD(FileDownloader): - """ Downloading niconico douga from DMC with heartbeat """ - - def real_download(self, filename, info_dict): - from ..extractor.niconico import NiconicoIE - - self.to_screen(f'[{self.FD_NAME}] Downloading from DMC') - ie = NiconicoIE(self.ydl) - info_dict, heartbeat_info_dict = ie._get_heartbeat_info(info_dict) - - fd = get_suitable_downloader(info_dict, params=self.params)(self.ydl, self.params) - - success = download_complete = False - timer = [None] - heartbeat_lock = threading.Lock() - heartbeat_url = heartbeat_info_dict['url'] - heartbeat_data = heartbeat_info_dict['data'].encode() - heartbeat_interval = heartbeat_info_dict.get('interval', 30) - - request = Request(heartbeat_url, heartbeat_data) - - def heartbeat(): - try: - self.ydl.urlopen(request).read() - except Exception: - self.to_screen(f'[{self.FD_NAME}] Heartbeat failed') - - with heartbeat_lock: - if not download_complete: - timer[0] = threading.Timer(heartbeat_interval, heartbeat) - timer[0].start() - - heartbeat_info_dict['ping']() - self.to_screen('[%s] Heartbeat with %d second interval ...' % (self.FD_NAME, heartbeat_interval)) - try: - heartbeat() - if type(fd).__name__ == 'HlsFD': - info_dict.update(ie._extract_m3u8_formats(info_dict['url'], info_dict['id'])[0]) - success = fd.real_download(filename, info_dict) - finally: - if heartbeat_lock: - with heartbeat_lock: - timer[0].cancel() - download_complete = True - return success - - class NiconicoLiveFD(FileDownloader): """ Downloads niconico live without being stopped """ @@ -85,6 +37,7 @@ def communicate_ws(reconnect): 'quality': live_quality, 'protocol': 'hls+fmp4', 'latency': live_latency, + 'accessRightMethod': 'single_cookie', 'chasePlay': False, }, 'room': { diff --git a/yt_dlp/extractor/__init__.py b/yt_dlp/extractor/__init__.py index 6bfa4bd7b..a090e942d 100644 --- a/yt_dlp/extractor/__init__.py +++ b/yt_dlp/extractor/__init__.py @@ -1,16 +1,25 @@ from ..compat.compat_utils import passthrough_module +from ..globals import extractors as _extractors_context +from ..globals import plugin_ies as _plugin_ies_context +from ..plugins import PluginSpec, register_plugin_spec passthrough_module(__name__, '.extractors') del passthrough_module +register_plugin_spec(PluginSpec( + module_name='extractor', + suffix='IE', + destination=_extractors_context, + plugin_destination=_plugin_ies_context, +)) + def gen_extractor_classes(): """ Return a list of supported extractors. The order does matter; the first extractor matched is the one handling the URL. """ - from .extractors import _ALL_CLASSES - - return _ALL_CLASSES + import_extractors() + return list(_extractors_context.value.values()) def gen_extractors(): @@ -37,6 +46,9 @@ def list_extractors(age_limit=None): def get_info_extractor(ie_name): """Returns the info extractor class with the given ie_name""" - from . import extractors + import_extractors() + return _extractors_context.value[f'{ie_name}IE'] - return getattr(extractors, f'{ie_name}IE') + +def import_extractors(): + from . import extractors # noqa: F401 diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index c331bab78..bb1c3db16 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -312,6 +312,7 @@ ) from .bundesliga import BundesligaIE from .bundestag import BundestagIE +from .bunnycdn import BunnyCdnIE from .businessinsider import BusinessInsiderIE from .buzzfeed import BuzzFeedIE from .byutv import BYUtvIE @@ -335,6 +336,7 @@ from .canalalpha import CanalAlphaIE from .canalc2 import Canalc2IE from .canalplus import CanalplusIE +from .canalsurmas import CanalsurmasIE from .caracoltv import CaracolTvPlayIE from .cartoonnetwork import CartoonNetworkIE from .cbc import ( @@ -494,10 +496,6 @@ from .daystar import DaystarClipIE from .dbtv import DBTVIE from .dctp import DctpTvIE -from .deezer import ( - DeezerAlbumIE, - DeezerPlaylistIE, -) from .democracynow import DemocracynowIE from .detik import DetikEmbedIE from .deuxm import ( @@ -508,6 +506,7 @@ from .dhm import DHMIE from .digitalconcerthall import DigitalConcertHallIE from .digiteka import DigitekaIE +from .digiview import DigiviewIE from .discogs import DiscogsReleasePlaylistIE from .disney import DisneyIE from .dispeak import DigitallySpeakingIE @@ -684,6 +683,7 @@ ) from .foxsports import FoxSportsIE from .fptplay import FptplayIE +from .francaisfacile import FrancaisFacileIE from .franceinter import FranceInterIE from .francetv import ( FranceTVIE, @@ -840,6 +840,7 @@ from .ichinanalive import ( IchinanaLiveClipIE, IchinanaLiveIE, + IchinanaLiveVODIE, ) from .idolplus import IdolPlusIE from .ign import ( @@ -902,6 +903,7 @@ IviIE, ) from .ivideon import IvideonIE +from .ivoox import IvooxIE from .iwara import ( IwaraIE, IwaraPlaylistIE, @@ -959,7 +961,10 @@ ) from .kicker import KickerIE from .kickstarter import KickStarterIE -from .kika import KikaIE +from .kika import ( + KikaIE, + KikaPlaylistIE, +) from .kinja import KinjaEmbedIE from .kinopoisk import KinoPoiskIE from .kommunetv import KommunetvIE @@ -1037,6 +1042,7 @@ LimelightMediaIE, ) from .linkedin import ( + LinkedInEventsIE, LinkedInIE, LinkedInLearningCourseIE, LinkedInLearningIE, @@ -1052,6 +1058,7 @@ ) from .livestreamfails import LivestreamfailsIE from .lnk import LnkIE +from .loco import LocoIE from .loom import ( LoomFolderIE, LoomIE, @@ -1059,6 +1066,7 @@ from .lovehomeporn import LoveHomePornIE from .lrt import ( LRTVODIE, + LRTRadioIE, LRTStreamIE, ) from .lsm import ( @@ -1491,6 +1499,10 @@ ) from .parler import ParlerIE from .parlview import ParlviewIE +from .parti import ( + PartiLivestreamIE, + PartiVideoIE, +) from .patreon import ( PatreonCampaignIE, PatreonIE, @@ -1737,6 +1749,7 @@ RoosterTeethSeriesIE, ) from .rottentomatoes import RottenTomatoesIE +from .roya import RoyaLiveIE from .rozhlas import ( MujRozhlasIE, RozhlasIE, @@ -1771,7 +1784,6 @@ from .rtve import ( RTVEALaCartaIE, RTVEAudioIE, - RTVEInfantilIE, RTVELiveIE, RTVETelevisionIE, ) @@ -1880,6 +1892,8 @@ SkyItVideoIE, SkyItVideoLiveIE, TV8ItIE, + TV8ItLiveIE, + TV8ItPlaylistIE, ) from .skylinewebcams import SkylineWebcamsIE from .skynewsarabia import ( @@ -1893,6 +1907,7 @@ from .smotrim import SmotrimIE from .snapchat import SnapchatSpotlightIE from .snotr import SnotrIE +from .softwhiteunderbelly import SoftWhiteUnderbellyIE from .sohu import ( SohuIE, SohuVIE, @@ -1982,6 +1997,7 @@ StoryFireSeriesIE, StoryFireUserIE, ) +from .streaks import StreaksIE from .streamable import StreamableIE from .streamcz import StreamCZIE from .streetvoice import StreetVoiceIE @@ -2221,6 +2237,10 @@ TVPlayIE, ) from .tvplayer import TVPlayerIE +from .tvw import ( + TvwIE, + TvwTvChannelsIE, +) from .tweakers import TweakersIE from .twentymin import TwentyMinutenIE from .twentythreevideo import TwentyThreeVideoIE @@ -2344,10 +2364,6 @@ ViewLiftIE, ) from .viidea import ViideaIE -from .viki import ( - VikiChannelIE, - VikiIE, -) from .vimeo import ( VHXEmbedIE, VimeoAlbumIE, @@ -2392,10 +2408,15 @@ VoxMediaIE, VoxMediaVolumeIE, ) +from .vrsquare import ( + VrSquareChannelIE, + VrSquareIE, + VrSquareSearchIE, + VrSquareSectionIE, +) from .vrt import ( VRTIE, DagelijkseKostIE, - KetnetIE, Radio1BeIE, VrtNUIE, ) diff --git a/yt_dlp/extractor/abematv.py b/yt_dlp/extractor/abematv.py index 8c7131b10..8f2fc4c80 100644 --- a/yt_dlp/extractor/abematv.py +++ b/yt_dlp/extractor/abematv.py @@ -21,6 +21,7 @@ int_or_none, time_seconds, traverse_obj, + update_url, update_url_query, ) @@ -417,6 +418,10 @@ def _real_extract(self, url): 'is_live': is_live, 'availability': availability, }) + + if thumbnail := update_url(self._og_search_thumbnail(webpage, default=''), query=None): + info['thumbnails'] = [{'url': thumbnail}] + return info diff --git a/yt_dlp/extractor/afreecatv.py b/yt_dlp/extractor/afreecatv.py index 572d1a389..aadb4d660 100644 --- a/yt_dlp/extractor/afreecatv.py +++ b/yt_dlp/extractor/afreecatv.py @@ -1,3 +1,4 @@ +import datetime as dt import functools from .common import InfoExtractor @@ -10,7 +11,7 @@ filter_dict, int_or_none, orderedSet, - unified_timestamp, + parse_iso8601, url_or_none, urlencode_postdata, urljoin, @@ -87,9 +88,9 @@ class AfreecaTVIE(AfreecaTVBaseIE): 'uploader_id': 'rlantnghks', 'uploader': '페이즈으', 'duration': 10840, - 'thumbnail': r're:https?://videoimg\.sooplive\.co/.kr/.+', + 'thumbnail': r're:https?://videoimg\.(?:sooplive\.co\.kr|afreecatv\.com)/.+', 'upload_date': '20230108', - 'timestamp': 1673218805, + 'timestamp': 1673186405, 'title': '젠지 페이즈', }, 'params': { @@ -102,7 +103,7 @@ class AfreecaTVIE(AfreecaTVBaseIE): 'id': '20170411_BE689A0E_190960999_1_2_h', 'ext': 'mp4', 'title': '혼자사는여자집', - 'thumbnail': r're:https?://(?:video|st)img\.sooplive\.co\.kr/.+', + 'thumbnail': r're:https?://(?:video|st)img\.(?:sooplive\.co\.kr|afreecatv\.com)/.+', 'uploader': '♥이슬이', 'uploader_id': 'dasl8121', 'upload_date': '20170411', @@ -119,7 +120,7 @@ class AfreecaTVIE(AfreecaTVBaseIE): 'id': '20180327_27901457_202289533_1', 'ext': 'mp4', 'title': '[생]빨개요♥ (part 1)', - 'thumbnail': r're:https?://(?:video|st)img\.sooplive\.co\.kr/.+', + 'thumbnail': r're:https?://(?:video|st)img\.(?:sooplive\.co\.kr|afreecatv\.com)/.+', 'uploader': '[SA]서아', 'uploader_id': 'bjdyrksu', 'upload_date': '20180327', @@ -187,7 +188,7 @@ def _real_extract(self, url): 'formats': formats, **traverse_obj(file_element, { 'duration': ('duration', {int_or_none(scale=1000)}), - 'timestamp': ('file_start', {unified_timestamp}), + 'timestamp': ('file_start', {parse_iso8601(delimiter=' ', timezone=dt.timedelta(hours=9))}), }), }) @@ -370,7 +371,7 @@ def _real_extract(self, url): 'title': channel_info.get('TITLE') or station_info.get('station_title'), 'uploader': channel_info.get('BJNICK') or station_info.get('station_name'), 'uploader_id': broadcaster_id, - 'timestamp': unified_timestamp(station_info.get('broad_start')), + 'timestamp': parse_iso8601(station_info.get('broad_start'), delimiter=' ', timezone=dt.timedelta(hours=9)), 'formats': formats, 'is_live': True, 'http_headers': {'Referer': url}, diff --git a/yt_dlp/extractor/agora.py b/yt_dlp/extractor/agora.py index 983558425..e040db601 100644 --- a/yt_dlp/extractor/agora.py +++ b/yt_dlp/extractor/agora.py @@ -146,7 +146,7 @@ class TokFMPodcastIE(InfoExtractor): 'url': 'https://audycje.tokfm.pl/podcast/91275,-Systemowy-rasizm-Czy-zamieszki-w-USA-po-morderstwie-w-Minneapolis-doprowadza-do-zmian-w-sluzbach-panstwowych', 'info_dict': { 'id': '91275', - 'ext': 'aac', + 'ext': 'mp3', 'title': 'md5:a9b15488009065556900169fb8061cce', 'episode': 'md5:a9b15488009065556900169fb8061cce', 'series': 'Analizy', @@ -164,23 +164,20 @@ def _real_extract(self, url): raise ExtractorError('No such podcast', expected=True) metadata = metadata[0] - formats = [] - for ext in ('aac', 'mp3'): - url_data = self._download_json( - f'https://api.podcast.radioagora.pl/api4/getSongUrl?podcast_id={media_id}&device_id={uuid.uuid4()}&ppre=false&audio={ext}', - media_id, f'Downloading podcast {ext} URL') - # prevents inserting the mp3 (default) multiple times - if 'link_ssl' in url_data and f'.{ext}' in url_data['link_ssl']: - formats.append({ - 'url': url_data['link_ssl'], - 'ext': ext, - 'vcodec': 'none', - 'acodec': ext, - }) + mp3_url = self._download_json( + 'https://api.podcast.radioagora.pl/api4/getSongUrl', + media_id, 'Downloading podcast mp3 URL', query={ + 'podcast_id': media_id, + 'device_id': str(uuid.uuid4()), + 'ppre': 'false', + 'audio': 'mp3', + })['link_ssl'] return { 'id': media_id, - 'formats': formats, + 'url': mp3_url, + 'vcodec': 'none', + 'ext': 'mp3', 'title': metadata.get('podcast_name'), 'series': metadata.get('series_name'), 'episode': metadata.get('podcast_name'), diff --git a/yt_dlp/extractor/atresplayer.py b/yt_dlp/extractor/atresplayer.py index 0fe95bec5..1258a5704 100644 --- a/yt_dlp/extractor/atresplayer.py +++ b/yt_dlp/extractor/atresplayer.py @@ -1,64 +1,105 @@ +import urllib.parse + from .common import InfoExtractor from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, int_or_none, + parse_age_limit, + url_or_none, urlencode_postdata, ) +from ..utils.traversal import traverse_obj class AtresPlayerIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?atresplayer\.com/[^/]+/[^/]+/[^/]+/[^/]+/(?P.+?)_(?P[0-9a-f]{24})' + _VALID_URL = r'https?://(?:www\.)?atresplayer\.com/(?:[^/?#]+/){4}(?P.+?)_(?P[0-9a-f]{24})' _NETRC_MACHINE = 'atresplayer' - _TESTS = [ - { - 'url': 'https://www.atresplayer.com/antena3/series/pequenas-coincidencias/temporada-1/capitulo-7-asuntos-pendientes_5d4aa2c57ed1a88fc715a615/', - 'info_dict': { - 'id': '5d4aa2c57ed1a88fc715a615', - 'ext': 'mp4', - 'title': 'Capítulo 7: Asuntos pendientes', - 'description': 'md5:7634cdcb4d50d5381bedf93efb537fbc', - 'duration': 3413, - }, - 'skip': 'This video is only available for registered users', + _TESTS = [{ + 'url': 'https://www.atresplayer.com/lasexta/programas/el-objetivo/clips/mbappe-describe-como-entrenador-a-carlo-ancelotti-sabe-cuando-tiene-que-ser-padre-jefe-amigo-entrenador_67f2dfb2fb6ab0e4c7203849/', + 'info_dict': { + 'ext': 'mp4', + 'id': '67f2dfb2fb6ab0e4c7203849', + 'display_id': 'md5:c203f8d4e425ed115ba56a1c6e4b3e6c', + 'title': 'Mbappé describe como entrenador a Carlo Ancelotti: "Sabe cuándo tiene que ser padre, jefe, amigo, entrenador..."', + 'channel': 'laSexta', + 'duration': 31, + 'thumbnail': 'https://imagenes.atresplayer.com/atp/clipping/cmsimages02/2025/04/06/B02DBE1E-D59B-4683-8404-1A9595D15269/1920x1080.jpg', + 'tags': ['Entrevista informativa', 'Actualidad', 'Debate informativo', 'Política', 'Economía', 'Sociedad', 'Cara a cara', 'Análisis', 'Más periodismo'], + 'series': 'El Objetivo', + 'season': 'Temporada 12', + 'timestamp': 1743970079, + 'upload_date': '20250406', }, - { - 'url': 'https://www.atresplayer.com/lasexta/programas/el-club-de-la-comedia/temporada-4/capitulo-10-especial-solidario-nochebuena_5ad08edf986b2855ed47adc4/', - 'only_matching': True, + }, { + 'url': 'https://www.atresplayer.com/antena3/programas/el-hormiguero/clips/revive-la-entrevista-completa-a-miguel-bose-en-el-hormiguero_67f836baa4a5b0e4147ca59a/', + 'info_dict': { + 'ext': 'mp4', + 'id': '67f836baa4a5b0e4147ca59a', + 'display_id': 'revive-la-entrevista-completa-a-miguel-bose-en-el-hormiguero', + 'title': 'Revive la entrevista completa a Miguel Bosé en El Hormiguero', + 'description': 'md5:c6d2b591408d45a7bc2986dfb938eb72', + 'channel': 'Antena 3', + 'duration': 2556, + 'thumbnail': 'https://imagenes.atresplayer.com/atp/clipping/cmsimages02/2025/04/10/9076395F-F1FD-48BE-9F18-540DBA10EBAD/1920x1080.jpg', + 'tags': ['Entrevista', 'Variedades', 'Humor', 'Entretenimiento', 'Te sigo', 'Buen rollo', 'Cara a cara'], + 'series': 'El Hormiguero ', + 'season': 'Temporada 14', + 'timestamp': 1744320111, + 'upload_date': '20250410', }, - { - 'url': 'https://www.atresplayer.com/antena3/series/el-secreto-de-puente-viejo/el-chico-de-los-tres-lunares/capitulo-977-29-12-14_5ad51046986b2886722ccdea/', - 'only_matching': True, + }, { + 'url': 'https://www.atresplayer.com/flooxer/series/biara-proyecto-lazarus/temporada-1/capitulo-3-supervivientes_67a6038b64ceca00070f4f69/', + 'info_dict': { + 'ext': 'mp4', + 'id': '67a6038b64ceca00070f4f69', + 'display_id': 'capitulo-3-supervivientes', + 'title': 'Capítulo 3: Supervivientes', + 'description': 'md5:65b231f20302f776c2b0dd24594599a1', + 'channel': 'Flooxer', + 'duration': 1196, + 'thumbnail': 'https://imagenes.atresplayer.com/atp/clipping/cmsimages01/2025/02/14/17CF90D3-FE67-40C5-A941-7825B3E13992/1920x1080.jpg', + 'tags': ['Juvenil', 'Terror', 'Piel de gallina', 'Te sigo', 'Un break', 'Del tirón'], + 'series': 'BIARA: Proyecto Lázarus', + 'season': 'Temporada 1', + 'season_number': 1, + 'episode': 'Episode 3', + 'episode_number': 3, + 'timestamp': 1743095191, + 'upload_date': '20250327', }, - ] + }, { + 'url': 'https://www.atresplayer.com/lasexta/programas/el-club-de-la-comedia/temporada-4/capitulo-10-especial-solidario-nochebuena_5ad08edf986b2855ed47adc4/', + 'only_matching': True, + }, { + 'url': 'https://www.atresplayer.com/antena3/series/el-secreto-de-puente-viejo/el-chico-de-los-tres-lunares/capitulo-977-29-12-14_5ad51046986b2886722ccdea/', + 'only_matching': True, + }] _API_BASE = 'https://api.atresplayer.com/' def _perform_login(self, username, password): - self._request_webpage( - self._API_BASE + 'login', None, 'Downloading login page') - try: - target_url = self._download_json( - 'https://account.atresmedia.com/api/login', None, - 'Logging in', headers={ - 'Content-Type': 'application/x-www-form-urlencoded', - }, data=urlencode_postdata({ + self._download_webpage( + 'https://account.atresplayer.com/auth/v1/login', None, + 'Logging in', 'Failed to log in', data=urlencode_postdata({ 'username': username, 'password': password, - }))['targetUrl'] + })) except ExtractorError as e: if isinstance(e.cause, HTTPError) and e.cause.status == 400: raise ExtractorError('Invalid username and/or password', expected=True) raise - self._request_webpage(target_url, None, 'Following Target URL') - def _real_extract(self, url): display_id, video_id = self._match_valid_url(url).groups() + metadata_url = self._download_json( + self._API_BASE + 'client/v1/url', video_id, 'Downloading API endpoint data', + query={'href': urllib.parse.urlparse(url).path})['href'] + metadata = self._download_json(metadata_url, video_id) + try: - episode = self._download_json( - self._API_BASE + 'client/v1/player/episode/' + video_id, video_id) + video_data = self._download_json(metadata['urlVideo'], video_id, 'Downloading video data') except ExtractorError as e: if isinstance(e.cause, HTTPError) and e.cause.status == 403: error = self._parse_json(e.cause.response.read(), None) @@ -67,37 +108,45 @@ def _real_extract(self, url): raise ExtractorError(error['error_description'], expected=True) raise - title = episode['titulo'] - formats = [] subtitles = {} - for source in episode.get('sources', []): - src = source.get('src') - if not src: - continue + for source in traverse_obj(video_data, ('sources', lambda _, v: url_or_none(v['src']))): + src_url = source['src'] src_type = source.get('type') - if src_type == 'application/vnd.apple.mpegurl': - formats, subtitles = self._extract_m3u8_formats( - src, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False) - elif src_type == 'application/dash+xml': - formats, subtitles = self._extract_mpd_formats( - src, video_id, mpd_id='dash', fatal=False) - - heartbeat = episode.get('heartbeat') or {} - omniture = episode.get('omniture') or {} - get_meta = lambda x: heartbeat.get(x) or omniture.get(x) + if src_type in ('application/vnd.apple.mpegurl', 'application/hls+legacy', 'application/hls+hevc'): + fmts, subs = self._extract_m3u8_formats_and_subtitles( + src_url, video_id, 'mp4', m3u8_id='hls', fatal=False) + elif src_type in ('application/dash+xml', 'application/dash+hevc'): + fmts, subs = self._extract_mpd_formats_and_subtitles( + src_url, video_id, mpd_id='dash', fatal=False) + else: + continue + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) return { 'display_id': display_id, 'id': video_id, - 'title': title, - 'description': episode.get('descripcion'), - 'thumbnail': episode.get('imgPoster'), - 'duration': int_or_none(episode.get('duration')), 'formats': formats, - 'channel': get_meta('channel'), - 'season': get_meta('season'), - 'episode_number': int_or_none(get_meta('episodeNumber')), 'subtitles': subtitles, + **traverse_obj(video_data, { + 'title': ('titulo', {str}), + 'description': ('descripcion', {str}), + 'duration': ('duration', {int_or_none}), + 'thumbnail': ('imgPoster', {url_or_none}, {lambda v: f'{v}1920x1080.jpg'}), + 'age_limit': ('ageRating', {parse_age_limit}), + }), + **traverse_obj(metadata, { + 'title': ('title', {str}), + 'description': ('description', {str}), + 'duration': ('duration', {int_or_none}), + 'tags': ('tags', ..., 'title', {str}), + 'age_limit': ('ageRating', {parse_age_limit}), + 'series': ('format', 'title', {str}), + 'season': ('currentSeason', 'title', {str}), + 'season_number': ('currentSeason', 'seasonNumber', {int_or_none}), + 'episode_number': ('numberOfEpisode', {int_or_none}), + 'timestamp': ('publicationDate', {int_or_none(scale=1000)}), + 'channel': ('channel', 'title', {str}), + }), } diff --git a/yt_dlp/extractor/azmedien.py b/yt_dlp/extractor/azmedien.py index 0e3a03f03..9f6bd820c 100644 --- a/yt_dlp/extractor/azmedien.py +++ b/yt_dlp/extractor/azmedien.py @@ -1,7 +1,6 @@ -import json - from .common import InfoExtractor from .kaltura import KalturaIE +from ..utils.traversal import require, traverse_obj class AZMedienIE(InfoExtractor): @@ -9,15 +8,15 @@ class AZMedienIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// (?:www\.|tv\.)? - (?P + (?: telezueri\.ch| telebaern\.tv| telem1\.ch| tvo-online\.ch )/ - [^/]+/ + [^/?#]+/ (?P - [^/]+-(?P\d+) + [^/?#]+-\d+ ) (?: \#video= @@ -47,19 +46,17 @@ class AZMedienIE(InfoExtractor): 'url': 'https://www.telebaern.tv/telebaern-news/montag-1-oktober-2018-ganze-sendung-133531189#video=0_7xjo9lf1', 'only_matching': True, }] - _API_TEMPL = 'https://www.%s/api/pub/gql/%s/NewsArticleTeaser/a4016f65fe62b81dc6664dd9f4910e4ab40383be' _PARTNER_ID = '1719221' def _real_extract(self, url): - host, display_id, article_id, entry_id = self._match_valid_url(url).groups() + display_id, entry_id = self._match_valid_url(url).groups() if not entry_id: - entry_id = self._download_json( - self._API_TEMPL % (host, host.split('.')[0]), display_id, query={ - 'variables': json.dumps({ - 'contextId': 'NewsArticle:' + article_id, - }), - })['data']['context']['mainAsset']['video']['kaltura']['kalturaId'] + webpage = self._download_webpage(url, display_id) + data = self._search_json( + r'window\.__APOLLO_STATE__\s*=', webpage, 'video data', display_id) + entry_id = traverse_obj(data, ( + lambda _, v: v['__typename'] == 'KalturaData', 'kalturaId', any, {require('kaltura id')})) return self.url_result( f'kaltura:{self._PARTNER_ID}:{entry_id}', diff --git a/yt_dlp/extractor/bandlab.py b/yt_dlp/extractor/bandlab.py index 64aa2ba70..f110b793b 100644 --- a/yt_dlp/extractor/bandlab.py +++ b/yt_dlp/extractor/bandlab.py @@ -86,7 +86,7 @@ def _parse_video(self, video_data, url=None): 'webpage_url': ( 'id', ({value(url)}, {format_field(template='https://www.bandlab.com/post/%s')}), filter, any), 'url': ('video', 'url', {url_or_none}), - 'title': ('caption', {lambda x: x.replace('\n', ' ')}, {truncate_string(left=50)}), + 'title': ('caption', {lambda x: x.replace('\n', ' ')}, {truncate_string(left=72)}), 'description': ('caption', {str}), 'thumbnail': ('video', 'picture', 'url', {url_or_none}), 'view_count': ('video', 'counters', 'plays', {int_or_none}), @@ -120,7 +120,7 @@ class BandlabIE(BandlabBaseIE): 'duration': 54.629999999999995, 'title': 'sweet black', 'upload_date': '20231210', - 'thumbnail': 'https://bandlabimages.azureedge.net/v1.0/songs/fa082beb-b856-4730-9170-a57e4e32cc2c/', + 'thumbnail': 'https://bl-prod-images.azureedge.net/v1.0/songs/fa082beb-b856-4730-9170-a57e4e32cc2c/', 'genres': ['Lofi'], 'uploader': 'ender milze', 'comment_count': int, @@ -142,7 +142,7 @@ class BandlabIE(BandlabBaseIE): 'duration': 54.629999999999995, 'title': 'sweet black', 'upload_date': '20231210', - 'thumbnail': 'https://bandlabimages.azureedge.net/v1.0/songs/fa082beb-b856-4730-9170-a57e4e32cc2c/', + 'thumbnail': 'https://bl-prod-images.azureedge.net/v1.0/songs/fa082beb-b856-4730-9170-a57e4e32cc2c/', 'genres': ['Lofi'], 'uploader': 'ender milze', 'comment_count': int, @@ -158,7 +158,7 @@ class BandlabIE(BandlabBaseIE): 'comment_count': int, 'genres': ['Other'], 'uploader_id': 'user8353034818103753', - 'thumbnail': 'https://bandlabimages.azureedge.net/v1.0/songs/51b18363-da23-4b9b-a29c-2933a3e561ca/', + 'thumbnail': 'https://bl-prod-images.azureedge.net/v1.0/songs/51b18363-da23-4b9b-a29c-2933a3e561ca/', 'timestamp': 1709625771, 'track': 'PodcastMaerchen4b', 'duration': 468.14, @@ -178,7 +178,7 @@ class BandlabIE(BandlabBaseIE): 'id': '110343fc-148b-ea11-96d2-0003ffd1fc09', 'ext': 'm4a', 'timestamp': 1588273294, - 'thumbnail': 'https://bandlabimages.azureedge.net/v1.0/users/b612e533-e4f7-4542-9f50-3fcfd8dd822c/', + 'thumbnail': 'https://bl-prod-images.azureedge.net/v1.0/users/b612e533-e4f7-4542-9f50-3fcfd8dd822c/', 'description': 'Final Revision.', 'title': 'Replay ( Instrumental)', 'uploader': 'David R Sparks', @@ -200,7 +200,7 @@ class BandlabIE(BandlabBaseIE): 'id': '5cdf9036-3857-ef11-991a-6045bd36e0d9', 'ext': 'mp4', 'duration': 44.705, - 'thumbnail': 'https://bandlabimages.azureedge.net/v1.0/videos/67c6cef1-cef6-40d3-831e-a55bc1dcb972/', + 'thumbnail': 'https://bl-prod-images.azureedge.net/v1.0/videos/67c6cef1-cef6-40d3-831e-a55bc1dcb972/', 'comment_count': int, 'title': 'backing vocals', 'uploader_id': 'marliashya', @@ -224,7 +224,7 @@ class BandlabIE(BandlabBaseIE): 'view_count': int, 'track': 'Positronic Meltdown', 'duration': 318.55, - 'thumbnail': 'https://bandlabimages.azureedge.net/v1.0/songs/87165bc3-5439-496e-b1f7-a9f13b541ff2/', + 'thumbnail': 'https://bl-prod-images.azureedge.net/v1.0/songs/87165bc3-5439-496e-b1f7-a9f13b541ff2/', 'description': 'Checkout my tracks at AOMX http://aomxsounds.com/', 'uploader_id': 'microfreaks', 'title': 'Positronic Meltdown', @@ -246,7 +246,7 @@ class BandlabIE(BandlabBaseIE): 'comment_count': int, 'uploader': 'Sorakime', 'uploader_id': 'sorakime', - 'thumbnail': 'https://bandlabimages.azureedge.net/v1.0/users/572a351a-0f3a-4c6a-ac39-1a5defdeeb1c/', + 'thumbnail': 'https://bl-prod-images.azureedge.net/v1.0/users/572a351a-0f3a-4c6a-ac39-1a5defdeeb1c/', 'timestamp': 1691162128, 'upload_date': '20230804', 'media_type': 'track', diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index 42b4e2d3c..6508942a4 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -1596,16 +1596,16 @@ def _real_extract(self, url): webpage = self._download_webpage(url, list_id) initial_state = self._search_json(r'window\.__INITIAL_STATE__\s*=', webpage, 'initial state', list_id) - if traverse_obj(initial_state, ('error', 'code', {int_or_none})) != 200: - error_code = traverse_obj(initial_state, ('error', 'trueCode', {int_or_none})) - error_message = traverse_obj(initial_state, ('error', 'message', {str_or_none})) + error = traverse_obj(initial_state, (('error', 'listError'), all, lambda _, v: v['code'], any)) + if error and error['code'] != 200: + error_code = error.get('trueCode') if error_code == -400 and list_id == 'watchlater': self.raise_login_required('You need to login to access your watchlater playlist') elif error_code == -403: self.raise_login_required('This is a private playlist. You need to login as its owner') elif error_code == 11010: raise ExtractorError('Playlist is no longer available', expected=True) - raise ExtractorError(f'Could not access playlist: {error_code} {error_message}') + raise ExtractorError(f'Could not access playlist: {error_code} {error.get("message")}') query = { 'ps': 20, diff --git a/yt_dlp/extractor/bitchute.py b/yt_dlp/extractor/bitchute.py index c83222ea5..981eebfbb 100644 --- a/yt_dlp/extractor/bitchute.py +++ b/yt_dlp/extractor/bitchute.py @@ -1,30 +1,32 @@ import functools +import json import re from .common import InfoExtractor from ..networking import HEADRequest +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, OnDemandPagedList, clean_html, - extract_attributes, + determine_ext, + format_field, get_element_by_class, - get_element_by_id, - get_element_html_by_class, get_elements_html_by_class, int_or_none, orderedSet, parse_count, parse_duration, - traverse_obj, - unified_strdate, + parse_iso8601, + url_or_none, urlencode_postdata, urljoin, ) +from ..utils.traversal import traverse_obj class BitChuteIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:www|old)\.)?bitchute\.com/(?:video|embed|torrent/[^/]+)/(?P[^/?#&]+)' + _VALID_URL = r'https?://(?:(?:www|old)\.)?bitchute\.com/(?:video|embed|torrent/[^/?#]+)/(?P[^/?#&]+)' _EMBED_REGEX = [rf'<(?:script|iframe)[^>]+\bsrc=(["\'])(?P{_VALID_URL})'] _TESTS = [{ 'url': 'https://www.bitchute.com/video/UGlrF9o9b-Q/', @@ -34,12 +36,17 @@ class BitChuteIE(InfoExtractor): 'ext': 'mp4', 'title': 'This is the first video on #BitChute !', 'description': 'md5:a0337e7b1fe39e32336974af8173a034', - 'thumbnail': r're:^https?://.*\.jpg$', + 'thumbnail': r're:https?://.+/.+\.jpg$', 'uploader': 'BitChute', 'upload_date': '20170103', 'uploader_url': 'https://www.bitchute.com/profile/I5NgtHZn9vPj/', 'channel': 'BitChute', 'channel_url': 'https://www.bitchute.com/channel/bitchute/', + 'uploader_id': 'I5NgtHZn9vPj', + 'channel_id': '1VBwRfyNcKdX', + 'view_count': int, + 'duration': 16.0, + 'timestamp': 1483425443, }, }, { # test case: video with different channel and uploader @@ -49,13 +56,18 @@ class BitChuteIE(InfoExtractor): 'id': 'Yti_j9A-UZ4', 'ext': 'mp4', 'title': 'Israel at War | Full Measure', - 'description': 'md5:38cf7bc6f42da1a877835539111c69ef', - 'thumbnail': r're:^https?://.*\.jpg$', + 'description': 'md5:e60198b89971966d6030d22b3268f08f', + 'thumbnail': r're:https?://.+/.+\.jpg$', 'uploader': 'sharylattkisson', 'upload_date': '20231106', 'uploader_url': 'https://www.bitchute.com/profile/9K0kUWA9zmd9/', 'channel': 'Full Measure with Sharyl Attkisson', 'channel_url': 'https://www.bitchute.com/channel/sharylattkisson/', + 'uploader_id': '9K0kUWA9zmd9', + 'channel_id': 'NpdxoCRv3ZLb', + 'view_count': int, + 'duration': 554.0, + 'timestamp': 1699296106, }, }, { # video not downloadable in browser, but we can recover it @@ -66,25 +78,21 @@ class BitChuteIE(InfoExtractor): 'ext': 'mp4', 'filesize': 71537926, 'title': 'STYXHEXENHAMMER666 - Election Fraud, Clinton 2020, EU Armies, and Gun Control', - 'description': 'md5:228ee93bd840a24938f536aeac9cf749', - 'thumbnail': r're:^https?://.*\.jpg$', + 'description': 'md5:2029c7c212ccd4b040f52bb2d036ef4e', + 'thumbnail': r're:https?://.+/.+\.jpg$', 'uploader': 'BitChute', 'upload_date': '20181113', 'uploader_url': 'https://www.bitchute.com/profile/I5NgtHZn9vPj/', 'channel': 'BitChute', 'channel_url': 'https://www.bitchute.com/channel/bitchute/', + 'uploader_id': 'I5NgtHZn9vPj', + 'channel_id': '1VBwRfyNcKdX', + 'view_count': int, + 'duration': 1701.0, + 'tags': ['bitchute'], + 'timestamp': 1542130287, }, 'params': {'check_formats': None}, - }, { - # restricted video - 'url': 'https://www.bitchute.com/video/WEnQU7XGcTdl/', - 'info_dict': { - 'id': 'WEnQU7XGcTdl', - 'ext': 'mp4', - 'title': 'Impartial Truth - Ein Letzter Appell an die Vernunft', - }, - 'params': {'skip_download': True}, - 'skip': 'Georestricted in DE', }, { 'url': 'https://www.bitchute.com/embed/lbb5G1hjPhw/', 'only_matching': True, @@ -96,11 +104,8 @@ class BitChuteIE(InfoExtractor): 'only_matching': True, }] _GEO_BYPASS = False - - _HEADERS = { - 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.57 Safari/537.36', - 'Referer': 'https://www.bitchute.com/', - } + _UPLOADER_URL_TMPL = 'https://www.bitchute.com/profile/%s/' + _CHANNEL_URL_TMPL = 'https://www.bitchute.com/channel/%s/' def _check_format(self, video_url, video_id): urls = orderedSet( @@ -112,7 +117,7 @@ def _check_format(self, video_url, video_id): for url in urls: try: response = self._request_webpage( - HEADRequest(url), video_id=video_id, note=f'Checking {url}', headers=self._HEADERS) + HEADRequest(url), video_id=video_id, note=f'Checking {url}') except ExtractorError as e: self.to_screen(f'{video_id}: URL is invalid, skipping: {e.cause}') continue @@ -121,54 +126,79 @@ def _check_format(self, video_url, video_id): 'filesize': int_or_none(response.headers.get('Content-Length')), } - def _raise_if_restricted(self, webpage): - page_title = clean_html(get_element_by_class('page-title', webpage)) or '' - if re.fullmatch(r'(?:Channel|Video) Restricted', page_title): - reason = clean_html(get_element_by_id('page-detail', webpage)) or page_title - self.raise_geo_restricted(reason) - - @staticmethod - def _make_url(html): - path = extract_attributes(get_element_html_by_class('spa', html) or '').get('href') - return urljoin('https://www.bitchute.com', path) + def _call_api(self, endpoint, data, display_id, fatal=True): + note = endpoint.rpartition('/')[2] + try: + return self._download_json( + f'https://api.bitchute.com/api/beta/{endpoint}', display_id, + f'Downloading {note} API JSON', f'Unable to download {note} API JSON', + data=json.dumps(data).encode(), + headers={ + 'Accept': 'application/json', + 'Content-Type': 'application/json', + }) + except ExtractorError as e: + if isinstance(e.cause, HTTPError) and e.cause.status == 403: + errors = '. '.join(traverse_obj(e.cause.response.read().decode(), ( + {json.loads}, 'errors', lambda _, v: v['context'] == 'reason', 'message', {str}))) + if errors and 'location' in errors: + # Can always be fatal since the video/media call will reach this code first + self.raise_geo_restricted(errors) + if fatal: + raise + self.report_warning(e.msg) def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage( - f'https://old.bitchute.com/video/{video_id}', video_id, headers=self._HEADERS) - - self._raise_if_restricted(webpage) - publish_date = clean_html(get_element_by_class('video-publish-date', webpage)) - entries = self._parse_html5_media_entries(url, webpage, video_id) + data = {'video_id': video_id} + media_url = self._call_api('video/media', data, video_id)['media_url'] formats = [] - for format_ in traverse_obj(entries, (0, 'formats', ...)): + if determine_ext(media_url) == 'm3u8': + formats.extend( + self._extract_m3u8_formats(media_url, video_id, 'mp4', m3u8_id='hls', live=True)) + else: if self.get_param('check_formats') is not False: - format_.update(self._check_format(format_.pop('url'), video_id) or {}) - if 'url' not in format_: - continue - formats.append(format_) + if fmt := self._check_format(media_url, video_id): + formats.append(fmt) + else: + formats.append({'url': media_url}) if not formats: self.raise_no_formats( 'Video is unavailable. Please make sure this video is playable in the browser ' 'before reporting this issue.', expected=True, video_id=video_id) - details = get_element_by_class('details', webpage) or '' - uploader_html = get_element_html_by_class('creator', details) or '' - channel_html = get_element_html_by_class('name', details) or '' + video = self._call_api('video', data, video_id, fatal=False) + channel = None + if channel_id := traverse_obj(video, ('channel', 'channel_id', {str})): + channel = self._call_api('channel', {'channel_id': channel_id}, video_id, fatal=False) return { + **traverse_obj(video, { + 'title': ('video_name', {str}), + 'description': ('description', {str}), + 'thumbnail': ('thumbnail_url', {url_or_none}), + 'channel': ('channel', 'channel_name', {str}), + 'channel_id': ('channel', 'channel_id', {str}), + 'channel_url': ('channel', 'channel_url', {urljoin('https://www.bitchute.com/')}), + 'uploader_id': ('profile_id', {str}), + 'uploader_url': ('profile_id', {format_field(template=self._UPLOADER_URL_TMPL)}, filter), + 'timestamp': ('date_published', {parse_iso8601}), + 'duration': ('duration', {parse_duration}), + 'tags': ('hashtags', ..., {str}, filter, all, filter), + 'view_count': ('view_count', {int_or_none}), + 'is_live': ('state_id', {lambda x: x == 'live'}), + }), + **traverse_obj(channel, { + 'channel': ('channel_name', {str}), + 'channel_id': ('channel_id', {str}), + 'channel_url': ('url_slug', {format_field(template=self._CHANNEL_URL_TMPL)}, filter), + 'uploader': ('profile_name', {str}), + 'uploader_id': ('profile_id', {str}), + 'uploader_url': ('profile_id', {format_field(template=self._UPLOADER_URL_TMPL)}, filter), + }), 'id': video_id, - 'title': self._html_extract_title(webpage) or self._og_search_title(webpage), - 'description': self._og_search_description(webpage, default=None), - 'thumbnail': self._og_search_thumbnail(webpage), - 'uploader': clean_html(uploader_html), - 'uploader_url': self._make_url(uploader_html), - 'channel': clean_html(channel_html), - 'channel_url': self._make_url(channel_html), - 'upload_date': unified_strdate(self._search_regex( - r'at \d+:\d+ UTC on (.+?)\.', publish_date, 'upload date', fatal=False)), 'formats': formats, } @@ -190,7 +220,7 @@ class BitChuteChannelIE(InfoExtractor): 'ext': 'mp4', 'title': 'This is the first video on #BitChute !', 'description': 'md5:a0337e7b1fe39e32336974af8173a034', - 'thumbnail': r're:^https?://.*\.jpg$', + 'thumbnail': r're:https?://.+/.+\.jpg$', 'uploader': 'BitChute', 'upload_date': '20170103', 'uploader_url': 'https://www.bitchute.com/profile/I5NgtHZn9vPj/', @@ -198,6 +228,9 @@ class BitChuteChannelIE(InfoExtractor): 'channel_url': 'https://www.bitchute.com/channel/bitchute/', 'duration': 16, 'view_count': int, + 'uploader_id': 'I5NgtHZn9vPj', + 'channel_id': '1VBwRfyNcKdX', + 'timestamp': 1483425443, }, }, ], @@ -213,6 +246,7 @@ class BitChuteChannelIE(InfoExtractor): 'title': 'Bruce MacDonald and "The Light of Darkness"', 'description': 'md5:747724ef404eebdfc04277714f81863e', }, + 'skip': '404 Not Found', }, { 'url': 'https://old.bitchute.com/playlist/wV9Imujxasw9/', 'only_matching': True, diff --git a/yt_dlp/extractor/bluesky.py b/yt_dlp/extractor/bluesky.py index 23344ac6c..8cb5c0d25 100644 --- a/yt_dlp/extractor/bluesky.py +++ b/yt_dlp/extractor/bluesky.py @@ -53,7 +53,7 @@ class BlueskyIE(InfoExtractor): 'channel_id': 'did:plc:z72i7hdynmk6r22z27h6tvur', 'channel_url': 'https://bsky.app/profile/did:plc:z72i7hdynmk6r22z27h6tvur', 'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$', - 'title': 'Bluesky now has video! Update your app to versi...', + 'title': 'Bluesky now has video! Update your app to version 1.91 or refresh on ...', 'alt_title': 'Bluesky video feature announcement', 'description': r're:(?s)Bluesky now has video! .{239}', 'upload_date': '20240911', @@ -172,7 +172,7 @@ class BlueskyIE(InfoExtractor): 'channel_id': 'did:plc:z72i7hdynmk6r22z27h6tvur', 'channel_url': 'https://bsky.app/profile/did:plc:z72i7hdynmk6r22z27h6tvur', 'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$', - 'title': 'Bluesky now has video! Update your app to versi...', + 'title': 'Bluesky now has video! Update your app to version 1.91 or refresh on ...', 'alt_title': 'Bluesky video feature announcement', 'description': r're:(?s)Bluesky now has video! .{239}', 'upload_date': '20240911', @@ -191,7 +191,7 @@ class BlueskyIE(InfoExtractor): 'info_dict': { 'id': '3l7rdfxhyds2f', 'ext': 'mp4', - 'uploader': 'cinnamon', + 'uploader': 'cinnamon 🐇 🏳️‍⚧️', 'uploader_id': 'cinny.bun.how', 'uploader_url': 'https://bsky.app/profile/cinny.bun.how', 'channel_id': 'did:plc:7x6rtuenkuvxq3zsvffp2ide', @@ -255,7 +255,7 @@ class BlueskyIE(InfoExtractor): 'info_dict': { 'id': '3l77u64l7le2e', 'ext': 'mp4', - 'title': 'hearing people on twitter say that bluesky isn\'...', + 'title': "hearing people on twitter say that bluesky isn't funny yet so post t...", 'like_count': int, 'uploader_id': 'thafnine.net', 'uploader_url': 'https://bsky.app/profile/thafnine.net', @@ -387,7 +387,7 @@ def _extract_videos(self, root, video_id, embed_path='embed', record_path='recor 'age_limit': ( 'labels', ..., 'val', {lambda x: 18 if x in ('sexual', 'porn', 'graphic-media') else None}, any), 'description': (*record_path, 'text', {str}, filter), - 'title': (*record_path, 'text', {lambda x: x.replace('\n', ' ')}, {truncate_string(left=50)}), + 'title': (*record_path, 'text', {lambda x: x.replace('\n', ' ')}, {truncate_string(left=72)}), }), }) return entries diff --git a/yt_dlp/extractor/bokecc.py b/yt_dlp/extractor/bokecc.py index 5fe937a6a..42047aced 100644 --- a/yt_dlp/extractor/bokecc.py +++ b/yt_dlp/extractor/bokecc.py @@ -24,7 +24,7 @@ def _extract_bokecc_formats(self, webpage, video_id, format_id=None): class BokeCCIE(BokeCCBaseIE): - _IE_DESC = 'CC视频' + IE_DESC = 'CC视频' _VALID_URL = r'https?://union\.bokecc\.com/playvideo\.bo\?(?P.*)' _TESTS = [{ diff --git a/yt_dlp/extractor/bpb.py b/yt_dlp/extractor/bpb.py index d7bf58b36..a2a908252 100644 --- a/yt_dlp/extractor/bpb.py +++ b/yt_dlp/extractor/bpb.py @@ -7,6 +7,7 @@ join_nonempty, js_to_json, mimetype2ext, + parse_resolution, unified_strdate, url_or_none, urljoin, @@ -110,24 +111,23 @@ def _parse_vue_attributes(self, name, string, video_id): return attributes - @staticmethod - def _process_source(source): + def _process_source(self, source): url = url_or_none(source['src']) if not url: return None source_type = source.get('type', '') extension = mimetype2ext(source_type) - is_video = source_type.startswith('video') - note = url.rpartition('.')[0].rpartition('_')[2] if is_video else None + note = self._search_regex(r'[_-]([a-z]+)\.[\da-z]+(?:$|\?)', url, 'note', default=None) return { 'url': url, 'ext': extension, - 'vcodec': None if is_video else 'none', + 'vcodec': None if source_type.startswith('video') else 'none', 'quality': 10 if note == 'high' else 0, 'format_note': note, 'format_id': join_nonempty(extension, note), + **parse_resolution(source.get('label')), } def _real_extract(self, url): diff --git a/yt_dlp/extractor/bunnycdn.py b/yt_dlp/extractor/bunnycdn.py new file mode 100644 index 000000000..d78753384 --- /dev/null +++ b/yt_dlp/extractor/bunnycdn.py @@ -0,0 +1,178 @@ +import json + +from .common import InfoExtractor +from ..networking import HEADRequest +from ..utils import ( + ExtractorError, + extract_attributes, + int_or_none, + parse_qs, + smuggle_url, + unsmuggle_url, + url_or_none, + urlhandle_detect_ext, +) +from ..utils.traversal import find_element, traverse_obj + + +class BunnyCdnIE(InfoExtractor): + _VALID_URL = r'https?://(?:iframe\.mediadelivery\.net|video\.bunnycdn\.com)/(?:embed|play)/(?P\d+)/(?P[\da-f-]+)' + _EMBED_REGEX = [rf']+src=[\'"](?P{_VALID_URL}[^\'"]*)[\'"]'] + _TESTS = [{ + 'url': 'https://iframe.mediadelivery.net/embed/113933/e73edec1-e381-4c8b-ae73-717a140e0924', + 'info_dict': { + 'id': 'e73edec1-e381-4c8b-ae73-717a140e0924', + 'ext': 'mp4', + 'title': 'mistress morgana (3).mp4', + 'description': '', + 'timestamp': 1693251673, + 'thumbnail': r're:^https?://.*\.b-cdn\.net/e73edec1-e381-4c8b-ae73-717a140e0924/thumbnail\.jpg', + 'duration': 7.0, + 'upload_date': '20230828', + }, + 'params': {'skip_download': True}, + }, { + 'url': 'https://iframe.mediadelivery.net/play/136145/32e34c4b-0d72-437c-9abb-05e67657da34', + 'info_dict': { + 'id': '32e34c4b-0d72-437c-9abb-05e67657da34', + 'ext': 'mp4', + 'timestamp': 1691145748, + 'thumbnail': r're:^https?://.*\.b-cdn\.net/32e34c4b-0d72-437c-9abb-05e67657da34/thumbnail_9172dc16\.jpg', + 'duration': 106.0, + 'description': 'md5:981a3e899a5c78352b21ed8b2f1efd81', + 'upload_date': '20230804', + 'title': 'Sanela ist Teil der #arbeitsmarktkraft', + }, + 'params': {'skip_download': True}, + }, { + # Stream requires activation and pings + 'url': 'https://iframe.mediadelivery.net/embed/200867/2e8545ec-509d-4571-b855-4cf0235ccd75', + 'info_dict': { + 'id': '2e8545ec-509d-4571-b855-4cf0235ccd75', + 'ext': 'mp4', + 'timestamp': 1708497752, + 'title': 'netflix part 1', + 'duration': 3959.0, + 'description': '', + 'upload_date': '20240221', + 'thumbnail': r're:^https?://.*\.b-cdn\.net/2e8545ec-509d-4571-b855-4cf0235ccd75/thumbnail\.jpg', + }, + 'params': {'skip_download': True}, + }] + _WEBPAGE_TESTS = [{ + # Stream requires Referer + 'url': 'https://conword.io/', + 'info_dict': { + 'id': '3a5d863e-9cd6-447e-b6ef-e289af50b349', + 'ext': 'mp4', + 'title': 'Conword bei der Stadt Köln und Stadt Dortmund', + 'description': '', + 'upload_date': '20231031', + 'duration': 31.0, + 'thumbnail': 'https://video.watchuh.com/3a5d863e-9cd6-447e-b6ef-e289af50b349/thumbnail.jpg', + 'timestamp': 1698783879, + }, + 'params': {'skip_download': True}, + }, { + # URL requires token and expires + 'url': 'https://www.stockphotos.com/video/moscow-subway-the-train-is-arriving-at-the-park-kultury-station-10017830', + 'info_dict': { + 'id': '0b02fa20-4e8c-4140-8f87-f64d820a3386', + 'ext': 'mp4', + 'thumbnail': r're:^https?://.*\.b-cdn\.net/0b02fa20-4e8c-4140-8f87-f64d820a3386/thumbnail\.jpg', + 'title': 'Moscow subway. The train is arriving at the Park Kultury station.', + 'upload_date': '20240531', + 'duration': 18.0, + 'timestamp': 1717152269, + 'description': '', + }, + 'params': {'skip_download': True}, + }] + + @classmethod + def _extract_embed_urls(cls, url, webpage): + for embed_url in super()._extract_embed_urls(url, webpage): + yield smuggle_url(embed_url, {'Referer': url}) + + def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + + video_id, library_id = self._match_valid_url(url).group('id', 'library_id') + webpage = self._download_webpage( + f'https://iframe.mediadelivery.net/embed/{library_id}/{video_id}', video_id, + headers=traverse_obj(smuggled_data, {'Referer': 'Referer'}), + query=traverse_obj(parse_qs(url), {'token': 'token', 'expires': 'expires'})) + + if html_title := self._html_extract_title(webpage, default=None) == '403': + raise ExtractorError( + 'This video is inaccessible. Setting a Referer header ' + 'might be required to access the video', expected=True) + elif html_title == '404': + raise ExtractorError('This video does not exist', expected=True) + + headers = {'Referer': url} + + info = traverse_obj(self._parse_html5_media_entries(url, webpage, video_id, _headers=headers), 0) or {} + formats = info.get('formats') or [] + subtitles = info.get('subtitles') or {} + + original_url = self._search_regex( + r'(?:var|const|let)\s+originalUrl\s*=\s*["\']([^"\']+)["\']', webpage, 'original url', default=None) + if url_or_none(original_url): + urlh = self._request_webpage( + HEADRequest(original_url), video_id=video_id, note='Checking original', + headers=headers, fatal=False, expected_status=(403, 404)) + if urlh and urlh.status == 200: + formats.append({ + 'url': original_url, + 'format_id': 'source', + 'quality': 1, + 'http_headers': headers, + 'ext': urlhandle_detect_ext(urlh, default='mp4'), + 'filesize': int_or_none(urlh.get_header('Content-Length')), + }) + + # MediaCage Streams require activation and pings + src_url = self._search_regex( + r'\.setAttribute\([\'"]src[\'"],\s*[\'"]([^\'"]+)[\'"]\)', webpage, 'src url', default=None) + activation_url = self._search_regex( + r'loadUrl\([\'"]([^\'"]+/activate)[\'"]', webpage, 'activation url', default=None) + ping_url = self._search_regex( + r'loadUrl\([\'"]([^\'"]+/ping)[\'"]', webpage, 'ping url', default=None) + secret = traverse_obj(parse_qs(src_url), ('secret', 0)) + context_id = traverse_obj(parse_qs(src_url), ('contextId', 0)) + ping_data = {} + if src_url and activation_url and ping_url and secret and context_id: + self._download_webpage( + activation_url, video_id, headers=headers, note='Downloading activation data') + + fmts, subs = self._extract_m3u8_formats_and_subtitles( + src_url, video_id, 'mp4', headers=headers, m3u8_id='hls', fatal=False) + for fmt in fmts: + fmt.update({ + 'protocol': 'bunnycdn', + 'http_headers': headers, + }) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + + ping_data = { + '_bunnycdn_ping_data': { + 'url': ping_url, + 'headers': headers, + 'secret': secret, + 'context_id': context_id, + }, + } + + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + **traverse_obj(webpage, ({find_element(id='main-video', html=True)}, {extract_attributes}, { + 'title': ('data-plyr-config', {json.loads}, 'title', {str}), + 'thumbnail': ('data-poster', {url_or_none}), + })), + **ping_data, + **self._search_json_ld(webpage, video_id, fatal=False), + } diff --git a/yt_dlp/extractor/canalsurmas.py b/yt_dlp/extractor/canalsurmas.py new file mode 100644 index 000000000..210973a0b --- /dev/null +++ b/yt_dlp/extractor/canalsurmas.py @@ -0,0 +1,84 @@ +import json +import time + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + float_or_none, + jwt_decode_hs256, + parse_iso8601, + url_or_none, + variadic, +) +from ..utils.traversal import traverse_obj + + +class CanalsurmasIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?canalsurmas\.es/videos/(?P\d+)' + _TESTS = [{ + 'url': 'https://www.canalsurmas.es/videos/44006-el-gran-queo-1-lora-del-rio-sevilla-20072014', + 'md5': '861f86fdc1221175e15523047d0087ef', + 'info_dict': { + 'id': '44006', + 'ext': 'mp4', + 'title': 'Lora del Río (Sevilla)', + 'description': 'md5:3d9ee40a9b1b26ed8259e6b71ed27b8b', + 'thumbnail': 'https://cdn2.rtva.interactvty.com/content_cards/00f3e8f67b0a4f3b90a4a14618a48b0d.jpg', + 'timestamp': 1648123182, + 'upload_date': '20220324', + }, + }] + _API_BASE = 'https://api-rtva.interactvty.com' + _access_token = None + + @staticmethod + def _is_jwt_expired(token): + return jwt_decode_hs256(token)['exp'] - time.time() < 300 + + def _call_api(self, endpoint, video_id, fields=None): + if not self._access_token or self._is_jwt_expired(self._access_token): + self._access_token = self._download_json( + f'{self._API_BASE}/jwt/token/', None, + 'Downloading access token', 'Failed to download access token', + headers={'Content-Type': 'application/json'}, + data=json.dumps({ + 'username': 'canalsur_demo', + 'password': 'dsUBXUcI', + }).encode())['access'] + + return self._download_json( + f'{self._API_BASE}/api/2.0/contents/{endpoint}/{video_id}/', video_id, + f'Downloading {endpoint} API JSON', f'Failed to download {endpoint} API JSON', + headers={'Authorization': f'jwtok {self._access_token}'}, + query={'optional_fields': ','.join(variadic(fields))} if fields else None) + + def _real_extract(self, url): + video_id = self._match_id(url) + video_info = self._call_api('content', video_id, fields=[ + 'description', 'image', 'duration', 'created_at', 'tags', + ]) + stream_info = self._call_api('content_resources', video_id, 'media_url') + + formats, subtitles = [], {} + for stream_url in traverse_obj(stream_info, ('results', ..., 'media_url', {url_or_none})): + if determine_ext(stream_url) == 'm3u8': + fmts, subs = self._extract_m3u8_formats_and_subtitles( + stream_url, video_id, m3u8_id='hls', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + else: + formats.append({'url': stream_url}) + + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + **traverse_obj(video_info, { + 'title': ('name', {str.strip}), + 'description': ('description', {str}), + 'thumbnail': ('image', {url_or_none}), + 'duration': ('duration', {float_or_none}), + 'timestamp': ('created_at', {parse_iso8601}), + 'tags': ('tags', ..., {str}), + }), + } diff --git a/yt_dlp/extractor/cbc.py b/yt_dlp/extractor/cbc.py index c0cf3da3d..319771655 100644 --- a/yt_dlp/extractor/cbc.py +++ b/yt_dlp/extractor/cbc.py @@ -1,29 +1,32 @@ -import base64 import functools -import json import re import time import urllib.parse from .common import InfoExtractor from ..networking import HEADRequest +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, float_or_none, int_or_none, js_to_json, + jwt_decode_hs256, mimetype2ext, orderedSet, + parse_age_limit, parse_iso8601, replace_extension, smuggle_url, strip_or_none, - traverse_obj, try_get, + unified_timestamp, update_url, url_basename, url_or_none, + urlencode_postdata, ) +from ..utils.traversal import require, traverse_obj, trim_str class CBCIE(InfoExtractor): @@ -516,9 +519,43 @@ def entries(): return self.playlist_result(entries(), playlist_id) -class CBCGemIE(InfoExtractor): +class CBCGemBaseIE(InfoExtractor): + _NETRC_MACHINE = 'cbcgem' + _GEO_COUNTRIES = ['CA'] + + def _call_show_api(self, item_id, display_id=None): + return self._download_json( + f'https://services.radio-canada.ca/ott/catalog/v2/gem/show/{item_id}', + display_id or item_id, query={'device': 'web'}) + + def _extract_item_info(self, item_info): + episode_number = None + title = traverse_obj(item_info, ('title', {str})) + if title and (mobj := re.match(r'(?P\d+)\. (?P.+)', title)): + episode_number = int_or_none(mobj.group('episode')) + title = mobj.group('title') + + return { + 'episode_number': episode_number, + **traverse_obj(item_info, { + 'id': ('url', {str}), + 'episode_id': ('url', {str}), + 'description': ('description', {str}), + 'thumbnail': ('images', 'card', 'url', {url_or_none}, {update_url(query=None)}), + 'episode_number': ('episodeNumber', {int_or_none}), + 'duration': ('metadata', 'duration', {int_or_none}), + 'release_timestamp': ('metadata', 'airDate', {unified_timestamp}), + 'timestamp': ('metadata', 'availabilityDate', {unified_timestamp}), + 'age_limit': ('metadata', 'rating', {trim_str(start='C')}, {parse_age_limit}), + }), + 'episode': title, + 'title': title, + } + + +class CBCGemIE(CBCGemBaseIE): IE_NAME = 'gem.cbc.ca' - _VALID_URL = r'https?://gem\.cbc\.ca/(?:media/)?(?P<id>[0-9a-z-]+/s[0-9]+[a-z][0-9]+)' + _VALID_URL = r'https?://gem\.cbc\.ca/(?:media/)?(?P<id>[0-9a-z-]+/s(?P<season>[0-9]+)[a-z][0-9]+)' _TESTS = [{ # This is a normal, public, TV show video 'url': 'https://gem.cbc.ca/media/schitts-creek/s06e01', @@ -529,7 +566,7 @@ class CBCGemIE(InfoExtractor): 'description': 'md5:929868d20021c924020641769eb3e7f1', 'thumbnail': r're:https://images\.radio-canada\.ca/[^#?]+/cbc_schitts_creek_season_06e01_thumbnail_v01\.jpg', 'duration': 1324, - 'categories': ['comedy'], + 'genres': ['Comédie et humour'], 'series': 'Schitt\'s Creek', 'season': 'Season 6', 'season_number': 6, @@ -537,9 +574,10 @@ class CBCGemIE(InfoExtractor): 'episode_number': 1, 'episode_id': 'schitts-creek/s06e01', 'upload_date': '20210618', - 'timestamp': 1623988800, + 'timestamp': 1623974400, 'release_date': '20200107', - 'release_timestamp': 1578427200, + 'release_timestamp': 1578355200, + 'age_limit': 14, }, 'params': {'format': 'bv'}, }, { @@ -557,12 +595,13 @@ class CBCGemIE(InfoExtractor): 'episode_number': 1, 'episode': 'The Cup Runneth Over', 'episode_id': 'schitts-creek/s01e01', - 'duration': 1309, - 'categories': ['comedy'], + 'duration': 1308, + 'genres': ['Comédie et humour'], 'upload_date': '20210617', - 'timestamp': 1623902400, - 'release_date': '20151124', - 'release_timestamp': 1448323200, + 'timestamp': 1623888000, + 'release_date': '20151123', + 'release_timestamp': 1448236800, + 'age_limit': 14, }, 'params': {'format': 'bv'}, }, { @@ -570,82 +609,107 @@ class CBCGemIE(InfoExtractor): 'only_matching': True, }] - _GEO_COUNTRIES = ['CA'] - _TOKEN_API_KEY = '3f4beddd-2061-49b0-ae80-6f1f2ed65b37' - _NETRC_MACHINE = 'cbcgem' + _CLIENT_ID = 'fc05b0ee-3865-4400-a3cc-3da82c330c23' + _refresh_token = None + _access_token = None _claims_token = None - def _new_claims_token(self, email, password): - data = json.dumps({ - 'email': email, - 'password': password, - }).encode() - headers = {'content-type': 'application/json'} - query = {'apikey': self._TOKEN_API_KEY} - resp = self._download_json('https://api.loginradius.com/identity/v2/auth/login', - None, data=data, headers=headers, query=query) - access_token = resp['access_token'] + @functools.cached_property + def _ropc_settings(self): + return self._download_json( + 'https://services.radio-canada.ca/ott/catalog/v1/gem/settings', None, + 'Downloading site settings', query={'device': 'web'})['identityManagement']['ropc'] - query = { - 'access_token': access_token, - 'apikey': self._TOKEN_API_KEY, - 'jwtapp': 'jwt', - } - resp = self._download_json('https://cloud-api.loginradius.com/sso/jwt/api/token', - None, headers=headers, query=query) - sig = resp['signature'] + def _is_jwt_expired(self, token): + return jwt_decode_hs256(token)['exp'] - time.time() < 300 - data = json.dumps({'jwt': sig}).encode() - headers = {'content-type': 'application/json', 'ott-device-type': 'web'} - resp = self._download_json('https://services.radio-canada.ca/ott/cbc-api/v2/token', - None, data=data, headers=headers, expected_status=426) - cbc_access_token = resp['accessToken'] + def _call_oauth_api(self, oauth_data, note='Refreshing access token'): + response = self._download_json( + self._ropc_settings['url'], None, note, data=urlencode_postdata({ + 'client_id': self._CLIENT_ID, + **oauth_data, + 'scope': self._ropc_settings['scopes'], + })) + self._refresh_token = response['refresh_token'] + self._access_token = response['access_token'] + self.cache.store(self._NETRC_MACHINE, 'token_data', [self._refresh_token, self._access_token]) - headers = {'content-type': 'application/json', 'ott-device-type': 'web', 'ott-access-token': cbc_access_token} - resp = self._download_json('https://services.radio-canada.ca/ott/cbc-api/v2/profile', - None, headers=headers, expected_status=426) - return resp['claimsToken'] + def _perform_login(self, username, password): + if not self._refresh_token: + self._refresh_token, self._access_token = self.cache.load( + self._NETRC_MACHINE, 'token_data', default=[None, None]) - def _get_claims_token_expiry(self): - # Token is a JWT - # JWT is decoded here and 'exp' field is extracted - # It is a Unix timestamp for when the token expires - b64_data = self._claims_token.split('.')[1] - data = base64.urlsafe_b64decode(b64_data + '==') - return json.loads(data)['exp'] + if self._refresh_token and self._access_token: + self.write_debug('Using cached refresh token') + if not self._claims_token: + self._claims_token = self.cache.load(self._NETRC_MACHINE, 'claims_token') + return - def claims_token_expired(self): - exp = self._get_claims_token_expiry() - # It will expire in less than 10 seconds, or has already expired - return exp - time.time() < 10 + try: + self._call_oauth_api({ + 'grant_type': 'password', + 'username': username, + 'password': password, + }, note='Logging in') + except ExtractorError as e: + if isinstance(e.cause, HTTPError) and e.cause.status == 400: + raise ExtractorError('Invalid username and/or password', expected=True) + raise - def claims_token_valid(self): - return self._claims_token is not None and not self.claims_token_expired() + def _fetch_access_token(self): + if self._is_jwt_expired(self._access_token): + try: + self._call_oauth_api({ + 'grant_type': 'refresh_token', + 'refresh_token': self._refresh_token, + }) + except ExtractorError: + self._refresh_token, self._access_token = None, None + self.cache.store(self._NETRC_MACHINE, 'token_data', [None, None]) + self.report_warning('Refresh token has been invalidated; retrying with credentials') + self._perform_login(*self._get_login_info()) - def _get_claims_token(self, email, password): - if not self.claims_token_valid(): - self._claims_token = self._new_claims_token(email, password) + return self._access_token + + def _fetch_claims_token(self): + if not self._get_login_info()[0]: + return None + + if not self._claims_token or self._is_jwt_expired(self._claims_token): + self._claims_token = self._download_json( + 'https://services.radio-canada.ca/ott/subscription/v2/gem/Subscriber/profile', + None, 'Downloading claims token', query={'device': 'web'}, + headers={'Authorization': f'Bearer {self._fetch_access_token()}'})['claimsToken'] self.cache.store(self._NETRC_MACHINE, 'claims_token', self._claims_token) + else: + self.write_debug('Using cached claims token') + return self._claims_token - def _real_initialize(self): - if self.claims_token_valid(): - return - self._claims_token = self.cache.load(self._NETRC_MACHINE, 'claims_token') - def _real_extract(self, url): - video_id = self._match_id(url) - video_info = self._download_json( - f'https://services.radio-canada.ca/ott/cbc-api/v2/assets/{video_id}', - video_id, expected_status=426) + video_id, season_number = self._match_valid_url(url).group('id', 'season') + video_info = self._call_show_api(video_id) + item_info = traverse_obj(video_info, ( + 'content', ..., 'lineups', ..., 'items', + lambda _, v: v['url'] == video_id, any, {require('item info')})) - email, password = self._get_login_info() - if email and password: - claims_token = self._get_claims_token(email, password) - headers = {'x-claims-token': claims_token} - else: - headers = {} - m3u8_info = self._download_json(video_info['playSession']['url'], video_id, headers=headers) + headers = {} + if claims_token := self._fetch_claims_token(): + headers['x-claims-token'] = claims_token + + m3u8_info = self._download_json( + 'https://services.radio-canada.ca/media/validation/v2/', + video_id, headers=headers, query={ + 'appCode': 'gem', + 'connectionType': 'hd', + 'deviceType': 'ipad', + 'multibitrate': 'true', + 'output': 'json', + 'tech': 'hls', + 'manifestVersion': '2', + 'manifestType': 'desktop', + 'idMedia': item_info['idMedia'], + }) if m3u8_info.get('errorCode') == 1: self.raise_geo_restricted(countries=['CA']) @@ -671,26 +735,20 @@ def _real_extract(self, url): fmt['preference'] = -2 return { + 'season_number': int_or_none(season_number), + **traverse_obj(video_info, { + 'series': ('title', {str}), + 'season_number': ('structuredMetadata', 'partofSeason', 'seasonNumber', {int_or_none}), + 'genres': ('structuredMetadata', 'genre', ..., {str}), + }), + **self._extract_item_info(item_info), 'id': video_id, 'episode_id': video_id, 'formats': formats, - **traverse_obj(video_info, { - 'title': ('title', {str}), - 'episode': ('title', {str}), - 'description': ('description', {str}), - 'thumbnail': ('image', {url_or_none}), - 'series': ('series', {str}), - 'season_number': ('season', {int_or_none}), - 'episode_number': ('episode', {int_or_none}), - 'duration': ('duration', {int_or_none}), - 'categories': ('category', {str}, all), - 'release_timestamp': ('airDate', {int_or_none(scale=1000)}), - 'timestamp': ('availableDate', {int_or_none(scale=1000)}), - }), } -class CBCGemPlaylistIE(InfoExtractor): +class CBCGemPlaylistIE(CBCGemBaseIE): IE_NAME = 'gem.cbc.ca:playlist' _VALID_URL = r'https?://gem\.cbc\.ca/(?:media/)?(?P<id>(?P<show>[0-9a-z-]+)/s(?P<season>[0-9]+))/?(?:[?#]|$)' _TESTS = [{ @@ -700,70 +758,35 @@ class CBCGemPlaylistIE(InfoExtractor): 'info_dict': { 'id': 'schitts-creek/s06', 'title': 'Season 6', - 'description': 'md5:6a92104a56cbeb5818cc47884d4326a2', 'series': 'Schitt\'s Creek', 'season_number': 6, 'season': 'Season 6', - 'thumbnail': 'https://images.radio-canada.ca/v1/synps-cbc/season/perso/cbc_schitts_creek_season_06_carousel_v03.jpg?impolicy=ott&im=Resize=(_Size_)&quality=75', }, }, { 'url': 'https://gem.cbc.ca/schitts-creek/s06', 'only_matching': True, }] - _API_BASE = 'https://services.radio-canada.ca/ott/cbc-api/v2/shows/' + + def _entries(self, season_info): + for episode in traverse_obj(season_info, ('items', lambda _, v: v['url'])): + yield self.url_result( + f'https://gem.cbc.ca/media/{episode["url"]}', CBCGemIE, + **self._extract_item_info(episode)) def _real_extract(self, url): - match = self._match_valid_url(url) - season_id = match.group('id') - show = match.group('show') - show_info = self._download_json(self._API_BASE + show, season_id, expected_status=426) - season = int(match.group('season')) + season_id, show, season = self._match_valid_url(url).group('id', 'show', 'season') + show_info = self._call_show_api(show, display_id=season_id) + season_info = traverse_obj(show_info, ( + 'content', ..., 'lineups', + lambda _, v: v['seasonNumber'] == int(season), any, {require('season info')})) - season_info = next((s for s in show_info['seasons'] if s.get('season') == season), None) - - if season_info is None: - raise ExtractorError(f'Couldn\'t find season {season} of {show}') - - episodes = [] - for episode in season_info['assets']: - episodes.append({ - '_type': 'url_transparent', - 'ie_key': 'CBCGem', - 'url': 'https://gem.cbc.ca/media/' + episode['id'], - 'id': episode['id'], - 'title': episode.get('title'), - 'description': episode.get('description'), - 'thumbnail': episode.get('image'), - 'series': episode.get('series'), - 'season_number': episode.get('season'), - 'season': season_info['title'], - 'season_id': season_info.get('id'), - 'episode_number': episode.get('episode'), - 'episode': episode.get('title'), - 'episode_id': episode['id'], - 'duration': episode.get('duration'), - 'categories': [episode.get('category')], - }) - - thumbnail = None - tn_uri = season_info.get('image') - # the-national was observed to use a "data:image/png;base64" - # URI for their 'image' value. The image was 1x1, and is - # probably just a placeholder, so it is ignored. - if tn_uri is not None and not tn_uri.startswith('data:'): - thumbnail = tn_uri - - return { - '_type': 'playlist', - 'entries': episodes, - 'id': season_id, - 'title': season_info['title'], - 'description': season_info.get('description'), - 'thumbnail': thumbnail, - 'series': show_info.get('title'), - 'season_number': season_info.get('season'), - 'season': season_info['title'], - } + return self.playlist_result( + self._entries(season_info), season_id, + **traverse_obj(season_info, { + 'title': ('title', {str}), + 'season': ('title', {str}), + 'season_number': ('seasonNumber', {int_or_none}), + }), series=traverse_obj(show_info, ('title', {str}))) class CBCGemLiveIE(InfoExtractor): diff --git a/yt_dlp/extractor/cda.py b/yt_dlp/extractor/cda.py index b2738e492..027b37d44 100644 --- a/yt_dlp/extractor/cda.py +++ b/yt_dlp/extractor/cda.py @@ -13,16 +13,17 @@ from ..utils import ( ExtractorError, OnDemandPagedList, + determine_ext, float_or_none, int_or_none, merge_dicts, multipart_encode, parse_duration, - traverse_obj, try_call, - try_get, + url_or_none, urljoin, ) +from ..utils.traversal import traverse_obj class CDAIE(InfoExtractor): @@ -121,10 +122,7 @@ def _download_age_confirm_page(self, url, video_id, *args, **kwargs): }, **kwargs) def _perform_login(self, username, password): - app_version = random.choice(( - '1.2.88 build 15306', - '1.2.174 build 18469', - )) + app_version = '1.2.255 build 21541' android_version = random.randrange(8, 14) phone_model = random.choice(( # x-kom.pl top selling Android smartphones, as of 2022-12-26 @@ -190,7 +188,7 @@ def _api_extract(self, video_id): meta = self._download_json( f'{self._BASE_API_URL}/video/{video_id}', video_id, headers=self._API_HEADERS)['video'] - uploader = traverse_obj(meta, 'author', 'login') + uploader = traverse_obj(meta, ('author', 'login', {str})) formats = [{ 'url': quality['file'], @@ -293,34 +291,47 @@ def extract_format(page, version): if not video or 'file' not in video: self.report_warning(f'Unable to extract {version} version information') return - if video['file'].startswith('uggc'): - video['file'] = codecs.decode(video['file'], 'rot_13') - if video['file'].endswith('adc.mp4'): - video['file'] = video['file'].replace('adc.mp4', '.mp4') - elif not video['file'].startswith('http'): - video['file'] = decrypt_file(video['file']) video_quality = video.get('quality') qualities = video.get('qualities', {}) video_quality = next((k for k, v in qualities.items() if v == video_quality), video_quality) - info_dict['formats'].append({ - 'url': video['file'], - 'format_id': video_quality, - 'height': int_or_none(video_quality[:-1]), - }) + if video.get('file'): + if video['file'].startswith('uggc'): + video['file'] = codecs.decode(video['file'], 'rot_13') + if video['file'].endswith('adc.mp4'): + video['file'] = video['file'].replace('adc.mp4', '.mp4') + elif not video['file'].startswith('http'): + video['file'] = decrypt_file(video['file']) + info_dict['formats'].append({ + 'url': video['file'], + 'format_id': video_quality, + 'height': int_or_none(video_quality[:-1]), + }) for quality, cda_quality in qualities.items(): if quality == video_quality: continue data = {'jsonrpc': '2.0', 'method': 'videoGetLink', 'id': 2, 'params': [video_id, cda_quality, video.get('ts'), video.get('hash2'), {}]} data = json.dumps(data).encode() - video_url = self._download_json( + response = self._download_json( f'https://www.cda.pl/video/{video_id}', video_id, headers={ 'Content-Type': 'application/json', 'X-Requested-With': 'XMLHttpRequest', }, data=data, note=f'Fetching {quality} url', errnote=f'Failed to fetch {quality} url', fatal=False) - if try_get(video_url, lambda x: x['result']['status']) == 'ok': - video_url = try_get(video_url, lambda x: x['result']['resp']) + if ( + traverse_obj(response, ('result', 'status')) != 'ok' + or not traverse_obj(response, ('result', 'resp', {url_or_none})) + ): + continue + video_url = response['result']['resp'] + ext = determine_ext(video_url) + if ext == 'mpd': + info_dict['formats'].extend(self._extract_mpd_formats( + video_url, video_id, mpd_id='dash', fatal=False)) + elif ext == 'm3u8': + info_dict['formats'].extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + else: info_dict['formats'].append({ 'url': video_url, 'format_id': quality, @@ -356,7 +367,7 @@ def extract_format(page, version): class CDAFolderIE(InfoExtractor): _MAX_PAGE_SIZE = 36 - _VALID_URL = r'https?://(?:www\.)?cda\.pl/(?P<channel>\w+)/folder/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?cda\.pl/(?P<channel>[\w-]+)/folder/(?P<id>\d+)' _TESTS = [ { 'url': 'https://www.cda.pl/domino264/folder/31188385', @@ -381,6 +392,9 @@ class CDAFolderIE(InfoExtractor): 'title': 'TESTY KOSMETYKÓW', }, 'playlist_mincount': 139, + }, { + 'url': 'https://www.cda.pl/FILMY-SERIALE-ANIME-KRESKOWKI-BAJKI/folder/18493422', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/chzzk.py b/yt_dlp/extractor/chzzk.py index aec77ac45..a5daf5ca7 100644 --- a/yt_dlp/extractor/chzzk.py +++ b/yt_dlp/extractor/chzzk.py @@ -21,7 +21,7 @@ class CHZZKLiveIE(InfoExtractor): 'channel': '진짜도현', 'channel_id': 'c68b8ef525fb3d2fa146344d84991753', 'channel_is_verified': False, - 'thumbnail': r're:^https?://.*\.jpg$', + 'thumbnail': r're:https?://.+/.+\.jpg', 'timestamp': 1705510344, 'upload_date': '20240117', 'live_status': 'is_live', @@ -98,7 +98,7 @@ class CHZZKVideoIE(InfoExtractor): 'channel': '침착맨', 'channel_id': 'bb382c2c0cc9fa7c86ab3b037fb5799c', 'channel_is_verified': False, - 'thumbnail': r're:^https?://.*\.jpg$', + 'thumbnail': r're:https?://.+/.+\.jpg', 'duration': 15577, 'timestamp': 1702970505.417, 'upload_date': '20231219', @@ -115,7 +115,7 @@ class CHZZKVideoIE(InfoExtractor): 'channel': '라디유radiyu', 'channel_id': '68f895c59a1043bc5019b5e08c83a5c5', 'channel_is_verified': False, - 'thumbnail': r're:^https?://.*\.jpg$', + 'thumbnail': r're:https?://.+/.+\.jpg', 'duration': 95, 'timestamp': 1703102631.722, 'upload_date': '20231220', @@ -131,12 +131,30 @@ class CHZZKVideoIE(InfoExtractor): 'channel': '강지', 'channel_id': 'b5ed5db484d04faf4d150aedd362f34b', 'channel_is_verified': True, - 'thumbnail': r're:^https?://.*\.jpg$', + 'thumbnail': r're:https?://.+/.+\.jpg', 'duration': 4433, 'timestamp': 1703307460.214, 'upload_date': '20231223', 'view_count': int, }, + }, { + # video_status == 'NONE' but is downloadable + 'url': 'https://chzzk.naver.com/video/6325166', + 'info_dict': { + 'id': '6325166', + 'ext': 'mp4', + 'title': '와이프 숙제빼주기', + 'channel': '이 다', + 'channel_id': '0076a519f147ee9fd0959bf02f9571ca', + 'channel_is_verified': False, + 'view_count': int, + 'duration': 28167, + 'thumbnail': r're:https?://.+/.+\.jpg', + 'timestamp': 1742139216.86, + 'upload_date': '20250316', + 'live_status': 'was_live', + }, + 'params': {'skip_download': 'm3u8'}, }] def _real_extract(self, url): @@ -147,11 +165,7 @@ def _real_extract(self, url): live_status = 'was_live' if video_meta.get('liveOpenDate') else 'not_live' video_status = video_meta.get('vodStatus') - if video_status == 'UPLOAD': - playback = self._parse_json(video_meta['liveRewindPlaybackJson'], video_id) - formats, subtitles = self._extract_m3u8_formats_and_subtitles( - playback['media'][0]['path'], video_id, 'mp4', m3u8_id='hls') - elif video_status == 'ABR_HLS': + if video_status == 'ABR_HLS': formats, subtitles = self._extract_mpd_formats_and_subtitles( f'https://apis.naver.com/neonplayer/vodplay/v1/playback/{video_meta["videoId"]}', video_id, query={ @@ -161,10 +175,17 @@ def _real_extract(self, url): 'cpl': 'en_US', }) else: - self.raise_no_formats( - f'Unknown video status detected: "{video_status}"', expected=True, video_id=video_id) - formats, subtitles = [], {} - live_status = 'post_live' if live_status == 'was_live' else None + fatal = video_status == 'UPLOAD' + playback = self._parse_json(video_meta['liveRewindPlaybackJson'], video_id, fatal=fatal) + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + traverse_obj(playback, ('media', 0, 'path')), video_id, 'mp4', m3u8_id='hls', fatal=fatal) + if formats and video_status != 'UPLOAD': + self.write_debug(f'Video found with status: "{video_status}"') + elif not formats: + self.raise_no_formats( + f'Unknown video status detected: "{video_status}"', expected=True, video_id=video_id) + formats, subtitles = [], {} + live_status = 'post_live' if live_status == 'was_live' else None return { 'id': video_id, diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index fcd9293da..00f466829 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -2,7 +2,6 @@ import collections import functools import getpass -import hashlib import http.client import http.cookiejar import http.cookies @@ -30,6 +29,7 @@ from ..cookies import LenientSimpleCookie from ..downloader.f4m import get_base_url, remove_encrypted_media from ..downloader.hls import HlsFD +from ..globals import plugin_ies_overrides from ..networking import HEADRequest, Request from ..networking.exceptions import ( HTTPError, @@ -78,7 +78,7 @@ parse_iso8601, parse_m3u8_attributes, parse_resolution, - sanitize_filename, + qualities, sanitize_url, smuggle_url, str_or_none, @@ -100,6 +100,7 @@ xpath_text, xpath_with_ns, ) +from ..utils._utils import _request_dump_filename class InfoExtractor: @@ -201,6 +202,11 @@ class InfoExtractor: fragment_base_url * "duration" (optional, int or float) * "filesize" (optional, int) + * hls_media_playlist_data + The M3U8 media playlist data as a string. + Only use if the data must be modified during extraction and + the native HLS downloader should bypass requesting the URL. + Does not apply if ffmpeg is used as external downloader * is_from_start Is a live format that can be downloaded from the start. Boolean * preference Order number of this format. If this field is @@ -1017,23 +1023,6 @@ def __check_blocked(self, content): 'Visit http://blocklist.rkn.gov.ru/ for a block reason.', expected=True) - def _request_dump_filename(self, url, video_id, data=None): - if data is not None: - data = hashlib.md5(data).hexdigest() - basen = join_nonempty(video_id, data, url, delim='_') - trim_length = self.get_param('trim_file_name') or 240 - if len(basen) > trim_length: - h = '___' + hashlib.md5(basen.encode()).hexdigest() - basen = basen[:trim_length - len(h)] + h - filename = sanitize_filename(f'{basen}.dump', restricted=True) - # Working around MAX_PATH limitation on Windows (see - # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx) - if os.name == 'nt': - absfilepath = os.path.abspath(filename) - if len(absfilepath) > 259: - filename = fR'\\?\{absfilepath}' - return filename - def __decode_webpage(self, webpage_bytes, encoding, headers): if not encoding: encoding = self._guess_encoding_from_content(headers.get('Content-Type', ''), webpage_bytes) @@ -1062,7 +1051,9 @@ def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errno if self.get_param('write_pages'): if isinstance(url_or_request, Request): data = self._create_request(url_or_request, data).data - filename = self._request_dump_filename(urlh.url, video_id, data) + filename = _request_dump_filename( + urlh.url, video_id, data, + trim_length=self.get_param('trim_file_name')) self.to_screen(f'Saving request to {filename}') with open(filename, 'wb') as outf: outf.write(webpage_bytes) @@ -1123,7 +1114,9 @@ def download_content(self, url_or_request, video_id, note=note, errnote=errnote, impersonate=None, require_impersonation=False): if self.get_param('load_pages'): url_or_request = self._create_request(url_or_request, data, headers, query) - filename = self._request_dump_filename(url_or_request.url, video_id, url_or_request.data) + filename = _request_dump_filename( + url_or_request.url, video_id, url_or_request.data, + trim_length=self.get_param('trim_file_name')) self.to_screen(f'Loading request from {filename}') try: with open(filename, 'rb') as dumpf: @@ -1577,6 +1570,8 @@ def _yield_json_ld(self, html, video_id, *, fatal=True, default=NO_DEFAULT): """Yield all json ld objects in the html""" if default is not NO_DEFAULT: fatal = False + if not fatal and not isinstance(html, str): + return for mobj in re.finditer(JSON_LD_RE, html): json_ld_item = self._parse_json( mobj.group('json_ld'), video_id, fatal=fatal, @@ -2185,6 +2180,8 @@ def extract_media(x_media_line): media_url = media.get('URI') if media_url: manifest_url = format_url(media_url) + is_audio = media_type == 'AUDIO' + is_alternate = media.get('DEFAULT') == 'NO' or media.get('AUTOSELECT') == 'NO' formats.extend({ 'format_id': join_nonempty(m3u8_id, group_id, name, idx), 'format_note': name, @@ -2197,7 +2194,11 @@ def extract_media(x_media_line): 'preference': preference, 'quality': quality, 'has_drm': has_drm, - 'vcodec': 'none' if media_type == 'AUDIO' else None, + 'vcodec': 'none' if is_audio else None, + # Alternate audio formats (e.g. audio description) should be deprioritized + 'source_preference': -2 if is_audio and is_alternate else None, + # Save this to assign source_preference based on associated video stream + '_audio_group_id': group_id if is_audio and not is_alternate else None, } for idx in _extract_m3u8_playlist_indices(manifest_url)) def build_stream_name(): @@ -2292,6 +2293,8 @@ def build_stream_name(): # ignore references to rendition groups and treat them # as complete formats. if audio_group_id and codecs and f.get('vcodec') != 'none': + # Save this to determine quality of audio formats that only have a GROUP-ID + f['_audio_group_id'] = audio_group_id audio_group = groups.get(audio_group_id) if audio_group and audio_group[0].get('URI'): # TODO: update acodec for audio only formats with @@ -2314,6 +2317,28 @@ def build_stream_name(): formats.append(http_f) last_stream_inf = {} + + # Some audio-only formats only have a GROUP-ID without any other quality/bitrate/codec info + # Each audio GROUP-ID corresponds with one or more video formats' AUDIO attribute + # For sorting purposes, set source_preference based on the quality of the video formats they are grouped with + # See https://github.com/yt-dlp/yt-dlp/issues/11178 + audio_groups_by_quality = orderedSet(f['_audio_group_id'] for f in sorted( + traverse_obj(formats, lambda _, v: v.get('vcodec') != 'none' and v['_audio_group_id']), + key=lambda x: (x.get('tbr') or 0, x.get('width') or 0))) + audio_quality_map = { + audio_groups_by_quality[0]: 'low', + audio_groups_by_quality[-1]: 'high', + } if len(audio_groups_by_quality) > 1 else None + audio_preference = qualities(audio_groups_by_quality) + for fmt in formats: + audio_group_id = fmt.pop('_audio_group_id', None) + if not audio_quality_map or not audio_group_id or fmt.get('vcodec') != 'none': + continue + # Use source_preference since quality and preference are set by params + fmt['source_preference'] = audio_preference(audio_group_id) + fmt['format_note'] = join_nonempty( + fmt.get('format_note'), audio_quality_map.get(audio_group_id), delim=', ') + return formats, subtitles def _extract_m3u8_vod_duration( @@ -2949,8 +2974,7 @@ def location_key(location): segment_duration = None if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info: segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale']) - representation_ms_info['total_number'] = int(math.ceil( - float_or_none(period_duration, segment_duration, default=0))) + representation_ms_info['total_number'] = math.ceil(float_or_none(period_duration, segment_duration, default=0)) representation_ms_info['fragments'] = [{ media_location_key: media_template % { 'Number': segment_number, @@ -3977,14 +4001,18 @@ def _extract_url(cls, webpage): # TODO: Remove def __init_subclass__(cls, *, plugin_name=None, **kwargs): if plugin_name: mro = inspect.getmro(cls) - super_class = cls.__wrapped__ = mro[mro.index(cls) + 1] - cls.PLUGIN_NAME, cls.ie_key = plugin_name, super_class.ie_key - cls.IE_NAME = f'{super_class.IE_NAME}+{plugin_name}' + next_mro_class = super_class = mro[mro.index(cls) + 1] + while getattr(super_class, '__wrapped__', None): super_class = super_class.__wrapped__ - setattr(sys.modules[super_class.__module__], super_class.__name__, cls) - _PLUGIN_OVERRIDES[super_class].append(cls) + if not any(override.PLUGIN_NAME == plugin_name for override in plugin_ies_overrides.value[super_class]): + cls.__wrapped__ = next_mro_class + cls.PLUGIN_NAME, cls.ie_key = plugin_name, next_mro_class.ie_key + cls.IE_NAME = f'{next_mro_class.IE_NAME}+{plugin_name}' + + setattr(sys.modules[super_class.__module__], super_class.__name__, cls) + plugin_ies_overrides.value[super_class].append(cls) return super().__init_subclass__(**kwargs) @@ -4040,6 +4068,3 @@ class UnsupportedURLIE(InfoExtractor): def _real_extract(self, url): raise UnsupportedError(url) - - -_PLUGIN_OVERRIDES = collections.defaultdict(list) diff --git a/yt_dlp/extractor/crowdbunker.py b/yt_dlp/extractor/crowdbunker.py index bf814570f..ca0323431 100644 --- a/yt_dlp/extractor/crowdbunker.py +++ b/yt_dlp/extractor/crowdbunker.py @@ -5,7 +5,9 @@ int_or_none, try_get, unified_strdate, + url_or_none, ) +from ..utils.traversal import traverse_obj class CrowdBunkerIE(InfoExtractor): @@ -44,16 +46,15 @@ def _real_extract(self, url): 'url': sub_url, }) - mpd_url = try_get(video_json, lambda x: x['dashManifest']['url']) - if mpd_url: - fmts, subs = self._extract_mpd_formats_and_subtitles(mpd_url, video_id) + if mpd_url := traverse_obj(video_json, ('dashManifest', 'url', {url_or_none})): + fmts, subs = self._extract_mpd_formats_and_subtitles(mpd_url, video_id, mpd_id='dash', fatal=False) formats.extend(fmts) - subtitles = self._merge_subtitles(subtitles, subs) - m3u8_url = try_get(video_json, lambda x: x['hlsManifest']['url']) - if m3u8_url: - fmts, subs = self._extract_m3u8_formats_and_subtitles(mpd_url, video_id) + self._merge_subtitles(subs, target=subtitles) + + if m3u8_url := traverse_obj(video_json, ('hlsManifest', 'url', {url_or_none})): + fmts, subs = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, m3u8_id='hls', fatal=False) formats.extend(fmts) - subtitles = self._merge_subtitles(subtitles, subs) + self._merge_subtitles(subs, target=subtitles) thumbnails = [{ 'url': image['url'], diff --git a/yt_dlp/extractor/cultureunplugged.py b/yt_dlp/extractor/cultureunplugged.py index c7ccd2747..fa2c2c08d 100644 --- a/yt_dlp/extractor/cultureunplugged.py +++ b/yt_dlp/extractor/cultureunplugged.py @@ -3,7 +3,7 @@ class CultureUnpluggedIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?cultureunplugged\.com/documentary/watch-online/play/(?P<id>\d+)(?:/(?P<display_id>[^/]+))?' + _VALID_URL = r'https?://(?:www\.)?cultureunplugged\.com/(?:documentary/watch-online/)?play/(?P<id>\d+)(?:/(?P<display_id>[^/#?]+))?' _TESTS = [{ 'url': 'http://www.cultureunplugged.com/documentary/watch-online/play/53662/The-Next--Best-West', 'md5': 'ac6c093b089f7d05e79934dcb3d228fc', @@ -12,12 +12,25 @@ class CultureUnpluggedIE(InfoExtractor): 'display_id': 'The-Next--Best-West', 'ext': 'mp4', 'title': 'The Next, Best West', - 'description': 'md5:0423cd00833dea1519cf014e9d0903b1', + 'description': 'md5:770033a3b7c2946a3bcfb7f1c6fb7045', 'thumbnail': r're:^https?://.*\.jpg$', - 'creator': 'Coldstream Creative', + 'creators': ['Coldstream Creative'], 'duration': 2203, 'view_count': int, }, + }, { + 'url': 'https://www.cultureunplugged.com/play/2833/Koi-Sunta-Hai--Journeys-with-Kumar---Kabir--Someone-is-Listening-', + 'md5': 'dc2014bc470dfccba389a1c934fa29fa', + 'info_dict': { + 'id': '2833', + 'display_id': 'Koi-Sunta-Hai--Journeys-with-Kumar---Kabir--Someone-is-Listening-', + 'ext': 'mp4', + 'title': 'Koi Sunta Hai: Journeys with Kumar & Kabir (Someone is Listening)', + 'description': 'md5:fa94ac934927c98660362b8285b2cda5', + 'view_count': int, + 'thumbnail': 'https://s3.amazonaws.com/cdn.cultureunplugged.com/thumbnails_16_9/lg/2833.jpg', + 'creators': ['Srishti'], + }, }, { 'url': 'http://www.cultureunplugged.com/documentary/watch-online/play/53662', 'only_matching': True, diff --git a/yt_dlp/extractor/dacast.py b/yt_dlp/extractor/dacast.py index 537352e5f..ffc7ca824 100644 --- a/yt_dlp/extractor/dacast.py +++ b/yt_dlp/extractor/dacast.py @@ -9,6 +9,7 @@ ExtractorError, classproperty, float_or_none, + parse_qs, traverse_obj, url_or_none, ) @@ -91,11 +92,15 @@ def _usp_signing_secret(self): # Rotates every so often, but hardcode a fallback in case of JS change/breakage before rotation return self._search_regex( r'\bUSP_SIGNING_SECRET\s*=\s*(["\'])(?P<secret>(?:(?!\1).)+)', player_js, - 'usp signing secret', group='secret', fatal=False) or 'odnInCGqhvtyRTtIiddxtuRtawYYICZP' + 'usp signing secret', group='secret', fatal=False) or 'hGDtqMKYVeFdofrAfFmBcrsakaZELajI' def _real_extract(self, url): user_id, video_id = self._match_valid_url(url).group('user_id', 'id') - query = {'contentId': f'{user_id}-vod-{video_id}', 'provider': 'universe'} + query = { + 'contentId': f'{user_id}-vod-{video_id}', + 'provider': 'universe', + **traverse_obj(url, ({parse_qs}, 'uss_token', {'signedKey': -1})), + } info = self._download_json(self._API_INFO_URL, video_id, query=query, fatal=False) access = self._download_json( 'https://playback.dacast.com/content/access', video_id, diff --git a/yt_dlp/extractor/dailymotion.py b/yt_dlp/extractor/dailymotion.py index 423c11c57..a81f0a26d 100644 --- a/yt_dlp/extractor/dailymotion.py +++ b/yt_dlp/extractor/dailymotion.py @@ -100,7 +100,7 @@ def _call_api(self, object_type, xid, object_fields, note, filter_extra=None): class DailymotionIE(DailymotionBaseInfoExtractor): _VALID_URL = r'''(?ix) - https?:// + (?:https?:)?// (?: dai\.ly/| (?: @@ -116,7 +116,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor): (?P<id>[^/?_&#]+)(?:[\w-]*\?playlist=(?P<playlist_id>x[0-9a-z]+))? ''' IE_NAME = 'dailymotion' - _EMBED_REGEX = [r'<(?:(?:embed|iframe)[^>]+?src=|input[^>]+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=)(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/(?:embed|swf)/video/.+?)\1'] + _EMBED_REGEX = [rf'(?ix)<(?:(?:embed|iframe)[^>]+?src=|input[^>]+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=)["\'](?P<url>{_VALID_URL[5:]})'] _TESTS = [{ 'url': 'http://www.dailymotion.com/video/x5kesuj_office-christmas-party-review-jason-bateman-olivia-munn-t-j-miller_news', 'md5': '074b95bdee76b9e3654137aee9c79dfe', @@ -308,6 +308,25 @@ class DailymotionIE(DailymotionBaseInfoExtractor): 'description': 'Que lindura', 'tags': [], }, + }, { + # //geo.dailymotion.com/player/xysxq.html?video=k2Y4Mjp7krAF9iCuINM + 'url': 'https://lcp.fr/programmes/avant-la-catastrophe-la-naissance-de-la-dictature-nazie-1933-1936-346819', + 'info_dict': { + 'id': 'k2Y4Mjp7krAF9iCuINM', + 'ext': 'mp4', + 'title': 'Avant la catastrophe la naissance de la dictature nazie 1933 -1936', + 'description': 'md5:7b620d5e26edbe45f27bbddc1c0257c1', + 'uploader': 'LCP Assemblée nationale', + 'uploader_id': 'xbz33d', + 'view_count': int, + 'like_count': int, + 'age_limit': 0, + 'duration': 3220, + 'thumbnail': 'https://s1.dmcdn.net/v/Xvumk1djJBUZfjj2a/x1080', + 'tags': [], + 'timestamp': 1739919947, + 'upload_date': '20250218', + }, }] _GEO_BYPASS = False _COMMON_MEDIA_FIELDS = '''description diff --git a/yt_dlp/extractor/deezer.py b/yt_dlp/extractor/deezer.py deleted file mode 100644 index 2ca8be5ca..000000000 --- a/yt_dlp/extractor/deezer.py +++ /dev/null @@ -1,142 +0,0 @@ -import json - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - int_or_none, - orderedSet, -) - - -class DeezerBaseInfoExtractor(InfoExtractor): - def get_data(self, url): - if not self.get_param('test'): - self.report_warning('For now, this extractor only supports the 30 second previews. Patches welcome!') - - mobj = self._match_valid_url(url) - data_id = mobj.group('id') - - webpage = self._download_webpage(url, data_id) - geoblocking_msg = self._html_search_regex( - r'<p class="soon-txt">(.*?)</p>', webpage, 'geoblocking message', - default=None) - if geoblocking_msg is not None: - raise ExtractorError( - f'Deezer said: {geoblocking_msg}', expected=True) - - data_json = self._search_regex( - (r'__DZR_APP_STATE__\s*=\s*({.+?})\s*</script>', - r'naboo\.display\(\'[^\']+\',\s*(.*?)\);\n'), - webpage, 'data JSON') - data = json.loads(data_json) - return data_id, webpage, data - - -class DeezerPlaylistIE(DeezerBaseInfoExtractor): - _VALID_URL = r'https?://(?:www\.)?deezer\.com/(../)?playlist/(?P<id>[0-9]+)' - _TEST = { - 'url': 'http://www.deezer.com/playlist/176747451', - 'info_dict': { - 'id': '176747451', - 'title': 'Best!', - 'uploader': 'anonymous', - 'thumbnail': r're:^https?://(e-)?cdns-images\.dzcdn\.net/images/cover/.*\.jpg$', - }, - 'playlist_count': 29, - } - - def _real_extract(self, url): - playlist_id, webpage, data = self.get_data(url) - - playlist_title = data.get('DATA', {}).get('TITLE') - playlist_uploader = data.get('DATA', {}).get('PARENT_USERNAME') - playlist_thumbnail = self._search_regex( - r'<img id="naboo_playlist_image".*?src="([^"]+)"', webpage, - 'playlist thumbnail') - - entries = [] - for s in data.get('SONGS', {}).get('data'): - formats = [{ - 'format_id': 'preview', - 'url': s.get('MEDIA', [{}])[0].get('HREF'), - 'preference': -100, # Only the first 30 seconds - 'ext': 'mp3', - }] - artists = ', '.join( - orderedSet(a.get('ART_NAME') for a in s.get('ARTISTS'))) - entries.append({ - 'id': s.get('SNG_ID'), - 'duration': int_or_none(s.get('DURATION')), - 'title': '{} - {}'.format(artists, s.get('SNG_TITLE')), - 'uploader': s.get('ART_NAME'), - 'uploader_id': s.get('ART_ID'), - 'age_limit': 16 if s.get('EXPLICIT_LYRICS') == '1' else 0, - 'formats': formats, - }) - - return { - '_type': 'playlist', - 'id': playlist_id, - 'title': playlist_title, - 'uploader': playlist_uploader, - 'thumbnail': playlist_thumbnail, - 'entries': entries, - } - - -class DeezerAlbumIE(DeezerBaseInfoExtractor): - _VALID_URL = r'https?://(?:www\.)?deezer\.com/(../)?album/(?P<id>[0-9]+)' - _TEST = { - 'url': 'https://www.deezer.com/fr/album/67505622', - 'info_dict': { - 'id': '67505622', - 'title': 'Last Week', - 'uploader': 'Home Brew', - 'thumbnail': r're:^https?://(e-)?cdns-images\.dzcdn\.net/images/cover/.*\.jpg$', - }, - 'playlist_count': 7, - } - - def _real_extract(self, url): - album_id, webpage, data = self.get_data(url) - - album_title = data.get('DATA', {}).get('ALB_TITLE') - album_uploader = data.get('DATA', {}).get('ART_NAME') - album_thumbnail = self._search_regex( - r'<img id="naboo_album_image".*?src="([^"]+)"', webpage, - 'album thumbnail') - - entries = [] - for s in data.get('SONGS', {}).get('data'): - formats = [{ - 'format_id': 'preview', - 'url': s.get('MEDIA', [{}])[0].get('HREF'), - 'preference': -100, # Only the first 30 seconds - 'ext': 'mp3', - }] - artists = ', '.join( - orderedSet(a.get('ART_NAME') for a in s.get('ARTISTS'))) - entries.append({ - 'id': s.get('SNG_ID'), - 'duration': int_or_none(s.get('DURATION')), - 'title': '{} - {}'.format(artists, s.get('SNG_TITLE')), - 'uploader': s.get('ART_NAME'), - 'uploader_id': s.get('ART_ID'), - 'age_limit': 16 if s.get('EXPLICIT_LYRICS') == '1' else 0, - 'formats': formats, - 'track': s.get('SNG_TITLE'), - 'track_number': int_or_none(s.get('TRACK_NUMBER')), - 'track_id': s.get('SNG_ID'), - 'artist': album_uploader, - 'album': album_title, - 'album_artist': album_uploader, - }) - - return { - '_type': 'playlist', - 'id': album_id, - 'title': album_title, - 'uploader': album_uploader, - 'thumbnail': album_thumbnail, - 'entries': entries, - } diff --git a/yt_dlp/extractor/digiview.py b/yt_dlp/extractor/digiview.py new file mode 100644 index 000000000..f7f23864d --- /dev/null +++ b/yt_dlp/extractor/digiview.py @@ -0,0 +1,130 @@ +from .common import InfoExtractor +from .youtube import YoutubeIE +from ..utils import clean_html, int_or_none, traverse_obj, url_or_none, urlencode_postdata + + +class DigiviewIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?ladigitale\.dev/digiview/#/v/(?P<id>[0-9a-f]+)' + _TESTS = [{ + # normal video + 'url': 'https://ladigitale.dev/digiview/#/v/67a8e50aee2ec', + 'info_dict': { + 'id': '67a8e50aee2ec', + 'ext': 'mp4', + 'title': 'Big Buck Bunny 60fps 4K - Official Blender Foundation Short Film', + 'thumbnail': 'https://i.ytimg.com/vi/aqz-KE-bpKQ/hqdefault.jpg', + 'upload_date': '20141110', + 'playable_in_embed': True, + 'duration': 635, + 'view_count': int, + 'comment_count': int, + 'channel': 'Blender', + 'license': 'Creative Commons Attribution license (reuse allowed)', + 'like_count': int, + 'tags': 'count:8', + 'live_status': 'not_live', + 'channel_id': 'UCSMOQeBJ2RAnuFungnQOxLg', + 'channel_follower_count': int, + 'channel_url': 'https://www.youtube.com/channel/UCSMOQeBJ2RAnuFungnQOxLg', + 'uploader_id': '@BlenderOfficial', + 'description': 'md5:8f3ed18a53a1bb36cbb3b70a15782fd0', + 'categories': ['Film & Animation'], + 'channel_is_verified': True, + 'heatmap': 'count:100', + 'section_end': 635, + 'uploader': 'Blender', + 'timestamp': 1415628355, + 'uploader_url': 'https://www.youtube.com/@BlenderOfficial', + 'age_limit': 0, + 'section_start': 0, + 'availability': 'public', + }, + }, { + # cut video + 'url': 'https://ladigitale.dev/digiview/#/v/67a8e51d0dd58', + 'info_dict': { + 'id': '67a8e51d0dd58', + 'ext': 'mp4', + 'title': 'Big Buck Bunny 60fps 4K - Official Blender Foundation Short Film', + 'thumbnail': 'https://i.ytimg.com/vi/aqz-KE-bpKQ/hqdefault.jpg', + 'upload_date': '20141110', + 'playable_in_embed': True, + 'duration': 5, + 'view_count': int, + 'comment_count': int, + 'channel': 'Blender', + 'license': 'Creative Commons Attribution license (reuse allowed)', + 'like_count': int, + 'tags': 'count:8', + 'live_status': 'not_live', + 'channel_id': 'UCSMOQeBJ2RAnuFungnQOxLg', + 'channel_follower_count': int, + 'channel_url': 'https://www.youtube.com/channel/UCSMOQeBJ2RAnuFungnQOxLg', + 'uploader_id': '@BlenderOfficial', + 'description': 'md5:8f3ed18a53a1bb36cbb3b70a15782fd0', + 'categories': ['Film & Animation'], + 'channel_is_verified': True, + 'heatmap': 'count:100', + 'section_end': 10, + 'uploader': 'Blender', + 'timestamp': 1415628355, + 'uploader_url': 'https://www.youtube.com/@BlenderOfficial', + 'age_limit': 0, + 'section_start': 5, + 'availability': 'public', + }, + }, { + # changed title + 'url': 'https://ladigitale.dev/digiview/#/v/67a8ea5644d7a', + 'info_dict': { + 'id': '67a8ea5644d7a', + 'ext': 'mp4', + 'title': 'Big Buck Bunny (with title changed)', + 'thumbnail': 'https://i.ytimg.com/vi/aqz-KE-bpKQ/hqdefault.jpg', + 'upload_date': '20141110', + 'playable_in_embed': True, + 'duration': 5, + 'view_count': int, + 'comment_count': int, + 'channel': 'Blender', + 'license': 'Creative Commons Attribution license (reuse allowed)', + 'like_count': int, + 'tags': 'count:8', + 'live_status': 'not_live', + 'channel_id': 'UCSMOQeBJ2RAnuFungnQOxLg', + 'channel_follower_count': int, + 'channel_url': 'https://www.youtube.com/channel/UCSMOQeBJ2RAnuFungnQOxLg', + 'uploader_id': '@BlenderOfficial', + 'description': 'md5:8f3ed18a53a1bb36cbb3b70a15782fd0', + 'categories': ['Film & Animation'], + 'channel_is_verified': True, + 'heatmap': 'count:100', + 'section_end': 15, + 'uploader': 'Blender', + 'timestamp': 1415628355, + 'uploader_url': 'https://www.youtube.com/@BlenderOfficial', + 'age_limit': 0, + 'section_start': 10, + 'availability': 'public', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + video_data = self._download_json( + 'https://ladigitale.dev/digiview/inc/recuperer_video.php', video_id, + data=urlencode_postdata({'id': video_id})) + + clip_id = video_data['videoId'] + return self.url_result( + f'https://www.youtube.com/watch?v={clip_id}', + YoutubeIE, video_id, url_transparent=True, + **traverse_obj(video_data, { + 'section_start': ('debut', {int_or_none}), + 'section_end': ('fin', {int_or_none}), + 'description': ('description', {clean_html}, filter), + 'title': ('titre', {str}), + 'thumbnail': ('vignette', {url_or_none}), + 'view_count': ('vues', {int_or_none}), + }), + ) diff --git a/yt_dlp/extractor/dreisat.py b/yt_dlp/extractor/dreisat.py index 4b0a269b9..edd66e46c 100644 --- a/yt_dlp/extractor/dreisat.py +++ b/yt_dlp/extractor/dreisat.py @@ -1,34 +1,75 @@ -from .zdf import ZDFIE +from .zdf import ZDFBaseIE +from ..utils import ( + int_or_none, + merge_dicts, + parse_iso8601, +) +from ..utils.traversal import require, traverse_obj -class DreiSatIE(ZDFIE): # XXX: Do not subclass from concrete IE +class DreiSatIE(ZDFBaseIE): IE_NAME = '3sat' - _VALID_URL = r'https?://(?:www\.)?3sat\.de/(?:[^/]+/)*(?P<id>[^/?#&]+)\.html' + _VALID_URL = r'https?://(?:www\.)?3sat\.de/(?:[^/?#]+/)*(?P<id>[^/?#&]+)\.html' _TESTS = [{ - # Same as https://www.zdf.de/dokumentation/ab-18/10-wochen-sommer-102.html - 'url': 'https://www.3sat.de/film/ab-18/10-wochen-sommer-108.html', - 'md5': '0aff3e7bc72c8813f5e0fae333316a1d', + 'url': 'https://www.3sat.de/dokumentation/reise/traumziele-suedostasiens-die-philippinen-und-vietnam-102.html', 'info_dict': { - 'id': '141007_ab18_10wochensommer_film', + 'id': '231124_traumziele_philippinen_und_vietnam_dokreise', 'ext': 'mp4', - 'title': 'Ab 18! - 10 Wochen Sommer', - 'description': 'md5:8253f41dc99ce2c3ff892dac2d65fe26', - 'duration': 2660, - 'timestamp': 1608604200, - 'upload_date': '20201222', + 'title': 'Traumziele Südostasiens (1/2): Die Philippinen und Vietnam', + 'description': 'md5:26329ce5197775b596773b939354079d', + 'duration': 2625.0, + 'thumbnail': 'https://www.3sat.de/assets/traumziele-suedostasiens-die-philippinen-und-vietnam-100~original?cb=1699870351148', + 'episode': 'Traumziele Südostasiens (1/2): Die Philippinen und Vietnam', + 'episode_id': 'POS_cc7ff51c-98cf-4d12-b99d-f7a551de1c95', + 'timestamp': 1747920900, + 'upload_date': '20250522', }, }, { - 'url': 'https://www.3sat.de/gesellschaft/schweizweit/waidmannsheil-100.html', + 'url': 'https://www.3sat.de/film/ab-18/ab-18---mein-fremdes-ich-100.html', + 'md5': 'f92638413a11d759bdae95c9d8ec165c', 'info_dict': { - 'id': '140913_sendung_schweizweit', + 'id': '221128_mein_fremdes_ich2_ab18', 'ext': 'mp4', - 'title': 'Waidmannsheil', - 'description': 'md5:cce00ca1d70e21425e72c86a98a56817', - 'timestamp': 1410623100, - 'upload_date': '20140913', + 'title': 'Ab 18! - Mein fremdes Ich', + 'description': 'md5:cae0c0b27b7426d62ca0dda181738bf0', + 'duration': 2625.0, + 'thumbnail': 'https://www.3sat.de/assets/ab-18---mein-fremdes-ich-106~original?cb=1666081865812', + 'episode': 'Ab 18! - Mein fremdes Ich', + 'episode_id': 'POS_6225d1ca-a0d5-45e3-870b-e783ee6c8a3f', + 'timestamp': 1695081600, + 'upload_date': '20230919', }, - 'params': { - 'skip_download': True, + }, { + 'url': 'https://www.3sat.de/gesellschaft/37-grad-leben/aus-dem-leben-gerissen-102.html', + 'md5': 'a903eaf8d1fd635bd3317cd2ad87ec84', + 'info_dict': { + 'id': '250323_0903_sendung_sgl', + 'ext': 'mp4', + 'title': 'Plötzlich ohne dich', + 'description': 'md5:380cc10659289dd91510ad8fa717c66b', + 'duration': 1620.0, + 'thumbnail': 'https://www.3sat.de/assets/37-grad-leben-106~original?cb=1645537156810', + 'episode': 'Plötzlich ohne dich', + 'episode_id': 'POS_faa7a93c-c0f2-4d51-823f-ce2ac3ee191b', + 'timestamp': 1743162540, + 'upload_date': '20250328', + }, + }, { + # Video with chapters + 'url': 'https://www.3sat.de/kultur/buchmesse/dein-buch-das-beste-von-der-leipziger-buchmesse-2025-teil-1-100.html', + 'md5': '6b95790ce52e75f0d050adcdd2711ee6', + 'info_dict': { + 'id': '250330_dein_buch1_bum', + 'ext': 'mp4', + 'title': 'dein buch - Das Beste von der Leipziger Buchmesse 2025 - Teil 1', + 'description': 'md5:bae51bfc22f15563ce3acbf97d2e8844', + 'duration': 5399.0, + 'thumbnail': 'https://www.3sat.de/assets/buchmesse-kerkeling-100~original?cb=1743329640903', + 'chapters': 'count:24', + 'episode': 'dein buch - Das Beste von der Leipziger Buchmesse 2025 - Teil 1', + 'episode_id': 'POS_1ef236cc-b390-401e-acd0-4fb4b04315fb', + 'timestamp': 1743327000, + 'upload_date': '20250330', }, }, { # Same as https://www.zdf.de/filme/filme-sonstige/der-hauptmann-112.html @@ -39,3 +80,45 @@ class DreiSatIE(ZDFIE): # XXX: Do not subclass from concrete IE 'url': 'https://www.3sat.de/wissen/nano/nano-21-mai-2019-102.html', 'only_matching': True, }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + player = self._search_json( + r'data-zdfplayer-jsb=(["\'])', webpage, 'player JSON', video_id) + player_url = player['content'] + api_token = f'Bearer {player["apiToken"]}' + + content = self._call_api(player_url, video_id, 'video metadata', api_token) + + video_target = content['mainVideoContent']['http://zdf.de/rels/target'] + ptmd_path = traverse_obj(video_target, ( + (('streams', 'default'), None), + ('http://zdf.de/rels/streams/ptmd', 'http://zdf.de/rels/streams/ptmd-template'), + {str}, any, {require('ptmd path')})) + ptmd_url = self._expand_ptmd_template(player_url, ptmd_path) + aspect_ratio = self._parse_aspect_ratio(video_target.get('aspectRatio')) + info = self._extract_ptmd(ptmd_url, video_id, api_token, aspect_ratio) + + return merge_dicts(info, { + **traverse_obj(content, { + 'title': (('title', 'teaserHeadline'), {str}, any), + 'episode': (('title', 'teaserHeadline'), {str}, any), + 'description': (('leadParagraph', 'teasertext'), {str}, any), + 'timestamp': ('editorialDate', {parse_iso8601}), + }), + **traverse_obj(video_target, { + 'duration': ('duration', {int_or_none}), + 'chapters': ('streamAnchorTag', {self._extract_chapters}), + }), + 'thumbnails': self._extract_thumbnails(traverse_obj(content, ('teaserImageRef', 'layouts', {dict}))), + **traverse_obj(content, ('programmeItem', 0, 'http://zdf.de/rels/target', { + 'series_id': ('http://zdf.de/rels/cmdm/series', 'seriesUuid', {str}), + 'series': ('http://zdf.de/rels/cmdm/series', 'seriesTitle', {str}), + 'season': ('http://zdf.de/rels/cmdm/season', 'seasonTitle', {str}), + 'season_number': ('http://zdf.de/rels/cmdm/season', 'seasonNumber', {int_or_none}), + 'season_id': ('http://zdf.de/rels/cmdm/season', 'seasonUuid', {str}), + 'episode_number': ('episodeNumber', {int_or_none}), + 'episode_id': ('contentId', {str}), + })), + }) diff --git a/yt_dlp/extractor/extractors.py b/yt_dlp/extractor/extractors.py index baa69d242..18a3737d7 100644 --- a/yt_dlp/extractor/extractors.py +++ b/yt_dlp/extractor/extractors.py @@ -1,28 +1,37 @@ -import contextlib +import inspect import os -from ..plugins import load_plugins +from ..globals import LAZY_EXTRACTORS +from ..globals import extractors as _extractors_context -# NB: Must be before other imports so that plugins can be correctly injected -_PLUGIN_CLASSES = load_plugins('extractor', 'IE') +_CLASS_LOOKUP = None +if os.environ.get('YTDLP_NO_LAZY_EXTRACTORS'): + LAZY_EXTRACTORS.value = False +else: + try: + from .lazy_extractors import _CLASS_LOOKUP + LAZY_EXTRACTORS.value = True + except ImportError: + LAZY_EXTRACTORS.value = None -_LAZY_LOADER = False -if not os.environ.get('YTDLP_NO_LAZY_EXTRACTORS'): - with contextlib.suppress(ImportError): - from .lazy_extractors import * # noqa: F403 - from .lazy_extractors import _ALL_CLASSES - _LAZY_LOADER = True +if not _CLASS_LOOKUP: + from . import _extractors -if not _LAZY_LOADER: - from ._extractors import * # noqa: F403 - _ALL_CLASSES = [ # noqa: F811 - klass - for name, klass in globals().items() + _CLASS_LOOKUP = { + name: value + for name, value in inspect.getmembers(_extractors) if name.endswith('IE') and name != 'GenericIE' - ] - _ALL_CLASSES.append(GenericIE) # noqa: F405 + } + _CLASS_LOOKUP['GenericIE'] = _extractors.GenericIE -globals().update(_PLUGIN_CLASSES) -_ALL_CLASSES[:0] = _PLUGIN_CLASSES.values() +# We want to append to the main lookup +_current = _extractors_context.value +for name, ie in _CLASS_LOOKUP.items(): + _current.setdefault(name, ie) -from .common import _PLUGIN_OVERRIDES # noqa: F401 + +def __getattr__(name): + value = _CLASS_LOOKUP.get(name) + if not value: + raise AttributeError(f'module {__name__} has no attribute {name}') + return value diff --git a/yt_dlp/extractor/francaisfacile.py b/yt_dlp/extractor/francaisfacile.py new file mode 100644 index 000000000..d3208c282 --- /dev/null +++ b/yt_dlp/extractor/francaisfacile.py @@ -0,0 +1,87 @@ +import urllib.parse + +from .common import InfoExtractor +from ..networking.exceptions import HTTPError +from ..utils import ( + ExtractorError, + float_or_none, + url_or_none, +) +from ..utils.traversal import traverse_obj + + +class FrancaisFacileIE(InfoExtractor): + _VALID_URL = r'https?://francaisfacile\.rfi\.fr/[a-z]{2}/(?:actualit%C3%A9|podcasts/[^/#?]+)/(?P<id>[^/#?]+)' + _TESTS = [{ + 'url': 'https://francaisfacile.rfi.fr/fr/actualit%C3%A9/20250305-r%C3%A9concilier-les-jeunes-avec-la-lecture-gr%C3%A2ce-aux-r%C3%A9seaux-sociaux', + 'md5': '4f33674cb205744345cc835991100afa', + 'info_dict': { + 'id': 'WBMZ58952-FLE-FR-20250305', + 'display_id': '20250305-réconcilier-les-jeunes-avec-la-lecture-grâce-aux-réseaux-sociaux', + 'title': 'Réconcilier les jeunes avec la lecture grâce aux réseaux sociaux', + 'url': 'https://aod-fle.akamaized.net/fle/sounds/fr/2025/03/05/6b6af52a-f9ba-11ef-a1f8-005056a97652.mp3', + 'ext': 'mp3', + 'description': 'md5:b903c63d8585bd59e8cc4d5f80c4272d', + 'duration': 103.15, + 'timestamp': 1741177984, + 'upload_date': '20250305', + }, + }, { + 'url': 'https://francaisfacile.rfi.fr/fr/actualit%C3%A9/20250307-argentine-le-sac-d-un-alpiniste-retrouv%C3%A9-40-ans-apr%C3%A8s-sa-mort', + 'md5': 'b8c3a63652d4ae8e8092dda5700c1cd9', + 'info_dict': { + 'id': 'WBMZ59102-FLE-FR-20250307', + 'display_id': '20250307-argentine-le-sac-d-un-alpiniste-retrouvé-40-ans-après-sa-mort', + 'title': 'Argentine: le sac d\'un alpiniste retrouvé 40 ans après sa mort', + 'url': 'https://aod-fle.akamaized.net/fle/sounds/fr/2025/03/07/8edf4082-fb46-11ef-8a37-005056bf762b.mp3', + 'ext': 'mp3', + 'description': 'md5:7fd088fbdf4a943bb68cf82462160dca', + 'duration': 117.74, + 'timestamp': 1741352789, + 'upload_date': '20250307', + }, + }, { + 'url': 'https://francaisfacile.rfi.fr/fr/podcasts/un-mot-une-histoire/20250317-le-mot-de-david-foenkinos-peut-%C3%AAtre', + 'md5': 'db83c2cc2589b4c24571c6b6cf14f5f1', + 'info_dict': { + 'id': 'WBMZ59441-FLE-FR-20250317', + 'display_id': '20250317-le-mot-de-david-foenkinos-peut-être', + 'title': 'Le mot de David Foenkinos: «peut-être» - Un mot, une histoire', + 'url': 'https://aod-fle.akamaized.net/fle/sounds/fr/2025/03/17/4ca6cbbe-0315-11f0-a85b-005056a97652.mp3', + 'ext': 'mp3', + 'description': 'md5:3fe35fae035803df696bfa7af2496e49', + 'duration': 198.96, + 'timestamp': 1742210897, + 'upload_date': '20250317', + }, + }] + + def _real_extract(self, url): + display_id = urllib.parse.unquote(self._match_id(url)) + + try: # yt-dlp's default user-agents are too old and blocked by the site + webpage = self._download_webpage(url, display_id, headers={ + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:136.0) Gecko/20100101 Firefox/136.0', + }) + except ExtractorError as e: + if not isinstance(e.cause, HTTPError) or e.cause.status != 403: + raise + # Retry with impersonation if hardcoded UA is insufficient + webpage = self._download_webpage(url, display_id, impersonate=True) + + data = self._search_json( + r'<script[^>]+\bdata-media-id=[^>]+\btype="application/json"[^>]*>', + webpage, 'audio data', display_id) + + return { + 'id': data['mediaId'], + 'display_id': display_id, + 'vcodec': 'none', + 'title': self._html_extract_title(webpage), + **self._search_json_ld(webpage, display_id, fatal=False), + **traverse_obj(data, { + 'title': ('title', {str}), + 'url': ('sources', ..., 'url', {url_or_none}, any), + 'duration': ('sources', ..., 'duration', {float_or_none}, any), + }), + } diff --git a/yt_dlp/extractor/francetv.py b/yt_dlp/extractor/francetv.py index 01b18d8da..5c9f8e36d 100644 --- a/yt_dlp/extractor/francetv.py +++ b/yt_dlp/extractor/francetv.py @@ -9,6 +9,7 @@ ExtractorError, clean_html, determine_ext, + extract_attributes, filter_dict, format_field, int_or_none, @@ -18,7 +19,7 @@ unsmuggle_url, url_or_none, ) -from ..utils.traversal import traverse_obj +from ..utils.traversal import find_element, traverse_obj class FranceTVBaseInfoExtractor(InfoExtractor): @@ -358,7 +359,8 @@ def _real_extract(self, url): # For livestreams we need the id of the stream instead of the currently airing episode id video_id = traverse_obj(nextjs_data, ( ..., ..., 'children', ..., 'children', ..., 'children', ..., 'children', ..., ..., - 'children', ..., ..., 'children', ..., ..., 'children', ..., 'options', 'id', {str}, any)) + 'children', ..., ..., 'children', ..., ..., 'children', (..., (..., ...)), + 'options', 'id', {str}, any)) else: video_id = traverse_obj(nextjs_data, ( ..., ..., ..., 'children', @@ -459,11 +461,16 @@ def _real_extract(self, url): self.url_result(dailymotion_url, DailymotionIE.ie_key()) for dailymotion_url in dailymotion_urls]) - video_id = self._search_regex( - (r'player\.load[^;]+src:\s*["\']([^"\']+)', - r'id-video=([^@]+@[^"]+)', - r'<a[^>]+href="(?:https?:)?//videos\.francetv\.fr/video/([^@]+@[^"]+)"', - r'(?:data-id|<figure[^<]+\bid)=["\']([\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'), - webpage, 'video id') + video_id = ( + traverse_obj(webpage, ( + {find_element(tag='button', attr='data-cy', value='francetv-player-wrapper', html=True)}, + {extract_attributes}, 'id')) + or self._search_regex( + (r'player\.load[^;]+src:\s*["\']([^"\']+)', + r'id-video=([^@]+@[^"]+)', + r'<a[^>]+href="(?:https?:)?//videos\.francetv\.fr/video/([^@]+@[^"]+)"', + r'(?:data-id|<figure[^<]+\bid)=["\']([\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'), + webpage, 'video id') + ) return self._make_url_result(video_id, url=url) diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index 320a47772..721d04e31 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -37,6 +37,7 @@ unescapeHTML, unified_timestamp, unsmuggle_url, + update_url, update_url_query, url_or_none, urlhandle_detect_ext, @@ -293,6 +294,19 @@ class GenericIE(InfoExtractor): 'timestamp': 1378272859.0, }, }, + # Live DASH MPD + { + 'url': 'https://livesim2.dashif.org/livesim2/ato_10/testpic_2s/Manifest.mpd', + 'info_dict': { + 'id': 'Manifest', + 'ext': 'mp4', + 'title': r're:Manifest \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', + 'live_status': 'is_live', + }, + 'params': { + 'skip_download': 'livestream', + }, + }, # m3u8 served with Content-Type: audio/x-mpegURL; charset=utf-8 { 'url': 'http://once.unicornmedia.com/now/master/playlist/bb0b18ba-64f5-4b1b-a29f-0ac252f06b68/77a785f3-5188-4806-b788-0893a61634ed/93677179-2d99-4ef4-9e17-fe70d49abfbf/content.m3u8', @@ -2200,10 +2214,21 @@ def hex_or_none(value): if is_live is not None: info['live_status'] = 'not_live' if is_live == 'false' else 'is_live' return - headers = m3u8_format.get('http_headers') or info.get('http_headers') - duration = self._extract_m3u8_vod_duration( - m3u8_format['url'], info.get('id'), note='Checking m3u8 live status', - errnote='Failed to download m3u8 media playlist', headers=headers) + headers = m3u8_format.get('http_headers') or info.get('http_headers') or {} + display_id = info.get('id') + urlh = self._request_webpage( + m3u8_format['url'], display_id, 'Checking m3u8 live status', errnote=False, + headers={**headers, 'Accept-Encoding': 'identity'}, fatal=False) + if urlh is False: + return + first_bytes = urlh.read(512) + if not first_bytes.startswith(b'#EXTM3U'): + return + m3u8_doc = self._webpage_read_content( + urlh, urlh.url, display_id, prefix=first_bytes, fatal=False, errnote=False) + if not m3u8_doc: + return + duration = self._parse_m3u8_vod_duration(m3u8_doc, display_id) if not duration: info['live_status'] = 'is_live' info['duration'] = info.get('duration') or duration @@ -2436,10 +2461,9 @@ def _real_extract(self, url): subtitles = {} if format_id.endswith('mpegurl') or ext == 'm3u8': formats, subtitles = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4', headers=headers) - elif format_id.endswith(('mpd', 'dash+xml')) or ext == 'mpd': - formats, subtitles = self._extract_mpd_formats_and_subtitles(url, video_id, headers=headers) elif format_id == 'f4m' or ext == 'f4m': formats = self._extract_f4m_formats(url, video_id, headers=headers) + # Don't check for DASH/mpd here, do it later w/ first_bytes. Same number of requests either way else: formats = [{ 'format_id': format_id, @@ -2514,13 +2538,15 @@ def _real_extract(self, url): return self.playlist_result( self._parse_xspf( doc, video_id, xspf_url=url, - xspf_base_url=full_response.url), + xspf_base_url=new_url), video_id) elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag): info_dict['formats'], info_dict['subtitles'] = self._parse_mpd_formats_and_subtitles( doc, - mpd_base_url=full_response.url.rpartition('/')[0], + # Do not use yt_dlp.utils.base_url here since it will raise on file:// URLs + mpd_base_url=update_url(new_url, query=None, fragment=None).rpartition('/')[0], mpd_url=url) + info_dict['live_status'] = 'is_live' if doc.get('type') == 'dynamic' else None self._extra_manifest_info(info_dict, url) self.report_detected('DASH manifest') return info_dict diff --git a/yt_dlp/extractor/getcourseru.py b/yt_dlp/extractor/getcourseru.py index b7581d77e..2d923cf54 100644 --- a/yt_dlp/extractor/getcourseru.py +++ b/yt_dlp/extractor/getcourseru.py @@ -8,7 +8,7 @@ class GetCourseRuPlayerIE(InfoExtractor): - _VALID_URL = r'https?://player02\.getcourse\.ru/sign-player/?\?(?:[^#]+&)?json=[^#&]+' + _VALID_URL = r'https?://(?:player02\.getcourse\.ru|cf-api-2\.vhcdn\.com)/sign-player/?\?(?:[^#]+&)?json=[^#&]+' _EMBED_REGEX = [rf'<iframe[^>]+\bsrc=[\'"](?P<url>{_VALID_URL}[^\'"]*)'] _TESTS = [{ 'url': 'http://player02.getcourse.ru/sign-player/?json=eyJ2aWRlb19oYXNoIjoiMTkwYmRmOTNmMWIyOTczNTMwOTg1M2E3YTE5ZTI0YjMiLCJ1c2VyX2lkIjozNTk1MjUxODMsInN1Yl9sb2dpbl91c2VyX2lkIjpudWxsLCJsZXNzb25faWQiOm51bGwsImlwIjoiNDYuMTQyLjE4Mi4yNDciLCJnY19ob3N0IjoiYWNhZGVteW1lbC5vbmxpbmUiLCJ0aW1lIjoxNzA1NDQ5NjQyLCJwYXlsb2FkIjoidV8zNTk1MjUxODMiLCJ1aV9sYW5ndWFnZSI6InJ1IiwiaXNfaGF2ZV9jdXN0b21fc3R5bGUiOnRydWV9&s=354ad2c993d95d5ac629e3133d6cefea&vh-static-feature=zigzag', @@ -20,6 +20,16 @@ class GetCourseRuPlayerIE(InfoExtractor): 'duration': 1693, }, 'skip': 'JWT expired', + }, { + 'url': 'https://cf-api-2.vhcdn.com/sign-player/?json=example', + 'info_dict': { + 'id': '435735291', + 'title': '8afd7c489952108e00f019590f3711f3', + 'ext': 'mp4', + 'thumbnail': 'https://preview-htz.vhcdn.com/preview/8afd7c489952108e00f019590f3711f3/preview.jpg?version=1682170973&host=vh-72', + 'duration': 777, + }, + 'skip': 'JWT expired', }] def _real_extract(self, url): @@ -168,7 +178,7 @@ def _real_extract(self, url): playlist_id = self._search_regex( r'window\.(?:lessonId|gcsObjectId)\s*=\s*(\d+)', webpage, 'playlist id', default=display_id) - title = self._og_search_title(webpage) or self._html_extract_title(webpage) + title = self._og_search_title(webpage, default=None) or self._html_extract_title(webpage) return self.playlist_from_matches( re.findall(GetCourseRuPlayerIE._EMBED_REGEX[0], webpage), diff --git a/yt_dlp/extractor/gigya.py b/yt_dlp/extractor/gigya.py deleted file mode 100644 index cc18ee67c..000000000 --- a/yt_dlp/extractor/gigya.py +++ /dev/null @@ -1,19 +0,0 @@ -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - urlencode_postdata, -) - - -class GigyaBaseIE(InfoExtractor): - def _gigya_login(self, auth_data): - auth_info = self._download_json( - 'https://accounts.eu1.gigya.com/accounts.login', None, - note='Logging in', errnote='Unable to log in', - data=urlencode_postdata(auth_data)) - - error_message = auth_info.get('errorDetails') or auth_info.get('errorMessage') - if error_message: - raise ExtractorError( - f'Unable to login: {error_message}', expected=True) - return auth_info diff --git a/yt_dlp/extractor/globo.py b/yt_dlp/extractor/globo.py index 7acbd2820..893202285 100644 --- a/yt_dlp/extractor/globo.py +++ b/yt_dlp/extractor/globo.py @@ -69,8 +69,13 @@ class GloboIE(InfoExtractor): 'info_dict': { 'id': '8013907', 'ext': 'mp4', - 'title': 'Capítulo de 14⧸08⧸1989', + 'title': 'Capítulo de 14/08/1989', + 'episode': 'Episode 1', 'episode_number': 1, + 'uploader': 'Tieta', + 'uploader_id': '11895', + 'duration': 2858.389, + 'subtitles': 'count:1', }, 'params': { 'skip_download': True, @@ -82,7 +87,12 @@ class GloboIE(InfoExtractor): 'id': '12824146', 'ext': 'mp4', 'title': 'Acordo de damas', + 'episode': 'Episode 1', 'episode_number': 1, + 'uploader': 'Rensga Hits!', + 'uploader_id': '20481', + 'duration': 1953.994, + 'season': 'Season 2', 'season_number': 2, }, 'params': { @@ -136,9 +146,10 @@ def _real_extract(self, url): else: formats, subtitles = self._extract_m3u8_formats_and_subtitles( main_source['url'], video_id, 'mp4', m3u8_id='hls') - self._merge_subtitles(traverse_obj(main_source, ('text', ..., { - 'url': ('subtitle', 'srt', 'url', {url_or_none}), - }, all, {subs_list_to_dict(lang='en')})), target=subtitles) + + self._merge_subtitles(traverse_obj(main_source, ('text', ..., ('caption', 'subtitle'), { + 'url': ('srt', 'url', {url_or_none}), + }, all, {subs_list_to_dict(lang='pt-BR')})), target=subtitles) return { 'id': video_id, diff --git a/yt_dlp/extractor/hse.py b/yt_dlp/extractor/hse.py index d9004293f..c3c7bb32e 100644 --- a/yt_dlp/extractor/hse.py +++ b/yt_dlp/extractor/hse.py @@ -6,7 +6,7 @@ ) -class HSEShowBaseInfoExtractor(InfoExtractor): +class HSEShowBaseIE(InfoExtractor): _GEO_COUNTRIES = ['DE'] def _extract_redux_data(self, url, video_id): @@ -28,7 +28,7 @@ def _extract_formats_and_subtitles(self, sources, video_id): return formats, subtitles -class HSEShowIE(HSEShowBaseInfoExtractor): +class HSEShowIE(HSEShowBaseIE): _VALID_URL = r'https?://(?:www\.)?hse\.de/dpl/c/tv-shows/(?P<id>[0-9]+)' _TESTS = [{ 'url': 'https://www.hse.de/dpl/c/tv-shows/505350', @@ -64,7 +64,7 @@ def _real_extract(self, url): } -class HSEProductIE(HSEShowBaseInfoExtractor): +class HSEProductIE(HSEShowBaseIE): _VALID_URL = r'https?://(?:www\.)?hse\.de/dpl/p/product/(?P<id>[0-9]+)' _TESTS = [{ 'url': 'https://www.hse.de/dpl/p/product/408630', diff --git a/yt_dlp/extractor/ichinanalive.py b/yt_dlp/extractor/ichinanalive.py index a37cfe77b..475d33593 100644 --- a/yt_dlp/extractor/ichinanalive.py +++ b/yt_dlp/extractor/ichinanalive.py @@ -1,5 +1,13 @@ + from .common import InfoExtractor -from ..utils import ExtractorError, str_or_none, traverse_obj, unified_strdate +from ..utils import ( + ExtractorError, + int_or_none, + str_or_none, + traverse_obj, + unified_strdate, + url_or_none, +) class IchinanaLiveIE(InfoExtractor): @@ -157,3 +165,51 @@ def _real_extract(self, url): 'description': view_data.get('caption'), 'upload_date': unified_strdate(str_or_none(view_data.get('createdAt'))), } + + +class IchinanaLiveVODIE(InfoExtractor): + IE_NAME = '17live:vod' + _VALID_URL = r'https?://(?:www\.)?17\.live/ja/vod/[^/?#]+/(?P<id>[^/?#]+)' + _TESTS = [{ + 'url': 'https://17.live/ja/vod/27323042/2cf84520-e65e-4b22-891e-1d3a00b0f068', + 'md5': '3299b930d7457b069639486998a89580', + 'info_dict': { + 'id': '2cf84520-e65e-4b22-891e-1d3a00b0f068', + 'ext': 'mp4', + 'title': 'md5:b5f8cbf497d54cc6a60eb3b480182f01', + 'uploader': 'md5:29fb12122ab94b5a8495586e7c3085a5', + 'uploader_id': '27323042', + 'channel': '🌟オールナイトニッポン アーカイブ🌟', + 'channel_id': '2b4f85f1-d61e-429d-a901-68d32bdd8645', + 'like_count': int, + 'view_count': int, + 'thumbnail': r're:https?://.+/.+\.(?:jpe?g|png)', + 'duration': 549, + 'description': 'md5:116f326579700f00eaaf5581aae1192e', + 'timestamp': 1741058645, + 'upload_date': '20250304', + }, + }, { + 'url': 'https://17.live/ja/vod/27323042/0de11bac-9bea-40b8-9eab-0239a7d88079', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + json_data = self._download_json(f'https://wap-api.17app.co/api/v1/vods/{video_id}', video_id) + + return traverse_obj(json_data, { + 'id': ('vodID', {str}), + 'title': ('title', {str}), + 'formats': ('vodURL', {lambda x: self._extract_m3u8_formats(x, video_id)}), + 'uploader': ('userInfo', 'displayName', {str}), + 'uploader_id': ('userInfo', 'roomID', {int}, {str_or_none}), + 'channel': ('userInfo', 'name', {str}), + 'channel_id': ('userInfo', 'userID', {str}), + 'like_count': ('likeCount', {int_or_none}), + 'view_count': ('viewCount', {int_or_none}), + 'thumbnail': ('imageURL', {url_or_none}), + 'duration': ('duration', {int_or_none}), + 'description': ('description', {str}), + 'timestamp': ('createdAt', {int_or_none}), + }) diff --git a/yt_dlp/extractor/instagram.py b/yt_dlp/extractor/instagram.py index 55086d0b2..98f70c267 100644 --- a/yt_dlp/extractor/instagram.py +++ b/yt_dlp/extractor/instagram.py @@ -2,12 +2,12 @@ import itertools import json import re -import time from .common import InfoExtractor from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, + bug_reports_message, decode_base_n, encode_base_n, filter_dict, @@ -15,12 +15,12 @@ format_field, get_element_by_attribute, int_or_none, + join_nonempty, lowercase_escape, str_or_none, str_to_int, traverse_obj, url_or_none, - urlencode_postdata, ) _ENCODING_CHARS = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_' @@ -28,63 +28,30 @@ def _pk_to_id(media_id): """Source: https://stackoverflow.com/questions/24437823/getting-instagram-post-url-from-media-id""" - return encode_base_n(int(media_id.split('_')[0]), table=_ENCODING_CHARS) + pk = int(str(media_id).split('_')[0]) + return encode_base_n(pk, table=_ENCODING_CHARS) def _id_to_pk(shortcode): - """Covert a shortcode to a numeric value""" - return decode_base_n(shortcode[:11], table=_ENCODING_CHARS) + """Convert a shortcode to a numeric value""" + if len(shortcode) > 28: + shortcode = shortcode[:-28] + return decode_base_n(shortcode, table=_ENCODING_CHARS) class InstagramBaseIE(InfoExtractor): - _NETRC_MACHINE = 'instagram' - _IS_LOGGED_IN = False - _API_BASE_URL = 'https://i.instagram.com/api/v1' _LOGIN_URL = 'https://www.instagram.com/accounts/login' - _API_HEADERS = { - 'X-IG-App-ID': '936619743392459', - 'X-ASBD-ID': '198387', - 'X-IG-WWW-Claim': '0', - 'Origin': 'https://www.instagram.com', - 'Accept': '*/*', - } - def _perform_login(self, username, password): - if self._IS_LOGGED_IN: - return - - login_webpage = self._download_webpage( - self._LOGIN_URL, None, note='Downloading login webpage', errnote='Failed to download login webpage') - - shared_data = self._parse_json(self._search_regex( - r'window\._sharedData\s*=\s*({.+?});', login_webpage, 'shared data', default='{}'), None) - - login = self._download_json( - f'{self._LOGIN_URL}/ajax/', None, note='Logging in', headers={ - **self._API_HEADERS, - 'X-Requested-With': 'XMLHttpRequest', - 'X-CSRFToken': shared_data['config']['csrf_token'], - 'X-Instagram-AJAX': shared_data['rollout_hash'], - 'Referer': 'https://www.instagram.com/', - }, data=urlencode_postdata({ - 'enc_password': f'#PWD_INSTAGRAM_BROWSER:0:{int(time.time())}:{password}', - 'username': username, - 'queryParams': '{}', - 'optIntoOneTap': 'false', - 'stopDeletionNonce': '', - 'trustedDeviceRecords': '{}', - })) - - if not login.get('authenticated'): - if login.get('message'): - raise ExtractorError(f'Unable to login: {login["message"]}') - elif login.get('user'): - raise ExtractorError('Unable to login: Sorry, your password was incorrect. Please double-check your password.', expected=True) - elif login.get('user') is False: - raise ExtractorError('Unable to login: The username you entered doesn\'t belong to an account. Please check your username and try again.', expected=True) - raise ExtractorError('Unable to login') - InstagramBaseIE._IS_LOGGED_IN = True + @property + def _api_headers(self): + return { + 'X-IG-App-ID': self._configuration_arg('app_id', ['936619743392459'], ie_key=InstagramIE)[0], + 'X-ASBD-ID': '198387', + 'X-IG-WWW-Claim': '0', + 'Origin': 'https://www.instagram.com', + 'Accept': '*/*', + } def _get_count(self, media, kind, *keys): return traverse_obj( @@ -209,7 +176,7 @@ def _extract_product(self, product_info): def _get_comments(self, video_id): comments_info = self._download_json( f'{self._API_BASE_URL}/media/{_id_to_pk(video_id)}/comments/?can_support_threading=true&permalink_enabled=false', video_id, - fatal=False, errnote='Comments extraction failed', note='Downloading comments info', headers=self._API_HEADERS) or {} + fatal=False, errnote='Comments extraction failed', note='Downloading comments info', headers=self._api_headers) or {} comment_data = traverse_obj(comments_info, ('edge_media_to_parent_comment', 'edges'), 'comments') for comment_dict in comment_data or []: @@ -402,14 +369,14 @@ def _real_extract(self, url): info = traverse_obj(self._download_json( f'{self._API_BASE_URL}/media/{_id_to_pk(video_id)}/info/', video_id, fatal=False, errnote='Video info extraction failed', - note='Downloading video info', headers=self._API_HEADERS), ('items', 0)) + note='Downloading video info', headers=self._api_headers), ('items', 0)) if info: media.update(info) return self._extract_product(media) api_check = self._download_json( f'{self._API_BASE_URL}/web/get_ruling_for_content/?content_type=MEDIA&target_id={_id_to_pk(video_id)}', - video_id, headers=self._API_HEADERS, fatal=False, note='Setting up session', errnote=False) or {} + video_id, headers=self._api_headers, fatal=False, note='Setting up session', errnote=False) or {} csrf_token = self._get_cookies('https://www.instagram.com').get('csrftoken') if not csrf_token: @@ -429,7 +396,7 @@ def _real_extract(self, url): general_info = self._download_json( 'https://www.instagram.com/graphql/query/', video_id, fatal=False, errnote=False, headers={ - **self._API_HEADERS, + **self._api_headers, 'X-CSRFToken': csrf_token or '', 'X-Requested-With': 'XMLHttpRequest', 'Referer': url, @@ -437,7 +404,6 @@ def _real_extract(self, url): 'doc_id': '8845758582119845', 'variables': json.dumps(variables, separators=(',', ':')), }) - media.update(traverse_obj(general_info, ('data', 'xdt_shortcode_media')) or {}) if not general_info: self.report_warning('General metadata extraction failed (some metadata might be missing).', video_id) @@ -466,6 +432,26 @@ def _real_extract(self, url): media.update(traverse_obj( additional_data, ('graphql', 'shortcode_media'), 'shortcode_media', expected_type=dict) or {}) + else: + xdt_shortcode_media = traverse_obj(general_info, ('data', 'xdt_shortcode_media', {dict})) or {} + if not xdt_shortcode_media: + error = join_nonempty('title', 'description', delim=': ', from_dict=api_check) + if 'Restricted Video' in error: + self.raise_login_required(error) + elif error: + raise ExtractorError(error, expected=True) + elif len(video_id) > 28: + # It's a private post (video_id == shortcode + 28 extra characters) + # Only raise after getting empty response; sometimes "long"-shortcode posts are public + self.raise_login_required( + 'This content is only available for registered users who follow this account') + raise ExtractorError( + 'Instagram sent an empty media response. Check if this post is accessible in your ' + f'browser without being logged-in. If it is not, then u{self._login_hint()[1:]}. ' + 'Otherwise, if the post is accessible in browser without being logged-in' + f'{bug_reports_message(before=",")}', expected=True) + media.update(xdt_shortcode_media) + username = traverse_obj(media, ('owner', 'username')) or self._search_regex( r'"owner"\s*:\s*{\s*"username"\s*:\s*"(.+?)"', webpage, 'username', fatal=False) @@ -485,8 +471,7 @@ def _real_extract(self, url): return self.playlist_result( self._extract_nodes(nodes, True), video_id, format_field(username, None, 'Post by %s'), description) - - video_url = self._og_search_video_url(webpage, secure=False) + raise ExtractorError('There is no video in this post', expected=True) formats = [{ 'url': video_url, @@ -689,7 +674,7 @@ def _query_vars_for(data): class InstagramStoryIE(InstagramBaseIE): - _VALID_URL = r'https?://(?:www\.)?instagram\.com/stories/(?P<user>[^/]+)/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?instagram\.com/stories/(?P<user>[^/?#]+)(?:/(?P<id>\d+))?' IE_NAME = 'instagram:story' _TESTS = [{ @@ -699,25 +684,38 @@ class InstagramStoryIE(InstagramBaseIE): 'title': 'Rare', }, 'playlist_mincount': 50, + }, { + 'url': 'https://www.instagram.com/stories/fruits_zipper/3570766765028588805/', + 'only_matching': True, + }, { + 'url': 'https://www.instagram.com/stories/fruits_zipper', + 'only_matching': True, }] def _real_extract(self, url): - username, story_id = self._match_valid_url(url).groups() - story_info = self._download_webpage(url, story_id) - user_info = self._search_json(r'"user":', story_info, 'user info', story_id, fatal=False) + username, story_id = self._match_valid_url(url).group('user', 'id') + if username == 'highlights' and not story_id: # story id is only mandatory for highlights + raise ExtractorError('Input URL is missing a highlight ID', expected=True) + display_id = story_id or username + story_info = self._download_webpage(url, display_id) + user_info = self._search_json(r'"user":', story_info, 'user info', display_id, fatal=False) if not user_info: self.raise_login_required('This content is unreachable') user_id = traverse_obj(user_info, 'pk', 'id', expected_type=str) - story_info_url = user_id if username != 'highlights' else f'highlight:{story_id}' - if not story_info_url: # user id is only mandatory for non-highlights - raise ExtractorError('Unable to extract user id') + if username == 'highlights': + story_info_url = f'highlight:{story_id}' + else: + if not user_id: # user id is only mandatory for non-highlights + raise ExtractorError('Unable to extract user id') + story_info_url = user_id videos = traverse_obj(self._download_json( f'{self._API_BASE_URL}/feed/reels_media/?reel_ids={story_info_url}', - story_id, errnote=False, fatal=False, headers=self._API_HEADERS), 'reels') + display_id, errnote=False, fatal=False, headers=self._api_headers), 'reels') if not videos: self.raise_login_required('You need to log in to access this content') + user_info = traverse_obj(videos, (user_id, 'user', {dict})) or {} full_name = traverse_obj(videos, (f'highlight:{story_id}', 'user', 'full_name'), (user_id, 'user', 'full_name')) story_title = traverse_obj(videos, (f'highlight:{story_id}', 'title')) @@ -727,6 +725,7 @@ def _real_extract(self, url): highlights = traverse_obj(videos, (f'highlight:{story_id}', 'items'), (user_id, 'items')) info_data = [] for highlight in highlights: + highlight.setdefault('user', {}).update(user_info) highlight_data = self._extract_product(highlight) if highlight_data.get('formats'): info_data.append({ @@ -734,4 +733,7 @@ def _real_extract(self, url): 'uploader_id': user_id, **filter_dict(highlight_data), }) + if username != 'highlights' and story_id and not self._yes_playlist(username, story_id): + return traverse_obj(info_data, (lambda _, v: v['id'] == _pk_to_id(story_id), any)) + return self.playlist_result(info_data, playlist_id=story_id, playlist_title=story_title) diff --git a/yt_dlp/extractor/ivoox.py b/yt_dlp/extractor/ivoox.py new file mode 100644 index 000000000..36e02493a --- /dev/null +++ b/yt_dlp/extractor/ivoox.py @@ -0,0 +1,78 @@ +from .common import InfoExtractor +from ..utils import int_or_none, parse_iso8601, url_or_none, urljoin +from ..utils.traversal import traverse_obj + + +class IvooxIE(InfoExtractor): + _VALID_URL = ( + r'https?://(?:www\.)?ivoox\.com/(?:\w{2}/)?[^/?#]+_rf_(?P<id>[0-9]+)_1\.html', + r'https?://go\.ivoox\.com/rf/(?P<id>[0-9]+)', + ) + _TESTS = [{ + 'url': 'https://www.ivoox.com/dex-08x30-rostros-del-mal-los-asesinos-en-audios-mp3_rf_143594959_1.html', + 'md5': '993f712de5b7d552459fc66aa3726885', + 'info_dict': { + 'id': '143594959', + 'ext': 'mp3', + 'timestamp': 1742731200, + 'channel': 'DIAS EXTRAÑOS con Santiago Camacho', + 'title': 'DEx 08x30 Rostros del mal: Los asesinos en serie que aterrorizaron España', + 'description': 'md5:eae8b4b9740d0216d3871390b056bb08', + 'uploader': 'Santiago Camacho', + 'thumbnail': 'https://static-1.ivoox.com/audios/c/d/5/2/cd52f46783fe735000c33a803dce2554_XXL.jpg', + 'upload_date': '20250323', + 'episode': 'DEx 08x30 Rostros del mal: Los asesinos en serie que aterrorizaron España', + 'duration': 11837, + 'tags': ['españa', 'asesinos en serie', 'arropiero', 'historia criminal', 'mataviejas'], + }, + }, { + 'url': 'https://go.ivoox.com/rf/143594959', + 'only_matching': True, + }, { + 'url': 'https://www.ivoox.com/en/campodelgas-28-03-2025-audios-mp3_rf_144036942_1.html', + 'only_matching': True, + }] + + def _real_extract(self, url): + media_id = self._match_id(url) + webpage = self._download_webpage(url, media_id, fatal=False) + + data = self._search_nuxt_data( + webpage, media_id, fatal=False, traverse=('data', 0, 'data', 'audio')) + + direct_download = self._download_json( + f'https://vcore-web.ivoox.com/v1/public/audios/{media_id}/download-url', media_id, fatal=False, + note='Fetching direct download link', headers={'Referer': url}) + + download_paths = { + *traverse_obj(direct_download, ('data', 'downloadUrl', {str}, filter, all)), + *traverse_obj(data, (('downloadUrl', 'mediaUrl'), {str}, filter)), + } + + formats = [] + for path in download_paths: + formats.append({ + 'url': urljoin('https://ivoox.com', path), + 'http_headers': {'Referer': url}, + }) + + return { + 'id': media_id, + 'formats': formats, + 'uploader': self._html_search_regex(r'data-prm-author="([^"]+)"', webpage, 'author', default=None), + 'timestamp': parse_iso8601( + self._html_search_regex(r'data-prm-pubdate="([^"]+)"', webpage, 'timestamp', default=None)), + 'channel': self._html_search_regex(r'data-prm-podname="([^"]+)"', webpage, 'channel', default=None), + 'title': self._html_search_regex(r'data-prm-title="([^"]+)"', webpage, 'title', default=None), + 'thumbnail': self._og_search_thumbnail(webpage, default=None), + 'description': self._og_search_description(webpage, default=None), + **self._search_json_ld(webpage, media_id, default={}), + **traverse_obj(data, { + 'title': ('title', {str}), + 'description': ('description', {str}), + 'thumbnail': ('image', {url_or_none}), + 'timestamp': ('uploadDate', {parse_iso8601(delimiter=' ')}), + 'duration': ('duration', {int_or_none}), + 'tags': ('tags', ..., 'name', {str}), + }), + } diff --git a/yt_dlp/extractor/jamendo.py b/yt_dlp/extractor/jamendo.py index 16540c414..bac0e869c 100644 --- a/yt_dlp/extractor/jamendo.py +++ b/yt_dlp/extractor/jamendo.py @@ -2,10 +2,12 @@ import random from .common import InfoExtractor +from ..networking import HEADRequest from ..utils import ( clean_html, int_or_none, try_get, + urlhandle_detect_ext, ) @@ -27,7 +29,7 @@ class JamendoIE(InfoExtractor): 'ext': 'flac', # 'title': 'Maya Filipič - Stories from Emona I', 'title': 'Stories from Emona I', - 'artist': 'Maya Filipič', + 'artists': ['Maya Filipič'], 'album': 'Between two worlds', 'track': 'Stories from Emona I', 'duration': 210, @@ -93,9 +95,15 @@ def _real_extract(self, url): if not cover_url or cover_url in urls: continue urls.append(cover_url) + urlh = self._request_webpage( + HEADRequest(cover_url), track_id, 'Checking thumbnail extension', + errnote=False, fatal=False) + if not urlh: + continue size = int_or_none(cover_id.lstrip('size')) thumbnails.append({ 'id': cover_id, + 'ext': urlhandle_detect_ext(urlh, default='jpg'), 'url': cover_url, 'width': size, 'height': size, diff --git a/yt_dlp/extractor/kika.py b/yt_dlp/extractor/kika.py index 69f4a3ce0..e27756452 100644 --- a/yt_dlp/extractor/kika.py +++ b/yt_dlp/extractor/kika.py @@ -1,3 +1,5 @@ +import itertools + from .common import InfoExtractor from ..utils import ( determine_ext, @@ -124,3 +126,43 @@ def _extract_formats(self, media_info, video_id): 'vbr': ('bitrateVideo', {int_or_none}, {lambda x: None if x == -1 else x}), }), } + + +class KikaPlaylistIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?kika\.de/[\w-]+/(?P<id>[a-z-]+\d+)' + + _TESTS = [{ + 'url': 'https://www.kika.de/logo/logo-die-welt-und-ich-562', + 'info_dict': { + 'id': 'logo-die-welt-und-ich-562', + 'title': 'logo!', + 'description': 'md5:7b9d7f65561b82fa512f2cfb553c397d', + }, + 'playlist_count': 100, + }] + + def _entries(self, playlist_url, playlist_id): + for page in itertools.count(1): + data = self._download_json(playlist_url, playlist_id, note=f'Downloading page {page}') + for item in traverse_obj(data, ('content', lambda _, v: url_or_none(v['api']['url']))): + yield self.url_result( + item['api']['url'], ie=KikaIE, + **traverse_obj(item, { + 'id': ('id', {str}), + 'title': ('title', {str}), + 'duration': ('duration', {int_or_none}), + 'timestamp': ('date', {parse_iso8601}), + })) + + playlist_url = traverse_obj(data, ('links', 'next', {url_or_none})) + if not playlist_url: + break + + def _real_extract(self, url): + playlist_id = self._match_id(url) + brand_data = self._download_json( + f'https://www.kika.de/_next-api/proxy/v1/brands/{playlist_id}', playlist_id) + + return self.playlist_result( + self._entries(brand_data['videoSubchannel']['videosPageUrl'], playlist_id), + playlist_id, title=brand_data.get('title'), description=brand_data.get('description')) diff --git a/yt_dlp/extractor/lbry.py b/yt_dlp/extractor/lbry.py index 7b22f90e9..e6eef13e5 100644 --- a/yt_dlp/extractor/lbry.py +++ b/yt_dlp/extractor/lbry.py @@ -26,6 +26,7 @@ class LBRYBaseIE(InfoExtractor): _CLAIM_ID_REGEX = r'[0-9a-f]{1,40}' _OPT_CLAIM_ID = f'[^$@:/?#&]+(?:[:#]{_CLAIM_ID_REGEX})?' _SUPPORTED_STREAM_TYPES = ['video', 'audio'] + _UNSUPPORTED_STREAM_TYPES = ['binary'] _PAGE_SIZE = 50 def _call_api_proxy(self, method, display_id, params, resource): @@ -336,12 +337,15 @@ def _real_extract(self, url): 'vcodec': 'none' if stream_type == 'audio' else None, }) + final_url = None # HEAD request returns redirect response to m3u8 URL if available - final_url = self._request_webpage( + urlh = self._request_webpage( HEADRequest(streaming_url), display_id, headers=headers, - note='Downloading streaming redirect url info').url + note='Downloading streaming redirect url info', fatal=False) + if urlh: + final_url = urlh.url - elif result.get('value_type') == 'stream': + elif result.get('value_type') == 'stream' and stream_type not in self._UNSUPPORTED_STREAM_TYPES: claim_id, is_live = result['signing_channel']['claim_id'], True live_data = self._download_json( 'https://api.odysee.live/livestream/is_live', claim_id, diff --git a/yt_dlp/extractor/linkedin.py b/yt_dlp/extractor/linkedin.py index c8c8ae52a..2974f4026 100644 --- a/yt_dlp/extractor/linkedin.py +++ b/yt_dlp/extractor/linkedin.py @@ -1,4 +1,5 @@ import itertools +import json import re from .common import InfoExtractor @@ -9,12 +10,12 @@ int_or_none, mimetype2ext, srt_subtitles_timecode, - traverse_obj, try_get, url_or_none, urlencode_postdata, urljoin, ) +from ..utils.traversal import find_elements, require, traverse_obj class LinkedInBaseIE(InfoExtractor): @@ -82,7 +83,10 @@ def _get_video_id(self, video_data, course_slug, video_slug): class LinkedInIE(LinkedInBaseIE): - _VALID_URL = r'https?://(?:www\.)?linkedin\.com/posts/[^/?#]+-(?P<id>\d+)-\w{4}/?(?:[?#]|$)' + _VALID_URL = [ + r'https?://(?:www\.)?linkedin\.com/posts/[^/?#]+-(?P<id>\d+)-\w{4}/?(?:[?#]|$)', + r'https?://(?:www\.)?linkedin\.com/feed/update/urn:li:activity:(?P<id>\d+)', + ] _TESTS = [{ 'url': 'https://www.linkedin.com/posts/mishalkhawaja_sendinblueviews-toronto-digitalmarketing-ugcPost-6850898786781339649-mM20', 'info_dict': { @@ -106,6 +110,9 @@ class LinkedInIE(LinkedInBaseIE): 'like_count': int, 'subtitles': 'mincount:1', }, + }, { + 'url': 'https://www.linkedin.com/feed/update/urn:li:activity:7016901149999955968/?utm_source=share&utm_medium=member_desktop', + 'only_matching': True, }] def _real_extract(self, url): @@ -271,3 +278,110 @@ def _real_extract(self, url): entries, course_slug, course_data.get('title'), course_data.get('description')) + + +class LinkedInEventsIE(LinkedInBaseIE): + IE_NAME = 'linkedin:events' + _VALID_URL = r'https?://(?:www\.)?linkedin\.com/events/(?P<id>[\w-]+)' + _TESTS = [{ + 'url': 'https://www.linkedin.com/events/7084656651378536448/comments/', + 'info_dict': { + 'id': '7084656651378536448', + 'ext': 'mp4', + 'title': '#37 Aprende a hacer una entrevista en inglés para tu próximo trabajo remoto', + 'description': '¡Agarra para anotar que se viene tremendo evento!', + 'duration': 1765, + 'timestamp': 1689113772, + 'upload_date': '20230711', + 'release_timestamp': 1689174012, + 'release_date': '20230712', + 'live_status': 'was_live', + }, + }, { + 'url': 'https://www.linkedin.com/events/27-02energyfreedombyenergyclub7295762520814874625/comments/', + 'info_dict': { + 'id': '27-02energyfreedombyenergyclub7295762520814874625', + 'ext': 'mp4', + 'title': '27.02 Energy Freedom by Energy Club', + 'description': 'md5:1292e6f31df998914c293787a02c3b91', + 'duration': 6420, + 'timestamp': 1739445333, + 'upload_date': '20250213', + 'release_timestamp': 1740657620, + 'release_date': '20250227', + 'live_status': 'was_live', + }, + }] + + def _real_initialize(self): + if not self._get_cookies('https://www.linkedin.com/').get('li_at'): + self.raise_login_required() + + def _real_extract(self, url): + event_id = self._match_id(url) + webpage = self._download_webpage(url, event_id) + + base_data = traverse_obj(webpage, ( + {find_elements(tag='code', attr='style', value='display: none')}, ..., {json.loads}, 'included', ...)) + meta_data = traverse_obj(base_data, ( + lambda _, v: v['$type'] == 'com.linkedin.voyager.dash.events.ProfessionalEvent', any)) or {} + + live_status = { + 'PAST': 'was_live', + 'ONGOING': 'is_live', + 'FUTURE': 'is_upcoming', + }.get(meta_data.get('lifecycleState')) + + if live_status == 'is_upcoming': + player_data = {} + if event_time := traverse_obj(meta_data, ('displayEventTime', {str})): + message = f'This live event is scheduled for {event_time}' + else: + message = 'This live event has not yet started' + self.raise_no_formats(message, expected=True, video_id=event_id) + else: + # TODO: Add support for audio-only live events + player_data = traverse_obj(base_data, ( + lambda _, v: v['$type'] == 'com.linkedin.videocontent.VideoPlayMetadata', + any, {require('video player data')})) + + formats, subtitles = [], {} + for prog_fmts in traverse_obj(player_data, ('progressiveStreams', ..., {dict})): + for fmt_url in traverse_obj(prog_fmts, ('streamingLocations', ..., 'url', {url_or_none})): + formats.append({ + 'url': fmt_url, + **traverse_obj(prog_fmts, { + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + 'tbr': ('bitRate', {int_or_none(scale=1000)}), + 'filesize': ('size', {int_or_none}), + 'ext': ('mediaType', {mimetype2ext}), + }), + }) + + for m3u8_url in traverse_obj(player_data, ( + 'adaptiveStreams', lambda _, v: v['protocol'] == 'HLS', 'masterPlaylists', ..., 'url', {url_or_none}, + )): + fmts, subs = self._extract_m3u8_formats_and_subtitles( + m3u8_url, event_id, 'mp4', m3u8_id='hls', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + + return { + 'id': event_id, + 'formats': formats, + 'subtitles': subtitles, + 'live_status': live_status, + **traverse_obj(meta_data, { + 'title': ('name', {str}), + 'description': ('description', 'text', {str}), + 'timestamp': ('createdAt', {int_or_none(scale=1000)}), + # timeRange.start is available when the stream is_upcoming + 'release_timestamp': ('timeRange', 'start', {int_or_none(scale=1000)}), + }), + **traverse_obj(player_data, { + 'duration': ('duration', {int_or_none(scale=1000)}), + # liveStreamCreatedAt is only available when the stream is_live or was_live + 'release_timestamp': ('liveStreamCreatedAt', {int_or_none(scale=1000)}), + }), + } diff --git a/yt_dlp/extractor/loco.py b/yt_dlp/extractor/loco.py new file mode 100644 index 000000000..6c9a25567 --- /dev/null +++ b/yt_dlp/extractor/loco.py @@ -0,0 +1,159 @@ +import json +import random +import time + +from .common import InfoExtractor +from ..utils import int_or_none, jwt_decode_hs256, try_call, url_or_none +from ..utils.traversal import require, traverse_obj + + +class LocoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?loco\.com/(?P<type>streamers|stream)/(?P<id>[^/?#]+)' + _TESTS = [{ + 'url': 'https://loco.com/streamers/teuzinfps', + 'info_dict': { + 'id': 'teuzinfps', + 'ext': 'mp4', + 'title': r're:MS BOLADAO, RESENHA & GAMEPLAY ALTO NIVEL', + 'description': 'bom e novo', + 'uploader_id': 'RLUVE3S9JU', + 'channel': 'teuzinfps', + 'channel_follower_count': int, + 'comment_count': int, + 'view_count': int, + 'concurrent_view_count': int, + 'like_count': int, + 'thumbnail': 'https://static.ivory.getloconow.com/default_thumb/743701a9-98ca-41ae-9a8b-70bd5da070ad.jpg', + 'tags': ['MMORPG', 'Gameplay'], + 'series': 'Tibia', + 'timestamp': int, + 'modified_timestamp': int, + 'live_status': 'is_live', + 'upload_date': str, + 'modified_date': str, + }, + 'params': { + 'skip_download': 'Livestream', + }, + }, { + 'url': 'https://loco.com/stream/c64916eb-10fb-46a9-9a19-8c4b7ed064e7', + 'md5': '45ebc8a47ee1c2240178757caf8881b5', + 'info_dict': { + 'id': 'c64916eb-10fb-46a9-9a19-8c4b7ed064e7', + 'ext': 'mp4', + 'title': 'PAULINHO LOKO NA LOCO!', + 'description': 'live on na loco', + 'uploader_id': '2MDO7Z1DPM', + 'channel': 'paulinholokobr', + 'channel_follower_count': int, + 'comment_count': int, + 'view_count': int, + 'concurrent_view_count': int, + 'like_count': int, + 'duration': 14491, + 'thumbnail': 'https://static.ivory.getloconow.com/default_thumb/59b5970b-23c1-4518-9e96-17ce341299fe.jpg', + 'tags': ['Gameplay'], + 'series': 'GTA 5', + 'timestamp': 1740612872, + 'modified_timestamp': 1740613037, + 'upload_date': '20250226', + 'modified_date': '20250226', + }, + }, { + # Requires video authorization + 'url': 'https://loco.com/stream/ac854641-ae0f-497c-a8ea-4195f6d8cc53', + 'md5': '0513edf85c1e65c9521f555f665387d5', + 'info_dict': { + 'id': 'ac854641-ae0f-497c-a8ea-4195f6d8cc53', + 'ext': 'mp4', + 'title': 'DUAS CONTAS DESAFIANTE, RUSH TOP 1 NO BRASIL!', + 'description': 'md5:aa77818edd6fe00dd4b6be75cba5f826', + 'uploader_id': '7Y9JNAZC3Q', + 'channel': 'ayellol', + 'channel_follower_count': int, + 'comment_count': int, + 'view_count': int, + 'concurrent_view_count': int, + 'like_count': int, + 'duration': 1229, + 'thumbnail': 'https://static.ivory.getloconow.com/default_thumb/f5aa678b-6d04-45d9-a89a-859af0a8028f.jpg', + 'tags': ['Gameplay', 'Carry'], + 'series': 'League of Legends', + 'timestamp': 1741182253, + 'upload_date': '20250305', + 'modified_timestamp': 1741182419, + 'modified_date': '20250305', + }, + }] + + # From _app.js + _CLIENT_ID = 'TlwKp1zmF6eKFpcisn3FyR18WkhcPkZtzwPVEEC3' + _CLIENT_SECRET = 'Kp7tYlUN7LXvtcSpwYvIitgYcLparbtsQSe5AdyyCdiEJBP53Vt9J8eB4AsLdChIpcO2BM19RA3HsGtqDJFjWmwoonvMSG3ZQmnS8x1YIM8yl82xMXZGbE3NKiqmgBVU' + + def _is_jwt_expired(self, token): + return jwt_decode_hs256(token)['exp'] - time.time() < 300 + + def _get_access_token(self, video_id): + access_token = try_call(lambda: self._get_cookies('https://loco.com')['access_token'].value) + if access_token and not self._is_jwt_expired(access_token): + return access_token + access_token = traverse_obj(self._download_json( + 'https://api.getloconow.com/v3/user/device_profile/', video_id, + 'Downloading access token', fatal=False, data=json.dumps({ + 'platform': 7, + 'client_id': self._CLIENT_ID, + 'client_secret': self._CLIENT_SECRET, + 'model': 'Mozilla', + 'os_name': 'Win32', + 'os_ver': '5.0 (Windows)', + 'app_ver': '5.0 (Windows)', + }).encode(), headers={ + 'Content-Type': 'application/json;charset=utf-8', + 'DEVICE-ID': ''.join(random.choices('0123456789abcdef', k=32)) + 'live', + 'X-APP-LANG': 'en', + 'X-APP-LOCALE': 'en-US', + 'X-CLIENT-ID': self._CLIENT_ID, + 'X-CLIENT-SECRET': self._CLIENT_SECRET, + 'X-PLATFORM': '7', + }), 'access_token') + if access_token and not self._is_jwt_expired(access_token): + self._set_cookie('.loco.com', 'access_token', access_token) + return access_token + + def _real_extract(self, url): + video_type, video_id = self._match_valid_url(url).group('type', 'id') + webpage = self._download_webpage(url, video_id) + stream = traverse_obj(self._search_nextjs_data(webpage, video_id), ( + 'props', 'pageProps', ('liveStreamData', 'stream', 'liveStream'), {dict}, any, {require('stream info')})) + + if access_token := self._get_access_token(video_id): + self._request_webpage( + 'https://drm.loco.com/v1/streams/playback/', video_id, + 'Downloading video authorization', fatal=False, headers={ + 'authorization': access_token, + }, query={ + 'stream_uid': stream['uid'], + }) + + return { + 'formats': self._extract_m3u8_formats(stream['conf']['hls'], video_id), + 'id': video_id, + 'is_live': video_type == 'streamers', + **traverse_obj(stream, { + 'title': ('title', {str}), + 'series': ('game_name', {str}), + 'uploader_id': ('user_uid', {str}), + 'channel': ('alias', {str}), + 'description': ('description', {str}), + 'concurrent_view_count': ('viewersCurrent', {int_or_none}), + 'view_count': ('total_views', {int_or_none}), + 'thumbnail': ('thumbnail_url_small', {url_or_none}), + 'like_count': ('likes', {int_or_none}), + 'tags': ('tags', ..., {str}), + 'timestamp': ('started_at', {int_or_none(scale=1000)}), + 'modified_timestamp': ('updated_at', {int_or_none(scale=1000)}), + 'comment_count': ('comments_count', {int_or_none}), + 'channel_follower_count': ('followers_count', {int_or_none}), + 'duration': ('duration', {int_or_none}), + }), + } diff --git a/yt_dlp/extractor/lrt.py b/yt_dlp/extractor/lrt.py index 1a0b6da23..e50194f88 100644 --- a/yt_dlp/extractor/lrt.py +++ b/yt_dlp/extractor/lrt.py @@ -2,8 +2,11 @@ from ..utils import ( clean_html, merge_dicts, + str_or_none, traverse_obj, + unified_timestamp, url_or_none, + urljoin, ) @@ -80,7 +83,7 @@ class LRTVODIE(LRTBaseIE): }] def _real_extract(self, url): - path, video_id = self._match_valid_url(url).groups() + path, video_id = self._match_valid_url(url).group('path', 'id') webpage = self._download_webpage(url, video_id) media_url = self._extract_js_var(webpage, 'main_url', path) @@ -106,3 +109,42 @@ def _real_extract(self, url): } return merge_dicts(clean_info, jw_data, json_ld_data) + + +class LRTRadioIE(LRTBaseIE): + _VALID_URL = r'https?://(?:www\.)?lrt\.lt/radioteka/irasas/(?P<id>\d+)/(?P<path>[^?#/]+)' + _TESTS = [{ + # m3u8 download + 'url': 'https://www.lrt.lt/radioteka/irasas/2000359728/nemarios-eiles-apie-pragarus-ir-skaistyklas-su-aiste-kiltinaviciute', + 'info_dict': { + 'id': '2000359728', + 'ext': 'm4a', + 'title': 'Nemarios eilės: apie pragarus ir skaistyklas su Aiste Kiltinavičiūte', + 'description': 'md5:5eee9a0e86a55bf547bd67596204625d', + 'timestamp': 1726143120, + 'upload_date': '20240912', + 'tags': 'count:5', + 'thumbnail': r're:https?://.+/.+\.jpe?g', + 'categories': ['Daiktiniai įrodymai'], + }, + }, { + 'url': 'https://www.lrt.lt/radioteka/irasas/2000304654/vakaras-su-knyga-svetlana-aleksijevic-cernobylio-malda-v-dalis?season=%2Fmediateka%2Faudio%2Fvakaras-su-knyga%2F2023', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id, path = self._match_valid_url(url).group('id', 'path') + media = self._download_json( + 'https://www.lrt.lt/radioteka/api/media', video_id, + query={'url': f'/mediateka/irasas/{video_id}/{path}'}) + + return traverse_obj(media, { + 'id': ('id', {int}, {str_or_none}), + 'title': ('title', {str}), + 'tags': ('tags', ..., 'name', {str}), + 'categories': ('playlist_item', 'category', {str}, filter, all, filter), + 'description': ('content', {clean_html}, {str}), + 'timestamp': ('date', {lambda x: x.replace('.', '/')}, {unified_timestamp}), + 'thumbnail': ('playlist_item', 'image', {urljoin('https://www.lrt.lt')}), + 'formats': ('playlist_item', 'file', {lambda x: self._extract_m3u8_formats(x, video_id)}), + }) diff --git a/yt_dlp/extractor/magellantv.py b/yt_dlp/extractor/magellantv.py index 6f2524ba2..e7ae709cf 100644 --- a/yt_dlp/extractor/magellantv.py +++ b/yt_dlp/extractor/magellantv.py @@ -1,35 +1,36 @@ from .common import InfoExtractor -from ..utils import parse_age_limit, parse_duration, traverse_obj +from ..utils import parse_age_limit, parse_duration, url_or_none +from ..utils.traversal import traverse_obj class MagellanTVIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?magellantv\.com/(?:watch|video)/(?P<id>[\w-]+)' _TESTS = [{ - 'url': 'https://www.magellantv.com/watch/my-dads-on-death-row?type=v', + 'url': 'https://www.magellantv.com/watch/incas-the-new-story?type=v', 'info_dict': { - 'id': 'my-dads-on-death-row', + 'id': 'incas-the-new-story', 'ext': 'mp4', - 'title': 'My Dad\'s On Death Row', - 'description': 'md5:33ba23b9f0651fc4537ed19b1d5b0d7a', - 'duration': 3780.0, + 'title': 'Incas: The New Story', + 'description': 'md5:936c7f6d711c02dfb9db22a067b586fe', 'age_limit': 14, - 'tags': ['Justice', 'Reality', 'United States', 'True Crime'], + 'duration': 3060.0, + 'tags': ['Ancient History', 'Archaeology', 'Anthropology'], }, 'params': {'skip_download': 'm3u8'}, }, { - 'url': 'https://www.magellantv.com/video/james-bulger-the-new-revelations', + 'url': 'https://www.magellantv.com/video/tortured-to-death-murdering-the-nanny', 'info_dict': { - 'id': 'james-bulger-the-new-revelations', + 'id': 'tortured-to-death-murdering-the-nanny', 'ext': 'mp4', - 'title': 'James Bulger: The New Revelations', - 'description': 'md5:7b97922038bad1d0fe8d0470d8a189f2', + 'title': 'Tortured to Death: Murdering the Nanny', + 'description': 'md5:d87033594fa218af2b1a8b49f52511e5', + 'age_limit': 14, 'duration': 2640.0, - 'age_limit': 0, - 'tags': ['Investigation', 'True Crime', 'Justice', 'Europe'], + 'tags': ['True Crime', 'Murder'], }, 'params': {'skip_download': 'm3u8'}, }, { - 'url': 'https://www.magellantv.com/watch/celebration-nation', + 'url': 'https://www.magellantv.com/watch/celebration-nation?type=s', 'info_dict': { 'id': 'celebration-nation', 'ext': 'mp4', @@ -43,10 +44,19 @@ class MagellanTVIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - data = traverse_obj(self._search_nextjs_data(webpage, video_id), ( - 'props', 'pageProps', 'reactContext', - (('video', 'detail'), ('series', 'currentEpisode')), {dict}), get_all=False) - formats, subtitles = self._extract_m3u8_formats_and_subtitles(data['jwpVideoUrl'], video_id) + context = self._search_nextjs_data(webpage, video_id)['props']['pageProps']['reactContext'] + data = traverse_obj(context, ((('video', 'detail'), ('series', 'currentEpisode')), {dict}, any)) + + formats, subtitles = [], {} + for m3u8_url in set(traverse_obj(data, ((('manifests', ..., 'hls'), 'jwp_video_url'), {url_or_none}))): + fmts, subs = self._extract_m3u8_formats_and_subtitles( + m3u8_url, video_id, 'mp4', m3u8_id='hls', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + if not formats and (error := traverse_obj(context, ('errorDetailPage', 'errorMessage', {str}))): + if 'available in your country' in error: + self.raise_geo_restricted(msg=error) + self.raise_no_formats(f'{self.IE_NAME} said: {error}', expected=True) return { 'id': video_id, diff --git a/yt_dlp/extractor/manyvids.py b/yt_dlp/extractor/manyvids.py index 8caa8f87f..1356169bf 100644 --- a/yt_dlp/extractor/manyvids.py +++ b/yt_dlp/extractor/manyvids.py @@ -1,31 +1,38 @@ -import re - from .common import InfoExtractor from ..utils import ( + clean_html, determine_ext, - extract_attributes, int_or_none, - str_to_int, + join_nonempty, + parse_count, + parse_duration, + parse_iso8601, url_or_none, - urlencode_postdata, ) +from ..utils.traversal import traverse_obj class ManyVidsIE(InfoExtractor): - _WORKING = False _VALID_URL = r'(?i)https?://(?:www\.)?manyvids\.com/video/(?P<id>\d+)' _TESTS = [{ # preview video - 'url': 'https://www.manyvids.com/Video/133957/everthing-about-me/', - 'md5': '03f11bb21c52dd12a05be21a5c7dcc97', + 'url': 'https://www.manyvids.com/Video/530341/mv-tips-tricks', + 'md5': '738dc723f7735ee9602f7ea352a6d058', 'info_dict': { - 'id': '133957', + 'id': '530341-preview', 'ext': 'mp4', - 'title': 'everthing about me (Preview)', - 'uploader': 'ellyxxix', + 'title': 'MV Tips & Tricks (Preview)', + 'description': r're:I will take you on a tour around .{1313}$', + 'thumbnail': r're:https://cdn5\.manyvids\.com/php_uploads/video_images/DestinyDiaz/.+\.jpg', + 'uploader': 'DestinyDiaz', 'view_count': int, 'like_count': int, + 'release_timestamp': 1508419904, + 'tags': ['AdultSchool', 'BBW', 'SFW', 'TeacherFetish'], + 'release_date': '20171019', + 'duration': 3167.0, }, + 'expected_warnings': ['Only extracting preview'], }, { # full video 'url': 'https://www.manyvids.com/Video/935718/MY-FACE-REVEAL/', @@ -34,129 +41,68 @@ class ManyVidsIE(InfoExtractor): 'id': '935718', 'ext': 'mp4', 'title': 'MY FACE REVEAL', - 'description': 'md5:ec5901d41808b3746fed90face161612', + 'description': r're:Today is the day!! I am finally taking off my mask .{445}$', + 'thumbnail': r're:https://ods\.manyvids\.com/1001061960/3aa5397f2a723ec4597e344df66ab845/screenshots/.+\.jpg', 'uploader': 'Sarah Calanthe', 'view_count': int, 'like_count': int, + 'release_date': '20181110', + 'tags': ['EyeContact', 'Interviews', 'MaskFetish', 'MouthFetish', 'Redhead'], + 'release_timestamp': 1541851200, + 'duration': 224.0, }, }] + _API_BASE = 'https://www.manyvids.com/bff/store/video' def _real_extract(self, url): video_id = self._match_id(url) + video_data = self._download_json(f'{self._API_BASE}/{video_id}/private', video_id)['data'] + formats, preview_only = [], True - real_url = f'https://www.manyvids.com/video/{video_id}/gtm.js' - try: - webpage = self._download_webpage(real_url, video_id) - except Exception: - # probably useless fallback - webpage = self._download_webpage(url, video_id) - - info = self._search_regex( - r'''(<div\b[^>]*\bid\s*=\s*(['"])pageMetaDetails\2[^>]*>)''', - webpage, 'meta details', default='') - info = extract_attributes(info) - - player = self._search_regex( - r'''(<div\b[^>]*\bid\s*=\s*(['"])rmpPlayerStream\2[^>]*>)''', - webpage, 'player details', default='') - player = extract_attributes(player) - - video_urls_and_ids = ( - (info.get('data-meta-video'), 'video'), - (player.get('data-video-transcoded'), 'transcoded'), - (player.get('data-video-filepath'), 'filepath'), - (self._og_search_video_url(webpage, secure=False, default=None), 'og_video'), - ) - - def txt_or_none(s, default=None): - return (s.strip() or default) if isinstance(s, str) else default - - uploader = txt_or_none(info.get('data-meta-author')) - - def mung_title(s): - if uploader: - s = re.sub(rf'^\s*{re.escape(uploader)}\s+[|-]', '', s) - return txt_or_none(s) - - title = ( - mung_title(info.get('data-meta-title')) - or self._html_search_regex( - (r'<span[^>]+class=["\']item-title[^>]+>([^<]+)', - r'<h2[^>]+class=["\']h2 m-0["\'][^>]*>([^<]+)'), - webpage, 'title', default=None) - or self._html_search_meta( - 'twitter:title', webpage, 'title', fatal=True)) - - title = re.sub(r'\s*[|-]\s+ManyVids\s*$', '', title) or title - - if any(p in webpage for p in ('preview_videos', '_preview.mp4')): - title += ' (Preview)' - - mv_token = self._search_regex( - r'data-mvtoken=(["\'])(?P<value>(?:(?!\1).)+)\1', webpage, - 'mv token', default=None, group='value') - - if mv_token: - # Sets some cookies - self._download_webpage( - 'https://www.manyvids.com/includes/ajax_repository/you_had_me_at_hello.php', - video_id, note='Setting format cookies', fatal=False, - data=urlencode_postdata({ - 'mvtoken': mv_token, - 'vid': video_id, - }), headers={ - 'Referer': url, - 'X-Requested-With': 'XMLHttpRequest', - }) - - formats = [] - for v_url, fmt in video_urls_and_ids: - v_url = url_or_none(v_url) - if not v_url: + for format_id, path in [ + ('preview', ['teaser', 'filepath']), + ('transcoded', ['transcodedFilepath']), + ('filepath', ['filepath']), + ]: + format_url = traverse_obj(video_data, (*path, {url_or_none})) + if not format_url: continue - if determine_ext(v_url) == 'm3u8': - formats.extend(self._extract_m3u8_formats( - v_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls')) + if determine_ext(format_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats(format_url, video_id, 'mp4', m3u8_id=format_id)) else: formats.append({ - 'url': v_url, - 'format_id': fmt, + 'url': format_url, + 'format_id': format_id, + 'preference': -10 if format_id == 'preview' else None, + 'quality': 10 if format_id == 'filepath' else None, + 'height': int_or_none( + self._search_regex(r'_(\d{2,3}[02468])_', format_url, 'height', default=None)), }) + if format_id != 'preview': + preview_only = False - self._remove_duplicate_formats(formats) + metadata = traverse_obj( + self._download_json(f'{self._API_BASE}/{video_id}', video_id, fatal=False), 'data') + title = traverse_obj(metadata, ('title', {clean_html})) - for f in formats: - if f.get('height') is None: - f['height'] = int_or_none( - self._search_regex(r'_(\d{2,3}[02468])_', f['url'], 'video height', default=None)) - if '/preview/' in f['url']: - f['format_id'] = '_'.join(filter(None, (f.get('format_id'), 'preview'))) - f['preference'] = -10 - if 'transcoded' in f['format_id']: - f['preference'] = f.get('preference', -1) - 1 - - def get_likes(): - likes = self._search_regex( - rf'''(<a\b[^>]*\bdata-id\s*=\s*(['"]){video_id}\2[^>]*>)''', - webpage, 'likes', default='') - likes = extract_attributes(likes) - return int_or_none(likes.get('data-likes')) - - def get_views(): - return str_to_int(self._html_search_regex( - r'''(?s)<span\b[^>]*\bclass\s*=["']views-wrapper\b[^>]+>.+?<span\b[^>]+>\s*(\d[\d,.]*)\s*</span>''', - webpage, 'view count', default=None)) + if preview_only: + title = join_nonempty(title, '(Preview)', delim=' ') + video_id += '-preview' + self.report_warning( + f'Only extracting preview. Video may be paid or subscription only. {self._login_hint()}') return { 'id': video_id, 'title': title, 'formats': formats, - 'description': txt_or_none(info.get('data-meta-description')), - 'uploader': txt_or_none(info.get('data-meta-author')), - 'thumbnail': ( - url_or_none(info.get('data-meta-image')) - or url_or_none(player.get('data-video-screenshot'))), - 'view_count': get_views(), - 'like_count': get_likes(), + **traverse_obj(metadata, { + 'description': ('description', {clean_html}), + 'uploader': ('model', 'displayName', {clean_html}), + 'thumbnail': (('screenshot', 'thumbnail'), {url_or_none}, any), + 'view_count': ('views', {parse_count}), + 'like_count': ('likes', {parse_count}), + 'release_timestamp': ('launchDate', {parse_iso8601}), + 'duration': ('videoDuration', {parse_duration}), + 'tags': ('tagList', ..., 'label', {str}, filter, all, filter), + }), } diff --git a/yt_dlp/extractor/medaltv.py b/yt_dlp/extractor/medaltv.py index d64dbfe63..94c51ed0e 100644 --- a/yt_dlp/extractor/medaltv.py +++ b/yt_dlp/extractor/medaltv.py @@ -102,11 +102,10 @@ def add_item(container, item_url, height, id_key='format_id', item_id=None): item_id = item_id or '%dp' % height if item_id not in item_url: return - width = int(round(aspect_ratio * height)) container.append({ 'url': item_url, id_key: item_id, - 'width': width, + 'width': round(aspect_ratio * height), 'height': height, }) diff --git a/yt_dlp/extractor/microsoftembed.py b/yt_dlp/extractor/microsoftembed.py index 2575d6c5e..9d58fa0a6 100644 --- a/yt_dlp/extractor/microsoftembed.py +++ b/yt_dlp/extractor/microsoftembed.py @@ -4,6 +4,7 @@ from ..utils import ( int_or_none, parse_iso8601, + parse_resolution, traverse_obj, unified_timestamp, url_basename, @@ -83,8 +84,8 @@ def _sub_to_dict(subtitle_list): subtitles.setdefault(sub.pop('tag', 'und'), []).append(sub) return subtitles - def _extract_ism(self, ism_url, video_id): - formats = self._extract_ism_formats(ism_url, video_id) + def _extract_ism(self, ism_url, video_id, fatal=True): + formats = self._extract_ism_formats(ism_url, video_id, fatal=fatal) for fmt in formats: if fmt['language'] != 'eng' and 'English' not in fmt['format_id']: fmt['language_preference'] = -10 @@ -218,9 +219,21 @@ class MicrosoftLearnEpisodeIE(MicrosoftMediusBaseIE): 'description': 'md5:7bbbfb593d21c2cf2babc3715ade6b88', 'timestamp': 1676339547, 'upload_date': '20230214', - 'thumbnail': r're:https://learn\.microsoft\.com/video/media/.*\.png', + 'thumbnail': r're:https://learn\.microsoft\.com/video/media/.+\.png', 'subtitles': 'count:14', }, + }, { + 'url': 'https://learn.microsoft.com/en-gb/shows/on-demand-instructor-led-training-series/az-900-module-1', + 'info_dict': { + 'id': '4fe10f7c-d83c-463b-ac0e-c30a8195e01b', + 'ext': 'mp4', + 'title': 'AZ-900 Cloud fundamentals (1 of 6)', + 'description': 'md5:3c2212ce865e9142f402c766441bd5c9', + 'thumbnail': r're:https://.+/.+\.jpg', + 'timestamp': 1706605184, + 'upload_date': '20240130', + }, + 'params': {'format': 'bv[protocol=https]'}, }] def _real_extract(self, url): @@ -230,9 +243,32 @@ def _real_extract(self, url): entry_id = self._html_search_meta('entryId', webpage, 'entryId', fatal=True) video_info = self._download_json( f'https://learn.microsoft.com/api/video/public/v1/entries/{entry_id}', video_id) + + formats = [] + if ism_url := traverse_obj(video_info, ('publicVideo', 'adaptiveVideoUrl', {url_or_none})): + formats.extend(self._extract_ism(ism_url, video_id, fatal=False)) + if hls_url := traverse_obj(video_info, ('publicVideo', 'adaptiveVideoHLSUrl', {url_or_none})): + formats.extend(self._extract_m3u8_formats(hls_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + if mpd_url := traverse_obj(video_info, ('publicVideo', 'adaptiveVideoDashUrl', {url_or_none})): + formats.extend(self._extract_mpd_formats(mpd_url, video_id, mpd_id='dash', fatal=False)) + for key in ('low', 'medium', 'high'): + if video_url := traverse_obj(video_info, ('publicVideo', f'{key}QualityVideoUrl', {url_or_none})): + formats.append({ + 'url': video_url, + 'format_id': f'video-http-{key}', + 'acodec': 'none', + **parse_resolution(video_url), + }) + if audio_url := traverse_obj(video_info, ('publicVideo', 'audioUrl', {url_or_none})): + formats.append({ + 'url': audio_url, + 'format_id': 'audio-http', + 'vcodec': 'none', + }) + return { 'id': entry_id, - 'formats': self._extract_ism(video_info['publicVideo']['adaptiveVideoUrl'], video_id), + 'formats': formats, 'subtitles': self._sub_to_dict(traverse_obj(video_info, ( 'publicVideo', 'captions', lambda _, v: url_or_none(v['url']), { 'tag': ('language', {str}), diff --git a/yt_dlp/extractor/mitele.py b/yt_dlp/extractor/mitele.py index 76fef337a..55fa83b51 100644 --- a/yt_dlp/extractor/mitele.py +++ b/yt_dlp/extractor/mitele.py @@ -1,5 +1,7 @@ from .telecinco import TelecincoBaseIE +from ..networking.exceptions import HTTPError from ..utils import ( + ExtractorError, int_or_none, parse_iso8601, ) @@ -79,7 +81,17 @@ class MiTeleIE(TelecincoBaseIE): def _real_extract(self, url): display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) + + try: # yt-dlp's default user-agents are too old and blocked by akamai + webpage = self._download_webpage(url, display_id, headers={ + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:136.0) Gecko/20100101 Firefox/136.0', + }) + except ExtractorError as e: + if not isinstance(e.cause, HTTPError) or e.cause.status != 403: + raise + # Retry with impersonation if hardcoded UA is insufficient to bypass akamai + webpage = self._download_webpage(url, display_id, impersonate=True) + pre_player = self._search_json( r'window\.\$REACTBASE_STATE\.prePlayer_mtweb\s*=', webpage, 'Pre Player', display_id)['prePlayer'] diff --git a/yt_dlp/extractor/mixcloud.py b/yt_dlp/extractor/mixcloud.py index 19b7fd4e7..852670fcb 100644 --- a/yt_dlp/extractor/mixcloud.py +++ b/yt_dlp/extractor/mixcloud.py @@ -10,7 +10,9 @@ parse_iso8601, strip_or_none, try_get, + url_or_none, ) +from ..utils.traversal import traverse_obj class MixcloudBaseIE(InfoExtractor): @@ -37,7 +39,7 @@ class MixcloudIE(MixcloudBaseIE): 'ext': 'm4a', 'title': 'Cryptkeeper', 'description': 'After quite a long silence from myself, finally another Drum\'n\'Bass mix with my favourite current dance floor bangers.', - 'uploader': 'Daniel Holbach', + 'uploader': 'dholbach', 'uploader_id': 'dholbach', 'thumbnail': r're:https?://.*\.jpg', 'view_count': int, @@ -46,10 +48,11 @@ class MixcloudIE(MixcloudBaseIE): 'uploader_url': 'https://www.mixcloud.com/dholbach/', 'artist': 'Submorphics & Chino , Telekinesis, Porter Robinson, Enei, Breakage ft Jess Mills', 'duration': 3723, - 'tags': [], + 'tags': ['liquid drum and bass', 'drum and bass'], 'comment_count': int, 'repost_count': int, 'like_count': int, + 'artists': list, }, 'params': {'skip_download': 'm3u8'}, }, { @@ -67,7 +70,7 @@ class MixcloudIE(MixcloudBaseIE): 'upload_date': '20150203', 'uploader_url': 'https://www.mixcloud.com/gillespeterson/', 'duration': 2992, - 'tags': [], + 'tags': ['jazz', 'soul', 'world music', 'funk'], 'comment_count': int, 'repost_count': int, 'like_count': int, @@ -149,8 +152,6 @@ def _real_extract(self, url): elif reason: raise ExtractorError('Track is restricted', expected=True) - title = cloudcast['name'] - stream_info = cloudcast['streamInfo'] formats = [] @@ -182,47 +183,39 @@ def _real_extract(self, url): self.raise_login_required(metadata_available=True) comments = [] - for edge in (try_get(cloudcast, lambda x: x['comments']['edges']) or []): - node = edge.get('node') or {} + for node in traverse_obj(cloudcast, ('comments', 'edges', ..., 'node', {dict})): text = strip_or_none(node.get('comment')) if not text: continue - user = node.get('user') or {} comments.append({ - 'author': user.get('displayName'), - 'author_id': user.get('username'), 'text': text, - 'timestamp': parse_iso8601(node.get('created')), + **traverse_obj(node, { + 'author': ('user', 'displayName', {str}), + 'author_id': ('user', 'username', {str}), + 'timestamp': ('created', {parse_iso8601}), + }), }) - tags = [] - for t in cloudcast.get('tags'): - tag = try_get(t, lambda x: x['tag']['name'], str) - if not tag: - tags.append(tag) - - get_count = lambda x: int_or_none(try_get(cloudcast, lambda y: y[x]['totalCount'])) - - owner = cloudcast.get('owner') or {} - return { 'id': track_id, - 'title': title, 'formats': formats, - 'description': cloudcast.get('description'), - 'thumbnail': try_get(cloudcast, lambda x: x['picture']['url'], str), - 'uploader': owner.get('displayName'), - 'timestamp': parse_iso8601(cloudcast.get('publishDate')), - 'uploader_id': owner.get('username'), - 'uploader_url': owner.get('url'), - 'duration': int_or_none(cloudcast.get('audioLength')), - 'view_count': int_or_none(cloudcast.get('plays')), - 'like_count': get_count('favorites'), - 'repost_count': get_count('reposts'), - 'comment_count': get_count('comments'), 'comments': comments, - 'tags': tags, - 'artist': ', '.join(cloudcast.get('featuringArtistList') or []) or None, + **traverse_obj(cloudcast, { + 'title': ('name', {str}), + 'description': ('description', {str}), + 'thumbnail': ('picture', 'url', {url_or_none}), + 'timestamp': ('publishDate', {parse_iso8601}), + 'duration': ('audioLength', {int_or_none}), + 'uploader': ('owner', 'displayName', {str}), + 'uploader_id': ('owner', 'username', {str}), + 'uploader_url': ('owner', 'url', {url_or_none}), + 'view_count': ('plays', {int_or_none}), + 'like_count': ('favorites', 'totalCount', {int_or_none}), + 'repost_count': ('reposts', 'totalCount', {int_or_none}), + 'comment_count': ('comments', 'totalCount', {int_or_none}), + 'tags': ('tags', ..., 'tag', 'name', {str}, filter, all, filter), + 'artists': ('featuringArtistList', ..., {str}, filter, all, filter), + }), } @@ -295,7 +288,7 @@ class MixcloudUserIE(MixcloudPlaylistBaseIE): 'url': 'http://www.mixcloud.com/dholbach/', 'info_dict': { 'id': 'dholbach_uploads', - 'title': 'Daniel Holbach (uploads)', + 'title': 'dholbach (uploads)', 'description': 'md5:a3f468a60ac8c3e1f8616380fc469b2b', }, 'playlist_mincount': 36, @@ -303,7 +296,7 @@ class MixcloudUserIE(MixcloudPlaylistBaseIE): 'url': 'http://www.mixcloud.com/dholbach/uploads/', 'info_dict': { 'id': 'dholbach_uploads', - 'title': 'Daniel Holbach (uploads)', + 'title': 'dholbach (uploads)', 'description': 'md5:a3f468a60ac8c3e1f8616380fc469b2b', }, 'playlist_mincount': 36, @@ -311,7 +304,7 @@ class MixcloudUserIE(MixcloudPlaylistBaseIE): 'url': 'http://www.mixcloud.com/dholbach/favorites/', 'info_dict': { 'id': 'dholbach_favorites', - 'title': 'Daniel Holbach (favorites)', + 'title': 'dholbach (favorites)', 'description': 'md5:a3f468a60ac8c3e1f8616380fc469b2b', }, # 'params': { @@ -337,7 +330,7 @@ class MixcloudUserIE(MixcloudPlaylistBaseIE): 'title': 'First Ear (stream)', 'description': 'we maraud for ears', }, - 'playlist_mincount': 269, + 'playlist_mincount': 267, }] _TITLE_KEY = 'displayName' @@ -361,7 +354,7 @@ class MixcloudPlaylistIE(MixcloudPlaylistBaseIE): 'id': 'maxvibes_jazzcat-on-ness-radio', 'title': 'Ness Radio sessions', }, - 'playlist_mincount': 59, + 'playlist_mincount': 58, }] _TITLE_KEY = 'name' _DESCRIPTION_KEY = 'description' diff --git a/yt_dlp/extractor/mlb.py b/yt_dlp/extractor/mlb.py index 935bf8561..562b93fc7 100644 --- a/yt_dlp/extractor/mlb.py +++ b/yt_dlp/extractor/mlb.py @@ -365,13 +365,15 @@ def _real_initialize(self): 'All videos are only available to registered users', method='password') def _set_device_id(self, username): - if not self._device_id: - self._device_id = self.cache.load( - self._NETRC_MACHINE, 'device_ids', default={}).get(username) + if self._device_id: + return + device_id_cache = self.cache.load(self._NETRC_MACHINE, 'device_ids', default={}) + self._device_id = device_id_cache.get(username) if self._device_id: return self._device_id = str(uuid.uuid4()) - self.cache.store(self._NETRC_MACHINE, 'device_ids', {username: self._device_id}) + device_id_cache[username] = self._device_id + self.cache.store(self._NETRC_MACHINE, 'device_ids', device_id_cache) def _perform_login(self, username, password): try: @@ -449,9 +451,7 @@ def _extract_formats_and_subtitles(self, broadcast, video_id): if not (m3u8_url and token): errors = '; '.join(traverse_obj(response, ('errors', ..., 'message', {str}))) - if 'not entitled' in errors: - raise ExtractorError(errors, expected=True) - elif errors: # Only warn when 'blacked out' since radio formats are available + if errors: # Only warn when 'blacked out' or 'not entitled'; radio formats may be available self.report_warning(f'API returned errors for {format_id}: {errors}') else: self.report_warning(f'No formats available for {format_id} broadcast; skipping') diff --git a/yt_dlp/extractor/moviepilot.py b/yt_dlp/extractor/moviepilot.py index ed5be4fa6..3b2023226 100644 --- a/yt_dlp/extractor/moviepilot.py +++ b/yt_dlp/extractor/moviepilot.py @@ -3,8 +3,8 @@ class MoviepilotIE(InfoExtractor): - _IE_NAME = 'moviepilot' - _IE_DESC = 'Moviepilot trailer' + IE_NAME = 'moviepilot' + IE_DESC = 'Moviepilot trailer' _VALID_URL = r'https?://(?:www\.)?moviepilot\.de/movies/(?P<id>[^/]+)' _TESTS = [{ diff --git a/yt_dlp/extractor/msn.py b/yt_dlp/extractor/msn.py index dd864952c..6ede7c5cf 100644 --- a/yt_dlp/extractor/msn.py +++ b/yt_dlp/extractor/msn.py @@ -1,167 +1,215 @@ -import re - from .common import InfoExtractor from ..utils import ( ExtractorError, + clean_html, determine_ext, int_or_none, - unescapeHTML, + parse_iso8601, + url_or_none, ) +from ..utils.traversal import traverse_obj class MSNIE(InfoExtractor): - _WORKING = False - _VALID_URL = r'https?://(?:(?:www|preview)\.)?msn\.com/(?:[^/]+/)+(?P<display_id>[^/]+)/[a-z]{2}-(?P<id>[\da-zA-Z]+)' + _VALID_URL = r'https?://(?:(?:www|preview)\.)?msn\.com/(?P<locale>[a-z]{2}-[a-z]{2})/(?:[^/?#]+/)+(?P<display_id>[^/?#]+)/[a-z]{2}-(?P<id>[\da-zA-Z]+)' _TESTS = [{ - 'url': 'https://www.msn.com/en-in/money/video/7-ways-to-get-rid-of-chest-congestion/vi-BBPxU6d', - 'md5': '087548191d273c5c55d05028f8d2cbcd', + 'url': 'https://www.msn.com/en-gb/video/news/president-macron-interrupts-trump-over-ukraine-funding/vi-AA1zMcD7', 'info_dict': { - 'id': 'BBPxU6d', - 'display_id': '7-ways-to-get-rid-of-chest-congestion', + 'id': 'AA1zMcD7', 'ext': 'mp4', - 'title': 'Seven ways to get rid of chest congestion', - 'description': '7 Ways to Get Rid of Chest Congestion', - 'duration': 88, - 'uploader': 'Health', - 'uploader_id': 'BBPrMqa', + 'display_id': 'president-macron-interrupts-trump-over-ukraine-funding', + 'title': 'President Macron interrupts Trump over Ukraine funding', + 'description': 'md5:5fd3857ac25849e7a56cb25fbe1a2a8b', + 'uploader': 'k! News UK', + 'uploader_id': 'BB1hz5Rj', + 'duration': 59, + 'thumbnail': 'https://img-s-msn-com.akamaized.net/tenant/amp/entityid/AA1zMagX.img', + 'tags': 'count:14', + 'timestamp': 1740510914, + 'upload_date': '20250225', + 'release_timestamp': 1740513600, + 'release_date': '20250225', + 'modified_timestamp': 1741413241, + 'modified_date': '20250308', }, }, { - # Article, multiple Dailymotion Embeds - 'url': 'https://www.msn.com/en-in/money/sports/hottest-football-wags-greatest-footballers-turned-managers-and-more/ar-BBpc7Nl', + 'url': 'https://www.msn.com/en-gb/video/watch/films-success-saved-adam-pearsons-acting-career/vi-AA1znZGE?ocid=hpmsn', 'info_dict': { - 'id': 'BBpc7Nl', + 'id': 'AA1znZGE', + 'ext': 'mp4', + 'display_id': 'films-success-saved-adam-pearsons-acting-career', + 'title': "Films' success saved Adam Pearson's acting career", + 'description': 'md5:98c05f7bd9ab4f9c423400f62f2d3da5', + 'uploader': 'Sky News', + 'uploader_id': 'AA2eki', + 'duration': 52, + 'thumbnail': 'https://img-s-msn-com.akamaized.net/tenant/amp/entityid/AA1zo7nU.img', + 'timestamp': 1739993965, + 'upload_date': '20250219', + 'release_timestamp': 1739977753, + 'release_date': '20250219', + 'modified_timestamp': 1742076259, + 'modified_date': '20250315', }, - 'playlist_mincount': 4, }, { - 'url': 'http://www.msn.com/en-ae/news/offbeat/meet-the-nine-year-old-self-made-millionaire/ar-BBt6ZKf', - 'only_matching': True, - }, { - 'url': 'http://www.msn.com/en-ae/video/watch/obama-a-lot-of-people-will-be-disappointed/vi-AAhxUMH', - 'only_matching': True, - }, { - # geo restricted - 'url': 'http://www.msn.com/en-ae/foodanddrink/joinourtable/the-first-fart-makes-you-laugh-the-last-fart-makes-you-cry/vp-AAhzIBU', - 'only_matching': True, - }, { - 'url': 'http://www.msn.com/en-ae/entertainment/bollywood/watch-how-salman-khan-reacted-when-asked-if-he-would-apologize-for-his-‘raped-woman’-comment/vi-AAhvzW6', - 'only_matching': True, - }, { - # Vidible(AOL) Embed - 'url': 'https://www.msn.com/en-us/money/other/jupiter-is-about-to-come-so-close-you-can-see-its-moons-with-binoculars/vi-AACqsHR', - 'only_matching': True, + 'url': 'https://www.msn.com/en-us/entertainment/news/rock-frontman-replacements-you-might-not-know-happened/vi-AA1yLVcD', + 'info_dict': { + 'id': 'AA1yLVcD', + 'ext': 'mp4', + 'display_id': 'rock-frontman-replacements-you-might-not-know-happened', + 'title': 'Rock Frontman Replacements You Might Not Know Happened', + 'description': 'md5:451a125496ff0c9f6816055bb1808da9', + 'uploader': 'Grunge (Video)', + 'uploader_id': 'BB1oveoV', + 'duration': 596, + 'thumbnail': 'https://img-s-msn-com.akamaized.net/tenant/amp/entityid/AA1yM4OJ.img', + 'timestamp': 1739223456, + 'upload_date': '20250210', + 'release_timestamp': 1739219731, + 'release_date': '20250210', + 'modified_timestamp': 1741427272, + 'modified_date': '20250308', + }, }, { # Dailymotion Embed - 'url': 'https://www.msn.com/es-ve/entretenimiento/watch/winston-salem-paire-refait-des-siennes-en-perdant-sa-raquette-au-service/vp-AAG704L', - 'only_matching': True, + 'url': 'https://www.msn.com/de-de/nachrichten/other/the-first-descendant-gameplay-trailer-zu-serena-der-neuen-gefl%C3%BCgelten-nachfahrin/vi-AA1B1d06', + 'info_dict': { + 'id': 'x9g6oli', + 'ext': 'mp4', + 'title': 'The First Descendant: Gameplay-Trailer zu Serena, der neuen geflügelten Nachfahrin', + 'description': '', + 'uploader': 'MeinMMO', + 'uploader_id': 'x2mvqi4', + 'view_count': int, + 'like_count': int, + 'age_limit': 0, + 'duration': 60, + 'thumbnail': 'https://s1.dmcdn.net/v/Y3fO61drj56vPB9SS/x1080', + 'tags': ['MeinMMO', 'The First Descendant'], + 'timestamp': 1742124877, + 'upload_date': '20250316', + }, }, { - # YouTube Embed - 'url': 'https://www.msn.com/en-in/money/news/meet-vikram-%E2%80%94-chandrayaan-2s-lander/vi-AAGUr0v', - 'only_matching': True, + # Youtube Embed + 'url': 'https://www.msn.com/en-gb/video/webcontent/web-content/vi-AA1ybFaJ', + 'info_dict': { + 'id': 'kQSChWu95nE', + 'ext': 'mp4', + 'title': '7 Daily Habits to Nurture Your Personal Growth', + 'description': 'md5:6f233c68341b74dee30c8c121924e827', + 'uploader': 'TopThink', + 'uploader_id': '@TopThink', + 'uploader_url': 'https://www.youtube.com/@TopThink', + 'channel': 'TopThink', + 'channel_id': 'UCMlGmHokrQRp-RaNO7aq4Uw', + 'channel_url': 'https://www.youtube.com/channel/UCMlGmHokrQRp-RaNO7aq4Uw', + 'channel_is_verified': True, + 'channel_follower_count': int, + 'comment_count': int, + 'view_count': int, + 'like_count': int, + 'age_limit': 0, + 'duration': 705, + 'thumbnail': 'https://i.ytimg.com/vi/kQSChWu95nE/maxresdefault.jpg', + 'categories': ['Howto & Style'], + 'tags': ['topthink', 'top think', 'personal growth'], + 'timestamp': 1722711620, + 'upload_date': '20240803', + 'playable_in_embed': True, + 'availability': 'public', + 'live_status': 'not_live', + }, }, { - # NBCSports Embed - 'url': 'https://www.msn.com/en-us/money/football_nfl/week-13-preview-redskins-vs-panthers/vi-BBXsCDb', - 'only_matching': True, + # Article with social embed + 'url': 'https://www.msn.com/en-in/news/techandscience/watch-earth-sets-and-rises-behind-moon-in-breathtaking-blue-ghost-video/ar-AA1zKoAc', + 'info_dict': { + 'id': 'AA1zKoAc', + 'title': 'Watch: Earth sets and rises behind Moon in breathtaking Blue Ghost video', + 'description': 'md5:0ad51cfa77e42e7f0c46cf98a619dbbf', + 'uploader': 'India Today', + 'uploader_id': 'AAyFWG', + 'tags': 'count:11', + 'timestamp': 1740485034, + 'upload_date': '20250225', + 'release_timestamp': 1740484875, + 'release_date': '20250225', + 'modified_timestamp': 1740488561, + 'modified_date': '20250225', + }, + 'playlist_count': 1, }] def _real_extract(self, url): - display_id, page_id = self._match_valid_url(url).groups() + locale, display_id, page_id = self._match_valid_url(url).group('locale', 'display_id', 'id') - webpage = self._download_webpage(url, display_id) + json_data = self._download_json( + f'https://assets.msn.com/content/view/v2/Detail/{locale}/{page_id}', page_id) - entries = [] - for _, metadata in re.findall(r'data-metadata\s*=\s*(["\'])(?P<data>.+?)\1', webpage): - video = self._parse_json(unescapeHTML(metadata), display_id) - - provider_id = video.get('providerId') - player_name = video.get('playerName') - if player_name and provider_id: - entry = None - if player_name == 'AOL': - if provider_id.startswith('http'): - provider_id = self._search_regex( - r'https?://delivery\.vidible\.tv/video/redirect/([0-9a-f]{24})', - provider_id, 'vidible id') - entry = self.url_result( - 'aol-video:' + provider_id, 'Aol', provider_id) - elif player_name == 'Dailymotion': - entry = self.url_result( - 'https://www.dailymotion.com/video/' + provider_id, - 'Dailymotion', provider_id) - elif player_name == 'YouTube': - entry = self.url_result( - provider_id, 'Youtube', provider_id) - elif player_name == 'NBCSports': - entry = self.url_result( - 'http://vplayer.nbcsports.com/p/BxmELC/nbcsports_embed/select/media/' + provider_id, - 'NBCSportsVPlayer', provider_id) - if entry: - entries.append(entry) - continue - - video_id = video['uuid'] - title = video['title'] + common_metadata = traverse_obj(json_data, { + 'title': ('title', {str}), + 'description': (('abstract', ('body', {clean_html})), {str}, filter, any), + 'timestamp': ('createdDateTime', {parse_iso8601}), + 'release_timestamp': ('publishedDateTime', {parse_iso8601}), + 'modified_timestamp': ('updatedDateTime', {parse_iso8601}), + 'thumbnail': ('thumbnail', 'image', 'url', {url_or_none}), + 'duration': ('videoMetadata', 'playTime', {int_or_none}), + 'tags': ('keywords', ..., {str}), + 'uploader': ('provider', 'name', {str}), + 'uploader_id': ('provider', 'id', {str}), + }) + page_type = json_data['type'] + source_url = traverse_obj(json_data, ('sourceHref', {url_or_none})) + if page_type == 'video': + if traverse_obj(json_data, ('thirdPartyVideoPlayer', 'enabled')) and source_url: + return self.url_result(source_url) formats = [] - for file_ in video.get('videoFiles', []): - format_url = file_.get('url') - if not format_url: - continue - if 'format=m3u8-aapl' in format_url: - # m3u8_native should not be used here until - # https://github.com/ytdl-org/youtube-dl/issues/9913 is fixed - formats.extend(self._extract_m3u8_formats( - format_url, display_id, 'mp4', - m3u8_id='hls', fatal=False)) - elif 'format=mpd-time-csf' in format_url: - formats.extend(self._extract_mpd_formats( - format_url, display_id, 'dash', fatal=False)) - elif '.ism' in format_url: - if format_url.endswith('.ism'): - format_url += '/manifest' - formats.extend(self._extract_ism_formats( - format_url, display_id, 'mss', fatal=False)) - else: - format_id = file_.get('formatCode') - formats.append({ - 'url': format_url, - 'ext': 'mp4', - 'format_id': format_id, - 'width': int_or_none(file_.get('width')), - 'height': int_or_none(file_.get('height')), - 'vbr': int_or_none(self._search_regex(r'_(\d+)\.mp4', format_url, 'vbr', default=None)), - 'quality': 1 if format_id == '1001' else None, - }) - subtitles = {} - for file_ in video.get('files', []): - format_url = file_.get('url') - format_code = file_.get('formatCode') - if not format_url or not format_code: - continue - if str(format_code) == '3100': - subtitles.setdefault(file_.get('culture', 'en'), []).append({ - 'ext': determine_ext(format_url, 'ttml'), - 'url': format_url, - }) + for file in traverse_obj(json_data, ('videoMetadata', 'externalVideoFiles', lambda _, v: url_or_none(v['url']))): + file_url = file['url'] + ext = determine_ext(file_url) + if ext == 'm3u8': + fmts, subs = self._extract_m3u8_formats_and_subtitles( + file_url, page_id, 'mp4', m3u8_id='hls', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + elif ext == 'mpd': + fmts, subs = self._extract_mpd_formats_and_subtitles( + file_url, page_id, mpd_id='dash', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + else: + formats.append( + traverse_obj(file, { + 'url': 'url', + 'format_id': ('format', {str}), + 'filesize': ('fileSize', {int_or_none}), + 'height': ('height', {int_or_none}), + 'width': ('width', {int_or_none}), + })) + for caption in traverse_obj(json_data, ('videoMetadata', 'closedCaptions', lambda _, v: url_or_none(v['href']))): + lang = caption.get('locale') or 'en-us' + subtitles.setdefault(lang, []).append({ + 'url': caption['href'], + 'ext': 'ttml', + }) - entries.append({ - 'id': video_id, + return { + 'id': page_id, 'display_id': display_id, - 'title': title, - 'description': video.get('description'), - 'thumbnail': video.get('headlineImage', {}).get('url'), - 'duration': int_or_none(video.get('durationSecs')), - 'uploader': video.get('sourceFriendly'), - 'uploader_id': video.get('providerId'), - 'creator': video.get('creator'), - 'subtitles': subtitles, 'formats': formats, - }) + 'subtitles': subtitles, + **common_metadata, + } + elif page_type == 'webcontent': + if not source_url: + raise ExtractorError('Could not find source URL') + return self.url_result(source_url) + elif page_type == 'article': + entries = [] + for embed_url in traverse_obj(json_data, ('socialEmbeds', ..., 'postUrl', {url_or_none})): + entries.append(self.url_result(embed_url)) - if not entries: - error = unescapeHTML(self._search_regex( - r'data-error=(["\'])(?P<error>.+?)\1', - webpage, 'error', group='error')) - raise ExtractorError(f'{self.IE_NAME} said: {error}', expected=True) + return self.playlist_result(entries, page_id, **common_metadata) - return self.playlist_result(entries, page_id) + raise ExtractorError(f'Unsupported page type: {page_type}') diff --git a/yt_dlp/extractor/n1.py b/yt_dlp/extractor/n1.py index bbb327e75..e0e49161b 100644 --- a/yt_dlp/extractor/n1.py +++ b/yt_dlp/extractor/n1.py @@ -4,7 +4,9 @@ from ..utils import ( extract_attributes, unified_timestamp, + url_or_none, ) +from ..utils.traversal import traverse_obj class N1InfoAssetIE(InfoExtractor): @@ -35,9 +37,9 @@ class N1InfoIIE(InfoExtractor): IE_NAME = 'N1Info:article' _VALID_URL = r'https?://(?:(?:\w+\.)?n1info\.\w+|nova\.rs)/(?:[^/?#]+/){1,2}(?P<id>[^/?#]+)' _TESTS = [{ - # Youtube embedded + # YouTube embedded 'url': 'https://rs.n1info.com/sport-klub/tenis/kako-je-djokovic-propustio-istorijsku-priliku-video/', - 'md5': '01ddb6646d0fd9c4c7d990aa77fe1c5a', + 'md5': '987ce6fd72acfecc453281e066b87973', 'info_dict': { 'id': 'L5Hd4hQVUpk', 'ext': 'mp4', @@ -45,7 +47,26 @@ class N1InfoIIE(InfoExtractor): 'title': 'Ozmo i USO21, ep. 13: Novak Đoković – Danil Medvedev | Ključevi Poraza, Budućnost | SPORT KLUB TENIS', 'description': 'md5:467f330af1effedd2e290f10dc31bb8e', 'uploader': 'Sport Klub', - 'uploader_id': 'sportklub', + 'uploader_id': '@sportklub', + 'uploader_url': 'https://www.youtube.com/@sportklub', + 'channel': 'Sport Klub', + 'channel_id': 'UChpzBje9Ro6CComXe3BgNaw', + 'channel_url': 'https://www.youtube.com/channel/UChpzBje9Ro6CComXe3BgNaw', + 'channel_is_verified': True, + 'channel_follower_count': int, + 'comment_count': int, + 'view_count': int, + 'like_count': int, + 'age_limit': 0, + 'duration': 1049, + 'thumbnail': 'https://i.ytimg.com/vi/L5Hd4hQVUpk/maxresdefault.jpg', + 'chapters': 'count:9', + 'categories': ['Sports'], + 'tags': 'count:10', + 'timestamp': 1631522787, + 'playable_in_embed': True, + 'availability': 'public', + 'live_status': 'not_live', }, }, { 'url': 'https://rs.n1info.com/vesti/djilas-los-plan-za-metro-nece-resiti-nijedan-saobracajni-problem/', @@ -55,6 +76,7 @@ class N1InfoIIE(InfoExtractor): 'title': 'Đilas: Predlog izgradnje metroa besmislen; SNS odbacuje navode', 'upload_date': '20210924', 'timestamp': 1632481347, + 'thumbnail': 'http://n1info.rs/wp-content/themes/ucnewsportal-n1/dist/assets/images/placeholder-image-video.jpg', }, 'params': { 'skip_download': True, @@ -67,6 +89,7 @@ class N1InfoIIE(InfoExtractor): 'title': 'Zadnji dnevi na kopališču Ilirija: “Ilirija ni umrla, ubili so jo”', 'timestamp': 1632567630, 'upload_date': '20210925', + 'thumbnail': 'https://n1info.si/wp-content/uploads/2021/09/06/1630945843-tomaz3.png', }, 'params': { 'skip_download': True, @@ -81,6 +104,14 @@ class N1InfoIIE(InfoExtractor): 'upload_date': '20210924', 'timestamp': 1632448649.0, 'uploader': 'YouLotWhatDontStop', + 'display_id': 'pu9wbx', + 'channel_id': 'serbia', + 'comment_count': int, + 'like_count': int, + 'dislike_count': int, + 'age_limit': 0, + 'duration': 134, + 'thumbnail': 'https://external-preview.redd.it/5nmmawSeGx60miQM3Iq-ueC9oyCLTLjjqX-qqY8uRsc.png?format=pjpg&auto=webp&s=2f973400b04d23f871b608b178e47fc01f9b8f1d', }, 'params': { 'skip_download': True, @@ -93,6 +124,7 @@ class N1InfoIIE(InfoExtractor): 'title': 'Žaklina Tatalović Ani Brnabić: Pričate laži (VIDEO)', 'upload_date': '20211102', 'timestamp': 1635861677, + 'thumbnail': 'https://nova.rs/wp-content/uploads/2021/11/02/1635860298-TNJG_Ana_Brnabic_i_Zaklina_Tatalovic_100_dana_Vlade_GP.jpg', }, }, { 'url': 'https://n1info.rs/vesti/cuta-biti-u-kosovskoj-mitrovici-znaci-da-te-docekaju-eksplozivnim-napravama/', @@ -104,6 +136,16 @@ class N1InfoIIE(InfoExtractor): 'timestamp': 1687290536, 'thumbnail': 'https://cdn.brid.tv/live/partners/26827/snapshot/1332368_th_6492013a8356f_1687290170.jpg', }, + }, { + 'url': 'https://n1info.rs/vesti/vuciceva-turneja-po-srbiji-najavljuje-kontrarevoluciju-preti-svom-narodu-vredja-novinare/', + 'info_dict': { + 'id': '2025974', + 'ext': 'mp4', + 'title': 'Vučićeva turneja po Srbiji: Najavljuje kontrarevoluciju, preti svom narodu, vređa novinare', + 'thumbnail': 'https://cdn-uc.brid.tv/live/partners/26827/snapshot/2025974_fhd_67c4a23280a81_1740939826.jpg', + 'timestamp': 1740939936, + 'upload_date': '20250302', + }, }, { 'url': 'https://hr.n1info.com/vijesti/pravobraniteljica-o-ubojstvu-u-zagrebu-radi-se-o-doista-nezapamcenoj-situaciji/', 'only_matching': True, @@ -115,11 +157,11 @@ def _real_extract(self, url): title = self._html_search_regex(r'<h1[^>]+>(.+?)</h1>', webpage, 'title') timestamp = unified_timestamp(self._html_search_meta('article:published_time', webpage)) - plugin_data = self._html_search_meta('BridPlugin', webpage) + plugin_data = re.findall(r'\$bp\("(?:Brid|TargetVideo)_\d+",\s(.+)\);', webpage) entries = [] if plugin_data: site_id = self._html_search_regex(r'site:(\d+)', webpage, 'site id') - for video_data in re.findall(r'\$bp\("Brid_\d+", (.+)\);', webpage): + for video_data in plugin_data: video_id = self._parse_json(video_data, title)['video'] entries.append({ 'id': video_id, @@ -140,7 +182,7 @@ def _real_extract(self, url): 'url': video_data.get('data-url'), 'id': video_data.get('id'), 'title': title, - 'thumbnail': video_data.get('data-thumbnail'), + 'thumbnail': traverse_obj(video_data, (('data-thumbnail', 'data-default_thumbnail'), {url_or_none}, any)), 'timestamp': timestamp, 'ie_key': 'N1InfoAsset', }) @@ -152,7 +194,7 @@ def _real_extract(self, url): if url.startswith('https://www.youtube.com'): entries.append(self.url_result(url, ie='Youtube')) elif url.startswith('https://www.redditmedia.com'): - entries.append(self.url_result(url, ie='RedditR')) + entries.append(self.url_result(url, ie='Reddit')) return { '_type': 'playlist', diff --git a/yt_dlp/extractor/nbc.py b/yt_dlp/extractor/nbc.py index 8f6fb22b1..d9aded09e 100644 --- a/yt_dlp/extractor/nbc.py +++ b/yt_dlp/extractor/nbc.py @@ -736,7 +736,7 @@ def _real_extract(self, url): webpage = self._download_webpage(url, video_id) nbc_data = self._search_json( - r'<script>\s*var\s+nbc\s*=', webpage, 'NBC JSON data', video_id) + r'(?:<script>\s*var\s+nbc\s*=|Object\.assign\(nbc,)', webpage, 'NBC JSON data', video_id) pdk_acct = nbc_data.get('pdkAcct') or 'Yh1nAC' fw_ssid = traverse_obj(nbc_data, ('video', 'fwSSID')) diff --git a/yt_dlp/extractor/niconico.py b/yt_dlp/extractor/niconico.py index 2f04de9e2..52ba6c417 100644 --- a/yt_dlp/extractor/niconico.py +++ b/yt_dlp/extractor/niconico.py @@ -13,30 +13,89 @@ ExtractorError, OnDemandPagedList, clean_html, + determine_ext, float_or_none, int_or_none, - join_nonempty, parse_duration, parse_iso8601, + parse_qs, parse_resolution, qualities, remove_start, str_or_none, - traverse_obj, try_get, unescapeHTML, + unified_timestamp, update_url_query, + url_basename, url_or_none, urlencode_postdata, urljoin, ) +from ..utils.traversal import find_element, traverse_obj -class NiconicoIE(InfoExtractor): +class NiconicoBaseIE(InfoExtractor): + _GEO_BYPASS = False + _GEO_COUNTRIES = ['JP'] + _LOGIN_BASE = 'https://account.nicovideo.jp' + _NETRC_MACHINE = 'niconico' + + @property + def is_logged_in(self): + return bool(self._get_cookies('https://www.nicovideo.jp').get('user_session')) + + def _raise_login_error(self, message, expected=True): + raise ExtractorError(f'Unable to login: {message}', expected=expected) + + def _perform_login(self, username, password): + if self.is_logged_in: + return + + self._request_webpage( + f'{self._LOGIN_BASE}/login', None, 'Requesting session cookies') + webpage = self._download_webpage( + f'{self._LOGIN_BASE}/login/redirector', None, + 'Logging in', 'Unable to log in', headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + 'Referer': f'{self._LOGIN_BASE}/login', + }, data=urlencode_postdata({ + 'mail_tel': username, + 'password': password, + })) + + if self.is_logged_in: + return + elif err_msg := traverse_obj(webpage, ( + {find_element(cls='notice error')}, {find_element(cls='notice__text')}, {clean_html}, + )): + self._raise_login_error(err_msg or 'Invalid username or password') + elif 'oneTimePw' in webpage: + post_url = self._search_regex( + r'<form[^>]+action=(["\'])(?P<url>.+?)\1', webpage, 'post url', group='url') + mfa, urlh = self._download_webpage_handle( + urljoin(self._LOGIN_BASE, post_url), None, + 'Performing MFA', 'Unable to complete MFA', headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + }, data=urlencode_postdata({ + 'otp': self._get_tfa_info('6 digit number shown on app'), + })) + if self.is_logged_in: + return + elif 'error-code' in parse_qs(urlh.url): + err_msg = traverse_obj(mfa, ({find_element(cls='pageMainMsg')}, {clean_html})) + self._raise_login_error(err_msg or 'MFA session expired') + elif 'formError' in mfa: + err_msg = traverse_obj(mfa, ( + {find_element(cls='formError')}, {find_element(tag='div')}, {clean_html})) + self._raise_login_error(err_msg or 'MFA challenge failed') + + self._raise_login_error('Unexpected login error', expected=False) + + +class NiconicoIE(NiconicoBaseIE): IE_NAME = 'niconico' IE_DESC = 'ニコニコ動画' - _GEO_COUNTRIES = ['JP'] - _GEO_BYPASS = False _TESTS = [{ 'url': 'http://www.nicovideo.jp/watch/sm22312215', @@ -176,229 +235,6 @@ class NiconicoIE(InfoExtractor): }] _VALID_URL = r'https?://(?:(?:www\.|secure\.|sp\.)?nicovideo\.jp/watch|nico\.ms)/(?P<id>(?:[a-z]{2})?[0-9]+)' - _NETRC_MACHINE = 'niconico' - _API_HEADERS = { - 'X-Frontend-ID': '6', - 'X-Frontend-Version': '0', - 'X-Niconico-Language': 'en-us', - 'Referer': 'https://www.nicovideo.jp/', - 'Origin': 'https://www.nicovideo.jp', - } - - def _perform_login(self, username, password): - login_ok = True - login_form_strs = { - 'mail_tel': username, - 'password': password, - } - self._request_webpage( - 'https://account.nicovideo.jp/login', None, - note='Acquiring Login session') - page = self._download_webpage( - 'https://account.nicovideo.jp/login/redirector?show_button_twitter=1&site=niconico&show_button_facebook=1', None, - note='Logging in', errnote='Unable to log in', - data=urlencode_postdata(login_form_strs), - headers={ - 'Referer': 'https://account.nicovideo.jp/login', - 'Content-Type': 'application/x-www-form-urlencoded', - }) - if 'oneTimePw' in page: - post_url = self._search_regex( - r'<form[^>]+action=(["\'])(?P<url>.+?)\1', page, 'post url', group='url') - page = self._download_webpage( - urljoin('https://account.nicovideo.jp', post_url), None, - note='Performing MFA', errnote='Unable to complete MFA', - data=urlencode_postdata({ - 'otp': self._get_tfa_info('6 digits code'), - }), headers={ - 'Content-Type': 'application/x-www-form-urlencoded', - }) - if 'oneTimePw' in page or 'formError' in page: - err_msg = self._html_search_regex( - r'formError["\']+>(.*?)</div>', page, 'form_error', - default='There\'s an error but the message can\'t be parsed.', - flags=re.DOTALL) - self.report_warning(f'Unable to log in: MFA challenge failed, "{err_msg}"') - return False - login_ok = 'class="notice error"' not in page - if not login_ok: - self.report_warning('Unable to log in: bad username or password') - return login_ok - - def _get_heartbeat_info(self, info_dict): - video_id, video_src_id, audio_src_id = info_dict['url'].split(':')[1].split('/') - dmc_protocol = info_dict['expected_protocol'] - - api_data = ( - info_dict.get('_api_data') - or self._parse_json( - self._html_search_regex( - 'data-api-data="([^"]+)"', - self._download_webpage('https://www.nicovideo.jp/watch/' + video_id, video_id), - 'API data', default='{}'), - video_id)) - - session_api_data = try_get(api_data, lambda x: x['media']['delivery']['movie']['session']) - session_api_endpoint = try_get(session_api_data, lambda x: x['urls'][0]) - - def ping(): - tracking_id = traverse_obj(api_data, ('media', 'delivery', 'trackingId')) - if tracking_id: - tracking_url = update_url_query('https://nvapi.nicovideo.jp/v1/2ab0cbaa/watch', {'t': tracking_id}) - watch_request_response = self._download_json( - tracking_url, video_id, - note='Acquiring permission for downloading video', fatal=False, - headers=self._API_HEADERS) - if traverse_obj(watch_request_response, ('meta', 'status')) != 200: - self.report_warning('Failed to acquire permission for playing video. Video download may fail.') - - yesno = lambda x: 'yes' if x else 'no' - - if dmc_protocol == 'http': - protocol = 'http' - protocol_parameters = { - 'http_output_download_parameters': { - 'use_ssl': yesno(session_api_data['urls'][0]['isSsl']), - 'use_well_known_port': yesno(session_api_data['urls'][0]['isWellKnownPort']), - }, - } - elif dmc_protocol == 'hls': - protocol = 'm3u8' - segment_duration = try_get(self._configuration_arg('segment_duration'), lambda x: int(x[0])) or 6000 - parsed_token = self._parse_json(session_api_data['token'], video_id) - encryption = traverse_obj(api_data, ('media', 'delivery', 'encryption')) - protocol_parameters = { - 'hls_parameters': { - 'segment_duration': segment_duration, - 'transfer_preset': '', - 'use_ssl': yesno(session_api_data['urls'][0]['isSsl']), - 'use_well_known_port': yesno(session_api_data['urls'][0]['isWellKnownPort']), - }, - } - if 'hls_encryption' in parsed_token and encryption: - protocol_parameters['hls_parameters']['encryption'] = { - parsed_token['hls_encryption']: { - 'encrypted_key': encryption['encryptedKey'], - 'key_uri': encryption['keyUri'], - }, - } - else: - protocol = 'm3u8_native' - else: - raise ExtractorError(f'Unsupported DMC protocol: {dmc_protocol}') - - session_response = self._download_json( - session_api_endpoint['url'], video_id, - query={'_format': 'json'}, - headers={'Content-Type': 'application/json'}, - note='Downloading JSON metadata for {}'.format(info_dict['format_id']), - data=json.dumps({ - 'session': { - 'client_info': { - 'player_id': session_api_data.get('playerId'), - }, - 'content_auth': { - 'auth_type': try_get(session_api_data, lambda x: x['authTypes'][session_api_data['protocols'][0]]), - 'content_key_timeout': session_api_data.get('contentKeyTimeout'), - 'service_id': 'nicovideo', - 'service_user_id': session_api_data.get('serviceUserId'), - }, - 'content_id': session_api_data.get('contentId'), - 'content_src_id_sets': [{ - 'content_src_ids': [{ - 'src_id_to_mux': { - 'audio_src_ids': [audio_src_id], - 'video_src_ids': [video_src_id], - }, - }], - }], - 'content_type': 'movie', - 'content_uri': '', - 'keep_method': { - 'heartbeat': { - 'lifetime': session_api_data.get('heartbeatLifetime'), - }, - }, - 'priority': session_api_data['priority'], - 'protocol': { - 'name': 'http', - 'parameters': { - 'http_parameters': { - 'parameters': protocol_parameters, - }, - }, - }, - 'recipe_id': session_api_data.get('recipeId'), - 'session_operation_auth': { - 'session_operation_auth_by_signature': { - 'signature': session_api_data.get('signature'), - 'token': session_api_data.get('token'), - }, - }, - 'timing_constraint': 'unlimited', - }, - }).encode()) - - info_dict['url'] = session_response['data']['session']['content_uri'] - info_dict['protocol'] = protocol - - # get heartbeat info - heartbeat_info_dict = { - 'url': session_api_endpoint['url'] + '/' + session_response['data']['session']['id'] + '?_format=json&_method=PUT', - 'data': json.dumps(session_response['data']), - # interval, convert milliseconds to seconds, then halve to make a buffer. - 'interval': float_or_none(session_api_data.get('heartbeatLifetime'), scale=3000), - 'ping': ping, - } - - return info_dict, heartbeat_info_dict - - def _extract_format_for_quality(self, video_id, audio_quality, video_quality, dmc_protocol): - - if not audio_quality.get('isAvailable') or not video_quality.get('isAvailable'): - return None - - format_id = '-'.join( - [remove_start(s['id'], 'archive_') for s in (video_quality, audio_quality)] + [dmc_protocol]) - - vid_qual_label = traverse_obj(video_quality, ('metadata', 'label')) - - return { - 'url': 'niconico_dmc:{}/{}/{}'.format(video_id, video_quality['id'], audio_quality['id']), - 'format_id': format_id, - 'format_note': join_nonempty('DMC', vid_qual_label, dmc_protocol.upper(), delim=' '), - 'ext': 'mp4', # Session API are used in HTML5, which always serves mp4 - 'acodec': 'aac', - 'vcodec': 'h264', - **traverse_obj(audio_quality, ('metadata', { - 'abr': ('bitrate', {float_or_none(scale=1000)}), - 'asr': ('samplingRate', {int_or_none}), - })), - **traverse_obj(video_quality, ('metadata', { - 'vbr': ('bitrate', {float_or_none(scale=1000)}), - 'height': ('resolution', 'height', {int_or_none}), - 'width': ('resolution', 'width', {int_or_none}), - })), - 'quality': -2 if 'low' in video_quality['id'] else None, - 'protocol': 'niconico_dmc', - 'expected_protocol': dmc_protocol, # XXX: This is not a documented field - 'http_headers': { - 'Origin': 'https://www.nicovideo.jp', - 'Referer': 'https://www.nicovideo.jp/watch/' + video_id, - }, - } - - def _yield_dmc_formats(self, api_data, video_id): - dmc_data = traverse_obj(api_data, ('media', 'delivery', 'movie')) - audios = traverse_obj(dmc_data, ('audios', ..., {dict})) - videos = traverse_obj(dmc_data, ('videos', ..., {dict})) - protocols = traverse_obj(dmc_data, ('session', 'protocols', ..., {str})) - if not all((audios, videos, protocols)): - return - - for audio_quality, video_quality, protocol in itertools.product(audios, videos, protocols): - if fmt := self._extract_format_for_quality(video_id, audio_quality, video_quality, protocol): - yield fmt def _yield_dms_formats(self, api_data, video_id): fmt_filter = lambda _, v: v['isAvailable'] and v['id'] @@ -430,6 +266,7 @@ def _yield_dms_formats(self, api_data, video_id): 'format_id': ('id', {str}), 'abr': ('bitRate', {float_or_none(scale=1000)}), 'asr': ('samplingRate', {int_or_none}), + 'quality': ('qualityLevel', {int_or_none}), }), get_all=False), 'acodec': 'aac', } @@ -441,7 +278,9 @@ def _yield_dms_formats(self, api_data, video_id): min_abr = min(traverse_obj(audios, (..., 'bitRate', {float_or_none})), default=0) / 1000 for video_fmt in video_fmts: video_fmt['tbr'] -= min_abr - video_fmt['format_id'] = f'video-{video_fmt["tbr"]:.0f}' + video_fmt['format_id'] = url_basename(video_fmt['url']).rpartition('.')[0] + video_fmt['quality'] = traverse_obj(videos, ( + lambda _, v: v['id'] == video_fmt['format_id'], 'qualityLevel', {int_or_none}, any)) or -1 yield video_fmt def _real_extract(self, url): @@ -478,8 +317,8 @@ def _real_extract(self, url): 'needs_premium': ('isPremium', {bool}), 'needs_subscription': ('isAdmission', {bool}), })) or {'needs_auth': True})) - formats = [*self._yield_dmc_formats(api_data, video_id), - *self._yield_dms_formats(api_data, video_id)] + + formats = list(self._yield_dms_formats(api_data, video_id)) if not formats: fail_msg = clean_html(self._html_search_regex( r'<p[^>]+\bclass="fail-message"[^>]*>(?P<msg>.+?)</p>', @@ -914,7 +753,7 @@ def _real_extract(self, url): return self.playlist_result(self._entries(list_id), list_id) -class NiconicoLiveIE(InfoExtractor): +class NiconicoLiveIE(NiconicoBaseIE): IE_NAME = 'niconico:live' IE_DESC = 'ニコニコ生放送' _VALID_URL = r'https?://(?:sp\.)?live2?\.nicovideo\.jp/(?:watch|gate)/(?P<id>lv\d+)' @@ -979,6 +818,7 @@ def _real_extract(self, url): 'quality': 'abr', 'protocol': 'hls+fmp4', 'latency': latency, + 'accessRightMethod': 'single_cookie', 'chasePlay': False, }, 'room': { @@ -999,6 +839,7 @@ def _real_extract(self, url): if data.get('type') == 'stream': m3u8_url = data['data']['uri'] qualities = data['data']['availableQualities'] + cookies = data['data']['cookies'] break elif data.get('type') == 'disconnect': self.write_debug(recv) @@ -1033,9 +874,15 @@ def _real_extract(self, url): thumbnails.append({ 'id': f'{name}_{width}x{height}', 'url': img_url, + 'ext': traverse_obj(parse_qs(img_url), ('image', 0, {determine_ext(default_ext='jpg')})), **res, }) + for cookie in cookies: + self._set_cookie( + cookie['domain'], cookie['name'], cookie['value'], + expire_time=unified_timestamp(cookie['expires']), path=cookie['path'], secure=cookie['secure']) + formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', live=True) for fmt, q in zip(formats, reversed(qualities[1:])): fmt.update({ diff --git a/yt_dlp/extractor/ninenow.py b/yt_dlp/extractor/ninenow.py index f17531e62..7b0cb77a7 100644 --- a/yt_dlp/extractor/ninenow.py +++ b/yt_dlp/extractor/ninenow.py @@ -1,34 +1,46 @@ +import json +import re + +from .brightcove import BrightcoveNewIE from .common import InfoExtractor from ..utils import ( - ExtractorError, float_or_none, int_or_none, - smuggle_url, + parse_iso8601, + parse_resolution, str_or_none, - try_get, - unified_strdate, - unified_timestamp, + url_or_none, ) +from ..utils.traversal import require, traverse_obj, value class NineNowIE(InfoExtractor): IE_NAME = '9now.com.au' - _VALID_URL = r'https?://(?:www\.)?9now\.com\.au/(?:[^/]+/){2}(?P<id>[^/?#]+)' - _GEO_COUNTRIES = ['AU'] + _VALID_URL = r'https?://(?:www\.)?9now\.com\.au/(?:[^/?#]+/){2}(?P<id>(?P<type>clip|episode)-[^/?#]+)' + _GEO_BYPASS = False _TESTS = [{ # clip - 'url': 'https://www.9now.com.au/afl-footy-show/2016/clip-ciql02091000g0hp5oktrnytc', - 'md5': '17cf47d63ec9323e562c9957a968b565', + 'url': 'https://www.9now.com.au/today/season-2025/clip-cm8hw9h5z00080hquqa5hszq7', 'info_dict': { - 'id': '16801', + 'id': '6370295582112', 'ext': 'mp4', - 'title': 'St. Kilda\'s Joey Montagna on the potential for a player\'s strike', - 'description': 'Is a boycott of the NAB Cup "on the table"?', + 'title': 'Would Karl Stefanovic be able to land a plane?', + 'description': 'The Today host\'s skills are put to the test with the latest simulation tech.', 'uploader_id': '4460760524001', - 'upload_date': '20160713', - 'timestamp': 1468421266, + 'duration': 197.376, + 'tags': ['flights', 'technology', 'Karl Stefanovic'], + 'season': 'Season 2025', + 'season_number': 2025, + 'series': 'TODAY', + 'timestamp': 1742507988, + 'upload_date': '20250320', + 'release_timestamp': 1742507983, + 'release_date': '20250320', + 'thumbnail': r're:https?://.+/1920x0/.+\.jpg', + }, + 'params': { + 'skip_download': 'HLS/DASH fragments and mp4 URLs are geo-restricted; only available in AU', }, - 'skip': 'Only available in Australia', }, { # episode 'url': 'https://www.9now.com.au/afl-footy-show/2016/episode-19', @@ -41,7 +53,7 @@ class NineNowIE(InfoExtractor): # episode of series 'url': 'https://www.9now.com.au/lego-masters/season-3/episode-3', 'info_dict': { - 'id': '6249614030001', + 'id': '6308830406112', 'title': 'Episode 3', 'ext': 'mp4', 'season_number': 3, @@ -50,72 +62,87 @@ class NineNowIE(InfoExtractor): 'uploader_id': '4460760524001', 'timestamp': 1619002200, 'upload_date': '20210421', + 'duration': 3574.085, + 'thumbnail': r're:https?://.+/1920x0/.+\.jpg', + 'tags': ['episode'], + 'series': 'Lego Masters', + 'season': 'Season 3', + 'episode': 'Episode 3', + 'release_timestamp': 1619002200, + 'release_date': '20210421', }, - 'expected_warnings': ['Ignoring subtitle tracks'], 'params': { - 'skip_download': True, + 'skip_download': 'HLS/DASH fragments and mp4 URLs are geo-restricted; only available in AU', + }, + }, { + 'url': 'https://www.9now.com.au/married-at-first-sight/season-12/episode-1', + 'info_dict': { + 'id': '6367798770112', + 'ext': 'mp4', + 'title': 'Episode 1', + 'description': r're:The cultural sensation of Married At First Sight returns with our first weddings! .{90}$', + 'uploader_id': '4460760524001', + 'duration': 5415.079, + 'thumbnail': r're:https?://.+/1920x0/.+\.png', + 'tags': ['episode'], + 'season': 'Season 12', + 'season_number': 12, + 'episode': 'Episode 1', + 'episode_number': 1, + 'series': 'Married at First Sight', + 'timestamp': 1737973800, + 'upload_date': '20250127', + 'release_timestamp': 1737973800, + 'release_date': '20250127', + }, + 'params': { + 'skip_download': 'HLS/DASH fragments and mp4 URLs are geo-restricted; only available in AU', }, }] - BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/4460760524001/default_default/index.html?videoId=%s' + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/4460760524001/default_default/index.html?videoId={}' + + # XXX: For parsing next.js v15+ data; see also yt_dlp.extractor.francetv and yt_dlp.extractor.goplay + def _find_json(self, s): + return self._search_json( + r'\w+\s*:\s*', s, 'next js data', None, contains_pattern=r'\[(?s:.+)\]', default=None) def _real_extract(self, url): - display_id = self._match_id(url) + display_id, video_type = self._match_valid_url(url).group('id', 'type') webpage = self._download_webpage(url, display_id) - page_data = self._parse_json(self._search_regex( - r'window\.__data\s*=\s*({.*?});', webpage, - 'page data', default='{}'), display_id, fatal=False) - if not page_data: - page_data = self._parse_json(self._parse_json(self._search_regex( - r'window\.__data\s*=\s*JSON\.parse\s*\(\s*(".+?")\s*\)\s*;', - webpage, 'page data'), display_id), display_id) - for kind in ('episode', 'clip'): - current_key = page_data.get(kind, {}).get( - f'current{kind.capitalize()}Key') - if not current_key: - continue - cache = page_data.get(kind, {}).get(f'{kind}Cache', {}) - if not cache: - continue - common_data = { - 'episode': (cache.get(current_key) or next(iter(cache.values())))[kind], - 'season': (cache.get(current_key) or next(iter(cache.values()))).get('season', None), - } - break - else: - raise ExtractorError('Unable to find video data') + common_data = traverse_obj( + re.findall(r'<script[^>]*>\s*self\.__next_f\.push\(\s*(\[.+?\])\s*\);?\s*</script>', webpage), + (..., {json.loads}, ..., {self._find_json}, + lambda _, v: v['payload'][video_type]['slug'] == display_id, + 'payload', any, {require('video data')})) - if not self.get_param('allow_unplayable_formats') and try_get(common_data, lambda x: x['episode']['video']['drm'], bool): + if traverse_obj(common_data, (video_type, 'video', 'drm', {bool})): self.report_drm(display_id) - brightcove_id = try_get( - common_data, lambda x: x['episode']['video']['brightcoveId'], str) or 'ref:{}'.format(common_data['episode']['video']['referenceId']) - video_id = str_or_none(try_get(common_data, lambda x: x['episode']['video']['id'])) or brightcove_id - - title = try_get(common_data, lambda x: x['episode']['name'], str) - season_number = try_get(common_data, lambda x: x['season']['seasonNumber'], int) - episode_number = try_get(common_data, lambda x: x['episode']['episodeNumber'], int) - timestamp = unified_timestamp(try_get(common_data, lambda x: x['episode']['airDate'], str)) - release_date = unified_strdate(try_get(common_data, lambda x: x['episode']['availability'], str)) - thumbnails_data = try_get(common_data, lambda x: x['episode']['image']['sizes'], dict) or {} - thumbnails = [{ - 'id': thumbnail_id, - 'url': thumbnail_url, - 'width': int_or_none(thumbnail_id[1:]), - } for thumbnail_id, thumbnail_url in thumbnails_data.items()] + brightcove_id = traverse_obj(common_data, ( + video_type, 'video', ( + ('brightcoveId', {str}), + ('referenceId', {str}, {lambda x: f'ref:{x}' if x else None}), + ), any, {require('brightcove ID')})) return { '_type': 'url_transparent', - 'url': smuggle_url( - self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, - {'geo_countries': self._GEO_COUNTRIES}), - 'id': video_id, - 'title': title, - 'description': try_get(common_data, lambda x: x['episode']['description'], str), - 'duration': float_or_none(try_get(common_data, lambda x: x['episode']['video']['duration'], float), 1000), - 'thumbnails': thumbnails, - 'ie_key': 'BrightcoveNew', - 'season_number': season_number, - 'episode_number': episode_number, - 'timestamp': timestamp, - 'release_date': release_date, + 'ie_key': BrightcoveNewIE.ie_key(), + 'url': self.BRIGHTCOVE_URL_TEMPLATE.format(brightcove_id), + **traverse_obj(common_data, { + 'id': (video_type, 'video', 'id', {int}, ({str_or_none}, {value(brightcove_id)}), any), + 'title': (video_type, 'name', {str}), + 'description': (video_type, 'description', {str}), + 'duration': (video_type, 'video', 'duration', {float_or_none(scale=1000)}), + 'tags': (video_type, 'tags', ..., 'name', {str}, all, filter), + 'series': ('tvSeries', 'name', {str}), + 'season_number': ('season', 'seasonNumber', {int_or_none}), + 'episode_number': ('episode', 'episodeNumber', {int_or_none}), + 'timestamp': ('episode', 'airDate', {parse_iso8601}), + 'release_timestamp': (video_type, 'availability', {parse_iso8601}), + 'thumbnails': (video_type, 'image', 'sizes', {dict.items}, lambda _, v: url_or_none(v[1]), { + 'id': 0, + 'url': 1, + 'width': (1, {parse_resolution}, 'width'), + }), + }), } diff --git a/yt_dlp/extractor/on24.py b/yt_dlp/extractor/on24.py index 05218e9de..1dfc25a7d 100644 --- a/yt_dlp/extractor/on24.py +++ b/yt_dlp/extractor/on24.py @@ -11,12 +11,15 @@ class On24IE(InfoExtractor): IE_NAME = 'on24' IE_DESC = 'ON24' - _VALID_URL = r'''(?x) - https?://event\.on24\.com/(?: - wcc/r/(?P<id_1>\d{7})/(?P<key_1>[0-9A-F]{32})| - eventRegistration/(?:console/EventConsoleApollo|EventLobbyServlet\?target=lobby30) - \.jsp\?(?:[^/#?]*&)?eventid=(?P<id_2>\d{7})[^/#?]*&key=(?P<key_2>[0-9A-F]{32}) - )''' + _ID_RE = r'(?P<id>\d{7})' + _KEY_RE = r'(?P<key>[0-9A-F]{32})' + _URL_BASE_RE = r'https?://event\.on24\.com' + _URL_QUERY_RE = rf'(?:[^#]*&)?eventid={_ID_RE}&(?:[^#]+&)?key={_KEY_RE}' + _VALID_URL = [ + rf'{_URL_BASE_RE}/wcc/r/{_ID_RE}/{_KEY_RE}', + rf'{_URL_BASE_RE}/eventRegistration/console/(?:EventConsoleApollo\.jsp|apollox/mainEvent/?)\?{_URL_QUERY_RE}', + rf'{_URL_BASE_RE}/eventRegistration/EventLobbyServlet/?\?{_URL_QUERY_RE}', + ] _TESTS = [{ 'url': 'https://event.on24.com/eventRegistration/console/EventConsoleApollo.jsp?uimode=nextgeneration&eventid=2197467&sessionid=1&key=5DF57BE53237F36A43B478DD36277A84&contenttype=A&eventuserid=305999&playerwidth=1000&playerheight=650&caller=previewLobby&text_language_id=en&format=fhaudio&newConsole=false', @@ -34,12 +37,16 @@ class On24IE(InfoExtractor): }, { 'url': 'https://event.on24.com/eventRegistration/console/EventConsoleApollo.jsp?&eventid=2639291&sessionid=1&username=&partnerref=&format=fhvideo1&mobile=&flashsupportedmobiledevice=&helpcenter=&key=82829018E813065A122363877975752E&newConsole=true&nxChe=true&newTabCon=true&text_language_id=en&playerwidth=748&playerheight=526&eventuserid=338788762&contenttype=A&mediametricsessionid=384764716&mediametricid=3558192&usercd=369267058&mode=launch', 'only_matching': True, + }, { + 'url': 'https://event.on24.com/eventRegistration/EventLobbyServlet?target=reg20.jsp&eventid=3543176&key=BC0F6B968B67C34B50D461D40FDB3E18&groupId=3143628', + 'only_matching': True, + }, { + 'url': 'https://event.on24.com/eventRegistration/console/apollox/mainEvent?&eventid=4843671&sessionid=1&username=&partnerref=&format=fhvideo1&mobile=&flashsupportedmobiledevice=&helpcenter=&key=4EAC9B5C564CC98FF29E619B06A2F743&newConsole=true&nxChe=true&newTabCon=true&consoleEarEventConsole=false&consoleEarCloudApi=false&text_language_id=en&playerwidth=748&playerheight=526&referrer=https%3A%2F%2Fevent.on24.com%2Finterface%2Fregistration%2Fautoreg%2Findex.html%3Fsessionid%3D1%26eventid%3D4843671%26key%3D4EAC9B5C564CC98FF29E619B06A2F743%26email%3D000a3e42-7952-4dd6-8f8a-34c38ea3cf02%2540platform%26firstname%3Ds%26lastname%3Ds%26deletecookie%3Dtrue%26event_email%3DN%26marketing_email%3DN%26std1%3D0642572014177%26std2%3D0642572014179%26std3%3D550165f7-a44e-4725-9fe6-716f89908c2b%26std4%3D0&eventuserid=745776448&contenttype=A&mediametricsessionid=640613707&mediametricid=6810717&usercd=745776448&mode=launch', + 'only_matching': True, }] def _real_extract(self, url): - mobj = self._match_valid_url(url) - event_id = mobj.group('id_1') or mobj.group('id_2') - event_key = mobj.group('key_1') or mobj.group('key_2') + event_id, event_key = self._match_valid_url(url).group('id', 'key') event_data = self._download_json( 'https://event.on24.com/apic/utilApp/EventConsoleCachedServlet', diff --git a/yt_dlp/extractor/openrec.py b/yt_dlp/extractor/openrec.py index b4f1c7d85..4c19ab684 100644 --- a/yt_dlp/extractor/openrec.py +++ b/yt_dlp/extractor/openrec.py @@ -67,7 +67,7 @@ def _extract_movie(self, webpage, video_id, name, is_live): class OpenRecIE(OpenRecBaseIE): IE_NAME = 'openrec' - _VALID_URL = r'https?://(?:www\.)?openrec\.tv/live/(?P<id>[^/]+)' + _VALID_URL = r'https?://(?:www\.)?openrec\.tv/live/(?P<id>[^/?#]+)' _TESTS = [{ 'url': 'https://www.openrec.tv/live/2p8v31qe4zy', 'only_matching': True, @@ -85,7 +85,7 @@ def _real_extract(self, url): class OpenRecCaptureIE(OpenRecBaseIE): IE_NAME = 'openrec:capture' - _VALID_URL = r'https?://(?:www\.)?openrec\.tv/capture/(?P<id>[^/]+)' + _VALID_URL = r'https?://(?:www\.)?openrec\.tv/capture/(?P<id>[^/?#]+)' _TESTS = [{ 'url': 'https://www.openrec.tv/capture/l9nk2x4gn14', 'only_matching': True, @@ -129,7 +129,7 @@ def _real_extract(self, url): class OpenRecMovieIE(OpenRecBaseIE): IE_NAME = 'openrec:movie' - _VALID_URL = r'https?://(?:www\.)?openrec\.tv/movie/(?P<id>[^/]+)' + _VALID_URL = r'https?://(?:www\.)?openrec\.tv/movie/(?P<id>[^/?#]+)' _TESTS = [{ 'url': 'https://www.openrec.tv/movie/nqz5xl5km8v', 'info_dict': { @@ -141,6 +141,9 @@ class OpenRecMovieIE(OpenRecBaseIE): 'uploader_id': 'taiki_to_kazuhiro', 'timestamp': 1638856800, }, + }, { + 'url': 'https://www.openrec.tv/movie/2p8vvex548y?playlist_id=98brq96vvsgn2nd', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/panopto.py b/yt_dlp/extractor/panopto.py index 91f105519..9f307a53e 100644 --- a/yt_dlp/extractor/panopto.py +++ b/yt_dlp/extractor/panopto.py @@ -14,8 +14,9 @@ int_or_none, parse_qs, srt_subtitles_timecode, - traverse_obj, + url_or_none, ) +from ..utils.traversal import traverse_obj class PanoptoBaseIE(InfoExtractor): @@ -345,21 +346,16 @@ def _extract_streams_formats_and_subtitles(self, video_id, streams, **fmt_kwargs subtitles = {} for stream in streams or []: stream_formats = [] - http_stream_url = stream.get('StreamHttpUrl') - stream_url = stream.get('StreamUrl') - - if http_stream_url: - stream_formats.append({'url': http_stream_url}) - - if stream_url: + for stream_url in set(traverse_obj(stream, (('StreamHttpUrl', 'StreamUrl'), {url_or_none}))): media_type = stream.get('ViewerMediaFileTypeName') if media_type in ('hls', ): - m3u8_formats, stream_subtitles = self._extract_m3u8_formats_and_subtitles(stream_url, video_id) - stream_formats.extend(m3u8_formats) - subtitles = self._merge_subtitles(subtitles, stream_subtitles) + fmts, subs = self._extract_m3u8_formats_and_subtitles(stream_url, video_id, m3u8_id='hls', fatal=False) + stream_formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) else: stream_formats.append({ 'url': stream_url, + 'ext': media_type, }) for fmt in stream_formats: fmt.update({ diff --git a/yt_dlp/extractor/parti.py b/yt_dlp/extractor/parti.py new file mode 100644 index 000000000..acadefc4e --- /dev/null +++ b/yt_dlp/extractor/parti.py @@ -0,0 +1,101 @@ +from .common import InfoExtractor +from ..utils import UserNotLive, int_or_none, parse_iso8601, url_or_none, urljoin +from ..utils.traversal import traverse_obj + + +class PartiBaseIE(InfoExtractor): + def _call_api(self, path, video_id, note=None): + return self._download_json( + f'https://api-backend.parti.com/parti_v2/profile/{path}', video_id, note) + + +class PartiVideoIE(PartiBaseIE): + IE_NAME = 'parti:video' + _VALID_URL = r'https?://(?:www\.)?parti\.com/video/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://parti.com/video/66284', + 'info_dict': { + 'id': '66284', + 'ext': 'mp4', + 'title': 'NOW LIVE ', + 'upload_date': '20250327', + 'categories': ['Gaming'], + 'thumbnail': 'https://assets.parti.com/351424_eb9e5250-2821-484a-9c5f-ca99aa666c87.png', + 'channel': 'ItZTMGG', + 'timestamp': 1743044379, + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + data = self._call_api(f'get_livestream_channel_info/recent/{video_id}', video_id) + + return { + 'id': video_id, + 'formats': self._extract_m3u8_formats( + urljoin('https://watch.parti.com', data['livestream_recording']), video_id, 'mp4'), + **traverse_obj(data, { + 'title': ('event_title', {str}), + 'channel': ('user_name', {str}), + 'thumbnail': ('event_file', {url_or_none}), + 'categories': ('category_name', {str}, filter, all), + 'timestamp': ('event_start_ts', {int_or_none}), + }), + } + + +class PartiLivestreamIE(PartiBaseIE): + IE_NAME = 'parti:livestream' + _VALID_URL = r'https?://(?:www\.)?parti\.com/creator/(?P<service>[\w]+)/(?P<id>[\w/-]+)' + _TESTS = [{ + 'url': 'https://parti.com/creator/parti/Capt_Robs_Adventures', + 'info_dict': { + 'id': 'Capt_Robs_Adventures', + 'ext': 'mp4', + 'title': r"re:I'm Live on Parti \d{4}-\d{2}-\d{2} \d{2}:\d{2}", + 'view_count': int, + 'thumbnail': r're:https://assets\.parti\.com/.+\.png', + 'timestamp': 1743879776, + 'upload_date': '20250405', + 'live_status': 'is_live', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://parti.com/creator/discord/sazboxgaming/0', + 'only_matching': True, + }] + + def _real_extract(self, url): + service, creator_slug = self._match_valid_url(url).group('service', 'id') + + encoded_creator_slug = creator_slug.replace('/', '%23') + creator_id = self._call_api( + f'get_user_by_social_media/{service}/{encoded_creator_slug}', + creator_slug, note='Fetching user ID') + + data = self._call_api( + f'get_livestream_channel_info/{creator_id}', creator_id, + note='Fetching user profile feed')['channel_info'] + + if not traverse_obj(data, ('channel', 'is_live', {bool})): + raise UserNotLive(video_id=creator_id) + + channel_info = data['channel'] + + return { + 'id': creator_slug, + 'formats': self._extract_m3u8_formats( + channel_info['playback_url'], creator_slug, live=True, query={ + 'token': channel_info['playback_auth_token'], + 'player_version': '1.17.0', + }), + 'is_live': True, + **traverse_obj(data, { + 'title': ('livestream_event_info', 'event_name', {str}), + 'description': ('livestream_event_info', 'event_description', {str}), + 'thumbnail': ('livestream_event_info', 'livestream_preview_file', {url_or_none}), + 'timestamp': ('stream', 'start_time', {parse_iso8601}), + 'view_count': ('stream', 'viewer_count', {int_or_none}), + }), + } diff --git a/yt_dlp/extractor/pbs.py b/yt_dlp/extractor/pbs.py index 2f839a2e9..53b199415 100644 --- a/yt_dlp/extractor/pbs.py +++ b/yt_dlp/extractor/pbs.py @@ -501,7 +501,7 @@ def _extract_webpage(self, url): r"div\s*:\s*'videoembed'\s*,\s*mediaid\s*:\s*'(\d+)'", # frontline video embed r'class="coveplayerid">([^<]+)<', # coveplayer r'<section[^>]+data-coveid="(\d+)"', # coveplayer from http://www.pbs.org/wgbh/frontline/film/real-csi/ - r'\bclass="passportcoveplayer"[^>]+\bdata-media="(\d+)', # https://www.thirteen.org/programs/the-woodwrights-shop/who-wrote-the-book-of-sloyd-fggvvq/ + r'\sclass="passportcoveplayer"[^>]*\sdata-media="(\d+)', # https://www.thirteen.org/programs/the-woodwrights-shop/who-wrote-the-book-of-sloyd-fggvvq/ r'<input type="hidden" id="pbs_video_id_[0-9]+" value="([0-9]+)"/>', # jwplayer r"(?s)window\.PBS\.playerConfig\s*=\s*{.*?id\s*:\s*'([0-9]+)',", r'<div[^>]+\bdata-cove-id=["\'](\d+)"', # http://www.pbs.org/wgbh/roadshow/watch/episode/2105-indianapolis-hour-2/ diff --git a/yt_dlp/extractor/phoenix.py b/yt_dlp/extractor/phoenix.py index 63c256019..9df34f8c9 100644 --- a/yt_dlp/extractor/phoenix.py +++ b/yt_dlp/extractor/phoenix.py @@ -1,5 +1,3 @@ -import re - from .youtube import YoutubeIE from .zdf import ZDFBaseIE from ..utils import ( @@ -7,44 +5,27 @@ merge_dicts, try_get, unified_timestamp, - urljoin, ) class PhoenixIE(ZDFBaseIE): IE_NAME = 'phoenix.de' - _VALID_URL = r'https?://(?:www\.)?phoenix\.de/(?:[^/]+/)*[^/?#&]*-a-(?P<id>\d+)\.html' + _VALID_URL = r'https?://(?:www\.)?phoenix\.de/(?:[^/?#]+/)*[^/?#&]*-a-(?P<id>\d+)\.html' _TESTS = [{ - # Same as https://www.zdf.de/politik/phoenix-sendungen/wohin-fuehrt-der-protest-in-der-pandemie-100.html - 'url': 'https://www.phoenix.de/sendungen/ereignisse/corona-nachgehakt/wohin-fuehrt-der-protest-in-der-pandemie-a-2050630.html', - 'md5': '34ec321e7eb34231fd88616c65c92db0', + 'url': 'https://www.phoenix.de/sendungen/dokumentationen/spitzbergen-a-893349.html', + 'md5': 'a79e86d9774d0b3f2102aff988a0bd32', 'info_dict': { - 'id': '210222_phx_nachgehakt_corona_protest', + 'id': '221215_phx_spitzbergen', 'ext': 'mp4', - 'title': 'Wohin führt der Protest in der Pandemie?', - 'description': 'md5:7d643fe7f565e53a24aac036b2122fbd', - 'duration': 1691, - 'timestamp': 1613902500, - 'upload_date': '20210221', + 'title': 'Spitzbergen', + 'description': 'Film von Tilmann Bünz', + 'duration': 728.0, + 'timestamp': 1555600500, + 'upload_date': '20190418', 'uploader': 'Phoenix', - 'series': 'corona nachgehakt', - 'episode': 'Wohin führt der Protest in der Pandemie?', - }, - }, { - # Youtube embed - 'url': 'https://www.phoenix.de/sendungen/gespraeche/phoenix-streitgut-brennglas-corona-a-1965505.html', - 'info_dict': { - 'id': 'hMQtqFYjomk', - 'ext': 'mp4', - 'title': 'phoenix streitgut: Brennglas Corona - Wie gerecht ist unsere Gesellschaft?', - 'description': 'md5:ac7a02e2eb3cb17600bc372e4ab28fdd', - 'duration': 3509, - 'upload_date': '20201219', - 'uploader': 'phoenix', - 'uploader_id': 'phoenix', - }, - 'params': { - 'skip_download': True, + 'thumbnail': 'https://www.phoenix.de/sixcms/media.php/21/Bergspitzen1.png', + 'series': 'Dokumentationen', + 'episode': 'Spitzbergen', }, }, { 'url': 'https://www.phoenix.de/entwicklungen-in-russland-a-2044720.html', @@ -90,8 +71,8 @@ def _real_extract(self, url): content_id = details['tracking']['nielsen']['content']['assetid'] info = self._extract_ptmd( - f'https://tmd.phoenix.de/tmd/2/ngplayer_2_3/vod/ptmd/phoenix/{content_id}', - content_id, None, url) + f'https://tmd.phoenix.de/tmd/2/android_native_6/vod/ptmd/phoenix/{content_id}', + content_id) duration = int_or_none(try_get( details, lambda x: x['tracking']['nielsen']['content']['length'])) @@ -101,20 +82,8 @@ def _real_extract(self, url): str) episode = title if details.get('contentType') == 'episode' else None - thumbnails = [] teaser_images = try_get(details, lambda x: x['teaserImageRef']['layouts'], dict) or {} - for thumbnail_key, thumbnail_url in teaser_images.items(): - thumbnail_url = urljoin(url, thumbnail_url) - if not thumbnail_url: - continue - thumbnail = { - 'url': thumbnail_url, - } - m = re.match('^([0-9]+)x([0-9]+)$', thumbnail_key) - if m: - thumbnail['width'] = int(m.group(1)) - thumbnail['height'] = int(m.group(2)) - thumbnails.append(thumbnail) + thumbnails = self._extract_thumbnails(teaser_images) return merge_dicts(info, { 'id': content_id, diff --git a/yt_dlp/extractor/pinterest.py b/yt_dlp/extractor/pinterest.py index f0b38893b..b2fe7494b 100644 --- a/yt_dlp/extractor/pinterest.py +++ b/yt_dlp/extractor/pinterest.py @@ -23,9 +23,9 @@ class PinterestBaseIE(InfoExtractor): def _call_api(self, resource, video_id, options): return self._download_json( f'https://www.pinterest.com/resource/{resource}Resource/get/', - video_id, f'Download {resource} JSON metadata', query={ - 'data': json.dumps({'options': options}), - })['resource_response'] + video_id, f'Download {resource} JSON metadata', + query={'data': json.dumps({'options': options})}, + headers={'X-Pinterest-PWS-Handler': 'www/[username].js'})['resource_response'] def _extract_video(self, data, extract_formats=True): video_id = data['id'] diff --git a/yt_dlp/extractor/playsuisse.py b/yt_dlp/extractor/playsuisse.py index 905f8fc2f..59231d840 100644 --- a/yt_dlp/extractor/playsuisse.py +++ b/yt_dlp/extractor/playsuisse.py @@ -1,4 +1,7 @@ +import base64 +import hashlib import json +import uuid from .common import InfoExtractor from ..utils import ( @@ -142,39 +145,73 @@ class PlaySuisseIE(InfoExtractor): id url }''' - _LOGIN_BASE_URL = 'https://login.srgssr.ch/srgssrlogin.onmicrosoft.com' - _LOGIN_PATH = 'B2C_1A__SignInV2' + _CLIENT_ID = '1e33f1bf-8bf3-45e4-bbd9-c9ad934b5fca' + _LOGIN_BASE = 'https://account.srgssr.ch' _ID_TOKEN = None def _perform_login(self, username, password): - login_page = self._download_webpage( - 'https://www.playsuisse.ch/api/sso/login', None, note='Downloading login page', - query={'x': 'x', 'locale': 'de', 'redirectUrl': 'https://www.playsuisse.ch/'}) - settings = self._search_json(r'var\s+SETTINGS\s*=', login_page, 'settings', None) + code_verifier = uuid.uuid4().hex + uuid.uuid4().hex + uuid.uuid4().hex + code_challenge = base64.urlsafe_b64encode( + hashlib.sha256(code_verifier.encode()).digest()).decode().rstrip('=') - csrf_token = settings['csrf'] - query = {'tx': settings['transId'], 'p': self._LOGIN_PATH} + request_id = parse_qs(self._request_webpage( + f'{self._LOGIN_BASE}/authz-srv/authz', None, 'Requesting session ID', query={ + 'client_id': self._CLIENT_ID, + 'redirect_uri': 'https://www.playsuisse.ch/auth', + 'scope': 'email profile openid offline_access', + 'response_type': 'code', + 'code_challenge': code_challenge, + 'code_challenge_method': 'S256', + 'view_type': 'login', + }).url)['requestId'][0] - status = traverse_obj(self._download_json( - f'{self._LOGIN_BASE_URL}/{self._LOGIN_PATH}/SelfAsserted', None, 'Logging in', - query=query, headers={'X-CSRF-TOKEN': csrf_token}, data=urlencode_postdata({ - 'request_type': 'RESPONSE', - 'signInName': username, - 'password': password, - }), expected_status=400), ('status', {int_or_none})) - if status == 400: - raise ExtractorError('Invalid username or password', expected=True) + try: + exchange_id = self._download_json( + f'{self._LOGIN_BASE}/verification-srv/v2/authenticate/initiate/password', None, + 'Submitting username', headers={'content-type': 'application/json'}, data=json.dumps({ + 'usage_type': 'INITIAL_AUTHENTICATION', + 'request_id': request_id, + 'medium_id': 'PASSWORD', + 'type': 'password', + 'identifier': username, + }).encode())['data']['exchange_id']['exchange_id'] + except ExtractorError: + raise ExtractorError('Invalid username', expected=True) - urlh = self._request_webpage( - f'{self._LOGIN_BASE_URL}/{self._LOGIN_PATH}/api/CombinedSigninAndSignup/confirmed', - None, 'Downloading ID token', query={ - 'rememberMe': 'false', - 'csrf_token': csrf_token, - **query, - 'diags': '', - }) + try: + login_data = self._download_json( + f'{self._LOGIN_BASE}/verification-srv/v2/authenticate/authenticate/password', None, + 'Submitting password', headers={'content-type': 'application/json'}, data=json.dumps({ + 'requestId': request_id, + 'exchange_id': exchange_id, + 'type': 'password', + 'password': password, + }).encode())['data'] + except ExtractorError: + raise ExtractorError('Invalid password', expected=True) + + authorization_code = parse_qs(self._request_webpage( + f'{self._LOGIN_BASE}/login-srv/verification/login', None, 'Logging in', + data=urlencode_postdata({ + 'requestId': request_id, + 'exchange_id': login_data['exchange_id']['exchange_id'], + 'verificationType': 'password', + 'sub': login_data['sub'], + 'status_id': login_data['status_id'], + 'rememberMe': True, + 'lat': '', + 'lon': '', + })).url)['code'][0] + + self._ID_TOKEN = self._download_json( + f'{self._LOGIN_BASE}/proxy/token', None, 'Downloading token', data=b'', query={ + 'client_id': self._CLIENT_ID, + 'redirect_uri': 'https://www.playsuisse.ch/auth', + 'code': authorization_code, + 'code_verifier': code_verifier, + 'grant_type': 'authorization_code', + })['id_token'] - self._ID_TOKEN = traverse_obj(parse_qs(urlh.url), ('id_token', 0)) if not self._ID_TOKEN: raise ExtractorError('Login failed') diff --git a/yt_dlp/extractor/polskieradio.py b/yt_dlp/extractor/polskieradio.py index 6fb21e156..9d0496bdf 100644 --- a/yt_dlp/extractor/polskieradio.py +++ b/yt_dlp/extractor/polskieradio.py @@ -22,7 +22,7 @@ ) -class PolskieRadioBaseExtractor(InfoExtractor): +class PolskieRadioBaseIE(InfoExtractor): def _extract_webpage_player_entries(self, webpage, playlist_id, base_data): media_urls = set() @@ -47,7 +47,7 @@ def _extract_webpage_player_entries(self, webpage, playlist_id, base_data): yield entry -class PolskieRadioLegacyIE(PolskieRadioBaseExtractor): +class PolskieRadioLegacyIE(PolskieRadioBaseIE): # legacy sites IE_NAME = 'polskieradio:legacy' _VALID_URL = r'https?://(?:www\.)?polskieradio(?:24)?\.pl/\d+/\d+/[Aa]rtykul/(?P<id>\d+)' @@ -127,7 +127,7 @@ def _real_extract(self, url): return self.playlist_result(entries, playlist_id, title, description) -class PolskieRadioIE(PolskieRadioBaseExtractor): +class PolskieRadioIE(PolskieRadioBaseIE): # new next.js sites _VALID_URL = r'https?://(?:[^/]+\.)?(?:polskieradio(?:24)?|radiokierowcow)\.pl/artykul/(?P<id>\d+)' _TESTS = [{ @@ -519,7 +519,7 @@ def _real_extract(self, url): } -class PolskieRadioPodcastBaseExtractor(InfoExtractor): +class PolskieRadioPodcastBaseIE(InfoExtractor): _API_BASE = 'https://apipodcasts.polskieradio.pl/api' def _parse_episode(self, data): @@ -539,7 +539,7 @@ def _parse_episode(self, data): } -class PolskieRadioPodcastListIE(PolskieRadioPodcastBaseExtractor): +class PolskieRadioPodcastListIE(PolskieRadioPodcastBaseIE): IE_NAME = 'polskieradio:podcast:list' _VALID_URL = r'https?://podcasty\.polskieradio\.pl/podcast/(?P<id>\d+)' _TESTS = [{ @@ -578,7 +578,7 @@ def get_page(page_num): } -class PolskieRadioPodcastIE(PolskieRadioPodcastBaseExtractor): +class PolskieRadioPodcastIE(PolskieRadioPodcastBaseIE): IE_NAME = 'polskieradio:podcast' _VALID_URL = r'https?://podcasty\.polskieradio\.pl/track/(?P<id>[a-f\d]{8}(?:-[a-f\d]{4}){4}[a-f\d]{8})' _TESTS = [{ diff --git a/yt_dlp/extractor/rai.py b/yt_dlp/extractor/rai.py index efb47affc..c489dc731 100644 --- a/yt_dlp/extractor/rai.py +++ b/yt_dlp/extractor/rai.py @@ -321,6 +321,27 @@ class RaiPlayIE(RaiBaseIE): 'timestamp': 1348495020, 'upload_date': '20120924', }, + }, { + # checking program_info gives false positive for DRM + 'url': 'https://www.raiplay.it/video/2022/10/Ad-ogni-costo---Un-giorno-in-Pretura---Puntata-del-15102022-1dfd1295-ea38-4bac-b51e-f87e2881693b.html', + 'md5': '572c6f711b7c5f2d670ba419b4ae3b08', + 'info_dict': { + 'id': '1dfd1295-ea38-4bac-b51e-f87e2881693b', + 'ext': 'mp4', + 'title': 'Ad ogni costo - Un giorno in Pretura - Puntata del 15/10/2022', + 'alt_title': 'St 2022/23 - Un giorno in pretura - Ad ogni costo', + 'description': 'md5:4046d97b2687f74f06a8b8270ba5599f', + 'uploader': 'Rai 3', + 'duration': 3773.0, + 'thumbnail': 'https://www.raiplay.it/dl/img/2022/10/12/1665586539957_2048x2048.png', + 'creators': ['Rai 3'], + 'series': 'Un giorno in pretura', + 'season': '2022/23', + 'episode': 'Ad ogni costo', + 'timestamp': 1665507240, + 'upload_date': '20221011', + 'release_year': 2025, + }, }, { 'url': 'http://www.raiplay.it/video/2016/11/gazebotraindesi-efebe701-969c-4593-92f3-285f0d1ce750.html?', 'only_matching': True, @@ -340,9 +361,8 @@ def _real_extract(self, url): media = self._download_json( f'{base}.json', video_id, 'Downloading video JSON') - if not self.get_param('allow_unplayable_formats'): - if traverse_obj(media, (('program_info', None), 'rights_management', 'rights', 'drm')): - self.report_drm(video_id) + if traverse_obj(media, ('rights_management', 'rights', 'drm')): + self.report_drm(video_id) video = media['video'] relinker_info = self._extract_relinker_info(video['content_url'], video_id) diff --git a/yt_dlp/extractor/reddit.py b/yt_dlp/extractor/reddit.py index 7325e547b..be4c32e13 100644 --- a/yt_dlp/extractor/reddit.py +++ b/yt_dlp/extractor/reddit.py @@ -8,6 +8,7 @@ int_or_none, parse_qs, traverse_obj, + truncate_string, try_get, unescapeHTML, update_url_query, @@ -26,6 +27,7 @@ class RedditIE(InfoExtractor): 'ext': 'mp4', 'display_id': '6rrwyj', 'title': 'That small heart attack.', + 'alt_title': 'That small heart attack.', 'thumbnail': r're:^https?://.*\.(?:jpg|png)', 'thumbnails': 'count:4', 'timestamp': 1501941939, @@ -49,7 +51,8 @@ class RedditIE(InfoExtractor): 'id': 'gyh95hiqc0b11', 'ext': 'mp4', 'display_id': '90bu6w', - 'title': 'Heat index was 110 degrees so we offered him a cold drink. He went for a full body soak instead', + 'title': 'Heat index was 110 degrees so we offered him a cold drink. He went fo...', + 'alt_title': 'Heat index was 110 degrees so we offered him a cold drink. He went for a full body soak instead', 'thumbnail': r're:^https?://.*\.(?:jpg|png)', 'thumbnails': 'count:7', 'timestamp': 1532051078, @@ -69,7 +72,8 @@ class RedditIE(InfoExtractor): 'id': 'zasobba6wp071', 'ext': 'mp4', 'display_id': 'nip71r', - 'title': 'I plan to make more stickers and prints! Check them out on my Etsy! Or get them through my Patreon. Links below.', + 'title': 'I plan to make more stickers and prints! Check them out on my Etsy! O...', + 'alt_title': 'I plan to make more stickers and prints! Check them out on my Etsy! Or get them through my Patreon. Links below.', 'thumbnail': r're:^https?://.*\.(?:jpg|png)', 'thumbnails': 'count:5', 'timestamp': 1621709093, @@ -91,7 +95,17 @@ class RedditIE(InfoExtractor): 'playlist_count': 2, 'info_dict': { 'id': 'wzqkxp', - 'title': 'md5:72d3d19402aa11eff5bd32fc96369b37', + 'title': '[Finale] Kamen Rider Revice Episode 50 "Family to the End, Until the ...', + 'alt_title': '[Finale] Kamen Rider Revice Episode 50 "Family to the End, Until the Day We Meet Again" Discussion', + 'description': 'md5:5b7deb328062b164b15704c5fd67c335', + 'uploader': 'TheTwelveYearOld', + 'channel_id': 'KamenRider', + 'comment_count': int, + 'like_count': int, + 'dislike_count': int, + 'age_limit': 0, + 'timestamp': 1661676059.0, + 'upload_date': '20220828', }, }, { # crossposted reddit-hosted media @@ -102,6 +116,7 @@ class RedditIE(InfoExtractor): 'ext': 'mp4', 'display_id': 'zjjw82', 'title': 'Cringe', + 'alt_title': 'Cringe', 'uploader': 'Otaku-senpai69420', 'thumbnail': r're:^https?://.*\.(?:jpg|png)', 'upload_date': '20221212', @@ -122,6 +137,7 @@ class RedditIE(InfoExtractor): 'ext': 'mp4', 'display_id': '124pp33', 'title': 'Harmless prank of some old friends', + 'alt_title': 'Harmless prank of some old friends', 'uploader': 'Dudezila', 'channel_id': 'ContagiousLaughter', 'duration': 17, @@ -142,6 +158,7 @@ class RedditIE(InfoExtractor): 'ext': 'mp4', 'display_id': '12fujy3', 'title': 'Based Hasan?', + 'alt_title': 'Based Hasan?', 'uploader': 'KingNigelXLII', 'channel_id': 'GenZedong', 'duration': 16, @@ -161,6 +178,7 @@ class RedditIE(InfoExtractor): 'ext': 'mp4', 'display_id': '1cl9h0u', 'title': 'The insurance claim will be interesting', + 'alt_title': 'The insurance claim will be interesting', 'uploader': 'darrenpauli', 'channel_id': 'Unexpected', 'duration': 53, @@ -183,6 +201,7 @@ class RedditIE(InfoExtractor): 'ext': 'mp4', 'display_id': '1cxwzso', 'title': 'Tottenham [1] - 0 Newcastle United - James Maddison 31\'', + 'alt_title': 'Tottenham [1] - 0 Newcastle United - James Maddison 31\'', 'uploader': 'Woodstovia', 'channel_id': 'soccer', 'duration': 30, @@ -198,6 +217,26 @@ class RedditIE(InfoExtractor): 'skip_download': True, 'writesubtitles': True, }, + }, { + # "gated" subreddit post + 'url': 'https://old.reddit.com/r/ketamine/comments/degtjo/when_the_k_hits/', + 'info_dict': { + 'id': 'gqsbxts133r31', + 'ext': 'mp4', + 'display_id': 'degtjo', + 'title': 'When the K hits', + 'alt_title': 'When the K hits', + 'uploader': '[deleted]', + 'channel_id': 'ketamine', + 'comment_count': int, + 'like_count': int, + 'dislike_count': int, + 'age_limit': 18, + 'duration': 34, + 'thumbnail': r're:https?://.+/.+\.(?:jpg|png)', + 'timestamp': 1570438713.0, + 'upload_date': '20191007', + }, }, { 'url': 'https://www.reddit.com/r/videos/comments/6rrwyj', 'only_matching': True, @@ -245,6 +284,15 @@ def _perform_login(self, username, password): elif not traverse_obj(login, ('json', 'data', 'cookie', {str})): raise ExtractorError('Unable to login, no cookie was returned') + def _real_initialize(self): + # Set cookie to opt-in to age-restricted subreddits + self._set_cookie('reddit.com', 'over18', '1') + # Set cookie to opt-in to "gated" subreddits + options = traverse_obj(self._get_cookies('https://www.reddit.com/'), ( + '_options', 'value', {urllib.parse.unquote}, {json.loads}, {dict})) or {} + options['pref_gated_sr_optin'] = True + self._set_cookie('reddit.com', '_options', urllib.parse.quote(json.dumps(options))) + def _get_subtitles(self, video_id): # Fallback if there were no subtitles provided by DASH or HLS manifests caption_url = f'https://v.redd.it/{video_id}/wh_ben_en.vtt' @@ -276,14 +324,6 @@ def _real_extract(self, url): data = data[0]['data']['children'][0]['data'] video_url = data['url'] - over_18 = data.get('over_18') - if over_18 is True: - age_limit = 18 - elif over_18 is False: - age_limit = 0 - else: - age_limit = None - thumbnails = [] def add_thumbnail(src): @@ -309,15 +349,19 @@ def add_thumbnail(src): add_thumbnail(resolution) info = { - 'title': data.get('title'), 'thumbnails': thumbnails, - 'timestamp': float_or_none(data.get('created_utc')), - 'uploader': data.get('author'), - 'channel_id': data.get('subreddit'), - 'like_count': int_or_none(data.get('ups')), - 'dislike_count': int_or_none(data.get('downs')), - 'comment_count': int_or_none(data.get('num_comments')), - 'age_limit': age_limit, + 'age_limit': {True: 18, False: 0}.get(data.get('over_18')), + **traverse_obj(data, { + 'title': ('title', {truncate_string(left=72)}), + 'alt_title': ('title', {str}), + 'description': ('selftext', {str}, filter), + 'timestamp': ('created_utc', {float_or_none}), + 'uploader': ('author', {str}), + 'channel_id': ('subreddit', {str}), + 'like_count': ('ups', {int_or_none}), + 'dislike_count': ('downs', {int_or_none}), + 'comment_count': ('num_comments', {int_or_none}), + }), } parsed_url = urllib.parse.urlparse(video_url) @@ -343,8 +387,9 @@ def add_thumbnail(src): **info, }) if entries: - return self.playlist_result(entries, video_id, info.get('title')) - raise ExtractorError('No media found', expected=True) + return self.playlist_result(entries, video_id, **info) + self.raise_no_formats('No media found', expected=True, video_id=video_id) + return {**info, 'id': video_id} # Check if media is hosted on reddit: reddit_video = traverse_obj(data, ( diff --git a/yt_dlp/extractor/redgifs.py b/yt_dlp/extractor/redgifs.py index 870e33253..cd3cd323e 100644 --- a/yt_dlp/extractor/redgifs.py +++ b/yt_dlp/extractor/redgifs.py @@ -12,7 +12,7 @@ ) -class RedGifsBaseInfoExtractor(InfoExtractor): +class RedGifsBaseIE(InfoExtractor): _FORMATS = { 'gif': 250, 'sd': 480, @@ -113,7 +113,7 @@ def _paged_entries(self, ep, item_id, query, fields): return page_fetcher(page) if page else OnDemandPagedList(page_fetcher, self._PAGE_SIZE) -class RedGifsIE(RedGifsBaseInfoExtractor): +class RedGifsIE(RedGifsBaseIE): _VALID_URL = r'https?://(?:(?:www\.)?redgifs\.com/(?:watch|ifr)/|thumbs2\.redgifs\.com/)(?P<id>[^-/?#\.]+)' _TESTS = [{ 'url': 'https://www.redgifs.com/watch/squeakyhelplesswisent', @@ -172,7 +172,7 @@ def _real_extract(self, url): return self._parse_gif_data(video_info['gif']) -class RedGifsSearchIE(RedGifsBaseInfoExtractor): +class RedGifsSearchIE(RedGifsBaseIE): IE_DESC = 'Redgifs search' _VALID_URL = r'https?://(?:www\.)?redgifs\.com/browse\?(?P<query>[^#]+)' _PAGE_SIZE = 80 @@ -226,7 +226,7 @@ def _real_extract(self, url): entries, query_str, tags, f'RedGifs search for {tags}, ordered by {order}') -class RedGifsUserIE(RedGifsBaseInfoExtractor): +class RedGifsUserIE(RedGifsBaseIE): IE_DESC = 'Redgifs user' _VALID_URL = r'https?://(?:www\.)?redgifs\.com/users/(?P<username>[^/?#]+)(?:\?(?P<query>[^#]+))?' _PAGE_SIZE = 80 diff --git a/yt_dlp/extractor/roya.py b/yt_dlp/extractor/roya.py new file mode 100644 index 000000000..e9fe304ee --- /dev/null +++ b/yt_dlp/extractor/roya.py @@ -0,0 +1,43 @@ +from .common import InfoExtractor +from ..utils.traversal import traverse_obj + + +class RoyaLiveIE(InfoExtractor): + _VALID_URL = r'https?://roya\.tv/live-stream/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://roya.tv/live-stream/1', + 'info_dict': { + 'id': '1', + 'title': r're:Roya TV \d{4}-\d{2}-\d{2} \d{2}:\d{2}', + 'ext': 'mp4', + 'live_status': 'is_live', + }, + }, { + 'url': 'https://roya.tv/live-stream/21', + 'info_dict': { + 'id': '21', + 'title': r're:Roya News \d{4}-\d{2}-\d{2} \d{2}:\d{2}', + 'ext': 'mp4', + 'live_status': 'is_live', + }, + }, { + 'url': 'https://roya.tv/live-stream/10000', + 'only_matching': True, + }] + + def _real_extract(self, url): + media_id = self._match_id(url) + + stream_url = self._download_json( + f'https://ticket.roya-tv.com/api/v5/fastchannel/{media_id}', media_id)['data']['secured_url'] + + title = traverse_obj( + self._download_json('https://backend.roya.tv/api/v01/channels/schedule-pagination', media_id, fatal=False), + ('data', 0, 'channel', lambda _, v: str(v['id']) == media_id, 'title', {str}, any)) + + return { + 'id': media_id, + 'formats': self._extract_m3u8_formats(stream_url, media_id, 'mp4', m3u8_id='hls', live=True), + 'title': title, + 'is_live': True, + } diff --git a/yt_dlp/extractor/rtp.py b/yt_dlp/extractor/rtp.py index 26aec2e4c..03e985940 100644 --- a/yt_dlp/extractor/rtp.py +++ b/yt_dlp/extractor/rtp.py @@ -3,12 +3,20 @@ import re import urllib.parse -from .common import InfoExtractor -from ..utils import js_to_json +from .common import InfoExtractor, Request +from ..utils import ( + determine_ext, + int_or_none, + js_to_json, + parse_duration, + parse_iso8601, + url_or_none, +) +from ..utils.traversal import traverse_obj class RTPIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?rtp\.pt/play/(?:(?:estudoemcasa|palco|zigzag)/)?p(?P<program_id>[0-9]+)/(?P<id>[^/?#]+)' + _VALID_URL = r'https?://(?:www\.)?rtp\.pt/play/(?:[^/#?]+/)?p(?P<program_id>\d+)/(?P<id>e\d+)' _TESTS = [{ 'url': 'http://www.rtp.pt/play/p405/e174042/paixoes-cruzadas', 'md5': 'e736ce0c665e459ddb818546220b4ef8', @@ -16,99 +24,173 @@ class RTPIE(InfoExtractor): 'id': 'e174042', 'ext': 'mp3', 'title': 'Paixões Cruzadas', - 'description': 'As paixões musicais de António Cartaxo e António Macedo', + 'description': 'md5:af979e58ba0ab73f78435fc943fdb070', 'thumbnail': r're:^https?://.*\.jpg', + 'series': 'Paixões Cruzadas', + 'duration': 2950.0, + 'modified_timestamp': 1553693464, + 'modified_date': '20190327', + 'timestamp': 1417219200, + 'upload_date': '20141129', }, }, { 'url': 'https://www.rtp.pt/play/zigzag/p13166/e757904/25-curiosidades-25-de-abril', - 'md5': '9a81ed53f2b2197cfa7ed455b12f8ade', + 'md5': '5b4859940e3adef61247a77dfb76046a', 'info_dict': { 'id': 'e757904', 'ext': 'mp4', - 'title': '25 Curiosidades, 25 de Abril', - 'description': 'Estudar ou não estudar - Em cada um dos episódios descobrimos uma curiosidade acerca de como era viver em Portugal antes da revolução do 25 de abr', + 'title': 'Estudar ou não estudar', + 'description': 'md5:3bfd7eb8bebfd5711a08df69c9c14c35', 'thumbnail': r're:^https?://.*\.jpg', + 'timestamp': 1711958401, + 'duration': 146.0, + 'upload_date': '20240401', + 'modified_timestamp': 1712242991, + 'series': '25 Curiosidades, 25 de Abril', + 'episode_number': 2, + 'episode': 'Estudar ou não estudar', + 'modified_date': '20240404', }, }, { - 'url': 'http://www.rtp.pt/play/p831/a-quimica-das-coisas', - 'only_matching': True, - }, { - 'url': 'https://www.rtp.pt/play/estudoemcasa/p7776/portugues-1-ano', - 'only_matching': True, - }, { - 'url': 'https://www.rtp.pt/play/palco/p13785/l7nnon', - 'only_matching': True, + # Episode not accessible through API + 'url': 'https://www.rtp.pt/play/estudoemcasa/p7776/e500050/portugues-1-ano', + 'md5': '57660c0b46db9f22118c52cbd65975e4', + 'info_dict': { + 'id': 'e500050', + 'ext': 'mp4', + 'title': 'Português - 1.º ano', + 'duration': 1669.0, + 'description': 'md5:be68925c81269f8c6886589f25fe83ea', + 'upload_date': '20201020', + 'timestamp': 1603180799, + 'thumbnail': 'https://cdn-images.rtp.pt/EPG/imagens/39482_59449_64850.png?v=3&w=860', + }, }] + _USER_AGENT = 'rtpplay/2.0.66 (pt.rtp.rtpplay; build:2066; iOS 15.8.3) Alamofire/5.9.1' + _AUTH_TOKEN = None + + def _fetch_auth_token(self): + if self._AUTH_TOKEN: + return self._AUTH_TOKEN + self._AUTH_TOKEN = traverse_obj(self._download_json(Request( + 'https://rtpplayapi.rtp.pt/play/api/2/token-manager', + headers={ + 'Accept': '*/*', + 'rtp-play-auth': 'RTPPLAY_MOBILE_IOS', + 'rtp-play-auth-hash': 'fac9c328b2f27e26e03d7f8942d66c05b3e59371e16c2a079f5c83cc801bd3ee', + 'rtp-play-auth-timestamp': '2145973229682', + 'User-Agent': self._USER_AGENT, + }, extensions={'keep_header_casing': True}), None, + note='Fetching guest auth token', errnote='Could not fetch guest auth token', + fatal=False), ('token', 'token', {str})) + return self._AUTH_TOKEN + + @staticmethod + def _cleanup_media_url(url): + if urllib.parse.urlparse(url).netloc == 'streaming-ondemand.rtp.pt': + return None + return url.replace('/drm-fps/', '/hls/').replace('/drm-dash/', '/dash/') + + def _extract_formats(self, media_urls, episode_id): + formats = [] + subtitles = {} + for media_url in set(traverse_obj(media_urls, (..., {url_or_none}, {self._cleanup_media_url}))): + ext = determine_ext(media_url) + if ext == 'm3u8': + fmts, subs = self._extract_m3u8_formats_and_subtitles( + media_url, episode_id, m3u8_id='hls', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + elif ext == 'mpd': + fmts, subs = self._extract_mpd_formats_and_subtitles( + media_url, episode_id, mpd_id='dash', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + else: + formats.append({ + 'url': media_url, + 'format_id': 'http', + }) + return formats, subtitles + + def _extract_from_api(self, program_id, episode_id): + auth_token = self._fetch_auth_token() + if not auth_token: + return + episode_data = traverse_obj(self._download_json( + f'https://www.rtp.pt/play/api/1/get-episode/{program_id}/{episode_id[1:]}', episode_id, + query={'include_assets': 'true', 'include_webparams': 'true'}, + headers={ + 'Accept': '*/*', + 'Authorization': f'Bearer {auth_token}', + 'User-Agent': self._USER_AGENT, + }, fatal=False), 'result', {dict}) + if not episode_data: + return + asset_urls = traverse_obj(episode_data, ('assets', 0, 'asset_url', {dict})) + media_urls = traverse_obj(asset_urls, ( + ((('hls', 'dash'), 'stream_url'), ('multibitrate', ('url_hls', 'url_dash'))),)) + formats, subtitles = self._extract_formats(media_urls, episode_id) + + for sub_data in traverse_obj(asset_urls, ('subtitles', 'vtt_list', lambda _, v: url_or_none(v['file']))): + subtitles.setdefault(sub_data.get('code') or 'pt', []).append({ + 'url': sub_data['file'], + 'name': sub_data.get('language'), + }) + + return { + 'id': episode_id, + 'formats': formats, + 'subtitles': subtitles, + 'thumbnail': traverse_obj(episode_data, ('assets', 0, 'asset_thumbnail', {url_or_none})), + **traverse_obj(episode_data, ('episode', { + 'title': (('episode_title', 'program_title'), {str}, filter, any), + 'alt_title': ('episode_subtitle', {str}, filter), + 'description': (('episode_description', 'episode_summary'), {str}, filter, any), + 'timestamp': ('episode_air_date', {parse_iso8601(delimiter=' ')}), + 'modified_timestamp': ('episode_lastchanged', {parse_iso8601(delimiter=' ')}), + 'duration': ('episode_duration_complete', {parse_duration}), + 'episode': ('episode_title', {str}, filter), + 'episode_number': ('episode_number', {int_or_none}), + 'season': ('program_season', {str}, filter), + 'series': ('program_title', {str}, filter), + })), + } + _RX_OBFUSCATION = re.compile(r'''(?xs) atob\s*\(\s*decodeURIComponent\s*\(\s* (\[[0-9A-Za-z%,'"]*\]) \s*\.\s*join\(\s*(?:""|'')\s*\)\s*\)\s*\) ''') - def __unobfuscate(self, data, *, video_id): - if data.startswith('{'): - data = self._RX_OBFUSCATION.sub( - lambda m: json.dumps( - base64.b64decode(urllib.parse.unquote( - ''.join(self._parse_json(m.group(1), video_id)), - )).decode('iso-8859-1')), - data) - return js_to_json(data) + def __unobfuscate(self, data): + return self._RX_OBFUSCATION.sub( + lambda m: json.dumps( + base64.b64decode(urllib.parse.unquote( + ''.join(json.loads(m.group(1))), + )).decode('iso-8859-1')), + data) - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - title = self._html_search_meta( - 'twitter:title', webpage, display_name='title', fatal=True) - - f, config = self._search_regex( - r'''(?sx) - (?:var\s+f\s*=\s*(?P<f>".*?"|{[^;]+?});\s*)? - var\s+player1\s+=\s+new\s+RTPPlayer\s*\((?P<config>{(?:(?!\*/).)+?})\);(?!\s*\*/) - ''', webpage, - 'player config', group=('f', 'config')) - - config = self._parse_json( - config, video_id, - lambda data: self.__unobfuscate(data, video_id=video_id)) - f = config['file'] if not f else self._parse_json( - f, video_id, - lambda data: self.__unobfuscate(data, video_id=video_id)) + def _extract_from_html(self, url, episode_id): + webpage = self._download_webpage(url, episode_id) formats = [] - if isinstance(f, dict): - f_hls = f.get('hls') - if f_hls is not None: - formats.extend(self._extract_m3u8_formats( - f_hls, video_id, 'mp4', 'm3u8_native', m3u8_id='hls')) - - f_dash = f.get('dash') - if f_dash is not None: - formats.extend(self._extract_mpd_formats(f_dash, video_id, mpd_id='dash')) - else: - formats.append({ - 'format_id': 'f', - 'url': f, - 'vcodec': 'none' if config.get('mediaType') == 'audio' else None, - }) - subtitles = {} - - vtt = config.get('vtt') - if vtt is not None: - for lcode, lname, url in vtt: - subtitles.setdefault(lcode, []).append({ - 'name': lname, - 'url': url, - }) + media_urls = traverse_obj(re.findall(r'(?:var\s+f\s*=|RTPPlayer\({[^}]+file:)\s*({[^}]+}|"[^"]+")', webpage), ( + -1, (({self.__unobfuscate}, {js_to_json}, {json.loads}, {dict.values}, ...), {json.loads}))) + formats, subtitles = self._extract_formats(media_urls, episode_id) return { - 'id': video_id, - 'title': title, + 'id': episode_id, 'formats': formats, - 'description': self._html_search_meta(['description', 'twitter:description'], webpage), - 'thumbnail': config.get('poster') or self._og_search_thumbnail(webpage), 'subtitles': subtitles, + 'description': self._html_search_meta(['og:description', 'twitter:description'], webpage, default=None), + 'thumbnail': self._html_search_meta(['og:image', 'twitter:image'], webpage, default=None), + **self._search_json_ld(webpage, episode_id, default={}), + 'title': self._html_search_meta(['og:title', 'twitter:title'], webpage, default=None), } + + def _real_extract(self, url): + program_id, episode_id = self._match_valid_url(url).group('program_id', 'id') + return self._extract_from_api(program_id, episode_id) or self._extract_from_html(url, episode_id) diff --git a/yt_dlp/extractor/rtve.py b/yt_dlp/extractor/rtve.py index 7e0b666ab..2812d9305 100644 --- a/yt_dlp/extractor/rtve.py +++ b/yt_dlp/extractor/rtve.py @@ -1,35 +1,142 @@ import base64 import io import struct +import urllib.parse from .common import InfoExtractor from ..utils import ( ExtractorError, + clean_html, determine_ext, float_or_none, + make_archive_id, + parse_iso8601, qualities, - remove_end, - remove_start, - try_get, + url_or_none, ) +from ..utils.traversal import subs_list_to_dict, traverse_obj -class RTVEALaCartaIE(InfoExtractor): +class RTVEBaseIE(InfoExtractor): + # Reimplementation of https://js2.rtve.es/pages/app-player/3.5.1/js/pf_video.js + @staticmethod + def _decrypt_url(png): + encrypted_data = io.BytesIO(base64.b64decode(png)[8:]) + while True: + length_data = encrypted_data.read(4) + length = struct.unpack('!I', length_data)[0] + chunk_type = encrypted_data.read(4) + if chunk_type == b'IEND': + break + data = encrypted_data.read(length) + if chunk_type == b'tEXt': + data = bytes(filter(None, data)) + alphabet_data, _, url_data = data.partition(b'#') + quality_str, _, url_data = url_data.rpartition(b'%%') + quality_str = quality_str.decode() or '' + alphabet = RTVEBaseIE._get_alphabet(alphabet_data) + url = RTVEBaseIE._get_url(alphabet, url_data) + yield quality_str, url + encrypted_data.read(4) # CRC + + @staticmethod + def _get_url(alphabet, url_data): + url = '' + f = 0 + e = 3 + b = 1 + for char in url_data.decode('iso-8859-1'): + if f == 0: + l = int(char) * 10 + f = 1 + else: + if e == 0: + l += int(char) + url += alphabet[l] + e = (b + 3) % 4 + f = 0 + b += 1 + else: + e -= 1 + return url + + @staticmethod + def _get_alphabet(alphabet_data): + alphabet = [] + e = 0 + d = 0 + for char in alphabet_data.decode('iso-8859-1'): + if d == 0: + alphabet.append(char) + d = e = (e + 1) % 4 + else: + d -= 1 + return alphabet + + def _extract_png_formats_and_subtitles(self, video_id, media_type='videos'): + formats, subtitles = [], {} + q = qualities(['Media', 'Alta', 'HQ', 'HD_READY', 'HD_FULL']) + for manager in ('rtveplayw', 'default'): + png = self._download_webpage( + f'http://www.rtve.es/ztnr/movil/thumbnail/{manager}/{media_type}/{video_id}.png', + video_id, 'Downloading url information', query={'q': 'v2'}, fatal=False) + if not png: + continue + + for quality, video_url in self._decrypt_url(png): + ext = determine_ext(video_url) + if ext == 'm3u8': + fmts, subs = self._extract_m3u8_formats_and_subtitles( + video_url, video_id, 'mp4', m3u8_id='hls', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + elif ext == 'mpd': + fmts, subs = self._extract_mpd_formats_and_subtitles( + video_url, video_id, 'dash', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + else: + formats.append({ + 'format_id': quality, + 'quality': q(quality), + 'url': video_url, + }) + return formats, subtitles + + def _parse_metadata(self, metadata): + return traverse_obj(metadata, { + 'title': ('title', {str.strip}), + 'alt_title': ('alt', {str.strip}), + 'description': ('description', {clean_html}), + 'timestamp': ('dateOfEmission', {parse_iso8601(delimiter=' ')}), + 'release_timestamp': ('publicationDate', {parse_iso8601(delimiter=' ')}), + 'modified_timestamp': ('modificationDate', {parse_iso8601(delimiter=' ')}), + 'thumbnail': (('thumbnail', 'image', 'imageSEO'), {url_or_none}, any), + 'duration': ('duration', {float_or_none(scale=1000)}), + 'is_live': ('live', {bool}), + 'series': (('programTitle', ('programInfo', 'title')), {clean_html}, any), + }) + + +class RTVEALaCartaIE(RTVEBaseIE): IE_NAME = 'rtve.es:alacarta' - IE_DESC = 'RTVE a la carta' - _VALID_URL = r'https?://(?:www\.)?rtve\.es/(m/)?(alacarta/videos|filmoteca)/[^/]+/[^/]+/(?P<id>\d+)' + IE_DESC = 'RTVE a la carta and Play' + _VALID_URL = [ + r'https?://(?:www\.)?rtve\.es/(?:m/)?(?:(?:alacarta|play)/videos|filmoteca)/(?!directo)(?:[^/?#]+/){2}(?P<id>\d+)', + r'https?://(?:www\.)?rtve\.es/infantil/serie/[^/?#]+/video/[^/?#]+/(?P<id>\d+)', + ] _TESTS = [{ - 'url': 'http://www.rtve.es/alacarta/videos/balonmano/o-swiss-cup-masculina-final-espana-suecia/2491869/', - 'md5': '1d49b7e1ca7a7502c56a4bf1b60f1b43', + 'url': 'http://www.rtve.es/alacarta/videos/la-aventura-del-saber/aventuraentornosilla/3088905/', + 'md5': 'a964547824359a5753aef09d79fe984b', 'info_dict': { - 'id': '2491869', + 'id': '3088905', 'ext': 'mp4', - 'title': 'Balonmano - Swiss Cup masculina. Final: España-Suecia', - 'duration': 5024.566, - 'series': 'Balonmano', + 'title': 'En torno a la silla', + 'duration': 1216.981, + 'series': 'La aventura del Saber', + 'thumbnail': 'https://img2.rtve.es/v/aventuraentornosilla_3088905.png', }, - 'expected_warnings': ['Failed to download MPD manifest', 'Failed to download m3u8 information'], }, { 'note': 'Live stream', 'url': 'http://www.rtve.es/alacarta/videos/television/24h-live/1694255/', @@ -38,140 +145,88 @@ class RTVEALaCartaIE(InfoExtractor): 'ext': 'mp4', 'title': 're:^24H LIVE [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'is_live': True, + 'live_status': 'is_live', + 'thumbnail': r're:https://img2\.rtve\.es/v/.*\.png', }, 'params': { 'skip_download': 'live stream', }, }, { 'url': 'http://www.rtve.es/alacarta/videos/servir-y-proteger/servir-proteger-capitulo-104/4236788/', - 'md5': 'd850f3c8731ea53952ebab489cf81cbf', + 'md5': 'f3cf0d1902d008c48c793e736706c174', 'info_dict': { 'id': '4236788', 'ext': 'mp4', - 'title': 'Servir y proteger - Capítulo 104', - 'duration': 3222.0, + 'title': 'Episodio 104', + 'duration': 3222.8, + 'thumbnail': r're:https://img2\.rtve\.es/v/.*\.png', + 'series': 'Servir y proteger', }, - 'expected_warnings': ['Failed to download MPD manifest', 'Failed to download m3u8 information'], }, { 'url': 'http://www.rtve.es/m/alacarta/videos/cuentame-como-paso/cuentame-como-paso-t16-ultimo-minuto-nuestra-vida-capitulo-276/2969138/?media=tve', 'only_matching': True, }, { 'url': 'http://www.rtve.es/filmoteca/no-do/not-1-introduccion-primer-noticiario-espanol/1465256/', 'only_matching': True, + }, { + 'url': 'https://www.rtve.es/play/videos/saber-vivir/07-07-24/16177116/', + 'md5': 'a5b24fcdfa3ff5cb7908aba53d22d4b6', + 'info_dict': { + 'id': '16177116', + 'ext': 'mp4', + 'title': 'Saber vivir - 07/07/24', + 'thumbnail': r're:https://img2\.rtve\.es/v/.*\.png', + 'duration': 2162.68, + 'series': 'Saber vivir', + }, + }, { + 'url': 'https://www.rtve.es/infantil/serie/agus-lui-churros-crafts/video/gusano/7048976/', + 'info_dict': { + 'id': '7048976', + 'ext': 'mp4', + 'title': 'Gusano', + 'thumbnail': r're:https://img2\.rtve\.es/v/.*\.png', + 'duration': 292.86, + 'series': 'Agus & Lui: Churros y Crafts', + '_old_archive_ids': ['rtveinfantil 7048976'], + }, }] - def _real_initialize(self): - user_agent_b64 = base64.b64encode(self.get_param('http_headers')['User-Agent'].encode()).decode('utf-8') - self._manager = self._download_json( - 'http://www.rtve.es/odin/loki/' + user_agent_b64, - None, 'Fetching manager info')['manager'] - - @staticmethod - def _decrypt_url(png): - encrypted_data = io.BytesIO(base64.b64decode(png)[8:]) - while True: - length = struct.unpack('!I', encrypted_data.read(4))[0] - chunk_type = encrypted_data.read(4) - if chunk_type == b'IEND': - break - data = encrypted_data.read(length) - if chunk_type == b'tEXt': - alphabet_data, text = data.split(b'\0') - quality, url_data = text.split(b'%%') - alphabet = [] - e = 0 - d = 0 - for l in alphabet_data.decode('iso-8859-1'): - if d == 0: - alphabet.append(l) - d = e = (e + 1) % 4 - else: - d -= 1 - url = '' - f = 0 - e = 3 - b = 1 - for letter in url_data.decode('iso-8859-1'): - if f == 0: - l = int(letter) * 10 - f = 1 - else: - if e == 0: - l += int(letter) - url += alphabet[l] - e = (b + 3) % 4 - f = 0 - b += 1 - else: - e -= 1 - - yield quality.decode(), url - encrypted_data.read(4) # CRC - - def _extract_png_formats(self, video_id): - png = self._download_webpage( - f'http://www.rtve.es/ztnr/movil/thumbnail/{self._manager}/videos/{video_id}.png', - video_id, 'Downloading url information', query={'q': 'v2'}) - q = qualities(['Media', 'Alta', 'HQ', 'HD_READY', 'HD_FULL']) - formats = [] - for quality, video_url in self._decrypt_url(png): - ext = determine_ext(video_url) - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - elif ext == 'mpd': - formats.extend(self._extract_mpd_formats( - video_url, video_id, 'dash', fatal=False)) - else: - formats.append({ - 'format_id': quality, - 'quality': q(quality), - 'url': video_url, - }) - return formats + def _get_subtitles(self, video_id): + subtitle_data = self._download_json( + f'https://api2.rtve.es/api/videos/{video_id}/subtitulos.json', video_id, + 'Downloading subtitles info') + return traverse_obj(subtitle_data, ('page', 'items', ..., { + 'id': ('lang', {str}), + 'url': ('src', {url_or_none}), + }, all, {subs_list_to_dict(lang='es')})) def _real_extract(self, url): video_id = self._match_id(url) - info = self._download_json( + metadata = self._download_json( f'http://www.rtve.es/api/videos/{video_id}/config/alacarta_videos.json', video_id)['page']['items'][0] - if info['state'] == 'DESPU': + if metadata['state'] == 'DESPU': raise ExtractorError('The video is no longer available', expected=True) - title = info['title'].strip() - formats = self._extract_png_formats(video_id) + formats, subtitles = self._extract_png_formats_and_subtitles(video_id) - subtitles = None - sbt_file = info.get('sbtFile') - if sbt_file: - subtitles = self.extract_subtitles(video_id, sbt_file) + self._merge_subtitles(self.extract_subtitles(video_id), target=subtitles) - is_live = info.get('live') is True + is_infantil = urllib.parse.urlparse(url).path.startswith('/infantil/') return { 'id': video_id, - 'title': title, 'formats': formats, - 'thumbnail': info.get('image'), 'subtitles': subtitles, - 'duration': float_or_none(info.get('duration'), 1000), - 'is_live': is_live, - 'series': info.get('programTitle'), + **self._parse_metadata(metadata), + '_old_archive_ids': [make_archive_id('rtveinfantil', video_id)] if is_infantil else None, } - def _get_subtitles(self, video_id, sub_file): - subs = self._download_json( - sub_file + '.json', video_id, - 'Downloading subtitles info')['page']['items'] - return dict( - (s['lang'], [{'ext': 'vtt', 'url': s['src']}]) - for s in subs) - -class RTVEAudioIE(RTVEALaCartaIE): # XXX: Do not subclass from concrete IE +class RTVEAudioIE(RTVEBaseIE): IE_NAME = 'rtve.es:audio' IE_DESC = 'RTVE audio' - _VALID_URL = r'https?://(?:www\.)?rtve\.es/(alacarta|play)/audios/[^/]+/[^/]+/(?P<id>[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?rtve\.es/(alacarta|play)/audios/(?:[^/?#]+/){2}(?P<id>\d+)' _TESTS = [{ 'url': 'https://www.rtve.es/alacarta/audios/a-hombros-de-gigantes/palabra-ingeniero-codigos-informaticos-27-04-21/5889192/', @@ -180,9 +235,11 @@ class RTVEAudioIE(RTVEALaCartaIE): # XXX: Do not subclass from concrete IE 'id': '5889192', 'ext': 'mp3', 'title': 'Códigos informáticos', - 'thumbnail': r're:https?://.+/1598856591583.jpg', + 'alt_title': 'Códigos informáticos - Escuchar ahora', 'duration': 349.440, 'series': 'A hombros de gigantes', + 'description': 'md5:72b0d7c1ca20fd327bdfff7ac0171afb', + 'thumbnail': 'https://img2.rtve.es/a/palabra-ingeniero-codigos-informaticos-270421_5889192.png', }, }, { 'url': 'https://www.rtve.es/play/audios/en-radio-3/ignatius-farray/5791165/', @@ -191,9 +248,11 @@ class RTVEAudioIE(RTVEALaCartaIE): # XXX: Do not subclass from concrete IE 'id': '5791165', 'ext': 'mp3', 'title': 'Ignatius Farray', + 'alt_title': 'En Radio 3 - Ignatius Farray - 13/02/21 - escuchar ahora', 'thumbnail': r're:https?://.+/1613243011863.jpg', 'duration': 3559.559, 'series': 'En Radio 3', + 'description': 'md5:124aa60b461e0b1724a380bad3bc4040', }, }, { 'url': 'https://www.rtve.es/play/audios/frankenstein-o-el-moderno-prometeo/capitulo-26-ultimo-muerte-victor-juan-jose-plans-mary-shelley/6082623/', @@ -202,126 +261,101 @@ class RTVEAudioIE(RTVEALaCartaIE): # XXX: Do not subclass from concrete IE 'id': '6082623', 'ext': 'mp3', 'title': 'Capítulo 26 y último: La muerte de Victor', + 'alt_title': 'Frankenstein o el moderno Prometeo - Capítulo 26 y último: La muerte de Victor', 'thumbnail': r're:https?://.+/1632147445707.jpg', 'duration': 3174.086, 'series': 'Frankenstein o el moderno Prometeo', + 'description': 'md5:4ee6fcb82ebe2e46d267e1d1c1a8f7b5', }, }] - def _extract_png_formats(self, audio_id): - """ - This function retrieves media related png thumbnail which obfuscate - valuable information about the media. This information is decrypted - via base class _decrypt_url function providing media quality and - media url - """ - png = self._download_webpage( - f'http://www.rtve.es/ztnr/movil/thumbnail/{self._manager}/audios/{audio_id}.png', - audio_id, 'Downloading url information', query={'q': 'v2'}) - q = qualities(['Media', 'Alta', 'HQ', 'HD_READY', 'HD_FULL']) - formats = [] - for quality, audio_url in self._decrypt_url(png): - ext = determine_ext(audio_url) - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - audio_url, audio_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - elif ext == 'mpd': - formats.extend(self._extract_mpd_formats( - audio_url, audio_id, 'dash', fatal=False)) - else: - formats.append({ - 'format_id': quality, - 'quality': q(quality), - 'url': audio_url, - }) - return formats - def _real_extract(self, url): audio_id = self._match_id(url) - info = self._download_json( - f'https://www.rtve.es/api/audios/{audio_id}.json', - audio_id)['page']['items'][0] + metadata = self._download_json( + f'https://www.rtve.es/api/audios/{audio_id}.json', audio_id)['page']['items'][0] + + formats, subtitles = self._extract_png_formats_and_subtitles(audio_id, media_type='audios') return { 'id': audio_id, - 'title': info['title'].strip(), - 'thumbnail': info.get('thumbnail'), - 'duration': float_or_none(info.get('duration'), 1000), - 'series': try_get(info, lambda x: x['programInfo']['title']), - 'formats': self._extract_png_formats(audio_id), + 'formats': formats, + 'subtitles': subtitles, + **self._parse_metadata(metadata), } -class RTVEInfantilIE(RTVEALaCartaIE): # XXX: Do not subclass from concrete IE - IE_NAME = 'rtve.es:infantil' - IE_DESC = 'RTVE infantil' - _VALID_URL = r'https?://(?:www\.)?rtve\.es/infantil/serie/[^/]+/video/[^/]+/(?P<id>[0-9]+)/' - - _TESTS = [{ - 'url': 'http://www.rtve.es/infantil/serie/cleo/video/maneras-vivir/3040283/', - 'md5': '5747454717aedf9f9fdf212d1bcfc48d', - 'info_dict': { - 'id': '3040283', - 'ext': 'mp4', - 'title': 'Maneras de vivir', - 'thumbnail': r're:https?://.+/1426182947956\.JPG', - 'duration': 357.958, - }, - 'expected_warnings': ['Failed to download MPD manifest', 'Failed to download m3u8 information'], - }] - - -class RTVELiveIE(RTVEALaCartaIE): # XXX: Do not subclass from concrete IE +class RTVELiveIE(RTVEBaseIE): IE_NAME = 'rtve.es:live' IE_DESC = 'RTVE.es live streams' - _VALID_URL = r'https?://(?:www\.)?rtve\.es/directo/(?P<id>[a-zA-Z0-9-]+)' + _VALID_URL = [ + r'https?://(?:www\.)?rtve\.es/directo/(?P<id>[a-zA-Z0-9-]+)', + r'https?://(?:www\.)?rtve\.es/play/videos/directo/[^/?#]+/(?P<id>[a-zA-Z0-9-]+)', + ] _TESTS = [{ 'url': 'http://www.rtve.es/directo/la-1/', 'info_dict': { 'id': 'la-1', 'ext': 'mp4', - 'title': 're:^La 1 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'live_status': 'is_live', + 'title': str, + 'description': str, + 'thumbnail': r're:https://img\d\.rtve\.es/resources/thumbslive/\d+\.jpg', + 'timestamp': int, + 'upload_date': str, }, - 'params': { - 'skip_download': 'live stream', + 'params': {'skip_download': 'live stream'}, + }, { + 'url': 'https://www.rtve.es/play/videos/directo/deportes/tdp/', + 'info_dict': { + 'id': 'tdp', + 'ext': 'mp4', + 'live_status': 'is_live', + 'title': str, + 'description': str, + 'thumbnail': r're:https://img2\d\.rtve\.es/resources/thumbslive/\d+\.jpg', + 'timestamp': int, + 'upload_date': str, }, + 'params': {'skip_download': 'live stream'}, + }, { + 'url': 'http://www.rtve.es/play/videos/directo/canales-lineales/la-1/', + 'only_matching': True, }] def _real_extract(self, url): - mobj = self._match_valid_url(url) - video_id = mobj.group('id') - + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - title = remove_end(self._og_search_title(webpage), ' en directo en RTVE.es') - title = remove_start(title, 'Estoy viendo ') - vidplayer_id = self._search_regex( - (r'playerId=player([0-9]+)', - r'class=["\'].*?\blive_mod\b.*?["\'][^>]+data-assetid=["\'](\d+)', - r'data-id=["\'](\d+)'), - webpage, 'internal video ID') + data_setup = self._search_json( + r'<div[^>]+class="[^"]*videoPlayer[^"]*"[^>]*data-setup=\'', + webpage, 'data_setup', video_id) + + formats, subtitles = self._extract_png_formats_and_subtitles(data_setup['idAsset']) return { 'id': video_id, - 'title': title, - 'formats': self._extract_png_formats(vidplayer_id), + **self._search_json_ld(webpage, video_id, fatal=False), + 'title': self._html_extract_title(webpage), + 'formats': formats, + 'subtitles': subtitles, 'is_live': True, } class RTVETelevisionIE(InfoExtractor): IE_NAME = 'rtve.es:television' - _VALID_URL = r'https?://(?:www\.)?rtve\.es/television/[^/]+/[^/]+/(?P<id>\d+).shtml' + _VALID_URL = r'https?://(?:www\.)?rtve\.es/television/[^/?#]+/[^/?#]+/(?P<id>\d+).shtml' _TEST = { - 'url': 'http://www.rtve.es/television/20160628/revolucion-del-movil/1364141.shtml', + 'url': 'https://www.rtve.es/television/20091103/video-inedito-del-8o-programa/299020.shtml', 'info_dict': { - 'id': '3069778', + 'id': '572515', 'ext': 'mp4', - 'title': 'Documentos TV - La revolución del móvil', - 'duration': 3496.948, + 'title': 'Clase inédita', + 'duration': 335.817, + 'thumbnail': r're:https://img2\.rtve\.es/v/.*\.png', + 'series': 'El coro de la cárcel', }, 'params': { 'skip_download': True, @@ -332,11 +366,8 @@ def _real_extract(self, url): page_id = self._match_id(url) webpage = self._download_webpage(url, page_id) - alacarta_url = self._search_regex( - r'data-location="alacarta_videos"[^<]+url":"(http://www\.rtve\.es/alacarta.+?)&', - webpage, 'alacarta url', default=None) - if alacarta_url is None: - raise ExtractorError( - 'The webpage doesn\'t contain any video', expected=True) + play_url = self._html_search_meta('contentUrl', webpage) + if play_url is None: + raise ExtractorError('The webpage doesn\'t contain any video', expected=True) - return self.url_result(alacarta_url, ie=RTVEALaCartaIE.ie_key()) + return self.url_result(play_url, ie=RTVEALaCartaIE.ie_key()) diff --git a/yt_dlp/extractor/rtvs.py b/yt_dlp/extractor/rtvs.py index 927da5778..fcbc88a9f 100644 --- a/yt_dlp/extractor/rtvs.py +++ b/yt_dlp/extractor/rtvs.py @@ -9,7 +9,9 @@ class RTVSIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?rtvs\.sk/(?:radio|televizia)/archiv(?:/\d+)?/(?P<id>\d+)/?(?:[#?]|$)' + IE_NAME = 'stvr' + IE_DESC = 'Slovak Television and Radio (formerly RTVS)' + _VALID_URL = r'https?://(?:www\.)?(?:rtvs|stvr)\.sk/(?:radio|televizia)/archiv(?:/\d+)?/(?P<id>\d+)/?(?:[#?]|$)' _TESTS = [{ # radio archive 'url': 'http://www.rtvs.sk/radio/archiv/11224/414872', @@ -19,7 +21,7 @@ class RTVSIE(InfoExtractor): 'ext': 'mp3', 'title': 'Ostrov pokladov 1 časť.mp3', 'duration': 2854, - 'thumbnail': 'https://www.rtvs.sk/media/a501/image/file/2/0000/b1R8.rtvs.jpg', + 'thumbnail': 'https://www.stvr.sk/media/a501/image/file/2/0000/rtvs-00009383.png', 'display_id': '135331', }, }, { @@ -30,7 +32,7 @@ class RTVSIE(InfoExtractor): 'ext': 'mp4', 'title': 'Amaro Džives - Náš deň', 'description': 'Galavečer pri príležitosti Medzinárodného dňa Rómov.', - 'thumbnail': 'https://www.rtvs.sk/media/a501/image/file/2/0031/L7Qm.amaro_dzives_png.jpg', + 'thumbnail': 'https://www.stvr.sk/media/a501/image/file/2/0031/L7Qm.amaro_dzives_png.jpg', 'timestamp': 1428555900, 'upload_date': '20150409', 'duration': 4986, @@ -47,8 +49,11 @@ class RTVSIE(InfoExtractor): 'display_id': '307655', 'duration': 831, 'upload_date': '20211111', - 'thumbnail': 'https://www.rtvs.sk/media/a501/image/file/2/0916/robin.jpg', + 'thumbnail': 'https://www.stvr.sk/media/a501/image/file/2/0916/robin.jpg', }, + }, { + 'url': 'https://www.stvr.sk/radio/archiv/11224/414872', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/rumble.py b/yt_dlp/extractor/rumble.py index 74c7e4f17..757d6994c 100644 --- a/yt_dlp/extractor/rumble.py +++ b/yt_dlp/extractor/rumble.py @@ -7,7 +7,6 @@ ExtractorError, UnsupportedError, clean_html, - determine_ext, extract_attributes, format_field, get_element_by_class, @@ -36,7 +35,7 @@ class RumbleEmbedIE(InfoExtractor): 'upload_date': '20191020', 'channel_url': 'https://rumble.com/c/WMAR', 'channel': 'WMAR', - 'thumbnail': 'https://sp.rmbl.ws/s8/1/5/M/z/1/5Mz1a.qR4e-small-WMAR-2-News-Latest-Headline.jpg', + 'thumbnail': r're:https://.+\.jpg', 'duration': 234, 'uploader': 'WMAR', 'live_status': 'not_live', @@ -52,7 +51,7 @@ class RumbleEmbedIE(InfoExtractor): 'upload_date': '20220217', 'channel_url': 'https://rumble.com/c/CyberTechNews', 'channel': 'CTNews', - 'thumbnail': 'https://sp.rmbl.ws/s8/6/7/i/9/h/7i9hd.OvCc.jpg', + 'thumbnail': r're:https://.+\.jpg', 'duration': 901, 'uploader': 'CTNews', 'live_status': 'not_live', @@ -114,6 +113,22 @@ class RumbleEmbedIE(InfoExtractor): 'live_status': 'was_live', }, 'params': {'skip_download': True}, + }, { + 'url': 'https://rumble.com/embed/v6pezdb', + 'info_dict': { + 'id': 'v6pezdb', + 'ext': 'mp4', + 'title': '"Es war einmal ein Mädchen" – Ein filmisches Zeitzeugnis aus Leningrad 1944', + 'uploader': 'RT DE', + 'channel': 'RT DE', + 'channel_url': 'https://rumble.com/c/RTDE', + 'duration': 309, + 'thumbnail': 'https://1a-1791.com/video/fww1/dc/s8/1/n/z/2/y/nz2yy.qR4e-small-Es-war-einmal-ein-Mdchen-Ei.jpg', + 'timestamp': 1743703500, + 'upload_date': '20250403', + 'live_status': 'not_live', + }, + 'params': {'skip_download': True}, }, { 'url': 'https://rumble.com/embed/ufe9n.v5pv5f', 'only_matching': True, @@ -168,40 +183,42 @@ def _real_extract(self, url): live_status = None formats = [] - for ext, ext_info in (video.get('ua') or {}).items(): - if isinstance(ext_info, dict): - for height, video_info in ext_info.items(): + for format_type, format_info in (video.get('ua') or {}).items(): + if isinstance(format_info, dict): + for height, video_info in format_info.items(): if not traverse_obj(video_info, ('meta', 'h', {int_or_none})): video_info.setdefault('meta', {})['h'] = height - ext_info = ext_info.values() + format_info = format_info.values() - for video_info in ext_info: + for video_info in format_info: meta = video_info.get('meta') or {} if not video_info.get('url'): continue - if ext == 'hls': + # With default query params returns m3u8 variants which are duplicates, without returns tar files + if format_type == 'tar': + continue + if format_type == 'hls': if meta.get('live') is True and video.get('live') == 1: live_status = 'post_live' formats.extend(self._extract_m3u8_formats( video_info['url'], video_id, ext='mp4', m3u8_id='hls', fatal=False, live=live_status == 'is_live')) continue - timeline = ext == 'timeline' - if timeline: - ext = determine_ext(video_info['url']) + is_timeline = format_type == 'timeline' + is_audio = format_type == 'audio' formats.append({ - 'ext': ext, - 'acodec': 'none' if timeline else None, + 'acodec': 'none' if is_timeline else None, + 'vcodec': 'none' if is_audio else None, 'url': video_info['url'], - 'format_id': join_nonempty(ext, format_field(meta, 'h', '%sp')), - 'format_note': 'Timeline' if timeline else None, - 'fps': None if timeline else video.get('fps'), + 'format_id': join_nonempty(format_type, format_field(meta, 'h', '%sp')), + 'format_note': 'Timeline' if is_timeline else None, + 'fps': None if is_timeline or is_audio else video.get('fps'), **traverse_obj(meta, { - 'tbr': 'bitrate', - 'filesize': 'size', - 'width': 'w', - 'height': 'h', - }, expected_type=lambda x: int(x) or None), + 'tbr': ('bitrate', {int_or_none}), + 'filesize': ('size', {int_or_none}), + 'width': ('w', {int_or_none}), + 'height': ('h', {int_or_none}), + }), }) subtitles = { diff --git a/yt_dlp/extractor/sbs.py b/yt_dlp/extractor/sbs.py index 8d61e22fc..7edb5214e 100644 --- a/yt_dlp/extractor/sbs.py +++ b/yt_dlp/extractor/sbs.py @@ -122,6 +122,15 @@ def _real_extract(self, url): if traverse_obj(media, ('partOfSeries', {dict})): media['epName'] = traverse_obj(media, ('title', {str})) + # Need to set different language for forced subs or else they have priority over full subs + fixed_subtitles = {} + for lang, subs in subtitles.items(): + for sub in subs: + fixed_lang = lang + if sub['url'].lower().endswith('_fe.vtt'): + fixed_lang += '-forced' + fixed_subtitles.setdefault(fixed_lang, []).append(sub) + return { 'id': video_id, **traverse_obj(media, { @@ -151,6 +160,6 @@ def _real_extract(self, url): }), }), 'formats': formats, - 'subtitles': subtitles, + 'subtitles': fixed_subtitles, 'uploader': 'SBSC', } diff --git a/yt_dlp/extractor/senategov.py b/yt_dlp/extractor/senategov.py index efcdb79d0..dc275e58d 100644 --- a/yt_dlp/extractor/senategov.py +++ b/yt_dlp/extractor/senategov.py @@ -13,7 +13,7 @@ class SenateISVPIE(InfoExtractor): - _IE_NAME = 'senate.gov:isvp' + IE_NAME = 'senate.gov:isvp' _VALID_URL = r'https?://(?:www\.)?senate\.gov/isvp/?\?(?P<qs>.+)' _EMBED_REGEX = [r"<iframe[^>]+src=['\"](?P<url>https?://www\.senate\.gov/isvp/?\?[^'\"]+)['\"]"] @@ -137,7 +137,7 @@ def _real_extract(self, url): class SenateGovIE(InfoExtractor): - _IE_NAME = 'senate.gov' + IE_NAME = 'senate.gov' _SUBDOMAIN_RE = '|'.join(map(re.escape, ( 'agriculture', 'aging', 'appropriations', 'armed-services', 'banking', 'budget', 'commerce', 'energy', 'epw', 'finance', 'foreign', 'help', diff --git a/yt_dlp/extractor/skyit.py b/yt_dlp/extractor/skyit.py index 6e2973232..0013d2621 100644 --- a/yt_dlp/extractor/skyit.py +++ b/yt_dlp/extractor/skyit.py @@ -2,16 +2,18 @@ from .common import InfoExtractor from ..utils import ( + clean_html, dict_get, int_or_none, parse_duration, unified_timestamp, + url_or_none, + urljoin, ) +from ..utils.traversal import traverse_obj -class SkyItPlayerIE(InfoExtractor): - IE_NAME = 'player.sky.it' - _VALID_URL = r'https?://player\.sky\.it/player/(?:external|social)\.html\?.*?\bid=(?P<id>\d+)' +class SkyItBaseIE(InfoExtractor): _GEO_BYPASS = False _DOMAIN = 'sky' _PLAYER_TMPL = 'https://player.sky.it/player/external.html?id=%s&domain=%s' @@ -33,7 +35,6 @@ def _player_url_result(self, video_id): SkyItPlayerIE.ie_key(), video_id) def _parse_video(self, video, video_id): - title = video['title'] is_live = video.get('type') == 'live' hls_url = video.get(('streaming' if is_live else 'hls') + '_url') if not hls_url and video.get('geoblock' if is_live else 'geob'): @@ -43,7 +44,7 @@ def _parse_video(self, video, video_id): return { 'id': video_id, - 'title': title, + 'title': video.get('title'), 'formats': formats, 'thumbnail': dict_get(video, ('video_still', 'video_still_medium', 'thumb')), 'description': video.get('short_desc') or None, @@ -52,6 +53,11 @@ def _parse_video(self, video, video_id): 'is_live': is_live, } + +class SkyItPlayerIE(SkyItBaseIE): + IE_NAME = 'player.sky.it' + _VALID_URL = r'https?://player\.sky\.it/player/(?:external|social)\.html\?.*?\bid=(?P<id>\d+)' + def _real_extract(self, url): video_id = self._match_id(url) domain = urllib.parse.parse_qs(urllib.parse.urlparse( @@ -67,7 +73,7 @@ def _real_extract(self, url): return self._parse_video(video, video_id) -class SkyItVideoIE(SkyItPlayerIE): # XXX: Do not subclass from concrete IE +class SkyItVideoIE(SkyItBaseIE): IE_NAME = 'video.sky.it' _VALID_URL = r'https?://(?:masterchef|video|xfactor)\.sky\.it(?:/[^/]+)*/video/[0-9a-z-]+-(?P<id>\d+)' _TESTS = [{ @@ -96,7 +102,7 @@ def _real_extract(self, url): return self._player_url_result(video_id) -class SkyItVideoLiveIE(SkyItPlayerIE): # XXX: Do not subclass from concrete IE +class SkyItVideoLiveIE(SkyItBaseIE): IE_NAME = 'video.sky.it:live' _VALID_URL = r'https?://video\.sky\.it/diretta/(?P<id>[^/?&#]+)' _TEST = { @@ -124,7 +130,7 @@ def _real_extract(self, url): return self._parse_video(livestream, asset_id) -class SkyItIE(SkyItPlayerIE): # XXX: Do not subclass from concrete IE +class SkyItIE(SkyItBaseIE): IE_NAME = 'sky.it' _VALID_URL = r'https?://(?:sport|tg24)\.sky\.it(?:/[^/]+)*/\d{4}/\d{2}/\d{2}/(?P<id>[^/?&#]+)' _TESTS = [{ @@ -223,3 +229,80 @@ class TV8ItIE(SkyItVideoIE): # XXX: Do not subclass from concrete IE 'params': {'skip_download': 'm3u8'}, }] _DOMAIN = 'mtv8' + + +class TV8ItLiveIE(SkyItBaseIE): + IE_NAME = 'tv8.it:live' + IE_DESC = 'TV8 Live' + _VALID_URL = r'https?://(?:www\.)?tv8\.it/streaming' + _TESTS = [{ + 'url': 'https://tv8.it/streaming', + 'info_dict': { + 'id': 'tv8', + 'ext': 'mp4', + 'title': str, + 'description': str, + 'is_live': True, + 'live_status': 'is_live', + }, + }] + + def _real_extract(self, url): + video_id = 'tv8' + livestream = self._download_json( + 'https://apid.sky.it/vdp/v1/getLivestream', video_id, + 'Downloading manifest JSON', query={'id': '7'}) + metadata = self._download_json('https://tv8.it/api/getStreaming', video_id, fatal=False) + + return { + **self._parse_video(livestream, video_id), + **traverse_obj(metadata, ('info', { + 'title': ('title', 'text', {str}), + 'description': ('description', 'html', {clean_html}), + })), + } + + +class TV8ItPlaylistIE(InfoExtractor): + IE_NAME = 'tv8.it:playlist' + IE_DESC = 'TV8 Playlist' + _VALID_URL = r'https?://(?:www\.)?tv8\.it/(?!video)[^/#?]+/(?P<id>[^/#?]+)' + _TESTS = [{ + 'url': 'https://tv8.it/intrattenimento/tv8-gialappas-night', + 'playlist_mincount': 32, + 'info_dict': { + 'id': 'tv8-gialappas-night', + 'title': 'Tv8 Gialappa\'s Night', + 'description': 'md5:c876039d487d9cf40229b768872718ed', + 'thumbnail': r're:https://static\.sky\.it/.+\.(png|jpe?g|webp)', + }, + }, { + 'url': 'https://tv8.it/sport/uefa-europa-league', + 'playlist_mincount': 11, + 'info_dict': { + 'id': 'uefa-europa-league', + 'title': 'UEFA Europa League', + 'description': 'md5:9ab1832b7a8b1705b1f590e13a36bc6a', + 'thumbnail': r're:https://static\.sky\.it/.+\.(png|jpe?g|webp)', + }, + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + webpage = self._download_webpage(url, playlist_id) + data = self._search_nextjs_data(webpage, playlist_id)['props']['pageProps']['data'] + entries = [self.url_result( + urljoin('https://tv8.it', card['href']), ie=TV8ItIE, + **traverse_obj(card, { + 'description': ('extraData', 'videoDesc', {str}), + 'id': ('extraData', 'asset_id', {str}), + 'thumbnail': ('image', 'src', {url_or_none}), + 'title': ('title', 'typography', 'text', {str}), + })) + for card in traverse_obj(data, ('lastContent', 'cards', lambda _, v: v['href']))] + + return self.playlist_result(entries, playlist_id, **traverse_obj(data, ('card', 'desktop', { + 'description': ('description', 'html', {clean_html}), + 'thumbnail': ('image', 'src', {url_or_none}), + 'title': ('title', 'text', {str}), + }))) diff --git a/yt_dlp/extractor/softwhiteunderbelly.py b/yt_dlp/extractor/softwhiteunderbelly.py new file mode 100644 index 000000000..ce1b21405 --- /dev/null +++ b/yt_dlp/extractor/softwhiteunderbelly.py @@ -0,0 +1,87 @@ +from .common import InfoExtractor +from .vimeo import VHXEmbedIE +from ..utils import ( + ExtractorError, + clean_html, + update_url, + urlencode_postdata, +) +from ..utils.traversal import find_element, traverse_obj + + +class SoftWhiteUnderbellyIE(InfoExtractor): + _LOGIN_URL = 'https://www.softwhiteunderbelly.com/login' + _NETRC_MACHINE = 'softwhiteunderbelly' + _VALID_URL = r'https?://(?:www\.)?softwhiteunderbelly\.com/videos/(?P<id>[\w-]+)' + _TESTS = [{ + 'url': 'https://www.softwhiteunderbelly.com/videos/kenneth-final1', + 'note': 'A single Soft White Underbelly Episode', + 'md5': '8e79f29ec1f1bda6da2e0b998fcbebb8', + 'info_dict': { + 'id': '3201266', + 'ext': 'mp4', + 'display_id': 'kenneth-final1', + 'title': 'Appalachian Man interview-Kenneth', + 'description': 'Soft White Underbelly interview and portrait of Kenneth, an Appalachian man in Clay County, Kentucky.', + 'thumbnail': 'https://vhx.imgix.net/softwhiteunderbelly/assets/249f6db0-2b39-49a4-979b-f8dad4681825.jpg', + 'uploader_url': 'https://vimeo.com/user80538407', + 'uploader': 'OTT Videos', + 'uploader_id': 'user80538407', + 'duration': 512, + }, + 'expected_warnings': ['Failed to parse XML: not well-formed'], + }, { + 'url': 'https://www.softwhiteunderbelly.com/videos/tj-2-final-2160p', + 'note': 'A single Soft White Underbelly Episode', + 'md5': '286bd8851b4824c62afb369e6f307036', + 'info_dict': { + 'id': '3506029', + 'ext': 'mp4', + 'display_id': 'tj-2-final-2160p', + 'title': 'Fentanyl Addict interview-TJ (follow up)', + 'description': 'Soft White Underbelly follow up interview and portrait of TJ, a fentanyl addict on Skid Row.', + 'thumbnail': 'https://vhx.imgix.net/softwhiteunderbelly/assets/c883d531-5da0-4faf-a2e2-8eba97e5adfc.jpg', + 'duration': 817, + 'uploader': 'OTT Videos', + 'uploader_url': 'https://vimeo.com/user80538407', + 'uploader_id': 'user80538407', + }, + 'expected_warnings': ['Failed to parse XML: not well-formed'], + }] + + def _perform_login(self, username, password): + signin_page = self._download_webpage(self._LOGIN_URL, None, 'Fetching authenticity token') + self._download_webpage( + self._LOGIN_URL, None, 'Logging in', + data=urlencode_postdata({ + 'email': username, + 'password': password, + 'authenticity_token': self._html_search_regex( + r'name=["\']authenticity_token["\']\s+value=["\']([^"\']+)', signin_page, 'authenticity_token'), + 'utf8': True, + }), + ) + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + if '<div id="watch-unauthorized"' in webpage: + if self._get_cookies('https://www.softwhiteunderbelly.com').get('_session'): + raise ExtractorError('This account is not subscribed to this content', expected=True) + self.raise_login_required() + + embed_url, embed_id = self._html_search_regex( + r'embed_url:\s*["\'](?P<url>https?://embed\.vhx\.tv/videos/(?P<id>\d+)[^"\']*)', + webpage, 'embed url', group=('url', 'id')) + + return { + '_type': 'url_transparent', + 'ie_key': VHXEmbedIE.ie_key(), + 'url': VHXEmbedIE._smuggle_referrer(embed_url, 'https://www.softwhiteunderbelly.com'), + 'id': embed_id, + 'display_id': display_id, + 'title': traverse_obj(webpage, ({find_element(id='watch-info')}, {find_element(cls='video-title')}, {clean_html})), + 'description': self._html_search_meta('description', webpage, default=None), + 'thumbnail': update_url(self._og_search_thumbnail(webpage) or '', query=None) or None, + } diff --git a/yt_dlp/extractor/soundcloud.py b/yt_dlp/extractor/soundcloud.py index eafa306f2..c70940a60 100644 --- a/yt_dlp/extractor/soundcloud.py +++ b/yt_dlp/extractor/soundcloud.py @@ -52,7 +52,8 @@ class SoundcloudBaseIE(InfoExtractor): _API_VERIFY_AUTH_TOKEN = 'https://api-auth.soundcloud.com/connect/session%s' _HEADERS = {} - _IMAGE_REPL_RE = r'-([0-9a-z]+)\.jpg' + _IMAGE_REPL_RE = r'-[0-9a-z]+\.(?P<ext>jpg|png)' + _TAGS_RE = re.compile(r'"([^"]+)"|([^ ]+)') _ARTWORK_MAP = { 'mini': 16, @@ -331,12 +332,14 @@ def invalid_url(url): thumbnails = [] artwork_url = info.get('artwork_url') thumbnail = artwork_url or user.get('avatar_url') - if isinstance(thumbnail, str): - if re.search(self._IMAGE_REPL_RE, thumbnail): + if url_or_none(thumbnail): + if mobj := re.search(self._IMAGE_REPL_RE, thumbnail): for image_id, size in self._ARTWORK_MAP.items(): + # Soundcloud serves JPEG regardless of URL's ext *except* for "original" thumb + ext = mobj.group('ext') if image_id == 'original' else 'jpg' i = { 'id': image_id, - 'url': re.sub(self._IMAGE_REPL_RE, f'-{image_id}.jpg', thumbnail), + 'url': re.sub(self._IMAGE_REPL_RE, f'-{image_id}.{ext}', thumbnail), } if image_id == 'tiny' and not artwork_url: size = 18 @@ -372,6 +375,7 @@ def extract_count(key): 'comment_count': extract_count('comment'), 'repost_count': extract_count('reposts'), 'genres': traverse_obj(info, ('genre', {str}, filter, all, filter)), + 'tags': traverse_obj(info, ('tag_list', {self._TAGS_RE.findall}, ..., ..., filter)), 'artists': traverse_obj(info, ('publisher_metadata', 'artist', {str}, filter, all, filter)), 'formats': formats if not extract_flat else None, } @@ -425,6 +429,7 @@ class SoundcloudIE(SoundcloudBaseIE): 'repost_count': int, 'thumbnail': 'https://i1.sndcdn.com/artworks-000031955188-rwb18x-original.jpg', 'uploader_url': 'https://soundcloud.com/ethmusic', + 'tags': 'count:14', }, }, # geo-restricted @@ -440,7 +445,7 @@ class SoundcloudIE(SoundcloudBaseIE): 'uploader_id': '9615865', 'timestamp': 1337635207, 'upload_date': '20120521', - 'duration': 227.155, + 'duration': 227.103, 'license': 'all-rights-reserved', 'view_count': int, 'like_count': int, @@ -450,6 +455,7 @@ class SoundcloudIE(SoundcloudBaseIE): 'thumbnail': 'https://i1.sndcdn.com/artworks-v8bFHhXm7Au6-0-original.jpg', 'genres': ['Alternative'], 'artists': ['The Royal Concept'], + 'tags': [], }, }, # private link @@ -475,6 +481,7 @@ class SoundcloudIE(SoundcloudBaseIE): 'uploader_url': 'https://soundcloud.com/jaimemf', 'thumbnail': 'https://a1.sndcdn.com/images/default_avatar_large.png', 'genres': ['youtubedl'], + 'tags': [], }, }, # private link (alt format) @@ -500,15 +507,16 @@ class SoundcloudIE(SoundcloudBaseIE): 'uploader_url': 'https://soundcloud.com/jaimemf', 'thumbnail': 'https://a1.sndcdn.com/images/default_avatar_large.png', 'genres': ['youtubedl'], + 'tags': [], }, }, # downloadable song { 'url': 'https://soundcloud.com/the80m/the-following', - 'md5': '9ffcddb08c87d74fb5808a3c183a1d04', + 'md5': 'ecb87d7705d5f53e6c02a63760573c75', # wav: '9ffcddb08c87d74fb5808a3c183a1d04' 'info_dict': { 'id': '343609555', - 'ext': 'wav', + 'ext': 'opus', # wav original available with auth 'title': 'The Following', 'track': 'The Following', 'description': '', @@ -526,15 +534,18 @@ class SoundcloudIE(SoundcloudBaseIE): 'view_count': int, 'genres': ['Dance & EDM'], 'artists': ['80M'], + 'tags': ['80M', 'EDM', 'Dance', 'Music'], }, + 'expected_warnings': ['Original download format is only available for registered users'], }, # private link, downloadable format + # tags with spaces (e.g. "Uplifting Trance", "Ori Uplift") { 'url': 'https://soundcloud.com/oriuplift/uponly-238-no-talking-wav/s-AyZUd', - 'md5': '64a60b16e617d41d0bef032b7f55441e', + 'md5': '2e1530d0e9986a833a67cb34fc90ece0', # wav: '64a60b16e617d41d0bef032b7f55441e' 'info_dict': { 'id': '340344461', - 'ext': 'wav', + 'ext': 'opus', # wav original available with auth 'title': 'Uplifting Only 238 [No Talking] (incl. Alex Feed Guestmix) (Aug 31, 2017) [wav]', 'track': 'Uplifting Only 238 [No Talking] (incl. Alex Feed Guestmix) (Aug 31, 2017) [wav]', 'description': 'md5:fa20ee0fca76a3d6df8c7e57f3715366', @@ -552,7 +563,9 @@ class SoundcloudIE(SoundcloudBaseIE): 'uploader_url': 'https://soundcloud.com/oriuplift', 'genres': ['Trance'], 'artists': ['Ori Uplift'], + 'tags': ['Orchestral', 'Emotional', 'Uplifting Trance', 'Trance', 'Ori Uplift', 'UpOnly'], }, + 'expected_warnings': ['Original download format is only available for registered users'], }, # no album art, use avatar pic for thumbnail { @@ -577,6 +590,7 @@ class SoundcloudIE(SoundcloudBaseIE): 'repost_count': int, 'uploader_url': 'https://soundcloud.com/garyvee', 'artists': ['MadReal'], + 'tags': [], }, 'params': { 'skip_download': True, @@ -604,8 +618,47 @@ class SoundcloudIE(SoundcloudBaseIE): 'repost_count': int, 'genres': ['Piano'], 'uploader_url': 'https://soundcloud.com/giovannisarani', + 'tags': 'count:10', }, }, + # .png "original" artwork, 160kbps m4a HLS format + { + 'url': 'https://soundcloud.com/skorxh/audio-dealer', + 'info_dict': { + 'id': '2011421339', + 'ext': 'm4a', + 'title': 'audio dealer', + 'description': '', + 'uploader': '$KORCH', + 'uploader_id': '150292288', + 'uploader_url': 'https://soundcloud.com/skorxh', + 'comment_count': int, + 'view_count': int, + 'like_count': int, + 'repost_count': int, + 'duration': 213.469, + 'tags': [], + 'artists': ['$KORXH'], + 'track': 'audio dealer', + 'timestamp': 1737143201, + 'upload_date': '20250117', + 'license': 'all-rights-reserved', + 'thumbnail': 'https://i1.sndcdn.com/artworks-a1wKGMYNreDLTMrT-fGjRiw-original.png', + 'thumbnails': [ + {'id': 'mini', 'url': 'https://i1.sndcdn.com/artworks-a1wKGMYNreDLTMrT-fGjRiw-mini.jpg'}, + {'id': 'tiny', 'url': 'https://i1.sndcdn.com/artworks-a1wKGMYNreDLTMrT-fGjRiw-tiny.jpg'}, + {'id': 'small', 'url': 'https://i1.sndcdn.com/artworks-a1wKGMYNreDLTMrT-fGjRiw-small.jpg'}, + {'id': 'badge', 'url': 'https://i1.sndcdn.com/artworks-a1wKGMYNreDLTMrT-fGjRiw-badge.jpg'}, + {'id': 't67x67', 'url': 'https://i1.sndcdn.com/artworks-a1wKGMYNreDLTMrT-fGjRiw-t67x67.jpg'}, + {'id': 'large', 'url': 'https://i1.sndcdn.com/artworks-a1wKGMYNreDLTMrT-fGjRiw-large.jpg'}, + {'id': 't300x300', 'url': 'https://i1.sndcdn.com/artworks-a1wKGMYNreDLTMrT-fGjRiw-t300x300.jpg'}, + {'id': 'crop', 'url': 'https://i1.sndcdn.com/artworks-a1wKGMYNreDLTMrT-fGjRiw-crop.jpg'}, + {'id': 't500x500', 'url': 'https://i1.sndcdn.com/artworks-a1wKGMYNreDLTMrT-fGjRiw-t500x500.jpg'}, + {'id': 'original', 'url': 'https://i1.sndcdn.com/artworks-a1wKGMYNreDLTMrT-fGjRiw-original.png'}, + ], + }, + 'params': {'skip_download': 'm3u8', 'format': 'hls_aac_160k'}, + }, { # AAC HQ format available (account with active subscription needed) 'url': 'https://soundcloud.com/wandw/the-chainsmokers-ft-daya-dont-let-me-down-ww-remix-1', diff --git a/yt_dlp/extractor/sovietscloset.py b/yt_dlp/extractor/sovietscloset.py index 773ddd344..d35214aa8 100644 --- a/yt_dlp/extractor/sovietscloset.py +++ b/yt_dlp/extractor/sovietscloset.py @@ -1,5 +1,6 @@ +from .bunnycdn import BunnyCdnIE from .common import InfoExtractor -from ..utils import try_get, unified_timestamp +from ..utils import make_archive_id, try_get, unified_timestamp class SovietsClosetBaseIE(InfoExtractor): @@ -43,7 +44,7 @@ class SovietsClosetIE(SovietsClosetBaseIE): 'url': 'https://sovietscloset.com/video/1337', 'md5': 'bd012b04b261725510ca5383074cdd55', 'info_dict': { - 'id': '1337', + 'id': '2f0cfbf4-3588-43a9-a7d6-7c9ea3755e67', 'ext': 'mp4', 'title': 'The Witcher #13', 'thumbnail': r're:^https?://.*\.b-cdn\.net/2f0cfbf4-3588-43a9-a7d6-7c9ea3755e67/thumbnail\.jpg$', @@ -55,20 +56,23 @@ class SovietsClosetIE(SovietsClosetBaseIE): 'upload_date': '20170413', 'uploader_id': 'SovietWomble', 'uploader_url': 'https://www.twitch.tv/SovietWomble', - 'duration': 7007, + 'duration': 7008, 'was_live': True, 'availability': 'public', 'series': 'The Witcher', 'season': 'Misc', 'episode_number': 13, 'episode': 'Episode 13', + 'creators': ['SovietWomble'], + 'description': '', + '_old_archive_ids': ['sovietscloset 1337'], }, }, { 'url': 'https://sovietscloset.com/video/1105', 'md5': '89fa928f183893cb65a0b7be846d8a90', 'info_dict': { - 'id': '1105', + 'id': 'c0e5e76f-3a93-40b4-bf01-12343c2eec5d', 'ext': 'mp4', 'title': 'Arma 3 - Zeus Games #5', 'uploader': 'SovietWomble', @@ -80,39 +84,20 @@ class SovietsClosetIE(SovietsClosetBaseIE): 'upload_date': '20160420', 'uploader_id': 'SovietWomble', 'uploader_url': 'https://www.twitch.tv/SovietWomble', - 'duration': 8804, + 'duration': 8805, 'was_live': True, 'availability': 'public', 'series': 'Arma 3', 'season': 'Zeus Games', 'episode_number': 5, 'episode': 'Episode 5', + 'creators': ['SovietWomble'], + 'description': '', + '_old_archive_ids': ['sovietscloset 1105'], }, }, ] - def _extract_bunnycdn_iframe(self, video_id, bunnycdn_id): - iframe = self._download_webpage( - f'https://iframe.mediadelivery.net/embed/5105/{bunnycdn_id}', - video_id, note='Downloading BunnyCDN iframe', headers=self.MEDIADELIVERY_REFERER) - - m3u8_url = self._search_regex(r'(https?://.*?\.m3u8)', iframe, 'm3u8 url') - thumbnail_url = self._search_regex(r'(https?://.*?thumbnail\.jpg)', iframe, 'thumbnail url') - - m3u8_formats = self._extract_m3u8_formats(m3u8_url, video_id, headers=self.MEDIADELIVERY_REFERER) - - if not m3u8_formats: - duration = None - else: - duration = self._extract_m3u8_vod_duration( - m3u8_formats[0]['url'], video_id, headers=self.MEDIADELIVERY_REFERER) - - return { - 'formats': m3u8_formats, - 'thumbnail': thumbnail_url, - 'duration': duration, - } - def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) @@ -122,13 +107,13 @@ def _real_extract(self, url): stream = self.parse_nuxt_jsonp(f'{static_assets_base}/video/{video_id}/payload.js', video_id, 'video')['stream'] - return { + return self.url_result( + f'https://iframe.mediadelivery.net/embed/5105/{stream["bunnyId"]}', ie=BunnyCdnIE, url_transparent=True, **self.video_meta( video_id=video_id, game_name=stream['game']['name'], category_name=try_get(stream, lambda x: x['subcategory']['name'], str), episode_number=stream.get('number'), stream_date=stream.get('date')), - **self._extract_bunnycdn_iframe(video_id, stream['bunnyId']), - } + _old_archive_ids=[make_archive_id(self, video_id)]) class SovietsClosetPlaylistIE(SovietsClosetBaseIE): diff --git a/yt_dlp/extractor/streaks.py b/yt_dlp/extractor/streaks.py new file mode 100644 index 000000000..1b3718473 --- /dev/null +++ b/yt_dlp/extractor/streaks.py @@ -0,0 +1,236 @@ +import json +import urllib.parse + +from .common import InfoExtractor +from ..networking.exceptions import HTTPError +from ..utils import ( + ExtractorError, + filter_dict, + float_or_none, + join_nonempty, + mimetype2ext, + parse_iso8601, + unsmuggle_url, + update_url_query, + url_or_none, +) +from ..utils.traversal import traverse_obj + + +class StreaksBaseIE(InfoExtractor): + _API_URL_TEMPLATE = 'https://{}.api.streaks.jp/v1/projects/{}/medias/{}{}' + _GEO_BYPASS = False + _GEO_COUNTRIES = ['JP'] + + def _extract_from_streaks_api(self, project_id, media_id, headers=None, query=None, ssai=False): + try: + response = self._download_json( + self._API_URL_TEMPLATE.format('playback', project_id, media_id, ''), + media_id, 'Downloading STREAKS playback API JSON', headers={ + 'Accept': 'application/json', + 'Origin': 'https://players.streaks.jp', + **self.geo_verification_headers(), + **(headers or {}), + }) + except ExtractorError as e: + if isinstance(e.cause, HTTPError) and e.cause.status in {403, 404}: + error = self._parse_json(e.cause.response.read().decode(), media_id, fatal=False) + message = traverse_obj(error, ('message', {str})) + code = traverse_obj(error, ('code', {str})) + if code == 'REQUEST_FAILED': + self.raise_geo_restricted(message, countries=self._GEO_COUNTRIES) + elif code == 'MEDIA_NOT_FOUND': + raise ExtractorError(message, expected=True) + elif code or message: + raise ExtractorError(join_nonempty(code, message, delim=': ')) + raise + + streaks_id = response['id'] + live_status = { + 'clip': 'was_live', + 'file': 'not_live', + 'linear': 'is_live', + 'live': 'is_live', + }.get(response.get('type')) + + formats, subtitles = [], {} + drm_formats = False + + for source in traverse_obj(response, ('sources', lambda _, v: v['src'])): + if source.get('key_systems'): + drm_formats = True + continue + + src_url = source['src'] + is_live = live_status == 'is_live' + ext = mimetype2ext(source.get('type')) + if ext != 'm3u8': + self.report_warning(f'Unsupported stream type: {ext}') + continue + + if is_live and ssai: + session_params = traverse_obj(self._download_json( + self._API_URL_TEMPLATE.format('ssai', project_id, streaks_id, '/ssai/session'), + media_id, 'Downloading session parameters', + headers={'Content-Type': 'application/json', 'Accept': 'application/json'}, + data=json.dumps({'id': source['id']}).encode(), + ), (0, 'query', {urllib.parse.parse_qs})) + src_url = update_url_query(src_url, session_params) + + fmts, subs = self._extract_m3u8_formats_and_subtitles( + src_url, media_id, 'mp4', m3u8_id='hls', fatal=False, live=is_live, query=query) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + + if not formats and drm_formats: + self.report_drm(media_id) + self._remove_duplicate_formats(formats) + + for subs in traverse_obj(response, ( + 'tracks', lambda _, v: v['kind'] in ('captions', 'subtitles') and url_or_none(v['src']), + )): + lang = traverse_obj(subs, ('srclang', {str.lower})) or 'ja' + subtitles.setdefault(lang, []).append({'url': subs['src']}) + + return { + 'id': streaks_id, + 'display_id': media_id, + 'formats': formats, + 'live_status': live_status, + 'subtitles': subtitles, + 'uploader_id': project_id, + **traverse_obj(response, { + 'title': ('name', {str}), + 'description': ('description', {str}, filter), + 'duration': ('duration', {float_or_none}), + 'modified_timestamp': ('updated_at', {parse_iso8601}), + 'tags': ('tags', ..., {str}), + 'thumbnails': (('poster', 'thumbnail'), 'src', {'url': {url_or_none}}), + 'timestamp': ('created_at', {parse_iso8601}), + }), + } + + +class StreaksIE(StreaksBaseIE): + _VALID_URL = [ + r'https?://players\.streaks\.jp/(?P<project_id>[\w-]+)/[\da-f]+/index\.html\?(?:[^#]+&)?m=(?P<id>(?:ref:)?[\w-]+)', + r'https?://playback\.api\.streaks\.jp/v1/projects/(?P<project_id>[\w-]+)/medias/(?P<id>(?:ref:)?[\w-]+)', + ] + _EMBED_REGEX = [rf'<iframe\s+[^>]*\bsrc\s*=\s*["\'](?P<url>{_VALID_URL[0]})'] + _TESTS = [{ + 'url': 'https://players.streaks.jp/tipness/08155cd19dc14c12bebefb69b92eafcc/index.html?m=dbdf2df35b4d483ebaeeaeb38c594647', + 'info_dict': { + 'id': 'dbdf2df35b4d483ebaeeaeb38c594647', + 'ext': 'mp4', + 'title': '3shunenCM_edit.mp4', + 'display_id': 'dbdf2df35b4d483ebaeeaeb38c594647', + 'duration': 47.533, + 'live_status': 'not_live', + 'modified_date': '20230726', + 'modified_timestamp': 1690356180, + 'timestamp': 1690355996, + 'upload_date': '20230726', + 'uploader_id': 'tipness', + }, + }, { + 'url': 'https://players.streaks.jp/ktv-web/0298e8964c164ab384c07ef6e08c444b/index.html?m=ref:mycoffeetime_250317', + 'info_dict': { + 'id': 'dccdc079e3fd41f88b0c8435e2d453ab', + 'ext': 'mp4', + 'title': 'わたしの珈琲時間_250317', + 'display_id': 'ref:mycoffeetime_250317', + 'duration': 122.99, + 'live_status': 'not_live', + 'modified_date': '20250310', + 'modified_timestamp': 1741586302, + 'thumbnail': r're:https?://.+\.jpg', + 'timestamp': 1741585839, + 'upload_date': '20250310', + 'uploader_id': 'ktv-web', + }, + }, { + 'url': 'https://playback.api.streaks.jp/v1/projects/ktv-web/medias/b5411938e1e5435dac71edf829dd4813', + 'info_dict': { + 'id': 'b5411938e1e5435dac71edf829dd4813', + 'ext': 'mp4', + 'title': 'KANTELE_SYUSEi_0630', + 'display_id': 'b5411938e1e5435dac71edf829dd4813', + 'live_status': 'not_live', + 'modified_date': '20250122', + 'modified_timestamp': 1737522999, + 'thumbnail': r're:https?://.+\.jpg', + 'timestamp': 1735205137, + 'upload_date': '20241226', + 'uploader_id': 'ktv-web', + }, + }, { + # TVer Olympics: website already down, but api remains accessible + 'url': 'https://playback.api.streaks.jp/v1/projects/tver-olympic/medias/ref:sp_240806_1748_dvr', + 'info_dict': { + 'id': 'c10f7345adb648cf804d7578ab93b2e3', + 'ext': 'mp4', + 'title': 'サッカー 男子 準決勝_dvr', + 'display_id': 'ref:sp_240806_1748_dvr', + 'duration': 12960.0, + 'live_status': 'was_live', + 'modified_date': '20240805', + 'modified_timestamp': 1722896263, + 'timestamp': 1722777618, + 'upload_date': '20240804', + 'uploader_id': 'tver-olympic', + }, + }, { + # TBS FREE: 24-hour stream + 'url': 'https://playback.api.streaks.jp/v1/projects/tbs/medias/ref:simul-02', + 'info_dict': { + 'id': 'c4e83a7b48f4409a96adacec674b4e22', + 'ext': 'mp4', + 'title': str, + 'display_id': 'ref:simul-02', + 'live_status': 'is_live', + 'modified_date': '20241031', + 'modified_timestamp': 1730339858, + 'timestamp': 1705466840, + 'upload_date': '20240117', + 'uploader_id': 'tbs', + }, + }, { + # DRM protected + 'url': 'https://players.streaks.jp/sp-jbc/a12d7ee0f40c49d6a0a2bff520639677/index.html?m=5f89c62f37ee4a68be8e6e3b1396c7d8', + 'only_matching': True, + }] + _WEBPAGE_TESTS = [{ + 'url': 'https://event.play.jp/playnext2023/', + 'info_dict': { + 'id': '2d975178293140dc8074a7fc536a7604', + 'ext': 'mp4', + 'title': 'PLAY NEXTキームービー(本番)', + 'uploader_id': 'play', + 'duration': 17.05, + 'thumbnail': r're:https?://.+\.jpg', + 'timestamp': 1668387517, + 'upload_date': '20221114', + 'modified_timestamp': 1739411523, + 'modified_date': '20250213', + 'live_status': 'not_live', + }, + }, { + 'url': 'https://wowshop.jp/Page/special/cooking_goods/?bid=wowshop&srsltid=AfmBOor_phUNoPEE_UCPiGGSCMrJE5T2US397smvsbrSdLqUxwON0el4', + 'playlist_mincount': 2, + 'info_dict': { + 'id': '?bid=wowshop&srsltid=AfmBOor_phUNoPEE_UCPiGGSCMrJE5T2US397smvsbrSdLqUxwON0el4', + 'title': 'ワンランク上の料理道具でとびきりの“おいしい”を食卓へ|wowshop', + 'description': 'md5:914b5cb8624fc69274c7fb7b2342958f', + 'age_limit': 0, + 'thumbnail': 'https://wowshop.jp/Page/special/cooking_goods/images/ogp.jpg', + }, + }] + + def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + project_id, media_id = self._match_valid_url(url).group('project_id', 'id') + + return self._extract_from_streaks_api( + project_id, media_id, headers=filter_dict({ + 'X-Streaks-Api-Key': smuggled_data.get('api_key'), + })) diff --git a/yt_dlp/extractor/taptap.py b/yt_dlp/extractor/taptap.py index e4c31da4e..f5900194f 100644 --- a/yt_dlp/extractor/taptap.py +++ b/yt_dlp/extractor/taptap.py @@ -191,12 +191,12 @@ class TapTapAppIE(TapTapBaseIE): }] -class TapTapIntlBase(TapTapBaseIE): +class TapTapIntlBaseIE(TapTapBaseIE): _X_UA = 'V=1&PN=WebAppIntl2&LANG=zh_TW&VN_CODE=115&VN=0.1.0&LOC=CN&PLT=PC&DS=Android&UID={uuid}&CURR=&DT=PC&OS=Windows&OSV=NT%208.0.0' _VIDEO_API = 'https://www.taptap.io/webapiv2/video-resource/v1/multi-get' -class TapTapAppIntlIE(TapTapIntlBase): +class TapTapAppIntlIE(TapTapIntlBaseIE): _VALID_URL = r'https?://www\.taptap\.io/app/(?P<id>\d+)' _INFO_API = 'https://www.taptap.io/webapiv2/i/app/v5/detail' _DATA_PATH = 'app' @@ -227,7 +227,7 @@ class TapTapAppIntlIE(TapTapIntlBase): }] -class TapTapPostIntlIE(TapTapIntlBase): +class TapTapPostIntlIE(TapTapIntlBaseIE): _VALID_URL = r'https?://www\.taptap\.io/post/(?P<id>\d+)' _INFO_API = 'https://www.taptap.io/webapiv2/creation/post/v1/detail' _INFO_QUERY_KEY = 'id_str' diff --git a/yt_dlp/extractor/telecinco.py b/yt_dlp/extractor/telecinco.py index 9ef621446..a34f2afd4 100644 --- a/yt_dlp/extractor/telecinco.py +++ b/yt_dlp/extractor/telecinco.py @@ -46,7 +46,7 @@ def _parse_content(self, content, url): error_code = traverse_obj( self._webpage_read_content(error.cause.response, caronte['cerbero'], video_id, fatal=False), ({json.loads}, 'code', {int})) - if error_code == 4038: + if error_code in (4038, 40313): self.raise_geo_restricted(countries=['ES']) raise diff --git a/yt_dlp/extractor/tiktok.py b/yt_dlp/extractor/tiktok.py index 9e53b3407..d9280cec1 100644 --- a/yt_dlp/extractor/tiktok.py +++ b/yt_dlp/extractor/tiktok.py @@ -26,6 +26,7 @@ srt_subtitles_timecode, str_or_none, traverse_obj, + truncate_string, try_call, try_get, url_or_none, @@ -249,6 +250,12 @@ def _extract_web_data_and_status(self, url, video_id, fatal=True): elif fatal: raise ExtractorError('Unable to extract webpage video data') + if not traverse_obj(video_data, ('video', {dict})) and traverse_obj(video_data, ('isContentClassified', {bool})): + message = 'This post may not be comfortable for some audiences. Log in for access' + if fatal: + self.raise_login_required(message) + self.report_warning(f'{message}. {self._login_hint()}', video_id=video_id) + return video_data, status def _get_subtitles(self, aweme_detail, aweme_id, user_name): @@ -438,7 +445,7 @@ def extract_addr(addr, add_meta={}): return { 'id': aweme_id, **traverse_obj(aweme_detail, { - 'title': ('desc', {str}), + 'title': ('desc', {truncate_string(left=72)}), 'description': ('desc', {str}), 'timestamp': ('create_time', {int_or_none}), }), @@ -589,7 +596,7 @@ def _parse_aweme_video_web(self, aweme_detail, webpage_url, video_id, extract_fl 'duration': ('duration', {int_or_none}), })), **traverse_obj(aweme_detail, { - 'title': ('desc', {str}), + 'title': ('desc', {truncate_string(left=72)}), 'description': ('desc', {str}), # audio-only slideshows have a video duration of 0 and an actual audio duration 'duration': ('video', 'duration', {int_or_none}, filter), @@ -650,7 +657,7 @@ class TikTokIE(TikTokBaseIE): 'info_dict': { 'id': '6742501081818877190', 'ext': 'mp4', - 'title': 'md5:5e2a23877420bb85ce6521dbee39ba94', + 'title': 'Tag 1 Friend reverse this Video and look what happens 🤩😱 @skyandtami ...', 'description': 'md5:5e2a23877420bb85ce6521dbee39ba94', 'duration': 27, 'height': 1024, @@ -854,7 +861,7 @@ class TikTokIE(TikTokBaseIE): 'info_dict': { 'id': '7253412088251534594', 'ext': 'm4a', - 'title': 'я ред флаг простите #переписка #щитпост #тревожныйтиппривязанности #рекомендации ', + 'title': 'я ред флаг простите #переписка #щитпост #тревожныйтиппривязанности #р...', 'description': 'я ред флаг простите #переписка #щитпост #тревожныйтиппривязанности #рекомендации ', 'uploader': 'hara_yoimiya', 'uploader_id': '6582536342634676230', @@ -895,8 +902,12 @@ def _real_extract(self, url): if video_data and status == 0: return self._parse_aweme_video_web(video_data, url, video_id) - elif status == 10216: - raise ExtractorError('This video is private', expected=True) + elif status in (10216, 10222): + # 10216: private post; 10222: private account + self.raise_login_required( + 'You do not have permission to view this post. Log into an account that has access') + elif status == 10204: + raise ExtractorError('Your IP address is blocked from accessing this post', expected=True) raise ExtractorError(f'Video not available, status code {status}', video_id=video_id) diff --git a/yt_dlp/extractor/tv2dk.py b/yt_dlp/extractor/tv2dk.py index 9cd7606b0..bad120f7b 100644 --- a/yt_dlp/extractor/tv2dk.py +++ b/yt_dlp/extractor/tv2dk.py @@ -2,12 +2,13 @@ import re from .common import InfoExtractor +from .jwplatform import JWPlatformIE from ..utils import ( determine_ext, - extract_attributes, js_to_json, url_or_none, ) +from ..utils.traversal import find_element, traverse_obj class TV2DKIE(InfoExtractor): @@ -21,35 +22,46 @@ class TV2DKIE(InfoExtractor): tv2fyn| tv2east| tv2lorry| - tv2nord + tv2nord| + tv2kosmopol )\.dk/ - (:[^/]+/)* + (?:[^/?#]+/)* (?P<id>[^/?\#&]+) ''' _TESTS = [{ 'url': 'https://www.tvsyd.dk/nyheder/28-10-2019/1930/1930-28-okt-2019?autoplay=1#player', 'info_dict': { - 'id': '0_52jmwa0p', + 'id': 'sPp5z21q', 'ext': 'mp4', 'title': '19:30 - 28. okt. 2019', - 'timestamp': 1572290248, + 'description': '', + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/sPp5z21q/poster.jpg?width=720', + 'timestamp': 1572287400, 'upload_date': '20191028', - 'uploader_id': 'tvsyd', - 'duration': 1347, - 'view_count': int, }, - 'add_ie': ['Kaltura'], }, { 'url': 'https://www.tv2lorry.dk/gadekamp/gadekamp-6-hoejhuse-i-koebenhavn', 'info_dict': { - 'id': '1_7iwll9n0', + 'id': 'oD9cyq0m', 'ext': 'mp4', - 'upload_date': '20211027', 'title': 'Gadekamp #6 - Højhuse i København', - 'uploader_id': 'tv2lorry', - 'timestamp': 1635345229, + 'description': '', + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/oD9cyq0m/poster.jpg?width=720', + 'timestamp': 1635348600, + 'upload_date': '20211027', }, - 'add_ie': ['Kaltura'], + }, { + 'url': 'https://www.tvsyd.dk/haderslev/x-factor-brodre-fulde-af-selvtillid-er-igen-hjemme-hos-mor-vores-diagnoser-har-vaeret-en-fordel', + 'info_dict': { + 'id': 'x-factor-brodre-fulde-af-selvtillid-er-igen-hjemme-hos-mor-vores-diagnoser-har-vaeret-en-fordel', + }, + 'playlist_count': 2, + }, { + 'url': 'https://www.tv2ostjylland.dk/aarhus/dom-kan-fa-alvorlige-konsekvenser', + 'info_dict': { + 'id': 'dom-kan-fa-alvorlige-konsekvenser', + }, + 'playlist_count': 3, }, { 'url': 'https://www.tv2ostjylland.dk/artikel/minister-gaar-ind-i-sag-om-diabetes-teknologi', 'only_matching': True, @@ -71,40 +83,22 @@ class TV2DKIE(InfoExtractor): }, { 'url': 'https://www.tv2nord.dk/artikel/dybt-uacceptabelt', 'only_matching': True, + }, { + 'url': 'https://www.tv2kosmopol.dk/metropolen/chaufforer-beordres-til-at-kore-videre-i-ulovlige-busser-med-rode-advarselslamper', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + search_space = traverse_obj(webpage, {find_element(tag='article')}) or webpage - entries = [] + player_ids = traverse_obj( + re.findall(r'x-data="(?:video_player|simple_player)\(({[^"]+})', search_space), + (..., {js_to_json}, {json.loads}, ('jwpMediaId', 'videoId'), {str})) - def add_entry(partner_id, kaltura_id): - entries.append(self.url_result( - f'kaltura:{partner_id}:{kaltura_id}', 'Kaltura', - video_id=kaltura_id)) - - for video_el in re.findall(r'(?s)<[^>]+\bdata-entryid\s*=[^>]*>', webpage): - video = extract_attributes(video_el) - kaltura_id = video.get('data-entryid') - if not kaltura_id: - continue - partner_id = video.get('data-partnerid') - if not partner_id: - continue - add_entry(partner_id, kaltura_id) - if not entries: - kaltura_id = self._search_regex( - (r'entry_id\s*:\s*["\']([0-9a-z_]+)', - r'\\u002FentryId\\u002F(\w+)\\u002F'), webpage, 'kaltura id') - partner_id = self._search_regex( - (r'\\u002Fp\\u002F(\d+)\\u002F', r'/p/(\d+)/'), webpage, - 'partner id') - add_entry(partner_id, kaltura_id) - if len(entries) == 1: - return entries[0] - return self.playlist_result(entries) + return self.playlist_from_matches( + player_ids, video_id, getter=lambda x: f'jwplatform:{x}', ie=JWPlatformIE) class TV2DKBornholmPlayIE(InfoExtractor): diff --git a/yt_dlp/extractor/tver.py b/yt_dlp/extractor/tver.py index f3daf8946..805150db4 100644 --- a/yt_dlp/extractor/tver.py +++ b/yt_dlp/extractor/tver.py @@ -1,31 +1,70 @@ -from .common import InfoExtractor +from .streaks import StreaksBaseIE from ..utils import ( ExtractorError, + int_or_none, join_nonempty, + make_archive_id, smuggle_url, str_or_none, strip_or_none, - traverse_obj, update_url_query, ) +from ..utils.traversal import require, traverse_obj -class TVerIE(InfoExtractor): +class TVerIE(StreaksBaseIE): _VALID_URL = r'https?://(?:www\.)?tver\.jp/(?:(?P<type>lp|corner|series|episodes?|feature)/)+(?P<id>[a-zA-Z0-9]+)' + _GEO_COUNTRIES = ['JP'] + _GEO_BYPASS = False _TESTS = [{ - 'skip': 'videos are only available for 7 days', - 'url': 'https://tver.jp/episodes/ep83nf3w4p', + # via Streaks backend + 'url': 'https://tver.jp/episodes/epc1hdugbk', 'info_dict': { - 'title': '家事ヤロウ!!! 売り場席巻のチーズSP&財前直見×森泉親子の脱東京暮らし密着!', - 'description': 'md5:dc2c06b6acc23f1e7c730c513737719b', - 'series': '家事ヤロウ!!!', - 'episode': '売り場席巻のチーズSP&財前直見×森泉親子の脱東京暮らし密着!', - 'alt_title': '売り場席巻のチーズSP&財前直見×森泉親子の脱東京暮らし密着!', - 'channel': 'テレビ朝日', - 'id': 'ep83nf3w4p', + 'id': 'epc1hdugbk', 'ext': 'mp4', + 'display_id': 'ref:baeebeac-a2a6-4dbf-9eb3-c40d59b40068', + 'title': '神回だけ見せます! #2 壮烈!車大騎馬戦(木曜スペシャル)', + 'alt_title': '神回だけ見せます! #2 壮烈!車大騎馬戦(木曜スペシャル) 日テレ', + 'description': 'md5:2726f742d5e3886edeaf72fb6d740fef', + 'uploader_id': 'tver-ntv', + 'channel': '日テレ', + 'duration': 1158.024, + 'thumbnail': 'https://statics.tver.jp/images/content/thumbnail/episode/xlarge/epc1hdugbk.jpg?v=16', + 'series': '神回だけ見せます!', + 'episode': '#2 壮烈!車大騎馬戦(木曜スペシャル)', + 'episode_number': 2, + 'timestamp': 1736486036, + 'upload_date': '20250110', + 'modified_timestamp': 1736870264, + 'modified_date': '20250114', + 'live_status': 'not_live', + 'release_timestamp': 1651453200, + 'release_date': '20220502', + '_old_archive_ids': ['brightcovenew ref:baeebeac-a2a6-4dbf-9eb3-c40d59b40068'], }, - 'add_ie': ['BrightcoveNew'], + }, { + # via Brightcove backend (deprecated) + 'url': 'https://tver.jp/episodes/epc1hdugbk', + 'info_dict': { + 'id': 'ref:baeebeac-a2a6-4dbf-9eb3-c40d59b40068', + 'ext': 'mp4', + 'title': '神回だけ見せます! #2 壮烈!車大騎馬戦(木曜スペシャル)', + 'alt_title': '神回だけ見せます! #2 壮烈!車大騎馬戦(木曜スペシャル) 日テレ', + 'description': 'md5:2726f742d5e3886edeaf72fb6d740fef', + 'uploader_id': '4394098882001', + 'channel': '日テレ', + 'duration': 1158.101, + 'thumbnail': 'https://statics.tver.jp/images/content/thumbnail/episode/xlarge/epc1hdugbk.jpg?v=16', + 'tags': [], + 'series': '神回だけ見せます!', + 'episode': '#2 壮烈!車大騎馬戦(木曜スペシャル)', + 'episode_number': 2, + 'timestamp': 1651388531, + 'upload_date': '20220501', + 'release_timestamp': 1651453200, + 'release_date': '20220502', + }, + 'params': {'extractor_args': {'tver': {'backend': ['brightcove']}}}, }, { 'url': 'https://tver.jp/corner/f0103888', 'only_matching': True, @@ -38,26 +77,7 @@ class TVerIE(InfoExtractor): 'id': 'srtxft431v', 'title': '名探偵コナン', }, - 'playlist': [ - { - 'md5': '779ffd97493ed59b0a6277ea726b389e', - 'info_dict': { - 'id': 'ref:conan-1137-241005', - 'ext': 'mp4', - 'title': '名探偵コナン #1137「行列店、味変の秘密」', - 'uploader_id': '5330942432001', - 'tags': [], - 'channel': '読売テレビ', - 'series': '名探偵コナン', - 'description': 'md5:601fccc1d2430d942a2c8068c4b33eb5', - 'episode': '#1137「行列店、味変の秘密」', - 'duration': 1469.077, - 'timestamp': 1728030405, - 'upload_date': '20241004', - 'alt_title': '名探偵コナン #1137「行列店、味変の秘密」 読売テレビ 10月5日(土)放送分', - 'thumbnail': r're:https://.+\.jpg', - }, - }], + 'playlist_mincount': 21, }, { 'url': 'https://tver.jp/series/sru35hwdd2', 'info_dict': { @@ -70,7 +90,11 @@ class TVerIE(InfoExtractor): 'only_matching': True, }] BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s' - _HEADERS = {'x-tver-platform-type': 'web'} + _HEADERS = { + 'x-tver-platform-type': 'web', + 'Origin': 'https://tver.jp', + 'Referer': 'https://tver.jp/', + } _PLATFORM_QUERY = {} def _real_initialize(self): @@ -103,6 +127,9 @@ def _yield_episode_ids_for_series(self, series_id): def _real_extract(self, url): video_id, video_type = self._match_valid_url(url).group('id', 'type') + backend = self._configuration_arg('backend', ['streaks'])[0] + if backend not in ('brightcove', 'streaks'): + raise ExtractorError(f'Invalid backend value: {backend}', expected=True) if video_type == 'series': series_info = self._call_platform_api( @@ -129,12 +156,6 @@ def _real_extract(self, url): video_info = self._download_json( f'https://statics.tver.jp/content/episode/{video_id}.json', video_id, 'Downloading video info', query={'v': version}, headers={'Referer': 'https://tver.jp/'}) - p_id = video_info['video']['accountID'] - r_id = traverse_obj(video_info, ('video', ('videoRefID', 'videoID')), get_all=False) - if not r_id: - raise ExtractorError('Failed to extract reference ID for Brightcove') - if not r_id.isdigit(): - r_id = f'ref:{r_id}' episode = strip_or_none(episode_content.get('title')) series = str_or_none(episode_content.get('seriesTitle')) @@ -161,17 +182,53 @@ def _real_extract(self, url): ] ] - return { - '_type': 'url_transparent', + metadata = { 'title': title, 'series': series, 'episode': episode, # an another title which is considered "full title" for some viewers 'alt_title': join_nonempty(title, provider, onair_label, delim=' '), 'channel': provider, - 'description': str_or_none(video_info.get('description')), 'thumbnails': thumbnails, - 'url': smuggle_url( - self.BRIGHTCOVE_URL_TEMPLATE % (p_id, r_id), {'geo_countries': ['JP']}), - 'ie_key': 'BrightcoveNew', + **traverse_obj(video_info, { + 'description': ('description', {str}), + 'release_timestamp': ('viewStatus', 'startAt', {int_or_none}), + 'episode_number': ('no', {int_or_none}), + }), + } + + brightcove_id = traverse_obj(video_info, ('video', ('videoRefID', 'videoID'), {str}, any)) + if brightcove_id and not brightcove_id.isdecimal(): + brightcove_id = f'ref:{brightcove_id}' + + streaks_id = traverse_obj(video_info, ('streaks', 'videoRefID', {str})) + if streaks_id and not streaks_id.startswith('ref:'): + streaks_id = f'ref:{streaks_id}' + + # Deprecated Brightcove extraction reachable w/extractor-arg or fallback; errors are expected + if backend == 'brightcove' or not streaks_id: + if backend != 'brightcove': + self.report_warning( + 'No STREAKS ID found; falling back to Brightcove extraction', video_id=video_id) + if not brightcove_id: + raise ExtractorError('Unable to extract brightcove reference ID', expected=True) + account_id = traverse_obj(video_info, ( + 'video', 'accountID', {str}, {require('brightcove account ID', expected=True)})) + return { + **metadata, + '_type': 'url_transparent', + 'url': smuggle_url( + self.BRIGHTCOVE_URL_TEMPLATE % (account_id, brightcove_id), + {'geo_countries': ['JP']}), + 'ie_key': 'BrightcoveNew', + } + + return { + **self._extract_from_streaks_api(video_info['streaks']['projectID'], streaks_id, { + 'Origin': 'https://tver.jp', + 'Referer': 'https://tver.jp/', + }), + **metadata, + 'id': video_id, + '_old_archive_ids': [make_archive_id('BrightcoveNew', brightcove_id)] if brightcove_id else None, } diff --git a/yt_dlp/extractor/tvp.py b/yt_dlp/extractor/tvp.py index da3082907..416cbab3c 100644 --- a/yt_dlp/extractor/tvp.py +++ b/yt_dlp/extractor/tvp.py @@ -513,7 +513,7 @@ def _parse_video(self, video, with_url=True): class TVPVODVideoIE(TVPVODBaseIE): IE_NAME = 'tvp:vod' - _VALID_URL = r'https?://vod\.tvp\.pl/(?P<category>[a-z\d-]+,\d+)/[a-z\d-]+(?<!-odcinki)(?:-odcinki,\d+/odcinek-\d+,S\d+E\d+)?,(?P<id>\d+)/?(?:[?#]|$)' + _VALID_URL = r'https?://vod\.tvp\.pl/(?P<category>[a-z\d-]+,\d+)/[a-z\d-]+(?<!-odcinki)(?:-odcinki,\d+/odcinek--?\d+,S-?\d+E-?\d+)?,(?P<id>\d+)/?(?:[?#]|$)' _TESTS = [{ 'url': 'https://vod.tvp.pl/dla-dzieci,24/laboratorium-alchemika-odcinki,309338/odcinek-24,S01E24,311357', @@ -568,6 +568,9 @@ class TVPVODVideoIE(TVPVODBaseIE): 'live_status': 'is_live', 'thumbnail': 're:https?://.+', }, + }, { + 'url': 'https://vod.tvp.pl/informacje-i-publicystyka,205/konskie-2025-debata-przedwyborcza-odcinki,2028435/odcinek--1,S01E-1,2028419', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/tvw.py b/yt_dlp/extractor/tvw.py new file mode 100644 index 000000000..0ab926dbd --- /dev/null +++ b/yt_dlp/extractor/tvw.py @@ -0,0 +1,165 @@ +import json + +from .common import InfoExtractor +from ..utils import ( + clean_html, + extract_attributes, + parse_qs, + remove_end, + require, + unified_timestamp, + url_or_none, +) +from ..utils.traversal import find_element, traverse_obj + + +class TvwIE(InfoExtractor): + IE_NAME = 'tvw' + _VALID_URL = r'https?://(?:www\.)?tvw\.org/video/(?P<id>[^/?#]+)' + _TESTS = [{ + 'url': 'https://tvw.org/video/billy-frank-jr-statue-maquette-unveiling-ceremony-2024011211/', + 'md5': '9ceb94fe2bb7fd726f74f16356825703', + 'info_dict': { + 'id': '2024011211', + 'ext': 'mp4', + 'title': 'Billy Frank Jr. Statue Maquette Unveiling Ceremony', + 'thumbnail': r're:^https?://.*\.(?:jpe?g|png)$', + 'description': 'md5:58a8150017d985b4f377e11ee8f6f36e', + 'timestamp': 1704902400, + 'upload_date': '20240110', + 'location': 'Legislative Building', + 'display_id': 'billy-frank-jr-statue-maquette-unveiling-ceremony-2024011211', + 'categories': ['General Interest'], + }, + }, { + 'url': 'https://tvw.org/video/ebeys-landing-state-park-2024081007/', + 'md5': '71e87dae3deafd65d75ff3137b9a32fc', + 'info_dict': { + 'id': '2024081007', + 'ext': 'mp4', + 'title': 'Ebey\'s Landing State Park', + 'thumbnail': r're:^https?://.*\.(?:jpe?g|png)$', + 'description': 'md5:50c5bd73bde32fa6286a008dbc853386', + 'timestamp': 1724310900, + 'upload_date': '20240822', + 'location': 'Ebey’s Landing State Park', + 'display_id': 'ebeys-landing-state-park-2024081007', + 'categories': ['Washington State Parks'], + }, + }, { + 'url': 'https://tvw.org/video/home-warranties-workgroup-2', + 'md5': 'f678789bf94d07da89809f213cf37150', + 'info_dict': { + 'id': '1999121000', + 'ext': 'mp4', + 'title': 'Home Warranties Workgroup', + 'thumbnail': r're:^https?://.*\.(?:jpe?g|png)$', + 'description': 'md5:861396cc523c9641d0dce690bc5c35f3', + 'timestamp': 946389600, + 'upload_date': '19991228', + 'display_id': 'home-warranties-workgroup-2', + 'categories': ['Legislative'], + }, + }, { + 'url': 'https://tvw.org/video/washington-to-washington-a-new-space-race-2022041111/?eventID=2022041111', + 'md5': '6f5551090b351aba10c0d08a881b4f30', + 'info_dict': { + 'id': '2022041111', + 'ext': 'mp4', + 'title': 'Washington to Washington - A New Space Race', + 'thumbnail': r're:^https?://.*\.(?:jpe?g|png)$', + 'description': 'md5:f65a24eec56107afbcebb3aa5cd26341', + 'timestamp': 1650394800, + 'upload_date': '20220419', + 'location': 'Hayner Media Center', + 'display_id': 'washington-to-washington-a-new-space-race-2022041111', + 'categories': ['Washington to Washington', 'General Interest'], + }, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + client_id = self._html_search_meta('clientID', webpage, fatal=True) + video_id = self._html_search_meta('eventID', webpage, fatal=True) + + video_data = self._download_json( + 'https://api.v3.invintus.com/v2/Event/getDetailed', video_id, + headers={ + 'authorization': 'embedder', + 'wsc-api-key': '7WhiEBzijpritypp8bqcU7pfU9uicDR', + }, + data=json.dumps({ + 'clientID': client_id, + 'eventID': video_id, + 'showStreams': True, + }).encode())['data'] + + formats = [] + subtitles = {} + for stream_url in traverse_obj(video_data, ('streamingURIs', ..., {url_or_none})): + fmts, subs = self._extract_m3u8_formats_and_subtitles( + stream_url, video_id, 'mp4', m3u8_id='hls', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + if caption_url := traverse_obj(video_data, ('captionPath', {url_or_none})): + subtitles.setdefault('en', []).append({'url': caption_url, 'ext': 'vtt'}) + + return { + 'id': video_id, + 'display_id': display_id, + 'formats': formats, + 'subtitles': subtitles, + 'title': remove_end(self._og_search_title(webpage, default=None), ' - TVW'), + 'description': self._og_search_description(webpage, default=None), + **traverse_obj(video_data, { + 'title': ('title', {str}), + 'description': ('description', {clean_html}), + 'categories': ('categories', ..., {str}), + 'thumbnail': ('videoThumbnail', {url_or_none}), + 'timestamp': ('startDateTime', {unified_timestamp}), + 'location': ('locationName', {str}), + 'is_live': ('eventStatus', {lambda x: x == 'live'}), + }), + } + + +class TvwTvChannelsIE(InfoExtractor): + IE_NAME = 'tvw:tvchannels' + _VALID_URL = r'https?://(?:www\.)?tvw\.org/tvchannels/(?P<id>[^/?#]+)' + _TESTS = [{ + 'url': 'https://tvw.org/tvchannels/air/', + 'info_dict': { + 'id': 'air', + 'ext': 'mp4', + 'title': r're:TVW Cable Channel Live Stream', + 'thumbnail': r're:https?://.+/.+\.(?:jpe?g|png)$', + 'live_status': 'is_live', + }, + }, { + 'url': 'https://tvw.org/tvchannels/tvw2/', + 'info_dict': { + 'id': 'tvw2', + 'ext': 'mp4', + 'title': r're:TVW-2 Broadcast Channel', + 'thumbnail': r're:https?://.+/.+\.(?:jpe?g|png)$', + 'live_status': 'is_live', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + m3u8_url = traverse_obj(webpage, ( + {find_element(id='invintus-persistent-stream-frame', html=True)}, {extract_attributes}, + 'src', {parse_qs}, 'encoder', 0, {json.loads}, 'live247URI', {url_or_none}, {require('stream url')})) + + return { + 'id': video_id, + 'formats': self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', m3u8_id='hls', live=True), + 'title': remove_end(self._og_search_title(webpage, default=None), ' - TVW'), + 'thumbnail': self._og_search_thumbnail(webpage, default=None), + 'is_live': True, + } diff --git a/yt_dlp/extractor/twitcasting.py b/yt_dlp/extractor/twitcasting.py index bf9c6348c..0a7f95c21 100644 --- a/yt_dlp/extractor/twitcasting.py +++ b/yt_dlp/extractor/twitcasting.py @@ -14,12 +14,13 @@ parse_duration, qualities, str_to_int, - traverse_obj, try_get, unified_timestamp, + url_or_none, urlencode_postdata, urljoin, ) +from ..utils.traversal import traverse_obj class TwitCastingIE(InfoExtractor): @@ -138,13 +139,7 @@ def _real_extract(self, url): r'data-toggle="true"[^>]+datetime="([^"]+)"', webpage, 'datetime', None)) - stream_server_data = self._download_json( - f'https://twitcasting.tv/streamserver.php?target={uploader_id}&mode=client', video_id, - 'Downloading live info', fatal=False) - is_live = any(f'data-{x}' in webpage for x in ['is-onlive="true"', 'live-type="live"', 'status="online"']) - if not traverse_obj(stream_server_data, 'llfmp4') and is_live: - self.raise_login_required(method='cookies') base_dict = { 'title': title, @@ -165,28 +160,37 @@ def find_dmu(x): return [data_movie_url] m3u8_urls = (try_get(webpage, find_dmu, list) - or traverse_obj(video_js_data, (..., 'source', 'url')) - or ([f'https://twitcasting.tv/{uploader_id}/metastream.m3u8'] if is_live else None)) - if not m3u8_urls: - raise ExtractorError('Failed to get m3u8 playlist') + or traverse_obj(video_js_data, (..., 'source', 'url'))) if is_live: - m3u8_url = m3u8_urls[0] - formats = self._extract_m3u8_formats( - m3u8_url, video_id, ext='mp4', m3u8_id='hls', - live=True, headers=self._M3U8_HEADERS) + stream_data = self._download_json( + 'https://twitcasting.tv/streamserver.php', + video_id, 'Downloading live info', query={ + 'target': uploader_id, + 'mode': 'client', + 'player': 'pc_web', + }) - if traverse_obj(stream_server_data, ('hls', 'source')): - formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, ext='mp4', m3u8_id='source', - live=True, query={'mode': 'source'}, - note='Downloading source quality m3u8', - headers=self._M3U8_HEADERS, fatal=False)) + formats = [] + # low: 640x360, medium: 1280x720, high: 1920x1080 + qq = qualities(['low', 'medium', 'high']) + for quality, m3u8_url in traverse_obj(stream_data, ( + 'tc-hls', 'streams', {dict.items}, lambda _, v: url_or_none(v[1]), + )): + formats.append({ + 'url': m3u8_url, + 'format_id': f'hls-{quality}', + 'ext': 'mp4', + 'quality': qq(quality), + 'protocol': 'm3u8', + 'http_headers': self._M3U8_HEADERS, + }) if websockets: qq = qualities(['base', 'mobilesource', 'main']) - streams = traverse_obj(stream_server_data, ('llfmp4', 'streams')) or {} - for mode, ws_url in streams.items(): + for mode, ws_url in traverse_obj(stream_data, ( + 'llfmp4', 'streams', {dict.items}, lambda _, v: url_or_none(v[1]), + )): formats.append({ 'url': ws_url, 'format_id': f'ws-{mode}', @@ -197,10 +201,15 @@ def find_dmu(x): 'protocol': 'websocket_frag', }) + if not formats: + self.raise_login_required() + infodict = { 'formats': formats, '_format_sort_fields': ('source', ), } + elif not m3u8_urls: + raise ExtractorError('Failed to get m3u8 playlist') elif len(m3u8_urls) == 1: formats = self._extract_m3u8_formats( m3u8_urls[0], video_id, 'mp4', headers=self._M3U8_HEADERS) diff --git a/yt_dlp/extractor/twitch.py b/yt_dlp/extractor/twitch.py index 44b19ad13..4f4c59627 100644 --- a/yt_dlp/extractor/twitch.py +++ b/yt_dlp/extractor/twitch.py @@ -14,19 +14,20 @@ dict_get, float_or_none, int_or_none, + join_nonempty, make_archive_id, parse_duration, parse_iso8601, parse_qs, qualities, str_or_none, - traverse_obj, try_get, unified_timestamp, update_url_query, url_or_none, urljoin, ) +from ..utils.traversal import traverse_obj, value class TwitchBaseIE(InfoExtractor): @@ -42,10 +43,10 @@ class TwitchBaseIE(InfoExtractor): 'CollectionSideBar': '27111f1b382effad0b6def325caef1909c733fe6a4fbabf54f8d491ef2cf2f14', 'FilterableVideoTower_Videos': 'a937f1d22e269e39a03b509f65a7490f9fc247d7f83d6ac1421523e3b68042cb', 'ClipsCards__User': 'b73ad2bfaecfd30a9e6c28fada15bd97032c83ec77a0440766a56fe0bd632777', + 'ShareClipRenderStatus': 'f130048a462a0ac86bb54d653c968c514e9ab9ca94db52368c1179e97b0f16eb', 'ChannelCollectionsContent': '447aec6a0cc1e8d0a8d7732d47eb0762c336a2294fdb009e9c9d854e49d484b9', 'StreamMetadata': 'a647c2a13599e5991e175155f798ca7f1ecddde73f7f341f39009c14dbf59962', 'ComscoreStreamingQuery': 'e1edae8122517d013405f237ffcc124515dc6ded82480a88daef69c83b53ac01', - 'VideoAccessToken_Clip': '36b89d2507fce29e5ca551df756d27c1cfe079e2609642b4390aa4c35796eb11', 'VideoPreviewOverlay': '3006e77e51b128d838fa4e835723ca4dc9a05c5efd4466c1085215c6e437e65c', 'VideoMetadata': '49b5b8f268cdeb259d75b58dcb0c1a748e3b575003448a2333dc5cdafd49adad', 'VideoPlayer_ChapterSelectButtonVideo': '8d2793384aac3773beab5e59bd5d6f585aedb923d292800119e03d40cd0f9b41', @@ -1083,16 +1084,44 @@ class TwitchClipsIE(TwitchBaseIE): 'url': 'https://clips.twitch.tv/FaintLightGullWholeWheat', 'md5': '761769e1eafce0ffebfb4089cb3847cd', 'info_dict': { - 'id': '42850523', + 'id': '396245304', 'display_id': 'FaintLightGullWholeWheat', 'ext': 'mp4', 'title': 'EA Play 2016 Live from the Novo Theatre', + 'duration': 32, + 'view_count': int, 'thumbnail': r're:^https?://.*\.jpg', 'timestamp': 1465767393, 'upload_date': '20160612', - 'creator': 'EA', - 'uploader': 'stereotype_', - 'uploader_id': '43566419', + 'creators': ['EA'], + 'channel': 'EA', + 'channel_id': '25163635', + 'channel_is_verified': False, + 'channel_follower_count': int, + 'uploader': 'EA', + 'uploader_id': '25163635', + }, + }, { + 'url': 'https://www.twitch.tv/xqc/clip/CulturedAmazingKuduDatSheffy-TiZ_-ixAGYR3y2Uy', + 'md5': 'e90fe616b36e722a8cfa562547c543f0', + 'info_dict': { + 'id': '3207364882', + 'display_id': 'CulturedAmazingKuduDatSheffy-TiZ_-ixAGYR3y2Uy', + 'ext': 'mp4', + 'title': 'A day in the life of xQc', + 'duration': 60, + 'view_count': int, + 'thumbnail': r're:^https?://.*\.jpg', + 'timestamp': 1742869615, + 'upload_date': '20250325', + 'creators': ['xQc'], + 'channel': 'xQc', + 'channel_id': '71092938', + 'channel_is_verified': True, + 'channel_follower_count': int, + 'uploader': 'xQc', + 'uploader_id': '71092938', + 'categories': ['Just Chatting'], }, }, { # multiple formats @@ -1116,16 +1145,14 @@ class TwitchClipsIE(TwitchBaseIE): }] def _real_extract(self, url): - video_id = self._match_id(url) + slug = self._match_id(url) clip = self._download_gql( - video_id, [{ - 'operationName': 'VideoAccessToken_Clip', - 'variables': { - 'slug': video_id, - }, + slug, [{ + 'operationName': 'ShareClipRenderStatus', + 'variables': {'slug': slug}, }], - 'Downloading clip access token GraphQL')[0]['data']['clip'] + 'Downloading clip GraphQL')[0]['data']['clip'] if not clip: raise ExtractorError( @@ -1135,81 +1162,71 @@ def _real_extract(self, url): 'sig': clip['playbackAccessToken']['signature'], 'token': clip['playbackAccessToken']['value'], } - - data = self._download_base_gql( - video_id, { - 'query': '''{ - clip(slug: "%s") { - broadcaster { - displayName - } - createdAt - curator { - displayName - id - } - durationSeconds - id - tiny: thumbnailURL(width: 86, height: 45) - small: thumbnailURL(width: 260, height: 147) - medium: thumbnailURL(width: 480, height: 272) - title - videoQualities { - frameRate - quality - sourceURL - } - viewCount - } -}''' % video_id}, 'Downloading clip GraphQL', fatal=False) # noqa: UP031 - - if data: - clip = try_get(data, lambda x: x['data']['clip'], dict) or clip + asset_default = traverse_obj(clip, ('assets', 0, {dict})) or {} + asset_portrait = traverse_obj(clip, ('assets', 1, {dict})) or {} formats = [] - for option in clip.get('videoQualities', []): - if not isinstance(option, dict): - continue - source = url_or_none(option.get('sourceURL')) - if not source: - continue + default_aspect_ratio = float_or_none(asset_default.get('aspectRatio')) + formats.extend(traverse_obj(asset_default, ('videoQualities', lambda _, v: url_or_none(v['sourceURL']), { + 'url': ('sourceURL', {update_url_query(query=access_query)}), + 'format_id': ('quality', {str}), + 'height': ('quality', {int_or_none}), + 'fps': ('frameRate', {float_or_none}), + 'aspect_ratio': {value(default_aspect_ratio)}, + }))) + portrait_aspect_ratio = float_or_none(asset_portrait.get('aspectRatio')) + for source in traverse_obj(asset_portrait, ('videoQualities', lambda _, v: url_or_none(v['sourceURL']))): formats.append({ - 'url': update_url_query(source, access_query), - 'format_id': option.get('quality'), - 'height': int_or_none(option.get('quality')), - 'fps': int_or_none(option.get('frameRate')), + 'url': update_url_query(source['sourceURL'], access_query), + 'format_id': join_nonempty('portrait', source.get('quality')), + 'height': int_or_none(source.get('quality')), + 'fps': float_or_none(source.get('frameRate')), + 'aspect_ratio': portrait_aspect_ratio, + 'quality': -2, }) thumbnails = [] - for thumbnail_id in ('tiny', 'small', 'medium'): - thumbnail_url = clip.get(thumbnail_id) - if not thumbnail_url: - continue - thumb = { - 'id': thumbnail_id, - 'url': thumbnail_url, - } - mobj = re.search(r'-(\d+)x(\d+)\.', thumbnail_url) - if mobj: - thumb.update({ - 'height': int(mobj.group(2)), - 'width': int(mobj.group(1)), - }) - thumbnails.append(thumb) + thumb_asset_default_url = url_or_none(asset_default.get('thumbnailURL')) + if thumb_asset_default_url: + thumbnails.append({ + 'id': 'default', + 'url': thumb_asset_default_url, + 'preference': 0, + }) + if thumb_asset_portrait_url := url_or_none(asset_portrait.get('thumbnailURL')): + thumbnails.append({ + 'id': 'portrait', + 'url': thumb_asset_portrait_url, + 'preference': -1, + }) + thumb_default_url = url_or_none(clip.get('thumbnailURL')) + if thumb_default_url and thumb_default_url != thumb_asset_default_url: + thumbnails.append({ + 'id': 'small', + 'url': thumb_default_url, + 'preference': -2, + }) old_id = self._search_regex(r'%7C(\d+)(?:-\d+)?.mp4', formats[-1]['url'], 'old id', default=None) return { - 'id': clip.get('id') or video_id, + 'id': clip.get('id') or slug, '_old_archive_ids': [make_archive_id(self, old_id)] if old_id else None, - 'display_id': video_id, - 'title': clip.get('title'), + 'display_id': slug, 'formats': formats, - 'duration': int_or_none(clip.get('durationSeconds')), - 'view_count': int_or_none(clip.get('viewCount')), - 'timestamp': unified_timestamp(clip.get('createdAt')), 'thumbnails': thumbnails, - 'creator': try_get(clip, lambda x: x['broadcaster']['displayName'], str), - 'uploader': try_get(clip, lambda x: x['curator']['displayName'], str), - 'uploader_id': try_get(clip, lambda x: x['curator']['id'], str), + **traverse_obj(clip, { + 'title': ('title', {str}), + 'duration': ('durationSeconds', {int_or_none}), + 'view_count': ('viewCount', {int_or_none}), + 'timestamp': ('createdAt', {parse_iso8601}), + 'creators': ('broadcaster', 'displayName', {str}, filter, all), + 'channel': ('broadcaster', 'displayName', {str}), + 'channel_id': ('broadcaster', 'id', {str}), + 'channel_follower_count': ('broadcaster', 'followers', 'totalCount', {int_or_none}), + 'channel_is_verified': ('broadcaster', 'isPartner', {bool}), + 'uploader': ('curator', 'displayName', {str}), + 'uploader_id': ('curator', 'id', {str}), + 'categories': ('game', 'displayName', {str}, filter, all, filter), + }), } diff --git a/yt_dlp/extractor/twitter.py b/yt_dlp/extractor/twitter.py index c05b5bf9c..5eee3e726 100644 --- a/yt_dlp/extractor/twitter.py +++ b/yt_dlp/extractor/twitter.py @@ -1,11 +1,12 @@ import functools import json -import random +import math import re import urllib.parse from .common import InfoExtractor from .periscope import PeriscopeBaseIE, PeriscopeIE +from ..jsinterp import js_number_to_string from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, @@ -20,6 +21,7 @@ str_or_none, strip_or_none, traverse_obj, + truncate_string, try_call, try_get, unified_timestamp, @@ -357,6 +359,7 @@ class TwitterCardIE(InfoExtractor): 'display_id': '560070183650213889', 'uploader_url': 'https://twitter.com/Twitter', }, + 'skip': 'This content is no longer available.', }, { 'url': 'https://twitter.com/i/cards/tfw/v1/623160978427936768', @@ -364,7 +367,7 @@ class TwitterCardIE(InfoExtractor): 'info_dict': { 'id': '623160978427936768', 'ext': 'mp4', - 'title': "NASA - Fly over Pluto's icy Norgay Mountains and Sputnik Plain in this @NASANewHorizons #PlutoFlyby video.", + 'title': "NASA - Fly over Pluto's icy Norgay Mountains and Sputnik Plain in this @NASA...", 'description': "Fly over Pluto's icy Norgay Mountains and Sputnik Plain in this @NASANewHorizons #PlutoFlyby video. https://t.co/BJYgOjSeGA", 'uploader': 'NASA', 'uploader_id': 'NASA', @@ -376,12 +379,14 @@ class TwitterCardIE(InfoExtractor): 'like_count': int, 'repost_count': int, 'tags': ['PlutoFlyby'], + 'channel_id': '11348282', + '_old_archive_ids': ['twitter 623160978427936768'], }, 'params': {'format': '[protocol=https]'}, }, { 'url': 'https://twitter.com/i/cards/tfw/v1/654001591733886977', - 'md5': 'b6d9683dd3f48e340ded81c0e917ad46', + 'md5': 'fb08fbd69595cbd8818f0b2f2a94474d', 'info_dict': { 'id': 'dq4Oj5quskI', 'ext': 'mp4', @@ -389,12 +394,12 @@ class TwitterCardIE(InfoExtractor): 'description': 'md5:a831e97fa384863d6e26ce48d1c43376', 'upload_date': '20111013', 'uploader': 'OMG! UBUNTU!', - 'uploader_id': 'omgubuntu', + 'uploader_id': '@omgubuntu', 'channel_url': 'https://www.youtube.com/channel/UCIiSwcm9xiFb3Y4wjzR41eQ', 'channel_id': 'UCIiSwcm9xiFb3Y4wjzR41eQ', 'channel_follower_count': int, 'chapters': 'count:8', - 'uploader_url': 'http://www.youtube.com/user/omgubuntu', + 'uploader_url': 'https://www.youtube.com/@omgubuntu', 'duration': 138, 'categories': ['Film & Animation'], 'age_limit': 0, @@ -406,6 +411,9 @@ class TwitterCardIE(InfoExtractor): 'tags': 'count:12', 'channel': 'OMG! UBUNTU!', 'playable_in_embed': True, + 'heatmap': 'count:100', + 'timestamp': 1318500227, + 'live_status': 'not_live', }, 'add_ie': ['Youtube'], }, @@ -547,13 +555,14 @@ class TwitterIE(TwitterBaseIE): 'age_limit': 0, '_old_archive_ids': ['twitter 700207533655363584'], }, + 'skip': 'Tweet has been deleted', }, { 'url': 'https://twitter.com/captainamerica/status/719944021058060289', 'info_dict': { 'id': '717462543795523584', 'display_id': '719944021058060289', 'ext': 'mp4', - 'title': 'Captain America - @King0fNerd Are you sure you made the right choice? Find out in theaters.', + 'title': 'Captain America - @King0fNerd Are you sure you made the right choice? Find out in theat...', 'description': '@King0fNerd Are you sure you made the right choice? Find out in theaters. https://t.co/GpgYi9xMJI', 'channel_id': '701615052', 'uploader_id': 'CaptainAmerica', @@ -590,7 +599,7 @@ class TwitterIE(TwitterBaseIE): 'info_dict': { 'id': '852077943283097602', 'ext': 'mp4', - 'title': 'عالم الأخبار - كلمة تاريخية بجلسة الجناسي التاريخية.. النائب خالد مؤنس العتيبي للمعارضين : اتقوا الله .. الظلم ظلمات يوم القيامة', + 'title': 'عالم الأخبار - كلمة تاريخية بجلسة الجناسي التاريخية.. النائب خالد مؤنس العتيبي للمعا...', 'description': 'كلمة تاريخية بجلسة الجناسي التاريخية.. النائب خالد مؤنس العتيبي للمعارضين : اتقوا الله .. الظلم ظلمات يوم القيامة https://t.co/xg6OhpyKfN', 'channel_id': '2526757026', 'uploader': 'عالم الأخبار', @@ -614,7 +623,7 @@ class TwitterIE(TwitterBaseIE): 'id': '910030238373089285', 'display_id': '910031516746514432', 'ext': 'mp4', - 'title': 'Préfet de Guadeloupe - [Direct] #Maria Le centre se trouve actuellement au sud de Basse-Terre. Restez confinés. Réfugiez-vous dans la pièce la + sûre.', + 'title': 'Préfet de Guadeloupe - [Direct] #Maria Le centre se trouve actuellement au sud de Basse-Terr...', 'thumbnail': r're:^https?://.*\.jpg', 'description': '[Direct] #Maria Le centre se trouve actuellement au sud de Basse-Terre. Restez confinés. Réfugiez-vous dans la pièce la + sûre. https://t.co/mwx01Rs4lo', 'channel_id': '2319432498', @@ -706,7 +715,7 @@ class TwitterIE(TwitterBaseIE): 'id': '1349774757969989634', 'display_id': '1349794411333394432', 'ext': 'mp4', - 'title': 'md5:d1c4941658e4caaa6cb579260d85dcba', + 'title': "Brooklyn Nets - WATCH: Sean Marks' full media session after our acquisition of 8-time...", 'thumbnail': r're:^https?://.*\.jpg', 'description': 'md5:71ead15ec44cee55071547d6447c6a3e', 'channel_id': '18552281', @@ -732,7 +741,7 @@ class TwitterIE(TwitterBaseIE): 'id': '1577855447914409984', 'display_id': '1577855540407197696', 'ext': 'mp4', - 'title': 'md5:466a3a8b049b5f5a13164ce915484b51', + 'title': 'Oshtru - gm ✨️ now I can post image and video. nice update.', 'description': 'md5:b9c3699335447391d11753ab21c70a74', 'upload_date': '20221006', 'channel_id': '143077138', @@ -754,10 +763,10 @@ class TwitterIE(TwitterBaseIE): 'url': 'https://twitter.com/UltimaShadowX/status/1577719286659006464', 'info_dict': { 'id': '1577719286659006464', - 'title': 'Ultima Reload - Test', + 'title': 'Ultima - Test', 'description': 'Test https://t.co/Y3KEZD7Dad', 'channel_id': '168922496', - 'uploader': 'Ultima Reload', + 'uploader': 'Ultima', 'uploader_id': 'UltimaShadowX', 'uploader_url': 'https://twitter.com/UltimaShadowX', 'upload_date': '20221005', @@ -776,7 +785,7 @@ class TwitterIE(TwitterBaseIE): 'id': '1575559336759263233', 'display_id': '1575560063510810624', 'ext': 'mp4', - 'title': 'md5:eec26382babd0f7c18f041db8ae1c9c9', + 'title': 'Max Olson - Absolutely heartbreaking footage captured by our surge probe of catas...', 'thumbnail': r're:^https?://.*\.jpg', 'description': 'md5:95aea692fda36a12081b9629b02daa92', 'channel_id': '1094109584', @@ -900,18 +909,18 @@ class TwitterIE(TwitterBaseIE): 'playlist_mincount': 2, 'info_dict': { 'id': '1600649710662213632', - 'title': 'md5:be05989b0722e114103ed3851a0ffae2', + 'title': "Jocelyn Laidlaw - How Kirstie Alley's tragic death inspired me to share more about my c...", 'timestamp': 1670459604.0, 'description': 'md5:591c19ce66fadc2359725d5cd0d1052c', 'comment_count': int, - 'uploader_id': 'CTVJLaidlaw', + 'uploader_id': 'JocelynVLaidlaw', 'channel_id': '80082014', 'repost_count': int, 'tags': ['colorectalcancer', 'cancerjourney', 'imnotaquitter'], 'upload_date': '20221208', 'age_limit': 0, 'uploader': 'Jocelyn Laidlaw', - 'uploader_url': 'https://twitter.com/CTVJLaidlaw', + 'uploader_url': 'https://twitter.com/JocelynVLaidlaw', 'like_count': int, }, }, { @@ -920,17 +929,17 @@ class TwitterIE(TwitterBaseIE): 'info_dict': { 'id': '1600649511827013632', 'ext': 'mp4', - 'title': 'md5:7662a0a27ce6faa3e5b160340f3cfab1', + 'title': "Jocelyn Laidlaw - How Kirstie Alley's tragic death inspired me to share more about my c... #1", 'thumbnail': r're:^https?://.+\.jpg', 'timestamp': 1670459604.0, 'channel_id': '80082014', - 'uploader_id': 'CTVJLaidlaw', + 'uploader_id': 'JocelynVLaidlaw', 'uploader': 'Jocelyn Laidlaw', 'repost_count': int, 'comment_count': int, 'tags': ['colorectalcancer', 'cancerjourney', 'imnotaquitter'], 'duration': 102.226, - 'uploader_url': 'https://twitter.com/CTVJLaidlaw', + 'uploader_url': 'https://twitter.com/JocelynVLaidlaw', 'display_id': '1600649710662213632', 'like_count': int, 'description': 'md5:591c19ce66fadc2359725d5cd0d1052c', @@ -989,6 +998,7 @@ class TwitterIE(TwitterBaseIE): '_old_archive_ids': ['twitter 1599108751385972737'], }, 'params': {'noplaylist': True}, + 'skip': 'Tweet is limited', }, { 'url': 'https://twitter.com/MunTheShinobi/status/1600009574919962625', 'info_dict': { @@ -1000,10 +1010,10 @@ class TwitterIE(TwitterBaseIE): 'description': 'This is a genius ad by Apple. \U0001f525\U0001f525\U0001f525\U0001f525\U0001f525 https://t.co/cNsA0MoOml', 'thumbnail': 'https://pbs.twimg.com/ext_tw_video_thumb/1600009362759733248/pu/img/XVhFQivj75H_YxxV.jpg?name=orig', 'age_limit': 0, - 'uploader': 'Mün', + 'uploader': 'Boy Called Mün', 'repost_count': int, 'upload_date': '20221206', - 'title': 'Mün - This is a genius ad by Apple. \U0001f525\U0001f525\U0001f525\U0001f525\U0001f525', + 'title': 'Boy Called Mün - This is a genius ad by Apple. \U0001f525\U0001f525\U0001f525\U0001f525\U0001f525', 'comment_count': int, 'like_count': int, 'tags': [], @@ -1041,7 +1051,7 @@ class TwitterIE(TwitterBaseIE): 'id': '1694928337846538240', 'ext': 'mp4', 'display_id': '1695424220702888009', - 'title': 'md5:e8daa9527bc2b947121395494f786d9d', + 'title': 'Benny Johnson - Donald Trump driving through the urban, poor neighborhoods of Atlanta...', 'description': 'md5:004f2d37fd58737724ec75bc7e679938', 'channel_id': '15212187', 'uploader': 'Benny Johnson', @@ -1065,7 +1075,7 @@ class TwitterIE(TwitterBaseIE): 'id': '1694928337846538240', 'ext': 'mp4', 'display_id': '1695424220702888009', - 'title': 'md5:e8daa9527bc2b947121395494f786d9d', + 'title': 'Benny Johnson - Donald Trump driving through the urban, poor neighborhoods of Atlanta...', 'description': 'md5:004f2d37fd58737724ec75bc7e679938', 'channel_id': '15212187', 'uploader': 'Benny Johnson', @@ -1100,6 +1110,7 @@ class TwitterIE(TwitterBaseIE): 'view_count': int, }, 'add_ie': ['TwitterBroadcast'], + 'skip': 'Broadcast no longer exists', }, { # Animated gif and quote tweet video 'url': 'https://twitter.com/BAKKOOONN/status/1696256659889565950', @@ -1128,7 +1139,7 @@ class TwitterIE(TwitterBaseIE): 'info_dict': { 'id': '1724883339285544960', 'ext': 'mp4', - 'title': 'md5:cc56716f9ed0b368de2ba54c478e493c', + 'title': 'Robert F. Kennedy Jr - A beautifully crafted short film by Mikki Willis about my independent...', 'description': 'md5:9dc14f5b0f1311fc7caf591ae253a164', 'display_id': '1724884212803834154', 'channel_id': '337808606', @@ -1149,7 +1160,7 @@ class TwitterIE(TwitterBaseIE): }, { # x.com 'url': 'https://x.com/historyinmemes/status/1790637656616943991', - 'md5': 'daca3952ba0defe2cfafb1276d4c1ea5', + 'md5': '4549eda363fecfe37439c455923cba2c', 'info_dict': { 'id': '1790637589910654976', 'ext': 'mp4', @@ -1210,20 +1221,10 @@ class TwitterIE(TwitterBaseIE): }] _MEDIA_ID_RE = re.compile(r'_video/(\d+)/') - - @property - def _GRAPHQL_ENDPOINT(self): - if self.is_logged_in: - return 'zZXycP0V6H7m-2r0mOnFcA/TweetDetail' - return '2ICDjqPd81tulZcYrtpTuQ/TweetResultByRestId' + _GRAPHQL_ENDPOINT = '2ICDjqPd81tulZcYrtpTuQ/TweetResultByRestId' def _graphql_to_legacy(self, data, twid): - result = traverse_obj(data, ( - 'threaded_conversation_with_injections_v2', 'instructions', 0, 'entries', - lambda _, v: v['entryId'] == f'tweet-{twid}', 'content', 'itemContent', - 'tweet_results', 'result', ('tweet', None), {dict}, - ), default={}, get_all=False) if self.is_logged_in else traverse_obj( - data, ('tweetResult', 'result', {dict}), default={}) + result = traverse_obj(data, ('tweetResult', 'result', {dict})) or {} typename = result.get('__typename') if typename not in ('Tweet', 'TweetWithVisibilityResults', 'TweetTombstone', 'TweetUnavailable', None): @@ -1267,37 +1268,6 @@ def _graphql_to_legacy(self, data, twid): def _build_graphql_query(self, media_id): return { - 'variables': { - 'focalTweetId': media_id, - 'includePromotedContent': True, - 'with_rux_injections': False, - 'withBirdwatchNotes': True, - 'withCommunity': True, - 'withDownvotePerspective': False, - 'withQuickPromoteEligibilityTweetFields': True, - 'withReactionsMetadata': False, - 'withReactionsPerspective': False, - 'withSuperFollowsTweetFields': True, - 'withSuperFollowsUserFields': True, - 'withV2Timeline': True, - 'withVoice': True, - }, - 'features': { - 'graphql_is_translatable_rweb_tweet_is_translatable_enabled': False, - 'interactive_text_enabled': True, - 'responsive_web_edit_tweet_api_enabled': True, - 'responsive_web_enhance_cards_enabled': True, - 'responsive_web_graphql_timeline_navigation_enabled': False, - 'responsive_web_text_conversations_enabled': False, - 'responsive_web_uc_gql_enabled': True, - 'standardized_nudges_misinfo': True, - 'tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled': False, - 'tweetypie_unmention_optimization_enabled': True, - 'unified_cards_ad_metadata_container_dynamic_card_content_query_enabled': True, - 'verified_phone_label_enabled': False, - 'vibe_api_enabled': True, - }, - } if self.is_logged_in else { 'variables': { 'tweetId': media_id, 'withCommunity': False, @@ -1330,6 +1300,11 @@ def _build_graphql_query(self, media_id): }, } + def _generate_syndication_token(self, twid): + # ((Number(twid) / 1e15) * Math.PI).toString(36).replace(/(0+|\.)/g, '') + translation = str.maketrans(dict.fromkeys('0.')) + return js_number_to_string((int(twid) / 1e15) * math.pi, 36).translate(translation) + def _call_syndication_api(self, twid): self.report_warning( 'Not all metadata or media is available via syndication endpoint', twid, only_once=True) @@ -1337,8 +1312,7 @@ def _call_syndication_api(self, twid): 'https://cdn.syndication.twimg.com/tweet-result', twid, 'Downloading syndication JSON', headers={'User-Agent': 'Googlebot'}, query={ 'id': twid, - # TODO: token = ((Number(twid) / 1e15) * Math.PI).toString(36).replace(/(0+|\.)/g, '') - 'token': ''.join(random.choices('123456789abcdefghijklmnopqrstuvwxyz', k=10)), + 'token': self._generate_syndication_token(twid), }) if not status: raise ExtractorError('Syndication endpoint returned empty JSON response') @@ -1385,7 +1359,7 @@ def _real_extract(self, url): title = description = traverse_obj( status, (('full_text', 'text'), {lambda x: x.replace('\n', ' ')}), get_all=False) or '' # strip 'https -_t.co_BJYgOjSeGA' junk from filenames - title = re.sub(r'\s+(https?://[^ ]+)', '', title) + title = truncate_string(re.sub(r'\s+(https?://[^ ]+)', '', title), left=72) user = status.get('user') or {} uploader = user.get('name') if uploader: @@ -1702,21 +1676,22 @@ class TwitterSpacesIE(TwitterBaseIE): _VALID_URL = TwitterBaseIE._BASE_REGEX + r'i/spaces/(?P<id>[0-9a-zA-Z]{13})' _TESTS = [{ - 'url': 'https://twitter.com/i/spaces/1RDxlgyvNXzJL', + 'url': 'https://twitter.com/i/spaces/1OwxWwQOPlNxQ', 'info_dict': { - 'id': '1RDxlgyvNXzJL', + 'id': '1OwxWwQOPlNxQ', 'ext': 'm4a', - 'title': 'King Carlo e la mossa Kansas City per fare il Grande Centro', - 'description': 'Twitter Space participated by annarita digiorgio, Signor Ernesto, Raffaello Colosimo, Simone M. Sepe', - 'uploader': r're:Lucio Di Gaetano.*?', - 'uploader_id': 'luciodigaetano', + 'title': 'Everybody in: @mtbarra & @elonmusk discuss the future of EV charging', + 'description': 'Twitter Space participated by Elon Musk', 'live_status': 'was_live', - 'timestamp': 1659877956, - 'upload_date': '20220807', - 'release_timestamp': 1659904215, - 'release_date': '20220807', + 'release_date': '20230608', + 'release_timestamp': 1686256230, + 'thumbnail': r're:https?://pbs\.twimg\.com/profile_images/.+', + 'timestamp': 1686254250, + 'upload_date': '20230608', + 'uploader': 'Mary Barra', + 'uploader_id': 'mtbarra', }, - 'skip': 'No longer available', + 'params': {'skip_download': 'm3u8'}, }, { # post_live/TimedOut but downloadable 'url': 'https://twitter.com/i/spaces/1vAxRAVQWONJl', @@ -1728,9 +1703,10 @@ class TwitterSpacesIE(TwitterBaseIE): 'uploader': 'Google Cloud', 'uploader_id': 'googlecloud', 'live_status': 'post_live', + 'thumbnail': r're:https?://pbs\.twimg\.com/profile_images/.+', 'timestamp': 1681409554, 'upload_date': '20230413', - 'release_timestamp': 1681839000, + 'release_timestamp': 1681839082, 'release_date': '20230418', 'protocol': 'm3u8', # ffmpeg is forced 'container': 'm4a_dash', # audio-only format fixup is applied @@ -1747,6 +1723,9 @@ class TwitterSpacesIE(TwitterBaseIE): 'uploader': '息根とめる', 'uploader_id': 'tomeru_ikinone', 'live_status': 'was_live', + 'release_date': '20230601', + 'release_timestamp': 1685617200, + 'thumbnail': r're:https?://pbs\.twimg\.com/profile_images/.+', 'timestamp': 1685617198, 'upload_date': '20230601', 'protocol': 'm3u8', # ffmpeg is forced @@ -1764,9 +1743,10 @@ class TwitterSpacesIE(TwitterBaseIE): 'uploader': 'Candace Owens', 'uploader_id': 'RealCandaceO', 'live_status': 'was_live', + 'thumbnail': r're:https?://pbs\.twimg\.com/profile_images/.+', 'timestamp': 1723931351, 'upload_date': '20240817', - 'release_timestamp': 1723932000, + 'release_timestamp': 1723932056, 'release_date': '20240817', 'protocol': 'm3u8_native', # not ffmpeg, detected as video space }, @@ -1846,18 +1826,21 @@ def _real_extract(self, url): return { 'id': space_id, - 'title': metadata.get('title'), 'description': f'Twitter Space participated by {participants}', - 'uploader': traverse_obj( - metadata, ('creator_results', 'result', 'legacy', 'name')), - 'uploader_id': traverse_obj( - metadata, ('creator_results', 'result', 'legacy', 'screen_name')), - 'live_status': live_status, - 'release_timestamp': try_call( - lambda: int_or_none(metadata['scheduled_start'], scale=1000)), - 'timestamp': int_or_none(metadata.get('created_at'), scale=1000), 'formats': formats, 'http_headers': headers, + 'live_status': live_status, + **traverse_obj(metadata, { + 'title': ('title', {str}), + # started_at is None when stream is_upcoming so fallback to scheduled_start for --wait-for-video + 'release_timestamp': (('started_at', 'scheduled_start'), {int_or_none(scale=1000)}, any), + 'timestamp': ('created_at', {int_or_none(scale=1000)}), + }), + **traverse_obj(metadata, ('creator_results', 'result', 'legacy', { + 'uploader': ('name', {str}), + 'uploader_id': ('screen_name', {str_or_none}), + 'thumbnail': ('profile_image_url_https', {lambda x: x.replace('_normal', '_400x400')}, {url_or_none}), + })), } diff --git a/yt_dlp/extractor/unsupported.py b/yt_dlp/extractor/unsupported.py index e8d6ae128..31393b02a 100644 --- a/yt_dlp/extractor/unsupported.py +++ b/yt_dlp/extractor/unsupported.py @@ -51,6 +51,8 @@ class KnownDRMIE(UnsupportedInfoExtractor): r'(?:watch|front)\.njpwworld\.com', r'qub\.ca/vrai', r'(?:beta\.)?crunchyroll\.com', + r'viki\.com', + r'deezer\.com', ) _TESTS = [{ @@ -160,6 +162,12 @@ class KnownDRMIE(UnsupportedInfoExtractor): }, { 'url': 'https://beta.crunchyroll.com/pt-br/watch/G8WUN8VKP/the-ruler-of-conspiracy', 'only_matching': True, + }, { + 'url': 'https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1', + 'only_matching': True, + }, { + 'url': 'http://www.deezer.com/playlist/176747451', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/viki.py b/yt_dlp/extractor/viki.py deleted file mode 100644 index 75f9cdf2f..000000000 --- a/yt_dlp/extractor/viki.py +++ /dev/null @@ -1,346 +0,0 @@ -import hashlib -import hmac -import json -import time - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - int_or_none, - parse_age_limit, - parse_iso8601, - try_get, -) - - -class VikiBaseIE(InfoExtractor): - _VALID_URL_BASE = r'https?://(?:www\.)?viki\.(?:com|net|mx|jp|fr)/' - _API_URL_TEMPLATE = 'https://api.viki.io%s' - - _DEVICE_ID = '112395910d' - _APP = '100005a' - _APP_VERSION = '6.11.3' - _APP_SECRET = 'd96704b180208dbb2efa30fe44c48bd8690441af9f567ba8fd710a72badc85198f7472' - - _GEO_BYPASS = False - _NETRC_MACHINE = 'viki' - - _token = None - - _ERRORS = { - 'geo': 'Sorry, this content is not available in your region.', - 'upcoming': 'Sorry, this content is not yet available.', - 'paywall': 'Sorry, this content is only available to Viki Pass Plus subscribers', - } - - def _stream_headers(self, timestamp, sig): - return { - 'X-Viki-manufacturer': 'vivo', - 'X-Viki-device-model': 'vivo 1606', - 'X-Viki-device-os-ver': '6.0.1', - 'X-Viki-connection-type': 'WIFI', - 'X-Viki-carrier': '', - 'X-Viki-as-id': '100005a-1625321982-3932', - 'timestamp': str(timestamp), - 'signature': str(sig), - 'x-viki-app-ver': self._APP_VERSION, - } - - def _api_query(self, path, version=4, **kwargs): - path += '?' if '?' not in path else '&' - query = f'/v{version}/{path}app={self._APP}' - if self._token: - query += f'&token={self._token}' - return query + ''.join(f'&{name}={val}' for name, val in kwargs.items()) - - def _sign_query(self, path): - timestamp = int(time.time()) - query = self._api_query(path, version=5) - sig = hmac.new( - self._APP_SECRET.encode('ascii'), f'{query}&t={timestamp}'.encode('ascii'), hashlib.sha1).hexdigest() - return timestamp, sig, self._API_URL_TEMPLATE % query - - def _call_api( - self, path, video_id, note='Downloading JSON metadata', data=None, query=None, fatal=True): - if query is None: - timestamp, sig, url = self._sign_query(path) - else: - url = self._API_URL_TEMPLATE % self._api_query(path, version=4) - resp = self._download_json( - url, video_id, note, fatal=fatal, query=query, - data=json.dumps(data).encode() if data else None, - headers=({'x-viki-app-ver': self._APP_VERSION} if data - else self._stream_headers(timestamp, sig) if query is None - else None), expected_status=400) or {} - - self._raise_error(resp.get('error'), fatal) - return resp - - def _raise_error(self, error, fatal=True): - if error is None: - return - msg = f'{self.IE_NAME} said: {error}' - if fatal: - raise ExtractorError(msg, expected=True) - else: - self.report_warning(msg) - - def _check_errors(self, data): - for reason, status in (data.get('blocking') or {}).items(): - if status and reason in self._ERRORS: - message = self._ERRORS[reason] - if reason == 'geo': - self.raise_geo_restricted(msg=message) - elif reason == 'paywall': - if try_get(data, lambda x: x['paywallable']['tvod']): - self._raise_error('This video is for rent only or TVOD (Transactional Video On demand)') - self.raise_login_required(message) - self._raise_error(message) - - def _perform_login(self, username, password): - self._token = self._call_api( - 'sessions.json', None, 'Logging in', fatal=False, - data={'username': username, 'password': password}).get('token') - if not self._token: - self.report_warning('Login Failed: Unable to get session token') - - @staticmethod - def dict_selection(dict_obj, preferred_key): - if preferred_key in dict_obj: - return dict_obj[preferred_key] - return (list(filter(None, dict_obj.values())) or [None])[0] - - -class VikiIE(VikiBaseIE): - IE_NAME = 'viki' - _VALID_URL = rf'{VikiBaseIE._VALID_URL_BASE}(?:videos|player)/(?P<id>[0-9]+v)' - _TESTS = [{ - 'note': 'Free non-DRM video with storyboards in MPD', - 'url': 'https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1', - 'info_dict': { - 'id': '1175236v', - 'ext': 'mp4', - 'title': 'Choosing Spouse by Lottery - Episode 1', - 'timestamp': 1606463239, - 'age_limit': 13, - 'uploader': 'FCC', - 'upload_date': '20201127', - }, - }, { - 'url': 'http://www.viki.com/videos/1023585v-heirs-episode-14', - 'info_dict': { - 'id': '1023585v', - 'ext': 'mp4', - 'title': 'Heirs - Episode 14', - 'uploader': 'SBS Contents Hub', - 'timestamp': 1385047627, - 'upload_date': '20131121', - 'age_limit': 13, - 'duration': 3570, - 'episode_number': 14, - }, - 'skip': 'Blocked in the US', - }, { - # clip - 'url': 'http://www.viki.com/videos/1067139v-the-avengers-age-of-ultron-press-conference', - 'md5': '86c0b5dbd4d83a6611a79987cc7a1989', - 'info_dict': { - 'id': '1067139v', - 'ext': 'mp4', - 'title': "'The Avengers: Age of Ultron' Press Conference", - 'description': 'md5:d70b2f9428f5488321bfe1db10d612ea', - 'duration': 352, - 'timestamp': 1430380829, - 'upload_date': '20150430', - 'uploader': 'Arirang TV', - 'like_count': int, - 'age_limit': 0, - }, - 'skip': 'Sorry. There was an error loading this video', - }, { - 'url': 'http://www.viki.com/videos/1048879v-ankhon-dekhi', - 'info_dict': { - 'id': '1048879v', - 'ext': 'mp4', - 'title': 'Ankhon Dekhi', - 'duration': 6512, - 'timestamp': 1408532356, - 'upload_date': '20140820', - 'uploader': 'Spuul', - 'like_count': int, - 'age_limit': 13, - }, - 'skip': 'Blocked in the US', - }, { - # episode - 'url': 'http://www.viki.com/videos/44699v-boys-over-flowers-episode-1', - 'md5': '0a53dc252e6e690feccd756861495a8c', - 'info_dict': { - 'id': '44699v', - 'ext': 'mp4', - 'title': 'Boys Over Flowers - Episode 1', - 'description': 'md5:b89cf50038b480b88b5b3c93589a9076', - 'duration': 4172, - 'timestamp': 1270496524, - 'upload_date': '20100405', - 'uploader': 'group8', - 'like_count': int, - 'age_limit': 13, - 'episode_number': 1, - }, - }, { - # youtube external - 'url': 'http://www.viki.com/videos/50562v-poor-nastya-complete-episode-1', - 'md5': '63f8600c1da6f01b7640eee7eca4f1da', - 'info_dict': { - 'id': '50562v', - 'ext': 'webm', - 'title': 'Poor Nastya [COMPLETE] - Episode 1', - 'description': '', - 'duration': 606, - 'timestamp': 1274949505, - 'upload_date': '20101213', - 'uploader': 'ad14065n', - 'uploader_id': 'ad14065n', - 'like_count': int, - 'age_limit': 13, - }, - 'skip': 'Page not found!', - }, { - 'url': 'http://www.viki.com/player/44699v', - 'only_matching': True, - }, { - # non-English description - 'url': 'http://www.viki.com/videos/158036v-love-in-magic', - 'md5': '41faaba0de90483fb4848952af7c7d0d', - 'info_dict': { - 'id': '158036v', - 'ext': 'mp4', - 'uploader': 'I Planet Entertainment', - 'upload_date': '20111122', - 'timestamp': 1321985454, - 'description': 'md5:44b1e46619df3a072294645c770cef36', - 'title': 'Love In Magic', - 'age_limit': 13, - }, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - video = self._call_api(f'videos/{video_id}.json', video_id, 'Downloading video JSON', query={}) - self._check_errors(video) - - title = try_get(video, lambda x: x['titles']['en'], str) - episode_number = int_or_none(video.get('number')) - if not title: - title = f'Episode {episode_number}' if video.get('type') == 'episode' else video.get('id') or video_id - container_titles = try_get(video, lambda x: x['container']['titles'], dict) or {} - container_title = self.dict_selection(container_titles, 'en') - title = f'{container_title} - {title}' - - thumbnails = [{ - 'id': thumbnail_id, - 'url': thumbnail['url'], - } for thumbnail_id, thumbnail in (video.get('images') or {}).items() if thumbnail.get('url')] - - resp = self._call_api( - f'playback_streams/{video_id}.json?drms=dt3&device_id={self._DEVICE_ID}', - video_id, 'Downloading video streams JSON')['main'][0] - - stream_id = try_get(resp, lambda x: x['properties']['track']['stream_id']) - subtitles = dict((lang, [{ - 'ext': ext, - 'url': self._API_URL_TEMPLATE % self._api_query( - f'videos/{video_id}/auth_subtitles/{lang}.{ext}', stream_id=stream_id), - } for ext in ('srt', 'vtt')]) for lang in (video.get('subtitle_completions') or {})) - - mpd_url = resp['url'] - # 720p is hidden in another MPD which can be found in the current manifest content - mpd_content = self._download_webpage(mpd_url, video_id, note='Downloading initial MPD manifest') - mpd_url = self._search_regex( - r'(?mi)<BaseURL>(http.+.mpd)', mpd_content, 'new manifest', default=mpd_url) - if 'mpdhd_high' not in mpd_url and 'sig=' not in mpd_url: - # Modify the URL to get 1080p - mpd_url = mpd_url.replace('mpdhd', 'mpdhd_high') - formats = self._extract_mpd_formats(mpd_url, video_id) - - return { - 'id': video_id, - 'formats': formats, - 'title': title, - 'description': self.dict_selection(video.get('descriptions', {}), 'en'), - 'duration': int_or_none(video.get('duration')), - 'timestamp': parse_iso8601(video.get('created_at')), - 'uploader': video.get('author'), - 'uploader_url': video.get('author_url'), - 'like_count': int_or_none(try_get(video, lambda x: x['likes']['count'])), - 'age_limit': parse_age_limit(video.get('rating')), - 'thumbnails': thumbnails, - 'subtitles': subtitles, - 'episode_number': episode_number, - } - - -class VikiChannelIE(VikiBaseIE): - IE_NAME = 'viki:channel' - _VALID_URL = rf'{VikiBaseIE._VALID_URL_BASE}(?:tv|news|movies|artists)/(?P<id>[0-9]+c)' - _TESTS = [{ - 'url': 'http://www.viki.com/tv/50c-boys-over-flowers', - 'info_dict': { - 'id': '50c', - 'title': 'Boys Over Flowers', - 'description': 'md5:804ce6e7837e1fd527ad2f25420f4d59', - }, - 'playlist_mincount': 51, - }, { - 'url': 'http://www.viki.com/tv/1354c-poor-nastya-complete', - 'info_dict': { - 'id': '1354c', - 'title': 'Poor Nastya [COMPLETE]', - 'description': 'md5:05bf5471385aa8b21c18ad450e350525', - }, - 'playlist_count': 127, - 'skip': 'Page not found', - }, { - 'url': 'http://www.viki.com/news/24569c-showbiz-korea', - 'only_matching': True, - }, { - 'url': 'http://www.viki.com/movies/22047c-pride-and-prejudice-2005', - 'only_matching': True, - }, { - 'url': 'http://www.viki.com/artists/2141c-shinee', - 'only_matching': True, - }] - - _video_types = ('episodes', 'movies', 'clips', 'trailers') - - def _entries(self, channel_id): - params = { - 'app': self._APP, 'token': self._token, 'only_ids': 'true', - 'direction': 'asc', 'sort': 'number', 'per_page': 30, - } - video_types = self._configuration_arg('video_types') or self._video_types - for video_type in video_types: - if video_type not in self._video_types: - self.report_warning(f'Unknown video_type: {video_type}') - page_num = 0 - while True: - page_num += 1 - params['page'] = page_num - res = self._call_api( - f'containers/{channel_id}/{video_type}.json', channel_id, query=params, fatal=False, - note=f'Downloading {video_type.title()} JSON page {page_num}') - - for video_id in res.get('response') or []: - yield self.url_result(f'https://www.viki.com/videos/{video_id}', VikiIE.ie_key(), video_id) - if not res.get('more'): - break - - def _real_extract(self, url): - channel_id = self._match_id(url) - channel = self._call_api(f'containers/{channel_id}.json', channel_id, 'Downloading channel JSON') - self._check_errors(channel) - return self.playlist_result( - self._entries(channel_id), channel_id, - self.dict_selection(channel['titles'], 'en'), - self.dict_selection(channel['descriptions'], 'en')) diff --git a/yt_dlp/extractor/vimeo.py b/yt_dlp/extractor/vimeo.py index 8a0aaaa46..fb9af7acf 100644 --- a/yt_dlp/extractor/vimeo.py +++ b/yt_dlp/extractor/vimeo.py @@ -39,6 +39,14 @@ class VimeoBaseInfoExtractor(InfoExtractor): _NETRC_MACHINE = 'vimeo' _LOGIN_REQUIRED = False _LOGIN_URL = 'https://vimeo.com/log_in' + _IOS_CLIENT_AUTH = 'MTMxNzViY2Y0NDE0YTQ5YzhjZTc0YmU0NjVjNDQxYzNkYWVjOWRlOTpHKzRvMmgzVUh4UkxjdU5FRW80cDNDbDhDWGR5dVJLNUJZZ055dHBHTTB4V1VzaG41bEx1a2hiN0NWYWNUcldSSW53dzRUdFRYZlJEZmFoTTArOTBUZkJHS3R4V2llYU04Qnl1bERSWWxUdXRidjNqR2J4SHFpVmtFSUcyRktuQw==' + _IOS_CLIENT_HEADERS = { + 'Accept': 'application/vnd.vimeo.*+json; version=3.4.10', + 'Accept-Language': 'en', + 'User-Agent': 'Vimeo/11.10.0 (com.vimeo; build:250424.164813.0; iOS 18.4.1) Alamofire/5.9.0 VimeoNetworking/5.0.0', + } + _IOS_OAUTH_CACHE_KEY = 'oauth-token-ios' + _ios_oauth_token = None @staticmethod def _smuggle_referrer(url, referrer_url): @@ -88,13 +96,16 @@ def _get_video_password(self): expected=True) return password - def _verify_video_password(self, video_id, password, token): + def _verify_video_password(self, video_id): + video_password = self._get_video_password() + token = self._download_json( + 'https://vimeo.com/_next/viewer', video_id, 'Downloading viewer info')['xsrft'] url = f'https://vimeo.com/{video_id}' try: - return self._download_webpage( + self._request_webpage( f'{url}/password', video_id, 'Submitting video password', data=json.dumps({ - 'password': password, + 'password': video_password, 'token': token, }, separators=(',', ':')).encode(), headers={ 'Accept': '*/*', @@ -239,20 +250,39 @@ def _parse_config(self, config, video_id): '_format_sort_fields': ('quality', 'res', 'fps', 'hdr:12', 'source'), } - def _call_videos_api(self, video_id, jwt_token, unlisted_hash=None, **kwargs): + def _fetch_oauth_token(self): + if not self._ios_oauth_token: + self._ios_oauth_token = self.cache.load(self._NETRC_MACHINE, self._IOS_OAUTH_CACHE_KEY) + + if not self._ios_oauth_token: + self._ios_oauth_token = self._download_json( + 'https://api.vimeo.com/oauth/authorize/client', None, + 'Fetching OAuth token', 'Failed to fetch OAuth token', + headers={ + 'Authorization': f'Basic {self._IOS_CLIENT_AUTH}', + **self._IOS_CLIENT_HEADERS, + }, data=urlencode_postdata({ + 'grant_type': 'client_credentials', + 'scope': 'private public create edit delete interact upload purchased stats', + }, quote_via=urllib.parse.quote))['access_token'] + self.cache.store(self._NETRC_MACHINE, self._IOS_OAUTH_CACHE_KEY, self._ios_oauth_token) + + return self._ios_oauth_token + + def _call_videos_api(self, video_id, unlisted_hash=None, **kwargs): return self._download_json( join_nonempty(f'https://api.vimeo.com/videos/{video_id}', unlisted_hash, delim=':'), video_id, 'Downloading API JSON', headers={ - 'Authorization': f'jwt {jwt_token}', - 'Accept': 'application/json', + 'Authorization': f'Bearer {self._fetch_oauth_token()}', + **self._IOS_CLIENT_HEADERS, }, query={ 'fields': ','.join(( - 'config_url', 'created_time', 'description', 'download', 'license', - 'metadata.connections.comments.total', 'metadata.connections.likes.total', - 'release_time', 'stats.plays')), + 'config_url', 'embed_player_config_url', 'player_embed_url', 'download', 'play', + 'files', 'description', 'license', 'release_time', 'created_time', 'stats.plays', + 'metadata.connections.comments.total', 'metadata.connections.likes.total')), }, **kwargs) - def _extract_original_format(self, url, video_id, unlisted_hash=None, jwt=None, api_data=None): + def _extract_original_format(self, url, video_id, unlisted_hash=None, api_data=None): # Original/source formats are only available when logged in if not self._get_cookies('https://vimeo.com/').get('vimeo'): return @@ -283,12 +313,8 @@ def _extract_original_format(self, url, video_id, unlisted_hash=None, jwt=None, 'quality': 1, } - jwt = jwt or traverse_obj(self._download_json( - 'https://vimeo.com/_rv/viewer', video_id, 'Downloading jwt token', fatal=False), ('jwt', {str})) - if not jwt: - return original_response = api_data or self._call_videos_api( - video_id, jwt, unlisted_hash, fatal=False, expected_status=(403, 404)) + video_id, unlisted_hash, fatal=False, expected_status=(403, 404)) for download_data in traverse_obj(original_response, ('download', ..., {dict})): download_url = download_data.get('link') if not download_url or download_data.get('quality') != 'source': @@ -410,6 +436,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'duration': 10, 'comment_count': int, 'like_count': int, + 'view_count': int, 'thumbnail': 'https://i.vimeocdn.com/video/440665496-b2c5aee2b61089442c794f64113a8e8f7d5763c3e6b3ebfaf696ae6413f8b1f4-d', }, 'params': { @@ -500,15 +527,16 @@ class VimeoIE(VimeoBaseInfoExtractor): 'uploader': 'The DMCI', 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/dmci', 'uploader_id': 'dmci', - 'timestamp': 1324343742, + 'timestamp': 1324361742, 'upload_date': '20111220', - 'description': 'md5:ae23671e82d05415868f7ad1aec21147', + 'description': 'md5:f37b4ad0f3ded6fa16f38ecde16c3c44', 'duration': 60, 'comment_count': int, 'view_count': int, 'thumbnail': 'https://i.vimeocdn.com/video/231174622-dd07f015e9221ff529d451e1cc31c982b5d87bfafa48c4189b1da72824ee289a-d', 'like_count': int, - 'tags': 'count:11', + 'release_timestamp': 1324361742, + 'release_date': '20111220', }, # 'params': {'format': 'Original'}, 'expected_warnings': ['Failed to parse XML: not well-formed'], @@ -521,15 +549,18 @@ class VimeoIE(VimeoBaseInfoExtractor): 'id': '393756517', # 'ext': 'mov', 'ext': 'mp4', - 'timestamp': 1582642091, + 'timestamp': 1582660091, 'uploader_id': 'frameworkla', 'title': 'Straight To Hell - Sabrina: Netflix', 'uploader': 'Framework Studio', - 'description': 'md5:f2edc61af3ea7a5592681ddbb683db73', 'upload_date': '20200225', 'duration': 176, 'thumbnail': 'https://i.vimeocdn.com/video/859377297-836494a4ef775e9d4edbace83937d9ad34dc846c688c0c419c0e87f7ab06c4b3-d', 'uploader_url': 'https://vimeo.com/frameworkla', + 'comment_count': int, + 'like_count': int, + 'release_timestamp': 1582660091, + 'release_date': '20200225', }, # 'params': {'format': 'source'}, 'expected_warnings': ['Failed to parse XML: not well-formed'], @@ -630,7 +661,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'description': str, # FIXME: Dynamic SEO spam description 'upload_date': '20150209', 'timestamp': 1423518307, - 'thumbnail': 'https://i.vimeocdn.com/video/default', + 'thumbnail': r're:https://i\.vimeocdn\.com/video/default', 'duration': 10, 'like_count': int, 'uploader_url': 'https://vimeo.com/user20132939', @@ -667,6 +698,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'like_count': int, 'uploader_url': 'https://vimeo.com/aliniamedia', 'release_date': '20160329', + 'view_count': int, }, 'params': {'skip_download': True}, 'expected_warnings': ['Failed to parse XML: not well-formed'], @@ -678,18 +710,19 @@ class VimeoIE(VimeoBaseInfoExtractor): # 'ext': 'm4v', 'ext': 'mp4', 'title': 'Eastnor Castle 2015 Firework Champions - The Promo!', - 'description': 'md5:5967e090768a831488f6e74b7821b3c1', + 'description': 'md5:9441e6829ae94f380cc6417d982f63ac', 'uploader_id': 'fireworkchampions', 'uploader': 'Firework Champions', 'upload_date': '20150910', - 'timestamp': 1441901895, + 'timestamp': 1441916295, 'thumbnail': 'https://i.vimeocdn.com/video/534715882-6ff8e4660cbf2fea68282876d8d44f318825dfe572cc4016e73b3266eac8ae3a-d', 'uploader_url': 'https://vimeo.com/fireworkchampions', - 'tags': 'count:6', 'duration': 229, 'view_count': int, 'like_count': int, 'comment_count': int, + 'release_timestamp': 1441916295, + 'release_date': '20150910', }, 'params': { 'skip_download': True, @@ -820,7 +853,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'uploader': 'Raja Virdi', 'uploader_id': 'rajavirdi', 'uploader_url': 'https://vimeo.com/rajavirdi', - 'duration': 309, + 'duration': 300, 'thumbnail': r're:https://i\.vimeocdn\.com/video/1716727772-[\da-f]+-d', }, # 'params': {'format': 'source'}, @@ -860,12 +893,9 @@ def _verify_player_video_password(self, url, video_id, headers): return checked def _extract_from_api(self, video_id, unlisted_hash=None): - viewer = self._download_json( - 'https://vimeo.com/_next/viewer', video_id, 'Downloading viewer info') - for retry in (False, True): try: - video = self._call_videos_api(video_id, viewer['jwt'], unlisted_hash) + video = self._call_videos_api(video_id, unlisted_hash) break except ExtractorError as e: if (not retry and isinstance(e.cause, HTTPError) and e.cause.status == 400 @@ -873,15 +903,14 @@ def _extract_from_api(self, video_id, unlisted_hash=None): self._webpage_read_content(e.cause.response, e.cause.response.url, video_id, fatal=False), ({json.loads}, 'invalid_parameters', ..., 'field'), )): - self._verify_video_password( - video_id, self._get_video_password(), viewer['xsrft']) + self._verify_video_password(video_id) continue raise info = self._parse_config(self._download_json( video['config_url'], video_id), video_id) source_format = self._extract_original_format( - f'https://vimeo.com/{video_id}', video_id, unlisted_hash, jwt=viewer['jwt'], api_data=video) + f'https://vimeo.com/{video_id}', video_id, unlisted_hash, api_data=video) if source_format: info['formats'].append(source_format) @@ -1122,7 +1151,7 @@ class VimeoOndemandIE(VimeoIE): # XXX: Do not subclass from concrete IE 'description': 'md5:aeeba3dbd4d04b0fa98a4fdc9c639998', 'upload_date': '20140906', 'timestamp': 1410032453, - 'thumbnail': 'https://i.vimeocdn.com/video/488238335-d7bf151c364cff8d467f1b73784668fe60aae28a54573a35d53a1210ae283bd8-d_1280', + 'thumbnail': r're:https://i\.vimeocdn\.com/video/\d+-[\da-f]+-d', 'comment_count': int, 'license': 'https://creativecommons.org/licenses/by-nc-nd/3.0/', 'duration': 53, @@ -1132,7 +1161,7 @@ class VimeoOndemandIE(VimeoIE): # XXX: Do not subclass from concrete IE 'params': { 'format': 'best[protocol=https]', }, - 'expected_warnings': ['Unable to download JSON metadata'], + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { # requires Referer to be passed along with og:video:url 'url': 'https://vimeo.com/ondemand/36938/126682985', @@ -1149,13 +1178,14 @@ class VimeoOndemandIE(VimeoIE): # XXX: Do not subclass from concrete IE 'duration': 121, 'comment_count': int, 'view_count': int, - 'thumbnail': 'https://i.vimeocdn.com/video/517077723-7066ae1d9a79d3eb361334fb5d58ec13c8f04b52f8dd5eadfbd6fb0bcf11f613-d_1280', + 'thumbnail': r're:https://i\.vimeocdn\.com/video/\d+-[\da-f]+-d', 'like_count': int, + 'tags': 'count:5', }, 'params': { 'skip_download': True, }, - 'expected_warnings': ['Unable to download JSON metadata'], + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { 'url': 'https://vimeo.com/ondemand/nazmaalik', 'only_matching': True, @@ -1237,7 +1267,7 @@ class VimeoUserIE(VimeoChannelIE): # XXX: Do not subclass from concrete IE _TESTS = [{ 'url': 'https://vimeo.com/nkistudio/videos', 'info_dict': { - 'title': 'Nki', + 'title': 'AKAMA', 'id': 'nkistudio', }, 'playlist_mincount': 66, @@ -1370,10 +1400,10 @@ class VimeoReviewIE(VimeoBaseInfoExtractor): 'uploader_id': 'user170863801', 'uploader_url': 'https://vimeo.com/user170863801', 'duration': 30, - 'thumbnail': 'https://i.vimeocdn.com/video/1912612821-09a43bd2e75c203d503aed89de7534f28fc4474a48f59c51999716931a246af5-d_1280', + 'thumbnail': r're:https://i\.vimeocdn\.com/video/\d+-[\da-f]+-d', }, 'params': {'skip_download': 'm3u8'}, - 'expected_warnings': ['Failed to parse XML'], + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { 'url': 'https://vimeo.com/user21297594/review/75524534/3c257a1b5d', 'md5': 'c507a72f780cacc12b2248bb4006d253', @@ -1423,12 +1453,8 @@ def _real_extract(self, url): user, video_id, review_hash = self._match_valid_url(url).group('user', 'id', 'hash') data_url = f'https://vimeo.com/{user}/review/data/{video_id}/{review_hash}' data = self._download_json(data_url, video_id) - viewer = {} if data.get('isLocked') is True: - video_password = self._get_video_password() - viewer = self._download_json( - 'https://vimeo.com/_rv/viewer', video_id) - self._verify_video_password(video_id, video_password, viewer['xsrft']) + self._verify_video_password(video_id) data = self._download_json(data_url, video_id) clip_data = data['clipData'] config_url = clip_data['configUrl'] @@ -1436,7 +1462,7 @@ def _real_extract(self, url): info_dict = self._parse_config(config, video_id) source_format = self._extract_original_format( f'https://vimeo.com/{user}/review/{video_id}/{review_hash}/action', - video_id, unlisted_hash=clip_data.get('unlistedHash'), jwt=viewer.get('jwt')) + video_id, unlisted_hash=clip_data.get('unlistedHash')) if source_format: info_dict['formats'].append(source_format) info_dict['description'] = clean_html(clip_data.get('description')) @@ -1528,20 +1554,22 @@ class VimeoProIE(VimeoBaseInfoExtractor): 'uploader_id': 'openstreetmapus', 'uploader': 'OpenStreetMap US', 'title': 'Andy Allan - Putting the Carto into OpenStreetMap Cartography', - 'description': 'md5:2c362968038d4499f4d79f88458590c1', + 'description': 'md5:8cf69a1a435f2d763f4adf601e9c3125', 'duration': 1595, 'upload_date': '20130610', - 'timestamp': 1370893156, + 'timestamp': 1370907556, 'license': 'by', - 'thumbnail': 'https://i.vimeocdn.com/video/440260469-19b0d92fca3bd84066623b53f1eb8aaa3980c6c809e2d67b6b39ab7b4a77a344-d_960', + 'thumbnail': r're:https://i\.vimeocdn\.com/video/\d+-[\da-f]+-d', 'view_count': int, 'comment_count': int, 'like_count': int, - 'tags': 'count:1', + 'release_timestamp': 1370907556, + 'release_date': '20130610', }, 'params': { 'format': 'best[protocol=https]', }, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }, { # password-protected VimeoPro page with Vimeo player embed 'url': 'https://vimeopro.com/cadfem/simulation-conference-mechanische-systeme-in-perfektion', @@ -1549,7 +1577,7 @@ class VimeoProIE(VimeoBaseInfoExtractor): 'id': '764543723', 'ext': 'mp4', 'title': 'Mechanische Systeme in Perfektion: Realität erfassen, Innovation treiben', - 'thumbnail': 'https://i.vimeocdn.com/video/1543784598-a1a750494a485e601110136b9fe11e28c2131942452b3a5d30391cb3800ca8fd-d_1280', + 'thumbnail': r're:https://i\.vimeocdn\.com/video/\d+-[\da-f]+-d', 'description': 'md5:2a9d195cd1b0f6f79827107dc88c2420', 'uploader': 'CADFEM', 'uploader_id': 'cadfem', @@ -1561,6 +1589,7 @@ class VimeoProIE(VimeoBaseInfoExtractor): 'videopassword': 'Conference2022', 'skip_download': True, }, + 'expected_warnings': ['Failed to parse XML: not well-formed'], }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index 4b36e41ff..c269802b3 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -116,6 +116,7 @@ class VKIE(VKBaseIE): 'id': '-77521_162222515', 'ext': 'mp4', 'title': 'ProtivoGunz - Хуёвая песня', + 'description': 'Видео из официальной группы Noize MC\nhttp://vk.com/noizemc', 'uploader': 're:(?:Noize MC|Alexander Ilyashenko).*', 'uploader_id': '39545378', 'duration': 195, @@ -165,6 +166,7 @@ class VKIE(VKBaseIE): 'id': '-93049196_456239755', 'ext': 'mp4', 'title': '8 серия (озвучка)', + 'description': 'Видео из официальной группы Noize MC\nhttp://vk.com/noizemc', 'duration': 8383, 'comment_count': int, 'uploader': 'Dizi2021', @@ -240,6 +242,7 @@ class VKIE(VKBaseIE): 'upload_date': '20221005', 'uploader': 'Шальная Императрица', 'uploader_id': '-74006511', + 'description': 'md5:f9315f7786fa0e84e75e4f824a48b056', }, }, { @@ -278,6 +281,43 @@ class VKIE(VKBaseIE): }, 'skip': 'No formats found', }, + { + 'note': 'video has chapters', + 'url': 'https://vkvideo.ru/video-18403220_456239696', + 'info_dict': { + 'id': '-18403220_456239696', + 'ext': 'mp4', + 'title': 'Трамп отменяет гранты // DeepSeek - Революция в ИИ // Илон Маск читер', + 'description': 'md5:b112ea9de53683b6d03d29076f62eec2', + 'uploader': 'Руслан Усачев', + 'uploader_id': '-18403220', + 'comment_count': int, + 'like_count': int, + 'duration': 1983, + 'thumbnail': r're:https?://.+\.jpg', + 'chapters': 'count:21', + 'timestamp': 1738252883, + 'upload_date': '20250130', + }, + }, + { + 'url': 'https://vkvideo.ru/video-50883936_456244102', + 'info_dict': { + 'id': '-50883936_456244102', + 'ext': 'mp4', + 'title': 'Добивание Украины // Техник в коме // МОЯ ЗЛОСТЬ №140', + 'description': 'md5:a9bc46181e9ebd0fdd82cef6c0191140', + 'uploader': 'Стас Ай, Как Просто!', + 'uploader_id': '-50883936', + 'comment_count': int, + 'like_count': int, + 'duration': 4651, + 'thumbnail': r're:https?://.+\.jpg', + 'chapters': 'count:59', + 'timestamp': 1743333869, + 'upload_date': '20250330', + }, + }, { # live stream, hls and rtmp links, most likely already finished live # stream by the time you are reading this comment @@ -449,7 +489,6 @@ def _real_extract(self, url): return self.url_result(opts_url) data = player['params'][0] - title = unescapeHTML(data['md_title']) # 2 = live # 3 = post live (finished live) @@ -507,17 +546,29 @@ def _real_extract(self, url): return { 'id': video_id, 'formats': formats, - 'title': title, - 'thumbnail': data.get('jpg'), - 'uploader': data.get('md_author'), - 'uploader_id': str_or_none(data.get('author_id') or mv_data.get('authorId')), - 'duration': int_or_none(data.get('duration') or mv_data.get('duration')), + 'subtitles': subtitles, + **traverse_obj(mv_data, { + 'title': ('title', {unescapeHTML}), + 'description': ('desc', {clean_html}, filter), + 'duration': ('duration', {int_or_none}), + 'like_count': ('likes', {int_or_none}), + 'comment_count': ('commcount', {int_or_none}), + }), + **traverse_obj(data, { + 'title': ('md_title', {unescapeHTML}), + 'description': ('description', {clean_html}, filter), + 'thumbnail': ('jpg', {url_or_none}), + 'uploader': ('md_author', {unescapeHTML}), + 'uploader_id': (('author_id', 'authorId'), {str_or_none}, any), + 'duration': ('duration', {int_or_none}), + 'chapters': ('time_codes', lambda _, v: isinstance(v['time'], int), { + 'title': ('text', {unescapeHTML}), + 'start_time': 'time', + }), + }), 'timestamp': timestamp, 'view_count': view_count, - 'like_count': int_or_none(mv_data.get('likes')), - 'comment_count': int_or_none(mv_data.get('commcount')), 'is_live': is_live, - 'subtitles': subtitles, '_format_sort_fields': ('res', 'source'), } diff --git a/yt_dlp/extractor/vrsquare.py b/yt_dlp/extractor/vrsquare.py new file mode 100644 index 000000000..9e8740b42 --- /dev/null +++ b/yt_dlp/extractor/vrsquare.py @@ -0,0 +1,185 @@ +import itertools + +from .common import InfoExtractor +from ..networking.exceptions import HTTPError +from ..utils import ( + ExtractorError, + clean_html, + extract_attributes, + parse_duration, + parse_qs, +) +from ..utils.traversal import ( + find_element, + find_elements, + traverse_obj, +) + + +class VrSquareIE(InfoExtractor): + IE_NAME = 'vrsquare' + IE_DESC = 'VR SQUARE' + + _BASE_URL = 'https://livr.jp' + _VALID_URL = r'https?://livr\.jp/contents/(?P<id>[\w-]+)' + _TESTS = [{ + 'url': 'https://livr.jp/contents/P470896661', + 'info_dict': { + 'id': 'P470896661', + 'ext': 'mp4', + 'title': 'そこ曲がったら、櫻坂? 7年間お疲れ様!菅井友香の卒業を祝う会!前半 2022年11月6日放送分', + 'description': 'md5:523726dc835aa8014dfe1e2b38d36cd1', + 'duration': 1515.0, + 'tags': 'count:2', + 'thumbnail': r're:https?://media\.livr\.jp/vod/img/.+\.jpg', + }, + }, { + 'url': 'https://livr.jp/contents/P589523973', + 'info_dict': { + 'id': 'P589523973', + 'ext': 'mp4', + 'title': '薄闇に仰ぐ しだれ桜の妖艶', + 'description': 'md5:a042f517b2cbb4ed6746707afec4d306', + 'duration': 1084.0, + 'tags': list, + 'thumbnail': r're:https?://media\.livr\.jp/vod/img/.+\.jpg', + }, + 'skip': 'Paid video', + }, { + 'url': 'https://livr.jp/contents/P316939908', + 'info_dict': { + 'id': 'P316939908', + 'ext': 'mp4', + 'title': '2024年5月16日(木) 「今日は誰に恋をする?」公演 小栗有以 生誕祭', + 'description': 'md5:2110bdcf947f28bd7d06ec420e51b619', + 'duration': 8559.0, + 'tags': list, + 'thumbnail': r're:https?://media\.livr\.jp/vod/img/.+\.jpg', + }, + 'skip': 'Premium channel subscribers only', + }, { + # Accessible only in the VR SQUARE app + 'url': 'https://livr.jp/contents/P126481458', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + status = self._download_json( + f'{self._BASE_URL}/webApi/contentsStatus/{video_id}', + video_id, 'Checking contents status', fatal=False) + if traverse_obj(status, 'result_code') == '40407': + self.raise_login_required('Unable to access this video') + + try: + web_api = self._download_json( + f'{self._BASE_URL}/webApi/play/url/{video_id}', video_id) + except ExtractorError as e: + if isinstance(e.cause, HTTPError) and e.cause.status == 500: + raise ExtractorError('VR SQUARE app-only videos are not supported', expected=True) + raise + + return { + 'id': video_id, + 'title': self._html_search_meta(['og:title', 'twitter:title'], webpage), + 'description': self._html_search_meta('description', webpage), + 'formats': self._extract_m3u8_formats(traverse_obj(web_api, ( + 'urls', ..., 'url', any)), video_id, 'mp4', fatal=False), + 'thumbnail': self._html_search_meta('og:image', webpage), + **traverse_obj(webpage, { + 'duration': ({find_element(cls='layout-product-data-time')}, {parse_duration}), + 'tags': ({find_elements(cls='search-tag')}, ..., {clean_html}), + }), + } + + +class VrSquarePlaylistBaseIE(InfoExtractor): + _BASE_URL = 'https://livr.jp' + + def _fetch_vids(self, source, keys=()): + for url_path in traverse_obj(source, ( + *keys, {find_elements(cls='video', html=True)}, ..., + {extract_attributes}, 'data-url', {str}, filter), + ): + yield self.url_result( + f'{self._BASE_URL}/contents/{url_path.removeprefix("/contents/")}', VrSquareIE) + + def _entries(self, path, display_id, query=None): + for page in itertools.count(1): + ajax = self._download_json( + f'{self._BASE_URL}{path}', display_id, + f'Downloading playlist JSON page {page}', + query={'p': page, **(query or {})}) + yield from self._fetch_vids(ajax, ('contents_render_list', ...)) + if not traverse_obj(ajax, (('has_next', 'hasNext'), {bool}, any)): + break + + +class VrSquareChannelIE(VrSquarePlaylistBaseIE): + IE_NAME = 'vrsquare:channel' + + _VALID_URL = r'https?://livr\.jp/channel/(?P<id>\w+)' + _TESTS = [{ + 'url': 'https://livr.jp/channel/H372648599', + 'info_dict': { + 'id': 'H372648599', + 'title': 'AKB48+チャンネル', + }, + 'playlist_mincount': 502, + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + webpage = self._download_webpage(url, playlist_id) + + return self.playlist_result( + self._entries(f'/ajax/channel/{playlist_id}', playlist_id), + playlist_id, self._html_search_meta('og:title', webpage)) + + +class VrSquareSearchIE(VrSquarePlaylistBaseIE): + IE_NAME = 'vrsquare:search' + + _VALID_URL = r'https?://livr\.jp/web-search/?\?(?:[^#]+&)?w=[^#]+' + _TESTS = [{ + 'url': 'https://livr.jp/web-search?w=%23%E5%B0%8F%E6%A0%97%E6%9C%89%E4%BB%A5', + 'info_dict': { + 'id': '#小栗有以', + }, + 'playlist_mincount': 60, + }] + + def _real_extract(self, url): + search_query = parse_qs(url)['w'][0] + + return self.playlist_result( + self._entries('/ajax/web-search', search_query, {'w': search_query}), search_query) + + +class VrSquareSectionIE(VrSquarePlaylistBaseIE): + IE_NAME = 'vrsquare:section' + + _VALID_URL = r'https?://livr\.jp/(?:category|headline)/(?P<id>\w+)' + _TESTS = [{ + 'url': 'https://livr.jp/category/C133936275', + 'info_dict': { + 'id': 'C133936275', + 'title': 'そこ曲がったら、櫻坂?VR', + }, + 'playlist_mincount': 308, + }, { + 'url': 'https://livr.jp/headline/A296449604', + 'info_dict': { + 'id': 'A296449604', + 'title': 'AKB48 アフターVR', + }, + 'playlist_mincount': 22, + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + webpage = self._download_webpage(url, playlist_id) + + return self.playlist_result( + self._fetch_vids(webpage), playlist_id, self._html_search_meta('og:title', webpage)) diff --git a/yt_dlp/extractor/vrt.py b/yt_dlp/extractor/vrt.py index 9345ca962..6e5514eef 100644 --- a/yt_dlp/extractor/vrt.py +++ b/yt_dlp/extractor/vrt.py @@ -2,31 +2,33 @@ import time import urllib.parse -from .gigya import GigyaBaseIE +from .common import InfoExtractor from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, clean_html, extract_attributes, + filter_dict, float_or_none, get_element_by_class, get_element_html_by_class, int_or_none, - join_nonempty, + jwt_decode_hs256, jwt_encode_hs256, make_archive_id, merge_dicts, parse_age_limit, + parse_duration, parse_iso8601, str_or_none, strip_or_none, traverse_obj, + try_call, url_or_none, - urlencode_postdata, ) -class VRTBaseIE(GigyaBaseIE): +class VRTBaseIE(InfoExtractor): _GEO_BYPASS = False _PLAYER_INFO = { 'platform': 'desktop', @@ -37,11 +39,11 @@ class VRTBaseIE(GigyaBaseIE): 'device': 'undefined (undefined)', 'os': { 'name': 'Windows', - 'version': 'x86_64', + 'version': '10', }, 'player': { 'name': 'VRT web player', - 'version': '2.7.4-prod-2023-04-19T06:05:45', + 'version': '5.1.1-prod-2025-02-14T08:44:16"', }, } # From https://player.vrt.be/vrtnws/js/main.js & https://player.vrt.be/ketnet/js/main.8cdb11341bcb79e4cd44.js @@ -90,20 +92,21 @@ def _extract_formats_and_subtitles(self, data, video_id): def _call_api(self, video_id, client='null', id_token=None, version='v2'): player_info = {'exp': (round(time.time(), 3) + 900), **self._PLAYER_INFO} player_token = self._download_json( - 'https://media-services-public.vrt.be/vualto-video-aggregator-web/rest/external/v2/tokens', - video_id, 'Downloading player token', headers={ + f'https://media-services-public.vrt.be/vualto-video-aggregator-web/rest/external/{version}/tokens', + video_id, 'Downloading player token', 'Failed to download player token', headers={ **self.geo_verification_headers(), 'Content-Type': 'application/json', }, data=json.dumps({ - 'identityToken': id_token or {}, + 'identityToken': id_token or '', 'playerInfo': jwt_encode_hs256(player_info, self._JWT_SIGNING_KEY, headers={ 'kid': self._JWT_KEY_ID, }).decode(), }, separators=(',', ':')).encode())['vrtPlayerToken'] return self._download_json( - f'https://media-services-public.vrt.be/media-aggregator/{version}/media-items/{video_id}', - video_id, 'Downloading API JSON', query={ + # The URL below redirects to https://media-services-public.vrt.be/media-aggregator/{version}/media-items/{video_id} + f'https://media-services-public.vrt.be/vualto-video-aggregator-web/rest/external/{version}/videos/{video_id}', + video_id, 'Downloading API JSON', 'Failed to download API JSON', query={ 'vrtPlayerToken': player_token, 'client': client, }, expected_status=400) @@ -177,215 +180,286 @@ def _real_extract(self, url): class VrtNUIE(VRTBaseIE): - IE_DESC = 'VRT MAX' - _VALID_URL = r'https?://(?:www\.)?vrt\.be/vrtnu/a-z/(?:[^/]+/){2}(?P<id>[^/?#&]+)' + IE_NAME = 'vrtmax' + IE_DESC = 'VRT MAX (formerly VRT NU)' + _VALID_URL = r'https?://(?:www\.)?vrt\.be/(?:vrtnu|vrtmax)/a-z/(?:[^/]+/){2}(?P<id>[^/?#&]+)' _TESTS = [{ - # CONTENT_IS_AGE_RESTRICTED - 'url': 'https://www.vrt.be/vrtnu/a-z/de-ideale-wereld/2023-vj/de-ideale-wereld-d20230116/', + 'url': 'https://www.vrt.be/vrtmax/a-z/ket---doc/trailer/ket---doc-trailer-s6/', 'info_dict': { - 'id': 'pbs-pub-855b00a8-6ce2-4032-ac4f-1fcf3ae78524$vid-d2243aa1-ec46-4e34-a55b-92568459906f', + 'id': 'pbs-pub-c8a78645-5d3e-468a-89ec-6f3ed5534bd5$vid-242ddfe9-18f5-4e16-ab45-09b122a19251', 'ext': 'mp4', - 'title': 'Tom Waes', - 'description': 'Satirisch actualiteitenmagazine met Ella Leyers. Tom Waes is te gast.', - 'timestamp': 1673905125, - 'release_timestamp': 1673905125, - 'series': 'De ideale wereld', - 'season_id': '1672830988794', - 'episode': 'Aflevering 1', - 'episode_number': 1, - 'episode_id': '1672830988861', - 'display_id': 'de-ideale-wereld-d20230116', - 'channel': 'VRT', - 'duration': 1939.0, - 'thumbnail': 'https://images.vrt.be/orig/2023/01/10/1bb39cb3-9115-11ed-b07d-02b7b76bf47f.jpg', - 'release_date': '20230116', - 'upload_date': '20230116', - 'age_limit': 12, + 'channel': 'ketnet', + 'description': 'Neem een kijkje in de bijzondere wereld van deze Ketnetters.', + 'display_id': 'ket---doc-trailer-s6', + 'duration': 30.0, + 'episode': 'Reeks 6 volledig vanaf 3 maart', + 'episode_id': '1739450401467', + 'season': 'Trailer', + 'season_id': '1739450401467', + 'series': 'Ket & Doc', + 'thumbnail': 'https://images.vrt.be/orig/2025/02/21/63f07122-5bbd-4ca1-b42e-8565c6cd95df.jpg', + 'timestamp': 1740373200, + 'title': 'Reeks 6 volledig vanaf 3 maart', + 'upload_date': '20250224', + '_old_archive_ids': [ + 'canvas pbs-pub-c8a78645-5d3e-468a-89ec-6f3ed5534bd5$vid-242ddfe9-18f5-4e16-ab45-09b122a19251', + 'ketnet pbs-pub-c8a78645-5d3e-468a-89ec-6f3ed5534bd5$vid-242ddfe9-18f5-4e16-ab45-09b122a19251', + ], }, }, { - 'url': 'https://www.vrt.be/vrtnu/a-z/buurman--wat-doet-u-nu-/6/buurman--wat-doet-u-nu--s6-trailer/', + 'url': 'https://www.vrt.be/vrtmax/a-z/meisjes/6/meisjes-s6a5/', 'info_dict': { - 'id': 'pbs-pub-ad4050eb-d9e5-48c2-9ec8-b6c355032361$vid-0465537a-34a8-4617-8352-4d8d983b4eee', + 'id': 'pbs-pub-97b541ab-e05c-43b9-9a40-445702ef7189$vid-5e306921-a9aa-4fa9-9f39-5b82c8f1028e', 'ext': 'mp4', - 'title': 'Trailer seizoen 6 \'Buurman, wat doet u nu?\'', - 'description': 'md5:197424726c61384b4e5c519f16c0cf02', - 'timestamp': 1652940000, - 'release_timestamp': 1652940000, - 'series': 'Buurman, wat doet u nu?', - 'season': 'Seizoen 6', + 'channel': 'ketnet', + 'description': 'md5:713793f15cbf677f66200b36b7b1ec5a', + 'display_id': 'meisjes-s6a5', + 'duration': 1336.02, + 'episode': 'Week 5', + 'episode_id': '1684157692901', + 'episode_number': 5, + 'season': '6', + 'season_id': '1684157692901', 'season_number': 6, - 'season_id': '1652344200907', - 'episode': 'Aflevering 0', - 'episode_number': 0, - 'episode_id': '1652951873524', - 'display_id': 'buurman--wat-doet-u-nu--s6-trailer', - 'channel': 'VRT', - 'duration': 33.13, - 'thumbnail': 'https://images.vrt.be/orig/2022/05/23/3c234d21-da83-11ec-b07d-02b7b76bf47f.jpg', - 'release_date': '20220519', - 'upload_date': '20220519', + 'series': 'Meisjes', + 'thumbnail': 'https://images.vrt.be/orig/2023/05/14/bf526ae0-f1d9-11ed-91d7-02b7b76bf47f.jpg', + 'timestamp': 1685251800, + 'title': 'Week 5', + 'upload_date': '20230528', + '_old_archive_ids': [ + 'canvas pbs-pub-97b541ab-e05c-43b9-9a40-445702ef7189$vid-5e306921-a9aa-4fa9-9f39-5b82c8f1028e', + 'ketnet pbs-pub-97b541ab-e05c-43b9-9a40-445702ef7189$vid-5e306921-a9aa-4fa9-9f39-5b82c8f1028e', + ], + }, + }, { + 'url': 'https://www.vrt.be/vrtnu/a-z/taboe/3/taboe-s3a4/', + 'info_dict': { + 'id': 'pbs-pub-f50faa3a-1778-46b6-9117-4ba85f197703$vid-547507fe-1c8b-4394-b361-21e627cbd0fd', + 'ext': 'mp4', + 'channel': 'een', + 'description': 'md5:bf61345a95eca9393a95de4a7a54b5c6', + 'display_id': 'taboe-s3a4', + 'duration': 2882.02, + 'episode': 'Mensen met het syndroom van Gilles de la Tourette', + 'episode_id': '1739055911734', + 'episode_number': 4, + 'season': '3', + 'season_id': '1739055911734', + 'season_number': 3, + 'series': 'Taboe', + 'thumbnail': 'https://images.vrt.be/orig/2025/02/19/8198496c-d1ae-4bca-9a48-761cf3ea3ff2.jpg', + 'timestamp': 1740286800, + 'title': 'Mensen met het syndroom van Gilles de la Tourette', + 'upload_date': '20250223', + '_old_archive_ids': [ + 'canvas pbs-pub-f50faa3a-1778-46b6-9117-4ba85f197703$vid-547507fe-1c8b-4394-b361-21e627cbd0fd', + 'ketnet pbs-pub-f50faa3a-1778-46b6-9117-4ba85f197703$vid-547507fe-1c8b-4394-b361-21e627cbd0fd', + ], }, - 'params': {'skip_download': 'm3u8'}, }] _NETRC_MACHINE = 'vrtnu' - _authenticated = False + + _TOKEN_COOKIE_DOMAIN = '.www.vrt.be' + _ACCESS_TOKEN_COOKIE_NAME = 'vrtnu-site_profile_at' + _REFRESH_TOKEN_COOKIE_NAME = 'vrtnu-site_profile_rt' + _VIDEO_TOKEN_COOKIE_NAME = 'vrtnu-site_profile_vt' + _VIDEO_PAGE_QUERY = ''' + query VideoPage($pageId: ID!) { + page(id: $pageId) { + ... on EpisodePage { + episode { + ageRaw + description + durationRaw + episodeNumberRaw + id + name + onTimeRaw + program { + title + } + season { + id + titleRaw + } + title + brand + } + ldjson + player { + image { + templateUrl + } + modes { + streamId + } + } + } + } + } + ''' + + def _fetch_tokens(self): + has_credentials = self._get_login_info()[0] + access_token = self._get_vrt_cookie(self._ACCESS_TOKEN_COOKIE_NAME) + video_token = self._get_vrt_cookie(self._VIDEO_TOKEN_COOKIE_NAME) + + if (access_token and not self._is_jwt_token_expired(access_token) + and video_token and not self._is_jwt_token_expired(video_token)): + return access_token, video_token + + if has_credentials: + access_token, video_token = self.cache.load(self._NETRC_MACHINE, 'token_data', default=(None, None)) + + if (access_token and not self._is_jwt_token_expired(access_token) + and video_token and not self._is_jwt_token_expired(video_token)): + self.write_debug('Restored tokens from cache') + self._set_cookie(self._TOKEN_COOKIE_DOMAIN, self._ACCESS_TOKEN_COOKIE_NAME, access_token) + self._set_cookie(self._TOKEN_COOKIE_DOMAIN, self._VIDEO_TOKEN_COOKIE_NAME, video_token) + return access_token, video_token + + if not self._get_vrt_cookie(self._REFRESH_TOKEN_COOKIE_NAME): + return None, None + + self._request_webpage( + 'https://www.vrt.be/vrtmax/sso/refresh', None, + note='Refreshing tokens', errnote='Failed to refresh tokens', fatal=False) + + access_token = self._get_vrt_cookie(self._ACCESS_TOKEN_COOKIE_NAME) + video_token = self._get_vrt_cookie(self._VIDEO_TOKEN_COOKIE_NAME) + + if not access_token or not video_token: + self.cache.store(self._NETRC_MACHINE, 'refresh_token', None) + self.cookiejar.clear(self._TOKEN_COOKIE_DOMAIN, '/vrtmax/sso', self._REFRESH_TOKEN_COOKIE_NAME) + msg = 'Refreshing of tokens failed' + if not has_credentials: + self.report_warning(msg) + return None, None + self.report_warning(f'{msg}. Re-logging in') + return self._perform_login(*self._get_login_info()) + + if has_credentials: + self.cache.store(self._NETRC_MACHINE, 'token_data', (access_token, video_token)) + + return access_token, video_token + + def _get_vrt_cookie(self, cookie_name): + # Refresh token cookie is scoped to /vrtmax/sso, others are scoped to / + return try_call(lambda: self._get_cookies('https://www.vrt.be/vrtmax/sso')[cookie_name].value) + + @staticmethod + def _is_jwt_token_expired(token): + return jwt_decode_hs256(token)['exp'] - time.time() < 300 def _perform_login(self, username, password): - auth_info = self._gigya_login({ - 'APIKey': '3_0Z2HujMtiWq_pkAjgnS2Md2E11a1AwZjYiBETtwNE-EoEHDINgtnvcAOpNgmrVGy', - 'targetEnv': 'jssdk', - 'loginID': username, - 'password': password, - 'authMode': 'cookie', - }) + refresh_token = self._get_vrt_cookie(self._REFRESH_TOKEN_COOKIE_NAME) + if refresh_token and not self._is_jwt_token_expired(refresh_token): + self.write_debug('Using refresh token from logged-in cookies; skipping login with credentials') + return - if auth_info.get('errorDetails'): - raise ExtractorError(f'Unable to login. VrtNU said: {auth_info["errorDetails"]}', expected=True) + refresh_token = self.cache.load(self._NETRC_MACHINE, 'refresh_token', default=None) + if refresh_token and not self._is_jwt_token_expired(refresh_token): + self.write_debug('Restored refresh token from cache') + self._set_cookie(self._TOKEN_COOKIE_DOMAIN, self._REFRESH_TOKEN_COOKIE_NAME, refresh_token, path='/vrtmax/sso') + return - # Sometimes authentication fails for no good reason, retry - for retry in self.RetryManager(): - if retry.attempt > 1: - self._sleep(1, None) - try: - self._request_webpage( - 'https://token.vrt.be/vrtnuinitlogin', None, note='Requesting XSRF Token', - errnote='Could not get XSRF Token', query={ - 'provider': 'site', - 'destination': 'https://www.vrt.be/vrtnu/', - }) - self._request_webpage( - 'https://login.vrt.be/perform_login', None, - note='Performing login', errnote='Login failed', - query={'client_id': 'vrtnu-site'}, data=urlencode_postdata({ - 'UID': auth_info['UID'], - 'UIDSignature': auth_info['UIDSignature'], - 'signatureTimestamp': auth_info['signatureTimestamp'], - '_csrf': self._get_cookies('https://login.vrt.be').get('OIDCXSRF').value, - })) - except ExtractorError as e: - if isinstance(e.cause, HTTPError) and e.cause.status == 401: - retry.error = e - continue - raise + self._request_webpage( + 'https://www.vrt.be/vrtmax/sso/login', None, + note='Getting session cookies', errnote='Failed to get session cookies') - self._authenticated = True + login_data = self._download_json( + 'https://login.vrt.be/perform_login', None, data=json.dumps({ + 'clientId': 'vrtnu-site', + 'loginID': username, + 'password': password, + }).encode(), headers={ + 'Content-Type': 'application/json', + 'Oidcxsrf': self._get_cookies('https://login.vrt.be')['OIDCXSRF'].value, + }, note='Logging in', errnote='Login failed', expected_status=403) + if login_data.get('errorCode'): + raise ExtractorError(f'Login failed: {login_data.get("errorMessage")}', expected=True) + + self._request_webpage( + login_data['redirectUrl'], None, + note='Getting access token', errnote='Failed to get access token') + + access_token = self._get_vrt_cookie(self._ACCESS_TOKEN_COOKIE_NAME) + video_token = self._get_vrt_cookie(self._VIDEO_TOKEN_COOKIE_NAME) + refresh_token = self._get_vrt_cookie(self._REFRESH_TOKEN_COOKIE_NAME) + + if not all((access_token, video_token, refresh_token)): + raise ExtractorError('Unable to extract token cookie values') + + self.cache.store(self._NETRC_MACHINE, 'token_data', (access_token, video_token)) + self.cache.store(self._NETRC_MACHINE, 'refresh_token', refresh_token) + + return access_token, video_token def _real_extract(self, url): display_id = self._match_id(url) - parsed_url = urllib.parse.urlparse(url) - details = self._download_json( - f'{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path.rstrip("/")}.model.json', - display_id, 'Downloading asset JSON', 'Unable to download asset JSON')['details'] + access_token, video_token = self._fetch_tokens() - watch_info = traverse_obj(details, ( - 'actions', lambda _, v: v['type'] == 'watch-episode', {dict}), get_all=False) or {} - video_id = join_nonempty( - 'episodePublicationId', 'episodeVideoId', delim='$', from_dict=watch_info) - if '$' not in video_id: - raise ExtractorError('Unable to extract video ID') + metadata = self._download_json( + f'https://www.vrt.be/vrtnu-api/graphql{"" if access_token else "/public"}/v1', + display_id, 'Downloading asset JSON', 'Unable to download asset JSON', + data=json.dumps({ + 'operationName': 'VideoPage', + 'query': self._VIDEO_PAGE_QUERY, + 'variables': {'pageId': urllib.parse.urlparse(url).path}, + }).encode(), + headers=filter_dict({ + 'Authorization': f'Bearer {access_token}' if access_token else None, + 'Content-Type': 'application/json', + 'x-vrt-client-name': 'WEB', + 'x-vrt-client-version': '1.5.9', + 'x-vrt-zone': 'default', + }))['data']['page'] - vrtnutoken = self._download_json( - 'https://token.vrt.be/refreshtoken', video_id, note='Retrieving vrtnutoken', - errnote='Token refresh failed')['vrtnutoken'] if self._authenticated else None + video_id = metadata['player']['modes'][0]['streamId'] - video_info = self._call_api(video_id, 'vrtnu-web@PROD', vrtnutoken) + try: + streaming_info = self._call_api(video_id, 'vrtnu-web@PROD', id_token=video_token) + except ExtractorError as e: + if not video_token and isinstance(e.cause, HTTPError) and e.cause.status == 404: + self.raise_login_required() + raise - if 'title' not in video_info: - code = video_info.get('code') - if code in ('AUTHENTICATION_REQUIRED', 'CONTENT_IS_AGE_RESTRICTED'): - self.raise_login_required(code, method='password') - elif code in ('INVALID_LOCATION', 'CONTENT_AVAILABLE_ONLY_IN_BE'): + formats, subtitles = self._extract_formats_and_subtitles(streaming_info, video_id) + + code = traverse_obj(streaming_info, ('code', {str})) + if not formats and code: + if code in ('CONTENT_AVAILABLE_ONLY_FOR_BE_RESIDENTS', 'CONTENT_AVAILABLE_ONLY_IN_BE', 'CONTENT_UNAVAILABLE_VIA_PROXY'): self.raise_geo_restricted(countries=['BE']) - elif code == 'CONTENT_AVAILABLE_ONLY_FOR_BE_RESIDENTS_AND_EXPATS': - if not self._authenticated: - self.raise_login_required(code, method='password') - self.raise_geo_restricted(countries=['BE']) - raise ExtractorError(code, expected=True) - - formats, subtitles = self._extract_formats_and_subtitles(video_info, video_id) + elif code in ('CONTENT_AVAILABLE_ONLY_FOR_BE_RESIDENTS_AND_EXPATS', 'CONTENT_IS_AGE_RESTRICTED', 'CONTENT_REQUIRES_AUTHENTICATION'): + self.raise_login_required() + else: + self.raise_no_formats(f'Unable to extract formats: {code}') return { - **traverse_obj(details, { - 'title': 'title', - 'description': ('description', {clean_html}), - 'timestamp': ('data', 'episode', 'onTime', 'raw', {parse_iso8601}), - 'release_timestamp': ('data', 'episode', 'onTime', 'raw', {parse_iso8601}), - 'series': ('data', 'program', 'title'), - 'season': ('data', 'season', 'title', 'value'), - 'season_number': ('data', 'season', 'title', 'raw', {int_or_none}), - 'season_id': ('data', 'season', 'id', {str_or_none}), - 'episode': ('data', 'episode', 'number', 'value', {str_or_none}), - 'episode_number': ('data', 'episode', 'number', 'raw', {int_or_none}), - 'episode_id': ('data', 'episode', 'id', {str_or_none}), - 'age_limit': ('data', 'episode', 'age', 'raw', {parse_age_limit}), - }), + 'duration': float_or_none(streaming_info.get('duration'), 1000), + 'thumbnail': url_or_none(streaming_info.get('posterImageUrl')), + **self._json_ld(traverse_obj(metadata, ('ldjson', ..., {json.loads})), video_id, fatal=False), + **traverse_obj(metadata, ('episode', { + 'title': ('title', {str}), + 'description': ('description', {str}), + 'timestamp': ('onTimeRaw', {parse_iso8601}), + 'series': ('program', 'title', {str}), + 'season': ('season', 'titleRaw', {str}), + 'season_number': ('season', 'titleRaw', {int_or_none}), + 'season_id': ('id', {str_or_none}), + 'episode': ('title', {str}), + 'episode_number': ('episodeNumberRaw', {int_or_none}), + 'episode_id': ('id', {str_or_none}), + 'age_limit': ('ageRaw', {parse_age_limit}), + 'channel': ('brand', {str}), + 'duration': ('durationRaw', {parse_duration}), + })), 'id': video_id, 'display_id': display_id, - 'channel': 'VRT', - 'formats': formats, - 'duration': float_or_none(video_info.get('duration'), 1000), - 'thumbnail': url_or_none(video_info.get('posterImageUrl')), - 'subtitles': subtitles, - '_old_archive_ids': [make_archive_id('Canvas', video_id)], - } - - -class KetnetIE(VRTBaseIE): - _VALID_URL = r'https?://(?:www\.)?ketnet\.be/(?P<id>(?:[^/]+/)*[^/?#&]+)' - _TESTS = [{ - 'url': 'https://www.ketnet.be/kijken/m/meisjes/6/meisjes-s6a5', - 'info_dict': { - 'id': 'pbs-pub-39f8351c-a0a0-43e6-8394-205d597d6162$vid-5e306921-a9aa-4fa9-9f39-5b82c8f1028e', - 'ext': 'mp4', - 'title': 'Meisjes', - 'episode': 'Reeks 6: Week 5', - 'season': 'Reeks 6', - 'series': 'Meisjes', - 'timestamp': 1685251800, - 'upload_date': '20230528', - }, - 'params': {'skip_download': 'm3u8'}, - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - - video = self._download_json( - 'https://senior-bff.ketnet.be/graphql', display_id, query={ - 'query': '''{ - video(id: "content/ketnet/nl/%s.model.json") { - description - episodeNr - imageUrl - mediaReference - programTitle - publicationDate - seasonTitle - subtitleVideodetail - titleVideodetail - } -}''' % display_id, # noqa: UP031 - })['data']['video'] - - video_id = urllib.parse.unquote(video['mediaReference']) - data = self._call_api(video_id, 'ketnet@PROD', version='v1') - formats, subtitles = self._extract_formats_and_subtitles(data, video_id) - - return { - 'id': video_id, 'formats': formats, 'subtitles': subtitles, - '_old_archive_ids': [make_archive_id('Canvas', video_id)], - **traverse_obj(video, { - 'title': ('titleVideodetail', {str}), - 'description': ('description', {str}), - 'thumbnail': ('thumbnail', {url_or_none}), - 'timestamp': ('publicationDate', {parse_iso8601}), - 'series': ('programTitle', {str}), - 'season': ('seasonTitle', {str}), - 'episode': ('subtitleVideodetail', {str}), - 'episode_number': ('episodeNr', {int_or_none}), - }), + '_old_archive_ids': [make_archive_id('Canvas', video_id), + make_archive_id('Ketnet', video_id)], } diff --git a/yt_dlp/extractor/weibo.py b/yt_dlp/extractor/weibo.py index 6e57446e9..420ac3829 100644 --- a/yt_dlp/extractor/weibo.py +++ b/yt_dlp/extractor/weibo.py @@ -12,6 +12,7 @@ str_or_none, strip_jsonp, traverse_obj, + truncate_string, url_or_none, urlencode_postdata, urljoin, @@ -96,7 +97,8 @@ def _extract_formats(self, video_info): }) return formats - def _parse_video_info(self, video_info, video_id=None): + def _parse_video_info(self, video_info): + video_id = traverse_obj(video_info, (('id', 'id_str', 'mid'), {str_or_none}, any)) return { 'id': video_id, 'extractor_key': WeiboIE.ie_key(), @@ -105,9 +107,10 @@ def _parse_video_info(self, video_info, video_id=None): 'http_headers': {'Referer': 'https://weibo.com/'}, '_old_archive_ids': [make_archive_id('WeiboMobile', video_id)], **traverse_obj(video_info, { - 'id': (('id', 'id_str', 'mid'), {str_or_none}), 'display_id': ('mblogid', {str_or_none}), - 'title': ('page_info', 'media_info', ('video_title', 'kol_title', 'name'), {str}, filter), + 'title': ('page_info', 'media_info', ('video_title', 'kol_title', 'name'), + {lambda x: x.replace('\n', ' ')}, {truncate_string(left=72)}, filter), + 'alt_title': ('page_info', 'media_info', ('video_title', 'kol_title', 'name'), {str}, filter), 'description': ('text_raw', {str}), 'duration': ('page_info', 'media_info', 'duration', {int_or_none}), 'timestamp': ('page_info', 'media_info', 'video_publish_time', {int_or_none}), @@ -129,9 +132,11 @@ class WeiboIE(WeiboBaseIE): 'url': 'https://weibo.com/7827771738/N4xlMvjhI', 'info_dict': { 'id': '4910815147462302', + '_old_archive_ids': ['weibomobile 4910815147462302'], 'ext': 'mp4', 'display_id': 'N4xlMvjhI', 'title': '【睡前消息暑假版第一期:拉泰国一把 对中国有好处】', + 'alt_title': '【睡前消息暑假版第一期:拉泰国一把 对中国有好处】', 'description': 'md5:e2637a7673980d68694ea7c43cf12a5f', 'duration': 918, 'timestamp': 1686312819, @@ -149,9 +154,11 @@ class WeiboIE(WeiboBaseIE): 'url': 'https://m.weibo.cn/status/4189191225395228', 'info_dict': { 'id': '4189191225395228', + '_old_archive_ids': ['weibomobile 4189191225395228'], 'ext': 'mp4', 'display_id': 'FBqgOmDxO', 'title': '柴犬柴犬的秒拍视频', + 'alt_title': '柴犬柴犬的秒拍视频', 'description': 'md5:80f461ab5cdae6bbdb70efbf5a1db24f', 'duration': 53, 'timestamp': 1514264429, @@ -166,34 +173,35 @@ class WeiboIE(WeiboBaseIE): }, }, { 'url': 'https://m.weibo.cn/detail/4189191225395228', - 'info_dict': { - 'id': '4189191225395228', - 'ext': 'mp4', - 'display_id': 'FBqgOmDxO', - 'title': '柴犬柴犬的秒拍视频', - 'description': '午睡当然是要甜甜蜜蜜的啦![坏笑] Instagram:shibainu.gaku http://t.cn/RHbmjzW ', - 'duration': 53, - 'timestamp': 1514264429, - 'upload_date': '20171226', - 'thumbnail': r're:https://.*\.jpg', - 'uploader': '柴犬柴犬', - 'uploader_id': '5926682210', - 'uploader_url': 'https://weibo.com/u/5926682210', - 'view_count': int, - 'like_count': int, - 'repost_count': int, - }, + 'only_matching': True, }, { 'url': 'https://weibo.com/0/4224132150961381', 'note': 'no playback_list example', 'only_matching': True, + }, { + 'url': 'https://m.weibo.cn/detail/5120561132606436', + 'info_dict': { + 'id': '5120561132606436', + }, + 'playlist_count': 9, }] def _real_extract(self, url): video_id = self._match_id(url) - return self._parse_video_info(self._weibo_download_json( - f'https://weibo.com/ajax/statuses/show?id={video_id}', video_id)) + meta = self._weibo_download_json(f'https://weibo.com/ajax/statuses/show?id={video_id}', video_id) + mix_media_info = traverse_obj(meta, ('mix_media_info', 'items', ...)) + if not mix_media_info: + return self._parse_video_info(meta) + + return self.playlist_result(self._entries(mix_media_info), video_id) + + def _entries(self, mix_media_info): + for media_info in traverse_obj(mix_media_info, lambda _, v: v['type'] != 'pic'): + yield self._parse_video_info(traverse_obj(media_info, { + 'id': ('data', 'object_id'), + 'page_info': {'media_info': ('data', 'media_info', {dict})}, + })) class WeiboVideoIE(WeiboBaseIE): @@ -205,6 +213,7 @@ class WeiboVideoIE(WeiboBaseIE): 'ext': 'mp4', 'display_id': 'LEZDodaiW', 'title': '呃,稍微了解了一下靡烟miya,感觉这东西也太二了', + 'alt_title': '呃,稍微了解了一下靡烟miya,感觉这东西也太二了', 'description': '呃,稍微了解了一下靡烟miya,感觉这东西也太二了 http://t.cn/A6aerGsM \u200b\u200b\u200b', 'duration': 76, 'timestamp': 1659344278, @@ -216,6 +225,7 @@ class WeiboVideoIE(WeiboBaseIE): 'view_count': int, 'like_count': int, 'repost_count': int, + '_old_archive_ids': ['weibomobile 4797700463137878'], }, }] diff --git a/yt_dlp/extractor/weverse.py b/yt_dlp/extractor/weverse.py index 53ad1100d..42b1189fe 100644 --- a/yt_dlp/extractor/weverse.py +++ b/yt_dlp/extractor/weverse.py @@ -290,12 +290,14 @@ def _real_extract(self, url): elif live_status == 'is_live': video_info = self._call_api( - f'/video/v1.2/lives/{api_video_id}/playInfo?preview.format=json&preview.version=v2', + f'/video/v1.3/lives/{api_video_id}/playInfo?preview.format=json&preview.version=v2', video_id, note='Downloading live JSON') playback = self._parse_json(video_info['lipPlayback'], video_id) m3u8_url = traverse_obj(playback, ( 'media', lambda _, v: v['protocol'] == 'HLS', 'path', {url_or_none}), get_all=False) - formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', m3u8_id='hls', live=True) + # Live subtitles are not downloadable, but extract to silence "ignoring subs" warning + formats, _ = self._extract_m3u8_formats_and_subtitles( + m3u8_url, video_id, 'mp4', m3u8_id='hls', live=True) elif live_status == 'post_live': if availability in ('premium_only', 'subscriber_only'): diff --git a/yt_dlp/extractor/wsj.py b/yt_dlp/extractor/wsj.py index b6b656f7d..7cf46141c 100644 --- a/yt_dlp/extractor/wsj.py +++ b/yt_dlp/extractor/wsj.py @@ -100,8 +100,8 @@ def _real_extract(self, url): class WSJArticleIE(InfoExtractor): - _VALID_URL = r'(?i)https?://(?:www\.)?wsj\.com/articles/(?P<id>[^/?#&]+)' - _TEST = { + _VALID_URL = r'(?i)https?://(?:www\.)?wsj\.com/(?:articles|opinion)/(?P<id>[^/?#&]+)' + _TESTS = [{ 'url': 'https://www.wsj.com/articles/dont-like-china-no-pandas-for-you-1490366939?', 'info_dict': { 'id': '4B13FA62-1D8C-45DB-8EA1-4105CB20B362', @@ -110,11 +110,20 @@ class WSJArticleIE(InfoExtractor): 'uploader_id': 'ralcaraz', 'title': 'Bao Bao the Panda Leaves for China', }, - } + }, { + 'url': 'https://www.wsj.com/opinion/hamas-hostages-caskets-bibas-family-israel-gaza-29da083b', + 'info_dict': { + 'id': 'CE68D629-8DB8-4CD3-B30A-92112C102054', + 'ext': 'mp4', + 'upload_date': '20241007', + 'uploader_id': 'Tinnes, David', + 'title': 'WSJ Opinion: "Get the Jew": The Crown Heights Riot Revisited', + }, + }] def _real_extract(self, url): article_id = self._match_id(url) - webpage = self._download_webpage(url, article_id) + webpage = self._download_webpage(url, article_id, impersonate=True) video_id = self._search_regex( r'(?:id=["\']video|video-|iframe\.html\?guid=|data-src=["\'])([a-fA-F0-9-]{36})', webpage, 'video id') diff --git a/yt_dlp/extractor/wykop.py b/yt_dlp/extractor/wykop.py index 2ae0a2a5e..08cad1fff 100644 --- a/yt_dlp/extractor/wykop.py +++ b/yt_dlp/extractor/wykop.py @@ -11,7 +11,7 @@ ) -class WykopBaseExtractor(InfoExtractor): +class WykopBaseIE(InfoExtractor): def _get_token(self, force_refresh=False): if not force_refresh: maybe_cached = self.cache.load('wykop', 'bearer') @@ -72,7 +72,7 @@ def _common_data_extract(self, data): } -class WykopDigIE(WykopBaseExtractor): +class WykopDigIE(WykopBaseIE): IE_NAME = 'wykop:dig' _VALID_URL = r'https?://(?:www\.)?wykop\.pl/link/(?P<id>\d+)' @@ -128,7 +128,7 @@ def _real_extract(self, url): } -class WykopDigCommentIE(WykopBaseExtractor): +class WykopDigCommentIE(WykopBaseIE): IE_NAME = 'wykop:dig:comment' _VALID_URL = r'https?://(?:www\.)?wykop\.pl/link/(?P<dig_id>\d+)/[^/]+/komentarz/(?P<id>\d+)' @@ -177,7 +177,7 @@ def _real_extract(self, url): } -class WykopPostIE(WykopBaseExtractor): +class WykopPostIE(WykopBaseIE): IE_NAME = 'wykop:post' _VALID_URL = r'https?://(?:www\.)?wykop\.pl/wpis/(?P<id>\d+)' @@ -228,7 +228,7 @@ def _real_extract(self, url): } -class WykopPostCommentIE(WykopBaseExtractor): +class WykopPostCommentIE(WykopBaseIE): IE_NAME = 'wykop:post:comment' _VALID_URL = r'https?://(?:www\.)?wykop\.pl/wpis/(?P<post_id>\d+)/[^/#]+#(?P<id>\d+)' diff --git a/yt_dlp/extractor/yandexvideo.py b/yt_dlp/extractor/yandexvideo.py index cdd32c5e4..09e7c9878 100644 --- a/yt_dlp/extractor/yandexvideo.py +++ b/yt_dlp/extractor/yandexvideo.py @@ -2,15 +2,17 @@ from .common import InfoExtractor from ..utils import ( + bug_reports_message, determine_ext, - extract_attributes, int_or_none, lowercase_escape, parse_qs, - traverse_obj, + qualities, try_get, + update_url_query, url_or_none, ) +from ..utils.traversal import traverse_obj class YandexVideoIE(InfoExtractor): @@ -186,7 +188,22 @@ def _real_extract(self, url): return self.url_result(data_json['video']['url']) -class ZenYandexIE(InfoExtractor): +class ZenYandexBaseIE(InfoExtractor): + def _fetch_ssr_data(self, url, video_id): + webpage = self._download_webpage(url, video_id) + redirect = self._search_json( + r'(?:var|let|const)\s+it\s*=', webpage, 'redirect', video_id, default={}).get('retpath') + if redirect: + video_id = self._match_id(redirect) + webpage = self._download_webpage(redirect, video_id, note='Redirecting') + return video_id, self._search_json( + r'(?:var|let|const)\s+_params\s*=\s*\(', webpage, 'metadata', video_id, + contains_pattern=r'{["\']ssrData.+}')['ssrData'] + + +class ZenYandexIE(ZenYandexBaseIE): + IE_NAME = 'dzen.ru' + IE_DESC = 'Дзен (dzen) formerly Яндекс.Дзен (Yandex Zen)' _VALID_URL = r'https?://(zen\.yandex|dzen)\.ru(?:/video)?/(media|watch)/(?:(?:id/[^/]+/|[^/]+/)(?:[a-z0-9-]+)-)?(?P<id>[a-z0-9-]+)' _TESTS = [{ 'url': 'https://zen.yandex.ru/media/id/606fd806cc13cb3c58c05cf5/vot-eto-focus-dedy-morozy-na-gidrociklah-60c7c443da18892ebfe85ed7', @@ -216,6 +233,7 @@ class ZenYandexIE(InfoExtractor): 'timestamp': 1573465585, }, 'params': {'skip_download': 'm3u8'}, + 'skip': 'The page does not exist', }, { 'url': 'https://zen.yandex.ru/video/watch/6002240ff8b1af50bb2da5e3', 'info_dict': { @@ -227,6 +245,9 @@ class ZenYandexIE(InfoExtractor): 'uploader': 'TechInsider', 'timestamp': 1611378221, 'upload_date': '20210123', + 'view_count': int, + 'duration': 243, + 'tags': ['опыт', 'эксперимент', 'огонь'], }, 'params': {'skip_download': 'm3u8'}, }, { @@ -240,6 +261,9 @@ class ZenYandexIE(InfoExtractor): 'uploader': 'TechInsider', 'upload_date': '20210123', 'timestamp': 1611378221, + 'view_count': int, + 'duration': 243, + 'tags': ['опыт', 'эксперимент', 'огонь'], }, 'params': {'skip_download': 'm3u8'}, }, { @@ -252,44 +276,56 @@ class ZenYandexIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - redirect = self._search_json(r'var it\s*=', webpage, 'redirect', id, default={}).get('retpath') - if redirect: - video_id = self._match_id(redirect) - webpage = self._download_webpage(redirect, video_id, note='Redirecting') - data_json = self._search_json( - r'("data"\s*:|data\s*=)', webpage, 'metadata', video_id, contains_pattern=r'{["\']_*serverState_*video.+}') - serverstate = self._search_regex(r'(_+serverState_+video-site_[^_]+_+)', webpage, 'server state') - uploader = self._search_regex(r'(<a\s*class=["\']card-channel-link[^"\']+["\'][^>]+>)', - webpage, 'uploader', default='<a>') - uploader_name = extract_attributes(uploader).get('aria-label') - item_id = traverse_obj(data_json, (serverstate, 'videoViewer', 'openedItemId', {str})) - video_json = traverse_obj(data_json, (serverstate, 'videoViewer', 'items', item_id, {dict})) or {} + video_id, ssr_data = self._fetch_ssr_data(url, video_id) + video_data = ssr_data['videoMetaResponse'] formats, subtitles = [], {} - for s_url in traverse_obj(video_json, ('video', 'streams', ..., {url_or_none})): + quality = qualities(('4', '0', '1', '2', '3', '5', '6', '7')) + # Deduplicate stream URLs. The "dzen_dash" query parameter is present in some URLs but can be omitted + stream_urls = set(traverse_obj(video_data, ( + 'video', ('id', ('streams', ...), ('mp4Streams', ..., 'url'), ('oneVideoStreams', ..., 'url')), + {url_or_none}, {update_url_query(query={'dzen_dash': []})}))) + for s_url in stream_urls: ext = determine_ext(s_url) - if ext == 'mpd': - fmts, subs = self._extract_mpd_formats_and_subtitles(s_url, video_id, mpd_id='dash') - elif ext == 'm3u8': - fmts, subs = self._extract_m3u8_formats_and_subtitles(s_url, video_id, 'mp4') + content_type = traverse_obj(parse_qs(s_url), ('ct', 0)) + if ext == 'mpd' or content_type == '6': + fmts, subs = self._extract_mpd_formats_and_subtitles(s_url, video_id, mpd_id='dash', fatal=False) + elif ext == 'm3u8' or content_type == '8': + fmts, subs = self._extract_m3u8_formats_and_subtitles(s_url, video_id, 'mp4', m3u8_id='hls', fatal=False) + elif content_type == '0': + format_type = traverse_obj(parse_qs(s_url), ('type', 0)) + formats.append({ + 'url': s_url, + 'format_id': format_type, + 'ext': 'mp4', + 'quality': quality(format_type), + }) + continue + else: + self.report_warning(f'Unsupported stream URL: {s_url}{bug_reports_message()}') + continue formats.extend(fmts) - subtitles = self._merge_subtitles(subtitles, subs) + self._merge_subtitles(subs, target=subtitles) + return { 'id': video_id, - 'title': video_json.get('title') or self._og_search_title(webpage), 'formats': formats, 'subtitles': subtitles, - 'duration': int_or_none(video_json.get('duration')), - 'view_count': int_or_none(video_json.get('views')), - 'timestamp': int_or_none(video_json.get('publicationDate')), - 'uploader': uploader_name or data_json.get('authorName') or try_get(data_json, lambda x: x['publisher']['name']), - 'description': video_json.get('description') or self._og_search_description(webpage), - 'thumbnail': self._og_search_thumbnail(webpage) or try_get(data_json, lambda x: x['og']['imageUrl']), + **traverse_obj(video_data, { + 'title': ('title', {str}), + 'description': ('description', {str}), + 'thumbnail': ('image', {url_or_none}), + 'duration': ('video', 'duration', {int_or_none}), + 'view_count': ('video', 'views', {int_or_none}), + 'timestamp': ('publicationDate', {int_or_none}), + 'tags': ('tags', ..., {str}), + 'uploader': ('source', 'title', {str}), + }), } -class ZenYandexChannelIE(InfoExtractor): +class ZenYandexChannelIE(ZenYandexBaseIE): + IE_NAME = 'dzen.ru:channel' _VALID_URL = r'https?://(zen\.yandex|dzen)\.ru/(?!media|video)(?:id/)?(?P<id>[a-z0-9-_]+)' _TESTS = [{ 'url': 'https://zen.yandex.ru/tok_media', @@ -323,8 +359,8 @@ class ZenYandexChannelIE(InfoExtractor): 'url': 'https://zen.yandex.ru/jony_me', 'info_dict': { 'id': 'jony_me', - 'description': 'md5:ce0a5cad2752ab58701b5497835b2cc5', - 'title': 'JONY ', + 'description': 'md5:7c30d11dc005faba8826feae99da3113', + 'title': 'JONY', }, 'playlist_count': 18, }, { @@ -333,9 +369,8 @@ class ZenYandexChannelIE(InfoExtractor): 'url': 'https://zen.yandex.ru/tatyanareva', 'info_dict': { 'id': 'tatyanareva', - 'description': 'md5:40a1e51f174369ec3ba9d657734ac31f', + 'description': 'md5:92e56fa730a932ca2483ba5c2186ad96', 'title': 'Татьяна Рева', - 'entries': 'maxcount:200', }, 'playlist_mincount': 46, }, { @@ -348,43 +383,31 @@ class ZenYandexChannelIE(InfoExtractor): 'playlist_mincount': 657, }] - def _entries(self, item_id, server_state_json, server_settings_json): - items = (traverse_obj(server_state_json, ('feed', 'items', ...)) - or traverse_obj(server_settings_json, ('exportData', 'items', ...))) - - more = (traverse_obj(server_state_json, ('links', 'more')) - or traverse_obj(server_settings_json, ('exportData', 'more', 'link'))) - + def _entries(self, feed_data, channel_id): next_page_id = None for page in itertools.count(1): - for item in items or []: - if item.get('type') != 'gif': - continue - video_id = traverse_obj(item, 'publication_id', 'publicationId') or '' - yield self.url_result(item['link'], ZenYandexIE, video_id.split(':')[-1]) + for item in traverse_obj(feed_data, ( + (None, ('items', lambda _, v: v['tab'] in ('shorts', 'longs'))), + 'items', lambda _, v: url_or_none(v['link']), + )): + yield self.url_result(item['link'], ZenYandexIE, item.get('id'), title=item.get('title')) + more = traverse_obj(feed_data, ('more', 'link', {url_or_none})) current_page_id = next_page_id next_page_id = traverse_obj(parse_qs(more), ('next_page_id', -1)) - if not all((more, items, next_page_id, next_page_id != current_page_id)): + if not all((more, next_page_id, next_page_id != current_page_id)): break - data = self._download_json(more, item_id, note=f'Downloading Page {page}') - items, more = data.get('items'), traverse_obj(data, ('more', 'link')) + feed_data = self._download_json(more, channel_id, note=f'Downloading Page {page}') def _real_extract(self, url): - item_id = self._match_id(url) - webpage = self._download_webpage(url, item_id) - redirect = self._search_json( - r'var it\s*=', webpage, 'redirect', item_id, default={}).get('retpath') - if redirect: - item_id = self._match_id(redirect) - webpage = self._download_webpage(redirect, item_id, note='Redirecting') - data = self._search_json( - r'("data"\s*:|data\s*=)', webpage, 'channel data', item_id, contains_pattern=r'{\"__serverState__.+}') - server_state_json = traverse_obj(data, lambda k, _: k.startswith('__serverState__'), get_all=False) - server_settings_json = traverse_obj(data, lambda k, _: k.startswith('__serverSettings__'), get_all=False) + channel_id = self._match_id(url) + channel_id, ssr_data = self._fetch_ssr_data(url, channel_id) + channel_data = ssr_data['exportResponse'] return self.playlist_result( - self._entries(item_id, server_state_json, server_settings_json), - item_id, traverse_obj(server_state_json, ('channel', 'source', 'title')), - traverse_obj(server_state_json, ('channel', 'source', 'description'))) + self._entries(channel_data['feedData'], channel_id), + channel_id, **traverse_obj(channel_data, ('channel', 'source', { + 'title': ('title', {str}), + 'description': ('description', {str}), + }))) diff --git a/yt_dlp/extractor/youporn.py b/yt_dlp/extractor/youporn.py index 8eb77aa03..b3c4b76d7 100644 --- a/yt_dlp/extractor/youporn.py +++ b/yt_dlp/extractor/youporn.py @@ -227,7 +227,7 @@ def extract_tag_box(regex, title): return result -class YouPornListBase(InfoExtractor): +class YouPornListBaseIE(InfoExtractor): def _get_next_url(self, url, pl_id, html): return urljoin(url, self._search_regex( r'''<a [^>]*?\bhref\s*=\s*("|')(?P<url>(?:(?!\1)[^>])+)\1''', @@ -284,7 +284,7 @@ def _real_extract(self, url, html=None): playlist_id=pl_id, playlist_title=title) -class YouPornCategoryIE(YouPornListBase): +class YouPornCategoryIE(YouPornListBaseIE): IE_DESC = 'YouPorn category, with sorting, filtering and pagination' _VALID_URL = r'''(?x) https?://(?:www\.)?youporn\.com/ @@ -319,7 +319,7 @@ class YouPornCategoryIE(YouPornListBase): }] -class YouPornChannelIE(YouPornListBase): +class YouPornChannelIE(YouPornListBaseIE): IE_DESC = 'YouPorn channel, with sorting and pagination' _VALID_URL = r'''(?x) https?://(?:www\.)?youporn\.com/ @@ -349,7 +349,7 @@ def _get_title_from_slug(title_slug): return re.sub(r'_', ' ', title_slug).title() -class YouPornCollectionIE(YouPornListBase): +class YouPornCollectionIE(YouPornListBaseIE): IE_DESC = 'YouPorn collection (user playlist), with sorting and pagination' _VALID_URL = r'''(?x) https?://(?:www\.)?youporn\.com/ @@ -394,7 +394,7 @@ def _real_extract(self, url): return playlist -class YouPornTagIE(YouPornListBase): +class YouPornTagIE(YouPornListBaseIE): IE_DESC = 'YouPorn tag (porntags), with sorting, filtering and pagination' _VALID_URL = r'''(?x) https?://(?:www\.)?youporn\.com/ @@ -442,7 +442,7 @@ def _real_extract(self, url): return super()._real_extract(url) -class YouPornStarIE(YouPornListBase): +class YouPornStarIE(YouPornListBaseIE): IE_DESC = 'YouPorn Pornstar, with description, sorting and pagination' _VALID_URL = r'''(?x) https?://(?:www\.)?youporn\.com/ @@ -493,7 +493,7 @@ def _real_extract(self, url): } -class YouPornVideosIE(YouPornListBase): +class YouPornVideosIE(YouPornListBaseIE): IE_DESC = 'YouPorn video (browse) playlists, with sorting, filtering and pagination' _VALID_URL = r'''(?x) https?://(?:www\.)?youporn\.com/ diff --git a/yt_dlp/extractor/youtube/__init__.py b/yt_dlp/extractor/youtube/__init__.py new file mode 100644 index 000000000..892d860b0 --- /dev/null +++ b/yt_dlp/extractor/youtube/__init__.py @@ -0,0 +1,50 @@ +# flake8: noqa: F401 +from ._base import YoutubeBaseInfoExtractor +from ._clip import YoutubeClipIE +from ._mistakes import YoutubeTruncatedIDIE, YoutubeTruncatedURLIE +from ._notifications import YoutubeNotificationsIE +from ._redirect import ( + YoutubeConsentRedirectIE, + YoutubeFavouritesIE, + YoutubeFeedsInfoExtractor, + YoutubeHistoryIE, + YoutubeLivestreamEmbedIE, + YoutubeRecommendedIE, + YoutubeShortsAudioPivotIE, + YoutubeSubscriptionsIE, + YoutubeWatchLaterIE, + YoutubeYtBeIE, + YoutubeYtUserIE, +) +from ._search import YoutubeMusicSearchURLIE, YoutubeSearchDateIE, YoutubeSearchIE, YoutubeSearchURLIE +from ._tab import YoutubePlaylistIE, YoutubeTabBaseInfoExtractor, YoutubeTabIE +from ._video import YoutubeIE + +# Hack to allow plugin overrides work +for _cls in [ + YoutubeBaseInfoExtractor, + YoutubeClipIE, + YoutubeTruncatedIDIE, + YoutubeTruncatedURLIE, + YoutubeNotificationsIE, + YoutubeConsentRedirectIE, + YoutubeFavouritesIE, + YoutubeFeedsInfoExtractor, + YoutubeHistoryIE, + YoutubeLivestreamEmbedIE, + YoutubeRecommendedIE, + YoutubeShortsAudioPivotIE, + YoutubeSubscriptionsIE, + YoutubeWatchLaterIE, + YoutubeYtBeIE, + YoutubeYtUserIE, + YoutubeMusicSearchURLIE, + YoutubeSearchDateIE, + YoutubeSearchIE, + YoutubeSearchURLIE, + YoutubePlaylistIE, + YoutubeTabBaseInfoExtractor, + YoutubeTabIE, + YoutubeIE, +]: + _cls.__module__ = 'yt_dlp.extractor.youtube' diff --git a/yt_dlp/extractor/youtube/_base.py b/yt_dlp/extractor/youtube/_base.py new file mode 100644 index 000000000..4194e1c21 --- /dev/null +++ b/yt_dlp/extractor/youtube/_base.py @@ -0,0 +1,1102 @@ +import calendar +import copy +import datetime as dt +import enum +import functools +import hashlib +import json +import re +import time +import urllib.parse + +from ..common import InfoExtractor +from ...networking.exceptions import HTTPError, network_exceptions +from ...utils import ( + ExtractorError, + bug_reports_message, + datetime_from_str, + filter_dict, + get_first, + int_or_none, + is_html, + join_nonempty, + parse_count, + qualities, + str_to_int, + traverse_obj, + try_call, + try_get, + unified_timestamp, + url_or_none, + variadic, +) + + +class _PoTokenContext(enum.Enum): + PLAYER = 'player' + GVS = 'gvs' + + +# any clients starting with _ cannot be explicitly requested by the user +INNERTUBE_CLIENTS = { + 'web': { + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'WEB', + 'clientVersion': '2.20250312.04.00', + }, + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 1, + 'PO_TOKEN_REQUIRED_CONTEXTS': [_PoTokenContext.GVS], + 'SUPPORTS_COOKIES': True, + }, + # Safari UA returns pre-merged video+audio 144p/240p/360p/720p/1080p HLS formats + 'web_safari': { + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'WEB', + 'clientVersion': '2.20250312.04.00', + 'userAgent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.5 Safari/605.1.15,gzip(gfe)', + }, + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 1, + 'PO_TOKEN_REQUIRED_CONTEXTS': [_PoTokenContext.GVS], + 'SUPPORTS_COOKIES': True, + }, + 'web_embedded': { + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'WEB_EMBEDDED_PLAYER', + 'clientVersion': '1.20250310.01.00', + }, + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 56, + 'SUPPORTS_COOKIES': True, + }, + 'web_music': { + 'INNERTUBE_HOST': 'music.youtube.com', + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'WEB_REMIX', + 'clientVersion': '1.20250310.01.00', + }, + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 67, + 'PO_TOKEN_REQUIRED_CONTEXTS': [_PoTokenContext.GVS], + 'SUPPORTS_COOKIES': True, + }, + # This client now requires sign-in for every video + 'web_creator': { + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'WEB_CREATOR', + 'clientVersion': '1.20250312.03.01', + }, + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 62, + 'PO_TOKEN_REQUIRED_CONTEXTS': [_PoTokenContext.GVS], + 'REQUIRE_AUTH': True, + 'SUPPORTS_COOKIES': True, + }, + 'android': { + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'ANDROID', + 'clientVersion': '20.10.38', + 'androidSdkVersion': 30, + 'userAgent': 'com.google.android.youtube/20.10.38 (Linux; U; Android 11) gzip', + 'osName': 'Android', + 'osVersion': '11', + }, + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 3, + 'REQUIRE_JS_PLAYER': False, + 'PO_TOKEN_REQUIRED_CONTEXTS': [_PoTokenContext.GVS], + }, + # YouTube Kids videos aren't returned on this client for some reason + 'android_vr': { + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'ANDROID_VR', + 'clientVersion': '1.62.27', + 'deviceMake': 'Oculus', + 'deviceModel': 'Quest 3', + 'androidSdkVersion': 32, + 'userAgent': 'com.google.android.apps.youtube.vr.oculus/1.62.27 (Linux; U; Android 12L; eureka-user Build/SQ3A.220605.009.A1) gzip', + 'osName': 'Android', + 'osVersion': '12L', + }, + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 28, + 'REQUIRE_JS_PLAYER': False, + }, + # iOS clients have HLS live streams. Setting device model to get 60fps formats. + # See: https://github.com/TeamNewPipe/NewPipeExtractor/issues/680#issuecomment-1002724558 + 'ios': { + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'IOS', + 'clientVersion': '20.10.4', + 'deviceMake': 'Apple', + 'deviceModel': 'iPhone16,2', + 'userAgent': 'com.google.ios.youtube/20.10.4 (iPhone16,2; U; CPU iOS 18_3_2 like Mac OS X;)', + 'osName': 'iPhone', + 'osVersion': '18.3.2.22D82', + }, + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 5, + 'PO_TOKEN_REQUIRED_CONTEXTS': [_PoTokenContext.GVS], + 'REQUIRE_JS_PLAYER': False, + }, + # mweb has 'ultralow' formats + # See: https://github.com/yt-dlp/yt-dlp/pull/557 + 'mweb': { + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'MWEB', + 'clientVersion': '2.20250311.03.00', + # mweb previously did not require PO Token with this UA + 'userAgent': 'Mozilla/5.0 (iPad; CPU OS 16_7_10 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1,gzip(gfe)', + }, + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 2, + 'PO_TOKEN_REQUIRED_CONTEXTS': [_PoTokenContext.GVS], + 'SUPPORTS_COOKIES': True, + }, + 'tv': { + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'TVHTML5', + 'clientVersion': '7.20250312.16.00', + 'userAgent': 'Mozilla/5.0 (ChromiumStylePlatform) Cobalt/Version', + }, + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 7, + 'SUPPORTS_COOKIES': True, + }, + # This client now requires sign-in for every video + # It was previously an age-gate workaround for videos that were `playable_in_embed` + # It may still be useful if signed into an EU account that is not age-verified + 'tv_embedded': { + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'TVHTML5_SIMPLY_EMBEDDED_PLAYER', + 'clientVersion': '2.0', + }, + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 85, + 'REQUIRE_AUTH': True, + 'SUPPORTS_COOKIES': True, + }, +} + + +def _split_innertube_client(client_name): + variant, *base = client_name.rsplit('.', 1) + if base: + return variant, base[0], variant + base, *variant = client_name.split('_', 1) + return client_name, base, variant[0] if variant else None + + +def short_client_name(client_name): + main, *parts = _split_innertube_client(client_name)[0].split('_') + return join_nonempty(main[:4], ''.join(x[0] for x in parts)).upper() + + +def build_innertube_clients(): + THIRD_PARTY = { + 'embedUrl': 'https://www.youtube.com/', # Can be any valid URL + } + BASE_CLIENTS = ('ios', 'web', 'tv', 'mweb', 'android') + priority = qualities(BASE_CLIENTS[::-1]) + + for client, ytcfg in tuple(INNERTUBE_CLIENTS.items()): + ytcfg.setdefault('INNERTUBE_HOST', 'www.youtube.com') + ytcfg.setdefault('REQUIRE_JS_PLAYER', True) + ytcfg.setdefault('PO_TOKEN_REQUIRED_CONTEXTS', []) + ytcfg.setdefault('REQUIRE_AUTH', False) + ytcfg.setdefault('SUPPORTS_COOKIES', False) + ytcfg.setdefault('PLAYER_PARAMS', None) + ytcfg['INNERTUBE_CONTEXT']['client'].setdefault('hl', 'en') + + _, base_client, variant = _split_innertube_client(client) + ytcfg['priority'] = 10 * priority(base_client) + + if variant == 'embedded': + ytcfg['INNERTUBE_CONTEXT']['thirdParty'] = THIRD_PARTY + ytcfg['priority'] -= 2 + elif variant: + ytcfg['priority'] -= 3 + + +build_innertube_clients() + + +class BadgeType(enum.Enum): + AVAILABILITY_UNLISTED = enum.auto() + AVAILABILITY_PRIVATE = enum.auto() + AVAILABILITY_PUBLIC = enum.auto() + AVAILABILITY_PREMIUM = enum.auto() + AVAILABILITY_SUBSCRIPTION = enum.auto() + LIVE_NOW = enum.auto() + VERIFIED = enum.auto() + + +CONFIGURATION_ARG_KEY = 'youtube' + + +class YoutubeBaseInfoExtractor(InfoExtractor): + """Provide base functions for Youtube extractors""" + + _RESERVED_NAMES = ( + r'channel|c|user|playlist|watch|w|v|embed|e|live|watch_popup|clip|' + r'shorts|movies|results|search|shared|hashtag|trending|explore|feed|feeds|' + r'browse|oembed|get_video_info|iframe_api|s/player|source|' + r'storefront|oops|index|account|t/terms|about|upload|signin|logout') + + _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM|WL|LL|LM)' + + # _NETRC_MACHINE = 'youtube' + + # If True it will raise an error if no login info is provided + _LOGIN_REQUIRED = False + + _INVIDIOUS_SITES = ( + # invidious-redirect websites + r'(?:www\.)?redirect\.invidious\.io', + r'(?:(?:www|dev)\.)?invidio\.us', + # Invidious instances taken from https://github.com/iv-org/documentation/blob/master/docs/instances.md + r'(?:www\.)?invidious\.pussthecat\.org', + r'(?:www\.)?invidious\.zee\.li', + r'(?:www\.)?invidious\.ethibox\.fr', + r'(?:www\.)?iv\.ggtyler\.dev', + r'(?:www\.)?inv\.vern\.i2p', + r'(?:www\.)?am74vkcrjp2d5v36lcdqgsj2m6x36tbrkhsruoegwfcizzabnfgf5zyd\.onion', + r'(?:www\.)?inv\.riverside\.rocks', + r'(?:www\.)?invidious\.silur\.me', + r'(?:www\.)?inv\.bp\.projectsegfau\.lt', + r'(?:www\.)?invidious\.g4c3eya4clenolymqbpgwz3q3tawoxw56yhzk4vugqrl6dtu3ejvhjid\.onion', + r'(?:www\.)?invidious\.slipfox\.xyz', + r'(?:www\.)?invidious\.esmail5pdn24shtvieloeedh7ehz3nrwcdivnfhfcedl7gf4kwddhkqd\.onion', + r'(?:www\.)?inv\.vernccvbvyi5qhfzyqengccj7lkove6bjot2xhh5kajhwvidqafczrad\.onion', + r'(?:www\.)?invidious\.tiekoetter\.com', + r'(?:www\.)?iv\.odysfvr23q5wgt7i456o5t3trw2cw5dgn56vbjfbq2m7xsc5vqbqpcyd\.onion', + r'(?:www\.)?invidious\.nerdvpn\.de', + r'(?:www\.)?invidious\.weblibre\.org', + r'(?:www\.)?inv\.odyssey346\.dev', + r'(?:www\.)?invidious\.dhusch\.de', + r'(?:www\.)?iv\.melmac\.space', + r'(?:www\.)?watch\.thekitty\.zone', + r'(?:www\.)?invidious\.privacydev\.net', + r'(?:www\.)?ng27owmagn5amdm7l5s3rsqxwscl5ynppnis5dqcasogkyxcfqn7psid\.onion', + r'(?:www\.)?invidious\.drivet\.xyz', + r'(?:www\.)?vid\.priv\.au', + r'(?:www\.)?euxxcnhsynwmfidvhjf6uzptsmh4dipkmgdmcmxxuo7tunp3ad2jrwyd\.onion', + r'(?:www\.)?inv\.vern\.cc', + r'(?:www\.)?invidious\.esmailelbob\.xyz', + r'(?:www\.)?invidious\.sethforprivacy\.com', + r'(?:www\.)?yt\.oelrichsgarcia\.de', + r'(?:www\.)?yt\.artemislena\.eu', + r'(?:www\.)?invidious\.flokinet\.to', + r'(?:www\.)?invidious\.baczek\.me', + r'(?:www\.)?y\.com\.sb', + r'(?:www\.)?invidious\.epicsite\.xyz', + r'(?:www\.)?invidious\.lidarshield\.cloud', + r'(?:www\.)?yt\.funami\.tech', + r'(?:www\.)?invidious\.3o7z6yfxhbw7n3za4rss6l434kmv55cgw2vuziwuigpwegswvwzqipyd\.onion', + r'(?:www\.)?osbivz6guyeahrwp2lnwyjk2xos342h4ocsxyqrlaopqjuhwn2djiiyd\.onion', + r'(?:www\.)?u2cvlit75owumwpy4dj2hsmvkq7nvrclkpht7xgyye2pyoxhpmclkrad\.onion', + # youtube-dl invidious instances list + r'(?:(?:www|no)\.)?invidiou\.sh', + r'(?:(?:www|fi)\.)?invidious\.snopyta\.org', + r'(?:www\.)?invidious\.kabi\.tk', + r'(?:www\.)?invidious\.mastodon\.host', + r'(?:www\.)?invidious\.zapashcanon\.fr', + r'(?:www\.)?(?:invidious(?:-us)?|piped)\.kavin\.rocks', + r'(?:www\.)?invidious\.tinfoil-hat\.net', + r'(?:www\.)?invidious\.himiko\.cloud', + r'(?:www\.)?invidious\.reallyancient\.tech', + r'(?:www\.)?invidious\.tube', + r'(?:www\.)?invidiou\.site', + r'(?:www\.)?invidious\.site', + r'(?:www\.)?invidious\.xyz', + r'(?:www\.)?invidious\.nixnet\.xyz', + r'(?:www\.)?invidious\.048596\.xyz', + r'(?:www\.)?invidious\.drycat\.fr', + r'(?:www\.)?inv\.skyn3t\.in', + r'(?:www\.)?tube\.poal\.co', + r'(?:www\.)?tube\.connect\.cafe', + r'(?:www\.)?vid\.wxzm\.sx', + r'(?:www\.)?vid\.mint\.lgbt', + r'(?:www\.)?vid\.puffyan\.us', + r'(?:www\.)?yewtu\.be', + r'(?:www\.)?yt\.elukerio\.org', + r'(?:www\.)?yt\.lelux\.fi', + r'(?:www\.)?invidious\.ggc-project\.de', + r'(?:www\.)?yt\.maisputain\.ovh', + r'(?:www\.)?ytprivate\.com', + r'(?:www\.)?invidious\.13ad\.de', + r'(?:www\.)?invidious\.toot\.koeln', + r'(?:www\.)?invidious\.fdn\.fr', + r'(?:www\.)?watch\.nettohikari\.com', + r'(?:www\.)?invidious\.namazso\.eu', + r'(?:www\.)?invidious\.silkky\.cloud', + r'(?:www\.)?invidious\.exonip\.de', + r'(?:www\.)?invidious\.riverside\.rocks', + r'(?:www\.)?invidious\.blamefran\.net', + r'(?:www\.)?invidious\.moomoo\.de', + r'(?:www\.)?ytb\.trom\.tf', + r'(?:www\.)?yt\.cyberhost\.uk', + r'(?:www\.)?kgg2m7yk5aybusll\.onion', + r'(?:www\.)?qklhadlycap4cnod\.onion', + r'(?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion', + r'(?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion', + r'(?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion', + r'(?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion', + r'(?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p', + r'(?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion', + r'(?:www\.)?w6ijuptxiku4xpnnaetxvnkc5vqcdu7mgns2u77qefoixi63vbvnpnqd\.onion', + r'(?:www\.)?kbjggqkzv65ivcqj6bumvp337z6264huv5kpkwuv6gu5yjiskvan7fad\.onion', + r'(?:www\.)?grwp24hodrefzvjjuccrkw3mjq4tzhaaq32amf33dzpmuxe7ilepcmad\.onion', + r'(?:www\.)?hpniueoejy4opn7bc4ftgazyqjoeqwlvh2uiku2xqku6zpoa4bf5ruid\.onion', + # piped instances from https://github.com/TeamPiped/Piped/wiki/Instances + r'(?:www\.)?piped\.kavin\.rocks', + r'(?:www\.)?piped\.tokhmi\.xyz', + r'(?:www\.)?piped\.syncpundit\.io', + r'(?:www\.)?piped\.mha\.fi', + r'(?:www\.)?watch\.whatever\.social', + r'(?:www\.)?piped\.garudalinux\.org', + r'(?:www\.)?piped\.rivo\.lol', + r'(?:www\.)?piped-libre\.kavin\.rocks', + r'(?:www\.)?yt\.jae\.fi', + r'(?:www\.)?piped\.mint\.lgbt', + r'(?:www\.)?il\.ax', + r'(?:www\.)?piped\.esmailelbob\.xyz', + r'(?:www\.)?piped\.projectsegfau\.lt', + r'(?:www\.)?piped\.privacydev\.net', + r'(?:www\.)?piped\.palveluntarjoaja\.eu', + r'(?:www\.)?piped\.smnz\.de', + r'(?:www\.)?piped\.adminforge\.de', + r'(?:www\.)?watch\.whatevertinfoil\.de', + r'(?:www\.)?piped\.qdi\.fi', + r'(?:(?:www|cf)\.)?piped\.video', + r'(?:www\.)?piped\.aeong\.one', + r'(?:www\.)?piped\.moomoo\.me', + r'(?:www\.)?piped\.chauvet\.pro', + r'(?:www\.)?watch\.leptons\.xyz', + r'(?:www\.)?pd\.vern\.cc', + r'(?:www\.)?piped\.hostux\.net', + r'(?:www\.)?piped\.lunar\.icu', + # Hyperpipe instances from https://hyperpipe.codeberg.page/ + r'(?:www\.)?hyperpipe\.surge\.sh', + r'(?:www\.)?hyperpipe\.esmailelbob\.xyz', + r'(?:www\.)?listen\.whatever\.social', + r'(?:www\.)?music\.adminforge\.de', + ) + + # extracted from account/account_menu ep + # XXX: These are the supported YouTube UI and API languages, + # which is slightly different from languages supported for translation in YouTube studio + _SUPPORTED_LANG_CODES = [ + 'af', 'az', 'id', 'ms', 'bs', 'ca', 'cs', 'da', 'de', 'et', 'en-IN', 'en-GB', 'en', 'es', + 'es-419', 'es-US', 'eu', 'fil', 'fr', 'fr-CA', 'gl', 'hr', 'zu', 'is', 'it', 'sw', 'lv', + 'lt', 'hu', 'nl', 'no', 'uz', 'pl', 'pt-PT', 'pt', 'ro', 'sq', 'sk', 'sl', 'sr-Latn', 'fi', + 'sv', 'vi', 'tr', 'be', 'bg', 'ky', 'kk', 'mk', 'mn', 'ru', 'sr', 'uk', 'el', 'hy', 'iw', + 'ur', 'ar', 'fa', 'ne', 'mr', 'hi', 'as', 'bn', 'pa', 'gu', 'or', 'ta', 'te', 'kn', 'ml', + 'si', 'th', 'lo', 'my', 'ka', 'am', 'km', 'zh-CN', 'zh-TW', 'zh-HK', 'ja', 'ko', + ] + + _IGNORED_WARNINGS = { + 'Unavailable videos will be hidden during playback', + 'Unavailable videos are hidden', + } + + _YT_HANDLE_RE = r'@[\w.-]{3,30}' # https://support.google.com/youtube/answer/11585688?hl=en + _YT_CHANNEL_UCID_RE = r'UC[\w-]{22}' + + _NETRC_MACHINE = 'youtube' + + _COOKIE_HOWTO_WIKI_URL = 'https://github.com/yt-dlp/yt-dlp/wiki/Extractors#exporting-youtube-cookies' + + def ucid_or_none(self, ucid): + return self._search_regex(rf'^({self._YT_CHANNEL_UCID_RE})$', ucid, 'UC-id', default=None) + + def handle_or_none(self, handle): + return self._search_regex(rf'^({self._YT_HANDLE_RE})$', urllib.parse.unquote(handle or ''), + '@-handle', default=None) + + def handle_from_url(self, url): + return self._search_regex(rf'^(?:https?://(?:www\.)?youtube\.com)?/({self._YT_HANDLE_RE})', + urllib.parse.unquote(url or ''), 'channel handle', default=None) + + def ucid_from_url(self, url): + return self._search_regex(rf'^(?:https?://(?:www\.)?youtube\.com)?/({self._YT_CHANNEL_UCID_RE})', + url, 'channel id', default=None) + + @functools.cached_property + def _preferred_lang(self): + """ + Returns a language code supported by YouTube for the user preferred language. + Returns None if no preferred language set. + """ + preferred_lang = self._configuration_arg('lang', ie_key='Youtube', casesense=True, default=[''])[0] + if not preferred_lang: + return + if preferred_lang not in self._SUPPORTED_LANG_CODES: + raise ExtractorError( + f'Unsupported language code: {preferred_lang}. Supported language codes (case-sensitive): {join_nonempty(*self._SUPPORTED_LANG_CODES, delim=", ")}.', + expected=True) + elif preferred_lang != 'en': + self.report_warning( + f'Preferring "{preferred_lang}" translated fields. Note that some metadata extraction may fail or be incorrect.') + return preferred_lang + + def _initialize_consent(self): + if self._has_auth_cookies: + return + socs = self._youtube_cookies.get('SOCS') + if socs and not socs.value.startswith('CAA'): # not consented + return + self._set_cookie('.youtube.com', 'SOCS', 'CAI', secure=True) # accept all (required for mixes) + + def _initialize_pref(self): + pref_cookie = self._youtube_cookies.get('PREF') + pref = {} + if pref_cookie: + try: + pref = dict(urllib.parse.parse_qsl(pref_cookie.value)) + except ValueError: + self.report_warning('Failed to parse user PREF cookie' + bug_reports_message()) + pref.update({'hl': self._preferred_lang or 'en', 'tz': 'UTC'}) + self._set_cookie('.youtube.com', name='PREF', value=urllib.parse.urlencode(pref)) + + def _initialize_cookie_auth(self): + self._passed_auth_cookies = False + if self._has_auth_cookies: + self._passed_auth_cookies = True + self.write_debug('Found YouTube account cookies') + + def _real_initialize(self): + self._initialize_pref() + self._initialize_consent() + self._initialize_cookie_auth() + self._check_login_required() + + def _perform_login(self, username, password): + if username.startswith('oauth'): + raise ExtractorError( + f'Login with OAuth is no longer supported. {self._youtube_login_hint}', expected=True) + + self.report_warning( + f'Login with password is not supported for YouTube. {self._youtube_login_hint}') + + @property + def _youtube_login_hint(self): + return (f'{self._login_hint(method="cookies")}. Also see {self._COOKIE_HOWTO_WIKI_URL} ' + 'for tips on effectively exporting YouTube cookies') + + def _check_login_required(self): + if self._LOGIN_REQUIRED and not self.is_authenticated: + self.raise_login_required( + f'Login details are needed to download this content. {self._youtube_login_hint}', method=None) + + _YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=' + _YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=' + + def _get_default_ytcfg(self, client='web'): + return copy.deepcopy(INNERTUBE_CLIENTS[client]) + + def _get_innertube_host(self, client='web'): + return INNERTUBE_CLIENTS[client]['INNERTUBE_HOST'] + + def _ytcfg_get_safe(self, ytcfg, getter, expected_type=None, default_client='web'): + # try_get but with fallback to default ytcfg client values when present + _func = lambda y: try_get(y, getter, expected_type) + return _func(ytcfg) or _func(self._get_default_ytcfg(default_client)) + + def _extract_client_name(self, ytcfg, default_client='web'): + return self._ytcfg_get_safe( + ytcfg, (lambda x: x['INNERTUBE_CLIENT_NAME'], + lambda x: x['INNERTUBE_CONTEXT']['client']['clientName']), str, default_client) + + def _extract_client_version(self, ytcfg, default_client='web'): + return self._ytcfg_get_safe( + ytcfg, (lambda x: x['INNERTUBE_CLIENT_VERSION'], + lambda x: x['INNERTUBE_CONTEXT']['client']['clientVersion']), str, default_client) + + def _select_api_hostname(self, req_api_hostname, default_client=None): + return (self._configuration_arg('innertube_host', [''], ie_key=CONFIGURATION_ARG_KEY)[0] + or req_api_hostname or self._get_innertube_host(default_client or 'web')) + + def _extract_context(self, ytcfg=None, default_client='web'): + context = get_first( + (ytcfg, self._get_default_ytcfg(default_client)), 'INNERTUBE_CONTEXT', expected_type=dict) + # Enforce language and tz for extraction + client_context = traverse_obj(context, 'client', expected_type=dict, default={}) + client_context.update({'hl': self._preferred_lang or 'en', 'timeZone': 'UTC', 'utcOffsetMinutes': 0}) + return context + + @staticmethod + def _make_sid_authorization(scheme, sid, origin, additional_parts): + timestamp = str(round(time.time())) + + hash_parts = [] + if additional_parts: + hash_parts.append(':'.join(additional_parts.values())) + hash_parts.extend([timestamp, sid, origin]) + sidhash = hashlib.sha1(' '.join(hash_parts).encode()).hexdigest() + + parts = [timestamp, sidhash] + if additional_parts: + parts.append(''.join(additional_parts)) + + return f'{scheme} {"_".join(parts)}' + + @property + def _youtube_cookies(self): + return self._get_cookies('https://www.youtube.com') + + def _get_sid_cookies(self): + """ + Get SAPISID, 1PSAPISID, 3PSAPISID cookie values + @returns sapisid, 1psapisid, 3psapisid + """ + yt_cookies = self._youtube_cookies + yt_sapisid = try_call(lambda: yt_cookies['SAPISID'].value) + yt_3papisid = try_call(lambda: yt_cookies['__Secure-3PAPISID'].value) + yt_1papisid = try_call(lambda: yt_cookies['__Secure-1PAPISID'].value) + + # Sometimes SAPISID cookie isn't present but __Secure-3PAPISID is. + # YouTube also falls back to __Secure-3PAPISID if SAPISID is missing. + # See: https://github.com/yt-dlp/yt-dlp/issues/393 + + return yt_sapisid or yt_3papisid, yt_1papisid, yt_3papisid + + def _get_sid_authorization_header(self, origin='https://www.youtube.com', user_session_id=None): + """ + Generate API Session ID Authorization for Innertube requests. Assumes all requests are secure (https). + @param origin: Origin URL + @param user_session_id: Optional User Session ID + @return: Authorization header value + """ + + authorizations = [] + additional_parts = {} + if user_session_id: + additional_parts['u'] = user_session_id + + yt_sapisid, yt_1psapisid, yt_3psapisid = self._get_sid_cookies() + + for scheme, sid in (('SAPISIDHASH', yt_sapisid), + ('SAPISID1PHASH', yt_1psapisid), + ('SAPISID3PHASH', yt_3psapisid)): + if sid: + authorizations.append(self._make_sid_authorization(scheme, sid, origin, additional_parts)) + + if not authorizations: + return None + + return ' '.join(authorizations) + + @property + def is_authenticated(self): + return self._has_auth_cookies + + @property + def _has_auth_cookies(self): + yt_sapisid, yt_1psapisid, yt_3psapisid = self._get_sid_cookies() + # YouTube doesn't appear to clear 3PSAPISID when rotating cookies (as of 2025-04-26) + # But LOGIN_INFO is cleared and should exist if logged in + has_login_info = 'LOGIN_INFO' in self._youtube_cookies + return bool(has_login_info and (yt_sapisid or yt_1psapisid or yt_3psapisid)) + + def _request_webpage(self, *args, **kwargs): + response = super()._request_webpage(*args, **kwargs) + + # Check that we are still logged-in and cookies have not rotated after every request + if getattr(self, '_passed_auth_cookies', None) and not self._has_auth_cookies: + self.report_warning( + 'The provided YouTube account cookies are no longer valid. ' + 'They have likely been rotated in the browser as a security measure. ' + f'For tips on how to effectively export YouTube cookies, refer to {self._COOKIE_HOWTO_WIKI_URL} .', + only_once=False) + + return response + + def _call_api(self, ep, query, video_id, fatal=True, headers=None, + note='Downloading API JSON', errnote='Unable to download API page', + context=None, api_key=None, api_hostname=None, default_client='web'): + + data = {'context': context} if context else {'context': self._extract_context(default_client=default_client)} + data.update(query) + real_headers = self.generate_api_headers(default_client=default_client) + real_headers.update({'content-type': 'application/json'}) + if headers: + real_headers.update(headers) + return self._download_json( + f'https://{self._select_api_hostname(api_hostname, default_client)}/youtubei/v1/{ep}', + video_id=video_id, fatal=fatal, note=note, errnote=errnote, + data=json.dumps(data).encode('utf8'), headers=real_headers, + query=filter_dict({ + 'key': self._configuration_arg( + 'innertube_key', [api_key], ie_key=CONFIGURATION_ARG_KEY, casesense=True)[0], + 'prettyPrint': 'false', + }, cndn=lambda _, v: v)) + + def extract_yt_initial_data(self, item_id, webpage, fatal=True): + return self._search_json(self._YT_INITIAL_DATA_RE, webpage, 'yt initial data', item_id, fatal=fatal) + + @staticmethod + def _extract_session_index(*data): + """ + Index of current account in account list. + See: https://github.com/yt-dlp/yt-dlp/pull/519 + """ + for ytcfg in data: + session_index = int_or_none(try_get(ytcfg, lambda x: x['SESSION_INDEX'])) + if session_index is not None: + return session_index + + @staticmethod + def _parse_data_sync_id(data_sync_id): + """ + Parse data_sync_id into delegated_session_id and user_session_id. + + data_sync_id is of the form "delegated_session_id||user_session_id" for secondary channel + and just "user_session_id||" for primary channel. + + @param data_sync_id: data_sync_id string + @return: Tuple of (delegated_session_id, user_session_id) + """ + if not data_sync_id: + return None, None + first, _, second = data_sync_id.partition('||') + if second: + return first, second + return None, first + + def _extract_delegated_session_id(self, *args): + """ + Extract current delegated session ID required to download private playlists of secondary channels + @params response and/or ytcfg + @return: delegated session ID + """ + # ytcfg includes channel_syncid if on secondary channel + if delegated_sid := traverse_obj(args, (..., 'DELEGATED_SESSION_ID', {str}, any)): + return delegated_sid + + data_sync_id = self._extract_data_sync_id(*args) + return self._parse_data_sync_id(data_sync_id)[0] + + def _extract_user_session_id(self, *args): + """ + Extract current user session ID + @params response and/or ytcfg + @return: user session ID + """ + if user_sid := traverse_obj(args, (..., 'USER_SESSION_ID', {str}, any)): + return user_sid + + data_sync_id = self._extract_data_sync_id(*args) + return self._parse_data_sync_id(data_sync_id)[1] + + def _extract_data_sync_id(self, *args): + """ + Extract current account dataSyncId. + In the format DELEGATED_SESSION_ID||USER_SESSION_ID or USER_SESSION_ID|| + @params response and/or ytcfg + """ + if data_sync_id := self._configuration_arg('data_sync_id', [None], ie_key=CONFIGURATION_ARG_KEY, casesense=True)[0]: + return data_sync_id + + return traverse_obj( + args, (..., ('DATASYNC_ID', ('responseContext', 'mainAppWebResponseContext', 'datasyncId')), {str}, any)) + + def _extract_visitor_data(self, *args): + """ + Extracts visitorData from an API response or ytcfg + Appears to be used to track session state + """ + if visitor_data := self._configuration_arg('visitor_data', [None], ie_key=CONFIGURATION_ARG_KEY, casesense=True)[0]: + return visitor_data + return get_first( + args, [('VISITOR_DATA', ('INNERTUBE_CONTEXT', 'client', 'visitorData'), ('responseContext', 'visitorData'))], + expected_type=str) + + def extract_ytcfg(self, video_id, webpage): + if not webpage: + return {} + return self._parse_json( + self._search_regex( + r'ytcfg\.set\s*\(\s*({.+?})\s*\)\s*;', webpage, 'ytcfg', + default='{}'), video_id, fatal=False) or {} + + def _generate_cookie_auth_headers(self, *, ytcfg=None, delegated_session_id=None, user_session_id=None, session_index=None, origin=None, **kwargs): + headers = {} + delegated_session_id = delegated_session_id or self._extract_delegated_session_id(ytcfg) + if delegated_session_id: + headers['X-Goog-PageId'] = delegated_session_id + if session_index is None: + session_index = self._extract_session_index(ytcfg) + if delegated_session_id or session_index is not None: + headers['X-Goog-AuthUser'] = session_index if session_index is not None else 0 + + auth = self._get_sid_authorization_header(origin, user_session_id=user_session_id or self._extract_user_session_id(ytcfg)) + if auth is not None: + headers['Authorization'] = auth + headers['X-Origin'] = origin + + if traverse_obj(ytcfg, 'LOGGED_IN', expected_type=bool): + headers['X-Youtube-Bootstrap-Logged-In'] = 'true' + + return headers + + def generate_api_headers( + self, *, ytcfg=None, delegated_session_id=None, user_session_id=None, session_index=None, + visitor_data=None, api_hostname=None, default_client='web', **kwargs): + + origin = 'https://' + (self._select_api_hostname(api_hostname, default_client)) + headers = { + 'X-YouTube-Client-Name': str( + self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_CONTEXT_CLIENT_NAME'], default_client=default_client)), + 'X-YouTube-Client-Version': self._extract_client_version(ytcfg, default_client), + 'Origin': origin, + 'X-Goog-Visitor-Id': visitor_data or self._extract_visitor_data(ytcfg), + 'User-Agent': self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_CONTEXT']['client']['userAgent'], default_client=default_client), + **self._generate_cookie_auth_headers( + ytcfg=ytcfg, + delegated_session_id=delegated_session_id, + user_session_id=user_session_id, + session_index=session_index, + origin=origin), + } + return filter_dict(headers) + + def _download_webpage_with_retries(self, *args, retry_fatal=False, retry_on_status=None, **kwargs): + for retry in self.RetryManager(fatal=retry_fatal): + try: + return self._download_webpage(*args, **kwargs) + except ExtractorError as e: + if isinstance(e.cause, network_exceptions): + if not isinstance(e.cause, HTTPError) or e.cause.status not in (retry_on_status or (403, 429)): + retry.error = e + continue + self._error_or_warning(e, fatal=retry_fatal) + break + + def _download_ytcfg(self, client, video_id): + url = { + 'web': 'https://www.youtube.com', + 'web_music': 'https://music.youtube.com', + 'web_embedded': f'https://www.youtube.com/embed/{video_id}?html5=1', + 'tv': 'https://www.youtube.com/tv', + }.get(client) + if not url: + return {} + webpage = self._download_webpage_with_retries( + url, video_id, note=f'Downloading {client.replace("_", " ").strip()} client config', + headers=traverse_obj(self._get_default_ytcfg(client), { + 'User-Agent': ('INNERTUBE_CONTEXT', 'client', 'userAgent', {str}), + })) + return self.extract_ytcfg(video_id, webpage) or {} + + @staticmethod + def _build_api_continuation_query(continuation, ctp=None): + query = { + 'continuation': continuation, + } + # TODO: Inconsistency with clickTrackingParams. + # Currently we have a fixed ctp contained within context (from ytcfg) + # and a ctp in root query for continuation. + if ctp: + query['clickTracking'] = {'clickTrackingParams': ctp} + return query + + @classmethod + def _extract_next_continuation_data(cls, renderer): + next_continuation = try_get( + renderer, (lambda x: x['continuations'][0]['nextContinuationData'], + lambda x: x['continuation']['reloadContinuationData']), dict) + if not next_continuation: + return + continuation = next_continuation.get('continuation') + if not continuation: + return + ctp = next_continuation.get('clickTrackingParams') + return cls._build_api_continuation_query(continuation, ctp) + + @classmethod + def _extract_continuation_ep_data(cls, continuation_ep: dict): + continuation_commands = traverse_obj( + continuation_ep, ('commandExecutorCommand', 'commands', ..., {dict})) + continuation_commands.append(continuation_ep) + for command in continuation_commands: + continuation = traverse_obj(command, ('continuationCommand', 'token', {str})) + if not continuation: + continue + ctp = command.get('clickTrackingParams') + return cls._build_api_continuation_query(continuation, ctp) + + @classmethod + def _extract_continuation(cls, renderer): + next_continuation = cls._extract_next_continuation_data(renderer) + if next_continuation: + return next_continuation + + return traverse_obj(renderer, ( + ('contents', 'items', 'rows'), ..., 'continuationItemRenderer', + ('continuationEndpoint', ('button', 'buttonRenderer', 'command')), + ), get_all=False, expected_type=cls._extract_continuation_ep_data) + + @classmethod + def _extract_alerts(cls, data): + for alert_dict in try_get(data, lambda x: x['alerts'], list) or []: + if not isinstance(alert_dict, dict): + continue + for alert in alert_dict.values(): + alert_type = alert.get('type') + if not alert_type: + continue + message = cls._get_text(alert, 'text') + if message: + yield alert_type, message + + def _report_alerts(self, alerts, expected=True, fatal=True, only_once=False): + errors, warnings = [], [] + for alert_type, alert_message in alerts: + if alert_type.lower() == 'error' and fatal: + errors.append([alert_type, alert_message]) + elif alert_message not in self._IGNORED_WARNINGS: + warnings.append([alert_type, alert_message]) + + for alert_type, alert_message in (warnings + errors[:-1]): + self.report_warning(f'YouTube said: {alert_type} - {alert_message}', only_once=only_once) + if errors: + raise ExtractorError(f'YouTube said: {errors[-1][1]}', expected=expected) + + def _extract_and_report_alerts(self, data, *args, **kwargs): + return self._report_alerts(self._extract_alerts(data), *args, **kwargs) + + def _extract_badges(self, badge_list: list): + """ + Extract known BadgeType's from a list of badge renderers. + @returns [{'type': BadgeType}] + """ + icon_type_map = { + 'PRIVACY_UNLISTED': BadgeType.AVAILABILITY_UNLISTED, + 'PRIVACY_PRIVATE': BadgeType.AVAILABILITY_PRIVATE, + 'PRIVACY_PUBLIC': BadgeType.AVAILABILITY_PUBLIC, + 'CHECK_CIRCLE_THICK': BadgeType.VERIFIED, + 'OFFICIAL_ARTIST_BADGE': BadgeType.VERIFIED, + 'CHECK': BadgeType.VERIFIED, + } + + badge_style_map = { + 'BADGE_STYLE_TYPE_MEMBERS_ONLY': BadgeType.AVAILABILITY_SUBSCRIPTION, + 'BADGE_STYLE_TYPE_PREMIUM': BadgeType.AVAILABILITY_PREMIUM, + 'BADGE_STYLE_TYPE_LIVE_NOW': BadgeType.LIVE_NOW, + 'BADGE_STYLE_TYPE_VERIFIED': BadgeType.VERIFIED, + 'BADGE_STYLE_TYPE_VERIFIED_ARTIST': BadgeType.VERIFIED, + } + + label_map = { + 'unlisted': BadgeType.AVAILABILITY_UNLISTED, + 'private': BadgeType.AVAILABILITY_PRIVATE, + 'members only': BadgeType.AVAILABILITY_SUBSCRIPTION, + 'live': BadgeType.LIVE_NOW, + 'premium': BadgeType.AVAILABILITY_PREMIUM, + 'verified': BadgeType.VERIFIED, + 'official artist channel': BadgeType.VERIFIED, + } + + badges = [] + for badge in traverse_obj(badge_list, (..., lambda key, _: re.search(r'[bB]adgeRenderer$', key))): + badge_type = ( + icon_type_map.get(traverse_obj(badge, ('icon', 'iconType'), expected_type=str)) + or badge_style_map.get(traverse_obj(badge, 'style')) + ) + if badge_type: + badges.append({'type': badge_type}) + continue + + # fallback, won't work in some languages + label = traverse_obj( + badge, 'label', ('accessibilityData', 'label'), 'tooltip', 'iconTooltip', get_all=False, expected_type=str, default='') + for match, label_badge_type in label_map.items(): + if match in label.lower(): + badges.append({'type': label_badge_type}) + break + + return badges + + @staticmethod + def _has_badge(badges, badge_type): + return bool(traverse_obj(badges, lambda _, v: v['type'] == badge_type)) + + @staticmethod + def _get_text(data, *path_list, max_runs=None): + for path in path_list or [None]: + if path is None: + obj = [data] + else: + obj = traverse_obj(data, path, default=[]) + if not any(key is ... or isinstance(key, (list, tuple)) for key in variadic(path)): + obj = [obj] + for item in obj: + text = try_get(item, lambda x: x['simpleText'], str) + if text: + return text + runs = try_get(item, lambda x: x['runs'], list) or [] + if not runs and isinstance(item, list): + runs = item + + runs = runs[:min(len(runs), max_runs or len(runs))] + text = ''.join(traverse_obj(runs, (..., 'text'), expected_type=str)) + if text: + return text + + def _get_count(self, data, *path_list): + count_text = self._get_text(data, *path_list) or '' + count = parse_count(count_text) + if count is None: + count = str_to_int( + self._search_regex(r'^([\d,]+)', re.sub(r'\s', '', count_text), 'count', default=None)) + return count + + @staticmethod + def _extract_thumbnails(data, *path_list, final_key='thumbnails'): + """ + Extract thumbnails from thumbnails dict + @param path_list: path list to level that contains 'thumbnails' key + """ + thumbnails = [] + for path in path_list or [()]: + for thumbnail in traverse_obj(data, (*variadic(path), final_key, ...)): + thumbnail_url = url_or_none(thumbnail.get('url')) + if not thumbnail_url: + continue + # Sometimes youtube gives a wrong thumbnail URL. See: + # https://github.com/yt-dlp/yt-dlp/issues/233 + # https://github.com/ytdl-org/youtube-dl/issues/28023 + if 'maxresdefault' in thumbnail_url: + thumbnail_url = thumbnail_url.split('?')[0] + thumbnails.append({ + 'url': thumbnail_url, + 'height': int_or_none(thumbnail.get('height')), + 'width': int_or_none(thumbnail.get('width')), + }) + return thumbnails + + @staticmethod + def extract_relative_time(relative_time_text): + """ + Extracts a relative time from string and converts to dt object + e.g. 'streamed 6 days ago', '5 seconds ago (edited)', 'updated today', '8 yr ago' + """ + + # XXX: this could be moved to a general function in utils/_utils.py + # The relative time text strings are roughly the same as what + # Javascript's Intl.RelativeTimeFormat function generates. + # See: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Intl/RelativeTimeFormat + mobj = re.search( + r'(?P<start>today|yesterday|now)|(?P<time>\d+)\s*(?P<unit>sec(?:ond)?|s|min(?:ute)?|h(?:our|r)?|d(?:ay)?|w(?:eek|k)?|mo(?:nth)?|y(?:ear|r)?)s?\s*ago', + relative_time_text) + if mobj: + start = mobj.group('start') + if start: + return datetime_from_str(start) + try: + return datetime_from_str('now-{}{}'.format(mobj.group('time'), mobj.group('unit'))) + except ValueError: + return None + + def _parse_time_text(self, text): + if not text: + return + dt_ = self.extract_relative_time(text) + timestamp = None + if isinstance(dt_, dt.datetime): + timestamp = calendar.timegm(dt_.timetuple()) + + if timestamp is None: + timestamp = ( + unified_timestamp(text) or unified_timestamp( + self._search_regex( + (r'([a-z]+\s*\d{1,2},?\s*20\d{2})', r'(?:.+|^)(?:live|premieres|ed|ing)(?:\s*(?:on|for))?\s*(.+\d)'), + text.lower(), 'time text', default=None))) + + if text and timestamp is None and self._preferred_lang in (None, 'en'): + self.report_warning( + f'Cannot parse localized time text "{text}"', only_once=True) + return timestamp + + def _extract_response(self, item_id, query, note='Downloading API JSON', headers=None, + ytcfg=None, check_get_keys=None, ep='browse', fatal=True, api_hostname=None, + default_client='web'): + raise_for_incomplete = bool(self._configuration_arg('raise_incomplete_data', ie_key=CONFIGURATION_ARG_KEY)) + # Incomplete Data should be a warning by default when retries are exhausted, while other errors should be fatal. + icd_retries = iter(self.RetryManager(fatal=raise_for_incomplete)) + icd_rm = next(icd_retries) + main_retries = iter(self.RetryManager()) + main_rm = next(main_retries) + # Manual retry loop for multiple RetryManagers + # The proper RetryManager MUST be advanced after an error + # and its result MUST be checked if the manager is non fatal + while True: + try: + response = self._call_api( + ep=ep, fatal=True, headers=headers, + video_id=item_id, query=query, note=note, + context=self._extract_context(ytcfg, default_client), + api_hostname=api_hostname, default_client=default_client) + except ExtractorError as e: + if not isinstance(e.cause, network_exceptions): + return self._error_or_warning(e, fatal=fatal) + elif not isinstance(e.cause, HTTPError): + main_rm.error = e + next(main_retries) + continue + + first_bytes = e.cause.response.read(512) + if not is_html(first_bytes): + yt_error = try_get( + self._parse_json( + self._webpage_read_content(e.cause.response, None, item_id, prefix=first_bytes) or '{}', item_id, fatal=False), + lambda x: x['error']['message'], str) + if yt_error: + self._report_alerts([('ERROR', yt_error)], fatal=False) + # Downloading page may result in intermittent 5xx HTTP error + # Sometimes a 404 is also received. See: https://github.com/ytdl-org/youtube-dl/issues/28289 + # We also want to catch all other network exceptions since errors in later pages can be troublesome + # See https://github.com/yt-dlp/yt-dlp/issues/507#issuecomment-880188210 + if e.cause.status not in (403, 429): + main_rm.error = e + next(main_retries) + continue + return self._error_or_warning(e, fatal=fatal) + + try: + self._extract_and_report_alerts(response, only_once=True) + except ExtractorError as e: + # YouTube's servers may return errors we want to retry on in a 200 OK response + # See: https://github.com/yt-dlp/yt-dlp/issues/839 + if 'unknown error' in e.msg.lower(): + main_rm.error = e + next(main_retries) + continue + return self._error_or_warning(e, fatal=fatal) + # Youtube sometimes sends incomplete data + # See: https://github.com/ytdl-org/youtube-dl/issues/28194 + if not traverse_obj(response, *variadic(check_get_keys)): + icd_rm.error = ExtractorError('Incomplete data received', expected=True) + should_retry = next(icd_retries, None) + if not should_retry: + return None + continue + + return response + + @staticmethod + def is_music_url(url): + return re.match(r'(https?://)?music\.youtube\.com/', url) is not None diff --git a/yt_dlp/extractor/youtube/_clip.py b/yt_dlp/extractor/youtube/_clip.py new file mode 100644 index 000000000..7d063700e --- /dev/null +++ b/yt_dlp/extractor/youtube/_clip.py @@ -0,0 +1,66 @@ +from ._tab import YoutubeTabBaseInfoExtractor +from ._video import YoutubeIE +from ...utils import ExtractorError, traverse_obj + + +class YoutubeClipIE(YoutubeTabBaseInfoExtractor): + IE_NAME = 'youtube:clip' + _VALID_URL = r'https?://(?:www\.)?youtube\.com/clip/(?P<id>[^/?#]+)' + _TESTS = [{ + # FIXME: Other metadata should be extracted from the clip, not from the base video + 'url': 'https://www.youtube.com/clip/UgytZKpehg-hEMBSn3F4AaABCQ', + 'info_dict': { + 'id': 'UgytZKpehg-hEMBSn3F4AaABCQ', + 'ext': 'mp4', + 'section_start': 29.0, + 'section_end': 39.7, + 'duration': 10.7, + 'age_limit': 0, + 'availability': 'public', + 'categories': ['Gaming'], + 'channel': 'Scott The Woz', + 'channel_id': 'UC4rqhyiTs7XyuODcECvuiiQ', + 'channel_url': 'https://www.youtube.com/channel/UC4rqhyiTs7XyuODcECvuiiQ', + 'description': 'md5:7a4517a17ea9b4bd98996399d8bb36e7', + 'like_count': int, + 'playable_in_embed': True, + 'tags': 'count:17', + 'thumbnail': 'https://i.ytimg.com/vi_webp/ScPX26pdQik/maxresdefault.webp', + 'title': 'Mobile Games on Console - Scott The Woz', + 'upload_date': '20210920', + 'uploader': 'Scott The Woz', + 'uploader_id': '@ScottTheWoz', + 'uploader_url': 'https://www.youtube.com/@ScottTheWoz', + 'view_count': int, + 'live_status': 'not_live', + 'channel_follower_count': int, + 'chapters': 'count:20', + 'comment_count': int, + 'heatmap': 'count:100', + }, + }] + + def _real_extract(self, url): + clip_id = self._match_id(url) + _, data = self._extract_webpage(url, clip_id) + + video_id = traverse_obj(data, ('currentVideoEndpoint', 'watchEndpoint', 'videoId')) + if not video_id: + raise ExtractorError('Unable to find video ID') + + clip_data = traverse_obj(data, ( + 'engagementPanels', ..., 'engagementPanelSectionListRenderer', 'content', 'clipSectionRenderer', + 'contents', ..., 'clipAttributionRenderer', 'onScrubExit', 'commandExecutorCommand', 'commands', ..., + 'openPopupAction', 'popup', 'notificationActionRenderer', 'actionButton', 'buttonRenderer', 'command', + 'commandExecutorCommand', 'commands', ..., 'loopCommand'), get_all=False) + + return { + '_type': 'url_transparent', + 'url': f'https://www.youtube.com/watch?v={video_id}', + 'ie_key': YoutubeIE.ie_key(), + 'id': clip_id, + 'section_start': int(clip_data['startTimeMs']) / 1000, + 'section_end': int(clip_data['endTimeMs']) / 1000, + '_format_sort_fields': ( # https protocol is prioritized for ffmpeg compatibility + 'proto:https', 'quality', 'res', 'fps', 'hdr:12', 'source', 'vcodec', 'channels', 'acodec', 'lang'), + } diff --git a/yt_dlp/extractor/youtube/_mistakes.py b/yt_dlp/extractor/youtube/_mistakes.py new file mode 100644 index 000000000..c5eb5161c --- /dev/null +++ b/yt_dlp/extractor/youtube/_mistakes.py @@ -0,0 +1,69 @@ + +from ._base import YoutubeBaseInfoExtractor +from ...utils import ExtractorError + + +class YoutubeTruncatedURLIE(YoutubeBaseInfoExtractor): + IE_NAME = 'youtube:truncated_url' + IE_DESC = False # Do not list + _VALID_URL = r'''(?x) + (?:https?://)? + (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/ + (?:watch\?(?: + feature=[a-z_]+| + annotation_id=annotation_[^&]+| + x-yt-cl=[0-9]+| + hl=[^&]*| + t=[0-9]+ + )? + | + attribution_link\?a=[^&]+ + ) + $ + ''' + + _TESTS = [{ + 'url': 'https://www.youtube.com/watch?annotation_id=annotation_3951667041', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/watch?', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/watch?feature=foo', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/watch?hl=en-GB', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/watch?t=2372', + 'only_matching': True, + }] + + def _real_extract(self, url): + raise ExtractorError( + 'Did you forget to quote the URL? Remember that & is a meta ' + 'character in most shells, so you want to put the URL in quotes, ' + 'like yt-dlp ' + '"https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" ' + ' or simply yt-dlp BaW_jenozKc .', + expected=True) + + +class YoutubeTruncatedIDIE(YoutubeBaseInfoExtractor): + IE_NAME = 'youtube:truncated_id' + IE_DESC = False # Do not list + _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$' + + _TESTS = [{ + 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + raise ExtractorError( + f'Incomplete YouTube ID {video_id}. URL {url} looks truncated.', + expected=True) diff --git a/yt_dlp/extractor/youtube/_notifications.py b/yt_dlp/extractor/youtube/_notifications.py new file mode 100644 index 000000000..ae55528da --- /dev/null +++ b/yt_dlp/extractor/youtube/_notifications.py @@ -0,0 +1,98 @@ +import itertools +import re + +from ._tab import YoutubeTabBaseInfoExtractor, YoutubeTabIE +from ._video import YoutubeIE +from ...utils import traverse_obj + + +class YoutubeNotificationsIE(YoutubeTabBaseInfoExtractor): + IE_NAME = 'youtube:notif' + IE_DESC = 'YouTube notifications; ":ytnotif" keyword (requires cookies)' + _VALID_URL = r':ytnotif(?:ication)?s?' + _LOGIN_REQUIRED = True + _TESTS = [{ + 'url': ':ytnotif', + 'only_matching': True, + }, { + 'url': ':ytnotifications', + 'only_matching': True, + }] + + def _extract_notification_menu(self, response, continuation_list): + notification_list = traverse_obj( + response, + ('actions', 0, 'openPopupAction', 'popup', 'multiPageMenuRenderer', 'sections', 0, 'multiPageMenuNotificationSectionRenderer', 'items'), + ('actions', 0, 'appendContinuationItemsAction', 'continuationItems'), + expected_type=list) or [] + continuation_list[0] = None + for item in notification_list: + entry = self._extract_notification_renderer(item.get('notificationRenderer')) + if entry: + yield entry + continuation = item.get('continuationItemRenderer') + if continuation: + continuation_list[0] = continuation + + def _extract_notification_renderer(self, notification): + video_id = traverse_obj( + notification, ('navigationEndpoint', 'watchEndpoint', 'videoId'), expected_type=str) + url = f'https://www.youtube.com/watch?v={video_id}' + channel_id = None + if not video_id: + browse_ep = traverse_obj( + notification, ('navigationEndpoint', 'browseEndpoint'), expected_type=dict) + channel_id = self.ucid_or_none(traverse_obj(browse_ep, 'browseId', expected_type=str)) + post_id = self._search_regex( + r'/post/(.+)', traverse_obj(browse_ep, 'canonicalBaseUrl', expected_type=str), + 'post id', default=None) + if not channel_id or not post_id: + return + # The direct /post url redirects to this in the browser + url = f'https://www.youtube.com/channel/{channel_id}/community?lb={post_id}' + + channel = traverse_obj( + notification, ('contextualMenu', 'menuRenderer', 'items', 1, 'menuServiceItemRenderer', 'text', 'runs', 1, 'text'), + expected_type=str) + notification_title = self._get_text(notification, 'shortMessage') + if notification_title: + notification_title = notification_title.replace('\xad', '') # remove soft hyphens + # TODO: handle recommended videos + title = self._search_regex( + rf'{re.escape(channel or "")}[^:]+: (.+)', notification_title, + 'video title', default=None) + timestamp = (self._parse_time_text(self._get_text(notification, 'sentTimeText')) + if self._configuration_arg('approximate_date', ie_key=YoutubeTabIE) + else None) + return { + '_type': 'url', + 'url': url, + 'ie_key': (YoutubeIE if video_id else YoutubeTabIE).ie_key(), + 'video_id': video_id, + 'title': title, + 'channel_id': channel_id, + 'channel': channel, + 'uploader': channel, + 'thumbnails': self._extract_thumbnails(notification, 'videoThumbnail'), + 'timestamp': timestamp, + } + + def _notification_menu_entries(self, ytcfg): + continuation_list = [None] + response = None + for page in itertools.count(1): + ctoken = traverse_obj( + continuation_list, (0, 'continuationEndpoint', 'getNotificationMenuEndpoint', 'ctoken'), expected_type=str) + response = self._extract_response( + item_id=f'page {page}', query={'ctoken': ctoken} if ctoken else {}, ytcfg=ytcfg, + ep='notification/get_notification_menu', check_get_keys='actions', + headers=self.generate_api_headers(ytcfg=ytcfg, visitor_data=self._extract_visitor_data(response))) + yield from self._extract_notification_menu(response, continuation_list) + if not continuation_list[0]: + break + + def _real_extract(self, url): + display_id = 'notifications' + ytcfg = self._download_ytcfg('web', display_id) if not self.skip_webpage else {} + self._report_playlist_authcheck(ytcfg) + return self.playlist_result(self._notification_menu_entries(ytcfg), display_id, display_id) diff --git a/yt_dlp/extractor/youtube/_redirect.py b/yt_dlp/extractor/youtube/_redirect.py new file mode 100644 index 000000000..1908df124 --- /dev/null +++ b/yt_dlp/extractor/youtube/_redirect.py @@ -0,0 +1,247 @@ +import base64 +import urllib.parse + +from ._base import YoutubeBaseInfoExtractor +from ._tab import YoutubeTabIE +from ...utils import ExtractorError, classproperty, parse_qs, update_url_query, url_or_none + + +class YoutubeYtBeIE(YoutubeBaseInfoExtractor): + IE_DESC = 'youtu.be' + _VALID_URL = rf'https?://youtu\.be/(?P<id>[0-9A-Za-z_-]{{11}})/*?.*?\blist=(?P<playlist_id>{YoutubeBaseInfoExtractor._PLAYLIST_ID_RE})' + _TESTS = [{ + 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5', + 'info_dict': { + 'id': 'yeWKywCrFtk', + 'ext': 'mp4', + 'title': 'Small Scale Baler and Braiding Rugs', + 'uploader': 'Backus-Page House Museum', + 'uploader_id': '@backuspagemuseum', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/@backuspagemuseum', + 'upload_date': '20161008', + 'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a', + 'categories': ['Nonprofits & Activism'], + 'tags': list, + 'like_count': int, + 'age_limit': 0, + 'playable_in_embed': True, + 'thumbnail': r're:^https?://.*\.webp', + 'channel': 'Backus-Page House Museum', + 'channel_id': 'UCEfMCQ9bs3tjvjy1s451zaw', + 'live_status': 'not_live', + 'view_count': int, + 'channel_url': 'https://www.youtube.com/channel/UCEfMCQ9bs3tjvjy1s451zaw', + 'availability': 'public', + 'duration': 59, + 'comment_count': int, + 'channel_follower_count': int, + }, + 'params': { + 'noplaylist': True, + 'skip_download': True, + }, + }, { + 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id = mobj.group('id') + playlist_id = mobj.group('playlist_id') + return self.url_result( + update_url_query('https://www.youtube.com/watch', { + 'v': video_id, + 'list': playlist_id, + 'feature': 'youtu.be', + }), ie=YoutubeTabIE.ie_key(), video_id=playlist_id) + + +class YoutubeLivestreamEmbedIE(YoutubeBaseInfoExtractor): + IE_DESC = 'YouTube livestream embeds' + _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/embed/live_stream/?\?(?:[^#]+&)?channel=(?P<id>[^&#]+)' + _TESTS = [{ + 'url': 'https://www.youtube.com/embed/live_stream?channel=UC2_KI6RB__jGdlnK6dvFEZA', + 'only_matching': True, + }] + + def _real_extract(self, url): + channel_id = self._match_id(url) + return self.url_result( + f'https://www.youtube.com/channel/{channel_id}/live', + ie=YoutubeTabIE.ie_key(), video_id=channel_id) + + +class YoutubeYtUserIE(YoutubeBaseInfoExtractor): + IE_DESC = 'YouTube user videos; "ytuser:" prefix' + IE_NAME = 'youtube:user' + _VALID_URL = r'ytuser:(?P<id>.+)' + _TESTS = [{ + 'url': 'ytuser:phihag', + 'only_matching': True, + }] + + def _real_extract(self, url): + user_id = self._match_id(url) + return self.url_result(f'https://www.youtube.com/user/{user_id}', YoutubeTabIE, user_id) + + +class YoutubeFavouritesIE(YoutubeBaseInfoExtractor): + IE_NAME = 'youtube:favorites' + IE_DESC = 'YouTube liked videos; ":ytfav" keyword (requires cookies)' + _VALID_URL = r':ytfav(?:ou?rite)?s?' + _LOGIN_REQUIRED = True + _TESTS = [{ + 'url': ':ytfav', + 'only_matching': True, + }, { + 'url': ':ytfavorites', + 'only_matching': True, + }] + + def _real_extract(self, url): + return self.url_result( + 'https://www.youtube.com/playlist?list=LL', + ie=YoutubeTabIE.ie_key()) + + +class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): + """ + Base class for feed extractors + Subclasses must re-define the _FEED_NAME property. + """ + _LOGIN_REQUIRED = True + _FEED_NAME = 'feeds' + + @classproperty + def IE_NAME(cls): + return f'youtube:{cls._FEED_NAME}' + + def _real_extract(self, url): + return self.url_result( + f'https://www.youtube.com/feed/{self._FEED_NAME}', ie=YoutubeTabIE.ie_key()) + + +class YoutubeWatchLaterIE(YoutubeBaseInfoExtractor): + IE_NAME = 'youtube:watchlater' + IE_DESC = 'Youtube watch later list; ":ytwatchlater" keyword (requires cookies)' + _VALID_URL = r':ytwatchlater' + _TESTS = [{ + 'url': ':ytwatchlater', + 'only_matching': True, + }] + + def _real_extract(self, url): + return self.url_result( + 'https://www.youtube.com/playlist?list=WL', ie=YoutubeTabIE.ie_key()) + + +class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor): + IE_DESC = 'YouTube recommended videos; ":ytrec" keyword' + _VALID_URL = r'https?://(?:www\.)?youtube\.com/?(?:[?#]|$)|:ytrec(?:ommended)?' + _FEED_NAME = 'recommended' + _LOGIN_REQUIRED = False + _TESTS = [{ + 'url': ':ytrec', + 'only_matching': True, + }, { + 'url': ':ytrecommended', + 'only_matching': True, + }, { + 'url': 'https://youtube.com', + 'only_matching': True, + }] + + +class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor): + IE_DESC = 'YouTube subscriptions feed; ":ytsubs" keyword (requires cookies)' + _VALID_URL = r':ytsub(?:scription)?s?' + _FEED_NAME = 'subscriptions' + _TESTS = [{ + 'url': ':ytsubs', + 'only_matching': True, + }, { + 'url': ':ytsubscriptions', + 'only_matching': True, + }] + + +class YoutubeHistoryIE(YoutubeFeedsInfoExtractor): + IE_DESC = 'Youtube watch history; ":ythis" keyword (requires cookies)' + _VALID_URL = r':ythis(?:tory)?' + _FEED_NAME = 'history' + _TESTS = [{ + 'url': ':ythistory', + 'only_matching': True, + }] + + +class YoutubeShortsAudioPivotIE(YoutubeBaseInfoExtractor): + IE_DESC = 'YouTube Shorts audio pivot (Shorts using audio of a given video)' + IE_NAME = 'youtube:shorts:pivot:audio' + _VALID_URL = r'https?://(?:www\.)?youtube\.com/source/(?P<id>[\w-]{11})/shorts' + _TESTS = [{ + 'url': 'https://www.youtube.com/source/Lyj-MZSAA9o/shorts', + 'only_matching': True, + }] + + @staticmethod + def _generate_audio_pivot_params(video_id): + """ + Generates sfv_audio_pivot browse params for this video id + """ + pb_params = b'\xf2\x05+\n)\x12\'\n\x0b%b\x12\x0b%b\x1a\x0b%b' % ((video_id.encode(),) * 3) + return urllib.parse.quote(base64.b64encode(pb_params).decode()) + + def _real_extract(self, url): + video_id = self._match_id(url) + return self.url_result( + f'https://www.youtube.com/feed/sfv_audio_pivot?bp={self._generate_audio_pivot_params(video_id)}', + ie=YoutubeTabIE) + + +class YoutubeConsentRedirectIE(YoutubeBaseInfoExtractor): + IE_NAME = 'youtube:consent' + IE_DESC = False # Do not list + _VALID_URL = r'https?://consent\.youtube\.com/m\?' + _TESTS = [{ + 'url': 'https://consent.youtube.com/m?continue=https%3A%2F%2Fwww.youtube.com%2Flive%2FqVv6vCqciTM%3Fcbrd%3D1&gl=NL&m=0&pc=yt&hl=en&src=1', + 'info_dict': { + 'id': 'qVv6vCqciTM', + 'ext': 'mp4', + 'age_limit': 0, + 'uploader_id': '@sana_natori', + 'comment_count': int, + 'chapters': 'count:13', + 'upload_date': '20221223', + 'thumbnail': 'https://i.ytimg.com/vi/qVv6vCqciTM/maxresdefault.jpg', + 'channel_url': 'https://www.youtube.com/channel/UCIdEIHpS0TdkqRkHL5OkLtA', + 'uploader_url': 'https://www.youtube.com/@sana_natori', + 'like_count': int, + 'release_date': '20221223', + 'tags': ['Vtuber', '月ノ美兎', '名取さな', 'にじさんじ', 'クリスマス', '3D配信'], + 'title': '【 #インターネット女クリスマス 】3Dで歌ってはしゃぐインターネットの女たち【月ノ美兎/名取さな】', + 'view_count': int, + 'playable_in_embed': True, + 'duration': 4438, + 'availability': 'public', + 'channel_follower_count': int, + 'channel_id': 'UCIdEIHpS0TdkqRkHL5OkLtA', + 'categories': ['Entertainment'], + 'live_status': 'was_live', + 'release_timestamp': 1671793345, + 'channel': 'さなちゃんねる', + 'description': 'md5:6aebf95cc4a1d731aebc01ad6cc9806d', + 'uploader': 'さなちゃんねる', + 'channel_is_verified': True, + 'heatmap': 'count:100', + }, + 'add_ie': ['Youtube'], + 'params': {'skip_download': 'Youtube'}, + }] + + def _real_extract(self, url): + redirect_url = url_or_none(parse_qs(url).get('continue', [None])[-1]) + if not redirect_url: + raise ExtractorError('Invalid cookie consent redirect URL', expected=True) + return self.url_result(redirect_url) diff --git a/yt_dlp/extractor/youtube/_search.py b/yt_dlp/extractor/youtube/_search.py new file mode 100644 index 000000000..be10a20da --- /dev/null +++ b/yt_dlp/extractor/youtube/_search.py @@ -0,0 +1,167 @@ +import urllib.parse + +from ._tab import YoutubeTabBaseInfoExtractor +from ..common import SearchInfoExtractor +from ...utils import join_nonempty, parse_qs + + +class YoutubeSearchIE(YoutubeTabBaseInfoExtractor, SearchInfoExtractor): + IE_DESC = 'YouTube search' + IE_NAME = 'youtube:search' + _SEARCH_KEY = 'ytsearch' + _SEARCH_PARAMS = 'EgIQAfABAQ==' # Videos only + _TESTS = [{ + 'url': 'ytsearch5:youtube-dl test video', + 'playlist_count': 5, + 'info_dict': { + 'id': 'youtube-dl test video', + 'title': 'youtube-dl test video', + }, + }, { + 'note': 'Suicide/self-harm search warning', + 'url': 'ytsearch1:i hate myself and i wanna die', + 'playlist_count': 1, + 'info_dict': { + 'id': 'i hate myself and i wanna die', + 'title': 'i hate myself and i wanna die', + }, + }] + + +class YoutubeSearchDateIE(YoutubeTabBaseInfoExtractor, SearchInfoExtractor): + IE_NAME = YoutubeSearchIE.IE_NAME + ':date' + _SEARCH_KEY = 'ytsearchdate' + IE_DESC = 'YouTube search, newest videos first' + _SEARCH_PARAMS = 'CAISAhAB8AEB' # Videos only, sorted by date + _TESTS = [{ + 'url': 'ytsearchdate5:youtube-dl test video', + 'playlist_count': 5, + 'info_dict': { + 'id': 'youtube-dl test video', + 'title': 'youtube-dl test video', + }, + }] + + +class YoutubeSearchURLIE(YoutubeTabBaseInfoExtractor): + IE_DESC = 'YouTube search URLs with sorting and filter support' + IE_NAME = YoutubeSearchIE.IE_NAME + '_url' + _VALID_URL = r'https?://(?:www\.)?youtube\.com/(?:results|search)\?([^#]+&)?(?:search_query|q)=(?:[^&]+)(?:[&#]|$)' + _TESTS = [{ + 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', + 'playlist_mincount': 5, + 'info_dict': { + 'id': 'youtube-dl test video', + 'title': 'youtube-dl test video', + }, + }, { + 'url': 'https://www.youtube.com/results?search_query=python&sp=EgIQAg%253D%253D', + 'playlist_mincount': 5, + 'info_dict': { + 'id': 'python', + 'title': 'python', + }, + }, { + 'url': 'https://www.youtube.com/results?search_query=%23cats', + 'playlist_mincount': 1, + 'info_dict': { + 'id': '#cats', + 'title': '#cats', + # The test suite does not have support for nested playlists + # 'entries': [{ + # 'url': r're:https://(www\.)?youtube\.com/hashtag/cats', + # 'title': '#cats', + # }], + }, + }, { + # Channel results + 'url': 'https://www.youtube.com/results?search_query=kurzgesagt&sp=EgIQAg%253D%253D', + 'info_dict': { + 'id': 'kurzgesagt', + 'title': 'kurzgesagt', + }, + 'playlist': [{ + 'info_dict': { + '_type': 'url', + 'id': 'UCsXVk37bltHxD1rDPwtNM8Q', + 'url': 'https://www.youtube.com/channel/UCsXVk37bltHxD1rDPwtNM8Q', + 'ie_key': 'YoutubeTab', + 'channel': 'Kurzgesagt – In a Nutshell', + 'description': 'md5:4ae48dfa9505ffc307dad26342d06bfc', + 'title': 'Kurzgesagt – In a Nutshell', + 'channel_id': 'UCsXVk37bltHxD1rDPwtNM8Q', + # No longer available for search as it is set to the handle. + # 'playlist_count': int, + 'channel_url': 'https://www.youtube.com/channel/UCsXVk37bltHxD1rDPwtNM8Q', + 'thumbnails': list, + 'uploader_id': '@kurzgesagt', + 'uploader_url': 'https://www.youtube.com/@kurzgesagt', + 'uploader': 'Kurzgesagt – In a Nutshell', + 'channel_is_verified': True, + 'channel_follower_count': int, + }, + }], + 'params': {'extract_flat': True, 'playlist_items': '1'}, + 'playlist_mincount': 1, + }, { + 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB', + 'only_matching': True, + }] + + def _real_extract(self, url): + qs = parse_qs(url) + query = (qs.get('search_query') or qs.get('q'))[0] + return self.playlist_result(self._search_results(query, qs.get('sp', (None,))[0]), query, query) + + +class YoutubeMusicSearchURLIE(YoutubeTabBaseInfoExtractor): + IE_DESC = 'YouTube music search URLs with selectable sections, e.g. #songs' + IE_NAME = 'youtube:music:search_url' + _VALID_URL = r'https?://music\.youtube\.com/search\?([^#]+&)?(?:search_query|q)=(?:[^&]+)(?:[&#]|$)' + _TESTS = [{ + 'url': 'https://music.youtube.com/search?q=royalty+free+music', + 'playlist_count': 16, + 'info_dict': { + 'id': 'royalty free music', + 'title': 'royalty free music', + }, + }, { + 'url': 'https://music.youtube.com/search?q=royalty+free+music&sp=EgWKAQIIAWoKEAoQAxAEEAkQBQ%3D%3D', + 'playlist_mincount': 30, + 'info_dict': { + 'id': 'royalty free music - songs', + 'title': 'royalty free music - songs', + }, + 'params': {'extract_flat': 'in_playlist'}, + }, { + 'url': 'https://music.youtube.com/search?q=royalty+free+music#community+playlists', + 'playlist_mincount': 30, + 'info_dict': { + 'id': 'royalty free music - community playlists', + 'title': 'royalty free music - community playlists', + }, + 'params': {'extract_flat': 'in_playlist'}, + }] + + _SECTIONS = { + 'albums': 'EgWKAQIYAWoKEAoQAxAEEAkQBQ==', + 'artists': 'EgWKAQIgAWoKEAoQAxAEEAkQBQ==', + 'community playlists': 'EgeKAQQoAEABagoQChADEAQQCRAF', + 'featured playlists': 'EgeKAQQoADgBagwQAxAJEAQQDhAKEAU==', + 'songs': 'EgWKAQIIAWoKEAoQAxAEEAkQBQ==', + 'videos': 'EgWKAQIQAWoKEAoQAxAEEAkQBQ==', + } + + def _real_extract(self, url): + qs = parse_qs(url) + query = (qs.get('search_query') or qs.get('q'))[0] + params = qs.get('sp', (None,))[0] + if params: + section = next((k for k, v in self._SECTIONS.items() if v == params), params) + else: + section = urllib.parse.unquote_plus(([*url.split('#'), ''])[1]).lower() + params = self._SECTIONS.get(section) + if not params: + section = None + title = join_nonempty(query, section, delim=' - ') + return self.playlist_result(self._search_results(query, params, default_client='web_music'), title, title) diff --git a/yt_dlp/extractor/youtube/_tab.py b/yt_dlp/extractor/youtube/_tab.py new file mode 100644 index 000000000..c018ee8cf --- /dev/null +++ b/yt_dlp/extractor/youtube/_tab.py @@ -0,0 +1,2385 @@ +import functools +import itertools +import re +import shlex +import urllib.parse + +from ._base import BadgeType, YoutubeBaseInfoExtractor +from ._video import YoutubeIE +from ...networking.exceptions import HTTPError, network_exceptions +from ...utils import ( + NO_DEFAULT, + ExtractorError, + UserNotLive, + bug_reports_message, + format_field, + get_first, + int_or_none, + parse_count, + parse_duration, + parse_qs, + smuggle_url, + str_to_int, + strftime_or_none, + traverse_obj, + try_get, + unsmuggle_url, + update_url_query, + url_or_none, + urljoin, + variadic, +) + + +class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): + @staticmethod + def passthrough_smuggled_data(func): + def _smuggle(info, smuggled_data): + if info.get('_type') not in ('url', 'url_transparent'): + return info + if smuggled_data.get('is_music_url'): + parsed_url = urllib.parse.urlparse(info['url']) + if parsed_url.netloc in ('www.youtube.com', 'music.youtube.com'): + smuggled_data.pop('is_music_url') + info['url'] = urllib.parse.urlunparse(parsed_url._replace(netloc='music.youtube.com')) + if smuggled_data: + info['url'] = smuggle_url(info['url'], smuggled_data) + return info + + @functools.wraps(func) + def wrapper(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + if self.is_music_url(url): + smuggled_data['is_music_url'] = True + info_dict = func(self, url, smuggled_data) + if smuggled_data: + _smuggle(info_dict, smuggled_data) + if info_dict.get('entries'): + info_dict['entries'] = (_smuggle(i, smuggled_data.copy()) for i in info_dict['entries']) + return info_dict + return wrapper + + @staticmethod + def _extract_basic_item_renderer(item): + # Modified from _extract_grid_item_renderer + known_basic_renderers = ( + 'playlistRenderer', 'videoRenderer', 'channelRenderer', 'showRenderer', 'reelItemRenderer', + ) + for key, renderer in item.items(): + if not isinstance(renderer, dict): + continue + elif key in known_basic_renderers: + return renderer + elif key.startswith('grid') and key.endswith('Renderer'): + return renderer + + def _extract_video(self, renderer): + video_id = renderer.get('videoId') + + reel_header_renderer = traverse_obj(renderer, ( + 'navigationEndpoint', 'reelWatchEndpoint', 'overlay', 'reelPlayerOverlayRenderer', + 'reelPlayerHeaderSupportedRenderers', 'reelPlayerHeaderRenderer')) + + title = self._get_text(renderer, 'title', 'headline') or self._get_text(reel_header_renderer, 'reelTitleText') + description = self._get_text(renderer, 'descriptionSnippet') + + duration = int_or_none(renderer.get('lengthSeconds')) + if duration is None: + duration = parse_duration(self._get_text( + renderer, 'lengthText', ('thumbnailOverlays', ..., 'thumbnailOverlayTimeStatusRenderer', 'text'))) + if duration is None: + # XXX: should write a parser to be more general to support more cases (e.g. shorts in shorts tab) + duration = parse_duration(self._search_regex( + r'(?i)(ago)(?!.*\1)\s+(?P<duration>[a-z0-9 ,]+?)(?:\s+[\d,]+\s+views)?(?:\s+-\s+play\s+short)?$', + traverse_obj(renderer, ('title', 'accessibility', 'accessibilityData', 'label'), default='', expected_type=str), + video_id, default=None, group='duration')) + + channel_id = traverse_obj( + renderer, ('shortBylineText', 'runs', ..., 'navigationEndpoint', 'browseEndpoint', 'browseId'), + expected_type=str, get_all=False) + if not channel_id: + channel_id = traverse_obj(reel_header_renderer, ('channelNavigationEndpoint', 'browseEndpoint', 'browseId')) + + channel_id = self.ucid_or_none(channel_id) + + overlay_style = traverse_obj( + renderer, ('thumbnailOverlays', ..., 'thumbnailOverlayTimeStatusRenderer', 'style'), + get_all=False, expected_type=str) + badges = self._extract_badges(traverse_obj(renderer, 'badges')) + owner_badges = self._extract_badges(traverse_obj(renderer, 'ownerBadges')) + navigation_url = urljoin('https://www.youtube.com/', traverse_obj( + renderer, ('navigationEndpoint', 'commandMetadata', 'webCommandMetadata', 'url'), + expected_type=str)) or '' + url = f'https://www.youtube.com/watch?v={video_id}' + if overlay_style == 'SHORTS' or '/shorts/' in navigation_url: + url = f'https://www.youtube.com/shorts/{video_id}' + + time_text = (self._get_text(renderer, 'publishedTimeText', 'videoInfo') + or self._get_text(reel_header_renderer, 'timestampText') or '') + scheduled_timestamp = str_to_int(traverse_obj(renderer, ('upcomingEventData', 'startTime'), get_all=False)) + + live_status = ( + 'is_upcoming' if scheduled_timestamp is not None + else 'was_live' if 'streamed' in time_text.lower() + else 'is_live' if overlay_style == 'LIVE' or self._has_badge(badges, BadgeType.LIVE_NOW) + else None) + + # videoInfo is a string like '50K views • 10 years ago'. + view_count_text = self._get_text(renderer, 'viewCountText', 'shortViewCountText', 'videoInfo') or '' + view_count = (0 if 'no views' in view_count_text.lower() + else self._get_count({'simpleText': view_count_text})) + view_count_field = 'concurrent_view_count' if live_status in ('is_live', 'is_upcoming') else 'view_count' + + channel = (self._get_text(renderer, 'ownerText', 'shortBylineText') + or self._get_text(reel_header_renderer, 'channelTitleText')) + + channel_handle = traverse_obj(renderer, ( + 'shortBylineText', 'runs', ..., 'navigationEndpoint', + (('commandMetadata', 'webCommandMetadata', 'url'), ('browseEndpoint', 'canonicalBaseUrl'))), + expected_type=self.handle_from_url, get_all=False) + return { + '_type': 'url', + 'ie_key': YoutubeIE.ie_key(), + 'id': video_id, + 'url': url, + 'title': title, + 'description': description, + 'duration': duration, + 'channel_id': channel_id, + 'channel': channel, + 'channel_url': f'https://www.youtube.com/channel/{channel_id}' if channel_id else None, + 'uploader': channel, + 'uploader_id': channel_handle, + 'uploader_url': format_field(channel_handle, None, 'https://www.youtube.com/%s', default=None), + 'thumbnails': self._extract_thumbnails(renderer, 'thumbnail'), + 'timestamp': (self._parse_time_text(time_text) + if self._configuration_arg('approximate_date', ie_key=YoutubeTabIE) + else None), + 'release_timestamp': scheduled_timestamp, + 'availability': + 'public' if self._has_badge(badges, BadgeType.AVAILABILITY_PUBLIC) + else self._availability( + is_private=self._has_badge(badges, BadgeType.AVAILABILITY_PRIVATE) or None, + needs_premium=self._has_badge(badges, BadgeType.AVAILABILITY_PREMIUM) or None, + needs_subscription=self._has_badge(badges, BadgeType.AVAILABILITY_SUBSCRIPTION) or None, + is_unlisted=self._has_badge(badges, BadgeType.AVAILABILITY_UNLISTED) or None), + view_count_field: view_count, + 'live_status': live_status, + 'channel_is_verified': True if self._has_badge(owner_badges, BadgeType.VERIFIED) else None, + } + + def _extract_channel_renderer(self, renderer): + channel_id = self.ucid_or_none(renderer['channelId']) + title = self._get_text(renderer, 'title') + channel_url = format_field(channel_id, None, 'https://www.youtube.com/channel/%s', default=None) + channel_handle = self.handle_from_url( + traverse_obj(renderer, ( + 'navigationEndpoint', (('commandMetadata', 'webCommandMetadata', 'url'), + ('browseEndpoint', 'canonicalBaseUrl')), + {str}), get_all=False)) + if not channel_handle: + # As of 2023-06-01, YouTube sets subscriberCountText to the handle in search + channel_handle = self.handle_or_none(self._get_text(renderer, 'subscriberCountText')) + return { + '_type': 'url', + 'url': channel_url, + 'id': channel_id, + 'ie_key': YoutubeTabIE.ie_key(), + 'channel': title, + 'uploader': title, + 'channel_id': channel_id, + 'channel_url': channel_url, + 'title': title, + 'uploader_id': channel_handle, + 'uploader_url': format_field(channel_handle, None, 'https://www.youtube.com/%s', default=None), + # See above. YouTube sets videoCountText to the subscriber text in search channel renderers. + # However, in feed/channels this is set correctly to the subscriber count + 'channel_follower_count': traverse_obj( + renderer, 'subscriberCountText', 'videoCountText', expected_type=self._get_count), + 'thumbnails': self._extract_thumbnails(renderer, 'thumbnail'), + 'playlist_count': ( + # videoCountText may be the subscriber count + self._get_count(renderer, 'videoCountText') + if self._get_count(renderer, 'subscriberCountText') is not None else None), + 'description': self._get_text(renderer, 'descriptionSnippet'), + 'channel_is_verified': True if self._has_badge( + self._extract_badges(traverse_obj(renderer, 'ownerBadges')), BadgeType.VERIFIED) else None, + } + + def _grid_entries(self, grid_renderer): + for item in grid_renderer['items']: + if not isinstance(item, dict): + continue + if lockup_view_model := traverse_obj(item, ('lockupViewModel', {dict})): + if entry := self._extract_lockup_view_model(lockup_view_model): + yield entry + continue + renderer = self._extract_basic_item_renderer(item) + if not isinstance(renderer, dict): + continue + title = self._get_text(renderer, 'title') + + # playlist + playlist_id = renderer.get('playlistId') + if playlist_id: + yield self.url_result( + f'https://www.youtube.com/playlist?list={playlist_id}', + ie=YoutubeTabIE.ie_key(), video_id=playlist_id, + video_title=title) + continue + # video + video_id = renderer.get('videoId') + if video_id: + yield self._extract_video(renderer) + continue + # channel + channel_id = renderer.get('channelId') + if channel_id: + yield self._extract_channel_renderer(renderer) + continue + # generic endpoint URL support + ep_url = urljoin('https://www.youtube.com/', try_get( + renderer, lambda x: x['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'], + str)) + if ep_url: + + for ie in (YoutubeTabIE, YoutubePlaylistIE, YoutubeIE): + if ie.suitable(ep_url): + yield self.url_result( + ep_url, ie=ie.ie_key(), video_id=ie._match_id(ep_url), video_title=title) + break + + def _music_reponsive_list_entry(self, renderer): + video_id = traverse_obj(renderer, ('playlistItemData', 'videoId')) + if video_id: + title = traverse_obj(renderer, ( + 'flexColumns', 0, 'musicResponsiveListItemFlexColumnRenderer', + 'text', 'runs', 0, 'text')) + return self.url_result(f'https://music.youtube.com/watch?v={video_id}', + ie=YoutubeIE.ie_key(), video_id=video_id, title=title) + playlist_id = traverse_obj(renderer, ('navigationEndpoint', 'watchEndpoint', 'playlistId')) + if playlist_id: + video_id = traverse_obj(renderer, ('navigationEndpoint', 'watchEndpoint', 'videoId')) + if video_id: + return self.url_result(f'https://music.youtube.com/watch?v={video_id}&list={playlist_id}', + ie=YoutubeTabIE.ie_key(), video_id=playlist_id) + return self.url_result(f'https://music.youtube.com/playlist?list={playlist_id}', + ie=YoutubeTabIE.ie_key(), video_id=playlist_id) + browse_id = traverse_obj(renderer, ('navigationEndpoint', 'browseEndpoint', 'browseId')) + if browse_id: + return self.url_result(f'https://music.youtube.com/browse/{browse_id}', + ie=YoutubeTabIE.ie_key(), video_id=browse_id) + + def _shelf_entries_from_content(self, shelf_renderer): + content = shelf_renderer.get('content') + if not isinstance(content, dict): + return + renderer = content.get('gridRenderer') or content.get('expandedShelfContentsRenderer') + if renderer: + # TODO: add support for nested playlists so each shelf is processed + # as separate playlist + # TODO: this includes only first N items + yield from self._grid_entries(renderer) + renderer = content.get('horizontalListRenderer') + if renderer: + # TODO: handle case + pass + + def _shelf_entries(self, shelf_renderer, skip_channels=False): + ep = try_get( + shelf_renderer, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'], + str) + shelf_url = urljoin('https://www.youtube.com', ep) + if shelf_url: + # Skipping links to another channels, note that checking for + # endpoint.commandMetadata.webCommandMetadata.webPageTypwebPageType == WEB_PAGE_TYPE_CHANNEL + # will not work + if skip_channels and '/channels?' in shelf_url: + return + title = self._get_text(shelf_renderer, 'title') + yield self.url_result(shelf_url, video_title=title) + # Shelf may not contain shelf URL, fallback to extraction from content + yield from self._shelf_entries_from_content(shelf_renderer) + + def _playlist_entries(self, video_list_renderer): + for content in video_list_renderer['contents']: + if not isinstance(content, dict): + continue + renderer = content.get('playlistVideoRenderer') or content.get('playlistPanelVideoRenderer') + if not isinstance(renderer, dict): + continue + video_id = renderer.get('videoId') + if not video_id: + continue + yield self._extract_video(renderer) + + def _extract_lockup_view_model(self, view_model): + content_id = view_model.get('contentId') + if not content_id: + return + content_type = view_model.get('contentType') + if content_type not in ('LOCKUP_CONTENT_TYPE_PLAYLIST', 'LOCKUP_CONTENT_TYPE_PODCAST'): + self.report_warning( + f'Unsupported lockup view model content type "{content_type}"{bug_reports_message()}', only_once=True) + return + return self.url_result( + f'https://www.youtube.com/playlist?list={content_id}', ie=YoutubeTabIE, video_id=content_id, + title=traverse_obj(view_model, ( + 'metadata', 'lockupMetadataViewModel', 'title', 'content', {str})), + thumbnails=self._extract_thumbnails(view_model, ( + 'contentImage', 'collectionThumbnailViewModel', 'primaryThumbnail', 'thumbnailViewModel', 'image'), final_key='sources')) + + def _rich_entries(self, rich_grid_renderer): + if lockup_view_model := traverse_obj(rich_grid_renderer, ('content', 'lockupViewModel', {dict})): + if entry := self._extract_lockup_view_model(lockup_view_model): + yield entry + return + renderer = traverse_obj( + rich_grid_renderer, + ('content', ('videoRenderer', 'reelItemRenderer', 'playlistRenderer', 'shortsLockupViewModel'), any)) or {} + video_id = renderer.get('videoId') + if video_id: + yield self._extract_video(renderer) + return + playlist_id = renderer.get('playlistId') + if playlist_id: + yield self.url_result( + f'https://www.youtube.com/playlist?list={playlist_id}', + ie=YoutubeTabIE.ie_key(), video_id=playlist_id, + video_title=self._get_text(renderer, 'title')) + return + # shortsLockupViewModel extraction + entity_id = renderer.get('entityId') + if entity_id: + video_id = traverse_obj(renderer, ('onTap', 'innertubeCommand', 'reelWatchEndpoint', 'videoId', {str})) + if not video_id: + return + yield self.url_result( + f'https://www.youtube.com/shorts/{video_id}', + ie=YoutubeIE, video_id=video_id, + **traverse_obj(renderer, { + 'title': (( + ('overlayMetadata', 'primaryText', 'content', {str}), + ('accessibilityText', {lambda x: re.fullmatch(r'(.+), (?:[\d,.]+(?:[KM]| million)?|No) views? - play Short', x)}, 1)), any), + 'view_count': ('overlayMetadata', 'secondaryText', 'content', {parse_count}), + }), + thumbnails=self._extract_thumbnails(renderer, 'thumbnail', final_key='sources')) + return + + def _video_entry(self, video_renderer): + video_id = video_renderer.get('videoId') + if video_id: + return self._extract_video(video_renderer) + + def _hashtag_tile_entry(self, hashtag_tile_renderer): + url = urljoin('https://youtube.com', traverse_obj( + hashtag_tile_renderer, ('onTapCommand', 'commandMetadata', 'webCommandMetadata', 'url'))) + if url: + return self.url_result( + url, ie=YoutubeTabIE.ie_key(), title=self._get_text(hashtag_tile_renderer, 'hashtag')) + + def _post_thread_entries(self, post_thread_renderer): + post_renderer = try_get( + post_thread_renderer, lambda x: x['post']['backstagePostRenderer'], dict) + if not post_renderer: + return + # video attachment + video_renderer = try_get( + post_renderer, lambda x: x['backstageAttachment']['videoRenderer'], dict) or {} + video_id = video_renderer.get('videoId') + if video_id: + entry = self._extract_video(video_renderer) + if entry: + yield entry + # playlist attachment + playlist_id = try_get( + post_renderer, lambda x: x['backstageAttachment']['playlistRenderer']['playlistId'], str) + if playlist_id: + yield self.url_result( + f'https://www.youtube.com/playlist?list={playlist_id}', + ie=YoutubeTabIE.ie_key(), video_id=playlist_id) + # inline video links + runs = try_get(post_renderer, lambda x: x['contentText']['runs'], list) or [] + for run in runs: + if not isinstance(run, dict): + continue + ep_url = try_get( + run, lambda x: x['navigationEndpoint']['urlEndpoint']['url'], str) + if not ep_url: + continue + if not YoutubeIE.suitable(ep_url): + continue + ep_video_id = YoutubeIE._match_id(ep_url) + if video_id == ep_video_id: + continue + yield self.url_result(ep_url, ie=YoutubeIE.ie_key(), video_id=ep_video_id) + + def _post_thread_continuation_entries(self, post_thread_continuation): + contents = post_thread_continuation.get('contents') + if not isinstance(contents, list): + return + for content in contents: + renderer = content.get('backstagePostThreadRenderer') + if isinstance(renderer, dict): + yield from self._post_thread_entries(renderer) + continue + renderer = content.get('videoRenderer') + if isinstance(renderer, dict): + yield self._video_entry(renderer) + + r''' # unused + def _rich_grid_entries(self, contents): + for content in contents: + video_renderer = try_get(content, lambda x: x['richItemRenderer']['content']['videoRenderer'], dict) + if video_renderer: + entry = self._video_entry(video_renderer) + if entry: + yield entry + ''' + + def _report_history_entries(self, renderer): + for url in traverse_obj(renderer, ( + 'rows', ..., 'reportHistoryTableRowRenderer', 'cells', ..., + 'reportHistoryTableCellRenderer', 'cell', 'reportHistoryTableTextCellRenderer', 'text', 'runs', ..., + 'navigationEndpoint', 'commandMetadata', 'webCommandMetadata', 'url')): + yield self.url_result(urljoin('https://www.youtube.com', url), YoutubeIE) + + def _extract_entries(self, parent_renderer, continuation_list): + # continuation_list is modified in-place with continuation_list = [continuation_token] + continuation_list[:] = [None] + contents = try_get(parent_renderer, lambda x: x['contents'], list) or [] + for content in contents: + if not isinstance(content, dict): + continue + is_renderer = traverse_obj( + content, 'itemSectionRenderer', 'musicShelfRenderer', 'musicShelfContinuation', + expected_type=dict) + if not is_renderer: + if content.get('richItemRenderer'): + for entry in self._rich_entries(content['richItemRenderer']): + yield entry + continuation_list[0] = self._extract_continuation(parent_renderer) + elif content.get('reportHistorySectionRenderer'): # https://www.youtube.com/reporthistory + table = traverse_obj(content, ('reportHistorySectionRenderer', 'table', 'tableRenderer')) + yield from self._report_history_entries(table) + continuation_list[0] = self._extract_continuation(table) + continue + + isr_contents = try_get(is_renderer, lambda x: x['contents'], list) or [] + for isr_content in isr_contents: + if not isinstance(isr_content, dict): + continue + + known_renderers = { + 'playlistVideoListRenderer': self._playlist_entries, + 'gridRenderer': self._grid_entries, + 'reelShelfRenderer': self._grid_entries, + 'shelfRenderer': self._shelf_entries, + 'musicResponsiveListItemRenderer': lambda x: [self._music_reponsive_list_entry(x)], + 'backstagePostThreadRenderer': self._post_thread_entries, + 'videoRenderer': lambda x: [self._video_entry(x)], + 'playlistRenderer': lambda x: self._grid_entries({'items': [{'playlistRenderer': x}]}), + 'channelRenderer': lambda x: self._grid_entries({'items': [{'channelRenderer': x}]}), + 'hashtagTileRenderer': lambda x: [self._hashtag_tile_entry(x)], + 'richGridRenderer': lambda x: self._extract_entries(x, continuation_list), + 'lockupViewModel': lambda x: [self._extract_lockup_view_model(x)], + } + for key, renderer in isr_content.items(): + if key not in known_renderers: + continue + for entry in known_renderers[key](renderer): + if entry: + yield entry + continuation_list[0] = self._extract_continuation(renderer) + break + + if not continuation_list[0]: + continuation_list[0] = self._extract_continuation(is_renderer) + + if not continuation_list[0]: + continuation_list[0] = self._extract_continuation(parent_renderer) + + def _entries(self, tab, item_id, ytcfg, delegated_session_id, visitor_data): + continuation_list = [None] + extract_entries = lambda x: self._extract_entries(x, continuation_list) + tab_content = try_get(tab, lambda x: x['content'], dict) + if not tab_content: + return + parent_renderer = ( + try_get(tab_content, lambda x: x['sectionListRenderer'], dict) + or try_get(tab_content, lambda x: x['richGridRenderer'], dict) or {}) + yield from extract_entries(parent_renderer) + continuation = continuation_list[0] + seen_continuations = set() + for page_num in itertools.count(1): + if not continuation: + break + continuation_token = continuation.get('continuation') + if continuation_token is not None and continuation_token in seen_continuations: + self.write_debug('Detected YouTube feed looping - assuming end of feed.') + break + seen_continuations.add(continuation_token) + headers = self.generate_api_headers( + ytcfg=ytcfg, delegated_session_id=delegated_session_id, visitor_data=visitor_data) + response = self._extract_response( + item_id=f'{item_id} page {page_num}', + query=continuation, headers=headers, ytcfg=ytcfg, + check_get_keys=( + 'continuationContents', 'onResponseReceivedActions', 'onResponseReceivedEndpoints', + # Playlist recommendations may return with no data - ignore + ('responseContext', 'serviceTrackingParams', ..., 'params', ..., lambda k, v: k == 'key' and v == 'GetRecommendedMusicPlaylists_rid'), + )) + + if not response: + break + + continuation = None + # Extracting updated visitor data is required to prevent an infinite extraction loop in some cases + # See: https://github.com/ytdl-org/youtube-dl/issues/28702 + visitor_data = self._extract_visitor_data(response) or visitor_data + + known_renderers = { + 'videoRenderer': (self._grid_entries, 'items'), # for membership tab + 'gridPlaylistRenderer': (self._grid_entries, 'items'), + 'gridVideoRenderer': (self._grid_entries, 'items'), + 'gridChannelRenderer': (self._grid_entries, 'items'), + 'playlistVideoRenderer': (self._playlist_entries, 'contents'), + 'itemSectionRenderer': (extract_entries, 'contents'), # for feeds + 'richItemRenderer': (extract_entries, 'contents'), # for hashtag + 'backstagePostThreadRenderer': (self._post_thread_continuation_entries, 'contents'), + 'reportHistoryTableRowRenderer': (self._report_history_entries, 'rows'), + 'playlistVideoListContinuation': (self._playlist_entries, None), + 'gridContinuation': (self._grid_entries, None), + 'itemSectionContinuation': (self._post_thread_continuation_entries, None), + 'sectionListContinuation': (extract_entries, None), # for feeds + } + + continuation_items = traverse_obj(response, ( + ('onResponseReceivedActions', 'onResponseReceivedEndpoints'), ..., + 'appendContinuationItemsAction', 'continuationItems', + ), 'continuationContents', get_all=False) + continuation_item = traverse_obj(continuation_items, 0, None, expected_type=dict, default={}) + + video_items_renderer = None + for key in continuation_item: + if key not in known_renderers: + continue + func, parent_key = known_renderers[key] + video_items_renderer = {parent_key: continuation_items} if parent_key else continuation_items + continuation_list = [None] + yield from func(video_items_renderer) + continuation = continuation_list[0] or self._extract_continuation(video_items_renderer) + + # In the case only a continuation is returned, try to follow it. + # We extract this after trying to extract non-continuation items as otherwise this + # may be prioritized over other continuations. + # see: https://github.com/yt-dlp/yt-dlp/issues/12933 + continuation = continuation or self._extract_continuation({'contents': [continuation_item]}) + + if not continuation and not video_items_renderer: + break + + @staticmethod + def _extract_selected_tab(tabs, fatal=True): + for tab_renderer in tabs: + if tab_renderer.get('selected'): + return tab_renderer + if fatal: + raise ExtractorError('Unable to find selected tab') + + @staticmethod + def _extract_tab_renderers(response): + return traverse_obj( + response, ('contents', 'twoColumnBrowseResultsRenderer', 'tabs', ..., ('tabRenderer', 'expandableTabRenderer')), expected_type=dict) + + def _extract_from_tabs(self, item_id, ytcfg, data, tabs): + metadata = self._extract_metadata_from_tabs(item_id, data) + + selected_tab = self._extract_selected_tab(tabs) + metadata['title'] += format_field(selected_tab, 'title', ' - %s') + metadata['title'] += format_field(selected_tab, 'expandedText', ' - %s') + + return self.playlist_result( + self._entries( + selected_tab, metadata['id'], ytcfg, + self._extract_delegated_session_id(ytcfg, data), + self._extract_visitor_data(data, ytcfg)), + **metadata) + + def _extract_metadata_from_tabs(self, item_id, data): + info = {'id': item_id} + + metadata_renderer = traverse_obj(data, ('metadata', 'channelMetadataRenderer'), expected_type=dict) + if metadata_renderer: + channel_id = traverse_obj(metadata_renderer, ('externalId', {self.ucid_or_none}), + ('channelUrl', {self.ucid_from_url})) + info.update({ + 'channel': metadata_renderer.get('title'), + 'channel_id': channel_id, + }) + if info['channel_id']: + info['id'] = info['channel_id'] + else: + metadata_renderer = traverse_obj(data, ('metadata', 'playlistMetadataRenderer'), expected_type=dict) + + # pageHeaderViewModel slow rollout began April 2024 + page_header_view_model = traverse_obj(data, ( + 'header', 'pageHeaderRenderer', 'content', 'pageHeaderViewModel', {dict})) + + # We can get the uncropped banner/avatar by replacing the crop params with '=s0' + # See: https://github.com/yt-dlp/yt-dlp/issues/2237#issuecomment-1013694714 + def _get_uncropped(url): + return url_or_none((url or '').split('=')[0] + '=s0') + + avatar_thumbnails = self._extract_thumbnails(metadata_renderer, 'avatar') + if avatar_thumbnails: + uncropped_avatar = _get_uncropped(avatar_thumbnails[0]['url']) + if uncropped_avatar: + avatar_thumbnails.append({ + 'url': uncropped_avatar, + 'id': 'avatar_uncropped', + 'preference': 1, + }) + + channel_banners = ( + self._extract_thumbnails(data, ('header', ..., ('banner', 'mobileBanner', 'tvBanner'))) + or self._extract_thumbnails( + page_header_view_model, ('banner', 'imageBannerViewModel', 'image'), final_key='sources')) + for banner in channel_banners: + banner['preference'] = -10 + + if channel_banners: + uncropped_banner = _get_uncropped(channel_banners[0]['url']) + if uncropped_banner: + channel_banners.append({ + 'url': uncropped_banner, + 'id': 'banner_uncropped', + 'preference': -5, + }) + + # Deprecated - remove primary_sidebar_renderer when layout discontinued + primary_sidebar_renderer = self._extract_sidebar_info_renderer(data, 'playlistSidebarPrimaryInfoRenderer') + playlist_header_renderer = traverse_obj(data, ('header', 'playlistHeaderRenderer'), expected_type=dict) + + primary_thumbnails = self._extract_thumbnails( + primary_sidebar_renderer, ('thumbnailRenderer', ('playlistVideoThumbnailRenderer', 'playlistCustomThumbnailRenderer'), 'thumbnail')) + playlist_thumbnails = self._extract_thumbnails( + playlist_header_renderer, ('playlistHeaderBanner', 'heroPlaylistThumbnailRenderer', 'thumbnail')) + + info.update({ + 'title': (traverse_obj(metadata_renderer, 'title') + or self._get_text(data, ('header', 'hashtagHeaderRenderer', 'hashtag')) + or info['id']), + 'availability': self._extract_availability(data), + 'channel_follower_count': ( + self._get_count(data, ('header', ..., 'subscriberCountText')) + or traverse_obj(page_header_view_model, ( + 'metadata', 'contentMetadataViewModel', 'metadataRows', ..., 'metadataParts', + lambda _, v: 'subscribers' in v['text']['content'], 'text', 'content', {parse_count}, any))), + 'description': try_get(metadata_renderer, lambda x: x.get('description', '')), + 'tags': (traverse_obj(data, ('microformat', 'microformatDataRenderer', 'tags', ..., {str})) + or traverse_obj(metadata_renderer, ('keywords', {lambda x: x and shlex.split(x)}, ...))), + 'thumbnails': (primary_thumbnails or playlist_thumbnails) + avatar_thumbnails + channel_banners, + }) + + channel_handle = ( + traverse_obj(metadata_renderer, (('vanityChannelUrl', ('ownerUrls', ...)), {self.handle_from_url}), get_all=False) + or traverse_obj(data, ('header', ..., 'channelHandleText', {self.handle_or_none}), get_all=False)) + + if channel_handle: + info.update({ + 'uploader_id': channel_handle, + 'uploader_url': format_field(channel_handle, None, 'https://www.youtube.com/%s', default=None), + }) + + channel_badges = self._extract_badges(traverse_obj(data, ('header', ..., 'badges'), get_all=False)) + if self._has_badge(channel_badges, BadgeType.VERIFIED): + info['channel_is_verified'] = True + # Playlist stats is a text runs array containing [video count, view count, last updated]. + # last updated or (view count and last updated) may be missing. + playlist_stats = get_first( + (primary_sidebar_renderer, playlist_header_renderer), (('stats', 'briefStats', 'numVideosText'), )) + + last_updated_unix = self._parse_time_text( + self._get_text(playlist_stats, 2) # deprecated, remove when old layout discontinued + or self._get_text(playlist_header_renderer, ('byline', 1, 'playlistBylineRenderer', 'text'))) + info['modified_date'] = strftime_or_none(last_updated_unix) + + info['view_count'] = self._get_count(playlist_stats, 1) + if info['view_count'] is None: # 0 is allowed + info['view_count'] = self._get_count(playlist_header_renderer, 'viewCountText') + if info['view_count'] is None: + info['view_count'] = self._get_count(data, ( + 'contents', 'twoColumnBrowseResultsRenderer', 'tabs', ..., 'tabRenderer', 'content', 'sectionListRenderer', + 'contents', ..., 'itemSectionRenderer', 'contents', ..., 'channelAboutFullMetadataRenderer', 'viewCountText')) + + info['playlist_count'] = self._get_count(playlist_stats, 0) + if info['playlist_count'] is None: # 0 is allowed + info['playlist_count'] = self._get_count(playlist_header_renderer, ('byline', 0, 'playlistBylineRenderer', 'text')) + + if not info.get('channel_id'): + owner = traverse_obj(playlist_header_renderer, 'ownerText') + if not owner: # Deprecated + owner = traverse_obj( + self._extract_sidebar_info_renderer(data, 'playlistSidebarSecondaryInfoRenderer'), + ('videoOwner', 'videoOwnerRenderer', 'title')) + owner_text = self._get_text(owner) + browse_ep = traverse_obj(owner, ('runs', 0, 'navigationEndpoint', 'browseEndpoint')) or {} + info.update({ + 'channel': self._search_regex(r'^by (.+) and \d+ others?$', owner_text, 'uploader', default=owner_text), + 'channel_id': self.ucid_or_none(browse_ep.get('browseId')), + 'uploader_id': self.handle_from_url(urljoin('https://www.youtube.com', browse_ep.get('canonicalBaseUrl'))), + }) + + info.update({ + 'uploader': info['channel'], + 'channel_url': format_field(info.get('channel_id'), None, 'https://www.youtube.com/channel/%s', default=None), + 'uploader_url': format_field(info.get('uploader_id'), None, 'https://www.youtube.com/%s', default=None), + }) + + return info + + def _extract_inline_playlist(self, playlist, playlist_id, data, ytcfg): + first_id = last_id = response = None + for page_num in itertools.count(1): + videos = list(self._playlist_entries(playlist)) + if not videos: + return + start = next((i for i, v in enumerate(videos) if v['id'] == last_id), -1) + 1 + if start >= len(videos): + return + yield from videos[start:] + first_id = first_id or videos[0]['id'] + last_id = videos[-1]['id'] + watch_endpoint = try_get( + playlist, lambda x: x['contents'][-1]['playlistPanelVideoRenderer']['navigationEndpoint']['watchEndpoint']) + headers = self.generate_api_headers( + ytcfg=ytcfg, delegated_session_id=self._extract_delegated_session_id(ytcfg, data), + visitor_data=self._extract_visitor_data(response, data, ytcfg)) + query = { + 'playlistId': playlist_id, + 'videoId': watch_endpoint.get('videoId') or last_id, + 'index': watch_endpoint.get('index') or len(videos), + 'params': watch_endpoint.get('params') or 'OAE%3D', + } + response = self._extract_response( + item_id=f'{playlist_id} page {page_num}', + query=query, ep='next', headers=headers, ytcfg=ytcfg, + check_get_keys='contents', + ) + playlist = try_get( + response, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist'], dict) + + def _extract_from_playlist(self, item_id, url, data, playlist, ytcfg): + title = playlist.get('title') or try_get( + data, lambda x: x['titleText']['simpleText'], str) + playlist_id = playlist.get('playlistId') or item_id + + # Delegating everything except mix playlists to regular tab-based playlist URL + playlist_url = urljoin(url, try_get( + playlist, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'], + str)) + + # Some playlists are unviewable but YouTube still provides a link to the (broken) playlist page [1] + # [1] MLCT, RLTDwFCb4jeqaKWnciAYM-ZVHg + is_known_unviewable = re.fullmatch(r'MLCT|RLTD[\w-]{22}', playlist_id) + + if playlist_url and playlist_url != url and not is_known_unviewable: + return self.url_result( + playlist_url, ie=YoutubeTabIE.ie_key(), video_id=playlist_id, + video_title=title) + + return self.playlist_result( + self._extract_inline_playlist(playlist, playlist_id, data, ytcfg), + playlist_id=playlist_id, playlist_title=title) + + def _extract_availability(self, data): + """ + Gets the availability of a given playlist/tab. + Note: Unless YouTube tells us explicitly, we do not assume it is public + @param data: response + """ + sidebar_renderer = self._extract_sidebar_info_renderer(data, 'playlistSidebarPrimaryInfoRenderer') or {} + playlist_header_renderer = traverse_obj(data, ('header', 'playlistHeaderRenderer')) or {} + player_header_privacy = playlist_header_renderer.get('privacy') + + badges = self._extract_badges(traverse_obj(sidebar_renderer, 'badges')) + + # Personal playlists, when authenticated, have a dropdown visibility selector instead of a badge + privacy_setting_icon = get_first( + (playlist_header_renderer, sidebar_renderer), + ('privacyForm', 'dropdownFormFieldRenderer', 'dropdown', 'dropdownRenderer', 'entries', + lambda _, v: v['privacyDropdownItemRenderer']['isSelected'], 'privacyDropdownItemRenderer', 'icon', 'iconType'), + expected_type=str) + + microformats_is_unlisted = traverse_obj( + data, ('microformat', 'microformatDataRenderer', 'unlisted'), expected_type=bool) + + return ( + 'public' if ( + self._has_badge(badges, BadgeType.AVAILABILITY_PUBLIC) + or player_header_privacy == 'PUBLIC' + or privacy_setting_icon == 'PRIVACY_PUBLIC') + else self._availability( + is_private=( + self._has_badge(badges, BadgeType.AVAILABILITY_PRIVATE) + or player_header_privacy == 'PRIVATE' if player_header_privacy is not None + else privacy_setting_icon == 'PRIVACY_PRIVATE' if privacy_setting_icon is not None else None), + is_unlisted=( + self._has_badge(badges, BadgeType.AVAILABILITY_UNLISTED) + or player_header_privacy == 'UNLISTED' if player_header_privacy is not None + else privacy_setting_icon == 'PRIVACY_UNLISTED' if privacy_setting_icon is not None + else microformats_is_unlisted if microformats_is_unlisted is not None else None), + needs_subscription=self._has_badge(badges, BadgeType.AVAILABILITY_SUBSCRIPTION) or None, + needs_premium=self._has_badge(badges, BadgeType.AVAILABILITY_PREMIUM) or None, + needs_auth=False)) + + @staticmethod + def _extract_sidebar_info_renderer(data, info_renderer, expected_type=dict): + sidebar_renderer = try_get( + data, lambda x: x['sidebar']['playlistSidebarRenderer']['items'], list) or [] + for item in sidebar_renderer: + renderer = try_get(item, lambda x: x[info_renderer], expected_type) + if renderer: + return renderer + + def _reload_with_unavailable_videos(self, item_id, data, ytcfg): + """ + Reload playlists with unavailable videos (e.g. private videos, region blocked, etc.) + """ + is_playlist = bool(traverse_obj( + data, ('metadata', 'playlistMetadataRenderer'), ('header', 'playlistHeaderRenderer'))) + if not is_playlist: + return + headers = self.generate_api_headers( + ytcfg=ytcfg, delegated_session_id=self._extract_delegated_session_id(ytcfg, data), + visitor_data=self._extract_visitor_data(data, ytcfg)) + query = { + 'params': 'wgYCCAA=', + 'browseId': f'VL{item_id}', + } + return self._extract_response( + item_id=item_id, headers=headers, query=query, + check_get_keys='contents', fatal=False, ytcfg=ytcfg, + note='Redownloading playlist API JSON with unavailable videos') + + @functools.cached_property + def skip_webpage(self): + return 'webpage' in self._configuration_arg('skip', ie_key=YoutubeTabIE.ie_key()) + + def _extract_webpage(self, url, item_id, fatal=True): + webpage, data = None, None + for retry in self.RetryManager(fatal=fatal): + try: + webpage = self._download_webpage(url, item_id, note='Downloading webpage') + data = self.extract_yt_initial_data(item_id, webpage or '', fatal=fatal) or {} + except ExtractorError as e: + if isinstance(e.cause, network_exceptions): + if not isinstance(e.cause, HTTPError) or e.cause.status not in (403, 429): + retry.error = e + continue + self._error_or_warning(e, fatal=fatal) + break + + try: + self._extract_and_report_alerts(data) + except ExtractorError as e: + self._error_or_warning(e, fatal=fatal) + break + + # Sometimes youtube returns a webpage with incomplete ytInitialData + # See: https://github.com/yt-dlp/yt-dlp/issues/116 + if not traverse_obj(data, 'contents', 'currentVideoEndpoint', 'onResponseReceivedActions'): + retry.error = ExtractorError('Incomplete yt initial data received') + data = None + continue + + return webpage, data + + def _report_playlist_authcheck(self, ytcfg, fatal=True): + """Use if failed to extract ytcfg (and data) from initial webpage""" + if not ytcfg and self.is_authenticated: + msg = 'Playlists that require authentication may not extract correctly without a successful webpage download' + if 'authcheck' not in self._configuration_arg('skip', ie_key=YoutubeTabIE.ie_key()) and fatal: + raise ExtractorError( + f'{msg}. If you are not downloading private content, or ' + 'your cookies are only for the first account and channel,' + ' pass "--extractor-args youtubetab:skip=authcheck" to skip this check', + expected=True) + self.report_warning(msg, only_once=True) + + def _extract_data(self, url, item_id, ytcfg=None, fatal=True, webpage_fatal=False, default_client='web'): + data = None + if not self.skip_webpage: + webpage, data = self._extract_webpage(url, item_id, fatal=webpage_fatal) + ytcfg = ytcfg or self.extract_ytcfg(item_id, webpage) + # Reject webpage data if redirected to home page without explicitly requesting + selected_tab = self._extract_selected_tab(self._extract_tab_renderers(data), fatal=False) or {} + if (url != 'https://www.youtube.com/feed/recommended' + and selected_tab.get('tabIdentifier') == 'FEwhat_to_watch' # Home page + and 'no-youtube-channel-redirect' not in self.get_param('compat_opts', [])): + msg = 'The channel/playlist does not exist and the URL redirected to youtube.com home page' + if fatal: + raise ExtractorError(msg, expected=True) + self.report_warning(msg, only_once=True) + if not data: + self._report_playlist_authcheck(ytcfg, fatal=fatal) + data = self._extract_tab_endpoint(url, item_id, ytcfg, fatal=fatal, default_client=default_client) + return data, ytcfg + + def _extract_tab_endpoint(self, url, item_id, ytcfg=None, fatal=True, default_client='web'): + headers = self.generate_api_headers(ytcfg=ytcfg, default_client=default_client) + resolve_response = self._extract_response( + item_id=item_id, query={'url': url}, check_get_keys='endpoint', headers=headers, ytcfg=ytcfg, fatal=fatal, + ep='navigation/resolve_url', note='Downloading API parameters API JSON', default_client=default_client) + endpoints = {'browseEndpoint': 'browse', 'watchEndpoint': 'next'} + for ep_key, ep in endpoints.items(): + params = try_get(resolve_response, lambda x: x['endpoint'][ep_key], dict) + if params: + return self._extract_response( + item_id=item_id, query=params, ep=ep, headers=headers, + ytcfg=ytcfg, fatal=fatal, default_client=default_client, + check_get_keys=('contents', 'currentVideoEndpoint', 'onResponseReceivedActions')) + err_note = 'Failed to resolve url (does the playlist exist?)' + if fatal: + raise ExtractorError(err_note, expected=True) + self.report_warning(err_note, item_id) + + _SEARCH_PARAMS = None + + def _search_results(self, query, params=NO_DEFAULT, default_client='web'): + data = {'query': query} + if params is NO_DEFAULT: + params = self._SEARCH_PARAMS + if params: + data['params'] = params + + content_keys = ( + ('contents', 'twoColumnSearchResultsRenderer', 'primaryContents', 'sectionListRenderer', 'contents'), + ('onResponseReceivedCommands', 0, 'appendContinuationItemsAction', 'continuationItems'), + # ytmusic search + ('contents', 'tabbedSearchResultsRenderer', 'tabs', 0, 'tabRenderer', 'content', 'sectionListRenderer', 'contents'), + ('continuationContents', ), + ) + display_id = f'query "{query}"' + check_get_keys = tuple({keys[0] for keys in content_keys}) + ytcfg = self._download_ytcfg(default_client, display_id) if not self.skip_webpage else {} + self._report_playlist_authcheck(ytcfg, fatal=False) + + continuation_list = [None] + search = None + for page_num in itertools.count(1): + data.update(continuation_list[0] or {}) + headers = self.generate_api_headers( + ytcfg=ytcfg, visitor_data=self._extract_visitor_data(search), default_client=default_client) + search = self._extract_response( + item_id=f'{display_id} page {page_num}', ep='search', query=data, + default_client=default_client, check_get_keys=check_get_keys, ytcfg=ytcfg, headers=headers) + slr_contents = traverse_obj(search, *content_keys) + yield from self._extract_entries({'contents': list(variadic(slr_contents))}, continuation_list) + if not continuation_list[0]: + break + + +class YoutubeTabIE(YoutubeTabBaseInfoExtractor): + IE_DESC = 'YouTube Tabs' + _VALID_URL = r'''(?x: + https?:// + (?!consent\.)(?:\w+\.)? + (?: + youtube(?:kids)?\.com| + {invidious} + )/ + (?: + (?P<channel_type>channel|c|user|browse)/| + (?P<not_channel> + feed/|hashtag/| + (?:playlist|watch)\?.*?\blist= + )| + (?!(?:{reserved_names})\b) # Direct URLs + ) + (?P<id>[^/?\#&]+) + )'''.format( + reserved_names=YoutubeBaseInfoExtractor._RESERVED_NAMES, + invidious='|'.join(YoutubeBaseInfoExtractor._INVIDIOUS_SITES), + ) + IE_NAME = 'youtube:tab' + + _TESTS = [{ + 'note': 'playlists, multipage', + 'url': 'https://www.youtube.com/c/ИгорьКлейнер/playlists?view=1&flow=grid', + 'playlist_mincount': 94, + 'info_dict': { + 'id': 'UCqj7Cz7revf5maW9g5pgNcg', + 'title': 'Igor Kleiner - Playlists', + 'description': 'md5:15d7dd9e333cb987907fcb0d604b233a', + 'uploader': 'Igor Kleiner ', + 'uploader_id': '@IgorDataScience', + 'uploader_url': 'https://www.youtube.com/@IgorDataScience', + 'channel': 'Igor Kleiner ', + 'channel_id': 'UCqj7Cz7revf5maW9g5pgNcg', + 'tags': 'count:23', + 'channel_url': 'https://www.youtube.com/channel/UCqj7Cz7revf5maW9g5pgNcg', + 'channel_follower_count': int, + }, + }, { + 'note': 'playlists, multipage, different order', + 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd', + 'playlist_mincount': 94, + 'info_dict': { + 'id': 'UCqj7Cz7revf5maW9g5pgNcg', + 'title': 'Igor Kleiner - Playlists', + 'description': 'md5:15d7dd9e333cb987907fcb0d604b233a', + 'uploader': 'Igor Kleiner ', + 'uploader_id': '@IgorDataScience', + 'uploader_url': 'https://www.youtube.com/@IgorDataScience', + 'tags': 'count:23', + 'channel_id': 'UCqj7Cz7revf5maW9g5pgNcg', + 'channel': 'Igor Kleiner ', + 'channel_url': 'https://www.youtube.com/channel/UCqj7Cz7revf5maW9g5pgNcg', + 'channel_follower_count': int, + }, + }, { + # TODO: fix channel_is_verified extraction + 'note': 'playlists, series', + 'url': 'https://www.youtube.com/c/3blue1brown/playlists?view=50&sort=dd&shelf_id=3', + 'playlist_mincount': 5, + 'info_dict': { + 'id': 'UCYO_jab_esuFRV4b17AJtAw', + 'title': '3Blue1Brown - Playlists', + 'description': 'md5:602e3789e6a0cb7d9d352186b720e395', + 'channel_url': 'https://www.youtube.com/channel/UCYO_jab_esuFRV4b17AJtAw', + 'channel': '3Blue1Brown', + 'channel_id': 'UCYO_jab_esuFRV4b17AJtAw', + 'uploader_id': '@3blue1brown', + 'uploader_url': 'https://www.youtube.com/@3blue1brown', + 'uploader': '3Blue1Brown', + 'tags': ['Mathematics'], + 'channel_follower_count': int, + 'channel_is_verified': True, + }, + }, { + 'note': 'playlists, singlepage', + 'url': 'https://www.youtube.com/user/ThirstForScience/playlists', + 'playlist_mincount': 4, + 'info_dict': { + 'id': 'UCAEtajcuhQ6an9WEzY9LEMQ', + 'title': 'ThirstForScience - Playlists', + 'description': 'md5:609399d937ea957b0f53cbffb747a14c', + 'uploader': 'ThirstForScience', + 'uploader_url': 'https://www.youtube.com/@ThirstForScience', + 'uploader_id': '@ThirstForScience', + 'channel_id': 'UCAEtajcuhQ6an9WEzY9LEMQ', + 'channel_url': 'https://www.youtube.com/channel/UCAEtajcuhQ6an9WEzY9LEMQ', + 'tags': 'count:12', + 'channel': 'ThirstForScience', + 'channel_follower_count': int, + }, + }, { + 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists', + 'only_matching': True, + }, { + # TODO: fix availability extraction + 'note': 'basic, single video playlist', + 'url': 'https://www.youtube.com/playlist?list=PLt5yu3-wZAlSLRHmI1qNm0wjyVNWw1pCU', + 'info_dict': { + 'id': 'PLt5yu3-wZAlSLRHmI1qNm0wjyVNWw1pCU', + 'title': 'single video playlist', + 'description': '', + 'tags': [], + 'view_count': int, + 'modified_date': '20250417', + 'channel': 'cole-dlp-test-acc', + 'channel_id': 'UCiu-3thuViMebBjw_5nWYrA', + 'channel_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA', + 'availability': 'public', + 'uploader': 'cole-dlp-test-acc', + 'uploader_url': 'https://www.youtube.com/@coletdjnz', + 'uploader_id': '@coletdjnz', + }, + 'playlist_count': 1, + }, { + 'note': 'empty playlist', + 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf', + 'info_dict': { + 'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf', + 'title': 'youtube-dl empty playlist', + 'tags': [], + 'channel': 'Sergey M.', + 'description': '', + 'modified_date': '20230921', + 'channel_id': 'UCmlqkdCBesrv2Lak1mF_MxA', + 'channel_url': 'https://www.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA', + 'availability': 'unlisted', + 'uploader_url': 'https://www.youtube.com/@sergeym.6173', + 'uploader_id': '@sergeym.6173', + 'uploader': 'Sergey M.', + }, + 'playlist_count': 0, + }, { + 'note': 'Home tab', + 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/featured', + 'info_dict': { + 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + 'title': 'lex will - Home', + 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', + 'uploader': 'lex will', + 'uploader_id': '@lexwill718', + 'channel': 'lex will', + 'tags': ['bible', 'history', 'prophesy'], + 'uploader_url': 'https://www.youtube.com/@lexwill718', + 'channel_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w', + 'channel_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + 'channel_follower_count': int, + }, + 'playlist_mincount': 2, + }, { + 'note': 'Videos tab', + 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos', + 'info_dict': { + 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + 'title': 'lex will - Videos', + 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', + 'uploader': 'lex will', + 'uploader_id': '@lexwill718', + 'tags': ['bible', 'history', 'prophesy'], + 'channel_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w', + 'channel_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + 'uploader_url': 'https://www.youtube.com/@lexwill718', + 'channel': 'lex will', + 'channel_follower_count': int, + }, + 'playlist_mincount': 975, + }, { + 'note': 'Videos tab, sorted by popular', + 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos?view=0&sort=p&flow=grid', + 'info_dict': { + 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + 'title': 'lex will - Videos', + 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', + 'uploader': 'lex will', + 'uploader_id': '@lexwill718', + 'channel_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + 'uploader_url': 'https://www.youtube.com/@lexwill718', + 'channel': 'lex will', + 'tags': ['bible', 'history', 'prophesy'], + 'channel_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w', + 'channel_follower_count': int, + }, + 'playlist_mincount': 199, + }, { + 'note': 'Playlists tab', + 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/playlists', + 'info_dict': { + 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + 'title': 'lex will - Playlists', + 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', + 'uploader': 'lex will', + 'uploader_id': '@lexwill718', + 'uploader_url': 'https://www.youtube.com/@lexwill718', + 'channel': 'lex will', + 'channel_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w', + 'channel_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + 'tags': ['bible', 'history', 'prophesy'], + 'channel_follower_count': int, + }, + 'playlist_mincount': 17, + }, { + 'note': 'Posts tab', + 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/community', + 'info_dict': { + 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + 'title': 'lex will - Posts', + 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', + 'channel': 'lex will', + 'channel_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w', + 'channel_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + 'tags': ['bible', 'history', 'prophesy'], + 'channel_follower_count': int, + 'uploader_url': 'https://www.youtube.com/@lexwill718', + 'uploader_id': '@lexwill718', + 'uploader': 'lex will', + }, + 'playlist_mincount': 18, + }, { + # TODO: fix channel_is_verified extraction + 'note': 'Search tab', + 'url': 'https://www.youtube.com/c/3blue1brown/search?query=linear%20algebra', + 'playlist_mincount': 40, + 'info_dict': { + 'id': 'UCYO_jab_esuFRV4b17AJtAw', + 'title': '3Blue1Brown - Search - linear algebra', + 'description': 'md5:602e3789e6a0cb7d9d352186b720e395', + 'channel_url': 'https://www.youtube.com/channel/UCYO_jab_esuFRV4b17AJtAw', + 'tags': ['Mathematics'], + 'channel': '3Blue1Brown', + 'channel_id': 'UCYO_jab_esuFRV4b17AJtAw', + 'channel_follower_count': int, + 'uploader_url': 'https://www.youtube.com/@3blue1brown', + 'uploader_id': '@3blue1brown', + 'uploader': '3Blue1Brown', + 'channel_is_verified': True, + }, + }, { + 'url': 'https://invidio.us/channel/UCmlqkdCBesrv2Lak1mF_MxA', + 'only_matching': True, + }, { + 'url': 'https://www.youtubekids.com/channel/UCmlqkdCBesrv2Lak1mF_MxA', + 'only_matching': True, + }, { + 'url': 'https://music.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA', + 'only_matching': True, + }, { + # TODO: fix availability extraction + 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.', + 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC', + 'info_dict': { + 'title': '29C3: Not my department', + 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC', + 'description': 'md5:a14dc1a8ef8307a9807fe136a0660268', + 'tags': [], + 'view_count': int, + 'modified_date': '20150605', + 'channel_id': 'UCEPzS1rYsrkqzSLNp76nrcg', + 'channel_url': 'https://www.youtube.com/channel/UCEPzS1rYsrkqzSLNp76nrcg', + 'channel': 'Christiaan008', + 'availability': 'public', + 'uploader_id': '@ChRiStIaAn008', + 'uploader': 'Christiaan008', + 'uploader_url': 'https://www.youtube.com/@ChRiStIaAn008', + }, + 'playlist_count': 96, + }, { + 'note': 'Large playlist', + 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q', + 'info_dict': { + 'title': 'Uploads from Cauchemar', + 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q', + 'channel_url': 'https://www.youtube.com/channel/UCBABnxM4Ar9ten8Mdjj1j0Q', + 'tags': [], + 'modified_date': r're:\d{8}', + 'channel': 'Cauchemar', + 'view_count': int, + 'description': '', + 'channel_id': 'UCBABnxM4Ar9ten8Mdjj1j0Q', + 'availability': 'public', + 'uploader_id': '@Cauchemar89', + 'uploader': 'Cauchemar', + 'uploader_url': 'https://www.youtube.com/@Cauchemar89', + }, + 'playlist_mincount': 1123, + 'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'], + }, { + 'note': 'even larger playlist, 8832 videos', + 'url': 'http://www.youtube.com/user/NASAgovVideo/videos', + 'only_matching': True, + }, { + 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos', + 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA', + 'info_dict': { + 'title': 'Uploads from Interstellar Movie', + 'id': 'UUXw-G3eDE9trcvY2sBMM_aA', + 'tags': [], + 'view_count': int, + 'channel_id': 'UCXw-G3eDE9trcvY2sBMM_aA', + 'channel_url': 'https://www.youtube.com/channel/UCXw-G3eDE9trcvY2sBMM_aA', + 'channel': 'Interstellar Movie', + 'description': '', + 'modified_date': r're:\d{8}', + 'availability': 'public', + 'uploader_id': '@InterstellarMovie', + 'uploader': 'Interstellar Movie', + 'uploader_url': 'https://www.youtube.com/@InterstellarMovie', + }, + 'playlist_mincount': 21, + }, { + # TODO: fix availability extraction + 'note': 'Playlist with "show unavailable videos" button', + 'url': 'https://www.youtube.com/playlist?list=PLYwq8WOe86_xGmR7FrcJq8Sb7VW8K3Tt2', + 'info_dict': { + 'title': 'The Memes Of 2010s.....', + 'id': 'PLYwq8WOe86_xGmR7FrcJq8Sb7VW8K3Tt2', + 'view_count': int, + 'channel': "I'm Not JiNxEd", + 'tags': [], + 'description': 'md5:44dc3b315ba69394feaafa2f40e7b2a1', + 'channel_url': 'https://www.youtube.com/channel/UC5H5H85D1QE5-fuWWQ1hdNg', + 'channel_id': 'UC5H5H85D1QE5-fuWWQ1hdNg', + 'modified_date': r're:\d{8}', + 'availability': 'public', + 'uploader_url': 'https://www.youtube.com/@imnotjinxed1998', + 'uploader_id': '@imnotjinxed1998', + 'uploader': "I'm Not JiNxEd", + }, + 'playlist_mincount': 150, + 'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'], + }, { + 'note': 'Playlist with unavailable videos in page 7', + 'url': 'https://www.youtube.com/playlist?list=UU8l9frL61Yl5KFOl87nIm2w', + 'info_dict': { + 'title': 'Uploads from BlankTV', + 'id': 'UU8l9frL61Yl5KFOl87nIm2w', + 'channel': 'BlankTV', + 'channel_url': 'https://www.youtube.com/channel/UC8l9frL61Yl5KFOl87nIm2w', + 'channel_id': 'UC8l9frL61Yl5KFOl87nIm2w', + 'view_count': int, + 'tags': [], + 'modified_date': r're:\d{8}', + 'description': '', + 'availability': 'public', + 'uploader_id': '@blanktv', + 'uploader': 'BlankTV', + 'uploader_url': 'https://www.youtube.com/@blanktv', + }, + 'playlist_mincount': 1000, + 'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'], + }, { + # TODO: fix availability extraction + 'note': 'https://github.com/ytdl-org/youtube-dl/issues/21844', + 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba', + 'info_dict': { + 'title': 'Data Analysis with Dr Mike Pound', + 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba', + 'description': 'md5:7f567c574d13d3f8c0954d9ffee4e487', + 'tags': [], + 'view_count': int, + 'channel_id': 'UC9-y-6csu5WGm29I7JiwpnA', + 'channel_url': 'https://www.youtube.com/channel/UC9-y-6csu5WGm29I7JiwpnA', + 'channel': 'Computerphile', + 'availability': 'public', + 'modified_date': '20190712', + 'uploader_id': '@Computerphile', + 'uploader': 'Computerphile', + 'uploader_url': 'https://www.youtube.com/@Computerphile', + }, + 'playlist_mincount': 11, + }, { + 'url': 'https://invidio.us/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc', + 'only_matching': True, + }, { + 'note': 'Playlist URL that does not actually serve a playlist', + 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4', + 'info_dict': { + 'id': 'FqZTN594JQw', + 'ext': 'webm', + 'title': "Smiley's People 01 detective, Adventure Series, Action", + 'upload_date': '20150526', + 'license': 'Standard YouTube License', + 'description': 'md5:507cdcb5a49ac0da37a920ece610be80', + 'categories': ['People & Blogs'], + 'tags': list, + 'view_count': int, + 'like_count': int, + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'This video is not available.', + 'add_ie': [YoutubeIE.ie_key()], + }, { + 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/channel/UCoMdktPbSTixAyNGwb-UYkQ/live', + 'info_dict': { + 'id': 'YDvsBbKfLPA', # This will keep changing + 'ext': 'mp4', + 'title': str, + 'upload_date': r're:\d{8}', + 'description': str, + 'categories': ['News & Politics'], + 'tags': list, + 'like_count': int, + 'release_timestamp': int, + 'channel': 'Sky News', + 'channel_id': 'UCoMdktPbSTixAyNGwb-UYkQ', + 'age_limit': 0, + 'view_count': int, + 'thumbnail': r're:https?://i\.ytimg\.com/vi/[^/]+/maxresdefault(?:_live)?\.jpg', + 'playable_in_embed': True, + 'release_date': r're:\d+', + 'availability': 'public', + 'live_status': 'is_live', + 'channel_url': 'https://www.youtube.com/channel/UCoMdktPbSTixAyNGwb-UYkQ', + 'channel_follower_count': int, + 'concurrent_view_count': int, + 'uploader_url': 'https://www.youtube.com/@SkyNews', + 'uploader_id': '@SkyNews', + 'uploader': 'Sky News', + 'channel_is_verified': True, + 'media_type': 'livestream', + 'timestamp': int, + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['Ignoring subtitle tracks found in '], + }, { + 'url': 'https://www.youtube.com/user/TheYoungTurks/live', + 'info_dict': { + 'id': 'a48o2S1cPoo', + 'ext': 'mp4', + 'title': 'The Young Turks - Live Main Show', + 'upload_date': '20150715', + 'license': 'Standard YouTube License', + 'description': 'md5:438179573adcdff3c97ebb1ee632b891', + 'categories': ['News & Politics'], + 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'], + 'like_count': int, + }, + 'params': { + 'skip_download': True, + }, + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/c/CommanderVideoHq/live', + 'only_matching': True, + }, { + 'note': 'A channel that is not live. Should raise error', + 'url': 'https://www.youtube.com/user/numberphile/live', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/feed/trending', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/feed/library', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/feed/history', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/feed/subscriptions', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/feed/watch_later', + 'only_matching': True, + }, { + 'note': 'Recommended - redirects to home page.', + 'url': 'https://www.youtube.com/feed/recommended', + 'only_matching': True, + }, { + 'note': 'inline playlist with not always working continuations', + 'url': 'https://www.youtube.com/watch?v=UC6u0Tct-Fo&list=PL36D642111D65BE7C', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/course', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/zsecurity', + 'only_matching': True, + }, { + 'url': 'http://www.youtube.com/NASAgovVideo/videos', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/TheYoungTurks/live', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/hashtag/cctv9', + 'info_dict': { + 'id': 'cctv9', + 'title': 'cctv9 - All', + 'tags': [], + }, + 'playlist_mincount': 300, # not consistent but should be over 300 + }, { + 'url': 'https://www.youtube.com/watch?list=PLW4dVinRY435CBE_JD3t-0SRXKfnZHS1P&feature=youtu.be&v=M9cJMXmQ_ZU', + 'only_matching': True, + }, { + 'note': 'Requires Premium: should request additional YTM-info webpage (and have format 141) for videos in playlist', + 'url': 'https://music.youtube.com/playlist?list=PLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq', + 'only_matching': True, + }, { + 'note': '/browse/ should redirect to /channel/', + 'url': 'https://music.youtube.com/browse/UC1a8OFewdjuLq6KlF8M_8Ng', + 'only_matching': True, + }, { + # TODO: fix availability extraction + 'note': 'VLPL, should redirect to playlist?list=PL...', + 'url': 'https://music.youtube.com/browse/VLPLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq', + 'info_dict': { + 'id': 'PLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq', + 'description': 'Providing you with copyright free / safe music for gaming, live streaming, studying and more!', + 'title': 'NCS : All Releases 💿', + 'channel_url': 'https://www.youtube.com/channel/UC_aEa8K-EOJ3D6gOs7HcyNg', + 'modified_date': r're:\d{8}', + 'view_count': int, + 'channel_id': 'UC_aEa8K-EOJ3D6gOs7HcyNg', + 'tags': [], + 'channel': 'NoCopyrightSounds', + 'availability': 'public', + 'uploader_url': 'https://www.youtube.com/@NoCopyrightSounds', + 'uploader': 'NoCopyrightSounds', + 'uploader_id': '@NoCopyrightSounds', + }, + 'playlist_mincount': 166, + 'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden', 'YouTube Music is not directly supported'], + }, { + # TODO: fix 'unviewable' issue with this playlist when reloading with unavailable videos + 'note': 'Topic, should redirect to playlist?list=UU...', + 'url': 'https://music.youtube.com/browse/UC9ALqqC4aIeG5iDs7i90Bfw', + 'info_dict': { + 'id': 'UU9ALqqC4aIeG5iDs7i90Bfw', + 'title': 'Uploads from Royalty Free Music - Topic', + 'tags': [], + 'channel_id': 'UC9ALqqC4aIeG5iDs7i90Bfw', + 'channel': 'Royalty Free Music - Topic', + 'view_count': int, + 'channel_url': 'https://www.youtube.com/channel/UC9ALqqC4aIeG5iDs7i90Bfw', + 'modified_date': r're:\d{8}', + 'description': '', + 'availability': 'public', + 'uploader': 'Royalty Free Music - Topic', + }, + 'playlist_mincount': 101, + 'expected_warnings': ['YouTube Music is not directly supported', r'[Uu]navailable videos (are|will be) hidden'], + }, { + # Destination channel with only a hidden self tab (tab id is UCtFRv9O2AHqOZjjynzrv-xg) + # Treat as a general feed + # TODO: fix extraction + 'url': 'https://www.youtube.com/channel/UCtFRv9O2AHqOZjjynzrv-xg', + 'info_dict': { + 'id': 'UCtFRv9O2AHqOZjjynzrv-xg', + 'title': 'UCtFRv9O2AHqOZjjynzrv-xg', + 'tags': [], + }, + 'playlist_mincount': 9, + }, { + 'note': 'Youtube music Album', + 'url': 'https://music.youtube.com/browse/MPREb_gTAcphH99wE', + 'info_dict': { + 'id': 'OLAK5uy_l1m0thk3g31NmIIz_vMIbWtyv7eZixlH0', + 'title': 'Album - Royalty Free Music Library V2 (50 Songs)', + 'tags': [], + 'view_count': int, + 'description': '', + 'availability': 'unlisted', + 'modified_date': r're:\d{8}', + }, + 'playlist_count': 50, + 'expected_warnings': ['YouTube Music is not directly supported'], + }, { + 'note': 'unlisted single video playlist', + 'url': 'https://www.youtube.com/playlist?list=PLt5yu3-wZAlQLfIN0MMgp0wVV6MP3bM4_', + 'info_dict': { + 'id': 'PLt5yu3-wZAlQLfIN0MMgp0wVV6MP3bM4_', + 'title': 'unlisted playlist', + 'availability': 'unlisted', + 'tags': [], + 'modified_date': '20250417', + 'channel': 'cole-dlp-test-acc', + 'view_count': int, + 'description': '', + 'channel_id': 'UCiu-3thuViMebBjw_5nWYrA', + 'channel_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA', + 'uploader_url': 'https://www.youtube.com/@coletdjnz', + 'uploader_id': '@coletdjnz', + 'uploader': 'cole-dlp-test-acc', + }, + 'playlist': [{ + 'info_dict': { + 'title': 'youtube-dl test video "\'/\\ä↭𝕐', + 'id': 'BaW_jenozKc', + '_type': 'url', + 'ie_key': 'Youtube', + 'duration': 10, + 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q', + 'channel_url': 'https://www.youtube.com/channel/UCLqxVugv74EIW3VWh2NOa3Q', + 'view_count': int, + 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc', + 'channel': 'Philipp Hagemeister', + 'uploader_id': '@PhilippHagemeister', + 'uploader_url': 'https://www.youtube.com/@PhilippHagemeister', + 'uploader': 'Philipp Hagemeister', + }, + }], + 'playlist_count': 1, + 'params': {'extract_flat': True}, + }, { + # By default, recommended is always empty. + 'note': 'API Fallback: Recommended - redirects to home page. Requires visitorData', + 'url': 'https://www.youtube.com/feed/recommended', + 'info_dict': { + 'id': 'recommended', + 'title': 'recommended', + 'tags': [], + }, + 'playlist_count': 0, + 'params': { + 'skip_download': True, + 'extractor_args': {'youtubetab': {'skip': ['webpage']}}, + }, + }, { + 'note': 'API Fallback: /videos tab, sorted by oldest first', + 'url': 'https://www.youtube.com/user/theCodyReeder/videos?view=0&sort=da&flow=grid', + 'info_dict': { + 'id': 'UCu6mSoMNzHQiBIOCkHUa2Aw', + 'title': 'Cody\'sLab - Videos', + 'description': 'md5:d083b7c2f0c67ee7a6c74c3e9b4243fa', + 'channel': 'Cody\'sLab', + 'channel_id': 'UCu6mSoMNzHQiBIOCkHUa2Aw', + 'tags': [], + 'channel_url': 'https://www.youtube.com/channel/UCu6mSoMNzHQiBIOCkHUa2Aw', + 'channel_follower_count': int, + }, + 'playlist_mincount': 650, + 'params': { + 'skip_download': True, + 'extractor_args': {'youtubetab': {'skip': ['webpage']}}, + }, + 'skip': 'Query for sorting no longer works', + }, { + # TODO: fix 'unviewable' issue with this playlist when reloading with unavailable videos + 'note': 'API Fallback: Topic, should redirect to playlist?list=UU...', + 'url': 'https://music.youtube.com/browse/UC9ALqqC4aIeG5iDs7i90Bfw', + 'info_dict': { + 'id': 'UU9ALqqC4aIeG5iDs7i90Bfw', + 'title': 'Uploads from Royalty Free Music - Topic', + 'modified_date': r're:\d{8}', + 'channel_id': 'UC9ALqqC4aIeG5iDs7i90Bfw', + 'description': '', + 'channel_url': 'https://www.youtube.com/channel/UC9ALqqC4aIeG5iDs7i90Bfw', + 'tags': [], + 'channel': 'Royalty Free Music - Topic', + 'view_count': int, + 'availability': 'public', + 'uploader': 'Royalty Free Music - Topic', + }, + 'playlist_mincount': 101, + 'params': { + 'skip_download': True, + 'extractor_args': {'youtubetab': {'skip': ['webpage']}}, + }, + 'expected_warnings': ['YouTube Music is not directly supported', r'[Uu]navailable videos (are|will be) hidden'], + }, { + 'note': 'non-standard redirect to regional channel', + 'url': 'https://www.youtube.com/channel/UCwVVpHQ2Cs9iGJfpdFngePQ', + 'only_matching': True, + }, { + # TODO: fix metadata extraction + 'note': 'collaborative playlist (uploader name in the form "by <uploader> and x other(s)")', + 'url': 'https://www.youtube.com/playlist?list=PLx-_-Kk4c89oOHEDQAojOXzEzemXxoqx6', + 'info_dict': { + 'id': 'PLx-_-Kk4c89oOHEDQAojOXzEzemXxoqx6', + 'modified_date': '20250115', + 'channel_url': 'https://www.youtube.com/channel/UCKcqXmCcyqnhgpA5P0oHH_Q', + 'tags': [], + 'availability': 'unlisted', + 'channel_id': 'UCKcqXmCcyqnhgpA5P0oHH_Q', + 'channel': 'pukkandan', + 'description': 'Test for collaborative playlist', + 'title': 'yt-dlp test - collaborative playlist', + 'view_count': int, + 'uploader_url': 'https://www.youtube.com/@pukkandan', + 'uploader_id': '@pukkandan', + 'uploader': 'pukkandan', + }, + 'playlist_mincount': 2, + }, { + 'note': 'translated tab name', + 'url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA/playlists', + 'info_dict': { + 'id': 'UCiu-3thuViMebBjw_5nWYrA', + 'tags': [], + 'channel_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA', + 'description': 'test description', + 'title': 'cole-dlp-test-acc - 再生リスト', + 'channel_id': 'UCiu-3thuViMebBjw_5nWYrA', + 'channel': 'cole-dlp-test-acc', + 'uploader_url': 'https://www.youtube.com/@coletdjnz', + 'uploader_id': '@coletdjnz', + 'uploader': 'cole-dlp-test-acc', + }, + 'playlist_mincount': 1, + 'params': {'extractor_args': {'youtube': {'lang': ['ja']}}}, + 'expected_warnings': ['Preferring "ja"'], + }, { + # XXX: this should really check flat playlist entries, but the test suite doesn't support that + # TODO: fix availability extraction + 'note': 'preferred lang set with playlist with translated video titles', + 'url': 'https://www.youtube.com/playlist?list=PLt5yu3-wZAlQAaPZ5Z-rJoTdbT-45Q7c0', + 'info_dict': { + 'id': 'PLt5yu3-wZAlQAaPZ5Z-rJoTdbT-45Q7c0', + 'tags': [], + 'view_count': int, + 'channel_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA', + 'channel': 'cole-dlp-test-acc', + 'channel_id': 'UCiu-3thuViMebBjw_5nWYrA', + 'description': 'test', + 'title': 'dlp test playlist', + 'availability': 'public', + 'uploader_url': 'https://www.youtube.com/@coletdjnz', + 'uploader_id': '@coletdjnz', + 'uploader': 'cole-dlp-test-acc', + }, + 'playlist_mincount': 1, + 'params': {'extractor_args': {'youtube': {'lang': ['ja']}}}, + 'expected_warnings': ['Preferring "ja"'], + }, { + # shorts audio pivot for 2GtVksBMYFM. + 'url': 'https://www.youtube.com/feed/sfv_audio_pivot?bp=8gUrCikSJwoLMkd0VmtzQk1ZRk0SCzJHdFZrc0JNWUZNGgsyR3RWa3NCTVlGTQ==', + # TODO: fix extraction + 'info_dict': { + 'id': 'sfv_audio_pivot', + 'title': 'sfv_audio_pivot', + 'tags': [], + }, + 'playlist_mincount': 50, + + }, { + # Channel with a real live tab (not to be mistaken with streams tab) + # Do not treat like it should redirect to live stream + 'url': 'https://www.youtube.com/channel/UCEH7P7kyJIkS_gJf93VYbmg/live', + 'info_dict': { + 'id': 'UCEH7P7kyJIkS_gJf93VYbmg', + 'title': 'UCEH7P7kyJIkS_gJf93VYbmg - Live', + 'tags': [], + }, + 'playlist_mincount': 20, + }, { + # Tab name is not the same as tab id + 'url': 'https://www.youtube.com/channel/UCQvWX73GQygcwXOTSf_VDVg/letsplay', + 'info_dict': { + 'id': 'UCQvWX73GQygcwXOTSf_VDVg', + 'title': 'UCQvWX73GQygcwXOTSf_VDVg - Let\'s play', + 'tags': [], + }, + 'playlist_mincount': 8, + }, { + # Home tab id is literally home. Not to get mistaken with featured + 'url': 'https://www.youtube.com/channel/UCQvWX73GQygcwXOTSf_VDVg/home', + 'info_dict': { + 'id': 'UCQvWX73GQygcwXOTSf_VDVg', + 'title': 'UCQvWX73GQygcwXOTSf_VDVg - Home', + 'tags': [], + }, + 'playlist_mincount': 8, + }, { + # Should get three playlists for videos, shorts and streams tabs + # TODO: fix channel_is_verified extraction + 'url': 'https://www.youtube.com/channel/UCK9V2B22uJYu3N7eR_BT9QA', + 'info_dict': { + 'id': 'UCK9V2B22uJYu3N7eR_BT9QA', + 'title': 'Polka Ch. 尾丸ポルカ', + 'channel_follower_count': int, + 'channel_id': 'UCK9V2B22uJYu3N7eR_BT9QA', + 'channel_url': 'https://www.youtube.com/channel/UCK9V2B22uJYu3N7eR_BT9QA', + 'description': 'md5:01e53f350ab8ad6fcf7c4fedb3c1b99f', + 'channel': 'Polka Ch. 尾丸ポルカ', + 'tags': 'count:35', + 'uploader_url': 'https://www.youtube.com/@OmaruPolka', + 'uploader': 'Polka Ch. 尾丸ポルカ', + 'uploader_id': '@OmaruPolka', + 'channel_is_verified': True, + }, + 'playlist_count': 3, + }, { + # Shorts tab with channel with handle + # TODO: fix channel_is_verified extraction + 'url': 'https://www.youtube.com/@NotJustBikes/shorts', + 'info_dict': { + 'id': 'UC0intLFzLaudFG-xAvUEO-A', + 'title': 'Not Just Bikes - Shorts', + 'tags': 'count:10', + 'channel_url': 'https://www.youtube.com/channel/UC0intLFzLaudFG-xAvUEO-A', + 'description': 'md5:1d9fc1bad7f13a487299d1fe1712e031', + 'channel_follower_count': int, + 'channel_id': 'UC0intLFzLaudFG-xAvUEO-A', + 'channel': 'Not Just Bikes', + 'uploader_url': 'https://www.youtube.com/@NotJustBikes', + 'uploader': 'Not Just Bikes', + 'uploader_id': '@NotJustBikes', + 'channel_is_verified': True, + }, + 'playlist_mincount': 10, + }, { + # Streams tab + 'url': 'https://www.youtube.com/channel/UC3eYAvjCVwNHgkaGbXX3sig/streams', + 'info_dict': { + 'id': 'UC3eYAvjCVwNHgkaGbXX3sig', + 'title': '中村悠一 - Live', + 'tags': 'count:7', + 'channel_id': 'UC3eYAvjCVwNHgkaGbXX3sig', + 'channel_url': 'https://www.youtube.com/channel/UC3eYAvjCVwNHgkaGbXX3sig', + 'channel': '中村悠一', + 'channel_follower_count': int, + 'description': 'md5:e8fd705073a594f27d6d6d020da560dc', + 'uploader_url': 'https://www.youtube.com/@Yuichi-Nakamura', + 'uploader_id': '@Yuichi-Nakamura', + 'uploader': '中村悠一', + }, + 'playlist_mincount': 60, + }, { + # Channel with no uploads and hence no videos, streams, shorts tabs or uploads playlist. This should fail. + # See test_youtube_lists + 'url': 'https://www.youtube.com/channel/UC2yXPzFejc422buOIzn_0CA', + 'only_matching': True, + }, { + # No uploads and no UCID given. Should fail with no uploads error + # See test_youtube_lists + 'url': 'https://www.youtube.com/news', + 'only_matching': True, + }, { + # No videos tab but has a shorts tab + # TODO: fix metadata extraction + 'url': 'https://www.youtube.com/c/TKFShorts', + 'info_dict': { + 'id': 'UCgJ5_1F6yJhYLnyMszUdmUg', + 'title': 'Shorts Break - Shorts', + 'tags': 'count:48', + 'channel_id': 'UCgJ5_1F6yJhYLnyMszUdmUg', + 'channel': 'Shorts Break', + 'description': 'md5:6de33c5e7ba686e5f3efd4e19c7ef499', + 'channel_follower_count': int, + 'channel_url': 'https://www.youtube.com/channel/UCgJ5_1F6yJhYLnyMszUdmUg', + 'uploader_url': 'https://www.youtube.com/@ShortsBreak_Official', + 'uploader': 'Shorts Break', + 'uploader_id': '@ShortsBreak_Official', + }, + 'playlist_mincount': 30, + }, { + # Trending Now Tab. tab id is empty + 'url': 'https://www.youtube.com/feed/trending', + 'info_dict': { + 'id': 'trending', + 'title': 'trending - Now', + 'tags': [], + }, + 'playlist_mincount': 30, + }, { + # Trending Gaming Tab. tab id is empty + 'url': 'https://www.youtube.com/feed/trending?bp=4gIcGhpnYW1pbmdfY29ycHVzX21vc3RfcG9wdWxhcg%3D%3D', + 'info_dict': { + 'id': 'trending', + 'title': 'trending - Gaming', + 'tags': [], + }, + 'playlist_mincount': 30, + }, { + # Shorts url result in shorts tab + # TODO: Fix channel id extraction + # TODO: fix test suite, 208163447408c78673b08c172beafe5c310fb167 broke this test + 'url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA/shorts', + 'info_dict': { + 'id': 'UCiu-3thuViMebBjw_5nWYrA', + 'title': 'cole-dlp-test-acc - Shorts', + 'channel': 'cole-dlp-test-acc', + 'description': 'test description', + 'channel_id': 'UCiu-3thuViMebBjw_5nWYrA', + 'channel_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA', + 'tags': [], + 'uploader_url': 'https://www.youtube.com/@coletdjnz', + 'uploader_id': '@coletdjnz', + 'uploader': 'cole-dlp-test-acc', + }, + 'playlist': [{ + 'info_dict': { + # Channel data is not currently available for short renderers (as of 2023-03-01) + '_type': 'url', + 'ie_key': 'Youtube', + 'url': 'https://www.youtube.com/shorts/sSM9J5YH_60', + 'id': 'sSM9J5YH_60', + 'title': 'SHORT short', + 'view_count': int, + 'thumbnails': list, + }, + }], + 'params': {'extract_flat': True}, + }, { + # Live video status should be extracted + # TODO: fix test suite, 208163447408c78673b08c172beafe5c310fb167 broke this test + 'url': 'https://www.youtube.com/channel/UCQvWX73GQygcwXOTSf_VDVg/live', + 'info_dict': { + 'id': 'UCQvWX73GQygcwXOTSf_VDVg', + 'title': 'UCQvWX73GQygcwXOTSf_VDVg - Live', # TODO: should be Minecraft - Live or Minecraft - Topic - Live + 'tags': [], + }, + 'playlist': [{ + 'info_dict': { + '_type': 'url', + 'ie_key': 'Youtube', + 'url': 'startswith:https://www.youtube.com/watch?v=', + 'id': str, + 'title': str, + 'live_status': 'is_live', + 'channel_id': str, + 'channel_url': str, + 'concurrent_view_count': int, + 'channel': str, + 'uploader': str, + 'uploader_url': str, + 'uploader_id': str, + 'channel_is_verified': bool, # this will keep changing + }, + }], + 'params': {'extract_flat': True, 'playlist_items': '1'}, + 'playlist_mincount': 1, + }, { + # Channel renderer metadata. Contains number of videos on the channel + # TODO: channels tab removed, change this test to use another page with channel renderer + 'url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA/channels', + 'info_dict': { + 'id': 'UCiu-3thuViMebBjw_5nWYrA', + 'title': 'cole-dlp-test-acc - Channels', + 'channel': 'cole-dlp-test-acc', + 'description': 'test description', + 'channel_id': 'UCiu-3thuViMebBjw_5nWYrA', + 'channel_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA', + 'tags': [], + 'uploader_url': 'https://www.youtube.com/@coletdjnz', + 'uploader_id': '@coletdjnz', + 'uploader': 'cole-dlp-test-acc', + }, + 'playlist': [{ + 'info_dict': { + '_type': 'url', + 'ie_key': 'YoutubeTab', + 'url': 'https://www.youtube.com/channel/UC-lHJZR3Gqxm24_Vd_AJ5Yw', + 'id': 'UC-lHJZR3Gqxm24_Vd_AJ5Yw', + 'channel_id': 'UC-lHJZR3Gqxm24_Vd_AJ5Yw', + 'title': 'PewDiePie', + 'channel': 'PewDiePie', + 'channel_url': 'https://www.youtube.com/channel/UC-lHJZR3Gqxm24_Vd_AJ5Yw', + 'thumbnails': list, + 'channel_follower_count': int, + 'playlist_count': int, + 'uploader': 'PewDiePie', + 'uploader_url': 'https://www.youtube.com/@PewDiePie', + 'uploader_id': '@PewDiePie', + 'channel_is_verified': True, + }, + }], + 'params': {'extract_flat': True}, + 'skip': 'channels tab removed', + }, { + # TODO: fix channel_is_verified extraction + 'url': 'https://www.youtube.com/@3blue1brown/about', + 'info_dict': { + 'id': '@3blue1brown', + 'tags': ['Mathematics'], + 'title': '3Blue1Brown', + 'channel_follower_count': int, + 'channel_id': 'UCYO_jab_esuFRV4b17AJtAw', + 'channel': '3Blue1Brown', + 'channel_url': 'https://www.youtube.com/channel/UCYO_jab_esuFRV4b17AJtAw', + 'description': 'md5:602e3789e6a0cb7d9d352186b720e395', + 'uploader_url': 'https://www.youtube.com/@3blue1brown', + 'uploader_id': '@3blue1brown', + 'uploader': '3Blue1Brown', + 'channel_is_verified': True, + }, + 'playlist_count': 0, + }, { + # Podcasts tab, with rich entry lockupViewModel + 'url': 'https://www.youtube.com/@99percentinvisiblepodcast/podcasts', + 'info_dict': { + 'id': 'UCVMF2HD4ZgC0QHpU9Yq5Xrw', + 'channel_id': 'UCVMF2HD4ZgC0QHpU9Yq5Xrw', + 'uploader_url': 'https://www.youtube.com/@99percentinvisiblepodcast', + 'description': 'md5:3a0ed38f1ad42a68ef0428c04a15695c', + 'title': '99% Invisible - Podcasts', + 'uploader': '99% Invisible', + 'channel_follower_count': int, + 'channel_url': 'https://www.youtube.com/channel/UCVMF2HD4ZgC0QHpU9Yq5Xrw', + 'tags': [], + 'channel': '99% Invisible', + 'uploader_id': '@99percentinvisiblepodcast', + }, + 'playlist_count': 5, + }, { + # Releases tab, with rich entry playlistRenderers (same as Podcasts tab) + # TODO: fix channel_is_verified extraction + 'url': 'https://www.youtube.com/@AHimitsu/releases', + 'info_dict': { + 'id': 'UCgFwu-j5-xNJml2FtTrrB3A', + 'channel': 'A Himitsu', + 'uploader_url': 'https://www.youtube.com/@AHimitsu', + 'title': 'A Himitsu - Releases', + 'uploader_id': '@AHimitsu', + 'uploader': 'A Himitsu', + 'channel_id': 'UCgFwu-j5-xNJml2FtTrrB3A', + 'tags': 'count:12', + 'description': 'I make music', + 'channel_url': 'https://www.youtube.com/channel/UCgFwu-j5-xNJml2FtTrrB3A', + 'channel_follower_count': int, + 'channel_is_verified': True, + }, + 'playlist_mincount': 10, + }, { + # Playlist with only shorts, shown as reel renderers + # FIXME: future: YouTube currently doesn't give continuation for this, + # may do in future. + 'url': 'https://www.youtube.com/playlist?list=UUxqPAgubo4coVn9Lx1FuKcg', + 'info_dict': { + 'id': 'UUxqPAgubo4coVn9Lx1FuKcg', + 'channel_url': 'https://www.youtube.com/channel/UCxqPAgubo4coVn9Lx1FuKcg', + 'view_count': int, + 'uploader_id': '@BangyShorts', + 'description': '', + 'uploader_url': 'https://www.youtube.com/@BangyShorts', + 'channel_id': 'UCxqPAgubo4coVn9Lx1FuKcg', + 'channel': 'Bangy Shorts', + 'uploader': 'Bangy Shorts', + 'tags': [], + 'availability': 'public', + 'modified_date': r're:\d{8}', + 'title': 'Uploads from Bangy Shorts', + }, + 'playlist_mincount': 100, + 'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'], + }, { + # TODO: fix channel_is_verified extraction + 'note': 'Tags containing spaces', + 'url': 'https://www.youtube.com/channel/UC7_YxT-KID8kRbqZo7MyscQ', + 'playlist_count': 3, + 'info_dict': { + 'id': 'UC7_YxT-KID8kRbqZo7MyscQ', + 'channel': 'Markiplier', + 'channel_id': 'UC7_YxT-KID8kRbqZo7MyscQ', + 'title': 'Markiplier', + 'channel_follower_count': int, + 'description': 'md5:0c010910558658824402809750dc5d97', + 'uploader_id': '@markiplier', + 'uploader_url': 'https://www.youtube.com/@markiplier', + 'uploader': 'Markiplier', + 'channel_url': 'https://www.youtube.com/channel/UC7_YxT-KID8kRbqZo7MyscQ', + 'channel_is_verified': True, + 'tags': ['markiplier', 'comedy', 'gaming', 'funny videos', 'funny moments', + 'sketch comedy', 'laughing', 'lets play', 'challenge videos', 'hilarious', + 'challenges', 'sketches', 'scary games', 'funny games', 'rage games', + 'mark fischbach'], + }, + }, { + # https://github.com/yt-dlp/yt-dlp/issues/12933 + 'note': 'streams tab, some scheduled streams. Empty intermediate response with only continuation - must follow', + 'url': 'https://www.youtube.com/@sbcitygov/streams', + 'playlist_mincount': 150, + 'info_dict': { + 'id': 'UCH6-qfQwlUgz9SAf05jvc_w', + 'channel': 'sbcitygov', + 'channel_id': 'UCH6-qfQwlUgz9SAf05jvc_w', + 'title': 'sbcitygov - Live', + 'channel_follower_count': int, + 'description': 'md5:ca1a92059835c071e33b3db52f4a6d67', + 'uploader_id': '@sbcitygov', + 'uploader_url': 'https://www.youtube.com/@sbcitygov', + 'uploader': 'sbcitygov', + 'channel_url': 'https://www.youtube.com/channel/UCH6-qfQwlUgz9SAf05jvc_w', + 'tags': [], + }, + }] + + @classmethod + def suitable(cls, url): + return False if YoutubeIE.suitable(url) else super().suitable(url) + + _URL_RE = re.compile(rf'(?P<pre>{_VALID_URL})(?(not_channel)|(?P<tab>/[^?#/]+))?(?P<post>.*)$') + + def _get_url_mobj(self, url): + mobj = self._URL_RE.match(url).groupdict() + mobj.update((k, '') for k, v in mobj.items() if v is None) + return mobj + + def _extract_tab_id_and_name(self, tab, base_url='https://www.youtube.com'): + tab_name = (tab.get('title') or '').lower() + tab_url = urljoin(base_url, traverse_obj( + tab, ('endpoint', 'commandMetadata', 'webCommandMetadata', 'url'))) + + tab_id = ((tab_url and self._get_url_mobj(tab_url)['tab'][1:]) + or traverse_obj(tab, 'tabIdentifier', expected_type=str)) + if tab_id: + return { + 'TAB_ID_SPONSORSHIPS': 'membership', + }.get(tab_id, tab_id), tab_name + + # Fallback to tab name if we cannot get the tab id. + # XXX: should we strip non-ascii letters? e.g. in case of 'let's play' tab example on special gaming channel + # Note that in the case of translated tab name this may result in an empty string, which we don't want. + if tab_name: + self.write_debug(f'Falling back to selected tab name: {tab_name}') + return { + 'home': 'featured', + 'live': 'streams', + }.get(tab_name, tab_name), tab_name + + def _has_tab(self, tabs, tab_id): + return any(self._extract_tab_id_and_name(tab)[0] == tab_id for tab in tabs) + + def _empty_playlist(self, item_id, data): + return self.playlist_result([], item_id, **self._extract_metadata_from_tabs(item_id, data)) + + @YoutubeTabBaseInfoExtractor.passthrough_smuggled_data + def _real_extract(self, url, smuggled_data): + item_id = self._match_id(url) + url = urllib.parse.urlunparse( + urllib.parse.urlparse(url)._replace(netloc='www.youtube.com')) + compat_opts = self.get_param('compat_opts', []) + + mobj = self._get_url_mobj(url) + pre, tab, post, is_channel = mobj['pre'], mobj['tab'], mobj['post'], not mobj['not_channel'] + if is_channel and smuggled_data.get('is_music_url'): + if item_id[:2] == 'VL': # Youtube music VL channels have an equivalent playlist + return self.url_result( + f'https://music.youtube.com/playlist?list={item_id[2:]}', YoutubeTabIE, item_id[2:]) + elif item_id[:2] == 'MP': # Resolve albums (/[channel/browse]/MP...) to their equivalent playlist + mdata = self._extract_tab_endpoint( + f'https://music.youtube.com/channel/{item_id}', item_id, default_client='web_music') + murl = traverse_obj(mdata, ('microformat', 'microformatDataRenderer', 'urlCanonical'), + get_all=False, expected_type=str) + if not murl: + raise ExtractorError('Failed to resolve album to playlist') + return self.url_result(murl, YoutubeTabIE) + elif mobj['channel_type'] == 'browse': # Youtube music /browse/ should be changed to /channel/ + return self.url_result( + f'https://music.youtube.com/channel/{item_id}{tab}{post}', YoutubeTabIE, item_id) + + original_tab_id, display_id = tab[1:], f'{item_id}{tab}' + if is_channel and not tab and 'no-youtube-channel-redirect' not in compat_opts: + url = f'{pre}/videos{post}' + if smuggled_data.get('is_music_url'): + self.report_warning(f'YouTube Music is not directly supported. Redirecting to {url}') + + # Handle both video/playlist URLs + qs = parse_qs(url) + video_id, playlist_id = (traverse_obj(qs, (key, 0)) for key in ('v', 'list')) + if not video_id and mobj['not_channel'].startswith('watch'): + if not playlist_id: + # If there is neither video or playlist ids, youtube redirects to home page, which is undesirable + raise ExtractorError('A video URL was given without video ID', expected=True) + # Common mistake: https://www.youtube.com/watch?list=playlist_id + self.report_warning(f'A video URL was given without video ID. Trying to download playlist {playlist_id}') + return self.url_result( + f'https://www.youtube.com/playlist?list={playlist_id}', YoutubeTabIE, playlist_id) + + if not self._yes_playlist(playlist_id, video_id): + return self.url_result( + f'https://www.youtube.com/watch?v={video_id}', YoutubeIE, video_id) + + data, ytcfg = self._extract_data(url, display_id) + + # YouTube may provide a non-standard redirect to the regional channel + # See: https://github.com/yt-dlp/yt-dlp/issues/2694 + # https://support.google.com/youtube/answer/2976814#zippy=,conditional-redirects + redirect_url = traverse_obj( + data, ('onResponseReceivedActions', ..., 'navigateAction', 'endpoint', 'commandMetadata', 'webCommandMetadata', 'url'), get_all=False) + if redirect_url and 'no-youtube-channel-redirect' not in compat_opts: + redirect_url = ''.join((urljoin('https://www.youtube.com', redirect_url), tab, post)) + self.to_screen(f'This playlist is likely not available in your region. Following conditional redirect to {redirect_url}') + return self.url_result(redirect_url, YoutubeTabIE) + + tabs, extra_tabs = self._extract_tab_renderers(data), [] + if is_channel and tabs and 'no-youtube-channel-redirect' not in compat_opts: + selected_tab = self._extract_selected_tab(tabs) + selected_tab_id, selected_tab_name = self._extract_tab_id_and_name(selected_tab, url) # NB: Name may be translated + self.write_debug(f'Selected tab: {selected_tab_id!r} ({selected_tab_name}), Requested tab: {original_tab_id!r}') + + # /about is no longer a tab + if original_tab_id == 'about': + return self._empty_playlist(item_id, data) + + if not original_tab_id and selected_tab_name: + self.to_screen('Downloading all uploads of the channel. ' + 'To download only the videos in a specific tab, pass the tab\'s URL') + if self._has_tab(tabs, 'streams'): + extra_tabs.append(''.join((pre, '/streams', post))) + if self._has_tab(tabs, 'shorts'): + extra_tabs.append(''.join((pre, '/shorts', post))) + # XXX: Members-only tab should also be extracted + + if not extra_tabs and selected_tab_id != 'videos': + # Channel does not have streams, shorts or videos tabs + if item_id[:2] != 'UC': + return self._empty_playlist(item_id, data) + + # Topic channels don't have /videos. Use the equivalent playlist instead + pl_id = f'UU{item_id[2:]}' + pl_url = f'https://www.youtube.com/playlist?list={pl_id}' + try: + data, ytcfg = self._extract_data(pl_url, pl_id, ytcfg=ytcfg, fatal=True, webpage_fatal=True) + except ExtractorError: + return self._empty_playlist(item_id, data) + else: + item_id, url = pl_id, pl_url + self.to_screen( + f'The channel does not have a videos, shorts, or live tab. Redirecting to playlist {pl_id} instead') + + elif extra_tabs and selected_tab_id != 'videos': + # When there are shorts/live tabs but not videos tab + url, data = f'{pre}{post}', None + + elif (original_tab_id or 'videos') != selected_tab_id: + if original_tab_id == 'live': + # Live tab should have redirected to the video + # Except in the case the channel has an actual live tab + # Example: https://www.youtube.com/channel/UCEH7P7kyJIkS_gJf93VYbmg/live + raise UserNotLive(video_id=item_id) + elif selected_tab_name: + raise ExtractorError(f'This channel does not have a {original_tab_id} tab', expected=True) + + # For channels such as https://www.youtube.com/channel/UCtFRv9O2AHqOZjjynzrv-xg + url = f'{pre}{post}' + + # YouTube sometimes provides a button to reload playlist with unavailable videos. + if 'no-youtube-unavailable-videos' not in compat_opts: + data = self._reload_with_unavailable_videos(display_id, data, ytcfg) or data + self._extract_and_report_alerts(data, only_once=True) + + tabs, entries = self._extract_tab_renderers(data), [] + if tabs: + entries = [self._extract_from_tabs(item_id, ytcfg, data, tabs)] + entries[0].update({ + 'extractor_key': YoutubeTabIE.ie_key(), + 'extractor': YoutubeTabIE.IE_NAME, + 'webpage_url': url, + }) + if self.get_param('playlist_items') == '0': + entries.extend(self.url_result(u, YoutubeTabIE) for u in extra_tabs) + else: # Users expect to get all `video_id`s even with `--flat-playlist`. So don't return `url_result` + entries.extend(map(self._real_extract, extra_tabs)) + + if len(entries) == 1: + return entries[0] + elif entries: + metadata = self._extract_metadata_from_tabs(item_id, data) + uploads_url = 'the Uploads (UU) playlist URL' + if try_get(metadata, lambda x: x['channel_id'].startswith('UC')): + uploads_url = f'https://www.youtube.com/playlist?list=UU{metadata["channel_id"][2:]}' + self.to_screen( + 'Downloading as multiple playlists, separated by tabs. ' + f'To download as a single playlist instead, pass {uploads_url}') + return self.playlist_result(entries, item_id, **metadata) + + # Inline playlist + playlist = traverse_obj( + data, ('contents', 'twoColumnWatchNextResults', 'playlist', 'playlist'), expected_type=dict) + if playlist: + return self._extract_from_playlist(item_id, url, data, playlist, ytcfg) + + video_id = traverse_obj( + data, ('currentVideoEndpoint', 'watchEndpoint', 'videoId'), expected_type=str) or video_id + if video_id: + if tab != '/live': # live tab is expected to redirect to video + self.report_warning(f'Unable to recognize playlist. Downloading just video {video_id}') + return self.url_result(f'https://www.youtube.com/watch?v={video_id}', YoutubeIE, video_id) + + raise ExtractorError('Unable to recognize tab page') + + +# xxx: This is tightly coupled to YoutubeTabBaseInfoExtractor. Should be decoupled at some point +class YoutubePlaylistIE(YoutubeBaseInfoExtractor): + IE_DESC = 'YouTube playlists' + _VALID_URL = r'''(?x)(?: + (?:https?://)? + (?:\w+\.)? + (?: + (?: + youtube(?:kids)?\.com| + {invidious} + ) + /.*?\?.*?\blist= + )? + (?P<id>{playlist_id}) + )'''.format( + playlist_id=YoutubeBaseInfoExtractor._PLAYLIST_ID_RE, + invidious='|'.join(YoutubeBaseInfoExtractor._INVIDIOUS_SITES), + ) + IE_NAME = 'youtube:playlist' + _TESTS = [{ + 'note': 'issue #673', + 'url': 'PLBB231211A4F62143', + 'info_dict': { + 'title': '[OLD]Team Fortress 2 (Class-based LP)', + 'id': 'PLBB231211A4F62143', + 'uploader': 'Wickman', + 'uploader_id': '@WickmanVT', + 'description': 'md5:8fa6f52abb47a9552002fa3ddfc57fc2', + 'view_count': int, + 'uploader_url': 'https://www.youtube.com/@WickmanVT', + 'modified_date': r're:\d{8}', + 'channel_id': 'UCKSpbfbl5kRQpTdL7kMc-1Q', + 'channel': 'Wickman', + 'tags': [], + 'channel_url': 'https://www.youtube.com/channel/UCKSpbfbl5kRQpTdL7kMc-1Q', + 'availability': 'public', + }, + 'playlist_mincount': 29, + }, { + 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl', + 'info_dict': { + 'title': 'YDL_safe_search', + 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl', + }, + 'playlist_count': 2, + 'skip': 'This playlist is private', + }, { + 'note': 'embedded', + 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu', + 'playlist_count': 4, + 'info_dict': { + 'title': 'JODA15', + 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu', + 'uploader': 'milan', + 'uploader_id': '@milan5503', + 'description': '', + 'channel_url': 'https://www.youtube.com/channel/UCEI1-PVPcYXjB73Hfelbmaw', + 'tags': [], + 'modified_date': '20140919', + 'view_count': int, + 'channel': 'milan', + 'channel_id': 'UCEI1-PVPcYXjB73Hfelbmaw', + 'uploader_url': 'https://www.youtube.com/@milan5503', + 'availability': 'public', + }, + 'expected_warnings': [r'[Uu]navailable videos? (is|are|will be) hidden', 'Retrying', 'Giving up'], + }, { + 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl', + 'playlist_mincount': 455, + 'info_dict': { + 'title': '2018 Chinese New Singles (11/6 updated)', + 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl', + 'uploader': 'LBK', + 'uploader_id': '@music_king', + 'description': 'md5:da521864744d60a198e3a88af4db0d9d', + 'channel': 'LBK', + 'view_count': int, + 'channel_url': 'https://www.youtube.com/channel/UC21nz3_MesPLqtDqwdvnoxA', + 'tags': [], + 'uploader_url': 'https://www.youtube.com/@music_king', + 'channel_id': 'UC21nz3_MesPLqtDqwdvnoxA', + 'modified_date': r're:\d{8}', + 'availability': 'public', + }, + 'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'], + }, { + 'url': 'TLGGrESM50VT6acwMjAyMjAxNw', + 'only_matching': True, + }, { + # music album playlist + 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + if YoutubeTabIE.suitable(url): + return False + from yt_dlp.utils import parse_qs + qs = parse_qs(url) + if qs.get('v', [None])[0]: + return False + return super().suitable(url) + + def _real_extract(self, url): + playlist_id = self._match_id(url) + is_music_url = YoutubeBaseInfoExtractor.is_music_url(url) + url = update_url_query( + 'https://www.youtube.com/playlist', + parse_qs(url) or {'list': playlist_id}) + if is_music_url: + url = smuggle_url(url, {'is_music_url': True}) + return self.url_result(url, ie=YoutubeTabIE.ie_key(), video_id=playlist_id) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube/_video.py similarity index 50% rename from yt_dlp/extractor/youtube.py rename to yt_dlp/extractor/youtube/_video.py index 0eed98c09..6c2d2fcb6 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube/_video.py @@ -1,36 +1,36 @@ import base64 import binascii -import calendar import collections -import copy import datetime as dt -import enum import functools -import hashlib import itertools import json import math import os.path import random import re -import shlex import sys import threading import time import traceback import urllib.parse -from .common import InfoExtractor, SearchInfoExtractor -from .openload import PhantomJSwrapper -from ..jsinterp import JSInterpreter -from ..networking.exceptions import HTTPError, network_exceptions -from ..utils import ( +from ._base import ( + INNERTUBE_CLIENTS, + BadgeType, + YoutubeBaseInfoExtractor, + _PoTokenContext, + _split_innertube_client, + short_client_name, +) +from ..openload import PhantomJSwrapper +from ...jsinterp import JSInterpreter +from ...networking.exceptions import HTTPError +from ...utils import ( NO_DEFAULT, ExtractorError, LazyList, - UserNotLive, bug_reports_message, - classproperty, clean_html, datetime_from_str, filesize_from_tbr, @@ -39,7 +39,6 @@ format_field, get_first, int_or_none, - is_html, join_nonempty, js_to_json, mimetype2ext, @@ -61,7 +60,6 @@ try_get, unescapeHTML, unified_strdate, - unified_timestamp, unsmuggle_url, update_url_query, url_or_none, @@ -74,1199 +72,6 @@ PO_TOKEN_GUIDE_URL = 'https://github.com/yt-dlp/yt-dlp/wiki/PO-Token-Guide' -class _PoTokenContext(enum.Enum): - PLAYER = 'player' - GVS = 'gvs' - - -# any clients starting with _ cannot be explicitly requested by the user -INNERTUBE_CLIENTS = { - 'web': { - 'INNERTUBE_CONTEXT': { - 'client': { - 'clientName': 'WEB', - 'clientVersion': '2.20241126.01.00', - }, - }, - 'INNERTUBE_CONTEXT_CLIENT_NAME': 1, - 'PO_TOKEN_REQUIRED_CONTEXTS': [_PoTokenContext.GVS], - 'SUPPORTS_COOKIES': True, - }, - # Safari UA returns pre-merged video+audio 144p/240p/360p/720p/1080p HLS formats - 'web_safari': { - 'INNERTUBE_CONTEXT': { - 'client': { - 'clientName': 'WEB', - 'clientVersion': '2.20241126.01.00', - 'userAgent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.5 Safari/605.1.15,gzip(gfe)', - }, - }, - 'INNERTUBE_CONTEXT_CLIENT_NAME': 1, - 'PO_TOKEN_REQUIRED_CONTEXTS': [_PoTokenContext.GVS], - 'SUPPORTS_COOKIES': True, - }, - 'web_embedded': { - 'INNERTUBE_CONTEXT': { - 'client': { - 'clientName': 'WEB_EMBEDDED_PLAYER', - 'clientVersion': '1.20241201.00.00', - }, - }, - 'INNERTUBE_CONTEXT_CLIENT_NAME': 56, - 'SUPPORTS_COOKIES': True, - }, - 'web_music': { - 'INNERTUBE_HOST': 'music.youtube.com', - 'INNERTUBE_CONTEXT': { - 'client': { - 'clientName': 'WEB_REMIX', - 'clientVersion': '1.20241127.01.00', - }, - }, - 'INNERTUBE_CONTEXT_CLIENT_NAME': 67, - 'PO_TOKEN_REQUIRED_CONTEXTS': [_PoTokenContext.GVS], - 'SUPPORTS_COOKIES': True, - }, - # This client now requires sign-in for every video - 'web_creator': { - 'INNERTUBE_CONTEXT': { - 'client': { - 'clientName': 'WEB_CREATOR', - 'clientVersion': '1.20241203.01.00', - }, - }, - 'INNERTUBE_CONTEXT_CLIENT_NAME': 62, - 'PO_TOKEN_REQUIRED_CONTEXTS': [_PoTokenContext.GVS], - 'REQUIRE_AUTH': True, - 'SUPPORTS_COOKIES': True, - }, - 'android': { - 'INNERTUBE_CONTEXT': { - 'client': { - 'clientName': 'ANDROID', - 'clientVersion': '19.44.38', - 'androidSdkVersion': 30, - 'userAgent': 'com.google.android.youtube/19.44.38 (Linux; U; Android 11) gzip', - 'osName': 'Android', - 'osVersion': '11', - }, - }, - 'INNERTUBE_CONTEXT_CLIENT_NAME': 3, - 'REQUIRE_JS_PLAYER': False, - 'PO_TOKEN_REQUIRED_CONTEXTS': [_PoTokenContext.GVS], - }, - # This client now requires sign-in for every video - 'android_music': { - 'INNERTUBE_CONTEXT': { - 'client': { - 'clientName': 'ANDROID_MUSIC', - 'clientVersion': '7.27.52', - 'androidSdkVersion': 30, - 'userAgent': 'com.google.android.apps.youtube.music/7.27.52 (Linux; U; Android 11) gzip', - 'osName': 'Android', - 'osVersion': '11', - }, - }, - 'INNERTUBE_CONTEXT_CLIENT_NAME': 21, - 'REQUIRE_JS_PLAYER': False, - 'PO_TOKEN_REQUIRED_CONTEXTS': [_PoTokenContext.GVS], - 'REQUIRE_AUTH': True, - }, - # This client now requires sign-in for every video - 'android_creator': { - 'INNERTUBE_CONTEXT': { - 'client': { - 'clientName': 'ANDROID_CREATOR', - 'clientVersion': '24.45.100', - 'androidSdkVersion': 30, - 'userAgent': 'com.google.android.apps.youtube.creator/24.45.100 (Linux; U; Android 11) gzip', - 'osName': 'Android', - 'osVersion': '11', - }, - }, - 'INNERTUBE_CONTEXT_CLIENT_NAME': 14, - 'REQUIRE_JS_PLAYER': False, - 'PO_TOKEN_REQUIRED_CONTEXTS': [_PoTokenContext.GVS], - 'REQUIRE_AUTH': True, - }, - # YouTube Kids videos aren't returned on this client for some reason - 'android_vr': { - 'INNERTUBE_CONTEXT': { - 'client': { - 'clientName': 'ANDROID_VR', - 'clientVersion': '1.60.19', - 'deviceMake': 'Oculus', - 'deviceModel': 'Quest 3', - 'androidSdkVersion': 32, - 'userAgent': 'com.google.android.apps.youtube.vr.oculus/1.60.19 (Linux; U; Android 12L; eureka-user Build/SQ3A.220605.009.A1) gzip', - 'osName': 'Android', - 'osVersion': '12L', - }, - }, - 'INNERTUBE_CONTEXT_CLIENT_NAME': 28, - 'REQUIRE_JS_PLAYER': False, - }, - # iOS clients have HLS live streams. Setting device model to get 60fps formats. - # See: https://github.com/TeamNewPipe/NewPipeExtractor/issues/680#issuecomment-1002724558 - 'ios': { - 'INNERTUBE_CONTEXT': { - 'client': { - 'clientName': 'IOS', - 'clientVersion': '20.03.02', - 'deviceMake': 'Apple', - 'deviceModel': 'iPhone16,2', - 'userAgent': 'com.google.ios.youtube/20.03.02 (iPhone16,2; U; CPU iOS 18_2_1 like Mac OS X;)', - 'osName': 'iPhone', - 'osVersion': '18.2.1.22C161', - }, - }, - 'INNERTUBE_CONTEXT_CLIENT_NAME': 5, - 'PO_TOKEN_REQUIRED_CONTEXTS': [_PoTokenContext.GVS], - 'REQUIRE_JS_PLAYER': False, - }, - # This client now requires sign-in for every video - 'ios_music': { - 'INNERTUBE_CONTEXT': { - 'client': { - 'clientName': 'IOS_MUSIC', - 'clientVersion': '7.27.0', - 'deviceMake': 'Apple', - 'deviceModel': 'iPhone16,2', - 'userAgent': 'com.google.ios.youtubemusic/7.27.0 (iPhone16,2; U; CPU iOS 18_1_0 like Mac OS X;)', - 'osName': 'iPhone', - 'osVersion': '18.1.0.22B83', - }, - }, - 'INNERTUBE_CONTEXT_CLIENT_NAME': 26, - 'REQUIRE_JS_PLAYER': False, - 'PO_TOKEN_REQUIRED_CONTEXTS': [_PoTokenContext.GVS], - 'REQUIRE_AUTH': True, - }, - # This client now requires sign-in for every video - 'ios_creator': { - 'INNERTUBE_CONTEXT': { - 'client': { - 'clientName': 'IOS_CREATOR', - 'clientVersion': '24.45.100', - 'deviceMake': 'Apple', - 'deviceModel': 'iPhone16,2', - 'userAgent': 'com.google.ios.ytcreator/24.45.100 (iPhone16,2; U; CPU iOS 18_1_0 like Mac OS X;)', - 'osName': 'iPhone', - 'osVersion': '18.1.0.22B83', - }, - }, - 'INNERTUBE_CONTEXT_CLIENT_NAME': 15, - 'REQUIRE_JS_PLAYER': False, - 'PO_TOKEN_REQUIRED_CONTEXTS': [_PoTokenContext.GVS], - 'REQUIRE_AUTH': True, - }, - # mweb has 'ultralow' formats - # See: https://github.com/yt-dlp/yt-dlp/pull/557 - 'mweb': { - 'INNERTUBE_CONTEXT': { - 'client': { - 'clientName': 'MWEB', - 'clientVersion': '2.20241202.07.00', - # mweb previously did not require PO Token with this UA - 'userAgent': 'Mozilla/5.0 (iPad; CPU OS 16_7_10 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1,gzip(gfe)', - }, - }, - 'INNERTUBE_CONTEXT_CLIENT_NAME': 2, - 'PO_TOKEN_REQUIRED_CONTEXTS': [_PoTokenContext.GVS], - 'SUPPORTS_COOKIES': True, - }, - 'tv': { - 'INNERTUBE_CONTEXT': { - 'client': { - 'clientName': 'TVHTML5', - 'clientVersion': '7.20250120.19.00', - 'userAgent': 'Mozilla/5.0 (ChromiumStylePlatform) Cobalt/Version', - }, - }, - 'INNERTUBE_CONTEXT_CLIENT_NAME': 7, - 'SUPPORTS_COOKIES': True, - }, - # This client now requires sign-in for every video - # It was previously an age-gate workaround for videos that were `playable_in_embed` - # It may still be useful if signed into an EU account that is not age-verified - 'tv_embedded': { - 'INNERTUBE_CONTEXT': { - 'client': { - 'clientName': 'TVHTML5_SIMPLY_EMBEDDED_PLAYER', - 'clientVersion': '2.0', - }, - }, - 'INNERTUBE_CONTEXT_CLIENT_NAME': 85, - 'REQUIRE_AUTH': True, - 'SUPPORTS_COOKIES': True, - }, -} - - -def _split_innertube_client(client_name): - variant, *base = client_name.rsplit('.', 1) - if base: - return variant, base[0], variant - base, *variant = client_name.split('_', 1) - return client_name, base, variant[0] if variant else None - - -def short_client_name(client_name): - main, *parts = _split_innertube_client(client_name)[0].split('_') - return join_nonempty(main[:4], ''.join(x[0] for x in parts)).upper() - - -def build_innertube_clients(): - THIRD_PARTY = { - 'embedUrl': 'https://www.youtube.com/', # Can be any valid URL - } - BASE_CLIENTS = ('ios', 'web', 'tv', 'mweb', 'android') - priority = qualities(BASE_CLIENTS[::-1]) - - for client, ytcfg in tuple(INNERTUBE_CLIENTS.items()): - ytcfg.setdefault('INNERTUBE_HOST', 'www.youtube.com') - ytcfg.setdefault('REQUIRE_JS_PLAYER', True) - ytcfg.setdefault('PO_TOKEN_REQUIRED_CONTEXTS', []) - ytcfg.setdefault('REQUIRE_AUTH', False) - ytcfg.setdefault('SUPPORTS_COOKIES', False) - ytcfg.setdefault('PLAYER_PARAMS', None) - ytcfg['INNERTUBE_CONTEXT']['client'].setdefault('hl', 'en') - - _, base_client, variant = _split_innertube_client(client) - ytcfg['priority'] = 10 * priority(base_client) - - if variant == 'embedded': - ytcfg['INNERTUBE_CONTEXT']['thirdParty'] = THIRD_PARTY - ytcfg['priority'] -= 2 - elif variant: - ytcfg['priority'] -= 3 - - -build_innertube_clients() - - -class BadgeType(enum.Enum): - AVAILABILITY_UNLISTED = enum.auto() - AVAILABILITY_PRIVATE = enum.auto() - AVAILABILITY_PUBLIC = enum.auto() - AVAILABILITY_PREMIUM = enum.auto() - AVAILABILITY_SUBSCRIPTION = enum.auto() - LIVE_NOW = enum.auto() - VERIFIED = enum.auto() - - -class YoutubeBaseInfoExtractor(InfoExtractor): - """Provide base functions for Youtube extractors""" - - _RESERVED_NAMES = ( - r'channel|c|user|playlist|watch|w|v|embed|e|live|watch_popup|clip|' - r'shorts|movies|results|search|shared|hashtag|trending|explore|feed|feeds|' - r'browse|oembed|get_video_info|iframe_api|s/player|source|' - r'storefront|oops|index|account|t/terms|about|upload|signin|logout') - - _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM|WL|LL|LM)' - - # _NETRC_MACHINE = 'youtube' - - # If True it will raise an error if no login info is provided - _LOGIN_REQUIRED = False - - _INVIDIOUS_SITES = ( - # invidious-redirect websites - r'(?:www\.)?redirect\.invidious\.io', - r'(?:(?:www|dev)\.)?invidio\.us', - # Invidious instances taken from https://github.com/iv-org/documentation/blob/master/docs/instances.md - r'(?:www\.)?invidious\.pussthecat\.org', - r'(?:www\.)?invidious\.zee\.li', - r'(?:www\.)?invidious\.ethibox\.fr', - r'(?:www\.)?iv\.ggtyler\.dev', - r'(?:www\.)?inv\.vern\.i2p', - r'(?:www\.)?am74vkcrjp2d5v36lcdqgsj2m6x36tbrkhsruoegwfcizzabnfgf5zyd\.onion', - r'(?:www\.)?inv\.riverside\.rocks', - r'(?:www\.)?invidious\.silur\.me', - r'(?:www\.)?inv\.bp\.projectsegfau\.lt', - r'(?:www\.)?invidious\.g4c3eya4clenolymqbpgwz3q3tawoxw56yhzk4vugqrl6dtu3ejvhjid\.onion', - r'(?:www\.)?invidious\.slipfox\.xyz', - r'(?:www\.)?invidious\.esmail5pdn24shtvieloeedh7ehz3nrwcdivnfhfcedl7gf4kwddhkqd\.onion', - r'(?:www\.)?inv\.vernccvbvyi5qhfzyqengccj7lkove6bjot2xhh5kajhwvidqafczrad\.onion', - r'(?:www\.)?invidious\.tiekoetter\.com', - r'(?:www\.)?iv\.odysfvr23q5wgt7i456o5t3trw2cw5dgn56vbjfbq2m7xsc5vqbqpcyd\.onion', - r'(?:www\.)?invidious\.nerdvpn\.de', - r'(?:www\.)?invidious\.weblibre\.org', - r'(?:www\.)?inv\.odyssey346\.dev', - r'(?:www\.)?invidious\.dhusch\.de', - r'(?:www\.)?iv\.melmac\.space', - r'(?:www\.)?watch\.thekitty\.zone', - r'(?:www\.)?invidious\.privacydev\.net', - r'(?:www\.)?ng27owmagn5amdm7l5s3rsqxwscl5ynppnis5dqcasogkyxcfqn7psid\.onion', - r'(?:www\.)?invidious\.drivet\.xyz', - r'(?:www\.)?vid\.priv\.au', - r'(?:www\.)?euxxcnhsynwmfidvhjf6uzptsmh4dipkmgdmcmxxuo7tunp3ad2jrwyd\.onion', - r'(?:www\.)?inv\.vern\.cc', - r'(?:www\.)?invidious\.esmailelbob\.xyz', - r'(?:www\.)?invidious\.sethforprivacy\.com', - r'(?:www\.)?yt\.oelrichsgarcia\.de', - r'(?:www\.)?yt\.artemislena\.eu', - r'(?:www\.)?invidious\.flokinet\.to', - r'(?:www\.)?invidious\.baczek\.me', - r'(?:www\.)?y\.com\.sb', - r'(?:www\.)?invidious\.epicsite\.xyz', - r'(?:www\.)?invidious\.lidarshield\.cloud', - r'(?:www\.)?yt\.funami\.tech', - r'(?:www\.)?invidious\.3o7z6yfxhbw7n3za4rss6l434kmv55cgw2vuziwuigpwegswvwzqipyd\.onion', - r'(?:www\.)?osbivz6guyeahrwp2lnwyjk2xos342h4ocsxyqrlaopqjuhwn2djiiyd\.onion', - r'(?:www\.)?u2cvlit75owumwpy4dj2hsmvkq7nvrclkpht7xgyye2pyoxhpmclkrad\.onion', - # youtube-dl invidious instances list - r'(?:(?:www|no)\.)?invidiou\.sh', - r'(?:(?:www|fi)\.)?invidious\.snopyta\.org', - r'(?:www\.)?invidious\.kabi\.tk', - r'(?:www\.)?invidious\.mastodon\.host', - r'(?:www\.)?invidious\.zapashcanon\.fr', - r'(?:www\.)?(?:invidious(?:-us)?|piped)\.kavin\.rocks', - r'(?:www\.)?invidious\.tinfoil-hat\.net', - r'(?:www\.)?invidious\.himiko\.cloud', - r'(?:www\.)?invidious\.reallyancient\.tech', - r'(?:www\.)?invidious\.tube', - r'(?:www\.)?invidiou\.site', - r'(?:www\.)?invidious\.site', - r'(?:www\.)?invidious\.xyz', - r'(?:www\.)?invidious\.nixnet\.xyz', - r'(?:www\.)?invidious\.048596\.xyz', - r'(?:www\.)?invidious\.drycat\.fr', - r'(?:www\.)?inv\.skyn3t\.in', - r'(?:www\.)?tube\.poal\.co', - r'(?:www\.)?tube\.connect\.cafe', - r'(?:www\.)?vid\.wxzm\.sx', - r'(?:www\.)?vid\.mint\.lgbt', - r'(?:www\.)?vid\.puffyan\.us', - r'(?:www\.)?yewtu\.be', - r'(?:www\.)?yt\.elukerio\.org', - r'(?:www\.)?yt\.lelux\.fi', - r'(?:www\.)?invidious\.ggc-project\.de', - r'(?:www\.)?yt\.maisputain\.ovh', - r'(?:www\.)?ytprivate\.com', - r'(?:www\.)?invidious\.13ad\.de', - r'(?:www\.)?invidious\.toot\.koeln', - r'(?:www\.)?invidious\.fdn\.fr', - r'(?:www\.)?watch\.nettohikari\.com', - r'(?:www\.)?invidious\.namazso\.eu', - r'(?:www\.)?invidious\.silkky\.cloud', - r'(?:www\.)?invidious\.exonip\.de', - r'(?:www\.)?invidious\.riverside\.rocks', - r'(?:www\.)?invidious\.blamefran\.net', - r'(?:www\.)?invidious\.moomoo\.de', - r'(?:www\.)?ytb\.trom\.tf', - r'(?:www\.)?yt\.cyberhost\.uk', - r'(?:www\.)?kgg2m7yk5aybusll\.onion', - r'(?:www\.)?qklhadlycap4cnod\.onion', - r'(?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion', - r'(?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion', - r'(?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion', - r'(?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion', - r'(?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p', - r'(?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion', - r'(?:www\.)?w6ijuptxiku4xpnnaetxvnkc5vqcdu7mgns2u77qefoixi63vbvnpnqd\.onion', - r'(?:www\.)?kbjggqkzv65ivcqj6bumvp337z6264huv5kpkwuv6gu5yjiskvan7fad\.onion', - r'(?:www\.)?grwp24hodrefzvjjuccrkw3mjq4tzhaaq32amf33dzpmuxe7ilepcmad\.onion', - r'(?:www\.)?hpniueoejy4opn7bc4ftgazyqjoeqwlvh2uiku2xqku6zpoa4bf5ruid\.onion', - # piped instances from https://github.com/TeamPiped/Piped/wiki/Instances - r'(?:www\.)?piped\.kavin\.rocks', - r'(?:www\.)?piped\.tokhmi\.xyz', - r'(?:www\.)?piped\.syncpundit\.io', - r'(?:www\.)?piped\.mha\.fi', - r'(?:www\.)?watch\.whatever\.social', - r'(?:www\.)?piped\.garudalinux\.org', - r'(?:www\.)?piped\.rivo\.lol', - r'(?:www\.)?piped-libre\.kavin\.rocks', - r'(?:www\.)?yt\.jae\.fi', - r'(?:www\.)?piped\.mint\.lgbt', - r'(?:www\.)?il\.ax', - r'(?:www\.)?piped\.esmailelbob\.xyz', - r'(?:www\.)?piped\.projectsegfau\.lt', - r'(?:www\.)?piped\.privacydev\.net', - r'(?:www\.)?piped\.palveluntarjoaja\.eu', - r'(?:www\.)?piped\.smnz\.de', - r'(?:www\.)?piped\.adminforge\.de', - r'(?:www\.)?watch\.whatevertinfoil\.de', - r'(?:www\.)?piped\.qdi\.fi', - r'(?:(?:www|cf)\.)?piped\.video', - r'(?:www\.)?piped\.aeong\.one', - r'(?:www\.)?piped\.moomoo\.me', - r'(?:www\.)?piped\.chauvet\.pro', - r'(?:www\.)?watch\.leptons\.xyz', - r'(?:www\.)?pd\.vern\.cc', - r'(?:www\.)?piped\.hostux\.net', - r'(?:www\.)?piped\.lunar\.icu', - # Hyperpipe instances from https://hyperpipe.codeberg.page/ - r'(?:www\.)?hyperpipe\.surge\.sh', - r'(?:www\.)?hyperpipe\.esmailelbob\.xyz', - r'(?:www\.)?listen\.whatever\.social', - r'(?:www\.)?music\.adminforge\.de', - ) - - # extracted from account/account_menu ep - # XXX: These are the supported YouTube UI and API languages, - # which is slightly different from languages supported for translation in YouTube studio - _SUPPORTED_LANG_CODES = [ - 'af', 'az', 'id', 'ms', 'bs', 'ca', 'cs', 'da', 'de', 'et', 'en-IN', 'en-GB', 'en', 'es', - 'es-419', 'es-US', 'eu', 'fil', 'fr', 'fr-CA', 'gl', 'hr', 'zu', 'is', 'it', 'sw', 'lv', - 'lt', 'hu', 'nl', 'no', 'uz', 'pl', 'pt-PT', 'pt', 'ro', 'sq', 'sk', 'sl', 'sr-Latn', 'fi', - 'sv', 'vi', 'tr', 'be', 'bg', 'ky', 'kk', 'mk', 'mn', 'ru', 'sr', 'uk', 'el', 'hy', 'iw', - 'ur', 'ar', 'fa', 'ne', 'mr', 'hi', 'as', 'bn', 'pa', 'gu', 'or', 'ta', 'te', 'kn', 'ml', - 'si', 'th', 'lo', 'my', 'ka', 'am', 'km', 'zh-CN', 'zh-TW', 'zh-HK', 'ja', 'ko', - ] - - _IGNORED_WARNINGS = { - 'Unavailable videos will be hidden during playback', - 'Unavailable videos are hidden', - } - - _YT_HANDLE_RE = r'@[\w.-]{3,30}' # https://support.google.com/youtube/answer/11585688?hl=en - _YT_CHANNEL_UCID_RE = r'UC[\w-]{22}' - - _NETRC_MACHINE = 'youtube' - - def ucid_or_none(self, ucid): - return self._search_regex(rf'^({self._YT_CHANNEL_UCID_RE})$', ucid, 'UC-id', default=None) - - def handle_or_none(self, handle): - return self._search_regex(rf'^({self._YT_HANDLE_RE})$', urllib.parse.unquote(handle or ''), - '@-handle', default=None) - - def handle_from_url(self, url): - return self._search_regex(rf'^(?:https?://(?:www\.)?youtube\.com)?/({self._YT_HANDLE_RE})', - urllib.parse.unquote(url or ''), 'channel handle', default=None) - - def ucid_from_url(self, url): - return self._search_regex(rf'^(?:https?://(?:www\.)?youtube\.com)?/({self._YT_CHANNEL_UCID_RE})', - url, 'channel id', default=None) - - @functools.cached_property - def _preferred_lang(self): - """ - Returns a language code supported by YouTube for the user preferred language. - Returns None if no preferred language set. - """ - preferred_lang = self._configuration_arg('lang', ie_key='Youtube', casesense=True, default=[''])[0] - if not preferred_lang: - return - if preferred_lang not in self._SUPPORTED_LANG_CODES: - raise ExtractorError( - f'Unsupported language code: {preferred_lang}. Supported language codes (case-sensitive): {join_nonempty(*self._SUPPORTED_LANG_CODES, delim=", ")}.', - expected=True) - elif preferred_lang != 'en': - self.report_warning( - f'Preferring "{preferred_lang}" translated fields. Note that some metadata extraction may fail or be incorrect.') - return preferred_lang - - def _initialize_consent(self): - cookies = self._get_cookies('https://www.youtube.com/') - if cookies.get('__Secure-3PSID'): - return - socs = cookies.get('SOCS') - if socs and not socs.value.startswith('CAA'): # not consented - return - self._set_cookie('.youtube.com', 'SOCS', 'CAI', secure=True) # accept all (required for mixes) - - def _initialize_pref(self): - cookies = self._get_cookies('https://www.youtube.com/') - pref_cookie = cookies.get('PREF') - pref = {} - if pref_cookie: - try: - pref = dict(urllib.parse.parse_qsl(pref_cookie.value)) - except ValueError: - self.report_warning('Failed to parse user PREF cookie' + bug_reports_message()) - pref.update({'hl': self._preferred_lang or 'en', 'tz': 'UTC'}) - self._set_cookie('.youtube.com', name='PREF', value=urllib.parse.urlencode(pref)) - - def _initialize_cookie_auth(self): - yt_sapisid, yt_1psapisid, yt_3psapisid = self._get_sid_cookies() - if yt_sapisid or yt_1psapisid or yt_3psapisid: - self.write_debug('Found YouTube account cookies') - - def _real_initialize(self): - self._initialize_pref() - self._initialize_consent() - self._initialize_cookie_auth() - self._check_login_required() - - def _perform_login(self, username, password): - if username.startswith('oauth'): - raise ExtractorError( - f'Login with OAuth is no longer supported. {self._youtube_login_hint}', expected=True) - - self.report_warning( - f'Login with password is not supported for YouTube. {self._youtube_login_hint}') - - @property - def _youtube_login_hint(self): - return (f'{self._login_hint(method="cookies")}. Also see ' - 'https://github.com/yt-dlp/yt-dlp/wiki/Extractors#exporting-youtube-cookies ' - 'for tips on effectively exporting YouTube cookies') - - def _check_login_required(self): - if self._LOGIN_REQUIRED and not self.is_authenticated: - self.raise_login_required( - f'Login details are needed to download this content. {self._youtube_login_hint}', method=None) - - _YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=' - _YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=' - - def _get_default_ytcfg(self, client='web'): - return copy.deepcopy(INNERTUBE_CLIENTS[client]) - - def _get_innertube_host(self, client='web'): - return INNERTUBE_CLIENTS[client]['INNERTUBE_HOST'] - - def _ytcfg_get_safe(self, ytcfg, getter, expected_type=None, default_client='web'): - # try_get but with fallback to default ytcfg client values when present - _func = lambda y: try_get(y, getter, expected_type) - return _func(ytcfg) or _func(self._get_default_ytcfg(default_client)) - - def _extract_client_name(self, ytcfg, default_client='web'): - return self._ytcfg_get_safe( - ytcfg, (lambda x: x['INNERTUBE_CLIENT_NAME'], - lambda x: x['INNERTUBE_CONTEXT']['client']['clientName']), str, default_client) - - def _extract_client_version(self, ytcfg, default_client='web'): - return self._ytcfg_get_safe( - ytcfg, (lambda x: x['INNERTUBE_CLIENT_VERSION'], - lambda x: x['INNERTUBE_CONTEXT']['client']['clientVersion']), str, default_client) - - def _select_api_hostname(self, req_api_hostname, default_client=None): - return (self._configuration_arg('innertube_host', [''], ie_key=YoutubeIE.ie_key())[0] - or req_api_hostname or self._get_innertube_host(default_client or 'web')) - - def _extract_context(self, ytcfg=None, default_client='web'): - context = get_first( - (ytcfg, self._get_default_ytcfg(default_client)), 'INNERTUBE_CONTEXT', expected_type=dict) - # Enforce language and tz for extraction - client_context = traverse_obj(context, 'client', expected_type=dict, default={}) - client_context.update({'hl': self._preferred_lang or 'en', 'timeZone': 'UTC', 'utcOffsetMinutes': 0}) - return context - - @staticmethod - def _make_sid_authorization(scheme, sid, origin, additional_parts): - timestamp = str(round(time.time())) - - hash_parts = [] - if additional_parts: - hash_parts.append(':'.join(additional_parts.values())) - hash_parts.extend([timestamp, sid, origin]) - sidhash = hashlib.sha1(' '.join(hash_parts).encode()).hexdigest() - - parts = [timestamp, sidhash] - if additional_parts: - parts.append(''.join(additional_parts)) - - return f'{scheme} {"_".join(parts)}' - - def _get_sid_cookies(self): - """ - Get SAPISID, 1PSAPISID, 3PSAPISID cookie values - @returns sapisid, 1psapisid, 3psapisid - """ - yt_cookies = self._get_cookies('https://www.youtube.com') - yt_sapisid = try_call(lambda: yt_cookies['SAPISID'].value) - yt_3papisid = try_call(lambda: yt_cookies['__Secure-3PAPISID'].value) - yt_1papisid = try_call(lambda: yt_cookies['__Secure-1PAPISID'].value) - - # Sometimes SAPISID cookie isn't present but __Secure-3PAPISID is. - # YouTube also falls back to __Secure-3PAPISID if SAPISID is missing. - # See: https://github.com/yt-dlp/yt-dlp/issues/393 - - return yt_sapisid or yt_3papisid, yt_1papisid, yt_3papisid - - def _get_sid_authorization_header(self, origin='https://www.youtube.com', user_session_id=None): - """ - Generate API Session ID Authorization for Innertube requests. Assumes all requests are secure (https). - @param origin: Origin URL - @param user_session_id: Optional User Session ID - @return: Authorization header value - """ - - authorizations = [] - additional_parts = {} - if user_session_id: - additional_parts['u'] = user_session_id - - yt_sapisid, yt_1psapisid, yt_3psapisid = self._get_sid_cookies() - - for scheme, sid in (('SAPISIDHASH', yt_sapisid), - ('SAPISID1PHASH', yt_1psapisid), - ('SAPISID3PHASH', yt_3psapisid)): - if sid: - authorizations.append(self._make_sid_authorization(scheme, sid, origin, additional_parts)) - - if not authorizations: - return None - - return ' '.join(authorizations) - - def _call_api(self, ep, query, video_id, fatal=True, headers=None, - note='Downloading API JSON', errnote='Unable to download API page', - context=None, api_key=None, api_hostname=None, default_client='web'): - - data = {'context': context} if context else {'context': self._extract_context(default_client=default_client)} - data.update(query) - real_headers = self.generate_api_headers(default_client=default_client) - real_headers.update({'content-type': 'application/json'}) - if headers: - real_headers.update(headers) - return self._download_json( - f'https://{self._select_api_hostname(api_hostname, default_client)}/youtubei/v1/{ep}', - video_id=video_id, fatal=fatal, note=note, errnote=errnote, - data=json.dumps(data).encode('utf8'), headers=real_headers, - query=filter_dict({ - 'key': self._configuration_arg( - 'innertube_key', [api_key], ie_key=YoutubeIE.ie_key(), casesense=True)[0], - 'prettyPrint': 'false', - }, cndn=lambda _, v: v)) - - def extract_yt_initial_data(self, item_id, webpage, fatal=True): - return self._search_json(self._YT_INITIAL_DATA_RE, webpage, 'yt initial data', item_id, fatal=fatal) - - @staticmethod - def _extract_session_index(*data): - """ - Index of current account in account list. - See: https://github.com/yt-dlp/yt-dlp/pull/519 - """ - for ytcfg in data: - session_index = int_or_none(try_get(ytcfg, lambda x: x['SESSION_INDEX'])) - if session_index is not None: - return session_index - - @staticmethod - def _parse_data_sync_id(data_sync_id): - """ - Parse data_sync_id into delegated_session_id and user_session_id. - - data_sync_id is of the form "delegated_session_id||user_session_id" for secondary channel - and just "user_session_id||" for primary channel. - - @param data_sync_id: data_sync_id string - @return: Tuple of (delegated_session_id, user_session_id) - """ - if not data_sync_id: - return None, None - first, _, second = data_sync_id.partition('||') - if second: - return first, second - return None, first - - def _extract_delegated_session_id(self, *args): - """ - Extract current delegated session ID required to download private playlists of secondary channels - @params response and/or ytcfg - @return: delegated session ID - """ - # ytcfg includes channel_syncid if on secondary channel - if delegated_sid := traverse_obj(args, (..., 'DELEGATED_SESSION_ID', {str}, any)): - return delegated_sid - - data_sync_id = self._extract_data_sync_id(*args) - return self._parse_data_sync_id(data_sync_id)[0] - - def _extract_user_session_id(self, *args): - """ - Extract current user session ID - @params response and/or ytcfg - @return: user session ID - """ - if user_sid := traverse_obj(args, (..., 'USER_SESSION_ID', {str}, any)): - return user_sid - - data_sync_id = self._extract_data_sync_id(*args) - return self._parse_data_sync_id(data_sync_id)[1] - - def _extract_data_sync_id(self, *args): - """ - Extract current account dataSyncId. - In the format DELEGATED_SESSION_ID||USER_SESSION_ID or USER_SESSION_ID|| - @params response and/or ytcfg - """ - if data_sync_id := self._configuration_arg('data_sync_id', [None], ie_key=YoutubeIE, casesense=True)[0]: - return data_sync_id - - return traverse_obj( - args, (..., ('DATASYNC_ID', ('responseContext', 'mainAppWebResponseContext', 'datasyncId')), {str}, any)) - - def _extract_visitor_data(self, *args): - """ - Extracts visitorData from an API response or ytcfg - Appears to be used to track session state - """ - if visitor_data := self._configuration_arg('visitor_data', [None], ie_key=YoutubeIE, casesense=True)[0]: - return visitor_data - return get_first( - args, [('VISITOR_DATA', ('INNERTUBE_CONTEXT', 'client', 'visitorData'), ('responseContext', 'visitorData'))], - expected_type=str) - - @functools.cached_property - def is_authenticated(self): - return bool(self._get_sid_authorization_header()) - - def extract_ytcfg(self, video_id, webpage): - if not webpage: - return {} - return self._parse_json( - self._search_regex( - r'ytcfg\.set\s*\(\s*({.+?})\s*\)\s*;', webpage, 'ytcfg', - default='{}'), video_id, fatal=False) or {} - - def _generate_cookie_auth_headers(self, *, ytcfg=None, delegated_session_id=None, user_session_id=None, session_index=None, origin=None, **kwargs): - headers = {} - delegated_session_id = delegated_session_id or self._extract_delegated_session_id(ytcfg) - if delegated_session_id: - headers['X-Goog-PageId'] = delegated_session_id - if session_index is None: - session_index = self._extract_session_index(ytcfg) - if delegated_session_id or session_index is not None: - headers['X-Goog-AuthUser'] = session_index if session_index is not None else 0 - - auth = self._get_sid_authorization_header(origin, user_session_id=user_session_id or self._extract_user_session_id(ytcfg)) - if auth is not None: - headers['Authorization'] = auth - headers['X-Origin'] = origin - - if traverse_obj(ytcfg, 'LOGGED_IN', expected_type=bool): - headers['X-Youtube-Bootstrap-Logged-In'] = 'true' - - return headers - - def generate_api_headers( - self, *, ytcfg=None, delegated_session_id=None, user_session_id=None, session_index=None, - visitor_data=None, api_hostname=None, default_client='web', **kwargs): - - origin = 'https://' + (self._select_api_hostname(api_hostname, default_client)) - headers = { - 'X-YouTube-Client-Name': str( - self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_CONTEXT_CLIENT_NAME'], default_client=default_client)), - 'X-YouTube-Client-Version': self._extract_client_version(ytcfg, default_client), - 'Origin': origin, - 'X-Goog-Visitor-Id': visitor_data or self._extract_visitor_data(ytcfg), - 'User-Agent': self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_CONTEXT']['client']['userAgent'], default_client=default_client), - **self._generate_cookie_auth_headers( - ytcfg=ytcfg, - delegated_session_id=delegated_session_id, - user_session_id=user_session_id, - session_index=session_index, - origin=origin), - } - return filter_dict(headers) - - def _download_ytcfg(self, client, video_id): - url = { - 'web': 'https://www.youtube.com', - 'web_music': 'https://music.youtube.com', - 'web_embedded': f'https://www.youtube.com/embed/{video_id}?html5=1', - 'tv': 'https://www.youtube.com/tv', - }.get(client) - if not url: - return {} - webpage = self._download_webpage( - url, video_id, fatal=False, note=f'Downloading {client.replace("_", " ").strip()} client config', - headers=traverse_obj(self._get_default_ytcfg(client), { - 'User-Agent': ('INNERTUBE_CONTEXT', 'client', 'userAgent', {str}), - })) - return self.extract_ytcfg(video_id, webpage) or {} - - @staticmethod - def _build_api_continuation_query(continuation, ctp=None): - query = { - 'continuation': continuation, - } - # TODO: Inconsistency with clickTrackingParams. - # Currently we have a fixed ctp contained within context (from ytcfg) - # and a ctp in root query for continuation. - if ctp: - query['clickTracking'] = {'clickTrackingParams': ctp} - return query - - @classmethod - def _extract_next_continuation_data(cls, renderer): - next_continuation = try_get( - renderer, (lambda x: x['continuations'][0]['nextContinuationData'], - lambda x: x['continuation']['reloadContinuationData']), dict) - if not next_continuation: - return - continuation = next_continuation.get('continuation') - if not continuation: - return - ctp = next_continuation.get('clickTrackingParams') - return cls._build_api_continuation_query(continuation, ctp) - - @classmethod - def _extract_continuation_ep_data(cls, continuation_ep: dict): - if isinstance(continuation_ep, dict): - continuation = try_get( - continuation_ep, lambda x: x['continuationCommand']['token'], str) - if not continuation: - return - ctp = continuation_ep.get('clickTrackingParams') - return cls._build_api_continuation_query(continuation, ctp) - - @classmethod - def _extract_continuation(cls, renderer): - next_continuation = cls._extract_next_continuation_data(renderer) - if next_continuation: - return next_continuation - - return traverse_obj(renderer, ( - ('contents', 'items', 'rows'), ..., 'continuationItemRenderer', - ('continuationEndpoint', ('button', 'buttonRenderer', 'command')), - ), get_all=False, expected_type=cls._extract_continuation_ep_data) - - @classmethod - def _extract_alerts(cls, data): - for alert_dict in try_get(data, lambda x: x['alerts'], list) or []: - if not isinstance(alert_dict, dict): - continue - for alert in alert_dict.values(): - alert_type = alert.get('type') - if not alert_type: - continue - message = cls._get_text(alert, 'text') - if message: - yield alert_type, message - - def _report_alerts(self, alerts, expected=True, fatal=True, only_once=False): - errors, warnings = [], [] - for alert_type, alert_message in alerts: - if alert_type.lower() == 'error' and fatal: - errors.append([alert_type, alert_message]) - elif alert_message not in self._IGNORED_WARNINGS: - warnings.append([alert_type, alert_message]) - - for alert_type, alert_message in (warnings + errors[:-1]): - self.report_warning(f'YouTube said: {alert_type} - {alert_message}', only_once=only_once) - if errors: - raise ExtractorError(f'YouTube said: {errors[-1][1]}', expected=expected) - - def _extract_and_report_alerts(self, data, *args, **kwargs): - return self._report_alerts(self._extract_alerts(data), *args, **kwargs) - - def _extract_badges(self, badge_list: list): - """ - Extract known BadgeType's from a list of badge renderers. - @returns [{'type': BadgeType}] - """ - icon_type_map = { - 'PRIVACY_UNLISTED': BadgeType.AVAILABILITY_UNLISTED, - 'PRIVACY_PRIVATE': BadgeType.AVAILABILITY_PRIVATE, - 'PRIVACY_PUBLIC': BadgeType.AVAILABILITY_PUBLIC, - 'CHECK_CIRCLE_THICK': BadgeType.VERIFIED, - 'OFFICIAL_ARTIST_BADGE': BadgeType.VERIFIED, - 'CHECK': BadgeType.VERIFIED, - } - - badge_style_map = { - 'BADGE_STYLE_TYPE_MEMBERS_ONLY': BadgeType.AVAILABILITY_SUBSCRIPTION, - 'BADGE_STYLE_TYPE_PREMIUM': BadgeType.AVAILABILITY_PREMIUM, - 'BADGE_STYLE_TYPE_LIVE_NOW': BadgeType.LIVE_NOW, - 'BADGE_STYLE_TYPE_VERIFIED': BadgeType.VERIFIED, - 'BADGE_STYLE_TYPE_VERIFIED_ARTIST': BadgeType.VERIFIED, - } - - label_map = { - 'unlisted': BadgeType.AVAILABILITY_UNLISTED, - 'private': BadgeType.AVAILABILITY_PRIVATE, - 'members only': BadgeType.AVAILABILITY_SUBSCRIPTION, - 'live': BadgeType.LIVE_NOW, - 'premium': BadgeType.AVAILABILITY_PREMIUM, - 'verified': BadgeType.VERIFIED, - 'official artist channel': BadgeType.VERIFIED, - } - - badges = [] - for badge in traverse_obj(badge_list, (..., lambda key, _: re.search(r'[bB]adgeRenderer$', key))): - badge_type = ( - icon_type_map.get(traverse_obj(badge, ('icon', 'iconType'), expected_type=str)) - or badge_style_map.get(traverse_obj(badge, 'style')) - ) - if badge_type: - badges.append({'type': badge_type}) - continue - - # fallback, won't work in some languages - label = traverse_obj( - badge, 'label', ('accessibilityData', 'label'), 'tooltip', 'iconTooltip', get_all=False, expected_type=str, default='') - for match, label_badge_type in label_map.items(): - if match in label.lower(): - badges.append({'type': label_badge_type}) - break - - return badges - - @staticmethod - def _has_badge(badges, badge_type): - return bool(traverse_obj(badges, lambda _, v: v['type'] == badge_type)) - - @staticmethod - def _get_text(data, *path_list, max_runs=None): - for path in path_list or [None]: - if path is None: - obj = [data] - else: - obj = traverse_obj(data, path, default=[]) - if not any(key is ... or isinstance(key, (list, tuple)) for key in variadic(path)): - obj = [obj] - for item in obj: - text = try_get(item, lambda x: x['simpleText'], str) - if text: - return text - runs = try_get(item, lambda x: x['runs'], list) or [] - if not runs and isinstance(item, list): - runs = item - - runs = runs[:min(len(runs), max_runs or len(runs))] - text = ''.join(traverse_obj(runs, (..., 'text'), expected_type=str)) - if text: - return text - - def _get_count(self, data, *path_list): - count_text = self._get_text(data, *path_list) or '' - count = parse_count(count_text) - if count is None: - count = str_to_int( - self._search_regex(r'^([\d,]+)', re.sub(r'\s', '', count_text), 'count', default=None)) - return count - - @staticmethod - def _extract_thumbnails(data, *path_list, final_key='thumbnails'): - """ - Extract thumbnails from thumbnails dict - @param path_list: path list to level that contains 'thumbnails' key - """ - thumbnails = [] - for path in path_list or [()]: - for thumbnail in traverse_obj(data, (*variadic(path), final_key, ...)): - thumbnail_url = url_or_none(thumbnail.get('url')) - if not thumbnail_url: - continue - # Sometimes youtube gives a wrong thumbnail URL. See: - # https://github.com/yt-dlp/yt-dlp/issues/233 - # https://github.com/ytdl-org/youtube-dl/issues/28023 - if 'maxresdefault' in thumbnail_url: - thumbnail_url = thumbnail_url.split('?')[0] - thumbnails.append({ - 'url': thumbnail_url, - 'height': int_or_none(thumbnail.get('height')), - 'width': int_or_none(thumbnail.get('width')), - }) - return thumbnails - - @staticmethod - def extract_relative_time(relative_time_text): - """ - Extracts a relative time from string and converts to dt object - e.g. 'streamed 6 days ago', '5 seconds ago (edited)', 'updated today', '8 yr ago' - """ - - # XXX: this could be moved to a general function in utils/_utils.py - # The relative time text strings are roughly the same as what - # Javascript's Intl.RelativeTimeFormat function generates. - # See: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Intl/RelativeTimeFormat - mobj = re.search( - r'(?P<start>today|yesterday|now)|(?P<time>\d+)\s*(?P<unit>sec(?:ond)?|s|min(?:ute)?|h(?:our|r)?|d(?:ay)?|w(?:eek|k)?|mo(?:nth)?|y(?:ear|r)?)s?\s*ago', - relative_time_text) - if mobj: - start = mobj.group('start') - if start: - return datetime_from_str(start) - try: - return datetime_from_str('now-{}{}'.format(mobj.group('time'), mobj.group('unit'))) - except ValueError: - return None - - def _parse_time_text(self, text): - if not text: - return - dt_ = self.extract_relative_time(text) - timestamp = None - if isinstance(dt_, dt.datetime): - timestamp = calendar.timegm(dt_.timetuple()) - - if timestamp is None: - timestamp = ( - unified_timestamp(text) or unified_timestamp( - self._search_regex( - (r'([a-z]+\s*\d{1,2},?\s*20\d{2})', r'(?:.+|^)(?:live|premieres|ed|ing)(?:\s*(?:on|for))?\s*(.+\d)'), - text.lower(), 'time text', default=None))) - - if text and timestamp is None and self._preferred_lang in (None, 'en'): - self.report_warning( - f'Cannot parse localized time text "{text}"', only_once=True) - return timestamp - - def _extract_response(self, item_id, query, note='Downloading API JSON', headers=None, - ytcfg=None, check_get_keys=None, ep='browse', fatal=True, api_hostname=None, - default_client='web'): - raise_for_incomplete = bool(self._configuration_arg('raise_incomplete_data', ie_key=YoutubeIE)) - # Incomplete Data should be a warning by default when retries are exhausted, while other errors should be fatal. - icd_retries = iter(self.RetryManager(fatal=raise_for_incomplete)) - icd_rm = next(icd_retries) - main_retries = iter(self.RetryManager()) - main_rm = next(main_retries) - # Manual retry loop for multiple RetryManagers - # The proper RetryManager MUST be advanced after an error - # and its result MUST be checked if the manager is non fatal - while True: - try: - response = self._call_api( - ep=ep, fatal=True, headers=headers, - video_id=item_id, query=query, note=note, - context=self._extract_context(ytcfg, default_client), - api_hostname=api_hostname, default_client=default_client) - except ExtractorError as e: - if not isinstance(e.cause, network_exceptions): - return self._error_or_warning(e, fatal=fatal) - elif not isinstance(e.cause, HTTPError): - main_rm.error = e - next(main_retries) - continue - - first_bytes = e.cause.response.read(512) - if not is_html(first_bytes): - yt_error = try_get( - self._parse_json( - self._webpage_read_content(e.cause.response, None, item_id, prefix=first_bytes) or '{}', item_id, fatal=False), - lambda x: x['error']['message'], str) - if yt_error: - self._report_alerts([('ERROR', yt_error)], fatal=False) - # Downloading page may result in intermittent 5xx HTTP error - # Sometimes a 404 is also received. See: https://github.com/ytdl-org/youtube-dl/issues/28289 - # We also want to catch all other network exceptions since errors in later pages can be troublesome - # See https://github.com/yt-dlp/yt-dlp/issues/507#issuecomment-880188210 - if e.cause.status not in (403, 429): - main_rm.error = e - next(main_retries) - continue - return self._error_or_warning(e, fatal=fatal) - - try: - self._extract_and_report_alerts(response, only_once=True) - except ExtractorError as e: - # YouTube's servers may return errors we want to retry on in a 200 OK response - # See: https://github.com/yt-dlp/yt-dlp/issues/839 - if 'unknown error' in e.msg.lower(): - main_rm.error = e - next(main_retries) - continue - return self._error_or_warning(e, fatal=fatal) - # Youtube sometimes sends incomplete data - # See: https://github.com/ytdl-org/youtube-dl/issues/28194 - if not traverse_obj(response, *variadic(check_get_keys)): - icd_rm.error = ExtractorError('Incomplete data received', expected=True) - should_retry = next(icd_retries, None) - if not should_retry: - return None - continue - - return response - - @staticmethod - def is_music_url(url): - return re.match(r'(https?://)?music\.youtube\.com/', url) is not None - - def _extract_video(self, renderer): - video_id = renderer.get('videoId') - - reel_header_renderer = traverse_obj(renderer, ( - 'navigationEndpoint', 'reelWatchEndpoint', 'overlay', 'reelPlayerOverlayRenderer', - 'reelPlayerHeaderSupportedRenderers', 'reelPlayerHeaderRenderer')) - - title = self._get_text(renderer, 'title', 'headline') or self._get_text(reel_header_renderer, 'reelTitleText') - description = self._get_text(renderer, 'descriptionSnippet') - - duration = int_or_none(renderer.get('lengthSeconds')) - if duration is None: - duration = parse_duration(self._get_text( - renderer, 'lengthText', ('thumbnailOverlays', ..., 'thumbnailOverlayTimeStatusRenderer', 'text'))) - if duration is None: - # XXX: should write a parser to be more general to support more cases (e.g. shorts in shorts tab) - duration = parse_duration(self._search_regex( - r'(?i)(ago)(?!.*\1)\s+(?P<duration>[a-z0-9 ,]+?)(?:\s+[\d,]+\s+views)?(?:\s+-\s+play\s+short)?$', - traverse_obj(renderer, ('title', 'accessibility', 'accessibilityData', 'label'), default='', expected_type=str), - video_id, default=None, group='duration')) - - channel_id = traverse_obj( - renderer, ('shortBylineText', 'runs', ..., 'navigationEndpoint', 'browseEndpoint', 'browseId'), - expected_type=str, get_all=False) - if not channel_id: - channel_id = traverse_obj(reel_header_renderer, ('channelNavigationEndpoint', 'browseEndpoint', 'browseId')) - - channel_id = self.ucid_or_none(channel_id) - - overlay_style = traverse_obj( - renderer, ('thumbnailOverlays', ..., 'thumbnailOverlayTimeStatusRenderer', 'style'), - get_all=False, expected_type=str) - badges = self._extract_badges(traverse_obj(renderer, 'badges')) - owner_badges = self._extract_badges(traverse_obj(renderer, 'ownerBadges')) - navigation_url = urljoin('https://www.youtube.com/', traverse_obj( - renderer, ('navigationEndpoint', 'commandMetadata', 'webCommandMetadata', 'url'), - expected_type=str)) or '' - url = f'https://www.youtube.com/watch?v={video_id}' - if overlay_style == 'SHORTS' or '/shorts/' in navigation_url: - url = f'https://www.youtube.com/shorts/{video_id}' - - time_text = (self._get_text(renderer, 'publishedTimeText', 'videoInfo') - or self._get_text(reel_header_renderer, 'timestampText') or '') - scheduled_timestamp = str_to_int(traverse_obj(renderer, ('upcomingEventData', 'startTime'), get_all=False)) - - live_status = ( - 'is_upcoming' if scheduled_timestamp is not None - else 'was_live' if 'streamed' in time_text.lower() - else 'is_live' if overlay_style == 'LIVE' or self._has_badge(badges, BadgeType.LIVE_NOW) - else None) - - # videoInfo is a string like '50K views • 10 years ago'. - view_count_text = self._get_text(renderer, 'viewCountText', 'shortViewCountText', 'videoInfo') or '' - view_count = (0 if 'no views' in view_count_text.lower() - else self._get_count({'simpleText': view_count_text})) - view_count_field = 'concurrent_view_count' if live_status in ('is_live', 'is_upcoming') else 'view_count' - - channel = (self._get_text(renderer, 'ownerText', 'shortBylineText') - or self._get_text(reel_header_renderer, 'channelTitleText')) - - channel_handle = traverse_obj(renderer, ( - 'shortBylineText', 'runs', ..., 'navigationEndpoint', - (('commandMetadata', 'webCommandMetadata', 'url'), ('browseEndpoint', 'canonicalBaseUrl'))), - expected_type=self.handle_from_url, get_all=False) - return { - '_type': 'url', - 'ie_key': YoutubeIE.ie_key(), - 'id': video_id, - 'url': url, - 'title': title, - 'description': description, - 'duration': duration, - 'channel_id': channel_id, - 'channel': channel, - 'channel_url': f'https://www.youtube.com/channel/{channel_id}' if channel_id else None, - 'uploader': channel, - 'uploader_id': channel_handle, - 'uploader_url': format_field(channel_handle, None, 'https://www.youtube.com/%s', default=None), - 'thumbnails': self._extract_thumbnails(renderer, 'thumbnail'), - 'timestamp': (self._parse_time_text(time_text) - if self._configuration_arg('approximate_date', ie_key=YoutubeTabIE) - else None), - 'release_timestamp': scheduled_timestamp, - 'availability': - 'public' if self._has_badge(badges, BadgeType.AVAILABILITY_PUBLIC) - else self._availability( - is_private=self._has_badge(badges, BadgeType.AVAILABILITY_PRIVATE) or None, - needs_premium=self._has_badge(badges, BadgeType.AVAILABILITY_PREMIUM) or None, - needs_subscription=self._has_badge(badges, BadgeType.AVAILABILITY_SUBSCRIPTION) or None, - is_unlisted=self._has_badge(badges, BadgeType.AVAILABILITY_UNLISTED) or None), - view_count_field: view_count, - 'live_status': live_status, - 'channel_is_verified': True if self._has_badge(owner_badges, BadgeType.VERIFIED) else None, - } - - class YoutubeIE(YoutubeBaseInfoExtractor): IE_DESC = 'YouTube' _VALID_URL = r'''(?x)^ @@ -1326,7 +131,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): _RETURN_TYPE = 'video' # XXX: How to handle multifeed? _PLAYER_INFO_RE = ( - r'/s/player/(?P<id>[a-zA-Z0-9_-]{8,})/player', + r'/s/player/(?P<id>[a-zA-Z0-9_-]{8,})/(?:tv-)?player', r'/(?P<id>[a-zA-Z0-9_-]{8,})/player(?:_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?|-plasma-ias-(?:phone|tablet)-[a-z]{2}_[A-Z]{2}\.vflset)/base\.js$', r'\b(?P<id>vfl[a-zA-Z0-9_-]+)\b.*?\.js$', ) @@ -2956,9 +1761,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor): }, ] + _PLAYER_JS_VARIANT_MAP = { + 'main': 'player_ias.vflset/en_US/base.js', + 'tce': 'player_ias_tce.vflset/en_US/base.js', + 'tv': 'tv-player-ias.vflset/tv-player-ias.js', + 'tv_es6': 'tv-player-es6.vflset/tv-player-es6.js', + 'phone': 'player-plasma-ias-phone-en_US.vflset/base.js', + 'tablet': 'player-plasma-ias-tablet-en_US.vflset/base.js', + } + _INVERSE_PLAYER_JS_VARIANT_MAP = {v: k for k, v in _PLAYER_JS_VARIANT_MAP.items()} + @classmethod def suitable(cls, url): - from ..utils import parse_qs + from yt_dlp.utils import parse_qs qs = parse_qs(url) if qs.get('list', [None])[0]: @@ -3004,6 +1819,12 @@ def mpd_feed(format_id, delay): else: retry.error = f'Cannot find refreshed manifest for format {format_id}{bug_reports_message()}' continue + + # Formats from ended premieres will be missing a manifest_url + # See https://github.com/yt-dlp/yt-dlp/issues/8543 + if not f.get('manifest_url'): + break + return f['manifest_url'], f['manifest_stream_number'], is_live return None @@ -3164,18 +1985,48 @@ def _extract_player_url(self, *ytcfgs, webpage=None): get_all=False, expected_type=str) if not player_url: return + + requested_js_variant = self._configuration_arg('player_js_variant', [''])[0] or 'actual' + if requested_js_variant in self._PLAYER_JS_VARIANT_MAP: + player_id = self._extract_player_info(player_url) + original_url = player_url + player_url = f'/s/player/{player_id}/{self._PLAYER_JS_VARIANT_MAP[requested_js_variant]}' + if original_url != player_url: + self.write_debug( + f'Forcing "{requested_js_variant}" player JS variant for player {player_id}\n' + f' original url = {original_url}', only_once=True) + elif requested_js_variant != 'actual': + self.report_warning( + f'Invalid player JS variant name "{requested_js_variant}" requested. ' + f'Valid choices are: {", ".join(self._PLAYER_JS_VARIANT_MAP)}', only_once=True) + return urljoin('https://www.youtube.com', player_url) def _download_player_url(self, video_id, fatal=False): - res = self._download_webpage( + iframe_webpage = self._download_webpage_with_retries( 'https://www.youtube.com/iframe_api', - note='Downloading iframe API JS', video_id=video_id, fatal=fatal) - if res: + note='Downloading iframe API JS', + video_id=video_id, retry_fatal=fatal) + + if iframe_webpage: player_version = self._search_regex( - r'player\\?/([0-9a-fA-F]{8})\\?/', res, 'player version', fatal=fatal) + r'player\\?/([0-9a-fA-F]{8})\\?/', iframe_webpage, 'player version', fatal=fatal) if player_version: return f'https://www.youtube.com/s/player/{player_version}/player_ias.vflset/en_US/base.js' + def _player_js_cache_key(self, player_url): + player_id = self._extract_player_info(player_url) + player_path = remove_start(urllib.parse.urlparse(player_url).path, f'/s/player/{player_id}/') + variant = self._INVERSE_PLAYER_JS_VARIANT_MAP.get(player_path) or next(( + v for k, v in self._INVERSE_PLAYER_JS_VARIANT_MAP.items() + if re.fullmatch(re.escape(k).replace('en_US', r'[a-zA-Z0-9_]+'), player_path)), None) + if not variant: + self.write_debug( + f'Unable to determine player JS variant\n' + f' player = {player_url}', only_once=True) + variant = re.sub(r'[^a-zA-Z0-9]', '_', remove_end(player_path, '.js')) + return join_nonempty(player_id, variant) + def _signature_cache_id(self, example_sig): """ Return a string representation of a signature """ return '.'.join(str(len(part)) for part in example_sig.split('.')) @@ -3191,30 +2042,29 @@ def _extract_player_info(cls, player_url): return id_m.group('id') def _load_player(self, video_id, player_url, fatal=True): - player_id = self._extract_player_info(player_url) - if player_id not in self._code_cache: + player_js_key = self._player_js_cache_key(player_url) + if player_js_key not in self._code_cache: code = self._download_webpage( player_url, video_id, fatal=fatal, - note='Downloading player ' + player_id, - errnote=f'Download of {player_url} failed') + note=f'Downloading player {player_js_key}', + errnote=f'Download of {player_js_key} failed') if code: - self._code_cache[player_id] = code - return self._code_cache.get(player_id) + self._code_cache[player_js_key] = code + return self._code_cache.get(player_js_key) def _extract_signature_function(self, video_id, player_url, example_sig): - player_id = self._extract_player_info(player_url) - # Read from filesystem cache - func_id = f'js_{player_id}_{self._signature_cache_id(example_sig)}' + func_id = join_nonempty( + self._player_js_cache_key(player_url), self._signature_cache_id(example_sig)) assert os.path.basename(func_id) == func_id self.write_debug(f'Extracting signature function {func_id}') - cache_spec, code = self.cache.load('youtube-sigfuncs', func_id), None + cache_spec, code = self.cache.load('youtube-sigfuncs', func_id, min_ver='2025.03.31'), None if not cache_spec: code = self._load_player(video_id, player_url) if code: - res = self._parse_sig_js(code) + res = self._parse_sig_js(code, player_url) test_string = ''.join(map(chr, range(len(example_sig)))) cache_spec = [ord(c) for c in res(test_string)] self.cache.store('youtube-sigfuncs', func_id, cache_spec) @@ -3262,7 +2112,7 @@ def _genslice(start, end, step): f' return {expr_code}\n') self.to_screen('Extracted signature function:\n' + code) - def _parse_sig_js(self, jscode): + def _parse_sig_js(self, jscode, player_url): # Examples where `sig` is funcname: # sig=function(a){a=a.split(""); ... ;return a.join("")}; # ;c&&(c=sig(decodeURIComponent(c)),a.set(b,encodeURIComponent(c)));return a}; @@ -3286,8 +2136,9 @@ def _parse_sig_js(self, jscode): r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('), jscode, 'Initial JS player signature function name', group='sig') + varname, global_list = self._interpret_player_js_global_var(jscode, player_url) jsi = JSInterpreter(jscode) - initial_function = jsi.extract_function(funcname) + initial_function = jsi.extract_function(funcname, filter_dict({varname: global_list})) return lambda s: initial_function([s]) def _cached(self, func, *cache_id): @@ -3306,6 +2157,24 @@ def inner(*args, **kwargs): return ret return inner + def _load_player_data_from_cache(self, name, player_url): + cache_id = (f'youtube-{name}', self._player_js_cache_key(player_url)) + + if data := self._player_cache.get(cache_id): + return data + + data = self.cache.load(*cache_id, min_ver='2025.03.31') + if data: + self._player_cache[cache_id] = data + + return data + + def _store_player_data_to_cache(self, name, player_url, data): + cache_id = (f'youtube-{name}', self._player_js_cache_key(player_url)) + if cache_id not in self._player_cache: + self.cache.store(*cache_id, data) + self._player_cache[cache_id] = data + def _decrypt_signature(self, s, video_id, player_url): """Turn the encrypted s field into a working signature""" extract_sig = self._cached( @@ -3346,9 +2215,31 @@ def _decrypt_nsig(self, s, video_id, player_url): video_id=video_id, note='Executing signature code').strip() self.write_debug(f'Decrypted nsig {s} => {ret}') + # Only cache nsig func JS code to disk if successful, and only once + self._store_player_data_to_cache('nsig', player_url, func_code) return ret def _extract_n_function_name(self, jscode, player_url=None): + varname, global_list = self._interpret_player_js_global_var(jscode, player_url) + if debug_str := traverse_obj(global_list, (lambda _, v: v.endswith('_w8_'), any)): + funcname = self._search_regex( + r'''(?xs) + [;\n](?: + (?P<f>function\s+)| + (?:var\s+)? + )(?P<funcname>[a-zA-Z0-9_$]+)\s*(?(f)|=\s*function\s*) + \((?P<argname>[a-zA-Z0-9_$]+)\)\s*\{ + (?:(?!\}[;\n]).)+ + \}\s*catch\(\s*[a-zA-Z0-9_$]+\s*\)\s* + \{\s*return\s+%s\[%d\]\s*\+\s*(?P=argname)\s*\}\s*return\s+[^}]+\}[;\n] + ''' % (re.escape(varname), global_list.index(debug_str)), + jscode, 'nsig function name', group='funcname', default=None) + if funcname: + return funcname + self.write_debug(join_nonempty( + 'Initial search was unable to find nsig function name', + player_url and f' player = {player_url}', delim='\n'), only_once=True) + # Examples (with placeholders nfunc, narray, idx): # * .get("n"))&&(b=nfunc(b) # * .get("n"))&&(b=narray[idx](b) @@ -3378,7 +2269,7 @@ def _extract_n_function_name(self, jscode, player_url=None): if not funcname: self.report_warning(join_nonempty( 'Falling back to generic n function search', - player_url and f' player = {player_url}', delim='\n')) + player_url and f' player = {player_url}', delim='\n'), only_once=True) return self._search_regex( r'''(?xs) ;\s*(?P<name>[a-zA-Z0-9_$]+)\s*=\s*function\([a-zA-Z0-9_$]+\) @@ -3391,14 +2282,60 @@ def _extract_n_function_name(self, jscode, player_url=None): rf'var {re.escape(funcname)}\s*=\s*(\[.+?\])\s*[,;]', jscode, f'Initial JS player n function list ({funcname}.{idx})')))[int(idx)] - def _fixup_n_function_code(self, argnames, code): - return argnames, re.sub( - rf';\s*if\s*\(\s*typeof\s+[a-zA-Z0-9_$]+\s*===?\s*(["\'])undefined\1\s*\)\s*return\s+{argnames[0]};', - ';', code) + def _extract_player_js_global_var(self, jscode, player_url): + """Returns tuple of strings: variable assignment code, variable name, variable value code""" + extract_global_var = self._cached(self._search_regex, 'js global array', player_url) + varcode, varname, varvalue = extract_global_var( + r'''(?x) + (?P<q1>["\'])use\s+strict(?P=q1);\s* + (?P<code> + var\s+(?P<name>[a-zA-Z0-9_$]+)\s*=\s* + (?P<value> + (?P<q2>["\'])(?:(?!(?P=q2)).|\\.)+(?P=q2) + \.split\((?P<q3>["\'])(?:(?!(?P=q3)).)+(?P=q3)\) + |\[\s*(?:(?P<q4>["\'])(?:(?!(?P=q4)).|\\.)*(?P=q4)\s*,?\s*)+\] + ) + )[;,] + ''', jscode, 'global variable', group=('code', 'name', 'value'), default=(None, None, None)) + if not varcode: + self.write_debug(join_nonempty( + 'No global array variable found in player JS', + player_url and f' player = {player_url}', delim='\n'), only_once=True) + return varcode, varname, varvalue + + def _interpret_player_js_global_var(self, jscode, player_url): + """Returns tuple of: variable name string, variable value list""" + _, varname, array_code = self._extract_player_js_global_var(jscode, player_url) + jsi = JSInterpreter(array_code) + interpret_global_var = self._cached(jsi.interpret_expression, 'js global list', player_url) + return varname, interpret_global_var(array_code, {}, allow_recursion=10) + + def _fixup_n_function_code(self, argnames, nsig_code, jscode, player_url): + varcode, varname, _ = self._extract_player_js_global_var(jscode, player_url) + if varcode and varname: + nsig_code = varcode + '; ' + nsig_code + _, global_list = self._interpret_player_js_global_var(jscode, player_url) + else: + varname = 'dlp_wins' + global_list = [] + + undefined_idx = global_list.index('undefined') if 'undefined' in global_list else r'\d+' + fixed_code = re.sub( + rf'''(?x) + ;\s*if\s*\(\s*typeof\s+[a-zA-Z0-9_$]+\s*===?\s*(?: + (["\'])undefined\1| + {re.escape(varname)}\[{undefined_idx}\] + )\s*\)\s*return\s+{re.escape(argnames[0])}; + ''', ';', nsig_code) + if fixed_code == nsig_code: + self.write_debug(join_nonempty( + 'No typeof statement found in nsig function code', + player_url and f' player = {player_url}', delim='\n'), only_once=True) + return argnames, fixed_code def _extract_n_function_code(self, video_id, player_url): player_id = self._extract_player_info(player_url) - func_code = self.cache.load('youtube-nsig', player_id, min_ver='2024.07.09') + func_code = self._load_player_data_from_cache('nsig', player_url) jscode = func_code or self._load_player(video_id, player_url) jsi = JSInterpreter(jscode) @@ -3407,10 +2344,9 @@ def _extract_n_function_code(self, video_id, player_url): func_name = self._extract_n_function_name(jscode, player_url=player_url) - # XXX: Workaround for the `typeof` gotcha - func_code = self._fixup_n_function_code(*jsi.extract_function_code(func_name)) + # XXX: Workaround for the global array variable and lack of `typeof` implementation + func_code = self._fixup_n_function_code(*jsi.extract_function_code(func_name), jscode, player_url) - self.cache.store('youtube-nsig', player_id, func_code) return jsi, player_id, func_code def _extract_n_function_from_code(self, jsi, func_code): @@ -3435,23 +2371,27 @@ def _extract_signature_timestamp(self, video_id, player_url, ytcfg=None, fatal=F Extract signatureTimestamp (sts) Required to tell API what sig/player version is in use. """ - sts = None - if isinstance(ytcfg, dict): - sts = int_or_none(ytcfg.get('STS')) + if sts := traverse_obj(ytcfg, ('STS', {int_or_none})): + return sts + + if not player_url: + error_msg = 'Cannot extract signature timestamp without player url' + if fatal: + raise ExtractorError(error_msg) + self.report_warning(error_msg) + return None + + sts = self._load_player_data_from_cache('sts', player_url) + if sts: + return sts + + if code := self._load_player(video_id, player_url, fatal=fatal): + sts = int_or_none(self._search_regex( + r'(?:signatureTimestamp|sts)\s*:\s*(?P<sts>[0-9]{5})', code, + 'JS player signature timestamp', group='sts', fatal=fatal)) + if sts: + self._store_player_data_to_cache('sts', player_url, sts) - if not sts: - # Attempt to extract from player - if player_url is None: - error_msg = 'Cannot extract signature timestamp without player_url.' - if fatal: - raise ExtractorError(error_msg) - self.report_warning(error_msg) - return - code = self._load_player(video_id, player_url, fatal=fatal) - if code: - sts = int_or_none(self._search_regex( - r'(?:signatureTimestamp|sts)\s*:\s*(?P<sts>[0-9]{5})', code, - 'JS player signature timestamp', group='sts', fatal=fatal)) return sts def _mark_watched(self, video_id, player_responses): @@ -4204,9 +3144,19 @@ def append_client(*client_names): else: prs.append(pr) + # web_embedded can work around age-gate and age-verification for some embeddable videos + if self._is_agegated(pr) and variant != 'web_embedded': + append_client(f'web_embedded.{base_client}') + # Unauthenticated users will only get web_embedded client formats if age-gated + if self._is_agegated(pr) and not self.is_authenticated: + self.to_screen( + f'{video_id}: This video is age-restricted; some formats may be missing ' + f'without authentication. {self._youtube_login_hint}', only_once=True) + # EU countries require age-verification for accounts to access age-restricted videos # If account is not age-verified, _is_agegated() will be truthy for non-embedded clients - if self.is_authenticated and self._is_agegated(pr): + embedding_is_disabled = variant == 'web_embedded' and self._is_unplayable(pr) + if self.is_authenticated and (self._is_agegated(pr) or embedding_is_disabled): self.to_screen( f'{video_id}: This video is age-restricted and YouTube is requiring ' 'account age-verification; some formats may be missing', only_once=True) @@ -4276,6 +3226,7 @@ def build_fragments(f): } for range_start in range(0, f['filesize'], CHUNK_SIZE)) for fmt in streaming_formats: + client_name = fmt[STREAMING_DATA_CLIENT_NAME] if fmt.get('targetDurationSec'): continue @@ -4308,18 +3259,40 @@ def build_fragments(f): if language_code and (is_original or (is_default and not original_language)): original_language = language_code + has_drm = bool(fmt.get('drmFamilies')) + # FORMAT_STREAM_TYPE_OTF(otf=1) requires downloading the init fragment # (adding `&sq=0` to the URL) and parsing emsg box to determine the # number of fragment that would subsequently requested with (`&sq=N`) - if fmt.get('type') == 'FORMAT_STREAM_TYPE_OTF': + if fmt.get('type') == 'FORMAT_STREAM_TYPE_OTF' and not has_drm: continue + if has_drm: + msg = f'Some {client_name} client https formats have been skipped as they are DRM protected. ' + if client_name == 'tv': + msg += ( + f'{"Your account" if self.is_authenticated else "The current session"} may have ' + f'an experiment that applies DRM to all videos on the tv client. ' + f'See https://github.com/yt-dlp/yt-dlp/issues/12563 for more details.' + ) + self.report_warning(msg, video_id, only_once=True) + fmt_url = fmt.get('url') if not fmt_url: sc = urllib.parse.parse_qs(fmt.get('signatureCipher')) fmt_url = url_or_none(try_get(sc, lambda x: x['url'][0])) encrypted_sig = try_get(sc, lambda x: x['s'][0]) if not all((sc, fmt_url, player_url, encrypted_sig)): + msg = f'Some {client_name} client https formats have been skipped as they are missing a url. ' + if client_name == 'web': + msg += 'YouTube is forcing SABR streaming for this client. ' + else: + msg += ( + f'YouTube may have enabled the SABR-only or Server-Side Ad Placement experiment for ' + f'{"your account" if self.is_authenticated else "the current session"}. ' + ) + msg += 'See https://github.com/yt-dlp/yt-dlp/issues/12482 for more details' + self.report_warning(msg, video_id, only_once=True) continue try: fmt_url += '&{}={}'.format( @@ -4340,14 +3313,12 @@ def build_fragments(f): 'n': decrypt_nsig(query['n'][0], video_id, player_url), }) except ExtractorError as e: - phantomjs_hint = '' - if isinstance(e, JSInterpreter.Exception): - phantomjs_hint = (f' Install {self._downloader._format_err("PhantomJS", self._downloader.Styles.EMPHASIS)} ' - f'to workaround the issue. {PhantomJSwrapper.INSTALL_HINT}\n') if player_url: self.report_warning( - f'nsig extraction failed: Some formats may be missing\n{phantomjs_hint}' - f' n = {query["n"][0]} ; player = {player_url}', video_id=video_id, only_once=True) + f'nsig extraction failed: Some formats may be missing\n' + f' n = {query["n"][0]} ; player = {player_url}\n' + f' {bug_reports_message(before="")}', + video_id=video_id, only_once=True) self.write_debug(e, only_once=True) else: self.report_warning( @@ -4364,9 +3335,8 @@ def build_fragments(f): is_damaged = try_call(lambda: format_duration < duration // 2) if is_damaged: self.report_warning( - f'{video_id}: Some formats are possibly damaged. They will be deprioritized', only_once=True) + 'Some formats are possibly damaged. They will be deprioritized', video_id, only_once=True) - client_name = fmt[STREAMING_DATA_CLIENT_NAME] po_token = fmt.get(STREAMING_DATA_INITIAL_PO_TOKEN) if po_token: @@ -4402,15 +3372,15 @@ def build_fragments(f): 'audio_channels': fmt.get('audioChannels'), 'height': height, 'quality': q(quality) - bool(fmt.get('isDrc')) / 2, - 'has_drm': bool(fmt.get('drmFamilies')), + 'has_drm': has_drm, 'tbr': tbr, 'filesize_approx': filesize_from_tbr(tbr, format_duration), 'url': fmt_url, 'width': int_or_none(fmt.get('width')), 'language': join_nonempty(language_code, 'desc' if is_descriptive else '') or None, 'language_preference': PREFERRED_LANG_VALUE if is_original else 5 if is_default else -10 if is_descriptive else -1, - # Strictly de-prioritize broken, damaged and 3gp formats - 'preference': -20 if require_po_token else -10 if is_damaged else -2 if itag == '17' else None, + # Strictly de-prioritize damaged and 3gp formats + 'preference': -10 if is_damaged else -2 if itag == '17' else None, } mime_mobj = re.match( r'((?:[^/]+)/(?:[^;]+))(?:;\s*codecs="([^"]+)")?', fmt.get('mimeType') or '') @@ -4588,8 +3558,7 @@ def _download_player_responses(self, url, smuggled_data, video_id, webpage_url): pp = self._configuration_arg('player_params', [None], casesense=True)[0] if pp: query['pp'] = pp - webpage = self._download_webpage( - webpage_url, video_id, fatal=False, query=query) + webpage = self._download_webpage_with_retries(webpage_url, video_id, query=query) master_ytcfg = self.extract_ytcfg(video_id, webpage) or self._get_default_ytcfg() @@ -4735,6 +3704,15 @@ def feed_entry(name): if 'sign in' in reason.lower(): reason = remove_end(reason, 'This helps protect our community. Learn more') reason = f'{remove_end(reason.strip(), ".")}. {self._youtube_login_hint}' + elif get_first(playability_statuses, ('errorScreen', 'playerCaptchaViewModel', {dict})): + reason += '. YouTube is requiring a captcha challenge before playback' + elif "This content isn't available, try again later" in reason: + reason = ( + f'{remove_end(reason.strip(), ".")}. {"Your account" if self.is_authenticated else "The current session"} ' + f'has been rate-limited by YouTube for up to an hour. It is recommended to use `-t sleep` to add a delay ' + f'between video requests to avoid exceeding the rate limit. For more information, refer to ' + f'https://github.com/yt-dlp/yt-dlp/wiki/Extractors#this-content-isnt-available-try-again-later' + ) self.raise_no_formats(reason, expected=True) keywords = get_first(video_details, 'keywords', expected_type=list) or [] @@ -4963,7 +3941,7 @@ def process_language(container, base_url, lang_code, sub_name, query): if not traverse_obj(initial_data, 'contents'): self.report_warning('Incomplete data received in embedded initial data; re-fetching using API.') initial_data = None - if not initial_data: + if not initial_data and 'initial_data' not in self._configuration_arg('player_skip'): query = {'videoId': video_id} query.update(self._get_checkok_params()) initial_data = self._extract_response( @@ -5174,2847 +4152,3 @@ def process_language(container, base_url, lang_code, sub_name, query): self.mark_watched(video_id, player_responses) return info - - -class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): - @staticmethod - def passthrough_smuggled_data(func): - def _smuggle(info, smuggled_data): - if info.get('_type') not in ('url', 'url_transparent'): - return info - if smuggled_data.get('is_music_url'): - parsed_url = urllib.parse.urlparse(info['url']) - if parsed_url.netloc in ('www.youtube.com', 'music.youtube.com'): - smuggled_data.pop('is_music_url') - info['url'] = urllib.parse.urlunparse(parsed_url._replace(netloc='music.youtube.com')) - if smuggled_data: - info['url'] = smuggle_url(info['url'], smuggled_data) - return info - - @functools.wraps(func) - def wrapper(self, url): - url, smuggled_data = unsmuggle_url(url, {}) - if self.is_music_url(url): - smuggled_data['is_music_url'] = True - info_dict = func(self, url, smuggled_data) - if smuggled_data: - _smuggle(info_dict, smuggled_data) - if info_dict.get('entries'): - info_dict['entries'] = (_smuggle(i, smuggled_data.copy()) for i in info_dict['entries']) - return info_dict - return wrapper - - @staticmethod - def _extract_basic_item_renderer(item): - # Modified from _extract_grid_item_renderer - known_basic_renderers = ( - 'playlistRenderer', 'videoRenderer', 'channelRenderer', 'showRenderer', 'reelItemRenderer', - ) - for key, renderer in item.items(): - if not isinstance(renderer, dict): - continue - elif key in known_basic_renderers: - return renderer - elif key.startswith('grid') and key.endswith('Renderer'): - return renderer - - def _extract_channel_renderer(self, renderer): - channel_id = self.ucid_or_none(renderer['channelId']) - title = self._get_text(renderer, 'title') - channel_url = format_field(channel_id, None, 'https://www.youtube.com/channel/%s', default=None) - channel_handle = self.handle_from_url( - traverse_obj(renderer, ( - 'navigationEndpoint', (('commandMetadata', 'webCommandMetadata', 'url'), - ('browseEndpoint', 'canonicalBaseUrl')), - {str}), get_all=False)) - if not channel_handle: - # As of 2023-06-01, YouTube sets subscriberCountText to the handle in search - channel_handle = self.handle_or_none(self._get_text(renderer, 'subscriberCountText')) - return { - '_type': 'url', - 'url': channel_url, - 'id': channel_id, - 'ie_key': YoutubeTabIE.ie_key(), - 'channel': title, - 'uploader': title, - 'channel_id': channel_id, - 'channel_url': channel_url, - 'title': title, - 'uploader_id': channel_handle, - 'uploader_url': format_field(channel_handle, None, 'https://www.youtube.com/%s', default=None), - # See above. YouTube sets videoCountText to the subscriber text in search channel renderers. - # However, in feed/channels this is set correctly to the subscriber count - 'channel_follower_count': traverse_obj( - renderer, 'subscriberCountText', 'videoCountText', expected_type=self._get_count), - 'thumbnails': self._extract_thumbnails(renderer, 'thumbnail'), - 'playlist_count': ( - # videoCountText may be the subscriber count - self._get_count(renderer, 'videoCountText') - if self._get_count(renderer, 'subscriberCountText') is not None else None), - 'description': self._get_text(renderer, 'descriptionSnippet'), - 'channel_is_verified': True if self._has_badge( - self._extract_badges(traverse_obj(renderer, 'ownerBadges')), BadgeType.VERIFIED) else None, - } - - def _grid_entries(self, grid_renderer): - for item in grid_renderer['items']: - if not isinstance(item, dict): - continue - if lockup_view_model := traverse_obj(item, ('lockupViewModel', {dict})): - if entry := self._extract_lockup_view_model(lockup_view_model): - yield entry - continue - renderer = self._extract_basic_item_renderer(item) - if not isinstance(renderer, dict): - continue - title = self._get_text(renderer, 'title') - - # playlist - playlist_id = renderer.get('playlistId') - if playlist_id: - yield self.url_result( - f'https://www.youtube.com/playlist?list={playlist_id}', - ie=YoutubeTabIE.ie_key(), video_id=playlist_id, - video_title=title) - continue - # video - video_id = renderer.get('videoId') - if video_id: - yield self._extract_video(renderer) - continue - # channel - channel_id = renderer.get('channelId') - if channel_id: - yield self._extract_channel_renderer(renderer) - continue - # generic endpoint URL support - ep_url = urljoin('https://www.youtube.com/', try_get( - renderer, lambda x: x['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'], - str)) - if ep_url: - for ie in (YoutubeTabIE, YoutubePlaylistIE, YoutubeIE): - if ie.suitable(ep_url): - yield self.url_result( - ep_url, ie=ie.ie_key(), video_id=ie._match_id(ep_url), video_title=title) - break - - def _music_reponsive_list_entry(self, renderer): - video_id = traverse_obj(renderer, ('playlistItemData', 'videoId')) - if video_id: - title = traverse_obj(renderer, ( - 'flexColumns', 0, 'musicResponsiveListItemFlexColumnRenderer', - 'text', 'runs', 0, 'text')) - return self.url_result(f'https://music.youtube.com/watch?v={video_id}', - ie=YoutubeIE.ie_key(), video_id=video_id, title=title) - playlist_id = traverse_obj(renderer, ('navigationEndpoint', 'watchEndpoint', 'playlistId')) - if playlist_id: - video_id = traverse_obj(renderer, ('navigationEndpoint', 'watchEndpoint', 'videoId')) - if video_id: - return self.url_result(f'https://music.youtube.com/watch?v={video_id}&list={playlist_id}', - ie=YoutubeTabIE.ie_key(), video_id=playlist_id) - return self.url_result(f'https://music.youtube.com/playlist?list={playlist_id}', - ie=YoutubeTabIE.ie_key(), video_id=playlist_id) - browse_id = traverse_obj(renderer, ('navigationEndpoint', 'browseEndpoint', 'browseId')) - if browse_id: - return self.url_result(f'https://music.youtube.com/browse/{browse_id}', - ie=YoutubeTabIE.ie_key(), video_id=browse_id) - - def _shelf_entries_from_content(self, shelf_renderer): - content = shelf_renderer.get('content') - if not isinstance(content, dict): - return - renderer = content.get('gridRenderer') or content.get('expandedShelfContentsRenderer') - if renderer: - # TODO: add support for nested playlists so each shelf is processed - # as separate playlist - # TODO: this includes only first N items - yield from self._grid_entries(renderer) - renderer = content.get('horizontalListRenderer') - if renderer: - # TODO: handle case - pass - - def _shelf_entries(self, shelf_renderer, skip_channels=False): - ep = try_get( - shelf_renderer, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'], - str) - shelf_url = urljoin('https://www.youtube.com', ep) - if shelf_url: - # Skipping links to another channels, note that checking for - # endpoint.commandMetadata.webCommandMetadata.webPageTypwebPageType == WEB_PAGE_TYPE_CHANNEL - # will not work - if skip_channels and '/channels?' in shelf_url: - return - title = self._get_text(shelf_renderer, 'title') - yield self.url_result(shelf_url, video_title=title) - # Shelf may not contain shelf URL, fallback to extraction from content - yield from self._shelf_entries_from_content(shelf_renderer) - - def _playlist_entries(self, video_list_renderer): - for content in video_list_renderer['contents']: - if not isinstance(content, dict): - continue - renderer = content.get('playlistVideoRenderer') or content.get('playlistPanelVideoRenderer') - if not isinstance(renderer, dict): - continue - video_id = renderer.get('videoId') - if not video_id: - continue - yield self._extract_video(renderer) - - def _extract_lockup_view_model(self, view_model): - content_id = view_model.get('contentId') - if not content_id: - return - content_type = view_model.get('contentType') - if content_type not in ('LOCKUP_CONTENT_TYPE_PLAYLIST', 'LOCKUP_CONTENT_TYPE_PODCAST'): - self.report_warning( - f'Unsupported lockup view model content type "{content_type}"{bug_reports_message()}', only_once=True) - return - return self.url_result( - f'https://www.youtube.com/playlist?list={content_id}', ie=YoutubeTabIE, video_id=content_id, - title=traverse_obj(view_model, ( - 'metadata', 'lockupMetadataViewModel', 'title', 'content', {str})), - thumbnails=self._extract_thumbnails(view_model, ( - 'contentImage', 'collectionThumbnailViewModel', 'primaryThumbnail', 'thumbnailViewModel', 'image'), final_key='sources')) - - def _rich_entries(self, rich_grid_renderer): - if lockup_view_model := traverse_obj(rich_grid_renderer, ('content', 'lockupViewModel', {dict})): - if entry := self._extract_lockup_view_model(lockup_view_model): - yield entry - return - renderer = traverse_obj( - rich_grid_renderer, - ('content', ('videoRenderer', 'reelItemRenderer', 'playlistRenderer', 'shortsLockupViewModel'), any)) or {} - video_id = renderer.get('videoId') - if video_id: - yield self._extract_video(renderer) - return - playlist_id = renderer.get('playlistId') - if playlist_id: - yield self.url_result( - f'https://www.youtube.com/playlist?list={playlist_id}', - ie=YoutubeTabIE.ie_key(), video_id=playlist_id, - video_title=self._get_text(renderer, 'title')) - return - # shortsLockupViewModel extraction - entity_id = renderer.get('entityId') - if entity_id: - video_id = traverse_obj(renderer, ('onTap', 'innertubeCommand', 'reelWatchEndpoint', 'videoId', {str})) - if not video_id: - return - yield self.url_result( - f'https://www.youtube.com/shorts/{video_id}', - ie=YoutubeIE, video_id=video_id, - **traverse_obj(renderer, { - 'title': (( - ('overlayMetadata', 'primaryText', 'content', {str}), - ('accessibilityText', {lambda x: re.fullmatch(r'(.+), (?:[\d,.]+(?:[KM]| million)?|No) views? - play Short', x)}, 1)), any), - 'view_count': ('overlayMetadata', 'secondaryText', 'content', {parse_count}), - }), - thumbnails=self._extract_thumbnails(renderer, 'thumbnail', final_key='sources')) - return - - def _video_entry(self, video_renderer): - video_id = video_renderer.get('videoId') - if video_id: - return self._extract_video(video_renderer) - - def _hashtag_tile_entry(self, hashtag_tile_renderer): - url = urljoin('https://youtube.com', traverse_obj( - hashtag_tile_renderer, ('onTapCommand', 'commandMetadata', 'webCommandMetadata', 'url'))) - if url: - return self.url_result( - url, ie=YoutubeTabIE.ie_key(), title=self._get_text(hashtag_tile_renderer, 'hashtag')) - - def _post_thread_entries(self, post_thread_renderer): - post_renderer = try_get( - post_thread_renderer, lambda x: x['post']['backstagePostRenderer'], dict) - if not post_renderer: - return - # video attachment - video_renderer = try_get( - post_renderer, lambda x: x['backstageAttachment']['videoRenderer'], dict) or {} - video_id = video_renderer.get('videoId') - if video_id: - entry = self._extract_video(video_renderer) - if entry: - yield entry - # playlist attachment - playlist_id = try_get( - post_renderer, lambda x: x['backstageAttachment']['playlistRenderer']['playlistId'], str) - if playlist_id: - yield self.url_result( - f'https://www.youtube.com/playlist?list={playlist_id}', - ie=YoutubeTabIE.ie_key(), video_id=playlist_id) - # inline video links - runs = try_get(post_renderer, lambda x: x['contentText']['runs'], list) or [] - for run in runs: - if not isinstance(run, dict): - continue - ep_url = try_get( - run, lambda x: x['navigationEndpoint']['urlEndpoint']['url'], str) - if not ep_url: - continue - if not YoutubeIE.suitable(ep_url): - continue - ep_video_id = YoutubeIE._match_id(ep_url) - if video_id == ep_video_id: - continue - yield self.url_result(ep_url, ie=YoutubeIE.ie_key(), video_id=ep_video_id) - - def _post_thread_continuation_entries(self, post_thread_continuation): - contents = post_thread_continuation.get('contents') - if not isinstance(contents, list): - return - for content in contents: - renderer = content.get('backstagePostThreadRenderer') - if isinstance(renderer, dict): - yield from self._post_thread_entries(renderer) - continue - renderer = content.get('videoRenderer') - if isinstance(renderer, dict): - yield self._video_entry(renderer) - - r''' # unused - def _rich_grid_entries(self, contents): - for content in contents: - video_renderer = try_get(content, lambda x: x['richItemRenderer']['content']['videoRenderer'], dict) - if video_renderer: - entry = self._video_entry(video_renderer) - if entry: - yield entry - ''' - - def _report_history_entries(self, renderer): - for url in traverse_obj(renderer, ( - 'rows', ..., 'reportHistoryTableRowRenderer', 'cells', ..., - 'reportHistoryTableCellRenderer', 'cell', 'reportHistoryTableTextCellRenderer', 'text', 'runs', ..., - 'navigationEndpoint', 'commandMetadata', 'webCommandMetadata', 'url')): - yield self.url_result(urljoin('https://www.youtube.com', url), YoutubeIE) - - def _extract_entries(self, parent_renderer, continuation_list): - # continuation_list is modified in-place with continuation_list = [continuation_token] - continuation_list[:] = [None] - contents = try_get(parent_renderer, lambda x: x['contents'], list) or [] - for content in contents: - if not isinstance(content, dict): - continue - is_renderer = traverse_obj( - content, 'itemSectionRenderer', 'musicShelfRenderer', 'musicShelfContinuation', - expected_type=dict) - if not is_renderer: - if content.get('richItemRenderer'): - for entry in self._rich_entries(content['richItemRenderer']): - yield entry - continuation_list[0] = self._extract_continuation(parent_renderer) - elif content.get('reportHistorySectionRenderer'): # https://www.youtube.com/reporthistory - table = traverse_obj(content, ('reportHistorySectionRenderer', 'table', 'tableRenderer')) - yield from self._report_history_entries(table) - continuation_list[0] = self._extract_continuation(table) - continue - - isr_contents = try_get(is_renderer, lambda x: x['contents'], list) or [] - for isr_content in isr_contents: - if not isinstance(isr_content, dict): - continue - - known_renderers = { - 'playlistVideoListRenderer': self._playlist_entries, - 'gridRenderer': self._grid_entries, - 'reelShelfRenderer': self._grid_entries, - 'shelfRenderer': self._shelf_entries, - 'musicResponsiveListItemRenderer': lambda x: [self._music_reponsive_list_entry(x)], - 'backstagePostThreadRenderer': self._post_thread_entries, - 'videoRenderer': lambda x: [self._video_entry(x)], - 'playlistRenderer': lambda x: self._grid_entries({'items': [{'playlistRenderer': x}]}), - 'channelRenderer': lambda x: self._grid_entries({'items': [{'channelRenderer': x}]}), - 'hashtagTileRenderer': lambda x: [self._hashtag_tile_entry(x)], - 'richGridRenderer': lambda x: self._extract_entries(x, continuation_list), - 'lockupViewModel': lambda x: [self._extract_lockup_view_model(x)], - } - for key, renderer in isr_content.items(): - if key not in known_renderers: - continue - for entry in known_renderers[key](renderer): - if entry: - yield entry - continuation_list[0] = self._extract_continuation(renderer) - break - - if not continuation_list[0]: - continuation_list[0] = self._extract_continuation(is_renderer) - - if not continuation_list[0]: - continuation_list[0] = self._extract_continuation(parent_renderer) - - def _entries(self, tab, item_id, ytcfg, delegated_session_id, visitor_data): - continuation_list = [None] - extract_entries = lambda x: self._extract_entries(x, continuation_list) - tab_content = try_get(tab, lambda x: x['content'], dict) - if not tab_content: - return - parent_renderer = ( - try_get(tab_content, lambda x: x['sectionListRenderer'], dict) - or try_get(tab_content, lambda x: x['richGridRenderer'], dict) or {}) - yield from extract_entries(parent_renderer) - continuation = continuation_list[0] - seen_continuations = set() - for page_num in itertools.count(1): - if not continuation: - break - continuation_token = continuation.get('continuation') - if continuation_token is not None and continuation_token in seen_continuations: - self.write_debug('Detected YouTube feed looping - assuming end of feed.') - break - seen_continuations.add(continuation_token) - headers = self.generate_api_headers( - ytcfg=ytcfg, delegated_session_id=delegated_session_id, visitor_data=visitor_data) - response = self._extract_response( - item_id=f'{item_id} page {page_num}', - query=continuation, headers=headers, ytcfg=ytcfg, - check_get_keys=('continuationContents', 'onResponseReceivedActions', 'onResponseReceivedEndpoints')) - - if not response: - break - # Extracting updated visitor data is required to prevent an infinite extraction loop in some cases - # See: https://github.com/ytdl-org/youtube-dl/issues/28702 - visitor_data = self._extract_visitor_data(response) or visitor_data - - known_renderers = { - 'videoRenderer': (self._grid_entries, 'items'), # for membership tab - 'gridPlaylistRenderer': (self._grid_entries, 'items'), - 'gridVideoRenderer': (self._grid_entries, 'items'), - 'gridChannelRenderer': (self._grid_entries, 'items'), - 'playlistVideoRenderer': (self._playlist_entries, 'contents'), - 'itemSectionRenderer': (extract_entries, 'contents'), # for feeds - 'richItemRenderer': (extract_entries, 'contents'), # for hashtag - 'backstagePostThreadRenderer': (self._post_thread_continuation_entries, 'contents'), - 'reportHistoryTableRowRenderer': (self._report_history_entries, 'rows'), - 'playlistVideoListContinuation': (self._playlist_entries, None), - 'gridContinuation': (self._grid_entries, None), - 'itemSectionContinuation': (self._post_thread_continuation_entries, None), - 'sectionListContinuation': (extract_entries, None), # for feeds - } - - continuation_items = traverse_obj(response, ( - ('onResponseReceivedActions', 'onResponseReceivedEndpoints'), ..., - 'appendContinuationItemsAction', 'continuationItems', - ), 'continuationContents', get_all=False) - continuation_item = traverse_obj(continuation_items, 0, None, expected_type=dict, default={}) - - video_items_renderer = None - for key in continuation_item: - if key not in known_renderers: - continue - func, parent_key = known_renderers[key] - video_items_renderer = {parent_key: continuation_items} if parent_key else continuation_items - continuation_list = [None] - yield from func(video_items_renderer) - continuation = continuation_list[0] or self._extract_continuation(video_items_renderer) - - if not video_items_renderer: - break - - @staticmethod - def _extract_selected_tab(tabs, fatal=True): - for tab_renderer in tabs: - if tab_renderer.get('selected'): - return tab_renderer - if fatal: - raise ExtractorError('Unable to find selected tab') - - @staticmethod - def _extract_tab_renderers(response): - return traverse_obj( - response, ('contents', 'twoColumnBrowseResultsRenderer', 'tabs', ..., ('tabRenderer', 'expandableTabRenderer')), expected_type=dict) - - def _extract_from_tabs(self, item_id, ytcfg, data, tabs): - metadata = self._extract_metadata_from_tabs(item_id, data) - - selected_tab = self._extract_selected_tab(tabs) - metadata['title'] += format_field(selected_tab, 'title', ' - %s') - metadata['title'] += format_field(selected_tab, 'expandedText', ' - %s') - - return self.playlist_result( - self._entries( - selected_tab, metadata['id'], ytcfg, - self._extract_delegated_session_id(ytcfg, data), - self._extract_visitor_data(data, ytcfg)), - **metadata) - - def _extract_metadata_from_tabs(self, item_id, data): - info = {'id': item_id} - - metadata_renderer = traverse_obj(data, ('metadata', 'channelMetadataRenderer'), expected_type=dict) - if metadata_renderer: - channel_id = traverse_obj(metadata_renderer, ('externalId', {self.ucid_or_none}), - ('channelUrl', {self.ucid_from_url})) - info.update({ - 'channel': metadata_renderer.get('title'), - 'channel_id': channel_id, - }) - if info['channel_id']: - info['id'] = info['channel_id'] - else: - metadata_renderer = traverse_obj(data, ('metadata', 'playlistMetadataRenderer'), expected_type=dict) - - # pageHeaderViewModel slow rollout began April 2024 - page_header_view_model = traverse_obj(data, ( - 'header', 'pageHeaderRenderer', 'content', 'pageHeaderViewModel', {dict})) - - # We can get the uncropped banner/avatar by replacing the crop params with '=s0' - # See: https://github.com/yt-dlp/yt-dlp/issues/2237#issuecomment-1013694714 - def _get_uncropped(url): - return url_or_none((url or '').split('=')[0] + '=s0') - - avatar_thumbnails = self._extract_thumbnails(metadata_renderer, 'avatar') - if avatar_thumbnails: - uncropped_avatar = _get_uncropped(avatar_thumbnails[0]['url']) - if uncropped_avatar: - avatar_thumbnails.append({ - 'url': uncropped_avatar, - 'id': 'avatar_uncropped', - 'preference': 1, - }) - - channel_banners = ( - self._extract_thumbnails(data, ('header', ..., ('banner', 'mobileBanner', 'tvBanner'))) - or self._extract_thumbnails( - page_header_view_model, ('banner', 'imageBannerViewModel', 'image'), final_key='sources')) - for banner in channel_banners: - banner['preference'] = -10 - - if channel_banners: - uncropped_banner = _get_uncropped(channel_banners[0]['url']) - if uncropped_banner: - channel_banners.append({ - 'url': uncropped_banner, - 'id': 'banner_uncropped', - 'preference': -5, - }) - - # Deprecated - remove primary_sidebar_renderer when layout discontinued - primary_sidebar_renderer = self._extract_sidebar_info_renderer(data, 'playlistSidebarPrimaryInfoRenderer') - playlist_header_renderer = traverse_obj(data, ('header', 'playlistHeaderRenderer'), expected_type=dict) - - primary_thumbnails = self._extract_thumbnails( - primary_sidebar_renderer, ('thumbnailRenderer', ('playlistVideoThumbnailRenderer', 'playlistCustomThumbnailRenderer'), 'thumbnail')) - playlist_thumbnails = self._extract_thumbnails( - playlist_header_renderer, ('playlistHeaderBanner', 'heroPlaylistThumbnailRenderer', 'thumbnail')) - - info.update({ - 'title': (traverse_obj(metadata_renderer, 'title') - or self._get_text(data, ('header', 'hashtagHeaderRenderer', 'hashtag')) - or info['id']), - 'availability': self._extract_availability(data), - 'channel_follower_count': ( - self._get_count(data, ('header', ..., 'subscriberCountText')) - or traverse_obj(page_header_view_model, ( - 'metadata', 'contentMetadataViewModel', 'metadataRows', ..., 'metadataParts', - lambda _, v: 'subscribers' in v['text']['content'], 'text', 'content', {parse_count}, any))), - 'description': try_get(metadata_renderer, lambda x: x.get('description', '')), - 'tags': (traverse_obj(data, ('microformat', 'microformatDataRenderer', 'tags', ..., {str})) - or traverse_obj(metadata_renderer, ('keywords', {lambda x: x and shlex.split(x)}, ...))), - 'thumbnails': (primary_thumbnails or playlist_thumbnails) + avatar_thumbnails + channel_banners, - }) - - channel_handle = ( - traverse_obj(metadata_renderer, (('vanityChannelUrl', ('ownerUrls', ...)), {self.handle_from_url}), get_all=False) - or traverse_obj(data, ('header', ..., 'channelHandleText', {self.handle_or_none}), get_all=False)) - - if channel_handle: - info.update({ - 'uploader_id': channel_handle, - 'uploader_url': format_field(channel_handle, None, 'https://www.youtube.com/%s', default=None), - }) - - channel_badges = self._extract_badges(traverse_obj(data, ('header', ..., 'badges'), get_all=False)) - if self._has_badge(channel_badges, BadgeType.VERIFIED): - info['channel_is_verified'] = True - # Playlist stats is a text runs array containing [video count, view count, last updated]. - # last updated or (view count and last updated) may be missing. - playlist_stats = get_first( - (primary_sidebar_renderer, playlist_header_renderer), (('stats', 'briefStats', 'numVideosText'), )) - - last_updated_unix = self._parse_time_text( - self._get_text(playlist_stats, 2) # deprecated, remove when old layout discontinued - or self._get_text(playlist_header_renderer, ('byline', 1, 'playlistBylineRenderer', 'text'))) - info['modified_date'] = strftime_or_none(last_updated_unix) - - info['view_count'] = self._get_count(playlist_stats, 1) - if info['view_count'] is None: # 0 is allowed - info['view_count'] = self._get_count(playlist_header_renderer, 'viewCountText') - if info['view_count'] is None: - info['view_count'] = self._get_count(data, ( - 'contents', 'twoColumnBrowseResultsRenderer', 'tabs', ..., 'tabRenderer', 'content', 'sectionListRenderer', - 'contents', ..., 'itemSectionRenderer', 'contents', ..., 'channelAboutFullMetadataRenderer', 'viewCountText')) - - info['playlist_count'] = self._get_count(playlist_stats, 0) - if info['playlist_count'] is None: # 0 is allowed - info['playlist_count'] = self._get_count(playlist_header_renderer, ('byline', 0, 'playlistBylineRenderer', 'text')) - - if not info.get('channel_id'): - owner = traverse_obj(playlist_header_renderer, 'ownerText') - if not owner: # Deprecated - owner = traverse_obj( - self._extract_sidebar_info_renderer(data, 'playlistSidebarSecondaryInfoRenderer'), - ('videoOwner', 'videoOwnerRenderer', 'title')) - owner_text = self._get_text(owner) - browse_ep = traverse_obj(owner, ('runs', 0, 'navigationEndpoint', 'browseEndpoint')) or {} - info.update({ - 'channel': self._search_regex(r'^by (.+) and \d+ others?$', owner_text, 'uploader', default=owner_text), - 'channel_id': self.ucid_or_none(browse_ep.get('browseId')), - 'uploader_id': self.handle_from_url(urljoin('https://www.youtube.com', browse_ep.get('canonicalBaseUrl'))), - }) - - info.update({ - 'uploader': info['channel'], - 'channel_url': format_field(info.get('channel_id'), None, 'https://www.youtube.com/channel/%s', default=None), - 'uploader_url': format_field(info.get('uploader_id'), None, 'https://www.youtube.com/%s', default=None), - }) - - return info - - def _extract_inline_playlist(self, playlist, playlist_id, data, ytcfg): - first_id = last_id = response = None - for page_num in itertools.count(1): - videos = list(self._playlist_entries(playlist)) - if not videos: - return - start = next((i for i, v in enumerate(videos) if v['id'] == last_id), -1) + 1 - if start >= len(videos): - return - yield from videos[start:] - first_id = first_id or videos[0]['id'] - last_id = videos[-1]['id'] - watch_endpoint = try_get( - playlist, lambda x: x['contents'][-1]['playlistPanelVideoRenderer']['navigationEndpoint']['watchEndpoint']) - headers = self.generate_api_headers( - ytcfg=ytcfg, delegated_session_id=self._extract_delegated_session_id(ytcfg, data), - visitor_data=self._extract_visitor_data(response, data, ytcfg)) - query = { - 'playlistId': playlist_id, - 'videoId': watch_endpoint.get('videoId') or last_id, - 'index': watch_endpoint.get('index') or len(videos), - 'params': watch_endpoint.get('params') or 'OAE%3D', - } - response = self._extract_response( - item_id=f'{playlist_id} page {page_num}', - query=query, ep='next', headers=headers, ytcfg=ytcfg, - check_get_keys='contents', - ) - playlist = try_get( - response, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist'], dict) - - def _extract_from_playlist(self, item_id, url, data, playlist, ytcfg): - title = playlist.get('title') or try_get( - data, lambda x: x['titleText']['simpleText'], str) - playlist_id = playlist.get('playlistId') or item_id - - # Delegating everything except mix playlists to regular tab-based playlist URL - playlist_url = urljoin(url, try_get( - playlist, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'], - str)) - - # Some playlists are unviewable but YouTube still provides a link to the (broken) playlist page [1] - # [1] MLCT, RLTDwFCb4jeqaKWnciAYM-ZVHg - is_known_unviewable = re.fullmatch(r'MLCT|RLTD[\w-]{22}', playlist_id) - - if playlist_url and playlist_url != url and not is_known_unviewable: - return self.url_result( - playlist_url, ie=YoutubeTabIE.ie_key(), video_id=playlist_id, - video_title=title) - - return self.playlist_result( - self._extract_inline_playlist(playlist, playlist_id, data, ytcfg), - playlist_id=playlist_id, playlist_title=title) - - def _extract_availability(self, data): - """ - Gets the availability of a given playlist/tab. - Note: Unless YouTube tells us explicitly, we do not assume it is public - @param data: response - """ - sidebar_renderer = self._extract_sidebar_info_renderer(data, 'playlistSidebarPrimaryInfoRenderer') or {} - playlist_header_renderer = traverse_obj(data, ('header', 'playlistHeaderRenderer')) or {} - player_header_privacy = playlist_header_renderer.get('privacy') - - badges = self._extract_badges(traverse_obj(sidebar_renderer, 'badges')) - - # Personal playlists, when authenticated, have a dropdown visibility selector instead of a badge - privacy_setting_icon = get_first( - (playlist_header_renderer, sidebar_renderer), - ('privacyForm', 'dropdownFormFieldRenderer', 'dropdown', 'dropdownRenderer', 'entries', - lambda _, v: v['privacyDropdownItemRenderer']['isSelected'], 'privacyDropdownItemRenderer', 'icon', 'iconType'), - expected_type=str) - - microformats_is_unlisted = traverse_obj( - data, ('microformat', 'microformatDataRenderer', 'unlisted'), expected_type=bool) - - return ( - 'public' if ( - self._has_badge(badges, BadgeType.AVAILABILITY_PUBLIC) - or player_header_privacy == 'PUBLIC' - or privacy_setting_icon == 'PRIVACY_PUBLIC') - else self._availability( - is_private=( - self._has_badge(badges, BadgeType.AVAILABILITY_PRIVATE) - or player_header_privacy == 'PRIVATE' if player_header_privacy is not None - else privacy_setting_icon == 'PRIVACY_PRIVATE' if privacy_setting_icon is not None else None), - is_unlisted=( - self._has_badge(badges, BadgeType.AVAILABILITY_UNLISTED) - or player_header_privacy == 'UNLISTED' if player_header_privacy is not None - else privacy_setting_icon == 'PRIVACY_UNLISTED' if privacy_setting_icon is not None - else microformats_is_unlisted if microformats_is_unlisted is not None else None), - needs_subscription=self._has_badge(badges, BadgeType.AVAILABILITY_SUBSCRIPTION) or None, - needs_premium=self._has_badge(badges, BadgeType.AVAILABILITY_PREMIUM) or None, - needs_auth=False)) - - @staticmethod - def _extract_sidebar_info_renderer(data, info_renderer, expected_type=dict): - sidebar_renderer = try_get( - data, lambda x: x['sidebar']['playlistSidebarRenderer']['items'], list) or [] - for item in sidebar_renderer: - renderer = try_get(item, lambda x: x[info_renderer], expected_type) - if renderer: - return renderer - - def _reload_with_unavailable_videos(self, item_id, data, ytcfg): - """ - Reload playlists with unavailable videos (e.g. private videos, region blocked, etc.) - """ - is_playlist = bool(traverse_obj( - data, ('metadata', 'playlistMetadataRenderer'), ('header', 'playlistHeaderRenderer'))) - if not is_playlist: - return - headers = self.generate_api_headers( - ytcfg=ytcfg, delegated_session_id=self._extract_delegated_session_id(ytcfg, data), - visitor_data=self._extract_visitor_data(data, ytcfg)) - query = { - 'params': 'wgYCCAA=', - 'browseId': f'VL{item_id}', - } - return self._extract_response( - item_id=item_id, headers=headers, query=query, - check_get_keys='contents', fatal=False, ytcfg=ytcfg, - note='Redownloading playlist API JSON with unavailable videos') - - @functools.cached_property - def skip_webpage(self): - return 'webpage' in self._configuration_arg('skip', ie_key=YoutubeTabIE.ie_key()) - - def _extract_webpage(self, url, item_id, fatal=True): - webpage, data = None, None - for retry in self.RetryManager(fatal=fatal): - try: - webpage = self._download_webpage(url, item_id, note='Downloading webpage') - data = self.extract_yt_initial_data(item_id, webpage or '', fatal=fatal) or {} - except ExtractorError as e: - if isinstance(e.cause, network_exceptions): - if not isinstance(e.cause, HTTPError) or e.cause.status not in (403, 429): - retry.error = e - continue - self._error_or_warning(e, fatal=fatal) - break - - try: - self._extract_and_report_alerts(data) - except ExtractorError as e: - self._error_or_warning(e, fatal=fatal) - break - - # Sometimes youtube returns a webpage with incomplete ytInitialData - # See: https://github.com/yt-dlp/yt-dlp/issues/116 - if not traverse_obj(data, 'contents', 'currentVideoEndpoint', 'onResponseReceivedActions'): - retry.error = ExtractorError('Incomplete yt initial data received') - data = None - continue - - return webpage, data - - def _report_playlist_authcheck(self, ytcfg, fatal=True): - """Use if failed to extract ytcfg (and data) from initial webpage""" - if not ytcfg and self.is_authenticated: - msg = 'Playlists that require authentication may not extract correctly without a successful webpage download' - if 'authcheck' not in self._configuration_arg('skip', ie_key=YoutubeTabIE.ie_key()) and fatal: - raise ExtractorError( - f'{msg}. If you are not downloading private content, or ' - 'your cookies are only for the first account and channel,' - ' pass "--extractor-args youtubetab:skip=authcheck" to skip this check', - expected=True) - self.report_warning(msg, only_once=True) - - def _extract_data(self, url, item_id, ytcfg=None, fatal=True, webpage_fatal=False, default_client='web'): - data = None - if not self.skip_webpage: - webpage, data = self._extract_webpage(url, item_id, fatal=webpage_fatal) - ytcfg = ytcfg or self.extract_ytcfg(item_id, webpage) - # Reject webpage data if redirected to home page without explicitly requesting - selected_tab = self._extract_selected_tab(self._extract_tab_renderers(data), fatal=False) or {} - if (url != 'https://www.youtube.com/feed/recommended' - and selected_tab.get('tabIdentifier') == 'FEwhat_to_watch' # Home page - and 'no-youtube-channel-redirect' not in self.get_param('compat_opts', [])): - msg = 'The channel/playlist does not exist and the URL redirected to youtube.com home page' - if fatal: - raise ExtractorError(msg, expected=True) - self.report_warning(msg, only_once=True) - if not data: - self._report_playlist_authcheck(ytcfg, fatal=fatal) - data = self._extract_tab_endpoint(url, item_id, ytcfg, fatal=fatal, default_client=default_client) - return data, ytcfg - - def _extract_tab_endpoint(self, url, item_id, ytcfg=None, fatal=True, default_client='web'): - headers = self.generate_api_headers(ytcfg=ytcfg, default_client=default_client) - resolve_response = self._extract_response( - item_id=item_id, query={'url': url}, check_get_keys='endpoint', headers=headers, ytcfg=ytcfg, fatal=fatal, - ep='navigation/resolve_url', note='Downloading API parameters API JSON', default_client=default_client) - endpoints = {'browseEndpoint': 'browse', 'watchEndpoint': 'next'} - for ep_key, ep in endpoints.items(): - params = try_get(resolve_response, lambda x: x['endpoint'][ep_key], dict) - if params: - return self._extract_response( - item_id=item_id, query=params, ep=ep, headers=headers, - ytcfg=ytcfg, fatal=fatal, default_client=default_client, - check_get_keys=('contents', 'currentVideoEndpoint', 'onResponseReceivedActions')) - err_note = 'Failed to resolve url (does the playlist exist?)' - if fatal: - raise ExtractorError(err_note, expected=True) - self.report_warning(err_note, item_id) - - _SEARCH_PARAMS = None - - def _search_results(self, query, params=NO_DEFAULT, default_client='web'): - data = {'query': query} - if params is NO_DEFAULT: - params = self._SEARCH_PARAMS - if params: - data['params'] = params - - content_keys = ( - ('contents', 'twoColumnSearchResultsRenderer', 'primaryContents', 'sectionListRenderer', 'contents'), - ('onResponseReceivedCommands', 0, 'appendContinuationItemsAction', 'continuationItems'), - # ytmusic search - ('contents', 'tabbedSearchResultsRenderer', 'tabs', 0, 'tabRenderer', 'content', 'sectionListRenderer', 'contents'), - ('continuationContents', ), - ) - display_id = f'query "{query}"' - check_get_keys = tuple({keys[0] for keys in content_keys}) - ytcfg = self._download_ytcfg(default_client, display_id) if not self.skip_webpage else {} - self._report_playlist_authcheck(ytcfg, fatal=False) - - continuation_list = [None] - search = None - for page_num in itertools.count(1): - data.update(continuation_list[0] or {}) - headers = self.generate_api_headers( - ytcfg=ytcfg, visitor_data=self._extract_visitor_data(search), default_client=default_client) - search = self._extract_response( - item_id=f'{display_id} page {page_num}', ep='search', query=data, - default_client=default_client, check_get_keys=check_get_keys, ytcfg=ytcfg, headers=headers) - slr_contents = traverse_obj(search, *content_keys) - yield from self._extract_entries({'contents': list(variadic(slr_contents))}, continuation_list) - if not continuation_list[0]: - break - - -class YoutubeTabIE(YoutubeTabBaseInfoExtractor): - IE_DESC = 'YouTube Tabs' - _VALID_URL = r'''(?x: - https?:// - (?!consent\.)(?:\w+\.)? - (?: - youtube(?:kids)?\.com| - {invidious} - )/ - (?: - (?P<channel_type>channel|c|user|browse)/| - (?P<not_channel> - feed/|hashtag/| - (?:playlist|watch)\?.*?\blist= - )| - (?!(?:{reserved_names})\b) # Direct URLs - ) - (?P<id>[^/?\#&]+) - )'''.format( - reserved_names=YoutubeBaseInfoExtractor._RESERVED_NAMES, - invidious='|'.join(YoutubeBaseInfoExtractor._INVIDIOUS_SITES), - ) - IE_NAME = 'youtube:tab' - - _TESTS = [{ - 'note': 'playlists, multipage', - 'url': 'https://www.youtube.com/c/ИгорьКлейнер/playlists?view=1&flow=grid', - 'playlist_mincount': 94, - 'info_dict': { - 'id': 'UCqj7Cz7revf5maW9g5pgNcg', - 'title': 'Igor Kleiner Ph.D. - Playlists', - 'description': 'md5:15d7dd9e333cb987907fcb0d604b233a', - 'uploader': 'Igor Kleiner Ph.D.', - 'uploader_id': '@IgorDataScience', - 'uploader_url': 'https://www.youtube.com/@IgorDataScience', - 'channel': 'Igor Kleiner Ph.D.', - 'channel_id': 'UCqj7Cz7revf5maW9g5pgNcg', - 'tags': ['критическое мышление', 'наука просто', 'математика', 'анализ данных'], - 'channel_url': 'https://www.youtube.com/channel/UCqj7Cz7revf5maW9g5pgNcg', - 'channel_follower_count': int, - }, - }, { - 'note': 'playlists, multipage, different order', - 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd', - 'playlist_mincount': 94, - 'info_dict': { - 'id': 'UCqj7Cz7revf5maW9g5pgNcg', - 'title': 'Igor Kleiner Ph.D. - Playlists', - 'description': 'md5:15d7dd9e333cb987907fcb0d604b233a', - 'uploader': 'Igor Kleiner Ph.D.', - 'uploader_id': '@IgorDataScience', - 'uploader_url': 'https://www.youtube.com/@IgorDataScience', - 'tags': ['критическое мышление', 'наука просто', 'математика', 'анализ данных'], - 'channel_id': 'UCqj7Cz7revf5maW9g5pgNcg', - 'channel': 'Igor Kleiner Ph.D.', - 'channel_url': 'https://www.youtube.com/channel/UCqj7Cz7revf5maW9g5pgNcg', - 'channel_follower_count': int, - }, - }, { - 'note': 'playlists, series', - 'url': 'https://www.youtube.com/c/3blue1brown/playlists?view=50&sort=dd&shelf_id=3', - 'playlist_mincount': 5, - 'info_dict': { - 'id': 'UCYO_jab_esuFRV4b17AJtAw', - 'title': '3Blue1Brown - Playlists', - 'description': 'md5:602e3789e6a0cb7d9d352186b720e395', - 'channel_url': 'https://www.youtube.com/channel/UCYO_jab_esuFRV4b17AJtAw', - 'channel': '3Blue1Brown', - 'channel_id': 'UCYO_jab_esuFRV4b17AJtAw', - 'uploader_id': '@3blue1brown', - 'uploader_url': 'https://www.youtube.com/@3blue1brown', - 'uploader': '3Blue1Brown', - 'tags': ['Mathematics'], - 'channel_follower_count': int, - 'channel_is_verified': True, - }, - }, { - 'note': 'playlists, singlepage', - 'url': 'https://www.youtube.com/user/ThirstForScience/playlists', - 'playlist_mincount': 4, - 'info_dict': { - 'id': 'UCAEtajcuhQ6an9WEzY9LEMQ', - 'title': 'ThirstForScience - Playlists', - 'description': 'md5:609399d937ea957b0f53cbffb747a14c', - 'uploader': 'ThirstForScience', - 'uploader_url': 'https://www.youtube.com/@ThirstForScience', - 'uploader_id': '@ThirstForScience', - 'channel_id': 'UCAEtajcuhQ6an9WEzY9LEMQ', - 'channel_url': 'https://www.youtube.com/channel/UCAEtajcuhQ6an9WEzY9LEMQ', - 'tags': 'count:12', - 'channel': 'ThirstForScience', - 'channel_follower_count': int, - }, - }, { - 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists', - 'only_matching': True, - }, { - 'note': 'basic, single video playlist', - 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc', - 'info_dict': { - 'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc', - 'title': 'youtube-dl public playlist', - 'description': '', - 'tags': [], - 'view_count': int, - 'modified_date': '20201130', - 'channel': 'Sergey M.', - 'channel_id': 'UCmlqkdCBesrv2Lak1mF_MxA', - 'channel_url': 'https://www.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA', - 'availability': 'public', - 'uploader': 'Sergey M.', - 'uploader_url': 'https://www.youtube.com/@sergeym.6173', - 'uploader_id': '@sergeym.6173', - }, - 'playlist_count': 1, - }, { - 'note': 'empty playlist', - 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf', - 'info_dict': { - 'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf', - 'title': 'youtube-dl empty playlist', - 'tags': [], - 'channel': 'Sergey M.', - 'description': '', - 'modified_date': '20230921', - 'channel_id': 'UCmlqkdCBesrv2Lak1mF_MxA', - 'channel_url': 'https://www.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA', - 'availability': 'unlisted', - 'uploader_url': 'https://www.youtube.com/@sergeym.6173', - 'uploader_id': '@sergeym.6173', - 'uploader': 'Sergey M.', - }, - 'playlist_count': 0, - }, { - 'note': 'Home tab', - 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/featured', - 'info_dict': { - 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - 'title': 'lex will - Home', - 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', - 'uploader': 'lex will', - 'uploader_id': '@lexwill718', - 'channel': 'lex will', - 'tags': ['bible', 'history', 'prophesy'], - 'uploader_url': 'https://www.youtube.com/@lexwill718', - 'channel_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w', - 'channel_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - 'channel_follower_count': int, - }, - 'playlist_mincount': 2, - }, { - 'note': 'Videos tab', - 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos', - 'info_dict': { - 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - 'title': 'lex will - Videos', - 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', - 'uploader': 'lex will', - 'uploader_id': '@lexwill718', - 'tags': ['bible', 'history', 'prophesy'], - 'channel_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w', - 'channel_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - 'uploader_url': 'https://www.youtube.com/@lexwill718', - 'channel': 'lex will', - 'channel_follower_count': int, - }, - 'playlist_mincount': 975, - }, { - 'note': 'Videos tab, sorted by popular', - 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos?view=0&sort=p&flow=grid', - 'info_dict': { - 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - 'title': 'lex will - Videos', - 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', - 'uploader': 'lex will', - 'uploader_id': '@lexwill718', - 'channel_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - 'uploader_url': 'https://www.youtube.com/@lexwill718', - 'channel': 'lex will', - 'tags': ['bible', 'history', 'prophesy'], - 'channel_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w', - 'channel_follower_count': int, - }, - 'playlist_mincount': 199, - }, { - 'note': 'Playlists tab', - 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/playlists', - 'info_dict': { - 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - 'title': 'lex will - Playlists', - 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', - 'uploader': 'lex will', - 'uploader_id': '@lexwill718', - 'uploader_url': 'https://www.youtube.com/@lexwill718', - 'channel': 'lex will', - 'channel_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w', - 'channel_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - 'tags': ['bible', 'history', 'prophesy'], - 'channel_follower_count': int, - }, - 'playlist_mincount': 17, - }, { - 'note': 'Community tab', - 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/community', - 'info_dict': { - 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - 'title': 'lex will - Community', - 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', - 'channel': 'lex will', - 'channel_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w', - 'channel_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - 'tags': ['bible', 'history', 'prophesy'], - 'channel_follower_count': int, - 'uploader_url': 'https://www.youtube.com/@lexwill718', - 'uploader_id': '@lexwill718', - 'uploader': 'lex will', - }, - 'playlist_mincount': 18, - }, { - 'note': 'Channels tab', - 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/channels', - 'info_dict': { - 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - 'title': 'lex will - Channels', - 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', - 'channel': 'lex will', - 'channel_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w', - 'channel_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - 'tags': ['bible', 'history', 'prophesy'], - 'channel_follower_count': int, - 'uploader_url': 'https://www.youtube.com/@lexwill718', - 'uploader_id': '@lexwill718', - 'uploader': 'lex will', - }, - 'playlist_mincount': 12, - }, { - 'note': 'Search tab', - 'url': 'https://www.youtube.com/c/3blue1brown/search?query=linear%20algebra', - 'playlist_mincount': 40, - 'info_dict': { - 'id': 'UCYO_jab_esuFRV4b17AJtAw', - 'title': '3Blue1Brown - Search - linear algebra', - 'description': 'md5:4d1da95432004b7ba840ebc895b6b4c9', - 'channel_url': 'https://www.youtube.com/channel/UCYO_jab_esuFRV4b17AJtAw', - 'tags': ['Mathematics'], - 'channel': '3Blue1Brown', - 'channel_id': 'UCYO_jab_esuFRV4b17AJtAw', - 'channel_follower_count': int, - 'uploader_url': 'https://www.youtube.com/@3blue1brown', - 'uploader_id': '@3blue1brown', - 'uploader': '3Blue1Brown', - 'channel_is_verified': True, - }, - }, { - 'url': 'https://invidio.us/channel/UCmlqkdCBesrv2Lak1mF_MxA', - 'only_matching': True, - }, { - 'url': 'https://www.youtubekids.com/channel/UCmlqkdCBesrv2Lak1mF_MxA', - 'only_matching': True, - }, { - 'url': 'https://music.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA', - 'only_matching': True, - }, { - 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.', - 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC', - 'info_dict': { - 'title': '29C3: Not my department', - 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC', - 'description': 'md5:a14dc1a8ef8307a9807fe136a0660268', - 'tags': [], - 'view_count': int, - 'modified_date': '20150605', - 'channel_id': 'UCEPzS1rYsrkqzSLNp76nrcg', - 'channel_url': 'https://www.youtube.com/channel/UCEPzS1rYsrkqzSLNp76nrcg', - 'channel': 'Christiaan008', - 'availability': 'public', - 'uploader_id': '@ChRiStIaAn008', - 'uploader': 'Christiaan008', - 'uploader_url': 'https://www.youtube.com/@ChRiStIaAn008', - }, - 'playlist_count': 96, - }, { - 'note': 'Large playlist', - 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q', - 'info_dict': { - 'title': 'Uploads from Cauchemar', - 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q', - 'channel_url': 'https://www.youtube.com/channel/UCBABnxM4Ar9ten8Mdjj1j0Q', - 'tags': [], - 'modified_date': r're:\d{8}', - 'channel': 'Cauchemar', - 'view_count': int, - 'description': '', - 'channel_id': 'UCBABnxM4Ar9ten8Mdjj1j0Q', - 'availability': 'public', - 'uploader_id': '@Cauchemar89', - 'uploader': 'Cauchemar', - 'uploader_url': 'https://www.youtube.com/@Cauchemar89', - }, - 'playlist_mincount': 1123, - 'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'], - }, { - 'note': 'even larger playlist, 8832 videos', - 'url': 'http://www.youtube.com/user/NASAgovVideo/videos', - 'only_matching': True, - }, { - 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos', - 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA', - 'info_dict': { - 'title': 'Uploads from Interstellar Movie', - 'id': 'UUXw-G3eDE9trcvY2sBMM_aA', - 'tags': [], - 'view_count': int, - 'channel_id': 'UCXw-G3eDE9trcvY2sBMM_aA', - 'channel_url': 'https://www.youtube.com/channel/UCXw-G3eDE9trcvY2sBMM_aA', - 'channel': 'Interstellar Movie', - 'description': '', - 'modified_date': r're:\d{8}', - 'availability': 'public', - 'uploader_id': '@InterstellarMovie', - 'uploader': 'Interstellar Movie', - 'uploader_url': 'https://www.youtube.com/@InterstellarMovie', - }, - 'playlist_mincount': 21, - }, { - 'note': 'Playlist with "show unavailable videos" button', - 'url': 'https://www.youtube.com/playlist?list=UUTYLiWFZy8xtPwxFwX9rV7Q', - 'info_dict': { - 'title': 'Uploads from Phim Siêu Nhân Nhật Bản', - 'id': 'UUTYLiWFZy8xtPwxFwX9rV7Q', - 'view_count': int, - 'channel': 'Phim Siêu Nhân Nhật Bản', - 'tags': [], - 'description': '', - 'channel_url': 'https://www.youtube.com/channel/UCTYLiWFZy8xtPwxFwX9rV7Q', - 'channel_id': 'UCTYLiWFZy8xtPwxFwX9rV7Q', - 'modified_date': r're:\d{8}', - 'availability': 'public', - 'uploader_url': 'https://www.youtube.com/@phimsieunhannhatban', - 'uploader_id': '@phimsieunhannhatban', - 'uploader': 'Phim Siêu Nhân Nhật Bản', - }, - 'playlist_mincount': 200, - 'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'], - }, { - 'note': 'Playlist with unavailable videos in page 7', - 'url': 'https://www.youtube.com/playlist?list=UU8l9frL61Yl5KFOl87nIm2w', - 'info_dict': { - 'title': 'Uploads from BlankTV', - 'id': 'UU8l9frL61Yl5KFOl87nIm2w', - 'channel': 'BlankTV', - 'channel_url': 'https://www.youtube.com/channel/UC8l9frL61Yl5KFOl87nIm2w', - 'channel_id': 'UC8l9frL61Yl5KFOl87nIm2w', - 'view_count': int, - 'tags': [], - 'modified_date': r're:\d{8}', - 'description': '', - 'availability': 'public', - 'uploader_id': '@blanktv', - 'uploader': 'BlankTV', - 'uploader_url': 'https://www.youtube.com/@blanktv', - }, - 'playlist_mincount': 1000, - 'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'], - }, { - 'note': 'https://github.com/ytdl-org/youtube-dl/issues/21844', - 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba', - 'info_dict': { - 'title': 'Data Analysis with Dr Mike Pound', - 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba', - 'description': 'md5:7f567c574d13d3f8c0954d9ffee4e487', - 'tags': [], - 'view_count': int, - 'channel_id': 'UC9-y-6csu5WGm29I7JiwpnA', - 'channel_url': 'https://www.youtube.com/channel/UC9-y-6csu5WGm29I7JiwpnA', - 'channel': 'Computerphile', - 'availability': 'public', - 'modified_date': '20190712', - 'uploader_id': '@Computerphile', - 'uploader': 'Computerphile', - 'uploader_url': 'https://www.youtube.com/@Computerphile', - }, - 'playlist_mincount': 11, - }, { - 'url': 'https://invidio.us/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc', - 'only_matching': True, - }, { - 'note': 'Playlist URL that does not actually serve a playlist', - 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4', - 'info_dict': { - 'id': 'FqZTN594JQw', - 'ext': 'webm', - 'title': "Smiley's People 01 detective, Adventure Series, Action", - 'upload_date': '20150526', - 'license': 'Standard YouTube License', - 'description': 'md5:507cdcb5a49ac0da37a920ece610be80', - 'categories': ['People & Blogs'], - 'tags': list, - 'view_count': int, - 'like_count': int, - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'This video is not available.', - 'add_ie': [YoutubeIE.ie_key()], - }, { - 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/channel/UCoMdktPbSTixAyNGwb-UYkQ/live', - 'info_dict': { - 'id': 'hGkQjiJLjWQ', # This will keep changing - 'ext': 'mp4', - 'title': str, - 'upload_date': r're:\d{8}', - 'description': str, - 'categories': ['News & Politics'], - 'tags': list, - 'like_count': int, - 'release_timestamp': int, - 'channel': 'Sky News', - 'channel_id': 'UCoMdktPbSTixAyNGwb-UYkQ', - 'age_limit': 0, - 'view_count': int, - 'thumbnail': r're:https?://i\.ytimg\.com/vi/[^/]+/maxresdefault(?:_live)?\.jpg', - 'playable_in_embed': True, - 'release_date': r're:\d+', - 'availability': 'public', - 'live_status': 'is_live', - 'channel_url': 'https://www.youtube.com/channel/UCoMdktPbSTixAyNGwb-UYkQ', - 'channel_follower_count': int, - 'concurrent_view_count': int, - 'uploader_url': 'https://www.youtube.com/@SkyNews', - 'uploader_id': '@SkyNews', - 'uploader': 'Sky News', - 'channel_is_verified': True, - }, - 'params': { - 'skip_download': True, - }, - 'expected_warnings': ['Ignoring subtitle tracks found in '], - }, { - 'url': 'https://www.youtube.com/user/TheYoungTurks/live', - 'info_dict': { - 'id': 'a48o2S1cPoo', - 'ext': 'mp4', - 'title': 'The Young Turks - Live Main Show', - 'upload_date': '20150715', - 'license': 'Standard YouTube License', - 'description': 'md5:438179573adcdff3c97ebb1ee632b891', - 'categories': ['News & Politics'], - 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'], - 'like_count': int, - }, - 'params': { - 'skip_download': True, - }, - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/c/CommanderVideoHq/live', - 'only_matching': True, - }, { - 'note': 'A channel that is not live. Should raise error', - 'url': 'https://www.youtube.com/user/numberphile/live', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/feed/trending', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/feed/library', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/feed/history', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/feed/subscriptions', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/feed/watch_later', - 'only_matching': True, - }, { - 'note': 'Recommended - redirects to home page.', - 'url': 'https://www.youtube.com/feed/recommended', - 'only_matching': True, - }, { - 'note': 'inline playlist with not always working continuations', - 'url': 'https://www.youtube.com/watch?v=UC6u0Tct-Fo&list=PL36D642111D65BE7C', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/course', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/zsecurity', - 'only_matching': True, - }, { - 'url': 'http://www.youtube.com/NASAgovVideo/videos', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/TheYoungTurks/live', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/hashtag/cctv9', - 'info_dict': { - 'id': 'cctv9', - 'title': 'cctv9 - All', - 'tags': [], - }, - 'playlist_mincount': 300, # not consistent but should be over 300 - }, { - 'url': 'https://www.youtube.com/watch?list=PLW4dVinRY435CBE_JD3t-0SRXKfnZHS1P&feature=youtu.be&v=M9cJMXmQ_ZU', - 'only_matching': True, - }, { - 'note': 'Requires Premium: should request additional YTM-info webpage (and have format 141) for videos in playlist', - 'url': 'https://music.youtube.com/playlist?list=PLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq', - 'only_matching': True, - }, { - 'note': '/browse/ should redirect to /channel/', - 'url': 'https://music.youtube.com/browse/UC1a8OFewdjuLq6KlF8M_8Ng', - 'only_matching': True, - }, { - 'note': 'VLPL, should redirect to playlist?list=PL...', - 'url': 'https://music.youtube.com/browse/VLPLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq', - 'info_dict': { - 'id': 'PLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq', - 'description': 'Providing you with copyright free / safe music for gaming, live streaming, studying and more!', - 'title': 'NCS : All Releases 💿', - 'channel_url': 'https://www.youtube.com/channel/UC_aEa8K-EOJ3D6gOs7HcyNg', - 'modified_date': r're:\d{8}', - 'view_count': int, - 'channel_id': 'UC_aEa8K-EOJ3D6gOs7HcyNg', - 'tags': [], - 'channel': 'NoCopyrightSounds', - 'availability': 'public', - 'uploader_url': 'https://www.youtube.com/@NoCopyrightSounds', - 'uploader': 'NoCopyrightSounds', - 'uploader_id': '@NoCopyrightSounds', - }, - 'playlist_mincount': 166, - 'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden', 'YouTube Music is not directly supported'], - }, { - # TODO: fix 'unviewable' issue with this playlist when reloading with unavailable videos - 'note': 'Topic, should redirect to playlist?list=UU...', - 'url': 'https://music.youtube.com/browse/UC9ALqqC4aIeG5iDs7i90Bfw', - 'info_dict': { - 'id': 'UU9ALqqC4aIeG5iDs7i90Bfw', - 'title': 'Uploads from Royalty Free Music - Topic', - 'tags': [], - 'channel_id': 'UC9ALqqC4aIeG5iDs7i90Bfw', - 'channel': 'Royalty Free Music - Topic', - 'view_count': int, - 'channel_url': 'https://www.youtube.com/channel/UC9ALqqC4aIeG5iDs7i90Bfw', - 'modified_date': r're:\d{8}', - 'description': '', - 'availability': 'public', - 'uploader': 'Royalty Free Music - Topic', - }, - 'playlist_mincount': 101, - 'expected_warnings': ['YouTube Music is not directly supported', r'[Uu]navailable videos (are|will be) hidden'], - }, { - # Destination channel with only a hidden self tab (tab id is UCtFRv9O2AHqOZjjynzrv-xg) - # Treat as a general feed - 'url': 'https://www.youtube.com/channel/UCtFRv9O2AHqOZjjynzrv-xg', - 'info_dict': { - 'id': 'UCtFRv9O2AHqOZjjynzrv-xg', - 'title': 'UCtFRv9O2AHqOZjjynzrv-xg', - 'tags': [], - }, - 'playlist_mincount': 9, - }, { - 'note': 'Youtube music Album', - 'url': 'https://music.youtube.com/browse/MPREb_gTAcphH99wE', - 'info_dict': { - 'id': 'OLAK5uy_l1m0thk3g31NmIIz_vMIbWtyv7eZixlH0', - 'title': 'Album - Royalty Free Music Library V2 (50 Songs)', - 'tags': [], - 'view_count': int, - 'description': '', - 'availability': 'unlisted', - 'modified_date': r're:\d{8}', - }, - 'playlist_count': 50, - 'expected_warnings': ['YouTube Music is not directly supported'], - }, { - 'note': 'unlisted single video playlist', - 'url': 'https://www.youtube.com/playlist?list=PLwL24UFy54GrB3s2KMMfjZscDi1x5Dajf', - 'info_dict': { - 'id': 'PLwL24UFy54GrB3s2KMMfjZscDi1x5Dajf', - 'title': 'yt-dlp unlisted playlist test', - 'availability': 'unlisted', - 'tags': [], - 'modified_date': '20220418', - 'channel': 'colethedj', - 'view_count': int, - 'description': '', - 'channel_id': 'UC9zHu_mHU96r19o-wV5Qs1Q', - 'channel_url': 'https://www.youtube.com/channel/UC9zHu_mHU96r19o-wV5Qs1Q', - 'uploader_url': 'https://www.youtube.com/@colethedj1894', - 'uploader_id': '@colethedj1894', - 'uploader': 'colethedj', - }, - 'playlist': [{ - 'info_dict': { - 'title': 'youtube-dl test video "\'/\\ä↭𝕐', - 'id': 'BaW_jenozKc', - '_type': 'url', - 'ie_key': 'Youtube', - 'duration': 10, - 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q', - 'channel_url': 'https://www.youtube.com/channel/UCLqxVugv74EIW3VWh2NOa3Q', - 'view_count': int, - 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc', - 'channel': 'Philipp Hagemeister', - 'uploader_id': '@PhilippHagemeister', - 'uploader_url': 'https://www.youtube.com/@PhilippHagemeister', - 'uploader': 'Philipp Hagemeister', - }, - }], - 'playlist_count': 1, - 'params': {'extract_flat': True}, - }, { - 'note': 'API Fallback: Recommended - redirects to home page. Requires visitorData', - 'url': 'https://www.youtube.com/feed/recommended', - 'info_dict': { - 'id': 'recommended', - 'title': 'recommended', - 'tags': [], - }, - 'playlist_mincount': 50, - 'params': { - 'skip_download': True, - 'extractor_args': {'youtubetab': {'skip': ['webpage']}}, - }, - }, { - 'note': 'API Fallback: /videos tab, sorted by oldest first', - 'url': 'https://www.youtube.com/user/theCodyReeder/videos?view=0&sort=da&flow=grid', - 'info_dict': { - 'id': 'UCu6mSoMNzHQiBIOCkHUa2Aw', - 'title': 'Cody\'sLab - Videos', - 'description': 'md5:d083b7c2f0c67ee7a6c74c3e9b4243fa', - 'channel': 'Cody\'sLab', - 'channel_id': 'UCu6mSoMNzHQiBIOCkHUa2Aw', - 'tags': [], - 'channel_url': 'https://www.youtube.com/channel/UCu6mSoMNzHQiBIOCkHUa2Aw', - 'channel_follower_count': int, - }, - 'playlist_mincount': 650, - 'params': { - 'skip_download': True, - 'extractor_args': {'youtubetab': {'skip': ['webpage']}}, - }, - 'skip': 'Query for sorting no longer works', - }, { - 'note': 'API Fallback: Topic, should redirect to playlist?list=UU...', - 'url': 'https://music.youtube.com/browse/UC9ALqqC4aIeG5iDs7i90Bfw', - 'info_dict': { - 'id': 'UU9ALqqC4aIeG5iDs7i90Bfw', - 'title': 'Uploads from Royalty Free Music - Topic', - 'modified_date': r're:\d{8}', - 'channel_id': 'UC9ALqqC4aIeG5iDs7i90Bfw', - 'description': '', - 'channel_url': 'https://www.youtube.com/channel/UC9ALqqC4aIeG5iDs7i90Bfw', - 'tags': [], - 'channel': 'Royalty Free Music - Topic', - 'view_count': int, - 'availability': 'public', - 'uploader': 'Royalty Free Music - Topic', - }, - 'playlist_mincount': 101, - 'params': { - 'skip_download': True, - 'extractor_args': {'youtubetab': {'skip': ['webpage']}}, - }, - 'expected_warnings': ['YouTube Music is not directly supported', r'[Uu]navailable videos (are|will be) hidden'], - }, { - 'note': 'non-standard redirect to regional channel', - 'url': 'https://www.youtube.com/channel/UCwVVpHQ2Cs9iGJfpdFngePQ', - 'only_matching': True, - }, { - 'note': 'collaborative playlist (uploader name in the form "by <uploader> and x other(s)")', - 'url': 'https://www.youtube.com/playlist?list=PLx-_-Kk4c89oOHEDQAojOXzEzemXxoqx6', - 'info_dict': { - 'id': 'PLx-_-Kk4c89oOHEDQAojOXzEzemXxoqx6', - 'modified_date': '20220407', - 'channel_url': 'https://www.youtube.com/channel/UCKcqXmCcyqnhgpA5P0oHH_Q', - 'tags': [], - 'availability': 'unlisted', - 'channel_id': 'UCKcqXmCcyqnhgpA5P0oHH_Q', - 'channel': 'pukkandan', - 'description': 'Test for collaborative playlist', - 'title': 'yt-dlp test - collaborative playlist', - 'view_count': int, - 'uploader_url': 'https://www.youtube.com/@pukkandan', - 'uploader_id': '@pukkandan', - 'uploader': 'pukkandan', - }, - 'playlist_mincount': 2, - }, { - 'note': 'translated tab name', - 'url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA/playlists', - 'info_dict': { - 'id': 'UCiu-3thuViMebBjw_5nWYrA', - 'tags': [], - 'channel_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA', - 'description': 'test description', - 'title': 'cole-dlp-test-acc - 再生リスト', - 'channel_id': 'UCiu-3thuViMebBjw_5nWYrA', - 'channel': 'cole-dlp-test-acc', - 'uploader_url': 'https://www.youtube.com/@coletdjnz', - 'uploader_id': '@coletdjnz', - 'uploader': 'cole-dlp-test-acc', - }, - 'playlist_mincount': 1, - 'params': {'extractor_args': {'youtube': {'lang': ['ja']}}}, - 'expected_warnings': ['Preferring "ja"'], - }, { - # XXX: this should really check flat playlist entries, but the test suite doesn't support that - 'note': 'preferred lang set with playlist with translated video titles', - 'url': 'https://www.youtube.com/playlist?list=PLt5yu3-wZAlQAaPZ5Z-rJoTdbT-45Q7c0', - 'info_dict': { - 'id': 'PLt5yu3-wZAlQAaPZ5Z-rJoTdbT-45Q7c0', - 'tags': [], - 'view_count': int, - 'channel_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA', - 'channel': 'cole-dlp-test-acc', - 'channel_id': 'UCiu-3thuViMebBjw_5nWYrA', - 'description': 'test', - 'title': 'dlp test playlist', - 'availability': 'public', - 'uploader_url': 'https://www.youtube.com/@coletdjnz', - 'uploader_id': '@coletdjnz', - 'uploader': 'cole-dlp-test-acc', - }, - 'playlist_mincount': 1, - 'params': {'extractor_args': {'youtube': {'lang': ['ja']}}}, - 'expected_warnings': ['Preferring "ja"'], - }, { - # shorts audio pivot for 2GtVksBMYFM. - 'url': 'https://www.youtube.com/feed/sfv_audio_pivot?bp=8gUrCikSJwoLMkd0VmtzQk1ZRk0SCzJHdFZrc0JNWUZNGgsyR3RWa3NCTVlGTQ==', - 'info_dict': { - 'id': 'sfv_audio_pivot', - 'title': 'sfv_audio_pivot', - 'tags': [], - }, - 'playlist_mincount': 50, - - }, { - # Channel with a real live tab (not to be mistaken with streams tab) - # Do not treat like it should redirect to live stream - 'url': 'https://www.youtube.com/channel/UCEH7P7kyJIkS_gJf93VYbmg/live', - 'info_dict': { - 'id': 'UCEH7P7kyJIkS_gJf93VYbmg', - 'title': 'UCEH7P7kyJIkS_gJf93VYbmg - Live', - 'tags': [], - }, - 'playlist_mincount': 20, - }, { - # Tab name is not the same as tab id - 'url': 'https://www.youtube.com/channel/UCQvWX73GQygcwXOTSf_VDVg/letsplay', - 'info_dict': { - 'id': 'UCQvWX73GQygcwXOTSf_VDVg', - 'title': 'UCQvWX73GQygcwXOTSf_VDVg - Let\'s play', - 'tags': [], - }, - 'playlist_mincount': 8, - }, { - # Home tab id is literally home. Not to get mistaken with featured - 'url': 'https://www.youtube.com/channel/UCQvWX73GQygcwXOTSf_VDVg/home', - 'info_dict': { - 'id': 'UCQvWX73GQygcwXOTSf_VDVg', - 'title': 'UCQvWX73GQygcwXOTSf_VDVg - Home', - 'tags': [], - }, - 'playlist_mincount': 8, - }, { - # Should get three playlists for videos, shorts and streams tabs - 'url': 'https://www.youtube.com/channel/UCK9V2B22uJYu3N7eR_BT9QA', - 'info_dict': { - 'id': 'UCK9V2B22uJYu3N7eR_BT9QA', - 'title': 'Polka Ch. 尾丸ポルカ', - 'channel_follower_count': int, - 'channel_id': 'UCK9V2B22uJYu3N7eR_BT9QA', - 'channel_url': 'https://www.youtube.com/channel/UCK9V2B22uJYu3N7eR_BT9QA', - 'description': 'md5:49809d8bf9da539bc48ed5d1f83c33f2', - 'channel': 'Polka Ch. 尾丸ポルカ', - 'tags': 'count:35', - 'uploader_url': 'https://www.youtube.com/@OmaruPolka', - 'uploader': 'Polka Ch. 尾丸ポルカ', - 'uploader_id': '@OmaruPolka', - 'channel_is_verified': True, - }, - 'playlist_count': 3, - }, { - # Shorts tab with channel with handle - # TODO: fix channel description - 'url': 'https://www.youtube.com/@NotJustBikes/shorts', - 'info_dict': { - 'id': 'UC0intLFzLaudFG-xAvUEO-A', - 'title': 'Not Just Bikes - Shorts', - 'tags': 'count:10', - 'channel_url': 'https://www.youtube.com/channel/UC0intLFzLaudFG-xAvUEO-A', - 'description': 'md5:5e82545b3a041345927a92d0585df247', - 'channel_follower_count': int, - 'channel_id': 'UC0intLFzLaudFG-xAvUEO-A', - 'channel': 'Not Just Bikes', - 'uploader_url': 'https://www.youtube.com/@NotJustBikes', - 'uploader': 'Not Just Bikes', - 'uploader_id': '@NotJustBikes', - 'channel_is_verified': True, - }, - 'playlist_mincount': 10, - }, { - # Streams tab - 'url': 'https://www.youtube.com/channel/UC3eYAvjCVwNHgkaGbXX3sig/streams', - 'info_dict': { - 'id': 'UC3eYAvjCVwNHgkaGbXX3sig', - 'title': '中村悠一 - Live', - 'tags': 'count:7', - 'channel_id': 'UC3eYAvjCVwNHgkaGbXX3sig', - 'channel_url': 'https://www.youtube.com/channel/UC3eYAvjCVwNHgkaGbXX3sig', - 'channel': '中村悠一', - 'channel_follower_count': int, - 'description': 'md5:e744f6c93dafa7a03c0c6deecb157300', - 'uploader_url': 'https://www.youtube.com/@Yuichi-Nakamura', - 'uploader_id': '@Yuichi-Nakamura', - 'uploader': '中村悠一', - }, - 'playlist_mincount': 60, - }, { - # Channel with no uploads and hence no videos, streams, shorts tabs or uploads playlist. This should fail. - # See test_youtube_lists - 'url': 'https://www.youtube.com/channel/UC2yXPzFejc422buOIzn_0CA', - 'only_matching': True, - }, { - # No uploads and no UCID given. Should fail with no uploads error - # See test_youtube_lists - 'url': 'https://www.youtube.com/news', - 'only_matching': True, - }, { - # No videos tab but has a shorts tab - 'url': 'https://www.youtube.com/c/TKFShorts', - 'info_dict': { - 'id': 'UCgJ5_1F6yJhYLnyMszUdmUg', - 'title': 'Shorts Break - Shorts', - 'tags': 'count:48', - 'channel_id': 'UCgJ5_1F6yJhYLnyMszUdmUg', - 'channel': 'Shorts Break', - 'description': 'md5:6de33c5e7ba686e5f3efd4e19c7ef499', - 'channel_follower_count': int, - 'channel_url': 'https://www.youtube.com/channel/UCgJ5_1F6yJhYLnyMszUdmUg', - 'uploader_url': 'https://www.youtube.com/@ShortsBreak_Official', - 'uploader': 'Shorts Break', - 'uploader_id': '@ShortsBreak_Official', - }, - 'playlist_mincount': 30, - }, { - # Trending Now Tab. tab id is empty - 'url': 'https://www.youtube.com/feed/trending', - 'info_dict': { - 'id': 'trending', - 'title': 'trending - Now', - 'tags': [], - }, - 'playlist_mincount': 30, - }, { - # Trending Gaming Tab. tab id is empty - 'url': 'https://www.youtube.com/feed/trending?bp=4gIcGhpnYW1pbmdfY29ycHVzX21vc3RfcG9wdWxhcg%3D%3D', - 'info_dict': { - 'id': 'trending', - 'title': 'trending - Gaming', - 'tags': [], - }, - 'playlist_mincount': 30, - }, { - # Shorts url result in shorts tab - # TODO: Fix channel id extraction - 'url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA/shorts', - 'info_dict': { - 'id': 'UCiu-3thuViMebBjw_5nWYrA', - 'title': 'cole-dlp-test-acc - Shorts', - 'channel': 'cole-dlp-test-acc', - 'description': 'test description', - 'channel_id': 'UCiu-3thuViMebBjw_5nWYrA', - 'channel_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA', - 'tags': [], - 'uploader_url': 'https://www.youtube.com/@coletdjnz', - 'uploader_id': '@coletdjnz', - 'uploader': 'cole-dlp-test-acc', - }, - 'playlist': [{ - 'info_dict': { - # Channel data is not currently available for short renderers (as of 2023-03-01) - '_type': 'url', - 'ie_key': 'Youtube', - 'url': 'https://www.youtube.com/shorts/sSM9J5YH_60', - 'id': 'sSM9J5YH_60', - 'title': 'SHORT short', - 'view_count': int, - 'thumbnails': list, - }, - }], - 'params': {'extract_flat': True}, - }, { - # Live video status should be extracted - 'url': 'https://www.youtube.com/channel/UCQvWX73GQygcwXOTSf_VDVg/live', - 'info_dict': { - 'id': 'UCQvWX73GQygcwXOTSf_VDVg', - 'title': 'UCQvWX73GQygcwXOTSf_VDVg - Live', # TODO: should be Minecraft - Live or Minecraft - Topic - Live - 'tags': [], - }, - 'playlist': [{ - 'info_dict': { - '_type': 'url', - 'ie_key': 'Youtube', - 'url': 'startswith:https://www.youtube.com/watch?v=', - 'id': str, - 'title': str, - 'live_status': 'is_live', - 'channel_id': str, - 'channel_url': str, - 'concurrent_view_count': int, - 'channel': str, - 'uploader': str, - 'uploader_url': str, - 'uploader_id': str, - 'channel_is_verified': bool, # this will keep changing - }, - }], - 'params': {'extract_flat': True, 'playlist_items': '1'}, - 'playlist_mincount': 1, - }, { - # Channel renderer metadata. Contains number of videos on the channel - 'url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA/channels', - 'info_dict': { - 'id': 'UCiu-3thuViMebBjw_5nWYrA', - 'title': 'cole-dlp-test-acc - Channels', - 'channel': 'cole-dlp-test-acc', - 'description': 'test description', - 'channel_id': 'UCiu-3thuViMebBjw_5nWYrA', - 'channel_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA', - 'tags': [], - 'uploader_url': 'https://www.youtube.com/@coletdjnz', - 'uploader_id': '@coletdjnz', - 'uploader': 'cole-dlp-test-acc', - }, - 'playlist': [{ - 'info_dict': { - '_type': 'url', - 'ie_key': 'YoutubeTab', - 'url': 'https://www.youtube.com/channel/UC-lHJZR3Gqxm24_Vd_AJ5Yw', - 'id': 'UC-lHJZR3Gqxm24_Vd_AJ5Yw', - 'channel_id': 'UC-lHJZR3Gqxm24_Vd_AJ5Yw', - 'title': 'PewDiePie', - 'channel': 'PewDiePie', - 'channel_url': 'https://www.youtube.com/channel/UC-lHJZR3Gqxm24_Vd_AJ5Yw', - 'thumbnails': list, - 'channel_follower_count': int, - 'playlist_count': int, - 'uploader': 'PewDiePie', - 'uploader_url': 'https://www.youtube.com/@PewDiePie', - 'uploader_id': '@PewDiePie', - 'channel_is_verified': True, - }, - }], - 'params': {'extract_flat': True}, - }, { - 'url': 'https://www.youtube.com/@3blue1brown/about', - 'info_dict': { - 'id': '@3blue1brown', - 'tags': ['Mathematics'], - 'title': '3Blue1Brown', - 'channel_follower_count': int, - 'channel_id': 'UCYO_jab_esuFRV4b17AJtAw', - 'channel': '3Blue1Brown', - 'channel_url': 'https://www.youtube.com/channel/UCYO_jab_esuFRV4b17AJtAw', - 'description': 'md5:4d1da95432004b7ba840ebc895b6b4c9', - 'uploader_url': 'https://www.youtube.com/@3blue1brown', - 'uploader_id': '@3blue1brown', - 'uploader': '3Blue1Brown', - 'channel_is_verified': True, - }, - 'playlist_count': 0, - }, { - # Podcasts tab, with rich entry lockupViewModel - 'url': 'https://www.youtube.com/@99percentinvisiblepodcast/podcasts', - 'info_dict': { - 'id': 'UCVMF2HD4ZgC0QHpU9Yq5Xrw', - 'channel_id': 'UCVMF2HD4ZgC0QHpU9Yq5Xrw', - 'uploader_url': 'https://www.youtube.com/@99percentinvisiblepodcast', - 'description': 'md5:3a0ed38f1ad42a68ef0428c04a15695c', - 'title': '99% Invisible - Podcasts', - 'uploader': '99% Invisible', - 'channel_follower_count': int, - 'channel_url': 'https://www.youtube.com/channel/UCVMF2HD4ZgC0QHpU9Yq5Xrw', - 'tags': [], - 'channel': '99% Invisible', - 'uploader_id': '@99percentinvisiblepodcast', - }, - 'playlist_count': 5, - }, { - # Releases tab, with rich entry playlistRenderers (same as Podcasts tab) - 'url': 'https://www.youtube.com/@AHimitsu/releases', - 'info_dict': { - 'id': 'UCgFwu-j5-xNJml2FtTrrB3A', - 'channel': 'A Himitsu', - 'uploader_url': 'https://www.youtube.com/@AHimitsu', - 'title': 'A Himitsu - Releases', - 'uploader_id': '@AHimitsu', - 'uploader': 'A Himitsu', - 'channel_id': 'UCgFwu-j5-xNJml2FtTrrB3A', - 'tags': 'count:12', - 'description': 'I make music', - 'channel_url': 'https://www.youtube.com/channel/UCgFwu-j5-xNJml2FtTrrB3A', - 'channel_follower_count': int, - 'channel_is_verified': True, - }, - 'playlist_mincount': 10, - }, { - # Playlist with only shorts, shown as reel renderers - # FIXME: future: YouTube currently doesn't give continuation for this, - # may do in future. - 'url': 'https://www.youtube.com/playlist?list=UUxqPAgubo4coVn9Lx1FuKcg', - 'info_dict': { - 'id': 'UUxqPAgubo4coVn9Lx1FuKcg', - 'channel_url': 'https://www.youtube.com/channel/UCxqPAgubo4coVn9Lx1FuKcg', - 'view_count': int, - 'uploader_id': '@BangyShorts', - 'description': '', - 'uploader_url': 'https://www.youtube.com/@BangyShorts', - 'channel_id': 'UCxqPAgubo4coVn9Lx1FuKcg', - 'channel': 'Bangy Shorts', - 'uploader': 'Bangy Shorts', - 'tags': [], - 'availability': 'public', - 'modified_date': r're:\d{8}', - 'title': 'Uploads from Bangy Shorts', - }, - 'playlist_mincount': 100, - 'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'], - }, { - 'note': 'Tags containing spaces', - 'url': 'https://www.youtube.com/channel/UC7_YxT-KID8kRbqZo7MyscQ', - 'playlist_count': 3, - 'info_dict': { - 'id': 'UC7_YxT-KID8kRbqZo7MyscQ', - 'channel': 'Markiplier', - 'channel_id': 'UC7_YxT-KID8kRbqZo7MyscQ', - 'title': 'Markiplier', - 'channel_follower_count': int, - 'description': 'md5:0c010910558658824402809750dc5d97', - 'uploader_id': '@markiplier', - 'uploader_url': 'https://www.youtube.com/@markiplier', - 'uploader': 'Markiplier', - 'channel_url': 'https://www.youtube.com/channel/UC7_YxT-KID8kRbqZo7MyscQ', - 'channel_is_verified': True, - 'tags': ['markiplier', 'comedy', 'gaming', 'funny videos', 'funny moments', - 'sketch comedy', 'laughing', 'lets play', 'challenge videos', 'hilarious', - 'challenges', 'sketches', 'scary games', 'funny games', 'rage games', - 'mark fischbach'], - }, - }] - - @classmethod - def suitable(cls, url): - return False if YoutubeIE.suitable(url) else super().suitable(url) - - _URL_RE = re.compile(rf'(?P<pre>{_VALID_URL})(?(not_channel)|(?P<tab>/[^?#/]+))?(?P<post>.*)$') - - def _get_url_mobj(self, url): - mobj = self._URL_RE.match(url).groupdict() - mobj.update((k, '') for k, v in mobj.items() if v is None) - return mobj - - def _extract_tab_id_and_name(self, tab, base_url='https://www.youtube.com'): - tab_name = (tab.get('title') or '').lower() - tab_url = urljoin(base_url, traverse_obj( - tab, ('endpoint', 'commandMetadata', 'webCommandMetadata', 'url'))) - - tab_id = ((tab_url and self._get_url_mobj(tab_url)['tab'][1:]) - or traverse_obj(tab, 'tabIdentifier', expected_type=str)) - if tab_id: - return { - 'TAB_ID_SPONSORSHIPS': 'membership', - }.get(tab_id, tab_id), tab_name - - # Fallback to tab name if we cannot get the tab id. - # XXX: should we strip non-ascii letters? e.g. in case of 'let's play' tab example on special gaming channel - # Note that in the case of translated tab name this may result in an empty string, which we don't want. - if tab_name: - self.write_debug(f'Falling back to selected tab name: {tab_name}') - return { - 'home': 'featured', - 'live': 'streams', - }.get(tab_name, tab_name), tab_name - - def _has_tab(self, tabs, tab_id): - return any(self._extract_tab_id_and_name(tab)[0] == tab_id for tab in tabs) - - def _empty_playlist(self, item_id, data): - return self.playlist_result([], item_id, **self._extract_metadata_from_tabs(item_id, data)) - - @YoutubeTabBaseInfoExtractor.passthrough_smuggled_data - def _real_extract(self, url, smuggled_data): - item_id = self._match_id(url) - url = urllib.parse.urlunparse( - urllib.parse.urlparse(url)._replace(netloc='www.youtube.com')) - compat_opts = self.get_param('compat_opts', []) - - mobj = self._get_url_mobj(url) - pre, tab, post, is_channel = mobj['pre'], mobj['tab'], mobj['post'], not mobj['not_channel'] - if is_channel and smuggled_data.get('is_music_url'): - if item_id[:2] == 'VL': # Youtube music VL channels have an equivalent playlist - return self.url_result( - f'https://music.youtube.com/playlist?list={item_id[2:]}', YoutubeTabIE, item_id[2:]) - elif item_id[:2] == 'MP': # Resolve albums (/[channel/browse]/MP...) to their equivalent playlist - mdata = self._extract_tab_endpoint( - f'https://music.youtube.com/channel/{item_id}', item_id, default_client='web_music') - murl = traverse_obj(mdata, ('microformat', 'microformatDataRenderer', 'urlCanonical'), - get_all=False, expected_type=str) - if not murl: - raise ExtractorError('Failed to resolve album to playlist') - return self.url_result(murl, YoutubeTabIE) - elif mobj['channel_type'] == 'browse': # Youtube music /browse/ should be changed to /channel/ - return self.url_result( - f'https://music.youtube.com/channel/{item_id}{tab}{post}', YoutubeTabIE, item_id) - - original_tab_id, display_id = tab[1:], f'{item_id}{tab}' - if is_channel and not tab and 'no-youtube-channel-redirect' not in compat_opts: - url = f'{pre}/videos{post}' - if smuggled_data.get('is_music_url'): - self.report_warning(f'YouTube Music is not directly supported. Redirecting to {url}') - - # Handle both video/playlist URLs - qs = parse_qs(url) - video_id, playlist_id = (traverse_obj(qs, (key, 0)) for key in ('v', 'list')) - if not video_id and mobj['not_channel'].startswith('watch'): - if not playlist_id: - # If there is neither video or playlist ids, youtube redirects to home page, which is undesirable - raise ExtractorError('A video URL was given without video ID', expected=True) - # Common mistake: https://www.youtube.com/watch?list=playlist_id - self.report_warning(f'A video URL was given without video ID. Trying to download playlist {playlist_id}') - return self.url_result( - f'https://www.youtube.com/playlist?list={playlist_id}', YoutubeTabIE, playlist_id) - - if not self._yes_playlist(playlist_id, video_id): - return self.url_result( - f'https://www.youtube.com/watch?v={video_id}', YoutubeIE, video_id) - - data, ytcfg = self._extract_data(url, display_id) - - # YouTube may provide a non-standard redirect to the regional channel - # See: https://github.com/yt-dlp/yt-dlp/issues/2694 - # https://support.google.com/youtube/answer/2976814#zippy=,conditional-redirects - redirect_url = traverse_obj( - data, ('onResponseReceivedActions', ..., 'navigateAction', 'endpoint', 'commandMetadata', 'webCommandMetadata', 'url'), get_all=False) - if redirect_url and 'no-youtube-channel-redirect' not in compat_opts: - redirect_url = ''.join((urljoin('https://www.youtube.com', redirect_url), tab, post)) - self.to_screen(f'This playlist is likely not available in your region. Following conditional redirect to {redirect_url}') - return self.url_result(redirect_url, YoutubeTabIE) - - tabs, extra_tabs = self._extract_tab_renderers(data), [] - if is_channel and tabs and 'no-youtube-channel-redirect' not in compat_opts: - selected_tab = self._extract_selected_tab(tabs) - selected_tab_id, selected_tab_name = self._extract_tab_id_and_name(selected_tab, url) # NB: Name may be translated - self.write_debug(f'Selected tab: {selected_tab_id!r} ({selected_tab_name}), Requested tab: {original_tab_id!r}') - - # /about is no longer a tab - if original_tab_id == 'about': - return self._empty_playlist(item_id, data) - - if not original_tab_id and selected_tab_name: - self.to_screen('Downloading all uploads of the channel. ' - 'To download only the videos in a specific tab, pass the tab\'s URL') - if self._has_tab(tabs, 'streams'): - extra_tabs.append(''.join((pre, '/streams', post))) - if self._has_tab(tabs, 'shorts'): - extra_tabs.append(''.join((pre, '/shorts', post))) - # XXX: Members-only tab should also be extracted - - if not extra_tabs and selected_tab_id != 'videos': - # Channel does not have streams, shorts or videos tabs - if item_id[:2] != 'UC': - return self._empty_playlist(item_id, data) - - # Topic channels don't have /videos. Use the equivalent playlist instead - pl_id = f'UU{item_id[2:]}' - pl_url = f'https://www.youtube.com/playlist?list={pl_id}' - try: - data, ytcfg = self._extract_data(pl_url, pl_id, ytcfg=ytcfg, fatal=True, webpage_fatal=True) - except ExtractorError: - return self._empty_playlist(item_id, data) - else: - item_id, url = pl_id, pl_url - self.to_screen( - f'The channel does not have a videos, shorts, or live tab. Redirecting to playlist {pl_id} instead') - - elif extra_tabs and selected_tab_id != 'videos': - # When there are shorts/live tabs but not videos tab - url, data = f'{pre}{post}', None - - elif (original_tab_id or 'videos') != selected_tab_id: - if original_tab_id == 'live': - # Live tab should have redirected to the video - # Except in the case the channel has an actual live tab - # Example: https://www.youtube.com/channel/UCEH7P7kyJIkS_gJf93VYbmg/live - raise UserNotLive(video_id=item_id) - elif selected_tab_name: - raise ExtractorError(f'This channel does not have a {original_tab_id} tab', expected=True) - - # For channels such as https://www.youtube.com/channel/UCtFRv9O2AHqOZjjynzrv-xg - url = f'{pre}{post}' - - # YouTube sometimes provides a button to reload playlist with unavailable videos. - if 'no-youtube-unavailable-videos' not in compat_opts: - data = self._reload_with_unavailable_videos(display_id, data, ytcfg) or data - self._extract_and_report_alerts(data, only_once=True) - - tabs, entries = self._extract_tab_renderers(data), [] - if tabs: - entries = [self._extract_from_tabs(item_id, ytcfg, data, tabs)] - entries[0].update({ - 'extractor_key': YoutubeTabIE.ie_key(), - 'extractor': YoutubeTabIE.IE_NAME, - 'webpage_url': url, - }) - if self.get_param('playlist_items') == '0': - entries.extend(self.url_result(u, YoutubeTabIE) for u in extra_tabs) - else: # Users expect to get all `video_id`s even with `--flat-playlist`. So don't return `url_result` - entries.extend(map(self._real_extract, extra_tabs)) - - if len(entries) == 1: - return entries[0] - elif entries: - metadata = self._extract_metadata_from_tabs(item_id, data) - uploads_url = 'the Uploads (UU) playlist URL' - if try_get(metadata, lambda x: x['channel_id'].startswith('UC')): - uploads_url = f'https://www.youtube.com/playlist?list=UU{metadata["channel_id"][2:]}' - self.to_screen( - 'Downloading as multiple playlists, separated by tabs. ' - f'To download as a single playlist instead, pass {uploads_url}') - return self.playlist_result(entries, item_id, **metadata) - - # Inline playlist - playlist = traverse_obj( - data, ('contents', 'twoColumnWatchNextResults', 'playlist', 'playlist'), expected_type=dict) - if playlist: - return self._extract_from_playlist(item_id, url, data, playlist, ytcfg) - - video_id = traverse_obj( - data, ('currentVideoEndpoint', 'watchEndpoint', 'videoId'), expected_type=str) or video_id - if video_id: - if tab != '/live': # live tab is expected to redirect to video - self.report_warning(f'Unable to recognize playlist. Downloading just video {video_id}') - return self.url_result(f'https://www.youtube.com/watch?v={video_id}', YoutubeIE, video_id) - - raise ExtractorError('Unable to recognize tab page') - - -class YoutubePlaylistIE(YoutubeBaseInfoExtractor): - IE_DESC = 'YouTube playlists' - _VALID_URL = r'''(?x)(?: - (?:https?://)? - (?:\w+\.)? - (?: - (?: - youtube(?:kids)?\.com| - {invidious} - ) - /.*?\?.*?\blist= - )? - (?P<id>{playlist_id}) - )'''.format( - playlist_id=YoutubeBaseInfoExtractor._PLAYLIST_ID_RE, - invidious='|'.join(YoutubeBaseInfoExtractor._INVIDIOUS_SITES), - ) - IE_NAME = 'youtube:playlist' - _TESTS = [{ - 'note': 'issue #673', - 'url': 'PLBB231211A4F62143', - 'info_dict': { - 'title': '[OLD]Team Fortress 2 (Class-based LP)', - 'id': 'PLBB231211A4F62143', - 'uploader': 'Wickman', - 'uploader_id': '@WickmanVT', - 'description': 'md5:8fa6f52abb47a9552002fa3ddfc57fc2', - 'view_count': int, - 'uploader_url': 'https://www.youtube.com/@WickmanVT', - 'modified_date': r're:\d{8}', - 'channel_id': 'UCKSpbfbl5kRQpTdL7kMc-1Q', - 'channel': 'Wickman', - 'tags': [], - 'channel_url': 'https://www.youtube.com/channel/UCKSpbfbl5kRQpTdL7kMc-1Q', - 'availability': 'public', - }, - 'playlist_mincount': 29, - }, { - 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl', - 'info_dict': { - 'title': 'YDL_safe_search', - 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl', - }, - 'playlist_count': 2, - 'skip': 'This playlist is private', - }, { - 'note': 'embedded', - 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu', - 'playlist_count': 4, - 'info_dict': { - 'title': 'JODA15', - 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu', - 'uploader': 'milan', - 'uploader_id': '@milan5503', - 'description': '', - 'channel_url': 'https://www.youtube.com/channel/UCEI1-PVPcYXjB73Hfelbmaw', - 'tags': [], - 'modified_date': '20140919', - 'view_count': int, - 'channel': 'milan', - 'channel_id': 'UCEI1-PVPcYXjB73Hfelbmaw', - 'uploader_url': 'https://www.youtube.com/@milan5503', - 'availability': 'public', - }, - 'expected_warnings': [r'[Uu]navailable videos? (is|are|will be) hidden', 'Retrying', 'Giving up'], - }, { - 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl', - 'playlist_mincount': 455, - 'info_dict': { - 'title': '2018 Chinese New Singles (11/6 updated)', - 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl', - 'uploader': 'LBK', - 'uploader_id': '@music_king', - 'description': 'md5:da521864744d60a198e3a88af4db0d9d', - 'channel': 'LBK', - 'view_count': int, - 'channel_url': 'https://www.youtube.com/channel/UC21nz3_MesPLqtDqwdvnoxA', - 'tags': [], - 'uploader_url': 'https://www.youtube.com/@music_king', - 'channel_id': 'UC21nz3_MesPLqtDqwdvnoxA', - 'modified_date': r're:\d{8}', - 'availability': 'public', - }, - 'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'], - }, { - 'url': 'TLGGrESM50VT6acwMjAyMjAxNw', - 'only_matching': True, - }, { - # music album playlist - 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM', - 'only_matching': True, - }] - - @classmethod - def suitable(cls, url): - if YoutubeTabIE.suitable(url): - return False - from ..utils import parse_qs - qs = parse_qs(url) - if qs.get('v', [None])[0]: - return False - return super().suitable(url) - - def _real_extract(self, url): - playlist_id = self._match_id(url) - is_music_url = YoutubeBaseInfoExtractor.is_music_url(url) - url = update_url_query( - 'https://www.youtube.com/playlist', - parse_qs(url) or {'list': playlist_id}) - if is_music_url: - url = smuggle_url(url, {'is_music_url': True}) - return self.url_result(url, ie=YoutubeTabIE.ie_key(), video_id=playlist_id) - - -class YoutubeYtBeIE(YoutubeBaseInfoExtractor): - IE_DESC = 'youtu.be' - _VALID_URL = rf'https?://youtu\.be/(?P<id>[0-9A-Za-z_-]{{11}})/*?.*?\blist=(?P<playlist_id>{YoutubeBaseInfoExtractor._PLAYLIST_ID_RE})' - _TESTS = [{ - 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5', - 'info_dict': { - 'id': 'yeWKywCrFtk', - 'ext': 'mp4', - 'title': 'Small Scale Baler and Braiding Rugs', - 'uploader': 'Backus-Page House Museum', - 'uploader_id': '@backuspagemuseum', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/@backuspagemuseum', - 'upload_date': '20161008', - 'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a', - 'categories': ['Nonprofits & Activism'], - 'tags': list, - 'like_count': int, - 'age_limit': 0, - 'playable_in_embed': True, - 'thumbnail': r're:^https?://.*\.webp', - 'channel': 'Backus-Page House Museum', - 'channel_id': 'UCEfMCQ9bs3tjvjy1s451zaw', - 'live_status': 'not_live', - 'view_count': int, - 'channel_url': 'https://www.youtube.com/channel/UCEfMCQ9bs3tjvjy1s451zaw', - 'availability': 'public', - 'duration': 59, - 'comment_count': int, - 'channel_follower_count': int, - }, - 'params': { - 'noplaylist': True, - 'skip_download': True, - }, - }, { - 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = self._match_valid_url(url) - video_id = mobj.group('id') - playlist_id = mobj.group('playlist_id') - return self.url_result( - update_url_query('https://www.youtube.com/watch', { - 'v': video_id, - 'list': playlist_id, - 'feature': 'youtu.be', - }), ie=YoutubeTabIE.ie_key(), video_id=playlist_id) - - -class YoutubeLivestreamEmbedIE(YoutubeBaseInfoExtractor): - IE_DESC = 'YouTube livestream embeds' - _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/embed/live_stream/?\?(?:[^#]+&)?channel=(?P<id>[^&#]+)' - _TESTS = [{ - 'url': 'https://www.youtube.com/embed/live_stream?channel=UC2_KI6RB__jGdlnK6dvFEZA', - 'only_matching': True, - }] - - def _real_extract(self, url): - channel_id = self._match_id(url) - return self.url_result( - f'https://www.youtube.com/channel/{channel_id}/live', - ie=YoutubeTabIE.ie_key(), video_id=channel_id) - - -class YoutubeYtUserIE(YoutubeBaseInfoExtractor): - IE_DESC = 'YouTube user videos; "ytuser:" prefix' - IE_NAME = 'youtube:user' - _VALID_URL = r'ytuser:(?P<id>.+)' - _TESTS = [{ - 'url': 'ytuser:phihag', - 'only_matching': True, - }] - - def _real_extract(self, url): - user_id = self._match_id(url) - return self.url_result(f'https://www.youtube.com/user/{user_id}', YoutubeTabIE, user_id) - - -class YoutubeFavouritesIE(YoutubeBaseInfoExtractor): - IE_NAME = 'youtube:favorites' - IE_DESC = 'YouTube liked videos; ":ytfav" keyword (requires cookies)' - _VALID_URL = r':ytfav(?:ou?rite)?s?' - _LOGIN_REQUIRED = True - _TESTS = [{ - 'url': ':ytfav', - 'only_matching': True, - }, { - 'url': ':ytfavorites', - 'only_matching': True, - }] - - def _real_extract(self, url): - return self.url_result( - 'https://www.youtube.com/playlist?list=LL', - ie=YoutubeTabIE.ie_key()) - - -class YoutubeNotificationsIE(YoutubeTabBaseInfoExtractor): - IE_NAME = 'youtube:notif' - IE_DESC = 'YouTube notifications; ":ytnotif" keyword (requires cookies)' - _VALID_URL = r':ytnotif(?:ication)?s?' - _LOGIN_REQUIRED = True - _TESTS = [{ - 'url': ':ytnotif', - 'only_matching': True, - }, { - 'url': ':ytnotifications', - 'only_matching': True, - }] - - def _extract_notification_menu(self, response, continuation_list): - notification_list = traverse_obj( - response, - ('actions', 0, 'openPopupAction', 'popup', 'multiPageMenuRenderer', 'sections', 0, 'multiPageMenuNotificationSectionRenderer', 'items'), - ('actions', 0, 'appendContinuationItemsAction', 'continuationItems'), - expected_type=list) or [] - continuation_list[0] = None - for item in notification_list: - entry = self._extract_notification_renderer(item.get('notificationRenderer')) - if entry: - yield entry - continuation = item.get('continuationItemRenderer') - if continuation: - continuation_list[0] = continuation - - def _extract_notification_renderer(self, notification): - video_id = traverse_obj( - notification, ('navigationEndpoint', 'watchEndpoint', 'videoId'), expected_type=str) - url = f'https://www.youtube.com/watch?v={video_id}' - channel_id = None - if not video_id: - browse_ep = traverse_obj( - notification, ('navigationEndpoint', 'browseEndpoint'), expected_type=dict) - channel_id = self.ucid_or_none(traverse_obj(browse_ep, 'browseId', expected_type=str)) - post_id = self._search_regex( - r'/post/(.+)', traverse_obj(browse_ep, 'canonicalBaseUrl', expected_type=str), - 'post id', default=None) - if not channel_id or not post_id: - return - # The direct /post url redirects to this in the browser - url = f'https://www.youtube.com/channel/{channel_id}/community?lb={post_id}' - - channel = traverse_obj( - notification, ('contextualMenu', 'menuRenderer', 'items', 1, 'menuServiceItemRenderer', 'text', 'runs', 1, 'text'), - expected_type=str) - notification_title = self._get_text(notification, 'shortMessage') - if notification_title: - notification_title = notification_title.replace('\xad', '') # remove soft hyphens - # TODO: handle recommended videos - title = self._search_regex( - rf'{re.escape(channel or "")}[^:]+: (.+)', notification_title, - 'video title', default=None) - timestamp = (self._parse_time_text(self._get_text(notification, 'sentTimeText')) - if self._configuration_arg('approximate_date', ie_key=YoutubeTabIE) - else None) - return { - '_type': 'url', - 'url': url, - 'ie_key': (YoutubeIE if video_id else YoutubeTabIE).ie_key(), - 'video_id': video_id, - 'title': title, - 'channel_id': channel_id, - 'channel': channel, - 'uploader': channel, - 'thumbnails': self._extract_thumbnails(notification, 'videoThumbnail'), - 'timestamp': timestamp, - } - - def _notification_menu_entries(self, ytcfg): - continuation_list = [None] - response = None - for page in itertools.count(1): - ctoken = traverse_obj( - continuation_list, (0, 'continuationEndpoint', 'getNotificationMenuEndpoint', 'ctoken'), expected_type=str) - response = self._extract_response( - item_id=f'page {page}', query={'ctoken': ctoken} if ctoken else {}, ytcfg=ytcfg, - ep='notification/get_notification_menu', check_get_keys='actions', - headers=self.generate_api_headers(ytcfg=ytcfg, visitor_data=self._extract_visitor_data(response))) - yield from self._extract_notification_menu(response, continuation_list) - if not continuation_list[0]: - break - - def _real_extract(self, url): - display_id = 'notifications' - ytcfg = self._download_ytcfg('web', display_id) if not self.skip_webpage else {} - self._report_playlist_authcheck(ytcfg) - return self.playlist_result(self._notification_menu_entries(ytcfg), display_id, display_id) - - -class YoutubeSearchIE(YoutubeTabBaseInfoExtractor, SearchInfoExtractor): - IE_DESC = 'YouTube search' - IE_NAME = 'youtube:search' - _SEARCH_KEY = 'ytsearch' - _SEARCH_PARAMS = 'EgIQAfABAQ==' # Videos only - _TESTS = [{ - 'url': 'ytsearch5:youtube-dl test video', - 'playlist_count': 5, - 'info_dict': { - 'id': 'youtube-dl test video', - 'title': 'youtube-dl test video', - }, - }, { - 'note': 'Suicide/self-harm search warning', - 'url': 'ytsearch1:i hate myself and i wanna die', - 'playlist_count': 1, - 'info_dict': { - 'id': 'i hate myself and i wanna die', - 'title': 'i hate myself and i wanna die', - }, - }] - - -class YoutubeSearchDateIE(YoutubeTabBaseInfoExtractor, SearchInfoExtractor): - IE_NAME = YoutubeSearchIE.IE_NAME + ':date' - _SEARCH_KEY = 'ytsearchdate' - IE_DESC = 'YouTube search, newest videos first' - _SEARCH_PARAMS = 'CAISAhAB8AEB' # Videos only, sorted by date - _TESTS = [{ - 'url': 'ytsearchdate5:youtube-dl test video', - 'playlist_count': 5, - 'info_dict': { - 'id': 'youtube-dl test video', - 'title': 'youtube-dl test video', - }, - }] - - -class YoutubeSearchURLIE(YoutubeTabBaseInfoExtractor): - IE_DESC = 'YouTube search URLs with sorting and filter support' - IE_NAME = YoutubeSearchIE.IE_NAME + '_url' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/(?:results|search)\?([^#]+&)?(?:search_query|q)=(?:[^&]+)(?:[&#]|$)' - _TESTS = [{ - 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', - 'playlist_mincount': 5, - 'info_dict': { - 'id': 'youtube-dl test video', - 'title': 'youtube-dl test video', - }, - }, { - 'url': 'https://www.youtube.com/results?search_query=python&sp=EgIQAg%253D%253D', - 'playlist_mincount': 5, - 'info_dict': { - 'id': 'python', - 'title': 'python', - }, - }, { - 'url': 'https://www.youtube.com/results?search_query=%23cats', - 'playlist_mincount': 1, - 'info_dict': { - 'id': '#cats', - 'title': '#cats', - # The test suite does not have support for nested playlists - # 'entries': [{ - # 'url': r're:https://(www\.)?youtube\.com/hashtag/cats', - # 'title': '#cats', - # }], - }, - }, { - # Channel results - 'url': 'https://www.youtube.com/results?search_query=kurzgesagt&sp=EgIQAg%253D%253D', - 'info_dict': { - 'id': 'kurzgesagt', - 'title': 'kurzgesagt', - }, - 'playlist': [{ - 'info_dict': { - '_type': 'url', - 'id': 'UCsXVk37bltHxD1rDPwtNM8Q', - 'url': 'https://www.youtube.com/channel/UCsXVk37bltHxD1rDPwtNM8Q', - 'ie_key': 'YoutubeTab', - 'channel': 'Kurzgesagt – In a Nutshell', - 'description': 'md5:4ae48dfa9505ffc307dad26342d06bfc', - 'title': 'Kurzgesagt – In a Nutshell', - 'channel_id': 'UCsXVk37bltHxD1rDPwtNM8Q', - # No longer available for search as it is set to the handle. - # 'playlist_count': int, - 'channel_url': 'https://www.youtube.com/channel/UCsXVk37bltHxD1rDPwtNM8Q', - 'thumbnails': list, - 'uploader_id': '@kurzgesagt', - 'uploader_url': 'https://www.youtube.com/@kurzgesagt', - 'uploader': 'Kurzgesagt – In a Nutshell', - 'channel_is_verified': True, - 'channel_follower_count': int, - }, - }], - 'params': {'extract_flat': True, 'playlist_items': '1'}, - 'playlist_mincount': 1, - }, { - 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB', - 'only_matching': True, - }] - - def _real_extract(self, url): - qs = parse_qs(url) - query = (qs.get('search_query') or qs.get('q'))[0] - return self.playlist_result(self._search_results(query, qs.get('sp', (None,))[0]), query, query) - - -class YoutubeMusicSearchURLIE(YoutubeTabBaseInfoExtractor): - IE_DESC = 'YouTube music search URLs with selectable sections, e.g. #songs' - IE_NAME = 'youtube:music:search_url' - _VALID_URL = r'https?://music\.youtube\.com/search\?([^#]+&)?(?:search_query|q)=(?:[^&]+)(?:[&#]|$)' - _TESTS = [{ - 'url': 'https://music.youtube.com/search?q=royalty+free+music', - 'playlist_count': 16, - 'info_dict': { - 'id': 'royalty free music', - 'title': 'royalty free music', - }, - }, { - 'url': 'https://music.youtube.com/search?q=royalty+free+music&sp=EgWKAQIIAWoKEAoQAxAEEAkQBQ%3D%3D', - 'playlist_mincount': 30, - 'info_dict': { - 'id': 'royalty free music - songs', - 'title': 'royalty free music - songs', - }, - 'params': {'extract_flat': 'in_playlist'}, - }, { - 'url': 'https://music.youtube.com/search?q=royalty+free+music#community+playlists', - 'playlist_mincount': 30, - 'info_dict': { - 'id': 'royalty free music - community playlists', - 'title': 'royalty free music - community playlists', - }, - 'params': {'extract_flat': 'in_playlist'}, - }] - - _SECTIONS = { - 'albums': 'EgWKAQIYAWoKEAoQAxAEEAkQBQ==', - 'artists': 'EgWKAQIgAWoKEAoQAxAEEAkQBQ==', - 'community playlists': 'EgeKAQQoAEABagoQChADEAQQCRAF', - 'featured playlists': 'EgeKAQQoADgBagwQAxAJEAQQDhAKEAU==', - 'songs': 'EgWKAQIIAWoKEAoQAxAEEAkQBQ==', - 'videos': 'EgWKAQIQAWoKEAoQAxAEEAkQBQ==', - } - - def _real_extract(self, url): - qs = parse_qs(url) - query = (qs.get('search_query') or qs.get('q'))[0] - params = qs.get('sp', (None,))[0] - if params: - section = next((k for k, v in self._SECTIONS.items() if v == params), params) - else: - section = urllib.parse.unquote_plus(([*url.split('#'), ''])[1]).lower() - params = self._SECTIONS.get(section) - if not params: - section = None - title = join_nonempty(query, section, delim=' - ') - return self.playlist_result(self._search_results(query, params, default_client='web_music'), title, title) - - -class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): - """ - Base class for feed extractors - Subclasses must re-define the _FEED_NAME property. - """ - _LOGIN_REQUIRED = True - _FEED_NAME = 'feeds' - - @classproperty - def IE_NAME(cls): - return f'youtube:{cls._FEED_NAME}' - - def _real_extract(self, url): - return self.url_result( - f'https://www.youtube.com/feed/{self._FEED_NAME}', ie=YoutubeTabIE.ie_key()) - - -class YoutubeWatchLaterIE(YoutubeBaseInfoExtractor): - IE_NAME = 'youtube:watchlater' - IE_DESC = 'Youtube watch later list; ":ytwatchlater" keyword (requires cookies)' - _VALID_URL = r':ytwatchlater' - _TESTS = [{ - 'url': ':ytwatchlater', - 'only_matching': True, - }] - - def _real_extract(self, url): - return self.url_result( - 'https://www.youtube.com/playlist?list=WL', ie=YoutubeTabIE.ie_key()) - - -class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor): - IE_DESC = 'YouTube recommended videos; ":ytrec" keyword' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/?(?:[?#]|$)|:ytrec(?:ommended)?' - _FEED_NAME = 'recommended' - _LOGIN_REQUIRED = False - _TESTS = [{ - 'url': ':ytrec', - 'only_matching': True, - }, { - 'url': ':ytrecommended', - 'only_matching': True, - }, { - 'url': 'https://youtube.com', - 'only_matching': True, - }] - - -class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor): - IE_DESC = 'YouTube subscriptions feed; ":ytsubs" keyword (requires cookies)' - _VALID_URL = r':ytsub(?:scription)?s?' - _FEED_NAME = 'subscriptions' - _TESTS = [{ - 'url': ':ytsubs', - 'only_matching': True, - }, { - 'url': ':ytsubscriptions', - 'only_matching': True, - }] - - -class YoutubeHistoryIE(YoutubeFeedsInfoExtractor): - IE_DESC = 'Youtube watch history; ":ythis" keyword (requires cookies)' - _VALID_URL = r':ythis(?:tory)?' - _FEED_NAME = 'history' - _TESTS = [{ - 'url': ':ythistory', - 'only_matching': True, - }] - - -class YoutubeShortsAudioPivotIE(YoutubeBaseInfoExtractor): - IE_DESC = 'YouTube Shorts audio pivot (Shorts using audio of a given video)' - IE_NAME = 'youtube:shorts:pivot:audio' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/source/(?P<id>[\w-]{11})/shorts' - _TESTS = [{ - 'url': 'https://www.youtube.com/source/Lyj-MZSAA9o/shorts', - 'only_matching': True, - }] - - @staticmethod - def _generate_audio_pivot_params(video_id): - """ - Generates sfv_audio_pivot browse params for this video id - """ - pb_params = b'\xf2\x05+\n)\x12\'\n\x0b%b\x12\x0b%b\x1a\x0b%b' % ((video_id.encode(),) * 3) - return urllib.parse.quote(base64.b64encode(pb_params).decode()) - - def _real_extract(self, url): - video_id = self._match_id(url) - return self.url_result( - f'https://www.youtube.com/feed/sfv_audio_pivot?bp={self._generate_audio_pivot_params(video_id)}', - ie=YoutubeTabIE) - - -class YoutubeTruncatedURLIE(YoutubeBaseInfoExtractor): - IE_NAME = 'youtube:truncated_url' - IE_DESC = False # Do not list - _VALID_URL = r'''(?x) - (?:https?://)? - (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/ - (?:watch\?(?: - feature=[a-z_]+| - annotation_id=annotation_[^&]+| - x-yt-cl=[0-9]+| - hl=[^&]*| - t=[0-9]+ - )? - | - attribution_link\?a=[^&]+ - ) - $ - ''' - - _TESTS = [{ - 'url': 'https://www.youtube.com/watch?annotation_id=annotation_3951667041', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/watch?', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/watch?feature=foo', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/watch?hl=en-GB', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/watch?t=2372', - 'only_matching': True, - }] - - def _real_extract(self, url): - raise ExtractorError( - 'Did you forget to quote the URL? Remember that & is a meta ' - 'character in most shells, so you want to put the URL in quotes, ' - 'like yt-dlp ' - '"https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" ' - ' or simply yt-dlp BaW_jenozKc .', - expected=True) - - -class YoutubeClipIE(YoutubeTabBaseInfoExtractor): - IE_NAME = 'youtube:clip' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/clip/(?P<id>[^/?#]+)' - _TESTS = [{ - # FIXME: Other metadata should be extracted from the clip, not from the base video - 'url': 'https://www.youtube.com/clip/UgytZKpehg-hEMBSn3F4AaABCQ', - 'info_dict': { - 'id': 'UgytZKpehg-hEMBSn3F4AaABCQ', - 'ext': 'mp4', - 'section_start': 29.0, - 'section_end': 39.7, - 'duration': 10.7, - 'age_limit': 0, - 'availability': 'public', - 'categories': ['Gaming'], - 'channel': 'Scott The Woz', - 'channel_id': 'UC4rqhyiTs7XyuODcECvuiiQ', - 'channel_url': 'https://www.youtube.com/channel/UC4rqhyiTs7XyuODcECvuiiQ', - 'description': 'md5:7a4517a17ea9b4bd98996399d8bb36e7', - 'like_count': int, - 'playable_in_embed': True, - 'tags': 'count:17', - 'thumbnail': 'https://i.ytimg.com/vi_webp/ScPX26pdQik/maxresdefault.webp', - 'title': 'Mobile Games on Console - Scott The Woz', - 'upload_date': '20210920', - 'uploader': 'Scott The Woz', - 'uploader_id': '@ScottTheWoz', - 'uploader_url': 'https://www.youtube.com/@ScottTheWoz', - 'view_count': int, - 'live_status': 'not_live', - 'channel_follower_count': int, - 'chapters': 'count:20', - 'comment_count': int, - 'heatmap': 'count:100', - }, - }] - - def _real_extract(self, url): - clip_id = self._match_id(url) - _, data = self._extract_webpage(url, clip_id) - - video_id = traverse_obj(data, ('currentVideoEndpoint', 'watchEndpoint', 'videoId')) - if not video_id: - raise ExtractorError('Unable to find video ID') - - clip_data = traverse_obj(data, ( - 'engagementPanels', ..., 'engagementPanelSectionListRenderer', 'content', 'clipSectionRenderer', - 'contents', ..., 'clipAttributionRenderer', 'onScrubExit', 'commandExecutorCommand', 'commands', ..., - 'openPopupAction', 'popup', 'notificationActionRenderer', 'actionButton', 'buttonRenderer', 'command', - 'commandExecutorCommand', 'commands', ..., 'loopCommand'), get_all=False) - - return { - '_type': 'url_transparent', - 'url': f'https://www.youtube.com/watch?v={video_id}', - 'ie_key': YoutubeIE.ie_key(), - 'id': clip_id, - 'section_start': int(clip_data['startTimeMs']) / 1000, - 'section_end': int(clip_data['endTimeMs']) / 1000, - '_format_sort_fields': ( # https protocol is prioritized for ffmpeg compatibility - 'proto:https', 'quality', 'res', 'fps', 'hdr:12', 'source', 'vcodec', 'channels', 'acodec', 'lang'), - } - - -class YoutubeConsentRedirectIE(YoutubeBaseInfoExtractor): - IE_NAME = 'youtube:consent' - IE_DESC = False # Do not list - _VALID_URL = r'https?://consent\.youtube\.com/m\?' - _TESTS = [{ - 'url': 'https://consent.youtube.com/m?continue=https%3A%2F%2Fwww.youtube.com%2Flive%2FqVv6vCqciTM%3Fcbrd%3D1&gl=NL&m=0&pc=yt&hl=en&src=1', - 'info_dict': { - 'id': 'qVv6vCqciTM', - 'ext': 'mp4', - 'age_limit': 0, - 'uploader_id': '@sana_natori', - 'comment_count': int, - 'chapters': 'count:13', - 'upload_date': '20221223', - 'thumbnail': 'https://i.ytimg.com/vi/qVv6vCqciTM/maxresdefault.jpg', - 'channel_url': 'https://www.youtube.com/channel/UCIdEIHpS0TdkqRkHL5OkLtA', - 'uploader_url': 'https://www.youtube.com/@sana_natori', - 'like_count': int, - 'release_date': '20221223', - 'tags': ['Vtuber', '月ノ美兎', '名取さな', 'にじさんじ', 'クリスマス', '3D配信'], - 'title': '【 #インターネット女クリスマス 】3Dで歌ってはしゃぐインターネットの女たち【月ノ美兎/名取さな】', - 'view_count': int, - 'playable_in_embed': True, - 'duration': 4438, - 'availability': 'public', - 'channel_follower_count': int, - 'channel_id': 'UCIdEIHpS0TdkqRkHL5OkLtA', - 'categories': ['Entertainment'], - 'live_status': 'was_live', - 'release_timestamp': 1671793345, - 'channel': 'さなちゃんねる', - 'description': 'md5:6aebf95cc4a1d731aebc01ad6cc9806d', - 'uploader': 'さなちゃんねる', - 'channel_is_verified': True, - 'heatmap': 'count:100', - }, - 'add_ie': ['Youtube'], - 'params': {'skip_download': 'Youtube'}, - }] - - def _real_extract(self, url): - redirect_url = url_or_none(parse_qs(url).get('continue', [None])[-1]) - if not redirect_url: - raise ExtractorError('Invalid cookie consent redirect URL', expected=True) - return self.url_result(redirect_url) - - -class YoutubeTruncatedIDIE(YoutubeBaseInfoExtractor): - IE_NAME = 'youtube:truncated_id' - IE_DESC = False # Do not list - _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$' - - _TESTS = [{ - 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - raise ExtractorError( - f'Incomplete YouTube ID {video_id}. URL {url} looks truncated.', - expected=True) diff --git a/yt_dlp/extractor/zdf.py b/yt_dlp/extractor/zdf.py index 703766cd7..10be582a3 100644 --- a/yt_dlp/extractor/zdf.py +++ b/yt_dlp/extractor/zdf.py @@ -1,212 +1,410 @@ +import itertools +import json import re +import time from .common import InfoExtractor from ..utils import ( - NO_DEFAULT, ExtractorError, determine_ext, + filter_dict, float_or_none, int_or_none, join_nonempty, - merge_dicts, + make_archive_id, parse_codecs, - qualities, - traverse_obj, - try_get, + parse_iso8601, + parse_qs, + smuggle_url, unified_timestamp, - update_url_query, + unsmuggle_url, url_or_none, urljoin, + variadic, ) +from ..utils.traversal import require, traverse_obj class ZDFBaseIE(InfoExtractor): _GEO_COUNTRIES = ['DE'] - _QUALITIES = ('auto', 'low', 'med', 'high', 'veryhigh', 'hd', 'fhd', 'uhd') + _TOKEN_CACHE_PARAMS = ('zdf', 'api-token') + _token_cache = {} - def _download_v2_doc(self, document_id): - return self._download_json( - f'https://zdf-prod-futura.zdf.de/mediathekV2/document/{document_id}', - document_id) + def _get_api_token(self): + # As of 2025-03, this API is used by the Android app for getting tokens. + # An equivalent token could be extracted from the webpage should the API become unavailable. + # For now this allows the extractor to avoid dealing with Next.js hydration data. + if not self._token_cache: + self._token_cache.update(self.cache.load(*self._TOKEN_CACHE_PARAMS, default={})) - def _call_api(self, url, video_id, item, api_token=None, referrer=None): - headers = {} - if api_token: - headers['Api-Auth'] = f'Bearer {api_token}' - if referrer: - headers['Referer'] = referrer + if traverse_obj(self._token_cache, ('expires', {int_or_none}), default=0) < int(time.time()): + self._token_cache.update(self._download_json( + 'https://zdf-prod-futura.zdf.de/mediathekV2/token', None, + 'Downloading API token', 'Failed to download API token')) + self.cache.store(*self._TOKEN_CACHE_PARAMS, self._token_cache) + + return f'{self._token_cache["type"]} {self._token_cache["token"]}' + + def _call_api(self, url, video_id, item, api_token=None): return self._download_json( - url, video_id, f'Downloading JSON {item}', headers=headers) + url, video_id, f'Downloading {item}', f'Failed to download {item}', + headers=filter_dict({'Api-Auth': api_token})) + + def _parse_aspect_ratio(self, aspect_ratio): + if not aspect_ratio or not isinstance(aspect_ratio, str): + return None + mobj = re.match(r'(?P<width>\d+):(?P<height>\d+)', aspect_ratio) + return int(mobj.group('width')) / int(mobj.group('height')) if mobj else None + + def _extract_chapters(self, data): + return traverse_obj(data, (lambda _, v: v['anchorOffset'], { + 'start_time': ('anchorOffset', {float_or_none}), + 'title': ('anchorLabel', {str}), + })) or None @staticmethod def _extract_subtitles(src): + seen_urls = set() subtitles = {} - for caption in try_get(src, lambda x: x['captions'], list) or []: + for caption in src: subtitle_url = url_or_none(caption.get('uri')) - if subtitle_url: - lang = caption.get('language', 'deu') - subtitles.setdefault(lang, []).append({ - 'url': subtitle_url, - }) + if not subtitle_url or subtitle_url in seen_urls: + continue + seen_urls.add(subtitle_url) + lang = caption.get('language') or 'deu' + subtitles.setdefault(lang, []).append({ + 'url': subtitle_url, + }) return subtitles - def _extract_format(self, video_id, formats, format_urls, meta): - format_url = url_or_none(meta.get('url')) - if not format_url or format_url in format_urls: - return - format_urls.add(format_url) + def _expand_ptmd_template(self, api_base_url, template): + return urljoin(api_base_url, template.replace('{playerId}', 'android_native_6')) - mime_type, ext = meta.get('mimeType'), determine_ext(format_url) - if mime_type == 'application/x-mpegURL' or ext == 'm3u8': - new_formats = self._extract_m3u8_formats( - format_url, video_id, 'mp4', m3u8_id='hls', - entry_protocol='m3u8_native', fatal=False) - elif mime_type == 'application/f4m+xml' or ext == 'f4m': - new_formats = self._extract_f4m_formats( - update_url_query(format_url, {'hdcore': '3.7.0'}), video_id, f4m_id='hds', fatal=False) - elif ext == 'mpd': - new_formats = self._extract_mpd_formats( - format_url, video_id, mpd_id='dash', fatal=False) - else: - f = parse_codecs(meta.get('mimeCodec')) - if not f and meta.get('type'): - data = meta['type'].split('_') - if try_get(data, lambda x: x[2]) == ext: - f = {'vcodec': data[0], 'acodec': data[1]} - f.update({ - 'url': format_url, - 'format_id': join_nonempty('http', meta.get('type'), meta.get('quality')), - 'tbr': int_or_none(self._search_regex(r'_(\d+)k_', format_url, 'tbr', default=None)), - }) - new_formats = [f] - formats.extend(merge_dicts(f, { - 'format_note': join_nonempty('quality', 'class', from_dict=meta, delim=', '), - 'language': meta.get('language'), - 'language_preference': 10 if meta.get('class') == 'main' else -10 if meta.get('class') == 'ad' else -1, - 'quality': qualities(self._QUALITIES)(meta.get('quality')), - }) for f in new_formats) + def _extract_ptmd(self, ptmd_urls, video_id, api_token=None, aspect_ratio=None): + content_id = None + duration = None + formats, src_captions = [], [] + seen_urls = set() - def _extract_ptmd(self, ptmd_url, video_id, api_token, referrer): - ptmd = self._call_api( - ptmd_url, video_id, 'metadata', api_token, referrer) + for ptmd_url in variadic(ptmd_urls): + ptmd_url, smuggled_data = unsmuggle_url(ptmd_url, {}) + # Is it a DGS variant? (*D*eutsche *G*ebärden*s*prache' / German Sign Language) + is_dgs = smuggled_data.get('vod_media_type') == 'DGS' + ptmd = self._call_api(ptmd_url, video_id, 'PTMD data', api_token) - content_id = ptmd.get('basename') or ptmd_url.split('/')[-1] + basename = ( + ptmd.get('basename') + # ptmd_url examples: + # https://api.zdf.de/tmd/2/android_native_6/vod/ptmd/mediathek/250328_sendung_hsh/3 + # https://tmd.phoenix.de/tmd/2/android_native_6/vod/ptmd/phoenix/221215_phx_spitzbergen + or self._search_regex(r'/vod/ptmd/[^/?#]+/(\w+)', ptmd_url, 'content ID', default=None)) + # If this is_dgs, then it's from ZDFIE and it only uses content_id for _old_archive_ids, + # and the old version of the extractor didn't extract DGS variants, so ignore basename + if not content_id and not is_dgs: + content_id = basename - formats = [] - track_uris = set() - for p in ptmd['priorityList']: - formitaeten = p.get('formitaeten') - if not isinstance(formitaeten, list): - continue - for f in formitaeten: - f_qualities = f.get('qualities') - if not isinstance(f_qualities, list): - continue - for quality in f_qualities: - tracks = try_get(quality, lambda x: x['audio']['tracks'], list) - if not tracks: - continue - for track in tracks: - self._extract_format( - content_id, formats, track_uris, { - 'url': track.get('uri'), - 'type': f.get('type'), - 'mimeType': f.get('mimeType'), - 'quality': quality.get('quality'), - 'class': track.get('class'), - 'language': track.get('language'), + if not duration: + duration = traverse_obj(ptmd, ('attributes', 'duration', 'value', {float_or_none(scale=1000)})) + src_captions += traverse_obj(ptmd, ('captions', ..., {dict})) + + for stream in traverse_obj(ptmd, ('priorityList', ..., 'formitaeten', ..., {dict})): + for quality in traverse_obj(stream, ('qualities', ..., {dict})): + for variant in traverse_obj(quality, ('audio', 'tracks', lambda _, v: url_or_none(v['uri']))): + format_url = variant['uri'] + if format_url in seen_urls: + continue + seen_urls.add(format_url) + ext = determine_ext(format_url) + if ext == 'm3u8': + fmts = self._extract_m3u8_formats( + format_url, video_id, 'mp4', m3u8_id='hls', fatal=False) + elif ext == 'mpd': + fmts = self._extract_mpd_formats( + format_url, video_id, mpd_id='dash', fatal=False) + else: + height = int_or_none(quality.get('highestVerticalResolution')) + width = round(aspect_ratio * height) if aspect_ratio and height else None + fmts = [{ + 'url': format_url, + **parse_codecs(quality.get('mimeCodec')), + 'height': height, + 'width': width, + 'format_id': join_nonempty('http', stream.get('type')), + 'tbr': int_or_none(self._search_regex(r'_(\d+)k_', format_url, 'tbr', default=None)), + }] + f_class = variant.get('class') + for f in fmts: + formats.append({ + **f, + 'format_id': join_nonempty(f.get('format_id'), is_dgs and 'dgs'), + 'format_note': join_nonempty( + f_class, is_dgs and 'German Sign Language', f.get('format_note'), delim=', '), + 'language': variant.get('language') or f.get('language'), + 'preference': -2 if is_dgs else -1, + 'language_preference': 10 if f_class == 'main' else -10 if f_class == 'ad' else -1, }) - duration = float_or_none(try_get( - ptmd, lambda x: x['attributes']['duration']['value']), scale=1000) - return { - 'extractor_key': ZDFIE.ie_key(), - 'id': content_id, + 'id': content_id or video_id, 'duration': duration, 'formats': formats, - 'subtitles': self._extract_subtitles(ptmd), - '_format_sort_fields': ('tbr', 'res', 'quality', 'language_preference'), + 'subtitles': self._extract_subtitles(src_captions), } - def _extract_player(self, webpage, video_id, fatal=True): - return self._parse_json( - self._search_regex( - r'(?s)data-zdfplayer-jsb=(["\'])(?P<json>{.+?})\1', webpage, - 'player JSON', default='{}' if not fatal else NO_DEFAULT, - group='json'), - video_id) + def _download_graphql(self, item_id, data_desc, query=None, body=None): + assert query or body, 'One of query or body is required' + + return self._download_json( + 'https://api.zdf.de/graphql', item_id, + f'Downloading {data_desc}', f'Failed to download {data_desc}', + query=query, data=json.dumps(body).encode() if body else None, + headers=filter_dict({ + 'Api-Auth': self._get_api_token(), + 'Apollo-Require-Preflight': True, + 'Content-Type': 'application/json' if body else None, + })) + + @staticmethod + def _extract_thumbnails(source): + return [{ + 'id': str(format_id), + 'url': url, + 'preference': 1 if format_id == 'original' else 0, + **traverse_obj(re.search(r'(?P<width>\d+|auto)[Xx](?P<height>\d+|auto)', str(format_id)), { + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + }), + } for format_id, url in traverse_obj(source, ({dict.items}, lambda _, v: url_or_none(v[1])))] class ZDFIE(ZDFBaseIE): - _VALID_URL = r'https?://www\.zdf\.de/(?:[^/]+/)*(?P<id>[^/?#&]+)\.html' + _VALID_URL = [ + r'https?://(?:www\.)?zdf\.de/(?:video|play)/(?:[^/?#]+/)*(?P<id>[^/?#]+)', + # /nachrichten/ sub-site URLs and legacy redirects from before the redesign in 2025-03 + r'https?://(?:www\.)?zdf\.de/(?:[^/?#]+/)*(?P<id>[^/?#]+)\.html', + ] + IE_NAME = 'zdf' _TESTS = [{ - # Same as https://www.phoenix.de/sendungen/ereignisse/corona-nachgehakt/wohin-fuehrt-der-protest-in-der-pandemie-a-2050630.html - 'url': 'https://www.zdf.de/politik/phoenix-sendungen/wohin-fuehrt-der-protest-in-der-pandemie-100.html', - 'md5': '34ec321e7eb34231fd88616c65c92db0', + # Standalone video (i.e. not part of a playlist), video URL + 'url': 'https://www.zdf.de/video/dokus/sylt---deutschlands-edles-nordlicht-movie-100/sylt-deutschlands-edles-nordlicht-100', 'info_dict': { - 'id': '210222_phx_nachgehakt_corona_protest', + 'id': 'sylt-deutschlands-edles-nordlicht-100', 'ext': 'mp4', - 'title': 'Wohin führt der Protest in der Pandemie?', - 'description': 'md5:7d643fe7f565e53a24aac036b2122fbd', - 'duration': 1691, - 'timestamp': 1613948400, - 'upload_date': '20210221', + 'title': 'Sylt - Deutschlands edles Nordlicht', + 'description': 'md5:35407b810c2e1e33efbe15ef6e4c06c3', + 'duration': 810.0, + 'thumbnail': 'https://www.zdf.de/assets/sylt-118~original?cb=1613992485011', + 'series': 'Sylt - Deutschlands edles Nordlicht', + 'series_id': 'sylt---deutschlands-edles-nordlicht-movie-100', + 'timestamp': 1612462500, + 'upload_date': '20210204', + '_old_archive_ids': ['zdf 210402_1915_sendung_dok'], }, - 'skip': 'No longer available: "Diese Seite wurde leider nicht gefunden"', }, { - # Same as https://www.3sat.de/film/ab-18/10-wochen-sommer-108.html - 'url': 'https://www.zdf.de/dokumentation/ab-18/10-wochen-sommer-102.html', - 'md5': '0aff3e7bc72c8813f5e0fae333316a1d', + # Standalone video (i.e. not part of a playlist), play URL + 'url': 'https://www.zdf.de/play/dokus/sylt---deutschlands-edles-nordlicht-movie-100/sylt-deutschlands-edles-nordlicht-100', 'info_dict': { - 'id': '141007_ab18_10wochensommer_film', + 'id': 'sylt-deutschlands-edles-nordlicht-100', 'ext': 'mp4', - 'title': 'Ab 18! - 10 Wochen Sommer', - 'description': 'md5:8253f41dc99ce2c3ff892dac2d65fe26', - 'duration': 2660, - 'timestamp': 1608604200, - 'upload_date': '20201222', + 'title': 'Sylt - Deutschlands edles Nordlicht', + 'description': 'md5:35407b810c2e1e33efbe15ef6e4c06c3', + 'duration': 810.0, + 'thumbnail': 'https://www.zdf.de/assets/sylt-118~original?cb=1613992485011', + 'series': 'Sylt - Deutschlands edles Nordlicht', + 'series_id': 'sylt---deutschlands-edles-nordlicht-movie-100', + 'timestamp': 1612462500, + 'upload_date': '20210204', + '_old_archive_ids': ['zdf 210402_1915_sendung_dok'], }, - 'skip': 'No longer available: "Diese Seite wurde leider nicht gefunden"', + 'params': {'skip_download': True}, }, { - 'url': 'https://www.zdf.de/nachrichten/heute-journal/heute-journal-vom-30-12-2021-100.html', + # Standalone video (i.e. not part of a playlist), legacy URL before website redesign in 2025-03 + 'url': 'https://www.zdf.de/dokumentation/dokumentation-sonstige/sylt-deutschlands-edles-nordlicht-100.html', 'info_dict': { - 'id': '211230_sendung_hjo', + 'id': 'sylt-deutschlands-edles-nordlicht-100', 'ext': 'mp4', - 'description': 'md5:47dff85977bde9fb8cba9e9c9b929839', - 'duration': 1890.0, - 'upload_date': '20211230', - 'chapters': list, - 'thumbnail': 'md5:e65f459f741be5455c952cd820eb188e', - 'title': 'heute journal vom 30.12.2021', - 'timestamp': 1640897100, + 'title': 'Sylt - Deutschlands edles Nordlicht', + 'description': 'md5:35407b810c2e1e33efbe15ef6e4c06c3', + 'duration': 810.0, + 'thumbnail': 'https://www.zdf.de/assets/sylt-118~original?cb=1613992485011', + 'series': 'Sylt - Deutschlands edles Nordlicht', + 'series_id': 'sylt---deutschlands-edles-nordlicht-movie-100', + 'timestamp': 1612462500, + 'upload_date': '20210204', + '_old_archive_ids': ['zdf 210402_1915_sendung_dok'], }, - 'skip': 'No longer available: "Diese Seite wurde leider nicht gefunden"', + 'params': {'skip_download': True}, }, { - 'url': 'https://www.zdf.de/dokumentation/terra-x/die-magie-der-farben-von-koenigspurpur-und-jeansblau-100.html', + # Video belongs to a playlist, video URL + 'url': 'https://www.zdf.de/video/dokus/die-magie-der-farben-116/die-magie-der-farben-von-koenigspurpur-und-jeansblau-100', + 'md5': '1eda17eb40a9ead3046326e10b9c5973', 'info_dict': { - 'id': '151025_magie_farben2_tex', + 'id': 'die-magie-der-farben-von-koenigspurpur-und-jeansblau-100', 'ext': 'mp4', - 'title': 'Die Magie der Farben (2/2)', + 'title': 'Von Königspurpur bis Jeansblau', 'description': 'md5:a89da10c928c6235401066b60a6d5c1a', - 'duration': 2615, - 'timestamp': 1465021200, - 'upload_date': '20160604', - 'thumbnail': 'https://www.zdf.de/assets/mauve-im-labor-100~768x432?cb=1464909117806', + 'duration': 2615.0, + 'thumbnail': 'https://www.zdf.de/assets/koenigspurpur-bis-jeansblau-100~original?cb=1741857765971', + 'series': 'Die Magie der Farben', + 'series_id': 'die-magie-der-farben-116', + 'season': 'Season 1', + 'season_number': 1, + 'episode': 'Episode 2', + 'episode_number': 2, + 'timestamp': 1445797800, + 'upload_date': '20151025', + '_old_archive_ids': ['zdf 151025_magie_farben2_tex'], + }, + }, { + # Video belongs to a playlist, play URL + 'url': 'https://www.zdf.de/play/dokus/die-magie-der-farben-116/die-magie-der-farben-von-koenigspurpur-und-jeansblau-100', + 'md5': '1eda17eb40a9ead3046326e10b9c5973', + 'info_dict': { + 'id': 'die-magie-der-farben-von-koenigspurpur-und-jeansblau-100', + 'ext': 'mp4', + 'title': 'Von Königspurpur bis Jeansblau', + 'description': 'md5:a89da10c928c6235401066b60a6d5c1a', + 'duration': 2615.0, + 'thumbnail': 'https://www.zdf.de/assets/koenigspurpur-bis-jeansblau-100~original?cb=1741857765971', + 'series': 'Die Magie der Farben', + 'series_id': 'die-magie-der-farben-116', + 'season': 'Season 1', + 'season_number': 1, + 'episode': 'Episode 2', + 'episode_number': 2, + 'timestamp': 1445797800, + 'upload_date': '20151025', + '_old_archive_ids': ['zdf 151025_magie_farben2_tex'], + }, + 'params': {'skip_download': True}, + }, { + # Video belongs to a playlist, legacy URL before website redesign in 2025-03 + 'url': 'https://www.zdf.de/dokumentation/terra-x/die-magie-der-farben-von-koenigspurpur-und-jeansblau-100.html', + 'md5': '1eda17eb40a9ead3046326e10b9c5973', + 'info_dict': { + 'id': 'die-magie-der-farben-von-koenigspurpur-und-jeansblau-100', + 'ext': 'mp4', + 'title': 'Von Königspurpur bis Jeansblau', + 'description': 'md5:a89da10c928c6235401066b60a6d5c1a', + 'duration': 2615.0, + 'thumbnail': 'https://www.zdf.de/assets/koenigspurpur-bis-jeansblau-100~original?cb=1741857765971', + 'series': 'Die Magie der Farben', + 'series_id': 'die-magie-der-farben-116', + 'season': 'Season 1', + 'season_number': 1, + 'episode': 'Episode 2', + 'episode_number': 2, + 'timestamp': 1445797800, + 'upload_date': '20151025', + '_old_archive_ids': ['zdf 151025_magie_farben2_tex'], + }, + 'params': {'skip_download': True}, + }, { + # Video with chapters + # Also: video with sign-language variant + 'url': 'https://www.zdf.de/video/magazine/heute-journal-104/heute-journal-vom-19-12-2021-100', + 'md5': '6ada39465497a84fb98d48ffff69e7b7', + 'info_dict': { + 'id': 'heute-journal-vom-19-12-2021-100', + 'ext': 'mp4', + 'title': 'heute journal vom 19.12.2021', + 'description': 'md5:02504cf3b03777ff32fcc927d260c5dd', + 'duration': 1770.0, + 'thumbnail': 'https://epg-image.zdf.de/fotobase-webdelivery/images/273e5545-16e7-4ca3-898e-52fe9e06d964?layout=1920x1080', + 'chapters': 'count:11', + 'series': 'heute journal', + 'series_id': 'heute-journal-104', + 'season': 'Season 2021', + 'season_number': 2021, + 'episode': 'Episode 370', + 'episode_number': 370, + 'timestamp': 1639946700, + 'upload_date': '20211219', + # Videos with sign language variants must not have a 'dgs' suffix on their old archive IDs. + '_old_archive_ids': ['zdf 211219_sendung_hjo'], + }, + }, { + # Video that requires fallback extraction + 'url': 'https://www.zdf.de/nachrichten/politik/deutschland/koalitionsverhandlungen-spd-cdu-csu-dobrindt-100.html', + 'md5': 'c3a78514dd993a5781aa3afe50db51e2', + 'info_dict': { + 'id': 'koalitionsverhandlungen-spd-cdu-csu-dobrindt-100', + 'ext': 'mp4', + 'title': 'Dobrindt schließt Steuererhöhungen aus', + 'description': 'md5:9a117646d7b8df6bc902eb543a9c9023', + 'duration': 325, + 'thumbnail': 'https://www.zdf.de/assets/dobrindt-csu-berlin-direkt-100~1920x1080?cb=1743357653736', + 'timestamp': 1743374520, + 'upload_date': '20250330', + '_old_archive_ids': ['zdf 250330_clip_2_bdi'], }, }, { 'url': 'https://www.zdf.de/funk/druck-11790/funk-alles-ist-verzaubert-102.html', 'md5': '57af4423db0455a3975d2dc4578536bc', 'info_dict': { + 'id': 'funk-alles-ist-verzaubert-102', 'ext': 'mp4', - 'id': 'video_funk_1770473', - 'duration': 1278, - 'description': 'Die Neue an der Schule verdreht Ismail den Kopf.', 'title': 'Alles ist verzaubert', + 'description': 'Die Neue an der Schule verdreht Ismail den Kopf.', + 'duration': 1278.0, + 'thumbnail': 'https://www.zdf.de/assets/teaser-funk-alles-ist-verzaubert-102~original?cb=1663848412907', + 'series': 'DRUCK', + 'series_id': 'funk-collection-funk-11790-1590', + 'season': 'Season 7', + 'season_number': 7, + 'episode': 'Episode 1', + 'episode_number': 1, 'timestamp': 1635520560, 'upload_date': '20211029', - 'thumbnail': 'https://www.zdf.de/assets/teaser-funk-alles-ist-verzaubert-102~1920x1080?cb=1663848412907', + '_old_archive_ids': ['zdf video_funk_1770473'], }, + }, { + 'url': 'https://www.zdf.de/serien/soko-stuttgart/das-geld-anderer-leute-100.html', + 'info_dict': { + 'id': 'das-geld-anderer-leute-100', + 'ext': 'mp4', + 'title': 'Das Geld anderer Leute', + 'description': 'md5:cb6f660850dc5eb7d1ab776ea094959d', + 'duration': 2581.0, + 'thumbnail': 'https://epg-image.zdf.de/fotobase-webdelivery/images/e2d7e55a-09f0-424e-ac73-6cac4dd65f35?layout=1920x1080', + 'series': 'SOKO Stuttgart', + 'series_id': 'soko-stuttgart-104', + 'season': 'Season 11', + 'season_number': 11, + 'episode': 'Episode 10', + 'episode_number': 10, + 'timestamp': 1728983700, + 'upload_date': '20241015', + '_old_archive_ids': ['zdf 191205_1800_sendung_sok8'], + }, + }, { + 'url': 'https://www.zdf.de/serien/northern-lights/begegnung-auf-der-bruecke-100.html', + 'info_dict': { + 'id': 'begegnung-auf-der-bruecke-100', + 'ext': 'webm', + 'title': 'Begegnung auf der Brücke', + 'description': 'md5:e53a555da87447f7f1207f10353f8e45', + 'duration': 3083.0, + 'thumbnail': 'https://epg-image.zdf.de/fotobase-webdelivery/images/c5ff1d1f-f5c8-4468-86ac-1b2f1dbecc76?layout=1920x1080', + 'series': 'Northern Lights', + 'series_id': 'northern-lights-100', + 'season': 'Season 1', + 'season_number': 1, + 'episode': 'Episode 1', + 'episode_number': 1, + 'timestamp': 1738546500, + 'upload_date': '20250203', + '_old_archive_ids': ['zdf 240319_2310_sendung_not'], + }, + 'params': {'skip_download': 'geo-restricted http format'}, + }, { + # Same as https://www.phoenix.de/sendungen/ereignisse/corona-nachgehakt/wohin-fuehrt-der-protest-in-der-pandemie-a-2050630.html + 'url': 'https://www.zdf.de/politik/phoenix-sendungen/wohin-fuehrt-der-protest-in-der-pandemie-100.html', + 'only_matching': True, + }, { + # Same as https://www.3sat.de/film/ab-18/10-wochen-sommer-108.html + 'url': 'https://www.zdf.de/dokumentation/ab-18/10-wochen-sommer-102.html', + 'only_matching': True, }, { # Same as https://www.phoenix.de/sendungen/dokumentationen/gesten-der-maechtigen-i-a-89468.html?ref=suche 'url': 'https://www.zdf.de/politik/phoenix-sendungen/die-gesten-der-maechtigen-100.html', @@ -230,220 +428,323 @@ class ZDFIE(ZDFBaseIE): 'only_matching': True, }, { 'url': 'https://www.zdf.de/arte/todliche-flucht/page-video-artede-toedliche-flucht-16-100.html', - 'info_dict': { - 'id': 'video_artede_083871-001-A', - 'ext': 'mp4', - 'title': 'Tödliche Flucht (1/6)', - 'description': 'md5:e34f96a9a5f8abd839ccfcebad3d5315', - 'duration': 3193.0, - 'timestamp': 1641355200, - 'upload_date': '20220105', - }, - 'skip': 'No longer available "Diese Seite wurde leider nicht gefunden"', - }, { - 'url': 'https://www.zdf.de/serien/soko-stuttgart/das-geld-anderer-leute-100.html', - 'info_dict': { - 'id': '191205_1800_sendung_sok8', - 'ext': 'mp4', - 'title': 'Das Geld anderer Leute', - 'description': 'md5:cb6f660850dc5eb7d1ab776ea094959d', - 'duration': 2581.0, - 'timestamp': 1675160100, - 'upload_date': '20230131', - 'thumbnail': 'https://epg-image.zdf.de/fotobase-webdelivery/images/e2d7e55a-09f0-424e-ac73-6cac4dd65f35?layout=2400x1350', - }, + 'only_matching': True, }, { 'url': 'https://www.zdf.de/dokumentation/terra-x/unser-gruener-planet-wuesten-doku-100.html', - 'info_dict': { - 'id': '220605_dk_gruener_planet_wuesten_tex', - 'ext': 'mp4', - 'title': 'Unser grüner Planet - Wüsten', - 'description': 'md5:4fc647b6f9c3796eea66f4a0baea2862', - 'duration': 2613.0, - 'timestamp': 1654450200, - 'upload_date': '20220605', - 'format_note': 'uhd, main', - 'thumbnail': 'https://www.zdf.de/assets/saguaro-kakteen-102~3840x2160?cb=1655910690796', - }, + 'only_matching': True, }] - def _extract_entry(self, url, player, content, video_id): - title = content.get('title') or content['teaserHeadline'] + _GRAPHQL_QUERY = ''' +query VideoByCanonical($canonical: String!) { + videoByCanonical(canonical: $canonical) { + canonical + title + leadParagraph + editorialDate + teaser { + description + image { + list + } + } + episodeInfo { + episodeNumber + seasonNumber + } + smartCollection { + canonical + title + } + currentMedia { + nodes { + ptmdTemplate + ... on VodMedia { + duration + aspectRatio + streamAnchorTags { + nodes { + anchorOffset + anchorLabel + } + } + vodMediaType + label + } + ... on LiveMedia { + start + stop + encryption + liveMediaType + label + } + id + } + } + } +} + ''' - t = content['mainVideoContent']['http://zdf.de/rels/target'] - ptmd_path = traverse_obj(t, ( - (('streams', 'default'), None), - ('http://zdf.de/rels/streams/ptmd', 'http://zdf.de/rels/streams/ptmd-template'), - ), get_all=False) - if not ptmd_path: - raise ExtractorError('Could not extract ptmd_path') + def _extract_ptmd(self, *args, **kwargs): + ptmd_data = super()._extract_ptmd(*args, **kwargs) + # This was the video id before the graphql redesign, other extractors still use it as such + old_archive_id = ptmd_data.pop('id') + ptmd_data['_old_archive_ids'] = [make_archive_id(self, old_archive_id)] + return ptmd_data - info = self._extract_ptmd( - urljoin(url, ptmd_path.replace('{playerId}', 'android_native_5')), video_id, player['apiToken'], url) + # This fallback should generally only happen for pages under `zdf.de/nachrichten`. + # They are on a separate website for which GraphQL often doesn't return results. + # The API used here is no longer in use by official clients and likely deprecated. + # Long-term, news documents probably should use the API used by the mobile apps: + # https://zdf-prod-futura.zdf.de/news/documents/ (note 'news' vs 'mediathekV2') + def _extract_fallback(self, document_id): + video = self._download_json( + f'https://zdf-prod-futura.zdf.de/mediathekV2/document/{document_id}', + document_id, note='Downloading fallback metadata', + errnote='Failed to download fallback metadata') + document = video['document'] + + ptmd_url = traverse_obj(document, ( + ('streamApiUrlAndroid', ('streams', 0, 'streamApiUrlAndroid')), + {url_or_none}, any, {require('PTMD URL')})) thumbnails = [] - layouts = try_get( - content, lambda x: x['teaserImageRef']['layouts'], dict) - if layouts: - for layout_key, layout_url in layouts.items(): - layout_url = url_or_none(layout_url) - if not layout_url: - continue - thumbnail = { - 'url': layout_url, - 'format_id': layout_key, - } - mobj = re.search(r'(?P<width>\d+)x(?P<height>\d+)', layout_key) - if mobj: - thumbnail.update({ - 'width': int(mobj.group('width')), - 'height': int(mobj.group('height')), - }) - thumbnails.append(thumbnail) - - chapter_marks = t.get('streamAnchorTag') or [] - chapter_marks.append({'anchorOffset': int_or_none(t.get('duration'))}) - chapters = [{ - 'start_time': chap.get('anchorOffset'), - 'end_time': next_chap.get('anchorOffset'), - 'title': chap.get('anchorLabel'), - } for chap, next_chap in zip(chapter_marks, chapter_marks[1:])] - - return merge_dicts(info, { - 'title': title, - 'description': content.get('leadParagraph') or content.get('teasertext'), - 'duration': int_or_none(t.get('duration')), - 'timestamp': unified_timestamp(content.get('editorialDate')), - 'thumbnails': thumbnails, - 'chapters': chapters or None, - }) - - def _extract_regular(self, url, player, video_id): - content = self._call_api( - player['content'], video_id, 'content', player['apiToken'], url) - return self._extract_entry(player['content'], player, content, video_id) - - def _extract_mobile(self, video_id): - video = self._download_v2_doc(video_id) - - formats = [] - formitaeten = try_get(video, lambda x: x['document']['formitaeten'], list) - document = formitaeten and video['document'] - if formitaeten: - title = document['titel'] - content_id = document['basename'] - - format_urls = set() - for f in formitaeten or []: - self._extract_format(content_id, formats, format_urls, f) - - thumbnails = [] - teaser_bild = document.get('teaserBild') - if isinstance(teaser_bild, dict): - for thumbnail_key, thumbnail in teaser_bild.items(): - thumbnail_url = try_get( - thumbnail, lambda x: x['url'], str) - if thumbnail_url: - thumbnails.append({ - 'url': thumbnail_url, - 'id': thumbnail_key, - 'width': int_or_none(thumbnail.get('width')), - 'height': int_or_none(thumbnail.get('height')), - }) + for thumbnail_key, thumbnail in traverse_obj(document, ('teaserBild', {dict.items}, ...)): + thumbnail_url = traverse_obj(thumbnail, ('url', {url_or_none})) + if not thumbnail_url: + continue + thumbnails.append({ + 'url': thumbnail_url, + 'id': thumbnail_key, + 'width': int_or_none(thumbnail.get('width')), + 'height': int_or_none(thumbnail.get('height')), + }) return { - 'id': content_id, - 'title': title, - 'description': document.get('beschreibung'), - 'duration': int_or_none(document.get('length')), - 'timestamp': unified_timestamp(document.get('date')) or unified_timestamp( - try_get(video, lambda x: x['meta']['editorialDate'], str)), 'thumbnails': thumbnails, - 'subtitles': self._extract_subtitles(document), - 'formats': formats, + **traverse_obj(video, { + 'title': ('document', 'titel', {str}), + 'description': ('document', 'beschreibung', {str}), + 'timestamp': ( + (('document', 'date'), ('meta', 'editorialDate')), + {unified_timestamp}, any), + 'subtitles': ('document', 'captions', {self._extract_subtitles}), + }), + **self._extract_ptmd(ptmd_url, document_id, self._get_api_token()), + 'id': document_id, } def _real_extract(self, url): video_id = self._match_id(url) + video_data = self._download_graphql(video_id, 'video metadata', body={ + 'operationName': 'VideoByCanonical', + 'query': self._GRAPHQL_QUERY, + 'variables': {'canonical': video_id}, + })['data']['videoByCanonical'] - webpage = self._download_webpage(url, video_id, fatal=False) - if webpage: - player = self._extract_player(webpage, url, fatal=False) - if player: - return self._extract_regular(url, player, video_id) + if not video_data: + return self._extract_fallback(video_id) - return self._extract_mobile(video_id) + aspect_ratio = None + ptmd_urls = [] + for node in traverse_obj(video_data, ('currentMedia', 'nodes', lambda _, v: v['ptmdTemplate'])): + ptmd_url = self._expand_ptmd_template('https://api.zdf.de', node['ptmdTemplate']) + # Smuggle vod_media_type so that _extract_ptmd is aware of 'DGS' variants + if vod_media_type := node.get('vodMediaType'): + ptmd_url = smuggle_url(ptmd_url, {'vod_media_type': vod_media_type}) + ptmd_urls.append(ptmd_url) + if not aspect_ratio: + aspect_ratio = self._parse_aspect_ratio(node.get('aspectRatio')) + + return { + **traverse_obj(video_data, { + 'title': ('title', {str}), + 'description': (('leadParagraph', ('teaser', 'description')), any, {str}), + 'timestamp': ('editorialDate', {parse_iso8601}), + 'thumbnails': ('teaser', 'image', 'list', {self._extract_thumbnails}), + 'episode_number': ('episodeInfo', 'episodeNumber', {int_or_none}), + 'season_number': ('episodeInfo', 'seasonNumber', {int_or_none}), + 'series': ('smartCollection', 'title', {str}), + 'series_id': ('smartCollection', 'canonical', {str}), + 'chapters': ('currentMedia', 'nodes', 0, 'streamAnchorTags', 'nodes', {self._extract_chapters}), + }), + **self._extract_ptmd(ptmd_urls, video_id, self._get_api_token(), aspect_ratio), + 'id': video_id, + } class ZDFChannelIE(ZDFBaseIE): - _VALID_URL = r'https?://www\.zdf\.de/(?:[^/?#]+/)*(?P<id>[^/?#&]+)' + _VALID_URL = r'https?://www\.zdf\.de/(?:[^/?#]+/)*(?P<id>[^/?#]+)' + IE_NAME = 'zdf:channel' _TESTS = [{ + # Playlist, legacy URL before website redesign in 2025-03 'url': 'https://www.zdf.de/sport/das-aktuelle-sportstudio', 'info_dict': { - 'id': 'das-aktuelle-sportstudio', + 'id': 'das-aktuelle-sportstudio-220', 'title': 'das aktuelle sportstudio', + 'description': 'md5:e46c785324238a03edcf8b301c5fd5dc', }, - 'playlist_mincount': 18, + 'playlist_mincount': 25, }, { - 'url': 'https://www.zdf.de/dokumentation/planet-e', + # Playlist, current URL + 'url': 'https://www.zdf.de/sport/das-aktuelle-sportstudio-220', 'info_dict': { - 'id': 'planet-e', - 'title': 'planet e.', - 'description': 'md5:87e3b9c66a63cf1407ee443d2c4eb88e', + 'id': 'das-aktuelle-sportstudio-220', + 'title': 'das aktuelle sportstudio', + 'description': 'md5:e46c785324238a03edcf8b301c5fd5dc', }, - 'playlist_mincount': 50, + 'playlist_mincount': 25, + }, { + # Standalone video (i.e. not part of a playlist), collection URL + 'add_ie': [ZDFIE.ie_key()], + 'url': 'https://www.zdf.de/dokus/sylt---deutschlands-edles-nordlicht-movie-100', + 'info_dict': { + 'id': 'sylt-deutschlands-edles-nordlicht-100', + 'ext': 'mp4', + 'title': 'Sylt - Deutschlands edles Nordlicht', + 'description': 'md5:35407b810c2e1e33efbe15ef6e4c06c3', + 'duration': 810.0, + 'thumbnail': 'https://www.zdf.de/assets/sylt-118~original?cb=1613992485011', + 'series': 'Sylt - Deutschlands edles Nordlicht', + 'series_id': 'sylt---deutschlands-edles-nordlicht-movie-100', + 'timestamp': 1612462500, + 'upload_date': '20210204', + '_old_archive_ids': ['zdf 210402_1915_sendung_dok'], + }, + 'params': {'skip_download': True}, }, { 'url': 'https://www.zdf.de/gesellschaft/aktenzeichen-xy-ungeloest', 'info_dict': { - 'id': 'aktenzeichen-xy-ungeloest', + 'id': 'aktenzeichen-xy-ungeloest-110', 'title': 'Aktenzeichen XY... Ungelöst', - 'description': 'md5:623ede5819c400c6d04943fa8100e6e7', + 'description': 'md5:b79ac0d64b979e53cbe510c0ca9cb7be', }, 'playlist_mincount': 2, }, { 'url': 'https://www.zdf.de/serien/taunuskrimi/', - 'only_matching': True, + 'info_dict': { + 'id': 'taunuskrimi-100', + 'title': 'Taunuskrimi', + 'description': 'md5:ee7204e9c625c3b611d1274f9d0e3070', + }, + 'playlist_mincount': 8, + }, { + 'url': 'https://www.zdf.de/serien/taunuskrimi/?staffel=1', + 'info_dict': { + 'id': 'taunuskrimi-100-s1', + 'title': 'Taunuskrimi - Season 1', + 'description': 'md5:ee7204e9c625c3b611d1274f9d0e3070', + }, + 'playlist_count': 7, + }, { + 'url': 'https://www.zdf.de/magazine/heute-journal-104', + 'info_dict': { + 'id': 'heute-journal-104', + 'title': 'heute journal', + 'description': 'md5:6edad39189abf8431795d3d6d7f986b3', + }, + 'playlist_mincount': 500, + }, { + 'url': 'https://www.zdf.de/magazine/heute-journal-104?staffel=2024', + 'info_dict': { + 'id': 'heute-journal-104-s2024', + 'title': 'heute journal - Season 2024', + 'description': 'md5:6edad39189abf8431795d3d6d7f986b3', + }, + 'playlist_count': 242, }] + _PAGE_SIZE = 24 + @classmethod def suitable(cls, url): return False if ZDFIE.suitable(url) else super().suitable(url) - def _extract_entry(self, entry): - return self.url_result( - entry['sharingUrl'], ZDFIE, **traverse_obj(entry, { - 'id': ('basename', {str}), - 'title': ('titel', {str}), - 'description': ('beschreibung', {str}), - 'duration': ('length', {float_or_none}), - # TODO: seasonNumber and episodeNumber can be extracted but need to also be in ZDFIE - })) + def _fetch_page(self, playlist_id, canonical_id, season_idx, season_number, page_number, cursor=None): + return self._download_graphql( + playlist_id, f'season {season_number} page {page_number} JSON', query={ + 'operationName': 'seasonByCanonical', + 'variables': json.dumps(filter_dict({ + 'seasonIndex': season_idx, + 'canonical': canonical_id, + 'episodesPageSize': self._PAGE_SIZE, + 'episodesAfter': cursor, + })), + 'extensions': json.dumps({ + 'persistedQuery': { + 'version': 1, + 'sha256Hash': '9412a0f4ac55dc37d46975d461ec64bfd14380d815df843a1492348f77b5c99a', + }, + }), + })['data']['smartCollectionByCanonical'] - def _entries(self, data, document_id): - for entry in traverse_obj(data, ( - 'cluster', lambda _, v: v['type'] == 'teaser', - # If 'brandId' differs, it is a 'You might also like' video. Filter these out - 'teaser', lambda _, v: v['type'] == 'video' and v['brandId'] == document_id and v['sharingUrl'], - )): - yield self._extract_entry(entry) + def _entries(self, playlist_id, canonical_id, season_numbers, requested_season_number): + for season_idx, season_number in enumerate(season_numbers): + if requested_season_number is not None and requested_season_number != season_number: + continue + + cursor = None + for page_number in itertools.count(1): + page = self._fetch_page( + playlist_id, canonical_id, season_idx, season_number, page_number, cursor) + + nodes = traverse_obj(page, ('seasons', 'nodes', ...)) + + for episode in traverse_obj(nodes, ( + ..., 'episodes', 'nodes', lambda _, v: url_or_none(v['sharingUrl']), + )): + yield self.url_result( + episode['sharingUrl'], ZDFIE, + **traverse_obj(episode, { + 'id': ('canonical', {str}), + 'title': ('teaser', 'title', {str}), + 'description': (('leadParagraph', ('teaser', 'description')), any, {str}), + 'timestamp': ('editorialDate', {parse_iso8601}), + 'episode_number': ('episodeInfo', 'episodeNumber', {int_or_none}), + 'season_number': ('episodeInfo', 'seasonNumber', {int_or_none}), + })) + + page_info = traverse_obj(nodes, (-1, 'episodes', 'pageInfo', {dict})) or {} + if not page_info.get('hasNextPage') or not page_info.get('endCursor'): + break + cursor = page_info['endCursor'] def _real_extract(self, url): - channel_id = self._match_id(url) - webpage = self._download_webpage(url, channel_id) - document_id = self._search_regex( - r'docId\s*:\s*(["\'])(?P<doc_id>(?:(?!\1).)+)\1', webpage, 'document id', group='doc_id') - data = self._download_v2_doc(document_id) + canonical_id = self._match_id(url) + # Make sure to get the correct ID in case of redirects + urlh = self._request_webpage(url, canonical_id) + canonical_id = self._search_regex(self._VALID_URL, urlh.url, 'channel id', group='id') + season_number = traverse_obj(parse_qs(url), ('staffel', -1, {int_or_none})) + playlist_id = join_nonempty(canonical_id, season_number and f's{season_number}') - main_video = traverse_obj(data, ( - 'cluster', lambda _, v: v['type'] == 'teaserContent', - 'teaser', lambda _, v: v['type'] == 'video' and v['basename'] and v['sharingUrl'], any)) or {} + collection_data = self._download_graphql( + playlist_id, 'smart collection data', query={ + 'operationName': 'GetSmartCollectionByCanonical', + 'variables': json.dumps({ + 'canonical': canonical_id, + 'videoPageSize': 100, # Use max page size to get episodes from all seasons + }), + 'extensions': json.dumps({ + 'persistedQuery': { + 'version': 1, + 'sha256Hash': 'cb49420e133bd668ad895a8cea0e65cba6aa11ac1cacb02341ff5cf32a17cd02', + }, + }), + })['data']['smartCollectionByCanonical'] + video_data = traverse_obj(collection_data, ('video', {dict})) or {} + season_numbers = traverse_obj(collection_data, ('seasons', 'seasons', ..., 'number', {int_or_none})) - if not self._yes_playlist(channel_id, main_video.get('basename')): - return self._extract_entry(main_video) + if not self._yes_playlist( + season_numbers and playlist_id, + url_or_none(video_data.get('sharingUrl')) and video_data.get('canonical'), + ): + return self.url_result(video_data['sharingUrl'], ZDFIE, video_data['canonical']) + + if season_number is not None and season_number not in season_numbers: + raise ExtractorError(f'Season {season_number} was not found in the collection data') return self.playlist_result( - self._entries(data, document_id), channel_id, - re.split(r'\s+[-|]\s+ZDF(?:mediathek)?$', self._og_search_title(webpage) or '')[0] or None, - join_nonempty( - 'headline', 'text', delim='\n\n', - from_dict=traverse_obj(data, ('shortText', {dict}), default={})) or None) + self._entries(playlist_id, canonical_id, season_numbers, season_number), + playlist_id, join_nonempty( + traverse_obj(collection_data, ('title', {str})), + season_number and f'Season {season_number}', delim=' - '), + traverse_obj(collection_data, ('infoText', {str}))) diff --git a/yt_dlp/globals.py b/yt_dlp/globals.py new file mode 100644 index 000000000..0cf276cc9 --- /dev/null +++ b/yt_dlp/globals.py @@ -0,0 +1,30 @@ +from collections import defaultdict + +# Please Note: Due to necessary changes and the complex nature involved in the plugin/globals system, +# no backwards compatibility is guaranteed for the plugin system API. +# However, we will still try our best. + + +class Indirect: + def __init__(self, initial, /): + self.value = initial + + def __repr__(self, /): + return f'{type(self).__name__}({self.value!r})' + + +postprocessors = Indirect({}) +extractors = Indirect({}) + +# Plugins +all_plugins_loaded = Indirect(False) +plugin_specs = Indirect({}) +plugin_dirs = Indirect(['default']) + +plugin_ies = Indirect({}) +plugin_pps = Indirect({}) +plugin_ies_overrides = Indirect(defaultdict(list)) + +# Misc +IN_CLI = Indirect(False) +LAZY_EXTRACTORS = Indirect(None) # `False`=force, `None`=disabled, `True`=enabled diff --git a/yt_dlp/jsinterp.py b/yt_dlp/jsinterp.py index ba059babb..b59fb2c61 100644 --- a/yt_dlp/jsinterp.py +++ b/yt_dlp/jsinterp.py @@ -25,7 +25,7 @@ def zeroise(x): with contextlib.suppress(TypeError): if math.isnan(x): # NB: NaN cannot be checked by membership return 0 - return x + return int(float(x)) def wrapped(a, b): return op(zeroise(a), zeroise(b)) & 0xffffffff @@ -95,6 +95,61 @@ def _js_ternary(cndn, if_true=True, if_false=False): return if_true +# Ref: https://es5.github.io/#x9.8.1 +def js_number_to_string(val: float, radix: int = 10): + if radix in (JS_Undefined, None): + radix = 10 + assert radix in range(2, 37), 'radix must be an integer at least 2 and no greater than 36' + + if math.isnan(val): + return 'NaN' + if val == 0: + return '0' + if math.isinf(val): + return '-Infinity' if val < 0 else 'Infinity' + if radix == 10: + # TODO: implement special cases + ... + + ALPHABET = b'0123456789abcdefghijklmnopqrstuvwxyz.-' + + result = collections.deque() + sign = val < 0 + val = abs(val) + fraction, integer = math.modf(val) + delta = max(math.nextafter(.0, math.inf), math.ulp(val) / 2) + + if fraction >= delta: + result.append(-2) # `.` + while fraction >= delta: + delta *= radix + fraction, digit = math.modf(fraction * radix) + result.append(int(digit)) + # if we need to round, propagate potential carry through fractional part + needs_rounding = fraction > 0.5 or (fraction == 0.5 and int(digit) & 1) + if needs_rounding and fraction + delta > 1: + for index in reversed(range(1, len(result))): + if result[index] + 1 < radix: + result[index] += 1 + break + result.pop() + + else: + integer += 1 + break + + integer, digit = divmod(int(integer), radix) + result.appendleft(digit) + while integer > 0: + integer, digit = divmod(integer, radix) + result.appendleft(digit) + + if sign: + result.appendleft(-1) # `-` + + return bytes(ALPHABET[digit] for digit in result).decode('ascii') + + # Ref: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators/Operator_Precedence _OPERATORS = { # None => Defined in JSInterpreter._operator '?': None, @@ -133,6 +188,7 @@ def _js_ternary(cndn, if_true=True, if_false=False): _NAME_RE = r'[a-zA-Z_$][\w$]*' _MATCHING_PARENS = dict(zip(*zip('()', '{}', '[]'))) _QUOTES = '\'"/' +_NESTED_BRACKETS = r'[^[\]]+(?:\[[^[\]]+(?:\[[^\]]+\])?\])?' class JS_Undefined: @@ -246,7 +302,7 @@ def _separate(expr, delim=',', max_split=None): OP_CHARS = '+-*/%&|^=<>!,;{}:[' if not expr: return - counters = {k: 0 for k in _MATCHING_PARENS.values()} + counters = dict.fromkeys(_MATCHING_PARENS.values(), 0) start, splits, pos, delim_len = 0, 0, 0, len(delim) - 1 in_quote, escaping, after_op, in_regex_char_group = None, False, True, False for idx, char in enumerate(expr): @@ -551,15 +607,18 @@ def dict_item(key, val): m = re.match(fr'''(?x) (?P<assign> - (?P<out>{_NAME_RE})(?:\[(?P<index>[^\]]+?)\])?\s* + (?P<out>{_NAME_RE})(?:\[(?P<index>{_NESTED_BRACKETS})\])?\s* (?P<op>{"|".join(map(re.escape, set(_OPERATORS) - _COMP_OPERATORS))})? =(?!=)(?P<expr>.*)$ )|(?P<return> (?!if|return|true|false|null|undefined|NaN)(?P<name>{_NAME_RE})$ + )|(?P<attribute> + (?P<var>{_NAME_RE})(?: + (?P<nullish>\?)?\.(?P<member>[^(]+)| + \[(?P<member2>{_NESTED_BRACKETS})\] + )\s* )|(?P<indexing> (?P<in>{_NAME_RE})\[(?P<idx>.+)\]$ - )|(?P<attribute> - (?P<var>{_NAME_RE})(?:(?P<nullish>\?)?\.(?P<member>[^(]+)|\[(?P<member2>[^\]]+)\])\s* )|(?P<function> (?P<fname>{_NAME_RE})\((?P<args>.*)\)$ )''', expr) @@ -652,7 +711,7 @@ def eval_method(): if obj is NO_DEFAULT: if variable not in self._objects: try: - self._objects[variable] = self.extract_object(variable) + self._objects[variable] = self.extract_object(variable, local_vars) except self.Exception: if not nullish: raise @@ -792,7 +851,7 @@ def interpret_expression(self, expr, local_vars, allow_recursion): raise self.Exception('Cannot return from an expression', expr) return ret - def extract_object(self, objname): + def extract_object(self, objname, *global_stack): _FUNC_NAME_RE = r'''(?:[a-zA-Z$0-9]+|"[a-zA-Z$0-9]+"|'[a-zA-Z$0-9]+')''' obj = {} obj_m = re.search( @@ -814,7 +873,8 @@ def extract_object(self, objname): for f in fields_m: argnames = f.group('args').split(',') name = remove_quotes(f.group('key')) - obj[name] = function_with_repr(self.build_function(argnames, f.group('code')), f'F<{name}>') + obj[name] = function_with_repr( + self.build_function(argnames, f.group('code'), *global_stack), f'F<{name}>') return obj @@ -835,9 +895,9 @@ def extract_function_code(self, funcname): code, _ = self._separate_at_paren(func_m.group('code')) return [x.strip() for x in func_m.group('args').split(',')], code - def extract_function(self, funcname): + def extract_function(self, funcname, *global_stack): return function_with_repr( - self.extract_function_from_code(*self.extract_function_code(funcname)), + self.extract_function_from_code(*self.extract_function_code(funcname), *global_stack), f'F<{funcname}>') def extract_function_from_code(self, argnames, code, *global_stack): diff --git a/yt_dlp/networking/__init__.py b/yt_dlp/networking/__init__.py index 1eaa0ee5f..39158a8cc 100644 --- a/yt_dlp/networking/__init__.py +++ b/yt_dlp/networking/__init__.py @@ -3,6 +3,7 @@ from .common import ( HEADRequest, + PATCHRequest, PUTRequest, Request, RequestDirector, diff --git a/yt_dlp/networking/_curlcffi.py b/yt_dlp/networking/_curlcffi.py index 0643348e7..c800f2c09 100644 --- a/yt_dlp/networking/_curlcffi.py +++ b/yt_dlp/networking/_curlcffi.py @@ -1,6 +1,7 @@ from __future__ import annotations import io +import itertools import math import re import urllib.parse @@ -31,9 +32,9 @@ curl_cffi_version = tuple(map(int, re.split(r'[^\d]+', curl_cffi.__version__)[:3])) -if curl_cffi_version != (0, 5, 10) and not ((0, 7, 0) <= curl_cffi_version < (0, 7, 2)): +if curl_cffi_version != (0, 5, 10) and not (0, 10) <= curl_cffi_version: curl_cffi._yt_dlp__version = f'{curl_cffi.__version__} (unsupported)' - raise ImportError('Only curl_cffi versions 0.5.10, 0.7.0 and 0.7.1 are supported') + raise ImportError('Only curl_cffi versions 0.5.10 and 0.10.x are supported') import curl_cffi.requests from curl_cffi.const import CurlECode, CurlOpt @@ -97,7 +98,7 @@ def read(self, amt=None): return self.fp.read(amt) except curl_cffi.requests.errors.RequestsError as e: if e.code == CurlECode.PARTIAL_FILE: - content_length = int_or_none(e.response.headers.get('Content-Length')) + content_length = e.response and int_or_none(e.response.headers.get('Content-Length')) raise IncompleteRead( partial=self.fp.bytes_read, expected=content_length - self.fp.bytes_read if content_length is not None else None, @@ -105,6 +106,51 @@ def read(self, amt=None): raise TransportError(cause=e) from e +# See: https://github.com/lexiforest/curl_cffi?tab=readme-ov-file#supported-impersonate-browsers +# https://github.com/lexiforest/curl-impersonate?tab=readme-ov-file#supported-browsers +BROWSER_TARGETS: dict[tuple[int, ...], dict[str, ImpersonateTarget]] = { + (0, 5): { + 'chrome99': ImpersonateTarget('chrome', '99', 'windows', '10'), + 'chrome99_android': ImpersonateTarget('chrome', '99', 'android', '12'), + 'chrome100': ImpersonateTarget('chrome', '100', 'windows', '10'), + 'chrome101': ImpersonateTarget('chrome', '101', 'windows', '10'), + 'chrome104': ImpersonateTarget('chrome', '104', 'windows', '10'), + 'chrome107': ImpersonateTarget('chrome', '107', 'windows', '10'), + 'chrome110': ImpersonateTarget('chrome', '110', 'windows', '10'), + 'edge99': ImpersonateTarget('edge', '99', 'windows', '10'), + 'edge101': ImpersonateTarget('edge', '101', 'windows', '10'), + 'safari15_3': ImpersonateTarget('safari', '15.3', 'macos', '11'), + 'safari15_5': ImpersonateTarget('safari', '15.5', 'macos', '12'), + }, + (0, 7): { + 'chrome116': ImpersonateTarget('chrome', '116', 'windows', '10'), + 'chrome119': ImpersonateTarget('chrome', '119', 'macos', '14'), + 'chrome120': ImpersonateTarget('chrome', '120', 'macos', '14'), + 'chrome123': ImpersonateTarget('chrome', '123', 'macos', '14'), + 'chrome124': ImpersonateTarget('chrome', '124', 'macos', '14'), + 'safari17_0': ImpersonateTarget('safari', '17.0', 'macos', '14'), + 'safari17_2_ios': ImpersonateTarget('safari', '17.2', 'ios', '17.2'), + }, + (0, 9): { + 'safari15_3': ImpersonateTarget('safari', '15.3', 'macos', '14'), + 'safari15_5': ImpersonateTarget('safari', '15.5', 'macos', '14'), + 'chrome119': ImpersonateTarget('chrome', '119', 'macos', '14'), + 'chrome120': ImpersonateTarget('chrome', '120', 'macos', '14'), + 'chrome123': ImpersonateTarget('chrome', '123', 'macos', '14'), + 'chrome124': ImpersonateTarget('chrome', '124', 'macos', '14'), + 'chrome131': ImpersonateTarget('chrome', '131', 'macos', '14'), + 'chrome131_android': ImpersonateTarget('chrome', '131', 'android', '14'), + 'chrome133a': ImpersonateTarget('chrome', '133', 'macos', '15'), + 'firefox133': ImpersonateTarget('firefox', '133', 'macos', '14'), + 'safari18_0': ImpersonateTarget('safari', '18.0', 'macos', '15'), + 'safari18_0_ios': ImpersonateTarget('safari', '18.0', 'ios', '18.0'), + }, + (0, 10): { + 'firefox135': ImpersonateTarget('firefox', '135', 'macos', '14'), + }, +} + + @register_rh class CurlCFFIRH(ImpersonateRequestHandler, InstanceStoreMixin): RH_NAME = 'curl_cffi' @@ -112,30 +158,21 @@ class CurlCFFIRH(ImpersonateRequestHandler, InstanceStoreMixin): _SUPPORTED_FEATURES = (Features.NO_PROXY, Features.ALL_PROXY) _SUPPORTED_PROXY_SCHEMES = ('http', 'https', 'socks4', 'socks4a', 'socks5', 'socks5h') _SUPPORTED_IMPERSONATE_TARGET_MAP = { - **({ - ImpersonateTarget('chrome', '124', 'macos', '14'): curl_cffi.requests.BrowserType.chrome124, - ImpersonateTarget('chrome', '123', 'macos', '14'): curl_cffi.requests.BrowserType.chrome123, - ImpersonateTarget('chrome', '120', 'macos', '14'): curl_cffi.requests.BrowserType.chrome120, - ImpersonateTarget('chrome', '119', 'macos', '14'): curl_cffi.requests.BrowserType.chrome119, - ImpersonateTarget('chrome', '116', 'windows', '10'): curl_cffi.requests.BrowserType.chrome116, - } if curl_cffi_version >= (0, 7, 0) else {}), - ImpersonateTarget('chrome', '110', 'windows', '10'): curl_cffi.requests.BrowserType.chrome110, - ImpersonateTarget('chrome', '107', 'windows', '10'): curl_cffi.requests.BrowserType.chrome107, - ImpersonateTarget('chrome', '104', 'windows', '10'): curl_cffi.requests.BrowserType.chrome104, - ImpersonateTarget('chrome', '101', 'windows', '10'): curl_cffi.requests.BrowserType.chrome101, - ImpersonateTarget('chrome', '100', 'windows', '10'): curl_cffi.requests.BrowserType.chrome100, - ImpersonateTarget('chrome', '99', 'windows', '10'): curl_cffi.requests.BrowserType.chrome99, - ImpersonateTarget('edge', '101', 'windows', '10'): curl_cffi.requests.BrowserType.edge101, - ImpersonateTarget('edge', '99', 'windows', '10'): curl_cffi.requests.BrowserType.edge99, - **({ - ImpersonateTarget('safari', '17.0', 'macos', '14'): curl_cffi.requests.BrowserType.safari17_0, - } if curl_cffi_version >= (0, 7, 0) else {}), - ImpersonateTarget('safari', '15.5', 'macos', '12'): curl_cffi.requests.BrowserType.safari15_5, - ImpersonateTarget('safari', '15.3', 'macos', '11'): curl_cffi.requests.BrowserType.safari15_3, - ImpersonateTarget('chrome', '99', 'android', '12'): curl_cffi.requests.BrowserType.chrome99_android, - **({ - ImpersonateTarget('safari', '17.2', 'ios', '17.2'): curl_cffi.requests.BrowserType.safari17_2_ios, - } if curl_cffi_version >= (0, 7, 0) else {}), + target: name if curl_cffi_version >= (0, 9) else curl_cffi.requests.BrowserType[name] + for name, target in dict(sorted(itertools.chain.from_iterable( + targets.items() + for version, targets in BROWSER_TARGETS.items() + if curl_cffi_version >= version + ), key=lambda x: ( + # deprioritize mobile targets since they give very different behavior + x[1].os not in ('ios', 'android'), + # prioritize edge < firefox < safari < chrome + ('edge', 'firefox', 'safari', 'chrome').index(x[1].client), + # prioritize newest version + float(x[1].version) if x[1].version else 0, + # group by os name + x[1].os, + ), reverse=True)).items() } def _create_instance(self, cookiejar=None): diff --git a/yt_dlp/networking/_requests.py b/yt_dlp/networking/_requests.py index 7de95ab3b..5b6b264a6 100644 --- a/yt_dlp/networking/_requests.py +++ b/yt_dlp/networking/_requests.py @@ -21,9 +21,11 @@ urllib3_version = tuple(int_or_none(x, default=0) for x in urllib3.__version__.split('.')) if urllib3_version < (1, 26, 17): + urllib3._yt_dlp__version = f'{urllib3.__version__} (unsupported)' raise ImportError('Only urllib3 >= 1.26.17 is supported') if requests.__build__ < 0x023202: + requests._yt_dlp__version = f'{requests.__version__} (unsupported)' raise ImportError('Only requests >= 2.32.2 is supported') import requests.adapters @@ -296,6 +298,7 @@ def _check_extensions(self, extensions): extensions.pop('cookiejar', None) extensions.pop('timeout', None) extensions.pop('legacy_ssl', None) + extensions.pop('keep_header_casing', None) def _create_instance(self, cookiejar, legacy_ssl_support=None): session = RequestsSession() @@ -312,11 +315,12 @@ def _create_instance(self, cookiejar, legacy_ssl_support=None): session.trust_env = False # no need, we already load proxies from env return session - def _send(self, request): - - headers = self._merge_headers(request.headers) + def _prepare_headers(self, _, headers): add_accept_encoding_header(headers, SUPPORTED_ENCODINGS) + def _send(self, request): + + headers = self._get_headers(request) max_redirects_exceeded = False session = self._get_instance( diff --git a/yt_dlp/networking/_urllib.py b/yt_dlp/networking/_urllib.py index 510bb2a69..a188b35f5 100644 --- a/yt_dlp/networking/_urllib.py +++ b/yt_dlp/networking/_urllib.py @@ -379,13 +379,15 @@ def _create_instance(self, proxies, cookiejar, legacy_ssl_support=None): opener.addheaders = [] return opener - def _send(self, request): - headers = self._merge_headers(request.headers) + def _prepare_headers(self, _, headers): add_accept_encoding_header(headers, SUPPORTED_ENCODINGS) + + def _send(self, request): + headers = self._get_headers(request) urllib_req = urllib.request.Request( url=request.url, data=request.data, - headers=dict(headers), + headers=headers, method=request.method, ) diff --git a/yt_dlp/networking/_websockets.py b/yt_dlp/networking/_websockets.py index ec55567da..d29f8e45a 100644 --- a/yt_dlp/networking/_websockets.py +++ b/yt_dlp/networking/_websockets.py @@ -34,6 +34,7 @@ websockets_version = tuple(map(int_or_none, websockets.version.version.split('.'))) if websockets_version < (13, 0): + websockets._yt_dlp__version = f'{websockets.version.version} (unsupported)' raise ImportError('Only websockets>=13.0 is supported') import websockets.sync.client @@ -116,6 +117,7 @@ def _check_extensions(self, extensions): extensions.pop('timeout', None) extensions.pop('cookiejar', None) extensions.pop('legacy_ssl', None) + extensions.pop('keep_header_casing', None) def close(self): # Remove the logging handler that contains a reference to our logger @@ -123,15 +125,16 @@ def close(self): for name, handler in self.__logging_handlers.items(): logging.getLogger(name).removeHandler(handler) - def _send(self, request): - timeout = self._calculate_timeout(request) - headers = self._merge_headers(request.headers) + def _prepare_headers(self, request, headers): if 'cookie' not in headers: cookiejar = self._get_cookiejar(request) cookie_header = cookiejar.get_cookie_header(request.url) if cookie_header: headers['cookie'] = cookie_header + def _send(self, request): + timeout = self._calculate_timeout(request) + headers = self._get_headers(request) wsuri = parse_uri(request.url) create_conn_kwargs = { 'source_address': (self.source_address, 0) if self.source_address else None, diff --git a/yt_dlp/networking/common.py b/yt_dlp/networking/common.py index e8951c7e7..e33769422 100644 --- a/yt_dlp/networking/common.py +++ b/yt_dlp/networking/common.py @@ -206,6 +206,7 @@ class RequestHandler(abc.ABC): - `cookiejar`: Cookiejar to use for this request. - `timeout`: socket timeout to use for this request. - `legacy_ssl`: Enable legacy SSL options for this request. See legacy_ssl_support. + - `keep_header_casing`: Keep the casing of headers when sending the request. To enable these, add extensions.pop('<extension>', None) to _check_extensions Apart from the url protocol, proxies dict may contain the following keys: @@ -259,6 +260,23 @@ def _make_sslcontext(self, legacy_ssl_support=None): def _merge_headers(self, request_headers): return HTTPHeaderDict(self.headers, request_headers) + def _prepare_headers(self, request: Request, headers: HTTPHeaderDict) -> None: # noqa: B027 + """Additional operations to prepare headers before building. To be extended by subclasses. + @param request: Request object + @param headers: Merged headers to prepare + """ + + def _get_headers(self, request: Request) -> dict[str, str]: + """ + Get headers for external use. + Subclasses may define a _prepare_headers method to modify headers after merge but before building. + """ + headers = self._merge_headers(request.headers) + self._prepare_headers(request, headers) + if request.extensions.get('keep_header_casing'): + return headers.sensitive() + return dict(headers) + def _calculate_timeout(self, request): return float(request.extensions.get('timeout') or self.timeout) @@ -317,6 +335,7 @@ def _check_extensions(self, extensions): assert isinstance(extensions.get('cookiejar'), (YoutubeDLCookieJar, NoneType)) assert isinstance(extensions.get('timeout'), (float, int, NoneType)) assert isinstance(extensions.get('legacy_ssl'), (bool, NoneType)) + assert isinstance(extensions.get('keep_header_casing'), (bool, NoneType)) def _validate(self, request): self._check_url_scheme(request) @@ -486,6 +505,7 @@ def copy(self): HEADRequest = functools.partial(Request, method='HEAD') +PATCHRequest = functools.partial(Request, method='PATCH') PUTRequest = functools.partial(Request, method='PUT') diff --git a/yt_dlp/networking/impersonate.py b/yt_dlp/networking/impersonate.py index 0626b3b49..b90d10b76 100644 --- a/yt_dlp/networking/impersonate.py +++ b/yt_dlp/networking/impersonate.py @@ -5,11 +5,11 @@ from dataclasses import dataclass from typing import Any -from .common import RequestHandler, register_preference +from .common import RequestHandler, register_preference, Request from .exceptions import UnsupportedRequest from ..compat.types import NoneType from ..utils import classproperty, join_nonempty -from ..utils.networking import std_headers +from ..utils.networking import std_headers, HTTPHeaderDict @dataclass(order=True, frozen=True) @@ -123,7 +123,17 @@ def _get_request_target(self, request): """Get the requested target for the request""" return self._resolve_target(request.extensions.get('impersonate') or self.impersonate) - def _get_impersonate_headers(self, request): + def _prepare_impersonate_headers(self, request: Request, headers: HTTPHeaderDict) -> None: # noqa: B027 + """Additional operations to prepare headers before building. To be extended by subclasses. + @param request: Request object + @param headers: Merged headers to prepare + """ + + def _get_impersonate_headers(self, request: Request) -> dict[str, str]: + """ + Get headers for external impersonation use. + Subclasses may define a _prepare_impersonate_headers method to modify headers after merge but before building. + """ headers = self._merge_headers(request.headers) if self._get_request_target(request) is not None: # remove all headers present in std_headers @@ -131,7 +141,11 @@ def _get_impersonate_headers(self, request): for k, v in std_headers.items(): if headers.get(k) == v: headers.pop(k) - return headers + + self._prepare_impersonate_headers(request, headers) + if request.extensions.get('keep_header_casing'): + return headers.sensitive() + return dict(headers) @register_preference(ImpersonateRequestHandler) diff --git a/yt_dlp/options.py b/yt_dlp/options.py index a965fb7c0..5249236b0 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -150,6 +150,15 @@ def format_option_strings(option): return opts +_PRESET_ALIASES = { + 'mp3': ['-f', 'ba[acodec^=mp3]/ba/b', '-x', '--audio-format', 'mp3'], + 'aac': ['-f', 'ba[acodec^=aac]/ba[acodec^=mp4a.40.]/ba/b', '-x', '--audio-format', 'aac'], + 'mp4': ['--merge-output-format', 'mp4', '--remux-video', 'mp4', '-S', 'vcodec:h264,lang,quality,res,fps,hdr:12,acodec:aac'], + 'mkv': ['--merge-output-format', 'mkv', '--remux-video', 'mkv'], + 'sleep': ['--sleep-subtitles', '5', '--sleep-requests', '0.75', '--sleep-interval', '10', '--max-sleep-interval', '20'], +} + + class _YoutubeDLOptionParser(optparse.OptionParser): # optparse is deprecated since Python 3.2. So assume a stable interface even for private methods ALIAS_DEST = '_triggered_aliases' @@ -215,6 +224,22 @@ def _match_long_opt(self, opt): return e.possibilities[0] raise + def format_option_help(self, formatter=None): + assert formatter, 'Formatter can not be None' + formatted_help = super().format_option_help(formatter=formatter) + formatter.indent() + heading = formatter.format_heading('Preset Aliases') + formatter.indent() + result = [] + for name, args in _PRESET_ALIASES.items(): + option = optparse.Option('-t', help=shlex.join(args)) + formatter.option_strings[option] = f'-t {name}' + result.append(formatter.format_option(option)) + formatter.dedent() + formatter.dedent() + help_lines = '\n'.join(result) + return f'{formatted_help}\n{heading}{help_lines}' + def create_parser(): def _list_from_options_callback(option, opt_str, value, parser, append=True, delim=',', process=str.strip): @@ -317,6 +342,13 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): parser.rargs[:0] = shlex.split( opts if value is None else opts.format(*map(shlex.quote, value))) + def _preset_alias_callback(option, opt_str, value, parser): + if not value: + return + if value not in _PRESET_ALIASES: + raise optparse.OptionValueError(f'Unknown preset alias: {value}') + parser.rargs[:0] = _PRESET_ALIASES[value] + general = optparse.OptionGroup(parser, 'General Options') general.add_option( '-h', '--help', dest='print_help', action='store_true', @@ -398,7 +430,7 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): '(Alias: --no-config)')) general.add_option( '--no-config-locations', - action='store_const', dest='config_locations', const=[], + action='store_const', dest='config_locations', const=None, help=( 'Do not load any custom configuration files (default). When given inside a ' 'configuration file, ignore all previous --config-locations defined in the current file')) @@ -410,12 +442,21 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): '("-" for stdin). Can be used multiple times and inside other configuration files')) general.add_option( '--plugin-dirs', - dest='plugin_dirs', metavar='PATH', action='append', + metavar='PATH', + dest='plugin_dirs', + action='callback', + callback=_list_from_options_callback, + type='str', + callback_kwargs={'delim': None}, + default=['default'], help=( 'Path to an additional directory to search for plugins. ' 'This option can be used multiple times to add multiple directories. ' - 'Note that this currently only works for extractor plugins; ' - 'postprocessor plugins can only be loaded from the default plugin directories')) + 'Use "default" to search the default plugin directories (default)')) + general.add_option( + '--no-plugin-dirs', + dest='plugin_dirs', action='store_const', const=[], + help='Clear plugin directories to search, including defaults and those provided by previous --plugin-dirs') general.add_option( '--flat-playlist', action='store_const', dest='extract_flat', const='in_playlist', default=False, @@ -498,7 +539,8 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): 'youtube-dlc': ['all', '-no-youtube-channel-redirect', '-no-live-chat', '-playlist-match-filter', '-manifest-filesize-approx', '-allow-unsafe-ext', '-prefer-vp9-sort'], '2021': ['2022', 'no-certifi', 'filename-sanitization'], '2022': ['2023', 'no-external-downloader-progress', 'playlist-match-filter', 'prefer-legacy-http-handler', 'manifest-filesize-approx'], - '2023': ['prefer-vp9-sort'], + '2023': ['2024', 'prefer-vp9-sort'], + '2024': [], }, }, help=( 'Options that can help keep compatibility with youtube-dl or youtube-dlc ' @@ -516,6 +558,15 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): 'Alias options can trigger more aliases; so be careful to avoid defining recursive options. ' f'As a safety measure, each alias may be triggered a maximum of {_YoutubeDLOptionParser.ALIAS_TRIGGER_LIMIT} times. ' 'This option can be used multiple times')) + general.add_option( + '-t', '--preset-alias', + metavar='PRESET', dest='_', type='str', + action='callback', callback=_preset_alias_callback, + help=( + 'Applies a predefined set of options. e.g. --preset-alias mp3. ' + f'The following presets are available: {", ".join(_PRESET_ALIASES)}. ' + 'See the "Preset Aliases" section at the end for more info. ' + 'This option can be used multiple times')) network = optparse.OptionGroup(parser, 'Network Options') network.add_option( diff --git a/yt_dlp/plugins.py b/yt_dlp/plugins.py index 94335a9a3..941709b21 100644 --- a/yt_dlp/plugins.py +++ b/yt_dlp/plugins.py @@ -1,4 +1,5 @@ import contextlib +import dataclasses import functools import importlib import importlib.abc @@ -14,17 +15,48 @@ from pathlib import Path from zipfile import ZipFile +from .globals import ( + Indirect, + plugin_dirs, + all_plugins_loaded, + plugin_specs, +) + from .utils import ( - Config, get_executable_path, get_system_config_dirs, get_user_config_dirs, + merge_dicts, orderedSet, write_string, ) PACKAGE_NAME = 'yt_dlp_plugins' COMPAT_PACKAGE_NAME = 'ytdlp_plugins' +_BASE_PACKAGE_PATH = Path(__file__).parent + + +# Please Note: Due to necessary changes and the complex nature involved, +# no backwards compatibility is guaranteed for the plugin system API. +# However, we will still try our best. + +__all__ = [ + 'COMPAT_PACKAGE_NAME', + 'PACKAGE_NAME', + 'PluginSpec', + 'directories', + 'load_all_plugins', + 'load_plugins', + 'register_plugin_spec', +] + + +@dataclasses.dataclass +class PluginSpec: + module_name: str + suffix: str + destination: Indirect + plugin_destination: Indirect class PluginLoader(importlib.abc.Loader): @@ -44,7 +76,42 @@ def dirs_in_zip(archive): pass except Exception as e: write_string(f'WARNING: Could not read zip file {archive}: {e}\n') - return set() + return () + + +def default_plugin_paths(): + def _get_package_paths(*root_paths, containing_folder): + for config_dir in orderedSet(map(Path, root_paths), lazy=True): + # We need to filter the base path added when running __main__.py directly + if config_dir == _BASE_PACKAGE_PATH: + continue + with contextlib.suppress(OSError): + yield from (config_dir / containing_folder).iterdir() + + # Load from yt-dlp config folders + yield from _get_package_paths( + *get_user_config_dirs('yt-dlp'), + *get_system_config_dirs('yt-dlp'), + containing_folder='plugins', + ) + + # Load from yt-dlp-plugins folders + yield from _get_package_paths( + get_executable_path(), + *get_user_config_dirs(''), + *get_system_config_dirs(''), + containing_folder='yt-dlp-plugins', + ) + + # Load from PYTHONPATH directories + yield from (path for path in map(Path, sys.path) if path != _BASE_PACKAGE_PATH) + + +def candidate_plugin_paths(candidate): + candidate_path = Path(candidate) + if not candidate_path.is_dir(): + raise ValueError(f'Invalid plugin directory: {candidate_path}') + yield from candidate_path.iterdir() class PluginFinder(importlib.abc.MetaPathFinder): @@ -56,40 +123,16 @@ class PluginFinder(importlib.abc.MetaPathFinder): def __init__(self, *packages): self._zip_content_cache = {} - self.packages = set(itertools.chain.from_iterable( - itertools.accumulate(name.split('.'), lambda a, b: '.'.join((a, b))) - for name in packages)) + self.packages = set( + itertools.chain.from_iterable( + itertools.accumulate(name.split('.'), lambda a, b: '.'.join((a, b))) + for name in packages)) def search_locations(self, fullname): - candidate_locations = [] - - def _get_package_paths(*root_paths, containing_folder='plugins'): - for config_dir in orderedSet(map(Path, root_paths), lazy=True): - with contextlib.suppress(OSError): - yield from (config_dir / containing_folder).iterdir() - - # Load from yt-dlp config folders - candidate_locations.extend(_get_package_paths( - *get_user_config_dirs('yt-dlp'), - *get_system_config_dirs('yt-dlp'), - containing_folder='plugins')) - - # Load from yt-dlp-plugins folders - candidate_locations.extend(_get_package_paths( - get_executable_path(), - *get_user_config_dirs(''), - *get_system_config_dirs(''), - containing_folder='yt-dlp-plugins')) - - candidate_locations.extend(map(Path, sys.path)) # PYTHONPATH - with contextlib.suppress(ValueError): # Added when running __main__.py directly - candidate_locations.remove(Path(__file__).parent) - - # TODO(coletdjnz): remove when plugin globals system is implemented - if Config._plugin_dirs: - candidate_locations.extend(_get_package_paths( - *Config._plugin_dirs, - containing_folder='')) + candidate_locations = itertools.chain.from_iterable( + default_plugin_paths() if candidate == 'default' else candidate_plugin_paths(candidate) + for candidate in plugin_dirs.value + ) parts = Path(*fullname.split('.')) for path in orderedSet(candidate_locations, lazy=True): @@ -109,7 +152,8 @@ def find_spec(self, fullname, path=None, target=None): search_locations = list(map(str, self.search_locations(fullname))) if not search_locations: - return None + # Prevent using built-in meta finders for searching plugins. + raise ModuleNotFoundError(fullname) spec = importlib.machinery.ModuleSpec(fullname, PluginLoader(), is_package=True) spec.submodule_search_locations = search_locations @@ -123,8 +167,10 @@ def invalidate_caches(self): def directories(): - spec = importlib.util.find_spec(PACKAGE_NAME) - return spec.submodule_search_locations if spec else [] + with contextlib.suppress(ModuleNotFoundError): + if spec := importlib.util.find_spec(PACKAGE_NAME): + return list(spec.submodule_search_locations) + return [] def iter_modules(subpackage): @@ -134,19 +180,23 @@ def iter_modules(subpackage): yield from pkgutil.iter_modules(path=pkg.__path__, prefix=f'{fullname}.') -def load_module(module, module_name, suffix): +def get_regular_classes(module, module_name, suffix): + # Find standard public plugin classes (not overrides) return inspect.getmembers(module, lambda obj: ( inspect.isclass(obj) and obj.__name__.endswith(suffix) and obj.__module__.startswith(module_name) and not obj.__name__.startswith('_') - and obj.__name__ in getattr(module, '__all__', [obj.__name__]))) + and obj.__name__ in getattr(module, '__all__', [obj.__name__]) + and getattr(obj, 'PLUGIN_NAME', None) is None + )) -def load_plugins(name, suffix): - classes = {} - if os.environ.get('YTDLP_NO_PLUGINS'): - return classes +def load_plugins(plugin_spec: PluginSpec): + name, suffix = plugin_spec.module_name, plugin_spec.suffix + regular_classes = {} + if os.environ.get('YTDLP_NO_PLUGINS') or not plugin_dirs.value: + return regular_classes for finder, module_name, _ in iter_modules(name): if any(x.startswith('_') for x in module_name.split('.')): @@ -163,24 +213,42 @@ def load_plugins(name, suffix): sys.modules[module_name] = module spec.loader.exec_module(module) except Exception: - write_string(f'Error while importing module {module_name!r}\n{traceback.format_exc(limit=-1)}') + write_string( + f'Error while importing module {module_name!r}\n{traceback.format_exc(limit=-1)}', + ) continue - classes.update(load_module(module, module_name, suffix)) + regular_classes.update(get_regular_classes(module, module_name, suffix)) # Compat: old plugin system using __init__.py # Note: plugins imported this way do not show up in directories() # nor are considered part of the yt_dlp_plugins namespace package - with contextlib.suppress(FileNotFoundError): - spec = importlib.util.spec_from_file_location( - name, Path(get_executable_path(), COMPAT_PACKAGE_NAME, name, '__init__.py')) - plugins = importlib.util.module_from_spec(spec) - sys.modules[spec.name] = plugins - spec.loader.exec_module(plugins) - classes.update(load_module(plugins, spec.name, suffix)) + if 'default' in plugin_dirs.value: + with contextlib.suppress(FileNotFoundError): + spec = importlib.util.spec_from_file_location( + name, + Path(get_executable_path(), COMPAT_PACKAGE_NAME, name, '__init__.py'), + ) + plugins = importlib.util.module_from_spec(spec) + sys.modules[spec.name] = plugins + spec.loader.exec_module(plugins) + regular_classes.update(get_regular_classes(plugins, spec.name, suffix)) - return classes + # Add the classes into the global plugin lookup for that type + plugin_spec.plugin_destination.value = regular_classes + # We want to prepend to the main lookup for that type + plugin_spec.destination.value = merge_dicts(regular_classes, plugin_spec.destination.value) + + return regular_classes -sys.meta_path.insert(0, PluginFinder(f'{PACKAGE_NAME}.extractor', f'{PACKAGE_NAME}.postprocessor')) +def load_all_plugins(): + for plugin_spec in plugin_specs.value.values(): + load_plugins(plugin_spec) + all_plugins_loaded.value = True -__all__ = ['COMPAT_PACKAGE_NAME', 'PACKAGE_NAME', 'directories', 'load_plugins'] + +def register_plugin_spec(plugin_spec: PluginSpec): + # If the plugin spec for a module is already registered, it will not be added again + if plugin_spec.module_name not in plugin_specs.value: + plugin_specs.value[plugin_spec.module_name] = plugin_spec + sys.meta_path.insert(0, PluginFinder(f'{PACKAGE_NAME}.{plugin_spec.module_name}')) diff --git a/yt_dlp/postprocessor/__init__.py b/yt_dlp/postprocessor/__init__.py index 7b1620544..20e8b14b2 100644 --- a/yt_dlp/postprocessor/__init__.py +++ b/yt_dlp/postprocessor/__init__.py @@ -33,15 +33,38 @@ from .sponskrub import SponSkrubPP from .sponsorblock import SponsorBlockPP from .xattrpp import XAttrMetadataPP -from ..plugins import load_plugins +from ..globals import plugin_pps, postprocessors +from ..plugins import PACKAGE_NAME, register_plugin_spec, PluginSpec +from ..utils import deprecation_warning -_PLUGIN_CLASSES = load_plugins('postprocessor', 'PP') + +def __getattr__(name): + lookup = plugin_pps.value + if name in lookup: + deprecation_warning( + f'Importing a plugin Post-Processor from {__name__} is deprecated. ' + f'Please import {PACKAGE_NAME}.postprocessor.{name} instead.') + return lookup[name] + + raise AttributeError(f'module {__name__!r} has no attribute {name!r}') def get_postprocessor(key): - return globals()[key + 'PP'] + return postprocessors.value[key + 'PP'] -globals().update(_PLUGIN_CLASSES) -__all__ = [name for name in globals() if name.endswith('PP')] -__all__.extend(('FFmpegPostProcessor', 'PostProcessor')) +register_plugin_spec(PluginSpec( + module_name='postprocessor', + suffix='PP', + destination=postprocessors, + plugin_destination=plugin_pps, +)) + +_default_pps = { + name: value + for name, value in globals().items() + if name.endswith('PP') or name in ('FFmpegPostProcessor', 'PostProcessor') +} +postprocessors.value.update(_default_pps) + +__all__ = list(_default_pps.values()) diff --git a/yt_dlp/postprocessor/common.py b/yt_dlp/postprocessor/common.py index be2bb33f6..f0a71c1ff 100644 --- a/yt_dlp/postprocessor/common.py +++ b/yt_dlp/postprocessor/common.py @@ -10,6 +10,7 @@ _configuration_args, deprecation_warning, ) +from ..utils._utils import _ProgressState class PostProcessorMetaClass(type): @@ -189,7 +190,7 @@ def report_progress(self, s): self._downloader.to_console_title(self._downloader.evaluate_outtmpl( progress_template.get('postprocess-title') or 'yt-dlp %(progress._default_template)s', - progress_dict)) + progress_dict), _ProgressState.from_dict(s), s.get('_percent')) def _retry_download(self, err, count, retries): # While this is not an extractor, it behaves similar to one and diff --git a/yt_dlp/postprocessor/ffmpeg.py b/yt_dlp/postprocessor/ffmpeg.py index 8965806ae..59a49aa57 100644 --- a/yt_dlp/postprocessor/ffmpeg.py +++ b/yt_dlp/postprocessor/ffmpeg.py @@ -202,7 +202,7 @@ def _probe_version(self): @property def available(self): - return self.basename is not None + return bool(self._ffmpeg_location.get()) or self.basename is not None @property def executable(self): @@ -743,7 +743,7 @@ def add(meta_list, info_list=None): if value not in ('', None): value = ', '.join(map(str, variadic(value))) value = value.replace('\0', '') # nul character cannot be passed in command line - metadata['common'].update({meta_f: value for meta_f in variadic(meta_list)}) + metadata['common'].update(dict.fromkeys(variadic(meta_list), value)) # Info on media metadata/metadata supported by ffmpeg: # https://wiki.multimedia.cx/index.php/FFmpeg_Metadata diff --git a/yt_dlp/update.py b/yt_dlp/update.py index 360f5ad58..de289cb78 100644 --- a/yt_dlp/update.py +++ b/yt_dlp/update.py @@ -117,7 +117,7 @@ def current_git_head(): } _NON_UPDATEABLE_REASONS = { - **{variant: None for variant in _FILE_SUFFIXES}, # Updatable + **dict.fromkeys(_FILE_SUFFIXES), # Updatable **{variant: f'Auto-update is not supported for unpackaged {name} executable; Re-download the latest release' for variant, name in {'win32_dir': 'Windows', 'darwin_dir': 'MacOS', 'linux_dir': 'Linux'}.items()}, 'py2exe': 'py2exe is no longer supported by yt-dlp; This executable cannot be updated', @@ -202,7 +202,7 @@ class UpdateInfo: requested_version: str | None = None commit: str | None = None - binary_name: str | None = _get_binary_name() # noqa: RUF009: Always returns the same value + binary_name: str | None = _get_binary_name() # noqa: RUF009 # Always returns the same value checksum: str | None = None diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index 39f24d2e8..42bfe3974 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -8,6 +8,7 @@ import datetime as dt import email.header import email.utils +import enum import errno import functools import hashlib @@ -51,8 +52,9 @@ compat_HTMLParseError, ) from ..dependencies import xattr +from ..globals import IN_CLI -__name__ = __name__.rsplit('.', 1)[0] # noqa: A001: Pretend to be the parent module +__name__ = __name__.rsplit('.', 1)[0] # noqa: A001 # Pretend to be the parent module class NO_DEFAULT: @@ -1486,8 +1488,7 @@ def write_string(s, out=None, encoding=None): # TODO: Use global logger def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs): - from .. import _IN_CLI - if _IN_CLI: + if IN_CLI.value: if msg in deprecation_warning._cache: return deprecation_warning._cache.add(msg) @@ -2043,7 +2044,7 @@ def url_or_none(url): if not url or not isinstance(url, str): return None url = url.strip() - return url if re.match(r'(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None + return url if re.match(r'(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?|wss?):)?//', url) else None def strftime_or_none(timestamp, date_format='%Y%m%d', default=None): @@ -2771,7 +2772,8 @@ def process_escape(match): def template_substitute(match): evaluated = js_to_json(match.group(1), vars, strict=strict) if evaluated[0] == '"': - return json.loads(evaluated) + with contextlib.suppress(json.JSONDecodeError): + return json.loads(evaluated) return evaluated def fix_kv(m): @@ -3251,7 +3253,7 @@ def _match_one(filter_part, dct, incomplete): op = lambda attr, value: not unnegated_op(attr, value) else: op = unnegated_op - comparison_value = m['quotedstrval'] or m['strval'] or m['intval'] + comparison_value = m['quotedstrval'] or m['strval'] if m['quote']: comparison_value = comparison_value.replace(r'\{}'.format(m['quote']), m['quote']) actual_value = dct.get(m['key']) @@ -4895,10 +4897,6 @@ class Config: filename = None __initialized = False - # Internal only, do not use! Hack to enable --plugin-dirs - # TODO(coletdjnz): remove when plugin globals system is implemented - _plugin_dirs = None - def __init__(self, parser, label=None): self.parser, self.label = parser, label self._loaded_paths, self.configs = set(), [] @@ -5636,6 +5634,24 @@ def filesize_from_tbr(tbr, duration): return int(duration * tbr * (1000 / 8)) +def _request_dump_filename(url, video_id, data=None, trim_length=None): + if data is not None: + data = hashlib.md5(data).hexdigest() + basen = join_nonempty(video_id, data, url, delim='_') + trim_length = trim_length or 240 + if len(basen) > trim_length: + h = '___' + hashlib.md5(basen.encode()).hexdigest() + basen = basen[:trim_length - len(h)] + h + filename = sanitize_filename(f'{basen}.dump', restricted=True) + # Working around MAX_PATH limitation on Windows (see + # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx) + if os.name == 'nt': + absfilepath = os.path.abspath(filename) + if len(absfilepath) > 259: + filename = fR'\\?\{absfilepath}' + return filename + + # XXX: Temporary class _YDLLogger: def __init__(self, ydl=None): @@ -5664,3 +5680,32 @@ def stdout(self, message): def stderr(self, message): if self._ydl: self._ydl.to_stderr(message) + + +class _ProgressState(enum.Enum): + """ + Represents a state for a progress bar. + + See: https://conemu.github.io/en/AnsiEscapeCodes.html#ConEmu_specific_OSC + """ + + HIDDEN = 0 + INDETERMINATE = 3 + VISIBLE = 1 + WARNING = 4 + ERROR = 2 + + @classmethod + def from_dict(cls, s, /): + if s['status'] == 'finished': + return cls.INDETERMINATE + + # Not currently used + if s['status'] == 'error': + return cls.ERROR + + return cls.INDETERMINATE if s.get('_percent') is None else cls.VISIBLE + + def get_ansi_escape(self, /, percent=None): + percent = 0 if percent is None else int(percent) + return f'\033]9;4;{self.value};{percent}\007' diff --git a/yt_dlp/utils/networking.py b/yt_dlp/utils/networking.py index 933b164be..542abace8 100644 --- a/yt_dlp/utils/networking.py +++ b/yt_dlp/utils/networking.py @@ -1,9 +1,16 @@ +from __future__ import annotations + import collections +import collections.abc import random +import typing import urllib.parse import urllib.request -from ._utils import remove_start +if typing.TYPE_CHECKING: + T = typing.TypeVar('T') + +from ._utils import NO_DEFAULT, remove_start def random_user_agent(): @@ -51,32 +58,141 @@ def random_user_agent(): return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS) -class HTTPHeaderDict(collections.UserDict, dict): +class HTTPHeaderDict(dict): """ Store and access keys case-insensitively. The constructor can take multiple dicts, in which keys in the latter are prioritised. + + Retains a case sensitive mapping of the headers, which can be accessed via `.sensitive()`. """ + def __new__(cls, *args: typing.Any, **kwargs: typing.Any) -> typing.Self: + obj = dict.__new__(cls, *args, **kwargs) + obj.__sensitive_map = {} + return obj - def __init__(self, *args, **kwargs): + def __init__(self, /, *args, **kwargs): super().__init__() - for dct in args: - if dct is not None: - self.update(dct) - self.update(kwargs) + self.__sensitive_map = {} - def __setitem__(self, key, value): - if isinstance(value, bytes): - value = value.decode('latin-1') - super().__setitem__(key.title(), str(value).strip()) + for dct in filter(None, args): + self.update(dct) + if kwargs: + self.update(kwargs) - def __getitem__(self, key): + def sensitive(self, /) -> dict[str, str]: + return { + self.__sensitive_map[key]: value + for key, value in self.items() + } + + def __contains__(self, key: str, /) -> bool: + return super().__contains__(key.title() if isinstance(key, str) else key) + + def __delitem__(self, key: str, /) -> None: + key = key.title() + del self.__sensitive_map[key] + super().__delitem__(key) + + def __getitem__(self, key, /) -> str: return super().__getitem__(key.title()) - def __delitem__(self, key): - super().__delitem__(key.title()) + def __ior__(self, other, /): + if isinstance(other, type(self)): + other = other.sensitive() + if isinstance(other, dict): + self.update(other) + return + return NotImplemented - def __contains__(self, key): - return super().__contains__(key.title() if isinstance(key, str) else key) + def __or__(self, other, /) -> typing.Self: + if isinstance(other, type(self)): + other = other.sensitive() + if isinstance(other, dict): + return type(self)(self.sensitive(), other) + return NotImplemented + + def __ror__(self, other, /) -> typing.Self: + if isinstance(other, type(self)): + other = other.sensitive() + if isinstance(other, dict): + return type(self)(other, self.sensitive()) + return NotImplemented + + def __setitem__(self, key: str, value, /) -> None: + if isinstance(value, bytes): + value = value.decode('latin-1') + key_title = key.title() + self.__sensitive_map[key_title] = key + super().__setitem__(key_title, str(value).strip()) + + def clear(self, /) -> None: + self.__sensitive_map.clear() + super().clear() + + def copy(self, /) -> typing.Self: + return type(self)(self.sensitive()) + + @typing.overload + def get(self, key: str, /) -> str | None: ... + + @typing.overload + def get(self, key: str, /, default: T) -> str | T: ... + + def get(self, key, /, default=NO_DEFAULT): + key = key.title() + if default is NO_DEFAULT: + return super().get(key) + return super().get(key, default) + + @typing.overload + def pop(self, key: str, /) -> str: ... + + @typing.overload + def pop(self, key: str, /, default: T) -> str | T: ... + + def pop(self, key, /, default=NO_DEFAULT): + key = key.title() + if default is NO_DEFAULT: + self.__sensitive_map.pop(key) + return super().pop(key) + self.__sensitive_map.pop(key, default) + return super().pop(key, default) + + def popitem(self) -> tuple[str, str]: + self.__sensitive_map.popitem() + return super().popitem() + + @typing.overload + def setdefault(self, key: str, /) -> str: ... + + @typing.overload + def setdefault(self, key: str, /, default) -> str: ... + + def setdefault(self, key, /, default=None) -> str: + key = key.title() + if key in self.__sensitive_map: + return super().__getitem__(key) + + self[key] = default or '' + return self[key] + + def update(self, other, /, **kwargs) -> None: + if isinstance(other, type(self)): + other = other.sensitive() + if isinstance(other, collections.abc.Mapping): + for key, value in other.items(): + self[key] = value + + elif hasattr(other, 'keys'): + for key in other.keys(): # noqa: SIM118 + self[key] = other[key] + + else: + for key, value in other: + self[key] = value + + for key, value in kwargs.items(): + self[key] = value std_headers = HTTPHeaderDict({ diff --git a/yt_dlp/version.py b/yt_dlp/version.py index 4f7a2ec90..e8b2bf170 100644 --- a/yt_dlp/version.py +++ b/yt_dlp/version.py @@ -1,8 +1,8 @@ # Autogenerated by devscripts/update-version.py -__version__ = '2025.01.26' +__version__ = '2025.04.30' -RELEASE_GIT_HEAD = '3b4531934465580be22937fecbb6e1a3a9e2334f' +RELEASE_GIT_HEAD = '505b400795af557bdcfd9d4fa7e9133b26ef431c' VARIANT = None @@ -12,4 +12,4 @@ ORIGIN = 'yt-dlp/yt-dlp' -_pkg_version = '2025.01.26' +_pkg_version = '2025.04.30'