From 6b291b52640b4523c63ed9762a12303fe852cbf4 Mon Sep 17 00:00:00 2001 From: doe1080 <98906116+doe1080@users.noreply.github.com> Date: Sat, 24 May 2025 00:05:49 +0900 Subject: [PATCH 1/2] [ie/tfo] Rework extractor --- yt_dlp/extractor/_extractors.py | 5 +- yt_dlp/extractor/tfo.py | 268 +++++++++++++++++++++++++++----- 2 files changed, 237 insertions(+), 36 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 14a0068934..7ff6628f15 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -2096,7 +2096,10 @@ ) from .testurl import TestURLIE from .tf1 import TF1IE -from .tfo import TFOIE +from .tfo import ( + TFOIE, + TFOSeriesIE, +) from .theguardian import ( TheGuardianPodcastIE, TheGuardianPodcastPlaylistIE, diff --git a/yt_dlp/extractor/tfo.py b/yt_dlp/extractor/tfo.py index 0d1b252175..b4d8b50312 100644 --- a/yt_dlp/extractor/tfo.py +++ b/yt_dlp/extractor/tfo.py @@ -1,48 +1,246 @@ -import json +import urllib.parse from .common import InfoExtractor -from ..networking import HEADRequest -from ..utils import ExtractorError, clean_html, int_or_none +from .uplynk import UplynkBaseIE +from ..utils import ( + ExtractorError, + clean_html, + int_or_none, + parse_iso8601, + str_or_none, + url_or_none, +) +from ..utils.traversal import traverse_obj -class TFOIE(InfoExtractor): - _GEO_COUNTRIES = ['CA'] - _VALID_URL = r'https?://(?:www\.)?tfo\.org/(?:en|fr)/(?:[^/]+/){2}(?P\d+)' - _TEST = { - 'url': 'http://www.tfo.org/en/universe/tfo-247/100463871/video-game-hackathon', - 'md5': 'cafbe4f47a8dae0ca0159937878100d6', +class TFOIE(UplynkBaseIE): + IE_NAME = 'tfo' + IE_DESC = 'Télévision française de l\'Ontario' + + _BASE_URL = 'https://www.tfo.org' + _VALID_URL = r'https?://(?:www\.)?tfo\.org/(?:episode|film|regarder|titre)(?:/[\w-]+)+/(?P(?:GP)?\d{6})' + _TESTS = [{ + 'url': 'https://www.tfo.org/regarder/bardot-la-meprise/GP701766', 'info_dict': { - 'id': '7da3d50e495c406b8fc0b997659cc075', + 'id': '02a8e473f171403184a0a8cddc36845a', 'ext': 'mp4', - 'title': 'Video Game Hackathon', - 'description': 'md5:558afeba217c6c8d96c60e5421795c07', + 'title': 'Bardot, la Méprise', + 'age_limit': 13, + 'alt_title': 'bardot-la-meprise', + 'description': 'md5:16ca832101b6c3838bb61cd8fa06aa9e', + 'display_id': 'GP701766', + 'duration': 3134.8480000000022, + 'genres': ['Biographie et portraits'], + 'release_timestamp': 1747875610, + 'release_date': '20250522', + 'release_year': 2013, + 'series': 'Bardot, la Méprise', + 'tags': ['13+'], + 'thumbnail': r're:https?://.+\.jpg', + 'uploader_id': '872295f75a144bcf880cf68f4ad35db1', }, - } + 'skip': True, + }, { + 'url': 'https://www.tfo.org/regarder/pouletosaure-rex-partie-1-2/GP639511', + 'info_dict': { + 'id': 'e19eea3f3f604cd79a34293906cc5147', + 'ext': 'mp4', + 'title': 'Pouletosaure Rex - Partie 1 & 2', + 'age_limit': 6, + 'alt_title': 'pouletosaure-rex-partie-1-2', + 'description': 'md5:24e1b629fab54d537eb40a0ef6630afa', + 'display_id': 'GP639511', + 'duration': 1321.216000000001, + 'episode': 'Pouletosaure Rex - Partie 1 & 2', + 'episode_id': 'episode-1', + 'episode_number': 1, + 'genres': ['6 à 9 ans'], + 'release_date': '20250406', + 'release_timestamp': 1743912000, + 'release_year': 2025, + 'season': 'Saison 1', + 'season_id': 'saison-1', + 'season_number': 1, + 'series': 'Dino Dex', + 'series_id': '003051136', + 'tags': ['G'], + 'thumbnail': r're:https?://.+\.jpg', + 'uploader_id': '872295f75a144bcf880cf68f4ad35db1', + }, + 'skip': True, + }, { + 'url': 'https://www.tfo.org/episode/passeport-pour-le-monde/saison-2/episode-1/vietnam-dans-loeil-du-dragon/GP938523', + 'info_dict': { + 'id': 'aeaf612919794960b3bfb1a0c45e70bd', + 'ext': 'mp4', + 'title': 'VIETNAM : Dans l\'oeil du dragon', + 'age_limit': 18, + 'alt_title': 'vietnam-dans-loeil-du-dragon', + 'description': 'md5:ca182241d021ba832680ccbc09dc70fd', + 'display_id': 'GP938523', + 'duration': 3120.0000000000023, + 'episode': 'VIETNAM : Dans l\'oeil du dragon', + 'episode_id': 'episode-1', + 'episode_number': 1, + 'genres': ['Voyage et découverte'], + 'release_date': '20250331', + 'release_timestamp': 1743393600, + 'release_year': 2025, + 'season': 'Saison 2', + 'season_id': 'saison-2', + 'season_number': 2, + 'series': 'Passeport pour le monde', + 'series_id': '002968508', + 'tags': ['G'], + 'thumbnail': r're:https?://.+\.jpg', + 'uploader_id': '872295f75a144bcf880cf68f4ad35db1', + }, + 'skip': True, + }, { + 'url': 'https://www.tfo.org/titre/entre-les-lignes/GP704192', + 'info_dict': { + 'id': '160c720e2dea43eba1171a3e4fdf2042', + 'ext': 'mp4', + 'title': 'Entre les lignes', + 'age_limit': 0, + 'alt_title': 'entre-les-lignes', + 'display_id': 'GP704192', + 'duration': 2042.8800000000015, + 'genres': ['Société'], + 'release_date': '20231105', + 'release_timestamp': 1699146000, + 'release_year': 2008, + 'series': 'Entre les lignes', + 'tags': ['G'], + 'thumbnail': r're:https?://.+\.jpg', + 'uploader_id': '872295f75a144bcf880cf68f4ad35db1', + }, + 'skip': True, + }, { + 'url': 'https://www.tfo.org/film/pouic-pouic/498034', + 'info_dict': { + 'id': 'e942d3bf41fa437380d5a1529c049ee8', + 'ext': 'mp4', + 'title': 'Pouic-Pouic', + 'age_limit': 0, + 'alt_title': 'pouic-pouic', + 'description': 'md5:ec68140f0050fc854def36643058a9fe', + 'display_id': '498034', + 'duration': 5219.3279999998795, + 'genres': ['Comédie', 'Satirique'], + 'release_date': '20250516', + 'release_timestamp': 1747357215, + 'release_year': 1963, + 'series': 'Pouic-Pouic', + 'tags': ['G'], + 'thumbnail': r're:https?://.+\.jpg', + 'uploader_id': '872295f75a144bcf880cf68f4ad35db1', + }, + 'skip': True, + }] def _real_extract(self, url): video_id = self._match_id(url) - self._request_webpage(HEADRequest('http://www.tfo.org/'), video_id) - infos = self._download_json( - 'http://www.tfo.org/api/web/video/get_infos', video_id, data=json.dumps({ - 'product_id': video_id, - }).encode(), headers={ - 'X-tfo-session': self._get_cookies('http://www.tfo.org/')['tfo-session'].value, - }) - if infos.get('success') == 0: - if infos.get('code') == 'ErrGeoBlocked': - self.raise_geo_restricted(countries=self._GEO_COUNTRIES) - raise ExtractorError('{} said: {}'.format(self.IE_NAME, clean_html(infos['msg'])), expected=True) - video_data = infos['data'] + slug = urllib.parse.urlparse(url).path.rstrip('/').split('/')[-2] + webpage = self._download_webpage( + f'{self._BASE_URL}/regarder/{slug}/{video_id}', video_id) + + next_data = self._search_nextjs_data(webpage, video_id) + page_props = next_data['props']['pageProps'] + season_id = traverse_obj(page_props, ('seasonId', {str_or_none})) + + build_id, locale = traverse_obj(next_data, (('buildId', 'locale'), {str}, all)) + path = urllib.parse.urlparse(self._og_search_url(webpage)).path + + video_data = self._download_json( + f'{self._BASE_URL}/_next/data/{build_id}/{locale}{path}.json', + video_id, expected_status=404) + + if not (product := traverse_obj(video_data, ( + 'pageProps', 'product', {dict}), default={}, + )): + raise ExtractorError( + 'Failed to fetch video information, try again', expected=True) return { - '_type': 'url_transparent', - 'id': video_id, - 'url': 'limelight:media:' + video_data['llid'], - 'title': video_data['title'], - 'description': video_data.get('description'), - 'series': video_data.get('collection'), - 'season_number': int_or_none(video_data.get('season')), - 'episode_number': int_or_none(video_data.get('episode')), - 'duration': int_or_none(video_data.get('duration')), - 'ie_key': 'LimelightMedia', + 'display_id': video_id, + **self._extract_uplynk_info(traverse_obj(page_props, ( + 'metadata', 'video', {url_or_none}, + ))), + **traverse_obj(product, { + 'title': ('name', {str}), + 'age_limit': ('ratingCode', {int_or_none}), + 'alt_title': ('slug', {str_or_none}), + 'description': ('longDescription', {clean_html}), + 'genres': ('genres', ..., {str}), + 'release_timestamp': ('begin', {parse_iso8601}), + 'release_year': ('productionYear', {int_or_none}), + 'series': ('name', {str}), + 'series_id': ('serieId', {str_or_none}), + 'tags': ('tags', ..., 'label', {str}), + 'thumbnail': ('bannerUrl', {url_or_none}), + }), + **traverse_obj(product, ( + 'seasons', ..., 'episodes', + lambda _, v: v.get('id') == video_id, any, { + 'title': ('name', {str}), + 'age_limit': ('ageRangeCode', {int_or_none}), + 'alt_title': ('slug', {str_or_none}), + 'description': ('description', {clean_html}), + 'episode': ('episodeName', {str}), + 'episode_id': ( + 'episodeNumber', {str_or_none}, + {lambda x: f'episode-{x}' if x else None}, + ), + 'episode_number': ('episodeNumber', {int_or_none}), + 'genres': ('genres', ..., {str}), + 'release_timestamp': ('begin', {parse_iso8601}), + 'tags': ('tags', ..., 'label', {str}), + 'thumbnail': ('imageUrl', {url_or_none}), + }, + )), + **traverse_obj(product, ( + 'seasons', lambda _, v: v.get('id') == season_id, any, { + 'season': ('slug', {str_or_none}, {lambda x: f'Saison {x}' if x else None}), + 'season_id': ('slug', {str_or_none}, {lambda x: f'saison-{x}' if x else None}), + 'season_number': ('seasonNumber', {int_or_none}), + }, + )), } + + +class TFOSeriesIE(InfoExtractor): + IE_NAME = 'tfo:series' + + _VALID_URL = r'https?://(?:www\.)?tfo\.org/serie/[\w-]+(?:/saison-(?P\d+))?/(?P\d{9})' + _TESTS = [{ + 'url': 'https://www.tfo.org/serie/super-mini-monstres/002748228', + 'info_dict': { + 'id': '002748228', + 'title': 'Super mini monstres', + }, + 'playlist_count': 44, + }, { + 'url': 'https://www.tfo.org/serie/chacun-son-ile/saison-2/002981471', + 'info_dict': { + 'id': '002981471', + 'title': 'Chacun son île | Saison 2', + }, + 'playlist_count': 7, + }] + + def _real_extract(self, url): + season, series_id = self._match_valid_url(url).groups() + webpage = self._download_webpage(url, series_id) + json_ld = next(self._yield_json_ld(webpage, series_id)) + + entries = [ + self.url_result(x, TFOIE) + for x in traverse_obj(json_ld, ( + '@graph', ..., *(() if season else ('seasons', ...)), + 'episode', ..., 'url', {url_or_none}, + )) + ] + + return self.playlist_result( + entries, series_id, self._html_search_meta(['og:image:alt', 'twitter:image:alt'], webpage)) From c900038cfe0ac97c536d43e03bbc46dd56049d45 Mon Sep 17 00:00:00 2001 From: doe1080 <98906116+doe1080@users.noreply.github.com> Date: Fri, 20 Jun 2025 00:23:52 +0900 Subject: [PATCH 2/2] update --- yt_dlp/extractor/tfo.py | 54 +++++++++++++++++------------------------ 1 file changed, 22 insertions(+), 32 deletions(-) diff --git a/yt_dlp/extractor/tfo.py b/yt_dlp/extractor/tfo.py index b4d8b50312..5341c76832 100644 --- a/yt_dlp/extractor/tfo.py +++ b/yt_dlp/extractor/tfo.py @@ -3,14 +3,13 @@ from .common import InfoExtractor from .uplynk import UplynkBaseIE from ..utils import ( - ExtractorError, clean_html, int_or_none, parse_iso8601, str_or_none, url_or_none, ) -from ..utils.traversal import traverse_obj +from ..utils.traversal import require, traverse_obj class TFOIE(UplynkBaseIE): @@ -22,13 +21,12 @@ class TFOIE(UplynkBaseIE): _TESTS = [{ 'url': 'https://www.tfo.org/regarder/bardot-la-meprise/GP701766', 'info_dict': { - 'id': '02a8e473f171403184a0a8cddc36845a', + 'id': 'GP701766', 'ext': 'mp4', 'title': 'Bardot, la Méprise', 'age_limit': 13, 'alt_title': 'bardot-la-meprise', 'description': 'md5:16ca832101b6c3838bb61cd8fa06aa9e', - 'display_id': 'GP701766', 'duration': 3134.8480000000022, 'genres': ['Biographie et portraits'], 'release_timestamp': 1747875610, @@ -43,13 +41,12 @@ class TFOIE(UplynkBaseIE): }, { 'url': 'https://www.tfo.org/regarder/pouletosaure-rex-partie-1-2/GP639511', 'info_dict': { - 'id': 'e19eea3f3f604cd79a34293906cc5147', + 'id': 'GP639511', 'ext': 'mp4', 'title': 'Pouletosaure Rex - Partie 1 & 2', 'age_limit': 6, 'alt_title': 'pouletosaure-rex-partie-1-2', 'description': 'md5:24e1b629fab54d537eb40a0ef6630afa', - 'display_id': 'GP639511', 'duration': 1321.216000000001, 'episode': 'Pouletosaure Rex - Partie 1 & 2', 'episode_id': 'episode-1', @@ -71,13 +68,12 @@ class TFOIE(UplynkBaseIE): }, { 'url': 'https://www.tfo.org/episode/passeport-pour-le-monde/saison-2/episode-1/vietnam-dans-loeil-du-dragon/GP938523', 'info_dict': { - 'id': 'aeaf612919794960b3bfb1a0c45e70bd', + 'id': 'GP938523', 'ext': 'mp4', 'title': 'VIETNAM : Dans l\'oeil du dragon', 'age_limit': 18, 'alt_title': 'vietnam-dans-loeil-du-dragon', 'description': 'md5:ca182241d021ba832680ccbc09dc70fd', - 'display_id': 'GP938523', 'duration': 3120.0000000000023, 'episode': 'VIETNAM : Dans l\'oeil du dragon', 'episode_id': 'episode-1', @@ -99,12 +95,11 @@ class TFOIE(UplynkBaseIE): }, { 'url': 'https://www.tfo.org/titre/entre-les-lignes/GP704192', 'info_dict': { - 'id': '160c720e2dea43eba1171a3e4fdf2042', + 'id': 'GP704192', 'ext': 'mp4', 'title': 'Entre les lignes', 'age_limit': 0, 'alt_title': 'entre-les-lignes', - 'display_id': 'GP704192', 'duration': 2042.8800000000015, 'genres': ['Société'], 'release_date': '20231105', @@ -117,22 +112,21 @@ class TFOIE(UplynkBaseIE): }, 'skip': True, }, { - 'url': 'https://www.tfo.org/film/pouic-pouic/498034', + 'url': 'https://www.tfo.org/film/le-chat/498047', 'info_dict': { - 'id': 'e942d3bf41fa437380d5a1529c049ee8', + 'id': '498047', 'ext': 'mp4', - 'title': 'Pouic-Pouic', - 'age_limit': 0, - 'alt_title': 'pouic-pouic', - 'description': 'md5:ec68140f0050fc854def36643058a9fe', - 'display_id': '498034', - 'duration': 5219.3279999998795, - 'genres': ['Comédie', 'Satirique'], - 'release_date': '20250516', - 'release_timestamp': 1747357215, - 'release_year': 1963, - 'series': 'Pouic-Pouic', - 'tags': ['G'], + 'title': 'Le Chat', + 'age_limit': 16, + 'alt_title': 'le-chat', + 'description': 'md5:1e19c39fff1a48e3875feb73a52146b7', + 'duration': 5257.7279999998755, + 'genres': ['Drame', 'Psychologique'], + 'release_date': '20250617', + 'release_timestamp': 1750122010, + 'release_year': 1971, + 'series': 'Le Chat', + 'tags': ['16+'], 'thumbnail': r're:https?://.+\.jpg', 'uploader_id': '872295f75a144bcf880cf68f4ad35db1', }, @@ -155,15 +149,10 @@ def _real_extract(self, url): video_data = self._download_json( f'{self._BASE_URL}/_next/data/{build_id}/{locale}{path}.json', video_id, expected_status=404) - - if not (product := traverse_obj(video_data, ( - 'pageProps', 'product', {dict}), default={}, - )): - raise ExtractorError( - 'Failed to fetch video information, try again', expected=True) + product = traverse_obj(video_data, ( + 'pageProps', 'product', {require('video information')})) return { - 'display_id': video_id, **self._extract_uplynk_info(traverse_obj(page_props, ( 'metadata', 'video', {url_or_none}, ))), @@ -206,6 +195,7 @@ def _real_extract(self, url): 'season_number': ('seasonNumber', {int_or_none}), }, )), + 'id': video_id, } @@ -226,7 +216,7 @@ class TFOSeriesIE(InfoExtractor): 'id': '002981471', 'title': 'Chacun son île | Saison 2', }, - 'playlist_count': 7, + 'playlist_mincount': 8, }] def _real_extract(self, url):