From 6f44c90d7415b366f22fe4c7233d1a7f77abad52 Mon Sep 17 00:00:00 2001 From: Jorge Arriagada Date: Thu, 7 Aug 2025 16:59:39 -0400 Subject: [PATCH] fixed support for NHK for school videos --- yt_dlp/extractor/nhk.py | 125 ++++++++++++++++++++++++++++++---------- 1 file changed, 96 insertions(+), 29 deletions(-) diff --git a/yt_dlp/extractor/nhk.py b/yt_dlp/extractor/nhk.py index 14fbd6ce82..a4d798702f 100644 --- a/yt_dlp/extractor/nhk.py +++ b/yt_dlp/extractor/nhk.py @@ -450,19 +450,28 @@ def entries(): class NhkForSchoolBangumiIE(InfoExtractor): - _VALID_URL = r'https?://www2\.nhk\.or\.jp/school/movie/(?Pbangumi|clip)\.cgi\?das_id=(?P[a-zA-Z0-9_-]+)' + + def _decode_unicode_escapes(self, text): + """Decode %uXXXX Unicode escape sequences""" + if not text: + return text + # Convert %uXXXX to proper Unicode characters + + def decode_match(match): + hex_code = match.group(1) + return chr(int(hex_code, 16)) + return re.sub(r'%u([0-9A-Fa-f]{4})', decode_match, text) + _VALID_URL = r'https?://www2\.nhk\.or\.jp/school/watch/(?Pbangumi|clip)/\?das_id=(?P[a-zA-Z0-9_-]+)' _TESTS = [{ - 'url': 'https://www2.nhk.or.jp/school/movie/bangumi.cgi?das_id=D0005150191_00000', + 'url': 'https://www2.nhk.or.jp/school/watch/bangumi/?das_id=D0005110301_00000', 'info_dict': { - 'id': 'D0005150191_00003', - 'title': 'にている かな', - 'duration': 599.999, - 'timestamp': 1396414800, - - 'upload_date': '20140402', + 'id': 'D0005110301_00002', + 'title': '考えるカラス~科学の考え方~ - #1', + 'duration': 600, 'ext': 'mp4', - - 'chapters': 'count:12', + 'chapters': 'count:7', + 'series': '考えるカラス~科学の考え方~', + 'episode': '#1', }, 'params': { # m3u8 download @@ -474,14 +483,28 @@ def _real_extract(self, url): program_type, video_id = self._match_valid_url(url).groups() webpage = self._download_webpage( - f'https://www2.nhk.or.jp/school/movie/{program_type}.cgi?das_id={video_id}', video_id) + f'https://www2.nhk.or.jp/school/watch/{program_type}/?das_id={video_id}', video_id) - # searches all variables - base_values = {g.group(1): g.group(2) for g in re.finditer(r'var\s+([a-zA-Z_]+)\s*=\s*"([^"]+?)";', webpage)} - # and programObj values too - program_values = {g.group(1): g.group(3) for g in re.finditer(r'(?:program|clip)Obj\.([a-zA-Z_]+)\s*=\s*(["\'])([^"]+?)\2;', webpage)} - # extract all chapters + # searches all variables (both old var format and new let format) + base_values = {g.group(1): g.group(2) for g in re.finditer(r'(?:var|let)\s+([a-zA-Z_]+)\s*=\s*"([^"]+?)";', webpage)} + # and programObj values in modern object format + program_values = {} + program_obj_match = re.search(r'let\s+programObj\s*=\s*\{([^}]+)\};', webpage) + if program_obj_match: + obj_content = program_obj_match.group(1) + for prop_match in re.finditer(r'([a-zA-Z_]+):\s*"([^"]*)"', obj_content): + program_values[prop_match.group(1)] = prop_match.group(2) + # fallback to old format + if not program_values: + program_values = {g.group(1): g.group(3) for g in re.finditer(r'(?:program|clip)Obj\.([a-zA-Z_]+)\s*=\s*(["\'])([^"]+?)\2;', webpage)} + # extract all chapters (both old and new formats) chapter_durations = [parse_duration(g.group(1)) for g in re.finditer(r'chapterTime\.push\(\'([0-9:]+?)\'\);', webpage)] + # new format: let chapterTime =["0","86.186","144.811",...] + if not chapter_durations: + chapter_time_match = re.search(r'let\s+chapterTime\s*=\s*\[([^\]]+)\];', webpage) + if chapter_time_match: + chapter_values = chapter_time_match.group(1) + chapter_durations = [float(match.group(1)) for match in re.finditer(r'"([^"]+)"', chapter_values)] chapter_titles = [' '.join([g.group(1) or '', unescapeHTML(g.group(2))]).strip() for g in re.finditer(r'
(scene\s*\d+)?([^<]+?)
', webpage)] # this is how player_core.js is actually doing (!) @@ -490,26 +513,70 @@ def _real_extract(self, url): video_id = f'{video_id.split("_")[0]}_{version}' formats = self._extract_m3u8_formats( - f'https://nhks-vh.akamaihd.net/i/das/{video_id[0:8]}/{video_id}_V_000.f4v/master.m3u8', + f'https://vod-stream.nhk.jp/das/{video_id[0:8]}/{video_id}_V_000/index.m3u8', video_id, ext='mp4', m3u8_id='hls') - duration = parse_duration(base_values.get('r_duration')) + # Handle duration from either source + duration_str = base_values.get('r_duration') or program_values.get('duration') + if duration_str and ':' in duration_str: + # Handle format like '00:10:00:0' which is HH:MM:SS:frame, not standard HH:MM:SS + parts = duration_str.split(':') + if len(parts) == 4: # HH:MM:SS:frame format + hours, minutes, seconds, _ = parts + duration = int(hours) * 3600 + int(minutes) * 60 + int(seconds) + else: + duration = parse_duration(duration_str) + else: + duration = parse_duration(duration_str) chapters = None - if chapter_durations and chapter_titles and len(chapter_durations) == len(chapter_titles): - start_time = chapter_durations - end_time = [*chapter_durations[1:], duration] - chapters = [{ - 'start_time': s, - 'end_time': e, - 'title': t, - } for s, e, t in zip(start_time, end_time, chapter_titles)] + if chapter_durations and chapter_titles: + # chapter_durations includes start (0.0) + chapter breaks, but we only have titles for actual chapters + if len(chapter_durations) == len(chapter_titles) + 1: + # Standard case: we have start + N chapter breaks, and N chapter titles + start_time = chapter_durations[:-1] # All but the last + end_time = chapter_durations[1:] # All but the first + if duration and end_time: + end_time[-1] = duration # Replace last end time with total duration + chapters = [{ + 'start_time': s, + 'end_time': e, + 'title': t, + } for s, e, t in zip(start_time, end_time, chapter_titles)] + elif len(chapter_durations) == len(chapter_titles): + # Equal case: same number of durations and titles + start_time = chapter_durations + end_time = [*chapter_durations[1:], duration] + chapters = [{ + 'start_time': s, + 'end_time': e, + 'title': t, + } for s, e, t in zip(start_time, end_time, chapter_titles)] + + # Extract series title from HTML if available + series_title = self._html_search_regex( + r'
([^<]+)
', webpage, 'series title', fatal=False) + + # Try to get episode title from multiple sources + episode_title = ( + self._decode_unicode_escapes(program_values.get('name')) + or self._html_search_regex(r'
([^<]+)
', webpage, 'episode title', fatal=False) + or self._html_search_regex(r'([^|]+)', webpage, 'page title', fatal=False) + ) + + # Combine series and episode titles + if series_title and episode_title: + title = f'{series_title} - {episode_title}' + else: + title = episode_title or series_title or 'Unknown' return { 'id': video_id, - 'title': program_values.get('name'), - 'duration': parse_duration(base_values.get('r_duration')), - 'timestamp': unified_timestamp(base_values['r_upload']), + 'title': title, + 'series': series_title, + 'episode': episode_title, + 'duration': duration, + 'timestamp': unified_timestamp(base_values.get('r_upload')), 'formats': formats, 'chapters': chapters, }