1
0
mirror of https://github.com/yt-dlp/yt-dlp.git synced 2025-08-13 16:08:29 +00:00

fixed support for NHK for school videos

This commit is contained in:
Jorge Arriagada 2025-08-07 16:59:39 -04:00
parent f799a4b472
commit 6f44c90d74

View File

@ -450,19 +450,28 @@ def entries():
class NhkForSchoolBangumiIE(InfoExtractor):
_VALID_URL = r'https?://www2\.nhk\.or\.jp/school/movie/(?P<type>bangumi|clip)\.cgi\?das_id=(?P<id>[a-zA-Z0-9_-]+)'
def _decode_unicode_escapes(self, text):
"""Decode %uXXXX Unicode escape sequences"""
if not text:
return text
# Convert %uXXXX to proper Unicode characters
def decode_match(match):
hex_code = match.group(1)
return chr(int(hex_code, 16))
return re.sub(r'%u([0-9A-Fa-f]{4})', decode_match, text)
_VALID_URL = r'https?://www2\.nhk\.or\.jp/school/watch/(?P<type>bangumi|clip)/\?das_id=(?P<id>[a-zA-Z0-9_-]+)'
_TESTS = [{
'url': 'https://www2.nhk.or.jp/school/movie/bangumi.cgi?das_id=D0005150191_00000',
'url': 'https://www2.nhk.or.jp/school/watch/bangumi/?das_id=D0005110301_00000',
'info_dict': {
'id': 'D0005150191_00003',
'title': 'にている かな',
'duration': 599.999,
'timestamp': 1396414800,
'upload_date': '20140402',
'id': 'D0005110301_00002',
'title': '考えるカラス~科学の考え方~ - ',
'duration': 600,
'ext': 'mp4',
'chapters': 'count:12',
'chapters': 'count:7',
'series': '考えるカラス~科学の考え方~',
'episode': '',
},
'params': {
# m3u8 download
@ -474,14 +483,28 @@ def _real_extract(self, url):
program_type, video_id = self._match_valid_url(url).groups()
webpage = self._download_webpage(
f'https://www2.nhk.or.jp/school/movie/{program_type}.cgi?das_id={video_id}', video_id)
f'https://www2.nhk.or.jp/school/watch/{program_type}/?das_id={video_id}', video_id)
# searches all variables
base_values = {g.group(1): g.group(2) for g in re.finditer(r'var\s+([a-zA-Z_]+)\s*=\s*"([^"]+?)";', webpage)}
# and programObj values too
program_values = {g.group(1): g.group(3) for g in re.finditer(r'(?:program|clip)Obj\.([a-zA-Z_]+)\s*=\s*(["\'])([^"]+?)\2;', webpage)}
# extract all chapters
# searches all variables (both old var format and new let format)
base_values = {g.group(1): g.group(2) for g in re.finditer(r'(?:var|let)\s+([a-zA-Z_]+)\s*=\s*"([^"]+?)";', webpage)}
# and programObj values in modern object format
program_values = {}
program_obj_match = re.search(r'let\s+programObj\s*=\s*\{([^}]+)\};', webpage)
if program_obj_match:
obj_content = program_obj_match.group(1)
for prop_match in re.finditer(r'([a-zA-Z_]+):\s*"([^"]*)"', obj_content):
program_values[prop_match.group(1)] = prop_match.group(2)
# fallback to old format
if not program_values:
program_values = {g.group(1): g.group(3) for g in re.finditer(r'(?:program|clip)Obj\.([a-zA-Z_]+)\s*=\s*(["\'])([^"]+?)\2;', webpage)}
# extract all chapters (both old and new formats)
chapter_durations = [parse_duration(g.group(1)) for g in re.finditer(r'chapterTime\.push\(\'([0-9:]+?)\'\);', webpage)]
# new format: let chapterTime =["0","86.186","144.811",...]
if not chapter_durations:
chapter_time_match = re.search(r'let\s+chapterTime\s*=\s*\[([^\]]+)\];', webpage)
if chapter_time_match:
chapter_values = chapter_time_match.group(1)
chapter_durations = [float(match.group(1)) for match in re.finditer(r'"([^"]+)"', chapter_values)]
chapter_titles = [' '.join([g.group(1) or '', unescapeHTML(g.group(2))]).strip() for g in re.finditer(r'<div class="cpTitle"><span>(scene\s*\d+)?</span>([^<]+?)</div>', webpage)]
# this is how player_core.js is actually doing (!)
@ -490,26 +513,70 @@ def _real_extract(self, url):
video_id = f'{video_id.split("_")[0]}_{version}'
formats = self._extract_m3u8_formats(
f'https://nhks-vh.akamaihd.net/i/das/{video_id[0:8]}/{video_id}_V_000.f4v/master.m3u8',
f'https://vod-stream.nhk.jp/das/{video_id[0:8]}/{video_id}_V_000/index.m3u8',
video_id, ext='mp4', m3u8_id='hls')
duration = parse_duration(base_values.get('r_duration'))
# Handle duration from either source
duration_str = base_values.get('r_duration') or program_values.get('duration')
if duration_str and ':' in duration_str:
# Handle format like '00:10:00:0' which is HH:MM:SS:frame, not standard HH:MM:SS
parts = duration_str.split(':')
if len(parts) == 4: # HH:MM:SS:frame format
hours, minutes, seconds, _ = parts
duration = int(hours) * 3600 + int(minutes) * 60 + int(seconds)
else:
duration = parse_duration(duration_str)
else:
duration = parse_duration(duration_str)
chapters = None
if chapter_durations and chapter_titles and len(chapter_durations) == len(chapter_titles):
start_time = chapter_durations
end_time = [*chapter_durations[1:], duration]
chapters = [{
'start_time': s,
'end_time': e,
'title': t,
} for s, e, t in zip(start_time, end_time, chapter_titles)]
if chapter_durations and chapter_titles:
# chapter_durations includes start (0.0) + chapter breaks, but we only have titles for actual chapters
if len(chapter_durations) == len(chapter_titles) + 1:
# Standard case: we have start + N chapter breaks, and N chapter titles
start_time = chapter_durations[:-1] # All but the last
end_time = chapter_durations[1:] # All but the first
if duration and end_time:
end_time[-1] = duration # Replace last end time with total duration
chapters = [{
'start_time': s,
'end_time': e,
'title': t,
} for s, e, t in zip(start_time, end_time, chapter_titles)]
elif len(chapter_durations) == len(chapter_titles):
# Equal case: same number of durations and titles
start_time = chapter_durations
end_time = [*chapter_durations[1:], duration]
chapters = [{
'start_time': s,
'end_time': e,
'title': t,
} for s, e, t in zip(start_time, end_time, chapter_titles)]
# Extract series title from HTML if available
series_title = self._html_search_regex(
r'<div class="onair">([^<]+)</div>', webpage, 'series title', fatal=False)
# Try to get episode title from multiple sources
episode_title = (
self._decode_unicode_escapes(program_values.get('name'))
or self._html_search_regex(r'<div class="title">([^<]+)</div>', webpage, 'episode title', fatal=False)
or self._html_search_regex(r'<title>([^|]+)', webpage, 'page title', fatal=False)
)
# Combine series and episode titles
if series_title and episode_title:
title = f'{series_title} - {episode_title}'
else:
title = episode_title or series_title or 'Unknown'
return {
'id': video_id,
'title': program_values.get('name'),
'duration': parse_duration(base_values.get('r_duration')),
'timestamp': unified_timestamp(base_values['r_upload']),
'title': title,
'series': series_title,
'episode': episode_title,
'duration': duration,
'timestamp': unified_timestamp(base_values.get('r_upload')),
'formats': formats,
'chapters': chapters,
}