mirror of
https://github.com/yt-dlp/yt-dlp.git
synced 2025-08-13 16:08:29 +00:00
Merge 6f44c90d74
into f2919bd28e
This commit is contained in:
commit
838e4255ab
@ -450,19 +450,28 @@ def entries():
|
||||
|
||||
|
||||
class NhkForSchoolBangumiIE(InfoExtractor):
|
||||
_VALID_URL = r'https?://www2\.nhk\.or\.jp/school/movie/(?P<type>bangumi|clip)\.cgi\?das_id=(?P<id>[a-zA-Z0-9_-]+)'
|
||||
|
||||
def _decode_unicode_escapes(self, text):
|
||||
"""Decode %uXXXX Unicode escape sequences"""
|
||||
if not text:
|
||||
return text
|
||||
# Convert %uXXXX to proper Unicode characters
|
||||
|
||||
def decode_match(match):
|
||||
hex_code = match.group(1)
|
||||
return chr(int(hex_code, 16))
|
||||
return re.sub(r'%u([0-9A-Fa-f]{4})', decode_match, text)
|
||||
_VALID_URL = r'https?://www2\.nhk\.or\.jp/school/watch/(?P<type>bangumi|clip)/\?das_id=(?P<id>[a-zA-Z0-9_-]+)'
|
||||
_TESTS = [{
|
||||
'url': 'https://www2.nhk.or.jp/school/movie/bangumi.cgi?das_id=D0005150191_00000',
|
||||
'url': 'https://www2.nhk.or.jp/school/watch/bangumi/?das_id=D0005110301_00000',
|
||||
'info_dict': {
|
||||
'id': 'D0005150191_00003',
|
||||
'title': 'にている かな',
|
||||
'duration': 599.999,
|
||||
'timestamp': 1396414800,
|
||||
|
||||
'upload_date': '20140402',
|
||||
'id': 'D0005110301_00002',
|
||||
'title': '考えるカラス~科学の考え方~ - #1',
|
||||
'duration': 600,
|
||||
'ext': 'mp4',
|
||||
|
||||
'chapters': 'count:12',
|
||||
'chapters': 'count:7',
|
||||
'series': '考えるカラス~科学の考え方~',
|
||||
'episode': '#1',
|
||||
},
|
||||
'params': {
|
||||
# m3u8 download
|
||||
@ -474,14 +483,28 @@ def _real_extract(self, url):
|
||||
program_type, video_id = self._match_valid_url(url).groups()
|
||||
|
||||
webpage = self._download_webpage(
|
||||
f'https://www2.nhk.or.jp/school/movie/{program_type}.cgi?das_id={video_id}', video_id)
|
||||
f'https://www2.nhk.or.jp/school/watch/{program_type}/?das_id={video_id}', video_id)
|
||||
|
||||
# searches all variables
|
||||
base_values = {g.group(1): g.group(2) for g in re.finditer(r'var\s+([a-zA-Z_]+)\s*=\s*"([^"]+?)";', webpage)}
|
||||
# and programObj values too
|
||||
program_values = {g.group(1): g.group(3) for g in re.finditer(r'(?:program|clip)Obj\.([a-zA-Z_]+)\s*=\s*(["\'])([^"]+?)\2;', webpage)}
|
||||
# extract all chapters
|
||||
# searches all variables (both old var format and new let format)
|
||||
base_values = {g.group(1): g.group(2) for g in re.finditer(r'(?:var|let)\s+([a-zA-Z_]+)\s*=\s*"([^"]+?)";', webpage)}
|
||||
# and programObj values in modern object format
|
||||
program_values = {}
|
||||
program_obj_match = re.search(r'let\s+programObj\s*=\s*\{([^}]+)\};', webpage)
|
||||
if program_obj_match:
|
||||
obj_content = program_obj_match.group(1)
|
||||
for prop_match in re.finditer(r'([a-zA-Z_]+):\s*"([^"]*)"', obj_content):
|
||||
program_values[prop_match.group(1)] = prop_match.group(2)
|
||||
# fallback to old format
|
||||
if not program_values:
|
||||
program_values = {g.group(1): g.group(3) for g in re.finditer(r'(?:program|clip)Obj\.([a-zA-Z_]+)\s*=\s*(["\'])([^"]+?)\2;', webpage)}
|
||||
# extract all chapters (both old and new formats)
|
||||
chapter_durations = [parse_duration(g.group(1)) for g in re.finditer(r'chapterTime\.push\(\'([0-9:]+?)\'\);', webpage)]
|
||||
# new format: let chapterTime =["0","86.186","144.811",...]
|
||||
if not chapter_durations:
|
||||
chapter_time_match = re.search(r'let\s+chapterTime\s*=\s*\[([^\]]+)\];', webpage)
|
||||
if chapter_time_match:
|
||||
chapter_values = chapter_time_match.group(1)
|
||||
chapter_durations = [float(match.group(1)) for match in re.finditer(r'"([^"]+)"', chapter_values)]
|
||||
chapter_titles = [' '.join([g.group(1) or '', unescapeHTML(g.group(2))]).strip() for g in re.finditer(r'<div class="cpTitle"><span>(scene\s*\d+)?</span>([^<]+?)</div>', webpage)]
|
||||
|
||||
# this is how player_core.js is actually doing (!)
|
||||
@ -490,26 +513,70 @@ def _real_extract(self, url):
|
||||
video_id = f'{video_id.split("_")[0]}_{version}'
|
||||
|
||||
formats = self._extract_m3u8_formats(
|
||||
f'https://nhks-vh.akamaihd.net/i/das/{video_id[0:8]}/{video_id}_V_000.f4v/master.m3u8',
|
||||
f'https://vod-stream.nhk.jp/das/{video_id[0:8]}/{video_id}_V_000/index.m3u8',
|
||||
video_id, ext='mp4', m3u8_id='hls')
|
||||
|
||||
duration = parse_duration(base_values.get('r_duration'))
|
||||
# Handle duration from either source
|
||||
duration_str = base_values.get('r_duration') or program_values.get('duration')
|
||||
if duration_str and ':' in duration_str:
|
||||
# Handle format like '00:10:00:0' which is HH:MM:SS:frame, not standard HH:MM:SS
|
||||
parts = duration_str.split(':')
|
||||
if len(parts) == 4: # HH:MM:SS:frame format
|
||||
hours, minutes, seconds, _ = parts
|
||||
duration = int(hours) * 3600 + int(minutes) * 60 + int(seconds)
|
||||
else:
|
||||
duration = parse_duration(duration_str)
|
||||
else:
|
||||
duration = parse_duration(duration_str)
|
||||
|
||||
chapters = None
|
||||
if chapter_durations and chapter_titles and len(chapter_durations) == len(chapter_titles):
|
||||
start_time = chapter_durations
|
||||
end_time = [*chapter_durations[1:], duration]
|
||||
chapters = [{
|
||||
'start_time': s,
|
||||
'end_time': e,
|
||||
'title': t,
|
||||
} for s, e, t in zip(start_time, end_time, chapter_titles)]
|
||||
if chapter_durations and chapter_titles:
|
||||
# chapter_durations includes start (0.0) + chapter breaks, but we only have titles for actual chapters
|
||||
if len(chapter_durations) == len(chapter_titles) + 1:
|
||||
# Standard case: we have start + N chapter breaks, and N chapter titles
|
||||
start_time = chapter_durations[:-1] # All but the last
|
||||
end_time = chapter_durations[1:] # All but the first
|
||||
if duration and end_time:
|
||||
end_time[-1] = duration # Replace last end time with total duration
|
||||
chapters = [{
|
||||
'start_time': s,
|
||||
'end_time': e,
|
||||
'title': t,
|
||||
} for s, e, t in zip(start_time, end_time, chapter_titles)]
|
||||
elif len(chapter_durations) == len(chapter_titles):
|
||||
# Equal case: same number of durations and titles
|
||||
start_time = chapter_durations
|
||||
end_time = [*chapter_durations[1:], duration]
|
||||
chapters = [{
|
||||
'start_time': s,
|
||||
'end_time': e,
|
||||
'title': t,
|
||||
} for s, e, t in zip(start_time, end_time, chapter_titles)]
|
||||
|
||||
# Extract series title from HTML if available
|
||||
series_title = self._html_search_regex(
|
||||
r'<div class="onair">([^<]+)</div>', webpage, 'series title', fatal=False)
|
||||
|
||||
# Try to get episode title from multiple sources
|
||||
episode_title = (
|
||||
self._decode_unicode_escapes(program_values.get('name'))
|
||||
or self._html_search_regex(r'<div class="title">([^<]+)</div>', webpage, 'episode title', fatal=False)
|
||||
or self._html_search_regex(r'<title>([^|]+)', webpage, 'page title', fatal=False)
|
||||
)
|
||||
|
||||
# Combine series and episode titles
|
||||
if series_title and episode_title:
|
||||
title = f'{series_title} - {episode_title}'
|
||||
else:
|
||||
title = episode_title or series_title or 'Unknown'
|
||||
|
||||
return {
|
||||
'id': video_id,
|
||||
'title': program_values.get('name'),
|
||||
'duration': parse_duration(base_values.get('r_duration')),
|
||||
'timestamp': unified_timestamp(base_values['r_upload']),
|
||||
'title': title,
|
||||
'series': series_title,
|
||||
'episode': episode_title,
|
||||
'duration': duration,
|
||||
'timestamp': unified_timestamp(base_values.get('r_upload')),
|
||||
'formats': formats,
|
||||
'chapters': chapters,
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user