mirror of
https://github.com/yt-dlp/yt-dlp.git
synced 2025-07-18 03:08:31 +00:00
Merge 62305bbc02
into c1ac543c81
This commit is contained in:
commit
8b734ec622
@ -1,8 +1,11 @@
|
|||||||
import json
|
import json
|
||||||
|
import re
|
||||||
|
|
||||||
from .common import InfoExtractor
|
from .common import InfoExtractor
|
||||||
from ..utils import (
|
from ..utils import (
|
||||||
|
ExtractorError,
|
||||||
int_or_none,
|
int_or_none,
|
||||||
|
js_to_json,
|
||||||
make_archive_id,
|
make_archive_id,
|
||||||
parse_iso8601,
|
parse_iso8601,
|
||||||
str_or_none,
|
str_or_none,
|
||||||
@ -13,9 +16,103 @@
|
|||||||
|
|
||||||
|
|
||||||
class KhanAcademyBaseIE(InfoExtractor):
|
class KhanAcademyBaseIE(InfoExtractor):
|
||||||
|
_RUNTIME_JS_URL = None
|
||||||
|
_MAIN_JS_URL = None
|
||||||
_VALID_URL_TEMPL = r'https?://(?:www\.)?khanacademy\.org/(?P<id>(?:[^/]+/){%s}%s[^?#/&]+)'
|
_VALID_URL_TEMPL = r'https?://(?:www\.)?khanacademy\.org/(?P<id>(?:[^/]+/){%s}%s[^?#/&]+)'
|
||||||
|
|
||||||
_PUBLISHED_CONTENT_VERSION = 'dc34750f0572c80f5effe7134082fe351143c1e4'
|
def _load_script_src_urls(self, webpage):
|
||||||
|
search = lambda name: self._search_regex(
|
||||||
|
rf'<script src="(https://cdn\.kastatic\.org/khanacademy/{name}\.[0-9a-f]+\.js)">', webpage, name)
|
||||||
|
self._RUNTIME_JS_URL = search('runtime')
|
||||||
|
self._MAIN_JS_URL = search('khanacademy')
|
||||||
|
|
||||||
|
def _extract_graphql(self, query_name):
|
||||||
|
# runtime.js contains hash version for each js file, which is needed for building js src url
|
||||||
|
runtime_js = self._download_webpage(self._RUNTIME_JS_URL, None, 'Downloading runtime.js')
|
||||||
|
version_hashes = self._search_json(
|
||||||
|
r'return\s*""\+e\+"\."\+\(', runtime_js, 'js resources', None, end_pattern=r'\)\[e\]\+"\.js"',
|
||||||
|
transform_source=lambda s: re.sub(r'([\da-f]+):', r'"\1":', s)) # cannot use js_to_json, due to #13621
|
||||||
|
|
||||||
|
# iterate all lazy-loaded js to find query-containing js file
|
||||||
|
main_js = self._download_webpage(self._MAIN_JS_URL, None, 'Downloading khanacademy.js')
|
||||||
|
for lazy_load in re.finditer(r'lazy\(function\(\)\{return Promise\.all\(\[(.+?)\]\)\.then', main_js):
|
||||||
|
for js_name in re.finditer(r'X.e\("([0-9a-f]+)"\)', lazy_load[1]):
|
||||||
|
if not (js_hash := version_hashes.get(js_name[1])):
|
||||||
|
self.report_warning(f'{js_name[1]} has no hash record for it, skip')
|
||||||
|
continue
|
||||||
|
url = f'https://cdn.kastatic.org/khanacademy/{js_name[1]}.{js_hash}.js'
|
||||||
|
js_src = self._download_webpage(url, None, f'Downloading {js_name[1]}.js')
|
||||||
|
if f'query {query_name}' in js_src:
|
||||||
|
return self._parse_graphql_js(js_src)
|
||||||
|
raise ExtractorError('Failed to find query js')
|
||||||
|
|
||||||
|
def _parse_graphql_js(self, src):
|
||||||
|
# extract gql strings for each object
|
||||||
|
queries = {match['obj_id']: json.loads(js_to_json(match['body'])) for match in re.finditer(
|
||||||
|
r'function (?P<obj_id>_templateObject\d*)\(\)\{var n=\(0,r\._\)\((?P<body>\[.+?\])\);return', src)}
|
||||||
|
|
||||||
|
# extract variable name to object query map at end: `x=o()(_templateObject00(), m, n, k)`
|
||||||
|
return {
|
||||||
|
match['name']: {
|
||||||
|
'sort': match['sort'] is not None,
|
||||||
|
'query': queries[match['obj_id']][0],
|
||||||
|
'embeds': match['embeds'].strip(',').split(',') if match['embeds'] else [],
|
||||||
|
} for match in re.finditer(
|
||||||
|
r'(?:var |,)(?P<name>[A-Za-z$_]+)=(?P<sort>\(0,s\.Fv\)\()?'
|
||||||
|
r'o\(\)\((?P<obj_id>_templateObject\d*)\(\)(?P<embeds>(?:,[A-Za-z$_]+)*)\)', src)}
|
||||||
|
|
||||||
|
def _sanitize_query(self, query: str):
|
||||||
|
outlines = []
|
||||||
|
indent = 0
|
||||||
|
for line in query.splitlines():
|
||||||
|
line = line.strip()
|
||||||
|
if not line or line.startswith('#'):
|
||||||
|
continue
|
||||||
|
if line == '}':
|
||||||
|
# unlike fragment, query has no __typename at its ends
|
||||||
|
# only object inside query has tailing __typename
|
||||||
|
if indent > 2 or outlines[0].startswith('fragment'):
|
||||||
|
outlines.append(f'{" " * indent}__typename')
|
||||||
|
indent -= 2
|
||||||
|
outlines.append(f'{" " * indent}{line}')
|
||||||
|
if line[-1] == '{':
|
||||||
|
indent += 2
|
||||||
|
return '\n'.join(outlines)
|
||||||
|
|
||||||
|
def _compose_query(self, query_objs, key):
|
||||||
|
def _get_fragments(key):
|
||||||
|
yield self._sanitize_query(query_objs[key]['query'])
|
||||||
|
for sub in query_objs[key]['embeds']:
|
||||||
|
yield from _get_fragments(sub)
|
||||||
|
|
||||||
|
queries = set(_get_fragments(key))
|
||||||
|
if not (query := next((q for q in queries if q.startswith('query ')), None)):
|
||||||
|
raise ExtractorError(f'Failed to find "{key}" query from query objects')
|
||||||
|
fragments = sorted(q for q in queries if q.startswith('fragment '))
|
||||||
|
return '\n\n'.join([query, *fragments])
|
||||||
|
|
||||||
|
def _string_hash(self, input_str):
|
||||||
|
str_hash = 5381
|
||||||
|
for char in input_str[::-1]:
|
||||||
|
str_hash = ((str_hash * 33) ^ ord(char)) & 0xFFFFFFFF
|
||||||
|
return str_hash
|
||||||
|
|
||||||
|
def _get_query_hash(self, query_name):
|
||||||
|
if cache := self.cache.load('khanacademy', f'{query_name}-hash'):
|
||||||
|
# change in hash of runtime.js may indicate change of graphql schema
|
||||||
|
# consider cached hash as invalidated upon such change
|
||||||
|
if cache['runtime_js'] == self._RUNTIME_JS_URL:
|
||||||
|
return cache['hash']
|
||||||
|
|
||||||
|
# iterate all query objects to find matching query
|
||||||
|
query_objs = self._extract_graphql(query_name)
|
||||||
|
for key, query_obj in query_objs.items():
|
||||||
|
if f'query {query_name}' in query_obj['query']:
|
||||||
|
query_hash = self._string_hash(self._compose_query(query_objs, key))
|
||||||
|
self.cache.store('khanacademy', f'{query_name}-hash', {
|
||||||
|
'hash': query_hash, 'runtime_js': self._RUNTIME_JS_URL})
|
||||||
|
return query_hash
|
||||||
|
raise ExtractorError(f'Failed to find query object for {query_name}')
|
||||||
|
|
||||||
def _parse_video(self, video):
|
def _parse_video(self, video):
|
||||||
return {
|
return {
|
||||||
@ -34,29 +131,36 @@ def _parse_video(self, video):
|
|||||||
|
|
||||||
def _real_extract(self, url):
|
def _real_extract(self, url):
|
||||||
display_id = self._match_id(url)
|
display_id = self._match_id(url)
|
||||||
content = self._download_json(
|
webpage = self._download_webpage(url, display_id)
|
||||||
|
self._load_script_src_urls(webpage)
|
||||||
|
|
||||||
|
ka_data = self._search_json(r'__KA_DATA__ \s*=', webpage, 'initial state', display_id)
|
||||||
|
data = self._download_json(
|
||||||
'https://www.khanacademy.org/api/internal/graphql/ContentForPath', display_id,
|
'https://www.khanacademy.org/api/internal/graphql/ContentForPath', display_id,
|
||||||
query={
|
query={
|
||||||
'fastly_cacheable': 'persist_until_publish',
|
'fastly_cacheable': 'persist_until_publish',
|
||||||
'pcv': self._PUBLISHED_CONTENT_VERSION,
|
'pcv': ka_data['KA-published-content-version'],
|
||||||
'hash': '3712657851',
|
'hash': self._get_query_hash('ContentForPath'),
|
||||||
'variables': json.dumps({
|
'variables': json.dumps({
|
||||||
'path': display_id,
|
'path': display_id,
|
||||||
'countryCode': 'US',
|
'countryCode': 'US',
|
||||||
'kaLocale': 'en',
|
|
||||||
'clientPublishedContentVersion': self._PUBLISHED_CONTENT_VERSION,
|
|
||||||
}),
|
}),
|
||||||
'lang': 'en',
|
'lang': 'en',
|
||||||
})['data']['contentRoute']['listedPathData']
|
'app': 'khanacademy',
|
||||||
return self._parse_component_props(content, display_id)
|
})['data']['contentRoute']
|
||||||
|
|
||||||
|
if data.get('listedPathData'):
|
||||||
|
return self._parse_component_props(data['listedPathData'], display_id, listed=True)
|
||||||
|
else:
|
||||||
|
return self._parse_component_props(data['unlistedPathData'], display_id, listed=False)
|
||||||
|
|
||||||
|
|
||||||
class KhanAcademyIE(KhanAcademyBaseIE):
|
class KhanAcademyIE(KhanAcademyBaseIE):
|
||||||
IE_NAME = 'khanacademy'
|
IE_NAME = 'khanacademy'
|
||||||
_VALID_URL = KhanAcademyBaseIE._VALID_URL_TEMPL % ('4', 'v/')
|
_VALID_URL = KhanAcademyBaseIE._VALID_URL_TEMPL % ('4', 'v/')
|
||||||
_TEST = {
|
_TESTS = [{
|
||||||
'url': 'https://www.khanacademy.org/computing/computer-science/cryptography/crypt/v/one-time-pad',
|
'url': 'https://www.khanacademy.org/computing/computer-science/cryptography/crypt/v/one-time-pad',
|
||||||
'md5': '1d5c2e70fa6aa29c38eca419f12515ce',
|
'md5': '2bd84e22fa3feea2e2a21352185a96bd',
|
||||||
'info_dict': {
|
'info_dict': {
|
||||||
'id': 'FlIG3TvQCBQ',
|
'id': 'FlIG3TvQCBQ',
|
||||||
'ext': 'mp4',
|
'ext': 'mp4',
|
||||||
@ -87,12 +191,31 @@ class KhanAcademyIE(KhanAcademyBaseIE):
|
|||||||
'view_count': int,
|
'view_count': int,
|
||||||
'like_count': int,
|
'like_count': int,
|
||||||
'heatmap': list,
|
'heatmap': list,
|
||||||
|
'media_type': 'video',
|
||||||
},
|
},
|
||||||
|
'expected_warnings': ['[0-9a-f]+ has no hash record for it, skip'],
|
||||||
'add_ie': ['Youtube'],
|
'add_ie': ['Youtube'],
|
||||||
}
|
}, {
|
||||||
|
'note': 'unlisted path video',
|
||||||
|
'url': 'https://www.khanacademy.org/math/math-for-fun-and-glory/vi-hart/spirals-fibonacci/v/doodling-in-math-spirals-fibonacci-and-being-a-plant-1-of-3',
|
||||||
|
'info_dict': {
|
||||||
|
'id': '537957955',
|
||||||
|
'ext': 'mp4',
|
||||||
|
'title': 'Doodling in math: Spirals, Fibonacci, and being a plant [1 of 3]',
|
||||||
|
'description': 'md5:4098102420babcf909097ec1633a52e7',
|
||||||
|
'upload_date': '20120131',
|
||||||
|
'timestamp': 1327972656,
|
||||||
|
'thumbnail': r're:https://cdn.kastatic.org/.*',
|
||||||
|
'duration': 355,
|
||||||
|
'creators': ['Vi Hart'],
|
||||||
|
'license': 'cc-by-nc-sa',
|
||||||
|
},
|
||||||
|
'expected_warnings': ['[0-9a-f]+ has no hash record for it, skip'],
|
||||||
|
}]
|
||||||
|
|
||||||
def _parse_component_props(self, component_props, display_id):
|
def _parse_component_props(self, component_props, display_id, listed=True):
|
||||||
video = component_props['content']
|
video = component_props['content']
|
||||||
|
if listed:
|
||||||
return {
|
return {
|
||||||
**self._parse_video(video),
|
**self._parse_video(video),
|
||||||
**traverse_obj(video, {
|
**traverse_obj(video, {
|
||||||
@ -101,6 +224,20 @@ def _parse_component_props(self, component_props, display_id):
|
|||||||
'license': ('kaUserLicense', {str}),
|
'license': ('kaUserLicense', {str}),
|
||||||
}),
|
}),
|
||||||
}
|
}
|
||||||
|
else:
|
||||||
|
return {
|
||||||
|
'id': str(video['id']),
|
||||||
|
'formats': self._extract_m3u8_formats(json.loads(video['downloadUrls'])['m3u8'], display_id),
|
||||||
|
**traverse_obj(video, {
|
||||||
|
'title': ('translatedTitle', {str}),
|
||||||
|
'description': ('description', {str}),
|
||||||
|
'thumbnail': ('thumbnailUrls', ..., 'url', {url_or_none}, any),
|
||||||
|
'duration': ('duration', {int}),
|
||||||
|
'creators': ('authorNames', ..., {str}),
|
||||||
|
'timestamp': ('dateAdded', {parse_iso8601}),
|
||||||
|
'license': ('kaUserLicense', {str}),
|
||||||
|
}),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
class KhanAcademyUnitIE(KhanAcademyBaseIE):
|
class KhanAcademyUnitIE(KhanAcademyBaseIE):
|
||||||
@ -116,6 +253,7 @@ class KhanAcademyUnitIE(KhanAcademyBaseIE):
|
|||||||
'_old_archive_ids': ['khanacademyunit cryptography'],
|
'_old_archive_ids': ['khanacademyunit cryptography'],
|
||||||
},
|
},
|
||||||
'playlist_mincount': 31,
|
'playlist_mincount': 31,
|
||||||
|
'expected_warnings': ['[0-9a-f]+ has no hash record for it, skip'],
|
||||||
}, {
|
}, {
|
||||||
'url': 'https://www.khanacademy.org/computing/computer-science',
|
'url': 'https://www.khanacademy.org/computing/computer-science',
|
||||||
'info_dict': {
|
'info_dict': {
|
||||||
@ -126,9 +264,22 @@ class KhanAcademyUnitIE(KhanAcademyBaseIE):
|
|||||||
'_old_archive_ids': ['khanacademyunit computer-science'],
|
'_old_archive_ids': ['khanacademyunit computer-science'],
|
||||||
},
|
},
|
||||||
'playlist_mincount': 50,
|
'playlist_mincount': 50,
|
||||||
|
'expected_warnings': ['[0-9a-f]+ has no hash record for it, skip'],
|
||||||
|
}, {
|
||||||
|
'note': 'unlisted path unit',
|
||||||
|
'url': 'https://www.khanacademy.org/math/math-for-fun-and-glory/vi-hart',
|
||||||
|
'info_dict': {
|
||||||
|
'id': 'xf48ec4ac',
|
||||||
|
'title': 'Doodling in Math and more',
|
||||||
|
'description': 'md5:81ca50417783334a27e48d687a346f14',
|
||||||
|
'display_id': 'math/math-for-fun-and-glory/vi-hart',
|
||||||
|
'_old_archive_ids': ['khanacademyunit vi-hart'],
|
||||||
|
},
|
||||||
|
'playlist_mincount': 50,
|
||||||
|
'expected_warnings': ['[0-9a-f]+ has no hash record for it, skip'],
|
||||||
}]
|
}]
|
||||||
|
|
||||||
def _parse_component_props(self, component_props, display_id):
|
def _parse_component_props(self, component_props, display_id, listed=True):
|
||||||
course = component_props['course']
|
course = component_props['course']
|
||||||
selected_unit = traverse_obj(course, (
|
selected_unit = traverse_obj(course, (
|
||||||
'unitChildren', lambda _, v: v['relativeUrl'] == f'/{display_id}', any)) or course
|
'unitChildren', lambda _, v: v['relativeUrl'] == f'/{display_id}', any)) or course
|
||||||
|
Loading…
Reference in New Issue
Block a user