mirror of
https://github.com/yt-dlp/yt-dlp.git
synced 2025-07-10 15:28:33 +00:00
Further tidy-up of the facebook method
This commit is contained in:
parent
01c8529ab1
commit
a19df1be3d
@ -539,13 +539,7 @@ def _extract_metadata(self, webpage, video_id):
|
||||
or (description or '').replace('\n', ' ') or f'Facebook video #{video_id}')
|
||||
return merge_dicts(info_json_ld, info_dict)
|
||||
|
||||
def _extract_from_url(self, url, video_id):
|
||||
webpage = self._download_webpage(
|
||||
url.replace('://m.facebook.com/', '://www.facebook.com/'), video_id)
|
||||
|
||||
video_data = None
|
||||
|
||||
def extract_video_data(instances):
|
||||
def _extract_video_data(self, instances: list) -> list:
|
||||
video_data = []
|
||||
for item in instances:
|
||||
if try_get(item, lambda x: x[1][0]) == 'VideoConfig':
|
||||
@ -554,67 +548,7 @@ def extract_video_data(instances):
|
||||
video_data.append(video_item['videoData'])
|
||||
return video_data
|
||||
|
||||
server_js_data = self._parse_json(self._search_regex(
|
||||
[r'handleServerJS\(({.+})(?:\);|,")', r'\bs\.handle\(({.+?})\);'],
|
||||
webpage, 'server js data', default='{}'), video_id, fatal=False)
|
||||
|
||||
if server_js_data:
|
||||
video_data = extract_video_data(server_js_data.get('instances', []))
|
||||
|
||||
def extract_from_jsmods_instances(js_data):
|
||||
if js_data:
|
||||
return extract_video_data(try_get(
|
||||
js_data, lambda x: x['jsmods']['instances'], list) or [])
|
||||
|
||||
def extract_dash_manifest(vid_data, formats, mpd_url=None):
|
||||
dash_manifest = traverse_obj(
|
||||
vid_data, 'dash_manifest', 'playlist', 'dash_manifest_xml_string', 'manifest_xml', expected_type=str)
|
||||
if dash_manifest:
|
||||
formats.extend(self._parse_mpd_formats(
|
||||
compat_etree_fromstring(urllib.parse.unquote_plus(dash_manifest)),
|
||||
mpd_url=url_or_none(vid_data.get('dash_manifest_url')) or mpd_url))
|
||||
|
||||
def process_formats(info):
|
||||
# Downloads with browser's User-Agent are rate limited. Working around
|
||||
# with non-browser User-Agent.
|
||||
for f in info['formats']:
|
||||
# Downloads with browser's User-Agent are rate limited. Working around
|
||||
# with non-browser User-Agent.
|
||||
f.setdefault('http_headers', {})['User-Agent'] = 'facebookexternalhit/1.1'
|
||||
# Formats larger than ~500MB will return error 403 unless chunk size is regulated
|
||||
f.setdefault('downloader_options', {})['http_chunk_size'] = 250 << 20
|
||||
|
||||
def yield_all_relay_data(_filter):
|
||||
for relay_data in re.findall(rf'data-sjs>({{.*?{_filter}.*?}})</script>', webpage):
|
||||
yield self._parse_json(relay_data, video_id, fatal=False) or {}
|
||||
|
||||
def extract_relay_data(_filter):
|
||||
return next(filter(None, yield_all_relay_data(_filter)), {})
|
||||
|
||||
def extract_relay_prefetched_data(_filter, target_keys=None):
|
||||
path = 'data'
|
||||
if target_keys is not None:
|
||||
path = lambda k, v: k == 'data' and any(target in v for target in variadic(target_keys))
|
||||
return traverse_obj(yield_all_relay_data(_filter), (
|
||||
..., 'require', (None, (..., ..., ..., '__bbox', 'require')),
|
||||
lambda _, v: any(key.startswith('RelayPrefetchedStreamCache') for key in v),
|
||||
..., ..., '__bbox', 'result', path, {dict}), get_all=False) or {}
|
||||
|
||||
if not video_data:
|
||||
server_js_data = self._parse_json(self._search_regex([
|
||||
r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+' + self._SUPPORTED_PAGLETS_REGEX,
|
||||
rf'bigPipe\.onPageletArrive\(({{.*?id\s*:\s*"{self._SUPPORTED_PAGLETS_REGEX}".*?}})\);',
|
||||
], webpage, 'js data', default='{}'), video_id, js_to_json, False)
|
||||
video_data = extract_from_jsmods_instances(server_js_data)
|
||||
|
||||
if not video_data:
|
||||
data = extract_relay_prefetched_data(
|
||||
r'"(?:dash_manifest|playable_url(?:_quality_hd)?)',
|
||||
target_keys=('video', 'event', 'nodes', 'node', 'mediaset'))
|
||||
if data:
|
||||
entries = []
|
||||
|
||||
def parse_graphql_video(video):
|
||||
def _parse_graphql_video(self, video, video_id, webpage) -> dict:
|
||||
v_id = video.get('videoId') or video.get('id') or video_id
|
||||
reel_info = traverse_obj(
|
||||
video, ('creation_story', 'short_form_video_context', 'playback_video', {dict}))
|
||||
@ -643,14 +577,14 @@ def parse_graphql_video(video):
|
||||
'quality': q(format_id) - 3,
|
||||
'url': playable_url,
|
||||
})
|
||||
extract_dash_manifest(fmt_data, formats)
|
||||
self._extract_dash_manifest(fmt_data, formats)
|
||||
|
||||
# New videoDeliveryResponse formats extraction
|
||||
fmt_data = traverse_obj(video, ('videoDeliveryResponseFragment', 'videoDeliveryResponseResult'))
|
||||
mpd_urls = traverse_obj(fmt_data, ('dash_manifest_urls', ..., 'manifest_url', {url_or_none}))
|
||||
dash_manifests = traverse_obj(fmt_data, ('dash_manifests', lambda _, v: v['manifest_xml']))
|
||||
for idx, dash_manifest in enumerate(dash_manifests):
|
||||
extract_dash_manifest(dash_manifest, formats, mpd_url=traverse_obj(mpd_urls, idx))
|
||||
self._extract_dash_manifest(dash_manifest, formats, mpd_url=traverse_obj(mpd_urls, idx))
|
||||
if not dash_manifests:
|
||||
# Only extract from MPD URLs if the manifests are not already provided
|
||||
for mpd_url in mpd_urls:
|
||||
@ -704,7 +638,7 @@ def parse_graphql_video(video):
|
||||
'automatic_captions': automatic_captions,
|
||||
'subtitles': subtitles,
|
||||
}
|
||||
process_formats(info)
|
||||
self._process_formats(info)
|
||||
description = try_get(video, lambda x: x['savable_description']['text'])
|
||||
title = video.get('name')
|
||||
if title:
|
||||
@ -714,12 +648,75 @@ def parse_graphql_video(video):
|
||||
})
|
||||
else:
|
||||
info['title'] = description or f'Facebook video #{v_id}'
|
||||
entries.append(info)
|
||||
return info
|
||||
|
||||
def _extract_dash_manifest(self, vid_data, formats, mpd_url=None):
|
||||
dash_manifest = traverse_obj(
|
||||
vid_data, 'dash_manifest', 'playlist', 'dash_manifest_xml_string', 'manifest_xml', expected_type=str)
|
||||
if dash_manifest:
|
||||
formats.extend(self._parse_mpd_formats(
|
||||
compat_etree_fromstring(urllib.parse.unquote_plus(dash_manifest)),
|
||||
mpd_url=url_or_none(vid_data.get('dash_manifest_url')) or mpd_url))
|
||||
|
||||
def _process_formats(self, info: dict) -> None:
|
||||
# Downloads with browser's User-Agent are rate limited. Working around
|
||||
# with non-browser User-Agent.
|
||||
for f in info['formats']:
|
||||
# Downloads with browser's User-Agent are rate limited. Working around
|
||||
# with non-browser User-Agent.
|
||||
f.setdefault('http_headers', {})['User-Agent'] = 'facebookexternalhit/1.1'
|
||||
# Formats larger than ~500MB will return error 403 unless chunk size is regulated
|
||||
f.setdefault('downloader_options', {})['http_chunk_size'] = 250 << 20
|
||||
|
||||
def _extract_from_url(self, url, video_id):
|
||||
webpage = self._download_webpage(
|
||||
url.replace('://m.facebook.com/', '://www.facebook.com/'), video_id)
|
||||
|
||||
video_data = None
|
||||
|
||||
server_js_data = self._parse_json(self._search_regex(
|
||||
[r'handleServerJS\(({.+})(?:\);|,")', r'\bs\.handle\(({.+?})\);'],
|
||||
webpage, 'server js data', default='{}'), video_id, fatal=False)
|
||||
|
||||
if server_js_data:
|
||||
video_data = self._extract_video_data(server_js_data.get('instances', []))
|
||||
|
||||
def extract_from_jsmods_instances(js_data):
|
||||
if js_data:
|
||||
return self._extract_video_data(try_get(
|
||||
js_data, lambda x: x['jsmods']['instances'], list) or [])
|
||||
|
||||
def yield_all_relay_data(_filter):
|
||||
for relay_data in re.findall(rf'data-sjs>({{.*?{_filter}.*?}})</script>', webpage):
|
||||
yield self._parse_json(relay_data, video_id, fatal=False) or {}
|
||||
|
||||
def extract_relay_prefetched_data(_filter, target_keys=None):
|
||||
path = 'data'
|
||||
if target_keys is not None:
|
||||
path = lambda k, v: k == 'data' and any(target in v for target in variadic(target_keys))
|
||||
return traverse_obj(yield_all_relay_data(_filter), (
|
||||
..., 'require', (None, (..., ..., ..., '__bbox', 'require')),
|
||||
lambda _, v: any(key.startswith('RelayPrefetchedStreamCache') for key in v),
|
||||
..., ..., '__bbox', 'result', path, {dict}), get_all=False) or {}
|
||||
|
||||
if not video_data:
|
||||
server_js_data = self._parse_json(self._search_regex([
|
||||
r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+' + self._SUPPORTED_PAGLETS_REGEX,
|
||||
rf'bigPipe\.onPageletArrive\(({{.*?id\s*:\s*"{self._SUPPORTED_PAGLETS_REGEX}".*?}})\);',
|
||||
], webpage, 'js data', default='{}'), video_id, js_to_json, False)
|
||||
video_data = extract_from_jsmods_instances(server_js_data)
|
||||
|
||||
if not video_data:
|
||||
data = extract_relay_prefetched_data(
|
||||
r'"(?:dash_manifest|playable_url(?:_quality_hd)?)',
|
||||
target_keys=('video', 'event', 'nodes', 'node', 'mediaset'))
|
||||
if data:
|
||||
entries = []
|
||||
|
||||
def parse_attachment(attachment, key='media'):
|
||||
media = attachment.get(key) or {}
|
||||
if media.get('__typename') == 'Video':
|
||||
return parse_graphql_video(media)
|
||||
entries.append(self._parse_graphql_video(media, video_id, webpage))
|
||||
|
||||
nodes = variadic(traverse_obj(data, 'nodes', 'node') or [])
|
||||
attachments = traverse_obj(nodes, (
|
||||
@ -747,7 +744,7 @@ def parse_attachment(attachment, key='media'):
|
||||
for attachment in attachments:
|
||||
parse_attachment(attachment)
|
||||
if not entries:
|
||||
parse_graphql_video(video)
|
||||
entries.append(self._parse_graphql_video(video, video_id, webpage))
|
||||
|
||||
if len(entries) > 1:
|
||||
return self.playlist_result(entries, video_id)
|
||||
@ -788,7 +785,8 @@ def parse_attachment(attachment, key='media'):
|
||||
if lsd:
|
||||
post_data[lsd['name']] = lsd['value']
|
||||
|
||||
relay_data = extract_relay_data(r'\[\s*"RelayAPIConfigDefaults"\s*,')
|
||||
relay_data = next(filter(None, yield_all_relay_data(r'\[\s*"RelayAPIConfigDefaults"\s*,')), {})
|
||||
|
||||
for define in (relay_data.get('define') or []):
|
||||
if define[0] == 'RelayAPIConfigDefaults':
|
||||
self._api_config = define[2]
|
||||
@ -874,7 +872,7 @@ def parse_attachment(attachment, key='media'):
|
||||
'quality': preference,
|
||||
'height': 720 if quality == 'hd' else None,
|
||||
})
|
||||
extract_dash_manifest(f[0], formats)
|
||||
self._extract_dash_manifest(f[0], formats)
|
||||
subtitles_src = f[0].get('subtitles_src')
|
||||
if subtitles_src:
|
||||
subtitles.setdefault('en', []).append({'url': subtitles_src})
|
||||
@ -884,7 +882,7 @@ def parse_attachment(attachment, key='media'):
|
||||
'formats': formats,
|
||||
'subtitles': subtitles,
|
||||
}
|
||||
process_formats(info_dict)
|
||||
self._process_formats(info_dict)
|
||||
info_dict.update(self._extract_metadata(webpage, video_id))
|
||||
|
||||
return info_dict
|
||||
|
Loading…
Reference in New Issue
Block a user