mirror of
				https://github.com/yt-dlp/yt-dlp.git
				synced 2025-10-31 06:35:12 +00:00 
			
		
		
		
	[ie/bbc] Fix and extend extraction (#9705)
Closes #9701 Authored by: kylegustavo, dirkf, pukkandan
This commit is contained in:
		| @@ -602,7 +602,7 @@ class BBCIE(BBCCoUkIE):  # XXX: Do not subclass from concrete IE | ||||
|         'url': 'http://www.bbc.com/news/world-europe-32668511', | ||||
|         'info_dict': { | ||||
|             'id': 'world-europe-32668511', | ||||
|             'title': 'Russia stages massive WW2 parade', | ||||
|             'title': 'Russia stages massive WW2 parade despite Western boycott', | ||||
|             'description': 'md5:00ff61976f6081841f759a08bf78cc9c', | ||||
|         }, | ||||
|         'playlist_count': 2, | ||||
| @@ -623,6 +623,7 @@ class BBCIE(BBCCoUkIE):  # XXX: Do not subclass from concrete IE | ||||
|         'info_dict': { | ||||
|             'id': '3662a707-0af9-3149-963f-47bea720b460', | ||||
|             'title': 'BUGGER', | ||||
|             'description': r're:BUGGER  The recent revelations by the whistleblower Edward Snowden were fascinating. .{211}\.{3}$', | ||||
|         }, | ||||
|         'playlist_count': 18, | ||||
|     }, { | ||||
| @@ -631,14 +632,14 @@ class BBCIE(BBCCoUkIE):  # XXX: Do not subclass from concrete IE | ||||
|         'info_dict': { | ||||
|             'id': 'p02mprgb', | ||||
|             'ext': 'mp4', | ||||
|             'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV', | ||||
|             'description': 'md5:2868290467291b37feda7863f7a83f54', | ||||
|             'title': 'Germanwings crash site aerial video', | ||||
|             'description': r're:(?s)Aerial video showed the site where the Germanwings flight 4U 9525, .{156} BFM TV\.$', | ||||
|             'duration': 47, | ||||
|             'timestamp': 1427219242, | ||||
|             'upload_date': '20150324', | ||||
|             'thumbnail': 'https://ichef.bbci.co.uk/news/1024/media/images/81879000/jpg/_81879090_81879089.jpg', | ||||
|         }, | ||||
|         'params': { | ||||
|             # rtmp download | ||||
|             'skip_download': True, | ||||
|         } | ||||
|     }, { | ||||
| @@ -656,21 +657,24 @@ class BBCIE(BBCCoUkIE):  # XXX: Do not subclass from concrete IE | ||||
|         }, | ||||
|         'params': { | ||||
|             'skip_download': True, | ||||
|         } | ||||
|         }, | ||||
|         'skip': 'now SIMORGH_DATA with no video', | ||||
|     }, { | ||||
|         # single video embedded with data-playable containing XML playlists (regional section) | ||||
|         'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw', | ||||
|         'info_dict': { | ||||
|             'id': '150619_video_honduras_militares_hospitales_corrupcion_aw', | ||||
|             'id': '39275083', | ||||
|             'display_id': '150619_video_honduras_militares_hospitales_corrupcion_aw', | ||||
|             'ext': 'mp4', | ||||
|             'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción', | ||||
|             'description': 'md5:1525f17448c4ee262b64b8f0c9ce66c8', | ||||
|             'description': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción', | ||||
|             'timestamp': 1434713142, | ||||
|             'upload_date': '20150619', | ||||
|             'thumbnail': 'https://a.files.bbci.co.uk/worldservice/live/assets/images/2015/06/19/150619132146_honduras_hsopitales_militares_640x360_aptn_nocredit.jpg', | ||||
|         }, | ||||
|         'params': { | ||||
|             'skip_download': True, | ||||
|         } | ||||
|         }, | ||||
|     }, { | ||||
|         # single video from video playlist embedded with vxp-playlist-data JSON | ||||
|         'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376', | ||||
| @@ -683,22 +687,21 @@ class BBCIE(BBCCoUkIE):  # XXX: Do not subclass from concrete IE | ||||
|         }, | ||||
|         'params': { | ||||
|             'skip_download': True, | ||||
|         } | ||||
|         }, | ||||
|         'skip': '404 Not Found', | ||||
|     }, { | ||||
|         # single video story with digitalData | ||||
|         # single video story with __PWA_PRELOADED_STATE__ | ||||
|         'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret', | ||||
|         'info_dict': { | ||||
|             'id': 'p02q6gc4', | ||||
|             'ext': 'flv', | ||||
|             'title': 'Sri Lanka’s spicy secret', | ||||
|             'description': 'As a new train line to Jaffna opens up the country’s north, travellers can experience a truly distinct slice of Tamil culture.', | ||||
|             'timestamp': 1437674293, | ||||
|             'upload_date': '20150723', | ||||
|             'ext': 'mp4', | ||||
|             'title': 'Tasting the spice of life in Jaffna', | ||||
|             'description': r're:(?s)BBC Travel Show’s Henry Golding explores the city of Jaffna .{151} aftertaste\.$', | ||||
|             'timestamp': 1646058397, | ||||
|             'upload_date': '20220228', | ||||
|             'duration': 255, | ||||
|             'thumbnail': 'https://ichef.bbci.co.uk/images/ic/1920xn/p02vxvkn.jpg', | ||||
|         }, | ||||
|         'params': { | ||||
|             # rtmp download | ||||
|             'skip_download': True, | ||||
|         } | ||||
|     }, { | ||||
|         # single video story without digitalData | ||||
|         'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star', | ||||
| @@ -710,12 +713,10 @@ class BBCIE(BBCCoUkIE):  # XXX: Do not subclass from concrete IE | ||||
|             'timestamp': 1415867444, | ||||
|             'upload_date': '20141113', | ||||
|         }, | ||||
|         'params': { | ||||
|             # rtmp download | ||||
|             'skip_download': True, | ||||
|         } | ||||
|         'skip': 'redirects to TopGear home page', | ||||
|     }, { | ||||
|         # single video embedded with Morph | ||||
|         # TODO: replacement test page | ||||
|         'url': 'http://www.bbc.co.uk/sport/live/olympics/36895975', | ||||
|         'info_dict': { | ||||
|             'id': 'p041vhd0', | ||||
| @@ -726,27 +727,22 @@ class BBCIE(BBCCoUkIE):  # XXX: Do not subclass from concrete IE | ||||
|             'uploader': 'BBC Sport', | ||||
|             'uploader_id': 'bbc_sport', | ||||
|         }, | ||||
|         'params': { | ||||
|             # m3u8 download | ||||
|             'skip_download': True, | ||||
|         }, | ||||
|         'skip': 'Georestricted to UK', | ||||
|         'skip': 'Video no longer in page', | ||||
|     }, { | ||||
|         # single video with playlist.sxml URL in playlist param | ||||
|         # single video in __INITIAL_DATA__ | ||||
|         'url': 'http://www.bbc.com/sport/0/football/33653409', | ||||
|         'info_dict': { | ||||
|             'id': 'p02xycnp', | ||||
|             'ext': 'mp4', | ||||
|             'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?', | ||||
|             'description': 'BBC Sport\'s David Ornstein has the latest transfer gossip, including rumours of a Manchester United return for Cristiano Ronaldo.', | ||||
|             'title': 'Ronaldo to Man Utd, Arsenal to spend?', | ||||
|             'description': r're:(?s)BBC Sport\'s David Ornstein rounds up the latest transfer reports, .{359} here\.$', | ||||
|             'timestamp': 1437750175, | ||||
|             'upload_date': '20150724', | ||||
|             'thumbnail': r're:https?://.+/.+media/images/69320000/png/_69320754_mmgossipcolumnextraaugust18.png', | ||||
|             'duration': 140, | ||||
|         }, | ||||
|         'params': { | ||||
|             # rtmp download | ||||
|             'skip_download': True, | ||||
|         } | ||||
|     }, { | ||||
|         # article with multiple videos embedded with playlist.sxml in playlist param | ||||
|         # article with multiple videos embedded with Morph.setPayload | ||||
|         'url': 'http://www.bbc.com/sport/0/football/34475836', | ||||
|         'info_dict': { | ||||
|             'id': '34475836', | ||||
| @@ -754,6 +750,21 @@ class BBCIE(BBCCoUkIE):  # XXX: Do not subclass from concrete IE | ||||
|             'description': 'Fast-paced football, wit, wisdom and a ready smile - why Liverpool fans should come to love new boss Jurgen Klopp.', | ||||
|         }, | ||||
|         'playlist_count': 3, | ||||
|     }, { | ||||
|         # Testing noplaylist | ||||
|         'url': 'http://www.bbc.com/sport/0/football/34475836', | ||||
|         'info_dict': { | ||||
|             'id': 'p034ppnv', | ||||
|             'ext': 'mp4', | ||||
|             'title': 'All you need to know about Jurgen Klopp', | ||||
|             'timestamp': 1444335081, | ||||
|             'upload_date': '20151008', | ||||
|             'duration': 122.0, | ||||
|             'thumbnail': 'https://ichef.bbci.co.uk/onesport/cps/976/cpsprodpb/7542/production/_85981003_klopp.jpg', | ||||
|         }, | ||||
|         'params': { | ||||
|             'noplaylist': True, | ||||
|         }, | ||||
|     }, { | ||||
|         # school report article with single video | ||||
|         'url': 'http://www.bbc.co.uk/schoolreport/35744779', | ||||
| @@ -762,6 +773,7 @@ class BBCIE(BBCCoUkIE):  # XXX: Do not subclass from concrete IE | ||||
|             'title': 'School which breaks down barriers in Jerusalem', | ||||
|         }, | ||||
|         'playlist_count': 1, | ||||
|         'skip': 'redirects to Young Reporter home page https://www.bbc.co.uk/news/topics/cg41ylwv43pt', | ||||
|     }, { | ||||
|         # single video with playlist URL from weather section | ||||
|         'url': 'http://www.bbc.com/weather/features/33601775', | ||||
| @@ -778,18 +790,33 @@ class BBCIE(BBCCoUkIE):  # XXX: Do not subclass from concrete IE | ||||
|             'thumbnail': r're:https?://.+/.+\.jpg', | ||||
|             'timestamp': 1437785037, | ||||
|             'upload_date': '20150725', | ||||
|             'duration': 105, | ||||
|         }, | ||||
|     }, { | ||||
|         # video with window.__INITIAL_DATA__ and value as JSON string | ||||
|         'url': 'https://www.bbc.com/news/av/world-europe-59468682', | ||||
|         'info_dict': { | ||||
|             'id': 'p0b71qth', | ||||
|             'id': 'p0b779gc', | ||||
|             'ext': 'mp4', | ||||
|             'title': 'Why France is making this woman a national hero', | ||||
|             'description': 'md5:7affdfab80e9c3a1f976230a1ff4d5e4', | ||||
|             'description': r're:(?s)France is honouring the US-born 20th Century singer and activist Josephine .{208} Second World War.', | ||||
|             'thumbnail': r're:https?://.+/.+\.jpg', | ||||
|             'timestamp': 1638230731, | ||||
|             'upload_date': '20211130', | ||||
|             'timestamp': 1638215626, | ||||
|             'upload_date': '20211129', | ||||
|             'duration': 125, | ||||
|         }, | ||||
|     }, { | ||||
|         # video with script id __NEXT_DATA__ and value as JSON string | ||||
|         'url': 'https://www.bbc.com/news/uk-68546268', | ||||
|         'info_dict': { | ||||
|             'id': 'p0hj0lq7', | ||||
|             'ext': 'mp4', | ||||
|             'title': 'Nasser Hospital doctor describes his treatment by IDF', | ||||
|             'description': r're:(?s)Doctor Abu Sabha said he was detained by Israeli forces after .{276} hostages\."$', | ||||
|             'thumbnail': r're:https?://.+/.+\.jpg', | ||||
|             'timestamp': 1710188248, | ||||
|             'upload_date': '20240311', | ||||
|             'duration': 104, | ||||
|         }, | ||||
|     }, { | ||||
|         # single video article embedded with data-media-vpid | ||||
| @@ -817,6 +844,7 @@ class BBCIE(BBCCoUkIE):  # XXX: Do not subclass from concrete IE | ||||
|             'uploader': 'Radio 3', | ||||
|             'uploader_id': 'bbc_radio_three', | ||||
|         }, | ||||
|         'skip': '404 Not Found', | ||||
|     }, { | ||||
|         'url': 'http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227', | ||||
|         'info_dict': { | ||||
| @@ -824,6 +852,7 @@ class BBCIE(BBCCoUkIE):  # XXX: Do not subclass from concrete IE | ||||
|             'ext': 'mp4', | ||||
|             'title': 'md5:2fabf12a726603193a2879a055f72514', | ||||
|             'description': 'Learn English words and phrases from this story', | ||||
|             'thumbnail': 'https://ichef.bbci.co.uk/images/ic/1200x675/p06pq9gk.jpg', | ||||
|         }, | ||||
|         'add_ie': [BBCCoUkIE.ie_key()], | ||||
|     }, { | ||||
| @@ -832,28 +861,30 @@ class BBCIE(BBCCoUkIE):  # XXX: Do not subclass from concrete IE | ||||
|         'info_dict': { | ||||
|             'id': 'p07c6sb9', | ||||
|             'ext': 'mp4', | ||||
|             'title': 'How positive thinking is harming your happiness', | ||||
|             'alt_title': 'The downsides of positive thinking', | ||||
|             'description': 'md5:fad74b31da60d83b8265954ee42d85b4', | ||||
|             'title': 'The downsides of positive thinking', | ||||
|             'description': 'The downsides of positive thinking', | ||||
|             'duration': 235, | ||||
|             'thumbnail': r're:https?://.+/p07c9dsr.jpg', | ||||
|             'upload_date': '20190604', | ||||
|             'categories': ['Psychology'], | ||||
|             'thumbnail': r're:https?://.+/p07c9dsr\.(?:jpg|webp|png)', | ||||
|             'upload_date': '20220223', | ||||
|             'timestamp': 1645632746, | ||||
|         }, | ||||
|     }, { | ||||
|         # BBC Sounds | ||||
|         'url': 'https://www.bbc.co.uk/sounds/play/m001q78b', | ||||
|         'url': 'https://www.bbc.co.uk/sounds/play/w3ct5rgx', | ||||
|         'info_dict': { | ||||
|             'id': 'm001q789', | ||||
|             'id': 'p0hrw4nr', | ||||
|             'ext': 'mp4', | ||||
|             'title': 'The Night Tracks Mix - Music for the darkling hour', | ||||
|             'thumbnail': 'https://ichef.bbci.co.uk/images/ic/raw/p0c00hym.jpg', | ||||
|             'chapters': 'count:8', | ||||
|             'description': 'md5:815fb51cbdaa270040aab8145b3f1d67', | ||||
|             'uploader': 'Radio 3', | ||||
|             'duration': 1800, | ||||
|             'uploader_id': 'bbc_radio_three', | ||||
|         }, | ||||
|             'title': 'Are our coastlines being washed away?', | ||||
|             'description': r're:(?s)Around the world, coastlines are constantly changing .{2000,} Images\)$', | ||||
|             'timestamp': 1713556800, | ||||
|             'upload_date': '20240419', | ||||
|             'duration': 1588, | ||||
|             'thumbnail': 'https://ichef.bbci.co.uk/images/ic/raw/p0hrnxbl.jpg', | ||||
|             'uploader': 'World Service', | ||||
|             'uploader_id': 'bbc_world_service', | ||||
|             'series': 'CrowdScience', | ||||
|             'chapters': [], | ||||
|         } | ||||
|     }, {  # onion routes | ||||
|         'url': 'https://www.bbcnewsd73hkzno2ini43t4gblxvycyac5aw4gnv7t2rccijh7745uqd.onion/news/av/world-europe-63208576', | ||||
|         'only_matching': True, | ||||
| @@ -1008,8 +1039,7 @@ class BBCIE(BBCCoUkIE):  # XXX: Do not subclass from concrete IE | ||||
|             webpage, 'group id', default=None) | ||||
|         if group_id: | ||||
|             return self.url_result( | ||||
|                 'https://www.bbc.co.uk/programmes/%s' % group_id, | ||||
|                 ie=BBCCoUkIE.ie_key()) | ||||
|                 f'https://www.bbc.co.uk/programmes/{group_id}', BBCCoUkIE) | ||||
| 
 | ||||
|         # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret) | ||||
|         programme_id = self._search_regex( | ||||
| @@ -1069,83 +1099,133 @@ class BBCIE(BBCCoUkIE):  # XXX: Do not subclass from concrete IE | ||||
|                 } | ||||
| 
 | ||||
|         # Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975) | ||||
|         # There are several setPayload calls may be present but the video | ||||
|         # seems to be always related to the first one | ||||
|         morph_payload = self._parse_json( | ||||
|             self._search_regex( | ||||
|                 r'Morph\.setPayload\([^,]+,\s*({.+?})\);', | ||||
|                 webpage, 'morph payload', default='{}'), | ||||
|             playlist_id, fatal=False) | ||||
|         # Several setPayload calls may be present but the video(s) | ||||
|         # should be in one that mentions leadMedia or videoData | ||||
|         morph_payload = self._search_json( | ||||
|             r'\bMorph\s*\.\s*setPayload\s*\([^,]+,', webpage, 'morph payload', playlist_id, | ||||
|             contains_pattern=r'{(?s:(?:(?!</script>).)+(?:"leadMedia"|\\"videoData\\")\s*:.+)}', | ||||
|             default={}) | ||||
|         if morph_payload: | ||||
|             components = try_get(morph_payload, lambda x: x['body']['components'], list) or [] | ||||
|             for component in components: | ||||
|                 if not isinstance(component, dict): | ||||
|                     continue | ||||
|                 lead_media = try_get(component, lambda x: x['props']['leadMedia'], dict) | ||||
|                 if not lead_media: | ||||
|                     continue | ||||
|                 identifiers = lead_media.get('identifiers') | ||||
|                 if not identifiers or not isinstance(identifiers, dict): | ||||
|                     continue | ||||
|                 programme_id = identifiers.get('vpid') or identifiers.get('playablePid') | ||||
|             for lead_media in traverse_obj(morph_payload, ( | ||||
|                     'body', 'components', ..., 'props', 'leadMedia', {dict})): | ||||
|                 programme_id = traverse_obj(lead_media, ('identifiers', ('vpid', 'playablePid'), {str}, any)) | ||||
|                 if not programme_id: | ||||
|                     continue | ||||
|                 title = lead_media.get('title') or self._og_search_title(webpage) | ||||
|                 formats, subtitles = self._download_media_selector(programme_id) | ||||
|                 description = lead_media.get('summary') | ||||
|                 uploader = lead_media.get('masterBrand') | ||||
|                 uploader_id = lead_media.get('mid') | ||||
|                 duration = None | ||||
|                 duration_d = lead_media.get('duration') | ||||
|                 if isinstance(duration_d, dict): | ||||
|                     duration = parse_duration(dict_get( | ||||
|                         duration_d, ('rawDuration', 'formattedDuration', 'spokenDuration'))) | ||||
|                 return { | ||||
|                     'id': programme_id, | ||||
|                     'title': title, | ||||
|                     'description': description, | ||||
|                     'duration': duration, | ||||
|                     'uploader': uploader, | ||||
|                     'uploader_id': uploader_id, | ||||
|                     'title': lead_media.get('title') or self._og_search_title(webpage), | ||||
|                     **traverse_obj(lead_media, { | ||||
|                         'description': ('summary', {str}), | ||||
|                         'duration': ('duration', ('rawDuration', 'formattedDuration', 'spokenDuration'), {parse_duration}), | ||||
|                         'uploader': ('masterBrand', {str}), | ||||
|                         'uploader_id': ('mid', {str}), | ||||
|                     }), | ||||
|                     'formats': formats, | ||||
|                     'subtitles': subtitles, | ||||
|                 } | ||||
|             body = self._parse_json(traverse_obj(morph_payload, ( | ||||
|                 'body', 'content', 'article', 'body')), playlist_id, fatal=False) | ||||
|             for video_data in traverse_obj(body, (lambda _, v: v['videoData']['pid'], 'videoData')): | ||||
|                 if video_data.get('vpid'): | ||||
|                     video_id = video_data['vpid'] | ||||
|                     formats, subtitles = self._download_media_selector(video_id) | ||||
|                     entry = { | ||||
|                         'id': video_id, | ||||
|                         'formats': formats, | ||||
|                         'subtitles': subtitles, | ||||
|                     } | ||||
|                 else: | ||||
|                     video_id = video_data['pid'] | ||||
|                     entry = self.url_result( | ||||
|                         f'https://www.bbc.co.uk/programmes/{video_id}', BBCCoUkIE, | ||||
|                         video_id, url_transparent=True) | ||||
|                 entry.update({ | ||||
|                     'timestamp': traverse_obj(morph_payload, ( | ||||
|                         'body', 'content', 'article', 'dateTimeInfo', 'dateTime', {parse_iso8601}) | ||||
|                     ), | ||||
|                     **traverse_obj(video_data, { | ||||
|                         'thumbnail': (('iChefImage', 'image'), {url_or_none}, any), | ||||
|                         'title': (('title', 'caption'), {str}, any), | ||||
|                         'duration': ('duration', {parse_duration}), | ||||
|                     }), | ||||
|                 }) | ||||
|                 if video_data.get('isLead') and not self._yes_playlist(playlist_id, video_id): | ||||
|                     return entry | ||||
|                 entries.append(entry) | ||||
|             if entries: | ||||
|                 playlist_title = traverse_obj(morph_payload, ( | ||||
|                     'body', 'content', 'article', 'headline', {str})) or playlist_title | ||||
|                 return self.playlist_result( | ||||
|                     entries, playlist_id, playlist_title, playlist_description) | ||||
| 
 | ||||
|         preload_state = self._parse_json(self._search_regex( | ||||
|             r'window\.__PRELOADED_STATE__\s*=\s*({.+?});', webpage, | ||||
|             'preload state', default='{}'), playlist_id, fatal=False) | ||||
|         if preload_state: | ||||
|             current_programme = preload_state.get('programmes', {}).get('current') or {} | ||||
|             programme_id = current_programme.get('id') | ||||
|             if current_programme and programme_id and current_programme.get('type') == 'playable_item': | ||||
|                 title = current_programme.get('titles', {}).get('tertiary') or playlist_title | ||||
|                 formats, subtitles = self._download_media_selector(programme_id) | ||||
|                 synopses = current_programme.get('synopses') or {} | ||||
|                 network = current_programme.get('network') or {} | ||||
|                 duration = int_or_none( | ||||
|                     current_programme.get('duration', {}).get('value')) | ||||
|                 thumbnail = None | ||||
|                 image_url = current_programme.get('image_url') | ||||
|                 if image_url: | ||||
|                     thumbnail = image_url.replace('{recipe}', 'raw') | ||||
|         # various PRELOADED_STATE JSON | ||||
|         preload_state = self._search_json( | ||||
|             r'window\.__(?:PWA_)?PRELOADED_STATE__\s*=', webpage, | ||||
|             'preload state', playlist_id, transform_source=js_to_json, default={}) | ||||
|         # PRELOADED_STATE with current programmme | ||||
|         current_programme = traverse_obj(preload_state, ('programmes', 'current', {dict})) | ||||
|         programme_id = traverse_obj(current_programme, ('id', {str})) | ||||
|         if programme_id and current_programme.get('type') == 'playable_item': | ||||
|             title = traverse_obj(current_programme, ('titles', ('tertiary', 'secondary'), {str}, any)) or playlist_title | ||||
|             formats, subtitles = self._download_media_selector(programme_id) | ||||
|             return { | ||||
|                 'id': programme_id, | ||||
|                 'title': title, | ||||
|                 'formats': formats, | ||||
|                 **traverse_obj(current_programme, { | ||||
|                     'description': ('synopses', ('long', 'medium', 'short'), {str}, any), | ||||
|                     'thumbnail': ('image_url', {lambda u: url_or_none(u.replace('{recipe}', 'raw'))}), | ||||
|                     'duration': ('duration', 'value', {int_or_none}), | ||||
|                     'uploader': ('network', 'short_title', {str}), | ||||
|                     'uploader_id': ('network', 'id', {str}), | ||||
|                     'timestamp': ((('availability', 'from'), ('release', 'date')), {parse_iso8601}, any), | ||||
|                     'series': ('titles', 'primary', {str}), | ||||
|                 }), | ||||
|                 'subtitles': subtitles, | ||||
|                 'chapters': traverse_obj(preload_state, ( | ||||
|                     'tracklist', 'tracks', lambda _, v: float(v['offset']['start']), { | ||||
|                         'title': ('titles', {lambda x: join_nonempty( | ||||
|                             'primary', 'secondary', 'tertiary', delim=' - ', from_dict=x)}), | ||||
|                         'start_time': ('offset', 'start', {float_or_none}), | ||||
|                         'end_time': ('offset', 'end', {float_or_none}), | ||||
|                     }) | ||||
|                 ), | ||||
|             } | ||||
| 
 | ||||
|         # PWA_PRELOADED_STATE with article video asset | ||||
|         asset_id = traverse_obj(preload_state, ( | ||||
|             'entities', 'articles', lambda k, _: k.rsplit('/', 1)[-1] == playlist_id, | ||||
|             'assetVideo', 0, {str}, any)) | ||||
|         if asset_id: | ||||
|             video_id = traverse_obj(preload_state, ('entities', 'videos', asset_id, 'vpid', {str})) | ||||
|             if video_id: | ||||
|                 article = traverse_obj(preload_state, ( | ||||
|                     'entities', 'articles', lambda _, v: v['assetVideo'][0] == asset_id, any)) | ||||
| 
 | ||||
|                 def image_url(image_id): | ||||
|                     return traverse_obj(preload_state, ( | ||||
|                         'entities', 'images', image_id, 'url', | ||||
|                         {lambda u: url_or_none(u.replace('$recipe', 'raw'))})) | ||||
| 
 | ||||
|                 formats, subtitles = self._download_media_selector(video_id) | ||||
|                 return { | ||||
|                     'id': programme_id, | ||||
|                     'title': title, | ||||
|                     'description': dict_get(synopses, ('long', 'medium', 'short')), | ||||
|                     'thumbnail': thumbnail, | ||||
|                     'duration': duration, | ||||
|                     'uploader': network.get('short_title'), | ||||
|                     'uploader_id': network.get('id'), | ||||
|                     'id': video_id, | ||||
|                     **traverse_obj(preload_state, ('entities', 'videos', asset_id, { | ||||
|                         'title': ('title', {str}), | ||||
|                         'description': (('synopsisLong', 'synopsisMedium', 'synopsisShort'), {str}, any), | ||||
|                         'thumbnail': (0, {image_url}), | ||||
|                         'duration': ('duration', {int_or_none}), | ||||
|                     })), | ||||
|                     'formats': formats, | ||||
|                     'subtitles': subtitles, | ||||
|                     'chapters': traverse_obj(preload_state, ( | ||||
|                         'tracklist', 'tracks', lambda _, v: float_or_none(v['offset']['start']), { | ||||
|                             'title': ('titles', {lambda x: join_nonempty( | ||||
|                                 'primary', 'secondary', 'tertiary', delim=' - ', from_dict=x)}), | ||||
|                             'start_time': ('offset', 'start', {float_or_none}), | ||||
|                             'end_time': ('offset', 'end', {float_or_none}), | ||||
|                         })) or None, | ||||
|                     'timestamp': traverse_obj(article, ('displayDate', {parse_iso8601})), | ||||
|                 } | ||||
|             else: | ||||
|                 return self.url_result( | ||||
|                     f'https://www.bbc.co.uk/programmes/{asset_id}', BBCCoUkIE, | ||||
|                     asset_id, playlist_title, display_id=playlist_id, | ||||
|                     description=playlist_description) | ||||
| 
 | ||||
|         bbc3_config = self._parse_json( | ||||
|             self._search_regex( | ||||
| @@ -1191,6 +1271,28 @@ class BBCIE(BBCCoUkIE):  # XXX: Do not subclass from concrete IE | ||||
|                 return self.playlist_result( | ||||
|                     entries, playlist_id, playlist_title, playlist_description) | ||||
| 
 | ||||
|         def parse_model(model): | ||||
|             """Extract single video from model structure""" | ||||
|             item_id = traverse_obj(model, ('versions', 0, 'versionId', {str})) | ||||
|             if not item_id: | ||||
|                 return | ||||
|             formats, subtitles = self._download_media_selector(item_id) | ||||
|             return { | ||||
|                 'id': item_id, | ||||
|                 'formats': formats, | ||||
|                 'subtitles': subtitles, | ||||
|                 **traverse_obj(model, { | ||||
|                     'title': ('title', {str}), | ||||
|                     'thumbnail': ('imageUrl', {lambda u: urljoin(url, u.replace('$recipe', 'raw'))}), | ||||
|                     'description': ('synopses', ('long', 'medium', 'short'), {str}, {lambda x: x or None}, any), | ||||
|                     'duration': ('versions', 0, 'duration', {int}), | ||||
|                     'timestamp': ('versions', 0, 'availableFrom', {functools.partial(int_or_none, scale=1000)}), | ||||
|                 }) | ||||
|             } | ||||
| 
 | ||||
|         def is_type(*types): | ||||
|             return lambda _, v: v['type'] in types | ||||
| 
 | ||||
|         initial_data = self._search_regex( | ||||
|             r'window\.__INITIAL_DATA__\s*=\s*("{.+?}")\s*;', webpage, | ||||
|             'quoted preload state', default=None) | ||||
| @@ -1202,6 +1304,19 @@ class BBCIE(BBCCoUkIE):  # XXX: Do not subclass from concrete IE | ||||
|             initial_data = self._parse_json(initial_data or '"{}"', playlist_id, fatal=False) | ||||
|         initial_data = self._parse_json(initial_data, playlist_id, fatal=False) | ||||
|         if initial_data: | ||||
|             for video_data in traverse_obj(initial_data, ( | ||||
|                     'stores', 'article', 'articleBodyContent', is_type('video'))): | ||||
|                 model = traverse_obj(video_data, ( | ||||
|                     'model', 'blocks', is_type('aresMedia'), | ||||
|                     'model', 'blocks', is_type('aresMediaMetadata'), | ||||
|                     'model', {dict}, any)) | ||||
|                 entry = parse_model(model) | ||||
|                 if entry: | ||||
|                     entries.append(entry) | ||||
|             if entries: | ||||
|                 return self.playlist_result( | ||||
|                     entries, playlist_id, playlist_title, playlist_description) | ||||
| 
 | ||||
|             def parse_media(media): | ||||
|                 if not media: | ||||
|                     return | ||||
| @@ -1234,27 +1349,90 @@ class BBCIE(BBCCoUkIE):  # XXX: Do not subclass from concrete IE | ||||
|                         'subtitles': subtitles, | ||||
|                         'timestamp': item_time, | ||||
|                         'description': strip_or_none(item_desc), | ||||
|                         'duration': int_or_none(item.get('duration')), | ||||
|                     }) | ||||
|             for resp in (initial_data.get('data') or {}).values(): | ||||
|                 name = resp.get('name') | ||||
| 
 | ||||
|             for resp in traverse_obj(initial_data, ('data', lambda _, v: v['name'])): | ||||
|                 name = resp['name'] | ||||
|                 if name == 'media-experience': | ||||
|                     parse_media(try_get(resp, lambda x: x['data']['initialItem']['mediaItem'], dict)) | ||||
|                 elif name == 'article': | ||||
|                     for block in (try_get(resp, | ||||
|                                           (lambda x: x['data']['blocks'], | ||||
|                                            lambda x: x['data']['content']['model']['blocks'],), | ||||
|                                           list) or []): | ||||
|                         if block.get('type') not in ['media', 'video']: | ||||
|                             continue | ||||
|                         parse_media(block.get('model')) | ||||
|                     for block in traverse_obj(resp, ( | ||||
|                             'data', (None, ('content', 'model')), 'blocks', | ||||
|                             is_type('media', 'video'), 'model', {dict})): | ||||
|                         parse_media(block) | ||||
|             return self.playlist_result( | ||||
|                 entries, playlist_id, playlist_title, playlist_description) | ||||
| 
 | ||||
|         # extract from SIMORGH_DATA hydration JSON | ||||
|         simorgh_data = self._search_json( | ||||
|             r'window\s*\.\s*SIMORGH_DATA\s*=', webpage, | ||||
|             'simorgh data', playlist_id, default={}) | ||||
|         if simorgh_data: | ||||
|             done = False | ||||
|             for video_data in traverse_obj(simorgh_data, ( | ||||
|                     'pageData', 'content', 'model', 'blocks', is_type('video', 'legacyMedia'))): | ||||
|                 model = traverse_obj(video_data, ( | ||||
|                     'model', 'blocks', is_type('aresMedia'), | ||||
|                     'model', 'blocks', is_type('aresMediaMetadata'), | ||||
|                     'model', {dict}, any)) | ||||
|                 if video_data['type'] == 'video': | ||||
|                     entry = parse_model(model) | ||||
|                 else:  # legacyMedia: no duration, subtitles | ||||
|                     block_id, entry = traverse_obj(model, ('blockId', {str})), None | ||||
|                     media_data = traverse_obj(simorgh_data, ( | ||||
|                         'pageData', 'promo', 'media', | ||||
|                         {lambda x: x if x['id'] == block_id else None})) | ||||
|                     formats = traverse_obj(media_data, ('playlist', lambda _, v: url_or_none(v['url']), { | ||||
|                         'url': ('url', {url_or_none}), | ||||
|                         'ext': ('format', {str}), | ||||
|                         'tbr': ('bitrate', {functools.partial(int_or_none, scale=1000)}), | ||||
|                     })) | ||||
|                     if formats: | ||||
|                         entry = { | ||||
|                             'id': block_id, | ||||
|                             'display_id': playlist_id, | ||||
|                             'formats': formats, | ||||
|                             'description': traverse_obj(simorgh_data, ('pageData', 'promo', 'summary', {str})), | ||||
|                             **traverse_obj(model, { | ||||
|                                 'title': ('title', {str}), | ||||
|                                 'thumbnail': ('imageUrl', {lambda u: urljoin(url, u.replace('$recipe', 'raw'))}), | ||||
|                                 'description': ('synopses', ('long', 'medium', 'short'), {str}, any), | ||||
|                                 'timestamp': ('firstPublished', {functools.partial(int_or_none, scale=1000)}), | ||||
|                             }), | ||||
|                         } | ||||
|                         done = True | ||||
|                 if entry: | ||||
|                     entries.append(entry) | ||||
|                 if done: | ||||
|                     break | ||||
|             if entries: | ||||
|                 return self.playlist_result( | ||||
|                     entries, playlist_id, playlist_title, playlist_description) | ||||
| 
 | ||||
|         def extract_all(pattern): | ||||
|             return list(filter(None, map( | ||||
|                 lambda s: self._parse_json(s, playlist_id, fatal=False), | ||||
|                 re.findall(pattern, webpage)))) | ||||
| 
 | ||||
|         # US accessed article with single embedded video (e.g. | ||||
|         # https://www.bbc.com/news/uk-68546268) | ||||
|         next_data = traverse_obj(self._search_nextjs_data(webpage, playlist_id, default={}), | ||||
|                                  ('props', 'pageProps', 'page')) | ||||
|         model = traverse_obj(next_data, ( | ||||
|             ..., 'contents', is_type('video'), | ||||
|             'model', 'blocks', is_type('media'), | ||||
|             'model', 'blocks', is_type('mediaMetadata'), | ||||
|             'model', {dict}, any)) | ||||
|         if model and (entry := parse_model(model)): | ||||
|             if not entry.get('timestamp'): | ||||
|                 entry['timestamp'] = traverse_obj(next_data, ( | ||||
|                     ..., 'contents', is_type('timestamp'), 'model', | ||||
|                     'timestamp', {functools.partial(int_or_none, scale=1000)}, any)) | ||||
|             entries.append(entry) | ||||
|             return self.playlist_result( | ||||
|                 entries, playlist_id, playlist_title, playlist_description) | ||||
| 
 | ||||
|         # Multiple video article (e.g. | ||||
|         # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460) | ||||
|         EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+%s(?:\b[^"]+)?' % self._ID_REGEX | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 kylegustavo
					kylegustavo