mirror of
				https://github.com/yt-dlp/yt-dlp.git
				synced 2025-10-31 06:35:12 +00:00 
			
		
		
		
	[extractor/nbc] Add NBCStations extractor (#5077)
Closes #4571 Authored by: bashonly
This commit is contained in:
		| @@ -7,14 +7,20 @@ from .theplatform import ThePlatformIE | ||||
| from .adobepass import AdobePassIE | ||||
| from ..compat import compat_urllib_parse_unquote | ||||
| from ..utils import ( | ||||
|     ExtractorError, | ||||
|     int_or_none, | ||||
|     parse_age_limit, | ||||
|     parse_duration, | ||||
|     RegexNotFoundError, | ||||
|     smuggle_url, | ||||
|     str_or_none, | ||||
|     traverse_obj, | ||||
|     try_get, | ||||
|     unified_strdate, | ||||
|     unified_timestamp, | ||||
|     update_url_query, | ||||
|     url_basename, | ||||
|     variadic, | ||||
| ) | ||||
| 
 | ||||
| 
 | ||||
| @@ -584,3 +590,169 @@ class NBCOlympicsStreamIE(AdobePassIE): | ||||
|             'formats': formats, | ||||
|             'is_live': is_live, | ||||
|         } | ||||
| 
 | ||||
| 
 | ||||
| class NBCStationsIE(InfoExtractor): | ||||
|     _DOMAIN_RE = '|'.join(map(re.escape, ( | ||||
|         'nbcbayarea', 'nbcboston', 'nbcchicago', 'nbcconnecticut', 'nbcdfw', 'nbclosangeles', | ||||
|         'nbcmiami', 'nbcnewyork', 'nbcphiladelphia', 'nbcsandiego', 'nbcwashington', | ||||
|         'necn', 'telemundo52', 'telemundoarizona', 'telemundochicago', 'telemundonuevainglaterra', | ||||
|     ))) | ||||
|     _VALID_URL = rf'https?://(?:www\.)?(?P<site>{_DOMAIN_RE})\.com/(?:[^/?#]+/)*(?P<id>[^/?#]+)/?(?:$|[#?])' | ||||
| 
 | ||||
|     _TESTS = [{ | ||||
|         'url': 'https://www.nbclosangeles.com/news/local/large-structure-fire-in-downtown-la-prompts-smoke-odor-advisory/2968618/', | ||||
|         'md5': '462041d91bd762ef5a38b7d85d6dc18f', | ||||
|         'info_dict': { | ||||
|             'id': '2968618', | ||||
|             'ext': 'mp4', | ||||
|             'title': 'Large Structure Fire in Downtown LA Prompts Smoke Odor Advisory', | ||||
|             'description': None, | ||||
|             'timestamp': 1661135892, | ||||
|             'upload_date': '20220821', | ||||
|             'uploader': 'NBC 4', | ||||
|             'uploader_id': 'KNBC', | ||||
|             'channel': 'nbclosangeles', | ||||
|         }, | ||||
|     }, { | ||||
|         'url': 'https://www.telemundoarizona.com/responde/huracan-complica-reembolso-para-televidente-de-tucson/2247002/', | ||||
|         'md5': '0917dcf7885be1023a9220630d415f67', | ||||
|         'info_dict': { | ||||
|             'id': '2247002', | ||||
|             'ext': 'mp4', | ||||
|             'title': 'Huracán complica que televidente de Tucson reciba reembolso', | ||||
|             'description': 'md5:af298dc73aab74d4fca6abfb12acb6cf', | ||||
|             'timestamp': 1660886507, | ||||
|             'upload_date': '20220819', | ||||
|             'uploader': 'Telemundo Arizona', | ||||
|             'uploader_id': 'KTAZ', | ||||
|             'channel': 'telemundoarizona', | ||||
|         }, | ||||
|     }] | ||||
| 
 | ||||
|     _RESOLUTIONS = { | ||||
|         '1080': '1920', | ||||
|         '720': '1280', | ||||
|         '540': '960', | ||||
|         '360': '640', | ||||
|         '234': '416', | ||||
|     } | ||||
| 
 | ||||
|     def _real_extract(self, url): | ||||
|         channel, video_id = self._match_valid_url(url).group('site', 'id') | ||||
|         webpage = self._download_webpage(url, video_id) | ||||
| 
 | ||||
|         nbc_data = self._search_json( | ||||
|             r'<script>var\s*nbc\s*=\s*', webpage, 'NBC JSON data', video_id) | ||||
|         pdk_acct = nbc_data.get('pdkAcct') or 'Yh1nAC' | ||||
|         fw_ssid = traverse_obj(nbc_data, ('video', 'fwSSID')) | ||||
|         fw_network_id = traverse_obj(nbc_data, ('video', 'fwNetworkID'), default='382114') | ||||
| 
 | ||||
|         video_data = self._parse_json(self._html_search_regex( | ||||
|             r'data-videos="([^"]*)"', webpage, 'video data', default='{}'), video_id) | ||||
|         video_data = variadic(video_data)[0] | ||||
|         video_data.update(self._parse_json(self._html_search_regex( | ||||
|             r'data-meta="([^"]*)"', webpage, 'metadata', default='{}'), video_id)) | ||||
| 
 | ||||
|         formats = [] | ||||
| 
 | ||||
|         if video_data.get('mpx_is_livestream') == '1': | ||||
|             live = True | ||||
|             player_id = traverse_obj( | ||||
|                 video_data, 'mpx_m3upid', ('video', 'meta', 'mpx_m3upid'), 'mpx_pid', | ||||
|                 ('video', 'meta', 'mpx_pid'), 'pid_streaming_web_medium') | ||||
|             query = { | ||||
|                 'mbr': 'true', | ||||
|                 'assetTypes': 'LegacyRelease', | ||||
|                 'fwsitesection': fw_ssid, | ||||
|                 'fwNetworkID': fw_network_id, | ||||
|                 'pprofile': 'ots_desktop_html', | ||||
|                 'sensitive': 'false', | ||||
|                 'w': '1920', | ||||
|                 'h': '1080', | ||||
|                 'rnd': '1660303', | ||||
|                 'mode': 'LIVE', | ||||
|                 'format': 'SMIL', | ||||
|                 'tracking': 'true', | ||||
|                 'formats': 'M3U+none,MPEG-DASH+none,MPEG4,MP3', | ||||
|                 'vpaid': 'script', | ||||
|                 'schema': '2.0', | ||||
|                 'SDK': 'PDK+6.1.3', | ||||
|             } | ||||
|             info = { | ||||
|                 'title': f'{channel} livestream', | ||||
|             } | ||||
| 
 | ||||
|         else: | ||||
|             live = False | ||||
|             player_id = traverse_obj( | ||||
|                 video_data, ('video', 'meta', 'pid_streaming_web_high'), 'pid_streaming_web_high', | ||||
|                 ('video', 'meta', 'mpx_pid'), 'mpx_pid') | ||||
| 
 | ||||
|             date_string = traverse_obj(video_data, 'date_string', 'date_gmt') | ||||
|             if date_string: | ||||
|                 date_string = self._search_regex( | ||||
|                     r'datetime="([^"]+)"', date_string, 'date string', fatal=False) | ||||
|             else: | ||||
|                 date_string = traverse_obj( | ||||
|                     nbc_data, ('dataLayer', 'adobe', 'prop70'), ('dataLayer', 'adobe', 'eVar70'), | ||||
|                     ('dataLayer', 'adobe', 'eVar59')) | ||||
| 
 | ||||
|             video_url = traverse_obj(video_data, ('video', 'meta', 'mp4_url'), 'mp4_url') | ||||
|             if video_url: | ||||
|                 height = url_basename(video_url).split('-')[1].split('p')[0] | ||||
|                 formats.append({ | ||||
|                     'url': video_url, | ||||
|                     'ext': 'mp4', | ||||
|                     'width': int_or_none(self._RESOLUTIONS.get(height)), | ||||
|                     'height': int_or_none(height), | ||||
|                     'format_id': f'http-{height}', | ||||
|                 }) | ||||
| 
 | ||||
|             query = { | ||||
|                 'mbr': 'true', | ||||
|                 'assetTypes': 'LegacyRelease', | ||||
|                 'fwsitesection': fw_ssid, | ||||
|                 'fwNetworkID': fw_network_id, | ||||
|                 'format': 'redirect', | ||||
|                 'manifest': 'm3u', | ||||
|                 'Tracking': 'true', | ||||
|                 'Embedded': 'true', | ||||
|                 'formats': 'MPEG4', | ||||
|             } | ||||
|             info = { | ||||
|                 'title': video_data.get('title') or traverse_obj( | ||||
|                     nbc_data, ('dataLayer', 'contenttitle'), ('dataLayer', 'title'), | ||||
|                     ('dataLayer', 'adobe', 'prop22'), ('dataLayer', 'id')), | ||||
|                 'description': traverse_obj(video_data, 'summary', 'excerpt', 'video_hero_text'), | ||||
|                 'upload_date': str_or_none(unified_strdate(date_string)), | ||||
|                 'timestamp': int_or_none(unified_timestamp(date_string)), | ||||
|             } | ||||
| 
 | ||||
|         if not player_id: | ||||
|             raise ExtractorError( | ||||
|                 'No video player ID or livestream player ID found in webpage', expected=True) | ||||
| 
 | ||||
|         headers = {'Origin': f'https://www.{channel}.com'} | ||||
|         manifest, urlh = self._download_webpage_handle( | ||||
|             f'https://link.theplatform.com/s/{pdk_acct}/{player_id}', video_id, | ||||
|             headers=headers, query=query, note='Downloading manifest') | ||||
|         if live: | ||||
|             manifest_url = self._search_regex(r'<video src="([^"]*)', manifest, 'manifest URL') | ||||
|         else: | ||||
|             manifest_url = urlh.geturl() | ||||
| 
 | ||||
|         formats.extend(self._extract_m3u8_formats( | ||||
|             manifest_url, video_id, 'mp4', headers=headers, m3u8_id='hls', | ||||
|             fatal=live, live=live, errnote='No HLS formats found')) | ||||
|         self._sort_formats(formats) | ||||
| 
 | ||||
|         return { | ||||
|             'id': str_or_none(video_id), | ||||
|             'channel': channel, | ||||
|             'uploader': str_or_none(nbc_data.get('on_air_name')), | ||||
|             'uploader_id': str_or_none(nbc_data.get('callLetters')), | ||||
|             'formats': formats, | ||||
|             'is_live': live, | ||||
|             **info, | ||||
|         } | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 bashonly
					bashonly