mirror of
				https://github.com/yt-dlp/yt-dlp.git
				synced 2025-10-31 14:45:14 +00:00 
			
		
		
		
	[extractor/la7] Improve extractor (#5538)
Authored by: nixxo Closes #5360
This commit is contained in:
		| @@ -2,7 +2,6 @@ import re | ||||
| 
 | ||||
| from .common import InfoExtractor | ||||
| from ..utils import ( | ||||
|     determine_ext, | ||||
|     float_or_none, | ||||
|     HEADRequest, | ||||
|     int_or_none, | ||||
| @@ -13,13 +12,13 @@ from ..utils import ( | ||||
| 
 | ||||
| class LA7IE(InfoExtractor): | ||||
|     IE_NAME = 'la7.it' | ||||
|     _VALID_URL = r'''(?x)(https?://)?(?: | ||||
|         (?:www\.)?la7\.it/([^/]+)/(?:rivedila7|video)/| | ||||
|     _VALID_URL = r'''(?x)https?://(?: | ||||
|         (?:www\.)?la7\.it/([^/]+)/(?:rivedila7|video|news)/| | ||||
|         tg\.la7\.it/repliche-tgla7\?id= | ||||
|     )(?P<id>.+)''' | ||||
| 
 | ||||
|     _TESTS = [{ | ||||
|         # 'src' is a plain URL | ||||
|         # single quality video | ||||
|         'url': 'http://www.la7.it/crozza/video/inccool8-02-10-2015-163722', | ||||
|         'md5': '8b613ffc0c4bf9b9e377169fc19c214c', | ||||
|         'info_dict': { | ||||
| @@ -29,6 +28,20 @@ class LA7IE(InfoExtractor): | ||||
|             'description': 'Benvenuti nell\'incredibile mondo della INC. COOL. 8. dove “INC.” sta per “Incorporated” “COOL” sta per “fashion” ed Eight sta per il gesto atletico', | ||||
|             'thumbnail': 're:^https?://.*', | ||||
|             'upload_date': '20151002', | ||||
|             'formats': 'count:4', | ||||
|         }, | ||||
|     }, { | ||||
|         # multiple quality video | ||||
|         'url': 'https://www.la7.it/calcio-femminile/news/il-gol-di-lindsey-thomas-fiorentina-vs-milan-serie-a-calcio-femminile-26-11-2022-461736', | ||||
|         'md5': 'd2370e78f75e8d1238cb3a0db9a2eda3', | ||||
|         'info_dict': { | ||||
|             'id': 'il-gol-di-lindsey-thomas-fiorentina-vs-milan-serie-a-calcio-femminile-26-11-2022-461736', | ||||
|             'ext': 'mp4', | ||||
|             'title': 'Il gol di Lindsey Thomas | Fiorentina vs Milan | Serie A Calcio Femminile', | ||||
|             'description': 'Il gol di Lindsey Thomas | Fiorentina vs Milan | Serie A Calcio Femminile', | ||||
|             'thumbnail': 're:^https?://.*', | ||||
|             'upload_date': '20221126', | ||||
|             'formats': 'count:8', | ||||
|         }, | ||||
|     }, { | ||||
|         'url': 'http://www.la7.it/omnibus/rivedila7/omnibus-news-02-07-2016-189077', | ||||
| @@ -39,7 +52,7 @@ class LA7IE(InfoExtractor): | ||||
|     def _generate_mp4_url(self, quality, m3u8_formats): | ||||
|         for f in m3u8_formats: | ||||
|             if f['vcodec'] != 'none' and quality in f['url']: | ||||
|                 http_url = '%s%s.mp4' % (self._HOST, quality) | ||||
|                 http_url = f'{self._HOST}{quality}.mp4' | ||||
| 
 | ||||
|                 urlh = self._request_webpage( | ||||
|                     HEADRequest(http_url), quality, | ||||
| @@ -58,12 +71,13 @@ class LA7IE(InfoExtractor): | ||||
| 
 | ||||
|     def _real_extract(self, url): | ||||
|         video_id = self._match_id(url) | ||||
| 
 | ||||
|         if not url.startswith('http'): | ||||
|             url = '%s//%s' % (self.http_scheme(), url) | ||||
| 
 | ||||
|         webpage = self._download_webpage(url, video_id) | ||||
|         video_path = self._search_regex(r'(/content/.*?).mp4', webpage, 'video_path') | ||||
| 
 | ||||
|         if re.search(r'(?i)(drmsupport\s*:\s*true)\s*', webpage): | ||||
|             self.report_drm(video_id) | ||||
| 
 | ||||
|         video_path = self._search_regex( | ||||
|             r'(/content/[\w/,]+?)\.mp4(?:\.csmil)?/master\.m3u8', webpage, 'video_path') | ||||
| 
 | ||||
|         formats = self._extract_mpd_formats( | ||||
|             f'{self._HOST}/local/dash/,{video_path}.mp4.urlset/manifest.mpd', | ||||
| @@ -90,8 +104,7 @@ class LA7IE(InfoExtractor): | ||||
| 
 | ||||
| class LA7PodcastEpisodeIE(InfoExtractor): | ||||
|     IE_NAME = 'la7.it:pod:episode' | ||||
|     _VALID_URL = r'''(?x)(https?://)? | ||||
|         (?:www\.)?la7\.it/[^/]+/podcast/([^/]+-)?(?P<id>\d+)''' | ||||
|     _VALID_URL = r'https?://(?:www\.)?la7\.it/[^/]+/podcast/([^/]+-)?(?P<id>\d+)' | ||||
| 
 | ||||
|     _TESTS = [{ | ||||
|         'url': 'https://www.la7.it/voicetown/podcast/la-carezza-delle-memoria-di-carlo-verdone-23-03-2021-371497', | ||||
| @@ -125,14 +138,15 @@ class LA7PodcastEpisodeIE(InfoExtractor): | ||||
|                 webpage, 'video_id', group='vid') | ||||
| 
 | ||||
|         media_url = self._search_regex( | ||||
|             (r'src:\s*([\'"])(?P<url>.+?mp3.+?)\1', | ||||
|              r'data-podcast=([\'"])(?P<url>.+?mp3.+?)\1'), | ||||
|             (r'src\s*:\s*([\'"])(?P<url>\S+?mp3.+?)\1', | ||||
|              r'data-podcast\s*=\s*([\'"])(?P<url>\S+?mp3.+?)\1'), | ||||
|             webpage, 'media_url', group='url') | ||||
|         ext = determine_ext(media_url) | ||||
|         formats = [{ | ||||
|             'url': media_url, | ||||
|             'format_id': ext, | ||||
|             'ext': ext, | ||||
|             'format_id': 'http-mp3', | ||||
|             'ext': 'mp3', | ||||
|             'acodec': 'mp3', | ||||
|             'vcodec': 'none', | ||||
|         }] | ||||
| 
 | ||||
|         title = self._html_search_regex( | ||||
| @@ -173,7 +187,7 @@ class LA7PodcastEpisodeIE(InfoExtractor): | ||||
|         # and title is the same as the show_title | ||||
|         # add the date to the title | ||||
|         if date and not date_alt and ppn and ppn.lower() == title.lower(): | ||||
|             title += ' del %s' % date | ||||
|             title = f'{title} del {date}' | ||||
|         return { | ||||
|             'id': video_id, | ||||
|             'title': title, | ||||
| @@ -193,7 +207,7 @@ class LA7PodcastEpisodeIE(InfoExtractor): | ||||
| 
 | ||||
| class LA7PodcastIE(LA7PodcastEpisodeIE):  # XXX: Do not subclass from concrete IE | ||||
|     IE_NAME = 'la7.it:podcast' | ||||
|     _VALID_URL = r'(https?://)?(www\.)?la7\.it/(?P<id>[^/]+)/podcast/?(?:$|[#?])' | ||||
|     _VALID_URL = r'https?://(?:www\.)?la7\.it/(?P<id>[^/]+)/podcast/?(?:$|[#?])' | ||||
| 
 | ||||
|     _TESTS = [{ | ||||
|         'url': 'https://www.la7.it/propagandalive/podcast', | ||||
| @@ -201,7 +215,7 @@ class LA7PodcastIE(LA7PodcastEpisodeIE):  # XXX: Do not subclass from concrete I | ||||
|             'id': 'propagandalive', | ||||
|             'title': "Propaganda Live", | ||||
|         }, | ||||
|         'playlist_count': 10, | ||||
|         'playlist_count_min': 10, | ||||
|     }] | ||||
| 
 | ||||
|     def _real_extract(self, url): | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 nixxo
					nixxo