mirror of
				https://github.com/yt-dlp/yt-dlp.git
				synced 2025-11-04 08:35:12 +00:00 
			
		
		
		
	[NBC] Enhance embedURL extraction (closes #2549)
This commit is contained in:
		@@ -53,6 +53,7 @@ from youtube_dl.utils import (
 | 
				
			|||||||
    unified_strdate,
 | 
					    unified_strdate,
 | 
				
			||||||
    unsmuggle_url,
 | 
					    unsmuggle_url,
 | 
				
			||||||
    uppercase_escape,
 | 
					    uppercase_escape,
 | 
				
			||||||
 | 
					    lowercase_escape,
 | 
				
			||||||
    url_basename,
 | 
					    url_basename,
 | 
				
			||||||
    urlencode_postdata,
 | 
					    urlencode_postdata,
 | 
				
			||||||
    version_tuple,
 | 
					    version_tuple,
 | 
				
			||||||
@@ -418,6 +419,10 @@ class TestUtil(unittest.TestCase):
 | 
				
			|||||||
        self.assertEqual(uppercase_escape('aä'), 'aä')
 | 
					        self.assertEqual(uppercase_escape('aä'), 'aä')
 | 
				
			||||||
        self.assertEqual(uppercase_escape('\\U0001d550'), '𝕐')
 | 
					        self.assertEqual(uppercase_escape('\\U0001d550'), '𝕐')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def test_lowercase_escape(self):
 | 
				
			||||||
 | 
					        self.assertEqual(lowercase_escape('aä'), 'aä')
 | 
				
			||||||
 | 
					        self.assertEqual(lowercase_escape('\\u0026'), '&')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def test_limit_length(self):
 | 
					    def test_limit_length(self):
 | 
				
			||||||
        self.assertEqual(limit_length(None, 12), None)
 | 
					        self.assertEqual(limit_length(None, 12), None)
 | 
				
			||||||
        self.assertEqual(limit_length('foo', 12), 'foo')
 | 
					        self.assertEqual(limit_length('foo', 12), 'foo')
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -10,6 +10,8 @@ from ..compat import (
 | 
				
			|||||||
from ..utils import (
 | 
					from ..utils import (
 | 
				
			||||||
    ExtractorError,
 | 
					    ExtractorError,
 | 
				
			||||||
    find_xpath_attr,
 | 
					    find_xpath_attr,
 | 
				
			||||||
 | 
					    lowercase_escape,
 | 
				
			||||||
 | 
					    unescapeHTML,
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -46,18 +48,23 @@ class NBCIE(InfoExtractor):
 | 
				
			|||||||
                'description': 'md5:0b40f9cbde5b671a7ff62fceccc4f442',
 | 
					                'description': 'md5:0b40f9cbde5b671a7ff62fceccc4f442',
 | 
				
			||||||
            },
 | 
					            },
 | 
				
			||||||
            'skip': 'Only works from US',
 | 
					            'skip': 'Only works from US',
 | 
				
			||||||
 | 
					        },
 | 
				
			||||||
 | 
					        {
 | 
				
			||||||
 | 
					            # This video has expired but with an escaped embedURL
 | 
				
			||||||
 | 
					            'url': 'http://www.nbc.com/parenthood/episode-guide/season-5/just-like-at-home/515',
 | 
				
			||||||
 | 
					            'skip': 'Expired'
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
    ]
 | 
					    ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def _real_extract(self, url):
 | 
					    def _real_extract(self, url):
 | 
				
			||||||
        video_id = self._match_id(url)
 | 
					        video_id = self._match_id(url)
 | 
				
			||||||
        webpage = self._download_webpage(url, video_id)
 | 
					        webpage = self._download_webpage(url, video_id)
 | 
				
			||||||
        theplatform_url = self._search_regex(
 | 
					        theplatform_url = unescapeHTML(lowercase_escape(self._html_search_regex(
 | 
				
			||||||
            [
 | 
					            [
 | 
				
			||||||
                r'(?:class="video-player video-player-full" data-mpx-url|class="player" src)="(.*?)"',
 | 
					                r'(?:class="video-player video-player-full" data-mpx-url|class="player" src)="(.*?)"',
 | 
				
			||||||
                r'"embedURL"\s*:\s*"([^"]+)"'
 | 
					                r'"embedURL"\s*:\s*"([^"]+)"'
 | 
				
			||||||
            ],
 | 
					            ],
 | 
				
			||||||
            webpage, 'theplatform url').replace('_no_endcard', '')
 | 
					            webpage, 'theplatform url').replace('_no_endcard', '').replace('\\/', '/')))
 | 
				
			||||||
        if theplatform_url.startswith('//'):
 | 
					        if theplatform_url.startswith('//'):
 | 
				
			||||||
            theplatform_url = 'http:' + theplatform_url
 | 
					            theplatform_url = 'http:' + theplatform_url
 | 
				
			||||||
        return self.url_result(theplatform_url)
 | 
					        return self.url_result(theplatform_url)
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -1486,6 +1486,14 @@ def uppercase_escape(s):
 | 
				
			|||||||
        s)
 | 
					        s)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def lowercase_escape(s):
 | 
				
			||||||
 | 
					    unicode_escape = codecs.getdecoder('unicode_escape')
 | 
				
			||||||
 | 
					    return re.sub(
 | 
				
			||||||
 | 
					        r'\\u[0-9a-fA-F]{4}',
 | 
				
			||||||
 | 
					        lambda m: unicode_escape(m.group(0))[0],
 | 
				
			||||||
 | 
					        s)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def escape_rfc3986(s):
 | 
					def escape_rfc3986(s):
 | 
				
			||||||
    """Escape non-ASCII characters as suggested by RFC 3986"""
 | 
					    """Escape non-ASCII characters as suggested by RFC 3986"""
 | 
				
			||||||
    if sys.version_info < (3, 0) and isinstance(s, compat_str):
 | 
					    if sys.version_info < (3, 0) and isinstance(s, compat_str):
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user