1
0
mirror of https://github.com/yt-dlp/yt-dlp.git synced 2026-02-22 16:36:54 +00:00

Merge branch 'master' into youtube-mix-fix

This commit is contained in:
insaneracist
2020-10-31 02:40:11 -07:00
28 changed files with 579 additions and 154 deletions

View File

@@ -1438,6 +1438,13 @@ class AdobePassIE(InfoExtractor):
provider_redirect_page, 'oauth redirect')
self._download_webpage(
oauth_redirect_url, video_id, 'Confirming auto login')
elif 'automatically signed in with' in provider_redirect_page:
# Seems like comcast is rolling up new way of automatically signing customers
oauth_redirect_url = self._html_search_regex(
r'continue:\s*"(https://oauth.xfinity.com/oauth/authorize\?.+)"', provider_redirect_page,
'oauth redirect (signed)')
# Just need to process the request. No useful data comes back
self._download_webpage(oauth_redirect_url, video_id, 'Confirming auto login')
else:
if '<form name="signin"' in provider_redirect_page:
provider_login_page_res = provider_redirect_page_res

View File

@@ -471,12 +471,17 @@ class BrightcoveNewIE(AdobePassIE):
title = json_data['name'].strip()
formats = []
sources_num = len(json_data.get('sources'))
key_systems_present = 0
for source in json_data.get('sources', []):
container = source.get('container')
ext = mimetype2ext(source.get('type'))
src = source.get('src')
# https://support.brightcove.com/playback-api-video-fields-reference#key_systems_object
if ext == 'ism' or container == 'WVM' or source.get('key_systems'):
# https://apis.support.brightcove.com/playback/references/playback-api-video-fields-reference.html
if source.get('key_systems'):
key_systems_present += 1
continue
elif ext == 'ism' or container == 'WVM':
continue
elif ext == 'm3u8' or container == 'M2TS':
if not src:
@@ -533,6 +538,10 @@ class BrightcoveNewIE(AdobePassIE):
'format_id': build_format_id('rtmp'),
})
formats.append(f)
if sources_num == key_systems_present:
raise ExtractorError('This video is DRM protected', expected=True)
if not formats:
# for sonyliv.com DRM protected videos
s3_source_url = json_data.get('custom_fields', {}).get('s3sourceurl')

View File

@@ -751,6 +751,7 @@ from .ninecninemedia import NineCNineMediaIE
from .ninegag import NineGagIE
from .ninenow import NineNowIE
from .nintendo import NintendoIE
from .nitter import NitterIE
from .njpwworld import NJPWWorldIE
from .nobelprize import NobelPrizeIE
from .noco import NocoIE
@@ -1037,6 +1038,10 @@ from .sky import (
SkyNewsIE,
SkySportsIE,
)
from .skyitalia import (
SkyArteItaliaIE,
SkyItaliaIE,
)
from .slideshare import SlideshareIE
from .slideslive import SlidesLiveIE
from .slutload import SlutloadIE

View File

@@ -289,7 +289,7 @@ class MTVServicesInfoExtractor(InfoExtractor):
return mgid
def _extract_mgid(self, webpage, url, data_zone=None):
def _extract_mgid(self, webpage, url, title=None, data_zone=None):
try:
# the url can be http://media.mtvnservices.com/fb/{mgid}.swf
# or http://media.mtvnservices.com/{mgid}
@@ -300,7 +300,8 @@ class MTVServicesInfoExtractor(InfoExtractor):
except RegexNotFoundError:
mgid = None
title = self._match_id(url)
if not title:
title = url_basename(url)
try:
window_data = self._parse_json(self._search_regex(
@@ -336,7 +337,7 @@ class MTVServicesInfoExtractor(InfoExtractor):
def _real_extract(self, url):
title = url_basename(url)
webpage = self._download_webpage(url, title)
mgid = self._extract_mgid(webpage, url)
mgid = self._extract_mgid(webpage, url, title=title)
videos_info = self._get_videos_info(mgid, url=url)
return videos_info

View File

@@ -13,17 +13,16 @@ from ..utils import (
class NetzkinoIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?netzkino\.de/\#!/(?P<category>[^/]+)/(?P<id>[^/]+)'
_VALID_URL = r'https?://(?:www\.)?netzkino\.de/\#!/[^/]+/(?P<id>[^/]+)'
_TEST = {
'url': 'http://www.netzkino.de/#!/scifikino/rakete-zum-mond',
_TESTS = [{
'url': 'https://www.netzkino.de/#!/scifikino/rakete-zum-mond',
'md5': '92a3f8b76f8d7220acce5377ea5d4873',
'info_dict': {
'id': 'rakete-zum-mond',
'ext': 'mp4',
'title': 'Rakete zum Mond (Endstation Mond, Destination Moon)',
'comments': 'mincount:3',
'description': 'md5:1eddeacc7e62d5a25a2d1a7290c64a28',
'title': 'Rakete zum Mond \u2013 Jules Verne',
'description': 'md5:f0a8024479618ddbfa450ff48ffa6c60',
'upload_date': '20120813',
'thumbnail': r're:https?://.*\.jpg$',
'timestamp': 1344858571,
@@ -32,17 +31,30 @@ class NetzkinoIE(InfoExtractor):
'params': {
'skip_download': 'Download only works from Germany',
}
}
}, {
'url': 'https://www.netzkino.de/#!/filme/dr-jekyll-mrs-hyde-2',
'md5': 'c7728b2dadd04ff6727814847a51ef03',
'info_dict': {
'id': 'dr-jekyll-mrs-hyde-2',
'ext': 'mp4',
'title': 'Dr. Jekyll & Mrs. Hyde 2',
'description': 'md5:c2e9626ebd02de0a794b95407045d186',
'upload_date': '20190130',
'thumbnail': r're:https?://.*\.jpg$',
'timestamp': 1548849437,
'age_limit': 18,
},
'params': {
'skip_download': 'Download only works from Germany',
}
}]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
category_id = mobj.group('category')
video_id = mobj.group('id')
api_url = 'http://api.netzkino.de.simplecache.net/capi-2.0a/categories/%s.json?d=www' % category_id
api_info = self._download_json(api_url, video_id)
info = next(
p for p in api_info['posts'] if p['slug'] == video_id)
api_url = 'https://api.netzkino.de.simplecache.net/capi-2.0a/movies/%s.json?d=www' % video_id
info = self._download_json(api_url, video_id)
custom_fields = info['custom_fields']
production_js = self._download_webpage(
@@ -67,23 +79,12 @@ class NetzkinoIE(InfoExtractor):
} for key, tpl in templates.items()]
self._sort_formats(formats)
comments = [{
'timestamp': parse_iso8601(c.get('date'), delimiter=' '),
'id': c['id'],
'author': c['name'],
'html': c['content'],
'parent': 'root' if c.get('parent', 0) == 0 else c['parent'],
} for c in info.get('comments', [])]
return {
'id': video_id,
'formats': formats,
'comments': comments,
'title': info['title'],
'age_limit': int_or_none(custom_fields.get('FSK')[0]),
'timestamp': parse_iso8601(info.get('date'), delimiter=' '),
'description': clean_html(info.get('content')),
'thumbnail': info.get('thumbnail'),
'playlist_title': api_info.get('title'),
'playlist_id': category_id,
}

View File

@@ -4,6 +4,7 @@ import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
extract_attributes,
int_or_none,
parse_duration,
@@ -20,22 +21,22 @@ class NewgroundsIE(InfoExtractor):
'info_dict': {
'id': '549479',
'ext': 'mp3',
'title': 'B7 - BusMode',
'title': 'Burn7 - B7 - BusMode',
'uploader': 'Burn7',
'timestamp': 1378878540,
'upload_date': '20130911',
'duration': 143,
},
}, {
'url': 'https://www.newgrounds.com/portal/view/673111',
'md5': '3394735822aab2478c31b1004fe5e5bc',
'url': 'https://www.newgrounds.com/portal/view/1',
'md5': 'fbfb40e2dc765a7e830cb251d370d981',
'info_dict': {
'id': '673111',
'id': '1',
'ext': 'mp4',
'title': 'Dancin',
'uploader': 'Squirrelman82',
'timestamp': 1460256780,
'upload_date': '20160410',
'title': 'Brian-Beaton - Scrotum 1',
'uploader': 'Brian-Beaton',
'timestamp': 955064100,
'upload_date': '20000406',
},
}, {
# source format unavailable, additional mp4 formats
@@ -43,7 +44,7 @@ class NewgroundsIE(InfoExtractor):
'info_dict': {
'id': '689400',
'ext': 'mp4',
'title': 'ZTV News Episode 8',
'title': 'Bennettthesage - ZTV News Episode 8',
'uploader': 'BennettTheSage',
'timestamp': 1487965140,
'upload_date': '20170224',
@@ -55,42 +56,73 @@ class NewgroundsIE(InfoExtractor):
def _real_extract(self, url):
media_id = self._match_id(url)
formats = []
uploader = None
webpage = self._download_webpage(url, media_id)
title = self._html_search_regex(
r'<title>([^>]+)</title>', webpage, 'title')
media_url = self._parse_json(self._search_regex(
r'"url"\s*:\s*("[^"]+"),', webpage, ''), media_id)
media_url_string = self._search_regex(
r'"url"\s*:\s*("[^"]+"),', webpage, 'media url', default=None, fatal=False)
formats = [{
'url': media_url,
'format_id': 'source',
'quality': 1,
}]
if media_url_string:
media_url = self._parse_json(media_url_string, media_id)
formats = [{
'url': media_url,
'format_id': 'source',
'quality': 1,
}]
max_resolution = int_or_none(self._search_regex(
r'max_resolution["\']\s*:\s*(\d+)', webpage, 'max resolution',
default=None))
if max_resolution:
url_base = media_url.rpartition('.')[0]
for resolution in (360, 720, 1080):
if resolution > max_resolution:
break
formats.append({
'url': '%s.%dp.mp4' % (url_base, resolution),
'format_id': '%dp' % resolution,
'height': resolution,
})
max_resolution = int_or_none(self._search_regex(
r'max_resolution["\']\s*:\s*(\d+)', webpage, 'max resolution',
default=None))
if max_resolution:
url_base = media_url.rpartition('.')[0]
for resolution in (360, 720, 1080):
if resolution > max_resolution:
break
formats.append({
'url': '%s.%dp.mp4' % (url_base, resolution),
'format_id': '%dp' % resolution,
'height': resolution,
})
else:
video_id = int_or_none(self._search_regex(
r'data-movie-id=\\"([0-9]+)\\"', webpage, ''))
if not video_id:
raise ExtractorError('Could not extract media data')
url_video_data = 'https://www.newgrounds.com/portal/video/%s' % video_id
headers = {
'Accept': 'application/json',
'Referer': url,
'X-Requested-With': 'XMLHttpRequest'
}
json_video = self._download_json(url_video_data, video_id, headers=headers, fatal=False)
if not json_video:
raise ExtractorError('Could not fetch media data')
uploader = json_video.get('author')
title = json_video.get('title')
media_formats = json_video.get('sources', [])
for media_format in media_formats:
media_sources = media_formats[media_format]
for source in media_sources:
formats.append({
'format_id': media_format,
'quality': int_or_none(media_format[:-1]),
'url': source.get('src')
})
self._check_formats(formats, media_id)
self._sort_formats(formats)
uploader = self._html_search_regex(
(r'(?s)<h4[^>]*>(.+?)</h4>.*?<em>\s*Author\s*</em>',
r'(?:Author|Writer)\s*<a[^>]+>([^<]+)'), webpage, 'uploader',
fatal=False)
if not uploader:
uploader = self._html_search_regex(
(r'(?s)<h4[^>]*>(.+?)</h4>.*?<em>\s*(?:Author|Artist)\s*</em>',
r'(?:Author|Writer)\s*<a[^>]+>([^<]+)'), webpage, 'uploader',
fatal=False)
timestamp = unified_timestamp(self._html_search_regex(
(r'<dt>\s*Uploaded\s*</dt>\s*<dd>([^<]+</dd>\s*<dd>[^<]+)',
@@ -109,6 +141,9 @@ class NewgroundsIE(InfoExtractor):
if '<dd>Song' in webpage:
formats[0]['vcodec'] = 'none'
if uploader:
title = "%s - %s" % (uploader, title)
return {
'id': media_id,
'title': title,

View File

@@ -0,0 +1,167 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import compat_urlparse
from ..utils import (
parse_count,
unified_strdate,
unified_timestamp,
remove_end,
determine_ext,
)
import re
class NitterIE(InfoExtractor):
# Taken from https://github.com/zedeus/nitter/wiki/Instances
INSTANCES = ('nitter.net',
'nitter.snopyta.org',
'nitter.42l.fr',
'nitter.nixnet.services',
'nitter.13ad.de',
'nitter.pussthecat.org',
'nitter.mastodont.cat',
'nitter.dark.fail',
'nitter.tedomum.net',
'nitter.cattube.org',
'nitter.fdn.fr',
'nitter.1d4.us',
'nitter.kavin.rocks',
'tweet.lambda.dance',
'nitter.cc',
'nitter.weaponizedhumiliation.com',
'3nzoldnxplag42gqjs23xvghtzf6t6yzssrtytnntc6ppc7xxuoneoad.onion',
'nitter.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd.onion',
'nitterlgj3n5fgwesu3vxc5h67ruku33nqaoeoocae2mvlzhsu6k7fqd.onion')
_INSTANCES_RE = '(?:' + '|'.join([re.escape(instance) for instance in INSTANCES]) + ')'
_VALID_URL = r'https?://%(instance)s/(?P<uploader_id>.+)/status/(?P<id>[0-9]+)(#.)?' % {'instance': _INSTANCES_RE}
current_instance = INSTANCES[0] # the test and official instance
_TESTS = [
{
# GIF (wrapped in mp4)
'url': 'https://' + current_instance + '/firefox/status/1314279897502629888#m',
'info_dict': {
'id': '1314279897502629888',
'ext': 'mp4',
'title': 'Firefox 🔥 - You know the old saying, if you see something say something. Now you actually can with the YouTube regrets extension. Report harmful YouTube recommendations so others can avoid watching them. ➡️ https://mzl.la/3iFIiyg #UnfckTheInternet',
'description': 'You know the old saying, if you see something say something. Now you actually can with the YouTube regrets extension. Report harmful YouTube recommendations so others can avoid watching them. ➡️ https://mzl.la/3iFIiyg #UnfckTheInternet',
'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'Firefox 🔥',
'uploader_id': 'firefox',
'uploader_url': 'https://' + current_instance + '/firefox',
'upload_date': '20201008',
'timestamp': 1602183720,
},
}, { # normal video
'url': 'https://' + current_instance + '/Le___Doc/status/1299715685392756737#m',
'info_dict': {
'id': '1299715685392756737',
'ext': 'mp4',
'title': 'Le Doc - "Je ne prédis jamais rien" D Raoult, Août 2020...',
'description': '"Je ne prédis jamais rien" D Raoult, Août 2020...',
'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'Le Doc',
'uploader_id': 'Le___Doc',
'uploader_url': 'https://' + current_instance + '/Le___Doc',
'upload_date': '20200829',
'timestamp': 1598711341,
'view_count': int,
'like_count': int,
'repost_count': int,
'comment_count': int,
},
}, { # video embed in a "Streaming Political Ads" box
'url': 'https://' + current_instance + '/mozilla/status/1321147074491092994#m',
'info_dict': {
'id': '1321147074491092994',
'ext': 'mp4',
'title': "Mozilla - Are you being targeted with weird, ominous or just plain annoying political ads while streaming your favorite shows? This isn't a real political ad, but if you're watching streaming TV in the U.S., chances are you've seen quite a few. Learn more ➡️ https://mzl.la/StreamingAds",
'description': "Are you being targeted with weird, ominous or just plain annoying political ads while streaming your favorite shows? This isn't a real political ad, but if you're watching streaming TV in the U.S., chances are you've seen quite a few. Learn more ➡️ https://mzl.la/StreamingAds",
'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'Mozilla',
'uploader_id': 'mozilla',
'uploader_url': 'https://' + current_instance + '/mozilla',
'upload_date': '20201027',
'timestamp': 1603820982
},
},
]
def _real_extract(self, url):
video_id = self._match_id(url)
parsed_url = compat_urlparse.urlparse(url)
base_url = parsed_url.scheme + '://' + parsed_url.netloc
self._set_cookie(parsed_url.netloc, 'hlsPlayback', 'on')
webpage = self._download_webpage(url, video_id)
video_url = base_url + self._html_search_regex(r'(?:<video[^>]+data-url|<source[^>]+src)="([^"]+)"', webpage, 'video url')
ext = determine_ext(video_url)
if ext == 'unknown_video':
formats = self._extract_m3u8_formats(video_url, video_id, ext='mp4')
else:
formats = [{
'url': video_url,
'ext': ext
}]
title = (
self._og_search_description(webpage).replace('\n', ' ')
or self._html_search_regex(r'<div class="tweet-content[^>]+>([^<]+)</div>', webpage, 'title'))
description = title
mobj = re.match(self._VALID_URL, url)
uploader_id = (
mobj.group('uploader_id')
or self._html_search_regex(r'<a class="fullname"[^>]+title="([^"]+)"', webpage, 'uploader name', fatal=False))
if uploader_id:
uploader_url = base_url + '/' + uploader_id
uploader = self._html_search_regex(r'<a class="fullname"[^>]+title="([^"]+)"', webpage, 'uploader name', fatal=False)
if uploader:
title = uploader + ' - ' + title
view_count = parse_count(self._html_search_regex(r'<span[^>]+class="icon-play[^>]*></span>\s([^<]+)</div>', webpage, 'view count', fatal=False))
like_count = parse_count(self._html_search_regex(r'<span[^>]+class="icon-heart[^>]*></span>\s([^<]+)</div>', webpage, 'like count', fatal=False))
repost_count = parse_count(self._html_search_regex(r'<span[^>]+class="icon-retweet[^>]*></span>\s([^<]+)</div>', webpage, 'repost count', fatal=False))
comment_count = parse_count(self._html_search_regex(r'<span[^>]+class="icon-comment[^>]*></span>\s([^<]+)</div>', webpage, 'repost count', fatal=False))
thumbnail = base_url + (self._html_search_meta('og:image', webpage, 'thumbnail url')
or self._html_search_regex(r'<video[^>]+poster="([^"]+)"', webpage, 'thumbnail url', fatal=False))
thumbnail = remove_end(thumbnail, '%3Asmall') # if parsed with regex, it should contain this
thumbnails = []
thumbnail_ids = ('thumb', 'small', 'large', 'medium', 'orig')
for id in thumbnail_ids:
thumbnails.append({
'id': id,
'url': thumbnail + '%3A' + id,
})
date = self._html_search_regex(r'<span[^>]+class="tweet-date"[^>]*><a[^>]+title="([^"]+)"', webpage, 'upload date', fatal=False)
upload_date = unified_strdate(date)
timestamp = unified_timestamp(date)
return {
'id': video_id,
'title': title,
'description': description,
'uploader': uploader,
'timestamp': timestamp,
'uploader_id': uploader_id,
'uploader_url': uploader_url,
'view_count': view_count,
'like_count': like_count,
'repost_count': repost_count,
'comment_count': comment_count,
'formats': formats,
'thumbnails': thumbnails,
'thumbnail': thumbnail,
'upload_date': upload_date,
}

View File

@@ -0,0 +1,119 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import ExtractorError
class SkyItaliaBaseIE(InfoExtractor):
_GET_VIDEO_DATA = 'https://apid.sky.it/vdp/v1/getVideoData?token={token}&caller=sky&rendition=web&id={id}'
_RES = {
'low': [426, 240],
'med': [640, 360],
'high': [854, 480],
'hd': [1280, 720]
}
def _extract_video_id(self, url):
webpage = self._download_webpage(url, 'skyitalia')
video_id = self._html_search_regex(
[r'data-videoid=\"(\d+)\"',
r'http://player\.sky\.it/social\?id=(\d+)\&'],
webpage, 'video_id')
if video_id:
return video_id
raise ExtractorError('Video ID not found.')
def _get_formats(self, video_id, token):
data_url = self._GET_VIDEO_DATA.replace('{id}', video_id)
data_url = data_url.replace('{token}', token)
video_data = self._parse_json(
self._download_webpage(data_url, video_id),
video_id)
formats = []
for q, r in self._RES.items():
key = 'web_%s_url' % q
if key not in video_data:
continue
formats.append({
'url': video_data.get(key),
'format_id': q,
'width': r[0],
'height': r[1]
})
self._sort_formats(formats)
title = video_data.get('title')
thumb = video_data.get('thumb')
return {
'id': video_id,
'title': title,
'thumbnail': thumb,
'formats': formats
}
def _real_extract(self, url):
video_id = self._match_id(url)
if video_id == 'None':
video_id = self._extract_video_id(url)
return self._get_formats(video_id, self._TOKEN)
class SkyItaliaIE(SkyItaliaBaseIE):
IE_NAME = 'sky.it'
_VALID_URL = r'''(?x)https?://
(?P<ie>sport|tg24|video)
\.sky\.it/(?:.+?)
(?P<id>[0-9]{6})?
(?:$|\?)'''
_TESTS = [{
'url': 'https://video.sky.it/sport/motogp/video/motogp-gp-emilia-romagna-highlights-prove-libere-616162',
'md5': '9c03b590b06e5952d8051f0e02b0feca',
'info_dict': {
'id': '616162',
'ext': 'mp4',
'title': 'MotoGP, GP Emilia Romagna: gli highlights delle prove libere',
'thumbnail': 'https://videoplatform.sky.it/thumbnail/2020/09/18/1600441214452_hl-libere-motogp-misano2_5602634_thumbnail_1.jpg',
}
}, {
'url': 'https://sport.sky.it/motogp/2020/09/18/motogp-gp-emilia-romagna-misano-2020-prove-libere-diretta',
'md5': '9c03b590b06e5952d8051f0e02b0feca',
'info_dict': {
'id': '616162',
'ext': 'mp4',
'title': 'MotoGP, GP Emilia Romagna: gli highlights delle prove libere',
'thumbnail': 'https://videoplatform.sky.it/thumbnail/2020/09/18/1600441214452_hl-libere-motogp-misano2_5602634_thumbnail_1.jpg',
}
}, {
'url': 'https://tg24.sky.it/salute-e-benessere/2020/09/18/coronavirus-vaccino-ue-sanofi',
'md5': 'caa25e62dadb529bc5e0b078da99f854',
'info_dict': {
'id': '615904',
'ext': 'mp4',
'title': 'Covid-19, al Buzzi di Milano tamponi drive-in per studenti',
'thumbnail': 'https://videoplatform.sky.it/thumbnail/2020/09/17/1600351405841_error-coronavirus-al-buzzi-di-milano-tamponi_thumbnail_1.jpg',
}
}, {
'url': 'https://video.sky.it/sport/motogp/video/motogp-gp-emilia-romagna-highlights-prove-libere-616162?itm_source=parsely-api',
'only_matching': True,
}]
_TOKEN = 'F96WlOd8yoFmLQgiqv6fNQRvHZcsWk5jDaYnDvhbiJk'
class SkyArteItaliaIE(SkyItaliaBaseIE):
IE_NAME = 'arte.sky.it'
_VALID_URL = r'https?://arte\.sky\.it/video/.+?(?P<id>[0-9]{6})?$'
_TEST = {
'url': 'https://arte.sky.it/video/federico-fellini-maestri-cinema/',
'md5': '2f22513a89f45142f2746f878d690647',
'info_dict': {
'id': '612888',
'ext': 'mp4',
'title': 'I maestri del cinema Federico Felini',
'thumbnail': 'https://videoplatform.sky.it/thumbnail/2020/09/03/1599146747305_i-maestri-del-cinema-federico-felini_thumbnail_1.jpg',
}
}
_TOKEN = 'LWk29hfiU39NNdq87ePeRach3nzTSV20o0lTv2001Cd'

View File

@@ -5,6 +5,7 @@ import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
int_or_none,
js_to_json,
orderedSet,
@@ -33,27 +34,11 @@ class XTubeIE(InfoExtractor):
'title': 'strange erotica',
'description': 'contains:an ET kind of thing',
'uploader': 'greenshowers',
'duration': 450,
'duration': 449,
'view_count': int,
'comment_count': int,
'age_limit': 18,
}
}, {
# FLV videos with duplicated formats
'url': 'http://www.xtube.com/video-watch/A-Super-Run-Part-1-YT-9299752',
'md5': 'a406963eb349dd43692ec54631efd88b',
'info_dict': {
'id': '9299752',
'display_id': 'A-Super-Run-Part-1-YT',
'ext': 'flv',
'title': 'A Super Run - Part 1 (YT)',
'description': 'md5:4cc3af1aa1b0413289babc88f0d4f616',
'uploader': 'tshirtguy59',
'duration': 579,
'view_count': int,
'comment_count': int,
'age_limit': 18,
},
}, {
# new URL schema
'url': 'http://www.xtube.com/video-watch/strange-erotica-625837',
@@ -89,16 +74,24 @@ class XTubeIE(InfoExtractor):
title, thumbnail, duration = [None] * 3
config = self._parse_json(self._search_regex(
r'playerConf\s*=\s*({.+?})\s*,\s*\n', webpage, 'config',
default='{}'), video_id, transform_source=js_to_json, fatal=False)
if config:
config = config.get('mainRoll')
if isinstance(config, dict):
title = config.get('title')
thumbnail = config.get('poster')
duration = int_or_none(config.get('duration'))
sources = config.get('sources') or config.get('format')
json_config_string = self._search_regex(
r'playerConf=({.+?}),loaderConf',
webpage, 'config', default=None)
if not json_config_string:
raise ExtractorError("Could not extract video player data")
json_config_string = json_config_string.replace("!0", "true").replace("!1", "false")
config = self._parse_json(json_config_string, video_id, transform_source=js_to_json, fatal=False)
if not config:
raise ExtractorError("Could not extract video player data")
config = config.get('mainRoll')
if isinstance(config, dict):
title = config.get('title')
thumbnail = config.get('poster')
duration = int_or_none(config.get('duration'))
sources = config.get('sources') or config.get('format')
if not isinstance(sources, dict):
sources = self._parse_json(self._search_regex(

View File

@@ -1375,14 +1375,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'ext': ext,
})
sub_lang_list[lang] = sub_formats
""" if has_live_chat_replay:
if has_live_chat_replay:
sub_lang_list['live_chat'] = [
{
'video_id': video_id,
'ext': 'json',
'protocol': 'youtube_live_chat_replay',
},
] """
]
if not sub_lang_list:
self._downloader.report_warning('video doesn\'t have subtitles')
return {}
@@ -1406,6 +1406,44 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
return self._parse_json(
uppercase_escape(config), video_id, fatal=False)
def _get_music_metadata_from_yt_initial(self, yt_initial):
music_metadata = []
key_map = {
'Album': 'album',
'Artist': 'artist',
'Song': 'track'
}
contents = try_get(yt_initial, lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'])
if type(contents) is list:
for content in contents:
music_track = {}
if type(content) is not dict:
continue
videoSecondaryInfoRenderer = try_get(content, lambda x: x['videoSecondaryInfoRenderer'])
if type(videoSecondaryInfoRenderer) is not dict:
continue
rows = try_get(videoSecondaryInfoRenderer, lambda x: x['metadataRowContainer']['metadataRowContainerRenderer']['rows'])
if type(rows) is not list:
continue
for row in rows:
metadataRowRenderer = try_get(row, lambda x: x['metadataRowRenderer'])
if type(metadataRowRenderer) is not dict:
continue
key = try_get(metadataRowRenderer, lambda x: x['title']['simpleText'])
value = try_get(metadataRowRenderer, lambda x: x['contents'][0]['simpleText']) or \
try_get(metadataRowRenderer, lambda x: x['contents'][0]['runs'][0]['text'])
if type(key) is not str or type(value) is not str:
continue
if key in key_map:
if key_map[key] in music_track:
# we've started on a new track
music_metadata.append(music_track)
music_track = {}
music_track[key_map[key]] = value
if len(music_track.keys()):
music_metadata.append(music_track)
return music_metadata
def _get_automatic_captions(self, video_id, webpage):
"""We need the webpage for getting the captions url, pass it as an
argument to speed up the process."""
@@ -2328,6 +2366,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
if release_year:
release_year = int(release_year)
yt_initial = self._get_yt_initial_data(video_id, video_webpage)
if yt_initial:
music_metadata = self._get_music_metadata_from_yt_initial(yt_initial)
if len(music_metadata):
album = music_metadata[0].get('album')
artist = music_metadata[0].get('artist')
track = music_metadata[0].get('track')
m_episode = re.search(
r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>',
video_webpage)