1
0
mirror of https://github.com/yt-dlp/yt-dlp.git synced 2025-06-27 17:08:32 +00:00
yt-dlp/yt_dlp/extractor/pbs.py
2025-05-26 18:48:11 -04:00

850 lines
43 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import re
from .common import InfoExtractor
from ..utils import (
US_RATINGS,
ExtractorError,
determine_ext,
float_or_none,
int_or_none,
js_to_json,
orderedSet,
strip_jsonp,
strip_or_none,
traverse_obj,
unified_strdate,
url_or_none,
)
class PBSIE(InfoExtractor):
_STATIONS = (
(r'(?:video|www|player)\.pbs\.org', 'PBS: Public Broadcasting Service'), # http://www.pbs.org/
(r'video\.aptv\.org', 'APT - Alabama Public Television (WBIQ)'), # http://aptv.org/
(r'video\.gpb\.org', 'GPB/Georgia Public Broadcasting (WGTV)'), # http://www.gpb.org/
(r'video\.mpbonline\.org', 'Mississippi Public Broadcasting (WMPN)'), # http://www.mpbonline.org
(r'video\.wnpt\.org', 'Nashville Public Television (WNPT)'), # http://www.wnpt.org
(r'video\.wfsu\.org', 'WFSU-TV (WFSU)'), # http://wfsu.org/
(r'video\.wsre\.org', 'WSRE (WSRE)'), # http://www.wsre.org
(r'video\.wtcitv\.org', 'WTCI (WTCI)'), # http://www.wtcitv.org
(r'video\.pba\.org', 'WPBA/Channel 30 (WPBA)'), # http://pba.org/
(r'video\.alaskapublic\.org', 'Alaska Public Media (KAKM)'), # http://alaskapublic.org/kakm
# (r'kuac\.org', 'KUAC (KUAC)'), # http://kuac.org/kuac-tv/
# (r'ktoo\.org', '360 North (KTOO)'), # http://www.ktoo.org/
# (r'azpm\.org', 'KUAT 6 (KUAT)'), # http://www.azpm.org/
(r'video\.azpbs\.org', 'Arizona PBS (KAET)'), # http://www.azpbs.org
(r'portal\.knme\.org', 'KNME-TV/Channel 5 (KNME)'), # http://www.newmexicopbs.org/
(r'video\.vegaspbs\.org', 'Vegas PBS (KLVX)'), # http://vegaspbs.org/
(r'watch\.aetn\.org', 'AETN/ARKANSAS ETV NETWORK (KETS)'), # http://www.aetn.org/
(r'video\.ket\.org', 'KET (WKLE)'), # http://www.ket.org/
(r'video\.wkno\.org', 'WKNO/Channel 10 (WKNO)'), # http://www.wkno.org/
(r'video\.lpb\.org', 'LPB/LOUISIANA PUBLIC BROADCASTING (WLPB)'), # http://www.lpb.org/
(r'videos\.oeta\.tv', 'OETA (KETA)'), # http://www.oeta.tv
(r'video\.optv\.org', 'Ozarks Public Television (KOZK)'), # http://www.optv.org/
(r'watch\.wsiu\.org', 'WSIU Public Broadcasting (WSIU)'), # http://www.wsiu.org/
(r'video\.keet\.org', 'KEET TV (KEET)'), # http://www.keet.org
(r'pbs\.kixe\.org', 'KIXE/Channel 9 (KIXE)'), # http://kixe.org/
(r'video\.kpbs\.org', 'KPBS San Diego (KPBS)'), # http://www.kpbs.org/
(r'video\.kqed\.org', 'KQED (KQED)'), # http://www.kqed.org
(r'vids\.kvie\.org', 'KVIE Public Television (KVIE)'), # http://www.kvie.org
(r'(?:video\.|www\.)pbssocal\.org', 'PBS SoCal/KOCE (KOCE)'), # http://www.pbssocal.org/
(r'video\.valleypbs\.org', 'ValleyPBS (KVPT)'), # http://www.valleypbs.org/
(r'video\.cptv\.org', 'CONNECTICUT PUBLIC TELEVISION (WEDH)'), # http://cptv.org
(r'watch\.knpb\.org', 'KNPB Channel 5 (KNPB)'), # http://www.knpb.org/
(r'video\.soptv\.org', 'SOPTV (KSYS)'), # http://www.soptv.org
# (r'klcs\.org', 'KLCS/Channel 58 (KLCS)'), # http://www.klcs.org
# (r'krcb\.org', 'KRCB Television & Radio (KRCB)'), # http://www.krcb.org
# (r'kvcr\.org', 'KVCR TV/DT/FM :: Vision for the Future (KVCR)'), # http://kvcr.org
(r'video\.rmpbs\.org', 'Rocky Mountain PBS (KRMA)'), # http://www.rmpbs.org
(r'video\.kenw\.org', 'KENW-TV3 (KENW)'), # http://www.kenw.org
(r'video\.kued\.org', 'KUED Channel 7 (KUED)'), # http://www.kued.org
(r'video\.wyomingpbs\.org', 'Wyoming PBS (KCWC)'), # http://www.wyomingpbs.org
(r'video\.cpt12\.org', 'Colorado Public Television / KBDI 12 (KBDI)'), # http://www.cpt12.org/
(r'video\.kbyueleven\.org', 'KBYU-TV (KBYU)'), # http://www.kbyutv.org/
(r'(?:video\.|www\.)thirteen\.org', 'Thirteen/WNET New York (WNET)'), # http://www.thirteen.org
(r'video\.wgbh\.org', 'WGBH/Channel 2 (WGBH)'), # http://wgbh.org
(r'video\.wgby\.org', 'WGBY (WGBY)'), # http://www.wgby.org
(r'watch\.njtvonline\.org', 'NJTV Public Media NJ (WNJT)'), # http://www.njtvonline.org/
# (r'ripbs\.org', 'Rhode Island PBS (WSBE)'), # http://www.ripbs.org/home/
(r'watch\.wliw\.org', 'WLIW21 (WLIW)'), # http://www.wliw.org/
(r'video\.mpt\.tv', 'mpt/Maryland Public Television (WMPB)'), # http://www.mpt.org
(r'watch\.weta\.org', 'WETA Television and Radio (WETA)'), # http://www.weta.org
(r'video\.whyy\.org', 'WHYY (WHYY)'), # http://www.whyy.org
(r'video\.wlvt\.org', 'PBS 39 (WLVT)'), # http://www.wlvt.org/
(r'video\.wvpt\.net', 'WVPT - Your Source for PBS and More! (WVPT)'), # http://www.wvpt.net
(r'video\.whut\.org', 'Howard University Television (WHUT)'), # http://www.whut.org
(r'video\.wedu\.org', 'WEDU PBS (WEDU)'), # http://www.wedu.org
(r'video\.wgcu\.org', 'WGCU Public Media (WGCU)'), # http://www.wgcu.org/
# (r'wjct\.org', 'WJCT Public Broadcasting (WJCT)'), # http://www.wjct.org
(r'video\.wpbt2\.org', 'WPBT2 (WPBT)'), # http://www.wpbt2.org
(r'video\.wucftv\.org', 'WUCF TV (WUCF)'), # http://wucftv.org
(r'video\.wuft\.org', 'WUFT/Channel 5 (WUFT)'), # http://www.wuft.org
(r'watch\.wxel\.org', 'WXEL/Channel 42 (WXEL)'), # http://www.wxel.org/home/
(r'video\.wlrn\.org', 'WLRN/Channel 17 (WLRN)'), # http://www.wlrn.org/
(r'video\.wusf\.usf\.edu', 'WUSF Public Broadcasting (WUSF)'), # http://wusf.org/
(r'video\.scetv\.org', 'ETV (WRLK)'), # http://www.scetv.org
(r'video\.unctv\.org', 'UNC-TV (WUNC)'), # http://www.unctv.org/
# (r'pbsguam\.org', 'PBS Guam (KGTF)'), # http://www.pbsguam.org/
(r'video\.pbshawaii\.org', 'PBS Hawaii - Oceanic Cable Channel 10 (KHET)'), # http://www.pbshawaii.org/
(r'video\.idahoptv\.org', 'Idaho Public Television (KAID)'), # http://idahoptv.org
(r'video\.ksps\.org', 'KSPS (KSPS)'), # http://www.ksps.org/home/
(r'watch\.opb\.org', 'OPB (KOPB)'), # http://www.opb.org
(r'watch\.nwptv\.org', 'KWSU/Channel 10 & KTNW/Channel 31 (KWSU)'), # http://www.kwsu.org
(r'video\.will\.illinois\.edu', 'WILL-TV (WILL)'), # http://will.illinois.edu/
(r'video\.networkknowledge\.tv', 'Network Knowledge - WSEC/Springfield (WSEC)'), # http://www.wsec.tv
(r'video\.wttw\.com', 'WTTW11 (WTTW)'), # http://www.wttw.com/
# (r'wtvp\.org', 'WTVP & WTVP.org, Public Media for Central Illinois (WTVP)'), # http://www.wtvp.org/
(r'video\.iptv\.org', 'Iowa Public Television/IPTV (KDIN)'), # http://www.iptv.org/
(r'video\.ninenet\.org', 'Nine Network (KETC)'), # http://www.ninenet.org
(r'video\.wfwa\.org', 'PBS39 Fort Wayne (WFWA)'), # http://wfwa.org/
(r'video\.wfyi\.org', 'WFYI Indianapolis (WFYI)'), # http://www.wfyi.org
(r'video\.mptv\.org', 'Milwaukee Public Television (WMVS)'), # http://www.mptv.org
(r'video\.wnin\.org', 'WNIN (WNIN)'), # http://www.wnin.org/
(r'video\.wnit\.org', 'WNIT Public Television (WNIT)'), # http://www.wnit.org/
(r'video\.wpt\.org', 'WPT (WPNE)'), # http://www.wpt.org/
(r'video\.wvut\.org', 'WVUT/Channel 22 (WVUT)'), # http://wvut.org/
(r'video\.weiu\.net', 'WEIU/Channel 51 (WEIU)'), # http://www.weiu.net
(r'video\.wqpt\.org', 'WQPT-TV (WQPT)'), # http://www.wqpt.org
(r'video\.wycc\.org', 'WYCC PBS Chicago (WYCC)'), # http://www.wycc.org
# (r'lakeshorepublicmedia\.org', 'Lakeshore Public Television (WYIN)'), # http://lakeshorepublicmedia.org/
(r'video\.wipb\.org', 'WIPB-TV (WIPB)'), # http://wipb.org
(r'video\.indianapublicmedia\.org', 'WTIU (WTIU)'), # http://indianapublicmedia.org/tv/
(r'watch\.cetconnect\.org', 'CET (WCET)'), # http://www.cetconnect.org
(r'video\.thinktv\.org', 'ThinkTVNetwork (WPTD)'), # http://www.thinktv.org
(r'video\.wbgu\.org', 'WBGU-TV (WBGU)'), # http://wbgu.org
(r'video\.wgvu\.org', 'WGVU TV (WGVU)'), # http://www.wgvu.org/
(r'video\.netnebraska\.org', 'NET1 (KUON)'), # http://netnebraska.org
(r'video\.pioneer\.org', 'Pioneer Public Television (KWCM)'), # http://www.pioneer.org
(r'watch\.sdpb\.org', 'SDPB Television (KUSD)'), # http://www.sdpb.org
(r'video\.tpt\.org', 'TPT (KTCA)'), # http://www.tpt.org
(r'watch\.ksmq\.org', 'KSMQ (KSMQ)'), # http://www.ksmq.org/
(r'watch\.kpts\.org', 'KPTS/Channel 8 (KPTS)'), # http://www.kpts.org/
(r'watch\.ktwu\.org', 'KTWU/Channel 11 (KTWU)'), # http://ktwu.org
# (r'shptv\.org', 'Smoky Hills Public Television (KOOD)'), # http://www.shptv.org
# (r'kcpt\.org', 'KCPT Kansas City Public Television (KCPT)'), # http://kcpt.org/
# (r'blueridgepbs\.org', 'Blue Ridge PBS (WBRA)'), # http://www.blueridgepbs.org/
(r'watch\.easttennesseepbs\.org', 'East Tennessee PBS (WSJK)'), # http://easttennesseepbs.org
(r'video\.wcte\.tv', 'WCTE-TV (WCTE)'), # http://www.wcte.org
(r'video\.wljt\.org', 'WLJT, Channel 11 (WLJT)'), # http://wljt.org/
(r'video\.wosu\.org', 'WOSU TV (WOSU)'), # http://wosu.org/
(r'video\.woub\.org', 'WOUB/WOUC (WOUB)'), # http://woub.org/tv/index.php?section=5
(r'video\.wvpublic\.org', 'WVPB (WVPB)'), # http://wvpublic.org/
(r'video\.wkyupbs\.org', 'WKYU-PBS (WKYU)'), # http://www.wkyupbs.org
# (r'wyes\.org', 'WYES-TV/New Orleans (WYES)'), # http://www.wyes.org
(r'video\.kera\.org', 'KERA 13 (KERA)'), # http://www.kera.org/
(r'video\.mpbn\.net', 'MPBN (WCBB)'), # http://www.mpbn.net/
(r'video\.mountainlake\.org', 'Mountain Lake PBS (WCFE)'), # http://www.mountainlake.org/
(r'video\.nhptv\.org', 'NHPTV (WENH)'), # http://nhptv.org/
(r'video\.vpt\.org', 'Vermont PBS (WETK)'), # http://www.vpt.org
(r'video\.witf\.org', 'witf (WITF)'), # http://www.witf.org
(r'watch\.wqed\.org', 'WQED Multimedia (WQED)'), # http://www.wqed.org/
(r'video\.wmht\.org', 'WMHT Educational Telecommunications (WMHT)'), # http://www.wmht.org/home/
(r'video\.deltabroadcasting\.org', 'Q-TV (WDCQ)'), # http://www.deltabroadcasting.org
(r'video\.dptv\.org', 'WTVS Detroit Public TV (WTVS)'), # http://www.dptv.org/
(r'video\.wcmu\.org', 'CMU Public Television (WCMU)'), # http://www.wcmu.org
(r'video\.wkar\.org', 'WKAR-TV (WKAR)'), # http://wkar.org/
(r'wnmuvideo\.nmu\.edu', 'WNMU-TV Public TV 13 (WNMU)'), # http://wnmutv.nmu.edu
(r'video\.wdse\.org', 'WDSE - WRPT (WDSE)'), # http://www.wdse.org/
(r'video\.wgte\.org', 'WGTE TV (WGTE)'), # http://www.wgte.org
(r'video\.lptv\.org', 'Lakeland Public Television (KAWE)'), # http://www.lakelandptv.org
# (r'prairiepublic\.org', 'PRAIRIE PUBLIC (KFME)'), # http://www.prairiepublic.org/
(r'video\.kmos\.org', 'KMOS-TV - Channels 6.1, 6.2 and 6.3 (KMOS)'), # http://www.kmos.org/
(r'watch\.montanapbs\.org', 'MontanaPBS (KUSM)'), # http://montanapbs.org
(r'video\.krwg\.org', 'KRWG/Channel 22 (KRWG)'), # http://www.krwg.org
(r'video\.kacvtv\.org', 'KACV (KACV)'), # http://www.panhandlepbs.org/home/
(r'video\.kcostv\.org', 'KCOS/Channel 13 (KCOS)'), # www.kcostv.org
(r'video\.wcny\.org', 'WCNY/Channel 24 (WCNY)'), # http://www.wcny.org
(r'video\.wned\.org', 'WNED (WNED)'), # http://www.wned.org/
(r'watch\.wpbstv\.org', 'WPBS (WPBS)'), # http://www.wpbstv.org
(r'video\.wskg\.org', 'WSKG Public TV (WSKG)'), # http://wskg.org
(r'video\.wxxi\.org', 'WXXI (WXXI)'), # http://wxxi.org
(r'video\.wpsu\.org', 'WPSU (WPSU)'), # http://www.wpsu.org
# (r'wqln\.org', 'WQLN/Channel 54 (WQLN)'), # http://www.wqln.org
(r'on-demand\.wvia\.org', 'WVIA Public Media Studios (WVIA)'), # http://www.wvia.org/
(r'video\.wtvi\.org', 'WTVI (WTVI)'), # http://www.wtvi.org/
# (r'whro\.org', 'WHRO (WHRO)'), # http://whro.org
(r'video\.westernreservepublicmedia\.org', 'Western Reserve PBS (WNEO)'), # http://www.WesternReservePublicMedia.org/
(r'video\.ideastream\.org', 'WVIZ/PBS ideastream (WVIZ)'), # http://www.wviz.org/
(r'video\.kcts9\.org', 'KCTS 9 (KCTS)'), # http://kcts9.org/
(r'video\.basinpbs\.org', 'Basin PBS (KPBT)'), # http://www.basinpbs.org
(r'video\.houstonpbs\.org', 'KUHT / Channel 8 (KUHT)'), # http://www.houstonpublicmedia.org/
# (r'tamu\.edu', 'KAMU - TV (KAMU)'), # http://KAMU.tamu.edu
# (r'kedt\.org', 'KEDT/Channel 16 (KEDT)'), # http://www.kedt.org
(r'video\.klrn\.org', 'KLRN (KLRN)'), # http://www.klrn.org
(r'video\.klru\.tv', 'KLRU (KLRU)'), # http://www.klru.org
# (r'kmbh\.org', 'KMBH-TV (KMBH)'), # http://www.kmbh.org
# (r'knct\.org', 'KNCT (KNCT)'), # http://www.knct.org
# (r'ktxt\.org', 'KTTZ-TV (KTXT)'), # http://www.ktxt.org
(r'video\.wtjx\.org', 'WTJX Channel 12 (WTJX)'), # http://www.wtjx.org/
(r'video\.ideastations\.org', 'WCVE PBS (WCVE)'), # http://ideastations.org/
(r'video\.kbtc\.org', 'KBTC Public Television (KBTC)'), # http://kbtc.org
)
IE_NAME = 'pbs'
IE_DESC = 'Public Broadcasting Service (PBS) and member stations: {}'.format(', '.join(list(zip(*_STATIONS))[1]))
_VALID_URL = r'''(?x)https?://
(?:
# Player
(?:video|player)\.pbs\.org/(?:widget/)?partnerplayer/(?P<player_id>[^/?#]+) |
# Direct video URL, or article with embedded player
(?:{})/(?:
(?:(?:vir|port)alplayer|video)/(?P<id>[0-9a-zA-Z-]+)(?:[?/#]|$) |
(?:[^/?#]+/){{1,5}}(?P<presumptive_id>[^/?#]+?)(?:\.html)?/?(?:$|[?#])
)
)
'''.format('|'.join(next(zip(*_STATIONS))))
_GEO_COUNTRIES = ['US']
_TESTS = [
{
'url': 'http://www.pbs.org/tpt/constitution-usa-peter-sagal/watch/a-more-perfect-union/',
'md5': '173dc391afd361fa72eab5d3d918968d',
'info_dict': {
'id': '2365006249',
'ext': 'mp4',
'title': 'Constitution USA with Peter Sagal - A More Perfect Union',
'description': 'md5:31b664af3c65fd07fa460d306b837d00',
'duration': 3190,
},
'skip': 'dead URL',
},
{
'url': 'https://www.thirteen.org/programs/the-woodwrights-shop/carving-away-with-mary-may-tioglz/',
'info_dict': {
'id': '3004803331',
'ext': 'mp4',
'title': "The Woodwright's Shop - Carving Away with Mary May",
'description': 'md5:7cbaaaa8b9bcc78bd8f0e31911644e28',
'duration': 1606,
'display_id': 'carving-away-with-mary-may-tioglz',
'chapters': [],
'thumbnail': 'https://image.pbs.org/video-assets/NcnTxNl-asset-mezzanine-16x9-K0Keoyv.jpg',
},
},
{
'url': 'http://www.pbs.org/wgbh/pages/frontline/losing-iraq/',
'md5': '372b12b670070de39438b946474df92f',
'info_dict': {
'id': '2365297690',
'ext': 'mp4',
'title': 'FRONTLINE - Losing Iraq',
'description': 'md5:5979a4d069b157f622d02bff62fbe654',
'duration': 5050,
'chapters': [
{'start_time': 0.0, 'end_time': 1234.0, 'title': 'After Saddam, Chaos'},
{'start_time': 1233.0, 'end_time': 1719.0, 'title': 'The Insurgency Takes Root'},
{'start_time': 1718.0, 'end_time': 2461.0, 'title': 'A Light Footprint'},
{'start_time': 2460.0, 'end_time': 3589.0, 'title': 'The Surge '},
{'start_time': 3588.0, 'end_time': 4355.0, 'title': 'The Withdrawal '},
{'start_time': 4354.0, 'end_time': 5051.0, 'title': 'ISIS on the March '},
],
'display_id': 'losing-iraq',
'thumbnail': 'https://image.pbs.org/video-assets/pbs/frontline/138098/images/mezzanine_401.jpg',
},
},
{
'url': 'http://www.pbs.org/newshour/bb/education-jan-june12-cyberschools_02-23/',
'md5': 'b19856d7f5351b17a5ab1dc6a64be633',
'info_dict': {
'id': '2201174722',
'ext': 'mp4',
'title': 'PBS News Hour - Cyber Schools Gain Popularity, but Quality Questions Persist',
'description': 'md5:e145f95a1e667414ba7e1a4ba9272a91',
'duration': 801,
'display_id': 'education-jan-june12-cyberschools_02-23',
'thumbnail': r're:^https?://.*\.jpg$',
'chapters': list,
},
},
{
'url': 'http://www.pbs.org/wnet/gperf/dudamel-conducts-verdi-requiem-hollywood-bowl-full-episode/3374/',
'md5': 'c62859342be2a0358d6c9eb306595978',
'info_dict': {
'id': '2365297708',
'ext': 'mp4',
'title': 'Great Performances - Dudamel Conducts Verdi Requiem at the Hollywood Bowl - Full',
'description': 'md5:657897370e09e2bc6bf0f8d2cd313c6b',
'duration': 6559,
'thumbnail': r're:^https?://.*\.jpg$',
},
'skip': 'Unavailable video',
},
{
'url': 'http://www.pbs.org/wgbh/nova/earth/killer-typhoon.html',
'md5': '908f3e5473a693b266b84e25e1cf9703',
'info_dict': {
'id': '2365160389',
'display_id': 'killer-typhoon',
'ext': 'mp4',
'description': 'md5:c741d14e979fc53228c575894094f157',
'title': 'NOVA - Killer Typhoon',
'duration': 3172,
'thumbnail': r're:^https?://.*\.jpg$',
'upload_date': '20140122',
'age_limit': 10,
},
},
{
'url': 'http://www.pbs.org/wgbh/pages/frontline/united-states-of-secrets/',
'info_dict': {
'id': 'united-states-of-secrets',
},
'playlist_count': 2,
},
{
'url': 'http://www.pbs.org/wgbh/americanexperience/films/great-war/',
'info_dict': {
'id': 'great-war',
},
'playlist_count': 3,
},
{
'url': 'https://www.pbs.org/wgbh/americanexperience/films/mr-polaroid/',
'info_dict': {
'id': '3100967977',
'display_id': '3100967977',
'ext': 'mp4',
'title': 'American Experience - Mr. Polaroid',
'duration': 3140,
'thumbnail': r're:^https?://.*\.jpg$',
'chapters': list,
},
'params': {
'skip_download': True, # requires ffmpeg
},
},
{
'url': 'http://www.pbs.org/video/2365245528/',
'md5': '115223d41bd55cda8ae5cd5ed4e11497',
'info_dict': {
'id': '2365245528',
'display_id': '2365245528',
'ext': 'mp4',
'title': 'FRONTLINE - United States of Secrets (Part One)',
'duration': 6851,
'thumbnail': r're:^https?://.*\.jpg$',
'chapters': [
{'start_time': 0.0, 'end_time': 623.0, 'title': 'The Snowden Leak '},
{'start_time': 622.0, 'end_time': 1408.0, 'title': '"The Program" Begins '},
{'start_time': 1407.0, 'end_time': 1986.0, 'title': 'ThinThread '},
{'start_time': 1985.0, 'end_time': 2437.0, 'title': '"They Have Gone Rogue" '},
{'start_time': 2436.0, 'end_time': 3626.0, 'title': 'Concerns at the Justice Department '},
{'start_time': 3625.0, 'end_time': 4800.0, 'title': 'Page One News '},
{'start_time': 4799.0, 'end_time': 5906.0, 'title': 'A Hunt for the Leaker '},
{'start_time': 5905.0, 'end_time': 6852.0, 'title': 'Obama Inherits "The Program" '},
],
},
'params': {
'skip_download': True, # video unavailable
},
},
{
# Video embedded in iframe containing angle brackets as attribute's value (e.g.
# "<iframe style='position: absolute;<br />\ntop: 0; left: 0;' ...", see
# https://github.com/ytdl-org/youtube-dl/issues/7059)
'url': 'http://www.pbs.org/food/features/a-chefs-life-season-3-episode-5-prickly-business/',
'md5': '59b0ef5009f9ac8a319cc5efebcd865e',
'info_dict': {
'id': '2365546844',
'display_id': 'a-chefs-life-season-3-episode-5-prickly-business',
'ext': 'mp4',
'title': "A Chef's Life - Season 3, Ep. 5: Prickly Business",
'description': 'md5:c0ff7475a4b70261c7e58f493c2792a5',
'duration': 1480,
'thumbnail': r're:^https?://.*\.jpg$',
},
'skip': 'Video unavailable',
},
{
'url': 'https://www.pbs.org/food/stories/tips-tricks-make-medieval-meat-roasting-pit',
'md5': 'c765f6b407b0b24de54bc622f2fed040',
'info_dict': {
'id': '2365807145',
'display_id': 'tips-tricks-make-medieval-meat-roasting-pit',
'ext': 'mp4',
'title': 'ChefSteps - Tips & Tricks: Make a Medieval Meat-Roasting Pit',
'description': 'md5:5f998c530823f7a0617c86361391e0b5',
'duration': 117,
'thumbnail': r're:^https?://.*\.jpg$',
'chapters': list,
},
},
{
# Frontline video embedded via flp2012.js
'url': 'http://www.pbs.org/wgbh/pages/frontline/the-atomic-artists',
'info_dict': {
'id': '2070868960',
'display_id': 'the-atomic-artists',
'ext': 'mp4',
'title': 'FRONTLINE - The Atomic Artists',
'description': 'md5:f677e4520cfacb4a5ce1471e31b57800',
'duration': 723,
'thumbnail': r're:^https?://.*\.jpg$',
'chapters': list,
},
'params': {
'skip_download': True, # requires ffmpeg
},
},
{
# Serves hd only via wigget/partnerplayer page
'url': 'http://www.pbs.org/video/2365641075/',
'md5': '2b138265167346172b771d65eea1bf9a',
'info_dict': {
'id': '2365641075',
'ext': 'mp4',
'title': 'FRONTLINE - Netanyahu at War',
'duration': 6852,
'thumbnail': r're:^https?://.*\.jpg$',
'formats': 'mincount:7',
'chapters': [
{'start_time': 0.0, 'end_time': 1212.0, 'title': 'Soldier, Pundit, Envoy'},
{'start_time': 1211.0, 'end_time': 2380.0, 'title': 'The Rabin Assassination'},
{'start_time': 2379.0, 'end_time': 3594.0, 'title': 'Netanyahu, Clinton and Obama'},
{'start_time': 3593.0, 'end_time': 4780.0, 'title': 'The Relationship Sours'},
{'start_time': 4779.0, 'end_time': 6083.0, 'title': 'Netanyahu and the 2012 Elections'},
{'start_time': 6082.0, 'end_time': 6853.0, 'title': '"A Very, Very Bad Deal"'},
],
},
},
{
# https://github.com/ytdl-org/youtube-dl/issues/13801
'url': 'https://www.pbs.org/video/e25-thank-you-roger-cook-ask-this-old-house-ZKzAfs/',
'md5': '78d7e9cf38cfb509a2b5ce461c4f400b',
'info_dict': {
'id': 'e25-thank-you-roger-cook-ask-this-old-house-ZKzAfs',
'ext': 'mp4',
'title': 'Ask This Old House - E25 | Thank You, Roger Cook | Ask This Old House',
'duration': 1422,
'thumbnail': r're:^https?://.*\.jpg$',
'chapters': [
{'start_time': 0.0, 'end_time': 45.0, 'title': 'Coming Up on Ask This Old House'},
{'start_time': 45.09, 'end_time': 502.09000000000003, 'title': 'Roger Cooks Secret to Hard Work and Teaching'},
{'start_time': 502.03, 'end_time': 884.03, 'title': 'Getting Dirty, Loving Nature, Doing It Right'},
{'start_time': 884.12, 'end_time': 1422.12, 'title': 'How Roger Cook Built His Legacy'},
],
},
},
{
'url': 'http://www.pbs.org/wgbh/roadshow/watch/episode/2105-indianapolis-hour-2/',
'info_dict': {
'id': '2365936247',
'ext': 'mp4',
'title': 'Antiques Roadshow - Indianapolis, Hour 2',
'description': 'md5:524b32249db55663e7231b6b8d1671a2',
'duration': 3180,
'thumbnail': r're:^https?://.*\.jpg$',
},
'params': {
'skip_download': True,
},
'expected_warnings': ['HTTP Error 403: Forbidden'],
},
{
'url': 'https://www.pbs.org/wgbh/masterpiece/episodes/victoria-s2-e1/',
'info_dict': {
'id': '3007193718',
'ext': 'mp4',
'title': "Victoria - A Soldier's Daughter / The Green-Eyed Monster",
'description': 'md5:37efbac85e0c09b009586523ec143652',
'duration': 6292,
'thumbnail': r're:^https?://.*\.(?:jpg|JPG)$',
'display_id': 'victoria-s2-e1',
'chapters': [
{'start_time': 0.0, 'end_time': 510.0, 'title': '1.'},
{'start_time': 510.04, 'end_time': 1416.04, 'title': '2.'},
{'start_time': 1416.048, 'end_time': 4465.048, 'title': '3.'},
{'start_time': 4465.056, 'end_time': 5074.056, 'title': '4.'},
{'start_time': 5074.0, 'end_time': 6292.0, 'title': '5.'},
],
},
'params': {
'skip_download': True,
},
'expected_warnings': ['HTTP Error 403: Forbidden'],
},
{
'url': 'https://player.pbs.org/partnerplayer/tOz9tM5ljOXQqIIWke53UA==/',
'info_dict': {
'id': '3011407934',
'ext': 'mp4',
'title': 'Stories from the Stage - Road Trip',
'duration': 1619,
'thumbnail': r're:^https?://.*\.(?:jpg|JPG)$',
'display_id': 'tOz9tM5ljOXQqIIWke53UA==',
'chapters': list,
},
'params': {
'skip_download': True,
},
'expected_warnings': ['HTTP Error 403: Forbidden'],
},
{
'url': 'https://www.pbssocal.org/shows/newshour/clip/capehart-johnson-1715984001',
'info_dict': {
'id': '3091549094',
'ext': 'mp4',
'title': 'PBS News Hour - Capehart and Johnson on the unusual Biden-Trump debate plans',
'description': 'Capehart and Johnson on how the Biden-Trump debates could shape the campaign season',
'display_id': 'capehart-johnson-1715984001',
'duration': 593,
'thumbnail': 'https://image.pbs.org/video-assets/mF3oSVn-asset-mezzanine-16x9-QeXjXPy.jpg',
'chapters': [],
},
},
{
'url': 'http://player.pbs.org/widget/partnerplayer/2365297708/?start=0&end=0&chapterbar=false&endscreen=false&topbar=true',
'only_matching': True,
},
{
'url': 'http://watch.knpb.org/video/2365616055/',
'only_matching': True,
},
{
'url': 'https://player.pbs.org/portalplayer/3004638221/?uid=',
'only_matching': True,
},
]
_ERRORS = {
101: 'We\'re sorry, but this video is not yet available.',
403: 'We\'re sorry, but this video is not available in your region due to right restrictions.',
404: 'We are experiencing technical difficulties that are preventing us from playing the video at this time. Please check back again soon.',
410: 'This video has expired and is no longer available for online streaming.',
}
def _real_initialize(self):
cookie = (self._download_json(
'http://localization.services.pbs.org/localize/auto/cookie/',
None, headers=self.geo_verification_headers(), fatal=False) or {}).get('cookie')
if cookie:
station = self._search_regex(r'#?s=\["([^"]+)"', cookie, 'station')
if station:
self._set_cookie('.pbs.org', 'pbsol.station', station)
def _extract_webpage(self, url):
mobj = self._match_valid_url(url)
description = None
presumptive_id = mobj.group('presumptive_id')
display_id = presumptive_id
if presumptive_id:
webpage = self._download_webpage(url, display_id)
description = strip_or_none(self._og_search_description(
webpage, default=None) or self._html_search_meta(
'description', webpage, default=None))
upload_date = unified_strdate(self._search_regex(
r'<input type="hidden" id="air_date_[0-9]+" value="([^"]+)"',
webpage, 'upload date', default=None))
# tabbed frontline videos
MULTI_PART_REGEXES = (
r'<div[^>]+class="videotab[^"]*"[^>]+vid="(\d+)"',
r'<a[^>]+href=["\']#(?:video-|part)\d+["\'][^>]+data-cove[Ii]d=["\'](\d+)',
)
for p in MULTI_PART_REGEXES:
tabbed_videos = orderedSet(re.findall(p, webpage))
if tabbed_videos:
return tabbed_videos, presumptive_id, upload_date, description
MEDIA_ID_REGEXES = [
r"div\s*:\s*'videoembed'\s*,\s*mediaid\s*:\s*'(\d+)'", # frontline video embed
r'class="coveplayerid">([^<]+)<', # coveplayer
r'<section[^>]+data-coveid="(\d+)"', # coveplayer from http://www.pbs.org/wgbh/frontline/film/real-csi/
r'\sclass="passportcoveplayer"[^>]*\sdata-media="(\d+)', # https://www.thirteen.org/programs/the-woodwrights-shop/who-wrote-the-book-of-sloyd-fggvvq/
r'<input type="hidden" id="pbs_video_id_[0-9]+" value="([0-9]+)"/>', # jwplayer
r"(?s)window\.PBS\.playerConfig\s*=\s*{.*?id\s*:\s*'([0-9]+)',",
r'<div[^>]+\bdata-cove-id=["\'](\d+)"', # http://www.pbs.org/wgbh/roadshow/watch/episode/2105-indianapolis-hour-2/
r'<iframe[^>]+\bsrc=["\'](?:https?:)?//video\.pbs\.org/widget/partnerplayer/(\d+)', # https://www.pbs.org/wgbh/masterpiece/episodes/victoria-s2-e1/
r'\bhttps?://player\.pbs\.org/[\w-]+player/(\d+)', # last pattern to avoid false positives
]
media_id = self._search_regex(
MEDIA_ID_REGEXES, webpage, 'media ID', fatal=False, default=None)
if media_id:
return media_id, presumptive_id, upload_date, description
# Frontline video embedded via flp
video_id = self._search_regex(
r'videoid\s*:\s*"([\d+a-z]{7,})"', webpage, 'videoid', default=None)
if video_id:
# pkg_id calculation is reverse engineered from
# http://www.pbs.org/wgbh/pages/frontline/js/flp2012.js
prg_id = self._search_regex(
r'videoid\s*:\s*"([\d+a-z]{7,})"', webpage, 'videoid')[7:]
if 'q' in prg_id:
prg_id = prg_id.split('q')[1]
prg_id = int(prg_id, 16)
getdir = self._download_json(
'http://www.pbs.org/wgbh/pages/frontline/.json/getdir/getdir%d.json' % prg_id,
presumptive_id, 'Downloading getdir JSON',
transform_source=strip_jsonp)
return getdir['mid'], presumptive_id, upload_date, description
for iframe in re.findall(r'(?s)<iframe(.+?)></iframe>', webpage):
url = self._search_regex(
r'src=(["\'])(?P<url>.+?partnerplayer.+?)\1', iframe,
'player URL', default=None, group='url')
if url:
break
if not url:
url = self._og_search_url(webpage)
mobj = re.match(
self._VALID_URL, self._proto_relative_url(url.strip()))
player_id = mobj.group('player_id')
if not display_id:
display_id = player_id
if player_id:
player_page = self._download_webpage(
url, display_id, note='Downloading player page',
errnote='Could not download player page')
video_id = self._search_regex(
r'<div\s+id=["\']video_(\d+)', player_page, 'video ID',
default=None)
if not video_id:
video_info = self._extract_video_data(
player_page, 'video data', display_id)
video_id = str(
video_info.get('id') or video_info['contentID'])
else:
video_id = mobj.group('id')
display_id = video_id
return video_id, display_id, None, description
def _extract_video_data(self, string, name, video_id, fatal=True):
return self._parse_json(
self._search_regex(
[r'(?s)PBS\.videoData\s*=\s*({.+?});\n',
r'window\.videoBridge\s*=\s*({.+?});'],
string, name, default='{}'),
video_id, transform_source=js_to_json, fatal=fatal)
def _real_extract(self, url):
video_id, display_id, upload_date, description = self._extract_webpage(url)
if isinstance(video_id, list):
entries = [self.url_result(
f'http://video.pbs.org/video/{vid_id}', 'PBS', vid_id)
for vid_id in video_id]
return self.playlist_result(entries, display_id)
info = {}
redirects = []
redirect_urls = set()
def extract_redirect_urls(info):
for encoding_name in ('recommended_encoding', 'alternate_encoding'):
redirect = info.get(encoding_name)
if not redirect:
continue
redirect_url = redirect.get('url')
if redirect_url and redirect_url not in redirect_urls:
redirects.append(redirect)
redirect_urls.add(redirect_url)
encodings = info.get('encodings')
if isinstance(encodings, list):
for encoding in encodings:
encoding_url = url_or_none(encoding)
if encoding_url and encoding_url not in redirect_urls:
redirects.append({'url': encoding_url})
redirect_urls.add(encoding_url)
chapters = []
# Player pages may also serve different qualities
for page in ('widget/partnerplayer', 'portalplayer'):
player = self._download_webpage(
f'http://player.pbs.org/{page}/{video_id}',
display_id, f'Downloading {page} page', fatal=False)
if player:
video_info = self._extract_video_data(
player, f'{page} video data', display_id, fatal=False)
if video_info:
extract_redirect_urls(video_info)
if not info:
info = video_info
if not chapters:
raw_chapters = video_info.get('chapters') or []
if not raw_chapters:
for chapter_data in re.findall(r'(?s)chapters\.push\(({.*?})\)', player):
chapter = self._parse_json(chapter_data, video_id, js_to_json, fatal=False)
if not chapter:
continue
raw_chapters.append(chapter)
for chapter in raw_chapters:
start_time = float_or_none(chapter.get('start_time'), 1000)
duration = float_or_none(chapter.get('duration'), 1000)
if start_time is None or duration is None:
continue
chapters.append({
'start_time': start_time,
'end_time': start_time + duration,
'title': chapter.get('title'),
})
formats = []
http_url = None
hls_subs = {}
for num, redirect in enumerate(redirects):
redirect_id = redirect.get('eeid')
redirect_info = self._download_json(
'{}?format=json'.format(redirect['url']), display_id,
'Downloading %s video url info' % (redirect_id or num),
headers=self.geo_verification_headers())
if redirect_info['status'] == 'error':
message = self._ERRORS.get(
redirect_info['http_code'], redirect_info['message'])
if redirect_info['http_code'] == 403:
self.raise_geo_restricted(
msg=message, countries=self._GEO_COUNTRIES)
raise ExtractorError(
f'{self.IE_NAME} said: {message}', expected=True)
format_url = redirect_info.get('url')
if not format_url:
continue
if determine_ext(format_url) == 'm3u8':
hls_formats, hls_subs = self._extract_m3u8_formats_and_subtitles(
format_url, display_id, 'mp4', m3u8_id='hls', fatal=False)
formats.extend(hls_formats)
else:
formats.append({
'url': format_url,
'format_id': redirect_id,
})
if re.search(r'^https?://.*(?:\d+k|baseline)', format_url):
http_url = format_url
self._remove_duplicate_formats(formats)
m3u8_formats = list(filter(
lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none',
formats))
if http_url:
for m3u8_format in m3u8_formats:
bitrate = self._search_regex(r'(\d+)k', m3u8_format['url'], 'bitrate', default=None)
# Lower qualities (150k and 192k) are not available as HTTP formats (see [1]),
# we won't try extracting them.
# Since summer 2016 higher quality formats (4500k and 6500k) are also available
# albeit they are not documented in [2].
# 1. https://github.com/ytdl-org/youtube-dl/commit/cbc032c8b70a038a69259378c92b4ba97b42d491#commitcomment-17313656
# 2. https://projects.pbs.org/confluence/display/coveapi/COVE+Video+Specifications
if not bitrate or int(bitrate) < 400:
continue
f_url = re.sub(r'\d+k|baseline', bitrate + 'k', http_url)
# This may produce invalid links sometimes (e.g.
# http://www.pbs.org/wgbh/frontline/film/suicide-plan)
if not self._is_valid_url(f_url, display_id, f'http-{bitrate}k video'):
continue
f = m3u8_format.copy()
f.update({
'url': f_url,
'format_id': m3u8_format['format_id'].replace('hls', 'http'),
'protocol': 'http',
})
formats.append(f)
for f in formats:
if (f.get('format_note') or '').endswith(' AD'): # Audio description
f['language_preference'] = -10
rating_str = info.get('rating')
if rating_str is not None:
rating_str = rating_str.rpartition('-')[2]
age_limit = US_RATINGS.get(rating_str)
subtitles = {}
captions = info.get('cc') or {}
for caption_url in captions.values():
subtitles.setdefault('en', []).append({
'url': caption_url,
})
subtitles = self._merge_subtitles(subtitles, hls_subs)
# info['title'] is often incomplete (e.g. 'Full Episode', 'Episode 5', etc)
# Try turning it to 'program - title' naming scheme if possible
alt_title = info.get('program', {}).get('title')
if alt_title:
info['title'] = alt_title + ' - ' + re.sub(r'^' + alt_title + r'[\s\-:]+', '', info['title'])
description = info.get('description') or info.get(
'program', {}).get('description') or description
return {
'id': video_id,
'display_id': display_id,
'title': info['title'],
'description': description,
'thumbnail': info.get('image_url'),
'duration': int_or_none(info.get('duration')),
'age_limit': age_limit,
'upload_date': upload_date,
'formats': formats,
'subtitles': subtitles,
'chapters': chapters,
}
class PBSKidsIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?pbskids\.org/video/[\w-]+/(?P<id>\d+)'
_TESTS = [
{
'url': 'https://pbskids.org/video/molly-of-denali/3030407927',
'md5': '1ded20a017cc6b53446238f1804ce4c7',
'info_dict': {
'id': '3030407927',
'title': 'Bird in the Hand/Bye-Bye Birdie',
'channel': 'molly-of-denali',
'duration': 1540,
'ext': 'mp4',
'series': 'Molly of Denali',
'description': 'md5:d006b2211633685d8ebc8d03b6d5611e',
'categories': ['Episode'],
'upload_date': '20190718',
},
},
{
'url': 'https://pbskids.org/video/plum-landing/2365205059',
'md5': '92e5d189851a64ae1d0237a965be71f5',
'info_dict': {
'id': '2365205059',
'title': 'Cooper\'s Favorite Place in Nature',
'channel': 'plum-landing',
'duration': 67,
'ext': 'mp4',
'series': 'Plum Landing',
'description': 'md5:657e5fc4356a84ead1c061eb280ff05d',
'categories': ['Episode'],
'upload_date': '20140302',
},
},
]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
meta = self._search_json(r'window\._PBS_KIDS_DEEPLINK\s*=', webpage, 'video info', video_id)
formats, subtitles = self._extract_m3u8_formats_and_subtitles(
traverse_obj(meta, ('video_obj', 'URI', {url_or_none})), video_id, ext='mp4')
return {
'id': video_id,
'formats': formats,
'subtitles': subtitles,
**traverse_obj(meta, {
'categories': ('video_obj', 'video_type', {str}, {lambda x: [x] if x else None}),
'channel': ('show_slug', {str}),
'description': ('video_obj', 'description', {str}),
'duration': ('video_obj', 'duration', {int_or_none}),
'series': ('video_obj', 'program_title', {str}),
'title': ('video_obj', 'title', {str}),
'upload_date': ('video_obj', 'air_date', {unified_strdate}),
}),
}