mirror of
				https://github.com/yt-dlp/yt-dlp.git
				synced 2025-11-04 08:35:12 +00:00 
			
		
		
		
	[facebook] Improve title and uploader extraction
Closes #1943, closes #795
This commit is contained in:
		@@ -20,13 +20,13 @@ from ..utils import (
 | 
				
			|||||||
    get_element_by_id,
 | 
					    get_element_by_id,
 | 
				
			||||||
    int_or_none,
 | 
					    int_or_none,
 | 
				
			||||||
    js_to_json,
 | 
					    js_to_json,
 | 
				
			||||||
    limit_length,
 | 
					 | 
				
			||||||
    merge_dicts,
 | 
					    merge_dicts,
 | 
				
			||||||
    network_exceptions,
 | 
					    network_exceptions,
 | 
				
			||||||
    parse_count,
 | 
					    parse_count,
 | 
				
			||||||
    parse_qs,
 | 
					    parse_qs,
 | 
				
			||||||
    qualities,
 | 
					    qualities,
 | 
				
			||||||
    sanitized_Request,
 | 
					    sanitized_Request,
 | 
				
			||||||
 | 
					    traverse_obj,
 | 
				
			||||||
    try_get,
 | 
					    try_get,
 | 
				
			||||||
    url_or_none,
 | 
					    url_or_none,
 | 
				
			||||||
    urlencode_postdata,
 | 
					    urlencode_postdata,
 | 
				
			||||||
@@ -398,28 +398,31 @@ class FacebookIE(InfoExtractor):
 | 
				
			|||||||
            url.replace('://m.facebook.com/', '://www.facebook.com/'), video_id)
 | 
					            url.replace('://m.facebook.com/', '://www.facebook.com/'), video_id)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        def extract_metadata(webpage):
 | 
					        def extract_metadata(webpage):
 | 
				
			||||||
            video_title = self._html_search_regex(
 | 
					            media_data = [self._parse_json(j, video_id, fatal=False) for j in re.findall(
 | 
				
			||||||
                r'<h2\s+[^>]*class="uiHeaderTitle"[^>]*>([^<]*)</h2>', webpage,
 | 
					                r'handleWithCustomApplyEach\(\s*ScheduledApplyEach\s*,\s*(\{.+?\})\s*\);', webpage)]
 | 
				
			||||||
                'title', default=None)
 | 
					            media = traverse_obj(media_data, (
 | 
				
			||||||
            if not video_title:
 | 
					                ..., 'require', ..., ..., ..., '__bbox', 'result', 'data', 'attachments', ..., 'media'), expected_type=dict)
 | 
				
			||||||
                video_title = self._html_search_regex(
 | 
					            media = [m for m in media if str(m.get('id')) == video_id and m.get('__typename') == 'Video']
 | 
				
			||||||
                    r'(?s)<span class="fbPhotosPhotoCaption".*?id="fbPhotoPageCaption"><span class="hasCaption">(.*?)</span>',
 | 
					
 | 
				
			||||||
                    webpage, 'alternative title', default=None)
 | 
					            video_title = traverse_obj(media, (..., 'title', 'text'), get_all=False)
 | 
				
			||||||
            if not video_title:
 | 
					            description = traverse_obj(media, (
 | 
				
			||||||
                video_title = self._html_search_meta(
 | 
					                ..., 'creation_story', 'comet_sections', 'message', 'story', 'message', 'text'), get_all=False)
 | 
				
			||||||
                    ['og:title', 'twitter:title', 'description'],
 | 
					            uploader = traverse_obj(media, (..., 'owner', 'name'), get_all=False)
 | 
				
			||||||
                    webpage, 'title', default=None)
 | 
					            uploader_id = traverse_obj(media, (..., 'owner', 'id'), get_all=False)
 | 
				
			||||||
            if video_title:
 | 
					
 | 
				
			||||||
                video_title = limit_length(video_title, 80)
 | 
					            video_title = video_title or self._html_search_regex((
 | 
				
			||||||
            else:
 | 
					                r'<h2\s+[^>]*class="uiHeaderTitle"[^>]*>(?P<content>[^<]*)</h2>',
 | 
				
			||||||
                video_title = 'Facebook video #%s' % video_id
 | 
					                r'(?s)<span class="fbPhotosPhotoCaption".*?id="fbPhotoPageCaption"><span class="hasCaption">(?P<content>.*?)</span>',
 | 
				
			||||||
            description = self._html_search_meta(
 | 
					                self._meta_regex('og:title'), self._meta_regex('twitter:title'), self._meta_regex('description'),
 | 
				
			||||||
 | 
					            ), webpage, 'title', default=None, group='content')
 | 
				
			||||||
 | 
					            description = description or self._html_search_meta(
 | 
				
			||||||
                ['description', 'og:description', 'twitter:description'],
 | 
					                ['description', 'og:description', 'twitter:description'],
 | 
				
			||||||
                webpage, 'description', default=None)
 | 
					                webpage, 'description', default=None)
 | 
				
			||||||
            uploader = clean_html(get_element_by_id(
 | 
					            uploader = uploader or (
 | 
				
			||||||
                'fbPhotoPageAuthorName', webpage)) or self._search_regex(
 | 
					                clean_html(get_element_by_id('fbPhotoPageAuthorName', webpage))
 | 
				
			||||||
                r'ownerName\s*:\s*"([^"]+)"', webpage, 'uploader',
 | 
					                or self._search_regex(
 | 
				
			||||||
                default=None) or self._og_search_title(webpage, fatal=False)
 | 
					                    (r'ownerName\s*:\s*"([^"]+)"', *self._og_regexes('title')), webpage, 'uploader', fatal=False))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            timestamp = int_or_none(self._search_regex(
 | 
					            timestamp = int_or_none(self._search_regex(
 | 
				
			||||||
                r'<abbr[^>]+data-utime=["\'](\d+)', webpage,
 | 
					                r'<abbr[^>]+data-utime=["\'](\d+)', webpage,
 | 
				
			||||||
                'timestamp', default=None))
 | 
					                'timestamp', default=None))
 | 
				
			||||||
@@ -434,17 +437,17 @@ class FacebookIE(InfoExtractor):
 | 
				
			|||||||
                r'\bviewCount\s*:\s*["\']([\d,.]+)', webpage, 'view count',
 | 
					                r'\bviewCount\s*:\s*["\']([\d,.]+)', webpage, 'view count',
 | 
				
			||||||
                default=None))
 | 
					                default=None))
 | 
				
			||||||
            info_dict = {
 | 
					            info_dict = {
 | 
				
			||||||
                'title': video_title,
 | 
					                'title': video_title or description.replace('\n', ' ') or f'Facebook video #{video_id}',
 | 
				
			||||||
                'description': description,
 | 
					                'description': description,
 | 
				
			||||||
                'uploader': uploader,
 | 
					                'uploader': uploader,
 | 
				
			||||||
 | 
					                'uploader_id': uploader_id,
 | 
				
			||||||
                'timestamp': timestamp,
 | 
					                'timestamp': timestamp,
 | 
				
			||||||
                'thumbnail': thumbnail,
 | 
					                'thumbnail': thumbnail,
 | 
				
			||||||
                'view_count': view_count,
 | 
					                'view_count': view_count,
 | 
				
			||||||
            }
 | 
					            }
 | 
				
			||||||
            info_json_ld = self._search_json_ld(webpage, video_id, default={})
 | 
					            info_json_ld = self._search_json_ld(webpage, video_id, default={})
 | 
				
			||||||
            if info_json_ld.get('title'):
 | 
					            if info_json_ld.get('title'):
 | 
				
			||||||
                info_json_ld['title'] = limit_length(
 | 
					                info_json_ld['title'] = re.sub(r'\s*\|\s*Facebook$', '', info_json_ld['title'])
 | 
				
			||||||
                    re.sub(r'\s*\|\s*Facebook$', '', info_json_ld['title']), 80)
 | 
					 | 
				
			||||||
            return merge_dicts(info_json_ld, info_dict)
 | 
					            return merge_dicts(info_json_ld, info_dict)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        video_data = None
 | 
					        video_data = None
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user