diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index e6c8d574e..4f881eb4b 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -224,8 +224,15 @@ def test_search_json_ld_realworld(self): } ''', { - 'timestamp': 1636523400, 'title': 'md5:91fe569e952e4d146485740ae927662b', + 'categories': ['Κοινωνία'], + 'creators': ['Ant1news'], + 'description': 'md5:16756d0a18f33bf550e683d134a72f3c', + 'modified_timestamp': 1636523573, + 'release_timestamp': 1636523400, + 'tags': 'count:6', + 'thumbnails': [{'url': 'https://ant1media.azureedge.net/imgHandler/1100/a635c968-be71-447c-bf9c-80d843ece21e.jpg'}], + 'uploader': 'Ant1news', }, {'expected_type': 'NewsArticle'}, ), @@ -328,6 +335,54 @@ def test_search_json_ld_realworld(self): }, {}, ), + ( + r''' + +''', + { + 'title': 'md5:3f077843a74f01f768bbf0853c210855', + 'categories': ['Reportages'], + 'creators': ['Sabine Dupont'], + 'description': 'md5:1dc04a3aa56c5228503071baa8b4cc97', + 'modified_timestamp': 1747319520, + 'release_timestamp': 1747319520, + 'tags': 'count:1', + 'timestamp': 1747319520, + 'thumbnails': [{'url': 'https://www.telemb.be/cdn/ff/pKwkkhB7a5GqSf98QdDUcn9WlvGTYyilvXisHO3fHpI/1747320854/public/2025-05/00006554_avc-tmb-093031.jpeg'}], + 'uploader': 'Tele MB', + }, + {}, + ), ] for html, expected_dict, search_json_ld_kwargs in _TESTS: expect_dict( diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 6058f66ae..5fde840e5 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -1742,11 +1742,21 @@ def traverse_json_ld(json_ld, at_top_level=True): 'timestamp': unified_timestamp(e.get('dateCreated')), }) elif is_type(e, 'Article', 'NewsArticle'): - info.update({ - 'timestamp': parse_iso8601(e.get('datePublished')), - 'title': unescapeHTML(e.get('headline')), - 'description': unescapeHTML(e.get('articleBody') or e.get('description')), - }) + info.update(**traverse_obj(e, { + 'title': ('headline', {clean_html}, filter), + 'alt_title': ('alternativeHeadline', {clean_html}, filter), + 'categories': ('articleSection', {clean_html}, filter, all, filter), + 'creators': ('author', (None, 'name'), {clean_html}, filter, all, filter), + 'description': (('description', 'articleBody'), {clean_html}, filter, any), + 'modified_timestamp': ('dateModified', {parse_iso8601}), + 'release_timestamp': ('datePublished', {parse_iso8601}), + 'tags': ('keywords', {clean_html}, {lambda x: x.split(',')}, ..., {str.strip}, filter, all, filter), + 'thumbnails': ('image', ..., { + 'url': ({str}, {unescapeHTML}, {self._proto_relative_url}, {url_or_none}), + }), + 'timestamp': ('dateCreated', {parse_iso8601}), + 'uploader': ('publisher', 'name', {clean_html}, filter), + })) if is_type(traverse_obj(e, ('video', 0)), 'VideoObject'): extract_video_object(e['video'][0]) elif is_type(traverse_obj(e, ('subjectOf', 0)), 'VideoObject'):