From 0dab25e4358b1efbbf17430e710380d7611a3d55 Mon Sep 17 00:00:00 2001 From: doe1080 <98906116+doe1080@users.noreply.github.com> Date: Thu, 5 Jun 2025 10:26:34 +0900 Subject: [PATCH 1/5] [ie] Improve JSON LD metadata extraction --- test/test_InfoExtractor.py | 57 +++++++++++++++++++++++++++++++++++++- yt_dlp/extractor/common.py | 19 +++++++++---- 2 files changed, 70 insertions(+), 6 deletions(-) diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index bc89b2955..a16bc16ef 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -224,8 +224,15 @@ def test_search_json_ld_realworld(self): } ''', { - 'timestamp': 1636523400, 'title': 'md5:91fe569e952e4d146485740ae927662b', + 'categories': ['Κοινωνία'], + 'creators': ['Ant1news'], + 'description': 'md5:16756d0a18f33bf550e683d134a72f3c', + 'modified_timestamp': 1636523573, + 'release_timestamp': 1636523400, + 'tags': ['μαχαίρωμα,συμμορία ανηλίκων,ΕΙΔΗΣΕΙΣ,ΕΙΔΗΣΕΙΣ ΣΗΜΕΡΑ,ΝΕΑ,Κοινωνία - Ant1news'], + 'thumbnails': [{'url': 'https://ant1media.azureedge.net/imgHandler/1100/a635c968-be71-447c-bf9c-80d843ece21e.jpg'}], + 'uploader': 'Ant1news', }, {'expected_type': 'NewsArticle'}, ), @@ -328,6 +335,54 @@ def test_search_json_ld_realworld(self): }, {}, ), + ( + r''' + +''', + { + 'title': 'md5:3f077843a74f01f768bbf0853c210855', + 'categories': ['Reportages'], + 'creators': ['Sabine Dupont'], + 'description': 'md5:40eaf402631e0a77d8d74f66574bb978', + 'modified_timestamp': 1747319520, + 'release_timestamp': 1747319520, + 'tags': ['enseignement secondaire'], + 'timestamp': 1747319520, + 'thumbnails': [{'url': 'https://www.telemb.be/cdn/ff/pKwkkhB7a5GqSf98QdDUcn9WlvGTYyilvXisHO3fHpI/1747320854/public/2025-05/00006554_avc-tmb-093031.jpeg'}], + 'uploader': 'Tele MB', + }, + {}, + ), ] for html, expected_dict, search_json_ld_kwargs in _TESTS: expect_dict( diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 1174bd4f5..d3abba036 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -1741,11 +1741,20 @@ def traverse_json_ld(json_ld, at_top_level=True): 'timestamp': unified_timestamp(e.get('dateCreated')), }) elif is_type(e, 'Article', 'NewsArticle'): - info.update({ - 'timestamp': parse_iso8601(e.get('datePublished')), - 'title': unescapeHTML(e.get('headline')), - 'description': unescapeHTML(e.get('articleBody') or e.get('description')), - }) + info.update(**traverse_obj(e, { + 'title': ('headline', {str}, {unescapeHTML}), + 'categories': ('articleSection', {str}, {unescapeHTML}, filter, all), + 'creators': ('author', (None, 'name'), {str}, {unescapeHTML}, filter, all), + 'description': (('description', 'articleBody'), {str}, {unescapeHTML}, any), + 'modified_timestamp': ('dateModified', {parse_iso8601}), + 'release_timestamp': ('datePublished', {parse_iso8601}), + 'tags': ('keywords', {str}, {unescapeHTML}, filter, all), + 'thumbnails': ('image', ..., { + 'url': ({str}, {unescapeHTML}, {self._proto_relative_url}, {url_or_none}), + }), + 'timestamp': ('dateCreated', {parse_iso8601}), + 'uploader': ('publisher', 'name', {str}, {unescapeHTML}), + })) if is_type(traverse_obj(e, ('video', 0)), 'VideoObject'): extract_video_object(e['video'][0]) elif is_type(traverse_obj(e, ('subjectOf', 0)), 'VideoObject'): From b9d2858b205e1e891dba4bd0aa00c98362d2821f Mon Sep 17 00:00:00 2001 From: doe1080 <98906116+doe1080@users.noreply.github.com> Date: Thu, 5 Jun 2025 10:50:05 +0900 Subject: [PATCH 2/5] fix tags --- test/test_InfoExtractor.py | 4 ++-- yt_dlp/extractor/common.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index a16bc16ef..d22b61f62 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -230,7 +230,7 @@ def test_search_json_ld_realworld(self): 'description': 'md5:16756d0a18f33bf550e683d134a72f3c', 'modified_timestamp': 1636523573, 'release_timestamp': 1636523400, - 'tags': ['μαχαίρωμα,συμμορία ανηλίκων,ΕΙΔΗΣΕΙΣ,ΕΙΔΗΣΕΙΣ ΣΗΜΕΡΑ,ΝΕΑ,Κοινωνία - Ant1news'], + 'tags': 'count:6', 'thumbnails': [{'url': 'https://ant1media.azureedge.net/imgHandler/1100/a635c968-be71-447c-bf9c-80d843ece21e.jpg'}], 'uploader': 'Ant1news', }, @@ -376,7 +376,7 @@ def test_search_json_ld_realworld(self): 'description': 'md5:40eaf402631e0a77d8d74f66574bb978', 'modified_timestamp': 1747319520, 'release_timestamp': 1747319520, - 'tags': ['enseignement secondaire'], + 'tags': 'count:1', 'timestamp': 1747319520, 'thumbnails': [{'url': 'https://www.telemb.be/cdn/ff/pKwkkhB7a5GqSf98QdDUcn9WlvGTYyilvXisHO3fHpI/1747320854/public/2025-05/00006554_avc-tmb-093031.jpeg'}], 'uploader': 'Tele MB', diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index d3abba036..74ed84050 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -1748,7 +1748,7 @@ def traverse_json_ld(json_ld, at_top_level=True): 'description': (('description', 'articleBody'), {str}, {unescapeHTML}, any), 'modified_timestamp': ('dateModified', {parse_iso8601}), 'release_timestamp': ('datePublished', {parse_iso8601}), - 'tags': ('keywords', {str}, {unescapeHTML}, filter, all), + 'tags': ('keywords', {str}, {unescapeHTML}, {lambda x: x.split(',')}, filter), 'thumbnails': ('image', ..., { 'url': ({str}, {unescapeHTML}, {self._proto_relative_url}, {url_or_none}), }), From e2bb3a52f227897d223f229c285661279e161a42 Mon Sep 17 00:00:00 2001 From: doe1080 <98906116+doe1080@users.noreply.github.com> Date: Sun, 15 Jun 2025 17:01:00 +0900 Subject: [PATCH 3/5] filter --- yt_dlp/extractor/common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index eec1742a4..ba32664e4 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -1744,8 +1744,8 @@ def traverse_json_ld(json_ld, at_top_level=True): elif is_type(e, 'Article', 'NewsArticle'): info.update(**traverse_obj(e, { 'title': ('headline', {str}, {unescapeHTML}), - 'categories': ('articleSection', {str}, {unescapeHTML}, filter, all), - 'creators': ('author', (None, 'name'), {str}, {unescapeHTML}, filter, all), + 'categories': ('articleSection', {str}, {unescapeHTML}, filter, all, filter), + 'creators': ('author', (None, 'name'), {str}, {unescapeHTML}, filter, all, filter), 'description': (('description', 'articleBody'), {str}, {unescapeHTML}, any), 'modified_timestamp': ('dateModified', {parse_iso8601}), 'release_timestamp': ('datePublished', {parse_iso8601}), From f5091a346a02d2ed17d7f95d2a2bbc85b6eee93d Mon Sep 17 00:00:00 2001 From: doe1080 <98906116+doe1080@users.noreply.github.com> Date: Sun, 15 Jun 2025 17:08:38 +0900 Subject: [PATCH 4/5] alt_title --- yt_dlp/extractor/common.py | 1 + 1 file changed, 1 insertion(+) diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index ba32664e4..e2ed97ea6 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -1744,6 +1744,7 @@ def traverse_json_ld(json_ld, at_top_level=True): elif is_type(e, 'Article', 'NewsArticle'): info.update(**traverse_obj(e, { 'title': ('headline', {str}, {unescapeHTML}), + 'alt_title': ('alternativeHeadline', {str}, {unescapeHTML}), 'categories': ('articleSection', {str}, {unescapeHTML}, filter, all, filter), 'creators': ('author', (None, 'name'), {str}, {unescapeHTML}, filter, all, filter), 'description': (('description', 'articleBody'), {str}, {unescapeHTML}, any), From a59c0bd4775fca77e90d54f3842b0d4171f0bf70 Mon Sep 17 00:00:00 2001 From: doe1080 <98906116+doe1080@users.noreply.github.com> Date: Thu, 19 Jun 2025 11:53:38 +0900 Subject: [PATCH 5/5] filter --- test/test_InfoExtractor.py | 2 +- yt_dlp/extractor/common.py | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index 8b367ffff..4f881eb4b 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -373,7 +373,7 @@ def test_search_json_ld_realworld(self): 'title': 'md5:3f077843a74f01f768bbf0853c210855', 'categories': ['Reportages'], 'creators': ['Sabine Dupont'], - 'description': 'md5:40eaf402631e0a77d8d74f66574bb978', + 'description': 'md5:1dc04a3aa56c5228503071baa8b4cc97', 'modified_timestamp': 1747319520, 'release_timestamp': 1747319520, 'tags': 'count:1', diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index e2ed97ea6..5fde840e5 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -1743,19 +1743,19 @@ def traverse_json_ld(json_ld, at_top_level=True): }) elif is_type(e, 'Article', 'NewsArticle'): info.update(**traverse_obj(e, { - 'title': ('headline', {str}, {unescapeHTML}), - 'alt_title': ('alternativeHeadline', {str}, {unescapeHTML}), - 'categories': ('articleSection', {str}, {unescapeHTML}, filter, all, filter), - 'creators': ('author', (None, 'name'), {str}, {unescapeHTML}, filter, all, filter), - 'description': (('description', 'articleBody'), {str}, {unescapeHTML}, any), + 'title': ('headline', {clean_html}, filter), + 'alt_title': ('alternativeHeadline', {clean_html}, filter), + 'categories': ('articleSection', {clean_html}, filter, all, filter), + 'creators': ('author', (None, 'name'), {clean_html}, filter, all, filter), + 'description': (('description', 'articleBody'), {clean_html}, filter, any), 'modified_timestamp': ('dateModified', {parse_iso8601}), 'release_timestamp': ('datePublished', {parse_iso8601}), - 'tags': ('keywords', {str}, {unescapeHTML}, {lambda x: x.split(',')}, filter), + 'tags': ('keywords', {clean_html}, {lambda x: x.split(',')}, ..., {str.strip}, filter, all, filter), 'thumbnails': ('image', ..., { 'url': ({str}, {unescapeHTML}, {self._proto_relative_url}, {url_or_none}), }), 'timestamp': ('dateCreated', {parse_iso8601}), - 'uploader': ('publisher', 'name', {str}, {unescapeHTML}), + 'uploader': ('publisher', 'name', {clean_html}, filter), })) if is_type(traverse_obj(e, ('video', 0)), 'VideoObject'): extract_video_object(e['video'][0])