From d65753ce059bb3bd5590db75101f3ef18a1ab68c Mon Sep 17 00:00:00 2001 From: grqx_wsl <173253225+grqx@users.noreply.github.com> Date: Fri, 19 Jul 2024 04:49:09 +1200 Subject: [PATCH 01/20] [GoogleDriveFolder] Fix Extractor --- yt_dlp/extractor/googledrive.py | 43 ++++++++------------------------- 1 file changed, 10 insertions(+), 33 deletions(-) diff --git a/yt_dlp/extractor/googledrive.py b/yt_dlp/extractor/googledrive.py index dfba2d3ba1..4f4a9cf3f2 100644 --- a/yt_dlp/extractor/googledrive.py +++ b/yt_dlp/extractor/googledrive.py @@ -1,3 +1,4 @@ +import json import re import urllib.parse @@ -298,46 +299,22 @@ class GoogleDriveFolderIE(InfoExtractor): }, 'playlist_count': 3, }] - _BOUNDARY = '=====vc17a3rwnndj=====' - _REQUEST = "/drive/v2beta/files?openDrive=true&reason=102&syncType=0&errorRecovery=false&q=trashed%20%3D%20false%20and%20'{folder_id}'%20in%20parents&fields=kind%2CnextPageToken%2Citems(kind%2CmodifiedDate%2CmodifiedByMeDate%2ClastViewedByMeDate%2CfileSize%2Cowners(kind%2CpermissionId%2Cid)%2ClastModifyingUser(kind%2CpermissionId%2Cid)%2ChasThumbnail%2CthumbnailVersion%2Ctitle%2Cid%2CresourceKey%2Cshared%2CsharedWithMeDate%2CuserPermission(role)%2CexplicitlyTrashed%2CmimeType%2CquotaBytesUsed%2Ccopyable%2CfileExtension%2CsharingUser(kind%2CpermissionId%2Cid)%2Cspaces%2Cversion%2CteamDriveId%2ChasAugmentedPermissions%2CcreatedDate%2CtrashingUser(kind%2CpermissionId%2Cid)%2CtrashedDate%2Cparents(id)%2CshortcutDetails(targetId%2CtargetMimeType%2CtargetLookupStatus)%2Ccapabilities(canCopy%2CcanDownload%2CcanEdit%2CcanAddChildren%2CcanDelete%2CcanRemoveChildren%2CcanShare%2CcanTrash%2CcanRename%2CcanReadTeamDrive%2CcanMoveTeamDriveItem)%2Clabels(starred%2Ctrashed%2Crestricted%2Cviewed))%2CincompleteSearch&appDataFilter=NO_APP_DATA&spaces=drive&pageToken={page_token}&maxResults=50&supportsTeamDrives=true&includeItemsFromAllDrives=true&corpora=default&orderBy=folder%2Ctitle_natural%20asc&retryCount=0&key={key} HTTP/1.1" - _DATA = f'''--{_BOUNDARY} -content-type: application/http -content-transfer-encoding: binary -GET %s + def _extract_json(self, idx, webpage): + RE = r'AF_initDataCallback\(\{key:\s*([\'"])ds:\s*%d\1,[^}]*data:(?P\[.*?\]),\s*sideChannel:\s\{' + return json.loads(self._html_search_regex(RE % idx, webpage, 'JSON', group='data')) ---{_BOUNDARY} -''' - - def _call_api(self, folder_id, key, data, **kwargs): - response = self._download_webpage( - 'https://clients6.google.com/batch/drive/v2beta', - folder_id, data=data.encode(), - headers={ - 'Content-Type': 'text/plain;charset=UTF-8;', - 'Origin': 'https://drive.google.com', - }, query={ - '$ct': f'multipart/mixed; boundary="{self._BOUNDARY}"', - 'key': key, - }, **kwargs) - return self._search_json('', response, 'api response', folder_id, **kwargs) or {} - - def _get_folder_items(self, folder_id, key): - page_token = '' - while page_token is not None: - request = self._REQUEST.format(folder_id=folder_id, page_token=page_token, key=key) - page = self._call_api(folder_id, key, self._DATA % request) - yield from page['items'] - page_token = page.get('nextPageToken') + def _get_folder_items(self, results): + yield from results def _real_extract(self, url): folder_id = self._match_id(url) webpage = self._download_webpage(url, folder_id) - key = self._search_regex(r'"(\w{39})"', webpage, 'key') - folder_info = self._call_api(folder_id, key, self._DATA % f'/drive/v2beta/files/{folder_id} HTTP/1.1', fatal=False) + title = self._extract_json(0, webpage)[1][2] + results_4 = self._extract_json(4, webpage)[-1] return self.playlist_from_matches( - self._get_folder_items(folder_id, key), folder_id, folder_info.get('title'), - ie=GoogleDriveIE, getter=lambda item: f'https://drive.google.com/file/d/{item["id"]}') + self._get_folder_items(results_4), folder_id, title, + ie=GoogleDriveIE, getter=lambda item: f'https://drive.google.com/file/d/{item[0]}') From 05403ea5ad7e57dcacde98055267285c93a3278b Mon Sep 17 00:00:00 2001 From: grqx_wsl <173253225+grqx@users.noreply.github.com> Date: Fri, 19 Jul 2024 16:31:08 +1200 Subject: [PATCH 02/20] add tests, fix regex, improve stability. --- yt_dlp/extractor/googledrive.py | 71 +++++++++++++++++++++++++++------ 1 file changed, 58 insertions(+), 13 deletions(-) diff --git a/yt_dlp/extractor/googledrive.py b/yt_dlp/extractor/googledrive.py index 4f4a9cf3f2..7a9d0befb0 100644 --- a/yt_dlp/extractor/googledrive.py +++ b/yt_dlp/extractor/googledrive.py @@ -1,4 +1,3 @@ -import json import re import urllib.parse @@ -298,23 +297,69 @@ class GoogleDriveFolderIE(InfoExtractor): 'title': 'Forrest', }, 'playlist_count': 3, + }, { + # Contains various formats + 'url': 'https://drive.google.com/drive/folders/1CkqRsNlzZ0o3IL083j17s6sH5Q83DcGo', + 'info_dict': { + 'id': '1CkqRsNlzZ0o3IL083j17s6sH5Q83DcGo', + 'title': 'public folder', + }, + 'playlist_count': 4, }] + _JSON_DS_RE = r'key\s*?:\s*?([\'"])ds:\s*?%d\1,[^}]*data:' + _JSON_HASH_RE = r'hash\s*?:\s*?([\'"])%d\1,[^}]*data:' + _ARRAY_RE = r'\[(?s:.+)\]' - def _extract_json(self, idx, webpage): - RE = r'AF_initDataCallback\(\{key:\s*([\'"])ds:\s*%d\1,[^}]*data:(?P\[.*?\]),\s*sideChannel:\s\{' - return json.loads(self._html_search_regex(RE % idx, webpage, 'JSON', group='data')) + def _extract_json_ds(self, dsval, webpage, video_id, **kwargs): + """ + Searches for json with the 'ds' value(0~5) from the webpage with regex. + Folder info: ds=0; Folder items: ds=4. + For example, if the webpage contains the line below, the empty data array + can be got by passing dsval=3 to this function. + AF_initDataCallback({key: 'ds:3', hash: '2', data:[], sideChannel: {}}); + """ + return self._search_json(self._JSON_DS_RE % dsval, webpage, + f'webpage JSON ds:{dsval}', video_id, + contains_pattern=self._ARRAY_RE, **kwargs) - def _get_folder_items(self, results): - yield from results + def _extract_json_hash(self, hashval, webpage, video_id, **kwargs): + """ + Searches for json with the 'hash' value(1~6) from the webpage with regex. + Folder info: hash=1; Folder items: hash=6. + For example, if the webpage contains the line below, the empty data array + can be got by passing hashval=2 to this function. + AF_initDataCallback({key: 'ds:3', hash: '2', data:[], sideChannel: {}}); + """ + return self._search_json(self._JSON_HASH_RE % hashval, webpage, + f'webpage JSON hash:{hashval}', video_id, + contains_pattern=self._ARRAY_RE, **kwargs) def _real_extract(self, url): + def item_url_getter(item): + url_from_0 = f'https://drive.google.com/file/d/{item[0]}' + if GoogleDriveIE.suitable(url_from_0): + return url_from_0 + else: + for attr in item: + if isinstance(attr, str) and GoogleDriveIE.suitable(attr): + return attr + self.write_debug('Failed to extract url!') + return None + folder_id = self._match_id(url) + headers = self.geo_verification_headers() - webpage = self._download_webpage(url, folder_id) + webpage = self._download_webpage(url, folder_id, headers=headers) + json_folder_info = ( + self._extract_json_ds(0, webpage, folder_id, default=None) + or self._extract_json_hash(1, webpage, folder_id) + ) + json_items = ( + self._extract_json_ds(4, webpage, folder_id, default=None) + or self._extract_json_hash(6, webpage, folder_id) + ) + title = json_folder_info[1][2] + items = json_items[-1] - title = self._extract_json(0, webpage)[1][2] - results_4 = self._extract_json(4, webpage)[-1] - - return self.playlist_from_matches( - self._get_folder_items(results_4), folder_id, title, - ie=GoogleDriveIE, getter=lambda item: f'https://drive.google.com/file/d/{item[0]}') + return self.playlist_from_matches((item for item in items), folder_id, title, + ie=GoogleDriveIE, getter=item_url_getter) From a917af960c8ab3895aaea0f44a4cef2021d418db Mon Sep 17 00:00:00 2001 From: grqx_wsl <173253225+grqx@users.noreply.github.com> Date: Fri, 19 Jul 2024 16:48:36 +1200 Subject: [PATCH 03/20] fix code formating, fix test folder title --- yt_dlp/extractor/googledrive.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/yt_dlp/extractor/googledrive.py b/yt_dlp/extractor/googledrive.py index 7a9d0befb0..d8e64360ef 100644 --- a/yt_dlp/extractor/googledrive.py +++ b/yt_dlp/extractor/googledrive.py @@ -302,7 +302,7 @@ class GoogleDriveFolderIE(InfoExtractor): 'url': 'https://drive.google.com/drive/folders/1CkqRsNlzZ0o3IL083j17s6sH5Q83DcGo', 'info_dict': { 'id': '1CkqRsNlzZ0o3IL083j17s6sH5Q83DcGo', - 'title': 'public folder', + 'title': r'], sideChannel: {}});', }, 'playlist_count': 4, }] @@ -319,8 +319,8 @@ def _extract_json_ds(self, dsval, webpage, video_id, **kwargs): AF_initDataCallback({key: 'ds:3', hash: '2', data:[], sideChannel: {}}); """ return self._search_json(self._JSON_DS_RE % dsval, webpage, - f'webpage JSON ds:{dsval}', video_id, - contains_pattern=self._ARRAY_RE, **kwargs) + f'webpage JSON ds:{dsval}', video_id, + contains_pattern=self._ARRAY_RE, **kwargs) def _extract_json_hash(self, hashval, webpage, video_id, **kwargs): """ @@ -331,8 +331,8 @@ def _extract_json_hash(self, hashval, webpage, video_id, **kwargs): AF_initDataCallback({key: 'ds:3', hash: '2', data:[], sideChannel: {}}); """ return self._search_json(self._JSON_HASH_RE % hashval, webpage, - f'webpage JSON hash:{hashval}', video_id, - contains_pattern=self._ARRAY_RE, **kwargs) + f'webpage JSON hash:{hashval}', video_id, + contains_pattern=self._ARRAY_RE, **kwargs) def _real_extract(self, url): def item_url_getter(item): From 64d4e93516e4ee78c312b7e6d84fb3b975658b8e Mon Sep 17 00:00:00 2001 From: grqx_wsl <173253225+grqx@users.noreply.github.com> Date: Mon, 22 Jul 2024 16:51:04 +1200 Subject: [PATCH 04/20] add support for subfolders(recursive) --- yt_dlp/extractor/googledrive.py | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/yt_dlp/extractor/googledrive.py b/yt_dlp/extractor/googledrive.py index d8e64360ef..28e496a458 100644 --- a/yt_dlp/extractor/googledrive.py +++ b/yt_dlp/extractor/googledrive.py @@ -298,13 +298,13 @@ class GoogleDriveFolderIE(InfoExtractor): }, 'playlist_count': 3, }, { - # Contains various formats + # Contains various formats and a subfolder 'url': 'https://drive.google.com/drive/folders/1CkqRsNlzZ0o3IL083j17s6sH5Q83DcGo', 'info_dict': { 'id': '1CkqRsNlzZ0o3IL083j17s6sH5Q83DcGo', 'title': r'], sideChannel: {}});', }, - 'playlist_count': 4, + 'playlist_count': 5, }] _JSON_DS_RE = r'key\s*?:\s*?([\'"])ds:\s*?%d\1,[^}]*data:' _JSON_HASH_RE = r'hash\s*?:\s*?([\'"])%d\1,[^}]*data:' @@ -335,17 +335,24 @@ def _extract_json_hash(self, hashval, webpage, video_id, **kwargs): contains_pattern=self._ARRAY_RE, **kwargs) def _real_extract(self, url): - def item_url_getter(item): - url_from_0 = f'https://drive.google.com/file/d/{item[0]}' - if GoogleDriveIE.suitable(url_from_0): - return url_from_0 - else: - for attr in item: - if isinstance(attr, str) and GoogleDriveIE.suitable(attr): - return attr - self.write_debug('Failed to extract url!') + def item_url_getter(item, video_id): + available_IEs = [GoogleDriveFolderIE, GoogleDriveIE] + for attr in item: + if isinstance(attr, str): + for available_IE in available_IEs: + if available_IE.suitable(attr): + return self.url_result(attr, available_IE, video_id, item[2]) + self.to_screen(f'Failed to find a suitable extractor for {item[2]}.') return None + def make_playlist(items, playlist_id): + entries = [] + for item in items: + entry = item_url_getter(item, playlist_id) + if entry: + entries.append(entry) + return self.playlist_result(entries, playlist_id, title) + folder_id = self._match_id(url) headers = self.geo_verification_headers() @@ -361,5 +368,4 @@ def item_url_getter(item): title = json_folder_info[1][2] items = json_items[-1] - return self.playlist_from_matches((item for item in items), folder_id, title, - ie=GoogleDriveIE, getter=item_url_getter) + return make_playlist(items, folder_id) From cbe698b4b0d7d1b8a9af5bbc3381a82855861f36 Mon Sep 17 00:00:00 2001 From: grqx_wsl <173253225+grqx@users.noreply.github.com> Date: Mon, 22 Jul 2024 17:28:45 +1200 Subject: [PATCH 05/20] Add support for empty folder --- yt_dlp/extractor/googledrive.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/yt_dlp/extractor/googledrive.py b/yt_dlp/extractor/googledrive.py index 28e496a458..950d877f97 100644 --- a/yt_dlp/extractor/googledrive.py +++ b/yt_dlp/extractor/googledrive.py @@ -367,5 +367,7 @@ def make_playlist(items, playlist_id): ) title = json_folder_info[1][2] items = json_items[-1] + if not isinstance(items, list): + return self.playlist_result([], folder_id, title) return make_playlist(items, folder_id) From 99628595959370d58a47a1d7c266e1224ad1ada9 Mon Sep 17 00:00:00 2001 From: grqx_wsl <173253225+grqx@users.noreply.github.com> Date: Wed, 24 Jul 2024 23:16:44 +1200 Subject: [PATCH 06/20] Replace function `make_playlist` with a more concise generator expression to improve code readability. All tests and code format checker are passing modified: yt_dlp/extractor/googledrive.py --- yt_dlp/extractor/googledrive.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/yt_dlp/extractor/googledrive.py b/yt_dlp/extractor/googledrive.py index 950d877f97..a4f2512119 100644 --- a/yt_dlp/extractor/googledrive.py +++ b/yt_dlp/extractor/googledrive.py @@ -345,14 +345,6 @@ def item_url_getter(item, video_id): self.to_screen(f'Failed to find a suitable extractor for {item[2]}.') return None - def make_playlist(items, playlist_id): - entries = [] - for item in items: - entry = item_url_getter(item, playlist_id) - if entry: - entries.append(entry) - return self.playlist_result(entries, playlist_id, title) - folder_id = self._match_id(url) headers = self.geo_verification_headers() @@ -365,9 +357,12 @@ def make_playlist(items, playlist_id): self._extract_json_ds(4, webpage, folder_id, default=None) or self._extract_json_hash(6, webpage, folder_id) ) + title = json_folder_info[1][2] items = json_items[-1] if not isinstance(items, list): return self.playlist_result([], folder_id, title) - return make_playlist(items, folder_id) + return self.playlist_result( + (entry for item in items if (entry := item_url_getter(item, folder_id))), + folder_id, title) From 017997068bb9b181c27e022daa6e184592faebe2 Mon Sep 17 00:00:00 2001 From: grqx_wsl <173253225+grqx@users.noreply.github.com> Date: Sun, 28 Jul 2024 03:18:17 +1200 Subject: [PATCH 07/20] read shortcuts url --- yt_dlp/extractor/googledrive.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/yt_dlp/extractor/googledrive.py b/yt_dlp/extractor/googledrive.py index a4f2512119..1a58882944 100644 --- a/yt_dlp/extractor/googledrive.py +++ b/yt_dlp/extractor/googledrive.py @@ -12,6 +12,7 @@ get_element_html_by_id, int_or_none, lowercase_escape, + traverse_obj, try_get, update_url_query, ) @@ -304,7 +305,7 @@ class GoogleDriveFolderIE(InfoExtractor): 'id': '1CkqRsNlzZ0o3IL083j17s6sH5Q83DcGo', 'title': r'], sideChannel: {}});', }, - 'playlist_count': 5, + 'playlist_count': 6, }] _JSON_DS_RE = r'key\s*?:\s*?([\'"])ds:\s*?%d\1,[^}]*data:' _JSON_HASH_RE = r'hash\s*?:\s*?([\'"])%d\1,[^}]*data:' @@ -337,13 +338,17 @@ def _extract_json_hash(self, hashval, webpage, video_id, **kwargs): def _real_extract(self, url): def item_url_getter(item, video_id): available_IEs = [GoogleDriveFolderIE, GoogleDriveIE] - for attr in item: - if isinstance(attr, str): - for available_IE in available_IEs: - if available_IE.suitable(attr): - return self.url_result(attr, available_IE, video_id, item[2]) - self.to_screen(f'Failed to find a suitable extractor for {item[2]}.') - return None + if 'application/vnd.google-apps.shortcut' in item: + entry_url = traverse_obj( + item, (..., ..., lambda _, v: any(ie.suitable(v) for ie in available_IEs), + {str}, any)) + else: + entry_url = traverse_obj( + item, (lambda _, v: any(ie.suitable(v) for ie in available_IEs), + {str}, any)) + if not entry_url: + return None + return self.url_result(entry_url, video_id=video_id, video_title=item[2]) folder_id = self._match_id(url) headers = self.geo_verification_headers() From 99d9105f33ea72c58ed98fd0c951a7f50ea2f647 Mon Sep 17 00:00:00 2001 From: grqx_wsl <173253225+grqx@users.noreply.github.com> Date: Sun, 28 Jul 2024 03:36:55 +1200 Subject: [PATCH 08/20] [GoogleDrive] add support for shortcut urls: follow redirection Example url: https://drive.google.com/file/d/17OrYKQBPRm4J_D1rsGbo4eOmm1_SfoIY/ Redirects to(real video): https://drive.google.com/file/d/1Jp0I0tS-qMxtXNehGQW5_hWhwgC0FeeB/edit --- yt_dlp/extractor/googledrive.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/yt_dlp/extractor/googledrive.py b/yt_dlp/extractor/googledrive.py index 1a58882944..437b31db9b 100644 --- a/yt_dlp/extractor/googledrive.py +++ b/yt_dlp/extractor/googledrive.py @@ -167,6 +167,10 @@ def _get_automatic_captions(self, video_id, subtitles_id, hl): def _real_extract(self, url): video_id = self._match_id(url) + _, webpage_urlh = self._download_webpage_handle(url, video_id) + if webpage_urlh.url != url: + return self.url_result(webpage_urlh.url) + video_info = urllib.parse.parse_qs(self._download_webpage( 'https://drive.google.com/get_video_info', video_id, 'Downloading video webpage', query={'docid': video_id})) From 6e98d99dd5cb171f21fb07832fa62095bf2e55c9 Mon Sep 17 00:00:00 2001 From: grqx_wsl <173253225+grqx@users.noreply.github.com> Date: Sat, 17 Aug 2024 20:21:02 +1200 Subject: [PATCH 09/20] [GoogleDriveFolderIE] improve code readability - Combined `_extract_json_ds` and`_extract_json_hash` into one method(`_extract_json_meta`) - Improved `item_url_getter`'s traversal path of item info - Add notations to improve code readability --- yt_dlp/extractor/googledrive.py | 66 ++++++++++++++++----------------- 1 file changed, 32 insertions(+), 34 deletions(-) diff --git a/yt_dlp/extractor/googledrive.py b/yt_dlp/extractor/googledrive.py index 437b31db9b..80ee06daba 100644 --- a/yt_dlp/extractor/googledrive.py +++ b/yt_dlp/extractor/googledrive.py @@ -303,7 +303,7 @@ class GoogleDriveFolderIE(InfoExtractor): }, 'playlist_count': 3, }, { - # Contains various formats and a subfolder + 'note': 'Contains various formats and a subfolder, folder name was formerly mismatched', 'url': 'https://drive.google.com/drive/folders/1CkqRsNlzZ0o3IL083j17s6sH5Q83DcGo', 'info_dict': { 'id': '1CkqRsNlzZ0o3IL083j17s6sH5Q83DcGo', @@ -311,45 +311,43 @@ class GoogleDriveFolderIE(InfoExtractor): }, 'playlist_count': 6, }] - _JSON_DS_RE = r'key\s*?:\s*?([\'"])ds:\s*?%d\1,[^}]*data:' - _JSON_HASH_RE = r'hash\s*?:\s*?([\'"])%d\1,[^}]*data:' - _ARRAY_RE = r'\[(?s:.+)\]' - def _extract_json_ds(self, dsval, webpage, video_id, **kwargs): + def _extract_json_meta(self, webpage, video_id, dsval=None, hashval=None, name=None, **kwargs): """ - Searches for json with the 'ds' value(0~5) from the webpage with regex. - Folder info: ds=0; Folder items: ds=4. + Uses regex to search for json metadata with 'ds' value(0-5) or 'hash' value(1-6) + from the webpage. + Folder info: ds=0, hash=1; Folder items: ds=4, hash=6. For example, if the webpage contains the line below, the empty data array - can be got by passing dsval=3 to this function. + can be got by passing dsval=3 or hashval=2 to this method. AF_initDataCallback({key: 'ds:3', hash: '2', data:[], sideChannel: {}}); """ - return self._search_json(self._JSON_DS_RE % dsval, webpage, - f'webpage JSON ds:{dsval}', video_id, - contains_pattern=self._ARRAY_RE, **kwargs) - - def _extract_json_hash(self, hashval, webpage, video_id, **kwargs): - """ - Searches for json with the 'hash' value(1~6) from the webpage with regex. - Folder info: hash=1; Folder items: hash=6. - For example, if the webpage contains the line below, the empty data array - can be got by passing hashval=2 to this function. - AF_initDataCallback({key: 'ds:3', hash: '2', data:[], sideChannel: {}}); - """ - return self._search_json(self._JSON_HASH_RE % hashval, webpage, - f'webpage JSON hash:{hashval}', video_id, - contains_pattern=self._ARRAY_RE, **kwargs) + _ARRAY_RE = r'\[(?s:.+)\]' + _META_END_RE = r', sideChannel: \{\}\}\);' # greedy match to deal with the 2nd test case + if dsval: + if not name: + name = f'webpage JSON metadata ds:{dsval}' + return self._search_json( + rf'''key\s*?:\s*?(['"])ds:\s*?{dsval}\1,[^\[]*?data:''', webpage, name, video_id, + end_pattern=_META_END_RE, contains_pattern=_ARRAY_RE, **kwargs) + elif hashval: + if not name: + name = f'webpage JSON metadata hash:{hashval}' + return self._search_json( + rf'''hash\s*?:\s*?(['"]){hashval}\1,[^\[]*?data:''', webpage, name, video_id, + end_pattern=_META_END_RE, contains_pattern=_ARRAY_RE, **kwargs) + return None def _real_extract(self, url): def item_url_getter(item, video_id): - available_IEs = [GoogleDriveFolderIE, GoogleDriveIE] - if 'application/vnd.google-apps.shortcut' in item: + available_IEs = [GoogleDriveFolderIE, GoogleDriveIE] # subfolder or item + if 'application/vnd.google-apps.shortcut' in item: # extract real link entry_url = traverse_obj( - item, (..., ..., lambda _, v: any(ie.suitable(v) for ie in available_IEs), - {str}, any)) + item, + (..., ..., lambda _, v: any(ie.suitable(v) for ie in available_IEs), any)) else: entry_url = traverse_obj( - item, (lambda _, v: any(ie.suitable(v) for ie in available_IEs), - {str}, any)) + item, + (lambda _, v: any(ie.suitable(v) for ie in available_IEs), any)) if not entry_url: return None return self.url_result(entry_url, video_id=video_id, video_title=item[2]) @@ -359,17 +357,17 @@ def item_url_getter(item, video_id): webpage = self._download_webpage(url, folder_id, headers=headers) json_folder_info = ( - self._extract_json_ds(0, webpage, folder_id, default=None) - or self._extract_json_hash(1, webpage, folder_id) + self._extract_json_meta(webpage, folder_id, dsval=0, name='folder info', default=None) + or self._extract_json_meta(webpage, folder_id, hashval=1) ) json_items = ( - self._extract_json_ds(4, webpage, folder_id, default=None) - or self._extract_json_hash(6, webpage, folder_id) + self._extract_json_meta(webpage, folder_id, dsval=4, name='folder items', default=None) + or self._extract_json_meta(webpage, folder_id, hashval=6) ) title = json_folder_info[1][2] items = json_items[-1] - if not isinstance(items, list): + if not isinstance(items, list): # empty folder return self.playlist_result([], folder_id, title) return self.playlist_result( From 8623ada2938124370211dd5682eab281690fdec4 Mon Sep 17 00:00:00 2001 From: grqx_wsl <173253225+grqx@users.noreply.github.com> Date: Sat, 17 Aug 2024 23:49:00 +1200 Subject: [PATCH 10/20] [GoogleDriveFolderIE] Several fixes - Update test: GoogleDriveFolder_1 - Raise for log-in required - catch HTTP Error 404 --- yt_dlp/extractor/googledrive.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/yt_dlp/extractor/googledrive.py b/yt_dlp/extractor/googledrive.py index 80ee06daba..4603611dcc 100644 --- a/yt_dlp/extractor/googledrive.py +++ b/yt_dlp/extractor/googledrive.py @@ -3,6 +3,7 @@ from .common import InfoExtractor from .youtube import YoutubeIE +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, bug_reports_message, @@ -303,13 +304,14 @@ class GoogleDriveFolderIE(InfoExtractor): }, 'playlist_count': 3, }, { - 'note': 'Contains various formats and a subfolder, folder name was formerly mismatched', - 'url': 'https://drive.google.com/drive/folders/1CkqRsNlzZ0o3IL083j17s6sH5Q83DcGo', + 'note': 'Contains various formats and a subfolder, folder name was formerly mismatched.' + 'also contains loop shortcut, shortcut to non-downloadable files, etc.', + 'url': 'https://drive.google.com/drive/folders/1jjrhqi94d8TSHSVMSdBjD49MOiHYpHfF', 'info_dict': { - 'id': '1CkqRsNlzZ0o3IL083j17s6sH5Q83DcGo', - 'title': r'], sideChannel: {}});', + 'id': '1jjrhqi94d8TSHSVMSdBjD49MOiHYpHfF', + 'title': '], sideChannel: {}});', }, - 'playlist_count': 6, + 'playlist_count': 8, }] def _extract_json_meta(self, webpage, video_id, dsval=None, hashval=None, name=None, **kwargs): @@ -355,7 +357,14 @@ def item_url_getter(item, video_id): folder_id = self._match_id(url) headers = self.geo_verification_headers() - webpage = self._download_webpage(url, folder_id, headers=headers) + try: + webpage, urlh = self._download_webpage_handle(url, folder_id, headers=headers) + except ExtractorError as e: + if isinstance(e.cause, HTTPError) and e.cause.status == 404: + self.raise_no_formats(e.cause.msg) + if urllib.parse.urlparse(urlh.url).netloc == 'accounts.google.com': + self.raise_login_required('This video is only available for registered users') + json_folder_info = ( self._extract_json_meta(webpage, folder_id, dsval=0, name='folder info', default=None) or self._extract_json_meta(webpage, folder_id, hashval=1) From 28ed64d87a4bfb7c27b96124c2dab41e9ba2c614 Mon Sep 17 00:00:00 2001 From: grqx_wsl <173253225+grqx@users.noreply.github.com> Date: Sat, 17 Aug 2024 23:49:56 +1200 Subject: [PATCH 11/20] [GoogleDriveFolderIE] add bare raise in the except block --- yt_dlp/extractor/googledrive.py | 1 + 1 file changed, 1 insertion(+) diff --git a/yt_dlp/extractor/googledrive.py b/yt_dlp/extractor/googledrive.py index 4603611dcc..a77ca24c07 100644 --- a/yt_dlp/extractor/googledrive.py +++ b/yt_dlp/extractor/googledrive.py @@ -362,6 +362,7 @@ def item_url_getter(item, video_id): except ExtractorError as e: if isinstance(e.cause, HTTPError) and e.cause.status == 404: self.raise_no_formats(e.cause.msg) + raise if urllib.parse.urlparse(urlh.url).netloc == 'accounts.google.com': self.raise_login_required('This video is only available for registered users') From b81a41d5ff33f05379c7ae96997d9d366ff4061b Mon Sep 17 00:00:00 2001 From: grqx_wsl <173253225+grqx@users.noreply.github.com> Date: Sun, 18 Aug 2024 00:01:03 +1200 Subject: [PATCH 12/20] [GoogleDriveFolderIE] raise for access denied --- yt_dlp/extractor/googledrive.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/googledrive.py b/yt_dlp/extractor/googledrive.py index a77ca24c07..a3d645f08c 100644 --- a/yt_dlp/extractor/googledrive.py +++ b/yt_dlp/extractor/googledrive.py @@ -360,8 +360,11 @@ def item_url_getter(item, video_id): try: webpage, urlh = self._download_webpage_handle(url, folder_id, headers=headers) except ExtractorError as e: - if isinstance(e.cause, HTTPError) and e.cause.status == 404: - self.raise_no_formats(e.cause.msg) + if isinstance(e.cause, HTTPError): + if e.cause.status == 404: + self.raise_no_formats(e.cause.msg) + elif e.cause.status == 403: + self.raise_login_required('Access Denied!') raise if urllib.parse.urlparse(urlh.url).netloc == 'accounts.google.com': self.raise_login_required('This video is only available for registered users') From 4a76306868e2387dc988edabe20ff4077b5b30c3 Mon Sep 17 00:00:00 2001 From: grqx_wsl <173253225+grqx@users.noreply.github.com> Date: Sun, 18 Aug 2024 03:08:45 +1200 Subject: [PATCH 13/20] [GoogleDriveFolderIE] support private folders and my-drive --- yt_dlp/extractor/googledrive.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/yt_dlp/extractor/googledrive.py b/yt_dlp/extractor/googledrive.py index a3d645f08c..f89718b5c1 100644 --- a/yt_dlp/extractor/googledrive.py +++ b/yt_dlp/extractor/googledrive.py @@ -295,7 +295,7 @@ def add_source_format(urlh): class GoogleDriveFolderIE(InfoExtractor): IE_NAME = 'GoogleDrive:Folder' - _VALID_URL = r'https?://(?:docs|drive)\.google\.com/drive/folders/(?P[\w-]{28,})' + _VALID_URL = r'https?://(?:docs|drive)\.google\.com/drive/(?:folders/(?P[\w-]{28,})|my-drive)' _TESTS = [{ 'url': 'https://drive.google.com/drive/folders/1dQ4sx0-__Nvg65rxTSgQrl7VyW_FZ9QI', 'info_dict': { @@ -318,7 +318,7 @@ def _extract_json_meta(self, webpage, video_id, dsval=None, hashval=None, name=N """ Uses regex to search for json metadata with 'ds' value(0-5) or 'hash' value(1-6) from the webpage. - Folder info: ds=0, hash=1; Folder items: ds=4, hash=6. + Folder info: ds=0, hash=1; Folder items: ds=4(public folder)/5(private folder), hash=6. For example, if the webpage contains the line below, the empty data array can be got by passing dsval=3 or hashval=2 to this method. AF_initDataCallback({key: 'ds:3', hash: '2', data:[], sideChannel: {}}); @@ -341,6 +341,8 @@ def _extract_json_meta(self, webpage, video_id, dsval=None, hashval=None, name=N def _real_extract(self, url): def item_url_getter(item, video_id): + if not isinstance(item, list): + return None available_IEs = [GoogleDriveFolderIE, GoogleDriveIE] # subfolder or item if 'application/vnd.google-apps.shortcut' in item: # extract real link entry_url = traverse_obj( @@ -354,7 +356,7 @@ def item_url_getter(item, video_id): return None return self.url_result(entry_url, video_id=video_id, video_title=item[2]) - folder_id = self._match_id(url) + folder_id = self._match_id(url) or 'my-drive' headers = self.geo_verification_headers() try: @@ -370,13 +372,10 @@ def item_url_getter(item, video_id): self.raise_login_required('This video is only available for registered users') json_folder_info = ( - self._extract_json_meta(webpage, folder_id, dsval=0, name='folder info', default=None) - or self._extract_json_meta(webpage, folder_id, hashval=1) - ) - json_items = ( - self._extract_json_meta(webpage, folder_id, dsval=4, name='folder items', default=None) - or self._extract_json_meta(webpage, folder_id, hashval=6) + self._extract_json_meta(webpage, folder_id, hashval=1, name='folder info', default=None) + or self._extract_json_meta(webpage, folder_id, dsval=0, name='folder info - fallback') ) + json_items = self._extract_json_meta(webpage, folder_id, hashval=6, name='folder items') title = json_folder_info[1][2] items = json_items[-1] From d133c2c7f707b7dbc00a784c75ca0f2148326f1d Mon Sep 17 00:00:00 2001 From: grqx_wsl <173253225+grqx@users.noreply.github.com> Date: Sun, 18 Aug 2024 03:15:45 +1200 Subject: [PATCH 14/20] [GoogleDriveFolderIE] revert part of last commit - change folder info json metadata extraction default appraoch to ds match --- yt_dlp/extractor/googledrive.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/googledrive.py b/yt_dlp/extractor/googledrive.py index f89718b5c1..1b1e3a2daf 100644 --- a/yt_dlp/extractor/googledrive.py +++ b/yt_dlp/extractor/googledrive.py @@ -372,8 +372,8 @@ def item_url_getter(item, video_id): self.raise_login_required('This video is only available for registered users') json_folder_info = ( - self._extract_json_meta(webpage, folder_id, hashval=1, name='folder info', default=None) - or self._extract_json_meta(webpage, folder_id, dsval=0, name='folder info - fallback') + self._extract_json_meta(webpage, folder_id, dsval=0, name='folder info', default=None) + or self._extract_json_meta(webpage, folder_id, hashval=1, name='folder info - fallback') ) json_items = self._extract_json_meta(webpage, folder_id, hashval=6, name='folder items') From beb76094fa817376741be87113eb1a28e69d341c Mon Sep 17 00:00:00 2001 From: N/Ame <173015200+grqz@users.noreply.github.com> Date: Sat, 28 Sep 2024 20:59:50 +1200 Subject: [PATCH 15/20] Update yt_dlp/extractor/googledrive.py --- yt_dlp/extractor/googledrive.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/googledrive.py b/yt_dlp/extractor/googledrive.py index 1b1e3a2daf..86900f5767 100644 --- a/yt_dlp/extractor/googledrive.py +++ b/yt_dlp/extractor/googledrive.py @@ -369,7 +369,7 @@ def item_url_getter(item, video_id): self.raise_login_required('Access Denied!') raise if urllib.parse.urlparse(urlh.url).netloc == 'accounts.google.com': - self.raise_login_required('This video is only available for registered users') + self.raise_login_required('Access Denied!') json_folder_info = ( self._extract_json_meta(webpage, folder_id, dsval=0, name='folder info', default=None) From 8d827d2460e6eabea92dec7766a5dce9598388bb Mon Sep 17 00:00:00 2001 From: grqx_wsl <173253225+grqx@users.noreply.github.com> Date: Sun, 29 Sep 2024 00:24:50 +1200 Subject: [PATCH 16/20] Update tests --- yt_dlp/extractor/googledrive.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/googledrive.py b/yt_dlp/extractor/googledrive.py index 86900f5767..65441c6965 100644 --- a/yt_dlp/extractor/googledrive.py +++ b/yt_dlp/extractor/googledrive.py @@ -53,6 +53,17 @@ class GoogleDriveIE(InfoExtractor): 'duration': 184, 'thumbnail': 'https://drive.google.com/thumbnail?id=1IP0o8dHcQrIHGgVyp0Ofvx2cGfLzyO1x', }, + }, { + # shortcut url + 'url': 'https://drive.google.com/file/d/1_n3-8ZwEUV4OniMsLAJ_C1JEjuT2u5Pk/view?usp=drivesdk', + 'md5': '43d34f7be1acc0262f337a039d1ad12d', + 'info_dict': { + 'id': '1J1RCw2jcgUngrZRdpza-IHXYkardZ-4l', + 'ext': 'webm', + 'title': 'Forrest walk with Best Mind Refresh Music Mithran [tEvJKrE4cS0].webm', + 'duration': 512, + 'thumbnail': 'https://drive.google.com/thumbnail?id=1J1RCw2jcgUngrZRdpza-IHXYkardZ-4l', + }, }, { # video can't be watched anonymously due to view count limit reached, # but can be downloaded (see https://github.com/ytdl-org/youtube-dl/issues/14046) @@ -306,7 +317,7 @@ class GoogleDriveFolderIE(InfoExtractor): }, { 'note': 'Contains various formats and a subfolder, folder name was formerly mismatched.' 'also contains loop shortcut, shortcut to non-downloadable files, etc.', - 'url': 'https://drive.google.com/drive/folders/1jjrhqi94d8TSHSVMSdBjD49MOiHYpHfF', + 'url': 'https://docs.google.com/drive/folders/1jjrhqi94d8TSHSVMSdBjD49MOiHYpHfF', 'info_dict': { 'id': '1jjrhqi94d8TSHSVMSdBjD49MOiHYpHfF', 'title': '], sideChannel: {}});', From 83e086083577fe50a1d24c34b141006402ce389f Mon Sep 17 00:00:00 2001 From: grqx_wsl <173253225+grqx@users.noreply.github.com> Date: Mon, 30 Sep 2024 06:35:33 +1300 Subject: [PATCH 17/20] fix my-drive extraction --- yt_dlp/extractor/googledrive.py | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/yt_dlp/extractor/googledrive.py b/yt_dlp/extractor/googledrive.py index 65441c6965..8b6ac8d854 100644 --- a/yt_dlp/extractor/googledrive.py +++ b/yt_dlp/extractor/googledrive.py @@ -306,7 +306,7 @@ def add_source_format(urlh): class GoogleDriveFolderIE(InfoExtractor): IE_NAME = 'GoogleDrive:Folder' - _VALID_URL = r'https?://(?:docs|drive)\.google\.com/drive/(?:folders/(?P[\w-]{28,})|my-drive)' + _VALID_URL = r'https?://(?:docs|drive)\.google\.com/drive/(?:folders/(?P[\w-]{19,})|my-drive)' _TESTS = [{ 'url': 'https://drive.google.com/drive/folders/1dQ4sx0-__Nvg65rxTSgQrl7VyW_FZ9QI', 'info_dict': { @@ -329,32 +329,36 @@ def _extract_json_meta(self, webpage, video_id, dsval=None, hashval=None, name=N """ Uses regex to search for json metadata with 'ds' value(0-5) or 'hash' value(1-6) from the webpage. - Folder info: ds=0, hash=1; Folder items: ds=4(public folder)/5(private folder), hash=6. + Folder info: ds=0(public folder), hash=1/5; + Folder items: ds=4(logged out)/5(logged in), hash=6. + public, logged in info:ds0hash1; items:ds5hash6 + public, logged out info:ds0hash1; items:ds4hash6 + my-drive, logged in info:ds0hash1/4; items:ds5hash6 + private, logged in info:ds0hash1; items:ds5hash6 For example, if the webpage contains the line below, the empty data array can be got by passing dsval=3 or hashval=2 to this method. AF_initDataCallback({key: 'ds:3', hash: '2', data:[], sideChannel: {}}); """ _ARRAY_RE = r'\[(?s:.+)\]' _META_END_RE = r', sideChannel: \{\}\}\);' # greedy match to deal with the 2nd test case - if dsval: + if dsval is not None: if not name: name = f'webpage JSON metadata ds:{dsval}' return self._search_json( rf'''key\s*?:\s*?(['"])ds:\s*?{dsval}\1,[^\[]*?data:''', webpage, name, video_id, end_pattern=_META_END_RE, contains_pattern=_ARRAY_RE, **kwargs) - elif hashval: + elif hashval is not None: if not name: name = f'webpage JSON metadata hash:{hashval}' return self._search_json( rf'''hash\s*?:\s*?(['"]){hashval}\1,[^\[]*?data:''', webpage, name, video_id, end_pattern=_META_END_RE, contains_pattern=_ARRAY_RE, **kwargs) - return None def _real_extract(self, url): def item_url_getter(item, video_id): if not isinstance(item, list): return None - available_IEs = [GoogleDriveFolderIE, GoogleDriveIE] # subfolder or item + available_IEs = (GoogleDriveFolderIE, GoogleDriveIE) # subfolder or item if 'application/vnd.google-apps.shortcut' in item: # extract real link entry_url = traverse_obj( item, @@ -377,20 +381,17 @@ def item_url_getter(item, video_id): if e.cause.status == 404: self.raise_no_formats(e.cause.msg) elif e.cause.status == 403: - self.raise_login_required('Access Denied!') + # logged in with an account without access + self.raise_login_required('Access Denied') raise if urllib.parse.urlparse(urlh.url).netloc == 'accounts.google.com': - self.raise_login_required('Access Denied!') + # not logged in when visiting a private folder + self.raise_login_required('Access Denied') - json_folder_info = ( - self._extract_json_meta(webpage, folder_id, dsval=0, name='folder info', default=None) - or self._extract_json_meta(webpage, folder_id, hashval=1, name='folder info - fallback') - ) - json_items = self._extract_json_meta(webpage, folder_id, hashval=6, name='folder items') + title = self._extract_json_meta(webpage, folder_id, dsval=0, name='folder info')[1][2] + items = self._extract_json_meta(webpage, folder_id, hashval=6, name='folder items')[-1] - title = json_folder_info[1][2] - items = json_items[-1] - if not isinstance(items, list): # empty folder + if items is False: # empty folder return self.playlist_result([], folder_id, title) return self.playlist_result( From 23ea25196deb91cfbf69ce2fd4bd56f7557f6d4b Mon Sep 17 00:00:00 2001 From: grqx_wsl <173253225+grqx@users.noreply.github.com> Date: Mon, 30 Sep 2024 12:24:13 +1300 Subject: [PATCH 18/20] update docstring --- yt_dlp/extractor/googledrive.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/yt_dlp/extractor/googledrive.py b/yt_dlp/extractor/googledrive.py index 8b6ac8d854..6399c66ab8 100644 --- a/yt_dlp/extractor/googledrive.py +++ b/yt_dlp/extractor/googledrive.py @@ -331,10 +331,9 @@ def _extract_json_meta(self, webpage, video_id, dsval=None, hashval=None, name=N from the webpage. Folder info: ds=0(public folder), hash=1/5; Folder items: ds=4(logged out)/5(logged in), hash=6. - public, logged in info:ds0hash1; items:ds5hash6 - public, logged out info:ds0hash1; items:ds4hash6 - my-drive, logged in info:ds0hash1/4; items:ds5hash6 - private, logged in info:ds0hash1; items:ds5hash6 + logged out info:ds0hash1; items:ds4hash6 + logged in info:ds0hash1; items:ds5hash6 + my-drive info:ds0hash1/ds0hash4; items:ds5hash6 For example, if the webpage contains the line below, the empty data array can be got by passing dsval=3 or hashval=2 to this method. AF_initDataCallback({key: 'ds:3', hash: '2', data:[], sideChannel: {}}); From 3582a238a09ee929b75b2030a99146f6b4aa5b23 Mon Sep 17 00:00:00 2001 From: grqx Date: Tue, 1 Oct 2024 23:51:00 +1300 Subject: [PATCH 19/20] update url and video id instead of returning a url result --- yt_dlp/extractor/googledrive.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/googledrive.py b/yt_dlp/extractor/googledrive.py index 6399c66ab8..33ac5cb8a5 100644 --- a/yt_dlp/extractor/googledrive.py +++ b/yt_dlp/extractor/googledrive.py @@ -179,9 +179,16 @@ def _get_automatic_captions(self, video_id, subtitles_id, hl): def _real_extract(self, url): video_id = self._match_id(url) - _, webpage_urlh = self._download_webpage_handle(url, video_id) + try: + _, webpage_urlh = self._download_webpage_handle(url, video_id) + except ExtractorError as e: + if isinstance(e.cause, HTTPError): + if e.cause.status in (401, 403): + self.raise_login_required('Access Denied') + raise if webpage_urlh.url != url: - return self.url_result(webpage_urlh.url) + url = webpage_urlh.url + video_id = self._match_id(url) video_info = urllib.parse.parse_qs(self._download_webpage( 'https://drive.google.com/get_video_info', @@ -378,7 +385,7 @@ def item_url_getter(item, video_id): except ExtractorError as e: if isinstance(e.cause, HTTPError): if e.cause.status == 404: - self.raise_no_formats(e.cause.msg) + self.raise_no_formats(e.cause.msg, expected=True) elif e.cause.status == 403: # logged in with an account without access self.raise_login_required('Access Denied') From 1ef35f1c0001c59e4694e9cdd19873bab16c3489 Mon Sep 17 00:00:00 2001 From: grqx_termux Date: Wed, 2 Oct 2024 17:52:29 +1300 Subject: [PATCH 20/20] update docstring --- yt_dlp/extractor/googledrive.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/yt_dlp/extractor/googledrive.py b/yt_dlp/extractor/googledrive.py index 33ac5cb8a5..0594cbae13 100644 --- a/yt_dlp/extractor/googledrive.py +++ b/yt_dlp/extractor/googledrive.py @@ -336,11 +336,9 @@ def _extract_json_meta(self, webpage, video_id, dsval=None, hashval=None, name=N """ Uses regex to search for json metadata with 'ds' value(0-5) or 'hash' value(1-6) from the webpage. - Folder info: ds=0(public folder), hash=1/5; - Folder items: ds=4(logged out)/5(logged in), hash=6. - logged out info:ds0hash1; items:ds4hash6 - logged in info:ds0hash1; items:ds5hash6 - my-drive info:ds0hash1/ds0hash4; items:ds5hash6 + logged out folder info:ds0hash1; items:ds4hash6 + logged in folder info:ds0hash1; items:ds5hash6 + my-drive folder info:ds0hash1/ds0hash4; items:ds5hash6 For example, if the webpage contains the line below, the empty data array can be got by passing dsval=3 or hashval=2 to this method. AF_initDataCallback({key: 'ds:3', hash: '2', data:[], sideChannel: {}});