1
0
mirror of https://github.com/yt-dlp/yt-dlp synced 2025-05-10 07:22:13 -05:00

[ie/nytimesarticle] Fix extraction (#13104)

Closes #13098
Authored by: bashonly
This commit is contained in:
bashonly 2025-05-06 15:32:41 -05:00 committed by GitHub
parent f123cc83b3
commit b26bc32579
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -181,6 +181,7 @@ class NYTimesArticleIE(NYTimesBaseIE):
'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg', 'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg',
'duration': 119.0, 'duration': 119.0,
}, },
'skip': 'HTTP Error 500: Internal Server Error',
}, { }, {
# article with audio and no video # article with audio and no video
'url': 'https://www.nytimes.com/2023/09/29/health/mosquitoes-genetic-engineering.html', 'url': 'https://www.nytimes.com/2023/09/29/health/mosquitoes-genetic-engineering.html',
@ -190,13 +191,14 @@ class NYTimesArticleIE(NYTimesBaseIE):
'ext': 'mp3', 'ext': 'mp3',
'title': 'The Gamble: Can Genetically Modified Mosquitoes End Disease?', 'title': 'The Gamble: Can Genetically Modified Mosquitoes End Disease?',
'description': 'md5:9ff8b47acbaf7f3ca8c732f5c815be2e', 'description': 'md5:9ff8b47acbaf7f3ca8c732f5c815be2e',
'timestamp': 1695960700, 'timestamp': 1696008129,
'upload_date': '20230929', 'upload_date': '20230929',
'creator': 'Stephanie Nolen, Natalija Gormalova', 'creators': ['Stephanie Nolen', 'Natalija Gormalova'],
'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg', 'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg',
'duration': 1322, 'duration': 1322,
}, },
}, { }, {
# lede_media_block already has sourceId
'url': 'https://www.nytimes.com/2023/11/29/business/dealbook/kamala-harris-biden-voters.html', 'url': 'https://www.nytimes.com/2023/11/29/business/dealbook/kamala-harris-biden-voters.html',
'md5': '3eb5ddb1d6f86254fe4f233826778737', 'md5': '3eb5ddb1d6f86254fe4f233826778737',
'info_dict': { 'info_dict': {
@ -207,7 +209,7 @@ class NYTimesArticleIE(NYTimesBaseIE):
'timestamp': 1701290997, 'timestamp': 1701290997,
'upload_date': '20231129', 'upload_date': '20231129',
'uploader': 'By The New York Times', 'uploader': 'By The New York Times',
'creator': 'Katie Rogers', 'creators': ['Katie Rogers'],
'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg', 'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg',
'duration': 97.631, 'duration': 97.631,
}, },
@ -222,10 +224,22 @@ class NYTimesArticleIE(NYTimesBaseIE):
'title': 'Drunk and Asleep on the Job: Air Traffic Controllers Pushed to the Brink', 'title': 'Drunk and Asleep on the Job: Air Traffic Controllers Pushed to the Brink',
'description': 'md5:549e5a5e935bf7d048be53ba3d2c863d', 'description': 'md5:549e5a5e935bf7d048be53ba3d2c863d',
'upload_date': '20231202', 'upload_date': '20231202',
'creator': 'Emily Steel, Sydney Ember', 'creators': ['Emily Steel', 'Sydney Ember'],
'timestamp': 1701511264, 'timestamp': 1701511264,
}, },
'playlist_count': 3, 'playlist_count': 3,
}, {
# lede_media_block does not have sourceId
'url': 'https://www.nytimes.com/2025/04/30/well/move/hip-mobility-routine.html',
'info_dict': {
'id': 'hip-mobility-routine',
'title': 'Tight Hips? These Moves Can Help.',
'description': 'Sitting all day is hard on your hips. Try this simple routine for better mobility.',
'creators': ['Alyssa Ages', 'Theodore Tae'],
'timestamp': 1746003629,
'upload_date': '20250430',
},
'playlist_count': 7,
}, { }, {
'url': 'https://www.nytimes.com/2023/12/02/business/media/netflix-squid-game-challenge.html', 'url': 'https://www.nytimes.com/2023/12/02/business/media/netflix-squid-game-challenge.html',
'only_matching': True, 'only_matching': True,
@ -256,14 +270,18 @@ class NYTimesArticleIE(NYTimesBaseIE):
def _real_extract(self, url): def _real_extract(self, url):
page_id = self._match_id(url) page_id = self._match_id(url)
webpage = self._download_webpage(url, page_id) webpage = self._download_webpage(url, page_id, impersonate=True)
art_json = self._search_json( art_json = self._search_json(
r'window\.__preloadedData\s*=', webpage, 'media details', page_id, r'window\.__preloadedData\s*=', webpage, 'media details', page_id,
transform_source=lambda x: x.replace('undefined', 'null'))['initialData']['data']['article'] transform_source=lambda x: x.replace('undefined', 'null'))['initialData']['data']['article']
content = art_json['sprinkledBody']['content']
blocks = traverse_obj(art_json, ( blocks = []
'sprinkledBody', 'content', ..., ('ledeMedia', None), block_filter = lambda k, v: k == 'media' and v['__typename'] in ('Video', 'Audio')
lambda _, v: v['__typename'] in ('Video', 'Audio'))) if lede_media_block := traverse_obj(content, (..., 'ledeMedia', block_filter, any)):
lede_media_block.setdefault('sourceId', art_json.get('sourceId'))
blocks.append(lede_media_block)
blocks.extend(traverse_obj(content, (..., block_filter)))
if not blocks: if not blocks:
raise ExtractorError('Unable to extract any media blocks from webpage') raise ExtractorError('Unable to extract any media blocks from webpage')
@ -273,8 +291,7 @@ class NYTimesArticleIE(NYTimesBaseIE):
'sprinkledBody', 'content', ..., 'summary', 'content', ..., 'text', {str}), 'sprinkledBody', 'content', ..., 'summary', 'content', ..., 'text', {str}),
get_all=False) or self._html_search_meta(['og:description', 'twitter:description'], webpage), get_all=False) or self._html_search_meta(['og:description', 'twitter:description'], webpage),
'timestamp': traverse_obj(art_json, ('firstPublished', {parse_iso8601})), 'timestamp': traverse_obj(art_json, ('firstPublished', {parse_iso8601})),
'creator': ', '.join( 'creators': traverse_obj(art_json, ('bylines', ..., 'creators', ..., 'displayName', {str})),
traverse_obj(art_json, ('bylines', ..., 'creators', ..., 'displayName'))), # TODO: change to 'creators' (list)
'thumbnails': self._extract_thumbnails(traverse_obj( 'thumbnails': self._extract_thumbnails(traverse_obj(
art_json, ('promotionalMedia', 'assetCrops', ..., 'renditions', ...))), art_json, ('promotionalMedia', 'assetCrops', ..., 'renditions', ...))),
} }