mirror of
https://github.com/yt-dlp/yt-dlp
synced 2025-05-10 07:22:13 -05:00
[ie/nytimesarticle] Fix extraction (#13104)
Closes #13098 Authored by: bashonly
This commit is contained in:
parent
f123cc83b3
commit
b26bc32579
@ -181,6 +181,7 @@ class NYTimesArticleIE(NYTimesBaseIE):
|
||||
'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg',
|
||||
'duration': 119.0,
|
||||
},
|
||||
'skip': 'HTTP Error 500: Internal Server Error',
|
||||
}, {
|
||||
# article with audio and no video
|
||||
'url': 'https://www.nytimes.com/2023/09/29/health/mosquitoes-genetic-engineering.html',
|
||||
@ -190,13 +191,14 @@ class NYTimesArticleIE(NYTimesBaseIE):
|
||||
'ext': 'mp3',
|
||||
'title': 'The Gamble: Can Genetically Modified Mosquitoes End Disease?',
|
||||
'description': 'md5:9ff8b47acbaf7f3ca8c732f5c815be2e',
|
||||
'timestamp': 1695960700,
|
||||
'timestamp': 1696008129,
|
||||
'upload_date': '20230929',
|
||||
'creator': 'Stephanie Nolen, Natalija Gormalova',
|
||||
'creators': ['Stephanie Nolen', 'Natalija Gormalova'],
|
||||
'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg',
|
||||
'duration': 1322,
|
||||
},
|
||||
}, {
|
||||
# lede_media_block already has sourceId
|
||||
'url': 'https://www.nytimes.com/2023/11/29/business/dealbook/kamala-harris-biden-voters.html',
|
||||
'md5': '3eb5ddb1d6f86254fe4f233826778737',
|
||||
'info_dict': {
|
||||
@ -207,7 +209,7 @@ class NYTimesArticleIE(NYTimesBaseIE):
|
||||
'timestamp': 1701290997,
|
||||
'upload_date': '20231129',
|
||||
'uploader': 'By The New York Times',
|
||||
'creator': 'Katie Rogers',
|
||||
'creators': ['Katie Rogers'],
|
||||
'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg',
|
||||
'duration': 97.631,
|
||||
},
|
||||
@ -222,10 +224,22 @@ class NYTimesArticleIE(NYTimesBaseIE):
|
||||
'title': 'Drunk and Asleep on the Job: Air Traffic Controllers Pushed to the Brink',
|
||||
'description': 'md5:549e5a5e935bf7d048be53ba3d2c863d',
|
||||
'upload_date': '20231202',
|
||||
'creator': 'Emily Steel, Sydney Ember',
|
||||
'creators': ['Emily Steel', 'Sydney Ember'],
|
||||
'timestamp': 1701511264,
|
||||
},
|
||||
'playlist_count': 3,
|
||||
}, {
|
||||
# lede_media_block does not have sourceId
|
||||
'url': 'https://www.nytimes.com/2025/04/30/well/move/hip-mobility-routine.html',
|
||||
'info_dict': {
|
||||
'id': 'hip-mobility-routine',
|
||||
'title': 'Tight Hips? These Moves Can Help.',
|
||||
'description': 'Sitting all day is hard on your hips. Try this simple routine for better mobility.',
|
||||
'creators': ['Alyssa Ages', 'Theodore Tae'],
|
||||
'timestamp': 1746003629,
|
||||
'upload_date': '20250430',
|
||||
},
|
||||
'playlist_count': 7,
|
||||
}, {
|
||||
'url': 'https://www.nytimes.com/2023/12/02/business/media/netflix-squid-game-challenge.html',
|
||||
'only_matching': True,
|
||||
@ -256,14 +270,18 @@ class NYTimesArticleIE(NYTimesBaseIE):
|
||||
|
||||
def _real_extract(self, url):
|
||||
page_id = self._match_id(url)
|
||||
webpage = self._download_webpage(url, page_id)
|
||||
webpage = self._download_webpage(url, page_id, impersonate=True)
|
||||
art_json = self._search_json(
|
||||
r'window\.__preloadedData\s*=', webpage, 'media details', page_id,
|
||||
transform_source=lambda x: x.replace('undefined', 'null'))['initialData']['data']['article']
|
||||
content = art_json['sprinkledBody']['content']
|
||||
|
||||
blocks = traverse_obj(art_json, (
|
||||
'sprinkledBody', 'content', ..., ('ledeMedia', None),
|
||||
lambda _, v: v['__typename'] in ('Video', 'Audio')))
|
||||
blocks = []
|
||||
block_filter = lambda k, v: k == 'media' and v['__typename'] in ('Video', 'Audio')
|
||||
if lede_media_block := traverse_obj(content, (..., 'ledeMedia', block_filter, any)):
|
||||
lede_media_block.setdefault('sourceId', art_json.get('sourceId'))
|
||||
blocks.append(lede_media_block)
|
||||
blocks.extend(traverse_obj(content, (..., block_filter)))
|
||||
if not blocks:
|
||||
raise ExtractorError('Unable to extract any media blocks from webpage')
|
||||
|
||||
@ -273,8 +291,7 @@ class NYTimesArticleIE(NYTimesBaseIE):
|
||||
'sprinkledBody', 'content', ..., 'summary', 'content', ..., 'text', {str}),
|
||||
get_all=False) or self._html_search_meta(['og:description', 'twitter:description'], webpage),
|
||||
'timestamp': traverse_obj(art_json, ('firstPublished', {parse_iso8601})),
|
||||
'creator': ', '.join(
|
||||
traverse_obj(art_json, ('bylines', ..., 'creators', ..., 'displayName'))), # TODO: change to 'creators' (list)
|
||||
'creators': traverse_obj(art_json, ('bylines', ..., 'creators', ..., 'displayName', {str})),
|
||||
'thumbnails': self._extract_thumbnails(traverse_obj(
|
||||
art_json, ('promotionalMedia', 'assetCrops', ..., 'renditions', ...))),
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user