From b26bc32579c00ef579d75a835807ccc87d20ee0a Mon Sep 17 00:00:00 2001
From: bashonly <88596187+bashonly@users.noreply.github.com>
Date: Tue, 6 May 2025 15:32:41 -0500
Subject: [PATCH] [ie/nytimesarticle] Fix extraction (#13104)

Closes #13098
Authored by: bashonly
---
 yt_dlp/extractor/nytimes.py | 37 +++++++++++++++++++++++++++----------
 1 file changed, 27 insertions(+), 10 deletions(-)

diff --git a/yt_dlp/extractor/nytimes.py b/yt_dlp/extractor/nytimes.py
index a97add71a..3472a2159 100644
--- a/yt_dlp/extractor/nytimes.py
+++ b/yt_dlp/extractor/nytimes.py
@@ -181,6 +181,7 @@ class NYTimesArticleIE(NYTimesBaseIE):
             'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg',
             'duration': 119.0,
         },
+        'skip': 'HTTP Error 500: Internal Server Error',
     }, {
         # article with audio and no video
         'url': 'https://www.nytimes.com/2023/09/29/health/mosquitoes-genetic-engineering.html',
@@ -190,13 +191,14 @@ class NYTimesArticleIE(NYTimesBaseIE):
             'ext': 'mp3',
             'title': 'The Gamble: Can Genetically Modified Mosquitoes End Disease?',
             'description': 'md5:9ff8b47acbaf7f3ca8c732f5c815be2e',
-            'timestamp': 1695960700,
+            'timestamp': 1696008129,
             'upload_date': '20230929',
-            'creator': 'Stephanie Nolen, Natalija Gormalova',
+            'creators': ['Stephanie Nolen', 'Natalija Gormalova'],
             'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg',
             'duration': 1322,
         },
     }, {
+        # lede_media_block already has sourceId
         'url': 'https://www.nytimes.com/2023/11/29/business/dealbook/kamala-harris-biden-voters.html',
         'md5': '3eb5ddb1d6f86254fe4f233826778737',
         'info_dict': {
@@ -207,7 +209,7 @@ class NYTimesArticleIE(NYTimesBaseIE):
             'timestamp': 1701290997,
             'upload_date': '20231129',
             'uploader': 'By The New York Times',
-            'creator': 'Katie Rogers',
+            'creators': ['Katie Rogers'],
             'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg',
             'duration': 97.631,
         },
@@ -222,10 +224,22 @@ class NYTimesArticleIE(NYTimesBaseIE):
             'title': 'Drunk and Asleep on the Job: Air Traffic Controllers Pushed to the Brink',
             'description': 'md5:549e5a5e935bf7d048be53ba3d2c863d',
             'upload_date': '20231202',
-            'creator': 'Emily Steel, Sydney Ember',
+            'creators': ['Emily Steel', 'Sydney Ember'],
             'timestamp': 1701511264,
         },
         'playlist_count': 3,
+    }, {
+        # lede_media_block does not have sourceId
+        'url': 'https://www.nytimes.com/2025/04/30/well/move/hip-mobility-routine.html',
+        'info_dict': {
+            'id': 'hip-mobility-routine',
+            'title': 'Tight Hips? These Moves Can Help.',
+            'description': 'Sitting all day is hard on your hips. Try this simple routine for better mobility.',
+            'creators': ['Alyssa Ages', 'Theodore Tae'],
+            'timestamp': 1746003629,
+            'upload_date': '20250430',
+        },
+        'playlist_count': 7,
     }, {
         'url': 'https://www.nytimes.com/2023/12/02/business/media/netflix-squid-game-challenge.html',
         'only_matching': True,
@@ -256,14 +270,18 @@ class NYTimesArticleIE(NYTimesBaseIE):
 
     def _real_extract(self, url):
         page_id = self._match_id(url)
-        webpage = self._download_webpage(url, page_id)
+        webpage = self._download_webpage(url, page_id, impersonate=True)
         art_json = self._search_json(
             r'window\.__preloadedData\s*=', webpage, 'media details', page_id,
             transform_source=lambda x: x.replace('undefined', 'null'))['initialData']['data']['article']
+        content = art_json['sprinkledBody']['content']
 
-        blocks = traverse_obj(art_json, (
-            'sprinkledBody', 'content', ..., ('ledeMedia', None),
-            lambda _, v: v['__typename'] in ('Video', 'Audio')))
+        blocks = []
+        block_filter = lambda k, v: k == 'media' and v['__typename'] in ('Video', 'Audio')
+        if lede_media_block := traverse_obj(content, (..., 'ledeMedia', block_filter, any)):
+            lede_media_block.setdefault('sourceId', art_json.get('sourceId'))
+            blocks.append(lede_media_block)
+        blocks.extend(traverse_obj(content, (..., block_filter)))
         if not blocks:
             raise ExtractorError('Unable to extract any media blocks from webpage')
 
@@ -273,8 +291,7 @@ class NYTimesArticleIE(NYTimesBaseIE):
                 'sprinkledBody', 'content', ..., 'summary', 'content', ..., 'text', {str}),
                 get_all=False) or self._html_search_meta(['og:description', 'twitter:description'], webpage),
             'timestamp': traverse_obj(art_json, ('firstPublished', {parse_iso8601})),
-            'creator': ', '.join(
-                traverse_obj(art_json, ('bylines', ..., 'creators', ..., 'displayName'))),  # TODO: change to 'creators' (list)
+            'creators': traverse_obj(art_json, ('bylines', ..., 'creators', ..., 'displayName', {str})),
             'thumbnails': self._extract_thumbnails(traverse_obj(
                 art_json, ('promotionalMedia', 'assetCrops', ..., 'renditions', ...))),
         }