From 5271ef48c6f61c145e03e18e960995d2e651d205 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Tue, 18 Feb 2025 20:20:50 -0600 Subject: [PATCH] [ie/gem.cbc.ca] Fix extractors (#12404) Does not fix broken login support Closes #11848 Authored by: bashonly, dirkf Co-authored-by: dirkf --- yt_dlp/extractor/cbc.py | 180 +++++++++++++++++++++------------------- 1 file changed, 95 insertions(+), 85 deletions(-) diff --git a/yt_dlp/extractor/cbc.py b/yt_dlp/extractor/cbc.py index c0cf3da3d..e63c608fc 100644 --- a/yt_dlp/extractor/cbc.py +++ b/yt_dlp/extractor/cbc.py @@ -14,16 +14,18 @@ from ..utils import ( js_to_json, mimetype2ext, orderedSet, + parse_age_limit, parse_iso8601, replace_extension, smuggle_url, strip_or_none, - traverse_obj, try_get, + unified_timestamp, update_url, url_basename, url_or_none, ) +from ..utils.traversal import require, traverse_obj, trim_str class CBCIE(InfoExtractor): @@ -516,9 +518,43 @@ class CBCPlayerPlaylistIE(InfoExtractor): return self.playlist_result(entries(), playlist_id) -class CBCGemIE(InfoExtractor): +class CBCGemBaseIE(InfoExtractor): + _NETRC_MACHINE = 'cbcgem' + _GEO_COUNTRIES = ['CA'] + + def _call_show_api(self, item_id, display_id=None): + return self._download_json( + f'https://services.radio-canada.ca/ott/catalog/v2/gem/show/{item_id}', + display_id or item_id, query={'device': 'web'}) + + def _extract_item_info(self, item_info): + episode_number = None + title = traverse_obj(item_info, ('title', {str})) + if title and (mobj := re.match(r'(?P\d+)\. (?P.+)', title)): + episode_number = int_or_none(mobj.group('episode')) + title = mobj.group('title') + + return { + 'episode_number': episode_number, + **traverse_obj(item_info, { + 'id': ('url', {str}), + 'episode_id': ('url', {str}), + 'description': ('description', {str}), + 'thumbnail': ('images', 'card', 'url', {url_or_none}, {update_url(query=None)}), + 'episode_number': ('episodeNumber', {int_or_none}), + 'duration': ('metadata', 'duration', {int_or_none}), + 'release_timestamp': ('metadata', 'airDate', {unified_timestamp}), + 'timestamp': ('metadata', 'availabilityDate', {unified_timestamp}), + 'age_limit': ('metadata', 'rating', {trim_str(start='C')}, {parse_age_limit}), + }), + 'episode': title, + 'title': title, + } + + +class CBCGemIE(CBCGemBaseIE): IE_NAME = 'gem.cbc.ca' - _VALID_URL = r'https?://gem\.cbc\.ca/(?:media/)?(?P<id>[0-9a-z-]+/s[0-9]+[a-z][0-9]+)' + _VALID_URL = r'https?://gem\.cbc\.ca/(?:media/)?(?P<id>[0-9a-z-]+/s(?P<season>[0-9]+)[a-z][0-9]+)' _TESTS = [{ # This is a normal, public, TV show video 'url': 'https://gem.cbc.ca/media/schitts-creek/s06e01', @@ -529,7 +565,7 @@ class CBCGemIE(InfoExtractor): 'description': 'md5:929868d20021c924020641769eb3e7f1', 'thumbnail': r're:https://images\.radio-canada\.ca/[^#?]+/cbc_schitts_creek_season_06e01_thumbnail_v01\.jpg', 'duration': 1324, - 'categories': ['comedy'], + 'genres': ['Comédie et humour'], 'series': 'Schitt\'s Creek', 'season': 'Season 6', 'season_number': 6, @@ -537,9 +573,10 @@ class CBCGemIE(InfoExtractor): 'episode_number': 1, 'episode_id': 'schitts-creek/s06e01', 'upload_date': '20210618', - 'timestamp': 1623988800, + 'timestamp': 1623974400, 'release_date': '20200107', - 'release_timestamp': 1578427200, + 'release_timestamp': 1578355200, + 'age_limit': 14, }, 'params': {'format': 'bv'}, }, { @@ -557,12 +594,13 @@ class CBCGemIE(InfoExtractor): 'episode_number': 1, 'episode': 'The Cup Runneth Over', 'episode_id': 'schitts-creek/s01e01', - 'duration': 1309, - 'categories': ['comedy'], + 'duration': 1308, + 'genres': ['Comédie et humour'], 'upload_date': '20210617', - 'timestamp': 1623902400, - 'release_date': '20151124', - 'release_timestamp': 1448323200, + 'timestamp': 1623888000, + 'release_date': '20151123', + 'release_timestamp': 1448236800, + 'age_limit': 14, }, 'params': {'format': 'bv'}, }, { @@ -570,9 +608,7 @@ class CBCGemIE(InfoExtractor): 'only_matching': True, }] - _GEO_COUNTRIES = ['CA'] _TOKEN_API_KEY = '3f4beddd-2061-49b0-ae80-6f1f2ed65b37' - _NETRC_MACHINE = 'cbcgem' _claims_token = None def _new_claims_token(self, email, password): @@ -634,10 +670,12 @@ class CBCGemIE(InfoExtractor): self._claims_token = self.cache.load(self._NETRC_MACHINE, 'claims_token') def _real_extract(self, url): - video_id = self._match_id(url) - video_info = self._download_json( - f'https://services.radio-canada.ca/ott/cbc-api/v2/assets/{video_id}', - video_id, expected_status=426) + video_id, season_number = self._match_valid_url(url).group('id', 'season') + video_info = self._call_show_api(video_id) + item_info = traverse_obj(video_info, ( + 'content', ..., 'lineups', ..., 'items', + lambda _, v: v['url'] == video_id, any, {require('item info')})) + media_id = item_info['idMedia'] email, password = self._get_login_info() if email and password: @@ -645,7 +683,20 @@ class CBCGemIE(InfoExtractor): headers = {'x-claims-token': claims_token} else: headers = {} - m3u8_info = self._download_json(video_info['playSession']['url'], video_id, headers=headers) + + m3u8_info = self._download_json( + 'https://services.radio-canada.ca/media/validation/v2/', + video_id, headers=headers, query={ + 'appCode': 'gem', + 'connectionType': 'hd', + 'deviceType': 'ipad', + 'multibitrate': 'true', + 'output': 'json', + 'tech': 'hls', + 'manifestVersion': '2', + 'manifestType': 'desktop', + 'idMedia': media_id, + }) if m3u8_info.get('errorCode') == 1: self.raise_geo_restricted(countries=['CA']) @@ -671,26 +722,20 @@ class CBCGemIE(InfoExtractor): fmt['preference'] = -2 return { + 'season_number': int_or_none(season_number), + **traverse_obj(video_info, { + 'series': ('title', {str}), + 'season_number': ('structuredMetadata', 'partofSeason', 'seasonNumber', {int_or_none}), + 'genres': ('structuredMetadata', 'genre', ..., {str}), + }), + **self._extract_item_info(item_info), 'id': video_id, 'episode_id': video_id, 'formats': formats, - **traverse_obj(video_info, { - 'title': ('title', {str}), - 'episode': ('title', {str}), - 'description': ('description', {str}), - 'thumbnail': ('image', {url_or_none}), - 'series': ('series', {str}), - 'season_number': ('season', {int_or_none}), - 'episode_number': ('episode', {int_or_none}), - 'duration': ('duration', {int_or_none}), - 'categories': ('category', {str}, all), - 'release_timestamp': ('airDate', {int_or_none(scale=1000)}), - 'timestamp': ('availableDate', {int_or_none(scale=1000)}), - }), } -class CBCGemPlaylistIE(InfoExtractor): +class CBCGemPlaylistIE(CBCGemBaseIE): IE_NAME = 'gem.cbc.ca:playlist' _VALID_URL = r'https?://gem\.cbc\.ca/(?:media/)?(?P<id>(?P<show>[0-9a-z-]+)/s(?P<season>[0-9]+))/?(?:[?#]|$)' _TESTS = [{ @@ -700,70 +745,35 @@ class CBCGemPlaylistIE(InfoExtractor): 'info_dict': { 'id': 'schitts-creek/s06', 'title': 'Season 6', - 'description': 'md5:6a92104a56cbeb5818cc47884d4326a2', 'series': 'Schitt\'s Creek', 'season_number': 6, 'season': 'Season 6', - 'thumbnail': 'https://images.radio-canada.ca/v1/synps-cbc/season/perso/cbc_schitts_creek_season_06_carousel_v03.jpg?impolicy=ott&im=Resize=(_Size_)&quality=75', }, }, { 'url': 'https://gem.cbc.ca/schitts-creek/s06', 'only_matching': True, }] - _API_BASE = 'https://services.radio-canada.ca/ott/cbc-api/v2/shows/' + + def _entries(self, season_info): + for episode in traverse_obj(season_info, ('items', lambda _, v: v['url'])): + yield self.url_result( + f'https://gem.cbc.ca/media/{episode["url"]}', CBCGemIE, + **self._extract_item_info(episode)) def _real_extract(self, url): - match = self._match_valid_url(url) - season_id = match.group('id') - show = match.group('show') - show_info = self._download_json(self._API_BASE + show, season_id, expected_status=426) - season = int(match.group('season')) + season_id, show, season = self._match_valid_url(url).group('id', 'show', 'season') + show_info = self._call_show_api(show, display_id=season_id) + season_info = traverse_obj(show_info, ( + 'content', ..., 'lineups', + lambda _, v: v['seasonNumber'] == int(season), any, {require('season info')})) - season_info = next((s for s in show_info['seasons'] if s.get('season') == season), None) - - if season_info is None: - raise ExtractorError(f'Couldn\'t find season {season} of {show}') - - episodes = [] - for episode in season_info['assets']: - episodes.append({ - '_type': 'url_transparent', - 'ie_key': 'CBCGem', - 'url': 'https://gem.cbc.ca/media/' + episode['id'], - 'id': episode['id'], - 'title': episode.get('title'), - 'description': episode.get('description'), - 'thumbnail': episode.get('image'), - 'series': episode.get('series'), - 'season_number': episode.get('season'), - 'season': season_info['title'], - 'season_id': season_info.get('id'), - 'episode_number': episode.get('episode'), - 'episode': episode.get('title'), - 'episode_id': episode['id'], - 'duration': episode.get('duration'), - 'categories': [episode.get('category')], - }) - - thumbnail = None - tn_uri = season_info.get('image') - # the-national was observed to use a "data:image/png;base64" - # URI for their 'image' value. The image was 1x1, and is - # probably just a placeholder, so it is ignored. - if tn_uri is not None and not tn_uri.startswith('data:'): - thumbnail = tn_uri - - return { - '_type': 'playlist', - 'entries': episodes, - 'id': season_id, - 'title': season_info['title'], - 'description': season_info.get('description'), - 'thumbnail': thumbnail, - 'series': show_info.get('title'), - 'season_number': season_info.get('season'), - 'season': season_info['title'], - } + return self.playlist_result( + self._entries(season_info), season_id, + **traverse_obj(season_info, { + 'title': ('title', {str}), + 'season': ('title', {str}), + 'season_number': ('seasonNumber', {int_or_none}), + }), series=traverse_obj(show_info, ('title', {str}))) class CBCGemLiveIE(InfoExtractor):