From 48be862b32648bff5b3e553e40fca4dcc6e88b28 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 27 Mar 2025 17:31:01 -0500 Subject: [PATCH] [ie/youtube] Make signature and nsig extraction more robust (#12761) Authored by: bashonly, seproDev Co-authored-by: sepro --- test/test_youtube_signature.py | 78 ++++++++++++++++++++++++--- yt_dlp/extractor/youtube/_video.py | 87 ++++++++++++++++++++++-------- 2 files changed, 136 insertions(+), 29 deletions(-) diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index c79fdc9df8..0f0885366e 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -88,11 +88,51 @@ _SIG_TESTS = [ '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', '0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpz2ICs6EVdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', ), + ( + 'https://www.youtube.com/s/player/363db69b/player_ias_tce.vflset/en_US/base.js', + '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', + '0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpz2ICs6EVdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', + ), ( 'https://www.youtube.com/s/player/4fcd6e4a/player_ias.vflset/en_US/base.js', '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', 'wAOAOq0QJ8ARAIgXmPlOPSBkkUs1bYFYlJCfe29xx8q7v1pDL0QwbdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_EMu-m37KtXJoOySqa0', ), + ( + 'https://www.youtube.com/s/player/4fcd6e4a/player_ias_tce.vflset/en_US/base.js', + '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', + 'wAOAOq0QJ8ARAIgXmPlOPSBkkUs1bYFYlJCfe29xx8q7v1pDL0QwbdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_EMu-m37KtXJoOySqa0', + ), + ( + 'https://www.youtube.com/s/player/20830619/player_ias.vflset/en_US/base.js', + '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', + '7AOq0QJ8wRAIgXmPlOPSBkkAs1bYFYlJCfe29xx8jOv1pDL0Q2bdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_EMu-m37KtXJoOySqa0qaw', + ), + ( + 'https://www.youtube.com/s/player/20830619/player_ias_tce.vflset/en_US/base.js', + '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', + '7AOq0QJ8wRAIgXmPlOPSBkkAs1bYFYlJCfe29xx8jOv1pDL0Q2bdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_EMu-m37KtXJoOySqa0qaw', + ), + ( + 'https://www.youtube.com/s/player/20830619/player-plasma-ias-phone-en_US.vflset/base.js', + '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', + '7AOq0QJ8wRAIgXmPlOPSBkkAs1bYFYlJCfe29xx8jOv1pDL0Q2bdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_EMu-m37KtXJoOySqa0qaw', + ), + ( + 'https://www.youtube.com/s/player/20830619/player-plasma-ias-tablet-en_US.vflset/base.js', + '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', + '7AOq0QJ8wRAIgXmPlOPSBkkAs1bYFYlJCfe29xx8jOv1pDL0Q2bdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_EMu-m37KtXJoOySqa0qaw', + ), + ( + 'https://www.youtube.com/s/player/8a8ac953/player_ias_tce.vflset/en_US/base.js', + '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', + 'IAOAOq0QJ8wRAAgXmPlOPSBkkUs1bYFYlJCfe29xx8j7v1pDL0QwbdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_E2u-m37KtXJoOySqa0', + ), + ( + 'https://www.youtube.com/s/player/8a8ac953/tv-player-es6.vflset/tv-player-es6.js', + '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', + 'IAOAOq0QJ8wRAAgXmPlOPSBkkUs1bYFYlJCfe29xx8j7v1pDL0QwbdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_E2u-m37KtXJoOySqa0', + ), ] _NSIG_TESTS = [ @@ -252,6 +292,30 @@ _NSIG_TESTS = [ 'https://www.youtube.com/s/player/4fcd6e4a/player_ias.vflset/en_US/base.js', 'o_L251jm8yhZkWtBW', 'lXoxI3XvToqn6A', ), + ( + 'https://www.youtube.com/s/player/4fcd6e4a/player_ias_tce.vflset/en_US/base.js', + 'o_L251jm8yhZkWtBW', 'lXoxI3XvToqn6A', + ), + ( + 'https://www.youtube.com/s/player/20830619/tv-player-ias.vflset/tv-player-ias.js', + 'ir9-V6cdbCiyKxhr', '9YE85kNjZiS4', + ), + ( + 'https://www.youtube.com/s/player/20830619/player-plasma-ias-phone-en_US.vflset/base.js', + 'ir9-V6cdbCiyKxhr', '9YE85kNjZiS4', + ), + ( + 'https://www.youtube.com/s/player/20830619/player-plasma-ias-tablet-en_US.vflset/base.js', + 'ir9-V6cdbCiyKxhr', '9YE85kNjZiS4', + ), + ( + 'https://www.youtube.com/s/player/8a8ac953/player_ias_tce.vflset/en_US/base.js', + 'MiBYeXx_vRREbiCCmh', 'RtZYMVvmkE0JE', + ), + ( + 'https://www.youtube.com/s/player/8a8ac953/tv-player-es6.vflset/tv-player-es6.js', + 'MiBYeXx_vRREbiCCmh', 'RtZYMVvmkE0JE', + ), ] @@ -302,33 +366,33 @@ def t_factory(name, sig_func, url_pattern): test_id = re.sub(r'[/.-]', '_', m.group('id') or m.group('compat_id')) def test_func(self): - basename = f'player-{name}-{test_id}.js' + basename = f'player-{test_id}.js' fn = os.path.join(self.TESTDATA_DIR, basename) if not os.path.exists(fn): urllib.request.urlretrieve(url, fn) with open(fn, encoding='utf-8') as testf: jscode = testf.read() - self.assertEqual(sig_func(jscode, sig_input), expected_sig) + self.assertEqual(sig_func(jscode, sig_input, url), expected_sig) test_func.__name__ = f'test_{name}_js_{test_id}' setattr(TestSignature, test_func.__name__, test_func) return make_tfunc -def signature(jscode, sig_input): - func = YoutubeIE(FakeYDL())._parse_sig_js(jscode) +def signature(jscode, sig_input, player_url): + func = YoutubeIE(FakeYDL())._parse_sig_js(jscode, player_url) src_sig = ( str(string.printable[:sig_input]) if isinstance(sig_input, int) else sig_input) return func(src_sig) -def n_sig(jscode, sig_input): +def n_sig(jscode, sig_input, player_url): ie = YoutubeIE(FakeYDL()) - funcname = ie._extract_n_function_name(jscode) + funcname = ie._extract_n_function_name(jscode, player_url=player_url) jsi = JSInterpreter(jscode) - func = jsi.extract_function_from_code(*ie._fixup_n_function_code(*jsi.extract_function_code(funcname), jscode)) + func = jsi.extract_function_from_code(*ie._fixup_n_function_code(*jsi.extract_function_code(funcname), jscode, player_url)) return func([sig_input]) diff --git a/yt_dlp/extractor/youtube/_video.py b/yt_dlp/extractor/youtube/_video.py index e349b36517..d86bbaff86 100644 --- a/yt_dlp/extractor/youtube/_video.py +++ b/yt_dlp/extractor/youtube/_video.py @@ -34,6 +34,7 @@ from ...utils import ( clean_html, datetime_from_str, filesize_from_tbr, + filter_dict, float_or_none, format_field, get_first, @@ -1986,12 +1987,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor): assert os.path.basename(func_id) == func_id self.write_debug(f'Extracting signature function {func_id}') - cache_spec, code = self.cache.load('youtube-sigfuncs', func_id), None + cache_spec, code = self.cache.load('youtube-sigfuncs', func_id, min_ver='2025.03.27'), None if not cache_spec: code = self._load_player(video_id, player_url) if code: - res = self._parse_sig_js(code) + res = self._parse_sig_js(code, player_url) test_string = ''.join(map(chr, range(len(example_sig)))) cache_spec = [ord(c) for c in res(test_string)] self.cache.store('youtube-sigfuncs', func_id, cache_spec) @@ -2039,7 +2040,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): f' return {expr_code}\n') self.to_screen('Extracted signature function:\n' + code) - def _parse_sig_js(self, jscode): + def _parse_sig_js(self, jscode, player_url): # Examples where `sig` is funcname: # sig=function(a){a=a.split(""); ... ;return a.join("")}; # ;c&&(c=sig(decodeURIComponent(c)),a.set(b,encodeURIComponent(c)));return a}; @@ -2063,12 +2064,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P[a-zA-Z0-9$]+)\('), jscode, 'Initial JS player signature function name', group='sig') + varname, global_list = self._interpret_player_js_global_var(jscode, player_url) jsi = JSInterpreter(jscode) - global_var_map = {} - _, varname, value = self._extract_player_js_global_var(jscode) - if varname: - global_var_map[varname] = jsi.interpret_expression(value, {}, allow_recursion=100) - initial_function = jsi.extract_function(funcname, global_var_map) + initial_function = jsi.extract_function(funcname, filter_dict({varname: global_list})) return lambda s: initial_function([s]) def _cached(self, func, *cache_id): @@ -2093,7 +2091,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if func_code := self._player_cache.get(cache_id): return func_code - func_code = self.cache.load('youtube-nsig', player_id, min_ver='2025.03.26') + func_code = self.cache.load('youtube-nsig', player_id, min_ver='2025.03.27') if func_code: self._player_cache[cache_id] = func_code @@ -2150,6 +2148,26 @@ class YoutubeIE(YoutubeBaseInfoExtractor): return ret def _extract_n_function_name(self, jscode, player_url=None): + varname, global_list = self._interpret_player_js_global_var(jscode, player_url) + if debug_str := traverse_obj(global_list, (lambda _, v: v.endswith('_w8_'), any)): + funcname = self._search_regex( + r'''(?xs) + [;\n](?: + (?Pfunction\s+)| + (?:var\s+)? + )(?P[a-zA-Z0-9_$]+)\s*(?(f)|=\s*function\s*) + \((?P[a-zA-Z0-9_$]+)\)\s*\{ + (?:(?!\}[;\n]).)+ + \}\s*catch\(\s*[a-zA-Z0-9_$]+\s*\)\s* + \{\s*return\s+%s\[%d\]\s*\+\s*(?P=argname)\s*\}\s*return\s+[^}]+\}[;\n] + ''' % (re.escape(varname), global_list.index(debug_str)), + jscode, 'nsig function name', group='funcname', default=None) + if funcname: + return funcname + self.write_debug(join_nonempty( + 'Initial search was unable to find nsig function name', + player_url and f' player = {player_url}', delim='\n'), only_once=True) + # Examples (with placeholders nfunc, narray, idx): # * .get("n"))&&(b=nfunc(b) # * .get("n"))&&(b=narray[idx](b) @@ -2179,7 +2197,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if not funcname: self.report_warning(join_nonempty( 'Falling back to generic n function search', - player_url and f' player = {player_url}', delim='\n')) + player_url and f' player = {player_url}', delim='\n'), only_once=True) return self._search_regex( r'''(?xs) ;\s*(?P[a-zA-Z0-9_$]+)\s*=\s*function\([a-zA-Z0-9_$]+\) @@ -2192,9 +2210,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): rf'var {re.escape(funcname)}\s*=\s*(\[.+?\])\s*[,;]', jscode, f'Initial JS player n function list ({funcname}.{idx})')))[int(idx)] - def _extract_player_js_global_var(self, jscode): + def _extract_player_js_global_var(self, jscode, player_url): """Returns tuple of strings: variable assignment code, variable name, variable value code""" - return self._search_regex( + extract_global_var = self._cached(self._search_regex, 'js global array', player_url) + varcode, varname, varvalue = extract_global_var( r'''(?x) (?P["\'])use\s+strict(?P=q1);\s* (?P @@ -2206,17 +2225,41 @@ class YoutubeIE(YoutubeBaseInfoExtractor): ) )[;,] ''', jscode, 'global variable', group=('code', 'name', 'value'), default=(None, None, None)) + if not varcode: + self.write_debug(join_nonempty( + 'No global array variable found in player JS', + player_url and f' player = {player_url}', delim='\n'), only_once=True) + return varcode, varname, varvalue - def _fixup_n_function_code(self, argnames, code, full_code): - global_var, varname, _ = self._extract_player_js_global_var(full_code) - if global_var: - self.write_debug(f'Prepending n function code with global array variable "{varname}"') - code = global_var + '; ' + code + def _interpret_player_js_global_var(self, jscode, player_url): + """Returns tuple of: variable name string, variable value list""" + _, varname, array_code = self._extract_player_js_global_var(jscode, player_url) + jsi = JSInterpreter(array_code) + interpret_global_var = self._cached(jsi.interpret_expression, 'js global list', player_url) + return varname, interpret_global_var(array_code, {}, allow_recursion=10) + + def _fixup_n_function_code(self, argnames, nsig_code, jscode, player_url): + varcode, varname, _ = self._extract_player_js_global_var(jscode, player_url) + if varcode and varname: + nsig_code = varcode + '; ' + nsig_code + _, global_list = self._interpret_player_js_global_var(jscode, player_url) else: - self.write_debug('No global array variable found in player JS') - return argnames, re.sub( - rf';\s*if\s*\(\s*typeof\s+[a-zA-Z0-9_$]+\s*===?\s*(?:(["\'])undefined\1|{varname}\[\d+\])\s*\)\s*return\s+{argnames[0]};', - ';', code) + varname = 'dlp_wins' + global_list = [] + + undefined_idx = global_list.index('undefined') if 'undefined' in global_list else r'\d+' + fixed_code = re.sub( + rf'''(?x) + ;\s*if\s*\(\s*typeof\s+[a-zA-Z0-9_$]+\s*===?\s*(?: + (["\'])undefined\1| + {re.escape(varname)}\[{undefined_idx}\] + )\s*\)\s*return\s+{re.escape(argnames[0])}; + ''', ';', nsig_code) + if fixed_code == nsig_code: + self.write_debug(join_nonempty( + 'No typeof statement found in nsig function code', + player_url and f' player = {player_url}', delim='\n'), only_once=True) + return argnames, fixed_code def _extract_n_function_code(self, video_id, player_url): player_id = self._extract_player_info(player_url) @@ -2230,7 +2273,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): func_name = self._extract_n_function_name(jscode, player_url=player_url) # XXX: Workaround for the global array variable and lack of `typeof` implementation - func_code = self._fixup_n_function_code(*jsi.extract_function_code(func_name), jscode) + func_code = self._fixup_n_function_code(*jsi.extract_function_code(func_name), jscode, player_url) return jsi, player_id, func_code