[downloader/mhtml] Add new downloader (#343)

This downloader is intended to be used for streams that consist of a timed sequence of stand-alone images, such as slideshows or thumbnail streams This can be used for implementing: https://github.com/ytdl-org/youtube-dl/issues/4974#issue-58006762 https://github.com/ytdl-org/youtube-dl/issues/4540#issuecomment-69574231 https://github.com/ytdl-org/youtube-dl/pull/11185#issuecomment-335554239 https://github.com/ytdl-org/youtube-dl/issues/9868 https://github.com/ytdl-org/youtube-dl/pull/14951 Authored by: fstirlitz
This also adds extracting storyboards from DASH manifest as mhtml
2025-07-03 20:42:53 -05:00 · 2021-05-23 18:34:49 +02:00 · 2021-06-13 22:41:29 +05:30
parent 4d85fbbdbb
commit cdb19aa4c2
6 changed files with 248 additions and 16 deletions
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@ -2126,6 +2126,7 @@ class InfoExtractor(object):
                        format_id.append(str(format_index))
                    f = {
                        'format_id': '-'.join(format_id),
+                        'format_note': name,
                        'format_index': format_index,
                        'url': manifest_url,
                        'manifest_url': m3u8_url,
@ -2637,7 +2638,7 @@ class InfoExtractor(object):
                    mime_type = representation_attrib['mimeType']
                    content_type = representation_attrib.get('contentType', mime_type.split('/')[0])

-                    if content_type in ('video', 'audio', 'text'):
+                    if content_type in ('video', 'audio', 'text') or mime_type == 'image/jpeg':
                        base_url = ''
                        for element in (representation, adaptation_set, period, mpd_doc):
                            base_url_e = element.find(_add_ns('BaseURL'))
@ -2654,9 +2655,15 @@ class InfoExtractor(object):
                        url_el = representation.find(_add_ns('BaseURL'))
                        filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
                        bandwidth = int_or_none(representation_attrib.get('bandwidth'))
+                        if representation_id is not None:
+                            format_id = representation_id
+                        else:
+                            format_id = content_type
+                        if mpd_id:
+                            format_id = mpd_id + '-' + format_id
                        if content_type in ('video', 'audio'):
                            f = {
-                                'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
+                                'format_id': format_id,
                                'manifest_url': mpd_url,
                                'ext': mimetype2ext(mime_type),
                                'width': int_or_none(representation_attrib.get('width')),
@ -2676,6 +2683,17 @@ class InfoExtractor(object):
                                'manifest_url': mpd_url,
                                'filesize': filesize,
                            }
+                        elif mime_type == 'image/jpeg':
+                            # See test case in VikiIE
+                            # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
+                            f = {
+                                'format_id': format_id,
+                                'ext': 'mhtml',
+                                'manifest_url': mpd_url,
+                                'format_note': 'DASH storyboards (jpeg)',
+                                'acodec': 'none',
+                                'vcodec': 'none',
+                            }
                        representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)

                        def prepare_template(template_name, identifiers):
@ -2694,7 +2712,8 @@ class InfoExtractor(object):
                                    t += c
                            # Next, $...$ templates are translated to their
                            # %(...) counterparts to be used with % operator
-                            t = t.replace('$RepresentationID$', representation_id)
+                            if representation_id is not None:
+                                t = t.replace('$RepresentationID$', representation_id)
                            t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
                            t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
                            t.replace('$$', '$')
@ -2811,7 +2830,7 @@ class InfoExtractor(object):
                                'url': mpd_url or base_url,
                                'fragment_base_url': base_url,
                                'fragments': [],
-                                'protocol': 'http_dash_segments',
+                                'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
                            })
                            if 'initialization_url' in representation_ms_info:
                                initialization_url = representation_ms_info['initialization_url']
@ -2822,7 +2841,7 @@ class InfoExtractor(object):
                        else:
                            # Assuming direct URL to unfragmented media.
                            f['url'] = base_url
-                        if content_type in ('video', 'audio'):
+                        if content_type in ('video', 'audio') or mime_type == 'image/jpeg':
                            formats.append(f)
                        elif content_type == 'text':
                            subtitles.setdefault(lang or 'und', []).append(f)