release 2013.07.08

[arte] Always look for the JSON URL (Fixes #1002 )
Merge branch 'master' of github.com:rg3/youtube-dl
2025-08-03 10:59:47 -05:00 · 2013-07-08 01:29:16 +02:00 · 2013-07-08 01:28:19 +02:00 · 2013-07-08 01:15:19 +02:00 · 2013-07-08 01:13:55 +02:00 · 2013-07-07 23:24:34 +02:00
8 changed files with 163 additions and 9 deletions
--- a/youtube_dl/extractor/init.py
+++ b/youtube_dl/extractor/init.py
@@ -11,6 +11,7 @@ from .comedycentral import ComedyCentralIE
 from .cspan import CSpanIE
 from .dailymotion import DailymotionIE
 from .depositfiles import DepositFilesIE
+from .dreisat import DreiSatIE
 from .eighttracks import EightTracksIE
 from .escapist import EscapistIE
 from .facebook import FacebookIE
--- a/youtube_dl/extractor/arte.py
+++ b/youtube_dl/extractor/arte.py
@@ -77,12 +77,8 @@ class ArteTvIE(InfoExtractor):

    def _extract_emission(self, url, video_id, lang):
        """Extract from www.arte.tv/guide"""
-        if video_id.replace('-','').isdigit():
-            json_url = 'http://org-www.arte.tv/papi/tvguide/videos/stream/player/F/%s_PLUS7-F/ALL/ALL.json' % video_id
-        else:
-            # We don't know the real id of the video, we have to search in the webpage
-            webpage = self._download_webpage(url, video_id)
-            json_url = self._html_search_regex(r'arte_vp_url="(.*?)"', webpage, 'json url')
+        webpage = self._download_webpage(url, video_id)
+        json_url = self._html_search_regex(r'arte_vp_url="(.*?)"', webpage, 'json url')

        json_info = self._download_webpage(json_url, video_id, 'Downloading info json')
        self.report_extraction(video_id)
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -3,6 +3,7 @@ import os
 import re
 import socket
 import sys
+import netrc

 from ..utils import (
    compat_http_client,
@@ -36,6 +37,8 @@ class InfoExtractor(object):
    The following fields are optional:

    format:         The video format, defaults to ext (used for --get-format)
+    thumbnails:     A list of dictionaries (with the entries "resolution" and
+                    "url") for the varying thumbnails
    thumbnail:      Full URL to a video thumbnail image.
    description:    One-line video description.
    uploader:       Full name of the video uploader.
@@ -161,6 +164,10 @@ class InfoExtractor(object):
        """Report attempt to confirm age."""
        self.to_screen(u'Confirming age')

+    def report_login(self):
+        """Report attempt to log in."""
+        self.to_screen(u'Logging in')
+
    #Methods for following #608
    #They set the correct value of the '_type' key
    def video_result(self, video_info):
@@ -225,6 +232,36 @@ class InfoExtractor(object):
        else:
            return res

+    def _get_login_info(self):
+        """
+        Get the the login info as (username, password)
+        It will look in the netrc file using the _NETRC_MACHINE value
+        If there's no info available, return (None, None)
+        """
+        if self._downloader is None:
+            return (None, None)
+
+        username = None
+        password = None
+        downloader_params = self._downloader.params
+
+        # Attempt to use provided username and password or .netrc data
+        if downloader_params.get('username', None) is not None:
+            username = downloader_params['username']
+            password = downloader_params['password']
+        elif downloader_params.get('usenetrc', False):
+            try:
+                info = netrc.netrc().authenticators(self._NETRC_MACHINE)
+                if info is not None:
+                    username = info[0]
+                    password = info[2]
+                else:
+                    raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
+            except (IOError, netrc.NetrcParseError) as err:
+                self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
+        
+        return (username, password)
+
 class SearchInfoExtractor(InfoExtractor):
    """
    Base class for paged search queries extractors.
--- a/youtube_dl/extractor/dreisat.py
+++ b/youtube_dl/extractor/dreisat.py
@@ -0,0 +1,85 @@
+# coding: utf-8
+
+import re
+import xml.etree.ElementTree
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+    ExtractorError,
+    unified_strdate,
+)
+
+
+class DreiSatIE(InfoExtractor):
+    IE_NAME = '3sat'
+    _VALID_URL = r'(?:http://)?(?:www\.)?3sat.de/mediathek/index.php\?(?:(?:mode|display)=[^&]+&)*obj=(?P<id>[0-9]+)$'
+    _TEST = {
+        u"url": u"http://www.3sat.de/mediathek/index.php?obj=36983",
+        u'file': u'36983.webm',
+        u'md5': u'57c97d0469d71cf874f6815aa2b7c944',
+        u'info_dict': {
+            u"title": u"Kaffeeland Schweiz",
+            u"description": u"Über 80 Kaffeeröstereien liefern in der Schweiz das Getränk, in das das Land so vernarrt ist: Mehr als 1000 Tassen trinkt ein Schweizer pro Jahr. SCHWEIZWEIT nimmt die Kaffeekultur unter die...", 
+            u"uploader": u"3sat",
+            u"upload_date": u"20130622"
+        }
+    }
+
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        details_url = 'http://www.3sat.de/mediathek/xmlservice/web/beitragsDetails?ak=web&id=%s' % video_id
+        details_xml = self._download_webpage(details_url, video_id, note=u'Downloading video details')
+        details_doc = xml.etree.ElementTree.fromstring(details_xml.encode('utf-8'))
+
+        thumbnail_els = details_doc.findall('.//teaserimage')
+        thumbnails = [{
+            'width': te.attrib['key'].partition('x')[0],
+            'height': te.attrib['key'].partition('x')[2],
+            'url': te.text,
+        } for te in thumbnail_els]
+
+        information_el = details_doc.find('.//information')
+        video_title = information_el.find('./title').text
+        video_description = information_el.find('./detail').text
+
+        details_el = details_doc.find('.//details')
+        video_uploader = details_el.find('./channel').text
+        upload_date = unified_strdate(details_el.find('./airtime').text)
+
+        format_els = details_doc.findall('.//formitaet')
+        formats = [{
+            'format_id': fe.attrib['basetype'],
+            'width': int(fe.find('./width').text),
+            'height': int(fe.find('./height').text),
+            'url': fe.find('./url').text,
+            'filesize': int(fe.find('./filesize').text),
+            'video_bitrate': int(fe.find('./videoBitrate').text),
+            '3sat_qualityname': fe.find('./quality').text,
+        } for fe in format_els
+            if not fe.find('./url').text.startswith('http://www.metafilegenerator.de/')]
+
+        def _sortkey(format):
+            qidx = ['low', 'med', 'high', 'veryhigh'].index(format['3sat_qualityname'])
+            prefer_http = 1 if 'rtmp' in format['url'] else 0
+            return (qidx, prefer_http, format['video_bitrate'])
+        formats.sort(key=_sortkey)
+
+        info = {
+            'id': video_id,
+            'title': video_title,
+            'formats': formats,
+            'description': video_description,
+            'thumbnails': thumbnails,
+            'thumbnail': thumbnails[-1]['url'],
+            'uploader': video_uploader,
+            'upload_date': upload_date,
+        }
+
+        # TODO: Remove when #980 has been merged
+        info['url'] = formats[-1]['url']
+        info['ext'] = determine_ext(formats[-1]['url'])
+
+        return self.video_result(info)
--- a/youtube_dl/extractor/vimeo.py
+++ b/youtube_dl/extractor/vimeo.py
@@ -17,6 +17,7 @@ class VimeoIE(InfoExtractor):

    # _VALID_URL matches Vimeo URLs
    _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)(?:[?].*)?$'
+    _NETRC_MACHINE = 'vimeo'
    IE_NAME = u'vimeo'
    _TEST = {
        u'url': u'http://vimeo.com/56015672',
@@ -31,6 +32,25 @@ class VimeoIE(InfoExtractor):
        }
    }

+    def _login(self):
+        (username, password) = self._get_login_info()
+        if username is None:
+            return
+        self.report_login()
+        login_url = 'https://vimeo.com/log_in'
+        webpage = self._download_webpage(login_url, None, False)
+        token = re.search(r'xsrft: \'(.*?)\'', webpage).group(1)
+        data = compat_urllib_parse.urlencode({'email': username,
+                                              'password': password,
+                                              'action': 'login',
+                                              'service': 'vimeo',
+                                              'token': token,
+                                              })
+        login_request = compat_urllib_request.Request(login_url, data)
+        login_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
+        login_request.add_header('Cookie', 'xsrft=%s' % token)
+        self._download_webpage(login_request, None, False, u'Wrong login info')
+
    def _verify_video_password(self, url, video_id, webpage):
        password = self._downloader.params.get('videopassword', None)
        if password is None:
@@ -50,6 +70,9 @@ class VimeoIE(InfoExtractor):
                               u'Verifying the password',
                               u'Wrong password')

+    def _real_initialize(self):
+        self._login()
+
    def _real_extract(self, url, new_video=True):
        # Extract ID from URL
        mobj = re.match(self._VALID_URL, url)
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -473,7 +473,12 @@ class YoutubeIE(InfoExtractor):
        video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])

        # thumbnail image
-        if 'thumbnail_url' not in video_info:
+        # We try first to get a high quality image:
+        m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
+                            video_webpage, re.DOTALL)
+        if m_thumb is not None:
+            video_thumbnail = m_thumb.group(1)
+        elif 'thumbnail_url' not in video_info:
            self._downloader.report_warning(u'unable to extract video thumbnail')
            video_thumbnail = ''
        else:   # don't panic if we can't find it
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -623,7 +623,7 @@ def unified_strdate(date_str):
    date_str = date_str.replace(',',' ')
    # %z (UTC offset) is only supported in python>=3.2
    date_str = re.sub(r' (\+|-)[\d]*$', '', date_str)
-    format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y', '%Y-%m-%d', '%d/%m/%Y', '%Y/%m/%d %H:%M:%S']
+    format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y', '%Y-%m-%d', '%d/%m/%Y', '%Y/%m/%d %H:%M:%S', '%d.%m.%Y %H:%M']
    for expression in format_expressions:
        try:
            upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
@@ -631,6 +631,13 @@ def unified_strdate(date_str):
            pass
    return upload_date

+def determine_ext(url):
+    guess = url.partition(u'?')[0].rpartition(u'.')[2]
+    if re.match(r'^[A-Za-z0-9]+$', guess):
+        return guess
+    else:
+        return u'unknown_video'
+
 def date_from_str(date_str):
    """
    Return a datetime object from a string in the format YYYYMMDD or
--- a/youtube_dl/version.py
+++ b/youtube_dl/version.py
@@ -1,2 +1,2 @@

-__version__ = '2013.07.07.01'
+__version__ = '2013.07.08'
Author	SHA1	Message	Date
Philipp Hagemeister	b04621d155	release 2013.07.08	2013-07-08 01:29:16 +02:00
Philipp Hagemeister	b227060388	[arte] Always look for the JSON URL (Fixes #1002 )	2013-07-08 01:28:19 +02:00
Philipp Hagemeister	d93e4dcbb7	Merge branch 'master' of github.com:rg3/youtube-dl	2013-07-08 01:15:19 +02:00
Philipp Hagemeister	73e79f2a1b	[3sat] Add support (Fixes #1001 )	2013-07-08 01:13:55 +02:00
Jaime Marquínez Ferrándiz	fc79158de2	VimeoIE: authentication support (closes #885 ) and add a method in the base InfoExtractor to get the login info	2013-07-07 23:24:34 +02:00
Jaime Marquínez Ferrándiz	7763b04e5f	YoutubeIE: extract the thumbnail in the best possible quality	2013-07-07 21:21:15 +02:00