release 2013.08.28

Delete default user agent (Fixes #1309 )
Revert "Install our own HTTPS handler as well (#1309 )"
2025-08-03 02:50:01 -05:00 · 2013-08-27 23:31:36 +02:00 · 2013-08-27 23:31:24 +02:00 · 2013-08-27 23:28:20 +02:00 · 2013-08-27 13:36:39 -07:00 · 2013-08-27 14:38:50 -05:00
10 changed files with 146 additions and 31 deletions
--- a/youtube_dl/init.py
+++ b/youtube_dl/init.py
@@ -430,6 +430,10 @@ def _real_main(argv=None):
    proxy_handler = compat_urllib_request.ProxyHandler(proxies)
    https_handler = make_HTTPS_handler(opts)
    opener = compat_urllib_request.build_opener(https_handler, proxy_handler, cookie_processor, YoutubeDLHandler())
+    # Delete the default user-agent header, which would otherwise apply in
+    # cases where our custom HTTP handler doesn't come into play
+    # (See https://github.com/rg3/youtube-dl/issues/1309 for details)
+    opener.addheaders =[]
    compat_urllib_request.install_opener(opener)
    socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)

--- a/youtube_dl/extractor/init.py
+++ b/youtube_dl/extractor/init.py
@@ -8,6 +8,8 @@ from .breakcom import BreakIE
 from .brightcove import BrightcoveIE
 from .c56 import C56IE
 from .canalplus import CanalplusIE
+from .canalc2 import Canalc2IE
+from .cnn import CNNIE
 from .collegehumor import CollegeHumorIE
 from .comedycentral import ComedyCentralIE
 from .condenast import CondeNastIE
@@ -52,6 +54,7 @@ from .muzu import MuzuTVIE
 from .myspass import MySpassIE
 from .myvideo import MyVideoIE
 from .nba import NBAIE
+from .nbc import NBCNewsIE
 from .ooyala import OoyalaIE
 from .pbs import PBSIE
 from .photobucket import PhotobucketIE
--- a/youtube_dl/extractor/canalc2.py
+++ b/youtube_dl/extractor/canalc2.py
@@ -0,0 +1,35 @@
+# coding: utf-8
+import re
+
+from .common import InfoExtractor
+
+
+class Canalc2IE(InfoExtractor):
+    _IE_NAME = 'canalc2.tv'
+    _VALID_URL = r'http://.*?\.canalc2\.tv/video\.asp\?idVideo=(\d+)&voir=oui'
+
+    _TEST = {
+        u'url': u'http://www.canalc2.tv/video.asp?idVideo=12163&voir=oui',
+        u'file': u'12163.mp4',
+        u'md5': u'060158428b650f896c542dfbb3d6487f',
+        u'info_dict': {
+            u'title': u'Terrasses du Numérique'
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = re.match(self._VALID_URL, url).group(1)
+        webpage = self._download_webpage(url, video_id)
+        file_name = self._search_regex(
+            r"so\.addVariable\('file','(.*?)'\);",
+            webpage, 'file name')
+        video_url = 'http://vod-flash.u-strasbg.fr:8080/' + file_name
+
+        title = self._html_search_regex(
+            r'class="evenement8">(.*?)</a>', webpage, u'title')
+        
+        return {'id': video_id,
+                'ext': 'mp4',
+                'url': video_url,
+                'title': title,
+                }
--- a/youtube_dl/extractor/canalplus.py
+++ b/youtube_dl/extractor/canalplus.py
@@ -5,7 +5,7 @@ from .common import InfoExtractor
 from ..utils import unified_strdate

 class CanalplusIE(InfoExtractor):
-    _VALID_URL = r'https?://www\.canalplus\.fr/.*?\?vid=(?P<id>\d+)'
+    _VALID_URL = r'https?://(www\.canalplus\.fr/.*?\?vid=|player\.canalplus\.fr/#/)(?P<id>\d+)'
    _VIDEO_INFO_TEMPLATE = 'http://service.canal-plus.com/video/rest/getVideosLiees/cplus/%s'
    IE_NAME = u'canalplus.fr'

--- a/youtube_dl/extractor/cnn.py
+++ b/youtube_dl/extractor/cnn.py
@@ -0,0 +1,47 @@
+import re
+import xml.etree.ElementTree
+
+from .common import InfoExtractor
+from ..utils import determine_ext
+
+class CNNIE(InfoExtractor):
+    _VALID_URL = r'https?://(edition\.)?cnn\.com/video/(data/.+?|\?)/(?P<path>.+?/(?P<title>[^/]+?)\.cnn)'
+
+    _TEST = {
+        u'url': u'http://edition.cnn.com/video/?/video/sports/2013/06/09/nadal-1-on-1.cnn',
+        u'file': u'sports_2013_06_09_nadal-1-on-1.cnn.mp4',
+        u'md5': u'3e6121ea48df7e2259fe73a0628605c4',
+        u'info_dict': {
+            u'title': u'Nadal wins 8th French Open title',
+            u'description': u'World Sport\'s Amanda Davies chats with 2013 French Open champion Rafael Nadal.',
+        },
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        path = mobj.group('path')
+        page_title = mobj.group('title')
+        info_xml = self._download_webpage(
+            'http://cnn.com/video/data/3.0/%s/index.xml' % path, page_title)
+        info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8'))
+
+        formats = []
+        for f in info.findall('files/file'):
+            mf = re.match(r'(\d+)x(\d+)(?:_(.*)k)?',f.attrib['bitrate'])
+            if mf is not None:
+                formats.append((int(mf.group(1)), int(mf.group(2)), int(mf.group(3) or 0), f.text))
+        formats = sorted(formats)
+        (_,_,_, video_path) = formats[-1]
+        video_url = 'http://ht.cdn.turner.com/cnn/big%s' % video_path
+
+        thumbnails = sorted([((int(t.attrib['height']),int(t.attrib['width'])), t.text) for t in info.findall('images/image')])
+        thumbs_dict = [{'resolution': res, 'url': t_url} for (res, t_url) in thumbnails]
+
+        return {'id': info.attrib['id'],
+                'title': info.find('headline').text,
+                'url': video_url,
+                'ext': determine_ext(video_url),
+                'thumbnail': thumbnails[-1][1],
+                'thumbnails': thumbs_dict,
+                'description': info.find('description').text,
+                }
--- a/youtube_dl/extractor/googleplus.py
+++ b/youtube_dl/extractor/googleplus.py
@@ -57,8 +57,8 @@ class GooglePlusIE(InfoExtractor):
            webpage, 'title', default=u'NA')

        # Step 2, Simulate clicking the image box to launch video
-        DOMAIN = 'https://plus.google.com'
-        video_page = self._search_regex(r'<a href="((?:%s)?/photos/.*?)"' % re.escape(DOMAIN),
+        DOMAIN = 'https://plus.google.com/'
+        video_page = self._search_regex(r'<a href="((?:%s)?photos/.*?)"' % re.escape(DOMAIN),
            webpage, u'video page URL')
        if not video_page.startswith(DOMAIN):
            video_page = DOMAIN + video_page
--- a/youtube_dl/extractor/hark.py
+++ b/youtube_dl/extractor/hark.py
@@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-

 import re
+import json

 from .common import InfoExtractor
 from ..utils import determine_ext
@@ -12,24 +13,25 @@ class HarkIE(InfoExtractor):
        u'file': u'mmbzyhkgny.mp3',
        u'md5': u'6783a58491b47b92c7c1af5a77d4cbee',
        u'info_dict': {
-            u"title": u"Obama: 'Beyond The Afghan Theater, We Only Target Al Qaeda' On May 23, 2013 ",
+            u'title': u"Obama: 'Beyond The Afghan Theater, We Only Target Al Qaeda' on May 23, 2013",
+            u'description': u'President Barack Obama addressed the nation live on May 23, 2013 in a speech aimed at addressing counter-terrorism policies including the use of drone strikes, detainees at Guantanamo Bay prison facility, and American citizens who are terrorists.',
+            u'duration': 11,
        }
    }

    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        video_id = mobj.group(1)
-        embed_url = "http://www.hark.com/clips/%s/homepage_embed" %(video_id)
-        webpage = self._download_webpage(embed_url, video_id)
-
-        final_url = self._search_regex(r'src="(.+?).mp3"',
-                                webpage, 'video url')+'.mp3'
-        title = self._html_search_regex(r'<title>(.+?)</title>',
-                                webpage, 'video title').replace(' Sound Clip and Quote - Hark','').replace(
-                                'Sound Clip , Quote, MP3, and Ringtone - Hark','')
+        json_url = "http://www.hark.com/clips/%s.json" %(video_id)
+        info_json = self._download_webpage(json_url, video_id)
+        info = json.loads(info_json)
+        final_url = info['url']

        return {'id': video_id,
                'url' : final_url,
-                'title': title,
+                'title': info['name'],
                'ext': determine_ext(final_url),
+                'description': info['description'],
+                'thumbnail': info['image_original'],
+                'duration': info['duration'],
                }
--- a/youtube_dl/extractor/nbc.py
+++ b/youtube_dl/extractor/nbc.py
@@ -0,0 +1,33 @@
+import re
+import xml.etree.ElementTree
+
+from .common import InfoExtractor
+from ..utils import find_xpath_attr, compat_str
+
+
+class NBCNewsIE(InfoExtractor):
+    _VALID_URL = r'https?://www\.nbcnews\.com/video/.+?/(?P<id>\d+)'
+
+    _TEST = {
+        u'url': u'http://www.nbcnews.com/video/nbc-news/52753292',
+        u'file': u'52753292.flv',
+        u'md5': u'47abaac93c6eaf9ad37ee6c4463a5179',
+        u'info_dict': {
+            u'title': u'Crew emerges after four-month Mars food study',
+            u'description': u'md5:24e632ffac72b35f8b67a12d1b6ddfc1',
+        },
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        info_xml = self._download_webpage('http://www.nbcnews.com/id/%s/displaymode/1219' % video_id, video_id)
+        info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8')).find('video')
+
+        return {'id': video_id,
+                'title': info.find('headline').text,
+                'ext': 'flv',
+                'url': find_xpath_attr(info, 'media', 'type', 'flashVideo').text,
+                'description': compat_str(info.find('caption').text),
+                'thumbnail': find_xpath_attr(info, 'media', 'type', 'thumbnail').text,
+                }
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -476,7 +476,7 @@ def formatSeconds(secs):
 def make_HTTPS_handler(opts):
    if sys.version_info < (3,2):
        # Python's 2.x handler is very simplistic
-        return YoutubeDLHandlerHTTPS()
+        return compat_urllib_request.HTTPSHandler()
    else:
        import ssl
        context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
@@ -485,7 +485,7 @@ def make_HTTPS_handler(opts):
        context.verify_mode = (ssl.CERT_NONE
                               if opts.no_check_certificate
                               else ssl.CERT_REQUIRED)
-        return YoutubeDLHandlerHTTPS(context=context)
+        return compat_urllib_request.HTTPSHandler(context=context)

 class ExtractorError(Exception):
    """Error during info extraction."""
@@ -569,8 +569,7 @@ class ContentTooShortError(Exception):
        self.downloaded = downloaded
        self.expected = expected

-
-class YoutubeDLHandler_Template:  # Old-style class, like HTTPHandler
+class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
    """Handler for HTTP requests and responses.

    This class, when installed with an OpenerDirector, automatically adds
@@ -603,8 +602,8 @@ class YoutubeDLHandler_Template:  # Old-style class, like HTTPHandler
        ret.code = code
        return ret

-    def _http_request(self, req):
-        for h, v in std_headers.items():
+    def http_request(self, req):
+        for h,v in std_headers.items():
            if h in req.headers:
                del req.headers[h]
            req.add_header(h, v)
@@ -619,7 +618,7 @@ class YoutubeDLHandler_Template:  # Old-style class, like HTTPHandler
            del req.headers['Youtubedl-user-agent']
        return req

-    def _http_response(self, req, resp):
+    def http_response(self, req, resp):
        old_resp = resp
        # gzip
        if resp.headers.get('Content-encoding', '') == 'gzip':
@@ -633,16 +632,8 @@ class YoutubeDLHandler_Template:  # Old-style class, like HTTPHandler
            resp.msg = old_resp.msg
        return resp

-
-class YoutubeDLHandler(YoutubeDLHandler_Template, compat_urllib_request.HTTPHandler):
-    http_request = YoutubeDLHandler_Template._http_request
-    http_response = YoutubeDLHandler_Template._http_response
-
-
-class YoutubeDLHandlerHTTPS(YoutubeDLHandler_Template, compat_urllib_request.HTTPSHandler):
-    https_request = YoutubeDLHandler_Template._http_request
-    https_response = YoutubeDLHandler_Template._http_response
-
+    https_request = http_request
+    https_response = http_response

 def unified_strdate(date_str):
    """Return a string with the date in the format YYYYMMDD"""
--- a/youtube_dl/version.py
+++ b/youtube_dl/version.py
@@ -1,2 +1,2 @@

-__version__ = '2013.08.27'
+__version__ = '2013.08.28'
Author	SHA1	Message	Date
Philipp Hagemeister	1619e22f40	release 2013.08.28	2013-08-27 23:31:36 +02:00
Philipp Hagemeister	88a79ce6a6	Delete default user agent (Fixes #1309 )	2013-08-27 23:31:24 +02:00
Philipp Hagemeister	acebc9cd6b	Revert "Install our own HTTPS handler as well (#1309 )" This reverts commit `36399e8576` and fixes #1322.	2013-08-27 23:28:20 +02:00
Jaime Marquínez Ferrándiz	443c12a703	Merge pull request #1324 from whydoubt/fix_gplus Initial slash in Google+ photos link was removed	2013-08-27 13:36:39 -07:00
Jeff Smith	7f3c4f4f65	Initial slash in Google+ photos link was removed	2013-08-27 14:38:50 -05:00
Jaime Marquínez Ferrándiz	0bc56fa66a	Add an extractor for NBC news (closes #1320 )	2013-08-27 12:38:57 +02:00
Jaime Marquínez Ferrándiz	1a582dd49d	Add an extractor for CNN (closes #1318 )	2013-08-27 11:56:48 +02:00
Philipp Hagemeister	e86ea47c02	[canalc2] Small improvements	2013-08-27 10:35:20 +02:00
Philipp Hagemeister	aa5a63a5b5	Merge remote-tracking branch 'Rudloff/canalc2'	2013-08-27 10:31:46 +02:00
Jaime Marquínez Ferrándiz	2a7b4da9b2	[hark] get the song info in JSON and extract more information.	2013-08-27 10:25:38 +02:00
Jaime Marquínez Ferrándiz	069d098f84	[canalplus] Accept player.canalplus.fr urls	2013-08-27 10:21:57 +02:00
Pierre Rudloff	ff2424595a	lxml is not part of the standard library.	2013-08-22 14:47:51 +02:00
Pierre Rudloff	cd0abcc0bb	Extractor for canalc2.tv	2013-08-22 13:54:23 +02:00
Pierre Rudloff	05a2926c5c	Merge remote-tracking branch 'upstream/master'	2013-08-22 12:55:58 +02:00
Pierre Rudloff	7070b83687	Merge remote-tracking branch 'upstream/master'	2013-08-22 12:54:17 +02:00
Pierre Rudloff	8d212e604a	Merge remote-tracking branch 'upstream/master' Conflicts: youtube_dl/extractor/jeuxvideo.py	2013-08-22 12:52:05 +02:00
Pierre Rudloff	943f7f7a39	Download videos from jeuxvideo.com	2013-08-18 16:11:47 +02:00