Compare commits

...

129 Commits

Author SHA1 Message Date
34ca5d9ba0 release 2014.03.11 2014-03-11 16:51:50 +01:00
60cc4dc4b4 [generic/funnyordie] Add support for funnyordie embeds (Fixes #2546) 2014-03-11 16:51:36 +01:00
db95dc13a1 [playvid] Simplify (#2539) 2014-03-10 20:55:47 +01:00
777ac90791 Merge remote-tracking branch 'MikeCol/playvid_extract' 2014-03-10 20:45:45 +01:00
04f9bebbcb Merge remote-tracking branch 'jaimeMF/remove_global_opener' 2014-03-10 20:42:54 +01:00
4ea3137e41 Playvid extractor 2014-03-10 20:16:49 +01:00
a0792b738e Don't install the global url opener
All the code uses now the urlopen method of YoutubeDL
2014-03-10 19:04:51 +01:00
19a41fc613 Don't set the global socket timeout
Use the timeout argument of the `OpenerDirector.open` method instead
2014-03-10 19:03:37 +01:00
3ee52157fb [vgtrk] Rename vesti extractor 2014-03-11 00:58:05 +07:00
c4d197ee2d [vesti] Fix _VALID_URL regex 2014-03-11 00:49:41 +07:00
a33932cfe3 [vevo] Correct test value
The date is now interpreted as UTC for consistency.
2014-03-10 17:56:54 +01:00
bcf89ce62c [generic] Suppress warning about doctypes in RSS parser 2014-03-10 17:31:32 +01:00
e3899d0e00 Merge branch 'master' of github.com:rg3/youtube-dl 2014-03-10 16:42:22 +01:00
dcb00da49c [depositfiles] Remove extractor
This site requires a CAPTCHA to download, supports arbitrary files and not only audio/video, and I can't find a single uncopyrighted video with a quick google search.
Closes #1255
2014-03-10 16:41:08 +01:00
aa51d20d19 [vesti] Skip geo restricted test 2014-03-10 22:31:22 +07:00
ae7ed92057 [youtube] Fix up invalid JSON 2014-03-10 13:35:45 +01:00
e45b31d9bd [vevo] Interpret date as UTC instead of local time 2014-03-10 13:12:57 +01:00
5a25f39653 Correct extractor documentation 2014-03-10 13:09:55 +01:00
963d7ec412 release 2014.03.10 2014-03-10 13:04:20 +01:00
e712d94adf Merge branch 'master' of github.com:rg3/youtube-dl 2014-03-10 13:03:52 +01:00
6a72423955 [generic] Use a different URL for the generic RSS test (Closes #2532) 2014-03-10 13:03:39 +01:00
4126826b10 [photobucket] More unicode literals 2014-03-10 12:59:19 +01:00
b773ead7fd [vesti] Add support for more sites (Closes #2534) 2014-03-10 18:52:00 +07:00
855e2750bc Credit @mharrys for aftonbladet 2014-03-10 10:30:17 +01:00
805ef3c60b Correct automatic resolution determination 2014-03-10 10:29:25 +01:00
fbc2dcb40b [aftonbladet] Modernize 2014-03-10 10:28:56 +01:00
5375d7ad84 Merge remote-tracking branch 'mharrys/aftonbladet' 2014-03-10 10:23:45 +01:00
90f3476180 [photobucket] Modernize and remove the old extraction code 2014-03-09 19:36:46 +01:00
ee95c09333 [pornhub] Use compat_urllib_parse.unquote_plus (#2531) 2014-03-09 19:16:25 +01:00
75d06db9fc Merge branch 'pornhub_unquote_password' of github.com:MikeCol/youtube-dl 2014-03-09 19:15:33 +01:00
439a1fffcb [myvideo] Modernize 2014-03-09 18:58:34 +01:00
9d9d70c462 [facebook] Modernize 2014-03-09 18:42:44 +01:00
b4a186b7be [jukebox] Modernize and add a test 2014-03-09 18:33:17 +01:00
bdebf51c8f [xnxx] Modernize 2014-03-09 18:31:39 +01:00
264b86f9b4 Unquote password 2014-03-09 18:26:18 +01:00
9e55e37a2e Merge remote-tracking branch 'origin/master' 2014-03-09 18:08:16 +01:00
1471956573 Add a basic test suite for the InfoExtractor class 2014-03-09 17:05:29 +01:00
27865b2169 [aftonbladet] add extractor for aftonbladet.se 2014-03-09 16:59:18 +01:00
6d07ce0162 YoutubeDL: If the logger is set call its warning method in report_warning 2014-03-09 15:16:54 +01:00
edb7fc5435 [videodetective] Modernize 2014-03-09 18:39:39 +07:00
31f77343f2 [vube] Update the test's checksum 2014-03-09 12:27:38 +01:00
63ad031583 [soundcloud] Add the description field to the second test 2014-03-09 12:26:58 +01:00
957688cee6 [ustream:channel] Update test's number of entries 2014-03-09 12:03:49 +01:00
806d6c2e8c [gamekings] Modernize and update the test's description field 2014-03-09 11:57:30 +01:00
0ef68e04d9 [mtv] Transform the urls from the mobile version to get the best quality
And don't report a warning, just log a message, it allows to pass the test from Europe.
2014-03-08 22:09:42 +01:00
a496524db2 [collegehumor] Replace youtube test 2014-03-09 03:21:26 +07:00
935c7360cc [spike] Add support for mobile urls 2014-03-08 21:10:21 +01:00
340b046876 [spike] Add support for downloading the mobile version if the normal version is geoblocked 2014-03-08 20:59:11 +01:00
cc1db7f9b7 [mtv] Improve detection of geoblocked videos 2014-03-08 19:46:34 +01:00
a4ff6c4762 [arte] Raise a proper error when no video is found 2014-03-08 16:04:03 +01:00
1060425cbb [vimeo] Add a better error message for embed-only videos (#2527) 2014-03-08 12:25:09 +01:00
e9c092f125 YoutubeDL: Use its urlopen method for downloading the thumbnail. 2014-03-07 16:43:34 +01:00
22ff5d2105 [http] Use the YoutubeDL.urlopen method 2014-03-07 16:41:42 +01:00
136db7881b [lynda] Modernize 2014-03-07 22:11:01 +07:00
dae313e725 release 2014.03.07.1 2014-03-07 15:59:10 +01:00
b74fa8cd2c [facebook] Fix login process
It was broken and didn't work in python 3.
And use `_download_webpage` instead of `compat_urllib_request.urlopen`.
2014-03-07 15:25:33 +01:00
94eae04c94 release 2014.03.07 2014-03-07 06:41:48 +01:00
16ff7ebc77 [lynda] Fix successful login regex and fix formats extraction (Closes #2520) 2014-03-07 06:56:48 +07:00
c361c505b0 release 2014.03.06 2014-03-06 23:57:00 +01:00
d37c07c575 [vesti] Fix extraction and support more link formats (Closes #2517) 2014-03-07 02:27:39 +07:00
9d6105c9f0 Do not resume live streams
No resuming or seeking in live streams is possible (c) man rtmpdump
2014-03-05 22:46:20 +07:00
8dec03ecba Use unicode literals 2014-03-05 22:24:07 +07:00
826547870b Report no connect as error 2014-03-05 22:21:19 +07:00
52d6a9a61d Handle rtmpdump's no connection return value 2014-03-05 22:19:27 +07:00
ad242b5fbc Remove superfluous whitespace 2014-03-05 22:16:50 +07:00
3524175625 Use meaningful return value constants for rtmpdump 2014-03-05 22:12:02 +07:00
7b9965ea93 [ted] Remove unused import and modernize test 2014-03-05 14:27:45 +01:00
0a5bce566f [generic] Add all test attributes for embedly (#2447)
In the future, we may want to not only print something, but throw an error for untested properties.
2014-03-05 14:05:50 +01:00
8012bd2424 [generic] Get a better ID 2014-03-05 14:02:14 +01:00
f55a1f0a88 Merge remote-tracking branch 'rzhxeo/embedly'
Conflicts:
	youtube_dl/extractor/generic.py
2014-03-05 14:01:53 +01:00
bacac173a9 [ted] Style fixes 2014-03-05 13:27:26 +01:00
ca1fee34f2 [ted] Fix playlist extraction and add a test 2014-03-05 13:22:10 +01:00
6dadaa9930 [prosiebensat1] Replace test 2014-03-05 15:10:49 +07:00
553f6e4633 [dailymotion] Convert width and height fields from strings to integers 2014-03-04 22:24:38 +01:00
652bee05f0 [ted] Fix video extraction
The site has been redesigned
2014-03-04 21:47:01 +01:00
d63516e9cd release 2014.03.04.2 2014-03-04 20:56:31 +01:00
e477dcf649 [vesti] Fix width and height 2014-03-04 21:40:35 +07:00
9d3f7781f3 [soundcloud:set] Fix _VALID_URL regex (Closes #2509) 2014-03-04 21:29:14 +07:00
c7095dada3 [tvigle] Add support for another video link format 2014-03-04 19:22:48 +07:00
607dbbad76 [xtube] Fix extraction add more metafields 2014-03-04 16:12:11 +07:00
17b75c0de1 Document width, height, and resolution (#1445) 2014-03-04 03:49:33 +01:00
ab24f4f3be [facebook] Use consistent quotes 2014-03-04 03:49:12 +01:00
e1a52d9e10 release 2014.03.04.1 2014-03-04 03:40:00 +01:00
d0ff838433 [facebook] Correct regexp 2014-03-04 03:39:45 +01:00
b37b94501c [facebook] Fix login detection (#2505) 2014-03-04 03:39:04 +01:00
cb3bb2cfef [facebook] Modernize 2014-03-04 03:36:54 +01:00
e2cc7983e9 release 2014.03.04 2014-03-04 03:32:54 +01:00
c9ae7b9565 [youtube] Add support for search result URLs (Fixes #2495) 2014-03-04 03:32:28 +01:00
86fb4347f7 release 2014.03.03 2014-03-03 13:51:25 +01:00
2fcec131f5 Credit @juancri for canal13cl (#2498) 2014-03-03 12:54:01 +01:00
9f62eaf4ef [canal13cl] Add test and improve extraction (#2498) 2014-03-03 12:53:11 +01:00
f92259c026 Merge remote-tracking branch 'origin/master' 2014-03-03 12:34:34 +01:00
0afef30b23 Add display_id field 2014-03-03 12:06:28 +01:00
dcdfd1c711 Merge remote-tracking branch 'origin/master' 2014-03-03 12:05:59 +01:00
2acc1f8f50 [orf] Fix segments extraction (Closes #2501) 2014-03-03 18:05:46 +07:00
2c39b0c695 [tinypic] Fix import 2014-03-03 17:40:12 +07:00
e77c5b4f63 [4tube] Fix import 2014-03-03 17:39:49 +07:00
409a16cb72 Allowing URLs for 13.cl without the /programas prefix 2014-03-02 23:41:13 -03:00
94d5e90b4f FIX: Typo in the extractor's name 2014-03-02 23:40:35 -03:00
2d73b45805 Adding support for 13.cl 2014-03-02 23:15:12 -03:00
271a2dbfa2 [tvigle] Add age limit 2014-03-02 22:07:18 +07:00
bf4adcac66 [tvigle] Fix like count 2014-03-02 20:56:36 +07:00
fb8b8fdd62 [tvigle] Add support for tvigle.ru 2014-03-02 19:59:34 +07:00
5a0b26252e [ceskatelevize] Simplify 2014-03-01 23:05:33 +07:00
7d78f0cc48 [ceskatelevize] Fix video availability check and add geo unrestricted test 2014-03-01 22:54:37 +07:00
f00fc78674 Merge branch '_ceskatelevize' of https://github.com/pulpe/youtube-dl into pulpe-_ceskatelevize 2014-03-01 22:26:18 +07:00
392017874c [CeskaTelevize] raise ExtractorError if you are outside of CR 2014-03-01 16:17:29 +01:00
c3cb92d1ab [CeskaTelevize] fix python3 support @dstftw 2014-03-01 16:02:51 +01:00
aa5590fa07 skip test 2014-03-01 12:34:01 +01:00
8cfb5bbf92 [CeskaTelevize] Add initial support for ceskatelevize.cz 2014-03-01 11:47:52 +01:00
69bb54ebf9 [mailru] Add support for mail.ru video 2014-03-01 16:34:38 +07:00
ca97a56e4b [vk] Add support for embedded videos (Closes #2473) 2014-02-28 23:51:54 +07:00
fc26f3b4c2 [lifenews] Add support for multiple videos on the same page (#2482) 2014-02-28 22:52:06 +07:00
f604c93c64 [gdcvault] Formatting / Remove unused variables 2014-02-28 15:50:19 +01:00
dc3727b65c Credit @mnem dor GDCVault 2014-02-28 15:14:25 +01:00
aba3231de1 Merge remote-tracking branch 'mnem/gdc-vault' 2014-02-28 12:52:11 +01:00
9193bab91d release 2014.02.28 2014-02-28 12:31:37 +01:00
fbcf3e416d Merge pull request #2463 from rzhxeo/resume
Set resume_len to 0 if download is restarted
2014-02-28 12:30:34 +01:00
c0e5d85631 [vimeo] Improve thumbnail extraction 2014-02-28 18:00:12 +07:00
ca7fa3dcb3 [vimeo] Fix thumbs extraction (Closes #2480) 2014-02-28 17:43:54 +07:00
4ccfba28d9 [collegehumor] Fix test's uploader field 2014-02-27 19:10:30 +01:00
abb82f1ddc [mixcloud] Unquote the track id (#2462) 2014-02-27 18:58:09 +01:00
546582ec3e Removing MD5 check for ethereal file. 2014-02-27 14:28:55 +00:00
4534485586 Fix test, remove unused, tidy quotes and brackets 2014-02-27 12:50:48 +00:00
50a138d95c Add support for authenticated videos 2014-02-27 10:32:31 +00:00
1b86cc41cf Add support for embed.ly 2014-02-27 08:14:28 +01:00
83cebb8b7a Add support for FLV videos with speaker decks 2014-02-27 00:20:34 +00:00
9e68f9fdf1 Extractor for non-password protected GDC Vault videos 2014-02-26 22:33:33 +00:00
60daf7f0bb Set resume_len to 0 if download is restarted 2014-02-26 02:47:27 +01:00
53 changed files with 1485 additions and 517 deletions

View File

@ -124,8 +124,12 @@ which means you can modify it, redistribute it or use it however you like.
video id, %(playlist)s for the playlist the
video is in, %(playlist_index)s for the
position in the playlist and %% for a
literal percent. Use - to output to stdout.
Can also be used to download to a different
literal percent. %(height)s and %(width)s
for the width and height of the video
format. %(resolution)s for a textual
description of the resolution of the video
format. Use - to output to stdout. Can also
be used to download to a different
directory, for example with -o '/my/downloa
ds/%(uploader)s/%(title)s-%(id)s.%(ext)s' .
--autonumber-size NUMBER Specifies the number of digits in

View File

@ -0,0 +1,44 @@
#!/usr/bin/env python
from __future__ import unicode_literals
# Allow direct execution
import os
import sys
import unittest
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from test.helper import FakeYDL
from youtube_dl.extractor.common import InfoExtractor
from youtube_dl.extractor import YoutubeIE, get_info_extractor
class TestIE(InfoExtractor):
pass
class TestInfoExtractor(unittest.TestCase):
def setUp(self):
self.ie = TestIE(FakeYDL())
def test_ie_key(self):
self.assertEqual(get_info_extractor(YoutubeIE.ie_key()), YoutubeIE)
def test_html_search_regex(self):
html = '<p id="foo">Watch this <a href="http://www.youtube.com/watch?v=BaW_jenozKc">video</a></p>'
search = lambda re, *args: self.ie._html_search_regex(re, html, *args)
self.assertEqual(search(r'<p id="foo">(.+?)</p>', 'foo'), 'Watch this video')
def test_opengraph(self):
ie = self.ie
html = '''
<meta name="og:title" content='Foo'/>
<meta content="Some video's description " name="og:description"/>
<meta property='og:image' content='http://domain.com/pic.jpg?key1=val1&amp;key2=val2'/>
'''
self.assertEqual(ie._og_search_title(html), 'Foo')
self.assertEqual(ie._og_search_description(html), 'Some video\'s description ')
self.assertEqual(ie._og_search_thumbnail(html), 'http://domain.com/pic.jpg?key1=val1&key2=val2')
if __name__ == '__main__':
unittest.main()

View File

@ -71,6 +71,10 @@ class TestAllURLsMatching(unittest.TestCase):
def test_youtube_truncated(self):
self.assertMatch('http://www.youtube.com/watch?', ['youtube:truncated_url'])
def test_youtube_search_matching(self):
self.assertMatch('http://www.youtube.com/results?search_query=making+mustard', ['youtube:search_url'])
self.assertMatch('https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', ['youtube:search_url'])
def test_justin_tv_channelid_matching(self):
self.assertTrue(JustinTVIE.suitable(u"justin.tv/vanillatv"))
self.assertTrue(JustinTVIE.suitable(u"twitch.tv/vanillatv"))

View File

@ -36,6 +36,7 @@ from youtube_dl.extractor import (
RutubeChannelIE,
GoogleSearchIE,
GenericIE,
TEDIE,
)
@ -98,7 +99,7 @@ class TestPlaylists(unittest.TestCase):
result = ie.extract('http://www.ustream.tv/channel/young-americans-for-liberty')
self.assertIsPlaylist(result)
self.assertEqual(result['id'], '5124905')
self.assertTrue(len(result['entries']) >= 11)
self.assertTrue(len(result['entries']) >= 6)
def test_soundcloud_set(self):
dl = FakeYDL()
@ -253,11 +254,20 @@ class TestPlaylists(unittest.TestCase):
def test_generic_rss_feed(self):
dl = FakeYDL()
ie = GenericIE(dl)
result = ie.extract('http://www.escapistmagazine.com/rss/videos/list/1.xml')
result = ie.extract('http://phihag.de/2014/youtube-dl/rss.xml')
self.assertIsPlaylist(result)
self.assertEqual(result['id'], 'http://www.escapistmagazine.com/rss/videos/list/1.xml')
self.assertEqual(result['id'], 'http://phihag.de/2014/youtube-dl/rss.xml')
self.assertEqual(result['title'], 'Zero Punctuation')
self.assertTrue(len(result['entries']) > 10)
def test_ted_playlist(self):
dl = FakeYDL()
ie = TEDIE(dl)
result = ie.extract('http://www.ted.com/playlists/who_are_the_hackers')
self.assertIsPlaylist(result)
self.assertEqual(result['id'], '10')
self.assertEqual(result['title'], 'Who are the hackers?')
self.assertTrue(len(result['entries']) >= 6)
if __name__ == '__main__':
unittest.main()

View File

@ -33,6 +33,7 @@ from youtube_dl.utils import (
unified_strdate,
unsmuggle_url,
url_basename,
urlencode_postdata,
xpath_with_ns,
)
@ -261,5 +262,9 @@ class TestUtil(unittest.TestCase):
bam''')
self.assertEqual(read_batch_urls(f), [u'foo', u'bar', u'baz', u'bam'])
def test_urlencode_postdata(self):
data = urlencode_postdata({'username': 'foo@bar.com', 'password': '1234'})
self.assertTrue(isinstance(data, bytes))
if __name__ == '__main__':
unittest.main()

View File

@ -16,6 +16,7 @@ from youtube_dl.extractor import (
YoutubeChannelIE,
YoutubeShowIE,
YoutubeTopListIE,
YoutubeSearchURLIE,
)
@ -133,5 +134,14 @@ class TestYoutubeLists(unittest.TestCase):
entries = result['entries']
self.assertTrue(len(entries) >= 5)
def test_youtube_search_url(self):
dl = FakeYDL()
ie = YoutubeSearchURLIE(dl)
result = ie.extract('https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video')
entries = result['entries']
self.assertIsPlaylist(result)
self.assertEqual(result['title'], 'youtube-dl test video')
self.assertTrue(len(entries) >= 5)
if __name__ == '__main__':
unittest.main()

View File

@ -370,12 +370,15 @@ class YoutubeDL(object):
Print the message to stderr, it will be prefixed with 'WARNING:'
If stderr is a tty file the 'WARNING:' will be colored
'''
if self._err_file.isatty() and os.name != 'nt':
_msg_header = '\033[0;33mWARNING:\033[0m'
if self.params.get('logger') is not None:
self.params['logger'].warning(message)
else:
_msg_header = 'WARNING:'
warning_message = '%s %s' % (_msg_header, message)
self.to_stderr(warning_message)
if self._err_file.isatty() and os.name != 'nt':
_msg_header = '\033[0;33mWARNING:\033[0m'
else:
_msg_header = 'WARNING:'
warning_message = '%s %s' % (_msg_header, message)
self.to_stderr(warning_message)
def report_error(self, message, tb=None):
'''
@ -409,6 +412,13 @@ class YoutubeDL(object):
template_dict['autonumber'] = autonumber_templ % self._num_downloads
if template_dict.get('playlist_index') is not None:
template_dict['playlist_index'] = '%05d' % template_dict['playlist_index']
if template_dict.get('resolution') is None:
if template_dict.get('width') and template_dict.get('height'):
template_dict['resolution'] = '%dx%d' % (template_dict['width'], template_dict['height'])
elif template_dict.get('height'):
template_dict['resolution'] = '%sp' % template_dict['height']
elif template_dict.get('width'):
template_dict['resolution'] = '?x%d' % template_dict['width']
sanitize = lambda k, v: sanitize_filename(
compat_str(v),
@ -675,6 +685,9 @@ class YoutubeDL(object):
info_dict['playlist'] = None
info_dict['playlist_index'] = None
if 'display_id' not in info_dict and 'id' in info_dict:
info_dict['display_id'] = info_dict['id']
# This extractors handle format selection themselves
if info_dict['extractor'] in ['Youku']:
if download:
@ -688,8 +701,11 @@ class YoutubeDL(object):
else:
formats = info_dict['formats']
if not formats:
raise ExtractorError('No video formats found!')
# We check that all the formats have the format and format_id fields
for (i, format) in enumerate(formats):
for i, format in enumerate(formats):
if format.get('format_id') is None:
format['format_id'] = compat_str(i)
if format.get('format') is None:
@ -908,7 +924,7 @@ class YoutubeDL(object):
self.to_screen('[%s] %s: Downloading thumbnail ...' %
(info_dict['extractor'], info_dict['id']))
try:
uf = compat_urllib_request.urlopen(info_dict['thumbnail'])
uf = self.urlopen(info_dict['thumbnail'])
with open(thumb_filename, 'wb') as thumbf:
shutil.copyfileobj(uf, thumbf)
self.to_screen('[%s] %s: Writing thumbnail to: %s' %
@ -1154,7 +1170,7 @@ class YoutubeDL(object):
def urlopen(self, req):
""" Start an HTTP download """
return self._opener.open(req)
return self._opener.open(req, timeout=self._socket_timeout)
def print_debug_header(self):
if not self.params.get('verbose'):
@ -1185,7 +1201,7 @@ class YoutubeDL(object):
def _setup_opener(self):
timeout_val = self.params.get('socket_timeout')
timeout = 600 if timeout_val is None else float(timeout_val)
self._socket_timeout = 600 if timeout_val is None else float(timeout_val)
opts_cookiefile = self.params.get('cookiefile')
opts_proxy = self.params.get('proxy')
@ -1223,7 +1239,3 @@ class YoutubeDL(object):
# (See https://github.com/rg3/youtube-dl/issues/1309 for details)
opener.addheaders = []
self._opener = opener
# TODO remove this global modification
compat_urllib_request.install_opener(opener)
socket.setdefaulttimeout(timeout)

View File

@ -48,6 +48,9 @@ __authors__ = (
'Niklas Laxström',
'David Triendl',
'Anthony Weems',
'David Wagner',
'Juan C. Olivares',
'Mattias Harrysson',
)
__license__ = 'Public Domain'
@ -428,6 +431,8 @@ def parseOpts(overrideArguments=None):
'%(extractor)s for the provider (youtube, metacafe, etc), '
'%(id)s for the video id, %(playlist)s for the playlist the video is in, '
'%(playlist_index)s for the position in the playlist and %% for a literal percent. '
'%(height)s and %(width)s for the width and height of the video format. '
'%(resolution)s for a textual description of the resolution of the video format. '
'Use - to output to stdout. Can also be used to download to a different directory, '
'for example with -o \'/my/downloads/%(uploader)s/%(title)s-%(id)s.%(ext)s\' .'))
filesystem.add_option('--autonumber-size',

View File

@ -49,7 +49,7 @@ class HttpFD(FileDownloader):
while count <= retries:
# Establish connection
try:
data = compat_urllib_request.urlopen(request)
data = self.ydl.urlopen(request)
break
except (compat_urllib_error.HTTPError, ) as err:
if (err.code < 500 or err.code >= 600) and err.code != 416:
@ -59,7 +59,7 @@ class HttpFD(FileDownloader):
# Unable to resume (requested range not satisfiable)
try:
# Open the connection again without the range header
data = compat_urllib_request.urlopen(basic_request)
data = self.ydl.urlopen(basic_request)
content_length = data.info()['Content-Length']
except (compat_urllib_error.HTTPError, ) as err:
if err.code < 500 or err.code >= 600:
@ -85,6 +85,7 @@ class HttpFD(FileDownloader):
else:
# The length does not match, we start the download over
self.report_unable_to_resume()
resume_len = 0
open_mode = 'wb'
break
# Retry

View File

@ -1,3 +1,5 @@
from __future__ import unicode_literals
import os
import re
import subprocess
@ -22,7 +24,7 @@ class RtmpFD(FileDownloader):
proc_stderr_closed = False
while not proc_stderr_closed:
# read line from stderr
line = u''
line = ''
while True:
char = proc.stderr.read(1)
if not char:
@ -46,7 +48,7 @@ class RtmpFD(FileDownloader):
data_len = None
if percent > 0:
data_len = int(downloaded_data_len * 100 / percent)
data_len_str = u'~' + format_bytes(data_len)
data_len_str = '~' + format_bytes(data_len)
self.report_progress(percent, data_len_str, speed, eta)
cursor_in_new_line = False
self._hook_progress({
@ -76,12 +78,12 @@ class RtmpFD(FileDownloader):
})
elif self.params.get('verbose', False):
if not cursor_in_new_line:
self.to_screen(u'')
self.to_screen('')
cursor_in_new_line = True
self.to_screen(u'[rtmpdump] '+line)
self.to_screen('[rtmpdump] '+line)
proc.wait()
if not cursor_in_new_line:
self.to_screen(u'')
self.to_screen('')
return proc.returncode
url = info_dict['url']
@ -102,7 +104,7 @@ class RtmpFD(FileDownloader):
try:
subprocess.call(['rtmpdump', '-h'], stdout=(open(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
except (OSError, IOError):
self.report_error(u'RTMP download detected but "rtmpdump" could not be run')
self.report_error('RTMP download detected but "rtmpdump" could not be run')
return False
# Download using rtmpdump. rtmpdump returns exit code 2 when
@ -127,7 +129,7 @@ class RtmpFD(FileDownloader):
basic_args += ['--live']
if conn:
basic_args += ['--conn', conn]
args = basic_args + [[], ['--resume', '--skip', '1']][self.params.get('continuedl', False)]
args = basic_args + [[], ['--resume', '--skip', '1']][not live and self.params.get('continuedl', False)]
if sys.platform == 'win32' and sys.version_info < (3, 0):
# Windows subprocess module does not actually support Unicode
@ -150,26 +152,35 @@ class RtmpFD(FileDownloader):
shell_quote = lambda args: ' '.join(map(pipes.quote, str_args))
except ImportError:
shell_quote = repr
self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(str_args))
self.to_screen('[debug] rtmpdump command line: ' + shell_quote(str_args))
RD_SUCCESS = 0
RD_FAILED = 1
RD_INCOMPLETE = 2
RD_NO_CONNECT = 3
retval = run_rtmpdump(args)
while (retval == 2 or retval == 1) and not test:
if retval == RD_NO_CONNECT:
self.report_error('[rtmpdump] Could not connect to RTMP server.')
return False
while (retval == RD_INCOMPLETE or retval == RD_FAILED) and not test and not live:
prevsize = os.path.getsize(encodeFilename(tmpfilename))
self.to_screen(u'[rtmpdump] %s bytes' % prevsize)
self.to_screen('[rtmpdump] %s bytes' % prevsize)
time.sleep(5.0) # This seems to be needed
retval = run_rtmpdump(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
retval = run_rtmpdump(basic_args + ['-e'] + [[], ['-k', '1']][retval == RD_FAILED])
cursize = os.path.getsize(encodeFilename(tmpfilename))
if prevsize == cursize and retval == 1:
if prevsize == cursize and retval == RD_FAILED:
break
# Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
if prevsize == cursize and retval == 2 and cursize > 1024:
self.to_screen(u'[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
retval = 0
if prevsize == cursize and retval == RD_INCOMPLETE and cursize > 1024:
self.to_screen('[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
retval = RD_SUCCESS
break
if retval == 0 or (test and retval == 2):
if retval == RD_SUCCESS or (test and retval == RD_INCOMPLETE):
fsize = os.path.getsize(encodeFilename(tmpfilename))
self.to_screen(u'[rtmpdump] %s bytes' % fsize)
self.to_screen('[rtmpdump] %s bytes' % fsize)
self.try_rename(tmpfilename, filename)
self._hook_progress({
'downloaded_bytes': fsize,
@ -179,6 +190,6 @@ class RtmpFD(FileDownloader):
})
return True
else:
self.to_stderr(u"\n")
self.report_error(u'rtmpdump exited with code %d' % retval)
self.to_stderr('\n')
self.report_error('rtmpdump exited with code %d' % retval)
return False

View File

@ -1,5 +1,6 @@
from .academicearth import AcademicEarthCourseIE
from .addanime import AddAnimeIE
from .aftonbladet import AftonbladetIE
from .anitube import AnitubeIE
from .aparat import AparatIE
from .appletrailers import AppleTrailersIE
@ -23,9 +24,11 @@ from .br import BRIE
from .breakcom import BreakIE
from .brightcove import BrightcoveIE
from .c56 import C56IE
from .canal13cl import Canal13clIE
from .canalplus import CanalplusIE
from .canalc2 import Canalc2IE
from .cbs import CBSIE
from .ceskatelevize import CeskaTelevizeIE
from .channel9 import Channel9IE
from .chilloutzone import ChilloutzoneIE
from .cinemassacre import CinemassacreIE
@ -50,7 +53,6 @@ from .dailymotion import (
DailymotionUserIE,
)
from .daum import DaumIE
from .depositfiles import DepositFilesIE
from .dotsub import DotsubIE
from .dreisat import DreiSatIE
from .defense import DefenseGouvFrIE
@ -89,6 +91,7 @@ from .funnyordie import FunnyOrDieIE
from .gamekings import GamekingsIE
from .gamespot import GameSpotIE
from .gametrailers import GametrailersIE
from .gdcvault import GDCVaultIE
from .generic import GenericIE
from .googleplus import GooglePlusIE
from .googlesearch import GoogleSearchIE
@ -133,6 +136,7 @@ from .lynda import (
)
from .m6 import M6IE
from .macgamestore import MacGameStoreIE
from .mailru import MailRuIE
from .malemotion import MalemotionIE
from .mdr import MDRIE
from .metacafe import MetacafeIE
@ -171,6 +175,7 @@ from .ooyala import OoyalaIE
from .orf import ORFIE
from .pbs import PBSIE
from .photobucket import PhotobucketIE
from .playvid import PlayvidIE
from .podomatic import PodomaticIE
from .pornhd import PornHdIE
from .pornhub import PornHubIE
@ -235,14 +240,15 @@ from .tube8 import Tube8IE
from .tudou import TudouIE
from .tumblr import TumblrIE
from .tutv import TutvIE
from .tvigle import TvigleIE
from .tvp import TvpIE
from .unistra import UnistraIE
from .ustream import UstreamIE, UstreamChannelIE
from .vbox7 import Vbox7IE
from .veehd import VeeHDIE
from .veoh import VeohIE
from .vesti import VestiIE
from .vevo import VevoIE
from .vgtrk import VGTRKIE
from .vice import ViceIE
from .viddler import ViddlerIE
from .videobam import VideoBamIE
@ -280,19 +286,20 @@ from .youku import YoukuIE
from .youporn import YouPornIE
from .youtube import (
YoutubeIE,
YoutubePlaylistIE,
YoutubeSearchIE,
YoutubeSearchDateIE,
YoutubeUserIE,
YoutubeChannelIE,
YoutubeShowIE,
YoutubeSubscriptionsIE,
YoutubeRecommendedIE,
YoutubeTruncatedURLIE,
YoutubeWatchLaterIE,
YoutubeFavouritesIE,
YoutubeHistoryIE,
YoutubePlaylistIE,
YoutubeRecommendedIE,
YoutubeSearchDateIE,
YoutubeSearchIE,
YoutubeSearchURLIE,
YoutubeShowIE,
YoutubeSubscriptionsIE,
YoutubeTopListIE,
YoutubeTruncatedURLIE,
YoutubeUserIE,
YoutubeWatchLaterIE,
)
from .zdf import ZDFIE

View File

@ -0,0 +1,69 @@
# encoding: utf-8
from __future__ import unicode_literals
import datetime
import re
from .common import InfoExtractor
class AftonbladetIE(InfoExtractor):
_VALID_URL = r'^http://tv\.aftonbladet\.se/webbtv.+?(?P<video_id>article[0-9]+)\.ab(?:$|[?#])'
_TEST = {
'url': 'http://tv.aftonbladet.se/webbtv/nyheter/vetenskap/rymden/article36015.ab',
'info_dict': {
'id': 'article36015',
'ext': 'mp4',
'title': 'Vulkanutbrott i rymden - nu släpper NASA bilderna',
'description': 'Jupiters måne mest aktiv av alla himlakroppar',
'upload_date': '20140306',
},
}
def _real_extract(self, url):
mobj = re.search(self._VALID_URL, url)
video_id = mobj.group('video_id')
webpage = self._download_webpage(url, video_id)
# find internal video meta data
META_URL = 'http://aftonbladet-play.drlib.aptoma.no/video/%s.json'
internal_meta_id = self._html_search_regex(
r'data-aptomaId="([\w\d]+)"', webpage, 'internal_meta_id')
internal_meta_url = META_URL % internal_meta_id
internal_meta_json = self._download_json(
internal_meta_url, video_id, 'Downloading video meta data')
# find internal video formats
FORMATS_URL = 'http://aftonbladet-play.videodata.drvideo.aptoma.no/actions/video/?id=%s'
internal_video_id = internal_meta_json['videoId']
internal_formats_url = FORMATS_URL % internal_video_id
internal_formats_json = self._download_json(
internal_formats_url, video_id, 'Downloading video formats')
formats = []
for fmt in internal_formats_json['formats']['http']['pseudostreaming']['mp4']:
p = fmt['paths'][0]
formats.append({
'url': 'http://%s:%d/%s/%s' % (p['address'], p['port'], p['path'], p['filename']),
'ext': 'mp4',
'width': fmt['width'],
'height': fmt['height'],
'tbr': fmt['bitrate'],
'protocol': 'http',
})
self._sort_formats(formats)
timestamp = datetime.datetime.fromtimestamp(internal_meta_json['timePublished'])
upload_date = timestamp.strftime('%Y%m%d')
return {
'id': video_id,
'title': internal_meta_json['title'],
'formats': formats,
'thumbnail': internal_meta_json['imageUrl'],
'description': internal_meta_json['shortPreamble'],
'upload_date': upload_date,
'duration': internal_meta_json['duration'],
'view_count': internal_meta_json['views'],
}

View File

@ -72,18 +72,22 @@ class ArteTvIE(InfoExtractor):
return self._extract_liveweb(url, name, lang)
if re.search(self._LIVE_URL, url) is not None:
raise ExtractorError(u'Arte live streams are not yet supported, sorry')
raise ExtractorError('Arte live streams are not yet supported, sorry')
# self.extractLiveStream(url)
# return
raise ExtractorError('No video found')
def _extract_video(self, url, video_id, lang):
"""Extract from videos.arte.tv"""
ref_xml_url = url.replace('/videos/', '/do_delegate/videos/')
ref_xml_url = ref_xml_url.replace('.html', ',view,asPlayerXml.xml')
ref_xml_doc = self._download_xml(ref_xml_url, video_id, note=u'Downloading metadata')
ref_xml_doc = self._download_xml(
ref_xml_url, video_id, note='Downloading metadata')
config_node = find_xpath_attr(ref_xml_doc, './/video', 'lang', lang)
config_xml_url = config_node.attrib['ref']
config_xml = self._download_webpage(config_xml_url, video_id, note=u'Downloading configuration')
config_xml = self._download_webpage(
config_xml_url, video_id, note='Downloading configuration')
video_urls = list(re.finditer(r'<url quality="(?P<quality>.*?)">(?P<url>.*?)</url>', config_xml))
def _key(m):

View File

@ -0,0 +1,48 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
class Canal13clIE(InfoExtractor):
_VALID_URL = r'^http://(?:www\.)?13\.cl/(?:[^/?#]+/)*(?P<id>[^/?#]+)'
_TEST = {
'url': 'http://www.13.cl/t13/nacional/el-circulo-de-hierro-de-michelle-bachelet-en-su-regreso-a-la-moneda',
'md5': '4cb1fa38adcad8fea88487a078831755',
'info_dict': {
'id': '1403022125',
'display_id': 'el-circulo-de-hierro-de-michelle-bachelet-en-su-regreso-a-la-moneda',
'ext': 'mp4',
'title': 'El "círculo de hierro" de Michelle Bachelet en su regreso a La Moneda',
'description': '(Foto: Agencia Uno) En nueve días más, Michelle Bachelet va a asumir por segunda vez como presidenta de la República. Entre aquellos que la acompañarán hay caras que se repiten y otras que se consolidan en su entorno de colaboradores más cercanos.',
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
display_id = mobj.group('id')
webpage = self._download_webpage(url, display_id)
title = self._html_search_meta(
'twitter:title', webpage, 'title', fatal=True)
description = self._html_search_meta(
'twitter:description', webpage, 'description')
url = self._html_search_regex(
r'articuloVideo = \"(.*?)\"', webpage, 'url')
real_id = self._search_regex(
r'[^0-9]([0-9]{7,})[^0-9]', url, 'id', default=display_id)
thumbnail = self._html_search_regex(
r'articuloImagen = \"(.*?)\"', webpage, 'thumbnail')
return {
'id': real_id,
'display_id': display_id,
'url': url,
'title': title,
'description': description,
'ext': 'mp4',
'thumbnail': thumbnail,
}

View File

@ -0,0 +1,126 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
compat_urllib_request,
compat_urllib_parse,
compat_urllib_parse_urlparse,
ExtractorError,
)
class CeskaTelevizeIE(InfoExtractor):
_VALID_URL = r'https?://www\.ceskatelevize\.cz/(porady|ivysilani)/(.+/)?(?P<id>[^?#]+)'
_TESTS = [
{
'url': 'http://www.ceskatelevize.cz/ivysilani/10532695142-prvni-republika/213512120230004-spanelska-chripka',
'info_dict': {
'id': '213512120230004',
'ext': 'flv',
'title': 'První republika: Španělská chřipka',
'duration': 3107.4,
},
'params': {
'skip_download': True, # requires rtmpdump
},
'skip': 'Works only from Czech Republic.',
},
{
'url': 'http://www.ceskatelevize.cz/ivysilani/1030584952-tsatsiki-maminka-a-policajt',
'info_dict': {
'id': '20138143440',
'ext': 'flv',
'title': 'Tsatsiki, maminka a policajt',
'duration': 6754.1,
},
'params': {
'skip_download': True, # requires rtmpdump
},
'skip': 'Works only from Czech Republic.',
},
{
'url': 'http://www.ceskatelevize.cz/ivysilani/10532695142-prvni-republika/bonus/14716-zpevacka-z-duparny-bobina',
'info_dict': {
'id': '14716',
'ext': 'flv',
'title': 'První republika: Zpěvačka z Dupárny Bobina',
'duration': 90,
},
'params': {
'skip_download': True, # requires rtmpdump
},
},
]
def _real_extract(self, url):
url = url.replace('/porady/', '/ivysilani/').replace('/video/', '')
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
NOT_AVAILABLE_STRING = 'This content is not available at your territory due to limited copyright.'
if '%s</p>' % NOT_AVAILABLE_STRING in webpage:
raise ExtractorError(NOT_AVAILABLE_STRING, expected=True)
typ = self._html_search_regex(r'getPlaylistUrl\(\[\{"type":"(.+?)","id":".+?"\}\],', webpage, 'type')
episode_id = self._html_search_regex(r'getPlaylistUrl\(\[\{"type":".+?","id":"(.+?)"\}\],', webpage, 'episode_id')
data = {
'playlist[0][type]': typ,
'playlist[0][id]': episode_id,
'requestUrl': compat_urllib_parse_urlparse(url).path,
'requestSource': 'iVysilani',
}
req = compat_urllib_request.Request('http://www.ceskatelevize.cz/ivysilani/ajax/get-playlist-url',
data=compat_urllib_parse.urlencode(data))
req.add_header('Content-type', 'application/x-www-form-urlencoded')
req.add_header('x-addr', '127.0.0.1')
req.add_header('X-Requested-With', 'XMLHttpRequest')
req.add_header('Referer', url)
playlistpage = self._download_json(req, video_id)
req = compat_urllib_request.Request(compat_urllib_parse.unquote(playlistpage['url']))
req.add_header('Referer', url)
playlist = self._download_xml(req, video_id)
formats = []
for i in playlist.find('smilRoot/body'):
if 'AD' not in i.attrib['id']:
base_url = i.attrib['base']
parsedurl = compat_urllib_parse_urlparse(base_url)
duration = i.attrib['duration']
for video in i.findall('video'):
if video.attrib['label'] != 'AD':
format_id = video.attrib['label']
play_path = video.attrib['src']
vbr = int(video.attrib['system-bitrate'])
formats.append({
'format_id': format_id,
'url': base_url,
'vbr': vbr,
'play_path': play_path,
'app': parsedurl.path[1:] + '?' + parsedurl.query,
'rtmp_live': True,
'ext': 'flv',
})
self._sort_formats(formats)
return {
'id': episode_id,
'title': self._html_search_regex(r'<title>(.+?) — iVysílání — Česká televize</title>', webpage, 'title'),
'duration': float(duration),
'formats': formats,
}

View File

@ -35,15 +35,15 @@ class CollegeHumorIE(InfoExtractor):
},
# embedded youtube video
{
'url': 'http://www.collegehumor.com/embed/6950457',
'url': 'http://www.collegehumor.com/embed/6950306',
'info_dict': {
'id': 'W5gMp3ZjYg4',
'id': 'Z-bao9fg6Yc',
'ext': 'mp4',
'title': 'Funny Dogs Protecting Babies Compilation 2014 [NEW HD]',
'uploader': 'Funnyplox TV',
'uploader_id': 'funnyploxtv',
'description': 'md5:7ded37421526d54afdf005e25bc2b7a3',
'upload_date': '20140128',
'title': 'Young Americans Think President John F. Kennedy Died THIS MORNING IN A CAR ACCIDENT!!!',
'uploader': 'Mark Dice',
'uploader_id': 'MarkDice',
'description': 'md5:62c3dab9351fac7bb44b53b69511d87f',
'upload_date': '20140127',
},
'params': {
'skip_download': True,

View File

@ -88,6 +88,10 @@ class InfoExtractor(object):
The following fields are optional:
display_id An alternative identifier for the video, not necessarily
unique, but available before title. Typically, id is
something like "4234987", title "Dancing naked mole rats",
and display_id "dancing-naked-mole-rats"
thumbnails: A list of dictionaries (with the entries "resolution" and
"url") for the varying thumbnails
thumbnail: Full URL to a video thumbnail image.
@ -114,9 +118,6 @@ class InfoExtractor(object):
_real_extract() methods and define a _VALID_URL regexp.
Probably, they should also be added to the list of extractors.
_real_extract() must return a *list* of information dictionaries as
described above.
Finally, the _WORKING attribute should be set to False for broken IEs
in order to warn the users and skip the tests.
"""
@ -432,14 +433,14 @@ class InfoExtractor(object):
if secure: regexes = self._og_regexes('video:secure_url') + regexes
return self._html_search_regex(regexes, html, name, **kargs)
def _html_search_meta(self, name, html, display_name=None):
def _html_search_meta(self, name, html, display_name=None, fatal=False):
if display_name is None:
display_name = name
return self._html_search_regex(
r'''(?ix)<meta
(?=[^>]+(?:itemprop|name|property)=["\']%s["\'])
[^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
html, display_name, fatal=False)
html, display_name, fatal=fatal)
def _dc_search_uploader(self, html):
return self._html_search_meta('dc.creator', html, 'uploader')

View File

@ -12,6 +12,7 @@ from ..utils import (
get_element_by_id,
orderedSet,
str_to_int,
int_or_none,
ExtractorError,
)
@ -124,7 +125,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):
if video_url is not None:
m_size = re.search(r'H264-(\d+)x(\d+)', video_url)
if m_size is not None:
width, height = m_size.group(1), m_size.group(2)
width, height = map(int_or_none, (m_size.group(1), m_size.group(2)))
else:
width, height = None, None
formats.append({

View File

@ -1,60 +0,0 @@
import re
import os
import socket
from .common import InfoExtractor
from ..utils import (
compat_http_client,
compat_str,
compat_urllib_error,
compat_urllib_parse,
compat_urllib_request,
ExtractorError,
)
class DepositFilesIE(InfoExtractor):
"""Information extractor for depositfiles.com"""
_VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
def _real_extract(self, url):
file_id = url.split('/')[-1]
# Rebuild url in english locale
url = 'http://depositfiles.com/en/files/' + file_id
# Retrieve file webpage with 'Free download' button pressed
free_download_indication = {'gateway_result' : '1'}
request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
try:
self.report_download_webpage(file_id)
webpage = compat_urllib_request.urlopen(request).read()
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
# Search for the real file URL
mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
if (mobj is None) or (mobj.group(1) is None):
# Try to figure out reason of the error.
mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
if (mobj is not None) and (mobj.group(1) is not None):
restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
raise ExtractorError(u'%s' % restriction_message)
else:
raise ExtractorError(u'Unable to extract download URL from: %s' % url)
file_url = mobj.group(1)
file_extension = os.path.splitext(file_url)[1][1:]
# Search for file title
file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
return [{
'id': file_id.decode('utf-8'),
'url': file_url.decode('utf-8'),
'uploader': None,
'upload_date': None,
'title': file_title,
'ext': file_extension.decode('utf-8'),
}]

View File

@ -1,3 +1,5 @@
from __future__ import unicode_literals
import json
import re
import socket
@ -9,16 +11,15 @@ from ..utils import (
compat_urllib_error,
compat_urllib_parse,
compat_urllib_request,
urlencode_postdata,
ExtractorError,
)
class FacebookIE(InfoExtractor):
"""Information Extractor for Facebook"""
_VALID_URL = r'''(?x)
(?:https?://)?(?:\w+\.)?facebook\.com/
https?://(?:\w+\.)?facebook\.com/
(?:[^#?]*\#!/)?
(?:video/video\.php|photo\.php|video/embed)\?(?:.*?)
(?:v|video_id)=(?P<id>[0-9]+)
@ -26,21 +27,18 @@ class FacebookIE(InfoExtractor):
_LOGIN_URL = 'https://www.facebook.com/login.php?next=http%3A%2F%2Ffacebook.com%2Fhome.php&login_attempt=1'
_CHECKPOINT_URL = 'https://www.facebook.com/checkpoint/?next=http%3A%2F%2Ffacebook.com%2Fhome.php&_fb_noscript=1'
_NETRC_MACHINE = 'facebook'
IE_NAME = u'facebook'
IE_NAME = 'facebook'
_TEST = {
u'url': u'https://www.facebook.com/photo.php?v=120708114770723',
u'file': u'120708114770723.mp4',
u'md5': u'48975a41ccc4b7a581abd68651c1a5a8',
u'info_dict': {
u"duration": 279,
u"title": u"PEOPLE ARE AWESOME 2013"
'url': 'https://www.facebook.com/photo.php?v=120708114770723',
'md5': '48975a41ccc4b7a581abd68651c1a5a8',
'info_dict': {
'id': '120708114770723',
'ext': 'mp4',
'duration': 279,
'title': 'PEOPLE ARE AWESOME 2013',
}
}
def report_login(self):
"""Report attempt to log in."""
self.to_screen(u'Logging in')
def _login(self):
(useremail, password) = self._get_login_info()
if useremail is None:
@ -48,11 +46,13 @@ class FacebookIE(InfoExtractor):
login_page_req = compat_urllib_request.Request(self._LOGIN_URL)
login_page_req.add_header('Cookie', 'locale=en_US')
self.report_login()
login_page = self._download_webpage(login_page_req, None, note=False,
errnote=u'Unable to download login page')
lsd = self._search_regex(r'"lsd":"(\w*?)"', login_page, u'lsd')
lgnrnd = self._search_regex(r'name="lgnrnd" value="([^"]*?)"', login_page, u'lgnrnd')
login_page = self._download_webpage(login_page_req, None,
note='Downloading login page',
errnote='Unable to download login page')
lsd = self._search_regex(
r'<input type="hidden" name="lsd" value="([^"]*)"',
login_page, 'lsd')
lgnrnd = self._search_regex(r'name="lgnrnd" value="([^"]*?)"', login_page, 'lgnrnd')
login_form = {
'email': useremail,
@ -65,27 +65,29 @@ class FacebookIE(InfoExtractor):
'timezone': '-60',
'trynum': '1',
}
request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
request = compat_urllib_request.Request(self._LOGIN_URL, urlencode_postdata(login_form))
request.add_header('Content-Type', 'application/x-www-form-urlencoded')
try:
login_results = compat_urllib_request.urlopen(request).read()
login_results = self._download_webpage(request, None,
note='Logging in', errnote='unable to fetch login page')
if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
self._downloader.report_warning('unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
return
check_form = {
'fb_dtsg': self._search_regex(r'"fb_dtsg":"(.*?)"', login_results, u'fb_dtsg'),
'nh': self._search_regex(r'name="nh" value="(\w*?)"', login_results, u'nh'),
'fb_dtsg': self._search_regex(r'name="fb_dtsg" value="(.+?)"', login_results, 'fb_dtsg'),
'nh': self._search_regex(r'name="nh" value="(\w*?)"', login_results, 'nh'),
'name_action_selected': 'dont_save',
'submit[Continue]': self._search_regex(r'<input value="(.*?)" name="submit\[Continue\]"', login_results, u'continue'),
'submit[Continue]': self._search_regex(r'<button[^>]+value="(.*?)"[^>]+name="submit\[Continue\]"', login_results, 'continue'),
}
check_req = compat_urllib_request.Request(self._CHECKPOINT_URL, compat_urllib_parse.urlencode(check_form))
check_req = compat_urllib_request.Request(self._CHECKPOINT_URL, urlencode_postdata(check_form))
check_req.add_header('Content-Type', 'application/x-www-form-urlencoded')
check_response = compat_urllib_request.urlopen(check_req).read()
check_response = self._download_webpage(check_req, None,
note='Confirming login')
if re.search(r'id="checkpointSubmitButton"', check_response) is not None:
self._downloader.report_warning(u'Unable to confirm login, you have to login in your brower and authorize the login.')
self._downloader.report_warning('Unable to confirm login, you have to login in your brower and authorize the login.')
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
self._downloader.report_warning('unable to log in: %s' % compat_str(err))
return
def _real_initialize(self):
@ -93,8 +95,6 @@ class FacebookIE(InfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
if mobj is None:
raise ExtractorError(u'Invalid URL: %s' % url)
video_id = mobj.group('id')
url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
@ -107,10 +107,10 @@ class FacebookIE(InfoExtractor):
m_msg = re.search(r'class="[^"]*uiInterstitialContent[^"]*"><div>(.*?)</div>', webpage)
if m_msg is not None:
raise ExtractorError(
u'The video is not available, Facebook said: "%s"' % m_msg.group(1),
'The video is not available, Facebook said: "%s"' % m_msg.group(1),
expected=True)
else:
raise ExtractorError(u'Cannot parse data')
raise ExtractorError('Cannot parse data')
data = dict(json.loads(m.group(1)))
params_raw = compat_urllib_parse.unquote(data['params'])
params = json.loads(params_raw)
@ -119,19 +119,15 @@ class FacebookIE(InfoExtractor):
if not video_url:
video_url = video_data['sd_src']
if not video_url:
raise ExtractorError(u'Cannot find video URL')
video_duration = int(video_data['video_duration'])
thumbnail = video_data['thumbnail_src']
raise ExtractorError('Cannot find video URL')
video_title = self._html_search_regex(
r'<h2 class="uiHeaderTitle">([^<]*)</h2>', webpage, u'title')
r'<h2 class="uiHeaderTitle">([^<]*)</h2>', webpage, 'title')
info = {
return {
'id': video_id,
'title': video_title,
'url': video_url,
'ext': 'mp4',
'duration': video_duration,
'thumbnail': thumbnail,
'duration': int(video_data['video_duration']),
'thumbnail': video_data['thumbnail_src'],
}
return [info]

View File

@ -8,8 +8,8 @@ from ..utils import (
unified_strdate,
str_to_int,
parse_duration,
clean_html,
)
from youtube_dl.utils import clean_html
class FourTubeIE(InfoExtractor):

View File

@ -1,12 +1,13 @@
from __future__ import unicode_literals
import json
import re
from .common import InfoExtractor
class FunnyOrDieIE(InfoExtractor):
_VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
_VALID_URL = r'https?://(?:www\.)?funnyordie\.com/(?P<type>embed|videos)/(?P<id>[0-9a-f]+)(?:$|[?#/])'
_TEST = {
'url': 'http://www.funnyordie.com/videos/0732f586d7/heart-shaped-box-literal-video-version',
'file': '0732f586d7.mp4',
@ -30,10 +31,20 @@ class FunnyOrDieIE(InfoExtractor):
[r'type="video/mp4" src="(.*?)"', r'src="([^>]*?)" type=\'video/mp4\''],
webpage, 'video URL', flags=re.DOTALL)
if mobj.group('type') == 'embed':
post_json = self._search_regex(
r'fb_post\s*=\s*(\{.*?\});', webpage, 'post details')
post = json.loads(post_json)['attachment']
title = post['name']
description = post.get('description')
else:
title = self._og_search_title(webpage)
description = self._og_search_description(webpage)
return {
'id': video_id,
'url': video_url,
'ext': 'mp4',
'title': self._og_search_title(webpage),
'description': self._og_search_description(webpage),
'title': title,
'description': description,
}

View File

@ -1,3 +1,5 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
@ -6,13 +8,14 @@ from .common import InfoExtractor
class GamekingsIE(InfoExtractor):
_VALID_URL = r'http://www\.gamekings\.tv/videos/(?P<name>[0-9a-z\-]+)'
_TEST = {
u"url": u"http://www.gamekings.tv/videos/phoenix-wright-ace-attorney-dual-destinies-review/",
u'file': u'20130811.mp4',
'url': 'http://www.gamekings.tv/videos/phoenix-wright-ace-attorney-dual-destinies-review/',
# MD5 is flaky, seems to change regularly
#u'md5': u'2f32b1f7b80fdc5cb616efb4f387f8a3',
# 'md5': '2f32b1f7b80fdc5cb616efb4f387f8a3',
u'info_dict': {
u"title": u"Phoenix Wright: Ace Attorney \u2013 Dual Destinies Review",
u"description": u"Melle en Steven hebben voor de review een week in de rechtbank doorbracht met Phoenix Wright: Ace Attorney - Dual Destinies.",
'id': '20130811',
'ext': 'mp4',
'title': 'Phoenix Wright: Ace Attorney \u2013 Dual Destinies Review',
'description': 'md5:632e61a9f97d700e83f43d77ddafb6a4',
}
}

View File

@ -0,0 +1,134 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
compat_urllib_parse,
compat_urllib_request,
)
class GDCVaultIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?gdcvault\.com/play/(?P<id>\d+)/(?P<name>(\w|-)+)'
_TESTS = [
{
'url': 'http://www.gdcvault.com/play/1019721/Doki-Doki-Universe-Sweet-Simple',
'md5': '7ce8388f544c88b7ac11c7ab1b593704',
'info_dict': {
'id': '1019721',
'ext': 'mp4',
'title': 'Doki-Doki Universe: Sweet, Simple and Genuine (GDC Next 10)'
}
},
{
'url': 'http://www.gdcvault.com/play/1015683/Embracing-the-Dark-Art-of',
'info_dict': {
'id': '1015683',
'ext': 'flv',
'title': 'Embracing the Dark Art of Mathematical Modeling in AI'
},
'params': {
'skip_download': True, # Requires rtmpdump
}
},
]
def _parse_mp4(self, xml_description):
video_formats = []
mp4_video = xml_description.find('./metadata/mp4video')
if mp4_video is None:
return None
mobj = re.match(r'(?P<root>https?://.*?/).*', mp4_video.text)
video_root = mobj.group('root')
formats = xml_description.findall('./metadata/MBRVideos/MBRVideo')
for format in formats:
mobj = re.match(r'mp4\:(?P<path>.*)', format.find('streamName').text)
url = video_root + mobj.group('path')
vbr = format.find('bitrate').text
video_formats.append({
'url': url,
'vbr': int(vbr),
})
return video_formats
def _parse_flv(self, xml_description):
video_formats = []
akami_url = xml_description.find('./metadata/akamaiHost').text
slide_video_path = xml_description.find('./metadata/slideVideo').text
video_formats.append({
'url': 'rtmp://' + akami_url + '/' + slide_video_path,
'format_note': 'slide deck video',
'quality': -2,
'preference': -2,
'format_id': 'slides',
})
speaker_video_path = xml_description.find('./metadata/speakerVideo').text
video_formats.append({
'url': 'rtmp://' + akami_url + '/' + speaker_video_path,
'format_note': 'speaker video',
'quality': -1,
'preference': -1,
'format_id': 'speaker',
})
return video_formats
def _login(self, webpage_url, video_id):
(username, password) = self._get_login_info()
if username is None or password is None:
self.report_warning('It looks like ' + webpage_url + ' requires a login. Try specifying a username and password and try again.')
return None
mobj = re.match(r'(?P<root_url>https?://.*?/).*', webpage_url)
login_url = mobj.group('root_url') + 'api/login.php'
logout_url = mobj.group('root_url') + 'logout'
login_form = {
'email': username,
'password': password,
}
request = compat_urllib_request.Request(login_url, compat_urllib_parse.urlencode(login_form))
request.add_header('Content-Type', 'application/x-www-form-urlencoded')
self._download_webpage(request, video_id, 'Logging in')
start_page = self._download_webpage(webpage_url, video_id, 'Getting authenticated video page')
self._download_webpage(logout_url, video_id, 'Logging out')
return start_page
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
webpage_url = 'http://www.gdcvault.com/play/' + video_id
start_page = self._download_webpage(webpage_url, video_id)
xml_root = self._html_search_regex(r'<iframe src="(?P<xml_root>.*?)player.html.*?".*?</iframe>', start_page, 'xml root', None, False)
if xml_root is None:
# Probably need to authenticate
start_page = self._login(webpage_url, video_id)
if start_page is None:
self.report_warning('Could not login.')
else:
# Grab the url from the authenticated page
xml_root = self._html_search_regex(r'<iframe src="(?P<xml_root>.*?)player.html.*?".*?</iframe>', start_page, 'xml root')
xml_name = self._html_search_regex(r'<iframe src=".*?\?xml=(?P<xml_file>.+?\.xml).*?".*?</iframe>', start_page, 'xml filename', None, False)
if xml_name is None:
# Fallback to the older format
xml_name = self._html_search_regex(r'<iframe src=".*?\?xmlURL=xml/(?P<xml_file>.+?\.xml).*?".*?</iframe>', start_page, 'xml filename')
xml_decription_url = xml_root + 'xml/' + xml_name
xml_description = self._download_xml(xml_decription_url, video_id)
video_title = xml_description.find('./metadata/title').text
video_formats = self._parse_mp4(xml_description)
if video_formats is None:
video_formats = self._parse_flv(xml_description)
return {
'id': video_id,
'title': video_title,
'formats': video_formats,
}

View File

@ -4,7 +4,6 @@ from __future__ import unicode_literals
import os
import re
import xml.etree.ElementTree
from .common import InfoExtractor
from .youtube import YoutubeIE
@ -17,6 +16,7 @@ from ..utils import (
ExtractorError,
HEADRequest,
parse_xml,
smuggle_url,
unescapeHTML,
unified_strdate,
@ -116,7 +116,35 @@ class GenericIE(InfoExtractor):
'params': {
'skip_download': False,
}
}
},
# embed.ly video
{
'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/',
'info_dict': {
'id': '9ODmcdjQcHQ',
'ext': 'mp4',
'title': 'Tested: Grinding Coffee at 2000 Frames Per Second',
'upload_date': '20140225',
'description': 'md5:06a40fbf30b220468f1e0957c0f558ff',
'uploader': 'Tested',
'uploader_id': 'testedcom',
},
# No need to test YoutubeIE here
'params': {
'skip_download': True,
},
},
# funnyordie embed
{
'url': 'http://www.theguardian.com/world/2014/mar/11/obama-zach-galifianakis-between-two-ferns',
'md5': '7cf780be104d40fea7bae52eed4a470e',
'info_dict': {
'id': '18e820ec3f',
'ext': 'mp4',
'title': 'Between Two Ferns with Zach Galifianakis: President Barack Obama',
'description': 'Episode 18: President Barack Obama sits down with Zach Galifianakis for his most memorable interview yet.',
}
},
]
def report_download_webpage(self, video_id):
@ -211,7 +239,7 @@ class GenericIE(InfoExtractor):
else:
assert ':' in default_search
return self.url_result(default_search + url)
video_id = os.path.splitext(url.split('/')[-1])[0]
video_id = os.path.splitext(url.rstrip('/').split('/')[-1])[0]
self.to_screen('%s: Requesting header' % video_id)
@ -257,7 +285,7 @@ class GenericIE(InfoExtractor):
# Is it an RSS feed?
try:
doc = xml.etree.ElementTree.fromstring(webpage.encode('utf-8'))
doc = parse_xml(webpage)
if doc.tag == 'rss':
return self._extract_rss(url, video_id, doc)
except compat_xml_parse_error:
@ -396,12 +424,33 @@ class GenericIE(InfoExtractor):
if mobj is not None:
return self.url_result(mobj.group('url'), 'Facebook')
# Look for embedded VK player
mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://vk\.com/video_ext\.php.+?)\1', webpage)
if mobj is not None:
return self.url_result(mobj.group('url'), 'VK')
# Look for embedded Huffington Post player
mobj = re.search(
r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed\.live\.huffingtonpost\.com/.+?)\1', webpage)
if mobj is not None:
return self.url_result(mobj.group('url'), 'HuffPost')
# Look for embed.ly
mobj = re.search(r'class=["\']embedly-card["\'][^>]href=["\'](?P<url>[^"\']+)', webpage)
if mobj is not None:
return self.url_result(mobj.group('url'))
mobj = re.search(r'class=["\']embedly-embed["\'][^>]src=["\'][^"\']*url=(?P<url>[^&]+)', webpage)
if mobj is not None:
return self.url_result(compat_urllib_parse.unquote(mobj.group('url')))
# Look for funnyordie embed
matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage)
if matches:
urlrs = [self.url_result(unescapeHTML(eurl), 'FunnyOrDie')
for eurl in matches]
return self.playlist_result(
urlrs, playlist_id=video_id, playlist_title=video_title)
# Start with something easy: JW Player in SWFObject
mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
if mobj is None:

View File

@ -1,56 +1,61 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
RegexNotFoundError,
unescapeHTML,
)
class JukeboxIE(InfoExtractor):
_VALID_URL = r'^http://www\.jukebox?\..+?\/.+[,](?P<video_id>[a-z0-9\-]+)\.html'
_IFRAME = r'<iframe .*src="(?P<iframe>[^"]*)".*>'
_VIDEO_URL = r'"config":{"file":"(?P<video_url>http:[^"]+[.](?P<video_ext>[^.?]+)[?]mdtk=[0-9]+)"'
_TITLE = r'<h1 class="inline">(?P<title>[^<]+)</h1>.*<span id="infos_article_artist">(?P<artist>[^<]+)</span>'
_IS_YOUTUBE = r'config":{"file":"(?P<youtube_url>http:[\\][/][\\][/]www[.]youtube[.]com[\\][/]watch[?]v=[^"]+)"'
_TEST = {
'url': 'http://www.jukebox.es/kosheen/videoclip,pride,r303r.html',
'md5': '5dc6477e74b1e37042ac5acedd8413e5',
'info_dict': {
'id': 'r303r',
'ext': 'flv',
'title': 'Kosheen-En Vivo Pride',
'uploader': 'Kosheen',
},
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('video_id')
html = self._download_webpage(url, video_id)
mobj = re.search(self._IFRAME, html)
if mobj is None:
raise ExtractorError(u'Cannot extract iframe url')
iframe_url = unescapeHTML(mobj.group('iframe'))
iframe_url = unescapeHTML(self._search_regex(r'<iframe .*src="([^"]*)"', html, 'iframe url'))
iframe_html = self._download_webpage(iframe_url, video_id, 'Downloading iframe')
mobj = re.search(r'class="jkb_waiting"', iframe_html)
if mobj is not None:
raise ExtractorError(u'Video is not available(in your country?)!')
if re.search(r'class="jkb_waiting"', iframe_html) is not None:
raise ExtractorError('Video is not available(in your country?)!')
self.report_extraction(video_id)
mobj = re.search(self._VIDEO_URL, iframe_html)
if mobj is None:
mobj = re.search(self._IS_YOUTUBE, iframe_html)
if mobj is None:
raise ExtractorError(u'Cannot extract video url')
youtube_url = unescapeHTML(mobj.group('youtube_url')).replace('\/','/')
self.to_screen(u'Youtube video detected')
return self.url_result(youtube_url,ie='Youtube')
video_url = unescapeHTML(mobj.group('video_url')).replace('\/','/')
video_ext = unescapeHTML(mobj.group('video_ext'))
try:
video_url = self._search_regex(r'"config":{"file":"(?P<video_url>http:[^"]+\?mdtk=[0-9]+)"',
iframe_html, 'video url')
video_url = unescapeHTML(video_url).replace('\/', '/')
except RegexNotFoundError:
youtube_url = self._search_regex(
r'config":{"file":"(http:\\/\\/www\.youtube\.com\\/watch\?v=[^"]+)"',
iframe_html, 'youtube url')
youtube_url = unescapeHTML(youtube_url).replace('\/', '/')
self.to_screen('Youtube video detected')
return self.url_result(youtube_url, ie='Youtube')
mobj = re.search(self._TITLE, html)
if mobj is None:
raise ExtractorError(u'Cannot extract title')
title = unescapeHTML(mobj.group('title'))
artist = unescapeHTML(mobj.group('artist'))
title = self._html_search_regex(r'<h1 class="inline">([^<]+)</h1>',
html, 'title')
artist = self._html_search_regex(r'<span id="infos_article_artist">([^<]+)</span>',
html, 'artist')
return [{'id': video_id,
'url': video_url,
'title': artist + '-' + title,
'ext': video_ext
}]
return {
'id': video_id,
'url': video_url,
'title': artist + '-' + title,
'uploader': artist,
}

View File

@ -6,7 +6,8 @@ import re
from .common import InfoExtractor
from ..utils import (
int_or_none,
unified_strdate
unified_strdate,
ExtractorError,
)
@ -34,11 +35,9 @@ class LifeNewsIE(InfoExtractor):
webpage = self._download_webpage('http://lifenews.ru/news/%s' % video_id, video_id, 'Downloading page')
video_url = self._html_search_regex(
r'<video.*?src="([^"]+)".*?></video>', webpage, 'video URL')
thumbnail = self._html_search_regex(
r'<video.*?poster="([^"]+)".*?"></video>', webpage, 'video thumbnail')
videos = re.findall(r'<video.*?poster="(?P<poster>[^"]+)".*?src="(?P<video>[^"]+)".*?></video>', webpage)
if not videos:
raise ExtractorError('No media links available for %s' % video_id)
title = self._og_search_title(webpage)
TITLE_SUFFIX = ' - Первый по срочным новостям — LIFE | NEWS'
@ -57,13 +56,19 @@ class LifeNewsIE(InfoExtractor):
if upload_date is not None:
upload_date = unified_strdate(upload_date)
return {
'id': video_id,
'url': video_url,
'thumbnail': thumbnail,
'title': title,
'description': description,
'view_count': int_or_none(view_count),
'comment_count': int_or_none(comment_count),
'upload_date': upload_date,
}
def make_entry(video_id, media, video_number=None):
return {
'id': video_id,
'url': media[1],
'thumbnail': media[0],
'title': title if video_number is None else '%s-video%s' % (title, video_number),
'description': description,
'view_count': int_or_none(view_count),
'comment_count': int_or_none(comment_count),
'upload_date': upload_date,
}
if len(videos) == 1:
return make_entry(video_id, videos[0])
else:
return [make_entry(video_id, media, video_number+1) for video_number, media in enumerate(videos)]

View File

@ -8,7 +8,9 @@ from .common import InfoExtractor
from ..utils import (
compat_urllib_parse,
compat_urllib_request,
ExtractorError
ExtractorError,
int_or_none,
compat_str,
)
@ -19,16 +21,17 @@ class LyndaIE(SubtitlesInfoExtractor):
_LOGIN_URL = 'https://www.lynda.com/login/login.aspx'
_NETRC_MACHINE = 'lynda'
_SUCCESSFUL_LOGIN_REGEX = r'<a href="https://www.lynda.com/home/userAccount/ChangeContactInfo.aspx" data-qa="eyebrow_account_menu">My account'
_SUCCESSFUL_LOGIN_REGEX = r'isLoggedIn: true'
_TIMECODE_REGEX = r'\[(?P<timecode>\d+:\d+:\d+[\.,]\d+)\]'
ACCOUNT_CREDENTIALS_HINT = 'Use --username and --password options to provide lynda.com account credentials.'
_TEST = {
'url': 'http://www.lynda.com/Bootstrap-tutorials/Using-exercise-files/110885/114408-4.html',
'file': '114408.mp4',
'md5': 'ecfc6862da89489161fb9cd5f5a6fac1',
'info_dict': {
'id': '114408',
'ext': 'mp4',
'title': 'Using the exercise files',
'duration': 68
}
@ -41,27 +44,44 @@ class LyndaIE(SubtitlesInfoExtractor):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group(1)
page = self._download_webpage('http://www.lynda.com/ajax/player?videoId=%s&type=video' % video_id,
video_id, 'Downloading video JSON')
page = self._download_webpage('http://www.lynda.com/ajax/player?videoId=%s&type=video' % video_id, video_id,
'Downloading video JSON')
video_json = json.loads(page)
if 'Status' in video_json:
raise ExtractorError('lynda returned error: %s' % video_json['Message'], expected=True)
if video_json['HasAccess'] is False:
raise ExtractorError('Video %s is only available for members. ' % video_id + self.ACCOUNT_CREDENTIALS_HINT, expected=True)
raise ExtractorError(
'Video %s is only available for members. ' % video_id + self.ACCOUNT_CREDENTIALS_HINT, expected=True)
video_id = video_json['ID']
video_id = compat_str(video_json['ID'])
duration = video_json['DurationInSeconds']
title = video_json['Title']
formats = [{'url': fmt['Url'],
formats = []
fmts = video_json.get('Formats')
if fmts:
formats.extend([
{
'url': fmt['Url'],
'ext': fmt['Extension'],
'width': fmt['Width'],
'height': fmt['Height'],
'filesize': fmt['FileSize'],
'format_id': str(fmt['Resolution'])
} for fmt in video_json['Formats']]
} for fmt in fmts])
prioritized_streams = video_json.get('PrioritizedStreams')
if prioritized_streams:
formats.extend([
{
'url': video_url,
'width': int_or_none(format_id),
'format_id': format_id,
} for format_id, video_url in prioritized_streams['0'].items()
])
self._sort_formats(formats)
@ -91,7 +111,7 @@ class LyndaIE(SubtitlesInfoExtractor):
'stayPut': 'false'
}
request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
login_page = self._download_webpage(request, None, note='Logging in as %s' % username)
login_page = self._download_webpage(request, None, 'Logging in as %s' % username)
# Not (yet) logged in
m = re.search(r'loginResultJson = \'(?P<json>[^\']+)\';', login_page)
@ -116,7 +136,7 @@ class LyndaIE(SubtitlesInfoExtractor):
'stayPut': 'false',
}
request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(confirm_form))
login_page = self._download_webpage(request, None, note='Confirming log in and log out from another device')
login_page = self._download_webpage(request, None, 'Confirming log in and log out from another device')
if re.search(self._SUCCESSFUL_LOGIN_REGEX, login_page) is None:
raise ExtractorError('Unable to log in')
@ -150,7 +170,7 @@ class LyndaIE(SubtitlesInfoExtractor):
def _get_available_subtitles(self, video_id, webpage):
url = 'http://www.lynda.com/ajax/player?videoId=%s&type=transcript' % video_id
sub = self._download_webpage(url, None, note=False)
sub = self._download_webpage(url, None, False)
sub_json = json.loads(sub)
return {'en': url} if len(sub_json) > 0 else {}
@ -179,6 +199,9 @@ class LyndaCourseIE(InfoExtractor):
videos = []
(username, _) = self._get_login_info()
# Might want to extract videos right here from video['Formats'] as it seems 'Formats' is not provided
# by single video API anymore
for chapter in course_json['Chapters']:
for video in chapter['Videos']:
if username is None and video['HasAccess'] is False:

View File

@ -0,0 +1,66 @@
# encoding: utf-8
from __future__ import unicode_literals
import re
import datetime
from .common import InfoExtractor
class MailRuIE(InfoExtractor):
IE_NAME = 'mailru'
IE_DESC = 'Видео@Mail.Ru'
_VALID_URL = r'http://(?:www\.)?my\.mail\.ru/video/.*#video=/?(?P<id>[^/]+/[^/]+/[^/]+/\d+)'
_TEST = {
'url': 'http://my.mail.ru/video/top#video=/mail/sonypicturesrus/75/76',
'md5': 'dea205f03120046894db4ebb6159879a',
'info_dict': {
'id': '46301138',
'ext': 'mp4',
'title': 'Новый Человек-Паук. Высокое напряжение. Восстание Электро',
'upload_date': '20140224',
'uploader': 'sonypicturesrus',
'uploader_id': 'sonypicturesrus@mail.ru',
'duration': 184,
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
video_data = self._download_json(
'http://videoapi.my.mail.ru/videos/%s.json?new=1' % video_id, video_id, 'Downloading video JSON')
author = video_data['author']
uploader = author['name']
uploader_id = author['id']
movie = video_data['movie']
content_id = str(movie['contentId'])
title = movie['title']
thumbnail = movie['poster']
duration = movie['duration']
upload_date = datetime.datetime.fromtimestamp(video_data['timestamp']).strftime('%Y%m%d')
view_count = video_data['views_count']
formats = [
{
'url': video['url'],
'format_id': video['name'],
} for video in video_data['videos']
]
return {
'id': content_id,
'title': title,
'thumbnail': thumbnail,
'upload_date': upload_date,
'uploader': uploader,
'uploader_id': uploader_id,
'duration': duration,
'view_count': view_count,
'formats': formats,
}

View File

@ -5,6 +5,7 @@ import re
from .common import InfoExtractor
from ..utils import (
unified_strdate,
compat_urllib_parse,
ExtractorError,
)
@ -15,8 +16,9 @@ class MixcloudIE(InfoExtractor):
_TEST = {
'url': 'http://www.mixcloud.com/dholbach/cryptkeeper/',
'file': 'dholbach-cryptkeeper.mp3',
'info_dict': {
'id': 'dholbach-cryptkeeper',
'ext': 'mp3',
'title': 'Cryptkeeper',
'description': 'After quite a long silence from myself, finally another Drum\'n\'Bass mix with my favourite current dance floor bangers.',
'uploader': 'Daniel Holbach',
@ -45,7 +47,7 @@ class MixcloudIE(InfoExtractor):
mobj = re.match(self._VALID_URL, url)
uploader = mobj.group(1)
cloudcast_name = mobj.group(2)
track_id = '-'.join((uploader, cloudcast_name))
track_id = compat_urllib_parse.unquote('-'.join((uploader, cloudcast_name)))
webpage = self._download_webpage(url, track_id)

View File

@ -5,9 +5,12 @@ import re
from .common import InfoExtractor
from ..utils import (
compat_urllib_parse,
compat_urllib_request,
ExtractorError,
find_xpath_attr,
fix_xml_ampersands,
HEADRequest,
unescapeHTML,
url_basename,
RegexNotFoundError,
)
@ -18,6 +21,7 @@ def _media_xml_tag(tag):
class MTVServicesInfoExtractor(InfoExtractor):
_MOBILE_TEMPLATE = None
@staticmethod
def _id_from_uri(uri):
return uri.split(':')[-1]
@ -39,9 +43,29 @@ class MTVServicesInfoExtractor(InfoExtractor):
else:
return thumb_node.attrib['url']
def _extract_video_formats(self, mdoc):
if re.match(r'.*/error_country_block\.swf$', mdoc.find('.//src').text) is not None:
raise ExtractorError('This video is not available from your country.', expected=True)
def _extract_mobile_video_formats(self, mtvn_id):
webpage_url = self._MOBILE_TEMPLATE % mtvn_id
req = compat_urllib_request.Request(webpage_url)
# Otherwise we get a webpage that would execute some javascript
req.add_header('Youtubedl-user-agent', 'curl/7')
webpage = self._download_webpage(req, mtvn_id,
'Downloading mobile page')
metrics_url = unescapeHTML(self._search_regex(r'<a href="(http://metrics.+?)"', webpage, 'url'))
req = HEADRequest(metrics_url)
response = self._request_webpage(req, mtvn_id, 'Resolving url')
url = response.geturl()
# Transform the url to get the best quality:
url = re.sub(r'.+pxE=mp4', 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=0+_pxK=18639+_pxE=mp4', url, 1)
return [{'url': url,'ext': 'mp4'}]
def _extract_video_formats(self, mdoc, mtvn_id):
if re.match(r'.*/(error_country_block\.swf|geoblock\.mp4)$', mdoc.find('.//src').text) is not None:
if mtvn_id is not None and self._MOBILE_TEMPLATE is not None:
self.to_screen('The normal version is not available from your '
'country, trying with the mobile version')
return self._extract_mobile_video_formats(mtvn_id)
raise ExtractorError('This video is not available from your country.',
expected=True)
formats = []
for rendition in mdoc.findall('.//rendition'):
@ -94,9 +118,16 @@ class MTVServicesInfoExtractor(InfoExtractor):
raise ExtractorError('Could not find video title')
title = title.strip()
# This a short id that's used in the webpage urls
mtvn_id = None
mtvn_id_node = find_xpath_attr(itemdoc, './/{http://search.yahoo.com/mrss/}category',
'scheme', 'urn:mtvn:id')
if mtvn_id_node is not None:
mtvn_id = mtvn_id_node.text
return {
'title': title,
'formats': self._extract_video_formats(mediagen_doc),
'formats': self._extract_video_formats(mediagen_doc, mtvn_id),
'id': video_id,
'thumbnail': self._get_thumbnail_url(uri, itemdoc),
'description': description,

View File

@ -1,3 +1,5 @@
from __future__ import unicode_literals
import binascii
import base64
import hashlib
@ -14,18 +16,16 @@ from ..utils import (
)
class MyVideoIE(InfoExtractor):
"""Information Extractor for myvideo.de."""
_VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/(?:[^/]+/)?watch/([0-9]+)/([^?/]+).*'
IE_NAME = u'myvideo'
_VALID_URL = r'http://(?:www\.)?myvideo\.de/(?:[^/]+/)?watch/(?P<id>[0-9]+)/[^?/]+.*'
IE_NAME = 'myvideo'
_TEST = {
u'url': u'http://www.myvideo.de/watch/8229274/bowling_fail_or_win',
u'file': u'8229274.flv',
u'md5': u'2d2753e8130479ba2cb7e0a37002053e',
u'info_dict': {
u"title": u"bowling-fail-or-win"
'url': 'http://www.myvideo.de/watch/8229274/bowling_fail_or_win',
'md5': '2d2753e8130479ba2cb7e0a37002053e',
'info_dict': {
'id': '8229274',
'ext': 'flv',
'title': 'bowling-fail-or-win',
}
}
@ -53,10 +53,7 @@ class MyVideoIE(InfoExtractor):
def _real_extract(self,url):
mobj = re.match(self._VALID_URL, url)
if mobj is None:
raise ExtractorError(u'invalid URL: %s' % url)
video_id = mobj.group(1)
video_id = mobj.group('id')
GK = (
b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
@ -74,37 +71,33 @@ class MyVideoIE(InfoExtractor):
video_url = mobj.group(1) + '.flv'
video_title = self._html_search_regex('<title>([^<]+)</title>',
webpage, u'title')
webpage, 'title')
video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
return [{
'id': video_id,
'url': video_url,
'uploader': None,
'upload_date': None,
'title': video_title,
'ext': video_ext,
}]
return {
'id': video_id,
'url': video_url,
'title': video_title,
}
mobj = re.search(r'data-video-service="/service/data/video/%s/config' % video_id, webpage)
if mobj is not None:
request = compat_urllib_request.Request('http://www.myvideo.de/service/data/video/%s/config' % video_id, '')
response = self._download_webpage(request, video_id,
u'Downloading video info')
'Downloading video info')
info = json.loads(base64.b64decode(response).decode('utf-8'))
return {'id': video_id,
'title': info['title'],
'url': info['streaming_url'].replace('rtmpe', 'rtmpt'),
'play_path': info['filename'],
'ext': 'flv',
'thumbnail': info['thumbnail'][0]['url'],
}
return {
'id': video_id,
'title': info['title'],
'url': info['streaming_url'].replace('rtmpe', 'rtmpt'),
'play_path': info['filename'],
'ext': 'flv',
'thumbnail': info['thumbnail'][0]['url'],
}
# try encxml
mobj = re.search('var flashvars={(.+?)}', webpage)
if mobj is None:
raise ExtractorError(u'Unable to extract video')
raise ExtractorError('Unable to extract video')
params = {}
encxml = ''
@ -118,7 +111,7 @@ class MyVideoIE(InfoExtractor):
params['domain'] = 'www.myvideo.de'
xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
if 'flash_playertype=MTV' in xmldata_url:
self._downloader.report_warning(u'avoiding MTV player')
self._downloader.report_warning('avoiding MTV player')
xmldata_url = (
'http://www.myvideo.de/dynamic/get_player_video_xml.php'
'?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
@ -144,7 +137,7 @@ class MyVideoIE(InfoExtractor):
video_url = compat_urllib_parse.unquote(mobj.group(1))
if 'myvideo2flash' in video_url:
self.report_warning(
u'Rewriting URL to use unencrypted rtmp:// ...',
'Rewriting URL to use unencrypted rtmp:// ...',
video_id)
video_url = video_url.replace('rtmpe://', 'rtmp://')
@ -152,39 +145,31 @@ class MyVideoIE(InfoExtractor):
# extract non rtmp videos
mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
if mobj is None:
raise ExtractorError(u'unable to extract url')
raise ExtractorError('unable to extract url')
video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file')
video_file = self._search_regex('source=\'(.*?)\'', dec_data, 'video file')
video_file = compat_urllib_parse.unquote(video_file)
if not video_file.endswith('f4m'):
ppath, prefix = video_file.split('.')
video_playpath = '%s:%s' % (prefix, ppath)
video_hls_playlist = ''
else:
video_playpath = ''
video_hls_playlist = (
video_file
).replace('.f4m', '.m3u8')
video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, 'swfobj')
video_swfobj = compat_urllib_parse.unquote(video_swfobj)
video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
webpage, u'title')
webpage, 'title')
return [{
'id': video_id,
'url': video_url,
'tc_url': video_url,
'uploader': None,
'upload_date': None,
'title': video_title,
'ext': u'flv',
'play_path': video_playpath,
'video_file': video_file,
'video_hls_playlist': video_hls_playlist,
'player_url': video_swfobj,
}]
return {
'id': video_id,
'url': video_url,
'tc_url': video_url,
'title': video_title,
'ext': 'flv',
'play_path': video_playpath,
'player_url': video_swfobj,
}

View File

@ -8,6 +8,7 @@ from .common import InfoExtractor
from ..utils import (
HEADRequest,
unified_strdate,
ExtractorError,
)
@ -35,7 +36,15 @@ class ORFIE(InfoExtractor):
data_json = self._search_regex(
r'initializeAdworx\((.+?)\);\n', webpage, 'video info')
all_data = json.loads(data_json)
sdata = all_data[0]['values']['segments']
def get_segments(all_data):
for data in all_data:
if data['name'] == 'Tracker::EPISODE_DETAIL_PAGE_OVER_PROGRAM':
return data['values']['segments']
sdata = get_segments(all_data)
if not sdata:
raise ExtractorError('Unable to extract segments')
def quality_to_int(s):
m = re.search('([0-9]+)', s)

View File

@ -1,76 +1,43 @@
from __future__ import unicode_literals
import datetime
import json
import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
)
class PhotobucketIE(InfoExtractor):
"""Information extractor for photobucket.com."""
# TODO: the original _VALID_URL was:
# r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
# Check if it's necessary to keep the old extracion process
_VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
IE_NAME = u'photobucket'
_VALID_URL = r'http://(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
_TEST = {
u'url': u'http://media.photobucket.com/user/rachaneronas/media/TiredofLinkBuildingTryBacklinkMyDomaincom_zpsc0c3b9fa.mp4.html?filters[term]=search&filters[primary]=videos&filters[secondary]=images&sort=1&o=0',
u'file': u'zpsc0c3b9fa.mp4',
u'md5': u'7dabfb92b0a31f6c16cebc0f8e60ff99',
u'info_dict': {
u"upload_date": u"20130504",
u"uploader": u"rachaneronas",
u"title": u"Tired of Link Building? Try BacklinkMyDomain.com!"
'url': 'http://media.photobucket.com/user/rachaneronas/media/TiredofLinkBuildingTryBacklinkMyDomaincom_zpsc0c3b9fa.mp4.html?filters[term]=search&filters[primary]=videos&filters[secondary]=images&sort=1&o=0',
'file': 'zpsc0c3b9fa.mp4',
'md5': '7dabfb92b0a31f6c16cebc0f8e60ff99',
'info_dict': {
'upload_date': '20130504',
'uploader': 'rachaneronas',
'title': 'Tired of Link Building? Try BacklinkMyDomain.com!',
}
}
def _real_extract(self, url):
# Extract id from URL
mobj = re.match(self._VALID_URL, url)
if mobj is None:
raise ExtractorError(u'Invalid URL: %s' % url)
video_id = mobj.group('id')
video_extension = mobj.group('ext')
# Retrieve video webpage to extract further information
webpage = self._download_webpage(url, video_id)
# Extract URL, uploader, and title from webpage
self.report_extraction(video_id)
# We try first by looking the javascript code:
mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
if mobj is not None:
info = json.loads(mobj.group('json'))
return [{
'id': video_id,
'url': info[u'downloadUrl'],
'uploader': info[u'username'],
'upload_date': datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
'title': info[u'title'],
'ext': video_extension,
'thumbnail': info[u'thumbUrl'],
}]
# We try looking in other parts of the webpage
video_url = self._search_regex(r'<link rel="video_src" href=".*\?file=([^"]+)" />',
webpage, u'video URL')
mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
if mobj is None:
raise ExtractorError(u'Unable to extract title')
video_title = mobj.group(1).decode('utf-8')
video_uploader = mobj.group(2).decode('utf-8')
return [{
'id': video_id.decode('utf-8'),
'url': video_url.decode('utf-8'),
'uploader': video_uploader,
'upload_date': None,
'title': video_title,
'ext': video_extension.decode('utf-8'),
}]
info_json = self._search_regex(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (.*?)\);',
webpage, 'info json')
info = json.loads(info_json)
return {
'id': video_id,
'url': info['downloadUrl'],
'uploader': info['username'],
'upload_date': datetime.date.fromtimestamp(info['creationDate']).strftime('%Y%m%d'),
'title': info['title'],
'ext': video_extension,
'thumbnail': info['thumbUrl'],
}

View File

@ -0,0 +1,80 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
compat_urllib_parse,
)
class PlayvidIE(InfoExtractor):
_VALID_URL = r'^https?://www\.playvid\.com/watch(\?v=|/)(?P<id>.+?)(?:#|$)'
_TEST = {
'url': 'http://www.playvid.com/watch/agbDDi7WZTV',
'md5': '44930f8afa616efdf9482daf4fe53e1e',
'info_dict': {
'id': 'agbDDi7WZTV',
'ext': 'mp4',
'title': 'Michelle Lewin in Miami Beach',
'duration': 240,
'age_limit': 18,
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
video_title = None
duration = None
video_thumbnail = None
formats = []
# most of the information is stored in the flashvars
flashvars = self._html_search_regex(
r'flashvars="(.+?)"', webpage, 'flashvars')
infos = compat_urllib_parse.unquote(flashvars).split(r'&')
for info in infos:
videovars_match = re.match(r'^video_vars\[(.+?)\]=(.+?)$', info)
if videovars_match:
key = videovars_match.group(1)
val = videovars_match.group(2)
if key == 'title':
video_title = compat_urllib_parse.unquote_plus(val)
if key == 'duration':
try:
duration = int(val)
except ValueError:
pass
if key == 'big_thumb':
video_thumbnail = val
videourl_match = re.match(
r'^video_urls\]\[(?P<resolution>[0-9]+)p', key)
if videourl_match:
height = int(videourl_match.group('resolution'))
formats.append({
'height': height,
'url': val,
})
self._sort_formats(formats)
# Extract title - should be in the flashvars; if not, look elsewhere
if video_title is None:
video_title = self._html_search_regex(
r'<title>(.*?)</title', webpage, 'title')
return {
'id': video_id,
'formats': formats,
'title': video_title,
'thumbnail': video_thumbnail,
'duration': duration,
'description': None,
'age_limit': 18
}

View File

@ -44,7 +44,7 @@ class PornHubIE(InfoExtractor):
video_urls = list(map(compat_urllib_parse.unquote , re.findall(r'"quality_[0-9]{3}p":"([^"]+)', webpage)))
if webpage.find('"encrypted":true') != -1:
password = self._html_search_regex(r'"video_title":"([^"]+)', webpage, 'password').replace('+', ' ')
password = compat_urllib_parse.unquote_plus(self._html_search_regex(r'"video_title":"([^"]+)', webpage, 'password'))
video_urls = list(map(lambda s: aes_decrypt_text(s, password, 32).decode('utf-8'), video_urls))
formats = []

View File

@ -51,14 +51,14 @@ class ProSiebenSat1IE(InfoExtractor):
'skip': 'Seems to be broken',
},
{
'url': 'http://www.prosiebenmaxx.de/yep/one-piece/video/148-folge-48-gold-rogers-heimat-ganze-folge',
'url': 'http://www.prosiebenmaxx.de/tv/experience/video/144-countdown-fuer-die-autowerkstatt-ganze-folge',
'info_dict': {
'id': '2437108',
'id': '2429369',
'ext': 'mp4',
'title': 'Folge 48: Gold Rogers Heimat',
'description': 'Ruffy erreicht die Insel, auf der der berühmte Gold Roger lebte und hingerichtet wurde.',
'upload_date': '20140226',
'duration': 1401.48,
'title': 'Countdown für die Autowerkstatt',
'description': 'md5:809fc051a457b5d8666013bc40698817',
'upload_date': '20140223',
'duration': 2595.04,
},
'params': {
# rtmp download

View File

@ -54,6 +54,7 @@ class SoundcloudIE(InfoExtractor):
'id': '47127627',
'ext': 'mp3',
'title': 'Goldrushed',
'description': 'From Stockholm Sweden\r\nPovel / Magnus / Filip / David\r\nwww.theroyalconcept.com',
'uploader': 'The Royal Concept',
'upload_date': '20120521',
},
@ -217,7 +218,7 @@ class SoundcloudIE(InfoExtractor):
return self._extract_info_dict(info, full_title, secret_token=token)
class SoundcloudSetIE(SoundcloudIE):
_VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)(?:[?].*)?$'
_VALID_URL = r'https?://(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
IE_NAME = 'soundcloud:set'
# it's in tests/test_playlists.py
_TESTS = []

View File

@ -1,10 +1,15 @@
from __future__ import unicode_literals
import re
from .mtv import MTVServicesInfoExtractor
class SpikeIE(MTVServicesInfoExtractor):
_VALID_URL = r'https?://www\.spike\.com/(video-clips|episodes)/.+'
_VALID_URL = r'''(?x)https?://
(www\.spike\.com/(video-clips|episodes)/.+|
m\.spike\.com/videos/video.rbml\?id=(?P<mobile_id>[^&]+))
'''
_TEST = {
'url': 'http://www.spike.com/video-clips/lhtu8m/auction-hunters-can-allen-ride-a-hundred-year-old-motorcycle',
'md5': '1a9265f32b0c375793d6c4ce45255256',
@ -17,3 +22,11 @@ class SpikeIE(MTVServicesInfoExtractor):
}
_FEED_URL = 'http://www.spike.com/feeds/mrss/'
_MOBILE_TEMPLATE = 'http://m.spike.com/videos/video.rbml?id=%s'
def _real_extract(self, url):
mobj = re.search(self._VALID_URL, url)
mobile_id = mobj.group('mobile_id')
if mobile_id is not None:
url = 'http://www.spike.com/video-clips/%s' % mobile_id
return super(SpikeIE, self)._real_extract(url)

View File

@ -6,115 +6,111 @@ import re
from .subtitles import SubtitlesInfoExtractor
from ..utils import (
RegexNotFoundError,
compat_str,
)
class TEDIE(SubtitlesInfoExtractor):
_VALID_URL=r'''http://www\.ted\.com/
(
((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
|
((?P<type_talk>talks)) # We have a simple talk
)
(/lang/(.*?))? # The url may contain the language
/(?P<name>\w+) # Here goes the name and then ".html"
'''
_VALID_URL = r'''(?x)http://www\.ted\.com/
(
(?P<type_playlist>playlists(?:/\d+)?) # We have a playlist
|
((?P<type_talk>talks)) # We have a simple talk
)
(/lang/(.*?))? # The url may contain the language
/(?P<name>\w+) # Here goes the name and then ".html"
'''
_TEST = {
'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
'file': '102.mp4',
'md5': '4ea1dada91e4174b53dac2bb8ace429d',
'info_dict': {
"description": "md5:c6fa72e6eedbd938c9caf6b2702f5922",
"title": "Dan Dennett: The illusion of consciousness"
'id': '102',
'ext': 'mp4',
'title': 'The illusion of consciousness',
'description': ('Philosopher Dan Dennett makes a compelling '
'argument that not only don\'t we understand our own '
'consciousness, but that half the time our brains are '
'actively fooling us.'),
'uploader': 'Dan Dennett',
}
}
@classmethod
def suitable(cls, url):
"""Receives a URL and returns True if suitable for this IE."""
return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
_FORMATS_PREFERENCE = {
'low': 1,
'medium': 2,
'high': 3,
}
def _extract_info(self, webpage):
info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>',
webpage, 'info json')
return json.loads(info_json)
def _real_extract(self, url):
m=re.match(self._VALID_URL, url, re.VERBOSE)
m = re.match(self._VALID_URL, url, re.VERBOSE)
name = m.group('name')
if m.group('type_talk'):
return self._talk_info(url)
else :
playlist_id=m.group('playlist_id')
name=m.group('name')
self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
return [self._playlist_videos_info(url,name,playlist_id)]
return self._talk_info(url, name)
else:
return self._playlist_videos_info(url, name)
def _playlist_videos_info(self, url, name, playlist_id):
def _playlist_videos_info(self, url, name):
'''Returns the videos of the playlist'''
webpage = self._download_webpage(
url, playlist_id, 'Downloading playlist webpage')
matches = re.finditer(
r'<p\s+class="talk-title[^"]*"><a\s+href="(?P<talk_url>/talks/[^"]+\.html)">[^<]*</a></p>',
webpage)
playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
webpage, 'playlist title')
webpage = self._download_webpage(url, name,
'Downloading playlist webpage')
info = self._extract_info(webpage)
playlist_info = info['playlist']
playlist_entries = [
self.url_result(u'http://www.ted.com' + m.group('talk_url'), 'TED')
for m in matches
self.url_result(u'http://www.ted.com/talks/' + talk['slug'], self.ie_key())
for talk in info['talks']
]
return self.playlist_result(
playlist_entries, playlist_id=playlist_id, playlist_title=playlist_title)
playlist_entries,
playlist_id=compat_str(playlist_info['id']),
playlist_title=playlist_info['title'])
def _talk_info(self, url, video_id=0):
"""Return the video for the talk in the url"""
m = re.match(self._VALID_URL, url,re.VERBOSE)
video_name = m.group('name')
webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
def _talk_info(self, url, video_name):
webpage = self._download_webpage(url, video_name)
self.report_extraction(video_name)
# If the url includes the language we get the title translated
title = self._html_search_regex(r'<span .*?id="altHeadline".+?>(?P<title>.*)</span>',
webpage, 'title')
json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
webpage, 'json data')
info = json.loads(json_data)
desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
webpage, 'description', flags = re.DOTALL)
thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
webpage, 'thumbnail')
talk_info = self._extract_info(webpage)['talks'][0]
formats = [{
'ext': 'mp4',
'url': stream['file'],
'format': stream['id']
} for stream in info['htmlStreams']]
video_id = info['id']
'url': format_url,
'format_id': format_id,
'format': format_id,
'preference': self._FORMATS_PREFERENCE.get(format_id, -1),
} for (format_id, format_url) in talk_info['nativeDownloads'].items()]
self._sort_formats(formats)
video_id = compat_str(talk_info['id'])
# subtitles
video_subtitles = self.extract_subtitles(video_id, webpage)
video_subtitles = self.extract_subtitles(video_id, talk_info)
if self._downloader.params.get('listsubtitles', False):
self._list_available_subtitles(video_id, webpage)
self._list_available_subtitles(video_id, talk_info)
return
return {
'id': video_id,
'title': title,
'thumbnail': thumbnail,
'description': desc,
'title': talk_info['title'],
'uploader': talk_info['speaker'],
'thumbnail': talk_info['thumb'],
'description': self._og_search_description(webpage),
'subtitles': video_subtitles,
'formats': formats,
}
def _get_available_subtitles(self, video_id, webpage):
try:
options = self._search_regex(r'(?:<select name="subtitles_language_select" id="subtitles_language_select">)(.*?)(?:</select>)', webpage, 'subtitles_language_select', flags=re.DOTALL)
languages = re.findall(r'(?:<option value=")(\S+)"', options)
if languages:
sub_lang_list = {}
for l in languages:
url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l)
sub_lang_list[l] = url
return sub_lang_list
except RegexNotFoundError:
def _get_available_subtitles(self, video_id, talk_info):
languages = [lang['languageCode'] for lang in talk_info.get('languages', [])]
if languages:
sub_lang_list = {}
for l in languages:
url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l)
sub_lang_list[l] = url
return sub_lang_list
else:
self._downloader.report_warning(u'video doesn\'t have subtitles')
return {}
return {}

View File

@ -3,7 +3,7 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
from youtube_dl.utils import ExtractorError
from ..utils import ExtractorError
class TinyPicIE(InfoExtractor):

View File

@ -0,0 +1,84 @@
# encoding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
unified_strdate,
clean_html,
int_or_none,
)
class TvigleIE(InfoExtractor):
IE_NAME = 'tvigle'
IE_DESC = 'Интернет-телевидение Tvigle.ru'
_VALID_URL = r'http://(?:www\.)?tvigle\.ru/category/.+?[\?&]v(?:ideo)?=(?P<id>\d+)'
_TESTS = [
{
'url': 'http://www.tvigle.ru/category/cinema/1608/?video=503081',
'md5': '09afba4616666249f087efc6dcf83cb3',
'info_dict': {
'id': '503081',
'ext': 'flv',
'title': 'Брат 2 ',
'description': 'md5:f5a42970f50648cee3d7ad740f3ae769',
'upload_date': '20110919',
},
},
{
'url': 'http://www.tvigle.ru/category/men/vysotskiy_vospominaniya02/?flt=196&v=676433',
'md5': 'e7efe5350dd5011d0de6550b53c3ba7b',
'info_dict': {
'id': '676433',
'ext': 'flv',
'title': 'Ведущий телепрограммы «60 минут» (США) о Владимире Высоцком',
'description': 'md5:027f7dc872948f14c96d19b4178428a4',
'upload_date': '20121218',
},
},
]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
video_data = self._download_xml(
'http://www.tvigle.ru/xml/single.php?obj=%s' % video_id, video_id, 'Downloading video XML')
video = video_data.find('./video')
title = video.get('name')
description = video.get('anons')
if description:
description = clean_html(description)
thumbnail = video_data.get('img')
upload_date = unified_strdate(video.get('date'))
like_count = int_or_none(video.get('vtp'))
formats = []
for num, (format_id, format_note) in enumerate([['low_file', 'SQ'], ['file', 'HQ'], ['hd', 'HD 720']]):
video_url = video.get(format_id)
if not video_url:
continue
formats.append({
'url': video_url,
'format_id': format_id,
'format_note': format_note,
'quality': num,
})
self._sort_formats(formats)
return {
'id': video_id,
'title': title,
'description': description,
'thumbnail': thumbnail,
'upload_date': upload_date,
'like_count': like_count,
'age_limit': 18,
'formats': formats,
}

View File

@ -57,7 +57,7 @@ class VevoIE(InfoExtractor):
'age_limit': 18,
'title': 'Tunnel Vision (Explicit)',
'uploader': 'Justin Timberlake',
'upload_date': '20130704',
'upload_date': '20130703',
},
'params': {
'skip_download': 'true',
@ -169,7 +169,7 @@ class VevoIE(InfoExtractor):
timestamp_ms = int(self._search_regex(
r'/Date\((\d+)\)/', video_info['launchDate'], 'launch date'))
upload_date = datetime.datetime.fromtimestamp(timestamp_ms // 1000)
upload_date = datetime.datetime.utcfromtimestamp(timestamp_ms // 1000)
return {
'id': video_id,
'title': video_info['title'],

View File

@ -10,10 +10,9 @@ from ..utils import (
)
class VestiIE(InfoExtractor):
IE_NAME = 'vesti'
IE_DESC = 'Вести.Ru'
_VALID_URL = r'http://(?:.+?\.)?vesti\.ru/(?P<id>.+)'
class VGTRKIE(InfoExtractor):
IE_DESC = 'ВГТРК'
_VALID_URL = r'http://(?:.+?\.)?(?:vesti\.ru|russia2?\.tv|tvkultura\.ru|rutv\.ru)/(?P<id>.+)'
_TESTS = [
{
@ -30,6 +29,20 @@ class VestiIE(InfoExtractor):
'skip_download': True,
},
},
{
'url': 'http://www.vesti.ru/doc.html?id=1349233',
'info_dict': {
'id': '773865',
'ext': 'mp4',
'title': 'Участники митинга штурмуют Донецкую областную администрацию',
'description': 'md5:1a160e98b3195379b4c849f2f4958009',
'duration': 210,
},
'params': {
# m3u8 download
'skip_download': True,
},
},
{
'url': 'http://www.vesti.ru/only_video.html?vid=576180',
'info_dict': {
@ -44,6 +57,20 @@ class VestiIE(InfoExtractor):
'skip_download': True,
},
},
{
'url': 'http://hitech.vesti.ru/news/view/id/4000',
'info_dict': {
'id': '766888',
'ext': 'mp4',
'title': 'Вести.net: интернет-гиганты начали перетягивание программных "одеял"',
'description': 'md5:65ddd47f9830c4f42ed6475f8730c995',
'duration': 279,
},
'params': {
# m3u8 download
'skip_download': True,
},
},
{
'url': 'http://sochi2014.vesti.ru/video/index/video_id/766403',
'info_dict': {
@ -57,7 +84,7 @@ class VestiIE(InfoExtractor):
# m3u8 download
'skip_download': True,
},
'skip': 'Blocked outside Russia'
'skip': 'Blocked outside Russia',
},
{
'url': 'http://sochi2014.vesti.ru/live/play/live_id/301',
@ -72,7 +99,78 @@ class VestiIE(InfoExtractor):
'skip_download': True,
},
'skip': 'Translation has finished'
}
},
{
'url': 'http://russia.tv/video/show/brand_id/5169/episode_id/970443/video_id/975648',
'info_dict': {
'id': '771852',
'ext': 'mp4',
'title': 'Прямой эфир. Жертвы загадочной болезни: смерть от старости в 17 лет',
'description': 'md5:b81c8c55247a4bd996b43ce17395b2d8',
'duration': 3096,
},
'params': {
# m3u8 download
'skip_download': True,
},
},
{
'url': 'http://russia.tv/brand/show/brand_id/57638',
'info_dict': {
'id': '774016',
'ext': 'mp4',
'title': 'Чужой в семье Сталина',
'description': '',
'duration': 2539,
},
'params': {
# m3u8 download
'skip_download': True,
},
},
{
'url': 'http://2.russia.tv/video/show/brand_id/48863/episode_id/972920/video_id/978667/viewtype/picture',
'info_dict': {
'id': '775081',
'ext': 'mp4',
'title': 'XXII зимние Олимпийские игры. Россияне заняли весь пьедестал в лыжных гонках',
'description': 'md5:15d3741dd8d04b203fbc031c6a47fb0f',
'duration': 101,
},
'params': {
# m3u8 download
'skip_download': True,
},
'skip': 'Blocked outside Russia',
},
{
'url': 'http://tvkultura.ru/video/show/brand_id/31724/episode_id/972347/video_id/978186',
'info_dict': {
'id': '774471',
'ext': 'mp4',
'title': 'Монологи на все времена',
'description': 'md5:18d8b5e6a41fb1faa53819471852d5d5',
'duration': 2906,
},
'params': {
# m3u8 download
'skip_download': True,
},
},
{
'url': 'http://rutv.ru/brand/show/id/6792/channel/75',
'info_dict': {
'id': '125521',
'ext': 'mp4',
'title': 'Грустная дама червей. Х',
'description': '',
'duration': 4882,
},
'params': {
# m3u8 download
'skip_download': True,
},
},
]
def _real_extract(self, url):
@ -81,16 +179,26 @@ class VestiIE(InfoExtractor):
page = self._download_webpage(url, video_id, 'Downloading page')
mobj = re.search(r'<meta property="og:video" content=".+?\.swf\?v?id=(?P<id>\d+).*?" />', page)
mobj = re.search(
r'<meta property="og:video" content="http://www\.vesti\.ru/i/flvplayer_videoHost\.swf\?vid=(?P<id>\d+)',
page)
if mobj:
video_id = mobj.group('id')
page = self._download_webpage('http://www.vesti.ru/only_video.html?vid=%s' % video_id, video_id,
'Downloading video page')
mobj = re.search(
r'<meta property="og:video" content="http://player\.rutv\.ru/flash2v/container\.swf\?id=(?P<id>\d+)', page)
if mobj:
video_type = 'video'
video_id = mobj.group('id')
else:
mobj = re.search(
r'<iframe.+?src="http://player\.rutv\.ru/iframe/(?P<type>[^/]+)/id/(?P<id>\d+)[^"]*".*?></iframe>', page)
r'<iframe.+?src="http://player\.rutv\.ru/iframe/(?P<type>[^/]+)/id/(?P<id>\d+)[^"]*".*?></iframe>',
page)
if not mobj:
raise ExtractorError('No media found')
raise ExtractorError('No media found', expected=True)
video_type = mobj.group('type')
video_id = mobj.group('id')
@ -113,8 +221,8 @@ class VestiIE(InfoExtractor):
priority_transport = playlist['priority_transport']
thumbnail = media['picture']
width = media['width']
height = media['height']
width = int_or_none(media['width'])
height = int_or_none(media['height'])
description = media['anons']
title = media['title']
duration = int_or_none(media.get('duration'))

View File

@ -1,22 +1,23 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from .internetvideoarchive import InternetVideoArchiveIE
from ..utils import (
compat_urlparse,
)
from ..utils import compat_urlparse
class VideoDetectiveIE(InfoExtractor):
_VALID_URL = r'https?://www\.videodetective\.com/[^/]+/[^/]+/(?P<id>\d+)'
_TEST = {
u'url': u'http://www.videodetective.com/movies/kick-ass-2/194487',
u'file': u'194487.mp4',
u'info_dict': {
u'title': u'KICK-ASS 2',
u'description': u'md5:65ba37ad619165afac7d432eaded6013',
u'duration': 135,
'url': 'http://www.videodetective.com/movies/kick-ass-2/194487',
'info_dict': {
'id': '194487',
'ext': 'mp4',
'title': 'KICK-ASS 2',
'description': 'md5:65ba37ad619165afac7d432eaded6013',
'duration': 135,
},
}
@ -26,5 +27,4 @@ class VideoDetectiveIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
og_video = self._og_search_video_url(webpage)
query = compat_urlparse.urlparse(og_video).query
return self.url_result(InternetVideoArchiveIE._build_url(query),
ie=InternetVideoArchiveIE.ie_key())
return self.url_result(InternetVideoArchiveIE._build_url(query), ie=InternetVideoArchiveIE.ie_key())

View File

@ -8,6 +8,7 @@ import itertools
from .common import InfoExtractor
from .subtitles import SubtitlesInfoExtractor
from ..utils import (
compat_HTTPError,
compat_urllib_parse,
compat_urllib_request,
clean_html,
@ -172,7 +173,18 @@ class VimeoIE(SubtitlesInfoExtractor):
# Retrieve video webpage to extract further information
request = compat_urllib_request.Request(url, None, headers)
webpage = self._download_webpage(request, video_id)
try:
webpage = self._download_webpage(request, video_id)
except ExtractorError as ee:
if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403:
errmsg = ee.cause.read()
if b'Because of its privacy settings, this video cannot be played here' in errmsg:
raise ExtractorError(
'Cannot download embed-only video without embedding '
'URL. Please call youtube-dl with the URL of the page '
'that embeds this video.',
expected=True)
raise
# Now we begin extracting as much information as we can from what we
# retrieved. First we extract the information common to all extractors,
@ -221,7 +233,9 @@ class VimeoIE(SubtitlesInfoExtractor):
# Extract video thumbnail
video_thumbnail = config["video"].get("thumbnail")
if video_thumbnail is None:
_, video_thumbnail = sorted((int(width), t_url) for (width, t_url) in config["video"]["thumbs"].items())[-1]
video_thumbs = config["video"].get("thumbs")
if video_thumbs and isinstance(video_thumbs, dict):
_, video_thumbnail = sorted((int(width), t_url) for (width, t_url) in video_thumbs.items())[-1]
# Extract video description
video_description = None

View File

@ -16,7 +16,7 @@ from ..utils import (
class VKIE(InfoExtractor):
IE_NAME = 'vk.com'
_VALID_URL = r'https?://vk\.com/(?:videos.*?\?.*?z=)?video(?P<id>.*?)(?:\?|%2F|$)'
_VALID_URL = r'https?://vk\.com/(?:video_ext\.php\?.*?\boid=(?P<oid>\d+).*?\bid=(?P<id>\d+)|(?:videos.*?\?.*?z=)?video(?P<videoid>.*?)(?:\?|%2F|$))'
_NETRC_MACHINE = 'vk'
_TESTS = [
@ -42,6 +42,18 @@ class VKIE(InfoExtractor):
'duration': 558,
}
},
{
'note': 'Embedded video',
'url': 'http://vk.com/video_ext.php?oid=32194266&id=162925554&hash=7d8c2e0d5e05aeaa&hd=1',
'md5': 'c7ce8f1f87bec05b3de07fdeafe21a0a',
'info_dict': {
'id': '162925554',
'ext': 'mp4',
'uploader': 'Vladimir Gavrin',
'title': 'Lin Dan',
'duration': 101,
}
},
{
'url': 'http://vk.com/video-8871596_164049491',
'md5': 'a590bcaf3d543576c9bd162812387666',
@ -54,7 +66,7 @@ class VKIE(InfoExtractor):
'duration': 8352,
},
'skip': 'Requires vk account credentials',
}
},
]
def _login(self):
@ -82,7 +94,10 @@ class VKIE(InfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
video_id = mobj.group('videoid')
if not video_id:
video_id = '%s_%s' % (mobj.group('oid'), mobj.group('id'))
info_url = 'http://vk.com/al_video.php?act=show&al=1&video=%s' % video_id
info_page = self._download_webpage(info_url, video_id)

View File

@ -13,7 +13,7 @@ class VubeIE(InfoExtractor):
_TEST = {
'url': 'http://vube.com/Chiara+Grispo+Video+Channel/YL2qNPkqon',
'md5': 'f81dcf6d0448e3291f54380181695821',
'md5': 'db7aba89d4603dadd627e9d1973946fe',
'info_dict': {
'id': 'YL2qNPkqon',
'ext': 'mp4',
@ -77,4 +77,4 @@ class VubeIE(InfoExtractor):
'like_count': like_count,
'dislike_count': dislike_count,
'comment_count': comment_count,
}
}

View File

@ -1,55 +1,49 @@
# encoding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
compat_urllib_parse,
ExtractorError,
)
class XNXXIE(InfoExtractor):
_VALID_URL = r'^(?:https?://)?(?:video|www)\.xnxx\.com/video([0-9]+)/(.*)'
VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
_VALID_URL = r'^https?://(?:video|www)\.xnxx\.com/video(?P<id>[0-9]+)/(.*)'
_TEST = {
u'url': u'http://video.xnxx.com/video1135332/lida_naked_funny_actress_5_',
u'file': u'1135332.flv',
u'md5': u'0831677e2b4761795f68d417e0b7b445',
u'info_dict': {
u"title": u"lida \u00bb Naked Funny Actress (5)",
u"age_limit": 18,
'url': 'http://video.xnxx.com/video1135332/lida_naked_funny_actress_5_',
'md5': '0831677e2b4761795f68d417e0b7b445',
'info_dict': {
'id': '1135332',
'ext': 'flv',
'title': 'lida » Naked Funny Actress (5)',
'age_limit': 18,
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
if mobj is None:
raise ExtractorError(u'Invalid URL: %s' % url)
video_id = mobj.group(1)
video_id = mobj.group('id')
# Get webpage content
webpage = self._download_webpage(url, video_id)
video_url = self._search_regex(self.VIDEO_URL_RE,
webpage, u'video URL')
video_url = self._search_regex(r'flv_url=(.*?)&amp;',
webpage, 'video URL')
video_url = compat_urllib_parse.unquote(video_url)
video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
webpage, u'title')
video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XNXX.COM',
webpage, 'title')
video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
webpage, u'thumbnail', fatal=False)
video_thumbnail = self._search_regex(r'url_bigthumb=(.*?)&amp;',
webpage, 'thumbnail', fatal=False)
return [{
return {
'id': video_id,
'url': video_url,
'uploader': None,
'upload_date': None,
'title': video_title,
'ext': 'flv',
'thumbnail': video_thumbnail,
'description': None,
'age_limit': 18,
}]
}

View File

@ -7,19 +7,24 @@ from .common import InfoExtractor
from ..utils import (
compat_urllib_parse_urlparse,
compat_urllib_request,
parse_duration,
str_to_int,
)
class XTubeIE(InfoExtractor):
_VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>xtube\.com/watch\.php\?v=(?P<videoid>[^/?&]+))'
_VALID_URL = r'https?://(?:www\.)?(?P<url>xtube\.com/watch\.php\?v=(?P<videoid>[^/?&]+))'
_TEST = {
'url': 'http://www.xtube.com/watch.php?v=kVTUy_G222_',
'file': 'kVTUy_G222_.mp4',
'md5': '092fbdd3cbe292c920ef6fc6a8a9cdab',
'info_dict': {
"title": "strange erotica",
"description": "surreal gay themed erotica...almost an ET kind of thing",
"uploader": "greenshowers",
"age_limit": 18,
'id': 'kVTUy_G222_',
'ext': 'mp4',
'title': 'strange erotica',
'description': 'surreal gay themed erotica...almost an ET kind of thing',
'uploader': 'greenshowers',
'duration': 450,
'age_limit': 18,
}
}
@ -32,10 +37,23 @@ class XTubeIE(InfoExtractor):
req.add_header('Cookie', 'age_verified=1')
webpage = self._download_webpage(req, video_id)
video_title = self._html_search_regex(r'<div class="p_5px[^>]*>([^<]+)', webpage, 'title')
video_uploader = self._html_search_regex(r'so_s\.addVariable\("owner_u", "([^"]+)', webpage, 'uploader', fatal=False)
video_description = self._html_search_regex(r'<p class="video_description">([^<]+)', webpage, 'description', fatal=False)
video_url= self._html_search_regex(r'var videoMp4 = "([^"]+)', webpage, 'video_url').replace('\\/', '/')
video_title = self._html_search_regex(r'<p class="title">([^<]+)', webpage, 'title')
video_uploader = self._html_search_regex(
r'so_s\.addVariable\("owner_u", "([^"]+)', webpage, 'uploader', fatal=False)
video_description = self._html_search_regex(
r'<p class="fieldsDesc">([^<]+)', webpage, 'description', fatal=False)
video_url = self._html_search_regex(r'var videoMp4 = "([^"]+)', webpage, 'video_url').replace('\\/', '/')
duration = parse_duration(self._html_search_regex(
r'<span class="bold">Runtime:</span> ([^<]+)</p>', webpage, 'duration', fatal=False))
view_count = self._html_search_regex(
r'<span class="bold">Views:</span> ([\d,\.]+)</p>', webpage, 'view count', fatal=False)
if view_count:
view_count = str_to_int(view_count)
comment_count = self._html_search_regex(
r'<div id="commentBar">([\d,\.]+) Comments</div>', webpage, 'comment count', fatal=False)
if comment_count:
comment_count = str_to_int(comment_count)
path = compat_urllib_parse_urlparse(video_url).path
extension = os.path.splitext(path)[1][1:]
format = path.split('/')[5].split('_')[:2]
@ -48,6 +66,9 @@ class XTubeIE(InfoExtractor):
'title': video_title,
'uploader': video_uploader,
'description': video_description,
'duration': duration,
'view_count': view_count,
'comment_count': comment_count,
'url': video_url,
'ext': extension,
'format': format,

View File

@ -1285,10 +1285,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
# Decide which formats to download
try:
mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
if not mobj:
raise ValueError('Could not find vevo ID')
ytplayer_config = json.loads(mobj.group(1))
json_code = uppercase_escape(mobj.group(1))
ytplayer_config = json.loads(json_code)
args = ytplayer_config['args']
# Easy way to know if the 's' value is in url_encoded_fmt_stream_map
# this signatures are encrypted
@ -1645,7 +1646,7 @@ class YoutubeChannelIE(InfoExtractor):
class YoutubeUserIE(InfoExtractor):
IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
_VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
_VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
_TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
_GDATA_PAGE_SIZE = 50
_GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
@ -1744,12 +1745,50 @@ class YoutubeSearchIE(SearchInfoExtractor):
for video_id in video_ids]
return self.playlist_result(videos, query)
class YoutubeSearchDateIE(YoutubeSearchIE):
IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
_API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
_SEARCH_KEY = 'ytsearchdate'
IE_DESC = u'YouTube.com searches, newest videos first'
class YoutubeSearchURLIE(InfoExtractor):
IE_DESC = u'YouTube.com search URLs'
IE_NAME = u'youtube:search_url'
_VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
query = compat_urllib_parse.unquote_plus(mobj.group('query'))
webpage = self._download_webpage(url, query)
result_code = self._search_regex(
r'(?s)<ol id="search-results"(.*?)</ol>', webpage, u'result HTML')
part_codes = re.findall(
r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
entries = []
for part_code in part_codes:
part_title = self._html_search_regex(
r'(?s)title="([^"]+)"', part_code, 'item title', fatal=False)
part_url_snippet = self._html_search_regex(
r'(?s)href="([^"]+)"', part_code, 'item URL')
part_url = compat_urlparse.urljoin(
'https://www.youtube.com/', part_url_snippet)
entries.append({
'_type': 'url',
'url': part_url,
'title': part_title,
})
return {
'_type': 'playlist',
'entries': entries,
'title': query,
}
class YoutubeShowIE(InfoExtractor):
IE_DESC = u'YouTube.com (multi-season) shows'
_VALID_URL = r'https?://www\.youtube\.com/show/(.*)'

View File

@ -22,6 +22,7 @@ import struct
import subprocess
import sys
import traceback
import xml.etree.ElementTree
import zlib
try:
@ -1263,3 +1264,17 @@ def read_batch_urls(batch_fd):
with contextlib.closing(batch_fd) as fd:
return [url for url in map(fixup, fd) if url]
def urlencode_postdata(*args, **kargs):
return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
def parse_xml(s):
class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
def doctype(self, name, pubid, system):
pass # Ignore doctypes
parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
return xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)

View File

@ -1,2 +1,2 @@
__version__ = '2014.02.27.1'
__version__ = '2014.03.11'