Compare commits

...

67 Commits

Author SHA1 Message Date
8c5850eeb4 release 2014.03.29 2014-03-29 14:01:53 +01:00
bd3e077a2d Merge branch 'master' of github.com:rg3/youtube-dl 2014-03-29 14:01:19 +01:00
7e70ac36b3 [bloomberg] Fix extraction (fixes #2154)
Stop using the OoyalaIE, extract the f4m url instead.
2014-03-29 11:55:12 +01:00
2cc0082dc0 Credit @phaer for OE1 (#2646) 2014-03-29 10:11:32 +01:00
056b56688a [ntv] Simplify 2014-03-29 15:55:03 +07:00
b17418313f [oe1] Simplify (#2646) 2014-03-28 23:23:58 +01:00
e9a6fd6a68 Merge remote-tracking branch 'phaer/add-oe1-support' 2014-03-28 23:21:58 +01:00
bf30f3bd9d release 2014.03.28 2014-03-28 23:14:54 +01:00
330edf2d84 Mention where to find keys in --dump-json (Fixes #2648) 2014-03-28 23:13:03 +01:00
43f775e4ca [comedycentral] Duration can now be a float (Fixes #2647) 2014-03-28 23:06:34 +01:00
8f6562448c [ntv] Move app guess outside formats loop 2014-03-28 23:09:56 +07:00
263f4b514b [ntv] Add support for ntv.ru (Closes #2581) 2014-03-28 23:01:08 +07:00
f0da3f1ef9 [oe1] Add support for oe1.orf.at. 2014-03-28 17:57:25 +02:00
cb3ac1c610 [smotri] Modernize and add support for emdebbed videos (Closes #2585) 2014-03-28 19:58:49 +07:00
8efd15f477 [canalplus] Fix video id extraction (Closes #2645) 2014-03-28 18:47:15 +07:00
d26ebe990f [ehow] Modernize 2014-03-27 21:23:02 +01:00
28acf5500a [appletrailers] Modernize 2014-03-27 21:10:51 +01:00
214c22c704 [niconico] Modernize 2014-03-27 21:01:09 +01:00
8cdafb47b9 [mooshare] Add support for URLs starting with 'www' 2014-03-27 19:08:35 +07:00
0dae5083f1 [urort] Add date 2014-03-27 02:56:23 +01:00
4c89bbd22c release 2014.03.27.1 2014-03-27 02:52:06 +01:00
e2b06e76c1 [urort] Add extractor (Fixes #2634) 2014-03-27 02:51:50 +01:00
e9c076c317 [clipsyndicate] Modernize 2014-03-27 02:30:00 +01:00
6c072e7d25 release 2014.03.27 2014-03-27 02:22:57 +01:00
ac6c104871 [ted] Add support for watch/ URLs (Fixes #2637) 2014-03-27 02:22:40 +01:00
69c01a9f68 [comedycentral] Add a testcase for extended-interviews URLs (#2636) 2014-03-27 02:02:48 +01:00
e55213ce35 Merge remote-tracking branch 'malept/tds-extended-interviews' 2014-03-27 02:02:18 +01:00
24a2aac445 [comedycentral] fix TDS extended interviews
The new website broke the URL format.
Added "playlist" as a valid ID keyword.
2014-03-26 10:51:02 -07:00
98acdc895b Merge remote-tracking branch 'dstftw/download-referer-header' (closes #2628) 2014-03-26 15:20:11 +01:00
bd3b5b8b10 [slashdot] Remove extractor
The generic ooyala detection works fine.
2014-03-26 15:09:14 +01:00
9a90636805 [vice] Remove extractor
The generic ooyala detection works fine.
2014-03-26 15:03:34 +01:00
6a66ae96ed [cspan] Roll back unfinished rtmp support 2014-03-26 19:51:54 +07:00
2c8a4ba6b5 Makefile: include the docs in the tarball 2014-03-26 12:01:08 +01:00
ad8915b729 Add --no-warnings option (Fixes #2630) 2014-03-26 00:43:46 +01:00
34cbc7ee8d [comedycentral] Better titles 2014-03-25 23:46:51 +01:00
a59e40a1ea Replace 'referer' with 'http_referer' 2014-03-25 21:53:26 +07:00
ad0a75db6b [auengine] Add referer 2014-03-25 21:22:41 +07:00
1d0e49e1c7 Use explicitly set Referer header for downloading 2014-03-25 21:22:27 +07:00
b4461b6ebe [auengine] Modernize 2014-03-25 21:16:10 +07:00
80959224fe release 2014.03.25.1 2014-03-25 14:27:40 +01:00
865cbf4fc5 [comedycentral] Correct uri (Fixes #2627) 2014-03-25 14:27:23 +01:00
196f061cac release 2014.03.25 2014-03-25 04:01:13 +01:00
99b380c33b [comedycentral] Fix thedailyshow / thecolbertreport (Fixes #2600, #2596) 2014-03-25 04:00:57 +01:00
02e4482e22 release 2014.03.24.5 2014-03-24 23:23:38 +01:00
b8a792de80 Merge remote-tracking branch 'origin/master' into HEAD
Conflicts:
	youtube_dl/extractor/arte.py
2014-03-24 23:23:17 +01:00
fac55558ad [washingtonpost] Add extractor (Fixes #2622) 2014-03-24 23:21:20 +01:00
b2799ff96d [arte] Fix videos.arte.tv extraction 2014-03-24 22:38:51 +01:00
7a249480b4 [arte] Fix video.arte.tv extractor 2014-03-24 22:34:03 +01:00
f605128d13 [rts] Add thumbnail support 2014-03-24 22:32:04 +01:00
ba40a74666 [clipfish] Modernize 2014-03-24 22:30:32 +01:00
fb8ae2d438 release 2014.03.24.4 2014-03-24 22:03:51 +01:00
893f8832b5 [arte] Add support for embedded videos (Fixes #2620) 2014-03-24 22:01:47 +01:00
878d11ec29 [arte] Add support for multiple formats 2014-03-24 21:36:26 +01:00
515bbe4b5b [arte] Remove liveweb support
liveweb.arte.tv is no longer functional, everything has moved to concert
2014-03-24 21:31:19 +01:00
75f2e25ba9 [downloader/hls] Encode filename (Fixes #2609) 2014-03-24 21:23:05 +01:00
0d466d34a3 release 2014.03.24.3 2014-03-24 17:12:42 +01:00
6949d81095 [byutv] Add support (Fixes #2612) 2014-03-24 17:12:15 +01:00
f847ca02d3 [addanime] Modernize 2014-03-24 16:39:53 +01:00
510243ba58 release 2014.03.24.2 2014-03-24 15:00:47 +01:00
b540697a8a [veoh] Improve extraction, fix youtube extraction (Closes #2616) 2014-03-24 20:53:03 +07:00
0d3641e589 [cinemassacre] Fix #2815 2014-03-24 13:43:13 +01:00
72546c831e Merge pull request #2553 from anisse/master
Add an option to specify custom HTTP headers
2014-03-24 10:42:58 +01:00
d26db9269d release 2014.03.24.1 2014-03-24 10:25:58 +01:00
4c0941853a [devscripts/release] Check version number 2014-03-24 10:25:49 +01:00
c11726364e release 2014.03.24 2014-03-24 10:17:35 +01:00
c577d735c6 release 2013.03.24.2 2014-03-24 02:24:31 +01:00
410afb2003 Add an option to specify custom HTTP headers 2014-03-17 16:40:41 +01:00
38 changed files with 990 additions and 433 deletions

View File

@ -72,8 +72,9 @@ youtube-dl.tar.gz: youtube-dl README.md README.txt youtube-dl.1 youtube-dl.bash-
--exclude '__pycache' \
--exclude '.git' \
--exclude 'testdata' \
--exclude 'docs/_build' \
-- \
bin devscripts test youtube_dl \
bin devscripts test youtube_dl docs \
CHANGELOG LICENSE README.md README.txt \
Makefile MANIFEST.in youtube-dl.1 youtube-dl.bash-completion setup.py \
youtube-dl

View File

@ -28,6 +28,9 @@ which means you can modify it, redistribute it or use it however you like.
--user-agent UA specify a custom user agent
--referer REF specify a custom referer, use if the video
access is restricted to one domain
--add-header FIELD:VALUE specify a custom HTTP header and its value,
separated by a colon ':'. You can use this
option multiple times
--list-extractors List all supported extractors and the URLs
they would handle
--extractor-descriptions Output descriptions of all supported
@ -166,6 +169,7 @@ which means you can modify it, redistribute it or use it however you like.
## Verbosity / Simulation Options:
-q, --quiet activates quiet mode
--no-warnings Ignore warnings
-s, --simulate do not download the video and do not write
anything to disk
--skip-download do not download the video
@ -177,7 +181,9 @@ which means you can modify it, redistribute it or use it however you like.
--get-duration simulate, quiet but print video length
--get-filename simulate, quiet but print output filename
--get-format simulate, quiet but print output format
-j, --dump-json simulate, quiet but print JSON information
-j, --dump-json simulate, quiet but print JSON information.
See --output for a description of available
keys.
--newline output progress bar as new lines
--no-progress do not print progress bar
--console-title display progress in console titlebar

View File

@ -22,6 +22,12 @@ fi
if [ -z "$1" ]; then echo "ERROR: specify version number like this: $0 1994.09.06"; exit 1; fi
version="$1"
major_version=$(echo "$version" | sed -n 's#^\([0-9]*\.[0-9]*\.[0-9]*\).*#\1#p')
if test "$major_version" '!=' "$(date '+%Y.%m.%d')"; then
echo "$version does not start with today's date!"
exit 1
fi
if [ ! -z "`git tag | grep "$version"`" ]; then echo 'ERROR: version already present'; exit 1; fi
if [ ! -z "`git status --porcelain | grep -v CHANGELOG`" ]; then echo 'ERROR: the working directory is not clean; commit or stash changes'; exit 1; fi
useless_files=$(find youtube_dl -type f -not -name '*.py')

View File

@ -143,5 +143,8 @@ class TestAllURLsMatching(unittest.TestCase):
self.assertMatch('http://video.pbs.org/viralplayer/2365173446/', ['PBS'])
self.assertMatch('http://video.pbs.org/widget/partnerplayer/980042464/', ['PBS'])
def test_ComedyCentralShows(self):
self.assertMatch('http://thedailyshow.cc.com/extended-interviews/xm3fnq/andrew-napolitano-extended-interview', ['ComedyCentralShows'])
if __name__ == '__main__':
unittest.main()

View File

@ -10,6 +10,7 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
# Various small unit tests
import io
import json
import xml.etree.ElementTree
#from youtube_dl.utils import htmlentity_transform
@ -36,6 +37,7 @@ from youtube_dl.utils import (
urlencode_postdata,
xpath_with_ns,
parse_iso8601,
strip_jsonp,
)
if sys.version_info < (3, 0):
@ -272,5 +274,11 @@ class TestUtil(unittest.TestCase):
self.assertEqual(parse_iso8601('2014-03-23T22:04:26+0000'), 1395612266)
self.assertEqual(parse_iso8601('2014-03-23T22:04:26Z'), 1395612266)
def test_strip_jsonp(self):
stripped = strip_jsonp('cb ([ {"id":"532cb",\n\n\n"x":\n3}\n]\n);')
d = json.loads(stripped)
self.assertEqual(d, [{"id": "532cb", "x": 3}])
if __name__ == '__main__':
unittest.main()

View File

@ -94,6 +94,7 @@ class YoutubeDL(object):
usenetrc: Use netrc for authentication instead.
verbose: Print additional info to stdout.
quiet: Do not print messages to stdout.
no_warnings: Do not print out anything for warnings.
forceurl: Force printing final URL.
forcetitle: Force printing title.
forceid: Force printing ID.
@ -376,6 +377,8 @@ class YoutubeDL(object):
if self.params.get('logger') is not None:
self.params['logger'].warning(message)
else:
if self.params.get('no_warnings'):
return
if self._err_file.isatty() and os.name != 'nt':
_msg_header = '\033[0;33mWARNING:\033[0m'
else:

View File

@ -51,6 +51,7 @@ __authors__ = (
'David Wagner',
'Juan C. Olivares',
'Mattias Harrysson',
'phaer',
)
__license__ = 'Public Domain'
@ -227,6 +228,9 @@ def parseOpts(overrideArguments=None):
general.add_option('--referer',
dest='referer', help='specify a custom referer, use if the video access is restricted to one domain',
metavar='REF', default=None)
general.add_option('--add-header',
dest='headers', help='specify a custom HTTP header and its value, separated by a colon \':\'. You can use this option multiple times', action="append",
metavar='FIELD:VALUE')
general.add_option('--list-extractors',
action='store_true', dest='list_extractors',
help='List all supported extractors and the URLs they would handle', default=False)
@ -361,6 +365,10 @@ def parseOpts(overrideArguments=None):
verbosity.add_option('-q', '--quiet',
action='store_true', dest='quiet', help='activates quiet mode', default=False)
verbosity.add_option(
'--no-warnings',
dest='no_warnings', action='store_true', default=False,
help='Ignore warnings')
verbosity.add_option('-s', '--simulate',
action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
verbosity.add_option('--skip-download',
@ -388,7 +396,7 @@ def parseOpts(overrideArguments=None):
help='simulate, quiet but print output format', default=False)
verbosity.add_option('-j', '--dump-json',
action='store_true', dest='dumpjson',
help='simulate, quiet but print JSON information', default=False)
help='simulate, quiet but print JSON information. See --output for a description of available keys.', default=False)
verbosity.add_option('--newline',
action='store_true', dest='progress_with_newline', help='output progress bar as new lines', default=False)
verbosity.add_option('--no-progress',
@ -556,6 +564,16 @@ def _real_main(argv=None):
if opts.referer is not None:
std_headers['Referer'] = opts.referer
# Custom HTTP headers
if opts.headers is not None:
for h in opts.headers:
if h.find(':', 1) < 0:
parser.error(u'wrong header formatting, it should be key:value, not "%s"'%h)
key, value = h.split(':', 2)
if opts.verbose:
write_string(u'[debug] Adding header from command line option %s:%s\n'%(key, value))
std_headers[key] = value
# Dump user agent
if opts.dump_user_agent:
compat_print(std_headers['User-Agent'])
@ -695,6 +713,7 @@ def _real_main(argv=None):
'password': opts.password,
'videopassword': opts.videopassword,
'quiet': (opts.quiet or any_printing),
'no_warnings': opts.no_warnings,
'forceurl': opts.geturl,
'forcetitle': opts.gettitle,
'forceid': opts.getid,

View File

@ -13,8 +13,10 @@ class HlsFD(FileDownloader):
self.report_destination(filename)
tmpfilename = self.temp_name(filename)
args = ['-y', '-i', url, '-f', 'mp4', '-c', 'copy',
'-bsf:a', 'aac_adtstoasc', tmpfilename]
args = [
'-y', '-i', url, '-f', 'mp4', '-c', 'copy',
'-bsf:a', 'aac_adtstoasc',
encodeFilename(tmpfilename, for_subprocess=True)]
for program in ['avconv', 'ffmpeg']:
try:

View File

@ -23,6 +23,8 @@ class HttpFD(FileDownloader):
headers = {'Youtubedl-no-compression': 'True'}
if 'user_agent' in info_dict:
headers['Youtubedl-user-agent'] = info_dict['user_agent']
if 'http_referer' in info_dict:
headers['Referer'] = info_dict['http_referer']
basic_request = compat_urllib_request.Request(url, None, headers)
request = compat_urllib_request.Request(url, None, headers)

View File

@ -14,6 +14,7 @@ from .arte import (
ArteTVConcertIE,
ArteTVFutureIE,
ArteTVDDCIE,
ArteTVEmbedIE,
)
from .auengine import AUEngineIE
from .bambuser import BambuserIE, BambuserChannelIE
@ -25,6 +26,7 @@ from .bloomberg import BloombergIE
from .br import BRIE
from .breakcom import BreakIE
from .brightcove import BrightcoveIE
from .byutv import BYUtvIE
from .c56 import C56IE
from .canal13cl import Canal13clIE
from .canalplus import CanalplusIE
@ -175,6 +177,8 @@ from .normalboots import NormalbootsIE
from .novamov import NovaMovIE
from .nowness import NownessIE
from .nowvideo import NowVideoIE
from .ntv import NTVIE
from .oe1 import OE1IE
from .ooyala import OoyalaIE
from .orf import ORFIE
from .parliamentliveuk import ParliamentLiveUKIE
@ -206,7 +210,6 @@ from .rutv import RUTVIE
from .savefrom import SaveFromIE
from .servingsys import ServingSysIE
from .sina import SinaIE
from .slashdot import SlashdotIE
from .slideshare import SlideshareIE
from .smotri import (
SmotriIE,
@ -255,13 +258,13 @@ from .udemy import (
UdemyCourseIE
)
from .unistra import UnistraIE
from .urort import UrortIE
from .ustream import UstreamIE, UstreamChannelIE
from .vbox7 import Vbox7IE
from .veehd import VeeHDIE
from .veoh import VeohIE
from .vesti import VestiIE
from .vevo import VevoIE
from .vice import ViceIE
from .viddler import ViddlerIE
from .videobam import VideoBamIE
from .videodetective import VideoDetectiveIE
@ -280,6 +283,7 @@ from .vine import VineIE
from .viki import VikiIE
from .vk import VKIE
from .vube import VubeIE
from .washingtonpost import WashingtonPostIE
from .wat import WatIE
from .wdr import WDRIE
from .weibo import WeiboIE

View File

@ -1,3 +1,5 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
@ -14,14 +16,14 @@ from ..utils import (
class AddAnimeIE(InfoExtractor):
_VALID_URL = r'^http://(?:\w+\.)?add-anime\.net/watch_video\.php\?(?:.*?)v=(?P<video_id>[\w_]+)(?:.*)'
IE_NAME = u'AddAnime'
_TEST = {
u'url': u'http://www.add-anime.net/watch_video.php?v=24MR3YO5SAS9',
u'file': u'24MR3YO5SAS9.mp4',
u'md5': u'72954ea10bc979ab5e2eb288b21425a0',
u'info_dict': {
u"description": u"One Piece 606",
u"title": u"One Piece 606"
'url': 'http://www.add-anime.net/watch_video.php?v=24MR3YO5SAS9',
'md5': '72954ea10bc979ab5e2eb288b21425a0',
'info_dict': {
'id': '24MR3YO5SAS9',
'ext': 'mp4',
'description': 'One Piece 606',
'title': 'One Piece 606',
}
}
@ -38,10 +40,10 @@ class AddAnimeIE(InfoExtractor):
redir_webpage = ee.cause.read().decode('utf-8')
action = self._search_regex(
r'<form id="challenge-form" action="([^"]+)"',
redir_webpage, u'Redirect form')
redir_webpage, 'Redirect form')
vc = self._search_regex(
r'<input type="hidden" name="jschl_vc" value="([^"]+)"/>',
redir_webpage, u'redirect vc value')
redir_webpage, 'redirect vc value')
av = re.search(
r'a\.value = ([0-9]+)[+]([0-9]+)[*]([0-9]+);',
redir_webpage)
@ -52,19 +54,19 @@ class AddAnimeIE(InfoExtractor):
parsed_url = compat_urllib_parse_urlparse(url)
av_val = av_res + len(parsed_url.netloc)
confirm_url = (
parsed_url.scheme + u'://' + parsed_url.netloc +
parsed_url.scheme + '://' + parsed_url.netloc +
action + '?' +
compat_urllib_parse.urlencode({
'jschl_vc': vc, 'jschl_answer': compat_str(av_val)}))
self._download_webpage(
confirm_url, video_id,
note=u'Confirming after redirect')
note='Confirming after redirect')
webpage = self._download_webpage(url, video_id)
formats = []
for format_id in ('normal', 'hq'):
rex = r"var %s_video_file = '(.*?)';" % re.escape(format_id)
video_url = self._search_regex(rex, webpage, u'video file URLx',
video_url = self._search_regex(rex, webpage, 'video file URLx',
fatal=False)
if not video_url:
continue
@ -72,14 +74,13 @@ class AddAnimeIE(InfoExtractor):
'format_id': format_id,
'url': video_url,
})
if not formats:
raise ExtractorError(u'Cannot find any video format!')
self._sort_formats(formats)
video_title = self._og_search_title(webpage)
video_description = self._og_search_description(webpage)
return {
'_type': 'video',
'id': video_id,
'id': video_id,
'formats': formats,
'title': video_title,
'description': video_description

View File

@ -16,9 +16,10 @@ class AppleTrailersIE(InfoExtractor):
"url": "http://trailers.apple.com/trailers/wb/manofsteel/",
"playlist": [
{
"file": "manofsteel-trailer4.mov",
"md5": "d97a8e575432dbcb81b7c3acb741f8a8",
"info_dict": {
"id": "manofsteel-trailer4",
"ext": "mov",
"duration": 111,
"title": "Trailer 4",
"upload_date": "20130523",
@ -26,9 +27,10 @@ class AppleTrailersIE(InfoExtractor):
},
},
{
"file": "manofsteel-trailer3.mov",
"md5": "b8017b7131b721fb4e8d6f49e1df908c",
"info_dict": {
"id": "manofsteel-trailer3",
"ext": "mov",
"duration": 182,
"title": "Trailer 3",
"upload_date": "20130417",
@ -36,9 +38,10 @@ class AppleTrailersIE(InfoExtractor):
},
},
{
"file": "manofsteel-trailer.mov",
"md5": "d0f1e1150989b9924679b441f3404d48",
"info_dict": {
"id": "manofsteel-trailer",
"ext": "mov",
"duration": 148,
"title": "Trailer",
"upload_date": "20121212",
@ -46,15 +49,16 @@ class AppleTrailersIE(InfoExtractor):
},
},
{
"file": "manofsteel-teaser.mov",
"md5": "5fe08795b943eb2e757fa95cb6def1cb",
"info_dict": {
"id": "manofsteel-teaser",
"ext": "mov",
"duration": 93,
"title": "Teaser",
"upload_date": "20120721",
"uploader_id": "wb",
},
}
},
]
}
@ -65,16 +69,16 @@ class AppleTrailersIE(InfoExtractor):
movie = mobj.group('movie')
uploader_id = mobj.group('company')
playlist_url = compat_urlparse.urljoin(url, u'includes/playlists/itunes.inc')
playlist_url = compat_urlparse.urljoin(url, 'includes/playlists/itunes.inc')
def fix_html(s):
s = re.sub(r'(?s)<script[^<]*?>.*?</script>', u'', s)
s = re.sub(r'(?s)<script[^<]*?>.*?</script>', '', s)
s = re.sub(r'<img ([^<]*?)>', r'<img \1/>', s)
# The ' in the onClick attributes are not escaped, it couldn't be parsed
# like: http://trailers.apple.com/trailers/wb/gravity/
def _clean_json(m):
return u'iTunes.playURL(%s);' % m.group(1).replace('\'', '&#39;')
return 'iTunes.playURL(%s);' % m.group(1).replace('\'', '&#39;')
s = re.sub(self._JSON_RE, _clean_json, s)
s = u'<html>' + s + u'</html>'
s = '<html>' + s + u'</html>'
return s
doc = self._download_xml(playlist_url, movie, transform_source=fix_html)
@ -82,7 +86,7 @@ class AppleTrailersIE(InfoExtractor):
for li in doc.findall('./div/ul/li'):
on_click = li.find('.//a').attrib['onClick']
trailer_info_json = self._search_regex(self._JSON_RE,
on_click, u'trailer info')
on_click, 'trailer info')
trailer_info = json.loads(trailer_info_json)
title = trailer_info['title']
video_id = movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', title).lower()
@ -98,8 +102,7 @@ class AppleTrailersIE(InfoExtractor):
first_url = trailer_info['url']
trailer_id = first_url.split('/')[-1].rpartition('_')[0].lower()
settings_json_url = compat_urlparse.urljoin(url, 'includes/settings/%s.json' % trailer_id)
settings_json = self._download_webpage(settings_json_url, trailer_id, u'Downloading settings json')
settings = json.loads(settings_json)
settings = self._download_json(settings_json_url, trailer_id, 'Downloading settings json')
formats = []
for format in settings['metadata']['sizes']:
@ -107,7 +110,6 @@ class AppleTrailersIE(InfoExtractor):
format_url = re.sub(r'_(\d*p.mov)', r'_h\1', format['src'])
formats.append({
'url': format_url,
'ext': determine_ext(format_url),
'format': format['type'],
'width': format['width'],
'height': int(format['height']),

View File

@ -2,7 +2,6 @@
from __future__ import unicode_literals
import re
import json
from .common import InfoExtractor
from ..utils import (
@ -19,114 +18,41 @@ from ..utils import (
# is different for each one. The videos usually expire in 7 days, so we can't
# add tests.
class ArteTvIE(InfoExtractor):
_VIDEOS_URL = r'(?:http://)?videos\.arte\.tv/(?P<lang>fr|de)/.*-(?P<id>.*?)\.html'
_LIVEWEB_URL = r'(?:http://)?liveweb\.arte\.tv/(?P<lang>fr|de)/(?P<subpage>.+?)/(?P<name>.+)'
_LIVE_URL = r'index-[0-9]+\.html$'
class ArteTvIE(InfoExtractor):
_VALID_URL = r'http://videos\.arte\.tv/(?P<lang>fr|de)/.*-(?P<id>.*?)\.html'
IE_NAME = 'arte.tv'
@classmethod
def suitable(cls, url):
return any(re.match(regex, url) for regex in (cls._VIDEOS_URL, cls._LIVEWEB_URL))
# TODO implement Live Stream
# from ..utils import compat_urllib_parse
# def extractLiveStream(self, url):
# video_lang = url.split('/')[-4]
# info = self.grep_webpage(
# url,
# r'src="(.*?/videothek_js.*?\.js)',
# 0,
# [
# (1, 'url', 'Invalid URL: %s' % url)
# ]
# )
# http_host = url.split('/')[2]
# next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
# info = self.grep_webpage(
# next_url,
# r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
# '(http://.*?\.swf).*?' +
# '(rtmp://.*?)\'',
# re.DOTALL,
# [
# (1, 'path', 'could not extract video path: %s' % url),
# (2, 'player', 'could not extract video player: %s' % url),
# (3, 'url', 'could not extract video url: %s' % url)
# ]
# )
# video_url = '%s/%s' % (info.get('url'), info.get('path'))
def _real_extract(self, url):
mobj = re.match(self._VIDEOS_URL, url)
if mobj is not None:
id = mobj.group('id')
lang = mobj.group('lang')
return self._extract_video(url, id, lang)
mobj = re.match(self._VALID_URL, url)
lang = mobj.group('lang')
video_id = mobj.group('id')
mobj = re.match(self._LIVEWEB_URL, url)
if mobj is not None:
name = mobj.group('name')
lang = mobj.group('lang')
return self._extract_liveweb(url, name, lang)
if re.search(self._LIVE_URL, url) is not None:
raise ExtractorError('Arte live streams are not yet supported, sorry')
# self.extractLiveStream(url)
# return
raise ExtractorError('No video found')
def _extract_video(self, url, video_id, lang):
"""Extract from videos.arte.tv"""
ref_xml_url = url.replace('/videos/', '/do_delegate/videos/')
ref_xml_url = ref_xml_url.replace('.html', ',view,asPlayerXml.xml')
ref_xml_doc = self._download_xml(
ref_xml_url, video_id, note='Downloading metadata')
config_node = find_xpath_attr(ref_xml_doc, './/video', 'lang', lang)
config_xml_url = config_node.attrib['ref']
config_xml = self._download_webpage(
config = self._download_xml(
config_xml_url, video_id, note='Downloading configuration')
video_urls = list(re.finditer(r'<url quality="(?P<quality>.*?)">(?P<url>.*?)</url>', config_xml))
def _key(m):
quality = m.group('quality')
if quality == 'hd':
return 2
else:
return 1
# We pick the best quality
video_urls = sorted(video_urls, key=_key)
video_url = list(video_urls)[-1].group('url')
title = self._html_search_regex(r'<name>(.*?)</name>', config_xml, 'title')
thumbnail = self._html_search_regex(r'<firstThumbnailUrl>(.*?)</firstThumbnailUrl>',
config_xml, 'thumbnail')
return {'id': video_id,
'title': title,
'thumbnail': thumbnail,
'url': video_url,
'ext': 'flv',
}
formats = [{
'forma_id': q.attrib['quality'],
'url': q.text,
'ext': 'flv',
'quality': 2 if q.attrib['quality'] == 'hd' else 1,
} for q in config.findall('./urls/url')]
self._sort_formats(formats)
def _extract_liveweb(self, url, name, lang):
"""Extract form http://liveweb.arte.tv/"""
webpage = self._download_webpage(url, name)
video_id = self._search_regex(r'eventId=(\d+?)("|&)', webpage, 'event id')
config_doc = self._download_xml('http://download.liveweb.arte.tv/o21/liveweb/events/event-%s.xml' % video_id,
video_id, 'Downloading information')
event_doc = config_doc.find('event')
url_node = event_doc.find('video').find('urlHd')
if url_node is None:
url_node = event_doc.find('urlSd')
return {'id': video_id,
'title': event_doc.find('name%s' % lang.capitalize()).text,
'url': url_node.text.replace('MP4', 'mp4'),
'ext': 'flv',
'thumbnail': self._og_search_thumbnail(webpage),
}
title = config.find('.//name').text
thumbnail = config.find('.//firstThumbnailUrl').text
return {
'id': video_id,
'title': title,
'thumbnail': thumbnail,
'formats': formats,
}
class ArteTVPlus7IE(InfoExtractor):
@ -152,9 +78,7 @@ class ArteTVPlus7IE(InfoExtractor):
return self._extract_from_json_url(json_url, video_id, lang)
def _extract_from_json_url(self, json_url, video_id, lang):
json_info = self._download_webpage(json_url, video_id, 'Downloading info json')
self.report_extraction(video_id)
info = json.loads(json_info)
info = self._download_json(json_url, video_id)
player_info = info['videoJsonPlayer']
info_dict = {
@ -176,6 +100,8 @@ class ArteTVPlus7IE(InfoExtractor):
l = 'F'
elif lang == 'de':
l = 'A'
else:
l = lang
regexes = [r'VO?%s' % l, r'VO?.-ST%s' % l]
return any(re.match(r, f['versionCode']) for r in regexes)
# Some formats may not be in the same language as the url
@ -305,3 +231,22 @@ class ArteTVConcertIE(ArteTVPlus7IE):
'description': 'md5:486eb08f991552ade77439fe6d82c305',
},
}
class ArteTVEmbedIE(ArteTVPlus7IE):
IE_NAME = 'arte.tv:embed'
_VALID_URL = r'''(?x)
http://www\.arte\.tv
/playerv2/embed\.php\?json_url=
(?P<json_url>
http://arte\.tv/papi/tvguide/videos/stream/player/
(?P<lang>[^/]+)/(?P<id>[^/]+)[^&]*
)
'''
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
lang = mobj.group('lang')
json_url = mobj.group('json_url')
return self._extract_from_json_url(json_url, video_id, lang)

View File

@ -11,22 +11,24 @@ from ..utils import (
class AUEngineIE(InfoExtractor):
_VALID_URL = r'http://(?:www\.)?auengine\.com/embed\.php\?.*?file=(?P<id>[^&]+).*?'
_TEST = {
'url': 'http://auengine.com/embed.php?file=lfvlytY6&w=650&h=370',
'file': 'lfvlytY6.mp4',
'md5': '48972bdbcf1a3a2f5533e62425b41d4f',
'info_dict': {
'id': 'lfvlytY6',
'ext': 'mp4',
'title': '[Commie]The Legend of the Legendary Heroes - 03 - Replication Eye (Alpha Stigma)[F9410F5A]'
}
}
_VALID_URL = r'(?:http://)?(?:www\.)?auengine\.com/embed\.php\?.*?file=([^&]+).*?'
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group(1)
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
title = self._html_search_regex(r'<title>(?P<title>.+?)</title>',
webpage, 'title')
title = self._html_search_regex(r'<title>(?P<title>.+?)</title>', webpage, 'title')
title = title.strip()
links = re.findall(r'\s(?:file|url):\s*["\']([^\'"]+)["\']', webpage)
links = map(compat_urllib_parse.unquote, links)
@ -39,14 +41,15 @@ class AUEngineIE(InfoExtractor):
elif '/videos/' in link:
video_url = link
if not video_url:
raise ExtractorError(u'Could not find video URL')
raise ExtractorError('Could not find video URL')
ext = '.' + determine_ext(video_url)
if ext == title[-len(ext):]:
title = title[:-len(ext)]
return {
'id': video_id,
'url': video_url,
'title': title,
'id': video_id,
'url': video_url,
'title': title,
'thumbnail': thumbnail,
'http_referer': 'http://www.auengine.com/flowplayer/flowplayer.commercial-3.2.14.swf',
}

View File

@ -1,22 +1,21 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from .ooyala import OoyalaIE
class BloombergIE(InfoExtractor):
_VALID_URL = r'https?://www\.bloomberg\.com/video/(?P<name>.+?)\.html'
_TEST = {
u'url': u'http://www.bloomberg.com/video/shah-s-presentation-on-foreign-exchange-strategies-qurhIVlJSB6hzkVi229d8g.html',
u'file': u'12bzhqZTqQHmmlA8I-i0NpzJgcG5NNYX.mp4',
u'info_dict': {
u'title': u'Shah\'s Presentation on Foreign-Exchange Strategies',
u'description': u'md5:abc86e5236f9f0e4866c59ad36736686',
},
u'params': {
# Requires ffmpeg (m3u8 manifest)
u'skip_download': True,
'url': 'http://www.bloomberg.com/video/shah-s-presentation-on-foreign-exchange-strategies-qurhIVlJSB6hzkVi229d8g.html',
'md5': '7bf08858ff7c203c870e8a6190e221e5',
'info_dict': {
'id': 'qurhIVlJSB6hzkVi229d8g',
'ext': 'flv',
'title': 'Shah\'s Presentation on Foreign-Exchange Strategies',
'description': 'md5:0681e0d30dcdfc6abf34594961d8ea88',
},
}
@ -24,7 +23,16 @@ class BloombergIE(InfoExtractor):
mobj = re.match(self._VALID_URL, url)
name = mobj.group('name')
webpage = self._download_webpage(url, name)
embed_code = self._search_regex(
r'<source src="https?://[^/]+/[^/]+/[^/]+/([^/]+)', webpage,
'embed code')
return OoyalaIE._build_url_result(embed_code)
f4m_url = self._search_regex(
r'<source src="(https?://[^"]+\.f4m.*?)"', webpage,
'f4m url')
title = re.sub(': Video$', '', self._og_search_title(webpage))
return {
'id': name.split('-')[-1],
'title': title,
'url': f4m_url,
'ext': 'flv',
'description': self._og_search_description(webpage),
'thumbnail': self._og_search_thumbnail(webpage),
}

View File

@ -0,0 +1,50 @@
from __future__ import unicode_literals
import json
import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
)
class BYUtvIE(InfoExtractor):
_VALID_URL = r'^https?://(?:www\.)?byutv.org/watch/[0-9a-f-]+/(?P<video_id>[^/?#]+)'
_TEST = {
'url': 'http://www.byutv.org/watch/44e80f7b-e3ba-43ba-8c51-b1fd96c94a79/granite-flats-talking',
'info_dict': {
'id': 'granite-flats-talking',
'ext': 'mp4',
'description': 'md5:1a7ae3e153359b7cc355ef3963441e5f',
'title': 'Talking',
'thumbnail': 're:^https?://.*promo.*'
},
'params': {
'skip_download': True,
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('video_id')
webpage = self._download_webpage(url, video_id)
episode_code = self._search_regex(
r'(?s)episode:(.*?\}),\s*\n', webpage, 'episode information')
episode_json = re.sub(
r'(\n\s+)([a-zA-Z]+):\s+\'(.*?)\'', r'\1"\2": "\3"', episode_code)
ep = json.loads(episode_json)
if ep['providerType'] == 'Ooyala':
return {
'_type': 'url_transparent',
'ie_key': 'Ooyala',
'url': 'ooyala:%s' % ep['providerId'],
'id': video_id,
'title': ep['title'],
'description': ep.get('description'),
'thumbnail': ep.get('imageThumbnail'),
}
else:
raise ExtractorError('Unsupported provider %s' % ep['provider'])

View File

@ -28,7 +28,7 @@ class CanalplusIE(InfoExtractor):
video_id = mobj.groupdict().get('id')
if video_id is None:
webpage = self._download_webpage(url, mobj.group('path'))
video_id = self._search_regex(r'videoId = "(\d+)";', webpage, u'video id')
video_id = self._search_regex(r'<canal:player videoId="(\d+)"', webpage, u'video id')
info_url = self._VIDEO_INFO_TEMPLATE % video_id
doc = self._download_xml(info_url,video_id,
u'Downloading video info')

View File

@ -9,12 +9,12 @@ from ..utils import (
class CinemassacreIE(InfoExtractor):
_VALID_URL = r'http://(?:www\.)?cinemassacre\.com/(?P<date_Y>[0-9]{4})/(?P<date_m>[0-9]{2})/(?P<date_d>[0-9]{2})/.+?'
_VALID_URL = r'http://(?:www\.)?cinemassacre\.com/(?P<date_Y>[0-9]{4})/(?P<date_m>[0-9]{2})/(?P<date_d>[0-9]{2})/(?P<display_id>[^?#/]+)'
_TESTS = [
{
'url': 'http://cinemassacre.com/2012/11/10/avgn-the-movie-trailer/',
'file': '19911.mp4',
'md5': 'fde81fbafaee331785f58cd6c0d46190',
'md5': '782f8504ca95a0eba8fc9177c373eec7',
'info_dict': {
'upload_date': '20121110',
'title': '“Angry Video Game Nerd: The Movie” Trailer',
@ -24,7 +24,7 @@ class CinemassacreIE(InfoExtractor):
{
'url': 'http://cinemassacre.com/2013/10/02/the-mummys-hand-1940',
'file': '521be8ef82b16.mp4',
'md5': 'd72f10cd39eac4215048f62ab477a511',
'md5': 'dec39ee5118f8d9cc067f45f9cbe3a35',
'info_dict': {
'upload_date': '20131002',
'title': 'The Mummys Hand (1940)',
@ -34,8 +34,9 @@ class CinemassacreIE(InfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
display_id = mobj.group('display_id')
webpage = self._download_webpage(url, None) # Don't know video id yet
webpage = self._download_webpage(url, display_id)
video_date = mobj.group('date_Y') + mobj.group('date_m') + mobj.group('date_d')
mobj = re.search(r'src="(?P<embed_url>http://player\.screenwavemedia\.com/play/[a-zA-Z]+\.php\?id=(?:Cinemassacre-)?(?P<video_id>.+?))"', webpage)
if not mobj:
@ -43,33 +44,36 @@ class CinemassacreIE(InfoExtractor):
playerdata_url = mobj.group('embed_url')
video_id = mobj.group('video_id')
video_title = self._html_search_regex(r'<title>(?P<title>.+?)\|',
webpage, 'title')
video_description = self._html_search_regex(r'<div class="entry-content">(?P<description>.+?)</div>',
video_title = self._html_search_regex(
r'<title>(?P<title>.+?)\|', webpage, 'title')
video_description = self._html_search_regex(
r'<div class="entry-content">(?P<description>.+?)</div>',
webpage, 'description', flags=re.DOTALL, fatal=False)
if len(video_description) == 0:
video_description = None
playerdata = self._download_webpage(playerdata_url, video_id)
sd_url = self._html_search_regex(r'file: \'(?P<sd_file>[^\']+)\', label: \'SD\'', playerdata, 'sd_file')
hd_url = self._html_search_regex(r'file: \'(?P<hd_file>[^\']+)\', label: \'HD\'', playerdata, 'hd_file')
sd_url = self._html_search_regex(r'file: \'([^\']+)\', label: \'SD\'', playerdata, 'sd_file')
hd_url = self._html_search_regex(
r'file: \'([^\']+)\', label: \'HD\'', playerdata, 'hd_file',
default=None)
video_thumbnail = self._html_search_regex(r'image: \'(?P<thumbnail>[^\']+)\'', playerdata, 'thumbnail', fatal=False)
formats = [
{
'url': sd_url,
'ext': 'mp4',
'format': 'sd',
'format_id': 'sd',
},
{
formats = [{
'url': sd_url,
'ext': 'mp4',
'format': 'sd',
'format_id': 'sd',
'quality': 1,
}]
if hd_url:
formats.append({
'url': hd_url,
'ext': 'mp4',
'format': 'hd',
'format_id': 'hd',
},
]
'quality': 2,
})
self._sort_formats(formats)
return {
'id': video_id,

View File

@ -1,22 +1,28 @@
from __future__ import unicode_literals
import re
import time
import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import ExtractorError
from ..utils import (
ExtractorError,
parse_duration,
)
class ClipfishIE(InfoExtractor):
IE_NAME = u'clipfish'
IE_NAME = 'clipfish'
_VALID_URL = r'^https?://(?:www\.)?clipfish\.de/.*?/video/(?P<id>[0-9]+)/'
_TEST = {
u'url': u'http://www.clipfish.de/special/game-trailer/video/3966754/fifa-14-e3-2013-trailer/',
u'file': u'3966754.mp4',
u'md5': u'2521cd644e862936cf2e698206e47385',
u'info_dict': {
u'title': u'FIFA 14 - E3 2013 Trailer',
u'duration': 82,
'url': 'http://www.clipfish.de/special/game-trailer/video/3966754/fifa-14-e3-2013-trailer/',
'md5': '2521cd644e862936cf2e698206e47385',
'info_dict': {
'id': '3966754',
'ext': 'mp4',
'title': 'FIFA 14 - E3 2013 Trailer',
'duration': 82,
},
u'skip': 'Blocked in the US'
}
@ -33,21 +39,10 @@ class ClipfishIE(InfoExtractor):
video_url = doc.find('filename').text
if video_url is None:
xml_bytes = xml.etree.ElementTree.tostring(doc)
raise ExtractorError(u'Cannot find video URL in document %r' %
raise ExtractorError('Cannot find video URL in document %r' %
xml_bytes)
thumbnail = doc.find('imageurl').text
duration_str = doc.find('duration').text
m = re.match(
r'^(?P<hours>[0-9]+):(?P<minutes>[0-9]{2}):(?P<seconds>[0-9]{2}):(?P<ms>[0-9]*)$',
duration_str)
if m:
duration = (
(int(m.group('hours')) * 60 * 60) +
(int(m.group('minutes')) * 60) +
(int(m.group('seconds')))
)
else:
duration = None
duration = parse_duration(doc.find('duration').text)
return {
'id': video_id,

View File

@ -1,3 +1,5 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
@ -11,13 +13,14 @@ class ClipsyndicateIE(InfoExtractor):
_VALID_URL = r'http://www\.clipsyndicate\.com/video/play(list/\d+)?/(?P<id>\d+)'
_TEST = {
u'url': u'http://www.clipsyndicate.com/video/play/4629301/brick_briscoe',
u'md5': u'4d7d549451bad625e0ff3d7bd56d776c',
u'info_dict': {
u'id': u'4629301',
u'ext': u'mp4',
u'title': u'Brick Briscoe',
u'duration': 612,
'url': 'http://www.clipsyndicate.com/video/play/4629301/brick_briscoe',
'md5': '4d7d549451bad625e0ff3d7bd56d776c',
'info_dict': {
'id': '4629301',
'ext': 'mp4',
'title': 'Brick Briscoe',
'duration': 612,
'thumbnail': 're:^https?://.+\.jpg',
},
}
@ -26,13 +29,13 @@ class ClipsyndicateIE(InfoExtractor):
video_id = mobj.group('id')
js_player = self._download_webpage(
'http://eplayer.clipsyndicate.com/embed/player.js?va_id=%s' % video_id,
video_id, u'Downlaoding player')
video_id, 'Downlaoding player')
# it includes a required token
flvars = self._search_regex(r'flvars: "(.*?)"', js_player, u'flvars')
flvars = self._search_regex(r'flvars: "(.*?)"', js_player, 'flvars')
pdoc = self._download_xml(
'http://eplayer.clipsyndicate.com/osmf/playlist?%s' % flvars,
video_id, u'Downloading video info',
video_id, 'Downloading video info',
transform_source=fix_xml_ampersands)
track_doc = pdoc.find('trackList/track')

View File

@ -7,8 +7,8 @@ from .mtv import MTVServicesInfoExtractor
from ..utils import (
compat_str,
compat_urllib_parse,
ExtractorError,
float_or_none,
unified_strdate,
)
@ -32,31 +32,32 @@ class ComedyCentralIE(MTVServicesInfoExtractor):
class ComedyCentralShowsIE(InfoExtractor):
IE_DESC = 'The Daily Show / Colbert Report'
IE_DESC = 'The Daily Show / The Colbert Report'
# urls can be abbreviations like :thedailyshow or :colbert
# urls for episodes like:
# or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
# or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
# or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
_VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
|(https?://)?(www\.)?
(?P<showname>thedailyshow|colbertnation)\.com/
_VALID_URL = r'''(?x)^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
|https?://(:www\.)?
(?P<showname>thedailyshow|thecolbertreport)\.(?:cc\.)?com/
(full-episodes/(?P<episode>.*)|
(?P<clip>
(the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
|(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))|
(?P<interview>
extended-interviews/(?P<interID>[0-9]+)/playlist_tds_extended_(?P<interview_title>.*?)/.*?)))
$"""
extended-interviews/(?P<interID>[0-9a-z]+)/(?:playlist_tds_extended_)?(?P<interview_title>.*?)(/.*?)?)))
$'''
_TEST = {
'url': 'http://www.thedailyshow.com/watch/thu-december-13-2012/kristen-stewart',
'file': '422212.mp4',
'url': 'http://thedailyshow.cc.com/watch/thu-december-13-2012/kristen-stewart',
'md5': '4e2f5cb088a83cd8cdb7756132f9739d',
'info_dict': {
"upload_date": "20121214",
"description": "Kristen Stewart",
"uploader": "thedailyshow",
"title": "thedailyshow-kristen-stewart part 1"
'id': 'ab9ab3e7-5a98-4dbe-8b21-551dc0523d55',
'ext': 'mp4',
'upload_date': '20121213',
'description': 'Kristen Stewart learns to let loose in "On the Road."',
'uploader': 'thedailyshow',
'title': 'thedailyshow-kristen-stewart part 1',
}
}
@ -79,11 +80,6 @@ class ComedyCentralShowsIE(InfoExtractor):
'400': (384, 216),
}
@classmethod
def suitable(cls, url):
"""Receives a URL and returns True if suitable for this IE."""
return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
@staticmethod
def _transform_rtmp_url(rtmp_video_url):
m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp\.comedystor/.*)$', rtmp_video_url)
@ -99,9 +95,9 @@ class ComedyCentralShowsIE(InfoExtractor):
if mobj.group('shortname'):
if mobj.group('shortname') in ('tds', 'thedailyshow'):
url = 'http://www.thedailyshow.com/full-episodes/'
url = 'http://thedailyshow.cc.com/full-episodes/'
else:
url = 'http://www.colbertnation.com/full-episodes/'
url = 'http://thecolbertreport.cc.com/full-episodes/'
mobj = re.match(self._VALID_URL, url, re.VERBOSE)
assert mobj is not None
@ -120,9 +116,9 @@ class ComedyCentralShowsIE(InfoExtractor):
epTitle = mobj.group('showname')
else:
epTitle = mobj.group('episode')
show_name = mobj.group('showname')
self.report_extraction(epTitle)
webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
webpage, htmlHandle = self._download_webpage_handle(url, epTitle)
if dlNewest:
url = htmlHandle.geturl()
mobj = re.match(self._VALID_URL, url, re.VERBOSE)
@ -130,71 +126,86 @@ class ComedyCentralShowsIE(InfoExtractor):
raise ExtractorError('Invalid redirected URL: ' + url)
if mobj.group('episode') == '':
raise ExtractorError('Redirected URL is still not specific: ' + url)
epTitle = mobj.group('episode')
epTitle = mobj.group('episode').rpartition('/')[-1]
mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
if len(mMovieParams) == 0:
# The Colbert Report embeds the information in a without
# a URL prefix; so extract the alternate reference
# and then add the URL prefix manually.
altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video|playlist).*?:.*?)"', webpage)
if len(altMovieParams) == 0:
raise ExtractorError('unable to find Flash URL in webpage ' + url)
else:
mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
uri = mMovieParams[0][1]
indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
idoc = self._download_xml(indexUrl, epTitle,
'Downloading show index',
'unable to download episode index')
# Correct cc.com in uri
uri = re.sub(r'(episode:[^.]+)(\.cc)?\.com', r'\1.cc.com', uri)
results = []
index_url = 'http://%s.cc.com/feeds/mrss?%s' % (show_name, compat_urllib_parse.urlencode({'uri': uri}))
idoc = self._download_xml(
index_url, epTitle,
'Downloading show index', 'Unable to download episode index')
itemEls = idoc.findall('.//item')
for partNum,itemEl in enumerate(itemEls):
mediaId = itemEl.findall('./guid')[0].text
shortMediaId = mediaId.split(':')[-1]
showId = mediaId.split(':')[-2].replace('.com', '')
officialTitle = itemEl.findall('./title')[0].text
officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
title = idoc.find('./channel/title').text
description = idoc.find('./channel/description').text
configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
compat_urllib_parse.urlencode({'uri': mediaId}))
cdoc = self._download_xml(configUrl, epTitle,
'Downloading configuration for %s' % shortMediaId)
entries = []
item_els = idoc.findall('.//item')
for part_num, itemEl in enumerate(item_els):
upload_date = unified_strdate(itemEl.findall('./pubDate')[0].text)
thumbnail = itemEl.find('.//{http://search.yahoo.com/mrss/}thumbnail').attrib.get('url')
content = itemEl.find('.//{http://search.yahoo.com/mrss/}content')
duration = float_or_none(content.attrib.get('duration'))
mediagen_url = content.attrib['url']
guid = itemEl.find('.//guid').text.rpartition(':')[-1]
cdoc = self._download_xml(
mediagen_url, epTitle,
'Downloading configuration for segment %d / %d' % (part_num + 1, len(item_els)))
turls = []
for rendition in cdoc.findall('.//rendition'):
finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
turls.append(finfo)
if len(turls) == 0:
self._downloader.report_error('unable to download ' + mediaId + ': No videos found')
continue
formats = []
for format, rtmp_video_url in turls:
w, h = self._video_dimensions.get(format, (None, None))
formats.append({
'format_id': 'vhttp-%s' % format,
'url': self._transform_rtmp_url(rtmp_video_url),
'ext': self._video_extensions.get(format, 'mp4'),
'format_id': format,
'height': h,
'width': w,
})
formats.append({
'format_id': 'rtmp-%s' % format,
'url': rtmp_video_url,
'ext': self._video_extensions.get(format, 'mp4'),
'height': h,
'width': w,
})
self._sort_formats(formats)
effTitle = showId + '-' + epTitle + ' part ' + compat_str(partNum+1)
results.append({
'id': shortMediaId,
virtual_id = show_name + ' ' + epTitle + ' part ' + compat_str(part_num + 1)
entries.append({
'id': guid,
'title': virtual_id,
'formats': formats,
'uploader': showId,
'upload_date': officialDate,
'title': effTitle,
'thumbnail': None,
'description': compat_str(officialTitle),
'uploader': show_name,
'upload_date': upload_date,
'duration': duration,
'thumbnail': thumbnail,
'description': description,
})
return results
return {
'_type': 'playlist',
'entries': entries,
'title': show_name + ' ' + title,
'description': description,
}

View File

@ -1,23 +1,25 @@
from __future__ import unicode_literals
import re
from ..utils import (
compat_urllib_parse,
determine_ext
)
from .common import InfoExtractor
class EHowIE(InfoExtractor):
IE_NAME = u'eHow'
_VALID_URL = r'(?:https?://)?(?:www\.)?ehow\.com/[^/_?]*_(?P<id>[0-9]+)'
IE_NAME = 'eHow'
_VALID_URL = r'https?://(?:www\.)?ehow\.com/[^/_?]*_(?P<id>[0-9]+)'
_TEST = {
u'url': u'http://www.ehow.com/video_12245069_hardwood-flooring-basics.html',
u'file': u'12245069.flv',
u'md5': u'9809b4e3f115ae2088440bcb4efbf371',
u'info_dict': {
u"title": u"Hardwood Flooring Basics",
u"description": u"Hardwood flooring may be time consuming, but its ultimately a pretty straightforward concept. Learn about hardwood flooring basics with help from a hardware flooring business owner in this free video...",
u"uploader": u"Erick Nathan"
'url': 'http://www.ehow.com/video_12245069_hardwood-flooring-basics.html',
'md5': '9809b4e3f115ae2088440bcb4efbf371',
'info_dict': {
'id': '12245069',
'ext': 'flv',
'title': 'Hardwood Flooring Basics',
'description': 'Hardwood flooring may be time consuming, but its ultimately a pretty straightforward concept. Learn about hardwood flooring basics with help from a hardware flooring business owner in this free video...',
'uploader': 'Erick Nathan',
}
}
@ -26,21 +28,16 @@ class EHowIE(InfoExtractor):
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
video_url = self._search_regex(r'(?:file|source)=(http[^\'"&]*)',
webpage, u'video URL')
final_url = compat_urllib_parse.unquote(video_url)
uploader = self._search_regex(r'<meta name="uploader" content="(.+?)" />',
webpage, u'uploader')
webpage, 'video URL')
final_url = compat_urllib_parse.unquote(video_url)
uploader = self._html_search_meta('uploader', webpage)
title = self._og_search_title(webpage).replace(' | eHow', '')
ext = determine_ext(final_url)
return {
'_type': 'video',
'id': video_id,
'url': final_url,
'ext': ext,
'title': title,
'thumbnail': self._og_search_thumbnail(webpage),
'id': video_id,
'url': final_url,
'title': title,
'thumbnail': self._og_search_thumbnail(webpage),
'description': self._og_search_description(webpage),
'uploader': uploader,
'uploader': uploader,
}

View File

@ -25,6 +25,7 @@ from ..utils import (
from .brightcove import BrightcoveIE
from .ooyala import OoyalaIE
from .rutv import RUTVIE
from .smotri import SmotriIE
class GenericIE(InfoExtractor):
@ -197,6 +198,36 @@ class GenericIE(InfoExtractor):
'description': 'No description',
},
},
# arte embed
{
'url': 'http://www.tv-replay.fr/redirection/20-03-14/x-enius-arte-10753389.html',
'md5': '7653032cbb25bf6c80d80f217055fa43',
'info_dict': {
'id': '048195-004_PLUS7-F',
'ext': 'flv',
'title': 'X:enius',
'description': 'md5:d5fdf32ef6613cdbfd516ae658abf168',
'upload_date': '20140320',
},
'params': {
'skip_download': 'Requires rtmpdump'
}
},
# smotri embed
{
'url': 'http://rbctv.rbc.ru/archive/news/562949990879132.shtml',
'md5': 'ec40048448e9284c9a1de77bb188108b',
'info_dict': {
'id': 'v27008541fad',
'ext': 'mp4',
'title': 'Крым и Севастополь вошли в состав России',
'description': 'md5:fae01b61f68984c7bd2fa741e11c3175',
'duration': 900,
'upload_date': '20140318',
'uploader': 'rbctv_2012_4',
'uploader_id': 'rbctv_2012_4',
},
},
]
def report_download_webpage(self, video_id):
@ -525,6 +556,18 @@ class GenericIE(InfoExtractor):
if mobj is not None:
return self.url_result(mobj.group('url'), 'TED')
# Look for embedded arte.tv player
mobj = re.search(
r'<script [^>]*?src="(?P<url>http://www\.arte\.tv/playerv2/embed[^"]+)"',
webpage)
if mobj is not None:
return self.url_result(mobj.group('url'), 'ArteTVEmbed')
# Look for embedded smotri.com player
smotri_url = SmotriIE._extract_url(webpage)
if smotri_url:
return self.url_result(smotri_url, 'Smotri')
# Start with something easy: JW Player in SWFObject
mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
if mobj is None:

View File

@ -14,7 +14,7 @@ from ..utils import (
class MooshareIE(InfoExtractor):
IE_NAME = 'mooshare'
IE_DESC = 'Mooshare.biz'
_VALID_URL = r'http://mooshare\.biz/(?P<id>[\da-z]{12})'
_VALID_URL = r'http://(?:www\.)?mooshare\.biz/(?P<id>[\da-z]{12})'
_TESTS = [
{

View File

@ -1,12 +1,10 @@
# encoding: utf-8
from __future__ import unicode_literals
import re
import socket
from .common import InfoExtractor
from ..utils import (
compat_http_client,
compat_urllib_error,
compat_urllib_parse,
compat_urllib_request,
compat_urlparse,
@ -18,57 +16,54 @@ from ..utils import (
class NiconicoIE(InfoExtractor):
IE_NAME = u'niconico'
IE_DESC = u'ニコニコ動画'
IE_NAME = 'niconico'
IE_DESC = 'ニコニコ動画'
_TEST = {
u'url': u'http://www.nicovideo.jp/watch/sm22312215',
u'file': u'sm22312215.mp4',
u'md5': u'd1a75c0823e2f629128c43e1212760f9',
u'info_dict': {
u'title': u'Big Buck Bunny',
u'uploader': u'takuya0301',
u'uploader_id': u'2698420',
u'upload_date': u'20131123',
u'description': u'(c) copyright 2008, Blender Foundation / www.bigbuckbunny.org',
'url': 'http://www.nicovideo.jp/watch/sm22312215',
'md5': 'd1a75c0823e2f629128c43e1212760f9',
'info_dict': {
'id': 'sm22312215',
'ext': 'mp4',
'title': 'Big Buck Bunny',
'uploader': 'takuya0301',
'uploader_id': '2698420',
'upload_date': '20131123',
'description': '(c) copyright 2008, Blender Foundation / www.bigbuckbunny.org',
},
u'params': {
u'username': u'ydl.niconico@gmail.com',
u'password': u'youtube-dl',
'params': {
'username': 'ydl.niconico@gmail.com',
'password': 'youtube-dl',
},
}
_VALID_URL = r'^https?://(?:www\.|secure\.)?nicovideo\.jp/watch/([a-z][a-z][0-9]+)(?:.*)$'
_NETRC_MACHINE = 'niconico'
# If True it will raise an error if no login info is provided
_LOGIN_REQUIRED = True
def _real_initialize(self):
self._login()
def _login(self):
(username, password) = self._get_login_info()
# No authentication to be performed
if username is None:
if self._LOGIN_REQUIRED:
raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
return False
# Login is required
raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
# Log in
login_form_strs = {
u'mail': username,
u'password': password,
'mail': username,
'password': password,
}
# Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
# chokes on unicode
login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in login_form_strs.items())
login_data = compat_urllib_parse.urlencode(login_form).encode('utf-8')
request = compat_urllib_request.Request(
u'https://secure.nicovideo.jp/secure/login', login_data)
'https://secure.nicovideo.jp/secure/login', login_data)
login_results = self._download_webpage(
request, u'', note=u'Logging in', errnote=u'Unable to log in')
request, None, note='Logging in', errnote='Unable to log in')
if re.search(r'(?i)<h1 class="mb8p4">Log in error</h1>', login_results) is not None:
self._downloader.report_warning(u'unable to log in: bad username or password')
self._downloader.report_warning('unable to log in: bad username or password')
return False
return True
@ -82,12 +77,12 @@ class NiconicoIE(InfoExtractor):
video_info = self._download_xml(
'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id, video_id,
note=u'Downloading video info page')
note='Downloading video info page')
# Get flv info
flv_info_webpage = self._download_webpage(
u'http://flapi.nicovideo.jp/api/getflv?v=' + video_id,
video_id, u'Downloading flv info')
'http://flapi.nicovideo.jp/api/getflv?v=' + video_id,
video_id, 'Downloading flv info')
video_real_url = compat_urlparse.parse_qs(flv_info_webpage)['url'][0]
# Start extracting information
@ -106,22 +101,22 @@ class NiconicoIE(InfoExtractor):
url = 'http://seiga.nicovideo.jp/api/user/info?id=' + video_uploader_id
try:
user_info = self._download_xml(
url, video_id, note=u'Downloading user information')
url, video_id, note='Downloading user information')
video_uploader = user_info.find('.//nickname').text
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
self._downloader.report_warning(u'Unable to download user info webpage: %s' % compat_str(err))
except ExtractorError as err:
self._downloader.report_warning('Unable to download user info webpage: %s' % compat_str(err))
return {
'id': video_id,
'url': video_real_url,
'title': video_title,
'ext': video_extension,
'format': video_format,
'thumbnail': video_thumbnail,
'id': video_id,
'url': video_real_url,
'title': video_title,
'ext': video_extension,
'format': video_format,
'thumbnail': video_thumbnail,
'description': video_description,
'uploader': video_uploader,
'uploader': video_uploader,
'upload_date': video_upload_date,
'uploader_id': video_uploader_id,
'view_count': video_view_count,
'view_count': video_view_count,
'webpage_url': video_webpage_url,
}

157
youtube_dl/extractor/ntv.py Normal file
View File

@ -0,0 +1,157 @@
# encoding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
unescapeHTML
)
class NTVIE(InfoExtractor):
_VALID_URL = r'http://(?:www\.)?ntv\.ru/(?P<id>.+)'
_TESTS = [
{
'url': 'http://www.ntv.ru/novosti/863142/',
'info_dict': {
'id': '746000',
'ext': 'flv',
'title': 'Командующий Черноморским флотом провел переговоры в штабе ВМС Украины',
'description': 'Командующий Черноморским флотом провел переговоры в штабе ВМС Украины',
'duration': 136,
},
'params': {
# rtmp download
'skip_download': True,
},
},
{
'url': 'http://www.ntv.ru/video/novosti/750370/',
'info_dict': {
'id': '750370',
'ext': 'flv',
'title': 'Родные пассажиров пропавшего Boeing не верят в трагический исход',
'description': 'Родные пассажиров пропавшего Boeing не верят в трагический исход',
'duration': 172,
},
'params': {
# rtmp download
'skip_download': True,
},
},
{
'url': 'http://www.ntv.ru/peredacha/segodnya/m23700/o232416',
'info_dict': {
'id': '747480',
'ext': 'flv',
'title': '«Сегодня». 21 марта 2014 года. 16:00 ',
'description': '«Сегодня». 21 марта 2014 года. 16:00 ',
'duration': 1496,
},
'params': {
# rtmp download
'skip_download': True,
},
},
{
'url': 'http://www.ntv.ru/kino/Koma_film',
'info_dict': {
'id': '750783',
'ext': 'flv',
'title': 'Остросюжетный фильм «Кома» — 4 апреля вечером на НТВ',
'description': 'Остросюжетный фильм «Кома» — 4 апреля вечером на НТВ',
'duration': 28,
},
'params': {
# rtmp download
'skip_download': True,
},
},
{
'url': 'http://www.ntv.ru/serial/Delo_vrachey/m31760/o233916/',
'info_dict': {
'id': '751482',
'ext': 'flv',
'title': '«Дело врачей»: «Деревце жизни»',
'description': '«Дело врачей»: «Деревце жизни»',
'duration': 2590,
},
'params': {
# rtmp download
'skip_download': True,
},
},
]
_VIDEO_ID_REGEXES = [
r'<meta property="og:url" content="http://www\.ntv\.ru/video/(\d+)',
r'<video embed=[^>]+><id>(\d+)</id>',
r'<video restriction[^>]+><key>(\d+)</key>'
]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
page = self._download_webpage(url, video_id, 'Downloading page')
for pattern in self._VIDEO_ID_REGEXES:
mobj = re.search(pattern, page)
if mobj:
break
if not mobj:
raise ExtractorError('No media links available for %s' % video_id)
video_id = mobj.group(1)
player = self._download_xml('http://www.ntv.ru/vi%s/' % video_id, video_id, 'Downloading video XML')
title = unescapeHTML(player.find('./data/title').text)
description = unescapeHTML(player.find('./data/description').text)
video = player.find('./data/video')
video_id = video.find('./id').text
thumbnail = video.find('./splash').text
duration = int(video.find('./totaltime').text)
view_count = int(video.find('./views').text)
puid22 = video.find('./puid22').text
apps = {
'4': 'video1',
'7': 'video2',
}
app = apps[puid22] if puid22 in apps else apps['4']
formats = []
for format_id in ['', 'hi', 'webm']:
file = video.find('./%sfile' % format_id)
if file is None:
continue
size = video.find('./%ssize' % format_id)
formats.append({
'url': 'rtmp://media.ntv.ru/%s' % app,
'app': app,
'play_path': file.text,
'rtmp_conn': 'B:1',
'player_url': 'http://www.ntv.ru/swf/vps1.swf?update=20131128',
'page_url': 'http://www.ntv.ru',
'flash_ver': 'LNX 11,2,202,341',
'rtmp_live': True,
'ext': 'flv',
'filesize': int(size.text),
})
self._sort_formats(formats)
return {
'id': video_id,
'title': title,
'description': description,
'thumbnail': thumbnail,
'duration': duration,
'view_count': view_count,
'formats': formats,
}

View File

@ -0,0 +1,40 @@
# coding: utf-8
from __future__ import unicode_literals
import calendar
import datetime
import re
from .common import InfoExtractor
# audios on oe1.orf.at are only available for 7 days, so we can't
# add tests.
class OE1IE(InfoExtractor):
IE_DESC = 'oe1.orf.at'
_VALID_URL = r'http://oe1\.orf\.at/programm/(?P<id>[0-9]+)'
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
show_id = mobj.group('id')
data = self._download_json(
'http://oe1.orf.at/programm/%s/konsole' % show_id,
show_id
)
timestamp = datetime.datetime.strptime('%s %s' % (
data['item']['day_label'],
data['item']['time']
), '%d.%m.%Y %H:%M')
unix_timestamp = calendar.timegm(timestamp.utctimetuple())
return {
'id': show_id,
'title': data['item']['title'],
'url': data['item']['url_stream'],
'ext': 'mp3',
'description': data['item'].get('info'),
'timestamp': unix_timestamp
}

View File

@ -7,7 +7,7 @@ from ..utils import unescapeHTML
class OoyalaIE(InfoExtractor):
_VALID_URL = r'https?://.+?\.ooyala\.com/.*?(?:embedCode|ec)=(?P<id>.+?)(&|$)'
_VALID_URL = r'(?:ooyala:|https?://.+?\.ooyala\.com/.*?(?:embedCode|ec)=)(?P<id>.+?)(&|$)'
_TEST = {
# From http://it.slashdot.org/story/13/04/25/178216/recovering-data-from-broken-hard-drives-and-ssds-video

View File

@ -28,6 +28,7 @@ class RTSIE(InfoExtractor):
'uploader': 'Divers',
'upload_date': '19680921',
'timestamp': -40280400,
'thumbnail': 're:^https?://.*\.image'
},
}
@ -58,4 +59,5 @@ class RTSIE(InfoExtractor):
'duration': duration,
'uploader': info.get('programName'),
'timestamp': upload_timestamp,
'thumbnail': thumbnail,
}

View File

@ -1,24 +0,0 @@
import re
from .common import InfoExtractor
class SlashdotIE(InfoExtractor):
_VALID_URL = r'https?://tv\.slashdot\.org/video/\?embed=(?P<id>.*?)(&|$)'
_TEST = {
u'add_ie': ['Ooyala'],
u'url': u'http://tv.slashdot.org/video/?embed=JscHMzZDplD0p-yNLOzTfzC3Q3xzJaUz',
u'file': u'JscHMzZDplD0p-yNLOzTfzC3Q3xzJaUz.mp4',
u'md5': u'd2222e7a4a4c1541b3e0cf732fb26735',
u'info_dict': {
u'title': u' Meet the Stampede Supercomputing Cluster\'s Administrator',
},
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
ooyala_url = self._search_regex(r'<script src="(.*?)"', webpage, 'ooyala url')
return self.url_result(ooyala_url, 'Ooyala')

View File

@ -13,22 +13,24 @@ from ..utils import (
compat_urllib_request,
ExtractorError,
url_basename,
int_or_none,
)
class SmotriIE(InfoExtractor):
IE_DESC = 'Smotri.com'
IE_NAME = 'smotri'
_VALID_URL = r'^https?://(?:www\.)?(?P<url>smotri\.com/video/view/\?id=(?P<videoid>v(?P<realvideoid>[0-9]+)[a-z0-9]{4}))'
_VALID_URL = r'^https?://(?:www\.)?(?:smotri\.com/video/view/\?id=|pics\.smotri\.com/(?:player|scrubber_custom8)\.swf\?file=)(?P<videoid>v(?P<realvideoid>[0-9]+)[a-z0-9]{4})'
_NETRC_MACHINE = 'smotri'
_TESTS = [
# real video id 2610366
{
'url': 'http://smotri.com/video/view/?id=v261036632ab',
'file': 'v261036632ab.mp4',
'md5': '2a7b08249e6f5636557579c368040eb9',
'info_dict': {
'id': 'v261036632ab',
'ext': 'mp4',
'title': 'катастрофа с камер видеонаблюдения',
'uploader': 'rbc2008',
'uploader_id': 'rbc08',
@ -40,9 +42,10 @@ class SmotriIE(InfoExtractor):
# real video id 57591
{
'url': 'http://smotri.com/video/view/?id=v57591cb20',
'file': 'v57591cb20.flv',
'md5': '830266dfc21f077eac5afd1883091bcd',
'info_dict': {
'id': 'v57591cb20',
'ext': 'flv',
'title': 'test',
'uploader': 'Support Photofile@photofile',
'uploader_id': 'support-photofile',
@ -54,9 +57,10 @@ class SmotriIE(InfoExtractor):
# video-password
{
'url': 'http://smotri.com/video/view/?id=v1390466a13c',
'file': 'v1390466a13c.mp4',
'md5': 'f6331cef33cad65a0815ee482a54440b',
'info_dict': {
'id': 'v1390466a13c',
'ext': 'mp4',
'title': 'TOCCA_A_NOI_-_LE_COSE_NON_VANNO_CAMBIAMOLE_ORA-1',
'uploader': 'timoxa40',
'uploader_id': 'timoxa40',
@ -71,9 +75,10 @@ class SmotriIE(InfoExtractor):
# age limit + video-password
{
'url': 'http://smotri.com/video/view/?id=v15408898bcf',
'file': 'v15408898bcf.flv',
'md5': '91e909c9f0521adf5ee86fbe073aad70',
'info_dict': {
'id': 'v15408898bcf',
'ext': 'flv',
'title': 'этот ролик не покажут по ТВ',
'uploader': 'zzxxx',
'uploader_id': 'ueggb',
@ -85,7 +90,22 @@ class SmotriIE(InfoExtractor):
'params': {
'videopassword': '333'
}
}
},
# swf player
{
'url': 'http://pics.smotri.com/scrubber_custom8.swf?file=v9188090500',
'md5': '4d47034979d9390d14acdf59c4935bc2',
'info_dict': {
'id': 'v9188090500',
'ext': 'mp4',
'title': 'Shakira - Don\'t Bother',
'uploader': 'HannahL',
'uploader_id': 'lisaha95',
'upload_date': '20090331',
'description': 'Shakira - Don\'t Bother, видео Shakira - Don\'t Bother',
'thumbnail': 'http://frame8.loadup.ru/44/0b/918809.7.3.jpg',
},
},
]
_SUCCESS = 0
@ -93,6 +113,21 @@ class SmotriIE(InfoExtractor):
_PASSWORD_DETECTED = 2
_VIDEO_NOT_FOUND = 3
@classmethod
def _extract_url(cls, webpage):
mobj = re.search(
r'<embed[^>]src=(["\'])(?P<url>http://pics\.smotri\.com/(?:player|scrubber_custom8)\.swf\?file=v.+?\1)',
webpage)
if mobj is not None:
return mobj.group('url')
mobj = re.search(
r'''(?x)<div\s+class="video_file">http://smotri\.com/video/download/file/[^<]+</div>\s*
<div\s+class="video_image">[^<]+</div>\s*
<div\s+class="video_id">(?P<id>[^<]+)</div>''', webpage)
if mobj is not None:
return 'http://smotri.com/video/view/?id=%s' % mobj.group('id')
def _search_meta(self, name, html, display_name=None):
if display_name is None:
display_name = name
@ -134,7 +169,7 @@ class SmotriIE(InfoExtractor):
# Video JSON does not provide enough meta data
# We will extract some from the video web page instead
video_page_url = 'http://' + mobj.group('url')
video_page_url = 'http://smotri.com/video/view/?id=%s' % video_id
video_page = self._download_webpage(video_page_url, video_id, 'Downloading video page')
# Warning if video is unavailable
@ -222,7 +257,7 @@ class SmotriIE(InfoExtractor):
'upload_date': video_upload_date,
'uploader_id': video_uploader_id,
'duration': video_duration,
'view_count': video_view_count,
'view_count': int_or_none(video_view_count),
'age_limit': 18 if adult_content else 0,
'video_page_url': video_page_url
}

View File

@ -18,12 +18,14 @@ class TEDIE(SubtitlesInfoExtractor):
(?P<type_playlist>playlists(?:/\d+)?) # We have a playlist
|
((?P<type_talk>talks)) # We have a simple talk
|
(?P<type_watch>watch)/[^/]+/[^/]+
)
(/lang/(.*?))? # The url may contain the language
/(?P<name>\w+) # Here goes the name and then ".html"
/(?P<name>[\w-]+) # Here goes the name and then ".html"
.*)$
'''
_TEST = {
_TESTS = [{
'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
'md5': '4ea1dada91e4174b53dac2bb8ace429d',
'info_dict': {
@ -36,7 +38,17 @@ class TEDIE(SubtitlesInfoExtractor):
'actively fooling us.'),
'uploader': 'Dan Dennett',
}
}
}, {
'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',
'md5': '226f4fb9c62380d11b7995efa4c87994',
'info_dict': {
'id': 'vishal-sikka-the-beauty-and-power-of-algorithms',
'ext': 'mp4',
'title': 'Vishal Sikka: The beauty and power of algorithms',
'thumbnail': 're:^https?://.+\.jpg',
'description': 'Adaptive, intelligent, and consistent, algorithms are emerging as the ultimate app for everything from matching consumers to products to assessing medical diagnoses. Vishal Sikka shares his appreciation for the algorithm, charting both its inherent beauty and its growing power.',
}
}]
_FORMATS_PREFERENCE = {
'low': 1,
@ -57,6 +69,8 @@ class TEDIE(SubtitlesInfoExtractor):
name = m.group('name')
if m.group('type_talk'):
return self._talk_info(url, name)
elif m.group('type_watch'):
return self._watch_info(url, name)
else:
return self._playlist_videos_info(url, name)
@ -123,3 +137,26 @@ class TEDIE(SubtitlesInfoExtractor):
else:
self._downloader.report_warning(u'video doesn\'t have subtitles')
return {}
def _watch_info(self, url, name):
webpage = self._download_webpage(url, name)
config_json = self._html_search_regex(
r"data-config='([^']+)", webpage, 'config')
config = json.loads(config_json)
video_url = config['video']['url']
thumbnail = config.get('image', {}).get('url')
title = self._html_search_regex(
r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title')
description = self._html_search_regex(
r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>',
webpage, 'description', fatal=False)
return {
'id': name,
'url': video_url,
'title': title,
'thumbnail': thumbnail,
'description': description,
}

View File

@ -0,0 +1,61 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
compat_urllib_parse,
unified_strdate,
)
class UrortIE(InfoExtractor):
IE_DESC = 'NRK P3 Urørt'
_VALID_URL = r'https?://(?:www\.)?urort\.p3\.no/#!/Band/(?P<id>[^/]+)$'
_TEST = {
'url': 'https://urort.p3.no/#!/Band/Gerilja',
'md5': '5ed31a924be8a05e47812678a86e127b',
'info_dict': {
'id': '33124-4',
'ext': 'mp3',
'title': 'The Bomb',
'thumbnail': 're:^https?://.+\.jpg',
'like_count': int,
'uploader': 'Gerilja',
'uploader_id': 'Gerilja',
'upload_date': '20100323',
},
'params': {
'matchtitle': '^The Bomb$', # To test, we want just one video
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
playlist_id = mobj.group('id')
fstr = compat_urllib_parse.quote("InternalBandUrl eq '%s'" % playlist_id)
json_url = 'http://urort.p3.no/breeze/urort/TrackDtos?$filter=' + fstr
songs = self._download_json(json_url, playlist_id)
print(songs[0])
entries = [{
'id': '%d-%s' % (s['BandId'], s['$id']),
'title': s['Title'],
'url': s['TrackUrl'],
'ext': 'mp3',
'uploader_id': playlist_id,
'uploader': s.get('BandName', playlist_id),
'like_count': s.get('LikeCount'),
'thumbnail': 'http://urort.p3.no/cloud/images/%s' % s['Image'],
'upload_date': unified_strdate(s.get('Released')),
} for s in songs]
return {
'_type': 'playlist',
'id': playlist_id,
'title': playlist_id,
'entries': entries,
}

View File

@ -4,26 +4,99 @@ import re
import json
from .common import InfoExtractor
from ..utils import compat_urllib_request
from ..utils import (
compat_urllib_request,
int_or_none,
)
class VeohIE(InfoExtractor):
_VALID_URL = r'http://(?:www\.)?veoh\.com/(?:watch|iphone/#_Watch)/v(?P<id>\d*)'
_VALID_URL = r'http://(?:www\.)?veoh\.com/(?:watch|iphone/#_Watch)/(?P<id>(?:v|yapi-)[\da-zA-Z]+)'
_TEST = {
'url': 'http://www.veoh.com/watch/v56314296nk7Zdmz3',
'file': '56314296.mp4',
'md5': '620e68e6a3cff80086df3348426c9ca3',
'info_dict': {
'title': 'Straight Backs Are Stronger',
'uploader': 'LUMOback',
'description': 'At LUMOback, we believe straight backs are stronger. The LUMOback Posture & Movement Sensor: It gently vibrates when you slouch, inspiring improved posture and mobility. Use the app to track your data and improve your posture over time. ',
_TESTS = [
{
'url': 'http://www.veoh.com/watch/v56314296nk7Zdmz3',
'md5': '620e68e6a3cff80086df3348426c9ca3',
'info_dict': {
'id': '56314296',
'ext': 'mp4',
'title': 'Straight Backs Are Stronger',
'uploader': 'LUMOback',
'description': 'At LUMOback, we believe straight backs are stronger. The LUMOback Posture & Movement Sensor: It gently vibrates when you slouch, inspiring improved posture and mobility. Use the app to track your data and improve your posture over time. ',
},
},
{
'url': 'http://www.veoh.com/watch/v27701988pbTc4wzN?h1=Chile+workers+cover+up+to+avoid+skin+damage',
'md5': '4a6ff84b87d536a6a71e6aa6c0ad07fa',
'info_dict': {
'id': '27701988',
'ext': 'mp4',
'title': 'Chile workers cover up to avoid skin damage',
'description': 'md5:2bd151625a60a32822873efc246ba20d',
'uploader': 'afp-news',
'duration': 123,
},
},
{
'url': 'http://www.veoh.com/watch/v69525809F6Nc4frX',
'md5': '4fde7b9e33577bab2f2f8f260e30e979',
'note': 'Embedded ooyala video',
'info_dict': {
'id': '69525809',
'ext': 'mp4',
'title': 'Doctors Alter Plan For Preteen\'s Weight Loss Surgery',
'description': 'md5:f5a11c51f8fb51d2315bca0937526891',
'uploader': 'newsy-videos',
},
},
]
def _extract_formats(self, source):
formats = []
link = source.get('aowPermalink')
if link:
formats.append({
'url': link,
'ext': 'mp4',
'format_id': 'aow',
})
link = source.get('fullPreviewHashLowPath')
if link:
formats.append({
'url': link,
'format_id': 'low',
})
link = source.get('fullPreviewHashHighPath')
if link:
formats.append({
'url': link,
'format_id': 'high',
})
return formats
def _extract_video(self, source):
return {
'id': source.get('videoId'),
'title': source.get('title'),
'description': source.get('description'),
'thumbnail': source.get('highResImage') or source.get('medResImage'),
'uploader': source.get('username'),
'duration': int_or_none(source.get('length')),
'view_count': int_or_none(source.get('views')),
'age_limit': 18 if source.get('isMature') == 'true' or source.get('isSexy') == 'true' else 0,
'formats': self._extract_formats(source),
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
if video_id.startswith('v'):
rsp = self._download_xml(
r'http://www.veoh.com/api/findByPermalink?permalink=%s' % video_id, video_id, 'Downloading video XML')
if rsp.get('stat') == 'ok':
return self._extract_video(rsp.find('./videoList/video'))
webpage = self._download_webpage(url, video_id)
age_limit = 0
if 'class="adultwarning-container"' in webpage:
@ -33,24 +106,16 @@ class VeohIE(InfoExtractor):
request.add_header('Cookie', 'confirmedAdult=true')
webpage = self._download_webpage(request, video_id)
m_youtube = re.search(r'http://www\.youtube\.com/v/(.*?)(\&|")', webpage)
m_youtube = re.search(r'http://www\.youtube\.com/v/(.*?)(\&|"|\?)', webpage)
if m_youtube is not None:
youtube_id = m_youtube.group(1)
self.to_screen('%s: detected Youtube video.' % video_id)
return self.url_result(youtube_id, 'Youtube')
self.report_extraction(video_id)
info = self._search_regex(r'videoDetailsJSON = \'({.*?})\';', webpage, 'info')
info = json.loads(info)
video_url = info.get('fullPreviewHashHighPath') or info.get('fullPreviewHashLowPath')
info = json.loads(
self._search_regex(r'videoDetailsJSON = \'({.*?})\';', webpage, 'info').replace('\\\'', '\''))
return {
'id': info['videoId'],
'title': info['title'],
'url': video_url,
'uploader': info['username'],
'thumbnail': info.get('highResImage') or info.get('medResImage'),
'description': info['description'],
'view_count': info['views'],
'age_limit': age_limit,
}
video = self._extract_video(info)
video['age_limit'] = age_limit
return video

View File

@ -1,38 +0,0 @@
import re
from .common import InfoExtractor
from .ooyala import OoyalaIE
from ..utils import ExtractorError
class ViceIE(InfoExtractor):
_VALID_URL = r'http://www\.vice\.com/.*?/(?P<name>.+)'
_TEST = {
u'url': u'http://www.vice.com/Fringes/cowboy-capitalists-part-1',
u'file': u'43cW1mYzpia9IlestBjVpd23Yu3afAfp.mp4',
u'info_dict': {
u'title': u'VICE_COWBOYCAPITALISTS_PART01_v1_VICE_WM_1080p.mov',
},
u'params': {
# Requires ffmpeg (m3u8 manifest)
u'skip_download': True,
},
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
name = mobj.group('name')
webpage = self._download_webpage(url, name)
try:
ooyala_url = self._og_search_video_url(webpage)
except ExtractorError:
try:
embed_code = self._search_regex(
r'OO.Player.create\(\'ooyalaplayer\', \'(.+?)\'', webpage,
u'ooyala embed code')
ooyala_url = OoyalaIE._url_for_embed_code(embed_code)
except ExtractorError:
raise ExtractorError(u'The page doesn\'t contain a video', expected=True)
return self.url_result(ooyala_url, ie='Ooyala')

View File

@ -0,0 +1,103 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
int_or_none,
strip_jsonp,
)
class WashingtonPostIE(InfoExtractor):
_VALID_URL = r'^https?://(?:www\.)?washingtonpost\.com/.*?/(?P<id>[^/]+)/(?:$|[?#])'
_TEST = {
'url': 'http://www.washingtonpost.com/sf/national/2014/03/22/sinkhole-of-bureaucracy/',
'playlist': [{
'md5': 'c3f4b4922ffa259243f68e928db2db8c',
'info_dict': {
'id': 'fc433c38-b146-11e3-b8b3-44b1d1cd4c1f',
'ext': 'mp4',
'title': 'Breaking Points: The Paper Mine',
'duration': 1287,
'description': 'Overly complicated paper pushing is nothing new to government bureaucracy. But the way federal retirement applications are filed may be the most outdated. David Fahrenthold explains.',
'uploader': 'The Washington Post',
'timestamp': 1395527908,
'upload_date': '20140322',
},
}, {
'md5': 'f645a07652c2950cd9134bb852c5f5eb',
'info_dict': {
'id': '41255e28-b14a-11e3-b8b3-44b1d1cd4c1f',
'ext': 'mp4',
'title': 'The town bureaucracy sustains',
'description': 'Underneath the friendly town of Boyers is a sea of government paperwork. In a disused limestone mine, hundreds of locals now track, file and process retirement applications for the federal government. We set out to find out what it\'s like to do paperwork 230 feet underground.',
'duration': 2217,
'timestamp': 1395528005,
'upload_date': '20140322',
'uploader': 'The Washington Post',
},
}]
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
page_id = mobj.group('id')
webpage = self._download_webpage(url, page_id)
title = self._og_search_title(webpage)
uuids = re.findall(r'data-video-uuid="([^"]+)"', webpage)
entries = []
for i, uuid in enumerate(uuids, start=1):
vinfo_all = self._download_json(
'http://www.washingtonpost.com/posttv/c/videojson/%s?resType=jsonp' % uuid,
page_id,
transform_source=strip_jsonp,
note='Downloading information of video %d/%d' % (i, len(uuids))
)
vinfo = vinfo_all[0]['contentConfig']
uploader = vinfo.get('credits', {}).get('source')
timestamp = int_or_none(
vinfo.get('dateConfig', {}).get('dateFirstPublished'), 1000)
formats = [{
'format_id': (
'%s-%s-%s' % (s.get('type'), s.get('width'), s.get('bitrate'))
if s.get('width')
else s.get('type')),
'vbr': s.get('bitrate') if s.get('width') != 0 else None,
'width': s.get('width'),
'height': s.get('height'),
'acodec': s.get('audioCodec'),
'vcodec': s.get('videoCodec') if s.get('width') != 0 else 'none',
'filesize': s.get('fileSize'),
'url': s.get('url'),
'ext': 'mp4',
'protocol': {
'MP4': 'http',
'F4F': 'f4m',
}.get(s.get('type'))
} for s in vinfo.get('streams', [])]
source_media_url = vinfo.get('sourceMediaURL')
if source_media_url:
formats.append({
'format_id': 'source_media',
'url': source_media_url,
})
self._sort_formats(formats)
entries.append({
'id': uuid,
'title': vinfo['title'],
'description': vinfo.get('blurb'),
'uploader': uploader,
'formats': formats,
'duration': int_or_none(vinfo.get('videoDuration'), 100),
'timestamp': timestamp,
})
return {
'_type': 'playlist',
'entries': entries,
'id': page_id,
'title': title,
}

View File

@ -1181,12 +1181,16 @@ def int_or_none(v, scale=1):
return v if v is None else (int(v) // scale)
def float_or_none(v, scale=1):
return v if v is None else (float(v) / scale)
def parse_duration(s):
if s is None:
return None
m = re.match(
r'(?:(?:(?P<hours>[0-9]+)[:h])?(?P<mins>[0-9]+)[:m])?(?P<secs>[0-9]+)s?$', s)
r'(?:(?:(?P<hours>[0-9]+)[:h])?(?P<mins>[0-9]+)[:m])?(?P<secs>[0-9]+)s?(?::[0-9]+)?$', s)
if not m:
return None
res = int(m.group('secs'))
@ -1328,3 +1332,7 @@ US_RATINGS = {
'R': 16,
'NC': 18,
}
def strip_jsonp(code):
return re.sub(r'(?s)^[a-zA-Z_]+\s*\(\s*(.*)\);\s*?\s*$', r'\1', code)

View File

@ -1,2 +1,2 @@
__version__ = '2013.03.24.1'
__version__ = '2014.03.29'