Compare commits

..

24 Commits

Author SHA1 Message Date
Ricardo Garcia
0c8beb43f2 Bump version number 2010-10-31 11:24:44 +01:00
Ricardo Garcia
71b7300e63 Use get_video_info to work around captcha problems (fixes issue #31) 2010-10-31 11:24:44 +01:00
Ricardo Garcia
8497c36d5a Fix minor problem with size formatting method 2010-10-31 11:24:44 +01:00
Ricardo Garcia
110cd3462e Update User-agent string 2010-10-31 11:24:43 +01:00
Ricardo Garcia
18963a36b0 Fix metacafe.com code due to recent changes in the site 2010-10-31 11:24:43 +01:00
Ricardo Garcia
df1ceb1fd9 Include format 5 in best quality list 2010-10-31 11:24:43 +01:00
Ricardo Garcia
7eb0e89742 Properly encode messages sent to stderr (fixes issue #34) 2010-10-31 11:24:43 +01:00
Ricardo Garcia
8b07dec5f6 Bump version number 2010-10-31 11:24:40 +01:00
Ricardo Garcia
113e5266cc Modify "more pages" check in YouTube playlist (fixes issue #29) 2010-10-31 11:24:40 +01:00
Ricardo Garcia
55e7c75e12 Delay opening file until there is data to write
Fixes issue #19.
2010-10-31 11:24:40 +01:00
Ricardo Garcia
ff21a710ae Restore INTERNAL version number 2010-10-31 11:24:36 +01:00
Ricardo Garcia
7374795552 Bump version number 2010-10-31 11:24:36 +01:00
Ricardo Garcia
0cd61126fc Document new "continuedl" FileDownloader option 2010-10-31 11:24:36 +01:00
Ricardo Garcia
e1f18b8a84 Remove integer casts and replace them with long integer casts 2010-10-31 11:24:36 +01:00
Ricardo Garcia
6a0015a7e0 Fix missing cast preventing detection of already downloaded file 2010-10-31 11:24:36 +01:00
Ricardo Garcia
7db85b2c70 Tweaks to ivanov's code 2010-10-31 11:24:36 +01:00
Paul Ivanov
f76c2df64e Added -c option (--continue)
interrupted downloads will properly resume and append to the previously downloaded data, instead of overwriting the file.

There's some error checking - if the length of the file to be download matches the length of the previously downloaded data, we report that this file has already been downloaded and do nothing.

If there is some other HTTP 416 'Requested range not satisfiable' error, we simply re-download the whole file (reverting to the original functionality)

All other HTTP errors are simply raised.

Resuming does not override -w (--nooverwrite), since it is not clear what should happen if file on disk is larger than file to be downloaded.

Thus, -c does nothing if -w is present.
2010-10-31 11:24:36 +01:00
Ricardo Garcia
daa88ccc2e Fix TypeError when using the -f option (fixes issue #24) 2010-10-31 11:24:36 +01:00
Ricardo Garcia
eb5d184157 Restore INTERNAL version number 2010-10-31 11:24:36 +01:00
Ricardo Garcia
5745bfdcdc Bump version number 2010-10-31 11:24:32 +01:00
Ricardo Garcia
320becd692 Remove trails from the "append_const" change (fixes issue #23) 2010-10-31 11:24:32 +01:00
Ricardo Garcia
968aa88438 Only catch UnavailableFormatError in call to process_info 2010-10-31 11:24:32 +01:00
Ricardo Garcia
cbfff4db63 Verify URLs in simulate mode (fixes issue #22) 2010-10-31 11:24:32 +01:00
Ricardo Garcia
781daeabdb Restore "INTERNAL" version number 2010-10-31 11:24:32 +01:00
2 changed files with 170 additions and 106 deletions

View File

@@ -1 +1 @@
2009.05.23
2009.08.08

View File

@@ -19,7 +19,7 @@ import urllib
import urllib2
std_headers = {
'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.0.8) Gecko/2009032609 Firefox/3.0.8',
'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.1.2) Gecko/20090729 Firefox/3.5.2',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
'Accept-Language': 'en-us,en;q=0.5',
@@ -114,6 +114,7 @@ class FileDownloader(object):
ignoreerrors: Do not stop on download errors.
ratelimit: Download speed limit, in bytes/sec.
nooverwrites: Prevent overwriting files.
continuedl: Try to continue downloads if possible.
"""
params = None
@@ -142,10 +143,12 @@ class FileDownloader(object):
def format_bytes(bytes):
if bytes is None:
return 'N/A'
if bytes == 0:
if type(bytes) is str:
bytes = float(bytes)
if bytes == 0.0:
exponent = 0
else:
exponent = long(math.log(float(bytes), 1024.0))
exponent = long(math.log(bytes, 1024.0))
suffix = 'bkMGTPEZY'[exponent]
converted = float(bytes) / float(1024**exponent)
return '%.2f%s' % (converted, suffix)
@@ -182,13 +185,13 @@ class FileDownloader(object):
new_min = max(bytes / 2.0, 1.0)
new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
if elapsed_time < 0.001:
return int(new_max)
return long(new_max)
rate = bytes / elapsed_time
if rate > new_max:
return int(new_max)
return long(new_max)
if rate < new_min:
return int(new_min)
return int(rate)
return long(new_min)
return long(rate)
@staticmethod
def parse_bytes(bytestr):
@@ -200,6 +203,14 @@ class FileDownloader(object):
multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
return long(round(number * multiplier))
@staticmethod
def verify_url(url):
"""Verify a URL is valid and data could be downloaded."""
request = urllib2.Request(url, None, std_headers)
data = urllib2.urlopen(request)
data.read(1)
data.close()
def add_info_extractor(self, ie):
"""Add an InfoExtractor object to the end of the list."""
self._ies.append(ie)
@@ -218,7 +229,7 @@ class FileDownloader(object):
def to_stderr(self, message):
"""Print message to stderr."""
print >>sys.stderr, message
print >>sys.stderr, message.encode(locale.getpreferredencoding())
def fixed_template(self):
"""Checks if the output template is fixed."""
@@ -258,6 +269,18 @@ class FileDownloader(object):
"""Report download progress."""
self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
(percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
def report_resuming_byte(self, resume_len):
"""Report attemtp to resume at given byte."""
self.to_stdout(u'[download] Resuming download at byte %s' % resume_len)
def report_file_already_downloaded(self, file_name):
"""Report file has already been fully downloaded."""
self.to_stdout(u'[download] %s has already been downloaded' % file_name)
def report_unable_to_resume(self):
"""Report it was impossible to resume download."""
self.to_stdout(u'[download] Unable to resume')
def report_finish(self):
"""Report download finished."""
@@ -265,25 +288,29 @@ class FileDownloader(object):
def process_info(self, info_dict):
"""Process a single dictionary returned by an InfoExtractor."""
# Forced printings
if self.params.get('forcetitle', False):
print info_dict['title'].encode(locale.getpreferredencoding())
if self.params.get('forceurl', False):
print info_dict['url'].encode(locale.getpreferredencoding())
# Do nothing else if in simulate mode
if self.params.get('simulate', False):
return
try:
self.verify_url(info_dict['url'])
except (OSError, IOError, urllib2.URLError, httplib.HTTPException, socket.error), err:
raise UnavailableFormatError
# Forced printings
if self.params.get('forcetitle', False):
print info_dict['title'].encode(locale.getpreferredencoding())
if self.params.get('forceurl', False):
print info_dict['url'].encode(locale.getpreferredencoding())
return
try:
template_dict = dict(info_dict)
template_dict['epoch'] = unicode(long(time.time()))
filename = self.params['outtmpl'] % template_dict
self.report_destination(filename)
except (ValueError, KeyError), err:
self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
if self.params['nooverwrites'] and os.path.exists(filename):
self.to_stderr('WARNING: file exists: %s; skipping' % filename)
self.to_stderr(u'WARNING: file exists: %s; skipping' % filename)
return
try:
@@ -293,17 +320,8 @@ class FileDownloader(object):
return
try:
outstream = open(filename, 'wb')
success = self._do_download(filename, info_dict['url'])
except (OSError, IOError), err:
self.trouble('ERROR: unable to open for writing: %s' % str(err))
return
try:
self._do_download(outstream, info_dict['url'])
outstream.close()
except (OSError, IOError), err:
outstream.close()
os.remove(filename)
raise UnavailableFormatError
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
self.trouble('ERROR: unable to download video data: %s' % str(err))
@@ -312,11 +330,12 @@ class FileDownloader(object):
self.trouble('ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
return
try:
self.post_process(filename, info_dict)
except (PostProcessingError), err:
self.trouble('ERROR: postprocessing: %s' % str(err))
return
if success:
try:
self.post_process(filename, info_dict)
except (PostProcessingError), err:
self.trouble('ERROR: postprocessing: %s' % str(err))
return
def download(self, url_list):
"""Download a given list of URLs."""
@@ -353,21 +372,43 @@ class FileDownloader(object):
if info is None:
break
def _do_download(self, stream, url):
def _do_download(self, filename, url):
stream = None
open_mode = 'ab'
basic_request = urllib2.Request(url, None, std_headers)
request = urllib2.Request(url, None, std_headers)
data = urllib2.urlopen(request)
# Attempt to resume download with "continuedl" option
if os.path.isfile(filename):
resume_len = os.path.getsize(filename)
else:
resume_len = 0
if self.params['continuedl'] and resume_len != 0:
self.report_resuming_byte(resume_len)
request.add_header('Range','bytes=%d-' % resume_len)
# Establish connection
try:
data = urllib2.urlopen(request)
except (urllib2.HTTPError, ), err:
if err.code != 416: # 416 is 'Requested range not satisfiable'
raise
data = urllib2.urlopen(basic_request)
content_length = data.info()['Content-Length']
if content_length is not None and long(content_length) == resume_len:
self.report_file_already_downloaded(filename)
return True
else:
self.report_unable_to_resume()
open_mode = 'wb'
data_len = data.info().get('Content-length', None)
data_len_str = self.format_bytes(data_len)
byte_counter = 0
block_size = 1024
start = time.time()
while True:
# Progress message
percent_str = self.calc_percent(byte_counter, data_len)
eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
speed_str = self.calc_speed(start, time.time(), byte_counter)
self.report_progress(percent_str, data_len_str, speed_str, eta_str)
# Download and write
before = time.time()
data_block = data.read(block_size)
@@ -376,15 +417,31 @@ class FileDownloader(object):
if data_block_len == 0:
break
byte_counter += data_block_len
# Open file just in time
if stream is None:
try:
stream = open(filename, open_mode)
self.report_destination(filename)
except (OSError, IOError), err:
self.trouble('ERROR: unable to open for writing: %s' % str(err))
return False
stream.write(data_block)
block_size = self.best_block_size(after - before, data_block_len)
# Progress message
percent_str = self.calc_percent(byte_counter, data_len)
eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
speed_str = self.calc_speed(start, time.time(), byte_counter)
self.report_progress(percent_str, data_len_str, speed_str, eta_str)
# Apply rate limit
self.slow_down(start, byte_counter)
self.report_finish()
if data_len is not None and str(byte_counter) != data_len:
raise ContentTooShortError(byte_counter, long(data_len))
return True
class InfoExtractor(object):
"""Information Extractor class.
@@ -455,7 +512,7 @@ class YoutubeIE(InfoExtractor):
_LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
_AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
_NETRC_MACHINE = 'youtube'
_available_formats = ['22', '35', '18', '17', '13'] # listed in order of priority for -b flag
_available_formats = ['22', '35', '18', '5', '17', '13'] # listed in order of priority for -b flag
_video_extensions = {
'13': '3gp',
'17': 'mp4',
@@ -502,9 +559,9 @@ class YoutubeIE(InfoExtractor):
"""Report attempt to confirm age."""
self._downloader.to_stdout(u'[youtube] Confirming age')
def report_webpage_download(self, video_id):
"""Report attempt to download webpage."""
self._downloader.to_stdout(u'[youtube] %s: Downloading video webpage' % video_id)
def report_video_info_webpage_download(self, video_id):
"""Report attempt to download video info webpage."""
self._downloader.to_stdout(u'[youtube] %s: Downloading video info webpage' % video_id)
def report_information_extraction(self, video_id):
"""Report attempt to extract video information."""
@@ -607,53 +664,62 @@ class YoutubeIE(InfoExtractor):
best_quality = True
while True:
# Extension
video_extension = self._video_extensions.get(format_param, 'flv')
# Get video info
video_info_url = 'http://www.youtube.com/get_video_info?&video_id=%s&el=detailpage&ps=default&eurl=&gl=US&hl=en' % video_id
request = urllib2.Request(video_info_url, None, std_headers)
try:
# Extension
video_extension = self._video_extensions.get(format_param, 'flv')
self.report_video_info_webpage_download(video_id)
video_info_webpage = urllib2.urlopen(request).read()
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
return
self.report_information_extraction(video_id)
# Normalize URL, including format
normalized_url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en' % video_id
if format_param is not None:
normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
request = urllib2.Request(normalized_url, None, std_headers)
try:
self.report_webpage_download(video_id)
video_webpage = urllib2.urlopen(request).read()
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
return
self.report_information_extraction(video_id)
# "t" param
mobj = re.search(r', "t": "([^"]+)"', video_webpage)
# "t" param
mobj = re.search(r'(?m)&token=([^&]+)(?:&|$)', video_info_webpage)
if mobj is None:
# Attempt to see if YouTube has issued an error message
mobj = re.search(r'(?m)&reason=([^&]+)(?:&|$)', video_info_webpage)
if mobj is None:
self._downloader.trouble(u'ERROR: unable to extract "t" parameter')
return
video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s&el=detailpage&ps=' % (video_id, mobj.group(1))
if format_param is not None:
video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
self.report_video_url(video_id, video_real_url)
self._downloader.trouble(u'ERROR: unable to extract "t" parameter for unknown reason')
stream = open('reportme-ydl-%s.dat' % time.time(), 'wb')
stream.write(video_info_webpage)
stream.close()
else:
reason = urllib.unquote_plus(mobj.group(1))
self._downloader.trouble(u'ERROR: YouTube said: %s' % reason.decode('utf-8'))
return
token = urllib.unquote(mobj.group(1))
video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=detailpage&ps=default&gl=US&hl=en' % (video_id, token)
if format_param is not None:
video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
self.report_video_url(video_id, video_real_url)
# uploader
mobj = re.search(r"var watchUsername = '([^']+)';", video_webpage)
if mobj is None:
self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
return
video_uploader = mobj.group(1)
# uploader
mobj = re.search(r'(?m)&author=([^&]+)(?:&|$)', video_info_webpage)
if mobj is None:
self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
return
video_uploader = urllib.unquote(mobj.group(1))
# title
mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
if mobj is None:
self._downloader.trouble(u'ERROR: unable to extract video title')
return
video_title = mobj.group(1).decode('utf-8')
video_title = re.sub(ur'(?u)&(.+?);', self.htmlentity_transform, video_title)
video_title = video_title.replace(os.sep, u'%')
# title
mobj = re.search(r'(?m)&title=([^&]+)(?:&|$)', video_info_webpage)
if mobj is None:
self._downloader.trouble(u'ERROR: unable to extract video title')
return
video_title = urllib.unquote(mobj.group(1))
video_title = video_title.decode('utf-8')
video_title = re.sub(ur'(?u)&(.+?);', self.htmlentity_transform, video_title)
video_title = video_title.replace(os.sep, u'%')
# simplified title
simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
simple_title = simple_title.strip(ur'_')
# simplified title
simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
simple_title = simple_title.strip(ur'_')
try:
# Process video information
self._downloader.process_info({
'id': video_id.decode('utf-8'),
@@ -766,19 +832,21 @@ class MetacafeIE(InfoExtractor):
# Extract URL, uploader and title from webpage
self.report_extraction(video_id)
mobj = re.search(r'(?m)&mediaURL=(http.*?\.flv)', webpage)
mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
if mobj is None:
self._downloader.trouble(u'ERROR: unable to extract media URL')
return
mediaURL = urllib.unquote(mobj.group(1))
mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
if mobj is None:
self._downloader.trouble(u'ERROR: unable to extract gdaKey')
return
gdaKey = mobj.group(1)
#mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
#if mobj is None:
# self._downloader.trouble(u'ERROR: unable to extract gdaKey')
# return
#gdaKey = mobj.group(1)
#
#video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
video_url = mediaURL
mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
if mobj is None:
@@ -846,7 +914,7 @@ class YoutubeSearchIE(InfoExtractor):
return
else:
try:
n = int(prefix)
n = long(prefix)
if n <= 0:
self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
return
@@ -855,7 +923,7 @@ class YoutubeSearchIE(InfoExtractor):
n = self._max_youtube_results
self._download_n_results(query, n)
return
except ValueError: # parsing prefix as int fails
except ValueError: # parsing prefix as integer fails
self._download_n_results(query, 1)
return
@@ -901,7 +969,7 @@ class YoutubePlaylistIE(InfoExtractor):
_VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/view_play_list\?p=(.+)'
_TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
_VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
_MORE_PAGES_INDICATOR = r'/view_play_list?p=%s&amp;page=%s'
_MORE_PAGES_INDICATOR = r'/view_play_list?p=%s&page=%s'
_youtube_ie = None
def __init__(self, youtube_ie, downloader=None):
@@ -947,7 +1015,7 @@ class YoutubePlaylistIE(InfoExtractor):
ids_in_page.append(mobj.group(1))
video_ids.extend(ids_in_page)
if (self._MORE_PAGES_INDICATOR % (playlist_id, pagenum + 1)) not in page:
if (self._MORE_PAGES_INDICATOR % (playlist_id.upper(), pagenum + 1)) not in page:
break
pagenum = pagenum + 1
@@ -1016,7 +1084,7 @@ if __name__ == '__main__':
# Parse command line
parser = optparse.OptionParser(
usage='Usage: %prog [options] url...',
version='2009.05.23',
version='2009.08.08',
conflict_handler='resolve',
)
@@ -1040,7 +1108,7 @@ if __name__ == '__main__':
video_format = optparse.OptionGroup(parser, 'Video Format Options')
video_format.add_option('-f', '--format',
action='append', dest='format', metavar='FMT', help='video format code')
action='store', dest='format', metavar='FMT', help='video format code')
video_format.add_option('-b', '--best-quality',
action='store_const', dest='format', help='download the best quality video possible', const='0')
video_format.add_option('-m', '--mobile-version',
@@ -1071,6 +1139,8 @@ if __name__ == '__main__':
dest='batchfile', metavar='F', help='file containing URLs to download')
filesystem.add_option('-w', '--no-overwrites',
action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
filesystem.add_option('-c', '--continue',
action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
parser.add_option_group(filesystem)
(opts, args) = parser.parse_args()
@@ -1104,13 +1174,6 @@ if __name__ == '__main__':
if numeric_limit is None:
parser.error(u'invalid rate limit specified')
opts.ratelimit = numeric_limit
if opts.format is not None and len(opts.format) > 1:
parser.error(u'pass at most one of the video format option flags (-f, -b, -m, -d)')
if opts.format is None:
real_format = None
else:
real_format = opts.format[0]
# Information extractors
youtube_ie = YoutubeIE()
@@ -1127,7 +1190,7 @@ if __name__ == '__main__':
'forceurl': opts.geturl,
'forcetitle': opts.gettitle,
'simulate': (opts.simulate or opts.geturl or opts.gettitle),
'format': real_format,
'format': opts.format,
'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(locale.getpreferredencoding()))
or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
@@ -1135,6 +1198,7 @@ if __name__ == '__main__':
'ignoreerrors': opts.ignoreerrors,
'ratelimit': opts.ratelimit,
'nooverwrites': opts.nooverwrites,
'continuedl': opts.continue_dl,
})
fd.add_info_extractor(youtube_search_ie)
fd.add_info_extractor(youtube_pl_ie)