mercurial/byterange.py
changeset 0 9117c6561b0b
child 575 7f5ce4bbdd7b
equal deleted inserted replaced
-1:000000000000 0:9117c6561b0b
       
     1 #   This library is free software; you can redistribute it and/or
       
     2 #   modify it under the terms of the GNU Lesser General Public
       
     3 #   License as published by the Free Software Foundation; either
       
     4 #   version 2.1 of the License, or (at your option) any later version.
       
     5 #
       
     6 #   This library is distributed in the hope that it will be useful,
       
     7 #   but WITHOUT ANY WARRANTY; without even the implied warranty of
       
     8 #   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
       
     9 #   Lesser General Public License for more details.
       
    10 #
       
    11 #   You should have received a copy of the GNU Lesser General Public
       
    12 #   License along with this library; if not, write to the 
       
    13 #      Free Software Foundation, Inc., 
       
    14 #      59 Temple Place, Suite 330, 
       
    15 #      Boston, MA  02111-1307  USA
       
    16 
       
    17 # This file is part of urlgrabber, a high-level cross-protocol url-grabber
       
    18 # Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko
       
    19 
       
    20 # $Id: byterange.py,v 1.9 2005/02/14 21:55:07 mstenner Exp $
       
    21 
       
    22 import os
       
    23 import stat
       
    24 import urllib
       
    25 import urllib2
       
    26 import rfc822
       
    27 
       
    28 try:    
       
    29     from cStringIO import StringIO
       
    30 except ImportError, msg: 
       
    31     from StringIO import StringIO
       
    32 
       
    33 class RangeError(IOError):
       
    34     """Error raised when an unsatisfiable range is requested."""
       
    35     pass
       
    36     
       
    37 class HTTPRangeHandler(urllib2.BaseHandler):
       
    38     """Handler that enables HTTP Range headers.
       
    39     
       
    40     This was extremely simple. The Range header is a HTTP feature to
       
    41     begin with so all this class does is tell urllib2 that the 
       
    42     "206 Partial Content" reponse from the HTTP server is what we 
       
    43     expected.
       
    44     
       
    45     Example:
       
    46         import urllib2
       
    47         import byterange
       
    48         
       
    49         range_handler = range.HTTPRangeHandler()
       
    50         opener = urllib2.build_opener(range_handler)
       
    51         
       
    52         # install it
       
    53         urllib2.install_opener(opener)
       
    54         
       
    55         # create Request and set Range header
       
    56         req = urllib2.Request('http://www.python.org/')
       
    57         req.header['Range'] = 'bytes=30-50'
       
    58         f = urllib2.urlopen(req)
       
    59     """
       
    60     
       
    61     def http_error_206(self, req, fp, code, msg, hdrs):
       
    62         # 206 Partial Content Response
       
    63         r = urllib.addinfourl(fp, hdrs, req.get_full_url())
       
    64         r.code = code
       
    65         r.msg = msg
       
    66         return r
       
    67     
       
    68     def http_error_416(self, req, fp, code, msg, hdrs):
       
    69         # HTTP's Range Not Satisfiable error
       
    70         raise RangeError('Requested Range Not Satisfiable')
       
    71 
       
    72 class RangeableFileObject:
       
    73     """File object wrapper to enable raw range handling.
       
    74     This was implemented primarilary for handling range 
       
    75     specifications for file:// urls. This object effectively makes 
       
    76     a file object look like it consists only of a range of bytes in 
       
    77     the stream.
       
    78     
       
    79     Examples:
       
    80         # expose 10 bytes, starting at byte position 20, from 
       
    81         # /etc/aliases.
       
    82         >>> fo = RangeableFileObject(file('/etc/passwd', 'r'), (20,30))
       
    83         # seek seeks within the range (to position 23 in this case)
       
    84         >>> fo.seek(3)
       
    85         # tell tells where your at _within the range_ (position 3 in
       
    86         # this case)
       
    87         >>> fo.tell()
       
    88         # read EOFs if an attempt is made to read past the last
       
    89         # byte in the range. the following will return only 7 bytes.
       
    90         >>> fo.read(30)
       
    91     """
       
    92     
       
    93     def __init__(self, fo, rangetup):
       
    94         """Create a RangeableFileObject.
       
    95         fo       -- a file like object. only the read() method need be 
       
    96                     supported but supporting an optimized seek() is 
       
    97                     preferable.
       
    98         rangetup -- a (firstbyte,lastbyte) tuple specifying the range
       
    99                     to work over.
       
   100         The file object provided is assumed to be at byte offset 0.
       
   101         """
       
   102         self.fo = fo
       
   103         (self.firstbyte, self.lastbyte) = range_tuple_normalize(rangetup)
       
   104         self.realpos = 0
       
   105         self._do_seek(self.firstbyte)
       
   106         
       
   107     def __getattr__(self, name):
       
   108         """This effectively allows us to wrap at the instance level.
       
   109         Any attribute not found in _this_ object will be searched for
       
   110         in self.fo.  This includes methods."""
       
   111         if hasattr(self.fo, name):
       
   112             return getattr(self.fo, name)
       
   113         raise AttributeError, name
       
   114     
       
   115     def tell(self):
       
   116         """Return the position within the range.
       
   117         This is different from fo.seek in that position 0 is the 
       
   118         first byte position of the range tuple. For example, if
       
   119         this object was created with a range tuple of (500,899),
       
   120         tell() will return 0 when at byte position 500 of the file.
       
   121         """
       
   122         return (self.realpos - self.firstbyte)
       
   123     
       
   124     def seek(self,offset,whence=0):
       
   125         """Seek within the byte range.
       
   126         Positioning is identical to that described under tell().
       
   127         """
       
   128         assert whence in (0, 1, 2)
       
   129         if whence == 0:   # absolute seek
       
   130             realoffset = self.firstbyte + offset
       
   131         elif whence == 1: # relative seek
       
   132             realoffset = self.realpos + offset
       
   133         elif whence == 2: # absolute from end of file
       
   134             # XXX: are we raising the right Error here?
       
   135             raise IOError('seek from end of file not supported.')
       
   136         
       
   137         # do not allow seek past lastbyte in range
       
   138         if self.lastbyte and (realoffset >= self.lastbyte):
       
   139             realoffset = self.lastbyte
       
   140         
       
   141         self._do_seek(realoffset - self.realpos)
       
   142         
       
   143     def read(self, size=-1):
       
   144         """Read within the range.
       
   145         This method will limit the size read based on the range.
       
   146         """
       
   147         size = self._calc_read_size(size)
       
   148         rslt = self.fo.read(size)
       
   149         self.realpos += len(rslt)
       
   150         return rslt
       
   151     
       
   152     def readline(self, size=-1):
       
   153         """Read lines within the range.
       
   154         This method will limit the size read based on the range.
       
   155         """
       
   156         size = self._calc_read_size(size)
       
   157         rslt = self.fo.readline(size)
       
   158         self.realpos += len(rslt)
       
   159         return rslt
       
   160     
       
   161     def _calc_read_size(self, size):
       
   162         """Handles calculating the amount of data to read based on
       
   163         the range.
       
   164         """
       
   165         if self.lastbyte:
       
   166             if size > -1:
       
   167                 if ((self.realpos + size) >= self.lastbyte):
       
   168                     size = (self.lastbyte - self.realpos)
       
   169             else:
       
   170                 size = (self.lastbyte - self.realpos)
       
   171         return size
       
   172         
       
   173     def _do_seek(self,offset):
       
   174         """Seek based on whether wrapped object supports seek().
       
   175         offset is relative to the current position (self.realpos).
       
   176         """
       
   177         assert offset >= 0
       
   178         if not hasattr(self.fo, 'seek'):
       
   179             self._poor_mans_seek(offset)
       
   180         else:
       
   181             self.fo.seek(self.realpos + offset)
       
   182         self.realpos+= offset
       
   183         
       
   184     def _poor_mans_seek(self,offset):
       
   185         """Seek by calling the wrapped file objects read() method.
       
   186         This is used for file like objects that do not have native
       
   187         seek support. The wrapped objects read() method is called
       
   188         to manually seek to the desired position.
       
   189         offset -- read this number of bytes from the wrapped
       
   190                   file object.
       
   191         raise RangeError if we encounter EOF before reaching the 
       
   192         specified offset.
       
   193         """
       
   194         pos = 0
       
   195         bufsize = 1024
       
   196         while pos < offset:
       
   197             if (pos + bufsize) > offset:
       
   198                 bufsize = offset - pos
       
   199             buf = self.fo.read(bufsize)
       
   200             if len(buf) != bufsize:
       
   201                 raise RangeError('Requested Range Not Satisfiable')
       
   202             pos+= bufsize
       
   203 
       
   204 class FileRangeHandler(urllib2.FileHandler):
       
   205     """FileHandler subclass that adds Range support.
       
   206     This class handles Range headers exactly like an HTTP
       
   207     server would.
       
   208     """
       
   209     def open_local_file(self, req):
       
   210         import mimetypes
       
   211         import mimetools
       
   212         host = req.get_host()
       
   213         file = req.get_selector()
       
   214         localfile = urllib.url2pathname(file)
       
   215         stats = os.stat(localfile)
       
   216         size = stats[stat.ST_SIZE]
       
   217         modified = rfc822.formatdate(stats[stat.ST_MTIME])
       
   218         mtype = mimetypes.guess_type(file)[0]
       
   219         if host:
       
   220             host, port = urllib.splitport(host)
       
   221             if port or socket.gethostbyname(host) not in self.get_names():
       
   222                 raise URLError('file not on local host')
       
   223         fo = open(localfile,'rb')
       
   224         brange = req.headers.get('Range',None)
       
   225         brange = range_header_to_tuple(brange)
       
   226         assert brange != ()
       
   227         if brange:
       
   228             (fb,lb) = brange
       
   229             if lb == '': lb = size
       
   230             if fb < 0 or fb > size or lb > size:
       
   231                 raise RangeError('Requested Range Not Satisfiable')
       
   232             size = (lb - fb)
       
   233             fo = RangeableFileObject(fo, (fb,lb))
       
   234         headers = mimetools.Message(StringIO(
       
   235             'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
       
   236             (mtype or 'text/plain', size, modified)))
       
   237         return urllib.addinfourl(fo, headers, 'file:'+file)
       
   238 
       
   239 
       
   240 # FTP Range Support 
       
   241 # Unfortunately, a large amount of base FTP code had to be copied
       
   242 # from urllib and urllib2 in order to insert the FTP REST command.
       
   243 # Code modifications for range support have been commented as 
       
   244 # follows:
       
   245 # -- range support modifications start/end here
       
   246 
       
   247 from urllib import splitport, splituser, splitpasswd, splitattr, \
       
   248                    unquote, addclosehook, addinfourl
       
   249 import ftplib
       
   250 import socket
       
   251 import sys
       
   252 import ftplib
       
   253 import mimetypes
       
   254 import mimetools
       
   255 
       
   256 class FTPRangeHandler(urllib2.FTPHandler):
       
   257     def ftp_open(self, req):
       
   258         host = req.get_host()
       
   259         if not host:
       
   260             raise IOError, ('ftp error', 'no host given')
       
   261         host, port = splitport(host)
       
   262         if port is None:
       
   263             port = ftplib.FTP_PORT
       
   264 
       
   265         # username/password handling
       
   266         user, host = splituser(host)
       
   267         if user:
       
   268             user, passwd = splitpasswd(user)
       
   269         else:
       
   270             passwd = None
       
   271         host = unquote(host)
       
   272         user = unquote(user or '')
       
   273         passwd = unquote(passwd or '')
       
   274         
       
   275         try:
       
   276             host = socket.gethostbyname(host)
       
   277         except socket.error, msg:
       
   278             raise URLError(msg)
       
   279         path, attrs = splitattr(req.get_selector())
       
   280         dirs = path.split('/')
       
   281         dirs = map(unquote, dirs)
       
   282         dirs, file = dirs[:-1], dirs[-1]
       
   283         if dirs and not dirs[0]:
       
   284             dirs = dirs[1:]
       
   285         try:
       
   286             fw = self.connect_ftp(user, passwd, host, port, dirs)
       
   287             type = file and 'I' or 'D'
       
   288             for attr in attrs:
       
   289                 attr, value = splitattr(attr)
       
   290                 if attr.lower() == 'type' and \
       
   291                    value in ('a', 'A', 'i', 'I', 'd', 'D'):
       
   292                     type = value.upper()
       
   293             
       
   294             # -- range support modifications start here
       
   295             rest = None
       
   296             range_tup = range_header_to_tuple(req.headers.get('Range',None))    
       
   297             assert range_tup != ()
       
   298             if range_tup:
       
   299                 (fb,lb) = range_tup
       
   300                 if fb > 0: rest = fb
       
   301             # -- range support modifications end here
       
   302             
       
   303             fp, retrlen = fw.retrfile(file, type, rest)
       
   304             
       
   305             # -- range support modifications start here
       
   306             if range_tup:
       
   307                 (fb,lb) = range_tup
       
   308                 if lb == '': 
       
   309                     if retrlen is None or retrlen == 0:
       
   310                         raise RangeError('Requested Range Not Satisfiable due to unobtainable file length.')
       
   311                     lb = retrlen
       
   312                     retrlen = lb - fb
       
   313                     if retrlen < 0:
       
   314                         # beginning of range is larger than file
       
   315                         raise RangeError('Requested Range Not Satisfiable')
       
   316                 else:
       
   317                     retrlen = lb - fb
       
   318                     fp = RangeableFileObject(fp, (0,retrlen))
       
   319             # -- range support modifications end here
       
   320             
       
   321             headers = ""
       
   322             mtype = mimetypes.guess_type(req.get_full_url())[0]
       
   323             if mtype:
       
   324                 headers += "Content-Type: %s\n" % mtype
       
   325             if retrlen is not None and retrlen >= 0:
       
   326                 headers += "Content-Length: %d\n" % retrlen
       
   327             sf = StringIO(headers)
       
   328             headers = mimetools.Message(sf)
       
   329             return addinfourl(fp, headers, req.get_full_url())
       
   330         except ftplib.all_errors, msg:
       
   331             raise IOError, ('ftp error', msg), sys.exc_info()[2]
       
   332 
       
   333     def connect_ftp(self, user, passwd, host, port, dirs):
       
   334         fw = ftpwrapper(user, passwd, host, port, dirs)
       
   335         return fw
       
   336 
       
   337 class ftpwrapper(urllib.ftpwrapper):
       
   338     # range support note:
       
   339     # this ftpwrapper code is copied directly from
       
   340     # urllib. The only enhancement is to add the rest
       
   341     # argument and pass it on to ftp.ntransfercmd
       
   342     def retrfile(self, file, type, rest=None):
       
   343         self.endtransfer()
       
   344         if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
       
   345         else: cmd = 'TYPE ' + type; isdir = 0
       
   346         try:
       
   347             self.ftp.voidcmd(cmd)
       
   348         except ftplib.all_errors:
       
   349             self.init()
       
   350             self.ftp.voidcmd(cmd)
       
   351         conn = None
       
   352         if file and not isdir:
       
   353             # Use nlst to see if the file exists at all
       
   354             try:
       
   355                 self.ftp.nlst(file)
       
   356             except ftplib.error_perm, reason:
       
   357                 raise IOError, ('ftp error', reason), sys.exc_info()[2]
       
   358             # Restore the transfer mode!
       
   359             self.ftp.voidcmd(cmd)
       
   360             # Try to retrieve as a file
       
   361             try:
       
   362                 cmd = 'RETR ' + file
       
   363                 conn = self.ftp.ntransfercmd(cmd, rest)
       
   364             except ftplib.error_perm, reason:
       
   365                 if str(reason)[:3] == '501':
       
   366                     # workaround for REST not supported error
       
   367                     fp, retrlen = self.retrfile(file, type)
       
   368                     fp = RangeableFileObject(fp, (rest,''))
       
   369                     return (fp, retrlen)
       
   370                 elif str(reason)[:3] != '550':
       
   371                     raise IOError, ('ftp error', reason), sys.exc_info()[2]
       
   372         if not conn:
       
   373             # Set transfer mode to ASCII!
       
   374             self.ftp.voidcmd('TYPE A')
       
   375             # Try a directory listing
       
   376             if file: cmd = 'LIST ' + file
       
   377             else: cmd = 'LIST'
       
   378             conn = self.ftp.ntransfercmd(cmd)
       
   379         self.busy = 1
       
   380         # Pass back both a suitably decorated object and a retrieval length
       
   381         return (addclosehook(conn[0].makefile('rb'),
       
   382                             self.endtransfer), conn[1])
       
   383 
       
   384 
       
   385 ####################################################################
       
   386 # Range Tuple Functions
       
   387 # XXX: These range tuple functions might go better in a class.
       
   388 
       
   389 _rangere = None
       
   390 def range_header_to_tuple(range_header):
       
   391     """Get a (firstbyte,lastbyte) tuple from a Range header value.
       
   392     
       
   393     Range headers have the form "bytes=<firstbyte>-<lastbyte>". This
       
   394     function pulls the firstbyte and lastbyte values and returns
       
   395     a (firstbyte,lastbyte) tuple. If lastbyte is not specified in
       
   396     the header value, it is returned as an empty string in the
       
   397     tuple.
       
   398     
       
   399     Return None if range_header is None
       
   400     Return () if range_header does not conform to the range spec 
       
   401     pattern.
       
   402     
       
   403     """
       
   404     global _rangere
       
   405     if range_header is None: return None
       
   406     if _rangere is None:
       
   407         import re
       
   408         _rangere = re.compile(r'^bytes=(\d{1,})-(\d*)')
       
   409     match = _rangere.match(range_header)
       
   410     if match: 
       
   411         tup = range_tuple_normalize(match.group(1,2))
       
   412         if tup and tup[1]: 
       
   413             tup = (tup[0],tup[1]+1)
       
   414         return tup
       
   415     return ()
       
   416 
       
   417 def range_tuple_to_header(range_tup):
       
   418     """Convert a range tuple to a Range header value.
       
   419     Return a string of the form "bytes=<firstbyte>-<lastbyte>" or None
       
   420     if no range is needed.
       
   421     """
       
   422     if range_tup is None: return None
       
   423     range_tup = range_tuple_normalize(range_tup)
       
   424     if range_tup:
       
   425         if range_tup[1]: 
       
   426             range_tup = (range_tup[0],range_tup[1] - 1)
       
   427         return 'bytes=%s-%s' % range_tup
       
   428     
       
   429 def range_tuple_normalize(range_tup):
       
   430     """Normalize a (first_byte,last_byte) range tuple.
       
   431     Return a tuple whose first element is guaranteed to be an int
       
   432     and whose second element will be '' (meaning: the last byte) or 
       
   433     an int. Finally, return None if the normalized tuple == (0,'')
       
   434     as that is equivelant to retrieving the entire file.
       
   435     """
       
   436     if range_tup is None: return None
       
   437     # handle first byte
       
   438     fb = range_tup[0]
       
   439     if fb in (None,''): fb = 0
       
   440     else: fb = int(fb)
       
   441     # handle last byte
       
   442     try: lb = range_tup[1]
       
   443     except IndexError: lb = ''
       
   444     else:  
       
   445         if lb is None: lb = ''
       
   446         elif lb != '': lb = int(lb)
       
   447     # check if range is over the entire file
       
   448     if (fb,lb) == (0,''): return None
       
   449     # check that the range is valid
       
   450     if lb < fb: raise RangeError('Invalid byte range: %s-%s' % (fb,lb))
       
   451     return (fb,lb)
       
   452