mercurial/byterange.py
branchstable
changeset 37788 ed5448edcbfa
parent 37287 fb92df8b634c
parent 37787 92213f6745ed
child 37789 bfd32db06952
equal deleted inserted replaced
37287:fb92df8b634c 37788:ed5448edcbfa
     1 #   This library is free software; you can redistribute it and/or
       
     2 #   modify it under the terms of the GNU Lesser General Public
       
     3 #   License as published by the Free Software Foundation; either
       
     4 #   version 2.1 of the License, or (at your option) any later version.
       
     5 #
       
     6 #   This library is distributed in the hope that it will be useful,
       
     7 #   but WITHOUT ANY WARRANTY; without even the implied warranty of
       
     8 #   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
       
     9 #   Lesser General Public License for more details.
       
    10 #
       
    11 #   You should have received a copy of the GNU Lesser General Public
       
    12 #   License along with this library; if not, see
       
    13 #   <http://www.gnu.org/licenses/>.
       
    14 
       
    15 # This file is part of urlgrabber, a high-level cross-protocol url-grabber
       
    16 # Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko
       
    17 
       
    18 # $Id: byterange.py,v 1.9 2005/02/14 21:55:07 mstenner Exp $
       
    19 
       
    20 from __future__ import absolute_import
       
    21 
       
    22 import email
       
    23 import ftplib
       
    24 import mimetypes
       
    25 import os
       
    26 import re
       
    27 import socket
       
    28 import stat
       
    29 
       
    30 from . import (
       
    31     urllibcompat,
       
    32     util,
       
    33 )
       
    34 
       
    35 urlerr = util.urlerr
       
    36 urlreq = util.urlreq
       
    37 
       
    38 addclosehook = urlreq.addclosehook
       
    39 addinfourl = urlreq.addinfourl
       
    40 splitattr = urlreq.splitattr
       
    41 splitpasswd = urlreq.splitpasswd
       
    42 splitport = urlreq.splitport
       
    43 splituser = urlreq.splituser
       
    44 unquote = urlreq.unquote
       
    45 
       
    46 class RangeError(IOError):
       
    47     """Error raised when an unsatisfiable range is requested."""
       
    48 
       
    49 class HTTPRangeHandler(urlreq.basehandler):
       
    50     """Handler that enables HTTP Range headers.
       
    51 
       
    52     This was extremely simple. The Range header is a HTTP feature to
       
    53     begin with so all this class does is tell urllib2 that the
       
    54     "206 Partial Content" response from the HTTP server is what we
       
    55     expected.
       
    56 
       
    57     Example:
       
    58         import urllib2
       
    59         import byterange
       
    60 
       
    61         range_handler = range.HTTPRangeHandler()
       
    62         opener = urlreq.buildopener(range_handler)
       
    63 
       
    64         # install it
       
    65         urlreq.installopener(opener)
       
    66 
       
    67         # create Request and set Range header
       
    68         req = urlreq.request('http://www.python.org/')
       
    69         req.header['Range'] = 'bytes=30-50'
       
    70         f = urlreq.urlopen(req)
       
    71     """
       
    72 
       
    73     def http_error_206(self, req, fp, code, msg, hdrs):
       
    74         # 206 Partial Content Response
       
    75         r = urlreq.addinfourl(fp, hdrs, req.get_full_url())
       
    76         r.code = code
       
    77         r.msg = msg
       
    78         return r
       
    79 
       
    80     def http_error_416(self, req, fp, code, msg, hdrs):
       
    81         # HTTP's Range Not Satisfiable error
       
    82         raise RangeError('Requested Range Not Satisfiable')
       
    83 
       
    84 class RangeableFileObject(object):
       
    85     """File object wrapper to enable raw range handling.
       
    86     This was implemented primarily for handling range
       
    87     specifications for file:// urls. This object effectively makes
       
    88     a file object look like it consists only of a range of bytes in
       
    89     the stream.
       
    90 
       
    91     Examples:
       
    92         # expose 10 bytes, starting at byte position 20, from
       
    93         # /etc/aliases.
       
    94         >>> fo = RangeableFileObject(file(b'/etc/passwd', b'r'), (20,30))
       
    95         # seek seeks within the range (to position 23 in this case)
       
    96         >>> fo.seek(3)
       
    97         # tell tells where your at _within the range_ (position 3 in
       
    98         # this case)
       
    99         >>> fo.tell()
       
   100         # read EOFs if an attempt is made to read past the last
       
   101         # byte in the range. the following will return only 7 bytes.
       
   102         >>> fo.read(30)
       
   103     """
       
   104 
       
   105     def __init__(self, fo, rangetup):
       
   106         """Create a RangeableFileObject.
       
   107         fo       -- a file like object. only the read() method need be
       
   108                     supported but supporting an optimized seek() is
       
   109                     preferable.
       
   110         rangetup -- a (firstbyte,lastbyte) tuple specifying the range
       
   111                     to work over.
       
   112         The file object provided is assumed to be at byte offset 0.
       
   113         """
       
   114         self.fo = fo
       
   115         (self.firstbyte, self.lastbyte) = range_tuple_normalize(rangetup)
       
   116         self.realpos = 0
       
   117         self._do_seek(self.firstbyte)
       
   118 
       
   119     def __getattr__(self, name):
       
   120         """This effectively allows us to wrap at the instance level.
       
   121         Any attribute not found in _this_ object will be searched for
       
   122         in self.fo.  This includes methods."""
       
   123         return getattr(self.fo, name)
       
   124 
       
   125     def tell(self):
       
   126         """Return the position within the range.
       
   127         This is different from fo.seek in that position 0 is the
       
   128         first byte position of the range tuple. For example, if
       
   129         this object was created with a range tuple of (500,899),
       
   130         tell() will return 0 when at byte position 500 of the file.
       
   131         """
       
   132         return (self.realpos - self.firstbyte)
       
   133 
       
   134     def seek(self, offset, whence=0):
       
   135         """Seek within the byte range.
       
   136         Positioning is identical to that described under tell().
       
   137         """
       
   138         assert whence in (0, 1, 2)
       
   139         if whence == 0:   # absolute seek
       
   140             realoffset = self.firstbyte + offset
       
   141         elif whence == 1: # relative seek
       
   142             realoffset = self.realpos + offset
       
   143         elif whence == 2: # absolute from end of file
       
   144             # XXX: are we raising the right Error here?
       
   145             raise IOError('seek from end of file not supported.')
       
   146 
       
   147         # do not allow seek past lastbyte in range
       
   148         if self.lastbyte and (realoffset >= self.lastbyte):
       
   149             realoffset = self.lastbyte
       
   150 
       
   151         self._do_seek(realoffset - self.realpos)
       
   152 
       
   153     def read(self, size=-1):
       
   154         """Read within the range.
       
   155         This method will limit the size read based on the range.
       
   156         """
       
   157         size = self._calc_read_size(size)
       
   158         rslt = self.fo.read(size)
       
   159         self.realpos += len(rslt)
       
   160         return rslt
       
   161 
       
   162     def readline(self, size=-1):
       
   163         """Read lines within the range.
       
   164         This method will limit the size read based on the range.
       
   165         """
       
   166         size = self._calc_read_size(size)
       
   167         rslt = self.fo.readline(size)
       
   168         self.realpos += len(rslt)
       
   169         return rslt
       
   170 
       
   171     def _calc_read_size(self, size):
       
   172         """Handles calculating the amount of data to read based on
       
   173         the range.
       
   174         """
       
   175         if self.lastbyte:
       
   176             if size > -1:
       
   177                 if ((self.realpos + size) >= self.lastbyte):
       
   178                     size = (self.lastbyte - self.realpos)
       
   179             else:
       
   180                 size = (self.lastbyte - self.realpos)
       
   181         return size
       
   182 
       
   183     def _do_seek(self, offset):
       
   184         """Seek based on whether wrapped object supports seek().
       
   185         offset is relative to the current position (self.realpos).
       
   186         """
       
   187         assert offset >= 0
       
   188         seek = getattr(self.fo, 'seek', self._poor_mans_seek)
       
   189         seek(self.realpos + offset)
       
   190         self.realpos += offset
       
   191 
       
   192     def _poor_mans_seek(self, offset):
       
   193         """Seek by calling the wrapped file objects read() method.
       
   194         This is used for file like objects that do not have native
       
   195         seek support. The wrapped objects read() method is called
       
   196         to manually seek to the desired position.
       
   197         offset -- read this number of bytes from the wrapped
       
   198                   file object.
       
   199         raise RangeError if we encounter EOF before reaching the
       
   200         specified offset.
       
   201         """
       
   202         pos = 0
       
   203         bufsize = 1024
       
   204         while pos < offset:
       
   205             if (pos + bufsize) > offset:
       
   206                 bufsize = offset - pos
       
   207             buf = self.fo.read(bufsize)
       
   208             if len(buf) != bufsize:
       
   209                 raise RangeError('Requested Range Not Satisfiable')
       
   210             pos += bufsize
       
   211 
       
   212 class FileRangeHandler(urlreq.filehandler):
       
   213     """FileHandler subclass that adds Range support.
       
   214     This class handles Range headers exactly like an HTTP
       
   215     server would.
       
   216     """
       
   217     def open_local_file(self, req):
       
   218         host = urllibcompat.gethost(req)
       
   219         file = urllibcompat.getselector(req)
       
   220         localfile = urlreq.url2pathname(file)
       
   221         stats = os.stat(localfile)
       
   222         size = stats[stat.ST_SIZE]
       
   223         modified = email.Utils.formatdate(stats[stat.ST_MTIME])
       
   224         mtype = mimetypes.guess_type(file)[0]
       
   225         if host:
       
   226             host, port = urlreq.splitport(host)
       
   227             if port or socket.gethostbyname(host) not in self.get_names():
       
   228                 raise urlerr.urlerror('file not on local host')
       
   229         fo = open(localfile,'rb')
       
   230         brange = req.headers.get('Range', None)
       
   231         brange = range_header_to_tuple(brange)
       
   232         assert brange != ()
       
   233         if brange:
       
   234             (fb, lb) = brange
       
   235             if lb == '':
       
   236                 lb = size
       
   237             if fb < 0 or fb > size or lb > size:
       
   238                 raise RangeError('Requested Range Not Satisfiable')
       
   239             size = (lb - fb)
       
   240             fo = RangeableFileObject(fo, (fb, lb))
       
   241         headers = email.message_from_string(
       
   242             'Content-Type: %s\nContent-Length: %d\nLast-Modified: %s\n' %
       
   243             (mtype or 'text/plain', size, modified))
       
   244         return urlreq.addinfourl(fo, headers, 'file:'+file)
       
   245 
       
   246 
       
   247 # FTP Range Support
       
   248 # Unfortunately, a large amount of base FTP code had to be copied
       
   249 # from urllib and urllib2 in order to insert the FTP REST command.
       
   250 # Code modifications for range support have been commented as
       
   251 # follows:
       
   252 # -- range support modifications start/end here
       
   253 
       
   254 class FTPRangeHandler(urlreq.ftphandler):
       
   255     def ftp_open(self, req):
       
   256         host = urllibcompat.gethost(req)
       
   257         if not host:
       
   258             raise IOError('ftp error', 'no host given')
       
   259         host, port = splitport(host)
       
   260         if port is None:
       
   261             port = ftplib.FTP_PORT
       
   262         else:
       
   263             port = int(port)
       
   264 
       
   265         # username/password handling
       
   266         user, host = splituser(host)
       
   267         if user:
       
   268             user, passwd = splitpasswd(user)
       
   269         else:
       
   270             passwd = None
       
   271         host = unquote(host)
       
   272         user = unquote(user or '')
       
   273         passwd = unquote(passwd or '')
       
   274 
       
   275         try:
       
   276             host = socket.gethostbyname(host)
       
   277         except socket.error as msg:
       
   278             raise urlerr.urlerror(msg)
       
   279         path, attrs = splitattr(req.get_selector())
       
   280         dirs = path.split('/')
       
   281         dirs = map(unquote, dirs)
       
   282         dirs, file = dirs[:-1], dirs[-1]
       
   283         if dirs and not dirs[0]:
       
   284             dirs = dirs[1:]
       
   285         try:
       
   286             fw = self.connect_ftp(user, passwd, host, port, dirs)
       
   287             if file:
       
   288                 type = 'I'
       
   289             else:
       
   290                 type = 'D'
       
   291 
       
   292             for attr in attrs:
       
   293                 attr, value = splitattr(attr)
       
   294                 if attr.lower() == 'type' and \
       
   295                    value in ('a', 'A', 'i', 'I', 'd', 'D'):
       
   296                     type = value.upper()
       
   297 
       
   298             # -- range support modifications start here
       
   299             rest = None
       
   300             range_tup = range_header_to_tuple(req.headers.get('Range', None))
       
   301             assert range_tup != ()
       
   302             if range_tup:
       
   303                 (fb, lb) = range_tup
       
   304                 if fb > 0:
       
   305                     rest = fb
       
   306             # -- range support modifications end here
       
   307 
       
   308             fp, retrlen = fw.retrfile(file, type, rest)
       
   309 
       
   310             # -- range support modifications start here
       
   311             if range_tup:
       
   312                 (fb, lb) = range_tup
       
   313                 if lb == '':
       
   314                     if retrlen is None or retrlen == 0:
       
   315                         raise RangeError('Requested Range Not Satisfiable due'
       
   316                                          ' to unobtainable file length.')
       
   317                     lb = retrlen
       
   318                     retrlen = lb - fb
       
   319                     if retrlen < 0:
       
   320                         # beginning of range is larger than file
       
   321                         raise RangeError('Requested Range Not Satisfiable')
       
   322                 else:
       
   323                     retrlen = lb - fb
       
   324                     fp = RangeableFileObject(fp, (0, retrlen))
       
   325             # -- range support modifications end here
       
   326 
       
   327             headers = ""
       
   328             mtype = mimetypes.guess_type(req.get_full_url())[0]
       
   329             if mtype:
       
   330                 headers += "Content-Type: %s\n" % mtype
       
   331             if retrlen is not None and retrlen >= 0:
       
   332                 headers += "Content-Length: %d\n" % retrlen
       
   333             headers = email.message_from_string(headers)
       
   334             return addinfourl(fp, headers, req.get_full_url())
       
   335         except ftplib.all_errors as msg:
       
   336             raise IOError('ftp error', msg)
       
   337 
       
   338     def connect_ftp(self, user, passwd, host, port, dirs):
       
   339         fw = ftpwrapper(user, passwd, host, port, dirs)
       
   340         return fw
       
   341 
       
   342 class ftpwrapper(urlreq.ftpwrapper):
       
   343     # range support note:
       
   344     # this ftpwrapper code is copied directly from
       
   345     # urllib. The only enhancement is to add the rest
       
   346     # argument and pass it on to ftp.ntransfercmd
       
   347     def retrfile(self, file, type, rest=None):
       
   348         self.endtransfer()
       
   349         if type in ('d', 'D'):
       
   350             cmd = 'TYPE A'
       
   351             isdir = 1
       
   352         else:
       
   353             cmd = 'TYPE ' + type
       
   354             isdir = 0
       
   355         try:
       
   356             self.ftp.voidcmd(cmd)
       
   357         except ftplib.all_errors:
       
   358             self.init()
       
   359             self.ftp.voidcmd(cmd)
       
   360         conn = None
       
   361         if file and not isdir:
       
   362             # Use nlst to see if the file exists at all
       
   363             try:
       
   364                 self.ftp.nlst(file)
       
   365             except ftplib.error_perm as reason:
       
   366                 raise IOError('ftp error', reason)
       
   367             # Restore the transfer mode!
       
   368             self.ftp.voidcmd(cmd)
       
   369             # Try to retrieve as a file
       
   370             try:
       
   371                 cmd = 'RETR ' + file
       
   372                 conn = self.ftp.ntransfercmd(cmd, rest)
       
   373             except ftplib.error_perm as reason:
       
   374                 if str(reason).startswith('501'):
       
   375                     # workaround for REST not supported error
       
   376                     fp, retrlen = self.retrfile(file, type)
       
   377                     fp = RangeableFileObject(fp, (rest,''))
       
   378                     return (fp, retrlen)
       
   379                 elif not str(reason).startswith('550'):
       
   380                     raise IOError('ftp error', reason)
       
   381         if not conn:
       
   382             # Set transfer mode to ASCII!
       
   383             self.ftp.voidcmd('TYPE A')
       
   384             # Try a directory listing
       
   385             if file:
       
   386                 cmd = 'LIST ' + file
       
   387             else:
       
   388                 cmd = 'LIST'
       
   389             conn = self.ftp.ntransfercmd(cmd)
       
   390         self.busy = 1
       
   391         # Pass back both a suitably decorated object and a retrieval length
       
   392         return (addclosehook(conn[0].makefile('rb'),
       
   393                             self.endtransfer), conn[1])
       
   394 
       
   395 
       
   396 ####################################################################
       
   397 # Range Tuple Functions
       
   398 # XXX: These range tuple functions might go better in a class.
       
   399 
       
   400 _rangere = None
       
   401 def range_header_to_tuple(range_header):
       
   402     """Get a (firstbyte,lastbyte) tuple from a Range header value.
       
   403 
       
   404     Range headers have the form "bytes=<firstbyte>-<lastbyte>". This
       
   405     function pulls the firstbyte and lastbyte values and returns
       
   406     a (firstbyte,lastbyte) tuple. If lastbyte is not specified in
       
   407     the header value, it is returned as an empty string in the
       
   408     tuple.
       
   409 
       
   410     Return None if range_header is None
       
   411     Return () if range_header does not conform to the range spec
       
   412     pattern.
       
   413 
       
   414     """
       
   415     global _rangere
       
   416     if range_header is None:
       
   417         return None
       
   418     if _rangere is None:
       
   419         _rangere = re.compile(br'^bytes=(\d{1,})-(\d*)')
       
   420     match = _rangere.match(range_header)
       
   421     if match:
       
   422         tup = range_tuple_normalize(match.group(1, 2))
       
   423         if tup and tup[1]:
       
   424             tup = (tup[0], tup[1]+1)
       
   425         return tup
       
   426     return ()
       
   427 
       
   428 def range_tuple_to_header(range_tup):
       
   429     """Convert a range tuple to a Range header value.
       
   430     Return a string of the form "bytes=<firstbyte>-<lastbyte>" or None
       
   431     if no range is needed.
       
   432     """
       
   433     if range_tup is None:
       
   434         return None
       
   435     range_tup = range_tuple_normalize(range_tup)
       
   436     if range_tup:
       
   437         if range_tup[1]:
       
   438             range_tup = (range_tup[0], range_tup[1] - 1)
       
   439         return 'bytes=%s-%s' % range_tup
       
   440 
       
   441 def range_tuple_normalize(range_tup):
       
   442     """Normalize a (first_byte,last_byte) range tuple.
       
   443     Return a tuple whose first element is guaranteed to be an int
       
   444     and whose second element will be '' (meaning: the last byte) or
       
   445     an int. Finally, return None if the normalized tuple == (0,'')
       
   446     as that is equivalent to retrieving the entire file.
       
   447     """
       
   448     if range_tup is None:
       
   449         return None
       
   450     # handle first byte
       
   451     fb = range_tup[0]
       
   452     if fb in (None, ''):
       
   453         fb = 0
       
   454     else:
       
   455         fb = int(fb)
       
   456     # handle last byte
       
   457     try:
       
   458         lb = range_tup[1]
       
   459     except IndexError:
       
   460         lb = ''
       
   461     else:
       
   462         if lb is None:
       
   463             lb = ''
       
   464         elif lb != '':
       
   465             lb = int(lb)
       
   466     # check if range is over the entire file
       
   467     if (fb, lb) == (0, ''):
       
   468         return None
       
   469     # check that the range is valid
       
   470     if lb < fb:
       
   471         raise RangeError('Invalid byte range: %s-%s' % (fb, lb))
       
   472     return (fb, lb)