1 # This library is free software; you can redistribute it and/or |
|
2 # modify it under the terms of the GNU Lesser General Public |
|
3 # License as published by the Free Software Foundation; either |
|
4 # version 2.1 of the License, or (at your option) any later version. |
|
5 # |
|
6 # This library is distributed in the hope that it will be useful, |
|
7 # but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
8 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
9 # Lesser General Public License for more details. |
|
10 # |
|
11 # You should have received a copy of the GNU Lesser General Public |
|
12 # License along with this library; if not, see |
|
13 # <http://www.gnu.org/licenses/>. |
|
14 |
|
15 # This file is part of urlgrabber, a high-level cross-protocol url-grabber |
|
16 # Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko |
|
17 |
|
18 # $Id: byterange.py,v 1.9 2005/02/14 21:55:07 mstenner Exp $ |
|
19 |
|
20 from __future__ import absolute_import |
|
21 |
|
22 import email |
|
23 import ftplib |
|
24 import mimetypes |
|
25 import os |
|
26 import re |
|
27 import socket |
|
28 import stat |
|
29 |
|
30 from . import ( |
|
31 urllibcompat, |
|
32 util, |
|
33 ) |
|
34 |
|
35 urlerr = util.urlerr |
|
36 urlreq = util.urlreq |
|
37 |
|
38 addclosehook = urlreq.addclosehook |
|
39 addinfourl = urlreq.addinfourl |
|
40 splitattr = urlreq.splitattr |
|
41 splitpasswd = urlreq.splitpasswd |
|
42 splitport = urlreq.splitport |
|
43 splituser = urlreq.splituser |
|
44 unquote = urlreq.unquote |
|
45 |
|
46 class RangeError(IOError): |
|
47 """Error raised when an unsatisfiable range is requested.""" |
|
48 |
|
49 class HTTPRangeHandler(urlreq.basehandler): |
|
50 """Handler that enables HTTP Range headers. |
|
51 |
|
52 This was extremely simple. The Range header is a HTTP feature to |
|
53 begin with so all this class does is tell urllib2 that the |
|
54 "206 Partial Content" response from the HTTP server is what we |
|
55 expected. |
|
56 |
|
57 Example: |
|
58 import urllib2 |
|
59 import byterange |
|
60 |
|
61 range_handler = range.HTTPRangeHandler() |
|
62 opener = urlreq.buildopener(range_handler) |
|
63 |
|
64 # install it |
|
65 urlreq.installopener(opener) |
|
66 |
|
67 # create Request and set Range header |
|
68 req = urlreq.request('http://www.python.org/') |
|
69 req.header['Range'] = 'bytes=30-50' |
|
70 f = urlreq.urlopen(req) |
|
71 """ |
|
72 |
|
73 def http_error_206(self, req, fp, code, msg, hdrs): |
|
74 # 206 Partial Content Response |
|
75 r = urlreq.addinfourl(fp, hdrs, req.get_full_url()) |
|
76 r.code = code |
|
77 r.msg = msg |
|
78 return r |
|
79 |
|
80 def http_error_416(self, req, fp, code, msg, hdrs): |
|
81 # HTTP's Range Not Satisfiable error |
|
82 raise RangeError('Requested Range Not Satisfiable') |
|
83 |
|
84 class RangeableFileObject(object): |
|
85 """File object wrapper to enable raw range handling. |
|
86 This was implemented primarily for handling range |
|
87 specifications for file:// urls. This object effectively makes |
|
88 a file object look like it consists only of a range of bytes in |
|
89 the stream. |
|
90 |
|
91 Examples: |
|
92 # expose 10 bytes, starting at byte position 20, from |
|
93 # /etc/aliases. |
|
94 >>> fo = RangeableFileObject(file(b'/etc/passwd', b'r'), (20,30)) |
|
95 # seek seeks within the range (to position 23 in this case) |
|
96 >>> fo.seek(3) |
|
97 # tell tells where your at _within the range_ (position 3 in |
|
98 # this case) |
|
99 >>> fo.tell() |
|
100 # read EOFs if an attempt is made to read past the last |
|
101 # byte in the range. the following will return only 7 bytes. |
|
102 >>> fo.read(30) |
|
103 """ |
|
104 |
|
105 def __init__(self, fo, rangetup): |
|
106 """Create a RangeableFileObject. |
|
107 fo -- a file like object. only the read() method need be |
|
108 supported but supporting an optimized seek() is |
|
109 preferable. |
|
110 rangetup -- a (firstbyte,lastbyte) tuple specifying the range |
|
111 to work over. |
|
112 The file object provided is assumed to be at byte offset 0. |
|
113 """ |
|
114 self.fo = fo |
|
115 (self.firstbyte, self.lastbyte) = range_tuple_normalize(rangetup) |
|
116 self.realpos = 0 |
|
117 self._do_seek(self.firstbyte) |
|
118 |
|
119 def __getattr__(self, name): |
|
120 """This effectively allows us to wrap at the instance level. |
|
121 Any attribute not found in _this_ object will be searched for |
|
122 in self.fo. This includes methods.""" |
|
123 return getattr(self.fo, name) |
|
124 |
|
125 def tell(self): |
|
126 """Return the position within the range. |
|
127 This is different from fo.seek in that position 0 is the |
|
128 first byte position of the range tuple. For example, if |
|
129 this object was created with a range tuple of (500,899), |
|
130 tell() will return 0 when at byte position 500 of the file. |
|
131 """ |
|
132 return (self.realpos - self.firstbyte) |
|
133 |
|
134 def seek(self, offset, whence=0): |
|
135 """Seek within the byte range. |
|
136 Positioning is identical to that described under tell(). |
|
137 """ |
|
138 assert whence in (0, 1, 2) |
|
139 if whence == 0: # absolute seek |
|
140 realoffset = self.firstbyte + offset |
|
141 elif whence == 1: # relative seek |
|
142 realoffset = self.realpos + offset |
|
143 elif whence == 2: # absolute from end of file |
|
144 # XXX: are we raising the right Error here? |
|
145 raise IOError('seek from end of file not supported.') |
|
146 |
|
147 # do not allow seek past lastbyte in range |
|
148 if self.lastbyte and (realoffset >= self.lastbyte): |
|
149 realoffset = self.lastbyte |
|
150 |
|
151 self._do_seek(realoffset - self.realpos) |
|
152 |
|
153 def read(self, size=-1): |
|
154 """Read within the range. |
|
155 This method will limit the size read based on the range. |
|
156 """ |
|
157 size = self._calc_read_size(size) |
|
158 rslt = self.fo.read(size) |
|
159 self.realpos += len(rslt) |
|
160 return rslt |
|
161 |
|
162 def readline(self, size=-1): |
|
163 """Read lines within the range. |
|
164 This method will limit the size read based on the range. |
|
165 """ |
|
166 size = self._calc_read_size(size) |
|
167 rslt = self.fo.readline(size) |
|
168 self.realpos += len(rslt) |
|
169 return rslt |
|
170 |
|
171 def _calc_read_size(self, size): |
|
172 """Handles calculating the amount of data to read based on |
|
173 the range. |
|
174 """ |
|
175 if self.lastbyte: |
|
176 if size > -1: |
|
177 if ((self.realpos + size) >= self.lastbyte): |
|
178 size = (self.lastbyte - self.realpos) |
|
179 else: |
|
180 size = (self.lastbyte - self.realpos) |
|
181 return size |
|
182 |
|
183 def _do_seek(self, offset): |
|
184 """Seek based on whether wrapped object supports seek(). |
|
185 offset is relative to the current position (self.realpos). |
|
186 """ |
|
187 assert offset >= 0 |
|
188 seek = getattr(self.fo, 'seek', self._poor_mans_seek) |
|
189 seek(self.realpos + offset) |
|
190 self.realpos += offset |
|
191 |
|
192 def _poor_mans_seek(self, offset): |
|
193 """Seek by calling the wrapped file objects read() method. |
|
194 This is used for file like objects that do not have native |
|
195 seek support. The wrapped objects read() method is called |
|
196 to manually seek to the desired position. |
|
197 offset -- read this number of bytes from the wrapped |
|
198 file object. |
|
199 raise RangeError if we encounter EOF before reaching the |
|
200 specified offset. |
|
201 """ |
|
202 pos = 0 |
|
203 bufsize = 1024 |
|
204 while pos < offset: |
|
205 if (pos + bufsize) > offset: |
|
206 bufsize = offset - pos |
|
207 buf = self.fo.read(bufsize) |
|
208 if len(buf) != bufsize: |
|
209 raise RangeError('Requested Range Not Satisfiable') |
|
210 pos += bufsize |
|
211 |
|
212 class FileRangeHandler(urlreq.filehandler): |
|
213 """FileHandler subclass that adds Range support. |
|
214 This class handles Range headers exactly like an HTTP |
|
215 server would. |
|
216 """ |
|
217 def open_local_file(self, req): |
|
218 host = urllibcompat.gethost(req) |
|
219 file = urllibcompat.getselector(req) |
|
220 localfile = urlreq.url2pathname(file) |
|
221 stats = os.stat(localfile) |
|
222 size = stats[stat.ST_SIZE] |
|
223 modified = email.Utils.formatdate(stats[stat.ST_MTIME]) |
|
224 mtype = mimetypes.guess_type(file)[0] |
|
225 if host: |
|
226 host, port = urlreq.splitport(host) |
|
227 if port or socket.gethostbyname(host) not in self.get_names(): |
|
228 raise urlerr.urlerror('file not on local host') |
|
229 fo = open(localfile,'rb') |
|
230 brange = req.headers.get('Range', None) |
|
231 brange = range_header_to_tuple(brange) |
|
232 assert brange != () |
|
233 if brange: |
|
234 (fb, lb) = brange |
|
235 if lb == '': |
|
236 lb = size |
|
237 if fb < 0 or fb > size or lb > size: |
|
238 raise RangeError('Requested Range Not Satisfiable') |
|
239 size = (lb - fb) |
|
240 fo = RangeableFileObject(fo, (fb, lb)) |
|
241 headers = email.message_from_string( |
|
242 'Content-Type: %s\nContent-Length: %d\nLast-Modified: %s\n' % |
|
243 (mtype or 'text/plain', size, modified)) |
|
244 return urlreq.addinfourl(fo, headers, 'file:'+file) |
|
245 |
|
246 |
|
247 # FTP Range Support |
|
248 # Unfortunately, a large amount of base FTP code had to be copied |
|
249 # from urllib and urllib2 in order to insert the FTP REST command. |
|
250 # Code modifications for range support have been commented as |
|
251 # follows: |
|
252 # -- range support modifications start/end here |
|
253 |
|
254 class FTPRangeHandler(urlreq.ftphandler): |
|
255 def ftp_open(self, req): |
|
256 host = urllibcompat.gethost(req) |
|
257 if not host: |
|
258 raise IOError('ftp error', 'no host given') |
|
259 host, port = splitport(host) |
|
260 if port is None: |
|
261 port = ftplib.FTP_PORT |
|
262 else: |
|
263 port = int(port) |
|
264 |
|
265 # username/password handling |
|
266 user, host = splituser(host) |
|
267 if user: |
|
268 user, passwd = splitpasswd(user) |
|
269 else: |
|
270 passwd = None |
|
271 host = unquote(host) |
|
272 user = unquote(user or '') |
|
273 passwd = unquote(passwd or '') |
|
274 |
|
275 try: |
|
276 host = socket.gethostbyname(host) |
|
277 except socket.error as msg: |
|
278 raise urlerr.urlerror(msg) |
|
279 path, attrs = splitattr(req.get_selector()) |
|
280 dirs = path.split('/') |
|
281 dirs = map(unquote, dirs) |
|
282 dirs, file = dirs[:-1], dirs[-1] |
|
283 if dirs and not dirs[0]: |
|
284 dirs = dirs[1:] |
|
285 try: |
|
286 fw = self.connect_ftp(user, passwd, host, port, dirs) |
|
287 if file: |
|
288 type = 'I' |
|
289 else: |
|
290 type = 'D' |
|
291 |
|
292 for attr in attrs: |
|
293 attr, value = splitattr(attr) |
|
294 if attr.lower() == 'type' and \ |
|
295 value in ('a', 'A', 'i', 'I', 'd', 'D'): |
|
296 type = value.upper() |
|
297 |
|
298 # -- range support modifications start here |
|
299 rest = None |
|
300 range_tup = range_header_to_tuple(req.headers.get('Range', None)) |
|
301 assert range_tup != () |
|
302 if range_tup: |
|
303 (fb, lb) = range_tup |
|
304 if fb > 0: |
|
305 rest = fb |
|
306 # -- range support modifications end here |
|
307 |
|
308 fp, retrlen = fw.retrfile(file, type, rest) |
|
309 |
|
310 # -- range support modifications start here |
|
311 if range_tup: |
|
312 (fb, lb) = range_tup |
|
313 if lb == '': |
|
314 if retrlen is None or retrlen == 0: |
|
315 raise RangeError('Requested Range Not Satisfiable due' |
|
316 ' to unobtainable file length.') |
|
317 lb = retrlen |
|
318 retrlen = lb - fb |
|
319 if retrlen < 0: |
|
320 # beginning of range is larger than file |
|
321 raise RangeError('Requested Range Not Satisfiable') |
|
322 else: |
|
323 retrlen = lb - fb |
|
324 fp = RangeableFileObject(fp, (0, retrlen)) |
|
325 # -- range support modifications end here |
|
326 |
|
327 headers = "" |
|
328 mtype = mimetypes.guess_type(req.get_full_url())[0] |
|
329 if mtype: |
|
330 headers += "Content-Type: %s\n" % mtype |
|
331 if retrlen is not None and retrlen >= 0: |
|
332 headers += "Content-Length: %d\n" % retrlen |
|
333 headers = email.message_from_string(headers) |
|
334 return addinfourl(fp, headers, req.get_full_url()) |
|
335 except ftplib.all_errors as msg: |
|
336 raise IOError('ftp error', msg) |
|
337 |
|
338 def connect_ftp(self, user, passwd, host, port, dirs): |
|
339 fw = ftpwrapper(user, passwd, host, port, dirs) |
|
340 return fw |
|
341 |
|
342 class ftpwrapper(urlreq.ftpwrapper): |
|
343 # range support note: |
|
344 # this ftpwrapper code is copied directly from |
|
345 # urllib. The only enhancement is to add the rest |
|
346 # argument and pass it on to ftp.ntransfercmd |
|
347 def retrfile(self, file, type, rest=None): |
|
348 self.endtransfer() |
|
349 if type in ('d', 'D'): |
|
350 cmd = 'TYPE A' |
|
351 isdir = 1 |
|
352 else: |
|
353 cmd = 'TYPE ' + type |
|
354 isdir = 0 |
|
355 try: |
|
356 self.ftp.voidcmd(cmd) |
|
357 except ftplib.all_errors: |
|
358 self.init() |
|
359 self.ftp.voidcmd(cmd) |
|
360 conn = None |
|
361 if file and not isdir: |
|
362 # Use nlst to see if the file exists at all |
|
363 try: |
|
364 self.ftp.nlst(file) |
|
365 except ftplib.error_perm as reason: |
|
366 raise IOError('ftp error', reason) |
|
367 # Restore the transfer mode! |
|
368 self.ftp.voidcmd(cmd) |
|
369 # Try to retrieve as a file |
|
370 try: |
|
371 cmd = 'RETR ' + file |
|
372 conn = self.ftp.ntransfercmd(cmd, rest) |
|
373 except ftplib.error_perm as reason: |
|
374 if str(reason).startswith('501'): |
|
375 # workaround for REST not supported error |
|
376 fp, retrlen = self.retrfile(file, type) |
|
377 fp = RangeableFileObject(fp, (rest,'')) |
|
378 return (fp, retrlen) |
|
379 elif not str(reason).startswith('550'): |
|
380 raise IOError('ftp error', reason) |
|
381 if not conn: |
|
382 # Set transfer mode to ASCII! |
|
383 self.ftp.voidcmd('TYPE A') |
|
384 # Try a directory listing |
|
385 if file: |
|
386 cmd = 'LIST ' + file |
|
387 else: |
|
388 cmd = 'LIST' |
|
389 conn = self.ftp.ntransfercmd(cmd) |
|
390 self.busy = 1 |
|
391 # Pass back both a suitably decorated object and a retrieval length |
|
392 return (addclosehook(conn[0].makefile('rb'), |
|
393 self.endtransfer), conn[1]) |
|
394 |
|
395 |
|
396 #################################################################### |
|
397 # Range Tuple Functions |
|
398 # XXX: These range tuple functions might go better in a class. |
|
399 |
|
400 _rangere = None |
|
401 def range_header_to_tuple(range_header): |
|
402 """Get a (firstbyte,lastbyte) tuple from a Range header value. |
|
403 |
|
404 Range headers have the form "bytes=<firstbyte>-<lastbyte>". This |
|
405 function pulls the firstbyte and lastbyte values and returns |
|
406 a (firstbyte,lastbyte) tuple. If lastbyte is not specified in |
|
407 the header value, it is returned as an empty string in the |
|
408 tuple. |
|
409 |
|
410 Return None if range_header is None |
|
411 Return () if range_header does not conform to the range spec |
|
412 pattern. |
|
413 |
|
414 """ |
|
415 global _rangere |
|
416 if range_header is None: |
|
417 return None |
|
418 if _rangere is None: |
|
419 _rangere = re.compile(br'^bytes=(\d{1,})-(\d*)') |
|
420 match = _rangere.match(range_header) |
|
421 if match: |
|
422 tup = range_tuple_normalize(match.group(1, 2)) |
|
423 if tup and tup[1]: |
|
424 tup = (tup[0], tup[1]+1) |
|
425 return tup |
|
426 return () |
|
427 |
|
428 def range_tuple_to_header(range_tup): |
|
429 """Convert a range tuple to a Range header value. |
|
430 Return a string of the form "bytes=<firstbyte>-<lastbyte>" or None |
|
431 if no range is needed. |
|
432 """ |
|
433 if range_tup is None: |
|
434 return None |
|
435 range_tup = range_tuple_normalize(range_tup) |
|
436 if range_tup: |
|
437 if range_tup[1]: |
|
438 range_tup = (range_tup[0], range_tup[1] - 1) |
|
439 return 'bytes=%s-%s' % range_tup |
|
440 |
|
441 def range_tuple_normalize(range_tup): |
|
442 """Normalize a (first_byte,last_byte) range tuple. |
|
443 Return a tuple whose first element is guaranteed to be an int |
|
444 and whose second element will be '' (meaning: the last byte) or |
|
445 an int. Finally, return None if the normalized tuple == (0,'') |
|
446 as that is equivalent to retrieving the entire file. |
|
447 """ |
|
448 if range_tup is None: |
|
449 return None |
|
450 # handle first byte |
|
451 fb = range_tup[0] |
|
452 if fb in (None, ''): |
|
453 fb = 0 |
|
454 else: |
|
455 fb = int(fb) |
|
456 # handle last byte |
|
457 try: |
|
458 lb = range_tup[1] |
|
459 except IndexError: |
|
460 lb = '' |
|
461 else: |
|
462 if lb is None: |
|
463 lb = '' |
|
464 elif lb != '': |
|
465 lb = int(lb) |
|
466 # check if range is over the entire file |
|
467 if (fb, lb) == (0, ''): |
|
468 return None |
|
469 # check that the range is valid |
|
470 if lb < fb: |
|
471 raise RangeError('Invalid byte range: %s-%s' % (fb, lb)) |
|
472 return (fb, lb) |
|