106 |
106 |
107 The localstr class is used to cache the known UTF-8 encoding of |
107 The localstr class is used to cache the known UTF-8 encoding of |
108 strings next to their local representation to allow lossless |
108 strings next to their local representation to allow lossless |
109 round-trip conversion back to UTF-8. |
109 round-trip conversion back to UTF-8. |
110 |
110 |
111 >>> u = 'foo: \\xc3\\xa4' # utf-8 |
111 >>> u = b'foo: \\xc3\\xa4' # utf-8 |
112 >>> l = tolocal(u) |
112 >>> l = tolocal(u) |
113 >>> l |
113 >>> l |
114 'foo: ?' |
114 'foo: ?' |
115 >>> fromlocal(l) |
115 >>> fromlocal(l) |
116 'foo: \\xc3\\xa4' |
116 'foo: \\xc3\\xa4' |
117 >>> u2 = 'foo: \\xc3\\xa1' |
117 >>> u2 = b'foo: \\xc3\\xa1' |
118 >>> d = { l: 1, tolocal(u2): 2 } |
118 >>> d = { l: 1, tolocal(u2): 2 } |
119 >>> len(d) # no collision |
119 >>> len(d) # no collision |
120 2 |
120 2 |
121 >>> 'foo: ?' in d |
121 >>> b'foo: ?' in d |
122 False |
122 False |
123 >>> l1 = 'foo: \\xe4' # historical latin1 fallback |
123 >>> l1 = b'foo: \\xe4' # historical latin1 fallback |
124 >>> l = tolocal(l1) |
124 >>> l = tolocal(l1) |
125 >>> l |
125 >>> l |
126 'foo: ?' |
126 'foo: ?' |
127 >>> fromlocal(l) # magically in utf-8 |
127 >>> fromlocal(l) # magically in utf-8 |
128 'foo: \\xc3\\xa4' |
128 'foo: \\xc3\\xa4' |
245 """Trim string 's' to at most 'width' columns (including 'ellipsis'). |
245 """Trim string 's' to at most 'width' columns (including 'ellipsis'). |
246 |
246 |
247 If 'leftside' is True, left side of string 's' is trimmed. |
247 If 'leftside' is True, left side of string 's' is trimmed. |
248 'ellipsis' is always placed at trimmed side. |
248 'ellipsis' is always placed at trimmed side. |
249 |
249 |
250 >>> ellipsis = '+++' |
250 >>> ellipsis = b'+++' |
251 >>> from . import encoding |
251 >>> from . import encoding |
252 >>> encoding.encoding = 'utf-8' |
252 >>> encoding.encoding = b'utf-8' |
253 >>> t= '1234567890' |
253 >>> t = b'1234567890' |
254 >>> print trim(t, 12, ellipsis=ellipsis) |
254 >>> print trim(t, 12, ellipsis=ellipsis) |
255 1234567890 |
255 1234567890 |
256 >>> print trim(t, 10, ellipsis=ellipsis) |
256 >>> print trim(t, 10, ellipsis=ellipsis) |
257 1234567890 |
257 1234567890 |
258 >>> print trim(t, 8, ellipsis=ellipsis) |
258 >>> print trim(t, 8, ellipsis=ellipsis) |
283 \xe3\x81\x88\xe3\x81\x8a |
283 \xe3\x81\x88\xe3\x81\x8a |
284 >>> print trim(t, 4, ellipsis=ellipsis) |
284 >>> print trim(t, 4, ellipsis=ellipsis) |
285 +++ |
285 +++ |
286 >>> print trim(t, 4, ellipsis=ellipsis, leftside=True) |
286 >>> print trim(t, 4, ellipsis=ellipsis, leftside=True) |
287 +++ |
287 +++ |
288 >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence |
288 >>> t = b'\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence |
289 >>> print trim(t, 12, ellipsis=ellipsis) |
289 >>> print trim(t, 12, ellipsis=ellipsis) |
290 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa |
290 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa |
291 >>> print trim(t, 10, ellipsis=ellipsis) |
291 >>> print trim(t, 10, ellipsis=ellipsis) |
292 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa |
292 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa |
293 >>> print trim(t, 8, ellipsis=ellipsis) |
293 >>> print trim(t, 8, ellipsis=ellipsis) |
404 - other strings are converted to UTF-8b surrogate encoding |
404 - other strings are converted to UTF-8b surrogate encoding |
405 - apply JSON-specified string escaping |
405 - apply JSON-specified string escaping |
406 |
406 |
407 (escapes are doubled in these tests) |
407 (escapes are doubled in these tests) |
408 |
408 |
409 >>> jsonescape('this is a test') |
409 >>> jsonescape(b'this is a test') |
410 'this is a test' |
410 'this is a test' |
411 >>> jsonescape('escape characters: \\0 \\x0b \\x7f') |
411 >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f') |
412 'escape characters: \\\\u0000 \\\\u000b \\\\u007f' |
412 'escape characters: \\\\u0000 \\\\u000b \\\\u007f' |
413 >>> jsonescape('escape characters: \\b \\t \\n \\f \\r \\" \\\\') |
413 >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\') |
414 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\' |
414 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\' |
415 >>> jsonescape('a weird byte: \\xdd') |
415 >>> jsonescape(b'a weird byte: \\xdd') |
416 'a weird byte: \\xed\\xb3\\x9d' |
416 'a weird byte: \\xed\\xb3\\x9d' |
417 >>> jsonescape('utf-8: caf\\xc3\\xa9') |
417 >>> jsonescape(b'utf-8: caf\\xc3\\xa9') |
418 'utf-8: caf\\xc3\\xa9' |
418 'utf-8: caf\\xc3\\xa9' |
419 >>> jsonescape('') |
419 >>> jsonescape(b'') |
420 '' |
420 '' |
421 |
421 |
422 If paranoid, non-ascii and common troublesome characters are also escaped. |
422 If paranoid, non-ascii and common troublesome characters are also escaped. |
423 This is suitable for web output. |
423 This is suitable for web output. |
424 |
424 |
425 >>> s = 'escape characters: \\0 \\x0b \\x7f' |
425 >>> s = b'escape characters: \\0 \\x0b \\x7f' |
426 >>> assert jsonescape(s) == jsonescape(s, paranoid=True) |
426 >>> assert jsonescape(s) == jsonescape(s, paranoid=True) |
427 >>> s = 'escape characters: \\b \\t \\n \\f \\r \\" \\\\' |
427 >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\' |
428 >>> assert jsonescape(s) == jsonescape(s, paranoid=True) |
428 >>> assert jsonescape(s) == jsonescape(s, paranoid=True) |
429 >>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True) |
429 >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True) |
430 'escape boundary: ~ \\\\u007f \\\\u0080' |
430 'escape boundary: ~ \\\\u007f \\\\u0080' |
431 >>> jsonescape('a weird byte: \\xdd', paranoid=True) |
431 >>> jsonescape(b'a weird byte: \\xdd', paranoid=True) |
432 'a weird byte: \\\\udcdd' |
432 'a weird byte: \\\\udcdd' |
433 >>> jsonescape('utf-8: caf\\xc3\\xa9', paranoid=True) |
433 >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True) |
434 'utf-8: caf\\\\u00e9' |
434 'utf-8: caf\\\\u00e9' |
435 >>> jsonescape('non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True) |
435 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True) |
436 'non-BMP: \\\\ud834\\\\udd1e' |
436 'non-BMP: \\\\ud834\\\\udd1e' |
437 >>> jsonescape('<foo@example.org>', paranoid=True) |
437 >>> jsonescape(b'<foo@example.org>', paranoid=True) |
438 '\\\\u003cfoo@example.org\\\\u003e' |
438 '\\\\u003cfoo@example.org\\\\u003e' |
439 ''' |
439 ''' |
440 |
440 |
441 u8chars = toutf8b(s) |
441 u8chars = toutf8b(s) |
442 try: |
442 try: |
529 return the original binary string. This |
529 return the original binary string. This |
530 is a round-trip process for strings like filenames, but metadata |
530 is a round-trip process for strings like filenames, but metadata |
531 that's was passed through tolocal will remain in UTF-8. |
531 that's was passed through tolocal will remain in UTF-8. |
532 |
532 |
533 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x |
533 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x |
534 >>> m = "\\xc3\\xa9\\x99abcd" |
534 >>> m = b"\\xc3\\xa9\\x99abcd" |
535 >>> toutf8b(m) |
535 >>> toutf8b(m) |
536 '\\xc3\\xa9\\xed\\xb2\\x99abcd' |
536 '\\xc3\\xa9\\xed\\xb2\\x99abcd' |
537 >>> roundtrip(m) |
537 >>> roundtrip(m) |
538 True |
538 True |
539 >>> roundtrip("\\xc2\\xc2\\x80") |
539 >>> roundtrip(b"\\xc2\\xc2\\x80") |
540 True |
540 True |
541 >>> roundtrip("\\xef\\xbf\\xbd") |
541 >>> roundtrip(b"\\xef\\xbf\\xbd") |
542 True |
542 True |
543 >>> roundtrip("\\xef\\xef\\xbf\\xbd") |
543 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd") |
544 True |
544 True |
545 >>> roundtrip("\\xf1\\x80\\x80\\x80\\x80") |
545 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80") |
546 True |
546 True |
547 ''' |
547 ''' |
548 |
548 |
549 if isasciistr(s): |
549 if isasciistr(s): |
550 return s |
550 return s |