mercurial/encoding.py
changeset 34131 0fa781320203
parent 33928 6c119dbfd0c0
child 34135 e9e225f16932
equal deleted inserted replaced
34130:ada8a19672ab 34131:0fa781320203
   106 
   106 
   107     The localstr class is used to cache the known UTF-8 encoding of
   107     The localstr class is used to cache the known UTF-8 encoding of
   108     strings next to their local representation to allow lossless
   108     strings next to their local representation to allow lossless
   109     round-trip conversion back to UTF-8.
   109     round-trip conversion back to UTF-8.
   110 
   110 
   111     >>> u = 'foo: \\xc3\\xa4' # utf-8
   111     >>> u = b'foo: \\xc3\\xa4' # utf-8
   112     >>> l = tolocal(u)
   112     >>> l = tolocal(u)
   113     >>> l
   113     >>> l
   114     'foo: ?'
   114     'foo: ?'
   115     >>> fromlocal(l)
   115     >>> fromlocal(l)
   116     'foo: \\xc3\\xa4'
   116     'foo: \\xc3\\xa4'
   117     >>> u2 = 'foo: \\xc3\\xa1'
   117     >>> u2 = b'foo: \\xc3\\xa1'
   118     >>> d = { l: 1, tolocal(u2): 2 }
   118     >>> d = { l: 1, tolocal(u2): 2 }
   119     >>> len(d) # no collision
   119     >>> len(d) # no collision
   120     2
   120     2
   121     >>> 'foo: ?' in d
   121     >>> b'foo: ?' in d
   122     False
   122     False
   123     >>> l1 = 'foo: \\xe4' # historical latin1 fallback
   123     >>> l1 = b'foo: \\xe4' # historical latin1 fallback
   124     >>> l = tolocal(l1)
   124     >>> l = tolocal(l1)
   125     >>> l
   125     >>> l
   126     'foo: ?'
   126     'foo: ?'
   127     >>> fromlocal(l) # magically in utf-8
   127     >>> fromlocal(l) # magically in utf-8
   128     'foo: \\xc3\\xa4'
   128     'foo: \\xc3\\xa4'
   245     """Trim string 's' to at most 'width' columns (including 'ellipsis').
   245     """Trim string 's' to at most 'width' columns (including 'ellipsis').
   246 
   246 
   247     If 'leftside' is True, left side of string 's' is trimmed.
   247     If 'leftside' is True, left side of string 's' is trimmed.
   248     'ellipsis' is always placed at trimmed side.
   248     'ellipsis' is always placed at trimmed side.
   249 
   249 
   250     >>> ellipsis = '+++'
   250     >>> ellipsis = b'+++'
   251     >>> from . import encoding
   251     >>> from . import encoding
   252     >>> encoding.encoding = 'utf-8'
   252     >>> encoding.encoding = b'utf-8'
   253     >>> t= '1234567890'
   253     >>> t = b'1234567890'
   254     >>> print trim(t, 12, ellipsis=ellipsis)
   254     >>> print trim(t, 12, ellipsis=ellipsis)
   255     1234567890
   255     1234567890
   256     >>> print trim(t, 10, ellipsis=ellipsis)
   256     >>> print trim(t, 10, ellipsis=ellipsis)
   257     1234567890
   257     1234567890
   258     >>> print trim(t, 8, ellipsis=ellipsis)
   258     >>> print trim(t, 8, ellipsis=ellipsis)
   283     \xe3\x81\x88\xe3\x81\x8a
   283     \xe3\x81\x88\xe3\x81\x8a
   284     >>> print trim(t, 4, ellipsis=ellipsis)
   284     >>> print trim(t, 4, ellipsis=ellipsis)
   285     +++
   285     +++
   286     >>> print trim(t, 4, ellipsis=ellipsis, leftside=True)
   286     >>> print trim(t, 4, ellipsis=ellipsis, leftside=True)
   287     +++
   287     +++
   288     >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
   288     >>> t = b'\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
   289     >>> print trim(t, 12, ellipsis=ellipsis)
   289     >>> print trim(t, 12, ellipsis=ellipsis)
   290     \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
   290     \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
   291     >>> print trim(t, 10, ellipsis=ellipsis)
   291     >>> print trim(t, 10, ellipsis=ellipsis)
   292     \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
   292     \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
   293     >>> print trim(t, 8, ellipsis=ellipsis)
   293     >>> print trim(t, 8, ellipsis=ellipsis)
   404     - other strings are converted to UTF-8b surrogate encoding
   404     - other strings are converted to UTF-8b surrogate encoding
   405     - apply JSON-specified string escaping
   405     - apply JSON-specified string escaping
   406 
   406 
   407     (escapes are doubled in these tests)
   407     (escapes are doubled in these tests)
   408 
   408 
   409     >>> jsonescape('this is a test')
   409     >>> jsonescape(b'this is a test')
   410     'this is a test'
   410     'this is a test'
   411     >>> jsonescape('escape characters: \\0 \\x0b \\x7f')
   411     >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')
   412     'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
   412     'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
   413     >>> jsonescape('escape characters: \\b \\t \\n \\f \\r \\" \\\\')
   413     >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')
   414     'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
   414     'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
   415     >>> jsonescape('a weird byte: \\xdd')
   415     >>> jsonescape(b'a weird byte: \\xdd')
   416     'a weird byte: \\xed\\xb3\\x9d'
   416     'a weird byte: \\xed\\xb3\\x9d'
   417     >>> jsonescape('utf-8: caf\\xc3\\xa9')
   417     >>> jsonescape(b'utf-8: caf\\xc3\\xa9')
   418     'utf-8: caf\\xc3\\xa9'
   418     'utf-8: caf\\xc3\\xa9'
   419     >>> jsonescape('')
   419     >>> jsonescape(b'')
   420     ''
   420     ''
   421 
   421 
   422     If paranoid, non-ascii and common troublesome characters are also escaped.
   422     If paranoid, non-ascii and common troublesome characters are also escaped.
   423     This is suitable for web output.
   423     This is suitable for web output.
   424 
   424 
   425     >>> s = 'escape characters: \\0 \\x0b \\x7f'
   425     >>> s = b'escape characters: \\0 \\x0b \\x7f'
   426     >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
   426     >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
   427     >>> s = 'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
   427     >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
   428     >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
   428     >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
   429     >>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
   429     >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
   430     'escape boundary: ~ \\\\u007f \\\\u0080'
   430     'escape boundary: ~ \\\\u007f \\\\u0080'
   431     >>> jsonescape('a weird byte: \\xdd', paranoid=True)
   431     >>> jsonescape(b'a weird byte: \\xdd', paranoid=True)
   432     'a weird byte: \\\\udcdd'
   432     'a weird byte: \\\\udcdd'
   433     >>> jsonescape('utf-8: caf\\xc3\\xa9', paranoid=True)
   433     >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)
   434     'utf-8: caf\\\\u00e9'
   434     'utf-8: caf\\\\u00e9'
   435     >>> jsonescape('non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
   435     >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
   436     'non-BMP: \\\\ud834\\\\udd1e'
   436     'non-BMP: \\\\ud834\\\\udd1e'
   437     >>> jsonescape('<foo@example.org>', paranoid=True)
   437     >>> jsonescape(b'<foo@example.org>', paranoid=True)
   438     '\\\\u003cfoo@example.org\\\\u003e'
   438     '\\\\u003cfoo@example.org\\\\u003e'
   439     '''
   439     '''
   440 
   440 
   441     u8chars = toutf8b(s)
   441     u8chars = toutf8b(s)
   442     try:
   442     try:
   529     return the original binary string. This
   529     return the original binary string. This
   530     is a round-trip process for strings like filenames, but metadata
   530     is a round-trip process for strings like filenames, but metadata
   531     that's was passed through tolocal will remain in UTF-8.
   531     that's was passed through tolocal will remain in UTF-8.
   532 
   532 
   533     >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
   533     >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
   534     >>> m = "\\xc3\\xa9\\x99abcd"
   534     >>> m = b"\\xc3\\xa9\\x99abcd"
   535     >>> toutf8b(m)
   535     >>> toutf8b(m)
   536     '\\xc3\\xa9\\xed\\xb2\\x99abcd'
   536     '\\xc3\\xa9\\xed\\xb2\\x99abcd'
   537     >>> roundtrip(m)
   537     >>> roundtrip(m)
   538     True
   538     True
   539     >>> roundtrip("\\xc2\\xc2\\x80")
   539     >>> roundtrip(b"\\xc2\\xc2\\x80")
   540     True
   540     True
   541     >>> roundtrip("\\xef\\xbf\\xbd")
   541     >>> roundtrip(b"\\xef\\xbf\\xbd")
   542     True
   542     True
   543     >>> roundtrip("\\xef\\xef\\xbf\\xbd")
   543     >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
   544     True
   544     True
   545     >>> roundtrip("\\xf1\\x80\\x80\\x80\\x80")
   545     >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
   546     True
   546     True
   547     '''
   547     '''
   548 
   548 
   549     if isasciistr(s):
   549     if isasciistr(s):
   550         return s
   550         return s