mercurial/encoding.py
changeset 45942 89a2afe31e82
parent 45681 a736ab681b78
child 46319 3dfebba99ef6
equal deleted inserted replaced
45941:346af7687c6f 45942:89a2afe31e82
   111 encodingmode = environ.get(b"HGENCODINGMODE", b"strict")
   111 encodingmode = environ.get(b"HGENCODINGMODE", b"strict")
   112 fallbackencoding = b'ISO-8859-1'
   112 fallbackencoding = b'ISO-8859-1'
   113 
   113 
   114 
   114 
   115 class localstr(bytes):
   115 class localstr(bytes):
   116     '''This class allows strings that are unmodified to be
   116     """This class allows strings that are unmodified to be
   117     round-tripped to the local encoding and back'''
   117     round-tripped to the local encoding and back"""
   118 
   118 
   119     def __new__(cls, u, l):
   119     def __new__(cls, u, l):
   120         s = bytes.__new__(cls, l)
   120         s = bytes.__new__(cls, l)
   121         s._utf8 = u
   121         s._utf8 = u
   122         return s
   122         return s
   327     return len(d)
   327     return len(d)
   328 
   328 
   329 
   329 
   330 def getcols(s, start, c):
   330 def getcols(s, start, c):
   331     # type: (bytes, int, int) -> bytes
   331     # type: (bytes, int, int) -> bytes
   332     '''Use colwidth to find a c-column substring of s starting at byte
   332     """Use colwidth to find a c-column substring of s starting at byte
   333     index start'''
   333     index start"""
   334     for x in pycompat.xrange(start + c, len(s)):
   334     for x in pycompat.xrange(start + c, len(s)):
   335         t = s[start:x]
   335         t = s[start:x]
   336         if colwidth(t) == c:
   336         if colwidth(t) == c:
   337             return t
   337             return t
   338     raise ValueError('substring not found')
   338     raise ValueError('substring not found')
   485     except LookupError as k:
   485     except LookupError as k:
   486         raise error.Abort(k, hint=b"please check your locale settings")
   486         raise error.Abort(k, hint=b"please check your locale settings")
   487 
   487 
   488 
   488 
   489 class normcasespecs(object):
   489 class normcasespecs(object):
   490     '''what a platform's normcase does to ASCII strings
   490     """what a platform's normcase does to ASCII strings
   491 
   491 
   492     This is specified per platform, and should be consistent with what normcase
   492     This is specified per platform, and should be consistent with what normcase
   493     on that platform actually does.
   493     on that platform actually does.
   494 
   494 
   495     lower: normcase lowercases ASCII strings
   495     lower: normcase lowercases ASCII strings
   496     upper: normcase uppercases ASCII strings
   496     upper: normcase uppercases ASCII strings
   497     other: the fallback function should always be called
   497     other: the fallback function should always be called
   498 
   498 
   499     This should be kept in sync with normcase_spec in util.h.'''
   499     This should be kept in sync with normcase_spec in util.h."""
   500 
   500 
   501     lower = -1
   501     lower = -1
   502     upper = 1
   502     upper = 1
   503     other = 0
   503     other = 0
   504 
   504 
   505 
   505 
   506 def jsonescape(s, paranoid=False):
   506 def jsonescape(s, paranoid=False):
   507     # type: (Any, Any) -> Any
   507     # type: (Any, Any) -> Any
   508     '''returns a string suitable for JSON
   508     """returns a string suitable for JSON
   509 
   509 
   510     JSON is problematic for us because it doesn't support non-Unicode
   510     JSON is problematic for us because it doesn't support non-Unicode
   511     bytes. To deal with this, we take the following approach:
   511     bytes. To deal with this, we take the following approach:
   512 
   512 
   513     - localstr/safelocalstr objects are converted back to UTF-8
   513     - localstr/safelocalstr objects are converted back to UTF-8
   545     'utf-8: caf\\\\u00e9'
   545     'utf-8: caf\\\\u00e9'
   546     >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
   546     >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
   547     'non-BMP: \\\\ud834\\\\udd1e'
   547     'non-BMP: \\\\ud834\\\\udd1e'
   548     >>> jsonescape(b'<foo@example.org>', paranoid=True)
   548     >>> jsonescape(b'<foo@example.org>', paranoid=True)
   549     '\\\\u003cfoo@example.org\\\\u003e'
   549     '\\\\u003cfoo@example.org\\\\u003e'
   550     '''
   550     """
   551 
   551 
   552     u8chars = toutf8b(s)
   552     u8chars = toutf8b(s)
   553     try:
   553     try:
   554         return _jsonescapeu8fast(u8chars, paranoid)
   554         return _jsonescapeu8fast(u8chars, paranoid)
   555     except ValueError:
   555     except ValueError:
   567 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
   567 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
   568 
   568 
   569 
   569 
   570 def getutf8char(s, pos):
   570 def getutf8char(s, pos):
   571     # type: (bytes, int) -> bytes
   571     # type: (bytes, int) -> bytes
   572     '''get the next full utf-8 character in the given string, starting at pos
   572     """get the next full utf-8 character in the given string, starting at pos
   573 
   573 
   574     Raises a UnicodeError if the given location does not start a valid
   574     Raises a UnicodeError if the given location does not start a valid
   575     utf-8 character.
   575     utf-8 character.
   576     '''
   576     """
   577 
   577 
   578     # find how many bytes to attempt decoding from first nibble
   578     # find how many bytes to attempt decoding from first nibble
   579     l = _utf8len[ord(s[pos : pos + 1]) >> 4]
   579     l = _utf8len[ord(s[pos : pos + 1]) >> 4]
   580     if not l:  # ascii
   580     if not l:  # ascii
   581         return s[pos : pos + 1]
   581         return s[pos : pos + 1]
   586     return c
   586     return c
   587 
   587 
   588 
   588 
   589 def toutf8b(s):
   589 def toutf8b(s):
   590     # type: (bytes) -> bytes
   590     # type: (bytes) -> bytes
   591     '''convert a local, possibly-binary string into UTF-8b
   591     """convert a local, possibly-binary string into UTF-8b
   592 
   592 
   593     This is intended as a generic method to preserve data when working
   593     This is intended as a generic method to preserve data when working
   594     with schemes like JSON and XML that have no provision for
   594     with schemes like JSON and XML that have no provision for
   595     arbitrary byte strings. As Mercurial often doesn't know
   595     arbitrary byte strings. As Mercurial often doesn't know
   596     what encoding data is in, we use so-called UTF-8b.
   596     what encoding data is in, we use so-called UTF-8b.
   614 
   614 
   615     (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
   615     (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
   616     arbitrary bytes into an internal Unicode format that can be
   616     arbitrary bytes into an internal Unicode format that can be
   617     re-encoded back into the original. Here we are exposing the
   617     re-encoded back into the original. Here we are exposing the
   618     internal surrogate encoding as a UTF-8 string.)
   618     internal surrogate encoding as a UTF-8 string.)
   619     '''
   619     """
   620 
   620 
   621     if isinstance(s, localstr):
   621     if isinstance(s, localstr):
   622         # assume that the original UTF-8 sequence would never contain
   622         # assume that the original UTF-8 sequence would never contain
   623         # invalid characters in U+DCxx range
   623         # invalid characters in U+DCxx range
   624         return s._utf8
   624         return s._utf8
   655     return r
   655     return r
   656 
   656 
   657 
   657 
   658 def fromutf8b(s):
   658 def fromutf8b(s):
   659     # type: (bytes) -> bytes
   659     # type: (bytes) -> bytes
   660     '''Given a UTF-8b string, return a local, possibly-binary string.
   660     """Given a UTF-8b string, return a local, possibly-binary string.
   661 
   661 
   662     return the original binary string. This
   662     return the original binary string. This
   663     is a round-trip process for strings like filenames, but metadata
   663     is a round-trip process for strings like filenames, but metadata
   664     that's was passed through tolocal will remain in UTF-8.
   664     that's was passed through tolocal will remain in UTF-8.
   665 
   665 
   675     True
   675     True
   676     >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
   676     >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
   677     True
   677     True
   678     >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
   678     >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
   679     True
   679     True
   680     '''
   680     """
   681 
   681 
   682     if isasciistr(s):
   682     if isasciistr(s):
   683         return s
   683         return s
   684     # fast path - look for uDxxx prefixes in s
   684     # fast path - look for uDxxx prefixes in s
   685     if b"\xed" not in s:
   685     if b"\xed" not in s: