mercurial/encoding.py
changeset 43077 687b865b95ad
parent 43076 2372284d9457
child 43089 c59eb1560c44
equal deleted inserted replaced
43076:2372284d9457 43077:687b865b95ad
    34 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
    34 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
    35 # "Unicode Subtleties"), so we need to ignore them in some places for
    35 # "Unicode Subtleties"), so we need to ignore them in some places for
    36 # sanity.
    36 # sanity.
    37 _ignore = [
    37 _ignore = [
    38     unichr(int(x, 16)).encode("utf-8")
    38     unichr(int(x, 16)).encode("utf-8")
    39     for x in "200c 200d 200e 200f 202a 202b 202c 202d 202e "
    39     for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "
    40     "206a 206b 206c 206d 206e 206f feff".split()
    40     b"206a 206b 206c 206d 206e 206f feff".split()
    41 ]
    41 ]
    42 # verify the next function will work
    42 # verify the next function will work
    43 assert all(i.startswith(("\xe2", "\xef")) for i in _ignore)
    43 assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)
    44 
    44 
    45 
    45 
    46 def hfsignoreclean(s):
    46 def hfsignoreclean(s):
    47     """Remove codepoints ignored by HFS+ from s.
    47     """Remove codepoints ignored by HFS+ from s.
    48 
    48 
    49     >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
    49     >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
    50     '.hg'
    50     '.hg'
    51     >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
    51     >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
    52     '.hg'
    52     '.hg'
    53     """
    53     """
    54     if "\xe2" in s or "\xef" in s:
    54     if b"\xe2" in s or b"\xef" in s:
    55         for c in _ignore:
    55         for c in _ignore:
    56             s = s.replace(c, '')
    56             s = s.replace(c, b'')
    57     return s
    57     return s
    58 
    58 
    59 
    59 
    60 # encoding.environ is provided read-only, which may not be used to modify
    60 # encoding.environ is provided read-only, which may not be used to modify
    61 # the process environment
    61 # the process environment
    71         (k.encode(r'utf-8'), v.encode(r'utf-8'))
    71         (k.encode(r'utf-8'), v.encode(r'utf-8'))
    72         for k, v in os.environ.items()  # re-exports
    72         for k, v in os.environ.items()  # re-exports
    73     )
    73     )
    74 
    74 
    75 _encodingrewrites = {
    75 _encodingrewrites = {
    76     '646': 'ascii',
    76     b'646': b'ascii',
    77     'ANSI_X3.4-1968': 'ascii',
    77     b'ANSI_X3.4-1968': b'ascii',
    78 }
    78 }
    79 # cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.
    79 # cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.
    80 # No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.
    80 # No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.
    81 # https://bugs.python.org/issue13216
    81 # https://bugs.python.org/issue13216
    82 if pycompat.iswindows and not pycompat.ispy3:
    82 if pycompat.iswindows and not pycompat.ispy3:
    83     _encodingrewrites['cp65001'] = 'utf-8'
    83     _encodingrewrites[b'cp65001'] = b'utf-8'
    84 
    84 
    85 try:
    85 try:
    86     encoding = environ.get("HGENCODING")
    86     encoding = environ.get(b"HGENCODING")
    87     if not encoding:
    87     if not encoding:
    88         encoding = locale.getpreferredencoding().encode('ascii') or 'ascii'
    88         encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'
    89         encoding = _encodingrewrites.get(encoding, encoding)
    89         encoding = _encodingrewrites.get(encoding, encoding)
    90 except locale.Error:
    90 except locale.Error:
    91     encoding = 'ascii'
    91     encoding = b'ascii'
    92 encodingmode = environ.get("HGENCODINGMODE", "strict")
    92 encodingmode = environ.get(b"HGENCODINGMODE", b"strict")
    93 fallbackencoding = 'ISO-8859-1'
    93 fallbackencoding = b'ISO-8859-1'
    94 
    94 
    95 
    95 
    96 class localstr(bytes):
    96 class localstr(bytes):
    97     '''This class allows strings that are unmodified to be
    97     '''This class allows strings that are unmodified to be
    98     round-tripped to the local encoding and back'''
    98     round-tripped to the local encoding and back'''
   156 
   156 
   157     try:
   157     try:
   158         try:
   158         try:
   159             # make sure string is actually stored in UTF-8
   159             # make sure string is actually stored in UTF-8
   160             u = s.decode('UTF-8')
   160             u = s.decode('UTF-8')
   161             if encoding == 'UTF-8':
   161             if encoding == b'UTF-8':
   162                 # fast path
   162                 # fast path
   163                 return s
   163                 return s
   164             r = u.encode(_sysstr(encoding), r"replace")
   164             r = u.encode(_sysstr(encoding), r"replace")
   165             if u == r.decode(_sysstr(encoding)):
   165             if u == r.decode(_sysstr(encoding)):
   166                 # r is a safe, non-lossy encoding of s
   166                 # r is a safe, non-lossy encoding of s
   178             except UnicodeDecodeError:
   178             except UnicodeDecodeError:
   179                 u = s.decode("utf-8", "replace")  # last ditch
   179                 u = s.decode("utf-8", "replace")  # last ditch
   180                 # can't round-trip
   180                 # can't round-trip
   181                 return u.encode(_sysstr(encoding), r"replace")
   181                 return u.encode(_sysstr(encoding), r"replace")
   182     except LookupError as k:
   182     except LookupError as k:
   183         raise error.Abort(k, hint="please check your locale settings")
   183         raise error.Abort(k, hint=b"please check your locale settings")
   184 
   184 
   185 
   185 
   186 def fromlocal(s):
   186 def fromlocal(s):
   187     """
   187     """
   188     Convert a string from the local character encoding to UTF-8
   188     Convert a string from the local character encoding to UTF-8
   204         u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
   204         u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
   205         return u.encode("utf-8")
   205         return u.encode("utf-8")
   206     except UnicodeDecodeError as inst:
   206     except UnicodeDecodeError as inst:
   207         sub = s[max(0, inst.start - 10) : inst.start + 10]
   207         sub = s[max(0, inst.start - 10) : inst.start + 10]
   208         raise error.Abort(
   208         raise error.Abort(
   209             "decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))
   209             b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))
   210         )
   210         )
   211     except LookupError as k:
   211     except LookupError as k:
   212         raise error.Abort(k, hint="please check your locale settings")
   212         raise error.Abort(k, hint=b"please check your locale settings")
   213 
   213 
   214 
   214 
   215 def unitolocal(u):
   215 def unitolocal(u):
   216     """Convert a unicode string to a byte string of local encoding"""
   216     """Convert a unicode string to a byte string of local encoding"""
   217     return tolocal(u.encode('utf-8'))
   217     return tolocal(u.encode('utf-8'))
   264 else:
   264 else:
   265     getcwd = os.getcwd  # re-exports
   265     getcwd = os.getcwd  # re-exports
   266 
   266 
   267 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
   267 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
   268 _wide = _sysstr(
   268 _wide = _sysstr(
   269     environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide" and "WFA" or "WF"
   269     environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"
       
   270     and b"WFA"
       
   271     or b"WF"
   270 )
   272 )
   271 
   273 
   272 
   274 
   273 def colwidth(s):
   275 def colwidth(s):
   274     "Find the column width of a string for display in the local encoding"
   276     b"Find the column width of a string for display in the local encoding"
   275     return ucolwidth(s.decode(_sysstr(encoding), r'replace'))
   277     return ucolwidth(s.decode(_sysstr(encoding), r'replace'))
   276 
   278 
   277 
   279 
   278 def ucolwidth(d):
   280 def ucolwidth(d):
   279     "Find the column width of a Unicode string for display"
   281     b"Find the column width of a Unicode string for display"
   280     eaw = getattr(unicodedata, 'east_asian_width', None)
   282     eaw = getattr(unicodedata, 'east_asian_width', None)
   281     if eaw is not None:
   283     if eaw is not None:
   282         return sum([eaw(c) in _wide and 2 or 1 for c in d])
   284         return sum([eaw(c) in _wide and 2 or 1 for c in d])
   283     return len(d)
   285     return len(d)
   284 
   286 
   290         t = s[start:x]
   292         t = s[start:x]
   291         if colwidth(t) == c:
   293         if colwidth(t) == c:
   292             return t
   294             return t
   293 
   295 
   294 
   296 
   295 def trim(s, width, ellipsis='', leftside=False):
   297 def trim(s, width, ellipsis=b'', leftside=False):
   296     """Trim string 's' to at most 'width' columns (including 'ellipsis').
   298     """Trim string 's' to at most 'width' columns (including 'ellipsis').
   297 
   299 
   298     If 'leftside' is True, left side of string 's' is trimmed.
   300     If 'leftside' is True, left side of string 's' is trimmed.
   299     'ellipsis' is always placed at trimmed side.
   301     'ellipsis' is always placed at trimmed side.
   300 
   302 
   388             return concat(usub.encode(_sysstr(encoding)))
   390             return concat(usub.encode(_sysstr(encoding)))
   389     return ellipsis  # no enough room for multi-column characters
   391     return ellipsis  # no enough room for multi-column characters
   390 
   392 
   391 
   393 
   392 def lower(s):
   394 def lower(s):
   393     "best-effort encoding-aware case-folding of local string s"
   395     b"best-effort encoding-aware case-folding of local string s"
   394     try:
   396     try:
   395         return asciilower(s)
   397         return asciilower(s)
   396     except UnicodeDecodeError:
   398     except UnicodeDecodeError:
   397         pass
   399         pass
   398     try:
   400     try:
   406             return s  # preserve localstring
   408             return s  # preserve localstring
   407         return lu.encode(_sysstr(encoding))
   409         return lu.encode(_sysstr(encoding))
   408     except UnicodeError:
   410     except UnicodeError:
   409         return s.lower()  # we don't know how to fold this except in ASCII
   411         return s.lower()  # we don't know how to fold this except in ASCII
   410     except LookupError as k:
   412     except LookupError as k:
   411         raise error.Abort(k, hint="please check your locale settings")
   413         raise error.Abort(k, hint=b"please check your locale settings")
   412 
   414 
   413 
   415 
   414 def upper(s):
   416 def upper(s):
   415     "best-effort encoding-aware case-folding of local string s"
   417     b"best-effort encoding-aware case-folding of local string s"
   416     try:
   418     try:
   417         return asciiupper(s)
   419         return asciiupper(s)
   418     except UnicodeDecodeError:
   420     except UnicodeDecodeError:
   419         return upperfallback(s)
   421         return upperfallback(s)
   420 
   422 
   431             return s  # preserve localstring
   433             return s  # preserve localstring
   432         return uu.encode(_sysstr(encoding))
   434         return uu.encode(_sysstr(encoding))
   433     except UnicodeError:
   435     except UnicodeError:
   434         return s.upper()  # we don't know how to fold this except in ASCII
   436         return s.upper()  # we don't know how to fold this except in ASCII
   435     except LookupError as k:
   437     except LookupError as k:
   436         raise error.Abort(k, hint="please check your locale settings")
   438         raise error.Abort(k, hint=b"please check your locale settings")
   437 
   439 
   438 
   440 
   439 class normcasespecs(object):
   441 class normcasespecs(object):
   440     '''what a platform's normcase does to ASCII strings
   442     '''what a platform's normcase does to ASCII strings
   441 
   443 
   573         # already verified that s is non-lossy in legacy encoding, which
   575         # already verified that s is non-lossy in legacy encoding, which
   574         # shouldn't contain characters in U+DCxx range
   576         # shouldn't contain characters in U+DCxx range
   575         return fromlocal(s)
   577         return fromlocal(s)
   576     elif isasciistr(s):
   578     elif isasciistr(s):
   577         return s
   579         return s
   578     if "\xed" not in s:
   580     if b"\xed" not in s:
   579         try:
   581         try:
   580             s.decode('utf-8', _utf8strict)
   582             s.decode('utf-8', _utf8strict)
   581             return s
   583             return s
   582         except UnicodeDecodeError:
   584         except UnicodeDecodeError:
   583             pass
   585             pass
   584 
   586 
   585     s = pycompat.bytestr(s)
   587     s = pycompat.bytestr(s)
   586     r = ""
   588     r = b""
   587     pos = 0
   589     pos = 0
   588     l = len(s)
   590     l = len(s)
   589     while pos < l:
   591     while pos < l:
   590         try:
   592         try:
   591             c = getutf8char(s, pos)
   593             c = getutf8char(s, pos)
   592             if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
   594             if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
   593                 # have to re-escape existing U+DCxx characters
   595                 # have to re-escape existing U+DCxx characters
   594                 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
   596                 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
   595                 pos += 1
   597                 pos += 1
   596             else:
   598             else:
   597                 pos += len(c)
   599                 pos += len(c)
   626     '''
   628     '''
   627 
   629 
   628     if isasciistr(s):
   630     if isasciistr(s):
   629         return s
   631         return s
   630     # fast path - look for uDxxx prefixes in s
   632     # fast path - look for uDxxx prefixes in s
   631     if "\xed" not in s:
   633     if b"\xed" not in s:
   632         return s
   634         return s
   633 
   635 
   634     # We could do this with the unicode type but some Python builds
   636     # We could do this with the unicode type but some Python builds
   635     # use UTF-16 internally (issue5031) which causes non-BMP code
   637     # use UTF-16 internally (issue5031) which causes non-BMP code
   636     # points to be escaped. Instead, we use our handy getutf8char
   638     # points to be escaped. Instead, we use our handy getutf8char
   637     # helper again to walk the string without "decoding" it.
   639     # helper again to walk the string without "decoding" it.
   638 
   640 
   639     s = pycompat.bytestr(s)
   641     s = pycompat.bytestr(s)
   640     r = ""
   642     r = b""
   641     pos = 0
   643     pos = 0
   642     l = len(s)
   644     l = len(s)
   643     while pos < l:
   645     while pos < l:
   644         c = getutf8char(s, pos)
   646         c = getutf8char(s, pos)
   645         pos += len(c)
   647         pos += len(c)
   646         # unescape U+DCxx characters
   648         # unescape U+DCxx characters
   647         if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
   649         if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
   648             c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)
   650             c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)
   649         r += c
   651         r += c
   650     return r
   652     return r