mercurial/encoding.py
changeset 51287 f15cb5111a1e
parent 51285 9d3721552b6c
child 51290 f4a0806081f2
equal deleted inserted replaced
51286:81224afd938d 51287:f15cb5111a1e
    57 ]
    57 ]
    58 # verify the next function will work
    58 # verify the next function will work
    59 assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)
    59 assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)
    60 
    60 
    61 
    61 
    62 def hfsignoreclean(s):
    62 def hfsignoreclean(s: bytes) -> bytes:
    63     # type: (bytes) -> bytes
       
    64     """Remove codepoints ignored by HFS+ from s.
    63     """Remove codepoints ignored by HFS+ from s.
    65 
    64 
    66     >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
    65     >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
    67     '.hg'
    66     '.hg'
    68     >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
    67     >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
   131         s._utf8 = u
   130         s._utf8 = u
   132         return s
   131         return s
   133 
   132 
   134     if typing.TYPE_CHECKING:
   133     if typing.TYPE_CHECKING:
   135         # pseudo implementation to help pytype see localstr() constructor
   134         # pseudo implementation to help pytype see localstr() constructor
   136         def __init__(self, u, l):
   135         def __init__(self, u: bytes, l: bytes) -> None:
   137             # type: (bytes, bytes) -> None
       
   138             super(localstr, self).__init__(l)
   136             super(localstr, self).__init__(l)
   139             self._utf8 = u
   137             self._utf8 = u
   140 
   138 
   141     def __hash__(self):
   139     def __hash__(self):
   142         return hash(self._utf8)  # avoid collisions in local string space
   140         return hash(self._utf8)  # avoid collisions in local string space
   151     >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}
   149     >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}
   152     >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
   150     >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
   153     """
   151     """
   154 
   152 
   155 
   153 
   156 def tolocal(s):
   154 def tolocal(s: bytes) -> bytes:
   157     # type: (bytes) -> bytes
       
   158     """
   155     """
   159     Convert a string from internal UTF-8 to local encoding
   156     Convert a string from internal UTF-8 to local encoding
   160 
   157 
   161     All internal strings should be UTF-8 but some repos before the
   158     All internal strings should be UTF-8 but some repos before the
   162     implementation of locale support may contain latin1 or possibly
   159     implementation of locale support may contain latin1 or possibly
   220         raise error.Abort(
   217         raise error.Abort(
   221             pycompat.bytestr(k), hint=b"please check your locale settings"
   218             pycompat.bytestr(k), hint=b"please check your locale settings"
   222         )
   219         )
   223 
   220 
   224 
   221 
   225 def fromlocal(s):
   222 def fromlocal(s: bytes) -> bytes:
   226     # type: (bytes) -> bytes
       
   227     """
   223     """
   228     Convert a string from the local character encoding to UTF-8
   224     Convert a string from the local character encoding to UTF-8
   229 
   225 
   230     We attempt to decode strings using the encoding mode set by
   226     We attempt to decode strings using the encoding mode set by
   231     HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
   227     HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
   252         raise error.Abort(
   248         raise error.Abort(
   253             pycompat.bytestr(k), hint=b"please check your locale settings"
   249             pycompat.bytestr(k), hint=b"please check your locale settings"
   254         )
   250         )
   255 
   251 
   256 
   252 
   257 def unitolocal(u):
   253 def unitolocal(u: str) -> bytes:
   258     # type: (Text) -> bytes
       
   259     """Convert a unicode string to a byte string of local encoding"""
   254     """Convert a unicode string to a byte string of local encoding"""
   260     return tolocal(u.encode('utf-8'))
   255     return tolocal(u.encode('utf-8'))
   261 
   256 
   262 
   257 
   263 def unifromlocal(s):
   258 def unifromlocal(s: bytes) -> str:
   264     # type: (bytes) -> Text
       
   265     """Convert a byte string of local encoding to a unicode string"""
   259     """Convert a byte string of local encoding to a unicode string"""
   266     return fromlocal(s).decode('utf-8')
   260     return fromlocal(s).decode('utf-8')
   267 
   261 
   268 
   262 
   269 def unimethod(bytesfunc):
   263 def unimethod(bytesfunc: Callable[[Any], bytes]) -> Callable[[Any], str]:
   270     # type: (Callable[[Any], bytes]) -> Callable[[Any], Text]
       
   271     """Create a proxy method that forwards __unicode__() and __str__() of
   264     """Create a proxy method that forwards __unicode__() and __str__() of
   272     Python 3 to __bytes__()"""
   265     Python 3 to __bytes__()"""
   273 
   266 
   274     def unifunc(obj):
   267     def unifunc(obj):
   275         return unifromlocal(bytesfunc(obj))
   268         return unifromlocal(bytesfunc(obj))
   283 strtolocal = unitolocal
   276 strtolocal = unitolocal
   284 strfromlocal = unifromlocal
   277 strfromlocal = unifromlocal
   285 strmethod = unimethod
   278 strmethod = unimethod
   286 
   279 
   287 
   280 
   288 def lower(s):
   281 def lower(s: bytes) -> bytes:
   289     # type: (bytes) -> bytes
       
   290     """best-effort encoding-aware case-folding of local string s"""
   282     """best-effort encoding-aware case-folding of local string s"""
   291     try:
   283     try:
   292         return asciilower(s)
   284         return asciilower(s)
   293     except UnicodeDecodeError:
   285     except UnicodeDecodeError:
   294         pass
   286         pass
   308         raise error.Abort(
   300         raise error.Abort(
   309             pycompat.bytestr(k), hint=b"please check your locale settings"
   301             pycompat.bytestr(k), hint=b"please check your locale settings"
   310         )
   302         )
   311 
   303 
   312 
   304 
   313 def upper(s):
   305 def upper(s: bytes) -> bytes:
   314     # type: (bytes) -> bytes
       
   315     """best-effort encoding-aware case-folding of local string s"""
   306     """best-effort encoding-aware case-folding of local string s"""
   316     try:
   307     try:
   317         return asciiupper(s)
   308         return asciiupper(s)
   318     except UnicodeDecodeError:
   309     except UnicodeDecodeError:
   319         return upperfallback(s)
   310         return upperfallback(s)
   320 
   311 
   321 
   312 
   322 def upperfallback(s):
   313 def upperfallback(s: Any) -> Any:
   323     # type: (Any) -> Any
       
   324     try:
   314     try:
   325         if isinstance(s, localstr):
   315         if isinstance(s, localstr):
   326             u = s._utf8.decode("utf-8")
   316             u = s._utf8.decode("utf-8")
   327         else:
   317         else:
   328             u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
   318             u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
   393     and b"WFA"
   383     and b"WFA"
   394     or b"WF"
   384     or b"WF"
   395 )
   385 )
   396 
   386 
   397 
   387 
   398 def colwidth(s):
   388 def colwidth(s: bytes) -> int:
   399     # type: (bytes) -> int
       
   400     """Find the column width of a string for display in the local encoding"""
   389     """Find the column width of a string for display in the local encoding"""
   401     return ucolwidth(s.decode(_sysstr(encoding), 'replace'))
   390     return ucolwidth(s.decode(_sysstr(encoding), 'replace'))
   402 
   391 
   403 
   392 
   404 def ucolwidth(d):
   393 def ucolwidth(d: Text) -> int:
   405     # type: (Text) -> int
       
   406     """Find the column width of a Unicode string for display"""
   394     """Find the column width of a Unicode string for display"""
   407     eaw = getattr(unicodedata, 'east_asian_width', None)
   395     eaw = getattr(unicodedata, 'east_asian_width', None)
   408     if eaw is not None:
   396     if eaw is not None:
   409         return sum([eaw(c) in _wide and 2 or 1 for c in d])
   397         return sum([eaw(c) in _wide and 2 or 1 for c in d])
   410     return len(d)
   398     return len(d)
   411 
   399 
   412 
   400 
   413 def getcols(s, start, c):
   401 def getcols(s: bytes, start: int, c: int) -> bytes:
   414     # type: (bytes, int, int) -> bytes
       
   415     """Use colwidth to find a c-column substring of s starting at byte
   402     """Use colwidth to find a c-column substring of s starting at byte
   416     index start"""
   403     index start"""
   417     for x in range(start + c, len(s)):
   404     for x in range(start + c, len(s)):
   418         t = s[start:x]
   405         t = s[start:x]
   419         if colwidth(t) == c:
   406         if colwidth(t) == c:
   420             return t
   407             return t
   421     raise ValueError('substring not found')
   408     raise ValueError('substring not found')
   422 
   409 
   423 
   410 
   424 def trim(s, width, ellipsis=b'', leftside=False):
   411 def trim(
   425     # type: (bytes, int, bytes, bool) -> bytes
   412     s: bytes,
       
   413     width: int,
       
   414     ellipsis: bytes = b'',
       
   415     leftside: bool = False,
       
   416 ) -> bytes:
   426     """Trim string 's' to at most 'width' columns (including 'ellipsis').
   417     """Trim string 's' to at most 'width' columns (including 'ellipsis').
   427 
   418 
   428     If 'leftside' is True, left side of string 's' is trimmed.
   419     If 'leftside' is True, left side of string 's' is trimmed.
   429     'ellipsis' is always placed at trimmed side.
   420     'ellipsis' is always placed at trimmed side.
   430 
   421 
   538     lower = -1
   529     lower = -1
   539     upper = 1
   530     upper = 1
   540     other = 0
   531     other = 0
   541 
   532 
   542 
   533 
   543 def jsonescape(s, paranoid=False):
   534 def jsonescape(s: Any, paranoid: Any = False) -> Any:
   544     # type: (Any, Any) -> Any
       
   545     """returns a string suitable for JSON
   535     """returns a string suitable for JSON
   546 
   536 
   547     JSON is problematic for us because it doesn't support non-Unicode
   537     JSON is problematic for us because it doesn't support non-Unicode
   548     bytes. To deal with this, we take the following approach:
   538     bytes. To deal with this, we take the following approach:
   549 
   539 
   599 _utf8strict = r'surrogatepass'
   589 _utf8strict = r'surrogatepass'
   600 
   590 
   601 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
   591 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
   602 
   592 
   603 
   593 
   604 def getutf8char(s, pos):
   594 def getutf8char(s: bytes, pos: int) -> bytes:
   605     # type: (bytes, int) -> bytes
       
   606     """get the next full utf-8 character in the given string, starting at pos
   595     """get the next full utf-8 character in the given string, starting at pos
   607 
   596 
   608     Raises a UnicodeError if the given location does not start a valid
   597     Raises a UnicodeError if the given location does not start a valid
   609     utf-8 character.
   598     utf-8 character.
   610     """
   599     """
   618     # validate with attempted decode
   607     # validate with attempted decode
   619     c.decode("utf-8", _utf8strict)
   608     c.decode("utf-8", _utf8strict)
   620     return c
   609     return c
   621 
   610 
   622 
   611 
   623 def toutf8b(s):
   612 def toutf8b(s: bytes) -> bytes:
   624     # type: (bytes) -> bytes
       
   625     """convert a local, possibly-binary string into UTF-8b
   613     """convert a local, possibly-binary string into UTF-8b
   626 
   614 
   627     This is intended as a generic method to preserve data when working
   615     This is intended as a generic method to preserve data when working
   628     with schemes like JSON and XML that have no provision for
   616     with schemes like JSON and XML that have no provision for
   629     arbitrary byte strings. As Mercurial often doesn't know
   617     arbitrary byte strings. As Mercurial often doesn't know
   687             pos += 1
   675             pos += 1
   688         r += c
   676         r += c
   689     return bytes(r)
   677     return bytes(r)
   690 
   678 
   691 
   679 
   692 def fromutf8b(s):
   680 def fromutf8b(s: bytes) -> bytes:
   693     # type: (bytes) -> bytes
       
   694     """Given a UTF-8b string, return a local, possibly-binary string.
   681     """Given a UTF-8b string, return a local, possibly-binary string.
   695 
   682 
   696     return the original binary string. This
   683     return the original binary string. This
   697     is a round-trip process for strings like filenames, but metadata
   684     is a round-trip process for strings like filenames, but metadata
   698     that's was passed through tolocal will remain in UTF-8.
   685     that's was passed through tolocal will remain in UTF-8.