changeset 45942 | 89a2afe31e82 |
parent 45681 | a736ab681b78 |
child 46319 | 3dfebba99ef6 |
45941:346af7687c6f | 45942:89a2afe31e82 |
---|---|
111 encodingmode = environ.get(b"HGENCODINGMODE", b"strict") |
111 encodingmode = environ.get(b"HGENCODINGMODE", b"strict") |
112 fallbackencoding = b'ISO-8859-1' |
112 fallbackencoding = b'ISO-8859-1' |
113 |
113 |
114 |
114 |
115 class localstr(bytes): |
115 class localstr(bytes): |
116 '''This class allows strings that are unmodified to be |
116 """This class allows strings that are unmodified to be |
117 round-tripped to the local encoding and back''' |
117 round-tripped to the local encoding and back""" |
118 |
118 |
119 def __new__(cls, u, l): |
119 def __new__(cls, u, l): |
120 s = bytes.__new__(cls, l) |
120 s = bytes.__new__(cls, l) |
121 s._utf8 = u |
121 s._utf8 = u |
122 return s |
122 return s |
327 return len(d) |
327 return len(d) |
328 |
328 |
329 |
329 |
330 def getcols(s, start, c): |
330 def getcols(s, start, c): |
331 # type: (bytes, int, int) -> bytes |
331 # type: (bytes, int, int) -> bytes |
332 '''Use colwidth to find a c-column substring of s starting at byte |
332 """Use colwidth to find a c-column substring of s starting at byte |
333 index start''' |
333 index start""" |
334 for x in pycompat.xrange(start + c, len(s)): |
334 for x in pycompat.xrange(start + c, len(s)): |
335 t = s[start:x] |
335 t = s[start:x] |
336 if colwidth(t) == c: |
336 if colwidth(t) == c: |
337 return t |
337 return t |
338 raise ValueError('substring not found') |
338 raise ValueError('substring not found') |
485 except LookupError as k: |
485 except LookupError as k: |
486 raise error.Abort(k, hint=b"please check your locale settings") |
486 raise error.Abort(k, hint=b"please check your locale settings") |
487 |
487 |
488 |
488 |
489 class normcasespecs(object): |
489 class normcasespecs(object): |
490 '''what a platform's normcase does to ASCII strings |
490 """what a platform's normcase does to ASCII strings |
491 |
491 |
492 This is specified per platform, and should be consistent with what normcase |
492 This is specified per platform, and should be consistent with what normcase |
493 on that platform actually does. |
493 on that platform actually does. |
494 |
494 |
495 lower: normcase lowercases ASCII strings |
495 lower: normcase lowercases ASCII strings |
496 upper: normcase uppercases ASCII strings |
496 upper: normcase uppercases ASCII strings |
497 other: the fallback function should always be called |
497 other: the fallback function should always be called |
498 |
498 |
499 This should be kept in sync with normcase_spec in util.h.''' |
499 This should be kept in sync with normcase_spec in util.h.""" |
500 |
500 |
501 lower = -1 |
501 lower = -1 |
502 upper = 1 |
502 upper = 1 |
503 other = 0 |
503 other = 0 |
504 |
504 |
505 |
505 |
506 def jsonescape(s, paranoid=False): |
506 def jsonescape(s, paranoid=False): |
507 # type: (Any, Any) -> Any |
507 # type: (Any, Any) -> Any |
508 '''returns a string suitable for JSON |
508 """returns a string suitable for JSON |
509 |
509 |
510 JSON is problematic for us because it doesn't support non-Unicode |
510 JSON is problematic for us because it doesn't support non-Unicode |
511 bytes. To deal with this, we take the following approach: |
511 bytes. To deal with this, we take the following approach: |
512 |
512 |
513 - localstr/safelocalstr objects are converted back to UTF-8 |
513 - localstr/safelocalstr objects are converted back to UTF-8 |
545 'utf-8: caf\\\\u00e9' |
545 'utf-8: caf\\\\u00e9' |
546 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True) |
546 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True) |
547 'non-BMP: \\\\ud834\\\\udd1e' |
547 'non-BMP: \\\\ud834\\\\udd1e' |
548 >>> jsonescape(b'<foo@example.org>', paranoid=True) |
548 >>> jsonescape(b'<foo@example.org>', paranoid=True) |
549 '\\\\u003cfoo@example.org\\\\u003e' |
549 '\\\\u003cfoo@example.org\\\\u003e' |
550 ''' |
550 """ |
551 |
551 |
552 u8chars = toutf8b(s) |
552 u8chars = toutf8b(s) |
553 try: |
553 try: |
554 return _jsonescapeu8fast(u8chars, paranoid) |
554 return _jsonescapeu8fast(u8chars, paranoid) |
555 except ValueError: |
555 except ValueError: |
567 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4] |
567 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4] |
568 |
568 |
569 |
569 |
570 def getutf8char(s, pos): |
570 def getutf8char(s, pos): |
571 # type: (bytes, int) -> bytes |
571 # type: (bytes, int) -> bytes |
572 '''get the next full utf-8 character in the given string, starting at pos |
572 """get the next full utf-8 character in the given string, starting at pos |
573 |
573 |
574 Raises a UnicodeError if the given location does not start a valid |
574 Raises a UnicodeError if the given location does not start a valid |
575 utf-8 character. |
575 utf-8 character. |
576 ''' |
576 """ |
577 |
577 |
578 # find how many bytes to attempt decoding from first nibble |
578 # find how many bytes to attempt decoding from first nibble |
579 l = _utf8len[ord(s[pos : pos + 1]) >> 4] |
579 l = _utf8len[ord(s[pos : pos + 1]) >> 4] |
580 if not l: # ascii |
580 if not l: # ascii |
581 return s[pos : pos + 1] |
581 return s[pos : pos + 1] |
586 return c |
586 return c |
587 |
587 |
588 |
588 |
589 def toutf8b(s): |
589 def toutf8b(s): |
590 # type: (bytes) -> bytes |
590 # type: (bytes) -> bytes |
591 '''convert a local, possibly-binary string into UTF-8b |
591 """convert a local, possibly-binary string into UTF-8b |
592 |
592 |
593 This is intended as a generic method to preserve data when working |
593 This is intended as a generic method to preserve data when working |
594 with schemes like JSON and XML that have no provision for |
594 with schemes like JSON and XML that have no provision for |
595 arbitrary byte strings. As Mercurial often doesn't know |
595 arbitrary byte strings. As Mercurial often doesn't know |
596 what encoding data is in, we use so-called UTF-8b. |
596 what encoding data is in, we use so-called UTF-8b. |
614 |
614 |
615 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and |
615 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and |
616 arbitrary bytes into an internal Unicode format that can be |
616 arbitrary bytes into an internal Unicode format that can be |
617 re-encoded back into the original. Here we are exposing the |
617 re-encoded back into the original. Here we are exposing the |
618 internal surrogate encoding as a UTF-8 string.) |
618 internal surrogate encoding as a UTF-8 string.) |
619 ''' |
619 """ |
620 |
620 |
621 if isinstance(s, localstr): |
621 if isinstance(s, localstr): |
622 # assume that the original UTF-8 sequence would never contain |
622 # assume that the original UTF-8 sequence would never contain |
623 # invalid characters in U+DCxx range |
623 # invalid characters in U+DCxx range |
624 return s._utf8 |
624 return s._utf8 |
655 return r |
655 return r |
656 |
656 |
657 |
657 |
658 def fromutf8b(s): |
658 def fromutf8b(s): |
659 # type: (bytes) -> bytes |
659 # type: (bytes) -> bytes |
660 '''Given a UTF-8b string, return a local, possibly-binary string. |
660 """Given a UTF-8b string, return a local, possibly-binary string. |
661 |
661 |
662 return the original binary string. This |
662 return the original binary string. This |
663 is a round-trip process for strings like filenames, but metadata |
663 is a round-trip process for strings like filenames, but metadata |
664 that's was passed through tolocal will remain in UTF-8. |
664 that's was passed through tolocal will remain in UTF-8. |
665 |
665 |
675 True |
675 True |
676 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd") |
676 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd") |
677 True |
677 True |
678 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80") |
678 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80") |
679 True |
679 True |
680 ''' |
680 """ |
681 |
681 |
682 if isasciistr(s): |
682 if isasciistr(s): |
683 return s |
683 return s |
684 # fast path - look for uDxxx prefixes in s |
684 # fast path - look for uDxxx prefixes in s |
685 if b"\xed" not in s: |
685 if b"\xed" not in s: |