34 # These unicode characters are ignored by HFS+ (Apple Technote 1150, |
34 # These unicode characters are ignored by HFS+ (Apple Technote 1150, |
35 # "Unicode Subtleties"), so we need to ignore them in some places for |
35 # "Unicode Subtleties"), so we need to ignore them in some places for |
36 # sanity. |
36 # sanity. |
37 _ignore = [ |
37 _ignore = [ |
38 unichr(int(x, 16)).encode("utf-8") |
38 unichr(int(x, 16)).encode("utf-8") |
39 for x in "200c 200d 200e 200f 202a 202b 202c 202d 202e " |
39 for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e " |
40 "206a 206b 206c 206d 206e 206f feff".split() |
40 b"206a 206b 206c 206d 206e 206f feff".split() |
41 ] |
41 ] |
42 # verify the next function will work |
42 # verify the next function will work |
43 assert all(i.startswith(("\xe2", "\xef")) for i in _ignore) |
43 assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore) |
44 |
44 |
45 |
45 |
46 def hfsignoreclean(s): |
46 def hfsignoreclean(s): |
47 """Remove codepoints ignored by HFS+ from s. |
47 """Remove codepoints ignored by HFS+ from s. |
48 |
48 |
49 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8')) |
49 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8')) |
50 '.hg' |
50 '.hg' |
51 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8')) |
51 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8')) |
52 '.hg' |
52 '.hg' |
53 """ |
53 """ |
54 if "\xe2" in s or "\xef" in s: |
54 if b"\xe2" in s or b"\xef" in s: |
55 for c in _ignore: |
55 for c in _ignore: |
56 s = s.replace(c, '') |
56 s = s.replace(c, b'') |
57 return s |
57 return s |
58 |
58 |
59 |
59 |
60 # encoding.environ is provided read-only, which may not be used to modify |
60 # encoding.environ is provided read-only, which may not be used to modify |
61 # the process environment |
61 # the process environment |
71 (k.encode(r'utf-8'), v.encode(r'utf-8')) |
71 (k.encode(r'utf-8'), v.encode(r'utf-8')) |
72 for k, v in os.environ.items() # re-exports |
72 for k, v in os.environ.items() # re-exports |
73 ) |
73 ) |
74 |
74 |
75 _encodingrewrites = { |
75 _encodingrewrites = { |
76 '646': 'ascii', |
76 b'646': b'ascii', |
77 'ANSI_X3.4-1968': 'ascii', |
77 b'ANSI_X3.4-1968': b'ascii', |
78 } |
78 } |
79 # cp65001 is a Windows variant of utf-8, which isn't supported on Python 2. |
79 # cp65001 is a Windows variant of utf-8, which isn't supported on Python 2. |
80 # No idea if it should be rewritten to the canonical name 'utf-8' on Python 3. |
80 # No idea if it should be rewritten to the canonical name 'utf-8' on Python 3. |
81 # https://bugs.python.org/issue13216 |
81 # https://bugs.python.org/issue13216 |
82 if pycompat.iswindows and not pycompat.ispy3: |
82 if pycompat.iswindows and not pycompat.ispy3: |
83 _encodingrewrites['cp65001'] = 'utf-8' |
83 _encodingrewrites[b'cp65001'] = b'utf-8' |
84 |
84 |
85 try: |
85 try: |
86 encoding = environ.get("HGENCODING") |
86 encoding = environ.get(b"HGENCODING") |
87 if not encoding: |
87 if not encoding: |
88 encoding = locale.getpreferredencoding().encode('ascii') or 'ascii' |
88 encoding = locale.getpreferredencoding().encode('ascii') or b'ascii' |
89 encoding = _encodingrewrites.get(encoding, encoding) |
89 encoding = _encodingrewrites.get(encoding, encoding) |
90 except locale.Error: |
90 except locale.Error: |
91 encoding = 'ascii' |
91 encoding = b'ascii' |
92 encodingmode = environ.get("HGENCODINGMODE", "strict") |
92 encodingmode = environ.get(b"HGENCODINGMODE", b"strict") |
93 fallbackencoding = 'ISO-8859-1' |
93 fallbackencoding = b'ISO-8859-1' |
94 |
94 |
95 |
95 |
96 class localstr(bytes): |
96 class localstr(bytes): |
97 '''This class allows strings that are unmodified to be |
97 '''This class allows strings that are unmodified to be |
98 round-tripped to the local encoding and back''' |
98 round-tripped to the local encoding and back''' |
156 |
156 |
157 try: |
157 try: |
158 try: |
158 try: |
159 # make sure string is actually stored in UTF-8 |
159 # make sure string is actually stored in UTF-8 |
160 u = s.decode('UTF-8') |
160 u = s.decode('UTF-8') |
161 if encoding == 'UTF-8': |
161 if encoding == b'UTF-8': |
162 # fast path |
162 # fast path |
163 return s |
163 return s |
164 r = u.encode(_sysstr(encoding), r"replace") |
164 r = u.encode(_sysstr(encoding), r"replace") |
165 if u == r.decode(_sysstr(encoding)): |
165 if u == r.decode(_sysstr(encoding)): |
166 # r is a safe, non-lossy encoding of s |
166 # r is a safe, non-lossy encoding of s |
178 except UnicodeDecodeError: |
178 except UnicodeDecodeError: |
179 u = s.decode("utf-8", "replace") # last ditch |
179 u = s.decode("utf-8", "replace") # last ditch |
180 # can't round-trip |
180 # can't round-trip |
181 return u.encode(_sysstr(encoding), r"replace") |
181 return u.encode(_sysstr(encoding), r"replace") |
182 except LookupError as k: |
182 except LookupError as k: |
183 raise error.Abort(k, hint="please check your locale settings") |
183 raise error.Abort(k, hint=b"please check your locale settings") |
184 |
184 |
185 |
185 |
186 def fromlocal(s): |
186 def fromlocal(s): |
187 """ |
187 """ |
188 Convert a string from the local character encoding to UTF-8 |
188 Convert a string from the local character encoding to UTF-8 |
204 u = s.decode(_sysstr(encoding), _sysstr(encodingmode)) |
204 u = s.decode(_sysstr(encoding), _sysstr(encodingmode)) |
205 return u.encode("utf-8") |
205 return u.encode("utf-8") |
206 except UnicodeDecodeError as inst: |
206 except UnicodeDecodeError as inst: |
207 sub = s[max(0, inst.start - 10) : inst.start + 10] |
207 sub = s[max(0, inst.start - 10) : inst.start + 10] |
208 raise error.Abort( |
208 raise error.Abort( |
209 "decoding near '%s': %s!" % (sub, pycompat.bytestr(inst)) |
209 b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst)) |
210 ) |
210 ) |
211 except LookupError as k: |
211 except LookupError as k: |
212 raise error.Abort(k, hint="please check your locale settings") |
212 raise error.Abort(k, hint=b"please check your locale settings") |
213 |
213 |
214 |
214 |
215 def unitolocal(u): |
215 def unitolocal(u): |
216 """Convert a unicode string to a byte string of local encoding""" |
216 """Convert a unicode string to a byte string of local encoding""" |
217 return tolocal(u.encode('utf-8')) |
217 return tolocal(u.encode('utf-8')) |
264 else: |
264 else: |
265 getcwd = os.getcwd # re-exports |
265 getcwd = os.getcwd # re-exports |
266 |
266 |
267 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide. |
267 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide. |
268 _wide = _sysstr( |
268 _wide = _sysstr( |
269 environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide" and "WFA" or "WF" |
269 environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide" |
|
270 and b"WFA" |
|
271 or b"WF" |
270 ) |
272 ) |
271 |
273 |
272 |
274 |
273 def colwidth(s): |
275 def colwidth(s): |
274 "Find the column width of a string for display in the local encoding" |
276 b"Find the column width of a string for display in the local encoding" |
275 return ucolwidth(s.decode(_sysstr(encoding), r'replace')) |
277 return ucolwidth(s.decode(_sysstr(encoding), r'replace')) |
276 |
278 |
277 |
279 |
278 def ucolwidth(d): |
280 def ucolwidth(d): |
279 "Find the column width of a Unicode string for display" |
281 b"Find the column width of a Unicode string for display" |
280 eaw = getattr(unicodedata, 'east_asian_width', None) |
282 eaw = getattr(unicodedata, 'east_asian_width', None) |
281 if eaw is not None: |
283 if eaw is not None: |
282 return sum([eaw(c) in _wide and 2 or 1 for c in d]) |
284 return sum([eaw(c) in _wide and 2 or 1 for c in d]) |
283 return len(d) |
285 return len(d) |
284 |
286 |
290 t = s[start:x] |
292 t = s[start:x] |
291 if colwidth(t) == c: |
293 if colwidth(t) == c: |
292 return t |
294 return t |
293 |
295 |
294 |
296 |
295 def trim(s, width, ellipsis='', leftside=False): |
297 def trim(s, width, ellipsis=b'', leftside=False): |
296 """Trim string 's' to at most 'width' columns (including 'ellipsis'). |
298 """Trim string 's' to at most 'width' columns (including 'ellipsis'). |
297 |
299 |
298 If 'leftside' is True, left side of string 's' is trimmed. |
300 If 'leftside' is True, left side of string 's' is trimmed. |
299 'ellipsis' is always placed at trimmed side. |
301 'ellipsis' is always placed at trimmed side. |
300 |
302 |
406 return s # preserve localstring |
408 return s # preserve localstring |
407 return lu.encode(_sysstr(encoding)) |
409 return lu.encode(_sysstr(encoding)) |
408 except UnicodeError: |
410 except UnicodeError: |
409 return s.lower() # we don't know how to fold this except in ASCII |
411 return s.lower() # we don't know how to fold this except in ASCII |
410 except LookupError as k: |
412 except LookupError as k: |
411 raise error.Abort(k, hint="please check your locale settings") |
413 raise error.Abort(k, hint=b"please check your locale settings") |
412 |
414 |
413 |
415 |
414 def upper(s): |
416 def upper(s): |
415 "best-effort encoding-aware case-folding of local string s" |
417 b"best-effort encoding-aware case-folding of local string s" |
416 try: |
418 try: |
417 return asciiupper(s) |
419 return asciiupper(s) |
418 except UnicodeDecodeError: |
420 except UnicodeDecodeError: |
419 return upperfallback(s) |
421 return upperfallback(s) |
420 |
422 |
431 return s # preserve localstring |
433 return s # preserve localstring |
432 return uu.encode(_sysstr(encoding)) |
434 return uu.encode(_sysstr(encoding)) |
433 except UnicodeError: |
435 except UnicodeError: |
434 return s.upper() # we don't know how to fold this except in ASCII |
436 return s.upper() # we don't know how to fold this except in ASCII |
435 except LookupError as k: |
437 except LookupError as k: |
436 raise error.Abort(k, hint="please check your locale settings") |
438 raise error.Abort(k, hint=b"please check your locale settings") |
437 |
439 |
438 |
440 |
439 class normcasespecs(object): |
441 class normcasespecs(object): |
440 '''what a platform's normcase does to ASCII strings |
442 '''what a platform's normcase does to ASCII strings |
441 |
443 |
573 # already verified that s is non-lossy in legacy encoding, which |
575 # already verified that s is non-lossy in legacy encoding, which |
574 # shouldn't contain characters in U+DCxx range |
576 # shouldn't contain characters in U+DCxx range |
575 return fromlocal(s) |
577 return fromlocal(s) |
576 elif isasciistr(s): |
578 elif isasciistr(s): |
577 return s |
579 return s |
578 if "\xed" not in s: |
580 if b"\xed" not in s: |
579 try: |
581 try: |
580 s.decode('utf-8', _utf8strict) |
582 s.decode('utf-8', _utf8strict) |
581 return s |
583 return s |
582 except UnicodeDecodeError: |
584 except UnicodeDecodeError: |
583 pass |
585 pass |
584 |
586 |
585 s = pycompat.bytestr(s) |
587 s = pycompat.bytestr(s) |
586 r = "" |
588 r = b"" |
587 pos = 0 |
589 pos = 0 |
588 l = len(s) |
590 l = len(s) |
589 while pos < l: |
591 while pos < l: |
590 try: |
592 try: |
591 c = getutf8char(s, pos) |
593 c = getutf8char(s, pos) |
592 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf": |
594 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf": |
593 # have to re-escape existing U+DCxx characters |
595 # have to re-escape existing U+DCxx characters |
594 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict) |
596 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict) |
595 pos += 1 |
597 pos += 1 |
596 else: |
598 else: |
597 pos += len(c) |
599 pos += len(c) |
626 ''' |
628 ''' |
627 |
629 |
628 if isasciistr(s): |
630 if isasciistr(s): |
629 return s |
631 return s |
630 # fast path - look for uDxxx prefixes in s |
632 # fast path - look for uDxxx prefixes in s |
631 if "\xed" not in s: |
633 if b"\xed" not in s: |
632 return s |
634 return s |
633 |
635 |
634 # We could do this with the unicode type but some Python builds |
636 # We could do this with the unicode type but some Python builds |
635 # use UTF-16 internally (issue5031) which causes non-BMP code |
637 # use UTF-16 internally (issue5031) which causes non-BMP code |
636 # points to be escaped. Instead, we use our handy getutf8char |
638 # points to be escaped. Instead, we use our handy getutf8char |
637 # helper again to walk the string without "decoding" it. |
639 # helper again to walk the string without "decoding" it. |
638 |
640 |
639 s = pycompat.bytestr(s) |
641 s = pycompat.bytestr(s) |
640 r = "" |
642 r = b"" |
641 pos = 0 |
643 pos = 0 |
642 l = len(s) |
644 l = len(s) |
643 while pos < l: |
645 while pos < l: |
644 c = getutf8char(s, pos) |
646 c = getutf8char(s, pos) |
645 pos += len(c) |
647 pos += len(c) |
646 # unescape U+DCxx characters |
648 # unescape U+DCxx characters |
647 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf": |
649 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf": |
648 c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF) |
650 c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF) |
649 r += c |
651 r += c |
650 return r |
652 return r |