changeset 51287 | f15cb5111a1e |
parent 51285 | 9d3721552b6c |
child 51290 | f4a0806081f2 |
51286:81224afd938d | 51287:f15cb5111a1e |
---|---|
57 ] |
57 ] |
58 # verify the next function will work |
58 # verify the next function will work |
59 assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore) |
59 assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore) |
60 |
60 |
61 |
61 |
62 def hfsignoreclean(s): |
62 def hfsignoreclean(s: bytes) -> bytes: |
63 # type: (bytes) -> bytes |
|
64 """Remove codepoints ignored by HFS+ from s. |
63 """Remove codepoints ignored by HFS+ from s. |
65 |
64 |
66 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8')) |
65 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8')) |
67 '.hg' |
66 '.hg' |
68 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8')) |
67 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8')) |
131 s._utf8 = u |
130 s._utf8 = u |
132 return s |
131 return s |
133 |
132 |
134 if typing.TYPE_CHECKING: |
133 if typing.TYPE_CHECKING: |
135 # pseudo implementation to help pytype see localstr() constructor |
134 # pseudo implementation to help pytype see localstr() constructor |
136 def __init__(self, u, l): |
135 def __init__(self, u: bytes, l: bytes) -> None: |
137 # type: (bytes, bytes) -> None |
|
138 super(localstr, self).__init__(l) |
136 super(localstr, self).__init__(l) |
139 self._utf8 = u |
137 self._utf8 = u |
140 |
138 |
141 def __hash__(self): |
139 def __hash__(self): |
142 return hash(self._utf8) # avoid collisions in local string space |
140 return hash(self._utf8) # avoid collisions in local string space |
151 >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0} |
149 >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0} |
152 >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0} |
150 >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0} |
153 """ |
151 """ |
154 |
152 |
155 |
153 |
156 def tolocal(s): |
154 def tolocal(s: bytes) -> bytes: |
157 # type: (bytes) -> bytes |
|
158 """ |
155 """ |
159 Convert a string from internal UTF-8 to local encoding |
156 Convert a string from internal UTF-8 to local encoding |
160 |
157 |
161 All internal strings should be UTF-8 but some repos before the |
158 All internal strings should be UTF-8 but some repos before the |
162 implementation of locale support may contain latin1 or possibly |
159 implementation of locale support may contain latin1 or possibly |
220 raise error.Abort( |
217 raise error.Abort( |
221 pycompat.bytestr(k), hint=b"please check your locale settings" |
218 pycompat.bytestr(k), hint=b"please check your locale settings" |
222 ) |
219 ) |
223 |
220 |
224 |
221 |
225 def fromlocal(s): |
222 def fromlocal(s: bytes) -> bytes: |
226 # type: (bytes) -> bytes |
|
227 """ |
223 """ |
228 Convert a string from the local character encoding to UTF-8 |
224 Convert a string from the local character encoding to UTF-8 |
229 |
225 |
230 We attempt to decode strings using the encoding mode set by |
226 We attempt to decode strings using the encoding mode set by |
231 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown |
227 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown |
252 raise error.Abort( |
248 raise error.Abort( |
253 pycompat.bytestr(k), hint=b"please check your locale settings" |
249 pycompat.bytestr(k), hint=b"please check your locale settings" |
254 ) |
250 ) |
255 |
251 |
256 |
252 |
257 def unitolocal(u): |
253 def unitolocal(u: str) -> bytes: |
258 # type: (Text) -> bytes |
|
259 """Convert a unicode string to a byte string of local encoding""" |
254 """Convert a unicode string to a byte string of local encoding""" |
260 return tolocal(u.encode('utf-8')) |
255 return tolocal(u.encode('utf-8')) |
261 |
256 |
262 |
257 |
263 def unifromlocal(s): |
258 def unifromlocal(s: bytes) -> str: |
264 # type: (bytes) -> Text |
|
265 """Convert a byte string of local encoding to a unicode string""" |
259 """Convert a byte string of local encoding to a unicode string""" |
266 return fromlocal(s).decode('utf-8') |
260 return fromlocal(s).decode('utf-8') |
267 |
261 |
268 |
262 |
269 def unimethod(bytesfunc): |
263 def unimethod(bytesfunc: Callable[[Any], bytes]) -> Callable[[Any], str]: |
270 # type: (Callable[[Any], bytes]) -> Callable[[Any], Text] |
|
271 """Create a proxy method that forwards __unicode__() and __str__() of |
264 """Create a proxy method that forwards __unicode__() and __str__() of |
272 Python 3 to __bytes__()""" |
265 Python 3 to __bytes__()""" |
273 |
266 |
274 def unifunc(obj): |
267 def unifunc(obj): |
275 return unifromlocal(bytesfunc(obj)) |
268 return unifromlocal(bytesfunc(obj)) |
283 strtolocal = unitolocal |
276 strtolocal = unitolocal |
284 strfromlocal = unifromlocal |
277 strfromlocal = unifromlocal |
285 strmethod = unimethod |
278 strmethod = unimethod |
286 |
279 |
287 |
280 |
288 def lower(s): |
281 def lower(s: bytes) -> bytes: |
289 # type: (bytes) -> bytes |
|
290 """best-effort encoding-aware case-folding of local string s""" |
282 """best-effort encoding-aware case-folding of local string s""" |
291 try: |
283 try: |
292 return asciilower(s) |
284 return asciilower(s) |
293 except UnicodeDecodeError: |
285 except UnicodeDecodeError: |
294 pass |
286 pass |
308 raise error.Abort( |
300 raise error.Abort( |
309 pycompat.bytestr(k), hint=b"please check your locale settings" |
301 pycompat.bytestr(k), hint=b"please check your locale settings" |
310 ) |
302 ) |
311 |
303 |
312 |
304 |
313 def upper(s): |
305 def upper(s: bytes) -> bytes: |
314 # type: (bytes) -> bytes |
|
315 """best-effort encoding-aware case-folding of local string s""" |
306 """best-effort encoding-aware case-folding of local string s""" |
316 try: |
307 try: |
317 return asciiupper(s) |
308 return asciiupper(s) |
318 except UnicodeDecodeError: |
309 except UnicodeDecodeError: |
319 return upperfallback(s) |
310 return upperfallback(s) |
320 |
311 |
321 |
312 |
322 def upperfallback(s): |
313 def upperfallback(s: Any) -> Any: |
323 # type: (Any) -> Any |
|
324 try: |
314 try: |
325 if isinstance(s, localstr): |
315 if isinstance(s, localstr): |
326 u = s._utf8.decode("utf-8") |
316 u = s._utf8.decode("utf-8") |
327 else: |
317 else: |
328 u = s.decode(_sysstr(encoding), _sysstr(encodingmode)) |
318 u = s.decode(_sysstr(encoding), _sysstr(encodingmode)) |
393 and b"WFA" |
383 and b"WFA" |
394 or b"WF" |
384 or b"WF" |
395 ) |
385 ) |
396 |
386 |
397 |
387 |
398 def colwidth(s): |
388 def colwidth(s: bytes) -> int: |
399 # type: (bytes) -> int |
|
400 """Find the column width of a string for display in the local encoding""" |
389 """Find the column width of a string for display in the local encoding""" |
401 return ucolwidth(s.decode(_sysstr(encoding), 'replace')) |
390 return ucolwidth(s.decode(_sysstr(encoding), 'replace')) |
402 |
391 |
403 |
392 |
404 def ucolwidth(d): |
393 def ucolwidth(d: Text) -> int: |
405 # type: (Text) -> int |
|
406 """Find the column width of a Unicode string for display""" |
394 """Find the column width of a Unicode string for display""" |
407 eaw = getattr(unicodedata, 'east_asian_width', None) |
395 eaw = getattr(unicodedata, 'east_asian_width', None) |
408 if eaw is not None: |
396 if eaw is not None: |
409 return sum([eaw(c) in _wide and 2 or 1 for c in d]) |
397 return sum([eaw(c) in _wide and 2 or 1 for c in d]) |
410 return len(d) |
398 return len(d) |
411 |
399 |
412 |
400 |
413 def getcols(s, start, c): |
401 def getcols(s: bytes, start: int, c: int) -> bytes: |
414 # type: (bytes, int, int) -> bytes |
|
415 """Use colwidth to find a c-column substring of s starting at byte |
402 """Use colwidth to find a c-column substring of s starting at byte |
416 index start""" |
403 index start""" |
417 for x in range(start + c, len(s)): |
404 for x in range(start + c, len(s)): |
418 t = s[start:x] |
405 t = s[start:x] |
419 if colwidth(t) == c: |
406 if colwidth(t) == c: |
420 return t |
407 return t |
421 raise ValueError('substring not found') |
408 raise ValueError('substring not found') |
422 |
409 |
423 |
410 |
424 def trim(s, width, ellipsis=b'', leftside=False): |
411 def trim( |
425 # type: (bytes, int, bytes, bool) -> bytes |
412 s: bytes, |
413 width: int, |
|
414 ellipsis: bytes = b'', |
|
415 leftside: bool = False, |
|
416 ) -> bytes: |
|
426 """Trim string 's' to at most 'width' columns (including 'ellipsis'). |
417 """Trim string 's' to at most 'width' columns (including 'ellipsis'). |
427 |
418 |
428 If 'leftside' is True, left side of string 's' is trimmed. |
419 If 'leftside' is True, left side of string 's' is trimmed. |
429 'ellipsis' is always placed at trimmed side. |
420 'ellipsis' is always placed at trimmed side. |
430 |
421 |
538 lower = -1 |
529 lower = -1 |
539 upper = 1 |
530 upper = 1 |
540 other = 0 |
531 other = 0 |
541 |
532 |
542 |
533 |
543 def jsonescape(s, paranoid=False): |
534 def jsonescape(s: Any, paranoid: Any = False) -> Any: |
544 # type: (Any, Any) -> Any |
|
545 """returns a string suitable for JSON |
535 """returns a string suitable for JSON |
546 |
536 |
547 JSON is problematic for us because it doesn't support non-Unicode |
537 JSON is problematic for us because it doesn't support non-Unicode |
548 bytes. To deal with this, we take the following approach: |
538 bytes. To deal with this, we take the following approach: |
549 |
539 |
599 _utf8strict = r'surrogatepass' |
589 _utf8strict = r'surrogatepass' |
600 |
590 |
601 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4] |
591 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4] |
602 |
592 |
603 |
593 |
604 def getutf8char(s, pos): |
594 def getutf8char(s: bytes, pos: int) -> bytes: |
605 # type: (bytes, int) -> bytes |
|
606 """get the next full utf-8 character in the given string, starting at pos |
595 """get the next full utf-8 character in the given string, starting at pos |
607 |
596 |
608 Raises a UnicodeError if the given location does not start a valid |
597 Raises a UnicodeError if the given location does not start a valid |
609 utf-8 character. |
598 utf-8 character. |
610 """ |
599 """ |
618 # validate with attempted decode |
607 # validate with attempted decode |
619 c.decode("utf-8", _utf8strict) |
608 c.decode("utf-8", _utf8strict) |
620 return c |
609 return c |
621 |
610 |
622 |
611 |
623 def toutf8b(s): |
612 def toutf8b(s: bytes) -> bytes: |
624 # type: (bytes) -> bytes |
|
625 """convert a local, possibly-binary string into UTF-8b |
613 """convert a local, possibly-binary string into UTF-8b |
626 |
614 |
627 This is intended as a generic method to preserve data when working |
615 This is intended as a generic method to preserve data when working |
628 with schemes like JSON and XML that have no provision for |
616 with schemes like JSON and XML that have no provision for |
629 arbitrary byte strings. As Mercurial often doesn't know |
617 arbitrary byte strings. As Mercurial often doesn't know |
687 pos += 1 |
675 pos += 1 |
688 r += c |
676 r += c |
689 return bytes(r) |
677 return bytes(r) |
690 |
678 |
691 |
679 |
692 def fromutf8b(s): |
680 def fromutf8b(s: bytes) -> bytes: |
693 # type: (bytes) -> bytes |
|
694 """Given a UTF-8b string, return a local, possibly-binary string. |
681 """Given a UTF-8b string, return a local, possibly-binary string. |
695 |
682 |
696 return the original binary string. This |
683 return the original binary string. This |
697 is a round-trip process for strings like filenames, but metadata |
684 is a round-trip process for strings like filenames, but metadata |
698 that's was passed through tolocal will remain in UTF-8. |
685 that's was passed through tolocal will remain in UTF-8. |