mercurial/encoding.py
author Martin von Zweigbergk <martinvonz@google.com>
Tue, 18 Jan 2022 12:57:55 -0800
changeset 48756 86e4b86df932
parent 48671 f1ed5c304f45
child 48875 6000f5b25c9b
permissions -rw-r--r--
filemerge: when not keeping premerge, don't write markers to context When premerge is enabled (as it is for non-binary inputs by default) and the markers are not kept, we currently still write it to the output context and then restore the previous content right after. With the refactoring in the previous patch, we can easily avoid that step and instead write the output in the opposite case (i.e. when it's successful or when the markers are supposed to be kept). Differential Revision: https://phab.mercurial-scm.org/D12149
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
8226
8b2cd04a6e97 put license and copyright info into comment blocks
Martin Geisler <mg@lazybytes.net>
parents: 8225
diff changeset
     1
# encoding.py - character transcoding support for Mercurial
8b2cd04a6e97 put license and copyright info into comment blocks
Martin Geisler <mg@lazybytes.net>
parents: 8225
diff changeset
     2
#
46819
d4ba4d51f85f contributor: change mentions of mpm to olivia
Raphaël Gomès <rgomes@octobus.net>
parents: 46319
diff changeset
     3
#  Copyright 2005-2009 Olivia Mackall <olivia@selenic.com> and others
8226
8b2cd04a6e97 put license and copyright info into comment blocks
Martin Geisler <mg@lazybytes.net>
parents: 8225
diff changeset
     4
#
8b2cd04a6e97 put license and copyright info into comment blocks
Martin Geisler <mg@lazybytes.net>
parents: 8225
diff changeset
     5
# This software may be used and distributed according to the terms of the
10263
25e572394f5c Update license to GPLv2+
Matt Mackall <mpm@selenic.com>
parents: 9574
diff changeset
     6
# GNU General Public License version 2 or any later version.
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
     7
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
     8
from __future__ import absolute_import, print_function
27355
b479fc425a81 encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents: 26963
diff changeset
     9
b479fc425a81 encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents: 26963
diff changeset
    10
import locale
b479fc425a81 encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents: 26963
diff changeset
    11
import os
47621
d6ee6456bd5f windows: enforce upper case drive letter for getcwd in mercurial too
Pierre-Yves David <pierre-yves.david@octobus.net>
parents: 47560
diff changeset
    12
import re
27355
b479fc425a81 encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents: 26963
diff changeset
    13
import unicodedata
b479fc425a81 encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents: 26963
diff changeset
    14
43089
c59eb1560c44 py3: manually import getattr where it is needed
Gregory Szorc <gregory.szorc@gmail.com>
parents: 43077
diff changeset
    15
from .pycompat import getattr
27355
b479fc425a81 encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents: 26963
diff changeset
    16
from . import (
b479fc425a81 encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents: 26963
diff changeset
    17
    error,
32372
df448de7cf3b parsers: switch to policy importer
Yuya Nishihara <yuya@tcha.org>
parents: 32299
diff changeset
    18
    policy,
30031
0f6d6fdd3c2a pycompat: provide 'ispy3' constant
Yuya Nishihara <yuya@tcha.org>
parents: 28508
diff changeset
    19
    pycompat,
27355
b479fc425a81 encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents: 26963
diff changeset
    20
)
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
    21
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
    22
from .pure import charencode as charencodepure
33924
b9101467d88b encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents: 33852
diff changeset
    23
43773
7b14d649af1b typing: consolidate "if not globals():" trick
Yuya Nishihara <yuya@tcha.org>
parents: 43685
diff changeset
    24
if pycompat.TYPE_CHECKING:
43496
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
    25
    from typing import (
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
    26
        Any,
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
    27
        Callable,
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
    28
        List,
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
    29
        Text,
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
    30
        Type,
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
    31
        TypeVar,
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
    32
        Union,
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
    33
    )
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
    34
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
    35
    # keep pyflakes happy
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
    36
    for t in (Any, Callable, List, Text, Type, Union):
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
    37
        assert t
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
    38
43680
3364a15f61f0 typing: fix forward reference in _Tlocalstr type bound
Yuya Nishihara <yuya@tcha.org>
parents: 43679
diff changeset
    39
    _Tlocalstr = TypeVar('_Tlocalstr', bound='localstr')
43496
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
    40
43506
9f70512ae2cf cleanup: remove pointless r-prefixes on single-quoted strings
Augie Fackler <augie@google.com>
parents: 43503
diff changeset
    41
charencode = policy.importmod('charencode')
33761
f5fc54e7e467 encoding: drop circular import by proxying through '<policy>.charencode'
Yuya Nishihara <yuya@tcha.org>
parents: 33022
diff changeset
    42
33926
f4433f2713d0 encoding: add function to test if a str consists of ASCII characters
Yuya Nishihara <yuya@tcha.org>
parents: 33925
diff changeset
    43
isasciistr = charencode.isasciistr
33761
f5fc54e7e467 encoding: drop circular import by proxying through '<policy>.charencode'
Yuya Nishihara <yuya@tcha.org>
parents: 33022
diff changeset
    44
asciilower = charencode.asciilower
f5fc54e7e467 encoding: drop circular import by proxying through '<policy>.charencode'
Yuya Nishihara <yuya@tcha.org>
parents: 33022
diff changeset
    45
asciiupper = charencode.asciiupper
33925
2c37f9dabc32 encoding: add fast path of jsonescape() (issue5533)
Yuya Nishihara <yuya@tcha.org>
parents: 33924
diff changeset
    46
_jsonescapeu8fast = charencode.jsonescapeu8fast
33761
f5fc54e7e467 encoding: drop circular import by proxying through '<policy>.charencode'
Yuya Nishihara <yuya@tcha.org>
parents: 33022
diff changeset
    47
30033
02dbfaa6df0b py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents: 30031
diff changeset
    48
_sysstr = pycompat.sysstr
02dbfaa6df0b py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents: 30031
diff changeset
    49
30031
0f6d6fdd3c2a pycompat: provide 'ispy3' constant
Yuya Nishihara <yuya@tcha.org>
parents: 28508
diff changeset
    50
if pycompat.ispy3:
28507
9bcbd9412225 encoding: make HFS+ ignore code Python 3 compatible
Gregory Szorc <gregory.szorc@gmail.com>
parents: 28069
diff changeset
    51
    unichr = chr
9bcbd9412225 encoding: make HFS+ ignore code Python 3 compatible
Gregory Szorc <gregory.szorc@gmail.com>
parents: 28069
diff changeset
    52
23596
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    53
# These unicode characters are ignored by HFS+ (Apple Technote 1150,
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    54
# "Unicode Subtleties"), so we need to ignore them in some places for
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    55
# sanity.
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
    56
_ignore = [
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
    57
    unichr(int(x, 16)).encode("utf-8")
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
    58
    for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
    59
    b"206a 206b 206c 206d 206e 206f feff".split()
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
    60
]
23596
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    61
# verify the next function will work
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
    62
assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)
23596
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    63
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
    64
23596
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    65
def hfsignoreclean(s):
43496
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
    66
    # type: (bytes) -> bytes
23596
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    67
    """Remove codepoints ignored by HFS+ from s.
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    68
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    69
    >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    70
    '.hg'
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    71
    >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    72
    '.hg'
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    73
    """
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
    74
    if b"\xe2" in s or b"\xef" in s:
23596
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    75
        for c in _ignore:
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
    76
            s = s.replace(c, b'')
23596
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    77
    return s
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    78
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
    79
30034
e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents: 30033
diff changeset
    80
# encoding.environ is provided read-only, which may not be used to modify
e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents: 30033
diff changeset
    81
# the process environment
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
    82
_nativeenviron = not pycompat.ispy3 or os.supports_bytes_environ
30034
e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents: 30033
diff changeset
    83
if not pycompat.ispy3:
32184
cf424dae5dc7 check-code: ignore re-exports of os.environ in encoding.py
Yuya Nishihara <yuya@tcha.org>
parents: 32156
diff changeset
    84
    environ = os.environ  # re-exports
30034
e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents: 30033
diff changeset
    85
elif _nativeenviron:
32184
cf424dae5dc7 check-code: ignore re-exports of os.environ in encoding.py
Yuya Nishihara <yuya@tcha.org>
parents: 32156
diff changeset
    86
    environ = os.environb  # re-exports
30034
e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents: 30033
diff changeset
    87
else:
e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents: 30033
diff changeset
    88
    # preferred encoding isn't known yet; use utf-8 to avoid unicode error
e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents: 30033
diff changeset
    89
    # and recreate it once encoding is settled
44452
9d2b2df2c2ba cleanup: run pyupgrade on our source tree to clean up varying things
Augie Fackler <augie@google.com>
parents: 43787
diff changeset
    90
    environ = {
9d2b2df2c2ba cleanup: run pyupgrade on our source tree to clean up varying things
Augie Fackler <augie@google.com>
parents: 43787
diff changeset
    91
        k.encode('utf-8'): v.encode('utf-8')
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
    92
        for k, v in os.environ.items()  # re-exports
44452
9d2b2df2c2ba cleanup: run pyupgrade on our source tree to clean up varying things
Augie Fackler <augie@google.com>
parents: 43787
diff changeset
    93
    }
30034
e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents: 30033
diff changeset
    94
39839
9e8fcd2e78c1 encoding: remove unnecessary lambdas from _encodingfixers
Martin von Zweigbergk <martinvonz@google.com>
parents: 39819
diff changeset
    95
_encodingrewrites = {
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
    96
    b'646': b'ascii',
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
    97
    b'ANSI_X3.4-1968': b'ascii',
11892
2be70ca17311 encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents: 11297
diff changeset
    98
}
38615
443029011990 encoding: alias cp65001 to utf-8 on Windows
Yuya Nishihara <yuya@tcha.org>
parents: 36797
diff changeset
    99
# cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.
443029011990 encoding: alias cp65001 to utf-8 on Windows
Yuya Nishihara <yuya@tcha.org>
parents: 36797
diff changeset
   100
# No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.
443029011990 encoding: alias cp65001 to utf-8 on Windows
Yuya Nishihara <yuya@tcha.org>
parents: 36797
diff changeset
   101
# https://bugs.python.org/issue13216
443029011990 encoding: alias cp65001 to utf-8 on Windows
Yuya Nishihara <yuya@tcha.org>
parents: 36797
diff changeset
   102
if pycompat.iswindows and not pycompat.ispy3:
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   103
    _encodingrewrites[b'cp65001'] = b'utf-8'
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   104
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   105
try:
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   106
    encoding = environ.get(b"HGENCODING")
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   107
    if not encoding:
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   108
        encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'
39839
9e8fcd2e78c1 encoding: remove unnecessary lambdas from _encodingfixers
Martin von Zweigbergk <martinvonz@google.com>
parents: 39819
diff changeset
   109
        encoding = _encodingrewrites.get(encoding, encoding)
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   110
except locale.Error:
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   111
    encoding = b'ascii'
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   112
encodingmode = environ.get(b"HGENCODINGMODE", b"strict")
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   113
fallbackencoding = b'ISO-8859-1'
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   114
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   115
33811
dabe1f11ae3a py3: change encoding.localstr to a subclass of bytes, not str
Yuya Nishihara <yuya@tcha.org>
parents: 33761
diff changeset
   116
class localstr(bytes):
45942
89a2afe31e82 formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents: 45681
diff changeset
   117
    """This class allows strings that are unmodified to be
89a2afe31e82 formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents: 45681
diff changeset
   118
    round-tripped to the local encoding and back"""
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   119
13046
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   120
    def __new__(cls, u, l):
33811
dabe1f11ae3a py3: change encoding.localstr to a subclass of bytes, not str
Yuya Nishihara <yuya@tcha.org>
parents: 33761
diff changeset
   121
        s = bytes.__new__(cls, l)
13046
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   122
        s._utf8 = u
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   123
        return s
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   124
43773
7b14d649af1b typing: consolidate "if not globals():" trick
Yuya Nishihara <yuya@tcha.org>
parents: 43685
diff changeset
   125
    if pycompat.TYPE_CHECKING:
43685
da925257a39e typing: add pseudo localstr.__init__() to help pytype
Yuya Nishihara <yuya@tcha.org>
parents: 43684
diff changeset
   126
        # pseudo implementation to help pytype see localstr() constructor
da925257a39e typing: add pseudo localstr.__init__() to help pytype
Yuya Nishihara <yuya@tcha.org>
parents: 43684
diff changeset
   127
        def __init__(self, u, l):
da925257a39e typing: add pseudo localstr.__init__() to help pytype
Yuya Nishihara <yuya@tcha.org>
parents: 43684
diff changeset
   128
            # type: (bytes, bytes) -> None
da925257a39e typing: add pseudo localstr.__init__() to help pytype
Yuya Nishihara <yuya@tcha.org>
parents: 43684
diff changeset
   129
            super(localstr, self).__init__(l)
da925257a39e typing: add pseudo localstr.__init__() to help pytype
Yuya Nishihara <yuya@tcha.org>
parents: 43684
diff changeset
   130
            self._utf8 = u
da925257a39e typing: add pseudo localstr.__init__() to help pytype
Yuya Nishihara <yuya@tcha.org>
parents: 43684
diff changeset
   131
13046
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   132
    def __hash__(self):
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   133
        return hash(self._utf8)  # avoid collisions in local string space
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   134
13046
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   135
37947
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37946
diff changeset
   136
class safelocalstr(bytes):
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37946
diff changeset
   137
    """Tagged string denoting it was previously an internal UTF-8 string,
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37946
diff changeset
   138
    and can be converted back to UTF-8 losslessly
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37946
diff changeset
   139
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37946
diff changeset
   140
    >>> assert safelocalstr(b'\\xc3') == b'\\xc3'
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37946
diff changeset
   141
    >>> assert b'\\xc3' == safelocalstr(b'\\xc3')
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37946
diff changeset
   142
    >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37946
diff changeset
   143
    >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37946
diff changeset
   144
    """
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37946
diff changeset
   145
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   146
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   147
def tolocal(s):
43681
b65fcccd9100 typing: fix argument type of encoding.tolocal() and .fromutf8b()
Yuya Nishihara <yuya@tcha.org>
parents: 43680
diff changeset
   148
    # type: (bytes) -> bytes
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   149
    """
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   150
    Convert a string from internal UTF-8 to local encoding
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   151
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   152
    All internal strings should be UTF-8 but some repos before the
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   153
    implementation of locale support may contain latin1 or possibly
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   154
    other character sets. We attempt to decode everything strictly
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   155
    using UTF-8, then Latin-1, and failing that, we use UTF-8 and
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   156
    replace unknown characters.
13046
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   157
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   158
    The localstr class is used to cache the known UTF-8 encoding of
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   159
    strings next to their local representation to allow lossless
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   160
    round-trip conversion back to UTF-8.
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   161
34131
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
   162
    >>> u = b'foo: \\xc3\\xa4' # utf-8
13046
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   163
    >>> l = tolocal(u)
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   164
    >>> l
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   165
    'foo: ?'
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   166
    >>> fromlocal(l)
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   167
    'foo: \\xc3\\xa4'
34131
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
   168
    >>> u2 = b'foo: \\xc3\\xa1'
13046
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   169
    >>> d = { l: 1, tolocal(u2): 2 }
18378
404feac78b8a tests: stabilize doctest output
Mads Kiilerich <mads@kiilerich.com>
parents: 17424
diff changeset
   170
    >>> len(d) # no collision
404feac78b8a tests: stabilize doctest output
Mads Kiilerich <mads@kiilerich.com>
parents: 17424
diff changeset
   171
    2
34131
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
   172
    >>> b'foo: ?' in d
13046
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   173
    False
34131
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
   174
    >>> l1 = b'foo: \\xe4' # historical latin1 fallback
13046
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   175
    >>> l = tolocal(l1)
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   176
    >>> l
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   177
    'foo: ?'
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   178
    >>> fromlocal(l) # magically in utf-8
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   179
    'foo: \\xc3\\xa4'
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   180
    """
13046
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   181
33927
853574db5b12 encoding: add fast path of from/tolocal() for ASCII strings
Yuya Nishihara <yuya@tcha.org>
parents: 33926
diff changeset
   182
    if isasciistr(s):
853574db5b12 encoding: add fast path of from/tolocal() for ASCII strings
Yuya Nishihara <yuya@tcha.org>
parents: 33926
diff changeset
   183
        return s
853574db5b12 encoding: add fast path of from/tolocal() for ASCII strings
Yuya Nishihara <yuya@tcha.org>
parents: 33926
diff changeset
   184
16274
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
   185
    try:
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   186
        try:
16274
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
   187
            # make sure string is actually stored in UTF-8
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
   188
            u = s.decode('UTF-8')
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   189
            if encoding == b'UTF-8':
16274
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
   190
                # fast path
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
   191
                return s
43503
313e3a279828 cleanup: remove pointless r-prefixes on double-quoted strings
Augie Fackler <augie@google.com>
parents: 43496
diff changeset
   192
            r = u.encode(_sysstr(encoding), "replace")
30033
02dbfaa6df0b py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents: 30031
diff changeset
   193
            if u == r.decode(_sysstr(encoding)):
13940
b7b26e54e37a encoding: avoid localstr when a string can be encoded losslessly (issue2763)
Matt Mackall <mpm@selenic.com>
parents: 13051
diff changeset
   194
                # r is a safe, non-lossy encoding of s
37947
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37946
diff changeset
   195
                return safelocalstr(r)
16274
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
   196
            return localstr(s, r)
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
   197
        except UnicodeDecodeError:
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
   198
            # we should only get here if we're looking at an ancient changeset
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
   199
            try:
30033
02dbfaa6df0b py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents: 30031
diff changeset
   200
                u = s.decode(_sysstr(fallbackencoding))
43503
313e3a279828 cleanup: remove pointless r-prefixes on double-quoted strings
Augie Fackler <augie@google.com>
parents: 43496
diff changeset
   201
                r = u.encode(_sysstr(encoding), "replace")
30033
02dbfaa6df0b py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents: 30031
diff changeset
   202
                if u == r.decode(_sysstr(encoding)):
16274
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
   203
                    # r is a safe, non-lossy encoding of s
37947
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37946
diff changeset
   204
                    return safelocalstr(r)
13940
b7b26e54e37a encoding: avoid localstr when a string can be encoded losslessly (issue2763)
Matt Mackall <mpm@selenic.com>
parents: 13051
diff changeset
   205
                return localstr(u.encode('UTF-8'), r)
16274
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
   206
            except UnicodeDecodeError:
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   207
                u = s.decode("utf-8", "replace")  # last ditch
30033
02dbfaa6df0b py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents: 30031
diff changeset
   208
                # can't round-trip
43503
313e3a279828 cleanup: remove pointless r-prefixes on double-quoted strings
Augie Fackler <augie@google.com>
parents: 43496
diff changeset
   209
                return u.encode(_sysstr(encoding), "replace")
25660
328739ea70c3 global: mass rewrite to use modern exception syntax
Gregory Szorc <gregory.szorc@gmail.com>
parents: 24608
diff changeset
   210
    except LookupError as k:
45681
a736ab681b78 errors: stop passing non-strings to Abort's constructor
Martin von Zweigbergk <martinvonz@google.com>
parents: 44452
diff changeset
   211
        raise error.Abort(
a736ab681b78 errors: stop passing non-strings to Abort's constructor
Martin von Zweigbergk <martinvonz@google.com>
parents: 44452
diff changeset
   212
            pycompat.bytestr(k), hint=b"please check your locale settings"
a736ab681b78 errors: stop passing non-strings to Abort's constructor
Martin von Zweigbergk <martinvonz@google.com>
parents: 44452
diff changeset
   213
        )
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   214
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   215
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   216
def fromlocal(s):
43637
7edc07fb890c encoding: fix bad type annotation
Augie Fackler <augie@google.com>
parents: 43506
diff changeset
   217
    # type: (bytes) -> bytes
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   218
    """
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   219
    Convert a string from the local character encoding to UTF-8
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   220
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   221
    We attempt to decode strings using the encoding mode set by
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   222
    HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   223
    characters will cause an error message. Other modes include
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   224
    'replace', which replaces unknown characters with a special
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   225
    Unicode character, and 'ignore', which drops the character.
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   226
    """
13046
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   227
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   228
    # can we do a lossless round-trip?
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   229
    if isinstance(s, localstr):
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   230
        return s._utf8
33927
853574db5b12 encoding: add fast path of from/tolocal() for ASCII strings
Yuya Nishihara <yuya@tcha.org>
parents: 33926
diff changeset
   231
    if isasciistr(s):
853574db5b12 encoding: add fast path of from/tolocal() for ASCII strings
Yuya Nishihara <yuya@tcha.org>
parents: 33926
diff changeset
   232
        return s
13046
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   233
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   234
    try:
30033
02dbfaa6df0b py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents: 30031
diff changeset
   235
        u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
02dbfaa6df0b py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents: 30031
diff changeset
   236
        return u.encode("utf-8")
25660
328739ea70c3 global: mass rewrite to use modern exception syntax
Gregory Szorc <gregory.szorc@gmail.com>
parents: 24608
diff changeset
   237
    except UnicodeDecodeError as inst:
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   238
        sub = s[max(0, inst.start - 10) : inst.start + 10]
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   239
        raise error.Abort(
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   240
            b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   241
        )
25660
328739ea70c3 global: mass rewrite to use modern exception syntax
Gregory Szorc <gregory.szorc@gmail.com>
parents: 24608
diff changeset
   242
    except LookupError as k:
48007
28c62f83b652 encoding: force a few Errors to bytes before passing to `error.Abort`
Matt Harbison <matt_harbison@yahoo.com>
parents: 47621
diff changeset
   243
        raise error.Abort(
28c62f83b652 encoding: force a few Errors to bytes before passing to `error.Abort`
Matt Harbison <matt_harbison@yahoo.com>
parents: 47621
diff changeset
   244
            pycompat.bytestr(k), hint=b"please check your locale settings"
28c62f83b652 encoding: force a few Errors to bytes before passing to `error.Abort`
Matt Harbison <matt_harbison@yahoo.com>
parents: 47621
diff changeset
   245
        )
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   246
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   247
31447
067add650129 encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents: 30622
diff changeset
   248
def unitolocal(u):
43496
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
   249
    # type: (Text) -> bytes
31447
067add650129 encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents: 30622
diff changeset
   250
    """Convert a unicode string to a byte string of local encoding"""
067add650129 encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents: 30622
diff changeset
   251
    return tolocal(u.encode('utf-8'))
067add650129 encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents: 30622
diff changeset
   252
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   253
31447
067add650129 encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents: 30622
diff changeset
   254
def unifromlocal(s):
43496
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
   255
    # type: (bytes) -> Text
31447
067add650129 encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents: 30622
diff changeset
   256
    """Convert a byte string of local encoding to a unicode string"""
067add650129 encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents: 30622
diff changeset
   257
    return fromlocal(s).decode('utf-8')
067add650129 encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents: 30622
diff changeset
   258
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   259
33022
ce96efec8112 py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents: 32537
diff changeset
   260
def unimethod(bytesfunc):
43496
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
   261
    # type: (Callable[[Any], bytes]) -> Callable[[Any], Text]
33022
ce96efec8112 py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents: 32537
diff changeset
   262
    """Create a proxy method that forwards __unicode__() and __str__() of
ce96efec8112 py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents: 32537
diff changeset
   263
    Python 3 to __bytes__()"""
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   264
33022
ce96efec8112 py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents: 32537
diff changeset
   265
    def unifunc(obj):
ce96efec8112 py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents: 32537
diff changeset
   266
        return unifromlocal(bytesfunc(obj))
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   267
33022
ce96efec8112 py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents: 32537
diff changeset
   268
    return unifunc
ce96efec8112 py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents: 32537
diff changeset
   269
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   270
31448
6419cd243017 encoding: add converter between native str and byte string
Yuya Nishihara <yuya@tcha.org>
parents: 31447
diff changeset
   271
# converter functions between native str and byte string. use these if the
6419cd243017 encoding: add converter between native str and byte string
Yuya Nishihara <yuya@tcha.org>
parents: 31447
diff changeset
   272
# character encoding is not aware (e.g. exception message) or is known to
6419cd243017 encoding: add converter between native str and byte string
Yuya Nishihara <yuya@tcha.org>
parents: 31447
diff changeset
   273
# be locale dependent (e.g. date formatting.)
6419cd243017 encoding: add converter between native str and byte string
Yuya Nishihara <yuya@tcha.org>
parents: 31447
diff changeset
   274
if pycompat.ispy3:
6419cd243017 encoding: add converter between native str and byte string
Yuya Nishihara <yuya@tcha.org>
parents: 31447
diff changeset
   275
    strtolocal = unitolocal
6419cd243017 encoding: add converter between native str and byte string
Yuya Nishihara <yuya@tcha.org>
parents: 31447
diff changeset
   276
    strfromlocal = unifromlocal
33022
ce96efec8112 py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents: 32537
diff changeset
   277
    strmethod = unimethod
31448
6419cd243017 encoding: add converter between native str and byte string
Yuya Nishihara <yuya@tcha.org>
parents: 31447
diff changeset
   278
else:
43469
5f2a8dabb0d8 encoding: define local identify functions with explicit type comments
Augie Fackler <augie@google.com>
parents: 43089
diff changeset
   279
5f2a8dabb0d8 encoding: define local identify functions with explicit type comments
Augie Fackler <augie@google.com>
parents: 43089
diff changeset
   280
    def strtolocal(s):
5f2a8dabb0d8 encoding: define local identify functions with explicit type comments
Augie Fackler <augie@google.com>
parents: 43089
diff changeset
   281
        # type: (str) -> bytes
43683
7f51bc36194d typing: suppress error of py2 encoding.strtolocal() and .strfromlocal()
Yuya Nishihara <yuya@tcha.org>
parents: 43682
diff changeset
   282
        return s  # pytype: disable=bad-return-type
43469
5f2a8dabb0d8 encoding: define local identify functions with explicit type comments
Augie Fackler <augie@google.com>
parents: 43089
diff changeset
   283
5f2a8dabb0d8 encoding: define local identify functions with explicit type comments
Augie Fackler <augie@google.com>
parents: 43089
diff changeset
   284
    def strfromlocal(s):
5f2a8dabb0d8 encoding: define local identify functions with explicit type comments
Augie Fackler <augie@google.com>
parents: 43089
diff changeset
   285
        # type: (bytes) -> str
43683
7f51bc36194d typing: suppress error of py2 encoding.strtolocal() and .strfromlocal()
Yuya Nishihara <yuya@tcha.org>
parents: 43682
diff changeset
   286
        return s  # pytype: disable=bad-return-type
43469
5f2a8dabb0d8 encoding: define local identify functions with explicit type comments
Augie Fackler <augie@google.com>
parents: 43089
diff changeset
   287
33022
ce96efec8112 py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents: 32537
diff changeset
   288
    strmethod = pycompat.identity
31448
6419cd243017 encoding: add converter between native str and byte string
Yuya Nishihara <yuya@tcha.org>
parents: 31447
diff changeset
   289
47559
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   290
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   291
def lower(s):
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   292
    # type: (bytes) -> bytes
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   293
    """best-effort encoding-aware case-folding of local string s"""
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   294
    try:
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   295
        return asciilower(s)
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   296
    except UnicodeDecodeError:
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   297
        pass
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   298
    try:
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   299
        if isinstance(s, localstr):
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   300
            u = s._utf8.decode("utf-8")
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   301
        else:
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   302
            u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   303
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   304
        lu = u.lower()
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   305
        if u == lu:
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   306
            return s  # preserve localstring
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   307
        return lu.encode(_sysstr(encoding))
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   308
    except UnicodeError:
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   309
        return s.lower()  # we don't know how to fold this except in ASCII
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   310
    except LookupError as k:
48007
28c62f83b652 encoding: force a few Errors to bytes before passing to `error.Abort`
Matt Harbison <matt_harbison@yahoo.com>
parents: 47621
diff changeset
   311
        raise error.Abort(
28c62f83b652 encoding: force a few Errors to bytes before passing to `error.Abort`
Matt Harbison <matt_harbison@yahoo.com>
parents: 47621
diff changeset
   312
            pycompat.bytestr(k), hint=b"please check your locale settings"
28c62f83b652 encoding: force a few Errors to bytes before passing to `error.Abort`
Matt Harbison <matt_harbison@yahoo.com>
parents: 47621
diff changeset
   313
        )
47559
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   314
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   315
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   316
def upper(s):
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   317
    # type: (bytes) -> bytes
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   318
    """best-effort encoding-aware case-folding of local string s"""
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   319
    try:
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   320
        return asciiupper(s)
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   321
    except UnicodeDecodeError:
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   322
        return upperfallback(s)
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   323
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   324
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   325
def upperfallback(s):
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   326
    # type: (Any) -> Any
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   327
    try:
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   328
        if isinstance(s, localstr):
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   329
            u = s._utf8.decode("utf-8")
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   330
        else:
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   331
            u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   332
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   333
        uu = u.upper()
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   334
        if u == uu:
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   335
            return s  # preserve localstring
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   336
        return uu.encode(_sysstr(encoding))
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   337
    except UnicodeError:
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   338
        return s.upper()  # we don't know how to fold this except in ASCII
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   339
    except LookupError as k:
48007
28c62f83b652 encoding: force a few Errors to bytes before passing to `error.Abort`
Matt Harbison <matt_harbison@yahoo.com>
parents: 47621
diff changeset
   340
        raise error.Abort(
28c62f83b652 encoding: force a few Errors to bytes before passing to `error.Abort`
Matt Harbison <matt_harbison@yahoo.com>
parents: 47621
diff changeset
   341
            pycompat.bytestr(k), hint=b"please check your locale settings"
28c62f83b652 encoding: force a few Errors to bytes before passing to `error.Abort`
Matt Harbison <matt_harbison@yahoo.com>
parents: 47621
diff changeset
   342
        )
47559
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   343
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   344
30034
e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents: 30033
diff changeset
   345
if not _nativeenviron:
e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents: 30033
diff changeset
   346
    # now encoding and helper functions are available, recreate the environ
e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents: 30033
diff changeset
   347
    # dict to be exported to other modules
47560
af633293a5bd windows: replicate the normalizing behavior of os.environ
Raphaël Gomès <rgomes@octobus.net>
parents: 47559
diff changeset
   348
    if pycompat.iswindows and pycompat.ispy3:
af633293a5bd windows: replicate the normalizing behavior of os.environ
Raphaël Gomès <rgomes@octobus.net>
parents: 47559
diff changeset
   349
af633293a5bd windows: replicate the normalizing behavior of os.environ
Raphaël Gomès <rgomes@octobus.net>
parents: 47559
diff changeset
   350
        class WindowsEnviron(dict):
af633293a5bd windows: replicate the normalizing behavior of os.environ
Raphaël Gomès <rgomes@octobus.net>
parents: 47559
diff changeset
   351
            """`os.environ` normalizes environment variables to uppercase on windows"""
af633293a5bd windows: replicate the normalizing behavior of os.environ
Raphaël Gomès <rgomes@octobus.net>
parents: 47559
diff changeset
   352
af633293a5bd windows: replicate the normalizing behavior of os.environ
Raphaël Gomès <rgomes@octobus.net>
parents: 47559
diff changeset
   353
            def get(self, key, default=None):
af633293a5bd windows: replicate the normalizing behavior of os.environ
Raphaël Gomès <rgomes@octobus.net>
parents: 47559
diff changeset
   354
                return super().get(upper(key), default)
af633293a5bd windows: replicate the normalizing behavior of os.environ
Raphaël Gomès <rgomes@octobus.net>
parents: 47559
diff changeset
   355
af633293a5bd windows: replicate the normalizing behavior of os.environ
Raphaël Gomès <rgomes@octobus.net>
parents: 47559
diff changeset
   356
        environ = WindowsEnviron()
af633293a5bd windows: replicate the normalizing behavior of os.environ
Raphaël Gomès <rgomes@octobus.net>
parents: 47559
diff changeset
   357
af633293a5bd windows: replicate the normalizing behavior of os.environ
Raphaël Gomès <rgomes@octobus.net>
parents: 47559
diff changeset
   358
    for k, v in os.environ.items():  # re-exports
af633293a5bd windows: replicate the normalizing behavior of os.environ
Raphaël Gomès <rgomes@octobus.net>
parents: 47559
diff changeset
   359
        environ[tolocal(k.encode('utf-8'))] = tolocal(v.encode('utf-8'))
af633293a5bd windows: replicate the normalizing behavior of os.environ
Raphaël Gomès <rgomes@octobus.net>
parents: 47559
diff changeset
   360
30034
e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents: 30033
diff changeset
   361
47621
d6ee6456bd5f windows: enforce upper case drive letter for getcwd in mercurial too
Pierre-Yves David <pierre-yves.david@octobus.net>
parents: 47560
diff changeset
   362
DRIVE_RE = re.compile(b'^[a-z]:')
d6ee6456bd5f windows: enforce upper case drive letter for getcwd in mercurial too
Pierre-Yves David <pierre-yves.david@octobus.net>
parents: 47560
diff changeset
   363
39818
24e493ec2229 py3: rename pycompat.getcwd() to encoding.getcwd() (API)
Matt Harbison <matt_harbison@yahoo.com>
parents: 38783
diff changeset
   364
if pycompat.ispy3:
24e493ec2229 py3: rename pycompat.getcwd() to encoding.getcwd() (API)
Matt Harbison <matt_harbison@yahoo.com>
parents: 38783
diff changeset
   365
    # os.getcwd() on Python 3 returns string, but it has os.getcwdb() which
24e493ec2229 py3: rename pycompat.getcwd() to encoding.getcwd() (API)
Matt Harbison <matt_harbison@yahoo.com>
parents: 38783
diff changeset
   366
    # returns bytes.
39819
fb628c048d64 py3: don't use os.getcwdb() on Windows to avoid DeprecationWarnings
Matt Harbison <matt_harbison@yahoo.com>
parents: 39818
diff changeset
   367
    if pycompat.iswindows:
fb628c048d64 py3: don't use os.getcwdb() on Windows to avoid DeprecationWarnings
Matt Harbison <matt_harbison@yahoo.com>
parents: 39818
diff changeset
   368
        # Python 3 on Windows issues a DeprecationWarning about using the bytes
fb628c048d64 py3: don't use os.getcwdb() on Windows to avoid DeprecationWarnings
Matt Harbison <matt_harbison@yahoo.com>
parents: 39818
diff changeset
   369
        # API when os.getcwdb() is called.
46319
3dfebba99ef6 windows: wrap `os.getcwd()` in `os.path.realpath()` on py3
Matt Harbison <matt_harbison@yahoo.com>
parents: 45942
diff changeset
   370
        #
3dfebba99ef6 windows: wrap `os.getcwd()` in `os.path.realpath()` on py3
Matt Harbison <matt_harbison@yahoo.com>
parents: 45942
diff changeset
   371
        # Additionally, py3.8+ uppercases the drive letter when calling
3dfebba99ef6 windows: wrap `os.getcwd()` in `os.path.realpath()` on py3
Matt Harbison <matt_harbison@yahoo.com>
parents: 45942
diff changeset
   372
        # os.path.realpath(), which is used on ``repo.root``.  Since those
3dfebba99ef6 windows: wrap `os.getcwd()` in `os.path.realpath()` on py3
Matt Harbison <matt_harbison@yahoo.com>
parents: 45942
diff changeset
   373
        # strings are compared in various places as simple strings, also call
3dfebba99ef6 windows: wrap `os.getcwd()` in `os.path.realpath()` on py3
Matt Harbison <matt_harbison@yahoo.com>
parents: 45942
diff changeset
   374
        # realpath here.  See https://bugs.python.org/issue40368
47621
d6ee6456bd5f windows: enforce upper case drive letter for getcwd in mercurial too
Pierre-Yves David <pierre-yves.david@octobus.net>
parents: 47560
diff changeset
   375
        #
d6ee6456bd5f windows: enforce upper case drive letter for getcwd in mercurial too
Pierre-Yves David <pierre-yves.david@octobus.net>
parents: 47560
diff changeset
   376
        # However this is not reliable, so lets explicitly make this drive
d6ee6456bd5f windows: enforce upper case drive letter for getcwd in mercurial too
Pierre-Yves David <pierre-yves.david@octobus.net>
parents: 47560
diff changeset
   377
        # letter upper case.
d6ee6456bd5f windows: enforce upper case drive letter for getcwd in mercurial too
Pierre-Yves David <pierre-yves.david@octobus.net>
parents: 47560
diff changeset
   378
        #
d6ee6456bd5f windows: enforce upper case drive letter for getcwd in mercurial too
Pierre-Yves David <pierre-yves.david@octobus.net>
parents: 47560
diff changeset
   379
        # note: we should consider dropping realpath here since it seems to
d6ee6456bd5f windows: enforce upper case drive letter for getcwd in mercurial too
Pierre-Yves David <pierre-yves.david@octobus.net>
parents: 47560
diff changeset
   380
        # change the semantic of `getcwd`.
d6ee6456bd5f windows: enforce upper case drive letter for getcwd in mercurial too
Pierre-Yves David <pierre-yves.david@octobus.net>
parents: 47560
diff changeset
   381
d6ee6456bd5f windows: enforce upper case drive letter for getcwd in mercurial too
Pierre-Yves David <pierre-yves.david@octobus.net>
parents: 47560
diff changeset
   382
        def getcwd():
d6ee6456bd5f windows: enforce upper case drive letter for getcwd in mercurial too
Pierre-Yves David <pierre-yves.david@octobus.net>
parents: 47560
diff changeset
   383
            cwd = os.getcwd()  # re-exports
d6ee6456bd5f windows: enforce upper case drive letter for getcwd in mercurial too
Pierre-Yves David <pierre-yves.david@octobus.net>
parents: 47560
diff changeset
   384
            cwd = os.path.realpath(cwd)
d6ee6456bd5f windows: enforce upper case drive letter for getcwd in mercurial too
Pierre-Yves David <pierre-yves.david@octobus.net>
parents: 47560
diff changeset
   385
            cwd = strtolocal(cwd)
d6ee6456bd5f windows: enforce upper case drive letter for getcwd in mercurial too
Pierre-Yves David <pierre-yves.david@octobus.net>
parents: 47560
diff changeset
   386
            if DRIVE_RE.match(cwd):
d6ee6456bd5f windows: enforce upper case drive letter for getcwd in mercurial too
Pierre-Yves David <pierre-yves.david@octobus.net>
parents: 47560
diff changeset
   387
                cwd = cwd[0:1].upper() + cwd[1:]
d6ee6456bd5f windows: enforce upper case drive letter for getcwd in mercurial too
Pierre-Yves David <pierre-yves.david@octobus.net>
parents: 47560
diff changeset
   388
            return cwd
d6ee6456bd5f windows: enforce upper case drive letter for getcwd in mercurial too
Pierre-Yves David <pierre-yves.david@octobus.net>
parents: 47560
diff changeset
   389
39819
fb628c048d64 py3: don't use os.getcwdb() on Windows to avoid DeprecationWarnings
Matt Harbison <matt_harbison@yahoo.com>
parents: 39818
diff changeset
   390
    else:
fb628c048d64 py3: don't use os.getcwdb() on Windows to avoid DeprecationWarnings
Matt Harbison <matt_harbison@yahoo.com>
parents: 39818
diff changeset
   391
        getcwd = os.getcwdb  # re-exports
39818
24e493ec2229 py3: rename pycompat.getcwd() to encoding.getcwd() (API)
Matt Harbison <matt_harbison@yahoo.com>
parents: 38783
diff changeset
   392
else:
24e493ec2229 py3: rename pycompat.getcwd() to encoding.getcwd() (API)
Matt Harbison <matt_harbison@yahoo.com>
parents: 38783
diff changeset
   393
    getcwd = os.getcwd  # re-exports
24e493ec2229 py3: rename pycompat.getcwd() to encoding.getcwd() (API)
Matt Harbison <matt_harbison@yahoo.com>
parents: 38783
diff changeset
   394
12866
eddc20306ab6 encoding: default ambiguous character to narrow
Matt Mackall <mpm@selenic.com>
parents: 12770
diff changeset
   395
# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   396
_wide = _sysstr(
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   397
    environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   398
    and b"WFA"
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   399
    or b"WF"
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   400
)
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   401
12866
eddc20306ab6 encoding: default ambiguous character to narrow
Matt Mackall <mpm@selenic.com>
parents: 12770
diff changeset
   402
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   403
def colwidth(s):
43496
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
   404
    # type: (bytes) -> int
43787
be8552f25cab cleanup: fix docstring formatting
Matt Harbison <matt_harbison@yahoo.com>
parents: 43773
diff changeset
   405
    """Find the column width of a string for display in the local encoding"""
43506
9f70512ae2cf cleanup: remove pointless r-prefixes on single-quoted strings
Augie Fackler <augie@google.com>
parents: 43503
diff changeset
   406
    return ucolwidth(s.decode(_sysstr(encoding), 'replace'))
15066
24efa83d81cb i18n: calculate terminal columns by width information of each characters
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 14951
diff changeset
   407
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   408
15066
24efa83d81cb i18n: calculate terminal columns by width information of each characters
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 14951
diff changeset
   409
def ucolwidth(d):
43496
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
   410
    # type: (Text) -> int
43787
be8552f25cab cleanup: fix docstring formatting
Matt Harbison <matt_harbison@yahoo.com>
parents: 43773
diff changeset
   411
    """Find the column width of a Unicode string for display"""
14951
61807854004e encoding: use getattr isntead of hasattr
Augie Fackler <durin42@gmail.com>
parents: 14069
diff changeset
   412
    eaw = getattr(unicodedata, 'east_asian_width', None)
61807854004e encoding: use getattr isntead of hasattr
Augie Fackler <durin42@gmail.com>
parents: 14069
diff changeset
   413
    if eaw is not None:
32537
044f3d7eb9ae encoding: make sure "wide" variable never be referenced from other modules
Yuya Nishihara <yuya@tcha.org>
parents: 32529
diff changeset
   414
        return sum([eaw(c) in _wide and 2 or 1 for c in d])
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   415
    return len(d)
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   416
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   417
15143
16c129b0f465 encoding: add getcols to extract substrings based on column width
Matt Mackall <mpm@selenic.com>
parents: 15142
diff changeset
   418
def getcols(s, start, c):
43496
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
   419
    # type: (bytes, int, int) -> bytes
45942
89a2afe31e82 formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents: 45681
diff changeset
   420
    """Use colwidth to find a c-column substring of s starting at byte
89a2afe31e82 formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents: 45681
diff changeset
   421
    index start"""
38783
e7aa113b14f7 global: use pycompat.xrange()
Gregory Szorc <gregory.szorc@gmail.com>
parents: 38739
diff changeset
   422
    for x in pycompat.xrange(start + c, len(s)):
15143
16c129b0f465 encoding: add getcols to extract substrings based on column width
Matt Mackall <mpm@selenic.com>
parents: 15142
diff changeset
   423
        t = s[start:x]
16c129b0f465 encoding: add getcols to extract substrings based on column width
Matt Mackall <mpm@selenic.com>
parents: 15142
diff changeset
   424
        if colwidth(t) == c:
16c129b0f465 encoding: add getcols to extract substrings based on column width
Matt Mackall <mpm@selenic.com>
parents: 15142
diff changeset
   425
            return t
43679
7cf332318f62 encoding: make getcols() raise exception explicitly
Yuya Nishihara <yuya@tcha.org>
parents: 43637
diff changeset
   426
    raise ValueError('substring not found')
15143
16c129b0f465 encoding: add getcols to extract substrings based on column width
Matt Mackall <mpm@selenic.com>
parents: 15142
diff changeset
   427
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   428
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   429
def trim(s, width, ellipsis=b'', leftside=False):
43496
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
   430
    # type: (bytes, int, bytes, bool) -> bytes
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   431
    """Trim string 's' to at most 'width' columns (including 'ellipsis').
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   432
21861
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   433
    If 'leftside' is True, left side of string 's' is trimmed.
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   434
    'ellipsis' is always placed at trimmed side.
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   435
34136
414a3513c2bd doctest: do not embed non-ascii characters in docstring
Yuya Nishihara <yuya@tcha.org>
parents: 34135
diff changeset
   436
    >>> from .node import bin
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
   437
    >>> def bprint(s):
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
   438
    ...     print(pycompat.sysstr(s))
34131
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
   439
    >>> ellipsis = b'+++'
27355
b479fc425a81 encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents: 26963
diff changeset
   440
    >>> from . import encoding
34131
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
   441
    >>> encoding.encoding = b'utf-8'
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
   442
    >>> t = b'1234567890'
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
   443
    >>> bprint(trim(t, 12, ellipsis=ellipsis))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   444
    1234567890
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
   445
    >>> bprint(trim(t, 10, ellipsis=ellipsis))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   446
    1234567890
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
   447
    >>> bprint(trim(t, 8, ellipsis=ellipsis))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   448
    12345+++
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
   449
    >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
21861
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   450
    +++67890
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
   451
    >>> bprint(trim(t, 8))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   452
    12345678
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
   453
    >>> bprint(trim(t, 8, leftside=True))
21861
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   454
    34567890
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
   455
    >>> bprint(trim(t, 3, ellipsis=ellipsis))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   456
    +++
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
   457
    >>> bprint(trim(t, 1, ellipsis=ellipsis))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   458
    +
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   459
    >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
34135
e9e225f16932 doctest: pass encoding name as system string
Yuya Nishihara <yuya@tcha.org>
parents: 34131
diff changeset
   460
    >>> t = u.encode(pycompat.sysstr(encoding.encoding))
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
   461
    >>> bprint(trim(t, 12, ellipsis=ellipsis))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   462
    \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
   463
    >>> bprint(trim(t, 10, ellipsis=ellipsis))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   464
    \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
   465
    >>> bprint(trim(t, 8, ellipsis=ellipsis))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   466
    \xe3\x81\x82\xe3\x81\x84+++
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
   467
    >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
21861
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   468
    +++\xe3\x81\x88\xe3\x81\x8a
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
   469
    >>> bprint(trim(t, 5))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   470
    \xe3\x81\x82\xe3\x81\x84
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
   471
    >>> bprint(trim(t, 5, leftside=True))
21861
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   472
    \xe3\x81\x88\xe3\x81\x8a
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
   473
    >>> bprint(trim(t, 4, ellipsis=ellipsis))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   474
    +++
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
   475
    >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))
21861
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   476
    +++
34136
414a3513c2bd doctest: do not embed non-ascii characters in docstring
Yuya Nishihara <yuya@tcha.org>
parents: 34135
diff changeset
   477
    >>> t = bin(b'112233445566778899aa') # invalid byte sequence
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
   478
    >>> bprint(trim(t, 12, ellipsis=ellipsis))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   479
    \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
   480
    >>> bprint(trim(t, 10, ellipsis=ellipsis))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   481
    \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
   482
    >>> bprint(trim(t, 8, ellipsis=ellipsis))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   483
    \x11\x22\x33\x44\x55+++
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
   484
    >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
21861
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   485
    +++\x66\x77\x88\x99\xaa
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
   486
    >>> bprint(trim(t, 8))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   487
    \x11\x22\x33\x44\x55\x66\x77\x88
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
   488
    >>> bprint(trim(t, 8, leftside=True))
21861
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   489
    \x33\x44\x55\x66\x77\x88\x99\xaa
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
   490
    >>> bprint(trim(t, 3, ellipsis=ellipsis))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   491
    +++
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
   492
    >>> bprint(trim(t, 1, ellipsis=ellipsis))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   493
    +
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   494
    """
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   495
    try:
30033
02dbfaa6df0b py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents: 30031
diff changeset
   496
        u = s.decode(_sysstr(encoding))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   497
    except UnicodeDecodeError:
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   498
        if len(s) <= width:  # trimming is not needed
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   499
            return s
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   500
        width -= len(ellipsis)
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   501
        if width <= 0:  # no enough room even for ellipsis
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   502
            return ellipsis[: width + len(ellipsis)]
21861
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   503
        if leftside:
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   504
            return ellipsis + s[-width:]
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   505
        return s[:width] + ellipsis
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   506
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   507
    if ucolwidth(u) <= width:  # trimming is not needed
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   508
        return s
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   509
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   510
    width -= len(ellipsis)
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   511
    if width <= 0:  # no enough room even for ellipsis
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   512
        return ellipsis[: width + len(ellipsis)]
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   513
48671
f1ed5c304f45 encoding: fix trim() to be O(n) instead of O(n^2)
Martin von Zweigbergk <martinvonz@google.com>
parents: 48007
diff changeset
   514
    chars = list(u)
21861
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   515
    if leftside:
48671
f1ed5c304f45 encoding: fix trim() to be O(n) instead of O(n^2)
Martin von Zweigbergk <martinvonz@google.com>
parents: 48007
diff changeset
   516
        chars.reverse()
f1ed5c304f45 encoding: fix trim() to be O(n) instead of O(n^2)
Martin von Zweigbergk <martinvonz@google.com>
parents: 48007
diff changeset
   517
    width_so_far = 0
f1ed5c304f45 encoding: fix trim() to be O(n) instead of O(n^2)
Martin von Zweigbergk <martinvonz@google.com>
parents: 48007
diff changeset
   518
    for i, c in enumerate(chars):
f1ed5c304f45 encoding: fix trim() to be O(n) instead of O(n^2)
Martin von Zweigbergk <martinvonz@google.com>
parents: 48007
diff changeset
   519
        width_so_far += ucolwidth(c)
f1ed5c304f45 encoding: fix trim() to be O(n) instead of O(n^2)
Martin von Zweigbergk <martinvonz@google.com>
parents: 48007
diff changeset
   520
        if width_so_far > width:
f1ed5c304f45 encoding: fix trim() to be O(n) instead of O(n^2)
Martin von Zweigbergk <martinvonz@google.com>
parents: 48007
diff changeset
   521
            break
f1ed5c304f45 encoding: fix trim() to be O(n) instead of O(n^2)
Martin von Zweigbergk <martinvonz@google.com>
parents: 48007
diff changeset
   522
    chars = chars[:i]
f1ed5c304f45 encoding: fix trim() to be O(n) instead of O(n^2)
Martin von Zweigbergk <martinvonz@google.com>
parents: 48007
diff changeset
   523
    if leftside:
f1ed5c304f45 encoding: fix trim() to be O(n) instead of O(n^2)
Martin von Zweigbergk <martinvonz@google.com>
parents: 48007
diff changeset
   524
        chars.reverse()
f1ed5c304f45 encoding: fix trim() to be O(n) instead of O(n^2)
Martin von Zweigbergk <martinvonz@google.com>
parents: 48007
diff changeset
   525
    u = u''.join(chars).encode(_sysstr(encoding))
f1ed5c304f45 encoding: fix trim() to be O(n) instead of O(n^2)
Martin von Zweigbergk <martinvonz@google.com>
parents: 48007
diff changeset
   526
    if leftside:
f1ed5c304f45 encoding: fix trim() to be O(n) instead of O(n^2)
Martin von Zweigbergk <martinvonz@google.com>
parents: 48007
diff changeset
   527
        return ellipsis + u
f1ed5c304f45 encoding: fix trim() to be O(n) instead of O(n^2)
Martin von Zweigbergk <martinvonz@google.com>
parents: 48007
diff changeset
   528
    return u + ellipsis
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   529
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   530
24593
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 24578
diff changeset
   531
class normcasespecs(object):
45942
89a2afe31e82 formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents: 45681
diff changeset
   532
    """what a platform's normcase does to ASCII strings
24593
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 24578
diff changeset
   533
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 24578
diff changeset
   534
    This is specified per platform, and should be consistent with what normcase
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 24578
diff changeset
   535
    on that platform actually does.
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 24578
diff changeset
   536
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 24578
diff changeset
   537
    lower: normcase lowercases ASCII strings
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 24578
diff changeset
   538
    upper: normcase uppercases ASCII strings
24608
1c533e23ce95 util.h: define an enum for normcase specs
Siddharth Agarwal <sid0@fb.com>
parents: 24597
diff changeset
   539
    other: the fallback function should always be called
1c533e23ce95 util.h: define an enum for normcase specs
Siddharth Agarwal <sid0@fb.com>
parents: 24597
diff changeset
   540
45942
89a2afe31e82 formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents: 45681
diff changeset
   541
    This should be kept in sync with normcase_spec in util.h."""
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   542
24593
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 24578
diff changeset
   543
    lower = -1
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 24578
diff changeset
   544
    upper = 1
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 24578
diff changeset
   545
    other = 0
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 24578
diff changeset
   546
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   547
28068
9ece901f7a19 encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents: 28067
diff changeset
   548
def jsonescape(s, paranoid=False):
43496
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
   549
    # type: (Any, Any) -> Any
45942
89a2afe31e82 formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents: 45681
diff changeset
   550
    """returns a string suitable for JSON
22426
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   551
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   552
    JSON is problematic for us because it doesn't support non-Unicode
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   553
    bytes. To deal with this, we take the following approach:
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   554
37947
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37946
diff changeset
   555
    - localstr/safelocalstr objects are converted back to UTF-8
22426
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   556
    - valid UTF-8/ASCII strings are passed as-is
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   557
    - other strings are converted to UTF-8b surrogate encoding
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   558
    - apply JSON-specified string escaping
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   559
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   560
    (escapes are doubled in these tests)
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   561
34131
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
   562
    >>> jsonescape(b'this is a test')
22426
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   563
    'this is a test'
34131
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
   564
    >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')
27881
ffa599f3f503 encoding: escape U+007F (DEL) character in JSON
Yuya Nishihara <yuya@tcha.org>
parents: 27699
diff changeset
   565
    'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
34131
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
   566
    >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')
33925
2c37f9dabc32 encoding: add fast path of jsonescape() (issue5533)
Yuya Nishihara <yuya@tcha.org>
parents: 33924
diff changeset
   567
    'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
34131
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
   568
    >>> jsonescape(b'a weird byte: \\xdd')
22426
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   569
    'a weird byte: \\xed\\xb3\\x9d'
34131
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
   570
    >>> jsonescape(b'utf-8: caf\\xc3\\xa9')
22426
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   571
    'utf-8: caf\\xc3\\xa9'
34131
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
   572
    >>> jsonescape(b'')
22426
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   573
    ''
28068
9ece901f7a19 encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents: 28067
diff changeset
   574
28069
b2d24c2898f9 encoding: backport paranoid escaping from templatefilters.jsonescape()
Yuya Nishihara <yuya@tcha.org>
parents: 28068
diff changeset
   575
    If paranoid, non-ascii and common troublesome characters are also escaped.
b2d24c2898f9 encoding: backport paranoid escaping from templatefilters.jsonescape()
Yuya Nishihara <yuya@tcha.org>
parents: 28068
diff changeset
   576
    This is suitable for web output.
28068
9ece901f7a19 encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents: 28067
diff changeset
   577
34131
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
   578
    >>> s = b'escape characters: \\0 \\x0b \\x7f'
33925
2c37f9dabc32 encoding: add fast path of jsonescape() (issue5533)
Yuya Nishihara <yuya@tcha.org>
parents: 33924
diff changeset
   579
    >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
34131
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
   580
    >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
33925
2c37f9dabc32 encoding: add fast path of jsonescape() (issue5533)
Yuya Nishihara <yuya@tcha.org>
parents: 33924
diff changeset
   581
    >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
34131
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
   582
    >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
28068
9ece901f7a19 encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents: 28067
diff changeset
   583
    'escape boundary: ~ \\\\u007f \\\\u0080'
34131
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
   584
    >>> jsonescape(b'a weird byte: \\xdd', paranoid=True)
28068
9ece901f7a19 encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents: 28067
diff changeset
   585
    'a weird byte: \\\\udcdd'
34131
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
   586
    >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)
28068
9ece901f7a19 encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents: 28067
diff changeset
   587
    'utf-8: caf\\\\u00e9'
34131
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
   588
    >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
28068
9ece901f7a19 encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents: 28067
diff changeset
   589
    'non-BMP: \\\\ud834\\\\udd1e'
34131
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
   590
    >>> jsonescape(b'<foo@example.org>', paranoid=True)
28069
b2d24c2898f9 encoding: backport paranoid escaping from templatefilters.jsonescape()
Yuya Nishihara <yuya@tcha.org>
parents: 28068
diff changeset
   591
    '\\\\u003cfoo@example.org\\\\u003e'
45942
89a2afe31e82 formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents: 45681
diff changeset
   592
    """
22426
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   593
28068
9ece901f7a19 encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents: 28067
diff changeset
   594
    u8chars = toutf8b(s)
9ece901f7a19 encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents: 28067
diff changeset
   595
    try:
33924
b9101467d88b encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents: 33852
diff changeset
   596
        return _jsonescapeu8fast(u8chars, paranoid)
b9101467d88b encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents: 33852
diff changeset
   597
    except ValueError:
28068
9ece901f7a19 encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents: 28067
diff changeset
   598
        pass
33924
b9101467d88b encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents: 33852
diff changeset
   599
    return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
22426
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   600
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   601
34218
aa877860d4d7 py3: use 'surrogatepass' error handler to process U+DCxx transparently
Yuya Nishihara <yuya@tcha.org>
parents: 34216
diff changeset
   602
# We need to decode/encode U+DCxx codes transparently since invalid UTF-8
aa877860d4d7 py3: use 'surrogatepass' error handler to process U+DCxx transparently
Yuya Nishihara <yuya@tcha.org>
parents: 34216
diff changeset
   603
# bytes are mapped to that range.
aa877860d4d7 py3: use 'surrogatepass' error handler to process U+DCxx transparently
Yuya Nishihara <yuya@tcha.org>
parents: 34216
diff changeset
   604
if pycompat.ispy3:
aa877860d4d7 py3: use 'surrogatepass' error handler to process U+DCxx transparently
Yuya Nishihara <yuya@tcha.org>
parents: 34216
diff changeset
   605
    _utf8strict = r'surrogatepass'
aa877860d4d7 py3: use 'surrogatepass' error handler to process U+DCxx transparently
Yuya Nishihara <yuya@tcha.org>
parents: 34216
diff changeset
   606
else:
aa877860d4d7 py3: use 'surrogatepass' error handler to process U+DCxx transparently
Yuya Nishihara <yuya@tcha.org>
parents: 34216
diff changeset
   607
    _utf8strict = r'strict'
aa877860d4d7 py3: use 'surrogatepass' error handler to process U+DCxx transparently
Yuya Nishihara <yuya@tcha.org>
parents: 34216
diff changeset
   608
26875
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
   609
_utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
   610
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   611
26875
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
   612
def getutf8char(s, pos):
43682
83a349aaeba3 typing: constrain argument/return types of encoding.toutf8b()
Yuya Nishihara <yuya@tcha.org>
parents: 43681
diff changeset
   613
    # type: (bytes, int) -> bytes
45942
89a2afe31e82 formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents: 45681
diff changeset
   614
    """get the next full utf-8 character in the given string, starting at pos
26875
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
   615
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
   616
    Raises a UnicodeError if the given location does not start a valid
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
   617
    utf-8 character.
45942
89a2afe31e82 formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents: 45681
diff changeset
   618
    """
26875
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
   619
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
   620
    # find how many bytes to attempt decoding from first nibble
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   621
    l = _utf8len[ord(s[pos : pos + 1]) >> 4]
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   622
    if not l:  # ascii
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   623
        return s[pos : pos + 1]
26875
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
   624
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   625
    c = s[pos : pos + l]
26875
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
   626
    # validate with attempted decode
34218
aa877860d4d7 py3: use 'surrogatepass' error handler to process U+DCxx transparently
Yuya Nishihara <yuya@tcha.org>
parents: 34216
diff changeset
   627
    c.decode("utf-8", _utf8strict)
26875
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
   628
    return c
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
   629
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   630
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   631
def toutf8b(s):
43682
83a349aaeba3 typing: constrain argument/return types of encoding.toutf8b()
Yuya Nishihara <yuya@tcha.org>
parents: 43681
diff changeset
   632
    # type: (bytes) -> bytes
45942
89a2afe31e82 formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents: 45681
diff changeset
   633
    """convert a local, possibly-binary string into UTF-8b
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   634
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   635
    This is intended as a generic method to preserve data when working
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   636
    with schemes like JSON and XML that have no provision for
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   637
    arbitrary byte strings. As Mercurial often doesn't know
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   638
    what encoding data is in, we use so-called UTF-8b.
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   639
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   640
    If a string is already valid UTF-8 (or ASCII), it passes unmodified.
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   641
    Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   642
    uDC00-uDCFF.
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   643
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   644
    Principles of operation:
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   645
17424
e7cfe3587ea4 fix trivial spelling errors
Mads Kiilerich <mads@kiilerich.com>
parents: 17236
diff changeset
   646
    - ASCII and UTF-8 data successfully round-trips and is understood
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   647
      by Unicode-oriented clients
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   648
    - filenames and file contents in arbitrary other encodings can have
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   649
      be round-tripped or recovered by clueful clients
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   650
    - local strings that have a cached known UTF-8 encoding (aka
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   651
      localstr) get sent as UTF-8 so Unicode-oriented clients get the
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   652
      Unicode data they want
37947
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37946
diff changeset
   653
    - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   654
    - because we must preserve UTF-8 bytestring in places such as
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   655
      filenames, metadata can't be roundtripped without help
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   656
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   657
    (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   658
    arbitrary bytes into an internal Unicode format that can be
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   659
    re-encoded back into the original. Here we are exposing the
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   660
    internal surrogate encoding as a UTF-8 string.)
45942
89a2afe31e82 formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents: 45681
diff changeset
   661
    """
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   662
37946
57b0c7221dba encoding: fix toutf8b() to resurrect lossy characters even if "\xed" in it
Yuya Nishihara <yuya@tcha.org>
parents: 36797
diff changeset
   663
    if isinstance(s, localstr):
57b0c7221dba encoding: fix toutf8b() to resurrect lossy characters even if "\xed" in it
Yuya Nishihara <yuya@tcha.org>
parents: 36797
diff changeset
   664
        # assume that the original UTF-8 sequence would never contain
57b0c7221dba encoding: fix toutf8b() to resurrect lossy characters even if "\xed" in it
Yuya Nishihara <yuya@tcha.org>
parents: 36797
diff changeset
   665
        # invalid characters in U+DCxx range
57b0c7221dba encoding: fix toutf8b() to resurrect lossy characters even if "\xed" in it
Yuya Nishihara <yuya@tcha.org>
parents: 36797
diff changeset
   666
        return s._utf8
37947
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37946
diff changeset
   667
    elif isinstance(s, safelocalstr):
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37946
diff changeset
   668
        # already verified that s is non-lossy in legacy encoding, which
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37946
diff changeset
   669
        # shouldn't contain characters in U+DCxx range
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37946
diff changeset
   670
        return fromlocal(s)
37946
57b0c7221dba encoding: fix toutf8b() to resurrect lossy characters even if "\xed" in it
Yuya Nishihara <yuya@tcha.org>
parents: 36797
diff changeset
   671
    elif isasciistr(s):
33928
6c119dbfd0c0 encoding: add fast path of from/toutf8b() for ASCII strings
Yuya Nishihara <yuya@tcha.org>
parents: 33927
diff changeset
   672
        return s
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   673
    if b"\xed" not in s:
26879
a24b98f4e03c encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents: 26878
diff changeset
   674
        try:
34218
aa877860d4d7 py3: use 'surrogatepass' error handler to process U+DCxx transparently
Yuya Nishihara <yuya@tcha.org>
parents: 34216
diff changeset
   675
            s.decode('utf-8', _utf8strict)
26879
a24b98f4e03c encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents: 26878
diff changeset
   676
            return s
a24b98f4e03c encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents: 26878
diff changeset
   677
        except UnicodeDecodeError:
a24b98f4e03c encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents: 26878
diff changeset
   678
            pass
26878
d7e83f106459 encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents: 26877
diff changeset
   679
34216
1c601df9894c py3: wrap bytes in encoding.from/toutf8b() with bytestr
Yuya Nishihara <yuya@tcha.org>
parents: 34200
diff changeset
   680
    s = pycompat.bytestr(s)
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   681
    r = b""
26878
d7e83f106459 encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents: 26877
diff changeset
   682
    pos = 0
d7e83f106459 encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents: 26877
diff changeset
   683
    l = len(s)
d7e83f106459 encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents: 26877
diff changeset
   684
    while pos < l:
d7e83f106459 encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents: 26877
diff changeset
   685
        try:
d7e83f106459 encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents: 26877
diff changeset
   686
            c = getutf8char(s, pos)
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   687
            if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
26879
a24b98f4e03c encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents: 26878
diff changeset
   688
                # have to re-escape existing U+DCxx characters
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   689
                c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
26879
a24b98f4e03c encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents: 26878
diff changeset
   690
                pos += 1
a24b98f4e03c encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents: 26878
diff changeset
   691
            else:
a24b98f4e03c encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents: 26878
diff changeset
   692
                pos += len(c)
26878
d7e83f106459 encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents: 26877
diff changeset
   693
        except UnicodeDecodeError:
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   694
            c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
26878
d7e83f106459 encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents: 26877
diff changeset
   695
            pos += 1
d7e83f106459 encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents: 26877
diff changeset
   696
        r += c
d7e83f106459 encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents: 26877
diff changeset
   697
    return r
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   698
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   699
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   700
def fromutf8b(s):
43681
b65fcccd9100 typing: fix argument type of encoding.tolocal() and .fromutf8b()
Yuya Nishihara <yuya@tcha.org>
parents: 43680
diff changeset
   701
    # type: (bytes) -> bytes
45942
89a2afe31e82 formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents: 45681
diff changeset
   702
    """Given a UTF-8b string, return a local, possibly-binary string.
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   703
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   704
    return the original binary string. This
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   705
    is a round-trip process for strings like filenames, but metadata
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   706
    that's was passed through tolocal will remain in UTF-8.
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   707
26963
de5ae97ce9f4 encoding: extend test cases for utf8b
Matt Mackall <mpm@selenic.com>
parents: 26879
diff changeset
   708
    >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
34131
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
   709
    >>> m = b"\\xc3\\xa9\\x99abcd"
26963
de5ae97ce9f4 encoding: extend test cases for utf8b
Matt Mackall <mpm@selenic.com>
parents: 26879
diff changeset
   710
    >>> toutf8b(m)
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   711
    '\\xc3\\xa9\\xed\\xb2\\x99abcd'
26963
de5ae97ce9f4 encoding: extend test cases for utf8b
Matt Mackall <mpm@selenic.com>
parents: 26879
diff changeset
   712
    >>> roundtrip(m)
de5ae97ce9f4 encoding: extend test cases for utf8b
Matt Mackall <mpm@selenic.com>
parents: 26879
diff changeset
   713
    True
34131
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
   714
    >>> roundtrip(b"\\xc2\\xc2\\x80")
26963
de5ae97ce9f4 encoding: extend test cases for utf8b
Matt Mackall <mpm@selenic.com>
parents: 26879
diff changeset
   715
    True
34131
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
   716
    >>> roundtrip(b"\\xef\\xbf\\xbd")
26963
de5ae97ce9f4 encoding: extend test cases for utf8b
Matt Mackall <mpm@selenic.com>
parents: 26879
diff changeset
   717
    True
34131
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
   718
    >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   719
    True
34131
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
   720
    >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
27699
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents: 27356
diff changeset
   721
    True
45942
89a2afe31e82 formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents: 45681
diff changeset
   722
    """
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   723
33928
6c119dbfd0c0 encoding: add fast path of from/toutf8b() for ASCII strings
Yuya Nishihara <yuya@tcha.org>
parents: 33927
diff changeset
   724
    if isasciistr(s):
6c119dbfd0c0 encoding: add fast path of from/toutf8b() for ASCII strings
Yuya Nishihara <yuya@tcha.org>
parents: 33927
diff changeset
   725
        return s
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   726
    # fast path - look for uDxxx prefixes in s
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   727
    if b"\xed" not in s:
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   728
        return s
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   729
27699
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents: 27356
diff changeset
   730
    # We could do this with the unicode type but some Python builds
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents: 27356
diff changeset
   731
    # use UTF-16 internally (issue5031) which causes non-BMP code
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents: 27356
diff changeset
   732
    # points to be escaped. Instead, we use our handy getutf8char
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents: 27356
diff changeset
   733
    # helper again to walk the string without "decoding" it.
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents: 27356
diff changeset
   734
34216
1c601df9894c py3: wrap bytes in encoding.from/toutf8b() with bytestr
Yuya Nishihara <yuya@tcha.org>
parents: 34200
diff changeset
   735
    s = pycompat.bytestr(s)
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   736
    r = b""
27699
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents: 27356
diff changeset
   737
    pos = 0
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents: 27356
diff changeset
   738
    l = len(s)
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents: 27356
diff changeset
   739
    while pos < l:
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents: 27356
diff changeset
   740
        c = getutf8char(s, pos)
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents: 27356
diff changeset
   741
        pos += len(c)
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents: 27356
diff changeset
   742
        # unescape U+DCxx characters
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   743
        if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   744
            c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)
27699
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents: 27356
diff changeset
   745
        r += c
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   746
    return r