mercurial/encoding.py
author Siddharth Agarwal <sid0@fb.com>
Fri, 20 Nov 2015 16:43:25 -0800
changeset 27079 a88a10a933b2
parent 26963 de5ae97ce9f4
child 27355 b479fc425a81
permissions -rw-r--r--
mergestate: add a method to compute actions to perform on dirstate We're going to use this to extend the action lists in merge.applyupdates. The somewhat funky return value is to make passing this dict directly into recordactions easier. We're going to exploit that in an upcoming patch.
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
8226
8b2cd04a6e97 put license and copyright info into comment blocks
Martin Geisler <mg@lazybytes.net>
parents: 8225
diff changeset
     1
# encoding.py - character transcoding support for Mercurial
8b2cd04a6e97 put license and copyright info into comment blocks
Martin Geisler <mg@lazybytes.net>
parents: 8225
diff changeset
     2
#
8b2cd04a6e97 put license and copyright info into comment blocks
Martin Geisler <mg@lazybytes.net>
parents: 8225
diff changeset
     3
#  Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
8b2cd04a6e97 put license and copyright info into comment blocks
Martin Geisler <mg@lazybytes.net>
parents: 8225
diff changeset
     4
#
8b2cd04a6e97 put license and copyright info into comment blocks
Martin Geisler <mg@lazybytes.net>
parents: 8225
diff changeset
     5
# This software may be used and distributed according to the terms of the
10263
25e572394f5c Update license to GPLv2+
Matt Mackall <mpm@selenic.com>
parents: 9574
diff changeset
     6
# GNU General Public License version 2 or any later version.
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
     7
22973
bcff9ecdaae0 encoding: avoid cyclic dependency around "parsers" in pure Python build
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 22779
diff changeset
     8
import error
12062
c327bfa5e831 cleanup: remove unused imports
Brodie Rao <brodie@bitheap.org>
parents: 11892
diff changeset
     9
import unicodedata, locale, os
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
    10
23596
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    11
# These unicode characters are ignored by HFS+ (Apple Technote 1150,
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    12
# "Unicode Subtleties"), so we need to ignore them in some places for
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    13
# sanity.
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    14
_ignore = [unichr(int(x, 16)).encode("utf-8") for x in
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    15
           "200c 200d 200e 200f 202a 202b 202c 202d 202e "
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    16
           "206a 206b 206c 206d 206e 206f feff".split()]
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    17
# verify the next function will work
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    18
assert set([i[0] for i in _ignore]) == set(["\xe2", "\xef"])
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    19
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    20
def hfsignoreclean(s):
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    21
    """Remove codepoints ignored by HFS+ from s.
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    22
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    23
    >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    24
    '.hg'
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    25
    >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    26
    '.hg'
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    27
    """
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    28
    if "\xe2" in s or "\xef" in s:
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    29
        for c in _ignore:
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    30
            s = s.replace(c, '')
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    31
    return s
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    32
11892
2be70ca17311 encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents: 11297
diff changeset
    33
def _getpreferredencoding():
2be70ca17311 encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents: 11297
diff changeset
    34
    '''
2be70ca17311 encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents: 11297
diff changeset
    35
    On darwin, getpreferredencoding ignores the locale environment and
2be70ca17311 encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents: 11297
diff changeset
    36
    always returns mac-roman. http://bugs.python.org/issue6202 fixes this
2be70ca17311 encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents: 11297
diff changeset
    37
    for Python 2.7 and up. This is the same corrected code for earlier
2be70ca17311 encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents: 11297
diff changeset
    38
    Python versions.
2be70ca17311 encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents: 11297
diff changeset
    39
12770
614f0d8724ab check-code: find trailing whitespace
Martin Geisler <mg@lazybytes.net>
parents: 12062
diff changeset
    40
    However, we can't use a version check for this method, as some distributions
11892
2be70ca17311 encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents: 11297
diff changeset
    41
    patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman
2be70ca17311 encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents: 11297
diff changeset
    42
    encoding, as it is unlikely that this encoding is the actually expected.
2be70ca17311 encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents: 11297
diff changeset
    43
    '''
2be70ca17311 encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents: 11297
diff changeset
    44
    try:
2be70ca17311 encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents: 11297
diff changeset
    45
        locale.CODESET
2be70ca17311 encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents: 11297
diff changeset
    46
    except AttributeError:
2be70ca17311 encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents: 11297
diff changeset
    47
        # Fall back to parsing environment variables :-(
2be70ca17311 encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents: 11297
diff changeset
    48
        return locale.getdefaultlocale()[1]
2be70ca17311 encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents: 11297
diff changeset
    49
2be70ca17311 encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents: 11297
diff changeset
    50
    oldloc = locale.setlocale(locale.LC_CTYPE)
2be70ca17311 encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents: 11297
diff changeset
    51
    locale.setlocale(locale.LC_CTYPE, "")
2be70ca17311 encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents: 11297
diff changeset
    52
    result = locale.nl_langinfo(locale.CODESET)
2be70ca17311 encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents: 11297
diff changeset
    53
    locale.setlocale(locale.LC_CTYPE, oldloc)
2be70ca17311 encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents: 11297
diff changeset
    54
2be70ca17311 encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents: 11297
diff changeset
    55
    return result
2be70ca17311 encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents: 11297
diff changeset
    56
2be70ca17311 encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents: 11297
diff changeset
    57
_encodingfixers = {
2be70ca17311 encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents: 11297
diff changeset
    58
    '646': lambda: 'ascii',
2be70ca17311 encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents: 11297
diff changeset
    59
    'ANSI_X3.4-1968': lambda: 'ascii',
2be70ca17311 encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents: 11297
diff changeset
    60
    'mac-roman': _getpreferredencoding
2be70ca17311 encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents: 11297
diff changeset
    61
}
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
    62
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
    63
try:
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
    64
    encoding = os.environ.get("HGENCODING")
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
    65
    if not encoding:
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
    66
        encoding = locale.getpreferredencoding() or 'ascii'
11892
2be70ca17311 encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents: 11297
diff changeset
    67
        encoding = _encodingfixers.get(encoding, lambda: encoding)()
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
    68
except locale.Error:
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
    69
    encoding = 'ascii'
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
    70
encodingmode = os.environ.get("HGENCODINGMODE", "strict")
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
    71
fallbackencoding = 'ISO-8859-1'
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
    72
13046
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
    73
class localstr(str):
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
    74
    '''This class allows strings that are unmodified to be
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
    75
    round-tripped to the local encoding and back'''
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
    76
    def __new__(cls, u, l):
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
    77
        s = str.__new__(cls, l)
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
    78
        s._utf8 = u
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
    79
        return s
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
    80
    def __hash__(self):
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
    81
        return hash(self._utf8) # avoid collisions in local string space
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
    82
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
    83
def tolocal(s):
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
    84
    """
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
    85
    Convert a string from internal UTF-8 to local encoding
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
    86
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
    87
    All internal strings should be UTF-8 but some repos before the
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
    88
    implementation of locale support may contain latin1 or possibly
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
    89
    other character sets. We attempt to decode everything strictly
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
    90
    using UTF-8, then Latin-1, and failing that, we use UTF-8 and
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
    91
    replace unknown characters.
13046
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
    92
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
    93
    The localstr class is used to cache the known UTF-8 encoding of
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
    94
    strings next to their local representation to allow lossless
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
    95
    round-trip conversion back to UTF-8.
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
    96
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
    97
    >>> u = 'foo: \\xc3\\xa4' # utf-8
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
    98
    >>> l = tolocal(u)
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
    99
    >>> l
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   100
    'foo: ?'
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   101
    >>> fromlocal(l)
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   102
    'foo: \\xc3\\xa4'
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   103
    >>> u2 = 'foo: \\xc3\\xa1'
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   104
    >>> d = { l: 1, tolocal(u2): 2 }
18378
404feac78b8a tests: stabilize doctest output
Mads Kiilerich <mads@kiilerich.com>
parents: 17424
diff changeset
   105
    >>> len(d) # no collision
404feac78b8a tests: stabilize doctest output
Mads Kiilerich <mads@kiilerich.com>
parents: 17424
diff changeset
   106
    2
13046
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   107
    >>> 'foo: ?' in d
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   108
    False
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   109
    >>> l1 = 'foo: \\xe4' # historical latin1 fallback
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   110
    >>> l = tolocal(l1)
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   111
    >>> l
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   112
    'foo: ?'
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   113
    >>> fromlocal(l) # magically in utf-8
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   114
    'foo: \\xc3\\xa4'
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   115
    """
13046
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   116
16274
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
   117
    try:
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   118
        try:
16274
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
   119
            # make sure string is actually stored in UTF-8
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
   120
            u = s.decode('UTF-8')
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
   121
            if encoding == 'UTF-8':
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
   122
                # fast path
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
   123
                return s
13940
b7b26e54e37a encoding: avoid localstr when a string can be encoded losslessly (issue2763)
Matt Mackall <mpm@selenic.com>
parents: 13051
diff changeset
   124
            r = u.encode(encoding, "replace")
b7b26e54e37a encoding: avoid localstr when a string can be encoded losslessly (issue2763)
Matt Mackall <mpm@selenic.com>
parents: 13051
diff changeset
   125
            if u == r.decode(encoding):
b7b26e54e37a encoding: avoid localstr when a string can be encoded losslessly (issue2763)
Matt Mackall <mpm@selenic.com>
parents: 13051
diff changeset
   126
                # r is a safe, non-lossy encoding of s
b7b26e54e37a encoding: avoid localstr when a string can be encoded losslessly (issue2763)
Matt Mackall <mpm@selenic.com>
parents: 13051
diff changeset
   127
                return r
16274
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
   128
            return localstr(s, r)
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
   129
        except UnicodeDecodeError:
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
   130
            # we should only get here if we're looking at an ancient changeset
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
   131
            try:
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
   132
                u = s.decode(fallbackencoding)
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
   133
                r = u.encode(encoding, "replace")
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
   134
                if u == r.decode(encoding):
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
   135
                    # r is a safe, non-lossy encoding of s
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
   136
                    return r
13940
b7b26e54e37a encoding: avoid localstr when a string can be encoded losslessly (issue2763)
Matt Mackall <mpm@selenic.com>
parents: 13051
diff changeset
   137
                return localstr(u.encode('UTF-8'), r)
16274
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
   138
            except UnicodeDecodeError:
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
   139
                u = s.decode("utf-8", "replace") # last ditch
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
   140
                return u.encode(encoding, "replace") # can't round-trip
25660
328739ea70c3 global: mass rewrite to use modern exception syntax
Gregory Szorc <gregory.szorc@gmail.com>
parents: 24608
diff changeset
   141
    except LookupError as k:
16274
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
   142
        raise error.Abort(k, hint="please check your locale settings")
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   143
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   144
def fromlocal(s):
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   145
    """
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   146
    Convert a string from the local character encoding to UTF-8
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   147
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   148
    We attempt to decode strings using the encoding mode set by
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   149
    HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   150
    characters will cause an error message. Other modes include
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   151
    'replace', which replaces unknown characters with a special
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   152
    Unicode character, and 'ignore', which drops the character.
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   153
    """
13046
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   154
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   155
    # can we do a lossless round-trip?
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   156
    if isinstance(s, localstr):
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   157
        return s._utf8
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   158
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   159
    try:
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   160
        return s.decode(encoding, encodingmode).encode("utf-8")
25660
328739ea70c3 global: mass rewrite to use modern exception syntax
Gregory Szorc <gregory.szorc@gmail.com>
parents: 24608
diff changeset
   161
    except UnicodeDecodeError as inst:
10282
08a0f04b56bd many, many trivial check-code fixups
Matt Mackall <mpm@selenic.com>
parents: 10263
diff changeset
   162
        sub = s[max(0, inst.start - 10):inst.start + 10]
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   163
        raise error.Abort("decoding near '%s': %s!" % (sub, inst))
25660
328739ea70c3 global: mass rewrite to use modern exception syntax
Gregory Szorc <gregory.szorc@gmail.com>
parents: 24608
diff changeset
   164
    except LookupError as k:
15769
afdf4f5bac61 encoding: use hint markup for "please check your locale settings"
Mads Kiilerich <mads@kiilerich.com>
parents: 15672
diff changeset
   165
        raise error.Abort(k, hint="please check your locale settings")
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   166
12866
eddc20306ab6 encoding: default ambiguous character to narrow
Matt Mackall <mpm@selenic.com>
parents: 12770
diff changeset
   167
# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
15066
24efa83d81cb i18n: calculate terminal columns by width information of each characters
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 14951
diff changeset
   168
wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
24efa83d81cb i18n: calculate terminal columns by width information of each characters
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 14951
diff changeset
   169
        and "WFA" or "WF")
12866
eddc20306ab6 encoding: default ambiguous character to narrow
Matt Mackall <mpm@selenic.com>
parents: 12770
diff changeset
   170
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   171
def colwidth(s):
15142
176882876780 encoding: colwidth input is in the local encoding
Matt Mackall <mpm@selenic.com>
parents: 15066
diff changeset
   172
    "Find the column width of a string for display in the local encoding"
15066
24efa83d81cb i18n: calculate terminal columns by width information of each characters
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 14951
diff changeset
   173
    return ucolwidth(s.decode(encoding, 'replace'))
24efa83d81cb i18n: calculate terminal columns by width information of each characters
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 14951
diff changeset
   174
24efa83d81cb i18n: calculate terminal columns by width information of each characters
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 14951
diff changeset
   175
def ucolwidth(d):
24efa83d81cb i18n: calculate terminal columns by width information of each characters
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 14951
diff changeset
   176
    "Find the column width of a Unicode string for display"
14951
61807854004e encoding: use getattr isntead of hasattr
Augie Fackler <durin42@gmail.com>
parents: 14069
diff changeset
   177
    eaw = getattr(unicodedata, 'east_asian_width', None)
61807854004e encoding: use getattr isntead of hasattr
Augie Fackler <durin42@gmail.com>
parents: 14069
diff changeset
   178
    if eaw is not None:
61807854004e encoding: use getattr isntead of hasattr
Augie Fackler <durin42@gmail.com>
parents: 14069
diff changeset
   179
        return sum([eaw(c) in wide and 2 or 1 for c in d])
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   180
    return len(d)
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   181
15143
16c129b0f465 encoding: add getcols to extract substrings based on column width
Matt Mackall <mpm@selenic.com>
parents: 15142
diff changeset
   182
def getcols(s, start, c):
16c129b0f465 encoding: add getcols to extract substrings based on column width
Matt Mackall <mpm@selenic.com>
parents: 15142
diff changeset
   183
    '''Use colwidth to find a c-column substring of s starting at byte
16c129b0f465 encoding: add getcols to extract substrings based on column width
Matt Mackall <mpm@selenic.com>
parents: 15142
diff changeset
   184
    index start'''
16c129b0f465 encoding: add getcols to extract substrings based on column width
Matt Mackall <mpm@selenic.com>
parents: 15142
diff changeset
   185
    for x in xrange(start + c, len(s)):
16c129b0f465 encoding: add getcols to extract substrings based on column width
Matt Mackall <mpm@selenic.com>
parents: 15142
diff changeset
   186
        t = s[start:x]
16c129b0f465 encoding: add getcols to extract substrings based on column width
Matt Mackall <mpm@selenic.com>
parents: 15142
diff changeset
   187
        if colwidth(t) == c:
16c129b0f465 encoding: add getcols to extract substrings based on column width
Matt Mackall <mpm@selenic.com>
parents: 15142
diff changeset
   188
            return t
16c129b0f465 encoding: add getcols to extract substrings based on column width
Matt Mackall <mpm@selenic.com>
parents: 15142
diff changeset
   189
21861
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   190
def trim(s, width, ellipsis='', leftside=False):
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   191
    """Trim string 's' to at most 'width' columns (including 'ellipsis').
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   192
21861
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   193
    If 'leftside' is True, left side of string 's' is trimmed.
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   194
    'ellipsis' is always placed at trimmed side.
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   195
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   196
    >>> ellipsis = '+++'
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   197
    >>> from mercurial import encoding
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   198
    >>> encoding.encoding = 'utf-8'
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   199
    >>> t= '1234567890'
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   200
    >>> print trim(t, 12, ellipsis=ellipsis)
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   201
    1234567890
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   202
    >>> print trim(t, 10, ellipsis=ellipsis)
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   203
    1234567890
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   204
    >>> print trim(t, 8, ellipsis=ellipsis)
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   205
    12345+++
21861
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   206
    >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   207
    +++67890
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   208
    >>> print trim(t, 8)
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   209
    12345678
21861
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   210
    >>> print trim(t, 8, leftside=True)
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   211
    34567890
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   212
    >>> print trim(t, 3, ellipsis=ellipsis)
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   213
    +++
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   214
    >>> print trim(t, 1, ellipsis=ellipsis)
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   215
    +
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   216
    >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   217
    >>> t = u.encode(encoding.encoding)
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   218
    >>> print trim(t, 12, ellipsis=ellipsis)
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   219
    \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   220
    >>> print trim(t, 10, ellipsis=ellipsis)
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   221
    \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   222
    >>> print trim(t, 8, ellipsis=ellipsis)
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   223
    \xe3\x81\x82\xe3\x81\x84+++
21861
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   224
    >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   225
    +++\xe3\x81\x88\xe3\x81\x8a
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   226
    >>> print trim(t, 5)
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   227
    \xe3\x81\x82\xe3\x81\x84
21861
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   228
    >>> print trim(t, 5, leftside=True)
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   229
    \xe3\x81\x88\xe3\x81\x8a
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   230
    >>> print trim(t, 4, ellipsis=ellipsis)
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   231
    +++
21861
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   232
    >>> print trim(t, 4, ellipsis=ellipsis, leftside=True)
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   233
    +++
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   234
    >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   235
    >>> print trim(t, 12, ellipsis=ellipsis)
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   236
    \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   237
    >>> print trim(t, 10, ellipsis=ellipsis)
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   238
    \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   239
    >>> print trim(t, 8, ellipsis=ellipsis)
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   240
    \x11\x22\x33\x44\x55+++
21861
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   241
    >>> print trim(t, 8, ellipsis=ellipsis, leftside=True)
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   242
    +++\x66\x77\x88\x99\xaa
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   243
    >>> print trim(t, 8)
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   244
    \x11\x22\x33\x44\x55\x66\x77\x88
21861
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   245
    >>> print trim(t, 8, leftside=True)
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   246
    \x33\x44\x55\x66\x77\x88\x99\xaa
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   247
    >>> print trim(t, 3, ellipsis=ellipsis)
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   248
    +++
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   249
    >>> print trim(t, 1, ellipsis=ellipsis)
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   250
    +
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   251
    """
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   252
    try:
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   253
        u = s.decode(encoding)
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   254
    except UnicodeDecodeError:
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   255
        if len(s) <= width: # trimming is not needed
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   256
            return s
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   257
        width -= len(ellipsis)
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   258
        if width <= 0: # no enough room even for ellipsis
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   259
            return ellipsis[:width + len(ellipsis)]
21861
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   260
        if leftside:
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   261
            return ellipsis + s[-width:]
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   262
        return s[:width] + ellipsis
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   263
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   264
    if ucolwidth(u) <= width: # trimming is not needed
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   265
        return s
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   266
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   267
    width -= len(ellipsis)
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   268
    if width <= 0: # no enough room even for ellipsis
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   269
        return ellipsis[:width + len(ellipsis)]
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   270
21861
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   271
    if leftside:
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   272
        uslice = lambda i: u[i:]
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   273
        concat = lambda s: ellipsis + s
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   274
    else:
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   275
        uslice = lambda i: u[:-i]
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   276
        concat = lambda s: s + ellipsis
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   277
    for i in xrange(1, len(u)):
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   278
        usub = uslice(i)
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   279
        if ucolwidth(usub) <= width:
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   280
            return concat(usub.encode(encoding))
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   281
    return ellipsis # no enough room for multi-column characters
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   282
22973
bcff9ecdaae0 encoding: avoid cyclic dependency around "parsers" in pure Python build
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 22779
diff changeset
   283
def _asciilower(s):
22778
80f2b63dd83a parsers: add a function to efficiently lowercase ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 22426
diff changeset
   284
    '''convert a string to lowercase if ASCII
80f2b63dd83a parsers: add a function to efficiently lowercase ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 22426
diff changeset
   285
80f2b63dd83a parsers: add a function to efficiently lowercase ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 22426
diff changeset
   286
    Raises UnicodeDecodeError if non-ASCII characters are found.'''
80f2b63dd83a parsers: add a function to efficiently lowercase ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 22426
diff changeset
   287
    s.decode('ascii')
80f2b63dd83a parsers: add a function to efficiently lowercase ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 22426
diff changeset
   288
    return s.lower()
80f2b63dd83a parsers: add a function to efficiently lowercase ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 22426
diff changeset
   289
22973
bcff9ecdaae0 encoding: avoid cyclic dependency around "parsers" in pure Python build
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 22779
diff changeset
   290
def asciilower(s):
bcff9ecdaae0 encoding: avoid cyclic dependency around "parsers" in pure Python build
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 22779
diff changeset
   291
    # delay importing avoids cyclic dependency around "parsers" in
bcff9ecdaae0 encoding: avoid cyclic dependency around "parsers" in pure Python build
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 22779
diff changeset
   292
    # pure Python build (util => i18n => encoding => parsers => util)
bcff9ecdaae0 encoding: avoid cyclic dependency around "parsers" in pure Python build
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 22779
diff changeset
   293
    import parsers
bcff9ecdaae0 encoding: avoid cyclic dependency around "parsers" in pure Python build
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 22779
diff changeset
   294
    impl = getattr(parsers, 'asciilower', _asciilower)
bcff9ecdaae0 encoding: avoid cyclic dependency around "parsers" in pure Python build
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 22779
diff changeset
   295
    global asciilower
bcff9ecdaae0 encoding: avoid cyclic dependency around "parsers" in pure Python build
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 22779
diff changeset
   296
    asciilower = impl
bcff9ecdaae0 encoding: avoid cyclic dependency around "parsers" in pure Python build
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 22779
diff changeset
   297
    return impl(s)
22778
80f2b63dd83a parsers: add a function to efficiently lowercase ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 22426
diff changeset
   298
24578
ac08de78de7f encoding: use parsers.asciiupper when available
Siddharth Agarwal <sid0@fb.com>
parents: 23596
diff changeset
   299
def _asciiupper(s):
ac08de78de7f encoding: use parsers.asciiupper when available
Siddharth Agarwal <sid0@fb.com>
parents: 23596
diff changeset
   300
    '''convert a string to uppercase if ASCII
ac08de78de7f encoding: use parsers.asciiupper when available
Siddharth Agarwal <sid0@fb.com>
parents: 23596
diff changeset
   301
ac08de78de7f encoding: use parsers.asciiupper when available
Siddharth Agarwal <sid0@fb.com>
parents: 23596
diff changeset
   302
    Raises UnicodeDecodeError if non-ASCII characters are found.'''
ac08de78de7f encoding: use parsers.asciiupper when available
Siddharth Agarwal <sid0@fb.com>
parents: 23596
diff changeset
   303
    s.decode('ascii')
ac08de78de7f encoding: use parsers.asciiupper when available
Siddharth Agarwal <sid0@fb.com>
parents: 23596
diff changeset
   304
    return s.upper()
ac08de78de7f encoding: use parsers.asciiupper when available
Siddharth Agarwal <sid0@fb.com>
parents: 23596
diff changeset
   305
ac08de78de7f encoding: use parsers.asciiupper when available
Siddharth Agarwal <sid0@fb.com>
parents: 23596
diff changeset
   306
def asciiupper(s):
ac08de78de7f encoding: use parsers.asciiupper when available
Siddharth Agarwal <sid0@fb.com>
parents: 23596
diff changeset
   307
    # delay importing avoids cyclic dependency around "parsers" in
ac08de78de7f encoding: use parsers.asciiupper when available
Siddharth Agarwal <sid0@fb.com>
parents: 23596
diff changeset
   308
    # pure Python build (util => i18n => encoding => parsers => util)
ac08de78de7f encoding: use parsers.asciiupper when available
Siddharth Agarwal <sid0@fb.com>
parents: 23596
diff changeset
   309
    import parsers
ac08de78de7f encoding: use parsers.asciiupper when available
Siddharth Agarwal <sid0@fb.com>
parents: 23596
diff changeset
   310
    impl = getattr(parsers, 'asciiupper', _asciiupper)
ac08de78de7f encoding: use parsers.asciiupper when available
Siddharth Agarwal <sid0@fb.com>
parents: 23596
diff changeset
   311
    global asciiupper
ac08de78de7f encoding: use parsers.asciiupper when available
Siddharth Agarwal <sid0@fb.com>
parents: 23596
diff changeset
   312
    asciiupper = impl
ac08de78de7f encoding: use parsers.asciiupper when available
Siddharth Agarwal <sid0@fb.com>
parents: 23596
diff changeset
   313
    return impl(s)
ac08de78de7f encoding: use parsers.asciiupper when available
Siddharth Agarwal <sid0@fb.com>
parents: 23596
diff changeset
   314
14069
e38846a79a23 encoding: add an encoding-aware lower function
Matt Mackall <mpm@selenic.com>
parents: 13940
diff changeset
   315
def lower(s):
e38846a79a23 encoding: add an encoding-aware lower function
Matt Mackall <mpm@selenic.com>
parents: 13940
diff changeset
   316
    "best-effort encoding-aware case-folding of local string s"
e38846a79a23 encoding: add an encoding-aware lower function
Matt Mackall <mpm@selenic.com>
parents: 13940
diff changeset
   317
    try:
22779
d9585dda63c3 encoding.lower: use fast ASCII lower
Siddharth Agarwal <sid0@fb.com>
parents: 22778
diff changeset
   318
        return asciilower(s)
17235
3745ae495ce5 encoding: use s.decode to trigger UnicodeDecodeError
Martin Geisler <mg@aragost.com>
parents: 16493
diff changeset
   319
    except UnicodeDecodeError:
16387
c481761033bd encoding: add fast-path for ASCII lowercase
Matt Mackall <mpm@selenic.com>
parents: 16274
diff changeset
   320
        pass
c481761033bd encoding: add fast-path for ASCII lowercase
Matt Mackall <mpm@selenic.com>
parents: 16274
diff changeset
   321
    try:
14069
e38846a79a23 encoding: add an encoding-aware lower function
Matt Mackall <mpm@selenic.com>
parents: 13940
diff changeset
   322
        if isinstance(s, localstr):
e38846a79a23 encoding: add an encoding-aware lower function
Matt Mackall <mpm@selenic.com>
parents: 13940
diff changeset
   323
            u = s._utf8.decode("utf-8")
e38846a79a23 encoding: add an encoding-aware lower function
Matt Mackall <mpm@selenic.com>
parents: 13940
diff changeset
   324
        else:
e38846a79a23 encoding: add an encoding-aware lower function
Matt Mackall <mpm@selenic.com>
parents: 13940
diff changeset
   325
            u = s.decode(encoding, encodingmode)
e38846a79a23 encoding: add an encoding-aware lower function
Matt Mackall <mpm@selenic.com>
parents: 13940
diff changeset
   326
e38846a79a23 encoding: add an encoding-aware lower function
Matt Mackall <mpm@selenic.com>
parents: 13940
diff changeset
   327
        lu = u.lower()
e38846a79a23 encoding: add an encoding-aware lower function
Matt Mackall <mpm@selenic.com>
parents: 13940
diff changeset
   328
        if u == lu:
e38846a79a23 encoding: add an encoding-aware lower function
Matt Mackall <mpm@selenic.com>
parents: 13940
diff changeset
   329
            return s # preserve localstring
e38846a79a23 encoding: add an encoding-aware lower function
Matt Mackall <mpm@selenic.com>
parents: 13940
diff changeset
   330
        return lu.encode(encoding)
e38846a79a23 encoding: add an encoding-aware lower function
Matt Mackall <mpm@selenic.com>
parents: 13940
diff changeset
   331
    except UnicodeError:
e38846a79a23 encoding: add an encoding-aware lower function
Matt Mackall <mpm@selenic.com>
parents: 13940
diff changeset
   332
        return s.lower() # we don't know how to fold this except in ASCII
25660
328739ea70c3 global: mass rewrite to use modern exception syntax
Gregory Szorc <gregory.szorc@gmail.com>
parents: 24608
diff changeset
   333
    except LookupError as k:
15672
2ebe3d0ce91d i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 15143
diff changeset
   334
        raise error.Abort(k, hint="please check your locale settings")
2ebe3d0ce91d i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 15143
diff changeset
   335
2ebe3d0ce91d i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 15143
diff changeset
   336
def upper(s):
2ebe3d0ce91d i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 15143
diff changeset
   337
    "best-effort encoding-aware case-folding of local string s"
2ebe3d0ce91d i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 15143
diff changeset
   338
    try:
24578
ac08de78de7f encoding: use parsers.asciiupper when available
Siddharth Agarwal <sid0@fb.com>
parents: 23596
diff changeset
   339
        return asciiupper(s)
17236
9fb8312dbdbd encoding: add fast-path for ASCII uppercase.
Martin Geisler <mg@aragost.com>
parents: 17235
diff changeset
   340
    except UnicodeDecodeError:
24597
b4258d5a1600 encoding.upper: factor out fallback code
Siddharth Agarwal <sid0@fb.com>
parents: 24593
diff changeset
   341
        return upperfallback(s)
b4258d5a1600 encoding.upper: factor out fallback code
Siddharth Agarwal <sid0@fb.com>
parents: 24593
diff changeset
   342
b4258d5a1600 encoding.upper: factor out fallback code
Siddharth Agarwal <sid0@fb.com>
parents: 24593
diff changeset
   343
def upperfallback(s):
17236
9fb8312dbdbd encoding: add fast-path for ASCII uppercase.
Martin Geisler <mg@aragost.com>
parents: 17235
diff changeset
   344
    try:
15672
2ebe3d0ce91d i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 15143
diff changeset
   345
        if isinstance(s, localstr):
2ebe3d0ce91d i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 15143
diff changeset
   346
            u = s._utf8.decode("utf-8")
2ebe3d0ce91d i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 15143
diff changeset
   347
        else:
2ebe3d0ce91d i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 15143
diff changeset
   348
            u = s.decode(encoding, encodingmode)
2ebe3d0ce91d i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 15143
diff changeset
   349
2ebe3d0ce91d i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 15143
diff changeset
   350
        uu = u.upper()
2ebe3d0ce91d i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 15143
diff changeset
   351
        if u == uu:
2ebe3d0ce91d i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 15143
diff changeset
   352
            return s # preserve localstring
2ebe3d0ce91d i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 15143
diff changeset
   353
        return uu.encode(encoding)
2ebe3d0ce91d i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 15143
diff changeset
   354
    except UnicodeError:
2ebe3d0ce91d i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 15143
diff changeset
   355
        return s.upper() # we don't know how to fold this except in ASCII
25660
328739ea70c3 global: mass rewrite to use modern exception syntax
Gregory Szorc <gregory.szorc@gmail.com>
parents: 24608
diff changeset
   356
    except LookupError as k:
15672
2ebe3d0ce91d i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 15143
diff changeset
   357
        raise error.Abort(k, hint="please check your locale settings")
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   358
24593
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 24578
diff changeset
   359
class normcasespecs(object):
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 24578
diff changeset
   360
    '''what a platform's normcase does to ASCII strings
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 24578
diff changeset
   361
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 24578
diff changeset
   362
    This is specified per platform, and should be consistent with what normcase
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 24578
diff changeset
   363
    on that platform actually does.
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 24578
diff changeset
   364
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 24578
diff changeset
   365
    lower: normcase lowercases ASCII strings
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 24578
diff changeset
   366
    upper: normcase uppercases ASCII strings
24608
1c533e23ce95 util.h: define an enum for normcase specs
Siddharth Agarwal <sid0@fb.com>
parents: 24597
diff changeset
   367
    other: the fallback function should always be called
1c533e23ce95 util.h: define an enum for normcase specs
Siddharth Agarwal <sid0@fb.com>
parents: 24597
diff changeset
   368
1c533e23ce95 util.h: define an enum for normcase specs
Siddharth Agarwal <sid0@fb.com>
parents: 24597
diff changeset
   369
    This should be kept in sync with normcase_spec in util.h.'''
24593
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 24578
diff changeset
   370
    lower = -1
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 24578
diff changeset
   371
    upper = 1
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 24578
diff changeset
   372
    other = 0
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 24578
diff changeset
   373
22426
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   374
_jsonmap = {}
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   375
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   376
def jsonescape(s):
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   377
    '''returns a string suitable for JSON
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   378
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   379
    JSON is problematic for us because it doesn't support non-Unicode
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   380
    bytes. To deal with this, we take the following approach:
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   381
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   382
    - localstr objects are converted back to UTF-8
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   383
    - valid UTF-8/ASCII strings are passed as-is
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   384
    - other strings are converted to UTF-8b surrogate encoding
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   385
    - apply JSON-specified string escaping
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   386
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   387
    (escapes are doubled in these tests)
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   388
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   389
    >>> jsonescape('this is a test')
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   390
    'this is a test'
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   391
    >>> jsonescape('escape characters: \\0 \\x0b \\t \\n \\r \\" \\\\')
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   392
    'escape characters: \\\\u0000 \\\\u000b \\\\t \\\\n \\\\r \\\\" \\\\\\\\'
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   393
    >>> jsonescape('a weird byte: \\xdd')
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   394
    'a weird byte: \\xed\\xb3\\x9d'
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   395
    >>> jsonescape('utf-8: caf\\xc3\\xa9')
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   396
    'utf-8: caf\\xc3\\xa9'
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   397
    >>> jsonescape('')
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   398
    ''
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   399
    '''
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   400
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   401
    if not _jsonmap:
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   402
        for x in xrange(32):
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   403
            _jsonmap[chr(x)] = "\u%04x" %x
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   404
        for x in xrange(32, 256):
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   405
            c = chr(x)
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   406
            _jsonmap[c] = c
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   407
        _jsonmap['\t'] = '\\t'
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   408
        _jsonmap['\n'] = '\\n'
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   409
        _jsonmap['\"'] = '\\"'
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   410
        _jsonmap['\\'] = '\\\\'
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   411
        _jsonmap['\b'] = '\\b'
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   412
        _jsonmap['\f'] = '\\f'
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   413
        _jsonmap['\r'] = '\\r'
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   414
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   415
    return ''.join(_jsonmap[c] for c in toutf8b(s))
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   416
26875
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
   417
_utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
   418
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
   419
def getutf8char(s, pos):
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
   420
    '''get the next full utf-8 character in the given string, starting at pos
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
   421
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
   422
    Raises a UnicodeError if the given location does not start a valid
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
   423
    utf-8 character.
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
   424
    '''
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
   425
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
   426
    # find how many bytes to attempt decoding from first nibble
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
   427
    l = _utf8len[ord(s[pos]) >> 4]
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
   428
    if not l: # ascii
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
   429
        return s[pos]
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
   430
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
   431
    c = s[pos:pos + l]
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
   432
    # validate with attempted decode
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
   433
    c.decode("utf-8")
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
   434
    return c
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
   435
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   436
def toutf8b(s):
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   437
    '''convert a local, possibly-binary string into UTF-8b
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   438
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   439
    This is intended as a generic method to preserve data when working
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   440
    with schemes like JSON and XML that have no provision for
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   441
    arbitrary byte strings. As Mercurial often doesn't know
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   442
    what encoding data is in, we use so-called UTF-8b.
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   443
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   444
    If a string is already valid UTF-8 (or ASCII), it passes unmodified.
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   445
    Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   446
    uDC00-uDCFF.
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   447
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   448
    Principles of operation:
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   449
17424
e7cfe3587ea4 fix trivial spelling errors
Mads Kiilerich <mads@kiilerich.com>
parents: 17236
diff changeset
   450
    - ASCII and UTF-8 data successfully round-trips and is understood
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   451
      by Unicode-oriented clients
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   452
    - filenames and file contents in arbitrary other encodings can have
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   453
      be round-tripped or recovered by clueful clients
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   454
    - local strings that have a cached known UTF-8 encoding (aka
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   455
      localstr) get sent as UTF-8 so Unicode-oriented clients get the
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   456
      Unicode data they want
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   457
    - because we must preserve UTF-8 bytestring in places such as
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   458
      filenames, metadata can't be roundtripped without help
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   459
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   460
    (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   461
    arbitrary bytes into an internal Unicode format that can be
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   462
    re-encoded back into the original. Here we are exposing the
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   463
    internal surrogate encoding as a UTF-8 string.)
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   464
    '''
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   465
26879
a24b98f4e03c encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents: 26878
diff changeset
   466
    if "\xed" not in s:
a24b98f4e03c encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents: 26878
diff changeset
   467
        if isinstance(s, localstr):
a24b98f4e03c encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents: 26878
diff changeset
   468
            return s._utf8
a24b98f4e03c encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents: 26878
diff changeset
   469
        try:
a24b98f4e03c encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents: 26878
diff changeset
   470
            s.decode('utf-8')
a24b98f4e03c encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents: 26878
diff changeset
   471
            return s
a24b98f4e03c encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents: 26878
diff changeset
   472
        except UnicodeDecodeError:
a24b98f4e03c encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents: 26878
diff changeset
   473
            pass
26878
d7e83f106459 encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents: 26877
diff changeset
   474
d7e83f106459 encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents: 26877
diff changeset
   475
    r = ""
d7e83f106459 encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents: 26877
diff changeset
   476
    pos = 0
d7e83f106459 encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents: 26877
diff changeset
   477
    l = len(s)
d7e83f106459 encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents: 26877
diff changeset
   478
    while pos < l:
d7e83f106459 encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents: 26877
diff changeset
   479
        try:
d7e83f106459 encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents: 26877
diff changeset
   480
            c = getutf8char(s, pos)
26879
a24b98f4e03c encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents: 26878
diff changeset
   481
            if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
a24b98f4e03c encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents: 26878
diff changeset
   482
                # have to re-escape existing U+DCxx characters
a24b98f4e03c encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents: 26878
diff changeset
   483
                c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
a24b98f4e03c encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents: 26878
diff changeset
   484
                pos += 1
a24b98f4e03c encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents: 26878
diff changeset
   485
            else:
a24b98f4e03c encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents: 26878
diff changeset
   486
                pos += len(c)
26878
d7e83f106459 encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents: 26877
diff changeset
   487
        except UnicodeDecodeError:
d7e83f106459 encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents: 26877
diff changeset
   488
            c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
d7e83f106459 encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents: 26877
diff changeset
   489
            pos += 1
d7e83f106459 encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents: 26877
diff changeset
   490
        r += c
d7e83f106459 encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents: 26877
diff changeset
   491
    return r
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   492
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   493
def fromutf8b(s):
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   494
    '''Given a UTF-8b string, return a local, possibly-binary string.
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   495
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   496
    return the original binary string. This
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   497
    is a round-trip process for strings like filenames, but metadata
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   498
    that's was passed through tolocal will remain in UTF-8.
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   499
26963
de5ae97ce9f4 encoding: extend test cases for utf8b
Matt Mackall <mpm@selenic.com>
parents: 26879
diff changeset
   500
    >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   501
    >>> m = "\\xc3\\xa9\\x99abcd"
26963
de5ae97ce9f4 encoding: extend test cases for utf8b
Matt Mackall <mpm@selenic.com>
parents: 26879
diff changeset
   502
    >>> toutf8b(m)
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   503
    '\\xc3\\xa9\\xed\\xb2\\x99abcd'
26963
de5ae97ce9f4 encoding: extend test cases for utf8b
Matt Mackall <mpm@selenic.com>
parents: 26879
diff changeset
   504
    >>> roundtrip(m)
de5ae97ce9f4 encoding: extend test cases for utf8b
Matt Mackall <mpm@selenic.com>
parents: 26879
diff changeset
   505
    True
de5ae97ce9f4 encoding: extend test cases for utf8b
Matt Mackall <mpm@selenic.com>
parents: 26879
diff changeset
   506
    >>> roundtrip("\\xc2\\xc2\\x80")
de5ae97ce9f4 encoding: extend test cases for utf8b
Matt Mackall <mpm@selenic.com>
parents: 26879
diff changeset
   507
    True
de5ae97ce9f4 encoding: extend test cases for utf8b
Matt Mackall <mpm@selenic.com>
parents: 26879
diff changeset
   508
    >>> roundtrip("\\xef\\xbf\\xbd")
de5ae97ce9f4 encoding: extend test cases for utf8b
Matt Mackall <mpm@selenic.com>
parents: 26879
diff changeset
   509
    True
de5ae97ce9f4 encoding: extend test cases for utf8b
Matt Mackall <mpm@selenic.com>
parents: 26879
diff changeset
   510
    >>> roundtrip("\\xef\\xef\\xbf\\xbd")
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   511
    True
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   512
    '''
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   513
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   514
    # fast path - look for uDxxx prefixes in s
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   515
    if "\xed" not in s:
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   516
        return s
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   517
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   518
    u = s.decode("utf-8")
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   519
    r = ""
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   520
    for c in u:
26877
cb467a9d7593 encoding: handle non-BMP characters in fromutf8b
Matt Mackall <mpm@selenic.com>
parents: 26875
diff changeset
   521
        if ord(c) & 0xffff00 == 0xdc00:
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   522
            r += chr(ord(c) & 0xff)
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   523
        else:
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   524
            r += c.encode("utf-8")
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   525
    return r