mercurial/pure/charencode.py
author Yuya Nishihara <yuya@tcha.org>
Sun, 23 Apr 2017 12:59:42 +0900
changeset 33926 f4433f2713d0
parent 33924 b9101467d88b
child 34217 5307cc57f271
permissions -rw-r--r--
encoding: add function to test if a str consists of ASCII characters Most strings are ASCII. Let's optimize for it. Using uint64_t is slightly faster than uint32_t on 64bit system, but there isn't huge difference.
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
33761
f5fc54e7e467 encoding: drop circular import by proxying through '<policy>.charencode'
Yuya Nishihara <yuya@tcha.org>
parents:
diff changeset
     1
# charencode.py - miscellaneous character encoding
f5fc54e7e467 encoding: drop circular import by proxying through '<policy>.charencode'
Yuya Nishihara <yuya@tcha.org>
parents:
diff changeset
     2
#
f5fc54e7e467 encoding: drop circular import by proxying through '<policy>.charencode'
Yuya Nishihara <yuya@tcha.org>
parents:
diff changeset
     3
#  Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
f5fc54e7e467 encoding: drop circular import by proxying through '<policy>.charencode'
Yuya Nishihara <yuya@tcha.org>
parents:
diff changeset
     4
#
f5fc54e7e467 encoding: drop circular import by proxying through '<policy>.charencode'
Yuya Nishihara <yuya@tcha.org>
parents:
diff changeset
     5
# This software may be used and distributed according to the terms of the
f5fc54e7e467 encoding: drop circular import by proxying through '<policy>.charencode'
Yuya Nishihara <yuya@tcha.org>
parents:
diff changeset
     6
# GNU General Public License version 2 or any later version.
f5fc54e7e467 encoding: drop circular import by proxying through '<policy>.charencode'
Yuya Nishihara <yuya@tcha.org>
parents:
diff changeset
     7
f5fc54e7e467 encoding: drop circular import by proxying through '<policy>.charencode'
Yuya Nishihara <yuya@tcha.org>
parents:
diff changeset
     8
from __future__ import absolute_import
f5fc54e7e467 encoding: drop circular import by proxying through '<policy>.charencode'
Yuya Nishihara <yuya@tcha.org>
parents:
diff changeset
     9
33924
b9101467d88b encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents: 33761
diff changeset
    10
import array
b9101467d88b encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents: 33761
diff changeset
    11
b9101467d88b encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents: 33761
diff changeset
    12
from .. import (
b9101467d88b encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents: 33761
diff changeset
    13
    pycompat,
b9101467d88b encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents: 33761
diff changeset
    14
)
b9101467d88b encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents: 33761
diff changeset
    15
33926
f4433f2713d0 encoding: add function to test if a str consists of ASCII characters
Yuya Nishihara <yuya@tcha.org>
parents: 33924
diff changeset
    16
def isasciistr(s):
f4433f2713d0 encoding: add function to test if a str consists of ASCII characters
Yuya Nishihara <yuya@tcha.org>
parents: 33924
diff changeset
    17
    try:
f4433f2713d0 encoding: add function to test if a str consists of ASCII characters
Yuya Nishihara <yuya@tcha.org>
parents: 33924
diff changeset
    18
        s.decode('ascii')
f4433f2713d0 encoding: add function to test if a str consists of ASCII characters
Yuya Nishihara <yuya@tcha.org>
parents: 33924
diff changeset
    19
        return True
f4433f2713d0 encoding: add function to test if a str consists of ASCII characters
Yuya Nishihara <yuya@tcha.org>
parents: 33924
diff changeset
    20
    except UnicodeDecodeError:
f4433f2713d0 encoding: add function to test if a str consists of ASCII characters
Yuya Nishihara <yuya@tcha.org>
parents: 33924
diff changeset
    21
        return False
f4433f2713d0 encoding: add function to test if a str consists of ASCII characters
Yuya Nishihara <yuya@tcha.org>
parents: 33924
diff changeset
    22
33761
f5fc54e7e467 encoding: drop circular import by proxying through '<policy>.charencode'
Yuya Nishihara <yuya@tcha.org>
parents:
diff changeset
    23
def asciilower(s):
f5fc54e7e467 encoding: drop circular import by proxying through '<policy>.charencode'
Yuya Nishihara <yuya@tcha.org>
parents:
diff changeset
    24
    '''convert a string to lowercase if ASCII
f5fc54e7e467 encoding: drop circular import by proxying through '<policy>.charencode'
Yuya Nishihara <yuya@tcha.org>
parents:
diff changeset
    25
f5fc54e7e467 encoding: drop circular import by proxying through '<policy>.charencode'
Yuya Nishihara <yuya@tcha.org>
parents:
diff changeset
    26
    Raises UnicodeDecodeError if non-ASCII characters are found.'''
f5fc54e7e467 encoding: drop circular import by proxying through '<policy>.charencode'
Yuya Nishihara <yuya@tcha.org>
parents:
diff changeset
    27
    s.decode('ascii')
f5fc54e7e467 encoding: drop circular import by proxying through '<policy>.charencode'
Yuya Nishihara <yuya@tcha.org>
parents:
diff changeset
    28
    return s.lower()
f5fc54e7e467 encoding: drop circular import by proxying through '<policy>.charencode'
Yuya Nishihara <yuya@tcha.org>
parents:
diff changeset
    29
f5fc54e7e467 encoding: drop circular import by proxying through '<policy>.charencode'
Yuya Nishihara <yuya@tcha.org>
parents:
diff changeset
    30
def asciiupper(s):
f5fc54e7e467 encoding: drop circular import by proxying through '<policy>.charencode'
Yuya Nishihara <yuya@tcha.org>
parents:
diff changeset
    31
    '''convert a string to uppercase if ASCII
f5fc54e7e467 encoding: drop circular import by proxying through '<policy>.charencode'
Yuya Nishihara <yuya@tcha.org>
parents:
diff changeset
    32
f5fc54e7e467 encoding: drop circular import by proxying through '<policy>.charencode'
Yuya Nishihara <yuya@tcha.org>
parents:
diff changeset
    33
    Raises UnicodeDecodeError if non-ASCII characters are found.'''
f5fc54e7e467 encoding: drop circular import by proxying through '<policy>.charencode'
Yuya Nishihara <yuya@tcha.org>
parents:
diff changeset
    34
    s.decode('ascii')
f5fc54e7e467 encoding: drop circular import by proxying through '<policy>.charencode'
Yuya Nishihara <yuya@tcha.org>
parents:
diff changeset
    35
    return s.upper()
33924
b9101467d88b encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents: 33761
diff changeset
    36
b9101467d88b encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents: 33761
diff changeset
    37
_jsonmap = []
b9101467d88b encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents: 33761
diff changeset
    38
_jsonmap.extend("\\u%04x" % x for x in range(32))
b9101467d88b encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents: 33761
diff changeset
    39
_jsonmap.extend(pycompat.bytechr(x) for x in range(32, 127))
b9101467d88b encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents: 33761
diff changeset
    40
_jsonmap.append('\\u007f')
b9101467d88b encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents: 33761
diff changeset
    41
_jsonmap[0x09] = '\\t'
b9101467d88b encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents: 33761
diff changeset
    42
_jsonmap[0x0a] = '\\n'
b9101467d88b encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents: 33761
diff changeset
    43
_jsonmap[0x22] = '\\"'
b9101467d88b encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents: 33761
diff changeset
    44
_jsonmap[0x5c] = '\\\\'
b9101467d88b encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents: 33761
diff changeset
    45
_jsonmap[0x08] = '\\b'
b9101467d88b encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents: 33761
diff changeset
    46
_jsonmap[0x0c] = '\\f'
b9101467d88b encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents: 33761
diff changeset
    47
_jsonmap[0x0d] = '\\r'
b9101467d88b encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents: 33761
diff changeset
    48
_paranoidjsonmap = _jsonmap[:]
b9101467d88b encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents: 33761
diff changeset
    49
_paranoidjsonmap[0x3c] = '\\u003c'  # '<' (e.g. escape "</script>")
b9101467d88b encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents: 33761
diff changeset
    50
_paranoidjsonmap[0x3e] = '\\u003e'  # '>'
b9101467d88b encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents: 33761
diff changeset
    51
_jsonmap.extend(pycompat.bytechr(x) for x in range(128, 256))
b9101467d88b encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents: 33761
diff changeset
    52
b9101467d88b encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents: 33761
diff changeset
    53
def jsonescapeu8fast(u8chars, paranoid):
b9101467d88b encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents: 33761
diff changeset
    54
    """Convert a UTF-8 byte string to JSON-escaped form (fast path)
b9101467d88b encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents: 33761
diff changeset
    55
b9101467d88b encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents: 33761
diff changeset
    56
    Raises ValueError if non-ASCII characters have to be escaped.
b9101467d88b encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents: 33761
diff changeset
    57
    """
b9101467d88b encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents: 33761
diff changeset
    58
    if paranoid:
b9101467d88b encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents: 33761
diff changeset
    59
        jm = _paranoidjsonmap
b9101467d88b encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents: 33761
diff changeset
    60
    else:
b9101467d88b encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents: 33761
diff changeset
    61
        jm = _jsonmap
b9101467d88b encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents: 33761
diff changeset
    62
    try:
b9101467d88b encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents: 33761
diff changeset
    63
        return ''.join(jm[x] for x in bytearray(u8chars))
b9101467d88b encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents: 33761
diff changeset
    64
    except IndexError:
b9101467d88b encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents: 33761
diff changeset
    65
        raise ValueError
b9101467d88b encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents: 33761
diff changeset
    66
b9101467d88b encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents: 33761
diff changeset
    67
def jsonescapeu8fallback(u8chars, paranoid):
b9101467d88b encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents: 33761
diff changeset
    68
    """Convert a UTF-8 byte string to JSON-escaped form (slow path)
b9101467d88b encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents: 33761
diff changeset
    69
b9101467d88b encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents: 33761
diff changeset
    70
    Escapes all non-ASCII characters no matter if paranoid is False.
b9101467d88b encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents: 33761
diff changeset
    71
    """
b9101467d88b encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents: 33761
diff changeset
    72
    if paranoid:
b9101467d88b encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents: 33761
diff changeset
    73
        jm = _paranoidjsonmap
b9101467d88b encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents: 33761
diff changeset
    74
    else:
b9101467d88b encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents: 33761
diff changeset
    75
        jm = _jsonmap
b9101467d88b encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents: 33761
diff changeset
    76
    # non-BMP char is represented as UTF-16 surrogate pair
b9101467d88b encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents: 33761
diff changeset
    77
    u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16'))
b9101467d88b encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents: 33761
diff changeset
    78
    u16codes.pop(0)  # drop BOM
b9101467d88b encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents: 33761
diff changeset
    79
    return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)