# HG changeset patch # User Yuya Nishihara # Date 1492926472 -32400 # Node ID 2c37f9dabc326370004ae9b66604e6c5ef6d6389 # Parent b9101467d88b232011609d63b4ddaa3e594cd489 encoding: add fast path of jsonescape() (issue5533) This isn't highly optimized as it copies characters one by one, but seems reasonably simple and not slow. (with mercurial repo) $ export HGRCPATH=/dev/null HGPLAIN= $ hg log --time --config experimental.stabilization=all -Tjson > /dev/null (original) time: real 6.830 secs (user 6.740+0.000 sys 0.080+0.000) time: real 6.690 secs (user 6.650+0.000 sys 0.040+0.000) time: real 6.700 secs (user 6.640+0.000 sys 0.060+0.000) (this patch) time: real 5.630 secs (user 5.550+0.000 sys 0.070+0.000) time: real 5.700 secs (user 5.650+0.000 sys 0.050+0.000) time: real 5.690 secs (user 5.640+0.000 sys 0.050+0.000) diff -r b9101467d88b -r 2c37f9dabc32 mercurial/cext/charencode.c --- a/mercurial/cext/charencode.c Sun Apr 23 16:10:51 2017 +0900 +++ b/mercurial/cext/charencode.c Sun Apr 23 14:47:52 2017 +0900 @@ -9,6 +9,7 @@ #define PY_SSIZE_T_CLEAN #include +#include #include "charencode.h" #include "util.h" @@ -63,6 +64,42 @@ '\x7b', '\x7c', '\x7d', '\x7e', '\x7f' }; +/* 1: no escape, 2: \, 6: \u */ +static const uint8_t jsonlentable[256] = { + 6, 6, 6, 6, 6, 6, 6, 6, 2, 2, 2, 6, 2, 2, 6, 6, /* b, t, n, f, r */ + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* " */ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, /* \\ */ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 6, /* DEL */ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +}; + +static const uint8_t jsonparanoidlentable[128] = { + 6, 6, 6, 6, 6, 6, 6, 6, 2, 2, 2, 6, 2, 2, 6, 6, /* b, t, n, f, r */ + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* " */ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 6, 1, 6, 1, /* <, > */ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, /* \\ */ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 6, /* DEL */ +}; + +static const char hexchartable[16] = { + '0', '1', '2', '3', '4', '5', '6', '7', + '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', +}; + /* * Turn a hex-encoded string into binary. */ @@ -217,3 +254,105 @@ Py_XDECREF(file_foldmap); return NULL; } + +/* calculate length of JSON-escaped string; returns -1 if unsupported */ +static Py_ssize_t jsonescapelen(const char *buf, Py_ssize_t len, bool paranoid) +{ + Py_ssize_t i, esclen = 0; + + if (paranoid) { + /* don't want to process multi-byte escapes in C */ + for (i = 0; i < len; i++) { + char c = buf[i]; + if (c & 0x80) { + PyErr_SetString(PyExc_ValueError, + "cannot process non-ascii str"); + return -1; + } + esclen += jsonparanoidlentable[(unsigned char)c]; + } + } else { + for (i = 0; i < len; i++) { + char c = buf[i]; + esclen += jsonlentable[(unsigned char)c]; + } + } + + return esclen; +} + +/* map '\' escape character */ +static char jsonescapechar2(char c) +{ + switch (c) { + case '\b': return 'b'; + case '\t': return 't'; + case '\n': return 'n'; + case '\f': return 'f'; + case '\r': return 'r'; + case '"': return '"'; + case '\\': return '\\'; + } + return '\0'; /* should not happen */ +} + +/* convert 'origbuf' to JSON-escaped form 'escbuf'; 'origbuf' should only + include characters mappable by json(paranoid)lentable */ +static void encodejsonescape(char *escbuf, Py_ssize_t esclen, + const char *origbuf, Py_ssize_t origlen, + bool paranoid) +{ + const uint8_t *lentable = + (paranoid) ? jsonparanoidlentable : jsonlentable; + Py_ssize_t i, j; + + for (i = 0, j = 0; i < origlen; i++) { + char c = origbuf[i]; + uint8_t l = lentable[(unsigned char)c]; + assert(j + l <= esclen); + switch (l) { + case 1: + escbuf[j] = c; + break; + case 2: + escbuf[j] = '\\'; + escbuf[j + 1] = jsonescapechar2(c); + break; + case 6: + memcpy(escbuf + j, "\\u00", 4); + escbuf[j + 4] = hexchartable[(unsigned char)c >> 4]; + escbuf[j + 5] = hexchartable[(unsigned char)c & 0xf]; + break; + } + j += l; + } +} + +PyObject *jsonescapeu8fast(PyObject *self, PyObject *args) +{ + PyObject *origstr, *escstr; + const char *origbuf; + Py_ssize_t origlen, esclen; + int paranoid; + if (!PyArg_ParseTuple(args, "O!i:jsonescapeu8fast", + &PyBytes_Type, &origstr, ¶noid)) + return NULL; + + origbuf = PyBytes_AS_STRING(origstr); + origlen = PyBytes_GET_SIZE(origstr); + esclen = jsonescapelen(origbuf, origlen, paranoid); + if (esclen < 0) + return NULL; /* unsupported char found */ + if (origlen == esclen) { + Py_INCREF(origstr); + return origstr; + } + + escstr = PyBytes_FromStringAndSize(NULL, esclen); + if (!escstr) + return NULL; + encodejsonescape(PyBytes_AS_STRING(escstr), esclen, origbuf, origlen, + paranoid); + + return escstr; +} diff -r b9101467d88b -r 2c37f9dabc32 mercurial/cext/charencode.h --- a/mercurial/cext/charencode.h Sun Apr 23 16:10:51 2017 +0900 +++ b/mercurial/cext/charencode.h Sun Apr 23 14:47:52 2017 +0900 @@ -22,6 +22,7 @@ PyObject *asciilower(PyObject *self, PyObject *args); PyObject *asciiupper(PyObject *self, PyObject *args); PyObject *make_file_foldmap(PyObject *self, PyObject *args); +PyObject *jsonescapeu8fast(PyObject *self, PyObject *args); static const int8_t hextable[256] = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, diff -r b9101467d88b -r 2c37f9dabc32 mercurial/cext/parsers.c --- a/mercurial/cext/parsers.c Sun Apr 23 16:10:51 2017 +0900 +++ b/mercurial/cext/parsers.c Sun Apr 23 14:47:52 2017 +0900 @@ -702,6 +702,8 @@ "construct a dict with an expected size\n"}, {"make_file_foldmap", make_file_foldmap, METH_VARARGS, "make file foldmap\n"}, + {"jsonescapeu8fast", jsonescapeu8fast, METH_VARARGS, + "escape a UTF-8 byte string to JSON (fast path)\n"}, {"encodedir", encodedir, METH_VARARGS, "encodedir a path\n"}, {"pathencode", pathencode, METH_VARARGS, "fncache-encode a path\n"}, {"lowerencode", lowerencode, METH_VARARGS, "lower-encode a path\n"}, @@ -714,7 +716,7 @@ void manifest_module_init(PyObject *mod); void revlog_module_init(PyObject *mod); -static const int version = 1; +static const int version = 2; static void module_init(PyObject *mod) { diff -r b9101467d88b -r 2c37f9dabc32 mercurial/encoding.py --- a/mercurial/encoding.py Sun Apr 23 16:10:51 2017 +0900 +++ b/mercurial/encoding.py Sun Apr 23 14:47:52 2017 +0900 @@ -26,7 +26,7 @@ asciilower = charencode.asciilower asciiupper = charencode.asciiupper -_jsonescapeu8fast = charencodepure.jsonescapeu8fast # TODO: no "pure" +_jsonescapeu8fast = charencode.jsonescapeu8fast _sysstr = pycompat.sysstr @@ -404,8 +404,8 @@ 'this is a test' >>> jsonescape('escape characters: \\0 \\x0b \\x7f') 'escape characters: \\\\u0000 \\\\u000b \\\\u007f' - >>> jsonescape('escape characters: \\t \\n \\r \\" \\\\') - 'escape characters: \\\\t \\\\n \\\\r \\\\" \\\\\\\\' + >>> jsonescape('escape characters: \\b \\t \\n \\f \\r \\" \\\\') + 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\' >>> jsonescape('a weird byte: \\xdd') 'a weird byte: \\xed\\xb3\\x9d' >>> jsonescape('utf-8: caf\\xc3\\xa9') @@ -416,6 +416,10 @@ If paranoid, non-ascii and common troublesome characters are also escaped. This is suitable for web output. + >>> s = 'escape characters: \\0 \\x0b \\x7f' + >>> assert jsonescape(s) == jsonescape(s, paranoid=True) + >>> s = 'escape characters: \\b \\t \\n \\f \\r \\" \\\\' + >>> assert jsonescape(s) == jsonescape(s, paranoid=True) >>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True) 'escape boundary: ~ \\\\u007f \\\\u0080' >>> jsonescape('a weird byte: \\xdd', paranoid=True) diff -r b9101467d88b -r 2c37f9dabc32 mercurial/policy.py --- a/mercurial/policy.py Sun Apr 23 16:10:51 2017 +0900 +++ b/mercurial/policy.py Sun Apr 23 14:47:52 2017 +0900 @@ -75,7 +75,7 @@ (r'cext', r'diffhelpers'): 1, (r'cext', r'mpatch'): 1, (r'cext', r'osutil'): 1, - (r'cext', r'parsers'): 1, + (r'cext', r'parsers'): 2, } # map import request to other package or module