py3: use 'surrogatepass' error handler to process U+DCxx transparently
authorYuya Nishihara <yuya@tcha.org>
Sat, 16 Sep 2017 22:55:48 +0900
changeset 34218 aa877860d4d7
parent 34217 5307cc57f271
child 34219 21fc747e1bc5
py3: use 'surrogatepass' error handler to process U+DCxx transparently It's disallowed by default on Python 3. https://docs.python.org/3/library/codecs.html#error-handlers
mercurial/encoding.py
mercurial/pure/charencode.py
tests/test-doctest.py
--- a/mercurial/encoding.py	Sat Sep 16 22:42:19 2017 +0900
+++ b/mercurial/encoding.py	Sat Sep 16 22:55:48 2017 +0900
@@ -448,6 +448,13 @@
         pass
     return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
 
+# We need to decode/encode U+DCxx codes transparently since invalid UTF-8
+# bytes are mapped to that range.
+if pycompat.ispy3:
+    _utf8strict = r'surrogatepass'
+else:
+    _utf8strict = r'strict'
+
 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
 
 def getutf8char(s, pos):
@@ -464,7 +471,7 @@
 
     c = s[pos:pos + l]
     # validate with attempted decode
-    c.decode("utf-8")
+    c.decode("utf-8", _utf8strict)
     return c
 
 def toutf8b(s):
@@ -503,7 +510,7 @@
         if isinstance(s, localstr):
             return s._utf8
         try:
-            s.decode('utf-8')
+            s.decode('utf-8', _utf8strict)
             return s
         except UnicodeDecodeError:
             pass
@@ -517,12 +524,12 @@
             c = getutf8char(s, pos)
             if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
                 # have to re-escape existing U+DCxx characters
-                c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
+                c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict)
                 pos += 1
             else:
                 pos += len(c)
         except UnicodeDecodeError:
-            c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
+            c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict)
             pos += 1
         r += c
     return r
@@ -570,7 +577,7 @@
         pos += len(c)
         # unescape U+DCxx characters
         if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
-            c = chr(ord(c.decode("utf-8")) & 0xff)
+            c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xff)
         r += c
     return r
 
--- a/mercurial/pure/charencode.py	Sat Sep 16 22:42:19 2017 +0900
+++ b/mercurial/pure/charencode.py	Sat Sep 16 22:55:48 2017 +0900
@@ -64,6 +64,11 @@
     except IndexError:
         raise ValueError
 
+if pycompat.ispy3:
+    _utf8strict = r'surrogatepass'
+else:
+    _utf8strict = r'strict'
+
 def jsonescapeu8fallback(u8chars, paranoid):
     """Convert a UTF-8 byte string to JSON-escaped form (slow path)
 
@@ -74,6 +79,7 @@
     else:
         jm = _jsonmap
     # non-BMP char is represented as UTF-16 surrogate pair
-    u16codes = array.array(r'H', u8chars.decode('utf-8').encode('utf-16'))
+    u16b = u8chars.decode('utf-8', _utf8strict).encode('utf-16', _utf8strict)
+    u16codes = array.array(r'H', u16b)
     u16codes.pop(0)  # drop BOM
     return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)
--- a/tests/test-doctest.py	Sat Sep 16 22:42:19 2017 +0900
+++ b/tests/test-doctest.py	Sat Sep 16 22:55:48 2017 +0900
@@ -50,7 +50,7 @@
 testmod('mercurial.context')
 testmod('mercurial.dagparser', optionflags=doctest.NORMALIZE_WHITESPACE)
 testmod('mercurial.dispatch')
-testmod('mercurial.encoding', py3=False)  # py3: multiple encoding issues
+testmod('mercurial.encoding')
 testmod('mercurial.formatter', py3=False)  # py3: write bytes to stdout
 testmod('mercurial.hg')
 testmod('mercurial.hgweb.hgwebdir_mod', py3=False)  # py3: repr(bytes) ?