posix: use getutf8char to handle OS X filename percent-escaping
authorMatt Mackall <mpm@selenic.com>
Thu, 05 Nov 2015 17:09:00 -0600
changeset 26876 b8381832ce2b
parent 26875 cf47bdb2183c
child 26877 cb467a9d7593
posix: use getutf8char to handle OS X filename percent-escaping This replaces an open-coded utf-8 parser that was ignoring subtle issues like overlong encodings.
mercurial/posix.py
--- a/mercurial/posix.py	Thu Nov 05 16:48:46 2015 -0600
+++ b/mercurial/posix.py	Thu Nov 05 17:09:00 2015 -0600
@@ -255,40 +255,17 @@
         except UnicodeDecodeError:
             # OS X percent-encodes any bytes that aren't valid utf-8
             s = ''
-            g = ''
-            l = 0
-            for c in path:
-                o = ord(c)
-                if l and o < 128 or o >= 192:
-                    # we want a continuation byte, but didn't get one
-                    s += ''.join(["%%%02X" % ord(x) for x in g])
-                    g = ''
-                    l = 0
-                if l == 0 and o < 128:
-                    # ascii
-                    s += c
-                elif l == 0 and 194 <= o < 245:
-                    # valid leading bytes
-                    if o < 224:
-                        l = 1
-                    elif o < 240:
-                        l = 2
-                    else:
-                        l = 3
-                    g = c
-                elif l > 0 and 128 <= o < 192:
-                    # valid continuations
-                    g += c
-                    l -= 1
-                    if not l:
-                        s += g
-                        g = ''
-                else:
-                    # invalid
-                    s += "%%%02X" % o
+            pos = 0
+            l = len(s)
+            while pos < l:
+                try:
+                    c = encoding.getutf8char(path, pos)
+                    pos += len(c)
+                except ValueError:
+                    c = '%%%%02X' % path[pos]
+                    pos += 1
+                s += c
 
-            # any remaining partial characters
-            s += ''.join(["%%%02X" % ord(x) for x in g])
             u = s.decode('utf-8')
 
         # Decompose then lowercase (HFS+ technote specifies lower)