minirst: use unicode string as intermediate form for replacement stable
authorFUJIWARA Katsunori <foozy@lares.dti.ne.jp>
Mon, 31 Oct 2011 21:06:18 +0900
branchstable
changeset 15393 87bb6b7644f6
parent 15392 d7bfbc92a1c0
child 15397 f32f71f6c20c
minirst: use unicode string as intermediate form for replacement # this change redones part of 521c8e0c93bf, backed out by 0ad0ebe67815 Some character encodings use ASCII characters other than control/alphabet/digit as a part of multi-bytes characters, so direct replacing with such characters on strings in local encoding causes invalid byte sequences. [mpm: test changed to simple doctest]
mercurial/minirst.py
tests/test-doctest.py
--- a/mercurial/minirst.py	Mon Oct 31 15:41:39 2011 -0500
+++ b/mercurial/minirst.py	Mon Oct 31 21:06:18 2011 +0900
@@ -23,9 +23,27 @@
 from i18n import _
 
 def replace(text, substs):
+    '''
+    Apply a list of (find, replace) pairs to a text.
+
+    >>> replace("foo bar", [('f', 'F'), ('b', 'B')])
+    'Foo Bar'
+    >>> encoding.encoding = 'latin1'
+    >>> replace('\\x81\\\\', [('\\\\', '/')])
+    '\\x81/'
+    >>> encoding.encoding = 'shiftjis'
+    >>> replace('\\x81\\\\', [('\\\\', '/')])
+    '\\x81\\\\'
+    '''
+
+    # some character encodings (cp932 for Japanese, at least) use
+    # ASCII characters other than control/alphabet/digit as a part of
+    # multi-bytes characters, so direct replacing with such characters
+    # on strings in local encoding causes invalid byte sequences.
+    utext = text.decode(encoding.encoding)
     for f, t in substs:
-        text = text.replace(f, t)
-    return text
+        utext = utext.replace(f, t)
+    return utext.encode(encoding.encoding)
 
 _blockre = re.compile(r"\n(?:\s*\n)+")
 
--- a/tests/test-doctest.py	Mon Oct 31 15:41:39 2011 -0500
+++ b/tests/test-doctest.py	Mon Oct 31 21:06:18 2011 +0900
@@ -36,3 +36,6 @@
 
 import mercurial.revset
 doctest.testmod(mercurial.revset)
+
+import mercurial.minirst
+doctest.testmod(mercurial.minirst)