Update win32mbcs extension
authorShun-ichi Goto <shunichi.goto@gmail.com>
Wed, 13 Aug 2008 20:18:40 -0500
changeset 6887 304484c7e0ba
parent 6886 41aaaa23745f
child 6888 7c36a4fb05a3
Update win32mbcs extension * Code cleanup by Matt. * Fix the issue with case-insensitive fs support by wrapping also util.fspath() and util.checkcase() * Abort program when path conversion is failed.
hgext/win32mbcs.py
--- a/hgext/win32mbcs.py	Wed Aug 13 20:18:40 2008 -0500
+++ b/hgext/win32mbcs.py	Wed Aug 13 20:18:40 2008 -0500
@@ -1,35 +1,43 @@
-# win32mbcs.py -- MBCS filename support for Mercurial on Windows
+# win32mbcs.py -- MBCS filename support for Mercurial
 #
 # Copyright (c) 2008 Shun-ichi Goto <shunichi.goto@gmail.com>
 #
-# Version: 0.1
+# Version: 0.2
 # Author:  Shun-ichi Goto <shunichi.goto@gmail.com>
 #
 # This software may be used and distributed according to the terms
 # of the GNU General Public License, incorporated herein by reference.
 #
-"""Allow to use shift_jis/big5 filenames on Windows.
-
-There is a well known issue "0x5c problem" on Windows.  It is a
-trouble on handling path name as raw encoded byte sequence of
-problematic encodings like shift_jis or big5.  The primary intent
-of this extension is to allow using such a encoding on Mercurial
-without strange file operation error.
+"""Allow to use MBCS path with problematic encoding.
 
-By enabling this extension, hook mechanism is activated and some
-functions are altered.  Usually, this encoding is your local encoding
-on your system by default. So you can get benefit simply by enabling
-this extension.
-
-The encoding for filename is same one for terminal by default.  You
-can change the encoding by setting HGENCODING environment variable.
+Some MBCS encodings are not good for some path operations
+(i.e. splitting path, case conversion, etc.) with its encoded bytes.
+We call such a encoding (i.e. shift_jis and big5) as "problematic
+encoding".  This extension can be used to fix the issue with those
+encodings by wrapping some functions to convert to unicode string
+before path operation.
 
 This extension is usefull for:
- * Japanese Windows user using shift_jis encoding.
- * Chinese Windows user using big5 encoding.
- * Users who want to use a repository created with such a encoding.
+ * Japanese Windows users using shift_jis encoding.
+ * Chinese Windows users using big5 encoding.
+ * All users who use a repository with one of problematic encodings
+   on case-insensitive file system.
+
+This extension is not needed for:
+ * Any user who use only ascii chars in path.
+ * Any user who do not use any of problematic encodings.
 
-Note: Unix people does not need to use this extension.
+Note that there are some limitations on using this extension:
+ * You should use single encoding in one repository.
+ * You should set same encoding for the repository by locale or HGENCODING.
+
+To use this extension, enable the extension in .hg/hgrc or ~/.hgrc:
+
+  [extensions]
+  hgext.win32mbcs =
+
+Path encoding conversion are done between unicode and util._encoding
+which is decided by mercurial from current locale setting or HGENCODING.
 
 """
 
@@ -37,122 +45,78 @@
 from mercurial.i18n import _
 from mercurial import util
 
-__all__ = ['install', 'uninstall', 'reposetup']
+def decode(arg):
+   if isinstance(arg, str):
+       uarg = arg.decode(util._encoding)
+       if arg == uarg.encode(util._encoding):
+           return uarg
+       raise UnicodeError("Not local encoding")
+   elif isinstance(arg, tuple):
+       return tuple(map(decode, arg))
+   elif isinstance(arg, list):
+       return map(decode, arg)
+   return arg
+
+def encode(arg):
+   if isinstance(arg, unicode):
+       return arg.encode(util._encoding)
+   elif isinstance(arg, tuple):
+       return tuple(map(encode, arg))
+   elif isinstance(arg, list):
+       return map(encode, arg)
+   return arg
+
+def wrapper(func, args):
+   # check argument is unicode, then call original
+   for arg in args:
+       if isinstance(arg, unicode):
+           return func(*args)
 
+   try:
+       # convert arguments to unicode, call func, then convert back
+       return encode(func(*decode(args)))
+   except UnicodeError:
+       # If not encoded with util._encoding, report it then
+       # continue with calling original function.
+      raise util.Abort(_("[win32mbcs] filename conversion fail with"
+                         " %s encoding\n") % (util._encoding))
+
+def wrapname(name):
+   idx = name.rfind('.')
+   module = name[:idx]
+   name = name[idx+1:]
+   module = eval(module)
+   func = getattr(module, name)
+   def f(*args):
+       return wrapper(func, args)
+   try:
+      f.__name__ = func.__name__                # fail with python23
+   except Exception:
+      pass
+   setattr(module, name, f)
+
+# List of functions to be wrapped.
+# NOTE: os.path.dirname() and os.path.basename() are safe because
+#       they use result of os.path.split()
+funcs = '''os.path.join os.path.split os.path.splitext
+ os.path.splitunc os.path.normpath os.path.normcase os.makedirs
+ util.endswithsep util.splitpath util.checkcase util.fspath'''
 
 # codec and alias names of sjis and big5 to be faked.
-_problematic_encodings = util.frozenset([
-        'big5', 'big5-tw', 'csbig5',
-        'big5hkscs', 'big5-hkscs', 'hkscs',
-        'cp932', '932', 'ms932', 'mskanji', 'ms-kanji',
-        'shift_jis', 'csshiftjis', 'shiftjis', 'sjis', 's_jis',
-        'shift_jis_2004', 'shiftjis2004', 'sjis_2004', 'sjis2004',
-        'shift_jisx0213', 'shiftjisx0213', 'sjisx0213', 's_jisx0213',
-        ])
-
-# attribute name to store original function
-_ORIGINAL = '_original'
-
-_ui = None
-
-def decode_with_check(arg):
-    if isinstance(arg, tuple):
-        return tuple(map(decode_with_check, arg))
-    elif isinstance(arg, list):
-        return map(decode_with_check, arg)
-    elif isinstance(arg, str):
-        uarg = arg.decode(util._encoding)
-        if arg == uarg.encode(util._encoding):
-            return uarg
-        else:
-            raise UnicodeError("Not local encoding")
-    else:
-        return arg
-
-def encode_with_check(arg):
-    if isinstance(arg, tuple):
-        return tuple(map(encode_with_check, arg))
-    elif isinstance(arg, list):
-        return map(encode_with_check, arg)
-    elif isinstance(arg, unicode):
-        ret = arg.encode(util._encoding)
-        return ret
-    else:
-        return arg
-
-def wrap(func):
-
-    def wrapped(*args):
-        # check argument is unicode, then call original
-        for arg in args:
-            if isinstance(arg, unicode):
-                return func(*args)
-        # make decoded argument list into uargs
-        try:
-            args = decode_with_check(args)
-        except UnicodeError, exc:
-            # If not encoded with _local_fs_encoding, report it then
-            # continue with calling original function.
-            _ui.warn(_("WARNING: [win32mbcs] filename conversion fail for" +
-                     " %s: '%s'\n") % (util._encoding, args))
-            return func(*args)
-        # call as unicode operation, then return with encoding
-        return encode_with_check(func(*args))
-
-    # fake is only for relevant environment.
-    if hasattr(func, _ORIGINAL) or \
-            util._encoding.lower() not in _problematic_encodings:
-        return func
-    else:
-        f = wrapped
-        f.__name__ = func.__name__
-        setattr(f, _ORIGINAL, func)   # hold original to restore
-        return f
-
-def unwrap(func):
-    return getattr(func, _ORIGINAL, func)
-
-def install():
-    # wrap some python functions and mercurial functions
-    # to handle raw bytes on Windows.
-    # NOTE: dirname and basename is safe because they use result
-    # of os.path.split()
-    global _ui
-    if not _ui:
-        from mercurial import ui
-        _ui = ui.ui()
-    os.path.join = wrap(os.path.join)
-    os.path.split = wrap(os.path.split)
-    os.path.splitext = wrap(os.path.splitext)
-    os.path.splitunc = wrap(os.path.splitunc)
-    os.path.normpath = wrap(os.path.normpath)
-    os.path.normcase = wrap(os.path.normcase)
-    os.makedirs = wrap(os.makedirs)
-    util.endswithsep = wrap(util.endswithsep)
-    util.splitpath = wrap(util.splitpath)
-
-def uninstall():
-    # restore original functions.
-    os.path.join = unwrap(os.path.join)
-    os.path.split = unwrap(os.path.split)
-    os.path.splitext = unwrap(os.path.splitext)
-    os.path.splitunc = unwrap(os.path.splitunc)
-    os.path.normpath = unwrap(os.path.normpath)
-    os.path.normcase = unwrap(os.path.normcase)
-    os.makedirs = unwrap(os.makedirs)
-    util.endswithsep = unwrap(util.endswithsep)
-    util.splitpath = unwrap(util.splitpath)
-
+problematic_encodings = '''big5 big5-tw csbig5 big5hkscs big5-hkscs
+ hkscs cp932 932 ms932 mskanji ms-kanji shift_jis csshiftjis shiftjis
+ sjis s_jis shift_jis_2004 shiftjis2004 sjis_2004 sjis2004
+ shift_jisx0213 shiftjisx0213 sjisx0213 s_jisx0213'''
 
 def reposetup(ui, repo):
-    # TODO: decide use of config section for this extension
-    global _ui
-    _ui = ui
-    if not os.path.supports_unicode_filenames:
-        ui.warn(_("[win32mbcs] cannot activate on this platform.\n"))
-        return
-    # install features of this extension
-    install()
-    ui.debug(_("[win32mbcs] activeted with encoding: %s\n") % util._encoding)
+   # TODO: decide use of config section for this extension
+   if not os.path.supports_unicode_filenames:
+       ui.warn(_("[win32mbcs] cannot activate on this platform.\n"))
+       return
 
-# win32mbcs.py ends here
+   # fake is only for relevant environment.
+   if util._encoding.lower() in problematic_encodings.split():
+       for f in funcs.split():
+           wrapname(f)
+       ui.debug(_("[win32mbcs] activated with encoding: %s\n") % util._encoding)
+