stringutil: move generic string helpers to new module
authorYuya Nishihara <yuya@tcha.org>
Thu, 22 Mar 2018 21:19:31 +0900
changeset 37083 f99d64e8a4e4
parent 37082 1a1d1c44b570
child 37084 f0b6fbea00cf
stringutil: move generic string helpers to new module Per https://phab.mercurial-scm.org/D2903#46738 URL and file paths functions are left since they are big enough to make separate modules.
mercurial/util.py
mercurial/utils/stringutil.py
tests/test-doctest.py
--- a/mercurial/util.py	Thu Mar 22 21:32:19 2018 +0900
+++ b/mercurial/util.py	Thu Mar 22 21:19:31 2018 +0900
@@ -17,7 +17,6 @@
 
 import abc
 import bz2
-import codecs
 import collections
 import contextlib
 import errno
@@ -37,7 +36,6 @@
 import subprocess
 import sys
 import tempfile
-import textwrap
 import time
 import traceback
 import warnings
@@ -52,7 +50,10 @@
     pycompat,
     urllibcompat,
 )
-from .utils import dateutil
+from .utils import (
+    dateutil,
+    stringutil,
+)
 
 base85 = policy.importmod(r'base85')
 osutil = policy.importmod(r'osutil')
@@ -808,20 +809,6 @@
         return object.__getattribute__(self, r'_observedcall')(
             r'setsockopt', *args, **kwargs)
 
-_DATA_ESCAPE_MAP = {pycompat.bytechr(i): br'\x%02x' % i for i in range(256)}
-_DATA_ESCAPE_MAP.update({
-    b'\\': b'\\\\',
-    b'\r': br'\r',
-    b'\n': br'\n',
-})
-_DATA_ESCAPE_RE = remod.compile(br'[\x00-\x08\x0a-\x1f\\\x7f-\xff]')
-
-def escapedata(s):
-    if isinstance(s, bytearray):
-        s = bytes(s)
-
-    return _DATA_ESCAPE_RE.sub(lambda m: _DATA_ESCAPE_MAP[m.group(0)], s)
-
 class baseproxyobserver(object):
     def _writedata(self, data):
         if not self.logdata:
@@ -1567,10 +1554,6 @@
             return fn(s, cmd[len(name):].lstrip())
     return pipefilter(s, cmd)
 
-def binary(s):
-    """return true if a string is binary data"""
-    return bool(s and '\0' in s)
-
 def increasingchunks(source, min=1024, max=65536):
     '''return no less than min bytes per chunk while data remains,
     doubling min after each chunk until it reaches max'''
@@ -2571,102 +2554,6 @@
         b[0:len(res)] = res
         return len(res)
 
-def stringmatcher(pattern, casesensitive=True):
-    """
-    accepts a string, possibly starting with 're:' or 'literal:' prefix.
-    returns the matcher name, pattern, and matcher function.
-    missing or unknown prefixes are treated as literal matches.
-
-    helper for tests:
-    >>> def test(pattern, *tests):
-    ...     kind, pattern, matcher = stringmatcher(pattern)
-    ...     return (kind, pattern, [bool(matcher(t)) for t in tests])
-    >>> def itest(pattern, *tests):
-    ...     kind, pattern, matcher = stringmatcher(pattern, casesensitive=False)
-    ...     return (kind, pattern, [bool(matcher(t)) for t in tests])
-
-    exact matching (no prefix):
-    >>> test(b'abcdefg', b'abc', b'def', b'abcdefg')
-    ('literal', 'abcdefg', [False, False, True])
-
-    regex matching ('re:' prefix)
-    >>> test(b're:a.+b', b'nomatch', b'fooadef', b'fooadefbar')
-    ('re', 'a.+b', [False, False, True])
-
-    force exact matches ('literal:' prefix)
-    >>> test(b'literal:re:foobar', b'foobar', b're:foobar')
-    ('literal', 're:foobar', [False, True])
-
-    unknown prefixes are ignored and treated as literals
-    >>> test(b'foo:bar', b'foo', b'bar', b'foo:bar')
-    ('literal', 'foo:bar', [False, False, True])
-
-    case insensitive regex matches
-    >>> itest(b're:A.+b', b'nomatch', b'fooadef', b'fooadefBar')
-    ('re', 'A.+b', [False, False, True])
-
-    case insensitive literal matches
-    >>> itest(b'ABCDEFG', b'abc', b'def', b'abcdefg')
-    ('literal', 'ABCDEFG', [False, False, True])
-    """
-    if pattern.startswith('re:'):
-        pattern = pattern[3:]
-        try:
-            flags = 0
-            if not casesensitive:
-                flags = remod.I
-            regex = remod.compile(pattern, flags)
-        except remod.error as e:
-            raise error.ParseError(_('invalid regular expression: %s')
-                                   % e)
-        return 're', pattern, regex.search
-    elif pattern.startswith('literal:'):
-        pattern = pattern[8:]
-
-    match = pattern.__eq__
-
-    if not casesensitive:
-        ipat = encoding.lower(pattern)
-        match = lambda s: ipat == encoding.lower(s)
-    return 'literal', pattern, match
-
-def shortuser(user):
-    """Return a short representation of a user name or email address."""
-    f = user.find('@')
-    if f >= 0:
-        user = user[:f]
-    f = user.find('<')
-    if f >= 0:
-        user = user[f + 1:]
-    f = user.find(' ')
-    if f >= 0:
-        user = user[:f]
-    f = user.find('.')
-    if f >= 0:
-        user = user[:f]
-    return user
-
-def emailuser(user):
-    """Return the user portion of an email address."""
-    f = user.find('@')
-    if f >= 0:
-        user = user[:f]
-    f = user.find('<')
-    if f >= 0:
-        user = user[f + 1:]
-    return user
-
-def email(author):
-    '''get email of author.'''
-    r = author.find('>')
-    if r == -1:
-        r = None
-    return author[author.find('<') + 1:r]
-
-def ellipsis(text, maxlength=400):
-    """Trim string to at most maxlength (default: 400) columns in display."""
-    return encoding.trim(text, maxlength, ellipsis='...')
-
 def unitcountfn(*unittable):
     '''return a function that renders a readable count of some quantity'''
 
@@ -2751,147 +2638,6 @@
     fromnativeeol = pycompat.identity
     nativeeolwriter = pycompat.identity
 
-def escapestr(s):
-    # call underlying function of s.encode('string_escape') directly for
-    # Python 3 compatibility
-    return codecs.escape_encode(s)[0]
-
-def unescapestr(s):
-    return codecs.escape_decode(s)[0]
-
-def forcebytestr(obj):
-    """Portably format an arbitrary object (e.g. exception) into a byte
-    string."""
-    try:
-        return pycompat.bytestr(obj)
-    except UnicodeEncodeError:
-        # non-ascii string, may be lossy
-        return pycompat.bytestr(encoding.strtolocal(str(obj)))
-
-def uirepr(s):
-    # Avoid double backslash in Windows path repr()
-    return pycompat.byterepr(pycompat.bytestr(s)).replace(b'\\\\', b'\\')
-
-# delay import of textwrap
-def _MBTextWrapper(**kwargs):
-    class tw(textwrap.TextWrapper):
-        """
-        Extend TextWrapper for width-awareness.
-
-        Neither number of 'bytes' in any encoding nor 'characters' is
-        appropriate to calculate terminal columns for specified string.
-
-        Original TextWrapper implementation uses built-in 'len()' directly,
-        so overriding is needed to use width information of each characters.
-
-        In addition, characters classified into 'ambiguous' width are
-        treated as wide in East Asian area, but as narrow in other.
-
-        This requires use decision to determine width of such characters.
-        """
-        def _cutdown(self, ucstr, space_left):
-            l = 0
-            colwidth = encoding.ucolwidth
-            for i in xrange(len(ucstr)):
-                l += colwidth(ucstr[i])
-                if space_left < l:
-                    return (ucstr[:i], ucstr[i:])
-            return ucstr, ''
-
-        # overriding of base class
-        def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width):
-            space_left = max(width - cur_len, 1)
-
-            if self.break_long_words:
-                cut, res = self._cutdown(reversed_chunks[-1], space_left)
-                cur_line.append(cut)
-                reversed_chunks[-1] = res
-            elif not cur_line:
-                cur_line.append(reversed_chunks.pop())
-
-        # this overriding code is imported from TextWrapper of Python 2.6
-        # to calculate columns of string by 'encoding.ucolwidth()'
-        def _wrap_chunks(self, chunks):
-            colwidth = encoding.ucolwidth
-
-            lines = []
-            if self.width <= 0:
-                raise ValueError("invalid width %r (must be > 0)" % self.width)
-
-            # Arrange in reverse order so items can be efficiently popped
-            # from a stack of chucks.
-            chunks.reverse()
-
-            while chunks:
-
-                # Start the list of chunks that will make up the current line.
-                # cur_len is just the length of all the chunks in cur_line.
-                cur_line = []
-                cur_len = 0
-
-                # Figure out which static string will prefix this line.
-                if lines:
-                    indent = self.subsequent_indent
-                else:
-                    indent = self.initial_indent
-
-                # Maximum width for this line.
-                width = self.width - len(indent)
-
-                # First chunk on line is whitespace -- drop it, unless this
-                # is the very beginning of the text (i.e. no lines started yet).
-                if self.drop_whitespace and chunks[-1].strip() == r'' and lines:
-                    del chunks[-1]
-
-                while chunks:
-                    l = colwidth(chunks[-1])
-
-                    # Can at least squeeze this chunk onto the current line.
-                    if cur_len + l <= width:
-                        cur_line.append(chunks.pop())
-                        cur_len += l
-
-                    # Nope, this line is full.
-                    else:
-                        break
-
-                # The current line is full, and the next chunk is too big to
-                # fit on *any* line (not just this one).
-                if chunks and colwidth(chunks[-1]) > width:
-                    self._handle_long_word(chunks, cur_line, cur_len, width)
-
-                # If the last chunk on this line is all whitespace, drop it.
-                if (self.drop_whitespace and
-                    cur_line and cur_line[-1].strip() == r''):
-                    del cur_line[-1]
-
-                # Convert current line back to a string and store it in list
-                # of all lines (return value).
-                if cur_line:
-                    lines.append(indent + r''.join(cur_line))
-
-            return lines
-
-    global _MBTextWrapper
-    _MBTextWrapper = tw
-    return tw(**kwargs)
-
-def wrap(line, width, initindent='', hangindent=''):
-    maxindent = max(len(hangindent), len(initindent))
-    if width <= maxindent:
-        # adjust for weird terminal size
-        width = max(78, maxindent + 1)
-    line = line.decode(pycompat.sysstr(encoding.encoding),
-                       pycompat.sysstr(encoding.encodingmode))
-    initindent = initindent.decode(pycompat.sysstr(encoding.encoding),
-                                   pycompat.sysstr(encoding.encodingmode))
-    hangindent = hangindent.decode(pycompat.sysstr(encoding.encoding),
-                                   pycompat.sysstr(encoding.encodingmode))
-    wrapper = _MBTextWrapper(width=width,
-                             initial_indent=initindent,
-                             subsequent_indent=hangindent)
-    return wrapper.fill(line).encode(pycompat.sysstr(encoding.encoding))
-
 if (pyplatform.python_implementation() == 'CPython' and
     sys.version_info < (3, 0)):
     # There is an issue in CPython that some IO methods do not handle EINTR
@@ -3064,17 +2810,6 @@
     except socket.error:
         raise Abort(_("no port number associated with service '%s'") % port)
 
-_booleans = {'1': True, 'yes': True, 'true': True, 'on': True, 'always': True,
-             '0': False, 'no': False, 'false': False, 'off': False,
-             'never': False}
-
-def parsebool(s):
-    """Parse s into a boolean.
-
-    If s is not a valid boolean, returns None.
-    """
-    return _booleans.get(s.lower(), None)
-
 class url(object):
     r"""Reliable URL parser.
 
@@ -4341,3 +4076,19 @@
 strdate = _deprecatedfunc(dateutil.strdate, '4.6')
 parsedate = _deprecatedfunc(dateutil.parsedate, '4.6')
 matchdate = _deprecatedfunc(dateutil.matchdate, '4.6')
+
+def _deprecatedfunc(func, version):  # TODO
+    return func
+escapedata = _deprecatedfunc(stringutil.escapedata, '4.6')
+binary = _deprecatedfunc(stringutil.binary, '4.6')
+stringmatcher = _deprecatedfunc(stringutil.stringmatcher, '4.6')
+shortuser = _deprecatedfunc(stringutil.shortuser, '4.6')
+emailuser = _deprecatedfunc(stringutil.emailuser, '4.6')
+email = _deprecatedfunc(stringutil.email, '4.6')
+ellipsis = _deprecatedfunc(stringutil.ellipsis, '4.6')
+escapestr = _deprecatedfunc(stringutil.escapestr, '4.6')
+unescapestr = _deprecatedfunc(stringutil.unescapestr, '4.6')
+forcebytestr = _deprecatedfunc(stringutil.forcebytestr, '4.6')
+uirepr = _deprecatedfunc(stringutil.uirepr, '4.6')
+wrap = _deprecatedfunc(stringutil.wrap, '4.6')
+parsebool = _deprecatedfunc(stringutil.parsebool, '4.6')
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mercurial/utils/stringutil.py	Thu Mar 22 21:19:31 2018 +0900
@@ -0,0 +1,288 @@
+# stringutil.py - utility for generic string formatting, parsing, etc.
+#
+#  Copyright 2005 K. Thananchayan <thananck@yahoo.com>
+#  Copyright 2005-2007 Matt Mackall <mpm@selenic.com>
+#  Copyright 2006 Vadim Gelfer <vadim.gelfer@gmail.com>
+#
+# This software may be used and distributed according to the terms of the
+# GNU General Public License version 2 or any later version.
+
+from __future__ import absolute_import
+
+import codecs
+import re as remod
+import textwrap
+
+from ..i18n import _
+
+from .. import (
+    encoding,
+    error,
+    pycompat,
+)
+
+_DATA_ESCAPE_MAP = {pycompat.bytechr(i): br'\x%02x' % i for i in range(256)}
+_DATA_ESCAPE_MAP.update({
+    b'\\': b'\\\\',
+    b'\r': br'\r',
+    b'\n': br'\n',
+})
+_DATA_ESCAPE_RE = remod.compile(br'[\x00-\x08\x0a-\x1f\\\x7f-\xff]')
+
+def escapedata(s):
+    if isinstance(s, bytearray):
+        s = bytes(s)
+
+    return _DATA_ESCAPE_RE.sub(lambda m: _DATA_ESCAPE_MAP[m.group(0)], s)
+
+def binary(s):
+    """return true if a string is binary data"""
+    return bool(s and '\0' in s)
+
+def stringmatcher(pattern, casesensitive=True):
+    """
+    accepts a string, possibly starting with 're:' or 'literal:' prefix.
+    returns the matcher name, pattern, and matcher function.
+    missing or unknown prefixes are treated as literal matches.
+
+    helper for tests:
+    >>> def test(pattern, *tests):
+    ...     kind, pattern, matcher = stringmatcher(pattern)
+    ...     return (kind, pattern, [bool(matcher(t)) for t in tests])
+    >>> def itest(pattern, *tests):
+    ...     kind, pattern, matcher = stringmatcher(pattern, casesensitive=False)
+    ...     return (kind, pattern, [bool(matcher(t)) for t in tests])
+
+    exact matching (no prefix):
+    >>> test(b'abcdefg', b'abc', b'def', b'abcdefg')
+    ('literal', 'abcdefg', [False, False, True])
+
+    regex matching ('re:' prefix)
+    >>> test(b're:a.+b', b'nomatch', b'fooadef', b'fooadefbar')
+    ('re', 'a.+b', [False, False, True])
+
+    force exact matches ('literal:' prefix)
+    >>> test(b'literal:re:foobar', b'foobar', b're:foobar')
+    ('literal', 're:foobar', [False, True])
+
+    unknown prefixes are ignored and treated as literals
+    >>> test(b'foo:bar', b'foo', b'bar', b'foo:bar')
+    ('literal', 'foo:bar', [False, False, True])
+
+    case insensitive regex matches
+    >>> itest(b're:A.+b', b'nomatch', b'fooadef', b'fooadefBar')
+    ('re', 'A.+b', [False, False, True])
+
+    case insensitive literal matches
+    >>> itest(b'ABCDEFG', b'abc', b'def', b'abcdefg')
+    ('literal', 'ABCDEFG', [False, False, True])
+    """
+    if pattern.startswith('re:'):
+        pattern = pattern[3:]
+        try:
+            flags = 0
+            if not casesensitive:
+                flags = remod.I
+            regex = remod.compile(pattern, flags)
+        except remod.error as e:
+            raise error.ParseError(_('invalid regular expression: %s')
+                                   % e)
+        return 're', pattern, regex.search
+    elif pattern.startswith('literal:'):
+        pattern = pattern[8:]
+
+    match = pattern.__eq__
+
+    if not casesensitive:
+        ipat = encoding.lower(pattern)
+        match = lambda s: ipat == encoding.lower(s)
+    return 'literal', pattern, match
+
+def shortuser(user):
+    """Return a short representation of a user name or email address."""
+    f = user.find('@')
+    if f >= 0:
+        user = user[:f]
+    f = user.find('<')
+    if f >= 0:
+        user = user[f + 1:]
+    f = user.find(' ')
+    if f >= 0:
+        user = user[:f]
+    f = user.find('.')
+    if f >= 0:
+        user = user[:f]
+    return user
+
+def emailuser(user):
+    """Return the user portion of an email address."""
+    f = user.find('@')
+    if f >= 0:
+        user = user[:f]
+    f = user.find('<')
+    if f >= 0:
+        user = user[f + 1:]
+    return user
+
+def email(author):
+    '''get email of author.'''
+    r = author.find('>')
+    if r == -1:
+        r = None
+    return author[author.find('<') + 1:r]
+
+def ellipsis(text, maxlength=400):
+    """Trim string to at most maxlength (default: 400) columns in display."""
+    return encoding.trim(text, maxlength, ellipsis='...')
+
+def escapestr(s):
+    # call underlying function of s.encode('string_escape') directly for
+    # Python 3 compatibility
+    return codecs.escape_encode(s)[0]
+
+def unescapestr(s):
+    return codecs.escape_decode(s)[0]
+
+def forcebytestr(obj):
+    """Portably format an arbitrary object (e.g. exception) into a byte
+    string."""
+    try:
+        return pycompat.bytestr(obj)
+    except UnicodeEncodeError:
+        # non-ascii string, may be lossy
+        return pycompat.bytestr(encoding.strtolocal(str(obj)))
+
+def uirepr(s):
+    # Avoid double backslash in Windows path repr()
+    return pycompat.byterepr(pycompat.bytestr(s)).replace(b'\\\\', b'\\')
+
+# delay import of textwrap
+def _MBTextWrapper(**kwargs):
+    class tw(textwrap.TextWrapper):
+        """
+        Extend TextWrapper for width-awareness.
+
+        Neither number of 'bytes' in any encoding nor 'characters' is
+        appropriate to calculate terminal columns for specified string.
+
+        Original TextWrapper implementation uses built-in 'len()' directly,
+        so overriding is needed to use width information of each characters.
+
+        In addition, characters classified into 'ambiguous' width are
+        treated as wide in East Asian area, but as narrow in other.
+
+        This requires use decision to determine width of such characters.
+        """
+        def _cutdown(self, ucstr, space_left):
+            l = 0
+            colwidth = encoding.ucolwidth
+            for i in xrange(len(ucstr)):
+                l += colwidth(ucstr[i])
+                if space_left < l:
+                    return (ucstr[:i], ucstr[i:])
+            return ucstr, ''
+
+        # overriding of base class
+        def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width):
+            space_left = max(width - cur_len, 1)
+
+            if self.break_long_words:
+                cut, res = self._cutdown(reversed_chunks[-1], space_left)
+                cur_line.append(cut)
+                reversed_chunks[-1] = res
+            elif not cur_line:
+                cur_line.append(reversed_chunks.pop())
+
+        # this overriding code is imported from TextWrapper of Python 2.6
+        # to calculate columns of string by 'encoding.ucolwidth()'
+        def _wrap_chunks(self, chunks):
+            colwidth = encoding.ucolwidth
+
+            lines = []
+            if self.width <= 0:
+                raise ValueError("invalid width %r (must be > 0)" % self.width)
+
+            # Arrange in reverse order so items can be efficiently popped
+            # from a stack of chucks.
+            chunks.reverse()
+
+            while chunks:
+
+                # Start the list of chunks that will make up the current line.
+                # cur_len is just the length of all the chunks in cur_line.
+                cur_line = []
+                cur_len = 0
+
+                # Figure out which static string will prefix this line.
+                if lines:
+                    indent = self.subsequent_indent
+                else:
+                    indent = self.initial_indent
+
+                # Maximum width for this line.
+                width = self.width - len(indent)
+
+                # First chunk on line is whitespace -- drop it, unless this
+                # is the very beginning of the text (i.e. no lines started yet).
+                if self.drop_whitespace and chunks[-1].strip() == r'' and lines:
+                    del chunks[-1]
+
+                while chunks:
+                    l = colwidth(chunks[-1])
+
+                    # Can at least squeeze this chunk onto the current line.
+                    if cur_len + l <= width:
+                        cur_line.append(chunks.pop())
+                        cur_len += l
+
+                    # Nope, this line is full.
+                    else:
+                        break
+
+                # The current line is full, and the next chunk is too big to
+                # fit on *any* line (not just this one).
+                if chunks and colwidth(chunks[-1]) > width:
+                    self._handle_long_word(chunks, cur_line, cur_len, width)
+
+                # If the last chunk on this line is all whitespace, drop it.
+                if (self.drop_whitespace and
+                    cur_line and cur_line[-1].strip() == r''):
+                    del cur_line[-1]
+
+                # Convert current line back to a string and store it in list
+                # of all lines (return value).
+                if cur_line:
+                    lines.append(indent + r''.join(cur_line))
+
+            return lines
+
+    global _MBTextWrapper
+    _MBTextWrapper = tw
+    return tw(**kwargs)
+
+def wrap(line, width, initindent='', hangindent=''):
+    maxindent = max(len(hangindent), len(initindent))
+    if width <= maxindent:
+        # adjust for weird terminal size
+        width = max(78, maxindent + 1)
+    line = line.decode(pycompat.sysstr(encoding.encoding),
+                       pycompat.sysstr(encoding.encodingmode))
+    initindent = initindent.decode(pycompat.sysstr(encoding.encoding),
+                                   pycompat.sysstr(encoding.encodingmode))
+    hangindent = hangindent.decode(pycompat.sysstr(encoding.encoding),
+                                   pycompat.sysstr(encoding.encodingmode))
+    wrapper = _MBTextWrapper(width=width,
+                             initial_indent=initindent,
+                             subsequent_indent=hangindent)
+    return wrapper.fill(line).encode(pycompat.sysstr(encoding.encoding))
+
+_booleans = {'1': True, 'yes': True, 'true': True, 'on': True, 'always': True,
+             '0': False, 'no': False, 'false': False, 'off': False,
+             'never': False}
+
+def parsebool(s):
+    """Parse s into a boolean.
+
+    If s is not a valid boolean, returns None.
+    """
+    return _booleans.get(s.lower(), None)
--- a/tests/test-doctest.py	Thu Mar 22 21:32:19 2018 +0900
+++ b/tests/test-doctest.py	Thu Mar 22 21:19:31 2018 +0900
@@ -70,6 +70,7 @@
 testmod('mercurial.url')
 testmod('mercurial.util')
 testmod('mercurial.util', testtarget='platform')
+testmod('mercurial.utils.stringutil')
 testmod('hgext.convert.convcmd')
 testmod('hgext.convert.cvsps')
 testmod('hgext.convert.filemap')