mercurial/utils/stringutil.py
changeset 49575 bbbb5213d043
parent 49284 d44e3c45f0e4
child 49648 9be765b82a90
equal deleted inserted replaced
49574:2506c3ac73f4 49575:bbbb5213d043
    12 import codecs
    12 import codecs
    13 import re as remod
    13 import re as remod
    14 import textwrap
    14 import textwrap
    15 import types
    15 import types
    16 
    16 
       
    17 from typing import (
       
    18     Optional,
       
    19     overload,
       
    20 )
       
    21 
    17 from ..i18n import _
    22 from ..i18n import _
    18 from ..thirdparty import attr
    23 from ..thirdparty import attr
    19 
    24 
    20 from .. import (
    25 from .. import (
    21     encoding,
    26     encoding,
    26 # regex special chars pulled from https://bugs.python.org/issue29995
    31 # regex special chars pulled from https://bugs.python.org/issue29995
    27 # which was part of Python 3.7.
    32 # which was part of Python 3.7.
    28 _respecial = pycompat.bytestr(b'()[]{}?*+-|^$\\.&~# \t\n\r\v\f')
    33 _respecial = pycompat.bytestr(b'()[]{}?*+-|^$\\.&~# \t\n\r\v\f')
    29 _regexescapemap = {ord(i): (b'\\' + i).decode('latin1') for i in _respecial}
    34 _regexescapemap = {ord(i): (b'\\' + i).decode('latin1') for i in _respecial}
    30 regexbytesescapemap = {i: (b'\\' + i) for i in _respecial}
    35 regexbytesescapemap = {i: (b'\\' + i) for i in _respecial}
       
    36 
       
    37 
       
    38 @overload
       
    39 def reescape(pat: bytes) -> bytes:
       
    40     ...
       
    41 
       
    42 
       
    43 @overload
       
    44 def reescape(pat: str) -> str:
       
    45     ...
    31 
    46 
    32 
    47 
    33 def reescape(pat):
    48 def reescape(pat):
    34     """Drop-in replacement for re.escape."""
    49     """Drop-in replacement for re.escape."""
    35     # NOTE: it is intentional that this works on unicodes and not
    50     # NOTE: it is intentional that this works on unicodes and not
    43     if wantuni:
    58     if wantuni:
    44         return pat
    59         return pat
    45     return pat.encode('latin1')
    60     return pat.encode('latin1')
    46 
    61 
    47 
    62 
    48 def pprint(o, bprefix=False, indent=0, level=0):
    63 def pprint(o, bprefix: bool = False, indent: int = 0, level: int = 0) -> bytes:
    49     """Pretty print an object."""
    64     """Pretty print an object."""
    50     return b''.join(pprintgen(o, bprefix=bprefix, indent=indent, level=level))
    65     return b''.join(pprintgen(o, bprefix=bprefix, indent=indent, level=level))
    51 
    66 
    52 
    67 
    53 def pprintgen(o, bprefix=False, indent=0, level=0):
    68 def pprintgen(o, bprefix: bool = False, indent: int = 0, level: int = 0):
    54     """Pretty print an object to a generator of atoms.
    69     """Pretty print an object to a generator of atoms.
    55 
    70 
    56     ``bprefix`` is a flag influencing whether bytestrings are preferred with
    71     ``bprefix`` is a flag influencing whether bytestrings are preferred with
    57     a ``b''`` prefix.
    72     a ``b''`` prefix.
    58 
    73 
   248         yield b']'
   263         yield b']'
   249     else:
   264     else:
   250         yield pycompat.byterepr(o)
   265         yield pycompat.byterepr(o)
   251 
   266 
   252 
   267 
   253 def prettyrepr(o):
   268 def prettyrepr(o) -> bytes:
   254     """Pretty print a representation of a possibly-nested object"""
   269     """Pretty print a representation of a possibly-nested object"""
   255     lines = []
   270     lines = []
   256     rs = pycompat.byterepr(o)
   271     rs = pycompat.byterepr(o)
   257     p0 = p1 = 0
   272     p0 = p1 = 0
   258     while p0 < len(rs):
   273     while p0 < len(rs):
   279         lines.append((l, rs[p0:q0].rstrip()))
   294         lines.append((l, rs[p0:q0].rstrip()))
   280         p0, p1 = q0, q1
   295         p0, p1 = q0, q1
   281     return b'\n'.join(b'  ' * l + s for l, s in lines)
   296     return b'\n'.join(b'  ' * l + s for l, s in lines)
   282 
   297 
   283 
   298 
   284 def buildrepr(r):
   299 def buildrepr(r) -> bytes:
   285     """Format an optional printable representation from unexpanded bits
   300     """Format an optional printable representation from unexpanded bits
   286 
   301 
   287     ========  =================================
   302     ========  =================================
   288     type(r)   example
   303     type(r)   example
   289     ========  =================================
   304     ========  =================================
   303         return r()
   318         return r()
   304     else:
   319     else:
   305         return pprint(r)
   320         return pprint(r)
   306 
   321 
   307 
   322 
   308 def binary(s):
   323 def binary(s: bytes) -> bool:
   309     """return true if a string is binary data"""
   324     """return true if a string is binary data"""
   310     return bool(s and b'\0' in s)
   325     return bool(s and b'\0' in s)
   311 
   326 
   312 
   327 
   313 def _splitpattern(pattern):
   328 def _splitpattern(pattern: bytes):
   314     if pattern.startswith(b're:'):
   329     if pattern.startswith(b're:'):
   315         return b're', pattern[3:]
   330         return b're', pattern[3:]
   316     elif pattern.startswith(b'literal:'):
   331     elif pattern.startswith(b'literal:'):
   317         return b'literal', pattern[8:]
   332         return b'literal', pattern[8:]
   318     return b'literal', pattern
   333     return b'literal', pattern
   319 
   334 
   320 
   335 
   321 def stringmatcher(pattern, casesensitive=True):
   336 def stringmatcher(pattern: bytes, casesensitive: bool = True):
   322     """
   337     """
   323     accepts a string, possibly starting with 're:' or 'literal:' prefix.
   338     accepts a string, possibly starting with 're:' or 'literal:' prefix.
   324     returns the matcher name, pattern, and matcher function.
   339     returns the matcher name, pattern, and matcher function.
   325     missing or unknown prefixes are treated as literal matches.
   340     missing or unknown prefixes are treated as literal matches.
   326 
   341 
   377         return kind, pattern, match
   392         return kind, pattern, match
   378 
   393 
   379     raise error.ProgrammingError(b'unhandled pattern kind: %s' % kind)
   394     raise error.ProgrammingError(b'unhandled pattern kind: %s' % kind)
   380 
   395 
   381 
   396 
   382 def substringregexp(pattern, flags=0):
   397 def substringregexp(pattern: bytes, flags: int = 0):
   383     """Build a regexp object from a string pattern possibly starting with
   398     """Build a regexp object from a string pattern possibly starting with
   384     're:' or 'literal:' prefix.
   399     're:' or 'literal:' prefix.
   385 
   400 
   386     helper for tests:
   401     helper for tests:
   387     >>> def test(pattern, *tests):
   402     >>> def test(pattern, *tests):
   429         return remod.compile(remod.escape(pattern), flags)
   444         return remod.compile(remod.escape(pattern), flags)
   430 
   445 
   431     raise error.ProgrammingError(b'unhandled pattern kind: %s' % kind)
   446     raise error.ProgrammingError(b'unhandled pattern kind: %s' % kind)
   432 
   447 
   433 
   448 
   434 def shortuser(user):
   449 def shortuser(user: bytes) -> bytes:
   435     """Return a short representation of a user name or email address."""
   450     """Return a short representation of a user name or email address."""
   436     f = user.find(b'@')
   451     f = user.find(b'@')
   437     if f >= 0:
   452     if f >= 0:
   438         user = user[:f]
   453         user = user[:f]
   439     f = user.find(b'<')
   454     f = user.find(b'<')
   446     if f >= 0:
   461     if f >= 0:
   447         user = user[:f]
   462         user = user[:f]
   448     return user
   463     return user
   449 
   464 
   450 
   465 
   451 def emailuser(user):
   466 def emailuser(user: bytes) -> bytes:
   452     """Return the user portion of an email address."""
   467     """Return the user portion of an email address."""
   453     f = user.find(b'@')
   468     f = user.find(b'@')
   454     if f >= 0:
   469     if f >= 0:
   455         user = user[:f]
   470         user = user[:f]
   456     f = user.find(b'<')
   471     f = user.find(b'<')
   457     if f >= 0:
   472     if f >= 0:
   458         user = user[f + 1 :]
   473         user = user[f + 1 :]
   459     return user
   474     return user
   460 
   475 
   461 
   476 
   462 def email(author):
   477 def email(author: bytes) -> bytes:
   463     '''get email of author.'''
   478     '''get email of author.'''
   464     r = author.find(b'>')
   479     r = author.find(b'>')
   465     if r == -1:
   480     if r == -1:
   466         r = None
   481         r = None
   467     return author[author.find(b'<') + 1 : r]
   482     return author[author.find(b'<') + 1 : r]
   468 
   483 
   469 
   484 
   470 def person(author):
   485 def person(author: bytes) -> bytes:
   471     """Returns the name before an email address,
   486     """Returns the name before an email address,
   472     interpreting it as per RFC 5322
   487     interpreting it as per RFC 5322
   473 
   488 
   474     >>> person(b'foo@bar')
   489     >>> person(b'foo@bar')
   475     'foo'
   490     'foo'
   610         )
   625         )
   611 
   626 
   612     return mailmap
   627     return mailmap
   613 
   628 
   614 
   629 
   615 def mapname(mailmap, author):
   630 def mapname(mailmap, author: bytes) -> bytes:
   616     """Returns the author field according to the mailmap cache, or
   631     """Returns the author field according to the mailmap cache, or
   617     the original author field.
   632     the original author field.
   618 
   633 
   619     >>> mmdata = b"\\n".join([
   634     >>> mmdata = b"\\n".join([
   620     ...     b'# Comment',
   635     ...     b'# Comment',
   661 
   676 
   662 
   677 
   663 _correctauthorformat = remod.compile(br'^[^<]+\s<[^<>]+@[^<>]+>$')
   678 _correctauthorformat = remod.compile(br'^[^<]+\s<[^<>]+@[^<>]+>$')
   664 
   679 
   665 
   680 
   666 def isauthorwellformed(author):
   681 def isauthorwellformed(author: bytes) -> bool:
   667     """Return True if the author field is well formed
   682     """Return True if the author field is well formed
   668     (ie "Contributor Name <contrib@email.dom>")
   683     (ie "Contributor Name <contrib@email.dom>")
   669 
   684 
   670     >>> isauthorwellformed(b'Good Author <good@author.com>')
   685     >>> isauthorwellformed(b'Good Author <good@author.com>')
   671     True
   686     True
   683     False
   698     False
   684     """
   699     """
   685     return _correctauthorformat.match(author) is not None
   700     return _correctauthorformat.match(author) is not None
   686 
   701 
   687 
   702 
   688 def firstline(text):
   703 def firstline(text: bytes) -> bytes:
   689     """Return the first line of the input"""
   704     """Return the first line of the input"""
   690     # Try to avoid running splitlines() on the whole string
   705     # Try to avoid running splitlines() on the whole string
   691     i = text.find(b'\n')
   706     i = text.find(b'\n')
   692     if i != -1:
   707     if i != -1:
   693         text = text[:i]
   708         text = text[:i]
   695         return text.splitlines()[0]
   710         return text.splitlines()[0]
   696     except IndexError:
   711     except IndexError:
   697         return b''
   712         return b''
   698 
   713 
   699 
   714 
   700 def ellipsis(text, maxlength=400):
   715 def ellipsis(text: bytes, maxlength: int = 400) -> bytes:
   701     """Trim string to at most maxlength (default: 400) columns in display."""
   716     """Trim string to at most maxlength (default: 400) columns in display."""
   702     return encoding.trim(text, maxlength, ellipsis=b'...')
   717     return encoding.trim(text, maxlength, ellipsis=b'...')
   703 
   718 
   704 
   719 
   705 def escapestr(s):
   720 def escapestr(s: bytes) -> bytes:
       
   721     # "bytes" is also a typing shortcut for bytes, bytearray, and memoryview
   706     if isinstance(s, memoryview):
   722     if isinstance(s, memoryview):
   707         s = bytes(s)
   723         s = bytes(s)
   708     # call underlying function of s.encode('string_escape') directly for
   724     # call underlying function of s.encode('string_escape') directly for
   709     # Python 3 compatibility
   725     # Python 3 compatibility
   710     return codecs.escape_encode(s)[0]  # pytype: disable=module-attr
   726     return codecs.escape_encode(s)[0]  # pytype: disable=module-attr
   711 
   727 
   712 
   728 
   713 def unescapestr(s):
   729 def unescapestr(s: bytes) -> bytes:
   714     return codecs.escape_decode(s)[0]  # pytype: disable=module-attr
   730     return codecs.escape_decode(s)[0]  # pytype: disable=module-attr
   715 
   731 
   716 
   732 
   717 def forcebytestr(obj):
   733 def forcebytestr(obj):
   718     """Portably format an arbitrary object (e.g. exception) into a byte
   734     """Portably format an arbitrary object (e.g. exception) into a byte
   722     except UnicodeEncodeError:
   738     except UnicodeEncodeError:
   723         # non-ascii string, may be lossy
   739         # non-ascii string, may be lossy
   724         return pycompat.bytestr(encoding.strtolocal(str(obj)))
   740         return pycompat.bytestr(encoding.strtolocal(str(obj)))
   725 
   741 
   726 
   742 
   727 def uirepr(s):
   743 def uirepr(s: bytes) -> bytes:
   728     # Avoid double backslash in Windows path repr()
   744     # Avoid double backslash in Windows path repr()
   729     return pycompat.byterepr(pycompat.bytestr(s)).replace(b'\\\\', b'\\')
   745     return pycompat.byterepr(pycompat.bytestr(s)).replace(b'\\\\', b'\\')
   730 
   746 
   731 
   747 
   732 # delay import of textwrap
   748 # delay import of textwrap
   836     global _MBTextWrapper
   852     global _MBTextWrapper
   837     _MBTextWrapper = tw
   853     _MBTextWrapper = tw
   838     return tw(**kwargs)
   854     return tw(**kwargs)
   839 
   855 
   840 
   856 
   841 def wrap(line, width, initindent=b'', hangindent=b''):
   857 def wrap(
       
   858     line: bytes, width: int, initindent: bytes = b'', hangindent: bytes = b''
       
   859 ) -> bytes:
   842     maxindent = max(len(hangindent), len(initindent))
   860     maxindent = max(len(hangindent), len(initindent))
   843     if width <= maxindent:
   861     if width <= maxindent:
   844         # adjust for weird terminal size
   862         # adjust for weird terminal size
   845         width = max(78, maxindent + 1)
   863         width = max(78, maxindent + 1)
   846     line = line.decode(
   864     line = line.decode(
   873     b'off': False,
   891     b'off': False,
   874     b'never': False,
   892     b'never': False,
   875 }
   893 }
   876 
   894 
   877 
   895 
   878 def parsebool(s):
   896 def parsebool(s: bytes) -> Optional[bool]:
   879     """Parse s into a boolean.
   897     """Parse s into a boolean.
   880 
   898 
   881     If s is not a valid boolean, returns None.
   899     If s is not a valid boolean, returns None.
   882     """
   900     """
   883     return _booleans.get(s.lower(), None)
   901     return _booleans.get(s.lower(), None)
   884 
   902 
   885 
   903 
   886 def parselist(value):
   904 # TODO: make arg mandatory (and fix code below?)
       
   905 def parselist(value: Optional[bytes]):
   887     """parse a configuration value as a list of comma/space separated strings
   906     """parse a configuration value as a list of comma/space separated strings
   888 
   907 
   889     >>> parselist(b'this,is "a small" ,test')
   908     >>> parselist(b'this,is "a small" ,test')
   890     ['this', 'is', 'a small', 'test']
   909     ['this', 'is', 'a small', 'test']
   891     """
   910     """
   971     else:
   990     else:
   972         result = value
   991         result = value
   973     return result or []
   992     return result or []
   974 
   993 
   975 
   994 
   976 def evalpythonliteral(s):
   995 def evalpythonliteral(s: bytes):
   977     """Evaluate a string containing a Python literal expression"""
   996     """Evaluate a string containing a Python literal expression"""
   978     # We could backport our tokenizer hack to rewrite '' to u'' if we want
   997     # We could backport our tokenizer hack to rewrite '' to u'' if we want
   979     return ast.literal_eval(s.decode('latin1'))
   998     return ast.literal_eval(s.decode('latin1'))