contrib/byteify-strings.py
changeset 38390 47dd23e6b116
parent 38389 1d68fd5f614a
child 38391 f77bbd34a1df
equal deleted inserted replaced
38389:1d68fd5f614a 38390:47dd23e6b116
    15 import os
    15 import os
    16 import sys
    16 import sys
    17 import tempfile
    17 import tempfile
    18 import token
    18 import token
    19 import tokenize
    19 import tokenize
       
    20 
       
    21 def adjusttokenpos(t, ofs):
       
    22     """Adjust start/end column of the given token"""
       
    23     return t._replace(start=(t.start[0], t.start[1] + ofs),
       
    24                       end=(t.end[0], t.end[1] + ofs))
    20 
    25 
    21 if True:
    26 if True:
    22     def replacetokens(tokens, opts):
    27     def replacetokens(tokens, opts):
    23         """Transform a stream of tokens from raw to Python 3.
    28         """Transform a stream of tokens from raw to Python 3.
    24 
    29 
    76             """
    81             """
    77             st = tokens[j]
    82             st = tokens[j]
    78             if st.type == token.STRING and st.string.startswith(("'", '"')):
    83             if st.type == token.STRING and st.string.startswith(("'", '"')):
    79                 sysstrtokens.add(st)
    84                 sysstrtokens.add(st)
    80 
    85 
       
    86         coldelta = 0  # column increment for new opening parens
       
    87         coloffset = -1  # column offset for the current line (-1: TBD)
       
    88         parens = [(0, 0, 0)]  # stack of (line, end-column, column-offset)
    81         for i, t in enumerate(tokens):
    89         for i, t in enumerate(tokens):
       
    90             # Compute the column offset for the current line, such that
       
    91             # the current line will be aligned to the last opening paren
       
    92             # as before.
       
    93             if coloffset < 0:
       
    94                 if t.start[1] == parens[-1][1]:
       
    95                     coloffset = parens[-1][2]
       
    96                 elif t.start[1] + 1 == parens[-1][1]:
       
    97                     # fix misaligned indent of s/util.Abort/error.Abort/
       
    98                     coloffset = parens[-1][2] + (parens[-1][1] - t.start[1])
       
    99                 else:
       
   100                     coloffset = 0
       
   101 
       
   102             # Reset per-line attributes at EOL.
       
   103             if t.type in (token.NEWLINE, tokenize.NL):
       
   104                 yield adjusttokenpos(t, coloffset)
       
   105                 coldelta = 0
       
   106                 coloffset = -1
       
   107                 continue
       
   108 
       
   109             # Remember the last paren position.
       
   110             if _isop(i, '(', '[', '{'):
       
   111                 parens.append(t.end + (coloffset + coldelta,))
       
   112             elif _isop(i, ')', ']', '}'):
       
   113                 parens.pop()
       
   114 
    82             # Convert most string literals to byte literals. String literals
   115             # Convert most string literals to byte literals. String literals
    83             # in Python 2 are bytes. String literals in Python 3 are unicode.
   116             # in Python 2 are bytes. String literals in Python 3 are unicode.
    84             # Most strings in Mercurial are bytes and unicode strings are rare.
   117             # Most strings in Mercurial are bytes and unicode strings are rare.
    85             # Rather than rewrite all string literals to use ``b''`` to indicate
   118             # Rather than rewrite all string literals to use ``b''`` to indicate
    86             # byte strings, we apply this token transformer to insert the ``b``
   119             # byte strings, we apply this token transformer to insert the ``b``
    95                 # is b''' prefixed, leading to a SyntaxError. We leave all
   128                 # is b''' prefixed, leading to a SyntaxError. We leave all
    96                 # docstrings as unprefixed to avoid this. This means Mercurial
   129                 # docstrings as unprefixed to avoid this. This means Mercurial
    97                 # components touching docstrings need to handle unicode,
   130                 # components touching docstrings need to handle unicode,
    98                 # unfortunately.
   131                 # unfortunately.
    99                 if s[0:3] in ("'''", '"""'):
   132                 if s[0:3] in ("'''", '"""'):
   100                     yield t
   133                     yield adjusttokenpos(t, coloffset)
   101                     continue
   134                     continue
   102 
   135 
   103                 # If the first character isn't a quote, it is likely a string
   136                 # If the first character isn't a quote, it is likely a string
   104                 # prefixing character (such as 'b', 'u', or 'r'. Ignore.
   137                 # prefixing character (such as 'b', 'u', or 'r'. Ignore.
   105                 if s[0] not in ("'", '"'):
   138                 if s[0] not in ("'", '"'):
   106                     yield t
   139                     yield adjusttokenpos(t, coloffset)
   107                     continue
   140                     continue
   108 
   141 
   109                 # String literal. Prefix to make a b'' string.
   142                 # String literal. Prefix to make a b'' string.
   110                 yield t._replace(string='b%s' % t.string)
   143                 yield adjusttokenpos(t._replace(string='b%s' % t.string),
       
   144                                      coloffset)
       
   145                 coldelta += 1
   111                 continue
   146                 continue
   112 
   147 
   113             # This looks like a function call.
   148             # This looks like a function call.
   114             if t.type == token.NAME and _isop(i + 1, '('):
   149             if t.type == token.NAME and _isop(i + 1, '('):
   115                 fn = t.string
   150                 fn = t.string
   130                             _ensuresysstr(argidx)
   165                             _ensuresysstr(argidx)
   131 
   166 
   132                 # It changes iteritems/values to items/values as they are not
   167                 # It changes iteritems/values to items/values as they are not
   133                 # present in Python 3 world.
   168                 # present in Python 3 world.
   134                 elif opts['dictiter'] and fn in ('iteritems', 'itervalues'):
   169                 elif opts['dictiter'] and fn in ('iteritems', 'itervalues'):
   135                     yield t._replace(string=fn[4:])
   170                     yield adjusttokenpos(t._replace(string=fn[4:]), coloffset)
   136                     continue
   171                     continue
   137 
   172 
   138             # Emit unmodified token.
   173             # Emit unmodified token.
   139             yield t
   174             yield adjusttokenpos(t, coloffset)
   140 
   175 
   141 def process(fin, fout, opts):
   176 def process(fin, fout, opts):
   142     tokens = tokenize.tokenize(fin.readline)
   177     tokens = tokenize.tokenize(fin.readline)
   143     tokens = replacetokens(list(tokens), opts)
   178     tokens = replacetokens(list(tokens), opts)
   144     fout.write(tokenize.untokenize(tokens))
   179     fout.write(tokenize.untokenize(tokens))