# HG changeset patch
# User Jun Wu <quark@fb.com>
# Date 1521458910 25200
# Node ID 5471348921c1e569c15019105ec82f15c522c22d
# Parent  8d730f96e792f52f8dee5f02d80ff6b2e590371b
patch: buffer lines for a same hunk

Instead of yielding tokens directly, buffer them if they belong to a same
hunk. This makes it easier for the upcoming new worddiff algorithm to only
focus on the diff hunk, instead of having to worry about other contents.

This breaks how the existing experimental worddiff algorithm works, so the
algorithm was removed, and related tests are disabled for now. The next patch
will add a new worddiff algorithm.

Differential Revision: https://phab.mercurial-scm.org/D3211

diff -r 8d730f96e792 -r 5471348921c1 mercurial/patch.py
--- a/mercurial/patch.py	Mon Mar 19 04:28:29 2018 -0700
+++ b/mercurial/patch.py	Mon Mar 19 04:28:30 2018 -0700
@@ -11,7 +11,6 @@
 import collections
 import contextlib
 import copy
-import difflib
 import email
 import errno
 import hashlib
@@ -2481,11 +2480,32 @@
     else:
         return difffn(opts, None)
 
+def diffsinglehunk(hunklines):
+    """yield tokens for a list of lines in a single hunk"""
+    for line in hunklines:
+        # chomp
+        chompline = line.rstrip('\n')
+        # highlight tabs and trailing whitespace
+        stripline = chompline.rstrip()
+        if line[0] == '-':
+            label = 'diff.deleted'
+        elif line[0] == '+':
+            label = 'diff.inserted'
+        else:
+            raise error.ProgrammingError('unexpected hunk line: %s' % line)
+        for token in tabsplitter.findall(stripline):
+            if '\t' == token[0]:
+                yield (token, 'diff.tab')
+            else:
+                yield (token, label)
+
+        if chompline != stripline:
+            yield (chompline[len(stripline):], 'diff.trailingwhitespace')
+        if chompline != line:
+            yield (line[len(chompline):], '')
+
 def difflabel(func, *args, **kw):
     '''yields 2-tuples of (output, label) based on the output of func()'''
-    inlinecolor = False
-    if kw.get(r'opts'):
-        inlinecolor = kw[r'opts'].worddiff
     headprefixes = [('diff', 'diff.diffline'),
                     ('copy', 'diff.extended'),
                     ('rename', 'diff.extended'),
@@ -2497,14 +2517,20 @@
                     ('---', 'diff.file_a'),
                     ('+++', 'diff.file_b')]
     textprefixes = [('@', 'diff.hunk'),
-                    ('-', 'diff.deleted'),
-                    ('+', 'diff.inserted')]
+                    # - and + are handled by diffsinglehunk
+                   ]
     head = False
+
+    # buffers a hunk, i.e. adjacent "-", "+" lines without other changes.
+    hunkbuffer = []
+    def consumehunkbuffer():
+        if hunkbuffer:
+            for token in diffsinglehunk(hunkbuffer):
+                yield token
+            hunkbuffer[:] = []
+
     for chunk in func(*args, **kw):
         lines = chunk.split('\n')
-        matches = {}
-        if inlinecolor:
-            matches = _findmatches(lines)
         linecount = len(lines)
         for i, line in enumerate(lines):
             if head:
@@ -2513,109 +2539,37 @@
             else:
                 if line and not line.startswith((' ', '+', '-', '@', '\\')):
                     head = True
-            stripline = line
             diffline = False
             if not head and line and line.startswith(('+', '-')):
-                # highlight tabs and trailing whitespace, but only in
-                # changed lines
-                stripline = line.rstrip()
                 diffline = True
 
             prefixes = textprefixes
             if head:
                 prefixes = headprefixes
-            for prefix, label in prefixes:
-                if stripline.startswith(prefix):
-                    if diffline:
-                        if i in matches:
-                            for t, l in _inlinediff(lines[i].rstrip(),
-                                                    lines[matches[i]].rstrip(),
-                                                    label):
-                                yield (t, l)
-                        else:
-                            for token in tabsplitter.findall(stripline):
-                                if token.startswith('\t'):
-                                    yield (token, 'diff.tab')
-                                else:
-                                    yield (token, label)
-                    else:
-                        yield (stripline, label)
-                    break
+            if diffline:
+                # buffered
+                bufferedline = line
+                if i + 1 < linecount:
+                    bufferedline += "\n"
+                hunkbuffer.append(bufferedline)
             else:
-                yield (line, '')
-            if line != stripline:
-                yield (line[len(stripline):], 'diff.trailingwhitespace')
-            if i + 1 < linecount:
-                yield ('\n', '')
-
-def _findmatches(slist):
-    '''Look for insertion matches to deletion and returns a dict of
-    correspondences.
-    '''
-    lastmatch = 0
-    matches = {}
-    for i, line in enumerate(slist):
-        if line == '':
-            continue
-        if line.startswith('-'):
-            lastmatch = max(lastmatch, i)
-            newgroup = False
-            for j, newline in enumerate(slist[lastmatch + 1:]):
-                if newline == '':
-                    continue
-                if newline.startswith('-') and newgroup: # too far, no match
-                    break
-                if newline.startswith('+'): # potential match
-                    newgroup = True
-                    sim = difflib.SequenceMatcher(None, line, newline).ratio()
-                    if sim > 0.7:
-                        lastmatch = lastmatch + 1 + j
-                        matches[i] = lastmatch
-                        matches[lastmatch] = i
+                # unbuffered
+                for token in consumehunkbuffer():
+                    yield token
+                stripline = line.rstrip()
+                for prefix, label in prefixes:
+                    if stripline.startswith(prefix):
+                        yield (stripline, label)
+                        if line != stripline:
+                            yield (line[len(stripline):],
+                                   'diff.trailingwhitespace')
                         break
-    return matches
-
-def _inlinediff(s1, s2, operation):
-    '''Perform string diff to highlight specific changes.'''
-    operation_skip = ('+', '?') if operation == 'diff.deleted' else ('-', '?')
-    if operation == 'diff.deleted':
-        s2, s1 = s1, s2
-
-    buff = []
-    # we never want to higlight the leading +-
-    if operation == 'diff.deleted' and s2.startswith('-'):
-        label = operation
-        token = '-'
-        s2 = s2[1:]
-        s1 = s1[1:]
-    elif operation == 'diff.inserted' and s1.startswith('+'):
-        label = operation
-        token = '+'
-        s2 = s2[1:]
-        s1 = s1[1:]
-    else:
-        raise error.ProgrammingError("Case not expected, operation = %s" %
-                                     operation)
-
-    s = difflib.ndiff(_nonwordre.split(s2), _nonwordre.split(s1))
-    for part in s:
-        if part.startswith(operation_skip) or len(part) == 2:
-            continue
-        l = operation + '.highlight'
-        if part.startswith(' '):
-            l = operation
-        if part[2:] == '\t':
-            l = 'diff.tab'
-        if l == label: # contiguous token with same label
-            token += part[2:]
-            continue
-        else:
-            buff.append((token, label))
-            label = l
-            token = part[2:]
-    buff.append((token, label))
-
-    return buff
+                else:
+                    yield (line, '')
+                if i + 1 < linecount:
+                    yield ('\n', '')
+        for token in consumehunkbuffer():
+            yield token
 
 def diffui(*args, **kw):
     '''like diff(), but yields 2-tuples of (output, label) for ui.write()'''
diff -r 8d730f96e792 -r 5471348921c1 tests/test-diff-color.t
--- a/tests/test-diff-color.t	Mon Mar 19 04:28:29 2018 -0700
+++ b/tests/test-diff-color.t	Mon Mar 19 04:28:30 2018 -0700
@@ -337,6 +337,7 @@
   [diff.deleted|-(to see if it works)]
   [diff.inserted|+three of those lines have]
   [diff.inserted|+collapsed onto one]
+#if false
   $ hg diff --config experimental.worddiff=True --color=debug
   [diff.diffline|diff --git a/file1 b/file1]
   [diff.file_a|--- a/file1]
@@ -370,6 +371,7 @@
   [diff.deleted|-(to see if it works)]
   [diff.inserted|+three of those lines ][diff.inserted.highlight|have]
   [diff.inserted|+][diff.inserted.highlight|collapsed][diff.inserted| onto one]
+#endif
 
 multibyte character shouldn't be broken up in word diff:
 
@@ -383,6 +385,8 @@
   >     f.write(b"blah \xe3\x82\xa4 blah\n")
   > EOF
   $ hg ci -m 'slightly change utf8 char' utf8
+
+#if false
   $ hg diff --config experimental.worddiff=True --color=debug -c.
   [diff.diffline|diff --git a/utf8 b/utf8]
   [diff.file_a|--- a/utf8]
@@ -390,3 +394,4 @@
   [diff.hunk|@@ -1,1 +1,1 @@]
   [diff.deleted|-blah ][diff.deleted.highlight|\xe3\x82\xa2][diff.deleted| blah] (esc)
   [diff.inserted|+blah ][diff.inserted.highlight|\xe3\x82\xa4][diff.inserted| blah] (esc)
+#endif