contrib/hgdiff
author Matt Mackall <mpm@selenic.com>
Tue, 24 Jan 2006 14:49:19 +1300
changeset 1636 7da32bb3d1d3
child 1644 e7e6504c4989
permissions -rwxr-xr-x
contrib: add Chris Mason's stand-alone diff tool This uses Mercurial's diff algorithm to generate unidiffs like the traditional diff tool.

#!/usr/bin/env python

import os, sys, struct, stat
import difflib
import re
from optparse import OptionParser
from mercurial.bdiff import bdiff, blocks

VERSION="0.2"
usage = "usage: %prog [options] file1 file2"
parser = OptionParser(usage=usage)

parser.add_option("-d", "--difflib", action="store_true", default=False)
parser.add_option('-x', '--count', default=1)
parser.add_option('-c', '--context', type="int", default=3)
parser.add_option('-p', '--show-c-function', action="store_true", default=False)
parser.add_option('-w', '--ignore-all-space', action="store_true", 
                  default=False)

(options, args) = parser.parse_args()

if not args:
    parser.print_help()
    sys.exit(1)

# somewhat self contained replacement for difflib.unified_diff
# t1 and t2 are the text to be diffed
# l1 and l2 are the text broken up into lines
# header1 and header2 are the filenames for the diff output
# context is the number of context lines
# showfunc enables diff -p output
# ignorews ignores all whitespace changes in the diff
def bunidiff(t1, t2, l1, l2, header1, header2, context=3, showfunc=False, 
             ignorews=False):
    def contextend(l, len):
        ret = l + context
        if ret > len:
            ret = len
        return ret

    def contextstart(l):
        ret = l - context
        if ret < 0:
            return 0
        return ret

    def yieldhunk(hunk, header):
        if header:
            for x in header:
                yield x
        (astart, a2, bstart, b2, delta) = hunk
        aend = contextend(a2, len(l1))
        alen = aend - astart
        blen = b2 - bstart + aend - a2

        func = ""
        if showfunc:
            # walk backwards from the start of the context
            # to find a line starting with an alphanumeric char.
            for x in xrange(astart, -1, -1):
                t = l1[x]
                if funcre.match(t):
                    func = ' ' + t[:40]
                    break
            
        yield "@@ -%d,%d +%d,%d @@%s\n" % (astart + 1, alen, 
                                           bstart + 1, blen, func)
        for x in delta:
            yield x
        for x in xrange(a2, aend):
            yield ' ' + l1[x]

    header = [ "--- %s\t\n" % header1, "+++ %s\t\n" % header2 ]

    if showfunc:
        funcre = re.compile('\w')
    if ignorews:
        wsre = re.compile('[ \t]')

    # bdiff.blocks gives us the matching sequences in the files.  The loop
    # below finds the spaces between those matching sequences and translates
    # them into diff output.
    #
    diff = blocks(t1, t2)
    hunk = None
    for i in xrange(len(diff)):
        # The first match is special.
        # we've either found a match starting at line 0 or a match later
        # in the file.  If it starts later, old and new below will both be
        # empty and we'll continue to the next match.
        if i > 0:
            s = diff[i-1]
        else:
            s = [0, 0, 0, 0]
        delta = []
        s1 = diff[i]
        a1 = s[1]
        a2 = s1[0]
        b1 = s[3]
        b2 = s1[2]
        old = l1[a1:a2]
        new = l2[b1:b2]

        # bdiff sometimes gives huge matches past eof, this check eats them,
        # and deals with the special first match case described above
        if not old and not new:
            continue

        if ignorews:
            wsold = wsre.sub('', "".join(old))
            wsnew = wsre.sub('', "".join(new))
            if wsold == wsnew:
                continue

        astart = contextstart(a1)
        bstart = contextstart(b1)
        prev = None
        if hunk:
            # join with the previous hunk if it falls inside the context
            if astart < hunk[1] + context + 1:
                prev = hunk
                astart = hunk[1]
                bstart = hunk[3]
            else:
                for x in yieldhunk(hunk, header):
                    yield x
                # we only want to yield the header if the files differ, and
                # we only want to yield it once.
                header = None
        if prev:
            # we've joined the previous hunk, record the new ending points.
            hunk[1] = a2
            hunk[3] = b2
            delta = hunk[4]
        else:
            # create a new hunk
            hunk = [ astart, a2, bstart, b2, delta ]

        delta[len(delta):] = [ ' ' + x for x in l1[astart:a1] ]
        delta[len(delta):] = [ '-' + x for x in old ]
        delta[len(delta):] = [ '+' + x for x in new ]

    if hunk:
        for x in yieldhunk(hunk, header):
            yield x

# simple utility function to put all the
# files from a directory tree into a dict
def buildlist(names, top):
    tlen = len(top)
    for root, dirs, files in os.walk(top):
        l = root[tlen + 1:]
        for x in files:
            p = os.path.join(root, x)
            st = os.lstat(p)
            if stat.S_ISREG(st.st_mode):
                names[os.path.join(l, x)] = (st.st_dev, st.st_ino)

def diff_files(file1, file2):
    if file1 == None:
        b = file(file2).read().splitlines(1)
        l1 = "--- %s\n" % (file2)
        l2 = "+++ %s\n" % (file2)
        l3 = "@@ -0,0 +1,%d @@\n" % len(b)
        l = [l1, l2, l3] + ["+" + e for e in b]
    elif file2 == None:
        a = file(file1).read().splitlines(1)
        l1 = "--- %s\n" % (file1)
        l2 = "+++ %s\n" % (file1)
        l3 = "@@ -1,%d +0,0 @@\n" % len(a)
        l = [l1, l2, l3] + ["-" + e for e in a]
    else:
        t1 = file(file1).read()
        t2 = file(file2).read()
        l1 = t1.splitlines(1)
        l2 = t2.splitlines(1)
        if options.difflib:
            l = difflib.unified_diff(l1, l2, file1, file2)
        else:
            l = bunidiff(t1, t2, l1, l2, file1, file2, context=options.context,
                     showfunc=options.show_c_function,
                     ignorews=options.ignore_all_space)
    for x in l:
        if x[-1] != '\n':
            x += "\n\ No newline at end of file\n"
        print x,

file1 = args[0]
file2 = args[1]

if os.path.isfile(file1) and os.path.isfile(file2):
    diff_files(file1, file2)
elif os.path.isdir(file1):
    if not os.path.isdir(file2):
        sys.stderr.write("file types don't match\n")
        sys.exit(1)

    d1 = {}
    d2 = {}

    buildlist(d1, file1)
    buildlist(d2, file2)
    keys = d1.keys()
    keys.sort()
    for x in keys:
        if x not in d2:
            f2 = None
        else:
            f2 = os.path.join(file2, x)
            st1 = d1[x]
            st2 = d2[x]
            del d2[x]
            if st1[0] == st2[0] and st1[1] == st2[1]:
                sys.stderr.write("%s is a hard link\n" % x)
                continue
        x = os.path.join(file1, x)
        diff_files(x, f2)
    keys = d2.keys()
    keys.sort()
    for x in keys:
        f1 = None
        x = os.path.join(file2, x)
        diff_files(f1, x)