Add bdiff.blocks / minor performance tweaks
authormpm@selenic.com
Wed, 22 Jun 2005 11:27:50 -0800
changeset 433 79c694462294
parent 432 3b9e3d3d2810
child 434 08f00b6494f4
Add bdiff.blocks / minor performance tweaks -----BEGIN PGP SIGNED MESSAGE----- Hash: SHA1 Add bdiff.blocks / minor performance tweaks This refactors bdiff.bdiff so that we can get a list of matching blocks of line numbers for use by annotate/unidiff. Minor performance tweaks: - - add a field for equivalence so we can keep h around a bit longer for cmp - - mix len into the hash to reduce collisions - - move an operation into the slow path in longest_match manifest hash: b1aee590b6291b31069ea8a86b6aa8fb259ac244 -----BEGIN PGP SIGNATURE----- Version: GnuPG v1.4.0 (GNU/Linux) iD8DBQFCubu2ywK+sNU5EO8RAm4FAJ9r10aJpT7qA96nqGYFHcuy4XcIHgCfeFx5 q0PyTXeZQc7Fw5kwEPcoykI= =QXSb -----END PGP SIGNATURE-----
mercurial/bdiff.c
--- a/mercurial/bdiff.c	Wed Jun 22 11:23:01 2005 -0800
+++ b/mercurial/bdiff.c	Wed Jun 22 11:27:50 2005 -0800
@@ -30,7 +30,7 @@
 #endif
 
 struct line {
-	int h, len, n;
+	int h, len, n, e;
 	const char *l;
 };
 
@@ -69,7 +69,7 @@
 		h = *p + rol32(h, 7); /* a simple hash from GNU diff */
 		if (*p == '\n' || p == a + len - 1) {
 			l->len = p - b + 1;
-			l->h = h;
+			l->h = h * l->len;
 			l->l = b;
 			l->n = -1;
 			l++;
@@ -86,7 +86,7 @@
 
 int inline cmp(struct line *a, struct line *b)
 {
-	return a->len != b->len || memcmp(a->l, b->l, a->len);
+	return a->h != b->h || a->len != b->len || memcmp(a->l, b->l, a->len);
 }
 
 static int equatelines(struct line *a, int an, struct line *b, int bn)
@@ -118,7 +118,7 @@
 
 		/* add to the head of the equivalence class */
 		b[i].n = h[j];
-		b[i].h = j;
+		b[i].e = j;
 		h[j] = i;
 		l[j]++; /* keep track of popularity */
 	}
@@ -133,7 +133,7 @@
 			if (!cmp(a + i, b + h[j]))
 				break;
 
-		a[i].h = j; /* use equivalence class for quick compare */
+		a[i].e = j; /* use equivalence class for quick compare */
 		if(l[j] <= t)
 			a[i].n = h[j]; /* point to head of match list */
 		else
@@ -159,11 +159,11 @@
 		/* loop through all lines match a[i] in b */
 		for (; j != -1 && j < b2; j = b[j].n) {
 			/* does this extend an earlier match? */
-			if (i > a1 && j > b1 && jpos[j - 1] == i)
+			if (i > a1 && j > b1 && jpos[j - 1] == i - 1)
 				k = jlen[j - 1] + 1;
 			else
 				k = 1;
-			jpos[j] = i + 1;
+			jpos[j] = i;
 			jlen[j] = k;
 
 			/* best match so far? */
@@ -182,10 +182,10 @@
 
 	/* expand match to include neighboring popular lines */
 	while (mi - mb > a1 && mj - mb > b1 &&
-	       a[mi - mb - 1].h == b[mj - mb - 1].h)
+	       a[mi - mb - 1].e == b[mj - mb - 1].e)
 		mb++;
 	while (mi + mk < a2 && mj + mk < b2 &&
-	       a[mi + mk].h == b[mj + mk].h)
+	       a[mi + mk].e == b[mj + mk].e)
 		mk++;
 
 	*omi = mi - mb;
@@ -213,33 +213,84 @@
 	recurse(a, b, jpos, jlen, i + k, a2, j + k, b2, l);
 }
 
-static PyObject *bdiff(PyObject *self, PyObject *args)
+static struct hunklist diff(struct line *a, int an, struct line *b, int bn)
 {
-	PyObject *sa, *sb, *result = NULL;
+	struct hunklist l;
+	int *jpos, *jlen, t;
+
+	/* allocate and fill arrays */
+	t = equatelines(a, an, b, bn);
+	jpos = calloc(bn, sizeof(int));
+	jlen = calloc(bn, sizeof(int));
+	l.head = l.base = malloc(sizeof(struct hunk) * ((an + bn) / 4 + 2));
+
+	if (jpos && jlen && l.base && t) {
+		/* generate the matching block list */
+		recurse(a, b, jpos, jlen, 0, an, 0, bn, &l);
+		l.head->a1 = an;
+		l.head->b1 = bn;
+		l.head++;
+	}
+
+	free(jpos);
+	free(jlen);
+	return l;
+}
+
+static PyObject *blocks(PyObject *self, PyObject *args)
+{
+	PyObject *sa, *sb, *rl, *m;
+	struct line *a, *b;
 	struct hunklist l;
 	struct hunk *h;
-	struct line *al, *bl;
-	char encode[12], *rb;
-	int an, bn, len = 0, t, la = 0, lb = 0, *jpos, *jlen;
+	int an, bn, pos = 0;
 
 	if (!PyArg_ParseTuple(args, "SS:bdiff", &sa, &sb))
 		return NULL;
 
-	/* allocate and fill arrays */
+	an = splitlines(PyString_AsString(sa), PyString_Size(sa), &a);
+	bn = splitlines(PyString_AsString(sb), PyString_Size(sb), &b);
+	if (!a || !b)
+		goto nomem;
+
+	l = diff(a, an, b, bn);
+	rl = PyList_New(l.head - l.base);
+	if (!l.head || !rl)
+		goto nomem;
+
+	for(h = l.base; h != l.head; h++) {
+		m = Py_BuildValue("iiii", h->a1, h->a2, h->b1, h->b2);
+		PyList_SetItem(rl, pos, m);
+		pos++;
+	}
+
+nomem:
+	free(a);
+	free(b);
+	free(l.base);
+	return rl ? rl : PyErr_NoMemory();
+}
+
+static PyObject *bdiff(PyObject *self, PyObject *args)
+{
+	PyObject *sa, *sb, *result = NULL;
+	struct line *al, *bl;
+	struct hunklist l;
+	struct hunk *h;
+	char encode[12], *rb;
+	int an, bn, len = 0, la = 0, lb = 0;
+
+	if (!PyArg_ParseTuple(args, "SS:bdiff", &sa, &sb))
+		return NULL;
+
 	an = splitlines(PyString_AsString(sa), PyString_Size(sa), &al);
 	bn = splitlines(PyString_AsString(sb), PyString_Size(sb), &bl);
-	t = equatelines(al, an, bl, bn);
-	jpos = calloc(bn, sizeof(int));
-	jlen = calloc(bn, sizeof(int));
-	l.head = l.base = malloc(sizeof(struct hunk) * ((an + bn) / 4 + 2));
-	if (!al || !bl || !jpos || !jlen || !l.base || !t)
+	if (!al || !bl)
 		goto nomem;
 
-	/* generate the matching block list */
-	recurse(al, bl, jpos, jlen, 0, an, 0, bn, &l);
-	l.head->a1 = an;
-	l.head->b1 = bn;
-	l.head++;
+	l = diff(al, an, bl, bn);
+	if (!l.head)
+		goto nomem;
 
 	/* calculate length of output */
 	for(h = l.base; h != l.head; h++) {
@@ -274,8 +325,6 @@
 nomem:
 	free(al);
 	free(bl);
-	free(jpos);
-	free(jlen);
 	free(l.base);
 	return result ? result : PyErr_NoMemory();
 }
@@ -284,6 +333,7 @@
 
 static PyMethodDef methods[] = {
 	{"bdiff", bdiff, METH_VARARGS, "calculate a binary diff\n"},
+	{"blocks", blocks, METH_VARARGS, "find a list of matching lines\n"},
 	{NULL, NULL}
 };