manifest: improve parsing performance by 8x via a new C extension
authorBryan O'Sullivan <bos@serpentine.com>
Wed, 26 Mar 2008 10:12:10 -0700
changeset 6389 0231f763ebc8
parent 6388 1e4ddc9ac9f7
child 6390 67784cb38d58
manifest: improve parsing performance by 8x via a new C extension
mercurial/manifest.py
mercurial/parsers.c
setup.py
--- a/mercurial/manifest.py	Mon Mar 24 18:05:30 2008 -0500
+++ b/mercurial/manifest.py	Wed Mar 26 10:12:10 2008 -0700
@@ -8,7 +8,7 @@
 from node import bin, hex, nullid
 from revlog import revlog, RevlogError
 from i18n import _
-import array, struct, mdiff
+import array, struct, mdiff, parsers
 
 class manifestdict(dict):
     def __init__(self, mapping=None, flags=None):
@@ -39,14 +39,7 @@
 
     def parse(self, lines):
         mfdict = manifestdict()
-        fdict = mfdict._flags
-        for l in lines.splitlines():
-            f, n = l.split('\0')
-            if len(n) > 40:
-                fdict[f] = n[40:]
-                mfdict[f] = bin(n[:40])
-            else:
-                mfdict[f] = bin(n)
+        parsers.parse_manifest(mfdict, mfdict._flags, lines)
         return mfdict
 
     def readdelta(self, node):
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mercurial/parsers.c	Wed Mar 26 10:12:10 2008 -0700
@@ -0,0 +1,168 @@
+/*
+ parsers.c - efficient content parsing
+
+ Copyright 2008 Matt Mackall <mpm@selenic.com> and others
+
+ This software may be used and distributed according to the terms of
+ the GNU General Public License, incorporated herein by reference.
+*/
+
+#include <Python.h>
+#include <ctype.h>
+#include <string.h>
+
+static int hexdigit(char c)
+{
+	if (c >= '0' && c <= '9')
+		return c - '0';
+
+	if (c >= 'A' && c <= 'F')
+		return c - 'A' + 10;
+
+	if (c >= 'a' && c <= 'f')
+		return c - 'a' + 10;
+	
+	return -1;
+}
+
+/*
+ * Turn a hex-encoded string into binary.
+ */
+static PyObject *unhexlify(const char *str, int len)
+{
+	PyObject *ret = NULL;
+	char *c, *d;
+
+	if (len % 2) {
+		PyErr_SetString(PyExc_ValueError,
+				"input is not even in length");
+		goto bail;
+	}
+
+	ret = PyString_FromStringAndSize(NULL, len / 2);
+	if (!ret)
+		goto bail;
+
+	d = PyString_AsString(ret);
+	if (!d)
+		goto bail;
+
+	for (c = str; c < str + len;) {
+		int hi = hexdigit(*c++);
+		int lo = hexdigit(*c++);
+
+		if (hi == -1 || lo == -1) {
+			PyErr_SetString(PyExc_ValueError,
+					"input contains non-hex character");
+			goto bail;
+		}
+
+		*d++ = (hi << 4) | lo;
+	}
+	
+	goto done;
+	
+bail:
+	Py_XDECREF(ret);
+	ret = NULL;
+done:
+	return ret;
+}
+
+/*
+ * This code assumes that a manifest is stitched together with newline
+ * ('\n') characters.
+ */
+static PyObject *parse_manifest(PyObject *self, PyObject *args)
+{
+	PyObject *mfdict, *fdict;
+	char *str, *cur, *start, *zero;
+	int len;
+
+	if (!PyArg_ParseTuple(args, "O!O!s#:parse_manifest",
+			      &PyDict_Type, &mfdict,
+			      &PyDict_Type, &fdict,
+			      &str, &len))
+		goto quit;
+
+	for (start = cur = str, zero = NULL; cur < str + len; cur++) {
+		PyObject *file = NULL, *node = NULL;
+		PyObject *flags = NULL;
+		int nlen;
+
+		if (!*cur) {
+			zero = cur;
+			continue;
+		}
+		else if (*cur != '\n')
+			continue;
+
+		if (!zero) {
+			PyErr_SetString(PyExc_ValueError,
+					"manifest entry has no separator");
+			goto quit;
+		}
+
+		file = PyString_FromStringAndSize(start, zero - start);
+		if (!file)
+			goto bail;
+
+		nlen = cur - zero - 1;
+
+		node = unhexlify(zero + 1, nlen > 40 ? 40 : nlen);
+		if (!node)
+			goto bail;
+
+		if (nlen > 40) {
+			PyObject *flags;
+
+			flags = PyString_FromStringAndSize(zero + 41,
+							   nlen - 40);
+			if (!flags)
+				goto bail;
+
+			if (PyDict_SetItem(fdict, file, flags) == -1)
+				goto bail;
+		}
+
+		if (PyDict_SetItem(mfdict, file, node) == -1)
+			goto bail;
+
+		start = cur + 1;
+		zero = NULL;
+
+		Py_XDECREF(flags);
+		Py_XDECREF(node);
+		Py_XDECREF(file);
+		continue;
+	bail:
+		Py_XDECREF(flags);
+		Py_XDECREF(node);
+		Py_XDECREF(file);
+		goto quit;
+	}
+
+	if (len > 0 && *(cur - 1) != '\n') {
+		PyErr_SetString(PyExc_ValueError,
+				"manifest contains trailing garbage");
+		goto quit;
+	}
+
+	Py_INCREF(Py_None);
+	return Py_None;
+
+quit:
+	return NULL;
+}
+
+static char parsers_doc[] = "Efficient content parsing.";
+
+static PyMethodDef methods[] = {
+	{"parse_manifest", parse_manifest, METH_VARARGS, "parse a manifest\n"},
+	{NULL, NULL}
+};
+
+PyMODINIT_FUNC initparsers(void)
+{
+	Py_InitModule3("parsers", methods, parsers_doc);
+}
--- a/setup.py	Mon Mar 24 18:05:30 2008 -0500
+++ b/setup.py	Wed Mar 26 10:12:10 2008 -0700
@@ -88,10 +88,11 @@
 cmdclass = {'install_data': install_package_data}
 
 ext_modules=[
-    Extension('mercurial.mpatch', ['mercurial/mpatch.c']),
+    Extension('mercurial.base85', ['mercurial/base85.c']),
     Extension('mercurial.bdiff', ['mercurial/bdiff.c']),
-    Extension('mercurial.base85', ['mercurial/base85.c']),
-    Extension('mercurial.diffhelpers', ['mercurial/diffhelpers.c'])
+    Extension('mercurial.diffhelpers', ['mercurial/diffhelpers.c']),
+    Extension('mercurial.mpatch', ['mercurial/mpatch.c']),
+    Extension('mercurial.parsers', ['mercurial/parsers.c']),
     ]
 
 packages = ['mercurial', 'mercurial.hgweb', 'hgext', 'hgext.convert']