# HG changeset patch # User Augie Fackler # Date 1421188298 28800 # Node ID a5f1bccd2996e76d1d9de82c37129078151e240d # Parent e0c1328df87212fb0819a955919635aa80a58882 manifest.c: new extension code to lazily parse manifests This lets us iterate manifests in order, but do a _lot_ less work in the common case when we only care about a few manifest entries. Many thanks to Mike Edgar for reviewing this in advance of it going out to the list, which caught many things I missed. This version of the patch includes C89 fixes from Sean Farley and many correctness/efficiency cleanups from Martin von Zweigbergk. Thanks to both! diff -r e0c1328df872 -r a5f1bccd2996 mercurial/manifest.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mercurial/manifest.c Tue Jan 13 14:31:38 2015 -0800 @@ -0,0 +1,841 @@ +/* + * manifest.c - manifest type that does on-demand parsing. + * + * Copyright 2015, Google Inc. + * + * This software may be used and distributed according to the terms of + * the GNU General Public License, incorporated herein by reference. + */ +#include +#include +#include + +#include + +/* VC9 doesn't include bool and lacks stdbool.h based on my searching */ +#ifdef _MSC_VER +#define true 1 +#define false 0 +typedef unsigned char bool; +#else +#include +#endif + +#define DEFAULT_LINES 100000 + +typedef struct { + char *start; + Py_ssize_t len; /* length of line including terminal newline */ + char hash_suffix; + bool from_malloc; + bool deleted; +} line; + +typedef struct { + PyObject_HEAD + PyObject *pydata; + line *lines; + int numlines; /* number of line entries */ + int livelines; /* number of non-deleted lines */ + int maxlines; /* allocated number of lines */ + bool dirty; +} lazymanifest; + +#define MANIFEST_OOM -1 +#define MANIFEST_NOT_SORTED -2 +#define MANIFEST_MALFORMED -3 + +/* defined in parsers.c */ +PyObject *unhexlify(const char *str, int len); + +/* get the length of the path for a line */ +static size_t pathlen(line *l) { + return strlen(l->start); +} + +/* get the node value of a single line */ +static PyObject *nodeof(line *l) { + char *s = l->start; + ssize_t llen = pathlen(l); + PyObject *hash = unhexlify(s + llen + 1, 40); + if (!hash) { + return NULL; + } + if (l->hash_suffix != '\0') { + char newhash[21]; + memcpy(newhash, PyString_AsString(hash), 20); + Py_DECREF(hash); + newhash[20] = l->hash_suffix; + hash = PyString_FromStringAndSize(newhash, 21); + } + return hash; +} + +/* get the node hash and flags of a line as a tuple */ +static PyObject *hashflags(line *l) +{ + char *s = l->start; + size_t plen = pathlen(l); + PyObject *hash = nodeof(l); + + /* 40 for hash, 1 for null byte, 1 for newline */ + size_t hplen = plen + 42; + Py_ssize_t flen = l->len - hplen; + PyObject *flags; + PyObject *tup; + + if (!hash) + return NULL; + flags = PyString_FromStringAndSize(s + hplen - 1, flen); + if (!flags) { + Py_DECREF(hash); + return NULL; + } + tup = PyTuple_Pack(2, hash, flags); + Py_DECREF(flags); + Py_DECREF(hash); + return tup; +} + +/* if we're about to run out of space in the line index, add more */ +static bool realloc_if_full(lazymanifest *self) +{ + if (self->numlines == self->maxlines) { + self->maxlines *= 2; + self->lines = realloc(self->lines, self->maxlines * sizeof(line)); + } + return self->lines; +} + +/* + * Find the line boundaries in the manifest that 'data' points to and store + * information about each line in 'self'. + */ +static int find_lines(lazymanifest *self, char *data, Py_ssize_t len) +{ + char *prev = NULL; + while (len > 0) { + line *l; + char *next = memchr(data, '\n', len); + if (!next) { + return MANIFEST_MALFORMED; + } + next++; /* advance past newline */ + if (!realloc_if_full(self)) { + return MANIFEST_OOM; /* no memory */ + } + if (prev && strcmp(prev, data) > -1) { + /* This data isn't sorted, so we have to abort. */ + return MANIFEST_NOT_SORTED; + } + l = self->lines + ((self->numlines)++); + l->start = data; + l->len = next - data; + l->hash_suffix = '\0'; + l->from_malloc = false; + l->deleted = false; + len = len - l->len; + prev = data; + data = next; + } + self->livelines = self->numlines; + return 0; +} + +static int lazymanifest_init(lazymanifest *self, PyObject *args) +{ + char *data; + Py_ssize_t len; + int err, ret; + PyObject *pydata; + if (!PyArg_ParseTuple(args, "S", &pydata)) { + return -1; + } + err = PyString_AsStringAndSize(pydata, &data, &len); + + self->dirty = false; + if (err == -1) + return -1; + self->pydata = pydata; + Py_INCREF(self->pydata); + Py_BEGIN_ALLOW_THREADS + self->lines = malloc(DEFAULT_LINES * sizeof(line)); + self->maxlines = DEFAULT_LINES; + self->numlines = 0; + if (!self->lines) + ret = MANIFEST_OOM; + else + ret = find_lines(self, data, len); + Py_END_ALLOW_THREADS + switch (ret) { + case 0: + break; + case MANIFEST_OOM: + PyErr_NoMemory(); + break; + case MANIFEST_NOT_SORTED: + PyErr_Format(PyExc_ValueError, + "Manifest lines not in sorted order."); + break; + case MANIFEST_MALFORMED: + PyErr_Format(PyExc_ValueError, + "Manifest did not end in a newline."); + break; + default: + PyErr_Format(PyExc_ValueError, + "Unknown problem parsing manifest."); + } + return ret == 0 ? 0 : -1; +} + +static void lazymanifest_dealloc(lazymanifest *self) +{ + /* free any extra lines we had to allocate */ + int i; + for (i = 0; i < self->numlines; i++) { + if (self->lines[i].from_malloc) { + free(self->lines[i].start); + } + } + if (self->lines) { + free(self->lines); + self->lines = NULL; + } + if (self->pydata) { + Py_DECREF(self->pydata); + self->pydata = NULL; + } + PyObject_Del(self); +} + +/* iteration support */ + +typedef struct { + PyObject_HEAD lazymanifest *m; + Py_ssize_t pos; +} lmIter; + +static void lmiter_dealloc(PyObject *o) +{ + lmIter *self = (lmIter *)o; + Py_DECREF(self->m); + PyObject_Del(self); +} + +static PyObject *lmiter_iternext(PyObject *o) +{ + size_t pl; + line *l; + Py_ssize_t consumed; + PyObject *ret = NULL, *path = NULL, *hash = NULL, *flags = NULL; + lmIter *self = (lmIter *)o; + do { + self->pos++; + if (self->pos >= self->m->numlines) { + goto bail; + } + /* skip over deleted manifest entries */ + } while (self->m->lines[self->pos].deleted); + l = self->m->lines + self->pos; + pl = pathlen(l); + path = PyString_FromStringAndSize(l->start, pl); + hash = nodeof(l); + consumed = pl + 41; + flags = PyString_FromStringAndSize(l->start + consumed, + l->len - consumed - 1); + if (!flags) { + goto bail; + } + ret = PyTuple_Pack(3, path, hash, flags); + bail: + Py_XDECREF(path); + Py_XDECREF(hash); + Py_XDECREF(flags); + return ret; +} + +static PyTypeObject lazymanifestIterator = { + PyObject_HEAD_INIT(NULL) + 0, /*ob_size */ + "parsers.lazymanifest.iterator", /*tp_name */ + sizeof(lmIter), /*tp_basicsize */ + 0, /*tp_itemsize */ + lmiter_dealloc, /*tp_dealloc */ + 0, /*tp_print */ + 0, /*tp_getattr */ + 0, /*tp_setattr */ + 0, /*tp_compare */ + 0, /*tp_repr */ + 0, /*tp_as_number */ + 0, /*tp_as_sequence */ + 0, /*tp_as_mapping */ + 0, /*tp_hash */ + 0, /*tp_call */ + 0, /*tp_str */ + 0, /*tp_getattro */ + 0, /*tp_setattro */ + 0, /*tp_as_buffer */ + /* tp_flags: Py_TPFLAGS_HAVE_ITER tells python to + use tp_iter and tp_iternext fields. */ + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_ITER, + "Iterator for a lazymanifest.", /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + PyObject_SelfIter, /* tp_iter: __iter__() method */ + lmiter_iternext, /* tp_iternext: next() method */ +}; + +static lazymanifest *lazymanifest_copy(lazymanifest *self); + +static PyObject *lazymanifest_getiter(lazymanifest *self) +{ + lmIter *i = NULL; + lazymanifest *t = lazymanifest_copy(self); + if (!t) { + PyErr_NoMemory(); + return NULL; + } + i = PyObject_New(lmIter, &lazymanifestIterator); + if (i) { + i->m = t; + i->pos = -1; + } else { + Py_DECREF(t); + PyErr_NoMemory(); + } + return (PyObject *)i; +} + +/* __getitem__ and __setitem__ support */ + +static Py_ssize_t lazymanifest_size(lazymanifest *self) +{ + return self->livelines; +} + +static int linecmp(const void *left, const void *right) +{ + return strcmp(((const line *)left)->start, + ((const line *)right)->start); +} + +static PyObject *lazymanifest_getitem(lazymanifest *self, PyObject *key) +{ + line needle; + line *hit; + if (!PyString_Check(key)) { + PyErr_Format(PyExc_TypeError, + "getitem: manifest keys must be a string."); + return NULL; + } + needle.start = PyString_AsString(key); + hit = bsearch(&needle, self->lines, self->numlines, sizeof(line), + &linecmp); + if (!hit || hit->deleted) { + PyErr_Format(PyExc_KeyError, "No such manifest entry."); + return NULL; + } + return hashflags(hit); +} + +static int lazymanifest_delitem(lazymanifest *self, PyObject *key) +{ + line needle; + line *hit; + if (!PyString_Check(key)) { + PyErr_Format(PyExc_TypeError, + "delitem: manifest keys must be a string."); + return -1; + } + needle.start = PyString_AsString(key); + hit = bsearch(&needle, self->lines, self->numlines, sizeof(line), + &linecmp); + if (!hit || hit->deleted) { + PyErr_Format(PyExc_KeyError, + "Tried to delete nonexistent manifest entry."); + return -1; + } + self->dirty = true; + hit->deleted = true; + self->livelines--; + return 0; +} + +static int lazymanifest_setitem( + lazymanifest *self, PyObject *key, PyObject *value) +{ + char *path; + Py_ssize_t plen; + PyObject *pyhash; + Py_ssize_t hlen; + char *hash; + PyObject *pyflags; + char *flags; + Py_ssize_t flen; + size_t dlen; + char *dest; + int i; + line new; + line *hit; + if (!PyString_Check(key)) { + PyErr_Format(PyExc_TypeError, + "setitem: manifest keys must be a string."); + return -1; + } + if (!value) { + return lazymanifest_delitem(self, key); + } + if (!PyTuple_Check(value) || PyTuple_Size(value) != 2) { + PyErr_Format(PyExc_TypeError, + "Manifest values must be a tuple of (node, flags)."); + return -1; + } + if (PyString_AsStringAndSize(key, &path, &plen) == -1) { + return -1; + } + + pyhash = PyTuple_GetItem(value, 0); + if (!PyString_Check(pyhash)) { + PyErr_Format(PyExc_TypeError, + "node must be a 20-byte string"); + return -1; + } + hlen = PyString_Size(pyhash); + /* Some parts of the codebase try and set 21 or 22 + * byte "hash" values in order to perturb things for + * status. We have to preserve at least the 21st + * byte. Sigh. If there's a 22nd byte, we drop it on + * the floor, which works fine. + */ + if (hlen != 20 && hlen != 21 && hlen != 22) { + PyErr_Format(PyExc_TypeError, + "node must be a 20-byte string"); + return -1; + } + hash = PyString_AsString(pyhash); + + pyflags = PyTuple_GetItem(value, 1); + if (!PyString_Check(pyflags) || PyString_Size(pyflags) > 1) { + PyErr_Format(PyExc_TypeError, + "flags must a 0 or 1 byte string"); + return -1; + } + if (PyString_AsStringAndSize(pyflags, &flags, &flen) == -1) { + return -1; + } + /* one null byte and one newline */ + dlen = plen + 41 + flen + 1; + dest = malloc(dlen); + if (!dest) { + PyErr_NoMemory(); + return -1; + } + memcpy(dest, path, plen + 1); + for (i = 0; i < 20; i++) { + sprintf(dest + plen + 1 + (i * 2), "%02hhx", hash[i]); + } + memcpy(dest + plen + 41, flags, flen); + dest[plen + 41 + flen] = '\n'; + new.start = dest; + new.len = dlen; + new.hash_suffix = '\0'; + if (hlen > 20) { + new.hash_suffix = hash[20]; + } + new.from_malloc = true; /* is `start` a pointer we allocated? */ + new.deleted = false; /* is this entry deleted? */ + hit = bsearch(&new, self->lines, self->numlines, + sizeof(line), &linecmp); + self->dirty = true; + if (hit) { + /* updating a line we already had */ + if (hit->from_malloc) { + free(hit->start); + } + if (hit->deleted) { + self->livelines++; + } + *hit = new; + } else { + /* new line */ + if (!realloc_if_full(self)) { + PyErr_NoMemory(); + return -1; + } + self->lines[self->numlines++] = new; + self->livelines++; + /* TODO: do a binary search and insert rather than + * append and qsort. Also offer a batch-insert + * interface, which should be a nice little + * performance win. + */ + qsort(self->lines, self->numlines, sizeof(line), &linecmp); + } + return 0; +} + +static PyMappingMethods lazymanifest_mapping_methods = { + (lenfunc)lazymanifest_size, /* mp_length */ + (binaryfunc)lazymanifest_getitem, /* mp_subscript */ + (objobjargproc)lazymanifest_setitem, /* mp_ass_subscript */ +}; + +/* sequence methods (important or __contains__ builds an iterator */ + +static int lazymanifest_contains(lazymanifest *self, PyObject *key) +{ + line needle; + line *hit; + if (!PyString_Check(key)) { + /* Our keys are always strings, so if the contains + * check is for a non-string, just return false. */ + return 0; + } + needle.start = PyString_AsString(key); + hit = bsearch(&needle, self->lines, self->numlines, sizeof(line), + &linecmp); + if (!hit || hit->deleted) { + return 0; + } + return 1; +} + +static PySequenceMethods lazymanifest_seq_meths = { + (lenfunc)lazymanifest_size, /* sq_length */ + 0, /* sq_concat */ + 0, /* sq_repeat */ + 0, /* sq_item */ + 0, /* sq_slice */ + 0, /* sq_ass_item */ + 0, /* sq_ass_slice */ + (objobjproc)lazymanifest_contains, /* sq_contains */ + 0, /* sq_inplace_concat */ + 0, /* sq_inplace_repeat */ +}; + + +/* Other methods (copy, diff, etc) */ +static PyTypeObject lazymanifestType; + +/* If the manifest has changes, build the new manifest text and reindex it. */ +static int compact(lazymanifest *self) { + int i; + ssize_t need = 0; + char *data; + line *src, *dst; + PyObject *pydata; + if (!self->dirty) + return 0; + for (i = 0; i < self->numlines; i++) { + if (!self->lines[i].deleted) { + need += self->lines[i].len; + } + } + pydata = PyString_FromStringAndSize(NULL, need); + if (!pydata) + return -1; + data = PyString_AsString(pydata); + if (!data) { + return -1; + } + src = self->lines; + dst = self->lines; + for (i = 0; i < self->numlines; i++, src++) { + char *tofree = NULL; + if (src->from_malloc) { + tofree = src->start; + } + if (!src->deleted) { + memcpy(data, src->start, src->len); + *dst = *src; + dst->start = data; + dst->from_malloc = false; + data += dst->len; + dst++; + } + free(tofree); + } + Py_DECREF(self->pydata); + self->pydata = pydata; + self->numlines = self->livelines; + self->dirty = false; + return 0; +} + +static PyObject *lazymanifest_text(lazymanifest *self) +{ + if (compact(self) != 0) { + PyErr_NoMemory(); + return NULL; + } + Py_INCREF(self->pydata); + return self->pydata; +} + +static lazymanifest *lazymanifest_copy(lazymanifest *self) +{ + lazymanifest *copy = NULL; + if (compact(self) != 0) { + goto nomem; + } + copy = PyObject_New(lazymanifest, &lazymanifestType); + if (!copy) { + goto nomem; + } + copy->numlines = self->numlines; + copy->livelines = self->livelines; + copy->dirty = false; + copy->lines = malloc(self->maxlines *sizeof(line)); + if (!copy->lines) { + goto nomem; + } + memcpy(copy->lines, self->lines, self->numlines * sizeof(line)); + copy->maxlines = self->maxlines; + copy->pydata = self->pydata; + Py_INCREF(copy->pydata); + return copy; + nomem: + PyErr_NoMemory(); + Py_XDECREF(copy); + return NULL; +} + +static lazymanifest *lazymanifest_filtercopy( + lazymanifest *self, PyObject *matchfn) +{ + lazymanifest *copy = NULL; + int i; + if (!PyCallable_Check(matchfn)) { + PyErr_SetString(PyExc_TypeError, "matchfn must be callable"); + return NULL; + } + /* compact ourselves first to avoid double-frees later when we + * compact tmp so that it doesn't have random pointers to our + * underlying from_malloc-data (self->pydata is safe) */ + if (compact(self) != 0) { + goto nomem; + } + copy = PyObject_New(lazymanifest, &lazymanifestType); + copy->dirty = true; + copy->lines = malloc(self->maxlines * sizeof(line)); + if (!copy->lines) { + goto nomem; + } + copy->maxlines = self->maxlines; + copy->numlines = 0; + copy->pydata = self->pydata; + Py_INCREF(self->pydata); + for (i = 0; i < self->numlines; i++) { + PyObject *arg = PyString_FromString(self->lines[i].start); + PyObject *arglist = PyTuple_Pack(1, arg); + PyObject *result = PyObject_CallObject(matchfn, arglist); + Py_DECREF(arglist); + Py_DECREF(arg); + /* if the callback raised an exception, just let it + * through and give up */ + if (!result) { + free(copy->lines); + Py_DECREF(self->pydata); + return NULL; + } + if (PyObject_IsTrue(result)) { + assert(!(self->lines[i].from_malloc)); + copy->lines[copy->numlines++] = self->lines[i]; + } + Py_DECREF(result); + } + copy->livelines = copy->numlines; + return copy; + nomem: + PyErr_NoMemory(); + Py_XDECREF(copy); + return NULL; +} + +static PyObject *lazymanifest_diff(lazymanifest *self, PyObject *args) +{ + lazymanifest *other; + PyObject *pyclean = NULL; + bool listclean; + PyObject *emptyTup = NULL, *ret = NULL; + PyObject *es; + int sneedle = 0, oneedle = 0; + if (!PyArg_ParseTuple(args, "O!|O", &lazymanifestType, &other, &pyclean)) { + return NULL; + } + listclean = (!pyclean) ? false : PyObject_IsTrue(pyclean); + es = PyString_FromString(""); + if (!es) { + goto nomem; + } + emptyTup = PyTuple_Pack(2, Py_None, es); + Py_DECREF(es); + if (!emptyTup) { + goto nomem; + } + ret = PyDict_New(); + if (!ret) { + goto nomem; + } + while (sneedle != self->numlines || oneedle != other->numlines) { + line *left = self->lines + sneedle; + line *right = other->lines + oneedle; + int result; + PyObject *key; + PyObject *outer; + /* If we're looking at a deleted entry and it's not + * the end of the manifest, just skip it. */ + if (left->deleted && sneedle < self->numlines) { + sneedle++; + continue; + } + if (right->deleted && oneedle < other->numlines) { + oneedle++; + continue; + } + /* if we're at the end of either manifest, then we + * know the remaining items are adds so we can skip + * the strcmp. */ + if (sneedle == self->numlines) { + result = 1; + } else if (oneedle == other->numlines) { + result = -1; + } else { + result = linecmp(left, right); + } + key = result <= 0 ? + PyString_FromString(left->start) : + PyString_FromString(right->start); + if (!key) + goto nomem; + if (result < 0) { + PyObject *l = hashflags(left); + if (!l) { + goto nomem; + } + outer = PyTuple_Pack(2, l, emptyTup); + Py_DECREF(l); + if (!outer) { + goto nomem; + } + PyDict_SetItem(ret, key, outer); + Py_DECREF(outer); + sneedle++; + } else if (result > 0) { + PyObject *r = hashflags(right); + if (!r) { + goto nomem; + } + outer = PyTuple_Pack(2, emptyTup, r); + Py_DECREF(r); + if (!outer) { + goto nomem; + } + PyDict_SetItem(ret, key, outer); + Py_DECREF(outer); + oneedle++; + } else { + /* file exists in both manifests */ + if (left->len != right->len + || memcmp(left->start, right->start, left->len) + || left->hash_suffix != right->hash_suffix) { + PyObject *l = hashflags(left); + PyObject *r; + if (!l) { + goto nomem; + } + r = hashflags(right); + if (!r) { + Py_DECREF(l); + goto nomem; + } + outer = PyTuple_Pack(2, l, r); + Py_DECREF(l); + Py_DECREF(r); + if (!outer) { + goto nomem; + } + PyDict_SetItem(ret, key, outer); + Py_DECREF(outer); + } else if (listclean) { + PyDict_SetItem(ret, key, Py_None); + } + sneedle++; + oneedle++; + } + Py_DECREF(key); + } + Py_DECREF(emptyTup); + return ret; + nomem: + PyErr_NoMemory(); + Py_XDECREF(ret); + Py_XDECREF(emptyTup); + return NULL; +} + +static PyMethodDef lazymanifest_methods[] = { + {"copy", (PyCFunction)lazymanifest_copy, METH_NOARGS, + "Make a copy of this lazymanifest."}, + {"filtercopy", (PyCFunction)lazymanifest_filtercopy, METH_O, + "Make a copy of this manifest filtered by matchfn."}, + {"diff", (PyCFunction)lazymanifest_diff, METH_VARARGS, + "Compare this lazymanifest to another one."}, + {"text", (PyCFunction)lazymanifest_text, METH_NOARGS, + "Encode this manifest to text."}, + {NULL}, +}; + +static PyTypeObject lazymanifestType = { + PyObject_HEAD_INIT(NULL) + 0, /* ob_size */ + "parsers.lazymanifest", /* tp_name */ + sizeof(lazymanifest), /* tp_basicsize */ + 0, /* tp_itemsize */ + (destructor)lazymanifest_dealloc, /* tp_dealloc */ + 0, /* tp_print */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_compare */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + &lazymanifest_seq_meths, /* tp_as_sequence */ + &lazymanifest_mapping_methods, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call */ + 0, /* tp_str */ + 0, /* tp_getattro */ + 0, /* tp_setattro */ + 0, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_SEQUENCE_IN, /* tp_flags */ + "TODO(augie)", /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + (getiterfunc)lazymanifest_getiter, /* tp_iter */ + 0, /* tp_iternext */ + lazymanifest_methods, /* tp_methods */ + 0, /* tp_members */ + 0, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + (initproc)lazymanifest_init, /* tp_init */ + 0, /* tp_alloc */ +}; + +void manifest_module_init(PyObject * mod) +{ + lazymanifestType.tp_new = PyType_GenericNew; + if (PyType_Ready(&lazymanifestType) < 0) + return; + Py_INCREF(&lazymanifestType); + + PyModule_AddObject(mod, "lazymanifest", + (PyObject *)&lazymanifestType); +} diff -r e0c1328df872 -r a5f1bccd2996 mercurial/parsers.c --- a/mercurial/parsers.c Thu Mar 05 22:16:28 2015 -0800 +++ b/mercurial/parsers.c Tue Jan 13 14:31:38 2015 -0800 @@ -71,7 +71,7 @@ /* * Turn a hex-encoded string into binary. */ -static PyObject *unhexlify(const char *str, int len) +PyObject *unhexlify(const char *str, int len) { PyObject *ret; char *d; @@ -2318,6 +2318,7 @@ }; void dirs_module_init(PyObject *mod); +void manifest_module_init(PyObject *mod); static void module_init(PyObject *mod) { @@ -2332,6 +2333,7 @@ PyModule_AddStringConstant(mod, "versionerrortext", versionerrortext); dirs_module_init(mod); + manifest_module_init(mod); indexType.tp_new = PyType_GenericNew; if (PyType_Ready(&indexType) < 0 || diff -r e0c1328df872 -r a5f1bccd2996 setup.py --- a/setup.py Thu Mar 05 22:16:28 2015 -0800 +++ b/setup.py Tue Jan 13 14:31:38 2015 -0800 @@ -493,6 +493,7 @@ Extension('mercurial.mpatch', ['mercurial/mpatch.c'], depends=common_depends), Extension('mercurial.parsers', ['mercurial/dirs.c', + 'mercurial/manifest.c', 'mercurial/parsers.c', 'mercurial/pathencode.c'], depends=common_depends), diff -r e0c1328df872 -r a5f1bccd2996 tests/test-manifest.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/test-manifest.py Tue Jan 13 14:31:38 2015 -0800 @@ -0,0 +1,221 @@ +import binascii +import unittest +import itertools + +import silenttestrunner + +from mercurial import parsers + +HASH_1 = '1' * 40 +HASH_2 = 'f' * 40 +HASH_3 = '1234567890abcdef0987654321deadbeef0fcafe' +A_SHORT_MANIFEST = ( + 'bar/baz/qux.py\0%(hash2)s%(flag2)s\n' + 'foo\0%(hash1)s%(flag1)s\n' + ) % {'hash1': HASH_1, + 'flag1': '', + 'hash2': HASH_2, + 'flag2': 'l', + } + +HUGE_MANIFEST_ENTRIES = 200001 + +A_HUGE_MANIFEST = ''.join(sorted( + 'file%d\0%s%s\n' % (i, h, f) for i, h, f in + itertools.izip(xrange(200001), + itertools.cycle((HASH_1, HASH_2)), + itertools.cycle(('', 'x', 'l'))))) + +class testmanifest(unittest.TestCase): + + def assertIn(self, thing, container, msg=None): + # assertIn new in 2.7, use it if available, otherwise polyfill + sup = getattr(unittest.TestCase, 'assertIn', False) + if sup: + return sup(self, thing, container, msg=msg) + if not msg: + msg = 'Expected %r in %r' % (thing, container) + self.assert_(thing in container, msg) + + def testEmptyManifest(self): + m = parsers.lazymanifest('') + self.assertEqual(0, len(m)) + self.assertEqual([], list(m)) + + def testManifest(self): + m = parsers.lazymanifest(A_SHORT_MANIFEST) + want = [ + ('bar/baz/qux.py', binascii.unhexlify(HASH_2), 'l'), + ('foo', binascii.unhexlify(HASH_1), ''), + ] + self.assertEqual(len(want), len(m)) + self.assertEqual(want, list(m)) + self.assertEqual((binascii.unhexlify(HASH_1), ''), m['foo']) + self.assertRaises(KeyError, lambda : m['wat']) + self.assertEqual((binascii.unhexlify(HASH_2), 'l'), + m['bar/baz/qux.py']) + + def testSetItem(self): + want = binascii.unhexlify(HASH_1), '' + + m = parsers.lazymanifest('') + m['a'] = want + self.assertIn('a', m) + self.assertEqual(want, m['a']) + self.assertEqual('a\0' + HASH_1 + '\n', m.text()) + + m = parsers.lazymanifest(A_SHORT_MANIFEST) + m['a'] = want + self.assertEqual(want, m['a']) + self.assertEqual('a\0' + HASH_1 + '\n' + A_SHORT_MANIFEST, + m.text()) + m2 = m.copy() + del m + del m2 # make sure we don't double free() anything + + def testCompaction(self): + unhex = binascii.unhexlify + h1, h2 = unhex(HASH_1), unhex(HASH_2) + m = parsers.lazymanifest(A_SHORT_MANIFEST) + m['alpha'] = h1, '' + m['beta'] = h2, '' + del m['foo'] + want = 'alpha\0%s\nbar/baz/qux.py\0%sl\nbeta\0%s\n' % ( + HASH_1, HASH_2, HASH_2) + self.assertEqual(want, m.text()) + self.assertEqual(3, len(m)) + self.assertEqual((h1, ''), m['alpha']) + self.assertEqual((h2, ''), m['beta']) + self.assertRaises(KeyError, lambda : m['foo']) + w = [('alpha', h1, ''), ('bar/baz/qux.py', h2, 'l'), ('beta', h2, '')] + self.assertEqual(w, list(m)) + + def testSetGetNodeSuffix(self): + clean = parsers.lazymanifest(A_SHORT_MANIFEST) + m = parsers.lazymanifest(A_SHORT_MANIFEST) + h, f = m['foo'] + want = h + 'a', f + # Merge code wants to set 21-byte fake hashes at times + m['foo'] = want + self.assertEqual(want, m['foo']) + self.assertEqual([('bar/baz/qux.py', binascii.unhexlify(HASH_2), 'l'), + ('foo', binascii.unhexlify(HASH_1) + 'a', '')], + list(m)) + # Sometimes it even tries a 22-byte fake hash, but we can + # return 21 and it'll work out + m['foo'] = want[0] + '+', f + self.assertEqual(want, m['foo']) + # make sure the suffix survives a copy + m2 = m.filtercopy(lambda x: x == 'foo') + self.assertEqual(want, m2['foo']) + self.assertEqual(1, len(m2)) + self.assertEqual(('foo\0%s\n' % HASH_1), m2.text()) + m2 = m.copy() + self.assertEqual(want, m2['foo']) + # suffix with iteration + self.assertEqual([('bar/baz/qux.py', binascii.unhexlify(HASH_2), 'l'), + ('foo', want[0], '')], list(m)) + # shows up in diff + self.assertEqual({'foo': (want, (h, ''))}, m.diff(clean)) + self.assertEqual({'foo': ((h, ''), want)}, clean.diff(m)) + + def testFilterCopyException(self): + m = parsers.lazymanifest(A_SHORT_MANIFEST) + def filt(path): + if path == 'foo': + assert False + return True + self.assertRaises(AssertionError, m.filtercopy, filt) + + def testRemoveItem(self): + m = parsers.lazymanifest(A_SHORT_MANIFEST) + del m['foo'] + self.assertRaises(KeyError, lambda : m['foo']) + self.assertEqual(1, len(m)) + self.assertEqual(1, len(list(m))) + + def testManifestDiff(self): + MISSING = (None, '') + addl = 'z-only-in-left\0' + HASH_1 + '\n' + addr = 'z-only-in-right\0' + HASH_2 + 'x\n' + left = parsers.lazymanifest( + A_SHORT_MANIFEST.replace(HASH_1, HASH_3 + 'x') + addl) + right = parsers.lazymanifest(A_SHORT_MANIFEST + addr) + want = { + 'foo': ((binascii.unhexlify(HASH_3), 'x'), + (binascii.unhexlify(HASH_1), '')), + 'z-only-in-left': ((binascii.unhexlify(HASH_1), ''), MISSING), + 'z-only-in-right': (MISSING, (binascii.unhexlify(HASH_2), 'x')), + } + self.assertEqual(want, left.diff(right)) + + want = { + 'bar/baz/qux.py': (MISSING, (binascii.unhexlify(HASH_2), 'l')), + 'foo': (MISSING, (binascii.unhexlify(HASH_3), 'x')), + 'z-only-in-left': (MISSING, (binascii.unhexlify(HASH_1), '')), + } + self.assertEqual(want, parsers.lazymanifest('').diff(left)) + + want = { + 'bar/baz/qux.py': ((binascii.unhexlify(HASH_2), 'l'), MISSING), + 'foo': ((binascii.unhexlify(HASH_3), 'x'), MISSING), + 'z-only-in-left': ((binascii.unhexlify(HASH_1), ''), MISSING), + } + self.assertEqual(want, left.diff(parsers.lazymanifest(''))) + copy = right.copy() + del copy['z-only-in-right'] + del right['foo'] + want = { + 'foo': (MISSING, (binascii.unhexlify(HASH_1), '')), + 'z-only-in-right': ((binascii.unhexlify(HASH_2), 'x'), MISSING), + } + self.assertEqual(want, right.diff(copy)) + + short = parsers.lazymanifest(A_SHORT_MANIFEST) + pruned = short.copy() + del pruned['foo'] + want = { + 'foo': ((binascii.unhexlify(HASH_1), ''), MISSING), + } + self.assertEqual(want, short.diff(pruned)) + want = { + 'foo': (MISSING, (binascii.unhexlify(HASH_1), '')), + } + self.assertEqual(want, pruned.diff(short)) + want = { + 'bar/baz/qux.py': None, + 'foo': (MISSING, (binascii.unhexlify(HASH_1), '')), + } + self.assertEqual(want, pruned.diff(short, True)) + + def testReversedLines(self): + backwards = ''.join( + l + '\n' for l in reversed(A_SHORT_MANIFEST.split('\n')) if l) + try: + parsers.lazymanifest(backwards) + self.fail('Should have raised ValueError') + except ValueError, v: + self.assertIn('Manifest lines not in sorted order.', str(v)) + + def testNoTerminalNewline(self): + try: + parsers.lazymanifest(A_SHORT_MANIFEST + 'wat') + self.fail('Should have raised ValueError') + except ValueError, v: + self.assertIn('Manifest did not end in a newline.', str(v)) + + def testNoNewLineAtAll(self): + try: + parsers.lazymanifest('wat') + self.fail('Should have raised ValueError') + except ValueError, v: + self.assertIn('Manifest did not end in a newline.', str(v)) + + def testHugeManifest(self): + m = parsers.lazymanifest(A_HUGE_MANIFEST) + self.assertEqual(HUGE_MANIFEST_ENTRIES, len(m)) + self.assertEqual(len(m), len(list(m))) + + +if __name__ == '__main__': + silenttestrunner.main(__name__)