# HG changeset patch # User Martin von Zweigbergk # Date 1428624875 25200 # Node ID 0de132d5328acc43a9f267b559c199cf20e0a362 # Parent eafa06e9edde4bb5254ef1f577440fe9c16222f5 treemanifest: lazily load manifests Most operations on treemanifests already visit only relevant submanifests. Notable examples include __getitem__, __contains__, walk/matches with matcher, diff. By making submanifests lazily loaded, we speed up all these operations. The lazy loading is achieved by adding a _load() method that gets defined where we currently eagerly parse the manifest. We make sure to call it before any access to _dirs, _files or _flags. Some timings on the Mozilla repo (with flat manifest timings for reference): hg cat -r . README.txt: 1.644s -> 0.096s (0.255s) hg diff -r .^ -r . : 1.746s -> 0.137s (0.431s) hg files -r . python : 1.508s -> 0.146s (0.335s) hg files -r . : 2.125s -> 2.203s (0.712s) diff -r eafa06e9edde -r 0de132d5328a mercurial/manifest.py --- a/mercurial/manifest.py Mon May 18 21:31:40 2015 -0700 +++ b/mercurial/manifest.py Thu Apr 09 17:14:35 2015 -0700 @@ -441,10 +441,13 @@ else: return '', f +_noop = lambda: None + class treemanifest(object): def __init__(self, dir='', text=''): self._dir = dir self._node = revlog.nullid + self._load = _noop self._dirty = False self._dirs = {} # Using _lazymanifest here is a little slower than plain old dicts @@ -461,18 +464,22 @@ return self._dir + path def __len__(self): + self._load() size = len(self._files) for m in self._dirs.values(): size += m.__len__() return size def _isempty(self): + self._load() # for consistency; already loaded by all callers return (not self._files and (not self._dirs or all(m._isempty() for m in self._dirs.values()))) def __str__(self): - return ('' % - (self._dir, revlog.hex(self._node), self._dirty)) + return ('' % + (self._dir, revlog.hex(self._node), + bool(self._load is _noop), + self._dirty)) def dir(self): '''The directory that this tree manifest represents, including a @@ -491,6 +498,7 @@ self._dirty = False def iteritems(self): + self._load() for p, n in sorted(self._dirs.items() + self._files.items()): if p in self._files: yield self._subpath(p), n @@ -499,6 +507,7 @@ yield f, sn def iterkeys(self): + self._load() for p in sorted(self._dirs.keys() + self._files.keys()): if p in self._files: yield self._subpath(p) @@ -515,6 +524,7 @@ def __contains__(self, f): if f is None: return False + self._load() dir, subpath = _splittopdir(f) if dir: if dir not in self._dirs: @@ -524,6 +534,7 @@ return f in self._files def get(self, f, default=None): + self._load() dir, subpath = _splittopdir(f) if dir: if dir not in self._dirs: @@ -533,6 +544,7 @@ return self._files.get(f, default) def __getitem__(self, f): + self._load() dir, subpath = _splittopdir(f) if dir: return self._dirs[dir].__getitem__(subpath) @@ -540,6 +552,7 @@ return self._files[f] def flags(self, f): + self._load() dir, subpath = _splittopdir(f) if dir: if dir not in self._dirs: @@ -551,6 +564,7 @@ return self._flags.get(f, '') def find(self, f): + self._load() dir, subpath = _splittopdir(f) if dir: return self._dirs[dir].find(subpath) @@ -558,6 +572,7 @@ return self._files[f], self._flags.get(f, '') def __delitem__(self, f): + self._load() dir, subpath = _splittopdir(f) if dir: self._dirs[dir].__delitem__(subpath) @@ -572,6 +587,7 @@ def __setitem__(self, f, n): assert n is not None + self._load() dir, subpath = _splittopdir(f) if dir: if dir not in self._dirs: @@ -584,6 +600,7 @@ def setflag(self, f, flags): """Set the flags (symlink, executable) for path f.""" assert 'd' not in flags + self._load() dir, subpath = _splittopdir(f) if dir: if dir not in self._dirs: @@ -597,10 +614,19 @@ copy = treemanifest(self._dir) copy._node = self._node copy._dirty = self._dirty - for d in self._dirs: - copy._dirs[d] = self._dirs[d].copy() - copy._files = dict.copy(self._files) - copy._flags = dict.copy(self._flags) + def _load(): + self._load() + for d in self._dirs: + copy._dirs[d] = self._dirs[d].copy() + copy._files = dict.copy(self._files) + copy._flags = dict.copy(self._flags) + copy._load = _noop + copy._load = _load + if self._load == _noop: + # Chaining _load if it's _noop is functionally correct, but the + # chain may end up excessively long (stack overflow), and + # will prevent garbage collection of 'self'. + copy._load() return copy def filesnotin(self, m2): @@ -609,6 +635,8 @@ def _filesnotin(t1, t2): if t1._node == t2._node and not t1._dirty and not t2._dirty: return + t1._load() + t2._load() for d, m1 in t1._dirs.iteritems(): if d in t2._dirs: m2 = t2._dirs[d] @@ -631,6 +659,7 @@ return self._alldirs def hasdir(self, dir): + self._load() topdir, subdir = _splittopdir(dir) if topdir: if topdir in self._dirs: @@ -673,6 +702,7 @@ return # yield this dir's files and walk its submanifests + self._load() for p in sorted(self._dirs.keys() + self._files.keys()): if p in self._files: fullp = self._subpath(p) @@ -697,6 +727,7 @@ if not match.visitdir(self._dir[:-1] or '.'): return ret + self._load() for fn in self._files: fullp = self._subpath(fn) if not match(fullp): @@ -734,6 +765,8 @@ def _diff(t1, t2): if t1._node == t2._node and not t1._dirty and not t2._dirty: return + t1._load() + t2._load() for d, m1 in t1._dirs.iteritems(): m2 = t2._dirs.get(d, emptytree) _diff(m1, m2) @@ -784,6 +817,7 @@ def text(self, usemanifestv2=False): """Get the full data of this manifest as a bytestring.""" + self._load() flags = self.flags return _text(((f, self[f], flags(f)) for f in self.keys()), usemanifestv2) @@ -792,12 +826,23 @@ """Get the full data of this directory as a bytestring. Make sure that any submanifests have been written first, so their nodeids are correct. """ + self._load() flags = self.flags dirs = [(d[:-1], self._dirs[d]._node, 'd') for d in self._dirs] files = [(f, self._files[f], flags(f)) for f in self._files] return _text(sorted(dirs + files), usemanifestv2) + def read(self, gettext, readsubtree): + def _load(): + # Mark as loaded already here, so __setitem__ and setflag() don't + # cause infinite loops when they try to load. + self._load = _noop + self.parse(gettext(), readsubtree) + self._dirty = False + self._load = _load + def writesubtrees(self, m1, m2, writesubtree): + self._load() # for consistency; should never have any effect here emptytree = treemanifest() for d, subm in self._dirs.iteritems(): subp1 = m1._dirs.get(d, emptytree)._node @@ -891,15 +936,17 @@ return self._newmanifest() # don't upset local cache if node in self._mancache: return self._mancache[node][0] - text = self.revision(node) if self._treeondisk: + def gettext(): + return self.revision(node) def readsubtree(dir, subm): return self.dirlog(dir).read(subm) m = self._newmanifest() - m.parse(text, readsubtree) + m.read(gettext, readsubtree) m.setnode(node) arraytext = None else: + text = self.revision(node) m = self._newmanifest(text) arraytext = array.array('c', text) self._mancache[node] = (m, arraytext) diff -r eafa06e9edde -r 0de132d5328a tests/test-treemanifest.t --- a/tests/test-treemanifest.t Mon May 18 21:31:40 2015 -0700 +++ b/tests/test-treemanifest.t Thu Apr 09 17:14:35 2015 -0700 @@ -78,9 +78,11 @@ $ rm before after Check that hg files (calls treemanifest.walk()) works +without loading all directory revlogs $ hg co 'desc("add dir2")' 2 files updated, 0 files merged, 0 files removed, 0 files unresolved + $ mv .hg/store/meta/dir2 .hg/store/meta/dir2-backup $ hg files -r . dir1 dir1/a (glob) dir1/b (glob) @@ -90,12 +92,14 @@ dir1/dir2/b (glob) Check that status between revisions works (calls treemanifest.matches()) +without loading all directory revlogs $ hg status --rev 'desc("add dir1")' --rev . dir1 A dir1/dir1/a A dir1/dir1/b A dir1/dir2/a A dir1/dir2/b + $ mv .hg/store/meta/dir2-backup .hg/store/meta/dir2 Merge creates 2-parent revision of directory revlog