treemanifest: lazily load manifests
authorMartin von Zweigbergk <martinvonz@google.com>
Thu, 09 Apr 2015 17:14:35 -0700
changeset 25222 0de132d5328a
parent 25221 eafa06e9edde
child 25223 29be0450b667
treemanifest: lazily load manifests Most operations on treemanifests already visit only relevant submanifests. Notable examples include __getitem__, __contains__, walk/matches with matcher, diff. By making submanifests lazily loaded, we speed up all these operations. The lazy loading is achieved by adding a _load() method that gets defined where we currently eagerly parse the manifest. We make sure to call it before any access to _dirs, _files or _flags. Some timings on the Mozilla repo (with flat manifest timings for reference): hg cat -r . README.txt: 1.644s -> 0.096s (0.255s) hg diff -r .^ -r . : 1.746s -> 0.137s (0.431s) hg files -r . python : 1.508s -> 0.146s (0.335s) hg files -r . : 2.125s -> 2.203s (0.712s)
mercurial/manifest.py
tests/test-treemanifest.t
--- a/mercurial/manifest.py	Mon May 18 21:31:40 2015 -0700
+++ b/mercurial/manifest.py	Thu Apr 09 17:14:35 2015 -0700
@@ -441,10 +441,13 @@
     else:
         return '', f
 
+_noop = lambda: None
+
 class treemanifest(object):
     def __init__(self, dir='', text=''):
         self._dir = dir
         self._node = revlog.nullid
+        self._load = _noop
         self._dirty = False
         self._dirs = {}
         # Using _lazymanifest here is a little slower than plain old dicts
@@ -461,18 +464,22 @@
         return self._dir + path
 
     def __len__(self):
+        self._load()
         size = len(self._files)
         for m in self._dirs.values():
             size += m.__len__()
         return size
 
     def _isempty(self):
+        self._load() # for consistency; already loaded by all callers
         return (not self._files and (not self._dirs or
                 all(m._isempty() for m in self._dirs.values())))
 
     def __str__(self):
-        return ('<treemanifest dir=%s, node=%s, dirty=%s>' %
-                (self._dir, revlog.hex(self._node), self._dirty))
+        return ('<treemanifest dir=%s, node=%s, loaded=%s, dirty=%s>' %
+                (self._dir, revlog.hex(self._node),
+                 bool(self._load is _noop),
+                 self._dirty))
 
     def dir(self):
         '''The directory that this tree manifest represents, including a
@@ -491,6 +498,7 @@
         self._dirty = False
 
     def iteritems(self):
+        self._load()
         for p, n in sorted(self._dirs.items() + self._files.items()):
             if p in self._files:
                 yield self._subpath(p), n
@@ -499,6 +507,7 @@
                     yield f, sn
 
     def iterkeys(self):
+        self._load()
         for p in sorted(self._dirs.keys() + self._files.keys()):
             if p in self._files:
                 yield self._subpath(p)
@@ -515,6 +524,7 @@
     def __contains__(self, f):
         if f is None:
             return False
+        self._load()
         dir, subpath = _splittopdir(f)
         if dir:
             if dir not in self._dirs:
@@ -524,6 +534,7 @@
             return f in self._files
 
     def get(self, f, default=None):
+        self._load()
         dir, subpath = _splittopdir(f)
         if dir:
             if dir not in self._dirs:
@@ -533,6 +544,7 @@
             return self._files.get(f, default)
 
     def __getitem__(self, f):
+        self._load()
         dir, subpath = _splittopdir(f)
         if dir:
             return self._dirs[dir].__getitem__(subpath)
@@ -540,6 +552,7 @@
             return self._files[f]
 
     def flags(self, f):
+        self._load()
         dir, subpath = _splittopdir(f)
         if dir:
             if dir not in self._dirs:
@@ -551,6 +564,7 @@
             return self._flags.get(f, '')
 
     def find(self, f):
+        self._load()
         dir, subpath = _splittopdir(f)
         if dir:
             return self._dirs[dir].find(subpath)
@@ -558,6 +572,7 @@
             return self._files[f], self._flags.get(f, '')
 
     def __delitem__(self, f):
+        self._load()
         dir, subpath = _splittopdir(f)
         if dir:
             self._dirs[dir].__delitem__(subpath)
@@ -572,6 +587,7 @@
 
     def __setitem__(self, f, n):
         assert n is not None
+        self._load()
         dir, subpath = _splittopdir(f)
         if dir:
             if dir not in self._dirs:
@@ -584,6 +600,7 @@
     def setflag(self, f, flags):
         """Set the flags (symlink, executable) for path f."""
         assert 'd' not in flags
+        self._load()
         dir, subpath = _splittopdir(f)
         if dir:
             if dir not in self._dirs:
@@ -597,10 +614,19 @@
         copy = treemanifest(self._dir)
         copy._node = self._node
         copy._dirty = self._dirty
-        for d in self._dirs:
-            copy._dirs[d] = self._dirs[d].copy()
-        copy._files = dict.copy(self._files)
-        copy._flags = dict.copy(self._flags)
+        def _load():
+            self._load()
+            for d in self._dirs:
+                copy._dirs[d] = self._dirs[d].copy()
+            copy._files = dict.copy(self._files)
+            copy._flags = dict.copy(self._flags)
+            copy._load = _noop
+        copy._load = _load
+        if self._load == _noop:
+            # Chaining _load if it's _noop is functionally correct, but the
+            # chain may end up excessively long (stack overflow), and
+            # will prevent garbage collection of 'self'.
+            copy._load()
         return copy
 
     def filesnotin(self, m2):
@@ -609,6 +635,8 @@
         def _filesnotin(t1, t2):
             if t1._node == t2._node and not t1._dirty and not t2._dirty:
                 return
+            t1._load()
+            t2._load()
             for d, m1 in t1._dirs.iteritems():
                 if d in t2._dirs:
                     m2 = t2._dirs[d]
@@ -631,6 +659,7 @@
         return self._alldirs
 
     def hasdir(self, dir):
+        self._load()
         topdir, subdir = _splittopdir(dir)
         if topdir:
             if topdir in self._dirs:
@@ -673,6 +702,7 @@
             return
 
         # yield this dir's files and walk its submanifests
+        self._load()
         for p in sorted(self._dirs.keys() + self._files.keys()):
             if p in self._files:
                 fullp = self._subpath(p)
@@ -697,6 +727,7 @@
         if not match.visitdir(self._dir[:-1] or '.'):
             return ret
 
+        self._load()
         for fn in self._files:
             fullp = self._subpath(fn)
             if not match(fullp):
@@ -734,6 +765,8 @@
         def _diff(t1, t2):
             if t1._node == t2._node and not t1._dirty and not t2._dirty:
                 return
+            t1._load()
+            t2._load()
             for d, m1 in t1._dirs.iteritems():
                 m2 = t2._dirs.get(d, emptytree)
                 _diff(m1, m2)
@@ -784,6 +817,7 @@
 
     def text(self, usemanifestv2=False):
         """Get the full data of this manifest as a bytestring."""
+        self._load()
         flags = self.flags
         return _text(((f, self[f], flags(f)) for f in self.keys()),
                      usemanifestv2)
@@ -792,12 +826,23 @@
         """Get the full data of this directory as a bytestring. Make sure that
         any submanifests have been written first, so their nodeids are correct.
         """
+        self._load()
         flags = self.flags
         dirs = [(d[:-1], self._dirs[d]._node, 'd') for d in self._dirs]
         files = [(f, self._files[f], flags(f)) for f in self._files]
         return _text(sorted(dirs + files), usemanifestv2)
 
+    def read(self, gettext, readsubtree):
+        def _load():
+            # Mark as loaded already here, so __setitem__ and setflag() don't
+            # cause infinite loops when they try to load.
+            self._load = _noop
+            self.parse(gettext(), readsubtree)
+            self._dirty = False
+        self._load = _load
+
     def writesubtrees(self, m1, m2, writesubtree):
+        self._load() # for consistency; should never have any effect here
         emptytree = treemanifest()
         for d, subm in self._dirs.iteritems():
             subp1 = m1._dirs.get(d, emptytree)._node
@@ -891,15 +936,17 @@
             return self._newmanifest() # don't upset local cache
         if node in self._mancache:
             return self._mancache[node][0]
-        text = self.revision(node)
         if self._treeondisk:
+            def gettext():
+                return self.revision(node)
             def readsubtree(dir, subm):
                 return self.dirlog(dir).read(subm)
             m = self._newmanifest()
-            m.parse(text, readsubtree)
+            m.read(gettext, readsubtree)
             m.setnode(node)
             arraytext = None
         else:
+            text = self.revision(node)
             m = self._newmanifest(text)
             arraytext = array.array('c', text)
         self._mancache[node] = (m, arraytext)
--- a/tests/test-treemanifest.t	Mon May 18 21:31:40 2015 -0700
+++ b/tests/test-treemanifest.t	Thu Apr 09 17:14:35 2015 -0700
@@ -78,9 +78,11 @@
   $ rm before after
 
 Check that hg files (calls treemanifest.walk()) works
+without loading all directory revlogs
 
   $ hg co 'desc("add dir2")'
   2 files updated, 0 files merged, 0 files removed, 0 files unresolved
+  $ mv .hg/store/meta/dir2 .hg/store/meta/dir2-backup
   $ hg files -r . dir1
   dir1/a (glob)
   dir1/b (glob)
@@ -90,12 +92,14 @@
   dir1/dir2/b (glob)
 
 Check that status between revisions works (calls treemanifest.matches())
+without loading all directory revlogs
 
   $ hg status --rev 'desc("add dir1")' --rev . dir1
   A dir1/dir1/a
   A dir1/dir1/b
   A dir1/dir2/a
   A dir1/dir2/b
+  $ mv .hg/store/meta/dir2-backup .hg/store/meta/dir2
 
 Merge creates 2-parent revision of directory revlog