changegroup: avoid iterating the whole manifest
authorAugie Fackler <augie@google.com>
Fri, 04 Dec 2015 10:34:58 -0500
changeset 27237 c08814b48ae5
parent 27236 b0d90fef16b6
child 27238 c3dc03109401
changegroup: avoid iterating the whole manifest The old code gathered the list of all files that changed anywhere in history and then gathered changed file nodes by walking the entirety of each manifest to be sent in order to gather changed file nodes. That's going to be unfortunate for narrowhg, and it's already inefficient for medium-to-large repositories. Timings for bundle --all on my hg repo, tested with hgperf: Before: ! wall 23.442445 comb 23.440000 user 23.250000 sys 0.190000 (best of 3) After: ! wall 20.272187 comb 20.270000 user 20.190000 sys 0.080000 (best of 3)
mercurial/changegroup.py
--- a/mercurial/changegroup.py	Fri Dec 04 15:59:46 2015 -0500
+++ b/mercurial/changegroup.py	Fri Dec 04 10:34:58 2015 -0500
@@ -613,7 +613,8 @@
         clrevorder = {}
         mfs = {} # needed manifests
         fnodes = {} # needed file nodes
-        changedfiles = set()
+        # maps manifest node id -> set(changed files)
+        mfchangedfiles = {}
 
         # Callback for the changelog, used to collect changed files and manifest
         # nodes.
@@ -621,9 +622,12 @@
         def lookupcl(x):
             c = cl.read(x)
             clrevorder[x] = len(clrevorder)
-            changedfiles.update(c[3])
+            n = c[0]
             # record the first changeset introducing this manifest version
-            mfs.setdefault(c[0], x)
+            mfs.setdefault(n, x)
+            # Record a complete list of potentially-changed files in
+            # this manifest.
+            mfchangedfiles.setdefault(n, set()).update(c[3])
             return x
 
         self._verbosenote(_('uncompressed size of bundle content:\n'))
@@ -668,8 +672,12 @@
             clnode = mfs[x]
             if not fastpathlinkrev:
                 mdata = ml.readfast(x)
-                for f, n in mdata.iteritems():
-                    if f in changedfiles:
+                for f in mfchangedfiles[x]:
+                    if True:
+                        try:
+                            n = mdata[f]
+                        except KeyError:
+                            continue
                         # record the first changeset introducing this filelog
                         # version
                         fclnodes = fnodes.setdefault(f, {})
@@ -696,6 +704,9 @@
                 return dict(genfilenodes())
             return fnodes.get(fname, {})
 
+        changedfiles = set()
+        for x in mfchangedfiles.itervalues():
+            changedfiles.update(x)
         for chunk in self.generatefiles(changedfiles, linknodes, commonrevs,
                                         source):
             yield chunk