similar: use cheaper hash() function to test exact matches
authorYuya Nishihara <yuya@tcha.org>
Thu, 23 Mar 2017 20:57:27 +0900
changeset 31584 985a98c6bad0
parent 31583 2efd9771323e
child 31585 c6921568cd20
similar: use cheaper hash() function to test exact matches We just need a hash table {fctx.data(): fctx} which doesn't keep fctx.data() in memory. Let's simply use hash(fctx.data()) to put data out from memory, and manage collided fctx objects by list. This isn't significantly faster than using sha1, but is more correct as we know SHA-1 collision attack is getting practical. Benchmark with 50k added/removed files, on tmpfs: $ hg addremove --dry-run --time -q previous: real 12.420 secs (user 11.120+0.000 sys 1.280+0.000) this patch: real 12.350 secs (user 11.210+0.000 sys 1.140+0.000)
mercurial/similar.py
--- a/mercurial/similar.py	Thu Mar 23 20:52:41 2017 +0900
+++ b/mercurial/similar.py	Thu Mar 23 20:57:27 2017 +0900
@@ -7,8 +7,6 @@
 
 from __future__ import absolute_import
 
-import hashlib
-
 from .i18n import _
 from . import (
     bdiff,
@@ -23,25 +21,29 @@
     '''
     numfiles = len(added) + len(removed)
 
-    # Get hashes of removed files.
+    # Build table of removed files: {hash(fctx.data()): [fctx, ...]}.
+    # We use hash() to discard fctx.data() from memory.
     hashes = {}
-    for i, fctx in enumerate(reversed(removed)):
+    for i, fctx in enumerate(removed):
         repo.ui.progress(_('searching for exact renames'), i, total=numfiles,
                          unit=_('files'))
-        h = hashlib.sha1(fctx.data()).digest()
-        hashes[h] = fctx
+        h = hash(fctx.data())
+        if h not in hashes:
+            hashes[h] = [fctx]
+        else:
+            hashes[h].append(fctx)
 
     # For each added file, see if it corresponds to a removed file.
     for i, fctx in enumerate(added):
         repo.ui.progress(_('searching for exact renames'), i + len(removed),
                 total=numfiles, unit=_('files'))
         adata = fctx.data()
-        h = hashlib.sha1(adata).digest()
-        if h in hashes:
-            rfctx = hashes[h]
+        h = hash(adata)
+        for rfctx in hashes.get(h, []):
             # compare between actual file contents for exact identity
             if adata == rfctx.data():
                 yield (rfctx, fctx)
+                break
 
     # Done
     repo.ui.progress(_('searching for exact renames'), None)