mercurial/similar.py
changeset 11060 e6df01776e08
parent 11059 ef4aa90b1e58
child 11085 0c8646292ca4
--- a/mercurial/similar.py	Sat Apr 03 11:58:16 2010 +1100
+++ b/mercurial/similar.py	Sat Apr 03 11:58:16 2010 +1100
@@ -10,29 +10,49 @@
 import mdiff
 import bdiff
 
-def findrenames(repo, added, removed, threshold):
-    '''find renamed files -- yields (before, after, score) tuples'''
+def _findexactmatches(repo, added, removed):
+    '''find renamed files that have no changes
+
+    Takes a list of new filectxs and a list of removed filectxs, and yields
+    (before, after) tuples of exact matches.
+    '''
+    numfiles = len(added) + len(removed)
+
+    # Get hashes of removed files.
+    hashes = {}
+    for i, fctx in enumerate(removed):
+        repo.ui.progress(_('searching for exact renames'), i, total=numfiles)
+        h = util.sha1(fctx.data()).digest()
+        hashes[h] = fctx
+
+    # For each added file, see if it corresponds to a removed file.
+    for i, fctx in enumerate(added):
+        repo.ui.progress(_('searching for exact renames'), i + len(removed),
+                total=numfiles)
+        h = util.sha1(fctx.data()).digest()
+        if h in hashes:
+            yield (hashes[h], fctx)
+
+    # Done
+    repo.ui.progress(_('searching for exact renames'), None)
+
+def _findsimilarmatches(repo, added, removed, threshold):
+    '''find potentially renamed files based on similar file content
+
+    Takes a list of new filectxs and a list of removed filectxs, and yields
+    (before, after, score) tuples of partial matches.
+    '''
     copies = {}
-    ctx = repo['.']
     for i, r in enumerate(removed):
-        repo.ui.progress(_('searching'), i, total=len(removed))
-        if r not in ctx:
-            continue
-        fctx = ctx.filectx(r)
+        repo.ui.progress(_('searching for similar files'), i, total=len(removed))
 
         # lazily load text
         @util.cachefunc
         def data():
-            orig = fctx.data()
+            orig = r.data()
             return orig, mdiff.splitnewlines(orig)
 
         def score(text):
-            if not len(text):
-                return 0.0
-            if not fctx.cmp(text):
-                return 1.0
-            if threshold == 1.0:
-                return 0.0
             orig, lines = data()
             # bdiff.blocks() returns blocks of matching lines
             # count the number of bytes in each
@@ -47,7 +67,7 @@
 
         for a in added:
             bestscore = copies.get(a, (None, threshold))[1]
-            myscore = score(repo.wread(a))
+            myscore = score(a.data())
             if myscore >= bestscore:
                 copies[a] = (r, myscore)
     repo.ui.progress(_('searching'), None)
@@ -56,4 +76,28 @@
         source, score = v
         yield source, dest, score
 
+def findrenames(repo, added, removed, threshold):
+    '''find renamed files -- yields (before, after, score) tuples'''
+    parentctx = repo['.']
+    workingctx = repo[None]
 
+    # Zero length files will be frequently unrelated to each other, and
+    # tracking the deletion/addition of such a file will probably cause more
+    # harm than good. We strip them out here to avoid matching them later on.
+    addedfiles = set([workingctx[fp] for fp in added
+            if workingctx[fp].size() > 0])
+    removedfiles = set([parentctx[fp] for fp in removed
+            if fp in parentctx and parentctx[fp].size() > 0])
+
+    # Find exact matches.
+    for (a, b) in _findexactmatches(repo,
+            sorted(addedfiles),sorted( removedfiles)):
+        addedfiles.remove(b)
+        yield (a.path(), b.path(), 1.0)
+
+    # If the user requested similar files to be matched, search for them also.
+    if threshold < 1.0:
+        for (a, b, score) in _findsimilarmatches(repo,
+                sorted(addedfiles), sorted(removedfiles), threshold):
+            yield (a.path(), b.path(), score)
+