mercurial: comparison mercurial/similar.py

equal deleted inserted replaced

-:ef4aa90b1e58
+:e6df01776e08
 from i18n import _
 import util
 import mdiff
 import bdiff
-def findrenames(repo, added, removed, threshold):
+def _findexactmatches(repo, added, removed):
-'''find renamed files -- yields (before, after, score) tuples'''
+'''find renamed files that have no changes
+Takes a list of new filectxs and a list of removed filectxs, and yields
+(before, after) tuples of exact matches.
+'''
+numfiles = len(added) + len(removed)
+# Get hashes of removed files.
+hashes = {}
+for i, fctx in enumerate(removed):
+repo.ui.progress(_('searching for exact renames'), i, total=numfiles)
+h = util.sha1(fctx.data()).digest()
+hashes[h] = fctx
+# For each added file, see if it corresponds to a removed file.
+for i, fctx in enumerate(added):
+repo.ui.progress(_('searching for exact renames'), i + len(removed),
+total=numfiles)
+h = util.sha1(fctx.data()).digest()
+if h in hashes:
+yield (hashes[h], fctx)
+# Done
+repo.ui.progress(_('searching for exact renames'), None)
+def _findsimilarmatches(repo, added, removed, threshold):
+'''find potentially renamed files based on similar file content
+Takes a list of new filectxs and a list of removed filectxs, and yields
+(before, after, score) tuples of partial matches.
+'''
 copies = {}
-ctx = repo['.']
 for i, r in enumerate(removed):
-repo.ui.progress(_('searching'), i, total=len(removed))
+repo.ui.progress(_('searching for similar files'), i, total=len(removed))
-if r not in ctx:
-continue
-fctx = ctx.filectx(r)
 # lazily load text
 @util.cachefunc
 def data():
-orig = fctx.data()
+orig = r.data()
 return orig, mdiff.splitnewlines(orig)
 def score(text):
-if not len(text):
-return 0.0
-if not fctx.cmp(text):
-return 1.0
-if threshold == 1.0:
-return 0.0
 orig, lines = data()
 # bdiff.blocks() returns blocks of matching lines
 # count the number of bytes in each
 equal = 0
 matches = bdiff.blocks(text, orig)
 lengths = len(text) + len(orig)
 return equal * 2.0 / lengths
 for a in added:
 bestscore = copies.get(a, (None, threshold))[1]
-myscore = score(repo.wread(a))
+myscore = score(a.data())
 if myscore >= bestscore:
 copies[a] = (r, myscore)
 repo.ui.progress(_('searching'), None)
 for dest, v in copies.iteritems():
 source, score = v
 yield source, dest, score
+def findrenames(repo, added, removed, threshold):
+'''find renamed files -- yields (before, after, score) tuples'''
+parentctx = repo['.']
+workingctx = repo[None]
+# Zero length files will be frequently unrelated to each other, and
+# tracking the deletion/addition of such a file will probably cause more
+# harm than good. We strip them out here to avoid matching them later on.
+addedfiles = set([workingctx[fp] for fp in added
+if workingctx[fp].size() > 0])
+removedfiles = set([parentctx[fp] for fp in removed
+if fp in parentctx and parentctx[fp].size() > 0])
+# Find exact matches.
+for (a, b) in _findexactmatches(repo,
+sorted(addedfiles),sorted( removedfiles)):
+addedfiles.remove(b)
+yield (a.path(), b.path(), 1.0)
+# If the user requested similar files to be matched, search for them also.
+if threshold < 1.0:
+for (a, b, score) in _findsimilarmatches(repo,
+sorted(addedfiles), sorted(removedfiles), threshold):
+yield (a.path(), b.path(), score)

changeset 11060	e6df01776e08
parent 11059	ef4aa90b1e58
child 11085	0c8646292ca4