mercurial/similar.py
changeset 11060 e6df01776e08
parent 11059 ef4aa90b1e58
child 11085 0c8646292ca4
equal deleted inserted replaced
11059:ef4aa90b1e58 11060:e6df01776e08
     8 from i18n import _
     8 from i18n import _
     9 import util
     9 import util
    10 import mdiff
    10 import mdiff
    11 import bdiff
    11 import bdiff
    12 
    12 
    13 def findrenames(repo, added, removed, threshold):
    13 def _findexactmatches(repo, added, removed):
    14     '''find renamed files -- yields (before, after, score) tuples'''
    14     '''find renamed files that have no changes
       
    15 
       
    16     Takes a list of new filectxs and a list of removed filectxs, and yields
       
    17     (before, after) tuples of exact matches.
       
    18     '''
       
    19     numfiles = len(added) + len(removed)
       
    20 
       
    21     # Get hashes of removed files.
       
    22     hashes = {}
       
    23     for i, fctx in enumerate(removed):
       
    24         repo.ui.progress(_('searching for exact renames'), i, total=numfiles)
       
    25         h = util.sha1(fctx.data()).digest()
       
    26         hashes[h] = fctx
       
    27 
       
    28     # For each added file, see if it corresponds to a removed file.
       
    29     for i, fctx in enumerate(added):
       
    30         repo.ui.progress(_('searching for exact renames'), i + len(removed),
       
    31                 total=numfiles)
       
    32         h = util.sha1(fctx.data()).digest()
       
    33         if h in hashes:
       
    34             yield (hashes[h], fctx)
       
    35 
       
    36     # Done
       
    37     repo.ui.progress(_('searching for exact renames'), None)
       
    38 
       
    39 def _findsimilarmatches(repo, added, removed, threshold):
       
    40     '''find potentially renamed files based on similar file content
       
    41 
       
    42     Takes a list of new filectxs and a list of removed filectxs, and yields
       
    43     (before, after, score) tuples of partial matches.
       
    44     '''
    15     copies = {}
    45     copies = {}
    16     ctx = repo['.']
       
    17     for i, r in enumerate(removed):
    46     for i, r in enumerate(removed):
    18         repo.ui.progress(_('searching'), i, total=len(removed))
    47         repo.ui.progress(_('searching for similar files'), i, total=len(removed))
    19         if r not in ctx:
       
    20             continue
       
    21         fctx = ctx.filectx(r)
       
    22 
    48 
    23         # lazily load text
    49         # lazily load text
    24         @util.cachefunc
    50         @util.cachefunc
    25         def data():
    51         def data():
    26             orig = fctx.data()
    52             orig = r.data()
    27             return orig, mdiff.splitnewlines(orig)
    53             return orig, mdiff.splitnewlines(orig)
    28 
    54 
    29         def score(text):
    55         def score(text):
    30             if not len(text):
       
    31                 return 0.0
       
    32             if not fctx.cmp(text):
       
    33                 return 1.0
       
    34             if threshold == 1.0:
       
    35                 return 0.0
       
    36             orig, lines = data()
    56             orig, lines = data()
    37             # bdiff.blocks() returns blocks of matching lines
    57             # bdiff.blocks() returns blocks of matching lines
    38             # count the number of bytes in each
    58             # count the number of bytes in each
    39             equal = 0
    59             equal = 0
    40             matches = bdiff.blocks(text, orig)
    60             matches = bdiff.blocks(text, orig)
    45             lengths = len(text) + len(orig)
    65             lengths = len(text) + len(orig)
    46             return equal * 2.0 / lengths
    66             return equal * 2.0 / lengths
    47 
    67 
    48         for a in added:
    68         for a in added:
    49             bestscore = copies.get(a, (None, threshold))[1]
    69             bestscore = copies.get(a, (None, threshold))[1]
    50             myscore = score(repo.wread(a))
    70             myscore = score(a.data())
    51             if myscore >= bestscore:
    71             if myscore >= bestscore:
    52                 copies[a] = (r, myscore)
    72                 copies[a] = (r, myscore)
    53     repo.ui.progress(_('searching'), None)
    73     repo.ui.progress(_('searching'), None)
    54 
    74 
    55     for dest, v in copies.iteritems():
    75     for dest, v in copies.iteritems():
    56         source, score = v
    76         source, score = v
    57         yield source, dest, score
    77         yield source, dest, score
    58 
    78 
       
    79 def findrenames(repo, added, removed, threshold):
       
    80     '''find renamed files -- yields (before, after, score) tuples'''
       
    81     parentctx = repo['.']
       
    82     workingctx = repo[None]
    59 
    83 
       
    84     # Zero length files will be frequently unrelated to each other, and
       
    85     # tracking the deletion/addition of such a file will probably cause more
       
    86     # harm than good. We strip them out here to avoid matching them later on.
       
    87     addedfiles = set([workingctx[fp] for fp in added
       
    88             if workingctx[fp].size() > 0])
       
    89     removedfiles = set([parentctx[fp] for fp in removed
       
    90             if fp in parentctx and parentctx[fp].size() > 0])
       
    91 
       
    92     # Find exact matches.
       
    93     for (a, b) in _findexactmatches(repo,
       
    94             sorted(addedfiles),sorted( removedfiles)):
       
    95         addedfiles.remove(b)
       
    96         yield (a.path(), b.path(), 1.0)
       
    97 
       
    98     # If the user requested similar files to be matched, search for them also.
       
    99     if threshold < 1.0:
       
   100         for (a, b, score) in _findsimilarmatches(repo,
       
   101                 sorted(addedfiles), sorted(removedfiles), threshold):
       
   102             yield (a.path(), b.path(), score)
       
   103