mercurial/pvec.py
changeset 16249 0d175ac527c1
child 17424 e7cfe3587ea4
equal deleted inserted replaced
16248:51e6f318bdf1 16249:0d175ac527c1
       
     1 # pvec.py - probabilistic vector clocks for Mercurial
       
     2 #
       
     3 # Copyright 2012 Matt Mackall <mpm@selenic.com>
       
     4 #
       
     5 # This software may be used and distributed according to the terms of the
       
     6 # GNU General Public License version 2 or any later version.
       
     7 
       
     8 '''
       
     9 A "pvec" is a changeset property based on the theory of vector clocks
       
    10 that can be compared to discover relatedness without consulting a
       
    11 graph. This can be useful for tasks like determining how a
       
    12 disconnected patch relates to a repository.
       
    13 
       
    14 Currently a pvec consist of 448 bits, of which 24 are 'depth' and the
       
    15 remainder are a bit vector. It is represented as a 70-character base85
       
    16 string.
       
    17 
       
    18 Construction:
       
    19 
       
    20 - a root changeset has a depth of 0 and a bit vector based on its hash
       
    21 - a normal commit has a changeset where depth is increased by one and
       
    22   one bit vector bit is flipped based on its hash
       
    23 - a merge changeset pvec is constructed by copying changes from one pvec into
       
    24   the other to balance its depth
       
    25 
       
    26 Properties:
       
    27 
       
    28 - for linear changes, difference in depth is always <= hamming distance
       
    29 - otherwise, changes are probably divergent
       
    30 - when hamming distance is < 200, we can reliably detect when pvecs are near
       
    31 
       
    32 Issues:
       
    33 
       
    34 - hamming distance ceases to work over distances of ~ 200
       
    35 - detecting divergence is less accurate when the common ancestor is very close
       
    36   to either revision or total distance is high
       
    37 - this could probably be improved by modeling the relation between
       
    38   delta and hdist
       
    39 
       
    40 Uses:
       
    41 
       
    42 - a patch pvec can be used to locate the nearest available common ancestor for
       
    43   resolving conflicts
       
    44 - ordering of patches can be established without a DAG
       
    45 - two head pvecs can be compared to determine whether push/pull/merge is needed
       
    46   and approximately how many changesets are involved
       
    47 - can be used to find a heuristic divergence measure between changesets on
       
    48   different branches
       
    49 '''
       
    50 
       
    51 import base85, util
       
    52 from node import nullrev
       
    53 
       
    54 _size = 448 # 70 chars b85-encoded
       
    55 _bytes = _size / 8
       
    56 _depthbits = 24
       
    57 _depthbytes = _depthbits / 8
       
    58 _vecbytes = _bytes - _depthbytes
       
    59 _vecbits = _vecbytes * 8
       
    60 _radius = (_vecbits - 30) / 2 # high probability vecs are related
       
    61 
       
    62 def _bin(bs):
       
    63     '''convert a bytestring to a long'''
       
    64     v = 0
       
    65     for b in bs:
       
    66         v = v * 256 + ord(b)
       
    67     return v
       
    68 
       
    69 def _str(v, l):
       
    70     bs = ""
       
    71     for p in xrange(l):
       
    72         bs = chr(v & 255) + bs
       
    73         v >>= 8
       
    74     return bs
       
    75 
       
    76 def _split(b):
       
    77     '''depth and bitvec'''
       
    78     return _bin(b[:_depthbytes]), _bin(b[_depthbytes:])
       
    79 
       
    80 def _join(depth, bitvec):
       
    81     return _str(depth, _depthbytes) + _str(bitvec, _vecbytes)
       
    82 
       
    83 def _hweight(x):
       
    84     c = 0
       
    85     while x:
       
    86         if x & 1:
       
    87             c += 1
       
    88         x >>= 1
       
    89     return c
       
    90 _htab = [_hweight(x) for x in xrange(256)]
       
    91 
       
    92 def _hamming(a, b):
       
    93     '''find the hamming distance between two longs'''
       
    94     d = a ^ b
       
    95     c = 0
       
    96     while d:
       
    97         c += _htab[d & 0xff]
       
    98         d >>= 8
       
    99     return c
       
   100 
       
   101 def _mergevec(x, y, c):
       
   102     # Ideally, this function would be x ^ y ^ ancestor, but finding
       
   103     # ancestors is a nuisance. So instead we find the minimal number
       
   104     # of changes to balance the depth and hamming distance
       
   105 
       
   106     d1, v1 = x
       
   107     d2, v2 = y
       
   108     if d1 < d2:
       
   109         d1, d2, v1, v2 = d2, d1, v2, v1
       
   110 
       
   111     hdist = _hamming(v1, v2)
       
   112     ddist = d1 - d2
       
   113     v = v1
       
   114     m = v1 ^ v2 # mask of different bits
       
   115     i = 1
       
   116 
       
   117     if hdist > ddist:
       
   118         # if delta = 10 and hdist = 100, then we need to go up 55 steps
       
   119         # to the ancestor and down 45
       
   120         changes = (hdist - ddist + 1) / 2
       
   121     else:
       
   122         # must make at least one change
       
   123         changes = 1
       
   124     depth = d1 + changes
       
   125 
       
   126     # copy changes from v2
       
   127     if m:
       
   128         while changes:
       
   129             if m & i:
       
   130                 v ^= i
       
   131                 changes -= 1
       
   132             i <<= 1
       
   133     else:
       
   134         v = _flipbit(v, c)
       
   135 
       
   136     return depth, v
       
   137 
       
   138 def _flipbit(v, node):
       
   139     # converting bit strings to longs is slow
       
   140     bit = (hash(node) & 0xffffffff) % _vecbits
       
   141     return v ^ (1<<bit)
       
   142 
       
   143 def ctxpvec(ctx):
       
   144     '''construct a pvec for ctx while filling in the cache'''
       
   145     r = ctx._repo
       
   146     if not util.safehasattr(r, "_pveccache"):
       
   147         r._pveccache = {}
       
   148     pvc = r._pveccache
       
   149     if ctx.rev() not in pvc:
       
   150         cl = r.changelog
       
   151         for n in xrange(ctx.rev() + 1):
       
   152             if n not in pvc:
       
   153                 node = cl.node(n)
       
   154                 p1, p2 = cl.parentrevs(n)
       
   155                 if p1 == nullrev:
       
   156                     # start with a 'random' vector at root
       
   157                     pvc[n] = (0, _bin((node * 3)[:_vecbytes]))
       
   158                 elif p2 == nullrev:
       
   159                     d, v = pvc[p1]
       
   160                     pvc[n] = (d + 1, _flipbit(v, node))
       
   161                 else:
       
   162                     pvc[n] = _mergevec(pvc[p1], pvc[p2], node)
       
   163     bs = _join(*pvc[ctx.rev()])
       
   164     return pvec(base85.b85encode(bs))
       
   165 
       
   166 class pvec(object):
       
   167     def __init__(self, hashorctx):
       
   168         if isinstance(hashorctx, str):
       
   169             self._bs = hashorctx
       
   170             self._depth, self._vec = _split(base85.b85decode(hashorctx))
       
   171         else:
       
   172             self._vec = ctxpvec(ctx)
       
   173 
       
   174     def __str__(self):
       
   175         return self._bs
       
   176 
       
   177     def __eq__(self, b):
       
   178         return self._vec == b._vec and self._depth == b._depth
       
   179 
       
   180     def __lt__(self, b):
       
   181         delta = b._depth - self._depth
       
   182         if delta < 0:
       
   183             return False # always correct
       
   184         if _hamming(self._vec, b._vec) > delta:
       
   185             return False
       
   186         return True
       
   187 
       
   188     def __gt__(self, b):
       
   189         return b < self
       
   190 
       
   191     def __or__(self, b):
       
   192         delta = abs(b._depth - self._depth)
       
   193         if _hamming(self._vec, b._vec) <= delta:
       
   194             return False
       
   195         return True
       
   196 
       
   197     def __sub__(self, b):
       
   198         if self | b:
       
   199             raise ValueError("concurrent pvecs")
       
   200         return self._depth - b._depth
       
   201 
       
   202     def distance(self, b):
       
   203         d = abs(b._depth - self._depth)
       
   204         h = _hamming(self._vec, b._vec)
       
   205         return max(d, h)
       
   206 
       
   207     def near(self, b):
       
   208         dist = abs(b.depth - self._depth)
       
   209         if dist > _radius or _hamming(self._vec, b._vec) > _radius:
       
   210             return False