discovery: add new set-based discovery
authorPeter Arrenbrecht <peter.arrenbrecht@gmail.com>
Mon, 02 May 2011 19:21:30 +0200
changeset 14164 cb98fed52495
parent 14163 38184a72d793
child 14165 78bdfc756908
discovery: add new set-based discovery Adds a new discovery method based on repeatedly sampling the still undecided subset of the local node graph to determine the set of nodes common to both the client and the server. For small differences between client and server, it uses about the same or slightly fewer roundtrips than the old tree-based discovery. For larger differences, it typically reduces the number of roundtrips drastically (from 150 to 4, for instance). The old discovery code now lives in treediscovery.py, the new code is in setdiscovery.py. Still missing is a hook for extensions to contribute nodes to the initial sample. For instance, Augie's remotebranches could contribute the last known state of the server's heads. Credits for the actual sampler and computing common heads instead of bases go to Benoit Boissinot.
mercurial/commands.py
mercurial/dagutil.py
mercurial/discovery.py
mercurial/revlog.py
mercurial/setdiscovery.py
mercurial/treediscovery.py
tests/test-acl.t
tests/test-bookmarks-pushpull.t
tests/test-bundle.t
tests/test-debugcomplete.t
tests/test-hook.t
tests/test-push-warn.t
tests/test-schemes.t
tests/test-setdiscovery.t
tests/test-ssh.t
--- a/mercurial/commands.py	Mon May 02 19:21:30 2011 +0200
+++ b/mercurial/commands.py	Mon May 02 19:21:30 2011 +0200
@@ -15,6 +15,7 @@
 import merge as mergemod
 import minirst, revset, templatefilters
 import dagparser, context, simplemerge
+import random, setdiscovery, treediscovery, dagutil
 
 # Commands start here, listed alphabetically
 
@@ -1471,6 +1472,65 @@
     else:
         raise util.Abort(_("no ignore patterns found"))
 
+def debugdiscovery(ui, repo, remoteurl="default", **opts):
+    """runs the changeset discovery protocol in isolation"""
+    remoteurl, branches = hg.parseurl(ui.expandpath(remoteurl), opts.get('branch'))
+    remote = hg.repository(hg.remoteui(repo, opts), remoteurl)
+    ui.status(_('comparing with %s\n') % util.hidepassword(remoteurl))
+
+    # make sure tests are repeatable
+    random.seed(12323)
+
+    def doit(localheads, remoteheads):
+        if opts.get('old'):
+            if localheads:
+                raise util.Abort('cannot use localheads with old style discovery')
+            common, _in, hds = treediscovery.findcommonincoming(repo, remote,
+                                                                force=True)
+            common = set(common)
+            if not opts.get('nonheads'):
+                ui.write("unpruned common: %s\n" % " ".join([short(n)
+                                                            for n in common]))
+                dag = dagutil.revlogdag(repo.changelog)
+                all = dag.ancestorset(dag.internalizeall(common))
+                common = dag.externalizeall(dag.headsetofconnecteds(all))
+        else:
+            common, any, hds = setdiscovery.findcommonheads(ui, repo, remote)
+        common = set(common)
+        rheads = set(hds)
+        lheads = set(repo.heads())
+        ui.write("common heads: %s\n" % " ".join([short(n) for n in common]))
+        if lheads <= common:
+            ui.write("local is subset\n")
+        elif rheads <= common:
+            ui.write("remote is subset\n")
+
+    serverlogs = opts.get('serverlog')
+    if serverlogs:
+        for filename in serverlogs:
+            logfile = open(filename, 'r')
+            try:
+                line = logfile.readline()
+                while line:
+                    parts = line.strip().split(';')
+                    op = parts[1]
+                    if op == 'cg':
+                        pass
+                    elif op == 'cgss':
+                        doit(parts[2].split(' '), parts[3].split(' '))
+                    elif op == 'unb':
+                        doit(parts[3].split(' '), parts[2].split(' '))
+                    line = logfile.readline()
+            finally:
+                logfile.close()
+
+    else:
+        remoterevs, _checkout = hg.addbranchrevs(repo, remote, branches,
+                                                 opts.get('remote_head'))
+        localrevs = opts.get('local_head')
+        doit(localrevs, remoterevs)
+
+
 def debugindex(ui, repo, file_, **opts):
     """dump the contents of an index file"""
     r = None
@@ -4513,6 +4573,14 @@
          [('e', 'extended', None, _('try extended date formats'))],
          _('[-e] DATE [RANGE]')),
     "debugdata": (debugdata, [], _('FILE REV')),
+    "debugdiscovery": (debugdiscovery,
+         [('', 'old', None,
+           _('use old-style discovery')),
+          ('', 'nonheads', None,
+           _('use old-style discovery with non-heads included')),
+         ] + remoteopts,
+         _('[-l REV] [-r REV] [-b BRANCH]...'
+           ' [OTHER]')),
     "debugfsinfo": (debugfsinfo, [], _('[PATH]')),
     "debuggetbundle":
         (debuggetbundle,
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mercurial/dagutil.py	Mon May 02 19:21:30 2011 +0200
@@ -0,0 +1,242 @@
+# dagutil.py - dag utilities for mercurial
+#
+# Copyright 2010 Benoit Boissinot <bboissin@gmail.com>
+# and Peter Arrenbrecht <peter@arrenbrecht.ch>
+#
+# This software may be used and distributed according to the terms of the
+# GNU General Public License version 2 or any later version.
+
+from node import nullrev
+
+
+class basedag(object):
+    '''generic interface for DAGs
+
+    terms:
+    "ix" (short for index) identifies a nodes internally,
+    "id" identifies one externally.
+
+    All params are ixs unless explicitly suffixed otherwise.
+    Pluralized params are lists or sets.
+    '''
+
+    def __init__(self):
+        self._inverse = None
+
+    def nodeset(self):
+        '''set of all node idxs'''
+        raise NotImplementedError()
+
+    def heads(self):
+        '''list of head ixs'''
+        raise NotImplementedError()
+
+    def parents(self, ix):
+        '''list of parents ixs of ix'''
+        raise NotImplementedError()
+
+    def inverse(self):
+        '''inverse DAG, where parents becomes children, etc.'''
+        raise NotImplementedError()
+
+    def ancestorset(self, starts, stops=None):
+        '''set of all ancestors of starts (incl), but stop walk at stops (excl)'''
+        raise NotImplementedError()
+
+    def descendantset(self, starts, stops=None):
+        '''set of all descendants of starts (incl), but stop walk at stops (excl)'''
+        return self.inverse().ancestorset(starts, stops)
+
+    def headsetofconnecteds(self, ixs):
+        '''subset of connected list of ixs so that no node has a descendant in it
+
+        By "connected list" we mean that if an ancestor and a descendant are in
+        the list, then so is at least one path connecting them.'''
+        raise NotImplementedError()
+
+    def externalize(self, ix):
+        '''return a list of (or set if given a set) of node ids'''
+        return self._externalize(ix)
+
+    def externalizeall(self, ixs):
+        '''return a list of (or set if given a set) of node ids'''
+        ids = self._externalizeall(ixs)
+        if isinstance(ixs, set):
+            return set(ids)
+        return list(ids)
+
+    def internalize(self, id):
+        '''return a list of (or set if given a set) of node ixs'''
+        return self._internalize(id)
+
+    def internalizeall(self, ids, filterunknown=False):
+        '''return a list of (or set if given a set) of node ids'''
+        ixs = self._internalizeall(ids, filterunknown)
+        if isinstance(ids, set):
+            return set(ixs)
+        return list(ixs)
+
+
+class genericdag(basedag):
+    '''generic implementations for DAGs'''
+
+    def ancestorset(self, starts, stops=None):
+        stops = stops and set(stops) or set()
+        seen = set()
+        pending = list(starts)
+        while pending:
+            n = pending.pop()
+            if n not in seen and n not in stops:
+                seen.add(n)
+                pending.extend(self.parents(n))
+        return seen
+
+    def headsetofconnecteds(self, ixs):
+        hds = set(ixs)
+        if not hds:
+            return hds
+        for n in ixs:
+            for p in self.parents(n):
+                hds.discard(p)
+        assert hds
+        return hds
+
+
+class revlogbaseddag(basedag):
+    '''generic dag interface to a revlog'''
+
+    def __init__(self, revlog, nodeset):
+        basedag.__init__(self)
+        self._revlog = revlog
+        self._heads = None
+        self._nodeset = nodeset
+
+    def nodeset(self):
+        return self._nodeset
+
+    def heads(self):
+        if self._heads is None:
+            self._heads = self._getheads()
+        return self._heads
+
+    def _externalize(self, ix):
+        return self._revlog.index[ix][7]
+    def _externalizeall(self, ixs):
+        idx = self._revlog.index
+        return [idx[i][7] for i in ixs]
+
+    def _internalize(self, id):
+        ix = self._revlog.rev(id)
+        if ix == nullrev:
+            raise LookupError(id, self._revlog.indexfile, _('nullid'))
+        return ix
+    def _internalizeall(self, ids, filterunknown):
+        rl = self._revlog
+        if filterunknown:
+            return [r for r in map(rl.nodemap.get, ids)
+                    if r is not None and r != nullrev]
+        return map(self._internalize, ids)
+
+
+class revlogdag(revlogbaseddag):
+    '''dag interface to a revlog'''
+
+    def __init__(self, revlog):
+        revlogbaseddag.__init__(self, revlog, set(xrange(len(revlog))))
+
+    def _getheads(self):
+        return [r for r in self._revlog.headrevs() if r != nullrev]
+
+    def parents(self, ix):
+        rlog = self._revlog
+        idx = rlog.index
+        revdata = idx[ix]
+        prev = revdata[5]
+        if prev != nullrev:
+            prev2 = revdata[6]
+            if prev2 == nullrev:
+                return [prev]
+            return [prev, prev2]
+        prev2 = revdata[6]
+        if prev2 != nullrev:
+            return [prev2]
+        return []
+
+    def inverse(self):
+        if self._inverse is None:
+            self._inverse = inverserevlogdag(self)
+        return self._inverse
+
+    def ancestorset(self, starts, stops=None):
+        rlog = self._revlog
+        idx = rlog.index
+        stops = stops and set(stops) or set()
+        seen = set()
+        pending = list(starts)
+        while pending:
+            rev = pending.pop()
+            if rev not in seen and rev not in stops:
+                seen.add(rev)
+                revdata = idx[rev]
+                for i in [5, 6]:
+                    prev = revdata[i]
+                    if prev != nullrev:
+                        pending.append(prev)
+        return seen
+
+    def headsetofconnecteds(self, ixs):
+        if not ixs:
+            return set()
+        rlog = self._revlog
+        idx = rlog.index
+        headrevs = set(ixs)
+        for rev in ixs:
+            revdata = idx[rev]
+            for i in [5, 6]:
+                prev = revdata[i]
+                if prev != nullrev:
+                    headrevs.discard(prev)
+        assert headrevs
+        return headrevs
+
+
+class inverserevlogdag(revlogbaseddag, genericdag):
+    '''inverse of an existing revlog dag; see revlogdag.inverse()'''
+
+    def __init__(self, orig):
+        revlogbaseddag.__init__(self, orig._revlog, orig._nodeset)
+        self._orig = orig
+        self._children = {}
+        self._roots = []
+        self._walkfrom = len(self._revlog) - 1
+
+    def _walkto(self, walkto):
+        rev = self._walkfrom
+        cs = self._children
+        roots = self._roots
+        idx = self._revlog.index
+        while rev >= walkto:
+            data = idx[rev]
+            isroot = True
+            for prev in [data[5], data[6]]: # parent revs
+                if prev != nullrev:
+                    cs.setdefault(prev, []).append(rev)
+                    isroot = False
+            if isroot:
+                roots.append(rev)
+            rev -= 1
+        self._walkfrom = rev - 1
+
+    def _getheads(self):
+        self._walkto(nullrev)
+        return self._roots
+
+    def parents(self, ix):
+        if ix is None:
+            return []
+        if ix <= self._walkfrom:
+            self._walkto(ix)
+        return self._children.get(ix, [])
+
+    def inverse(self):
+        return self._orig
--- a/mercurial/discovery.py	Mon May 02 19:21:30 2011 +0200
+++ b/mercurial/discovery.py	Mon May 02 19:21:30 2011 +0200
@@ -7,7 +7,7 @@
 
 from node import nullid, short
 from i18n import _
-import util, error
+import util, error, setdiscovery, treediscovery
 
 def findcommonincoming(repo, remote, heads=None, force=False):
     """Return a tuple (common, anyincoming, heads) used to identify the common
@@ -20,145 +20,28 @@
       changegroupsubset. No code except for pull should be relying on this fact
       any longer.
     "heads" is either the supplied heads, or else the remote's heads.
+
+    If you pass heads and they are all known locally, the reponse lists justs
+    these heads in "common" and in "heads".
     """
 
-    m = repo.changelog.nodemap
-    search = []
-    fetch = set()
-    seen = set()
-    seenbranch = set()
-    base = set()
-
-    if not heads:
-        heads = remote.heads()
-
-    if repo.changelog.tip() == nullid:
-        base.add(nullid)
-        if heads != [nullid]:
-            return [nullid], [nullid], list(heads)
-        return [nullid], [], []
-
-    # assume we're closer to the tip than the root
-    # and start by examining the heads
-    repo.ui.status(_("searching for changes\n"))
-
-    if remote.capable('getbundle'):
-        myheads = repo.heads()
-        known = remote.known(myheads)
-        if util.all(known):
-            hasincoming = set(heads).difference(set(myheads)) and True
-            return myheads, hasincoming, heads
-
-    unknown = []
-    for h in heads:
-        if h not in m:
-            unknown.append(h)
-        else:
-            base.add(h)
-
-    heads = unknown
-    if not unknown:
-        return list(base), [], []
-
-    req = set(unknown)
-    reqcnt = 0
-
-    # search through remote branches
-    # a 'branch' here is a linear segment of history, with four parts:
-    # head, root, first parent, second parent
-    # (a branch always has two parents (or none) by definition)
-    unknown = remote.branches(unknown)
-    while unknown:
-        r = []
-        while unknown:
-            n = unknown.pop(0)
-            if n[0] in seen:
-                continue
+    if not remote.capable('getbundle'):
+        return treediscovery.findcommonincoming(repo, remote, heads, force)
 
-            repo.ui.debug("examining %s:%s\n"
-                          % (short(n[0]), short(n[1])))
-            if n[0] == nullid: # found the end of the branch
-                pass
-            elif n in seenbranch:
-                repo.ui.debug("branch already found\n")
-                continue
-            elif n[1] and n[1] in m: # do we know the base?
-                repo.ui.debug("found incomplete branch %s:%s\n"
-                              % (short(n[0]), short(n[1])))
-                search.append(n[0:2]) # schedule branch range for scanning
-                seenbranch.add(n)
-            else:
-                if n[1] not in seen and n[1] not in fetch:
-                    if n[2] in m and n[3] in m:
-                        repo.ui.debug("found new changeset %s\n" %
-                                      short(n[1]))
-                        fetch.add(n[1]) # earliest unknown
-                    for p in n[2:4]:
-                        if p in m:
-                            base.add(p) # latest known
-
-                for p in n[2:4]:
-                    if p not in req and p not in m:
-                        r.append(p)
-                        req.add(p)
-            seen.add(n[0])
-
-        if r:
-            reqcnt += 1
-            repo.ui.progress(_('searching'), reqcnt, unit=_('queries'))
-            repo.ui.debug("request %d: %s\n" %
-                        (reqcnt, " ".join(map(short, r))))
-            for p in xrange(0, len(r), 10):
-                for b in remote.branches(r[p:p + 10]):
-                    repo.ui.debug("received %s:%s\n" %
-                                  (short(b[0]), short(b[1])))
-                    unknown.append(b)
+    if heads:
+        allknown = True
+        nm = repo.changelog.nodemap
+        for h in heads:
+            if nm.get(h) is None:
+                allknown = False
+                break
+        if allknown:
+            return (heads, False, heads)
 
-    # do binary search on the branches we found
-    while search:
-        newsearch = []
-        reqcnt += 1
-        repo.ui.progress(_('searching'), reqcnt, unit=_('queries'))
-        for n, l in zip(search, remote.between(search)):
-            l.append(n[1])
-            p = n[0]
-            f = 1
-            for i in l:
-                repo.ui.debug("narrowing %d:%d %s\n" % (f, len(l), short(i)))
-                if i in m:
-                    if f <= 2:
-                        repo.ui.debug("found new branch changeset %s\n" %
-                                          short(p))
-                        fetch.add(p)
-                        base.add(i)
-                    else:
-                        repo.ui.debug("narrowed branch search to %s:%s\n"
-                                      % (short(p), short(i)))
-                        newsearch.append((p, i))
-                    break
-                p, f = i, f * 2
-            search = newsearch
-
-    # sanity check our fetch list
-    for f in fetch:
-        if f in m:
-            raise error.RepoError(_("already have changeset ")
-                                  + short(f[:4]))
-
-    base = list(base)
-    if base == [nullid]:
-        if force:
-            repo.ui.warn(_("warning: repository is unrelated\n"))
-        else:
-            raise util.Abort(_("repository is unrelated"))
-
-    repo.ui.debug("found new changesets starting at " +
-                 " ".join([short(f) for f in fetch]) + "\n")
-
-    repo.ui.progress(_('searching'), None)
-    repo.ui.debug("%d total queries\n" % reqcnt)
-
-    return base, list(fetch), heads
+    res = setdiscovery.findcommonheads(repo.ui, repo, remote,
+                                       abortwhenunrelated=not force)
+    common, anyinc, srvheads = res
+    return (list(common), anyinc, heads or list(srvheads))
 
 def prepush(repo, remote, force, revs, newbranch):
     '''Analyze the local and remote repositories and determine which
@@ -174,9 +57,7 @@
     changegroup is a readable file-like object whose read() returns
     successive changegroup chunks ready to be sent over the wire and
     remoteheads is the list of remote heads.'''
-    remoteheads = remote.heads()
-    common, inc, _rheads = findcommonincoming(repo, remote, heads=remoteheads,
-                                              force=force)
+    common, inc, remoteheads = findcommonincoming(repo, remote, force=force)
 
     cl = repo.changelog
     outg = cl.findmissing(common, revs)
--- a/mercurial/revlog.py	Mon May 02 19:21:30 2011 +0200
+++ b/mercurial/revlog.py	Mon May 02 19:21:30 2011 +0200
@@ -617,6 +617,17 @@
         assert heads
         return (orderedout, roots, heads)
 
+    def headrevs(self):
+        count = len(self)
+        if not count:
+            return [nullrev]
+        ishead = [1] * (count + 1)
+        index = self.index
+        for r in xrange(count):
+            e = index[r]
+            ishead[e[5]] = ishead[e[6]] = 0
+        return [r for r in xrange(count) if ishead[r]]
+
     def heads(self, start=None, stop=None):
         """return the list of all nodes that have no children
 
@@ -626,15 +637,9 @@
         as if they had no children
         """
         if start is None and stop is None:
-            count = len(self)
-            if not count:
+            if not len(self):
                 return [nullid]
-            ishead = [1] * (count + 1)
-            index = self.index
-            for r in xrange(count):
-                e = index[r]
-                ishead[e[5]] = ishead[e[6]] = 0
-            return [self.node(r) for r in xrange(count) if ishead[r]]
+            return [self.node(r) for r in self.headrevs()]
 
         if start is None:
             start = nullid
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mercurial/setdiscovery.py	Mon May 02 19:21:30 2011 +0200
@@ -0,0 +1,178 @@
+# setdiscovery.py - improved discovery of common nodeset for mercurial
+#
+# Copyright 2010 Benoit Boissinot <bboissin@gmail.com>
+# and Peter Arrenbrecht <peter@arrenbrecht.ch>
+#
+# This software may be used and distributed according to the terms of the
+# GNU General Public License version 2 or any later version.
+
+from node import nullid
+from i18n import _
+import random, collections, util, dagutil
+
+def _updatesample(dag, nodes, sample, always, quicksamplesize=0):
+    # if nodes is empty we scan the entire graph
+    if nodes:
+        heads = dag.headsetofconnecteds(nodes)
+    else:
+        heads = dag.heads()
+    dist = {}
+    visit = collections.deque(heads)
+    seen = set()
+    factor = 1
+    while visit:
+        curr = visit.popleft()
+        if curr in seen:
+            continue
+        d = dist.setdefault(curr, 1)
+        if d > factor:
+            factor *= 2
+        if d == factor:
+            if curr not in always: # need this check for the early exit below
+                sample.add(curr)
+                if quicksamplesize and (len(sample) >= quicksamplesize):
+                    return
+        seen.add(curr)
+        for p in dag.parents(curr):
+            if not nodes or p in nodes:
+                dist.setdefault(p, d + 1)
+                visit.append(p)
+
+def _setupsample(dag, nodes, size):
+    if len(nodes) <= size:
+        return set(nodes), None, 0
+    always = set(dag.heads())
+    desiredlen = size - len(always)
+    if desiredlen <= 0:
+        # This could be bad if there are very many heads, all unknown to the
+        # server. We're counting on long request support here.
+        return always, None, desiredlen
+    return always, set(), desiredlen
+
+def _takequicksample(dag, nodes, size, initial):
+    always, sample, desiredlen = _setupsample(dag, nodes, size)
+    if sample is None:
+        return always
+    if initial:
+        fromset = None
+    else:
+        fromset = nodes
+    _updatesample(dag, fromset, sample, always, quicksamplesize=desiredlen)
+    sample.update(always)
+    return sample
+
+def _takefullsample(dag, nodes, size):
+    always, sample, desiredlen = _setupsample(dag, nodes, size)
+    if sample is None:
+        return always
+    # update from heads
+    _updatesample(dag, nodes, sample, always)
+    # update from roots
+    _updatesample(dag.inverse(), nodes, sample, always)
+    assert sample
+    if len(sample) > desiredlen:
+        sample = set(random.sample(sample, desiredlen))
+    elif len(sample) < desiredlen:
+        more = desiredlen - len(sample)
+        sample.update(random.sample(list(nodes - sample - always), more))
+    sample.update(always)
+    return sample
+
+def findcommonheads(ui, local, remote,
+                    initialsamplesize=100,
+                    fullsamplesize=200,
+                    abortwhenunrelated=True):
+    '''Return a tuple (common, anyincoming, remoteheads) used to identify missing
+    nodes from or in remote.
+
+    shortcutlocal determines whether we try use direct access to localrepo if
+    remote is actually local.
+    '''
+    roundtrips = 0
+    cl = local.changelog
+    dag = dagutil.revlogdag(cl)
+    nodes = dag.nodeset()
+
+    # early exit if we know all the specified server heads already
+    ui.debug("query 1; heads\n")
+    roundtrips += 1
+    srvheadhashes = remote.heads()
+
+    ## TODO We might want to request an additional random sample of the server's
+    ## nodes batched with the heads query here.
+
+    if cl.tip() == nullid:
+        if srvheadhashes != [nullid]:
+            return [nullid], True, srvheadhashes
+        return [nullid], False, []
+
+    # start actual discovery (we note this before the next "if" for compatibility
+    # reasons)
+    ui.status(_("searching for changes\n"))
+
+    srvheads = dag.internalizeall(srvheadhashes, filterunknown=True)
+    if len(srvheads) == len(srvheadhashes):
+        ui.note("all remote heads known locally\n")
+        return (srvheadhashes, False, srvheadhashes,)
+
+    # full blown discovery
+    undecided = nodes # own nodes where I don't know if the server knows them
+    common = set() # own nodes I know we both know
+    missing = set() # own nodes I know the server lacks
+
+    # treat remote heads as a first implicit sample response
+    common.update(dag.ancestorset(srvheads))
+    undecided.difference_update(common)
+    # use cheapish initial sample
+    if common:
+        ui.debug("taking initial sample\n")
+        sample = _takefullsample(dag, undecided, size=fullsamplesize)
+    else:
+        ui.debug("taking quick initial sample\n")
+        sample = _takequicksample(dag, nodes, size=initialsamplesize,
+                                  initial=True)
+
+    roundtrips += 1
+    ui.progress(_('searching'), roundtrips, unit=_('queries'))
+    ui.debug("query %i; still undecided: %i, sample size is: %i\n"
+             % (roundtrips, len(undecided), len(sample)))
+    # indices between sample and externalized version must match
+    sample = list(sample)
+    yesno = remote.known(dag.externalizeall(sample))
+
+    while undecided:
+        commoninsample = set(n for i, n in enumerate(sample) if yesno[i])
+        common.update(dag.ancestorset(commoninsample, common))
+
+        missinginsample = [n for i, n in enumerate(sample) if not yesno[i]]
+        missing.update(dag.descendantset(missinginsample, missing))
+
+        undecided.difference_update(missing)
+        undecided.difference_update(common)
+
+        if not undecided:
+            break
+
+        ui.note("sampling from both directions\n")
+        sample = _takefullsample(dag, undecided, size=fullsamplesize)
+
+        roundtrips += 1
+        ui.progress(_('searching'), roundtrips, unit=_('queries'))
+        ui.debug("query %i; still undecided: %i, sample size is: %i\n"
+                 % (roundtrips, len(undecided), len(sample)))
+        # indices between sample and externalized version must match
+        sample = list(sample)
+        yesno = remote.known(dag.externalizeall(sample))
+
+    result = dag.headsetofconnecteds(common)
+    ui.progress(_('searching'), None)
+    ui.debug("%d total queries\n" % roundtrips)
+
+    if not result and srvheadhashes != [nullid]:
+        if abortwhenunrelated:
+            raise util.Abort(_("repository is unrelated"))
+        else:
+            ui.warn(_("warning: repository is unrelated\n"))
+        return (set([nullid]), True, srvheadhashes,)
+
+    return (dag.externalizeall(result), True, srvheadhashes,)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mercurial/treediscovery.py	Mon May 02 19:21:30 2011 +0200
@@ -0,0 +1,151 @@
+# discovery.py - protocol changeset discovery functions
+#
+# Copyright 2010 Matt Mackall <mpm@selenic.com>
+#
+# This software may be used and distributed according to the terms of the
+# GNU General Public License version 2 or any later version.
+
+from node import nullid, short
+from i18n import _
+import util, error
+
+def findcommonincoming(repo, remote, heads=None, force=False):
+    """Return a tuple (common, fetch, heads) used to identify the common
+    subset of nodes between repo and remote.
+
+    "common" is a list of (at least) the heads of the common subset.
+    "fetch" is a list of roots of the nodes that would be incoming, to be
+      supplied to changegroupsubset.
+    "heads" is either the supplied heads, or else the remote's heads.
+    """
+
+    m = repo.changelog.nodemap
+    search = []
+    fetch = set()
+    seen = set()
+    seenbranch = set()
+    base = set()
+
+    if not heads:
+        heads = remote.heads()
+
+    if repo.changelog.tip() == nullid:
+        base.add(nullid)
+        if heads != [nullid]:
+            return [nullid], [nullid], list(heads)
+        return [nullid], [], []
+
+    # assume we're closer to the tip than the root
+    # and start by examining the heads
+    repo.ui.status(_("searching for changes\n"))
+
+    unknown = []
+    for h in heads:
+        if h not in m:
+            unknown.append(h)
+        else:
+            base.add(h)
+
+    heads = unknown
+    if not unknown:
+        return list(base), [], []
+
+    req = set(unknown)
+    reqcnt = 0
+
+    # search through remote branches
+    # a 'branch' here is a linear segment of history, with four parts:
+    # head, root, first parent, second parent
+    # (a branch always has two parents (or none) by definition)
+    unknown = remote.branches(unknown)
+    while unknown:
+        r = []
+        while unknown:
+            n = unknown.pop(0)
+            if n[0] in seen:
+                continue
+
+            repo.ui.debug("examining %s:%s\n"
+                          % (short(n[0]), short(n[1])))
+            if n[0] == nullid: # found the end of the branch
+                pass
+            elif n in seenbranch:
+                repo.ui.debug("branch already found\n")
+                continue
+            elif n[1] and n[1] in m: # do we know the base?
+                repo.ui.debug("found incomplete branch %s:%s\n"
+                              % (short(n[0]), short(n[1])))
+                search.append(n[0:2]) # schedule branch range for scanning
+                seenbranch.add(n)
+            else:
+                if n[1] not in seen and n[1] not in fetch:
+                    if n[2] in m and n[3] in m:
+                        repo.ui.debug("found new changeset %s\n" %
+                                      short(n[1]))
+                        fetch.add(n[1]) # earliest unknown
+                    for p in n[2:4]:
+                        if p in m:
+                            base.add(p) # latest known
+
+                for p in n[2:4]:
+                    if p not in req and p not in m:
+                        r.append(p)
+                        req.add(p)
+            seen.add(n[0])
+
+        if r:
+            reqcnt += 1
+            repo.ui.progress(_('searching'), reqcnt, unit=_('queries'))
+            repo.ui.debug("request %d: %s\n" %
+                        (reqcnt, " ".join(map(short, r))))
+            for p in xrange(0, len(r), 10):
+                for b in remote.branches(r[p:p + 10]):
+                    repo.ui.debug("received %s:%s\n" %
+                                  (short(b[0]), short(b[1])))
+                    unknown.append(b)
+
+    # do binary search on the branches we found
+    while search:
+        newsearch = []
+        reqcnt += 1
+        repo.ui.progress(_('searching'), reqcnt, unit=_('queries'))
+        for n, l in zip(search, remote.between(search)):
+            l.append(n[1])
+            p = n[0]
+            f = 1
+            for i in l:
+                repo.ui.debug("narrowing %d:%d %s\n" % (f, len(l), short(i)))
+                if i in m:
+                    if f <= 2:
+                        repo.ui.debug("found new branch changeset %s\n" %
+                                          short(p))
+                        fetch.add(p)
+                        base.add(i)
+                    else:
+                        repo.ui.debug("narrowed branch search to %s:%s\n"
+                                      % (short(p), short(i)))
+                        newsearch.append((p, i))
+                    break
+                p, f = i, f * 2
+            search = newsearch
+
+    # sanity check our fetch list
+    for f in fetch:
+        if f in m:
+            raise error.RepoError(_("already have changeset ")
+                                  + short(f[:4]))
+
+    base = list(base)
+    if base == [nullid]:
+        if force:
+            repo.ui.warn(_("warning: repository is unrelated\n"))
+        else:
+            raise util.Abort(_("repository is unrelated"))
+
+    repo.ui.debug("found new changesets starting at " +
+                 " ".join([short(f) for f in fetch]) + "\n")
+
+    repo.ui.progress(_('searching'), None)
+    repo.ui.debug("%d total queries\n" % reqcnt)
+
+    return base, list(fetch), heads
--- a/tests/test-acl.t	Mon May 02 19:21:30 2011 +0200
+++ b/tests/test-acl.t	Mon May 02 19:21:30 2011 +0200
@@ -82,7 +82,9 @@
   hgrc = """
   """
   pushing to ../b
+  query 1; heads
   searching for changes
+  all remote heads known locally
   3 changesets found
   list of changesets:
   ef1ea85a6374b77d6da9dcda9541f498f2d17df7
@@ -135,7 +137,9 @@
   pretxnchangegroup.acl = python:hgext.acl.hook
   """
   pushing to ../b
+  query 1; heads
   searching for changes
+  all remote heads known locally
   invalidating branch cache (tip differs)
   3 changesets found
   list of changesets:
@@ -192,7 +196,9 @@
   sources = push
   """
   pushing to ../b
+  query 1; heads
   searching for changes
+  all remote heads known locally
   invalidating branch cache (tip differs)
   3 changesets found
   list of changesets:
@@ -258,7 +264,9 @@
   [acl.allow]
   """
   pushing to ../b
+  query 1; heads
   searching for changes
+  all remote heads known locally
   invalidating branch cache (tip differs)
   3 changesets found
   list of changesets:
@@ -322,7 +330,9 @@
   foo/** = fred
   """
   pushing to ../b
+  query 1; heads
   searching for changes
+  all remote heads known locally
   3 changesets found
   list of changesets:
   ef1ea85a6374b77d6da9dcda9541f498f2d17df7
@@ -390,7 +400,9 @@
   [acl.deny]
   """
   pushing to ../b
+  query 1; heads
   searching for changes
+  all remote heads known locally
   3 changesets found
   list of changesets:
   ef1ea85a6374b77d6da9dcda9541f498f2d17df7
@@ -455,7 +467,9 @@
   foo/bar/** = fred
   """
   pushing to ../b
+  query 1; heads
   searching for changes
+  all remote heads known locally
   3 changesets found
   list of changesets:
   ef1ea85a6374b77d6da9dcda9541f498f2d17df7
@@ -525,7 +539,9 @@
   foo/Bar/** = fred
   """
   pushing to ../b
+  query 1; heads
   searching for changes
+  all remote heads known locally
   3 changesets found
   list of changesets:
   ef1ea85a6374b77d6da9dcda9541f498f2d17df7
@@ -592,7 +608,9 @@
   foo/Bar/** = fred
   """
   pushing to ../b
+  query 1; heads
   searching for changes
+  all remote heads known locally
   3 changesets found
   list of changesets:
   ef1ea85a6374b77d6da9dcda9541f498f2d17df7
@@ -661,7 +679,9 @@
   ** = barney
   """
   pushing to ../b
+  query 1; heads
   searching for changes
+  all remote heads known locally
   3 changesets found
   list of changesets:
   ef1ea85a6374b77d6da9dcda9541f498f2d17df7
@@ -733,7 +753,9 @@
   **/*.txt = wilma
   """
   pushing to ../b
+  query 1; heads
   searching for changes
+  all remote heads known locally
   invalidating branch cache (tip differs)
   3 changesets found
   list of changesets:
@@ -810,7 +832,9 @@
   config = ../acl.config
   """
   pushing to ../b
+  query 1; heads
   searching for changes
+  all remote heads known locally
   3 changesets found
   list of changesets:
   ef1ea85a6374b77d6da9dcda9541f498f2d17df7
@@ -880,7 +904,9 @@
   foo/** = betty
   """
   pushing to ../b
+  query 1; heads
   searching for changes
+  all remote heads known locally
   3 changesets found
   list of changesets:
   ef1ea85a6374b77d6da9dcda9541f498f2d17df7
@@ -962,7 +988,9 @@
   changegroup.acl = false
   """
   pushing to ../b
+  query 1; heads
   searching for changes
+  all remote heads known locally
   3 changesets found
   list of changesets:
   ef1ea85a6374b77d6da9dcda9541f498f2d17df7
@@ -1035,7 +1063,9 @@
   ** = fred
   """
   pushing to ../b
+  query 1; heads
   searching for changes
+  all remote heads known locally
   invalidating branch cache (tip differs)
   3 changesets found
   list of changesets:
@@ -1105,7 +1135,9 @@
   foo/Bar/** = *
   """
   pushing to ../b
+  query 1; heads
   searching for changes
+  all remote heads known locally
   invalidating branch cache (tip differs)
   3 changesets found
   list of changesets:
@@ -1178,7 +1210,9 @@
   ** = @group1
   """
   pushing to ../b
+  query 1; heads
   searching for changes
+  all remote heads known locally
   3 changesets found
   list of changesets:
   ef1ea85a6374b77d6da9dcda9541f498f2d17df7
@@ -1248,7 +1282,9 @@
   foo/Bar/** = @group1
   """
   pushing to ../b
+  query 1; heads
   searching for changes
+  all remote heads known locally
   invalidating branch cache (tip differs)
   3 changesets found
   list of changesets:
@@ -1359,7 +1395,9 @@
   [extensions]
   """
   pushing to ../b
+  query 1; heads
   searching for changes
+  all remote heads known locally
   4 changesets found
   list of changesets:
   ef1ea85a6374b77d6da9dcda9541f498f2d17df7
@@ -1436,7 +1474,9 @@
   foobar = *
   """
   pushing to ../b
+  query 1; heads
   searching for changes
+  all remote heads known locally
   invalidating branch cache (tip differs)
   4 changesets found
   list of changesets:
@@ -1512,7 +1552,9 @@
   [acl.allow.branches]
   """
   pushing to ../b
+  query 1; heads
   searching for changes
+  all remote heads known locally
   4 changesets found
   list of changesets:
   ef1ea85a6374b77d6da9dcda9541f498f2d17df7
@@ -1583,7 +1625,9 @@
   * = george
   """
   pushing to ../b
+  query 1; heads
   searching for changes
+  all remote heads known locally
   4 changesets found
   list of changesets:
   ef1ea85a6374b77d6da9dcda9541f498f2d17df7
@@ -1648,7 +1692,9 @@
   * = george
   """
   pushing to ../b
+  query 1; heads
   searching for changes
+  all remote heads known locally
   4 changesets found
   list of changesets:
   ef1ea85a6374b77d6da9dcda9541f498f2d17df7
@@ -1730,7 +1776,9 @@
   * = george
   """
   pushing to ../b
+  query 1; heads
   searching for changes
+  all remote heads known locally
   invalidating branch cache (tip differs)
   4 changesets found
   list of changesets:
@@ -1812,7 +1860,9 @@
   * = george
   """
   pushing to ../b
+  query 1; heads
   searching for changes
+  all remote heads known locally
   invalidating branch cache (tip differs)
   4 changesets found
   list of changesets:
--- a/tests/test-bookmarks-pushpull.t	Mon May 02 19:21:30 2011 +0200
+++ b/tests/test-bookmarks-pushpull.t	Mon May 02 19:21:30 2011 +0200
@@ -39,7 +39,6 @@
   Z	4e3505fd95835d721066b76e75dbb8cc554d7f77
   $ hg pull -B X ../a
   pulling from ../a
-  searching for changes
   no changes found
   importing bookmark X
   $ hg bookmark
@@ -173,7 +172,6 @@
      foobar                    000000000000
   $ hg pull -B Z http://localhost:$HGPORT/
   pulling from http://localhost:$HGPORT/
-  searching for changes
   no changes found
   not updating divergent bookmark X
   importing bookmark Z
--- a/tests/test-bundle.t	Mon May 02 19:21:30 2011 +0200
+++ b/tests/test-bundle.t	Mon May 02 19:21:30 2011 +0200
@@ -561,7 +561,9 @@
 == bundling
 
   $ hg bundle bundle.hg part --debug
+  query 1; heads
   searching for changes
+  all remote heads known locally
   2 changesets found
   list of changesets:
   d2ae7f538514cd87c17547b0de4cea71fe1af9fb
--- a/tests/test-debugcomplete.t	Mon May 02 19:21:30 2011 +0200
+++ b/tests/test-debugcomplete.t	Mon May 02 19:21:30 2011 +0200
@@ -75,6 +75,7 @@
   debugdag
   debugdata
   debugdate
+  debugdiscovery
   debugfsinfo
   debuggetbundle
   debugignore
@@ -219,6 +220,7 @@
   debugdag: tags, branches, dots, spaces
   debugdata: 
   debugdate: extended
+  debugdiscovery: old, nonheads, ssh, remotecmd, insecure
   debugfsinfo: 
   debuggetbundle: head, common, type
   debugignore: 
--- a/tests/test-hook.t	Mon May 02 19:21:30 2011 +0200
+++ b/tests/test-hook.t	Mon May 02 19:21:30 2011 +0200
@@ -189,7 +189,7 @@
   $ hg pull -B bar ../a
   pulling from ../a
   listkeys hook: HG_NAMESPACE=bookmarks HG_VALUES={'bar': '0000000000000000000000000000000000000000', 'foo': '0000000000000000000000000000000000000000'} 
-  searching for changes
+  no changes found
   listkeys hook: HG_NAMESPACE=bookmarks HG_VALUES={'bar': '0000000000000000000000000000000000000000', 'foo': '0000000000000000000000000000000000000000'} 
   importing bookmark bar
   $ cd ../a
--- a/tests/test-push-warn.t	Mon May 02 19:21:30 2011 +0200
+++ b/tests/test-push-warn.t	Mon May 02 19:21:30 2011 +0200
@@ -31,14 +31,12 @@
 
   $ hg push --debug ../a
   pushing to ../a
+  query 1; heads
   searching for changes
-  examining 1c9246a22a0a:d8d565842d04
-  found incomplete branch 1c9246a22a0a:d8d565842d04
-  searching: 1 queries
-  narrowing 1:1 d8d565842d04
-  found new branch changeset 1c9246a22a0a
-  found new changesets starting at 1c9246a22a0a
-  1 total queries
+  taking quick initial sample
+  searching: 2 queries
+  query 2; still undecided: 2, sample size is: 2
+  2 total queries
   new remote heads on branch 'default'
   new remote head 1e108cc5548c
   abort: push creates new remote heads on branch 'default'!
--- a/tests/test-schemes.t	Mon May 02 19:21:30 2011 +0200
+++ b/tests/test-schemes.t	Mon May 02 19:21:30 2011 +0200
@@ -27,9 +27,10 @@
   using http://localhost:$HGPORT/
   sending capabilities command
   comparing with parts://localhost/
+  query 1; heads
   sending heads command
   searching for changes
-  sending known command
+  all remote heads known locally
   no changes found
   [1]
 
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/test-setdiscovery.t	Mon May 02 19:21:30 2011 +0200
@@ -0,0 +1,271 @@
+
+Function to test discovery between two repos in both directions, using both the local shortcut
+(which is currently not activated by default) and the full remotable protocol:
+
+  $ testdesc() { # revs_a, revs_b, dagdesc
+  >     if [ -e foo ]; then rm -rf foo; fi
+  >     hg init foo
+  >     cd foo
+  >     hg debugbuilddag "$3"
+  >     hg clone . a $1 --quiet
+  >     hg clone . b $2 --quiet
+  >     echo
+  >     echo "% -- a -> b tree"
+  >     hg -R a debugdiscovery b --verbose --old
+  >     echo
+  >     echo "% -- a -> b set"
+  >     hg -R a debugdiscovery b --verbose --debug
+  >     echo
+  >     echo "% -- b -> a tree"
+  >     hg -R b debugdiscovery a --verbose --old
+  >     echo
+  >     echo "% -- b -> a set"
+  >     hg -R b debugdiscovery a --verbose --debug
+  >     cd ..
+  > }
+
+
+Small superset:
+
+  $ testdesc '-ra1 -ra2' '-rb1 -rb2 -rb3' '
+  > +2:f +1:a1:b1
+  > <f +4 :a2
+  > +5 :b2
+  > <f +3 :b3'
+  
+  % -- a -> b tree
+  comparing with b
+  searching for changes
+  unpruned common: b5714e113bc0 66f7d451a68b 01241442b3c2
+  common heads: b5714e113bc0 01241442b3c2
+  local is subset
+  
+  % -- a -> b set
+  comparing with b
+  query 1; heads
+  searching for changes
+  taking initial sample
+  searching: 2 queries
+  query 2; still undecided: 4, sample size is: 4
+  2 total queries
+  common heads: b5714e113bc0 01241442b3c2
+  local is subset
+  
+  % -- b -> a tree
+  comparing with a
+  searching for changes
+  unpruned common: b5714e113bc0 01241442b3c2
+  common heads: b5714e113bc0 01241442b3c2
+  remote is subset
+  
+  % -- b -> a set
+  comparing with a
+  query 1; heads
+  searching for changes
+  all remote heads known locally
+  common heads: b5714e113bc0 01241442b3c2
+  remote is subset
+
+
+Many new:
+
+  $ testdesc '-ra1 -ra2' '-rb' '
+  > +2:f +3:a1 +3:b
+  > <f +30 :a2'
+  
+  % -- a -> b tree
+  comparing with b
+  searching for changes
+  unpruned common: bebd167eb94d
+  common heads: bebd167eb94d
+  
+  % -- a -> b set
+  comparing with b
+  query 1; heads
+  searching for changes
+  taking quick initial sample
+  searching: 2 queries
+  query 2; still undecided: 35, sample size is: 35
+  2 total queries
+  common heads: bebd167eb94d
+  
+  % -- b -> a tree
+  comparing with a
+  searching for changes
+  unpruned common: bebd167eb94d 66f7d451a68b
+  common heads: bebd167eb94d
+  
+  % -- b -> a set
+  comparing with a
+  query 1; heads
+  searching for changes
+  taking initial sample
+  searching: 2 queries
+  query 2; still undecided: 3, sample size is: 3
+  2 total queries
+  common heads: bebd167eb94d
+
+
+Both sides many new with stub:
+
+  $ testdesc '-ra1 -ra2' '-rb' '
+  > +2:f +2:a1 +30 :b
+  > <f +30 :a2'
+  
+  % -- a -> b tree
+  comparing with b
+  searching for changes
+  unpruned common: 2dc09a01254d
+  common heads: 2dc09a01254d
+  
+  % -- a -> b set
+  comparing with b
+  query 1; heads
+  searching for changes
+  taking quick initial sample
+  searching: 2 queries
+  query 2; still undecided: 34, sample size is: 34
+  2 total queries
+  common heads: 2dc09a01254d
+  
+  % -- b -> a tree
+  comparing with a
+  searching for changes
+  unpruned common: 66f7d451a68b 2dc09a01254d
+  common heads: 2dc09a01254d
+  
+  % -- b -> a set
+  comparing with a
+  query 1; heads
+  searching for changes
+  taking initial sample
+  searching: 2 queries
+  query 2; still undecided: 30, sample size is: 30
+  2 total queries
+  common heads: 2dc09a01254d
+
+
+Both many new:
+
+  $ testdesc '-ra' '-rb' '
+  > +2:f +30 :b
+  > <f +30 :a'
+  
+  % -- a -> b tree
+  comparing with b
+  searching for changes
+  unpruned common: 66f7d451a68b
+  common heads: 66f7d451a68b
+  
+  % -- a -> b set
+  comparing with b
+  query 1; heads
+  searching for changes
+  taking quick initial sample
+  searching: 2 queries
+  query 2; still undecided: 32, sample size is: 32
+  2 total queries
+  common heads: 66f7d451a68b
+  
+  % -- b -> a tree
+  comparing with a
+  searching for changes
+  unpruned common: 66f7d451a68b
+  common heads: 66f7d451a68b
+  
+  % -- b -> a set
+  comparing with a
+  query 1; heads
+  searching for changes
+  taking quick initial sample
+  searching: 2 queries
+  query 2; still undecided: 32, sample size is: 32
+  2 total queries
+  common heads: 66f7d451a68b
+
+
+Both many new skewed:
+
+  $ testdesc '-ra' '-rb' '
+  > +2:f +30 :b
+  > <f +50 :a'
+  
+  % -- a -> b tree
+  comparing with b
+  searching for changes
+  unpruned common: 66f7d451a68b
+  common heads: 66f7d451a68b
+  
+  % -- a -> b set
+  comparing with b
+  query 1; heads
+  searching for changes
+  taking quick initial sample
+  searching: 2 queries
+  query 2; still undecided: 52, sample size is: 52
+  2 total queries
+  common heads: 66f7d451a68b
+  
+  % -- b -> a tree
+  comparing with a
+  searching for changes
+  unpruned common: 66f7d451a68b
+  common heads: 66f7d451a68b
+  
+  % -- b -> a set
+  comparing with a
+  query 1; heads
+  searching for changes
+  taking quick initial sample
+  searching: 2 queries
+  query 2; still undecided: 32, sample size is: 32
+  2 total queries
+  common heads: 66f7d451a68b
+
+
+Both many new on top of long history:
+
+  $ testdesc '-ra' '-rb' '
+  > +1000:f +30 :b
+  > <f +50 :a'
+  
+  % -- a -> b tree
+  comparing with b
+  searching for changes
+  unpruned common: 7ead0cba2838
+  common heads: 7ead0cba2838
+  
+  % -- a -> b set
+  comparing with b
+  query 1; heads
+  searching for changes
+  taking quick initial sample
+  searching: 2 queries
+  query 2; still undecided: 1050, sample size is: 11
+  sampling from both directions
+  searching: 3 queries
+  query 3; still undecided: 31, sample size is: 31
+  3 total queries
+  common heads: 7ead0cba2838
+  
+  % -- b -> a tree
+  comparing with a
+  searching for changes
+  unpruned common: 7ead0cba2838
+  common heads: 7ead0cba2838
+  
+  % -- b -> a set
+  comparing with a
+  query 1; heads
+  searching for changes
+  taking quick initial sample
+  searching: 2 queries
+  query 2; still undecided: 1030, sample size is: 11
+  sampling from both directions
+  searching: 3 queries
+  query 3; still undecided: 16, sample size is: 16
+  3 total queries
+  common heads: 7ead0cba2838
+
+
+
--- a/tests/test-ssh.t	Mon May 02 19:21:30 2011 +0200
+++ b/tests/test-ssh.t	Mon May 02 19:21:30 2011 +0200
@@ -219,7 +219,6 @@
   $ hg book -f -r 0 foo
   $ hg pull -B foo
   pulling from ssh://user@dummy/remote
-  searching for changes
   no changes found
   updating bookmark foo
   importing bookmark foo