hgext/narrow/narrowchangegroup.py
changeset 36079 a2a6e724d61a
child 36088 8c31187b6717
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hgext/narrow/narrowchangegroup.py	Mon Jan 29 16:19:33 2018 -0500
@@ -0,0 +1,385 @@
+# narrowchangegroup.py - narrow clone changegroup creation and consumption
+#
+# Copyright 2017 Google, Inc.
+#
+# This software may be used and distributed according to the terms of the
+# GNU General Public License version 2 or any later version.
+
+from __future__ import absolute_import
+
+from mercurial.i18n import _
+from mercurial import (
+    changegroup,
+    error,
+    extensions,
+    manifest,
+    mdiff,
+    node,
+    util,
+)
+
+from . import (
+    narrowrepo,
+    narrowrevlog,
+)
+
+def setup():
+
+    def supportedoutgoingversions(orig, repo):
+        versions = orig(repo)
+        if narrowrepo.requirement in repo.requirements:
+            versions.discard('01')
+            versions.discard('02')
+        return versions
+
+    extensions.wrapfunction(changegroup, 'supportedoutgoingversions',
+                            supportedoutgoingversions)
+
+    def prune(orig, self, revlog, missing, commonrevs):
+        if isinstance(revlog, manifest.manifestrevlog):
+            matcher = getattr(self._repo, 'narrowmatch',
+                              getattr(self, '_narrow_matcher', None))
+            if (matcher is not None and
+                not matcher().visitdir(revlog._dir[:-1] or '.')):
+                return []
+        return orig(self, revlog, missing, commonrevs)
+
+    extensions.wrapfunction(changegroup.cg1packer, 'prune', prune)
+
+    def generatefiles(orig, self, changedfiles, linknodes, commonrevs,
+                      source):
+        matcher = getattr(self._repo, 'narrowmatch',
+                          getattr(self, '_narrow_matcher', None))
+        if matcher is not None:
+            narrowmatch = matcher()
+            changedfiles = filter(narrowmatch, changedfiles)
+        if getattr(self, 'is_shallow', False):
+            # See comment in generate() for why this sadness is a thing.
+            mfdicts = self._mfdicts
+            del self._mfdicts
+            # In a shallow clone, the linknodes callback needs to also include
+            # those file nodes that are in the manifests we sent but weren't
+            # introduced by those manifests.
+            commonctxs = [self._repo[c] for c in commonrevs]
+            oldlinknodes = linknodes
+            clrev = self._repo.changelog.rev
+            def linknodes(flog, fname):
+                for c in commonctxs:
+                    try:
+                        fnode = c.filenode(fname)
+                        self.clrev_to_localrev[c.rev()] = flog.rev(fnode)
+                    except error.ManifestLookupError:
+                        pass
+                links = oldlinknodes(flog, fname)
+                if len(links) != len(mfdicts):
+                    for mf, lr in mfdicts:
+                        fnode = mf.get(fname, None)
+                        if fnode in links:
+                            links[fnode] = min(links[fnode], lr, key=clrev)
+                        elif fnode:
+                            links[fnode] = lr
+                return links
+        return orig(self, changedfiles, linknodes, commonrevs, source)
+    extensions.wrapfunction(
+        changegroup.cg1packer, 'generatefiles', generatefiles)
+
+    def ellipsisdata(packer, rev, revlog, p1, p2, data, linknode):
+        n = revlog.node(rev)
+        p1n, p2n = revlog.node(p1), revlog.node(p2)
+        flags = revlog.flags(rev)
+        flags |= narrowrevlog.ELLIPSIS_NODE_FLAG
+        meta = packer.builddeltaheader(
+            n, p1n, p2n, node.nullid, linknode, flags)
+        # TODO: try and actually send deltas for ellipsis data blocks
+        diffheader = mdiff.trivialdiffheader(len(data))
+        l = len(meta) + len(diffheader) + len(data)
+        return ''.join((changegroup.chunkheader(l),
+                        meta,
+                        diffheader,
+                        data))
+
+    def close(orig, self):
+        getattr(self, 'clrev_to_localrev', {}).clear()
+        if getattr(self, 'next_clrev_to_localrev', {}):
+            self.clrev_to_localrev = self.next_clrev_to_localrev
+            del self.next_clrev_to_localrev
+        self.changelog_done = True
+        return orig(self)
+    extensions.wrapfunction(changegroup.cg1packer, 'close', close)
+
+    # In a perfect world, we'd generate better ellipsis-ified graphs
+    # for non-changelog revlogs. In practice, we haven't started doing
+    # that yet, so the resulting DAGs for the manifestlog and filelogs
+    # are actually full of bogus parentage on all the ellipsis
+    # nodes. This has the side effect that, while the contents are
+    # correct, the individual DAGs might be completely out of whack in
+    # a case like 882681bc3166 and its ancestors (back about 10
+    # revisions or so) in the main hg repo.
+    #
+    # The one invariant we *know* holds is that the new (potentially
+    # bogus) DAG shape will be valid if we order the nodes in the
+    # order that they're introduced in dramatis personae by the
+    # changelog, so what we do is we sort the non-changelog histories
+    # by the order in which they are used by the changelog.
+    def _sortgroup(orig, self, revlog, nodelist, lookup):
+        if not util.safehasattr(self, 'full_nodes') or not self.clnode_to_rev:
+            return orig(self, revlog, nodelist, lookup)
+        key = lambda n: self.clnode_to_rev[lookup(n)]
+        return [revlog.rev(n) for n in sorted(nodelist, key=key)]
+
+    extensions.wrapfunction(changegroup.cg1packer, '_sortgroup', _sortgroup)
+
+    def generate(orig, self, commonrevs, clnodes, fastpathlinkrev, source):
+        '''yield a sequence of changegroup chunks (strings)'''
+        # Note: other than delegating to orig, the only deviation in
+        # logic from normal hg's generate is marked with BEGIN/END
+        # NARROW HACK.
+        if not util.safehasattr(self, 'full_nodes'):
+            # not sending a narrow bundle
+            for x in orig(self, commonrevs, clnodes, fastpathlinkrev, source):
+                yield x
+            return
+
+        repo = self._repo
+        cl = repo.changelog
+        mfl = repo.manifestlog
+        mfrevlog = mfl._revlog
+
+        clrevorder = {}
+        mfs = {} # needed manifests
+        fnodes = {} # needed file nodes
+        changedfiles = set()
+
+        # Callback for the changelog, used to collect changed files and manifest
+        # nodes.
+        # Returns the linkrev node (identity in the changelog case).
+        def lookupcl(x):
+            c = cl.read(x)
+            clrevorder[x] = len(clrevorder)
+            # BEGIN NARROW HACK
+            #
+            # Only update mfs if x is going to be sent. Otherwise we
+            # end up with bogus linkrevs specified for manifests and
+            # we skip some manifest nodes that we should otherwise
+            # have sent.
+            if x in self.full_nodes or cl.rev(x) in self.precomputed_ellipsis:
+                n = c[0]
+                # record the first changeset introducing this manifest version
+                mfs.setdefault(n, x)
+                # Set this narrow-specific dict so we have the lowest manifest
+                # revnum to look up for this cl revnum. (Part of mapping
+                # changelog ellipsis parents to manifest ellipsis parents)
+                self.next_clrev_to_localrev.setdefault(cl.rev(x),
+                                                       mfrevlog.rev(n))
+            # We can't trust the changed files list in the changeset if the
+            # client requested a shallow clone.
+            if self.is_shallow:
+                changedfiles.update(mfl[c[0]].read().keys())
+            else:
+                changedfiles.update(c[3])
+            # END NARROW HACK
+            # Record a complete list of potentially-changed files in
+            # this manifest.
+            return x
+
+        self._verbosenote(_('uncompressed size of bundle content:\n'))
+        size = 0
+        for chunk in self.group(clnodes, cl, lookupcl, units=_('changesets')):
+            size += len(chunk)
+            yield chunk
+        self._verbosenote(_('%8.i (changelog)\n') % size)
+
+        # We need to make sure that the linkrev in the changegroup refers to
+        # the first changeset that introduced the manifest or file revision.
+        # The fastpath is usually safer than the slowpath, because the filelogs
+        # are walked in revlog order.
+        #
+        # When taking the slowpath with reorder=None and the manifest revlog
+        # uses generaldelta, the manifest may be walked in the "wrong" order.
+        # Without 'clrevorder', we would get an incorrect linkrev (see fix in
+        # cc0ff93d0c0c).
+        #
+        # When taking the fastpath, we are only vulnerable to reordering
+        # of the changelog itself. The changelog never uses generaldelta, so
+        # it is only reordered when reorder=True. To handle this case, we
+        # simply take the slowpath, which already has the 'clrevorder' logic.
+        # This was also fixed in cc0ff93d0c0c.
+        fastpathlinkrev = fastpathlinkrev and not self._reorder
+        # Treemanifests don't work correctly with fastpathlinkrev
+        # either, because we don't discover which directory nodes to
+        # send along with files. This could probably be fixed.
+        fastpathlinkrev = fastpathlinkrev and (
+            'treemanifest' not in repo.requirements)
+        # Shallow clones also don't work correctly with fastpathlinkrev
+        # because file nodes may need to be sent for a manifest even if they
+        # weren't introduced by that manifest.
+        fastpathlinkrev = fastpathlinkrev and not self.is_shallow
+
+        moreargs = []
+        if self.generatemanifests.func_code.co_argcount == 7:
+            # The source argument was added to generatemanifests in hg in
+            # 75cc1f1e11f2 (2017/09/11).
+            moreargs.append(source)
+        for chunk in self.generatemanifests(commonrevs, clrevorder,
+                fastpathlinkrev, mfs, fnodes, *moreargs):
+            yield chunk
+        # BEGIN NARROW HACK
+        mfdicts = None
+        if self.is_shallow:
+            mfdicts = [(self._repo.manifestlog[n].read(), lr)
+                       for (n, lr) in mfs.iteritems()]
+        # END NARROW HACK
+        mfs.clear()
+        clrevs = set(cl.rev(x) for x in clnodes)
+
+        if not fastpathlinkrev:
+            def linknodes(unused, fname):
+                return fnodes.get(fname, {})
+        else:
+            cln = cl.node
+            def linknodes(filerevlog, fname):
+                llr = filerevlog.linkrev
+                fln = filerevlog.node
+                revs = ((r, llr(r)) for r in filerevlog)
+                return dict((fln(r), cln(lr)) for r, lr in revs if lr in clrevs)
+
+        # BEGIN NARROW HACK
+        #
+        # We need to pass the mfdicts variable down into
+        # generatefiles(), but more than one command might have
+        # wrapped generatefiles so we can't modify the function
+        # signature. Instead, we pass the data to ourselves using an
+        # instance attribute. I'm sorry.
+        self._mfdicts = mfdicts
+        # END NARROW HACK
+        for chunk in self.generatefiles(changedfiles, linknodes, commonrevs,
+                                        source):
+            yield chunk
+
+        yield self.close()
+
+        if clnodes:
+            repo.hook('outgoing', node=node.hex(clnodes[0]), source=source)
+    extensions.wrapfunction(changegroup.cg1packer, 'generate', generate)
+
+    def revchunk(orig, self, revlog, rev, prev, linknode):
+        if not util.safehasattr(self, 'full_nodes'):
+            # not sending a narrow changegroup
+            for x in orig(self, revlog, rev, prev, linknode):
+                yield x
+            return
+        # build up some mapping information that's useful later. See
+        # the local() nested function below.
+        if not self.changelog_done:
+            self.clnode_to_rev[linknode] = rev
+            linkrev = rev
+            self.clrev_to_localrev[linkrev] = rev
+        else:
+            linkrev = self.clnode_to_rev[linknode]
+            self.clrev_to_localrev[linkrev] = rev
+        # This is a node to send in full, because the changeset it
+        # corresponds to was a full changeset.
+        if linknode in self.full_nodes:
+            for x in orig(self, revlog, rev, prev, linknode):
+                yield x
+            return
+        # At this point, a node can either be one we should skip or an
+        # ellipsis. If it's not an ellipsis, bail immediately.
+        if linkrev not in self.precomputed_ellipsis:
+            return
+        linkparents = self.precomputed_ellipsis[linkrev]
+        def local(clrev):
+            """Turn a changelog revnum into a local revnum.
+
+            The ellipsis dag is stored as revnums on the changelog,
+            but when we're producing ellipsis entries for
+            non-changelog revlogs, we need to turn those numbers into
+            something local. This does that for us, and during the
+            changelog sending phase will also expand the stored
+            mappings as needed.
+            """
+            if clrev == node.nullrev:
+                return node.nullrev
+            if not self.changelog_done:
+                # If we're doing the changelog, it's possible that we
+                # have a parent that is already on the client, and we
+                # need to store some extra mapping information so that
+                # our contained ellipsis nodes will be able to resolve
+                # their parents.
+                if clrev not in self.clrev_to_localrev:
+                    clnode = revlog.node(clrev)
+                    self.clnode_to_rev[clnode] = clrev
+                return clrev
+            # Walk the ellipsis-ized changelog breadth-first looking for a
+            # change that has been linked from the current revlog.
+            #
+            # For a flat manifest revlog only a single step should be necessary
+            # as all relevant changelog entries are relevant to the flat
+            # manifest.
+            #
+            # For a filelog or tree manifest dirlog however not every changelog
+            # entry will have been relevant, so we need to skip some changelog
+            # nodes even after ellipsis-izing.
+            walk = [clrev]
+            while walk:
+                p = walk[0]
+                walk = walk[1:]
+                if p in self.clrev_to_localrev:
+                    return self.clrev_to_localrev[p]
+                elif p in self.full_nodes:
+                    walk.extend([pp for pp in self._repo.changelog.parentrevs(p)
+                                    if pp != node.nullrev])
+                elif p in self.precomputed_ellipsis:
+                    walk.extend([pp for pp in self.precomputed_ellipsis[p]
+                                    if pp != node.nullrev])
+                else:
+                    # In this case, we've got an ellipsis with parents
+                    # outside the current bundle (likely an
+                    # incremental pull). We "know" that we can use the
+                    # value of this same revlog at whatever revision
+                    # is pointed to by linknode. "Know" is in scare
+                    # quotes because I haven't done enough examination
+                    # of edge cases to convince myself this is really
+                    # a fact - it works for all the (admittedly
+                    # thorough) cases in our testsuite, but I would be
+                    # somewhat unsurprised to find a case in the wild
+                    # where this breaks down a bit. That said, I don't
+                    # know if it would hurt anything.
+                    for i in xrange(rev, 0, -1):
+                        if revlog.linkrev(i) == clrev:
+                            return i
+                    # We failed to resolve a parent for this node, so
+                    # we crash the changegroup construction.
+                    raise error.Abort(
+                        'unable to resolve parent while packing %r %r'
+                        ' for changeset %r' % (revlog.indexfile, rev, clrev))
+            return node.nullrev
+
+        if not linkparents or (
+            revlog.parentrevs(rev) == (node.nullrev, node.nullrev)):
+            p1, p2 = node.nullrev, node.nullrev
+        elif len(linkparents) == 1:
+            p1, = sorted(local(p) for p in linkparents)
+            p2 = node.nullrev
+        else:
+            p1, p2 = sorted(local(p) for p in linkparents)
+        yield ellipsisdata(
+            self, rev, revlog, p1, p2, revlog.revision(rev), linknode)
+    extensions.wrapfunction(changegroup.cg1packer, 'revchunk', revchunk)
+
+    def deltaparent(orig, self, revlog, rev, p1, p2, prev):
+        if util.safehasattr(self, 'full_nodes'):
+            # TODO: send better deltas when in narrow mode.
+            #
+            # changegroup.group() loops over revisions to send,
+            # including revisions we'll skip. What this means is that
+            # `prev` will be a potentially useless delta base for all
+            # ellipsis nodes, as the client likely won't have it. In
+            # the future we should do bookkeeping about which nodes
+            # have been sent to the client, and try to be
+            # significantly smarter about delta bases. This is
+            # slightly tricky because this same code has to work for
+            # all revlogs, and we don't have the linkrev/linknode here.
+            return p1
+        return orig(self, revlog, rev, p1, p2, prev)
+    extensions.wrapfunction(changegroup.cg2packer, 'deltaparent', deltaparent)