delta-find: use a smarter object for snapshot caching
authorPierre-Yves David <pierre-yves.david@octobus.net>
Sun, 06 Nov 2022 16:56:23 -0500
changeset 49678 efbbc2f9121e
parent 49677 05db41701ece
child 49679 b670eb3dd6c9
delta-find: use a smarter object for snapshot caching This open the way for a longer lived cache.
contrib/fuzz/revlog.cc
mercurial/cext/revlog.c
mercurial/revlogutils/deltas.py
tests/test-revlog-raw.py
--- a/contrib/fuzz/revlog.cc	Mon Nov 07 22:12:59 2022 -0500
+++ b/contrib/fuzz/revlog.cc	Sun Nov 06 16:56:23 2022 -0500
@@ -20,7 +20,7 @@
         index, cache = parsers.parse_index2(data, inline)
         index.slicechunktodensity(list(range(len(index))), 0.5, 262144)
         index.stats()
-        index.findsnapshots({}, 0)
+        index.findsnapshots({}, 0, len(index) - 1)
         10 in index
         for rev in range(len(index)):
             index.reachableroots(0, [len(index)-1], [rev])
--- a/mercurial/cext/revlog.c	Mon Nov 07 22:12:59 2022 -0500
+++ b/mercurial/cext/revlog.c	Sun Nov 06 16:56:23 2022 -0500
@@ -1446,16 +1446,25 @@
 static PyObject *index_findsnapshots(indexObject *self, PyObject *args)
 {
 	Py_ssize_t start_rev;
+	Py_ssize_t end_rev;
 	PyObject *cache;
 	Py_ssize_t base;
 	Py_ssize_t rev;
 	PyObject *key = NULL;
 	PyObject *value = NULL;
 	const Py_ssize_t length = index_length(self);
-	if (!PyArg_ParseTuple(args, "O!n", &PyDict_Type, &cache, &start_rev)) {
+	if (!PyArg_ParseTuple(args, "O!nn", &PyDict_Type, &cache, &start_rev,
+	                      &end_rev)) {
 		return NULL;
 	}
-	for (rev = start_rev; rev < length; rev++) {
+	end_rev += 1;
+	if (end_rev > length) {
+		end_rev = length;
+	}
+	if (start_rev < 0) {
+		start_rev = 0;
+	}
+	for (rev = start_rev; rev < end_rev; rev++) {
 		int issnap;
 		PyObject *allvalues = NULL;
 		issnap = index_issnapshotrev(self, rev);
--- a/mercurial/revlogutils/deltas.py	Mon Nov 07 22:12:59 2022 -0500
+++ b/mercurial/revlogutils/deltas.py	Sun Nov 06 16:56:23 2022 -0500
@@ -799,18 +799,6 @@
     yield None
 
 
-def _findsnapshots(revlog, cache, start_rev):
-    """find snapshot from start_rev to tip"""
-    if util.safehasattr(revlog.index, b'findsnapshots'):
-        revlog.index.findsnapshots(cache, start_rev)
-    else:
-        deltaparent = revlog.deltaparent
-        issnapshot = revlog.issnapshot
-        for rev in revlog.revs(start_rev):
-            if issnapshot(rev):
-                cache[deltaparent(rev)].append(rev)
-
-
 def _refinedgroups(revlog, p1, p2, cachedelta):
     good = None
     # First we try to reuse a the delta contained in the bundle.
@@ -832,13 +820,13 @@
             yield None
             return
     # XXX cache me higher
-    snapshots = collections.defaultdict(list)
+    snapshot_cache = SnapshotCache()
     groups = _rawgroups(
         revlog,
         p1,
         p2,
         cachedelta,
-        snapshots,
+        snapshot_cache,
     )
     for candidates in groups:
         good = yield candidates
@@ -861,12 +849,12 @@
                 break
             good = yield (base,)
         # refine snapshot up
-        if not snapshots:
-            _findsnapshots(revlog, snapshots, good + 1)
+        if not snapshot_cache.snapshots:
+            snapshot_cache.update(revlog, good + 1)
         previous = None
         while good != previous:
             previous = good
-            children = tuple(sorted(c for c in snapshots[good]))
+            children = tuple(sorted(c for c in snapshot_cache.snapshots[good]))
             good = yield children
 
     if debug_info is not None:
@@ -876,7 +864,7 @@
     yield None
 
 
-def _rawgroups(revlog, p1, p2, cachedelta, snapshots=None):
+def _rawgroups(revlog, p1, p2, cachedelta, snapshot_cache=None):
     """Provides group of revision to be tested as delta base
 
     This lower level function focus on emitting delta theorically interresting
@@ -907,9 +895,9 @@
             yield parents
 
     if sparse and parents:
-        if snapshots is None:
+        if snapshot_cache is None:
             # map: base-rev: [snapshot-revs]
-            snapshots = collections.defaultdict(list)
+            snapshot_cache = SnapshotCache()
         # See if we can use an existing snapshot in the parent chains to use as
         # a base for a new intermediate-snapshot
         #
@@ -923,7 +911,7 @@
                     break
                 parents_snaps[idx].add(s)
         snapfloor = min(parents_snaps[0]) + 1
-        _findsnapshots(revlog, snapshots, snapfloor)
+        snapshot_cache.update(revlog, snapfloor)
         # search for the highest "unrelated" revision
         #
         # Adding snapshots used by "unrelated" revision increase the odd we
@@ -961,7 +949,7 @@
         for idx, snaps in sorted(parents_snaps.items(), reverse=True):
             siblings = set()
             for s in snaps:
-                siblings.update(snapshots[s])
+                siblings.update(snapshot_cache.snapshots[s])
             # Before considering making a new intermediate snapshot, we check
             # if an existing snapshot, children of base we consider, would be
             # suitable.
@@ -989,7 +977,7 @@
         # revisions instead of starting our own. Without such re-use,
         # topological branches would keep reopening new full chains. Creating
         # more and more snapshot as the repository grow.
-        yield tuple(snapshots[nullrev])
+        yield tuple(snapshot_cache.snapshots[nullrev])
 
     if not sparse:
         # other approach failed try against prev to hopefully save us a
@@ -997,6 +985,62 @@
         yield (prev,)
 
 
+class SnapshotCache:
+    __slots__ = ('snapshots', '_start_rev', '_end_rev')
+
+    def __init__(self):
+        # XXX should probably be a set ?
+        self.snapshots = collections.defaultdict(list)
+        self._start_rev = None
+        self._end_rev = None
+
+    def update(self, revlog, start_rev=0):
+        """find snapshots from start_rev to tip"""
+        nb_revs = len(revlog)
+        end_rev = nb_revs - 1
+        if start_rev > end_rev:
+            return  # range is empty
+
+        if self._start_rev is None:
+            assert self._end_rev is None
+            self._update(revlog, start_rev, end_rev)
+        elif not (self._start_rev <= start_rev and end_rev <= self._end_rev):
+            if start_rev < self._start_rev:
+                self._update(revlog, start_rev, self._start_rev - 1)
+            if self._end_rev < end_rev:
+                self._update(revlog, self._end_rev + 1, end_rev)
+
+        if self._start_rev is None:
+            assert self._end_rev is None
+            self._end_rev = end_rev
+            self._start_rev = start_rev
+        else:
+            self._start_rev = min(self._start_rev, start_rev)
+            self._end_rev = max(self._end_rev, end_rev)
+        assert self._start_rev <= self._end_rev, (
+            self._start_rev,
+            self._end_rev,
+        )
+
+    def _update(self, revlog, start_rev, end_rev):
+        """internal method that actually do update content"""
+        assert self._start_rev is None or (
+            start_rev < self._start_rev or start_rev > self._end_rev
+        ), (self._start_rev, self._end_rev, start_rev, end_rev)
+        assert self._start_rev is None or (
+            end_rev < self._start_rev or end_rev > self._end_rev
+        ), (self._start_rev, self._end_rev, start_rev, end_rev)
+        cache = self.snapshots
+        if util.safehasattr(revlog.index, b'findsnapshots'):
+            revlog.index.findsnapshots(cache, start_rev, end_rev)
+        else:
+            deltaparent = revlog.deltaparent
+            issnapshot = revlog.issnapshot
+            for rev in revlog.revs(start_rev, end_rev):
+                if issnapshot(rev):
+                    cache[deltaparent(rev)].append(rev)
+
+
 class deltacomputer:
     def __init__(
         self,
--- a/tests/test-revlog-raw.py	Mon Nov 07 22:12:59 2022 -0500
+++ b/tests/test-revlog-raw.py	Sun Nov 06 16:56:23 2022 -0500
@@ -1,7 +1,6 @@
 # test revlog interaction about raw data (flagprocessor)
 
 
-import collections
 import hashlib
 import sys
 
@@ -472,16 +471,16 @@
 
 
 def findsnapshottest(rlog):
-    resultall = collections.defaultdict(list)
-    deltas._findsnapshots(rlog, resultall, 0)
-    resultall = dict(resultall.items())
+    cache = deltas.SnapshotCache()
+    cache.update(rlog)
+    resultall = dict(cache.snapshots)
     if resultall != snapshotmapall:
         print('snapshot map  differ:')
         print('  expected: %s' % snapshotmapall)
         print('  got:      %s' % resultall)
-    result15 = collections.defaultdict(list)
-    deltas._findsnapshots(rlog, result15, 15)
-    result15 = dict(result15.items())
+    cache15 = deltas.SnapshotCache()
+    cache15.update(rlog, 15)
+    result15 = dict(cache15.snapshots)
     if result15 != snapshotmap15:
         print('snapshot map  differ:')
         print('  expected: %s' % snapshotmap15)