revlog: add a small cache of unfiltered chunk
authorPierre-Yves David <pierre-yves.david@octobus.net>
Fri, 27 Oct 2023 08:54:41 +0200
changeset 51108 0250e45040f1
parent 51107 c2d2e5b65def
child 51109 687e192dae16
revlog: add a small cache of unfiltered chunk This can provides a massive boost to the reading of multiple revision and the computation of a valid delta chain. This greatly help operation like `hg log --patch`, delta computation (helping pull/unbundle), linkrev adjustment (helping copy tracing). A first round of benchmark for `hg log --patch --limit 1000` shows improvement in the 10-20% range on "small" repository like pypy or mercurial and large improvements (about 33%) for more complex ones like netbeans and mozilla's. These speeds up are consistent with the improvement to `hg pull` (from a server sending poor deltas) I saw benchmarking this last year. Further benchmark will be run during the freeze. I added some configuration in the experimental space to be able to further test the effect of various tuning for now. This feature should fit well in the "usage/resource profile" configuration that we should land next cycle. When it does not provides a benefit the overhead of the cache seem to be around 2%, a small price for the big improvement. In addition I believe we could shave most of this overhead with a more efficent lru implementation.
mercurial/configitems.toml
mercurial/localrepo.py
mercurial/revlog.py
--- a/mercurial/configitems.toml	Fri Oct 27 02:57:09 2023 +0200
+++ b/mercurial/configitems.toml	Fri Oct 27 08:54:41 2023 +0200
@@ -1111,6 +1111,28 @@
 
 [[items]]
 section = "experimental"
+name = "revlog.uncompressed-cache.enabled"
+default = true
+experimental = true
+documentation = """Enable some caching of uncompressed chunk, greatly boosting
+performance at the cost of memory usage."""
+
+[[items]]
+section = "experimental"
+name = "revlog.uncompressed-cache.factor"
+default = 4
+experimental = true
+documentation = """The size of the cache compared to the largest revision seen."""
+
+[[items]]
+section = "experimental"
+name = "revlog.uncompressed-cache.count"
+default = 10000
+experimental = true
+documentation = """The number of chunk cached."""
+
+[[items]]
+section = "experimental"
 name = "stream-v3"
 default = false
 
--- a/mercurial/localrepo.py	Fri Oct 27 02:57:09 2023 +0200
+++ b/mercurial/localrepo.py	Fri Oct 27 08:54:41 2023 +0200
@@ -1089,6 +1089,16 @@
     if chunkcachesize is not None:
         data_config.chunk_cache_size = chunkcachesize
 
+    if ui.configbool(b'experimental', b'revlog.uncompressed-cache.enabled'):
+        factor = ui.configint(
+            b'experimental', b'revlog.uncompressed-cache.factor'
+        )
+        count = ui.configint(
+            b'experimental', b'revlog.uncompressed-cache.count'
+        )
+        data_config.uncompressed_cache_factor = factor
+        data_config.uncompressed_cache_count = count
+
     delta_config.delta_both_parents = ui.configbool(
         b'storage', b'revlog.optimize-delta-parent-choice'
     )
--- a/mercurial/revlog.py	Fri Oct 27 02:57:09 2023 +0200
+++ b/mercurial/revlog.py	Fri Oct 27 08:54:41 2023 +0200
@@ -295,6 +295,12 @@
     # How much data to read and cache into the raw revlog data cache.
     chunk_cache_size = attr.ib(default=65536)
 
+    # The size of the uncompressed cache compared to the largest revision seen.
+    uncompressed_cache_factor = attr.ib(default=None)
+
+    # The number of chunk cached
+    uncompressed_cache_count = attr.ib(default=None)
+
     # Allow sparse reading of the revlog data
     with_sparse_read = attr.ib(default=False)
     # minimal density of a sparse read chunk
@@ -396,6 +402,18 @@
         # 3-tuple of (node, rev, text) for a raw revision.
         self._revisioncache = None
 
+        # cache some uncompressed chunks
+        # rev → uncompressed_chunk
+        #
+        # the max cost is dynamically updated to be proportionnal to the
+        # size of revision we actually encounter.
+        self._uncompressed_chunk_cache = None
+        if self.data_config.uncompressed_cache_factor is not None:
+            self._uncompressed_chunk_cache = util.lrucachedict(
+                self.data_config.uncompressed_cache_count,
+                maxcost=65536,  # some arbitrary initial value
+            )
+
         self._delay_buffer = None
 
     @property
@@ -414,6 +432,8 @@
     def clear_cache(self):
         assert not self.is_delaying
         self._revisioncache = None
+        if self._uncompressed_chunk_cache is not None:
+            self._uncompressed_chunk_cache.clear()
         self._segmentfile.clear_cache()
         self._segmentfile_sidedata.clear_cache()
 
@@ -865,18 +885,26 @@
 
         Returns a str holding uncompressed data for the requested revision.
         """
+        if self._uncompressed_chunk_cache is not None:
+            uncomp = self._uncompressed_chunk_cache.get(rev)
+            if uncomp is not None:
+                return uncomp
+
         compression_mode = self.index[rev][10]
         data = self.get_segment_for_revs(rev, rev)[1]
         if compression_mode == COMP_MODE_PLAIN:
-            return data
+            uncomp = data
         elif compression_mode == COMP_MODE_DEFAULT:
-            return self._decompressor(data)
+            uncomp = self._decompressor(data)
         elif compression_mode == COMP_MODE_INLINE:
-            return self.decompress(data)
+            uncomp = self.decompress(data)
         else:
             msg = b'unknown compression mode %d'
             msg %= compression_mode
             raise error.RevlogError(msg)
+        if self._uncompressed_chunk_cache is not None:
+            self._uncompressed_chunk_cache.insert(rev, uncomp, cost=len(uncomp))
+        return uncomp
 
     def _chunks(self, revs, targetsize=None):
         """Obtain decompressed chunks for the specified revisions.
@@ -899,17 +927,30 @@
         iosize = self.index.entry_size
         buffer = util.buffer
 
-        l = []
-        ladd = l.append
+        fetched_revs = []
+        fadd = fetched_revs.append
+
         chunks = []
         ladd = chunks.append
 
-        if not self.data_config.with_sparse_read:
-            slicedchunks = (revs,)
+        if self._uncompressed_chunk_cache is None:
+            fetched_revs = revs
+        else:
+            for rev in revs:
+                cached_value = self._uncompressed_chunk_cache.get(rev)
+                if cached_value is None:
+                    fadd(rev)
+                else:
+                    ladd((rev, cached_value))
+
+        if not fetched_revs:
+            slicedchunks = ()
+        elif not self.data_config.with_sparse_read:
+            slicedchunks = (fetched_revs,)
         else:
             slicedchunks = deltautil.slicechunk(
                 self,
-                revs,
+                fetched_revs,
                 targetsize=targetsize,
             )
 
@@ -949,7 +990,10 @@
                     msg %= comp_mode
                     raise error.RevlogError(msg)
                 ladd((rev, c))
-
+                if self._uncompressed_chunk_cache is not None:
+                    self._uncompressed_chunk_cache.insert(rev, c, len(c))
+
+        chunks.sort()
         return [x[1] for x in chunks]
 
     def raw_text(self, node, rev):
@@ -981,6 +1025,14 @@
         if 0 <= rawsize:
             targetsize = 4 * rawsize
 
+        if self._uncompressed_chunk_cache is not None:
+            # dynamically update the uncompressed_chunk_cache size to the
+            # largest revision we saw in this revlog.
+            factor = self.data_config.uncompressed_cache_factor
+            candidate_size = rawsize * factor
+            if candidate_size > self._uncompressed_chunk_cache.maxcost:
+                self._uncompressed_chunk_cache.maxcost = candidate_size
+
         bins = self._chunks(chain, targetsize=targetsize)
         if basetext is None:
             basetext = bytes(bins[0])