revlog: move the compression/decompression logic on the inner object
authorPierre-Yves David <pierre-yves.david@octobus.net>
Wed, 25 Oct 2023 02:13:18 +0200
changeset 51091 a82704902db8
parent 51090 de6a8cc24de3
child 51092 9c8df10ea6e0
revlog: move the compression/decompression logic on the inner object This is a necessary step before being able to move more logic around restoring a revision content there. For now, we do a simple patch for the perf extension logic, when the implementation of the inner object changes, we will likely need some evolution of the API. However this is true of many things in the perf extension. So we will see this later.
contrib/perf.py
mercurial/revlog.py
mercurial/revlogutils/deltas.py
--- a/contrib/perf.py	Mon Oct 23 14:27:07 2023 +0200
+++ b/contrib/perf.py	Wed Oct 25 02:13:18 2023 +0200
@@ -3833,14 +3833,16 @@
     def docompress(compressor):
         rl.clearcaches()
 
+        compressor_holder = getattr(rl, '_inner', rl)
+
         try:
             # Swap in the requested compression engine.
-            oldcompressor = rl._compressor
-            rl._compressor = compressor
+            oldcompressor = compressor_holder._compressor
+            compressor_holder._compressor = compressor
             for chunk in chunks[0]:
                 rl.compress(chunk)
         finally:
-            rl._compressor = oldcompressor
+            compressor_holder._compressor = oldcompressor
 
     benches = [
         (lambda: doread(), b'read'),
--- a/mercurial/revlog.py	Mon Oct 23 14:27:07 2023 +0200
+++ b/mercurial/revlog.py	Wed Oct 25 02:13:18 2023 +0200
@@ -353,7 +353,9 @@
         sidedata_file,
         inline,
         data_config,
+        feature_config,
         chunk_cache,
+        default_compression_header,
     ):
         self.opener = opener
         self.index = index
@@ -363,6 +365,9 @@
         self.sidedata_file = sidedata_file
         self.inline = inline
         self.data_config = data_config
+        self.feature_config = feature_config
+
+        self._default_compression_header = default_compression_header
 
         # index
 
@@ -381,6 +386,9 @@
             self.data_config.chunk_cache_size,
         )
 
+        # revlog header -> revlog compressor
+        self._decompressors = {}
+
     @property
     def index_file(self):
         return self.__index_file
@@ -405,6 +413,103 @@
         """the end of the data chunk for this revision"""
         return self.start(rev) + self.length(rev)
 
+    @util.propertycache
+    def _compressor(self):
+        engine = util.compengines[self.feature_config.compression_engine]
+        return engine.revlogcompressor(
+            self.feature_config.compression_engine_options
+        )
+
+    @util.propertycache
+    def _decompressor(self):
+        """the default decompressor"""
+        if self._default_compression_header is None:
+            return None
+        t = self._default_compression_header
+        c = self._get_decompressor(t)
+        return c.decompress
+
+    def _get_decompressor(self, t):
+        try:
+            compressor = self._decompressors[t]
+        except KeyError:
+            try:
+                engine = util.compengines.forrevlogheader(t)
+                compressor = engine.revlogcompressor(
+                    self.feature_config.compression_engine_options
+                )
+                self._decompressors[t] = compressor
+            except KeyError:
+                raise error.RevlogError(
+                    _(b'unknown compression type %s') % binascii.hexlify(t)
+                )
+        return compressor
+
+    def compress(self, data):
+        """Generate a possibly-compressed representation of data."""
+        if not data:
+            return b'', data
+
+        compressed = self._compressor.compress(data)
+
+        if compressed:
+            # The revlog compressor added the header in the returned data.
+            return b'', compressed
+
+        if data[0:1] == b'\0':
+            return b'', data
+        return b'u', data
+
+    def decompress(self, data):
+        """Decompress a revlog chunk.
+
+        The chunk is expected to begin with a header identifying the
+        format type so it can be routed to an appropriate decompressor.
+        """
+        if not data:
+            return data
+
+        # Revlogs are read much more frequently than they are written and many
+        # chunks only take microseconds to decompress, so performance is
+        # important here.
+        #
+        # We can make a few assumptions about revlogs:
+        #
+        # 1) the majority of chunks will be compressed (as opposed to inline
+        #    raw data).
+        # 2) decompressing *any* data will likely by at least 10x slower than
+        #    returning raw inline data.
+        # 3) we want to prioritize common and officially supported compression
+        #    engines
+        #
+        # It follows that we want to optimize for "decompress compressed data
+        # when encoded with common and officially supported compression engines"
+        # case over "raw data" and "data encoded by less common or non-official
+        # compression engines." That is why we have the inline lookup first
+        # followed by the compengines lookup.
+        #
+        # According to `hg perfrevlogchunks`, this is ~0.5% faster for zlib
+        # compressed chunks. And this matters for changelog and manifest reads.
+        t = data[0:1]
+
+        if t == b'x':
+            try:
+                return _zlibdecompress(data)
+            except zlib.error as e:
+                raise error.RevlogError(
+                    _(b'revlog decompress error: %s')
+                    % stringutil.forcebytestr(e)
+                )
+        # '\0' is more common than 'u' so it goes first.
+        elif t == b'\0':
+            return data
+        elif t == b'u':
+            return util.buffer(data, 1)
+
+        compressor = self._get_decompressor(t)
+
+        return compressor.decompress(data)
+
     @contextlib.contextmanager
     def reading(self):
         """Context manager that keeps data and sidedata files open for reading"""
@@ -1284,12 +1389,15 @@
         self.index = index
         # revnum -> (chain-length, sum-delta-length)
         self._chaininfocache = util.lrucachedict(500)
-        # revlog header -> revlog compressor
-        self._decompressors = {}
 
         return chunkcache
 
     def _load_inner(self, chunk_cache):
+        if self._docket is None:
+            default_compression_header = None
+        else:
+            default_compression_header = self._docket.default_compression_header
+
         self._inner = _InnerRevlog(
             opener=self.opener,
             index=self.index,
@@ -1298,7 +1406,9 @@
             sidedata_file=self._sidedatafile,
             inline=self._inline,
             data_config=self.data_config,
+            feature_config=self.feature_config,
             chunk_cache=chunk_cache,
+            default_compression_header=default_compression_header,
         )
 
     def get_revlog(self):
@@ -1319,38 +1429,6 @@
         else:
             return self.radix
 
-    def _get_decompressor(self, t):
-        try:
-            compressor = self._decompressors[t]
-        except KeyError:
-            try:
-                engine = util.compengines.forrevlogheader(t)
-                compressor = engine.revlogcompressor(
-                    self.feature_config.compression_engine_options
-                )
-                self._decompressors[t] = compressor
-            except KeyError:
-                raise error.RevlogError(
-                    _(b'unknown compression type %s') % binascii.hexlify(t)
-                )
-        return compressor
-
-    @util.propertycache
-    def _compressor(self):
-        engine = util.compengines[self.feature_config.compression_engine]
-        return engine.revlogcompressor(
-            self.feature_config.compression_engine_options
-        )
-
-    @util.propertycache
-    def _decompressor(self):
-        """the default decompressor"""
-        if self._docket is None:
-            return None
-        t = self._docket.default_compression_header
-        c = self._get_decompressor(t)
-        return c.decompress
-
     def _datafp(self, mode=b'r'):
         """file object for the revlog's data file"""
         return self.opener(self._datafile, mode=mode)
@@ -2272,9 +2350,9 @@
         if compression_mode == COMP_MODE_PLAIN:
             return data
         elif compression_mode == COMP_MODE_DEFAULT:
-            return self._decompressor(data)
+            return self._inner._decompressor(data)
         elif compression_mode == COMP_MODE_INLINE:
-            return self.decompress(data)
+            return self._inner.decompress(data)
         else:
             msg = b'unknown compression mode %d'
             msg %= compression_mode
@@ -2328,9 +2406,9 @@
                 # 2G on Windows
                 return [self._chunk(rev) for rev in revschunk]
 
-            decomp = self.decompress
+            decomp = self._inner.decompress
             # self._decompressor might be None, but will not be used in that case
-            def_decomp = self._decompressor
+            def_decomp = self._inner._decompressor
             for rev in revschunk:
                 chunkstart = start(rev)
                 if inline:
@@ -2544,9 +2622,9 @@
         if comp == COMP_MODE_PLAIN:
             segment = comp_segment
         elif comp == COMP_MODE_DEFAULT:
-            segment = self._decompressor(comp_segment)
+            segment = self._inner._decompressor(comp_segment)
         elif comp == COMP_MODE_INLINE:
-            segment = self.decompress(comp_segment)
+            segment = self._inner.decompress(comp_segment)
         else:
             msg = b'unknown compression mode %d'
             msg %= comp
@@ -2842,69 +2920,10 @@
             )
 
     def compress(self, data):
-        """Generate a possibly-compressed representation of data."""
-        if not data:
-            return b'', data
-
-        compressed = self._compressor.compress(data)
-
-        if compressed:
-            # The revlog compressor added the header in the returned data.
-            return b'', compressed
-
-        if data[0:1] == b'\0':
-            return b'', data
-        return b'u', data
+        return self._inner.compress(data)
 
     def decompress(self, data):
-        """Decompress a revlog chunk.
-
-        The chunk is expected to begin with a header identifying the
-        format type so it can be routed to an appropriate decompressor.
-        """
-        if not data:
-            return data
-
-        # Revlogs are read much more frequently than they are written and many
-        # chunks only take microseconds to decompress, so performance is
-        # important here.
-        #
-        # We can make a few assumptions about revlogs:
-        #
-        # 1) the majority of chunks will be compressed (as opposed to inline
-        #    raw data).
-        # 2) decompressing *any* data will likely by at least 10x slower than
-        #    returning raw inline data.
-        # 3) we want to prioritize common and officially supported compression
-        #    engines
-        #
-        # It follows that we want to optimize for "decompress compressed data
-        # when encoded with common and officially supported compression engines"
-        # case over "raw data" and "data encoded by less common or non-official
-        # compression engines." That is why we have the inline lookup first
-        # followed by the compengines lookup.
-        #
-        # According to `hg perfrevlogchunks`, this is ~0.5% faster for zlib
-        # compressed chunks. And this matters for changelog and manifest reads.
-        t = data[0:1]
-
-        if t == b'x':
-            try:
-                return _zlibdecompress(data)
-            except zlib.error as e:
-                raise error.RevlogError(
-                    _(b'revlog decompress error: %s')
-                    % stringutil.forcebytestr(e)
-                )
-        # '\0' is more common than 'u' so it goes first.
-        elif t == b'\0':
-            return data
-        elif t == b'u':
-            return util.buffer(data, 1)
-
-        compressor = self._get_decompressor(t)
-
-        return compressor.decompress(data)
+        return self._inner.decompress(data)
 
     def _addrevision(
         self,
@@ -3029,7 +3048,7 @@
             sidedata_compression_mode = COMP_MODE_PLAIN
             serialized_sidedata = sidedatautil.serialize_sidedata(sidedata)
             sidedata_offset = self._docket.sidedata_end
-            h, comp_sidedata = self.compress(serialized_sidedata)
+            h, comp_sidedata = self._inner.compress(serialized_sidedata)
             if (
                 h != b'u'
                 and comp_sidedata[0:1] != b'\0'
@@ -3876,7 +3895,7 @@
                 sidedata_compression_mode = COMP_MODE_INLINE
                 if serialized_sidedata and self.feature_config.has_side_data:
                     sidedata_compression_mode = COMP_MODE_PLAIN
-                    h, comp_sidedata = self.compress(serialized_sidedata)
+                    h, comp_sidedata = self._inner.compress(serialized_sidedata)
                     if (
                         h != b'u'
                         and comp_sidedata[0] != b'\0'
--- a/mercurial/revlogutils/deltas.py	Mon Oct 23 14:27:07 2023 +0200
+++ b/mercurial/revlogutils/deltas.py	Wed Oct 25 02:13:18 2023 +0200
@@ -1205,7 +1205,7 @@
                     msg = b"DBG-DELTAS-SEARCH:     DISCARDED (prev size)\n"
                     self._write_debug(msg)
                 return None
-        header, data = revlog.compress(delta)
+        header, data = revlog._inner.compress(delta)
         deltalen = len(header) + len(data)
         offset = revlog.end(len(revlog) - 1)
         dist = deltalen + offset - revlog.start(chainbase)
@@ -1226,7 +1226,7 @@
 
     def _fullsnapshotinfo(self, revinfo, curr):
         rawtext = self.buildtext(revinfo)
-        data = self.revlog.compress(rawtext)
+        data = self.revlog._inner.compress(rawtext)
         compresseddeltalen = deltalen = dist = len(data[1]) + len(data[0])
         deltabase = chainbase = curr
         snapshotdepth = 0