revlog: add a "data compression mode" entry in the index tuple
authorPierre-Yves David <pierre-yves.david@octobus.net>
Mon, 03 May 2021 18:19:16 +0200
changeset 47249 130c9f7ed914
parent 47248 013c645dd28c
child 47250 4dca422d3907
revlog: add a "data compression mode" entry in the index tuple That will make it possible to keep track of compression information in the revlog index, opening the way to more efficient revision restoration (in native code, but the python usage is already defeating performance work). We start with adding a new entry to the index tuple, using a value matching the current behavior. We will introduce storage and other value in later changesets. Differential Revision: https://phab.mercurial-scm.org/D10646
mercurial/bundlerepo.py
mercurial/cext/parsers.c
mercurial/cext/revlog.c
mercurial/policy.py
mercurial/pure/parsers.py
mercurial/revlog.py
mercurial/revlogutils/constants.py
mercurial/revlogutils/revlogv0.py
mercurial/unionrepo.py
tests/test-parseindex2.py
--- a/mercurial/bundlerepo.py	Tue May 04 01:15:03 2021 +0200
+++ b/mercurial/bundlerepo.py	Mon May 03 18:19:16 2021 +0200
@@ -105,6 +105,7 @@
                 node,
                 0,
                 0,
+                revlog_constants.COMP_MODE_INLINE,
             )
             self.index.append(e)
             self.bundlerevs.add(n)
--- a/mercurial/cext/parsers.c	Tue May 04 01:15:03 2021 +0200
+++ b/mercurial/cext/parsers.c	Mon May 03 18:19:16 2021 +0200
@@ -668,7 +668,7 @@
 void manifest_module_init(PyObject *mod);
 void revlog_module_init(PyObject *mod);
 
-static const int version = 18;
+static const int version = 19;
 
 static void module_init(PyObject *mod)
 {
--- a/mercurial/cext/revlog.c	Tue May 04 01:15:03 2021 +0200
+++ b/mercurial/cext/revlog.c	Mon May 03 18:19:16 2021 +0200
@@ -118,9 +118,9 @@
 static int index_find_node(indexObject *self, const char *node);
 
 #if LONG_MAX == 0x7fffffffL
-static const char *const tuple_format = PY23("Kiiiiiis#Ki", "Kiiiiiiy#Ki");
+static const char *const tuple_format = PY23("Kiiiiiis#KiB", "Kiiiiiiy#KiB");
 #else
-static const char *const tuple_format = PY23("kiiiiiis#ki", "kiiiiiiy#ki");
+static const char *const tuple_format = PY23("kiiiiiis#kiB", "kiiiiiiy#kiB");
 #endif
 
 /* A RevlogNG v1 index entry is 64 bytes long. */
@@ -132,6 +132,8 @@
 static const long format_v1 = 1; /* Internal only, could be any number */
 static const long format_v2 = 2; /* Internal only, could be any number */
 
+static const char comp_mode_inline = 2;
+
 static void raise_revlog_error(void)
 {
 	PyObject *mod = NULL, *dict = NULL, *errclass = NULL;
@@ -294,6 +296,7 @@
 	uint64_t offset_flags, sidedata_offset;
 	int comp_len, uncomp_len, base_rev, link_rev, parent_1, parent_2,
 	    sidedata_comp_len;
+	char data_comp_mode;
 	const char *c_node_id;
 	const char *data;
 	Py_ssize_t length = index_length(self);
@@ -340,9 +343,11 @@
 		sidedata_comp_len = getbe32(data + 72);
 	}
 
+	data_comp_mode = comp_mode_inline;
 	return Py_BuildValue(tuple_format, offset_flags, comp_len, uncomp_len,
 	                     base_rev, link_rev, parent_1, parent_2, c_node_id,
-	                     self->nodelen, sidedata_offset, sidedata_comp_len);
+	                     self->nodelen, sidedata_offset, sidedata_comp_len,
+	                     data_comp_mode);
 }
 /*
  * Pack header information in binary
@@ -443,6 +448,7 @@
 {
 	uint64_t offset_flags, sidedata_offset;
 	int rev, comp_len, uncomp_len, base_rev, link_rev, parent_1, parent_2;
+	char data_comp_mode;
 	Py_ssize_t c_node_id_len, sidedata_comp_len;
 	const char *c_node_id;
 	char *data;
@@ -450,8 +456,9 @@
 	if (!PyArg_ParseTuple(obj, tuple_format, &offset_flags, &comp_len,
 	                      &uncomp_len, &base_rev, &link_rev, &parent_1,
 	                      &parent_2, &c_node_id, &c_node_id_len,
-	                      &sidedata_offset, &sidedata_comp_len)) {
-		PyErr_SetString(PyExc_TypeError, "10-tuple required");
+	                      &sidedata_offset, &sidedata_comp_len,
+	                      &data_comp_mode)) {
+		PyErr_SetString(PyExc_TypeError, "11-tuple required");
 		return NULL;
 	}
 
@@ -459,6 +466,12 @@
 		PyErr_SetString(PyExc_TypeError, "invalid node");
 		return NULL;
 	}
+	if (data_comp_mode != comp_mode_inline) {
+		PyErr_Format(PyExc_ValueError,
+		             "invalid data compression mode: %i",
+		             data_comp_mode);
+		return NULL;
+	}
 
 	if (self->new_length == self->added_length) {
 		size_t new_added_length =
@@ -2761,9 +2774,9 @@
 		self->entry_size = v1_entry_size;
 	}
 
-	self->nullentry =
-	    Py_BuildValue(PY23("iiiiiiis#ii", "iiiiiiiy#ii"), 0, 0, 0, -1, -1,
-	                  -1, -1, nullid, self->nodelen, 0, 0);
+	self->nullentry = Py_BuildValue(PY23("iiiiiiis#iiB", "iiiiiiiy#iiB"), 0,
+	                                0, 0, -1, -1, -1, -1, nullid,
+	                                self->nodelen, 0, 0, comp_mode_inline);
 
 	if (!self->nullentry)
 		return -1;
--- a/mercurial/policy.py	Tue May 04 01:15:03 2021 +0200
+++ b/mercurial/policy.py	Mon May 03 18:19:16 2021 +0200
@@ -80,7 +80,7 @@
     ('cext', 'bdiff'): 3,
     ('cext', 'mpatch'): 1,
     ('cext', 'osutil'): 4,
-    ('cext', 'parsers'): 18,
+    ('cext', 'parsers'): 19,
 }
 
 # map import request to other package or module
--- a/mercurial/pure/parsers.py	Tue May 04 01:15:03 2021 +0200
+++ b/mercurial/pure/parsers.py	Mon May 03 18:19:16 2021 +0200
@@ -54,7 +54,19 @@
     # Size of a C long int, platform independent
     int_size = struct.calcsize(b'>i')
     # An empty index entry, used as a default value to be overridden, or nullrev
-    null_item = (0, 0, 0, -1, -1, -1, -1, sha1nodeconstants.nullid, 0, 0)
+    null_item = (
+        0,
+        0,
+        0,
+        -1,
+        -1,
+        -1,
+        -1,
+        sha1nodeconstants.nullid,
+        0,
+        0,
+        revlog_constants.COMP_MODE_INLINE,
+    )
 
     @util.propertycache
     def entry_size(self):
@@ -135,7 +147,7 @@
 
     def _unpack_entry(self, data):
         r = self.index_format.unpack(data)
-        r = r + (0, 0)
+        r = r + (0, 0, revlog_constants.COMP_MODE_INLINE)
         return r
 
     def pack_header(self, header):
@@ -303,16 +315,17 @@
             self._extra[rev - self._lgt] = new
 
     def _unpack_entry(self, data):
-        return self.index_format.unpack(data)
+        return self.index_format.unpack(data) + (
+            revlog_constants.COMP_MODE_INLINE,
+        )
 
     def _pack_entry(self, entry):
-        return self.index_format.pack(*entry)
+        return self.index_format.pack(*entry[:10])
 
     def entry_binary(self, rev):
         """return the raw binary string representing a revision"""
         entry = self[rev]
-        p = revlog_constants.INDEX_ENTRY_V2.pack(*entry)
-        return p
+        return self._pack_entry(entry)
 
     def pack_header(self, header):
         """pack header information as binary"""
--- a/mercurial/revlog.py	Tue May 04 01:15:03 2021 +0200
+++ b/mercurial/revlog.py	Mon May 03 18:19:16 2021 +0200
@@ -35,6 +35,7 @@
 from .pycompat import getattr
 from .revlogutils.constants import (
     ALL_KINDS,
+    COMP_MODE_INLINE,
     FEATURES_BY_VERSION,
     FLAG_GENERALDELTA,
     FLAG_INLINE_DATA,
@@ -336,6 +337,12 @@
 
     [9] sidedata chunk length:
             The size, in bytes, of the revision's side-data chunk.
+
+    [10] data compression mode:
+            two bits that detail the way the data chunk is compressed on disk.
+            (see "COMP_MODE_*" constants for details). For revlog version 0 and
+            1 this will always be COMP_MODE_INLINE.
+
     """
 
     _flagserrorclass = error.RevlogError
@@ -2474,6 +2481,7 @@
             node,
             sidedata_offset,
             len(serialized_sidedata),
+            COMP_MODE_INLINE,
         )
 
         self.index.append(e)
--- a/mercurial/revlogutils/constants.py	Tue May 04 01:15:03 2021 +0200
+++ b/mercurial/revlogutils/constants.py	Mon May 03 18:19:16 2021 +0200
@@ -1,4 +1,4 @@
-# revlogdeltas.py - constant used for revlog logic
+# revlogdeltas.py - constant used for revlog logic.
 #
 # Copyright 2005-2007 Olivia Mackall <olivia@selenic.com>
 # Copyright 2018 Octobus <contact@octobus.net>
@@ -114,6 +114,14 @@
 # bitmark for flags that could cause rawdata content change
 REVIDX_RAWTEXT_CHANGING_FLAGS = REVIDX_ISCENSORED | REVIDX_EXTSTORED
 
+## chunk compression mode constants:
+# These constants are used in revlog version >=2 to denote the compression used
+# for a chunk.
+
+# Chunk use a compression mode stored "inline" at the start of the chunk
+# itself.  This is the mode always used for revlog version "0" and "1"
+COMP_MODE_INLINE = 2
+
 SUPPORTED_FLAGS = {
     REVLOGV0: REVLOGV0_FLAGS,
     REVLOGV1: REVLOGV1_FLAGS,
@@ -152,4 +160,5 @@
     },
 }
 
+
 SPARSE_REVLOG_MAX_CHAIN_LENGTH = 1000
--- a/mercurial/revlogutils/revlogv0.py	Tue May 04 01:15:03 2021 +0200
+++ b/mercurial/revlogutils/revlogv0.py	Mon May 03 18:19:16 2021 +0200
@@ -9,6 +9,7 @@
 
 from ..node import sha1nodeconstants
 from .constants import (
+    COMP_MODE_INLINE,
     INDEX_ENTRY_V0,
 )
 from ..i18n import _
@@ -42,7 +43,19 @@
 
 class revlogoldindex(list):
     entry_size = INDEX_ENTRY_V0.size
-    null_item = (0, 0, 0, -1, -1, -1, -1, sha1nodeconstants.nullid, 0, 0)
+    null_item = (
+        0,
+        0,
+        0,
+        -1,
+        -1,
+        -1,
+        -1,
+        sha1nodeconstants.nullid,
+        0,
+        0,
+        COMP_MODE_INLINE,
+    )
 
     @property
     def nodemap(self):
@@ -138,6 +151,7 @@
             e[6],
             0,  # no side data support
             0,  # no side data support
+            COMP_MODE_INLINE,
         )
         index.append(e2)
         nodemap[e[6]] = n
--- a/mercurial/unionrepo.py	Tue May 04 01:15:03 2021 +0200
+++ b/mercurial/unionrepo.py	Mon May 03 18:19:16 2021 +0200
@@ -31,6 +31,10 @@
     vfs as vfsmod,
 )
 
+from .revlogutils import (
+    constants as revlog_constants,
+)
+
 
 class unionrevlog(revlog.revlog):
     def __init__(self, opener, radix, revlog2, linkmapper):
@@ -65,6 +69,7 @@
                 node,
                 _sdo,
                 _sds,
+                _dcm,
             ) = rev
             flags = _start & 0xFFFF
 
@@ -99,6 +104,7 @@
                 node,
                 0,  # sidedata offset
                 0,  # sidedata size
+                revlog_constants.COMP_MODE_INLINE,
             )
             self.index.append(e)
             self.bundlerevs.add(n)
--- a/tests/test-parseindex2.py	Tue May 04 01:15:03 2021 +0200
+++ b/tests/test-parseindex2.py	Mon May 03 18:19:16 2021 +0200
@@ -21,6 +21,9 @@
     policy,
     pycompat,
 )
+from mercurial.revlogutils import (
+    constants,
+)
 
 parsers = policy.importmod('parsers')
 
@@ -49,7 +52,7 @@
         cache = (0, data)
         while off <= l:
             e = struct.unpack(indexformatng, data[off : off + s])
-            e = e + (0, 0)
+            e = e + (0, 0, constants.COMP_MODE_INLINE)
             nodemap[e[7]] = n
             append(e)
             n += 1
@@ -59,7 +62,7 @@
     else:
         while off <= l:
             e = struct.unpack(indexformatng, data[off : off + s])
-            e = e + (0, 0)
+            e = e + (0, 0, constants.COMP_MODE_INLINE)
             nodemap[e[7]] = n
             append(e)
             n += 1
@@ -242,7 +245,19 @@
                 break
 
     def testminusone(self):
-        want = (0, 0, 0, -1, -1, -1, -1, sha1nodeconstants.nullid, 0, 0)
+        want = (
+            0,
+            0,
+            0,
+            -1,
+            -1,
+            -1,
+            -1,
+            sha1nodeconstants.nullid,
+            0,
+            0,
+            constants.COMP_MODE_INLINE,
+        )
         index, junk = parsers.parse_index2(data_inlined, True)
         got = index[-1]
         self.assertEqual(want, got)  # inline data
@@ -264,7 +279,20 @@
             # node won't matter for this test, let's just make sure
             # they don't collide. Other data don't matter either.
             node = hexrev(p1) + hexrev(p2) + b'.' * 12
-            index.append((0, 0, 12, 1, 34, p1, p2, node, 0, 0))
+            e = (
+                0,
+                0,
+                12,
+                1,
+                34,
+                p1,
+                p2,
+                node,
+                0,
+                0,
+                constants.COMP_MODE_INLINE,
+            )
+            index.append(e)
 
         appendrev(4)
         appendrev(5)