revlogv2: introduce a very basic docket file
authorPierre-Yves David <pierre-yves.david@octobus.net>
Mon, 03 May 2021 12:34:11 +0200
changeset 47234 616b8f412676
parent 47233 bcafcd779d2e
child 47235 6b1eae313b2f
revlogv2: introduce a very basic docket file This is the first stone toward using a docket file in revlogv2. Right now the docket is very basic and only store the version number (which is -also- stored into the index file…) and the other files have fixed name. This new implementation break transactionally… but they are no test checking transactionally for revlogv2… So I take this as an opportunity to start small. They are no usage of revlogv2 outside of tests anyway. The docket keeps the `.i` naming used by previous version index to preserve a unique entry point. We could decide to use a different name and look it up first, or to fully rework this in a future "store" version. However that does not seems necessary right now. We will re-introduces transactionality (and associated testing…) in a later changesets. A long list of TODOs have been added to the relevant comment. Differential Revision: https://phab.mercurial-scm.org/D10624
mercurial/changelog.py
mercurial/configitems.py
mercurial/revlog.py
mercurial/revlogutils/constants.py
mercurial/revlogutils/docket.py
mercurial/store.py
--- a/mercurial/changelog.py	Mon May 17 15:05:24 2021 +0200
+++ b/mercurial/changelog.py	Mon May 03 12:34:11 2021 +0200
@@ -445,6 +445,8 @@
 
     def delayupdate(self, tr):
         """delay visibility of index updates to other readers"""
+        if self._docket is not None:
+            return
 
         if not self._delayed:
             if len(self) == 0:
--- a/mercurial/configitems.py	Mon May 17 15:05:24 2021 +0200
+++ b/mercurial/configitems.py	Mon May 03 12:34:11 2021 +0200
@@ -1150,14 +1150,27 @@
 )
 # "out of experimental" todo list.
 #
-# * to grow a docket file to at least store the last offset of the data
-#   file when rewriting sidedata.
-# * need a way of dealing with garbage data if we allow rewriting
-#   *existing* sidedata.
+# * stop storing version information in the index (it is already in the docket)
+# * properly hide uncommitted content to other process
+# * expose transaction content hooks during pre-commit validation
+# * include management of a persistent nodemap in the main docket
+# * enforce a "no-truncate" policy for mmap safety
+#      - for censoring operation
+#      - for stripping operation
+#      - for rollback operation
+# * store the data size in the docket to simplify sidedata rewrite.
+# * track garbage data to evemtually allow rewriting -existing- sidedata.
 # * Exchange-wise, we will also need to do something more efficient than
 #   keeping references to the affected revlogs, especially memory-wise when
 #   rewriting sidedata.
-# * Also... compress the sidedata? (this should be coming very soon)
+# * sidedata compression
+# * introduce a proper solution to reduce the number of filelog related files.
+# * Improvement to consider
+#   - track compression mode in the index entris instead of the chunks
+#   - split the data offset and flag field (the 2 bytes save are mostly trouble)
+#   - keep track of uncompressed -chunk- size (to preallocate memory better)
+#   - keep track of chain base or size (probably not that useful anymore)
+#   - store data and sidedata in different files
 coreconfigitem(
     b'experimental',
     b'revlogv2',
--- a/mercurial/revlog.py	Mon May 17 15:05:24 2021 +0200
+++ b/mercurial/revlog.py	Mon May 03 12:34:11 2021 +0200
@@ -75,6 +75,7 @@
 )
 from .revlogutils import (
     deltas as deltautil,
+    docket as docketutil,
     flagutil,
     nodemap as nodemaputil,
     revlogv0,
@@ -317,6 +318,7 @@
 
         self.radix = radix
 
+        self._docket_file = None
         self._indexfile = None
         self._datafile = None
         self._nodemap_file = None
@@ -344,6 +346,7 @@
         self._maxchainlen = None
         self._deltabothparents = True
         self.index = None
+        self._docket = None
         self._nodemap_docket = None
         # Mapping of partial identifiers to full nodes.
         self._pcache = {}
@@ -505,8 +508,23 @@
         self._generaldelta = features[b'generaldelta'](self._format_flags)
         self.hassidedata = features[b'sidedata']
 
-        index_data = entry_data
-        self._indexfile = entry_point
+        if not features[b'docket']:
+            self._indexfile = entry_point
+            index_data = entry_data
+        else:
+            self._docket_file = entry_point
+            if self._initempty:
+                self._docket = docketutil.default_docket(self, header)
+            else:
+                self._docket = docketutil.parse_docket(self, entry_data)
+            self._indexfile = self._docket.index_filepath()
+            index_data = self._get_data(self._indexfile, mmapindexthreshold)
+            self._inline = False
+            # generaldelta implied by version 2 revlogs.
+            self._generaldelta = True
+            # the logic for persistent nodemap will be dealt with within the
+            # main docket, so disable it for now.
+            self._nodemap_file = None
 
         if self.postfix is None or self.postfix == b'a':
             self._datafile = b'%s.d' % self.radix
@@ -2053,6 +2071,8 @@
                     self._writinghandles = (ifh, dfh)
                     try:
                         yield
+                        if self._docket is not None:
+                            self._docket.write(transaction)
                     finally:
                         self._writinghandles = None
                 finally:
@@ -3126,9 +3146,7 @@
     def rewrite_sidedata(self, transaction, helpers, startrev, endrev):
         if not self.hassidedata:
             return
-        # inline are not yet supported because they suffer from an issue when
-        # rewriting them (since it's not an append-only operation).
-        # See issue6485.
+        # revlog formats with sidedata support does not support inline
         assert not self._inline
         if not helpers[1] and not helpers[2]:
             # Nothing to generate or remove
--- a/mercurial/revlogutils/constants.py	Mon May 17 15:05:24 2021 +0200
+++ b/mercurial/revlogutils/constants.py	Mon May 03 12:34:11 2021 +0200
@@ -133,20 +133,22 @@
         b'inline': _no,
         b'generaldelta': _no,
         b'sidedata': False,
+        b'docket': False,
     },
     REVLOGV1: {
         b'inline': _from_flag(FLAG_INLINE_DATA),
         b'generaldelta': _from_flag(FLAG_GENERALDELTA),
         b'sidedata': False,
+        b'docket': False,
     },
     REVLOGV2: {
-        # There is a bug in the transaction handling when going from an
-        # inline revlog to a separate index and data file. Turn it off until
-        # it's fixed, since v2 revlogs sometimes get rewritten on exchange.
-        # See issue6485
+        # The point of inline-revlog is to reduce the number of files used in
+        # the store. Using a docket defeat this purpose. So we needs other
+        # means to reduce the number of files for revlogv2.
         b'inline': _no,
         b'generaldelta': _yes,
         b'sidedata': True,
+        b'docket': True,
     },
 }
 
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mercurial/revlogutils/docket.py	Mon May 03 12:34:11 2021 +0200
@@ -0,0 +1,80 @@
+# docket - code related to revlog "docket"
+#
+# Copyright 2021 Pierre-Yves David <pierre-yves.david@octobus.net>
+#
+# This software may be used and distributed according to the terms of the
+# GNU General Public License version 2 or any later version.
+
+### Revlog docket file
+#
+# The revlog is stored on disk using multiple files:
+#
+# * a small docket file, containing metadata and a pointer,
+#
+# * an index file, containing fixed width information about revisions,
+#
+# * a data file, containing variable width data for these revisions,
+
+from __future__ import absolute_import
+
+import struct
+
+from . import (
+    constants,
+)
+
+# Docket format
+#
+# * 4 bytes: revlog version
+#          |   This is mandatory as docket must be compatible with the previous
+#          |   revlog index header.
+S_HEADER = struct.Struct(constants.INDEX_HEADER.format)
+
+
+class RevlogDocket(object):
+    """metadata associated with revlog"""
+
+    def __init__(self, revlog, version_header=None):
+        self._version_header = version_header
+        self._dirty = False
+        self._radix = revlog.radix
+        self._path = revlog._docket_file
+        self._opener = revlog.opener
+
+    def index_filepath(self):
+        """file path to the current index file associated to this docket"""
+        # very simplistic version at first
+        return b"%s.idx" % self._radix
+
+    def write(self, transaction):
+        """write the modification of disk if any
+
+        This make the new content visible to all process"""
+        if self._dirty:
+            transaction.addbackup(self._path, location=b'store')
+            with self._opener(self._path, mode=b'w', atomictemp=True) as f:
+                f.write(self._serialize())
+            self._dirty = False
+
+    def _serialize(self):
+        return S_HEADER.pack(self._version_header)
+
+
+def default_docket(revlog, version_header):
+    """given a revlog version a new docket object for the given revlog"""
+    if (version_header & 0xFFFF) != constants.REVLOGV2:
+        return None
+    docket = RevlogDocket(revlog, version_header=version_header)
+    docket._dirty = True
+    return docket
+
+
+def parse_docket(revlog, data):
+    """given some docket data return a docket object for the given revlog"""
+    header = S_HEADER.unpack(data[: S_HEADER.size])
+    (version_header,) = header
+    docket = RevlogDocket(
+        revlog,
+        version_header=version_header,
+    )
+    return docket
--- a/mercurial/store.py	Mon May 17 15:05:24 2021 +0200
+++ b/mercurial/store.py	Mon May 03 12:34:11 2021 +0200
@@ -389,7 +389,7 @@
 ]
 
 REVLOG_FILES_MAIN_EXT = (b'.i', b'i.tmpcensored')
-REVLOG_FILES_OTHER_EXT = (b'.d', b'.n', b'.nd', b'd.tmpcensored')
+REVLOG_FILES_OTHER_EXT = (b'.idx', b'.d', b'.n', b'.nd', b'd.tmpcensored')
 # files that are "volatile" and might change between listing and streaming
 #
 # note: the ".nd" file are nodemap data and won't "change" but they might be
@@ -397,7 +397,7 @@
 REVLOG_FILES_VOLATILE_EXT = (b'.n', b'.nd')
 
 # some exception to the above matching
-EXCLUDED = re.compile(b'.*undo\.[^/]+\.nd?$')
+EXCLUDED = re.compile(b'.*undo\.[^/]+\.(nd?|i)$')
 
 
 def is_revlog(f, kind, st):
@@ -407,7 +407,7 @@
 
 
 def revlog_type(f):
-    if f.endswith(REVLOG_FILES_MAIN_EXT):
+    if f.endswith(REVLOG_FILES_MAIN_EXT) and EXCLUDED.match(f) is None:
         return FILEFLAGS_REVLOG_MAIN
     elif f.endswith(REVLOG_FILES_OTHER_EXT) and EXCLUDED.match(f) is None:
         t = FILETYPE_FILELOG_OTHER