sidedata: move to new sidedata storage in revlogv2
authorRaphaël Gomès <rgomes@octobus.net>
Mon, 18 Jan 2021 11:44:51 +0100
changeset 46709 3d740058b467
parent 46708 358737abeeef
child 46710 4cd214c9948d
sidedata: move to new sidedata storage in revlogv2 The current (experimental) sidedata system uses flagprocessors to signify the presence and store/retrieve sidedata from the raw revlog data. This proved to be quite fragile from an exchange perspective and a lot more complex than simply having a dedicated space in the new revlog format. This change does not handle exchange (ironically), so the test for amend - that uses a bundle - is broken. This functionality is split into the next patches. Differential Revision: https://phab.mercurial-scm.org/D9993
hgext/lfs/wrapper.py
hgext/remotefilelog/remotefilelog.py
mercurial/revlog.py
mercurial/revlogutils/flagutil.py
mercurial/revlogutils/sidedata.py
tests/flagprocessorext.py
tests/simplestorerepo.py
tests/test-copies-in-changeset.t
tests/test-revlog-raw.py
tests/testlib/ext-sidedata.py
--- a/hgext/lfs/wrapper.py	Wed Jan 20 18:35:12 2021 +0100
+++ b/hgext/lfs/wrapper.py	Mon Jan 18 11:44:51 2021 +0100
@@ -116,10 +116,10 @@
     if hgmeta or text.startswith(b'\1\n'):
         text = storageutil.packmeta(hgmeta, text)
 
-    return (text, True, {})
+    return (text, True)
 
 
-def writetostore(self, text, sidedata):
+def writetostore(self, text):
     # hg filelog metadata (includes rename, etc)
     hgmeta, offset = storageutil.parsemeta(text)
     if offset and offset > 0:
--- a/hgext/remotefilelog/remotefilelog.py	Wed Jan 20 18:35:12 2021 +0100
+++ b/hgext/remotefilelog/remotefilelog.py	Mon Jan 18 11:44:51 2021 +0100
@@ -155,12 +155,12 @@
         # text passed to "addrevision" includes hg filelog metadata header
         if node is None:
             node = storageutil.hashrevisionsha1(text, p1, p2)
-        if sidedata is None:
-            sidedata = {}
 
         meta, metaoffset = storageutil.parsemeta(text)
         rawtext, validatehash = flagutil.processflagswrite(
-            self, text, flags, sidedata=sidedata
+            self,
+            text,
+            flags,
         )
         return self.addrawrevision(
             rawtext,
--- a/mercurial/revlog.py	Wed Jan 20 18:35:12 2021 +0100
+++ b/mercurial/revlog.py	Mon Jan 18 11:44:51 2021 +0100
@@ -120,10 +120,10 @@
 
 # Flag processors for REVIDX_ELLIPSIS.
 def ellipsisreadprocessor(rl, text):
-    return text, False, {}
-
-
-def ellipsiswriteprocessor(rl, text, sidedata):
+    return text, False
+
+
+def ellipsiswriteprocessor(rl, text):
     return text, False
 
 
@@ -554,8 +554,6 @@
         if self._mmaplargeindex and b'mmapindexthreshold' in opts:
             mmapindexthreshold = opts[b'mmapindexthreshold']
         self.hassidedata = bool(opts.get(b'side-data', False))
-        if self.hassidedata:
-            self._flagprocessors[REVIDX_SIDEDATA] = sidedatautil.processors
         self._sparserevlog = bool(opts.get(b'sparse-revlog', False))
         withsparseread = bool(opts.get(b'with-sparse-read', False))
         # sparse-revlog forces sparse-read
@@ -856,6 +854,11 @@
     def length(self, rev):
         return self.index[rev][1]
 
+    def sidedata_length(self, rev):
+        if self.version & 0xFFFF != REVLOGV2:
+            return 0
+        return self.index[rev][9]
+
     def rawsize(self, rev):
         """return the length of the uncompressed text for a given revision"""
         l = self.index[rev][2]
@@ -917,7 +920,7 @@
     # Derived from index values.
 
     def end(self, rev):
-        return self.start(rev) + self.length(rev)
+        return self.start(rev) + self.length(rev) + self.sidedata_length(rev)
 
     def parents(self, node):
         i = self.index
@@ -1853,7 +1856,7 @@
         elif operation == b'read':
             return flagutil.processflagsread(self, text, flags)
         else:  # write operation
-            return flagutil.processflagswrite(self, text, flags, None)
+            return flagutil.processflagswrite(self, text, flags)
 
     def revision(self, nodeorrev, _df=None, raw=False):
         """return an uncompressed revision of a given node or revision
@@ -1898,10 +1901,17 @@
         # revision or might need to be processed to retrieve the revision.
         rev, rawtext, validated = self._rawtext(node, rev, _df=_df)
 
+        if self.version & 0xFFFF == REVLOGV2:
+            if rev is None:
+                rev = self.rev(node)
+            sidedata = self._sidedata(rev)
+        else:
+            sidedata = {}
+
         if raw and validated:
             # if we don't want to process the raw text and that raw
             # text is cached, we can exit early.
-            return rawtext, {}
+            return rawtext, sidedata
         if rev is None:
             rev = self.rev(node)
         # the revlog's flag for this revision
@@ -1910,20 +1920,14 @@
 
         if validated and flags == REVIDX_DEFAULT_FLAGS:
             # no extra flags set, no flag processor runs, text = rawtext
-            return rawtext, {}
-
-        sidedata = {}
+            return rawtext, sidedata
+
         if raw:
             validatehash = flagutil.processflagsraw(self, rawtext, flags)
             text = rawtext
         else:
-            try:
-                r = flagutil.processflagsread(self, rawtext, flags)
-            except error.SidedataHashError as exc:
-                msg = _(b"integrity check failed on %s:%s sidedata key %d")
-                msg %= (self.indexfile, pycompat.bytestr(rev), exc.sidedatakey)
-                raise error.RevlogError(msg)
-            text, validatehash, sidedata = r
+            r = flagutil.processflagsread(self, rawtext, flags)
+            text, validatehash = r
         if validatehash:
             self.checkhash(text, node, rev=rev)
         if not validated:
@@ -1974,6 +1978,21 @@
         del basetext  # let us have a chance to free memory early
         return (rev, rawtext, False)
 
+    def _sidedata(self, rev):
+        """Return the sidedata for a given revision number."""
+        index_entry = self.index[rev]
+        sidedata_offset = index_entry[8]
+        sidedata_size = index_entry[9]
+
+        if self._inline:
+            sidedata_offset += self._io.size * (1 + rev)
+        if sidedata_size == 0:
+            return {}
+
+        segment = self._getsegment(sidedata_offset, sidedata_size)
+        sidedata = sidedatautil.deserialize_sidedata(segment)
+        return sidedata
+
     def rawdata(self, nodeorrev, _df=None):
         """return an uncompressed raw data of a given node or revision number.
 
@@ -2107,20 +2126,15 @@
 
         if sidedata is None:
             sidedata = {}
-            flags = flags & ~REVIDX_SIDEDATA
         elif not self.hassidedata:
             raise error.ProgrammingError(
                 _(b"trying to add sidedata to a revlog who don't support them")
             )
-        else:
-            flags |= REVIDX_SIDEDATA
 
         if flags:
             node = node or self.hash(text, p1, p2)
 
-        rawtext, validatehash = flagutil.processflagswrite(
-            self, text, flags, sidedata=sidedata
-        )
+        rawtext, validatehash = flagutil.processflagswrite(self, text, flags)
 
         # If the flag processor modifies the revision data, ignore any provided
         # cachedelta.
@@ -2153,6 +2167,7 @@
             flags,
             cachedelta=cachedelta,
             deltacomputer=deltacomputer,
+            sidedata=sidedata,
         )
 
     def addrawrevision(
@@ -2166,6 +2181,7 @@
         flags,
         cachedelta=None,
         deltacomputer=None,
+        sidedata=None,
     ):
         """add a raw revision with known flags, node and parents
         useful when reusing a revision not stored in this revlog (ex: received
@@ -2188,6 +2204,7 @@
                 ifh,
                 dfh,
                 deltacomputer=deltacomputer,
+                sidedata=sidedata,
             )
         finally:
             if dfh:
@@ -2281,6 +2298,7 @@
         dfh,
         alwayscache=False,
         deltacomputer=None,
+        sidedata=None,
     ):
         """internal function to add revisions to the log
 
@@ -2350,6 +2368,16 @@
 
         deltainfo = deltacomputer.finddeltainfo(revinfo, fh)
 
+        if sidedata:
+            serialized_sidedata = sidedatautil.serialize_sidedata(sidedata)
+            sidedata_offset = offset + deltainfo.deltalen
+        else:
+            serialized_sidedata = b""
+            # Don't store the offset if the sidedata is empty, that way
+            # we can easily detect empty sidedata and they will be no different
+            # than ones we manually add.
+            sidedata_offset = 0
+
         e = (
             offset_type(offset, flags),
             deltainfo.deltalen,
@@ -2359,18 +2387,24 @@
             p1r,
             p2r,
             node,
-            0,
-            0,
+            sidedata_offset,
+            len(serialized_sidedata),
         )
 
         if self.version & 0xFFFF != REVLOGV2:
             e = e[:8]
 
         self.index.append(e)
-
         entry = self._io.packentry(e, self.node, self.version, curr)
         self._writeentry(
-            transaction, ifh, dfh, entry, deltainfo.data, link, offset
+            transaction,
+            ifh,
+            dfh,
+            entry,
+            deltainfo.data,
+            link,
+            offset,
+            serialized_sidedata,
         )
 
         rawtext = btext[0]
@@ -2383,7 +2417,9 @@
         self._chainbasecache[curr] = deltainfo.chainbase
         return curr
 
-    def _writeentry(self, transaction, ifh, dfh, entry, data, link, offset):
+    def _writeentry(
+        self, transaction, ifh, dfh, entry, data, link, offset, sidedata
+    ):
         # Files opened in a+ mode have inconsistent behavior on various
         # platforms. Windows requires that a file positioning call be made
         # when the file handle transitions between reads and writes. See
@@ -2407,6 +2443,8 @@
             if data[0]:
                 dfh.write(data[0])
             dfh.write(data[1])
+            if sidedata:
+                dfh.write(sidedata)
             ifh.write(entry)
         else:
             offset += curr * self._io.size
@@ -2414,6 +2452,8 @@
             ifh.write(entry)
             ifh.write(data[0])
             ifh.write(data[1])
+            if sidedata:
+                ifh.write(sidedata)
             self._enforceinlinesize(transaction, ifh)
         nodemaputil.setup_persistent_nodemap(transaction, self)
 
--- a/mercurial/revlogutils/flagutil.py	Wed Jan 20 18:35:12 2021 +0100
+++ b/mercurial/revlogutils/flagutil.py	Mon Jan 18 11:44:51 2021 +0100
@@ -84,7 +84,7 @@
     flagprocessors[flag] = processor
 
 
-def processflagswrite(revlog, text, flags, sidedata):
+def processflagswrite(revlog, text, flags):
     """Inspect revision data flags and applies write transformations defined
     by registered flag processors.
 
@@ -100,9 +100,12 @@
     processed text and ``validatehash`` is a bool indicating whether the
     returned text should be checked for hash integrity.
     """
-    return _processflagsfunc(revlog, text, flags, b'write', sidedata=sidedata)[
-        :2
-    ]
+    return _processflagsfunc(
+        revlog,
+        text,
+        flags,
+        b'write',
+    )[:2]
 
 
 def processflagsread(revlog, text, flags):
@@ -145,14 +148,14 @@
     return _processflagsfunc(revlog, text, flags, b'raw')[1]
 
 
-def _processflagsfunc(revlog, text, flags, operation, sidedata=None):
+def _processflagsfunc(revlog, text, flags, operation):
     """internal function to process flag on a revlog
 
     This function is private to this module, code should never needs to call it
     directly."""
     # fast path: no flag processors will run
     if flags == 0:
-        return text, True, {}
+        return text, True
     if operation not in (b'read', b'write', b'raw'):
         raise error.ProgrammingError(_(b"invalid '%s' operation") % operation)
     # Check all flags are known.
@@ -168,7 +171,6 @@
     if operation == b'write':
         orderedflags = reversed(orderedflags)
 
-    outsidedata = {}
     for flag in orderedflags:
         # If a flagprocessor has been registered for a known flag, apply the
         # related operation transform and update result tuple.
@@ -186,10 +188,9 @@
                 if operation == b'raw':
                     vhash = rawtransform(revlog, text)
                 elif operation == b'read':
-                    text, vhash, s = readtransform(revlog, text)
-                    outsidedata.update(s)
+                    text, vhash = readtransform(revlog, text)
                 else:  # write operation
-                    text, vhash = writetransform(revlog, text, sidedata)
+                    text, vhash = writetransform(revlog, text)
             validatehash = validatehash and vhash
 
-    return text, validatehash, outsidedata
+    return text, validatehash
--- a/mercurial/revlogutils/sidedata.py	Wed Jan 20 18:35:12 2021 +0100
+++ b/mercurial/revlogutils/sidedata.py	Mon Jan 18 11:44:51 2021 +0100
@@ -13,9 +13,8 @@
 The current implementation is experimental and subject to changes. Do not rely
 on it in production.
 
-Sidedata are stored in the revlog itself, within the revision rawtext. They
-are inserted and removed from it using the flagprocessors mechanism. The following
-format is currently used::
+Sidedata are stored in the revlog itself, thanks to a new version of the
+revlog. The following format is currently used::
 
     initial header:
         <number of sidedata; 2 bytes>
@@ -60,48 +59,35 @@
 SIDEDATA_ENTRY = struct.Struct('>HL20s')
 
 
-def sidedatawriteprocessor(rl, text, sidedata):
+def serialize_sidedata(sidedata):
     sidedata = list(sidedata.items())
     sidedata.sort()
-    rawtext = [SIDEDATA_HEADER.pack(len(sidedata))]
+    buf = [SIDEDATA_HEADER.pack(len(sidedata))]
     for key, value in sidedata:
         digest = hashutil.sha1(value).digest()
-        rawtext.append(SIDEDATA_ENTRY.pack(key, len(value), digest))
+        buf.append(SIDEDATA_ENTRY.pack(key, len(value), digest))
     for key, value in sidedata:
-        rawtext.append(value)
-    rawtext.append(bytes(text))
-    return b''.join(rawtext), False
+        buf.append(value)
+    buf = b''.join(buf)
+    return buf
 
 
-def sidedatareadprocessor(rl, text):
+def deserialize_sidedata(blob):
     sidedata = {}
     offset = 0
-    (nbentry,) = SIDEDATA_HEADER.unpack(text[: SIDEDATA_HEADER.size])
+    (nbentry,) = SIDEDATA_HEADER.unpack(blob[: SIDEDATA_HEADER.size])
     offset += SIDEDATA_HEADER.size
     dataoffset = SIDEDATA_HEADER.size + (SIDEDATA_ENTRY.size * nbentry)
     for i in range(nbentry):
         nextoffset = offset + SIDEDATA_ENTRY.size
-        key, size, storeddigest = SIDEDATA_ENTRY.unpack(text[offset:nextoffset])
+        key, size, storeddigest = SIDEDATA_ENTRY.unpack(blob[offset:nextoffset])
         offset = nextoffset
         # read the data associated with that entry
         nextdataoffset = dataoffset + size
-        entrytext = text[dataoffset:nextdataoffset]
+        entrytext = bytes(blob[dataoffset:nextdataoffset])
         readdigest = hashutil.sha1(entrytext).digest()
         if storeddigest != readdigest:
             raise error.SidedataHashError(key, storeddigest, readdigest)
         sidedata[key] = entrytext
         dataoffset = nextdataoffset
-    text = text[dataoffset:]
-    return text, True, sidedata
-
-
-def sidedatarawprocessor(rl, text):
-    # side data modifies rawtext and prevent rawtext hash validation
-    return False
-
-
-processors = (
-    sidedatareadprocessor,
-    sidedatawriteprocessor,
-    sidedatarawprocessor,
-)
+    return sidedata
--- a/tests/flagprocessorext.py	Wed Jan 20 18:35:12 2021 +0100
+++ b/tests/flagprocessorext.py	Mon Jan 18 11:44:51 2021 +0100
@@ -31,28 +31,28 @@
     return False
 
 
-def noopdonothing(self, text, sidedata):
+def noopdonothing(self, text):
     return (text, True)
 
 
 def noopdonothingread(self, text):
-    return (text, True, {})
+    return (text, True)
 
 
-def b64encode(self, text, sidedata):
+def b64encode(self, text):
     return (base64.b64encode(text), False)
 
 
 def b64decode(self, text):
-    return (base64.b64decode(text), True, {})
+    return (base64.b64decode(text), True)
 
 
-def gzipcompress(self, text, sidedata):
+def gzipcompress(self, text):
     return (zlib.compress(text), False)
 
 
 def gzipdecompress(self, text):
-    return (zlib.decompress(text), True, {})
+    return (zlib.decompress(text), True)
 
 
 def supportedoutgoingversions(orig, repo):
--- a/tests/simplestorerepo.py	Wed Jan 20 18:35:12 2021 +0100
+++ b/tests/simplestorerepo.py	Mon Jan 18 11:44:51 2021 +0100
@@ -300,7 +300,7 @@
             text = rawtext
         else:
             r = flagutil.processflagsread(self, rawtext, flags)
-            text, validatehash, sidedata = r
+            text, validatehash = r
         if validatehash:
             self.checkhash(text, node, rev=rev)
 
--- a/tests/test-copies-in-changeset.t	Wed Jan 20 18:35:12 2021 +0100
+++ b/tests/test-copies-in-changeset.t	Mon Jan 18 11:44:51 2021 +0100
@@ -271,12 +271,13 @@
   $ hg ci --amend -m 'copy a to j, v2'
   saved backup bundle to $TESTTMP/repo/.hg/strip-backup/*-*-amend.hg (glob)
   $ hg debugsidedata -c -v -- -1
-  1 sidedata entries
-   entry-0014 size 24
-    '\x00\x00\x00\x02\x00\x00\x00\x00\x01\x00\x00\x00\x00\x06\x00\x00\x00\x02\x00\x00\x00\x00aj'
+  1 sidedata entries (missing-correct-output !)
+   entry-0014 size 24 (missing-correct-output !)
+    '\x00\x00\x00\x02\x00\x00\x00\x00\x01\x00\x00\x00\x00\x06\x00\x00\x00\x02\x00\x00\x00\x00aj' (missing-correct-output !)
 #endif
   $ hg showcopies --config experimental.copies.read-from=filelog-only
-  a -> j
+  a -> j (sidedata missing-correct-output !)
+  a -> j (no-sidedata !)
 The entries should be written to extras even if they're empty (so the client
 won't have to fall back to reading from filelogs)
   $ echo x >> j
@@ -354,7 +355,8 @@
   saved backup bundle to $TESTTMP/rebase-rename/.hg/strip-backup/*-*-rebase.hg (glob)
   $ hg st --change . --copies
   A b
-    a
+    a (sidedata missing-correct-output !)
+    a (no-sidedata !)
   R a
   $ cd ..
 
--- a/tests/test-revlog-raw.py	Wed Jan 20 18:35:12 2021 +0100
+++ b/tests/test-revlog-raw.py	Mon Jan 18 11:44:51 2021 +0100
@@ -51,10 +51,10 @@
 def readprocessor(self, rawtext):
     # True: the returned text could be used to verify hash
     text = rawtext[len(_extheader) :].replace(b'i', b'1')
-    return text, True, {}
+    return text, True
 
 
-def writeprocessor(self, text, sidedata):
+def writeprocessor(self, text):
     # False: the returned rawtext shouldn't be used to verify hash
     rawtext = _extheader + text.replace(b'1', b'i')
     return rawtext, False
@@ -293,7 +293,7 @@
 
         # Verify text, rawtext, and rawsize
         if isext:
-            rawtext = writeprocessor(None, text, {})[0]
+            rawtext = writeprocessor(None, text)[0]
         else:
             rawtext = text
         if rlog.rawsize(rev) != len(rawtext):
--- a/tests/testlib/ext-sidedata.py	Wed Jan 20 18:35:12 2021 +0100
+++ b/tests/testlib/ext-sidedata.py	Mon Jan 18 11:44:51 2021 +0100
@@ -40,19 +40,20 @@
     return orig(self, text, transaction, link, p1, p2, *args, **kwargs)
 
 
-def wraprevision(orig, self, nodeorrev, *args, **kwargs):
-    text = orig(self, nodeorrev, *args, **kwargs)
+def wrap_revisiondata(orig, self, nodeorrev, *args, **kwargs):
+    text, sd = orig(self, nodeorrev, *args, **kwargs)
     if getattr(self, 'sidedatanocheck', False):
-        return text
+        return text, sd
+    if self.version & 0xFFFF != 2:
+        return text, sd
     if nodeorrev != nullrev and nodeorrev != nullid:
-        sd = self.sidedata(nodeorrev)
         if len(text) != struct.unpack('>I', sd[sidedata.SD_TEST1])[0]:
             raise RuntimeError('text size mismatch')
         expected = sd[sidedata.SD_TEST2]
         got = hashlib.sha256(text).digest()
         if got != expected:
             raise RuntimeError('sha256 mismatch')
-    return text
+    return text, sd
 
 
 def wrapgetsidedatacompanion(orig, srcrepo, dstrepo):
@@ -81,7 +82,7 @@
 
 def extsetup(ui):
     extensions.wrapfunction(revlog.revlog, 'addrevision', wrapaddrevision)
-    extensions.wrapfunction(revlog.revlog, 'revision', wraprevision)
+    extensions.wrapfunction(revlog.revlog, '_revisiondata', wrap_revisiondata)
     extensions.wrapfunction(
         upgrade_engine, 'getsidedatacompanion', wrapgetsidedatacompanion
     )