dirstate-v2: Move fixed-size tree metadata into the docket file
authorSimon Sapin <simon.sapin@octobus.net>
Thu, 15 Jul 2021 23:02:17 +0200
changeset 47682 78f7f0d490ee
parent 47681 d94118365ec5
child 47683 284a20269a97
dirstate-v2: Move fixed-size tree metadata into the docket file Before this changeset, the dirstate-v2 data file contained not only nodes and paths that may be reused when appending to an existing file, but also some fixed-size metadata that applies to the entire tree and was added at the end of the data file for every append. This moves that metadata into the docket file, so that repeated "append" operations without meaningful changes don’t actually need to grow any file. Differential Revision: https://phab.mercurial-scm.org/D11098
mercurial/debugcommands.py
mercurial/dirstatemap.py
mercurial/dirstateutils/docket.py
rust/hg-core/src/dirstate_tree/dirstate_map.rs
rust/hg-core/src/dirstate_tree/dispatch.rs
rust/hg-core/src/dirstate_tree/on_disk.rs
rust/hg-core/src/operations/list_tracked_files.rs
rust/hg-cpython/src/dirstate/dirstate_map.rs
rust/hg-cpython/src/dirstate/dispatch.rs
rust/hg-cpython/src/dirstate/owning.rs
rust/rhg/src/commands/status.rs
--- a/mercurial/debugcommands.py	Thu Jul 08 19:23:44 2021 +0200
+++ b/mercurial/debugcommands.py	Thu Jul 15 23:02:17 2021 +0200
@@ -999,11 +999,7 @@
     if repo.dirstate._use_dirstate_v2:
         docket = repo.dirstate._map.docket
         hash_len = 20  # 160 bits for SHA-1
-        hash_offset = docket.data_size - hash_len  # hash is at the end
-        data_filename = docket.data_filename()
-        with repo.vfs(data_filename) as f:
-            f.seek(hash_offset)
-            hash_bytes = f.read(hash_len)
+        hash_bytes = docket.tree_metadata[-hash_len:]
         ui.write(binascii.hexlify(hash_bytes) + b'\n')
 
 
--- a/mercurial/dirstatemap.py	Thu Jul 08 19:23:44 2021 +0200
+++ b/mercurial/dirstatemap.py	Thu Jul 15 23:02:17 2021 +0200
@@ -638,7 +638,7 @@
                 else:
                     data = b''
                 self._rustmap = rustmod.DirstateMap.new_v2(
-                    data, self.docket.data_size
+                    data, self.docket.data_size, self.docket.tree_metadata
                 )
                 parents = self.docket.parents
             else:
@@ -665,7 +665,7 @@
 
             # We can only append to an existing data file if there is one
             can_append = self.docket.uuid is not None
-            packed, append = self._rustmap.write_v2(now, can_append)
+            packed, meta, append = self._rustmap.write_v2(now, can_append)
             if append:
                 docket = self.docket
                 data_filename = docket.data_filename()
@@ -679,12 +679,13 @@
                         assert written == len(packed), (written, len(packed))
                 docket.data_size += len(packed)
                 docket.parents = self.parents()
+                docket.tree_metadata = meta
                 st.write(docket.serialize())
                 st.close()
             else:
                 old_docket = self.docket
                 new_docket = docketmod.DirstateDocket.with_new_uuid(
-                    self.parents(), len(packed)
+                    self.parents(), len(packed), meta
                 )
                 data_filename = new_docket.data_filename()
                 if tr:
--- a/mercurial/dirstateutils/docket.py	Thu Jul 08 19:23:44 2021 +0200
+++ b/mercurial/dirstateutils/docket.py	Thu Jul 15 23:02:17 2021 +0200
@@ -14,47 +14,60 @@
 
 V2_FORMAT_MARKER = b"dirstate-v2\n"
 
+# Must match the constant of the same name in
+# `rust/hg-core/src/dirstate_tree/on_disk.rs`
+TREE_METADATA_SIZE = 40
+
 # * 12 bytes: format marker
 # * 32 bytes: node ID of the working directory's first parent
 # * 32 bytes: node ID of the working directory's second parent
 # * 4 bytes: big-endian used size of the data file
+# * {TREE_METADATA_SIZE} bytes: tree metadata, parsed separately
 # * 1 byte: length of the data file's UUID
 # * variable: data file's UUID
 #
 # Node IDs are null-padded if shorter than 32 bytes.
 # A data file shorter than the specified used size is corrupted (truncated)
-HEADER = struct.Struct(">{}s32s32sLB".format(len(V2_FORMAT_MARKER)))
+HEADER = struct.Struct(
+    ">{}s32s32sL{}sB".format(len(V2_FORMAT_MARKER), TREE_METADATA_SIZE)
+)
 
 
 class DirstateDocket(object):
     data_filename_pattern = b'dirstate.%s.d'
 
-    def __init__(self, parents, data_size, uuid):
+    def __init__(self, parents, data_size, tree_metadata, uuid):
         self.parents = parents
         self.data_size = data_size
+        self.tree_metadata = tree_metadata
         self.uuid = uuid
 
     @classmethod
-    def with_new_uuid(cls, parents, data):
-        return cls(parents, data, docket_mod.make_uid())
+    def with_new_uuid(cls, parents, data_size, tree_metadata):
+        return cls(parents, data_size, tree_metadata, docket_mod.make_uid())
 
     @classmethod
     def parse(cls, data, nodeconstants):
         if not data:
             parents = (nodeconstants.nullid, nodeconstants.nullid)
-            return cls(parents, 0, None)
-        marker, p1, p2, data_size, uuid_size = HEADER.unpack_from(data)
+            return cls(parents, 0, b'', None)
+        marker, p1, p2, data_size, meta, uuid_size = HEADER.unpack_from(data)
         if marker != V2_FORMAT_MARKER:
             raise ValueError("expected dirstate-v2 marker")
         uuid = data[HEADER.size : HEADER.size + uuid_size]
         p1 = p1[: nodeconstants.nodelen]
         p2 = p2[: nodeconstants.nodelen]
-        return cls((p1, p2), data_size, uuid)
+        return cls((p1, p2), data_size, meta, uuid)
 
     def serialize(self):
         p1, p2 = self.parents
         header = HEADER.pack(
-            V2_FORMAT_MARKER, p1, p2, self.data_size, len(self.uuid)
+            V2_FORMAT_MARKER,
+            p1,
+            p2,
+            self.data_size,
+            self.tree_metadata,
+            len(self.uuid),
         )
         return header + self.uuid
 
--- a/rust/hg-core/src/dirstate_tree/dirstate_map.rs	Thu Jul 08 19:23:44 2021 +0200
+++ b/rust/hg-core/src/dirstate_tree/dirstate_map.rs	Thu Jul 15 23:02:17 2021 +0200
@@ -424,9 +424,10 @@
     pub fn new_v2(
         on_disk: &'on_disk [u8],
         data_size: usize,
+        metadata: &[u8],
     ) -> Result<Self, DirstateError> {
         if let Some(data) = on_disk.get(..data_size) {
-            Ok(on_disk::read(data)?)
+            Ok(on_disk::read(data, metadata)?)
         } else {
             Err(DirstateV2ParseError.into())
         }
@@ -1094,15 +1095,16 @@
         Ok(packed)
     }
 
-    /// Returns new data together with whether that data should be appended to
-    /// the existing data file whose content is at `self.on_disk` (true),
-    /// instead of written to a new data file (false).
+    /// Returns new data and metadata together with whether that data should be
+    /// appended to the existing data file whose content is at
+    /// `self.on_disk` (true), instead of written to a new data file
+    /// (false).
     #[timed]
     fn pack_v2(
         &mut self,
         now: Timestamp,
         can_append: bool,
-    ) -> Result<(Vec<u8>, bool), DirstateError> {
+    ) -> Result<(Vec<u8>, Vec<u8>, bool), DirstateError> {
         // TODO: how do we want to handle this in 2038?
         let now: i32 = now.0.try_into().expect("time overflow");
         let mut paths = Vec::new();
--- a/rust/hg-core/src/dirstate_tree/dispatch.rs	Thu Jul 08 19:23:44 2021 +0200
+++ b/rust/hg-core/src/dirstate_tree/dispatch.rs	Thu Jul 15 23:02:17 2021 +0200
@@ -182,16 +182,17 @@
     /// serialize bytes to write a dirstate data file to disk in dirstate-v2
     /// format.
     ///
-    /// Returns new data together with whether that data should be appended to
-    /// the existing data file whose content is at `self.on_disk` (true),
-    /// instead of written to a new data file (false).
+    /// Returns new data and metadata together with whether that data should be
+    /// appended to the existing data file whose content is at
+    /// `self.on_disk` (true), instead of written to a new data file
+    /// (false).
     ///
     /// Note: this is only supported by the tree dirstate map.
     fn pack_v2(
         &mut self,
         now: Timestamp,
         can_append: bool,
-    ) -> Result<(Vec<u8>, bool), DirstateError>;
+    ) -> Result<(Vec<u8>, Vec<u8>, bool), DirstateError>;
 
     /// Run the status algorithm.
     ///
@@ -395,7 +396,7 @@
         &mut self,
         _now: Timestamp,
         _can_append: bool,
-    ) -> Result<(Vec<u8>, bool), DirstateError> {
+    ) -> Result<(Vec<u8>, Vec<u8>, bool), DirstateError> {
         panic!(
             "should have used dirstate_tree::DirstateMap to use the v2 format"
         )
--- a/rust/hg-core/src/dirstate_tree/on_disk.rs	Thu Jul 08 19:23:44 2021 +0200
+++ b/rust/hg-core/src/dirstate_tree/on_disk.rs	Thu Jul 15 23:02:17 2021 +0200
@@ -47,6 +47,18 @@
 pub(super) const IGNORE_PATTERNS_HASH_LEN: usize = 20;
 pub(super) type IgnorePatternsHash = [u8; IGNORE_PATTERNS_HASH_LEN];
 
+/// Must match the constant of the same name in
+/// `mercurial/dirstateutils/docket.py`
+const TREE_METADATA_SIZE: usize = 40;
+
+/// Make sure that size-affecting changes are made knowingly
+#[allow(unused)]
+fn static_assert_size_of() {
+    let _ = std::mem::transmute::<DocketHeader, [u8; 121]>;
+    let _ = std::mem::transmute::<TreeMetadata, [u8; TREE_METADATA_SIZE]>;
+    let _ = std::mem::transmute::<Node, [u8; 43]>;
+}
+
 // Must match `HEADER` in `mercurial/dirstateutils/docket.py`
 #[derive(BytesCast)]
 #[repr(C)]
@@ -58,6 +70,8 @@
     /// Counted in bytes
     data_size: Size,
 
+    metadata: TreeMetadata,
+
     uuid_size: u8,
 }
 
@@ -68,7 +82,7 @@
 
 #[derive(BytesCast)]
 #[repr(C)]
-struct Root {
+struct TreeMetadata {
     root_nodes: ChildNodes,
     nodes_with_entry_count: Size,
     nodes_with_copy_source_count: Size,
@@ -134,7 +148,7 @@
     ///   - All direct children of this directory (as returned by
     ///     `std::fs::read_dir`) either have a corresponding dirstate node, or
     ///     are ignored by ignore patterns whose hash is in
-    ///     `Root::ignore_patterns_hash`.
+    ///     `TreeMetadata::ignore_patterns_hash`.
     ///
     ///   This means that if `std::fs::symlink_metadata` later reports the
     ///   same modification time and ignored patterns haven’t changed, a run
@@ -205,13 +219,6 @@
 /// Either nothing if `start == 0`, or a `HgPath` of `len` bytes
 type OptPathSlice = PathSlice;
 
-/// Make sure that size-affecting changes are made knowingly
-fn _static_assert_size_of() {
-    let _ = std::mem::transmute::<DocketHeader, [u8; 81]>;
-    let _ = std::mem::transmute::<Root, [u8; 40]>;
-    let _ = std::mem::transmute::<Node, [u8; 43]>;
-}
-
 /// Unexpected file format found in `.hg/dirstate` with the "v2" format.
 ///
 /// This should only happen if Mercurial is buggy or a repository is corrupted.
@@ -242,6 +249,10 @@
         DirstateParents { p1, p2 }
     }
 
+    pub fn tree_metadata(&self) -> &[u8] {
+        self.header.metadata.as_bytes()
+    }
+
     pub fn data_size(&self) -> usize {
         // This `unwrap` could only panic on a 16-bit CPU
         self.header.data_size.get().try_into().unwrap()
@@ -265,40 +276,25 @@
     }
 }
 
-fn read_root<'on_disk>(
-    on_disk: &'on_disk [u8],
-) -> Result<&'on_disk Root, DirstateV2ParseError> {
-    // Find the `Root` at the end of the given slice
-    let root_offset = on_disk
-        .len()
-        .checked_sub(std::mem::size_of::<Root>())
-        // A non-empty slice too short is an error
-        .ok_or(DirstateV2ParseError)?;
-    let (root, _) = Root::from_bytes(&on_disk[root_offset..])
-        .map_err(|_| DirstateV2ParseError)?;
-    Ok(root)
-}
-
 pub(super) fn read<'on_disk>(
     on_disk: &'on_disk [u8],
+    metadata: &[u8],
 ) -> Result<DirstateMap<'on_disk>, DirstateV2ParseError> {
     if on_disk.is_empty() {
         return Ok(DirstateMap::empty(on_disk));
     }
-    let root = read_root(on_disk)?;
-    let mut unreachable_bytes = root.unreachable_bytes.get();
-    // Each append writes a new `Root`, so it’s never reused
-    unreachable_bytes += std::mem::size_of::<Root>() as u32;
+    let (meta, _) = TreeMetadata::from_bytes(metadata)
+        .map_err(|_| DirstateV2ParseError)?;
     let dirstate_map = DirstateMap {
         on_disk,
         root: dirstate_map::ChildNodes::OnDisk(read_nodes(
             on_disk,
-            root.root_nodes,
+            meta.root_nodes,
         )?),
-        nodes_with_entry_count: root.nodes_with_entry_count.get(),
-        nodes_with_copy_source_count: root.nodes_with_copy_source_count.get(),
-        ignore_patterns_hash: root.ignore_patterns_hash,
-        unreachable_bytes,
+        nodes_with_entry_count: meta.nodes_with_entry_count.get(),
+        nodes_with_copy_source_count: meta.nodes_with_copy_source_count.get(),
+        ignore_patterns_hash: meta.ignore_patterns_hash,
+        unreachable_bytes: meta.unreachable_bytes.get(),
     };
     Ok(dirstate_map)
 }
@@ -530,9 +526,11 @@
 
 pub(crate) fn for_each_tracked_path<'on_disk>(
     on_disk: &'on_disk [u8],
+    metadata: &[u8],
     mut f: impl FnMut(&'on_disk HgPath),
 ) -> Result<(), DirstateV2ParseError> {
-    let root = read_root(on_disk)?;
+    let (meta, _) = TreeMetadata::from_bytes(metadata)
+        .map_err(|_| DirstateV2ParseError)?;
     fn recur<'on_disk>(
         on_disk: &'on_disk [u8],
         nodes: ChildNodes,
@@ -548,23 +546,23 @@
         }
         Ok(())
     }
-    recur(on_disk, root.root_nodes, &mut f)
+    recur(on_disk, meta.root_nodes, &mut f)
 }
 
-/// Returns new data together with whether that data should be appended to the
-/// existing data file whose content is at `dirstate_map.on_disk` (true),
-/// instead of written to a new data file (false).
+/// Returns new data and metadata, together with whether that data should be
+/// appended to the existing data file whose content is at
+/// `dirstate_map.on_disk` (true), instead of written to a new data file
+/// (false).
 pub(super) fn write(
     dirstate_map: &mut DirstateMap,
     can_append: bool,
-) -> Result<(Vec<u8>, bool), DirstateError> {
+) -> Result<(Vec<u8>, Vec<u8>, bool), DirstateError> {
     let append = can_append && dirstate_map.write_should_append();
 
     // This ignores the space for paths, and for nodes without an entry.
     // TODO: better estimate? Skip the `Vec` and write to a file directly?
-    let size_guess = std::mem::size_of::<Root>()
-        + std::mem::size_of::<Node>()
-            * dirstate_map.nodes_with_entry_count as usize;
+    let size_guess = std::mem::size_of::<Node>()
+        * dirstate_map.nodes_with_entry_count as usize;
 
     let mut writer = Writer {
         dirstate_map,
@@ -574,7 +572,7 @@
 
     let root_nodes = writer.write_nodes(dirstate_map.root.as_ref())?;
 
-    let root = Root {
+    let meta = TreeMetadata {
         root_nodes,
         nodes_with_entry_count: dirstate_map.nodes_with_entry_count.into(),
         nodes_with_copy_source_count: dirstate_map
@@ -583,8 +581,7 @@
         unreachable_bytes: dirstate_map.unreachable_bytes.into(),
         ignore_patterns_hash: dirstate_map.ignore_patterns_hash,
     };
-    writer.out.extend(root.as_bytes());
-    Ok((writer.out, append))
+    Ok((writer.out, meta.as_bytes().to_vec(), append))
 }
 
 struct Writer<'dmap, 'on_disk> {
--- a/rust/hg-core/src/operations/list_tracked_files.rs	Thu Jul 08 19:23:44 2021 +0200
+++ b/rust/hg-core/src/operations/list_tracked_files.rs	Thu Jul 15 23:02:17 2021 +0200
@@ -22,27 +22,33 @@
 pub struct Dirstate {
     /// The `dirstate` content.
     content: Vec<u8>,
-    dirstate_v2: bool,
+    v2_metadata: Option<Vec<u8>>,
 }
 
 impl Dirstate {
     pub fn new(repo: &Repo) -> Result<Self, HgError> {
         let mut content = repo.hg_vfs().read("dirstate")?;
-        if repo.has_dirstate_v2() {
+        let v2_metadata = if repo.has_dirstate_v2() {
             let docket = read_docket(&content)?;
+            let meta = docket.tree_metadata().to_vec();
             content = repo.hg_vfs().read(docket.data_filename())?;
-        }
+            Some(meta)
+        } else {
+            None
+        };
         Ok(Self {
             content,
-            dirstate_v2: repo.has_dirstate_v2(),
+            v2_metadata,
         })
     }
 
     pub fn tracked_files(&self) -> Result<Vec<&HgPath>, DirstateError> {
         let mut files = Vec::new();
         if !self.content.is_empty() {
-            if self.dirstate_v2 {
-                for_each_tracked_path(&self.content, |path| files.push(path))?
+            if let Some(meta) = &self.v2_metadata {
+                for_each_tracked_path(&self.content, meta, |path| {
+                    files.push(path)
+                })?
             } else {
                 let _parents = parse_dirstate_entries(
                     &self.content,
--- a/rust/hg-cpython/src/dirstate/dirstate_map.rs	Thu Jul 08 19:23:44 2021 +0200
+++ b/rust/hg-cpython/src/dirstate/dirstate_map.rs	Thu Jul 15 23:02:17 2021 +0200
@@ -84,12 +84,14 @@
     def new_v2(
         on_disk: PyBytes,
         data_size: usize,
+        tree_metadata: PyBytes,
     ) -> PyResult<PyObject> {
         let dirstate_error = |e: DirstateError| {
             PyErr::new::<exc::OSError, _>(py, format!("Dirstate error: {:?}", e))
         };
-        let inner = OwningDirstateMap::new_v2(py, on_disk, data_size)
-                .map_err(dirstate_error)?;
+        let inner = OwningDirstateMap::new_v2(
+            py, on_disk, data_size, tree_metadata,
+        ).map_err(dirstate_error)?;
         let map = Self::create_instance(py, Box::new(inner))?;
         Ok(map.into_object())
     }
@@ -353,9 +355,11 @@
         let mut inner = self.inner(py).borrow_mut();
         let result = inner.pack_v2(now, can_append);
         match result {
-            Ok((packed, append)) => {
+            Ok((packed, tree_metadata, append)) => {
                 let packed = PyBytes::new(py, &packed);
-                Ok((packed, append).to_py_object(py).into_object())
+                let tree_metadata = PyBytes::new(py, &tree_metadata);
+                let tuple = (packed, tree_metadata, append);
+                Ok(tuple.to_py_object(py).into_object())
             },
             Err(_) => Err(PyErr::new::<exc::OSError, _>(
                 py,
--- a/rust/hg-cpython/src/dirstate/dispatch.rs	Thu Jul 08 19:23:44 2021 +0200
+++ b/rust/hg-cpython/src/dirstate/dispatch.rs	Thu Jul 15 23:02:17 2021 +0200
@@ -128,7 +128,7 @@
         &mut self,
         now: Timestamp,
         can_append: bool,
-    ) -> Result<(Vec<u8>, bool), DirstateError> {
+    ) -> Result<(Vec<u8>, Vec<u8>, bool), DirstateError> {
         self.get_mut().pack_v2(now, can_append)
     }
 
--- a/rust/hg-cpython/src/dirstate/owning.rs	Thu Jul 08 19:23:44 2021 +0200
+++ b/rust/hg-cpython/src/dirstate/owning.rs	Thu Jul 15 23:02:17 2021 +0200
@@ -49,9 +49,11 @@
         py: Python,
         on_disk: PyBytes,
         data_size: usize,
+        tree_metadata: PyBytes,
     ) -> Result<Self, DirstateError> {
         let bytes: &'_ [u8] = on_disk.data(py);
-        let map = DirstateMap::new_v2(bytes, data_size)?;
+        let map =
+            DirstateMap::new_v2(bytes, data_size, tree_metadata.data(py))?;
 
         // Like in `bytes` above, this `'_` lifetime parameter borrows from
         // the bytes buffer owned by `on_disk`.
--- a/rust/rhg/src/commands/status.rs	Thu Jul 08 19:23:44 2021 +0200
+++ b/rust/rhg/src/commands/status.rs	Thu Jul 15 23:02:17 2021 +0200
@@ -168,13 +168,16 @@
     let repo = invocation.repo?;
     let dirstate_data_mmap;
     let (mut dmap, parents) = if repo.has_dirstate_v2() {
+        let docket_data =
+            repo.hg_vfs().read("dirstate").io_not_found_as_none()?;
         let parents;
         let dirstate_data;
         let data_size;
-        if let Some(docket_data) =
-            repo.hg_vfs().read("dirstate").io_not_found_as_none()?
-        {
-            let docket = on_disk::read_docket(&docket_data)?;
+        let docket;
+        let tree_metadata;
+        if let Some(docket_data) = &docket_data {
+            docket = on_disk::read_docket(docket_data)?;
+            tree_metadata = docket.tree_metadata();
             parents = Some(docket.parents());
             data_size = docket.data_size();
             dirstate_data_mmap = repo
@@ -184,10 +187,12 @@
             dirstate_data = dirstate_data_mmap.as_deref().unwrap_or(b"");
         } else {
             parents = None;
+            tree_metadata = b"";
             data_size = 0;
             dirstate_data = b"";
         }
-        let dmap = DirstateMap::new_v2(dirstate_data, data_size)?;
+        let dmap =
+            DirstateMap::new_v2(dirstate_data, data_size, tree_metadata)?;
         (dmap, parents)
     } else {
         dirstate_data_mmap =