dirstate-v2: Support appending to the same data file
authorSimon Sapin <simon.sapin@octobus.net>
Tue, 13 Jul 2021 17:18:23 +0200
changeset 47678 065e61628980
parent 47677 da1c0cd68d53
child 47679 731286dc5321
dirstate-v2: Support appending to the same data file For now we’re still writing the entire data every time, so appending is not useful yet. Later we’ll have new nodes pointing to some existing data for nodes and paths that haven’t changed. The decision whether to append is pseudo-random in order to make tests exercise both code paths. This will be replaced by a heuristic based on the amount of unused existing data. Differential Revision: https://phab.mercurial-scm.org/D11094
mercurial/dirstatemap.py
rust/hg-core/src/dirstate_tree/dirstate_map.rs
rust/hg-core/src/dirstate_tree/dispatch.rs
rust/hg-core/src/dirstate_tree/on_disk.rs
rust/hg-cpython/src/dirstate/dirstate_map.rs
rust/hg-cpython/src/dirstate/dispatch.rs
--- a/mercurial/dirstatemap.py	Tue Jul 13 09:44:44 2021 +0200
+++ b/mercurial/dirstatemap.py	Tue Jul 13 17:18:23 2021 +0200
@@ -655,13 +655,41 @@
             return self._rustmap
 
         def write(self, tr, st, now):
-            if self._use_dirstate_v2:
-                packed = self._rustmap.write_v2(now)
+            if not self._use_dirstate_v2:
+                p1, p2 = self.parents()
+                packed = self._rustmap.write_v1(p1, p2, now)
+                st.write(packed)
+                st.close()
+                self._dirtyparents = False
+                return
+
+            # We can only append to an existing data file if there is one
+            can_append = self.docket.uuid is not None
+            packed, append = self._rustmap.write_v2(now, can_append)
+            if append:
+                docket = self.docket
+                data_filename = docket.data_filename()
+                if tr:
+                    tr.add(data_filename, docket.data_size)
+                with self._opener(data_filename, b'r+b') as fp:
+                    fp.seek(docket.data_size)
+                    assert fp.tell() == docket.data_size
+                    written = fp.write(packed)
+                    if written is not None:  # py2 may return None
+                        assert written == len(packed), (written, len(packed))
+                docket.data_size += len(packed)
+                docket.parents = self.parents()
+                st.write(docket.serialize())
+                st.close()
+            else:
                 old_docket = self.docket
                 new_docket = docketmod.DirstateDocket.with_new_uuid(
                     self.parents(), len(packed)
                 )
-                self._opener.write(new_docket.data_filename(), packed)
+                data_filename = new_docket.data_filename()
+                if tr:
+                    tr.add(data_filename, 0)
+                self._opener.write(data_filename, packed)
                 # Write the new docket after the new data file has been
                 # written. Because `st` was opened with `atomictemp=True`,
                 # the actual `.hg/dirstate` file is only affected on close.
@@ -670,13 +698,16 @@
                 # Remove the old data file after the new docket pointing to
                 # the new data file was written.
                 if old_docket.uuid:
-                    self._opener.unlink(old_docket.data_filename())
+                    data_filename = old_docket.data_filename()
+                    unlink = lambda _tr=None: self._opener.unlink(data_filename)
+                    if tr:
+                        category = b"dirstate-v2-clean-" + old_docket.uuid
+                        tr.addpostclose(category, unlink)
+                    else:
+                        unlink()
                 self._docket = new_docket
-            else:
-                p1, p2 = self.parents()
-                packed = self._rustmap.write_v1(p1, p2, now)
-                st.write(packed)
-                st.close()
+            # Reload from the newly-written file
+            util.clearcachedproperty(self, b"_rustmap")
             self._dirtyparents = False
 
         @propertycache
--- a/rust/hg-core/src/dirstate_tree/dirstate_map.rs	Tue Jul 13 09:44:44 2021 +0200
+++ b/rust/hg-core/src/dirstate_tree/dirstate_map.rs	Tue Jul 13 17:18:23 2021 +0200
@@ -468,6 +468,24 @@
         Ok((map, parents))
     }
 
+    /// Assuming dirstate-v2 format, returns whether the next write should
+    /// append to the existing data file that contains `self.on_disk` (true),
+    /// or create a new data file from scratch (false).
+    pub(super) fn write_should_append(&self) -> bool {
+        // Soon this will be a heuristic based on the amount of unreachable
+        // data. For now it’s pseudo-random in order to make tests exercise
+        // both code paths.
+
+        fn bad_rng() -> u32 {
+            std::time::SystemTime::now()
+                .duration_since(std::time::UNIX_EPOCH)
+                .unwrap()
+                .subsec_millis()
+        }
+
+        bad_rng() % 2 == 0
+    }
+
     fn get_node<'tree>(
         &'tree self,
         path: &HgPath,
@@ -1043,8 +1061,15 @@
         Ok(packed)
     }
 
+    /// Returns new data together with whether that data should be appended to
+    /// the existing data file whose content is at `self.on_disk` (true),
+    /// instead of written to a new data file (false).
     #[timed]
-    fn pack_v2(&mut self, now: Timestamp) -> Result<Vec<u8>, DirstateError> {
+    fn pack_v2(
+        &mut self,
+        now: Timestamp,
+        can_append: bool,
+    ) -> Result<(Vec<u8>, bool), DirstateError> {
         // TODO: how do we want to handle this in 2038?
         let now: i32 = now.0.try_into().expect("time overflow");
         let mut paths = Vec::new();
@@ -1063,7 +1088,7 @@
 
         self.clear_known_ambiguous_mtimes(&paths)?;
 
-        on_disk::write(self)
+        on_disk::write(self, can_append)
     }
 
     fn status<'a>(
--- a/rust/hg-core/src/dirstate_tree/dispatch.rs	Tue Jul 13 09:44:44 2021 +0200
+++ b/rust/hg-core/src/dirstate_tree/dispatch.rs	Tue Jul 13 17:18:23 2021 +0200
@@ -179,11 +179,19 @@
 
     /// Clear mtimes that are ambigous with `now` (similar to
     /// `clear_ambiguous_times` but for all files in the dirstate map), and
-    /// serialize bytes to write the `.hg/dirstate` file to disk in dirstate-v2
+    /// serialize bytes to write a dirstate data file to disk in dirstate-v2
     /// format.
     ///
+    /// Returns new data together with whether that data should be appended to
+    /// the existing data file whose content is at `self.on_disk` (true),
+    /// instead of written to a new data file (false).
+    ///
     /// Note: this is only supported by the tree dirstate map.
-    fn pack_v2(&mut self, now: Timestamp) -> Result<Vec<u8>, DirstateError>;
+    fn pack_v2(
+        &mut self,
+        now: Timestamp,
+        can_append: bool,
+    ) -> Result<(Vec<u8>, bool), DirstateError>;
 
     /// Run the status algorithm.
     ///
@@ -383,7 +391,11 @@
         self.pack(parents, now)
     }
 
-    fn pack_v2(&mut self, _now: Timestamp) -> Result<Vec<u8>, DirstateError> {
+    fn pack_v2(
+        &mut self,
+        _now: Timestamp,
+        _can_append: bool,
+    ) -> Result<(Vec<u8>, bool), DirstateError> {
         panic!(
             "should have used dirstate_tree::DirstateMap to use the v2 format"
         )
--- a/rust/hg-core/src/dirstate_tree/on_disk.rs	Tue Jul 13 09:44:44 2021 +0200
+++ b/rust/hg-core/src/dirstate_tree/on_disk.rs	Tue Jul 13 17:18:23 2021 +0200
@@ -544,20 +544,28 @@
     recur(on_disk, root.root_nodes, &mut f)
 }
 
+/// Returns new data together with whether that data should be appended to the
+/// existing data file whose content is at `dirstate_map.on_disk` (true),
+/// instead of written to a new data file (false).
 pub(super) fn write(
     dirstate_map: &mut DirstateMap,
-) -> Result<Vec<u8>, DirstateError> {
-    let root_len = std::mem::size_of::<Root>();
+    can_append: bool,
+) -> Result<(Vec<u8>, bool), DirstateError> {
+    let append = can_append && dirstate_map.write_should_append();
 
     // This ignores the space for paths, and for nodes without an entry.
     // TODO: better estimate? Skip the `Vec` and write to a file directly?
-    let size_guess = root_len
+    let size_guess = std::mem::size_of::<Root>()
         + std::mem::size_of::<Node>()
             * dirstate_map.nodes_with_entry_count as usize;
-    let mut out = Vec::with_capacity(size_guess);
 
-    let root_nodes =
-        write_nodes(dirstate_map, dirstate_map.root.as_ref(), &mut out)?;
+    let mut writer = Writer {
+        dirstate_map,
+        append,
+        out: Vec::with_capacity(size_guess),
+    };
+
+    let root_nodes = writer.write_nodes(dirstate_map.root.as_ref())?;
 
     let root = Root {
         root_nodes,
@@ -567,112 +575,121 @@
             .into(),
         ignore_patterns_hash: dirstate_map.ignore_patterns_hash,
     };
-    out.extend(root.as_bytes());
-    Ok(out)
+    writer.out.extend(root.as_bytes());
+    Ok((writer.out, append))
+}
+
+struct Writer<'dmap, 'on_disk> {
+    dirstate_map: &'dmap DirstateMap<'on_disk>,
+    append: bool,
+    out: Vec<u8>,
 }
 
-fn write_nodes(
-    dirstate_map: &DirstateMap,
-    nodes: dirstate_map::ChildNodesRef,
-    out: &mut Vec<u8>,
-) -> Result<ChildNodes, DirstateError> {
-    // `dirstate_map::ChildNodes` is a `HashMap` with undefined iteration
-    // order. Sort to enable binary search in the written file.
-    let nodes = nodes.sorted();
-    let nodes_len = nodes.len();
+impl Writer<'_, '_> {
+    fn write_nodes(
+        &mut self,
+        nodes: dirstate_map::ChildNodesRef,
+    ) -> Result<ChildNodes, DirstateError> {
+        // `dirstate_map::ChildNodes` is a `HashMap` with undefined iteration
+        // order. Sort to enable binary search in the written file.
+        let nodes = nodes.sorted();
+        let nodes_len = nodes.len();
 
-    // First accumulate serialized nodes in a `Vec`
-    let mut on_disk_nodes = Vec::with_capacity(nodes_len);
-    for node in nodes {
-        let children = write_nodes(
-            dirstate_map,
-            node.children(dirstate_map.on_disk)?,
-            out,
-        )?;
-        let full_path = node.full_path(dirstate_map.on_disk)?;
-        let full_path = write_path(full_path.as_bytes(), out);
-        let copy_source =
-            if let Some(source) = node.copy_source(dirstate_map.on_disk)? {
-                write_path(source.as_bytes(), out)
+        // First accumulate serialized nodes in a `Vec`
+        let mut on_disk_nodes = Vec::with_capacity(nodes_len);
+        for node in nodes {
+            let children =
+                self.write_nodes(node.children(self.dirstate_map.on_disk)?)?;
+            let full_path = node.full_path(self.dirstate_map.on_disk)?;
+            let full_path = self.write_path(full_path.as_bytes());
+            let copy_source = if let Some(source) =
+                node.copy_source(self.dirstate_map.on_disk)?
+            {
+                self.write_path(source.as_bytes())
             } else {
                 PathSlice {
                     start: 0.into(),
                     len: 0.into(),
                 }
             };
-        on_disk_nodes.push(match node {
-            NodeRef::InMemory(path, node) => {
-                let (state, data) = match &node.data {
-                    dirstate_map::NodeData::Entry(entry) => (
-                        entry.state.into(),
-                        Entry {
-                            mode: entry.mode.into(),
-                            mtime: entry.mtime.into(),
-                            size: entry.size.into(),
-                        },
-                    ),
-                    dirstate_map::NodeData::CachedDirectory { mtime } => {
-                        (b'd', Entry::from_timestamp(*mtime))
+            on_disk_nodes.push(match node {
+                NodeRef::InMemory(path, node) => {
+                    let (state, data) = match &node.data {
+                        dirstate_map::NodeData::Entry(entry) => (
+                            entry.state.into(),
+                            Entry {
+                                mode: entry.mode.into(),
+                                mtime: entry.mtime.into(),
+                                size: entry.size.into(),
+                            },
+                        ),
+                        dirstate_map::NodeData::CachedDirectory { mtime } => {
+                            (b'd', Entry::from_timestamp(*mtime))
+                        }
+                        dirstate_map::NodeData::None => (
+                            b'\0',
+                            Entry {
+                                mode: 0.into(),
+                                mtime: 0.into(),
+                                size: 0.into(),
+                            },
+                        ),
+                    };
+                    Node {
+                        children,
+                        copy_source,
+                        full_path,
+                        base_name_start: u16::try_from(path.base_name_start())
+                            // Could only panic for paths over 64 KiB
+                            .expect("dirstate-v2 path length overflow")
+                            .into(),
+                        descendants_with_entry_count: node
+                            .descendants_with_entry_count
+                            .into(),
+                        tracked_descendants_count: node
+                            .tracked_descendants_count
+                            .into(),
+                        state,
+                        data,
                     }
-                    dirstate_map::NodeData::None => (
-                        b'\0',
-                        Entry {
-                            mode: 0.into(),
-                            mtime: 0.into(),
-                            size: 0.into(),
-                        },
-                    ),
-                };
-                Node {
+                }
+                NodeRef::OnDisk(node) => Node {
                     children,
                     copy_source,
                     full_path,
-                    base_name_start: u16::try_from(path.base_name_start())
-                        // Could only panic for paths over 64 KiB
-                        .expect("dirstate-v2 path length overflow")
-                        .into(),
-                    descendants_with_entry_count: node
-                        .descendants_with_entry_count
-                        .into(),
-                    tracked_descendants_count: node
-                        .tracked_descendants_count
-                        .into(),
-                    state,
-                    data,
-                }
-            }
-            NodeRef::OnDisk(node) => Node {
-                children,
-                copy_source,
-                full_path,
-                ..*node
-            },
-        })
+                    ..*node
+                },
+            })
+        }
+        // … so we can write them contiguously, after writing everything else
+        // they refer to.
+        let start = self.current_offset();
+        let len = u32::try_from(nodes_len)
+            // Could only panic with over 4 billion nodes
+            .expect("dirstate-v2 path length overflow")
+            .into();
+        self.out.extend(on_disk_nodes.as_bytes());
+        Ok(ChildNodes { start, len })
     }
-    // … so we can write them contiguously, after writing everything else they
-    // refer to.
-    let start = current_offset(out);
-    let len = u32::try_from(nodes_len)
-        // Could only panic with over 4 billion nodes
-        .expect("dirstate-v2 path length overflow")
-        .into();
-    out.extend(on_disk_nodes.as_bytes());
-    Ok(ChildNodes { start, len })
-}
 
-fn current_offset(out: &Vec<u8>) -> Offset {
-    u32::try_from(out.len())
-        // Could only panic for a dirstate file larger than 4 GiB
-        .expect("dirstate-v2 offset overflow")
-        .into()
-}
+    fn current_offset(&mut self) -> Offset {
+        let mut offset = self.out.len();
+        if self.append {
+            offset += self.dirstate_map.on_disk.len()
+        }
+        u32::try_from(offset)
+            // Could only panic for a dirstate file larger than 4 GiB
+            .expect("dirstate-v2 offset overflow")
+            .into()
+    }
 
-fn write_path(slice: &[u8], out: &mut Vec<u8>) -> PathSlice {
-    let start = current_offset(out);
-    let len = u16::try_from(slice.len())
-        // Could only panic for paths over 64 KiB
-        .expect("dirstate-v2 path length overflow")
-        .into();
-    out.extend(slice.as_bytes());
-    PathSlice { start, len }
+    fn write_path(&mut self, slice: &[u8]) -> PathSlice {
+        let start = self.current_offset();
+        let len = u16::try_from(slice.len())
+            // Could only panic for paths over 64 KiB
+            .expect("dirstate-v2 path length overflow")
+            .into();
+        self.out.extend(slice.as_bytes());
+        PathSlice { start, len }
+    }
 }
--- a/rust/hg-cpython/src/dirstate/dirstate_map.rs	Tue Jul 13 09:44:44 2021 +0200
+++ b/rust/hg-cpython/src/dirstate/dirstate_map.rs	Tue Jul 13 17:18:23 2021 +0200
@@ -340,16 +340,23 @@
         }
     }
 
+    /// Returns new data together with whether that data should be appended to
+    /// the existing data file whose content is at `self.on_disk` (True),
+    /// instead of written to a new data file (False).
     def write_v2(
         &self,
-        now: PyObject
-    ) -> PyResult<PyBytes> {
+        now: PyObject,
+        can_append: bool,
+    ) -> PyResult<PyObject> {
         let now = Timestamp(now.extract(py)?);
 
         let mut inner = self.inner(py).borrow_mut();
-        let result = inner.pack_v2(now);
+        let result = inner.pack_v2(now, can_append);
         match result {
-            Ok(packed) => Ok(PyBytes::new(py, &packed)),
+            Ok((packed, append)) => {
+                let packed = PyBytes::new(py, &packed);
+                Ok((packed, append).to_py_object(py).into_object())
+            },
             Err(_) => Err(PyErr::new::<exc::OSError, _>(
                 py,
                 "Dirstate error".to_string(),
--- a/rust/hg-cpython/src/dirstate/dispatch.rs	Tue Jul 13 09:44:44 2021 +0200
+++ b/rust/hg-cpython/src/dirstate/dispatch.rs	Tue Jul 13 17:18:23 2021 +0200
@@ -124,8 +124,12 @@
         self.get_mut().pack_v1(parents, now)
     }
 
-    fn pack_v2(&mut self, now: Timestamp) -> Result<Vec<u8>, DirstateError> {
-        self.get_mut().pack_v2(now)
+    fn pack_v2(
+        &mut self,
+        now: Timestamp,
+        can_append: bool,
+    ) -> Result<(Vec<u8>, bool), DirstateError> {
+        self.get_mut().pack_v2(now, can_append)
     }
 
     fn status<'a>(