rhg: remember the inode of .hg/dirstate stable
authorRaphaël Gomès <rgomes@octobus.net>
Wed, 01 Mar 2023 16:48:09 +0100
branchstable
changeset 50245 dbe09fb038fc
parent 50244 07d030b38097
child 50246 a6b497872b97
rhg: remember the inode of .hg/dirstate This allows us to detect changes of `.hg/dirstate`, which is either the full dirstate (in dirstate-v1) or the docket file (v2) without relying on data inside the file. It only works on UNIX systems. This fixes a race condition for dirstate-v1 (as demonstrated by the test changes) and adds a confortable layer of sanity for dirstate-v2.
mercurial/dirstatemap.py
rust/hg-core/src/dirstate_tree/dirstate_map.rs
rust/hg-core/src/dirstate_tree/on_disk.rs
rust/hg-core/src/dirstate_tree/owning.rs
rust/hg-core/src/repo.rs
rust/hg-cpython/src/dirstate/dirstate_map.rs
tests/test-dirstate-status-write-race.t
--- a/mercurial/dirstatemap.py	Tue Feb 28 17:58:15 2023 +0100
+++ b/mercurial/dirstatemap.py	Wed Mar 01 16:48:09 2023 +0100
@@ -570,6 +570,12 @@
             testing.wait_on_cfg(self._ui, b'dirstate.pre-read-file')
             if self._use_dirstate_v2:
                 self.docket  # load the data if needed
+                inode = (
+                    self.identity.stat.st_ino
+                    if self.identity is not None
+                    and self.identity.stat is not None
+                    else None
+                )
                 testing.wait_on_cfg(self._ui, b'dirstate.post-docket-read-file')
                 if not self.docket.uuid:
                     data = b''
@@ -581,12 +587,19 @@
                         self.docket.data_size,
                         self.docket.tree_metadata,
                         self.docket.uuid,
+                        inode,
                     )
                 parents = self.docket.parents
             else:
                 self._set_identity()
+                inode = (
+                    self.identity.stat.st_ino
+                    if self.identity is not None
+                    and self.identity.stat is not None
+                    else None
+                )
                 self._map, parents = rustmod.DirstateMap.new_v1(
-                    self._readdirstatefile()
+                    self._readdirstatefile(), inode
                 )
 
             if parents and not self._dirtyparents:
--- a/rust/hg-core/src/dirstate_tree/dirstate_map.rs	Tue Feb 28 17:58:15 2023 +0100
+++ b/rust/hg-core/src/dirstate_tree/dirstate_map.rs	Wed Mar 01 16:48:09 2023 +0100
@@ -76,6 +76,14 @@
     /// Can be `None` if using dirstate v1 or if it's a brand new dirstate.
     pub(super) old_uuid: Option<Vec<u8>>,
 
+    /// Identity of the dirstate file (for dirstate-v1) or the docket file
+    /// (v2). Used to detect if the file has changed from another process.
+    /// Since it's always written atomically, we can compare the inode to
+    /// check the file identity.
+    ///
+    /// TODO On non-Unix systems, something like hashing is a possibility?
+    pub(super) identity: Option<u64>,
+
     pub(super) dirstate_version: DirstateVersion,
 
     /// Controlled by config option `devel.dirstate.v2.data_update_mode`
@@ -468,6 +476,7 @@
             unreachable_bytes: 0,
             old_data_size: 0,
             old_uuid: None,
+            identity: None,
             dirstate_version: DirstateVersion::V1,
             write_mode: DirstateMapWriteMode::Auto,
         }
@@ -479,9 +488,10 @@
         data_size: usize,
         metadata: &[u8],
         uuid: Vec<u8>,
+        identity: Option<u64>,
     ) -> Result<Self, DirstateError> {
         if let Some(data) = on_disk.get(..data_size) {
-            Ok(on_disk::read(data, metadata, uuid)?)
+            Ok(on_disk::read(data, metadata, uuid, identity)?)
         } else {
             Err(DirstateV2ParseError::new("not enough bytes on disk").into())
         }
@@ -490,6 +500,7 @@
     #[timed]
     pub fn new_v1(
         on_disk: &'on_disk [u8],
+        identity: Option<u64>,
     ) -> Result<(Self, Option<DirstateParents>), DirstateError> {
         let mut map = Self::empty(on_disk);
         if map.on_disk.is_empty() {
@@ -531,6 +542,7 @@
             },
         )?;
         let parents = Some(parents.clone());
+        map.identity = identity;
 
         Ok((map, parents))
     }
@@ -1853,6 +1865,7 @@
             packed_len,
             metadata.as_bytes(),
             vec![],
+            None,
         )?;
 
         // Check that everything is accounted for
--- a/rust/hg-core/src/dirstate_tree/on_disk.rs	Tue Feb 28 17:58:15 2023 +0100
+++ b/rust/hg-core/src/dirstate_tree/on_disk.rs	Wed Mar 01 16:48:09 2023 +0100
@@ -291,6 +291,7 @@
     on_disk: &'on_disk [u8],
     metadata: &[u8],
     uuid: Vec<u8>,
+    identity: Option<u64>,
 ) -> Result<DirstateMap<'on_disk>, DirstateV2ParseError> {
     if on_disk.is_empty() {
         let mut map = DirstateMap::empty(on_disk);
@@ -314,6 +315,7 @@
         unreachable_bytes: meta.unreachable_bytes.get(),
         old_data_size: on_disk.len(),
         old_uuid: Some(uuid),
+        identity,
         dirstate_version: DirstateVersion::V2,
         write_mode: DirstateMapWriteMode::Auto,
     };
--- a/rust/hg-core/src/dirstate_tree/owning.rs	Tue Feb 28 17:58:15 2023 +0100
+++ b/rust/hg-core/src/dirstate_tree/owning.rs	Wed Mar 01 16:48:09 2023 +0100
@@ -31,6 +31,7 @@
 
     pub fn new_v1<OnDisk>(
         on_disk: OnDisk,
+        identity: Option<u64>,
     ) -> Result<(Self, DirstateParents), DirstateError>
     where
         OnDisk: Deref<Target = [u8]> + Send + 'static,
@@ -42,7 +43,7 @@
             OwningDirstateMapTryBuilder {
                 on_disk,
                 map_builder: |bytes| {
-                    DirstateMap::new_v1(&bytes).map(|(dmap, p)| {
+                    DirstateMap::new_v1(&bytes, identity).map(|(dmap, p)| {
                         parents = p.unwrap_or(DirstateParents::NULL);
                         dmap
                     })
@@ -58,6 +59,7 @@
         data_size: usize,
         metadata: &[u8],
         uuid: Vec<u8>,
+        identity: Option<u64>,
     ) -> Result<Self, DirstateError>
     where
         OnDisk: Deref<Target = [u8]> + Send + 'static,
@@ -67,7 +69,9 @@
         OwningDirstateMapTryBuilder {
             on_disk,
             map_builder: |bytes| {
-                DirstateMap::new_v2(&bytes, data_size, metadata, uuid)
+                DirstateMap::new_v2(
+                    &bytes, data_size, metadata, uuid, identity,
+                )
             },
         }
         .try_build()
@@ -92,6 +96,10 @@
         self.get_map().old_uuid.as_deref()
     }
 
+    pub fn old_identity(&self) -> Option<u64> {
+        self.get_map().identity
+    }
+
     pub fn old_data_size(&self) -> usize {
         self.get_map().old_data_size
     }
--- a/rust/hg-core/src/repo.rs	Tue Feb 28 17:58:15 2023 +0100
+++ b/rust/hg-core/src/repo.rs	Wed Mar 01 16:48:09 2023 +0100
@@ -259,6 +259,15 @@
             .unwrap_or(Vec::new()))
     }
 
+    fn dirstate_identity(&self) -> Result<Option<u64>, HgError> {
+        use std::os::unix::fs::MetadataExt;
+        Ok(self
+            .hg_vfs()
+            .symlink_metadata("dirstate")
+            .io_not_found_as_none()?
+            .map(|meta| meta.ino()))
+    }
+
     pub fn dirstate_parents(&self) -> Result<DirstateParents, HgError> {
         Ok(*self
             .dirstate_parents
@@ -284,23 +293,27 @@
     /// Returns the information read from the dirstate docket necessary to
     /// check if the data file has been updated/deleted by another process
     /// since we last read the dirstate.
-    /// Namely, the data file uuid and the data size.
+    /// Namely, the inode, data file uuid and the data size.
     fn get_dirstate_data_file_integrity(
         &self,
-    ) -> Result<(Option<Vec<u8>>, usize), HgError> {
+    ) -> Result<(Option<u64>, Option<Vec<u8>>, usize), HgError> {
         assert!(
             self.has_dirstate_v2(),
             "accessing dirstate data file ID without dirstate-v2"
         );
+        // Get the identity before the contents since we could have a race
+        // between the two. Having an identity that is too old is fine, but
+        // one that is younger than the content change is bad.
+        let identity = self.dirstate_identity()?;
         let dirstate = self.dirstate_file_contents()?;
         if dirstate.is_empty() {
             self.dirstate_parents.set(DirstateParents::NULL);
-            Ok((None, 0))
+            Ok((identity, None, 0))
         } else {
             let docket =
                 crate::dirstate_tree::on_disk::read_docket(&dirstate)?;
             self.dirstate_parents.set(docket.parents());
-            Ok((Some(docket.uuid.to_owned()), docket.data_size()))
+            Ok((identity, Some(docket.uuid.to_owned()), docket.data_size()))
         }
     }
 
@@ -347,13 +360,16 @@
                 self.config(),
                 "dirstate.pre-read-file",
             );
+            let identity = self.dirstate_identity()?;
             let dirstate_file_contents = self.dirstate_file_contents()?;
             return if dirstate_file_contents.is_empty() {
                 self.dirstate_parents.set(DirstateParents::NULL);
                 Ok(OwningDirstateMap::new_empty(Vec::new()))
             } else {
-                let (map, parents) =
-                    OwningDirstateMap::new_v1(dirstate_file_contents)?;
+                let (map, parents) = OwningDirstateMap::new_v1(
+                    dirstate_file_contents,
+                    identity,
+                )?;
                 self.dirstate_parents.set(parents);
                 Ok(map)
             };
@@ -365,6 +381,7 @@
     ) -> Result<OwningDirstateMap, DirstateError> {
         debug_wait_for_file_or_print(self.config(), "dirstate.pre-read-file");
         let dirstate_file_contents = self.dirstate_file_contents()?;
+        let identity = self.dirstate_identity()?;
         if dirstate_file_contents.is_empty() {
             self.dirstate_parents.set(DirstateParents::NULL);
             return Ok(OwningDirstateMap::new_empty(Vec::new()));
@@ -410,7 +427,9 @@
                 }
                 Err(e) => return Err(e.into()),
             };
-            OwningDirstateMap::new_v2(contents, data_size, metadata, uuid)
+            OwningDirstateMap::new_v2(
+                contents, data_size, metadata, uuid, identity,
+            )
         } else {
             match self
                 .hg_vfs()
@@ -418,7 +437,7 @@
                 .io_not_found_as_none()
             {
                 Ok(Some(data_mmap)) => OwningDirstateMap::new_v2(
-                    data_mmap, data_size, metadata, uuid,
+                    data_mmap, data_size, metadata, uuid, identity,
                 ),
                 Ok(None) => {
                     // Race where the data file was deleted right after we
@@ -534,12 +553,15 @@
         // it’s unset
         let parents = self.dirstate_parents()?;
         let (packed_dirstate, old_uuid_to_remove) = if self.has_dirstate_v2() {
-            let (uuid, data_size) = self.get_dirstate_data_file_integrity()?;
+            let (identity, uuid, data_size) =
+                self.get_dirstate_data_file_integrity()?;
+            let identity_changed = identity != map.old_identity();
             let uuid_changed = uuid.as_deref() != map.old_uuid();
             let data_length_changed = data_size != map.old_data_size();
 
-            if uuid_changed || data_length_changed {
-                // If uuid or length changed since last disk read, don't write.
+            if identity_changed || uuid_changed || data_length_changed {
+                // If any of identity, uuid or length have changed since
+                // last disk read, don't write.
                 // This is fine because either we're in a command that doesn't
                 // write anything too important (like `hg status`), or we're in
                 // `hg add` and we're supposed to have taken the lock before
@@ -636,6 +658,22 @@
 
             (packed_dirstate, old_uuid)
         } else {
+            let identity = self.dirstate_identity()?;
+            if identity != map.old_identity() {
+                // If identity changed since last disk read, don't write.
+                // This is fine because either we're in a command that doesn't
+                // write anything too important (like `hg status`), or we're in
+                // `hg add` and we're supposed to have taken the lock before
+                // reading anyway.
+                //
+                // TODO complain loudly if we've changed anything important
+                // without taking the lock.
+                // (see `hg help config.format.use-dirstate-tracked-hint`)
+                log::debug!(
+                    "dirstate has changed since last read, not updating."
+                );
+                return Ok(());
+            }
             (map.pack_v1(parents)?, None)
         };
 
--- a/rust/hg-cpython/src/dirstate/dirstate_map.rs	Tue Feb 28 17:58:15 2023 +0100
+++ b/rust/hg-cpython/src/dirstate/dirstate_map.rs	Wed Mar 01 16:48:09 2023 +0100
@@ -49,9 +49,10 @@
     @staticmethod
     def new_v1(
         on_disk: PyBytes,
+        identity: Option<u64>,
     ) -> PyResult<PyObject> {
         let on_disk = PyBytesDeref::new(py, on_disk);
-        let (map, parents) = OwningDirstateMap::new_v1(on_disk)
+        let (map, parents) = OwningDirstateMap::new_v1(on_disk, identity)
             .map_err(|e| dirstate_error(py, e))?;
         let map = Self::create_instance(py, map)?;
         let p1 = PyBytes::new(py, parents.p1.as_bytes());
@@ -67,6 +68,7 @@
         data_size: usize,
         tree_metadata: PyBytes,
         uuid: PyBytes,
+        identity: Option<u64>,
     ) -> PyResult<PyObject> {
         let dirstate_error = |e: DirstateError| {
             PyErr::new::<exc::OSError, _>(py, format!("Dirstate error: {:?}", e))
@@ -74,7 +76,11 @@
         let on_disk = PyBytesDeref::new(py, on_disk);
         let uuid = uuid.data(py);
         let map = OwningDirstateMap::new_v2(
-            on_disk, data_size, tree_metadata.data(py), uuid.to_owned(),
+            on_disk,
+            data_size,
+            tree_metadata.data(py),
+            uuid.to_owned(),
+            identity,
         ).map_err(dirstate_error)?;
         let map = Self::create_instance(py, map)?;
         Ok(map.into_object())
--- a/tests/test-dirstate-status-write-race.t	Tue Feb 28 17:58:15 2023 +0100
+++ b/tests/test-dirstate-status-write-race.t	Wed Mar 01 16:48:09 2023 +0100
@@ -242,12 +242,9 @@
 The file should in a "added" state
 
   $ hg status
-  A dir/n (no-rhg dirstate-v1 !)
-  A dir/n (no-dirstate-v1 !)
-  A dir/n (missing-correct-output rhg dirstate-v1 !)
+  A dir/n
   A dir/o
   R dir/nested/m
-  ? dir/n (known-bad-output rhg dirstate-v1 !)
   ? p
   ? q
 
@@ -289,22 +286,6 @@
 
 The parent must change and the status should be clean
 
-# XXX rhg misbehaves here
-#if rhg dirstate-v1
-  $ hg summary
-  parent: 1:c349430a1631 
-   more files to have two commits
-  branch: default
-  commit: 1 added, 1 removed, 3 unknown (new branch head)
-  update: 1 new changesets (update)
-  phases: 3 draft
-  $ hg status
-  A dir/o
-  R dir/nested/m
-  ? dir/n
-  ? p
-  ? q
-#else
   $ hg summary
   parent: 2:2e3b442a2fd4 tip
    created-during-status
@@ -317,7 +298,6 @@
   ? dir/n
   ? p
   ? q
-#endif
 
 The status process should return a consistent result and not crash.
 
@@ -416,9 +396,7 @@
 the first update should be on disk
 
   $ hg debugstate --all | grep "g"
-  n 644          0 2000-01-01 00:10:00 g (known-bad-output rhg dirstate-v1 !)
-  n 644          0 2000-01-01 00:25:00 g (rhg no-dirstate-v1 !)
-  n 644          0 2000-01-01 00:25:00 g (no-rhg !)
+  n 644          0 2000-01-01 00:25:00 g
 
 The status process should return a consistent result and not crash.