dirstate-v2: Truncate directory mtimes to 31 bits of seconds
authorSimon Sapin <simon.sapin@octobus.net>
Tue, 12 Oct 2021 16:38:13 +0200
changeset 48193 320de901896a
parent 48192 d2f760c2c91c
child 48194 1000db4a71f1
dirstate-v2: Truncate directory mtimes to 31 bits of seconds … instead of 64 bits, while keeping the sub-second presision. This brings the size of one timestamp from 12 bytes to 8 bytes. 31 bits is chosen instead of 32 because that’s already what happens for the mtime of files and symlinks, because dirstate-v1 uses negative i32 values as markers. Later we’ll add sub-second precision for file/symlink mtimes, making their dirstate-v2 representation the same as for directories. Differential Revision: https://phab.mercurial-scm.org/D11633
mercurial/helptext/internals/dirstate-v2.txt
rust/hg-core/src/dirstate/entry.rs
rust/hg-core/src/dirstate_tree/dirstate_map.rs
rust/hg-core/src/dirstate_tree/on_disk.rs
rust/hg-core/src/dirstate_tree/status.rs
--- a/mercurial/helptext/internals/dirstate-v2.txt	Tue Oct 12 16:20:05 2021 +0200
+++ b/mercurial/helptext/internals/dirstate-v2.txt	Tue Oct 12 16:38:13 2021 +0200
@@ -439,23 +439,24 @@
   If an untracked node `HAS_MTIME` *unset*, this space is unused:
 
   * Offset 31:
-    12 bytes set to zero
+    12 unused bytes, set to zero
 
   If an untracked node `HAS_MTIME` *set*,
   what follows is the modification time of a directory
   represented similarly to the C `timespec` struct:
 
   * Offset 31:
+    4 unused bytes, set to zero
+
+  * Offset 35:
     The number of seconds elapsed since the Unix epoch,
-    as a signed (two’s complement) 64-bit integer.
+    truncated to its lower 31 bits,
+    as a 32-bit integer.
 
   * Offset 39:
-    The number of nanoseconds elapsed since
-    the instant specified by the previous field alone,
+    The sub-second number of nanoseconds elapsed since the Unix epoch,
     as 32-bit integer.
     Always greater than or equal to zero, and strictly less than a billion.
-    Increasing this component makes the modification time
-    go forward in time regardless of the sign of the seconds component.
 
   The presence of a directory modification time means that at some point,
   this path in the working directory was observed:
--- a/rust/hg-core/src/dirstate/entry.rs	Tue Oct 12 16:20:05 2021 +0200
+++ b/rust/hg-core/src/dirstate/entry.rs	Tue Oct 12 16:38:13 2021 +0200
@@ -1,3 +1,4 @@
+use crate::dirstate_tree::on_disk::DirstateV2ParseError;
 use crate::errors::HgError;
 use bitflags::bitflags;
 use std::convert::TryFrom;
@@ -29,34 +30,76 @@
     }
 }
 
-#[derive(Copy, Clone, PartialEq)]
-pub struct Timestamp {
-    seconds: i64,
-
-    /// In `0 .. 1_000_000_000`.
-    ///
-    /// This timestamp is after `(seconds, 0)` by this many nanoseconds.
+/// A Unix timestamp with nanoseconds precision
+#[derive(Copy, Clone)]
+pub struct TruncatedTimestamp {
+    truncated_seconds: u32,
+    /// Always in the `0 .. 1_000_000_000` range.
     nanoseconds: u32,
 }
 
-impl Timestamp {
-    pub fn new(seconds: i64, nanoseconds: u32) -> Self {
+impl TruncatedTimestamp {
+    /// Constructs from a timestamp potentially outside of the supported range,
+    /// and truncate the seconds components to its lower 31 bits.
+    ///
+    /// Panics if the nanoseconds components is not in the expected range.
+    pub fn new_truncate(seconds: i64, nanoseconds: u32) -> Self {
+        assert!(nanoseconds < NSEC_PER_SEC);
         Self {
-            seconds,
+            truncated_seconds: seconds as u32 & RANGE_MASK_31BIT,
             nanoseconds,
         }
     }
 
-    pub fn seconds(&self) -> i64 {
-        self.seconds
+    /// Construct from components. Returns an error if they are not in the
+    /// expcted range.
+    pub fn from_already_truncated(
+        truncated_seconds: u32,
+        nanoseconds: u32,
+    ) -> Result<Self, DirstateV2ParseError> {
+        if truncated_seconds & !RANGE_MASK_31BIT == 0
+            && nanoseconds < NSEC_PER_SEC
+        {
+            Ok(Self {
+                truncated_seconds,
+                nanoseconds,
+            })
+        } else {
+            Err(DirstateV2ParseError)
+        }
     }
 
+    /// The lower 31 bits of the number of seconds since the epoch.
+    pub fn truncated_seconds(&self) -> u32 {
+        self.truncated_seconds
+    }
+
+    /// The sub-second component of this timestamp, in nanoseconds.
+    /// Always in the `0 .. 1_000_000_000` range.
+    ///
+    /// This timestamp is after `(seconds, 0)` by this many nanoseconds.
     pub fn nanoseconds(&self) -> u32 {
         self.nanoseconds
     }
+
+    /// Returns whether two timestamps are equal modulo 2**31 seconds.
+    ///
+    /// If this returns `true`, the original values converted from `SystemTime`
+    /// or given to `new_truncate` were very likely equal. A false positive is
+    /// possible if they were exactly a multiple of 2**31 seconds apart (around
+    /// 68 years). This is deemed very unlikely to happen by chance, especially
+    /// on filesystems that support sub-second precision.
+    ///
+    /// If someone is manipulating the modification times of some files to
+    /// intentionally make `hg status` return incorrect results, not truncating
+    /// wouldn’t help much since they can set exactly the expected timestamp.
+    pub fn very_likely_equal(&self, other: &Self) -> bool {
+        self.truncated_seconds == other.truncated_seconds
+            && self.nanoseconds == other.nanoseconds
+    }
 }
 
-impl From<SystemTime> for Timestamp {
+impl From<SystemTime> for TruncatedTimestamp {
     fn from(system_time: SystemTime) -> Self {
         // On Unix, `SystemTime` is a wrapper for the `timespec` C struct:
         // https://www.gnu.org/software/libc/manual/html_node/Time-Types.html#index-struct-timespec
@@ -83,20 +126,17 @@
                     // For example if `system_time` was 4.3 seconds before
                     // the Unix epoch we get a Duration that represents
                     // `(-4, -0.3)` but we want `(-5, +0.7)`:
-                    const NSEC_PER_SEC: u32 = 1_000_000_000;
                     seconds = -1 - negative_secs;
                     nanoseconds = NSEC_PER_SEC - negative_nanos;
                 }
             }
         };
-        Self {
-            seconds,
-            nanoseconds,
-        }
+        Self::new_truncate(seconds, nanoseconds)
     }
 }
 
-pub const V1_RANGEMASK: i32 = 0x7FFFFFFF;
+const NSEC_PER_SEC: u32 = 1_000_000_000;
+const RANGE_MASK_31BIT: u32 = 0x7FFF_FFFF;
 
 pub const MTIME_UNSET: i32 = -1;
 
--- a/rust/hg-core/src/dirstate_tree/dirstate_map.rs	Tue Oct 12 16:20:05 2021 +0200
+++ b/rust/hg-core/src/dirstate_tree/dirstate_map.rs	Tue Oct 12 16:38:13 2021 +0200
@@ -14,6 +14,7 @@
 use crate::dirstate::parsers::Timestamp;
 use crate::dirstate::CopyMapIter;
 use crate::dirstate::StateMapIter;
+use crate::dirstate::TruncatedTimestamp;
 use crate::dirstate::SIZE_FROM_OTHER_PARENT;
 use crate::dirstate::SIZE_NON_NORMAL;
 use crate::matchers::Matcher;
@@ -330,12 +331,12 @@
 
     pub(super) fn cached_directory_mtime(
         &self,
-    ) -> Option<crate::dirstate::Timestamp> {
+    ) -> Result<Option<TruncatedTimestamp>, DirstateV2ParseError> {
         match self {
-            NodeRef::InMemory(_path, node) => match node.data {
+            NodeRef::InMemory(_path, node) => Ok(match node.data {
                 NodeData::CachedDirectory { mtime } => Some(mtime),
                 _ => None,
-            },
+            }),
             NodeRef::OnDisk(node) => node.cached_directory_mtime(),
         }
     }
@@ -376,7 +377,7 @@
 
 pub(super) enum NodeData {
     Entry(DirstateEntry),
-    CachedDirectory { mtime: crate::dirstate::Timestamp },
+    CachedDirectory { mtime: TruncatedTimestamp },
     None,
 }
 
@@ -1177,8 +1178,8 @@
                 entry.debug_tuple()
             } else if !all {
                 return Ok(None);
-            } else if let Some(mtime) = node.cached_directory_mtime() {
-                (b' ', 0, -1, mtime.seconds() as i32)
+            } else if let Some(mtime) = node.cached_directory_mtime()? {
+                (b' ', 0, -1, mtime.truncated_seconds() as i32)
             } else {
                 (b' ', 0, -1, -1)
             };
--- a/rust/hg-core/src/dirstate_tree/on_disk.rs	Tue Oct 12 16:20:05 2021 +0200
+++ b/rust/hg-core/src/dirstate_tree/on_disk.rs	Tue Oct 12 16:38:13 2021 +0200
@@ -2,7 +2,7 @@
 //!
 //! See `mercurial/helptext/internals/dirstate-v2.txt`
 
-use crate::dirstate::Timestamp;
+use crate::dirstate::TruncatedTimestamp;
 use crate::dirstate_tree::dirstate_map::{self, DirstateMap, NodeRef};
 use crate::dirstate_tree::path_with_basename::WithBasename;
 use crate::errors::HgError;
@@ -11,7 +11,7 @@
 use crate::DirstateError;
 use crate::DirstateParents;
 use bitflags::bitflags;
-use bytes_cast::unaligned::{I32Be, I64Be, U16Be, U32Be};
+use bytes_cast::unaligned::{I32Be, U16Be, U32Be};
 use bytes_cast::BytesCast;
 use format_bytes::format_bytes;
 use std::borrow::Cow;
@@ -122,11 +122,8 @@
 #[derive(BytesCast, Copy, Clone)]
 #[repr(C)]
 struct PackedTimestamp {
-    seconds: I64Be,
-
-    /// In `0 .. 1_000_000_000`.
-    ///
-    /// This timestamp is after `(seconds, 0)` by this many nanoseconds.
+    _padding: U32Be,
+    truncated_seconds: U32Be,
     nanoseconds: U32Be,
 }
 
@@ -316,19 +313,23 @@
     ) -> Result<dirstate_map::NodeData, DirstateV2ParseError> {
         if self.has_entry() {
             Ok(dirstate_map::NodeData::Entry(self.assume_entry()))
-        } else if let Some(mtime) = self.cached_directory_mtime() {
+        } else if let Some(mtime) = self.cached_directory_mtime()? {
             Ok(dirstate_map::NodeData::CachedDirectory { mtime })
         } else {
             Ok(dirstate_map::NodeData::None)
         }
     }
 
-    pub(super) fn cached_directory_mtime(&self) -> Option<Timestamp> {
-        if self.flags.contains(Flags::HAS_MTIME) && !self.has_entry() {
-            Some(self.data.as_timestamp())
-        } else {
-            None
-        }
+    pub(super) fn cached_directory_mtime(
+        &self,
+    ) -> Result<Option<TruncatedTimestamp>, DirstateV2ParseError> {
+        Ok(
+            if self.flags.contains(Flags::HAS_MTIME) && !self.has_entry() {
+                Some(self.data.as_timestamp()?)
+            } else {
+                None
+            },
+        )
     }
 
     fn assume_entry(&self) -> DirstateEntry {
@@ -422,9 +423,10 @@
         (flags, raw_entry)
     }
 
-    fn from_timestamp(timestamp: Timestamp) -> Self {
+    fn from_timestamp(timestamp: TruncatedTimestamp) -> Self {
         let packed = PackedTimestamp {
-            seconds: timestamp.seconds().into(),
+            _padding: 0.into(),
+            truncated_seconds: timestamp.truncated_seconds().into(),
             nanoseconds: timestamp.nanoseconds().into(),
         };
         // Safety: both types implement the `ByteCast` trait, so we could
@@ -435,11 +437,14 @@
         unsafe { std::mem::transmute::<PackedTimestamp, Entry>(packed) }
     }
 
-    fn as_timestamp(self) -> Timestamp {
+    fn as_timestamp(self) -> Result<TruncatedTimestamp, DirstateV2ParseError> {
         // Safety: same as above in `from_timestamp`
         let packed =
             unsafe { std::mem::transmute::<Entry, PackedTimestamp>(self) };
-        Timestamp::new(packed.seconds.get(), packed.nanoseconds.get())
+        TruncatedTimestamp::from_already_truncated(
+            packed.truncated_seconds.get(),
+            packed.nanoseconds.get(),
+        )
     }
 }
 
--- a/rust/hg-core/src/dirstate_tree/status.rs	Tue Oct 12 16:20:05 2021 +0200
+++ b/rust/hg-core/src/dirstate_tree/status.rs	Tue Oct 12 16:38:13 2021 +0200
@@ -1,4 +1,4 @@
-use crate::dirstate::entry::Timestamp;
+use crate::dirstate::entry::TruncatedTimestamp;
 use crate::dirstate::status::IgnoreFnType;
 use crate::dirstate_tree::dirstate_map::BorrowedPath;
 use crate::dirstate_tree::dirstate_map::ChildNodesRef;
@@ -126,7 +126,8 @@
     matcher: &'a (dyn Matcher + Sync),
     ignore_fn: IgnoreFnType<'a>,
     outcome: Mutex<DirstateStatus<'on_disk>>,
-    new_cachable_directories: Mutex<Vec<(Cow<'on_disk, HgPath>, Timestamp)>>,
+    new_cachable_directories:
+        Mutex<Vec<(Cow<'on_disk, HgPath>, TruncatedTimestamp)>>,
     outated_cached_directories: Mutex<Vec<Cow<'on_disk, HgPath>>>,
 
     /// Whether ignore files like `.hgignore` have changed since the previous
@@ -165,7 +166,7 @@
         dirstate_node: &NodeRef<'tree, 'on_disk>,
     ) -> Result<(), DirstateV2ParseError> {
         if self.ignore_patterns_have_changed == Some(true)
-            && dirstate_node.cached_directory_mtime().is_some()
+            && dirstate_node.cached_directory_mtime()?.is_some()
         {
             self.outated_cached_directories.lock().unwrap().push(
                 dirstate_node
@@ -182,7 +183,7 @@
     fn can_skip_fs_readdir(
         &self,
         directory_metadata: Option<&std::fs::Metadata>,
-        cached_directory_mtime: Option<Timestamp>,
+        cached_directory_mtime: Option<TruncatedTimestamp>,
     ) -> bool {
         if !self.options.list_unknown && !self.options.list_ignored {
             // All states that we care about listing have corresponding
@@ -199,8 +200,9 @@
                 // directory eligible for `read_dir` caching.
                 if let Some(meta) = directory_metadata {
                     if let Ok(current_mtime) = meta.modified() {
-                        let current_mtime = Timestamp::from(current_mtime);
-                        if current_mtime == cached_mtime {
+                        let truncated =
+                            TruncatedTimestamp::from(current_mtime);
+                        if truncated.very_likely_equal(&cached_mtime) {
                             // The mtime of that directory has not changed
                             // since then, which means that the results of
                             // `read_dir` should also be unchanged.
@@ -222,7 +224,7 @@
         directory_hg_path: &BorrowedPath<'tree, 'on_disk>,
         directory_fs_path: &Path,
         directory_metadata: Option<&std::fs::Metadata>,
-        cached_directory_mtime: Option<Timestamp>,
+        cached_directory_mtime: Option<TruncatedTimestamp>,
         is_at_repo_root: bool,
     ) -> Result<bool, DirstateV2ParseError> {
         if self.can_skip_fs_readdir(directory_metadata, cached_directory_mtime)
@@ -363,7 +365,7 @@
                     hg_path,
                     fs_path,
                     Some(fs_metadata),
-                    dirstate_node.cached_directory_mtime(),
+                    dirstate_node.cached_directory_mtime()?,
                     is_at_repo_root,
                 )?;
             self.maybe_save_directory_mtime(
@@ -466,16 +468,22 @@
                     //
                     // We deem this scenario (unlike the previous one) to be
                     // unlikely enough in practice.
-                    let timestamp = directory_mtime.into();
-                    let cached = dirstate_node.cached_directory_mtime();
-                    if cached != Some(timestamp) {
+                    let truncated = TruncatedTimestamp::from(directory_mtime);
+                    let is_up_to_date = if let Some(cached) =
+                        dirstate_node.cached_directory_mtime()?
+                    {
+                        cached.very_likely_equal(&truncated)
+                    } else {
+                        false
+                    };
+                    if !is_up_to_date {
                         let hg_path = dirstate_node
                             .full_path_borrowed(self.dmap.on_disk)?
                             .detach_from_tree();
                         self.new_cachable_directories
                             .lock()
                             .unwrap()
-                            .push((hg_path, timestamp))
+                            .push((hg_path, truncated))
                     }
                 }
             }