dirstate-tree: Add the new `status()` algorithm
authorSimon Sapin <simon.sapin@octobus.net>
Fri, 16 Apr 2021 12:12:41 +0200
changeset 47113 be579775c2d9
parent 47112 d5956136d19d
child 47114 aeb03758f37a
dirstate-tree: Add the new `status()` algorithm With the dirstate organized in a tree that mirrors the structure of the filesystem tree, we can traverse both trees at the same time in order to compare them. This is hopefully more efficient that building multiple big hashmaps for all of the repository’s contents. Differential Revision: https://phab.mercurial-scm.org/D10547
rust/Cargo.lock
rust/hg-core/Cargo.toml
rust/hg-core/src/dirstate.rs
rust/hg-core/src/dirstate/status.rs
rust/hg-core/src/dirstate_tree/dirstate_map.rs
rust/hg-core/src/dirstate_tree/status.rs
rust/hg-core/src/utils/files.rs
rust/hg-core/src/utils/hg_path.rs
--- a/rust/Cargo.lock	Fri Apr 16 12:12:04 2021 +0200
+++ b/rust/Cargo.lock	Fri Apr 16 12:12:41 2021 +0200
@@ -358,6 +358,7 @@
  "format-bytes",
  "home",
  "im-rc",
+ "itertools",
  "lazy_static",
  "log",
  "memmap",
--- a/rust/hg-core/Cargo.toml	Fri Apr 16 12:12:04 2021 +0200
+++ b/rust/hg-core/Cargo.toml	Fri Apr 16 12:12:41 2021 +0200
@@ -14,6 +14,7 @@
 derive_more = "0.99"
 home = "0.5"
 im-rc = "15.0.*"
+itertools = "0.9"
 lazy_static = "1.4.0"
 rand = "0.7.3"
 rand_pcg = "0.2.1"
--- a/rust/hg-core/src/dirstate.rs	Fri Apr 16 12:12:04 2021 +0200
+++ b/rust/hg-core/src/dirstate.rs	Fri Apr 16 12:12:41 2021 +0200
@@ -42,6 +42,19 @@
     pub fn is_from_other_parent(&self) -> bool {
         self.state == EntryState::Normal && self.size == SIZE_FROM_OTHER_PARENT
     }
+
+    // TODO: other platforms
+    #[cfg(unix)]
+    pub fn mode_changed(
+        &self,
+        filesystem_metadata: &std::fs::Metadata,
+    ) -> bool {
+        use std::os::unix::fs::MetadataExt;
+        const EXEC_BIT_MASK: u32 = 0o100;
+        let dirstate_exec_bit = (self.mode as u32) & EXEC_BIT_MASK;
+        let fs_exec_bit = filesystem_metadata.mode() & EXEC_BIT_MASK;
+        dirstate_exec_bit != fs_exec_bit
+    }
 }
 
 #[derive(BytesCast)]
--- a/rust/hg-core/src/dirstate/status.rs	Fri Apr 16 12:12:04 2021 +0200
+++ b/rust/hg-core/src/dirstate/status.rs	Fri Apr 16 12:12:41 2021 +0200
@@ -95,7 +95,7 @@
 
 type IoResult<T> = std::io::Result<T>;
 
-/// `Box<dyn Trait>` is syntactic sugar for `Box<dyn Trait, 'static>`, so add
+/// `Box<dyn Trait>` is syntactic sugar for `Box<dyn Trait + 'static>`, so add
 /// an explicit lifetime here to not fight `'static` bounds "out of nowhere".
 pub type IgnoreFnType<'a> =
     Box<dyn for<'r> Fn(&'r HgPath) -> bool + Sync + 'a>;
@@ -255,7 +255,7 @@
     pub collect_traversed_dirs: bool,
 }
 
-#[derive(Debug)]
+#[derive(Debug, Default)]
 pub struct DirstateStatus<'a> {
     /// Tracked files whose contents have changed since the parent revision
     pub modified: Vec<HgPathCow<'a>>,
--- a/rust/hg-core/src/dirstate_tree/dirstate_map.rs	Fri Apr 16 12:12:04 2021 +0200
+++ b/rust/hg-core/src/dirstate_tree/dirstate_map.rs	Fri Apr 16 12:12:41 2021 +0200
@@ -27,7 +27,7 @@
 pub struct DirstateMap {
     parents: Option<DirstateParents>,
     dirty_parents: bool,
-    root: ChildNodes,
+    pub(super) root: ChildNodes,
 
     /// Number of nodes anywhere in the tree that have `.entry.is_some()`.
     nodes_with_entry_count: usize,
@@ -42,17 +42,17 @@
 /// path, so comparing full paths gives the same result as comparing base
 /// names. However `BTreeMap` would waste time always re-comparing the same
 /// string prefix.
-type ChildNodes = BTreeMap<WithBasename<HgPathBuf>, Node>;
+pub(super) type ChildNodes = BTreeMap<WithBasename<HgPathBuf>, Node>;
 
 /// Represents a file or a directory
 #[derive(Default)]
-struct Node {
+pub(super) struct Node {
     /// `None` for directories
-    entry: Option<DirstateEntry>,
+    pub(super) entry: Option<DirstateEntry>,
 
-    copy_source: Option<HgPathBuf>,
+    pub(super) copy_source: Option<HgPathBuf>,
 
-    children: ChildNodes,
+    pub(super) children: ChildNodes,
 
     /// How many (non-inclusive) descendants of this node are tracked files
     tracked_descendants_count: usize,
@@ -67,6 +67,10 @@
             false
         }
     }
+
+    pub(super) fn state(&self) -> Option<EntryState> {
+        self.entry.as_ref().map(|entry| entry.state)
+    }
 }
 
 /// `(full_path, entry, copy_source)`
--- a/rust/hg-core/src/dirstate_tree/status.rs	Fri Apr 16 12:12:04 2021 +0200
+++ b/rust/hg-core/src/dirstate_tree/status.rs	Fri Apr 16 12:12:41 2021 +0200
@@ -1,17 +1,379 @@
+use crate::dirstate::status::IgnoreFnType;
+use crate::dirstate_tree::dirstate_map::ChildNodes;
 use crate::dirstate_tree::dirstate_map::DirstateMap;
+use crate::dirstate_tree::dirstate_map::Node;
+use crate::matchers::get_ignore_function;
 use crate::matchers::Matcher;
+use crate::utils::files::get_bytes_from_os_string;
+use crate::utils::hg_path::HgPath;
 use crate::DirstateStatus;
+use crate::EntryState;
+use crate::HgPathBuf;
 use crate::PatternFileWarning;
 use crate::StatusError;
 use crate::StatusOptions;
+use std::borrow::Cow;
+use std::io;
+use std::path::Path;
 use std::path::PathBuf;
 
-pub fn status<'a>(
-    _dmap: &'a mut DirstateMap,
-    _matcher: &'a (dyn Matcher + Sync),
-    _root_dir: PathBuf,
-    _ignore_files: Vec<PathBuf>,
-    _options: StatusOptions,
-) -> Result<(DirstateStatus<'a>, Vec<PatternFileWarning>), StatusError> {
-    todo!()
+/// Returns the status of the working directory compared to its parent
+/// changeset.
+///
+/// This algorithm is based on traversing the filesystem tree (`fs` in function
+/// and variable names) and dirstate tree at the same time. The core of this
+/// traversal is the recursive `traverse_fs_directory_and_dirstate` function
+/// and its use of `itertools::merge_join_by`. When reaching a path that only
+/// exists in one of the two trees, depending on information requested by
+/// `options` we may need to traverse the remaining subtree.
+pub fn status<'tree>(
+    dmap: &'tree mut DirstateMap,
+    matcher: &(dyn Matcher + Sync),
+    root_dir: PathBuf,
+    ignore_files: Vec<PathBuf>,
+    options: StatusOptions,
+) -> Result<(DirstateStatus<'tree>, Vec<PatternFileWarning>), StatusError> {
+    let (ignore_fn, warnings): (IgnoreFnType, _) =
+        if options.list_ignored || options.list_unknown {
+            get_ignore_function(ignore_files, &root_dir)?
+        } else {
+            (Box::new(|&_| true), vec![])
+        };
+
+    let mut common = StatusCommon {
+        options,
+        matcher,
+        ignore_fn,
+        outcome: DirstateStatus::default(),
+    };
+    let is_at_repo_root = true;
+    let hg_path = HgPath::new("");
+    let has_ignored_ancestor = false;
+    common.traverse_fs_directory_and_dirstate(
+        has_ignored_ancestor,
+        &mut dmap.root,
+        hg_path,
+        &root_dir,
+        is_at_repo_root,
+    );
+    Ok((common.outcome, warnings))
+}
+
+/// Bag of random things needed by various parts of the algorithm. Reduces the
+/// number of parameters passed to functions.
+struct StatusCommon<'tree, 'a> {
+    options: StatusOptions,
+    matcher: &'a (dyn Matcher + Sync),
+    ignore_fn: IgnoreFnType<'a>,
+    outcome: DirstateStatus<'tree>,
 }
+
+impl<'tree, 'a> StatusCommon<'tree, 'a> {
+    fn traverse_fs_directory_and_dirstate(
+        &mut self,
+        has_ignored_ancestor: bool,
+        dirstate_nodes: &'tree mut ChildNodes,
+        directory_hg_path: &HgPath,
+        fs_path: &Path,
+        is_at_repo_root: bool,
+    ) {
+        // TODO: handle I/O errors
+        let mut fs_entries =
+            DirEntry::read_dir(fs_path, is_at_repo_root).unwrap();
+
+        // `merge_join_by` requires both its input iterators to be sorted:
+
+        // * `BTreeMap` iterates according to keys’ ordering by definition
+
+        // `sort_unstable_by_key` doesn’t allow keys borrowing from the value:
+        // https://github.com/rust-lang/rust/issues/34162
+        fs_entries.sort_unstable_by(|e1, e2| e1.base_name.cmp(&e2.base_name));
+
+        for pair in itertools::merge_join_by(
+            dirstate_nodes,
+            &fs_entries,
+            |(full_path, _node), fs_entry| {
+                full_path.base_name().cmp(&fs_entry.base_name)
+            },
+        ) {
+            use itertools::EitherOrBoth::*;
+            match pair {
+                Both((hg_path, dirstate_node), fs_entry) => {
+                    self.traverse_fs_and_dirstate(
+                        fs_entry,
+                        hg_path.full_path(),
+                        dirstate_node,
+                        has_ignored_ancestor,
+                    );
+                }
+                Left((hg_path, dirstate_node)) => self.traverse_dirstate_only(
+                    hg_path.full_path(),
+                    dirstate_node,
+                ),
+                Right(fs_entry) => self.traverse_fs_only(
+                    has_ignored_ancestor,
+                    directory_hg_path,
+                    fs_entry,
+                ),
+            }
+        }
+    }
+
+    fn traverse_fs_and_dirstate(
+        &mut self,
+        fs_entry: &DirEntry,
+        hg_path: &'tree HgPath,
+        dirstate_node: &'tree mut Node,
+        has_ignored_ancestor: bool,
+    ) {
+        if fs_entry.metadata.is_dir() {
+            if self.options.collect_traversed_dirs {
+                self.outcome.traversed.push(hg_path.into())
+            }
+            // If we previously had a file here, it was removed (with
+            // `hg rm` or similar) or deleted before it could be
+            // replaced by a directory.
+            self.mark_removed_or_deleted_if_file(
+                hg_path,
+                dirstate_node.state(),
+            );
+            let is_ignored = has_ignored_ancestor || (self.ignore_fn)(hg_path);
+            let is_at_repo_root = false;
+            self.traverse_fs_directory_and_dirstate(
+                is_ignored,
+                &mut dirstate_node.children,
+                hg_path,
+                &fs_entry.full_path,
+                is_at_repo_root,
+            );
+        } else {
+            if self.matcher.matches(hg_path) {
+                let full_path = Cow::from(hg_path);
+                if let Some(entry) = &dirstate_node.entry {
+                    match entry.state {
+                        EntryState::Added => {
+                            self.outcome.added.push(full_path)
+                        }
+                        EntryState::Removed => {
+                            self.outcome.removed.push(full_path)
+                        }
+                        EntryState::Merged => {
+                            self.outcome.modified.push(full_path)
+                        }
+                        EntryState::Normal => {
+                            self.handle_normal_file(
+                                full_path,
+                                dirstate_node,
+                                entry,
+                                fs_entry,
+                            );
+                        }
+                        // This variant is not used in DirstateMap
+                        // nodes
+                        EntryState::Unknown => unreachable!(),
+                    }
+                } else {
+                    // `node.entry.is_none()` indicates a "directory"
+                    // node, but the filesystem has a file
+                    self.mark_unknown_or_ignored(
+                        has_ignored_ancestor,
+                        full_path,
+                    )
+                }
+            }
+
+            for (child_hg_path, child_node) in &mut dirstate_node.children {
+                self.traverse_dirstate_only(
+                    child_hg_path.full_path(),
+                    child_node,
+                )
+            }
+        }
+    }
+
+    /// A file with `EntryState::Normal` in the dirstate was found in the
+    /// filesystem
+    fn handle_normal_file(
+        &mut self,
+        full_path: Cow<'tree, HgPath>,
+        dirstate_node: &Node,
+        entry: &crate::DirstateEntry,
+        fs_entry: &DirEntry,
+    ) {
+        // Keep the low 31 bits
+        fn truncate_u64(value: u64) -> i32 {
+            (value & 0x7FFF_FFFF) as i32
+        }
+        fn truncate_i64(value: i64) -> i32 {
+            (value & 0x7FFF_FFFF) as i32
+        }
+
+        let mode_changed = || {
+            self.options.check_exec && entry.mode_changed(&fs_entry.metadata)
+        };
+        let size_changed = entry.size != truncate_u64(fs_entry.metadata.len());
+        if entry.size >= 0
+            && size_changed
+            && fs_entry.metadata.file_type().is_symlink()
+        {
+            // issue6456: Size returned may be longer due to encryption
+            // on EXT-4 fscrypt. TODO maybe only do it on EXT4?
+            self.outcome.unsure.push(full_path)
+        } else if dirstate_node.copy_source.is_some()
+            || entry.is_from_other_parent()
+            || (entry.size >= 0 && (size_changed || mode_changed()))
+        {
+            self.outcome.modified.push(full_path)
+        } else {
+            let mtime = mtime_seconds(&fs_entry.metadata);
+            if truncate_i64(mtime) != entry.mtime
+                || mtime == self.options.last_normal_time
+            {
+                self.outcome.unsure.push(full_path)
+            } else if self.options.list_clean {
+                self.outcome.clean.push(full_path)
+            }
+        }
+    }
+
+    /// A node in the dirstate tree has no corresponding filesystem entry
+    fn traverse_dirstate_only(
+        &mut self,
+        hg_path: &'tree HgPath,
+        dirstate_node: &'tree mut Node,
+    ) {
+        self.mark_removed_or_deleted_if_file(hg_path, dirstate_node.state());
+        for (child_hg_path, child_node) in &mut dirstate_node.children {
+            self.traverse_dirstate_only(child_hg_path.full_path(), child_node)
+        }
+    }
+
+    /// A node in the dirstate tree has no corresponding *file* on the
+    /// filesystem
+    ///
+    /// Does nothing on a "directory" node
+    fn mark_removed_or_deleted_if_file(
+        &mut self,
+        hg_path: &'tree HgPath,
+        dirstate_node_state: Option<EntryState>,
+    ) {
+        if let Some(state) = dirstate_node_state {
+            if self.matcher.matches(hg_path) {
+                if let EntryState::Removed = state {
+                    self.outcome.removed.push(hg_path.into())
+                } else {
+                    self.outcome.deleted.push(hg_path.into())
+                }
+            }
+        }
+    }
+
+    /// Something in the filesystem has no corresponding dirstate node
+    fn traverse_fs_only(
+        &mut self,
+        has_ignored_ancestor: bool,
+        directory_hg_path: &HgPath,
+        fs_entry: &DirEntry,
+    ) {
+        let hg_path = directory_hg_path.join(&fs_entry.base_name);
+        if fs_entry.metadata.is_dir() {
+            let is_ignored =
+                has_ignored_ancestor || (self.ignore_fn)(&hg_path);
+            let traverse_children = if is_ignored {
+                // Descendants of an ignored directory are all ignored
+                self.options.list_ignored
+            } else {
+                // Descendants of an unknown directory may be either unknown or
+                // ignored
+                self.options.list_unknown || self.options.list_ignored
+            };
+            if traverse_children {
+                let is_at_repo_root = false;
+                // TODO: handle I/O errors
+                let children_fs_entries =
+                    DirEntry::read_dir(&fs_entry.full_path, is_at_repo_root)
+                        .unwrap();
+                for child_fs_entry in children_fs_entries {
+                    self.traverse_fs_only(
+                        is_ignored,
+                        &hg_path,
+                        &child_fs_entry,
+                    )
+                }
+            }
+            if self.options.collect_traversed_dirs {
+                self.outcome.traversed.push(hg_path.into())
+            }
+        } else if self.matcher.matches(&hg_path) {
+            self.mark_unknown_or_ignored(has_ignored_ancestor, hg_path.into())
+        }
+    }
+
+    fn mark_unknown_or_ignored(
+        &mut self,
+        has_ignored_ancestor: bool,
+        hg_path: Cow<'tree, HgPath>,
+    ) {
+        let is_ignored = has_ignored_ancestor || (self.ignore_fn)(&hg_path);
+        if is_ignored {
+            if self.options.list_ignored {
+                self.outcome.ignored.push(hg_path)
+            }
+        } else {
+            if self.options.list_unknown {
+                self.outcome.unknown.push(hg_path)
+            }
+        }
+    }
+}
+
+#[cfg(unix)] // TODO
+fn mtime_seconds(metadata: &std::fs::Metadata) -> i64 {
+    // Going through `Metadata::modified()` would be portable, but would take
+    // care to construct a `SystemTime` value with sub-second precision just
+    // for us to throw that away here.
+    use std::os::unix::fs::MetadataExt;
+    metadata.mtime()
+}
+
+struct DirEntry {
+    base_name: HgPathBuf,
+    full_path: PathBuf,
+    metadata: std::fs::Metadata,
+}
+
+impl DirEntry {
+    /// Returns **unsorted** entries in the given directory, with name and
+    /// metadata.
+    ///
+    /// If a `.hg` sub-directory is encountered:
+    ///
+    /// * At the repository root, ignore that sub-directory
+    /// * Elsewhere, we’re listing the content of a sub-repo. Return an empty
+    ///   list instead.
+    fn read_dir(path: &Path, is_at_repo_root: bool) -> io::Result<Vec<Self>> {
+        let mut results = Vec::new();
+        for entry in path.read_dir()? {
+            let entry = entry?;
+            let metadata = entry.metadata()?;
+            let name = get_bytes_from_os_string(entry.file_name());
+            // FIXME don't do this when cached
+            if name == b".hg" {
+                if is_at_repo_root {
+                    // Skip the repo’s own .hg (might be a symlink)
+                    continue;
+                } else if metadata.is_dir() {
+                    // A .hg sub-directory at another location means a subrepo,
+                    // skip it entirely.
+                    return Ok(Vec::new());
+                }
+            }
+            results.push(DirEntry {
+                base_name: name.into(),
+                full_path: entry.path(),
+                metadata,
+            })
+        }
+        Ok(results)
+    }
+}
--- a/rust/hg-core/src/utils/files.rs	Fri Apr 16 12:12:04 2021 +0200
+++ b/rust/hg-core/src/utils/files.rs	Fri Apr 16 12:12:41 2021 +0200
@@ -17,7 +17,7 @@
 use lazy_static::lazy_static;
 use same_file::is_same_file;
 use std::borrow::{Cow, ToOwned};
-use std::ffi::OsStr;
+use std::ffi::{OsStr, OsString};
 use std::fs::Metadata;
 use std::iter::FusedIterator;
 use std::ops::Deref;
@@ -53,6 +53,12 @@
     str.as_ref().as_bytes().to_vec()
 }
 
+#[cfg(unix)]
+pub fn get_bytes_from_os_string(str: OsString) -> Vec<u8> {
+    use std::os::unix::ffi::OsStringExt;
+    str.into_vec()
+}
+
 /// An iterator over repository path yielding itself and its ancestors.
 #[derive(Copy, Clone, Debug)]
 pub struct Ancestors<'a> {
--- a/rust/hg-core/src/utils/hg_path.rs	Fri Apr 16 12:12:04 2021 +0200
+++ b/rust/hg-core/src/utils/hg_path.rs	Fri Apr 16 12:12:41 2021 +0200
@@ -6,6 +6,7 @@
 // GNU General Public License version 2 or any later version.
 
 use std::borrow::Borrow;
+use std::borrow::Cow;
 use std::convert::TryFrom;
 use std::ffi::{OsStr, OsString};
 use std::fmt;
@@ -535,6 +536,24 @@
     }
 }
 
+impl From<HgPathBuf> for Cow<'_, HgPath> {
+    fn from(path: HgPathBuf) -> Self {
+        Cow::Owned(path)
+    }
+}
+
+impl<'a> From<&'a HgPath> for Cow<'a, HgPath> {
+    fn from(path: &'a HgPath) -> Self {
+        Cow::Borrowed(path)
+    }
+}
+
+impl<'a> From<&'a HgPathBuf> for Cow<'a, HgPath> {
+    fn from(path: &'a HgPathBuf) -> Self {
+        Cow::Borrowed(&**path)
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;