rust/hg-core/src/dirstate_tree/status.rs
author Simon Sapin <simon.sapin@octobus.net>
Fri, 16 Apr 2021 12:12:41 +0200
changeset 47113 be579775c2d9
parent 47112 d5956136d19d
child 47114 aeb03758f37a
permissions -rw-r--r--
dirstate-tree: Add the new `status()` algorithm With the dirstate organized in a tree that mirrors the structure of the filesystem tree, we can traverse both trees at the same time in order to compare them. This is hopefully more efficient that building multiple big hashmaps for all of the repository’s contents. Differential Revision: https://phab.mercurial-scm.org/D10547

use crate::dirstate::status::IgnoreFnType;
use crate::dirstate_tree::dirstate_map::ChildNodes;
use crate::dirstate_tree::dirstate_map::DirstateMap;
use crate::dirstate_tree::dirstate_map::Node;
use crate::matchers::get_ignore_function;
use crate::matchers::Matcher;
use crate::utils::files::get_bytes_from_os_string;
use crate::utils::hg_path::HgPath;
use crate::DirstateStatus;
use crate::EntryState;
use crate::HgPathBuf;
use crate::PatternFileWarning;
use crate::StatusError;
use crate::StatusOptions;
use std::borrow::Cow;
use std::io;
use std::path::Path;
use std::path::PathBuf;

/// Returns the status of the working directory compared to its parent
/// changeset.
///
/// This algorithm is based on traversing the filesystem tree (`fs` in function
/// and variable names) and dirstate tree at the same time. The core of this
/// traversal is the recursive `traverse_fs_directory_and_dirstate` function
/// and its use of `itertools::merge_join_by`. When reaching a path that only
/// exists in one of the two trees, depending on information requested by
/// `options` we may need to traverse the remaining subtree.
pub fn status<'tree>(
    dmap: &'tree mut DirstateMap,
    matcher: &(dyn Matcher + Sync),
    root_dir: PathBuf,
    ignore_files: Vec<PathBuf>,
    options: StatusOptions,
) -> Result<(DirstateStatus<'tree>, Vec<PatternFileWarning>), StatusError> {
    let (ignore_fn, warnings): (IgnoreFnType, _) =
        if options.list_ignored || options.list_unknown {
            get_ignore_function(ignore_files, &root_dir)?
        } else {
            (Box::new(|&_| true), vec![])
        };

    let mut common = StatusCommon {
        options,
        matcher,
        ignore_fn,
        outcome: DirstateStatus::default(),
    };
    let is_at_repo_root = true;
    let hg_path = HgPath::new("");
    let has_ignored_ancestor = false;
    common.traverse_fs_directory_and_dirstate(
        has_ignored_ancestor,
        &mut dmap.root,
        hg_path,
        &root_dir,
        is_at_repo_root,
    );
    Ok((common.outcome, warnings))
}

/// Bag of random things needed by various parts of the algorithm. Reduces the
/// number of parameters passed to functions.
struct StatusCommon<'tree, 'a> {
    options: StatusOptions,
    matcher: &'a (dyn Matcher + Sync),
    ignore_fn: IgnoreFnType<'a>,
    outcome: DirstateStatus<'tree>,
}

impl<'tree, 'a> StatusCommon<'tree, 'a> {
    fn traverse_fs_directory_and_dirstate(
        &mut self,
        has_ignored_ancestor: bool,
        dirstate_nodes: &'tree mut ChildNodes,
        directory_hg_path: &HgPath,
        fs_path: &Path,
        is_at_repo_root: bool,
    ) {
        // TODO: handle I/O errors
        let mut fs_entries =
            DirEntry::read_dir(fs_path, is_at_repo_root).unwrap();

        // `merge_join_by` requires both its input iterators to be sorted:

        // * `BTreeMap` iterates according to keys’ ordering by definition

        // `sort_unstable_by_key` doesn’t allow keys borrowing from the value:
        // https://github.com/rust-lang/rust/issues/34162
        fs_entries.sort_unstable_by(|e1, e2| e1.base_name.cmp(&e2.base_name));

        for pair in itertools::merge_join_by(
            dirstate_nodes,
            &fs_entries,
            |(full_path, _node), fs_entry| {
                full_path.base_name().cmp(&fs_entry.base_name)
            },
        ) {
            use itertools::EitherOrBoth::*;
            match pair {
                Both((hg_path, dirstate_node), fs_entry) => {
                    self.traverse_fs_and_dirstate(
                        fs_entry,
                        hg_path.full_path(),
                        dirstate_node,
                        has_ignored_ancestor,
                    );
                }
                Left((hg_path, dirstate_node)) => self.traverse_dirstate_only(
                    hg_path.full_path(),
                    dirstate_node,
                ),
                Right(fs_entry) => self.traverse_fs_only(
                    has_ignored_ancestor,
                    directory_hg_path,
                    fs_entry,
                ),
            }
        }
    }

    fn traverse_fs_and_dirstate(
        &mut self,
        fs_entry: &DirEntry,
        hg_path: &'tree HgPath,
        dirstate_node: &'tree mut Node,
        has_ignored_ancestor: bool,
    ) {
        if fs_entry.metadata.is_dir() {
            if self.options.collect_traversed_dirs {
                self.outcome.traversed.push(hg_path.into())
            }
            // If we previously had a file here, it was removed (with
            // `hg rm` or similar) or deleted before it could be
            // replaced by a directory.
            self.mark_removed_or_deleted_if_file(
                hg_path,
                dirstate_node.state(),
            );
            let is_ignored = has_ignored_ancestor || (self.ignore_fn)(hg_path);
            let is_at_repo_root = false;
            self.traverse_fs_directory_and_dirstate(
                is_ignored,
                &mut dirstate_node.children,
                hg_path,
                &fs_entry.full_path,
                is_at_repo_root,
            );
        } else {
            if self.matcher.matches(hg_path) {
                let full_path = Cow::from(hg_path);
                if let Some(entry) = &dirstate_node.entry {
                    match entry.state {
                        EntryState::Added => {
                            self.outcome.added.push(full_path)
                        }
                        EntryState::Removed => {
                            self.outcome.removed.push(full_path)
                        }
                        EntryState::Merged => {
                            self.outcome.modified.push(full_path)
                        }
                        EntryState::Normal => {
                            self.handle_normal_file(
                                full_path,
                                dirstate_node,
                                entry,
                                fs_entry,
                            );
                        }
                        // This variant is not used in DirstateMap
                        // nodes
                        EntryState::Unknown => unreachable!(),
                    }
                } else {
                    // `node.entry.is_none()` indicates a "directory"
                    // node, but the filesystem has a file
                    self.mark_unknown_or_ignored(
                        has_ignored_ancestor,
                        full_path,
                    )
                }
            }

            for (child_hg_path, child_node) in &mut dirstate_node.children {
                self.traverse_dirstate_only(
                    child_hg_path.full_path(),
                    child_node,
                )
            }
        }
    }

    /// A file with `EntryState::Normal` in the dirstate was found in the
    /// filesystem
    fn handle_normal_file(
        &mut self,
        full_path: Cow<'tree, HgPath>,
        dirstate_node: &Node,
        entry: &crate::DirstateEntry,
        fs_entry: &DirEntry,
    ) {
        // Keep the low 31 bits
        fn truncate_u64(value: u64) -> i32 {
            (value & 0x7FFF_FFFF) as i32
        }
        fn truncate_i64(value: i64) -> i32 {
            (value & 0x7FFF_FFFF) as i32
        }

        let mode_changed = || {
            self.options.check_exec && entry.mode_changed(&fs_entry.metadata)
        };
        let size_changed = entry.size != truncate_u64(fs_entry.metadata.len());
        if entry.size >= 0
            && size_changed
            && fs_entry.metadata.file_type().is_symlink()
        {
            // issue6456: Size returned may be longer due to encryption
            // on EXT-4 fscrypt. TODO maybe only do it on EXT4?
            self.outcome.unsure.push(full_path)
        } else if dirstate_node.copy_source.is_some()
            || entry.is_from_other_parent()
            || (entry.size >= 0 && (size_changed || mode_changed()))
        {
            self.outcome.modified.push(full_path)
        } else {
            let mtime = mtime_seconds(&fs_entry.metadata);
            if truncate_i64(mtime) != entry.mtime
                || mtime == self.options.last_normal_time
            {
                self.outcome.unsure.push(full_path)
            } else if self.options.list_clean {
                self.outcome.clean.push(full_path)
            }
        }
    }

    /// A node in the dirstate tree has no corresponding filesystem entry
    fn traverse_dirstate_only(
        &mut self,
        hg_path: &'tree HgPath,
        dirstate_node: &'tree mut Node,
    ) {
        self.mark_removed_or_deleted_if_file(hg_path, dirstate_node.state());
        for (child_hg_path, child_node) in &mut dirstate_node.children {
            self.traverse_dirstate_only(child_hg_path.full_path(), child_node)
        }
    }

    /// A node in the dirstate tree has no corresponding *file* on the
    /// filesystem
    ///
    /// Does nothing on a "directory" node
    fn mark_removed_or_deleted_if_file(
        &mut self,
        hg_path: &'tree HgPath,
        dirstate_node_state: Option<EntryState>,
    ) {
        if let Some(state) = dirstate_node_state {
            if self.matcher.matches(hg_path) {
                if let EntryState::Removed = state {
                    self.outcome.removed.push(hg_path.into())
                } else {
                    self.outcome.deleted.push(hg_path.into())
                }
            }
        }
    }

    /// Something in the filesystem has no corresponding dirstate node
    fn traverse_fs_only(
        &mut self,
        has_ignored_ancestor: bool,
        directory_hg_path: &HgPath,
        fs_entry: &DirEntry,
    ) {
        let hg_path = directory_hg_path.join(&fs_entry.base_name);
        if fs_entry.metadata.is_dir() {
            let is_ignored =
                has_ignored_ancestor || (self.ignore_fn)(&hg_path);
            let traverse_children = if is_ignored {
                // Descendants of an ignored directory are all ignored
                self.options.list_ignored
            } else {
                // Descendants of an unknown directory may be either unknown or
                // ignored
                self.options.list_unknown || self.options.list_ignored
            };
            if traverse_children {
                let is_at_repo_root = false;
                // TODO: handle I/O errors
                let children_fs_entries =
                    DirEntry::read_dir(&fs_entry.full_path, is_at_repo_root)
                        .unwrap();
                for child_fs_entry in children_fs_entries {
                    self.traverse_fs_only(
                        is_ignored,
                        &hg_path,
                        &child_fs_entry,
                    )
                }
            }
            if self.options.collect_traversed_dirs {
                self.outcome.traversed.push(hg_path.into())
            }
        } else if self.matcher.matches(&hg_path) {
            self.mark_unknown_or_ignored(has_ignored_ancestor, hg_path.into())
        }
    }

    fn mark_unknown_or_ignored(
        &mut self,
        has_ignored_ancestor: bool,
        hg_path: Cow<'tree, HgPath>,
    ) {
        let is_ignored = has_ignored_ancestor || (self.ignore_fn)(&hg_path);
        if is_ignored {
            if self.options.list_ignored {
                self.outcome.ignored.push(hg_path)
            }
        } else {
            if self.options.list_unknown {
                self.outcome.unknown.push(hg_path)
            }
        }
    }
}

#[cfg(unix)] // TODO
fn mtime_seconds(metadata: &std::fs::Metadata) -> i64 {
    // Going through `Metadata::modified()` would be portable, but would take
    // care to construct a `SystemTime` value with sub-second precision just
    // for us to throw that away here.
    use std::os::unix::fs::MetadataExt;
    metadata.mtime()
}

struct DirEntry {
    base_name: HgPathBuf,
    full_path: PathBuf,
    metadata: std::fs::Metadata,
}

impl DirEntry {
    /// Returns **unsorted** entries in the given directory, with name and
    /// metadata.
    ///
    /// If a `.hg` sub-directory is encountered:
    ///
    /// * At the repository root, ignore that sub-directory
    /// * Elsewhere, we’re listing the content of a sub-repo. Return an empty
    ///   list instead.
    fn read_dir(path: &Path, is_at_repo_root: bool) -> io::Result<Vec<Self>> {
        let mut results = Vec::new();
        for entry in path.read_dir()? {
            let entry = entry?;
            let metadata = entry.metadata()?;
            let name = get_bytes_from_os_string(entry.file_name());
            // FIXME don't do this when cached
            if name == b".hg" {
                if is_at_repo_root {
                    // Skip the repo’s own .hg (might be a symlink)
                    continue;
                } else if metadata.is_dir() {
                    // A .hg sub-directory at another location means a subrepo,
                    // skip it entirely.
                    return Ok(Vec::new());
                }
            }
            results.push(DirEntry {
                base_name: name.into(),
                full_path: entry.path(),
                metadata,
            })
        }
        Ok(results)
    }
}