rust/hg-core/src/dirstate/dirstate_tree/tree.rs
author Raphaël Gomès <rgomes@octobus.net>
Fri, 25 Sep 2020 17:51:34 +0200
changeset 45562 b51167d70f5a
child 45588 c35db907363d
permissions -rw-r--r--
rust: add `dirstate_tree` module Mercurial needs to represent the filesystem hierarchy on which it operates, for example in the dirstate. Its current on-disk representation is an unsorted, flat structure that gets transformed in the current Rust code into a `HashMap`. This loses the hierarchical information of the dirstate, leading to some unfortunate performance and algorithmic compromises. This module adds an implementation of a radix tree that is specialized for representing the dirstate: its unit is the path component. I have made no efforts to optimize either its memory footprint or its insertion speed: they're pretty bad for now. Following will be a few patches that modify the dirstate.status logic to use that new hierarchical information, fixing issue 6335 in the same swing. Differential Revision: https://phab.mercurial-scm.org/D9085

// tree.rs
//
// Copyright 2020, Raphaël Gomès <rgomes@octobus.net>
//
// This software may be used and distributed according to the terms of the
// GNU General Public License version 2 or any later version.

use super::iter::Iter;
use super::node::{Directory, Node, NodeKind};
use crate::dirstate::dirstate_tree::iter::FsIter;
use crate::dirstate::dirstate_tree::node::{InsertResult, RemoveResult};
use crate::utils::hg_path::{HgPath, HgPathBuf};
use crate::DirstateEntry;
use std::path::PathBuf;

/// A specialized tree to represent the Mercurial dirstate.
///
/// # Advantages over a flat structure
///
/// The dirstate is inherently hierarchical, since it's a representation of the
/// file structure of the project. The current dirstate format is flat, and
/// while that affords us potentially great (unordered) iteration speeds, the
/// need to retrieve a given path is great enough that you need some kind of
/// hashmap or tree in a lot of cases anyway.
///
/// Going with a tree allows us to be smarter:
///   - Skipping an ignored directory means we don't visit its entire subtree
///   - Security auditing does not need to reconstruct paths backwards to check
///     for symlinked directories, this can be done during the iteration in a
///     very efficient fashion
///   - We don't need to build the directory information in another struct,
///     simplifying the code a lot, reducing the memory footprint and
///     potentially going faster depending on the implementation.
///   - We can use it to store a (platform-dependent) caching mechanism [1]
///   - And probably other types of optimizations.
///
/// Only the first two items in this list are implemented as of this commit.
///
/// [1]: https://www.mercurial-scm.org/wiki/DirsCachePlan
///       
///
/// # Structure
///
/// It's a prefix (radix) tree with no fixed arity, with a granularity of a
/// folder, allowing it to mimic a filesystem hierarchy:
///
/// ```text
/// foo/bar
/// foo/baz
/// test
/// ```
/// Will be represented (simplified) by:
///
/// ```text
/// Directory(root):
///   - File("test")
///   - Directory("foo"):
///     - File("bar")
///     - File("baz")
/// ```
///
/// Moreover, it is special-cased for storing the dirstate and as such handles
/// cases that a simple `HashMap` would handle, but while preserving the
/// hierarchy.
/// For example:
///
/// ```shell
/// $ touch foo
/// $ hg add foo
/// $ hg commit -m "foo"
/// $ hg remove foo
/// $ rm foo
/// $ mkdir foo
/// $ touch foo/a
/// $ hg add foo/a
/// $ hg status
///   R foo
///   A foo/a
/// ```
/// To represent this in a tree, one needs to keep track of whether any given
/// file was a directory and whether any given directory was a file at the last
/// dirstate update. This tree stores that information, but only in the right
/// circumstances by respecting the high-level rules that prevent nonsensical
/// structures to exist:
///     - a file can only be added as a child of another file if the latter is
///       marked as `Removed`
///     - a file cannot replace a folder unless all its descendents are removed
///
/// This second rule is not checked by the tree for performance reasons, and
/// because high-level logic already prevents that state from happening.
///
/// # Ordering
///
/// It makes no guarantee of ordering for now.
#[derive(Debug, Default, Clone, PartialEq)]
pub struct Tree {
    pub root: Node,
    files_count: usize,
}

impl Tree {
    pub fn new() -> Self {
        Self {
            root: Node {
                kind: NodeKind::Directory(Directory {
                    was_file: None,
                    children: Default::default(),
                }),
            },
            files_count: 0,
        }
    }

    /// How many files (not directories) are stored in the tree, including ones
    /// marked as `Removed`.
    pub fn len(&self) -> usize {
        self.files_count
    }

    pub fn is_empty(&self) -> bool {
        self.len() == 0
    }

    /// Inserts a file in the tree and returns the previous entry if any.
    pub fn insert(
        &mut self,
        path: impl AsRef<HgPath>,
        kind: DirstateEntry,
    ) -> Option<DirstateEntry> {
        let old = self.insert_node(path, kind);
        match old?.kind {
            NodeKind::Directory(_) => None,
            NodeKind::File(f) => Some(f.entry),
        }
    }

    /// Low-level insertion method that returns the previous node (directories
    /// included).
    fn insert_node(&mut self, path: impl AsRef<HgPath>, kind: DirstateEntry) -> Option<Node> {
        let InsertResult {
            did_insert,
            old_entry,
        } = self.root.insert(path.as_ref().as_bytes(), kind);
        self.files_count += if did_insert { 1 } else { 0 };
        old_entry
    }

    /// Returns a reference to a node if it exists.
    pub fn get_node(&self, path: impl AsRef<HgPath>) -> Option<&Node> {
        self.root.get(path.as_ref().as_bytes())
    }

    /// Returns a reference to the entry corresponding to `path` if it exists.
    pub fn get(&self, path: impl AsRef<HgPath>) -> Option<&DirstateEntry> {
        if let Some(node) = self.get_node(&path) {
            return match &node.kind {
                NodeKind::Directory(d) => d.was_file.as_ref().map(|f| &f.entry),
                NodeKind::File(f) => Some(&f.entry),
            };
        }
        None
    }

    /// Returns `true` if an entry is found for the given `path`.
    pub fn contains_key(&self, path: impl AsRef<HgPath>) -> bool {
        self.get(path).is_some()
    }

    /// Returns a mutable reference to the entry corresponding to `path` if it
    /// exists.
    pub fn get_mut(&mut self, path: impl AsRef<HgPath>) -> Option<&mut DirstateEntry> {
        if let Some(kind) = self.root.get_mut(path.as_ref().as_bytes()) {
            return match kind {
                NodeKind::Directory(d) => d.was_file.as_mut().map(|f| &mut f.entry),
                NodeKind::File(f) => Some(&mut f.entry),
            };
        }
        None
    }

    /// Returns an iterator over the paths and corresponding entries in the
    /// tree.
    pub fn iter(&self) -> Iter {
        Iter::new(&self.root)
    }

    /// Returns an iterator of all entries in the tree, with a special
    /// filesystem handling for the directories containing said entries. See
    /// the documentation of `FsIter` for more.
    pub fn fs_iter(&self, root_dir: PathBuf) -> FsIter {
        FsIter::new(&self.root, root_dir)
    }

    /// Remove the entry at `path` and returns it, if it exists.
    pub fn remove(&mut self, path: impl AsRef<HgPath>) -> Option<DirstateEntry> {
        let RemoveResult { old_entry, .. } = self.root.remove(path.as_ref().as_bytes());
        self.files_count = self
            .files_count
            .checked_sub(if old_entry.is_some() { 1 } else { 0 })
            .expect("removed too many files");
        old_entry
    }
}

impl<P: AsRef<HgPath>> Extend<(P, DirstateEntry)> for Tree {
    fn extend<T: IntoIterator<Item = (P, DirstateEntry)>>(&mut self, iter: T) {
        for (path, entry) in iter {
            self.insert(path, entry);
        }
    }
}

impl<'a> IntoIterator for &'a Tree {
    type Item = (HgPathBuf, DirstateEntry);
    type IntoIter = Iter<'a>;

    fn into_iter(self) -> Self::IntoIter {
        self.iter()
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::dirstate::dirstate_tree::node::File;
    use crate::{EntryState, FastHashMap};
    use pretty_assertions::assert_eq;

    impl Node {
        /// Shortcut for getting children of a node in tests.
        fn children(&self) -> Option<&FastHashMap<Vec<u8>, Node>> {
            match &self.kind {
                NodeKind::Directory(d) => Some(&d.children),
                NodeKind::File(_) => None,
            }
        }
    }

    #[test]
    fn test_dirstate_tree() {
        let mut tree = Tree::new();

        assert_eq!(
            tree.insert_node(
                HgPath::new(b"we/p"),
                DirstateEntry {
                    state: EntryState::Normal,
                    mode: 0,
                    mtime: 0,
                    size: 0
                }
            ),
            None
        );
        dbg!(&tree);
        assert!(tree.get_node(HgPath::new(b"we")).is_some());
        let entry = DirstateEntry {
            state: EntryState::Merged,
            mode: 41,
            mtime: 42,
            size: 43,
        };
        assert_eq!(tree.insert_node(HgPath::new(b"foo/bar"), entry), None);
        assert_eq!(
            tree.get_node(HgPath::new(b"foo/bar")),
            Some(&Node {
                kind: NodeKind::File(File {
                    was_directory: None,
                    entry
                })
            })
        );
        // We didn't override the first entry we made
        assert!(tree.get_node(HgPath::new(b"we")).is_some(),);
        // Inserting the same key again
        assert_eq!(
            tree.insert_node(HgPath::new(b"foo/bar"), entry),
            Some(Node {
                kind: NodeKind::File(File {
                    was_directory: None,
                    entry
                }),
            })
        );
        // Inserting the two levels deep
        assert_eq!(tree.insert_node(HgPath::new(b"foo/bar/baz"), entry), None);
        // Getting a file "inside a file" should return `None`
        assert_eq!(tree.get_node(HgPath::new(b"foo/bar/baz/bap"),), None);

        assert_eq!(
            tree.insert_node(HgPath::new(b"wasdir/subfile"), entry),
            None,
        );
        let removed_entry = DirstateEntry {
            state: EntryState::Removed,
            mode: 0,
            mtime: 0,
            size: 0,
        };
        assert!(tree
            .insert_node(HgPath::new(b"wasdir"), removed_entry)
            .is_some());

        assert_eq!(
            tree.get_node(HgPath::new(b"wasdir")),
            Some(&Node {
                kind: NodeKind::File(File {
                    was_directory: Some(Box::new(Directory {
                        was_file: None,
                        children: [(
                            b"subfile".to_vec(),
                            Node {
                                kind: NodeKind::File(File {
                                    was_directory: None,
                                    entry,
                                })
                            }
                        )]
                        .to_vec()
                        .into_iter()
                        .collect()
                    })),
                    entry: removed_entry
                })
            })
        );

        assert!(tree.get(HgPath::new(b"wasdir/subfile")).is_some())
    }

    #[test]
    fn test_insert_removed() {
        let mut tree = Tree::new();
        let entry = DirstateEntry {
            state: EntryState::Merged,
            mode: 1,
            mtime: 2,
            size: 3,
        };
        let removed_entry = DirstateEntry {
            state: EntryState::Removed,
            mode: 10,
            mtime: 20,
            size: 30,
        };
        assert_eq!(tree.insert_node(HgPath::new(b"foo"), entry), None);
        assert_eq!(tree.insert_node(HgPath::new(b"foo/a"), removed_entry), None);
        // The insert should not turn `foo` into a directory as `foo` is not
        // `Removed`.
        match tree.get_node(HgPath::new(b"foo")).unwrap().kind {
            NodeKind::Directory(_) => panic!("should be a file"),
            NodeKind::File(_) => {}
        }

        let mut tree = Tree::new();
        let entry = DirstateEntry {
            state: EntryState::Merged,
            mode: 1,
            mtime: 2,
            size: 3,
        };
        let removed_entry = DirstateEntry {
            state: EntryState::Removed,
            mode: 10,
            mtime: 20,
            size: 30,
        };
        // The insert *should* turn `foo` into a directory as it is `Removed`.
        assert_eq!(tree.insert_node(HgPath::new(b"foo"), removed_entry), None);
        assert_eq!(tree.insert_node(HgPath::new(b"foo/a"), entry), None);
        match tree.get_node(HgPath::new(b"foo")).unwrap().kind {
            NodeKind::Directory(_) => {}
            NodeKind::File(_) => panic!("should be a directory"),
        }
    }

    #[test]
    fn test_get() {
        let mut tree = Tree::new();
        let entry = DirstateEntry {
            state: EntryState::Merged,
            mode: 1,
            mtime: 2,
            size: 3,
        };
        assert_eq!(tree.insert_node(HgPath::new(b"a/b/c"), entry), None);
        assert_eq!(tree.files_count, 1);
        assert_eq!(tree.get(HgPath::new(b"a/b/c")), Some(&entry));
        assert_eq!(tree.get(HgPath::new(b"a/b")), None);
        assert_eq!(tree.get(HgPath::new(b"a")), None);
        assert_eq!(tree.get(HgPath::new(b"a/b/c/d")), None);
        let entry2 = DirstateEntry {
            state: EntryState::Removed,
            mode: 0,
            mtime: 5,
            size: 1,
        };
        // was_directory
        assert_eq!(tree.insert(HgPath::new(b"a/b"), entry2), None);
        assert_eq!(tree.files_count, 2);
        assert_eq!(tree.get(HgPath::new(b"a/b")), Some(&entry2));
        assert_eq!(tree.get(HgPath::new(b"a/b/c")), Some(&entry));

        let mut tree = Tree::new();

        // was_file
        assert_eq!(tree.insert_node(HgPath::new(b"a"), entry), None);
        assert_eq!(tree.files_count, 1);
        assert_eq!(tree.insert_node(HgPath::new(b"a/b"), entry2), None);
        assert_eq!(tree.files_count, 2);
        assert_eq!(tree.get(HgPath::new(b"a/b")), Some(&entry2));
    }

    #[test]
    fn test_get_mut() {
        let mut tree = Tree::new();
        let mut entry = DirstateEntry {
            state: EntryState::Merged,
            mode: 1,
            mtime: 2,
            size: 3,
        };
        assert_eq!(tree.insert_node(HgPath::new(b"a/b/c"), entry), None);
        assert_eq!(tree.files_count, 1);
        assert_eq!(tree.get_mut(HgPath::new(b"a/b/c")), Some(&mut entry));
        assert_eq!(tree.get_mut(HgPath::new(b"a/b")), None);
        assert_eq!(tree.get_mut(HgPath::new(b"a")), None);
        assert_eq!(tree.get_mut(HgPath::new(b"a/b/c/d")), None);
        let mut entry2 = DirstateEntry {
            state: EntryState::Removed,
            mode: 0,
            mtime: 5,
            size: 1,
        };
        // was_directory
        assert_eq!(tree.insert(HgPath::new(b"a/b"), entry2), None);
        assert_eq!(tree.files_count, 2);
        assert_eq!(tree.get_mut(HgPath::new(b"a/b")), Some(&mut entry2));
        assert_eq!(tree.get_mut(HgPath::new(b"a/b/c")), Some(&mut entry));

        let mut tree = Tree::new();

        // was_file
        assert_eq!(tree.insert_node(HgPath::new(b"a"), entry), None);
        assert_eq!(tree.files_count, 1);
        assert_eq!(tree.insert_node(HgPath::new(b"a/b"), entry2), None);
        assert_eq!(tree.files_count, 2);
        assert_eq!(tree.get_mut(HgPath::new(b"a/b")), Some(&mut entry2));
    }

    #[test]
    fn test_remove() {
        let mut tree = Tree::new();
        assert_eq!(tree.files_count, 0);
        assert_eq!(tree.remove(HgPath::new(b"foo")), None);
        assert_eq!(tree.files_count, 0);

        let entry = DirstateEntry {
            state: EntryState::Normal,
            mode: 0,
            mtime: 0,
            size: 0,
        };
        assert_eq!(tree.insert_node(HgPath::new(b"a/b/c"), entry), None);
        assert_eq!(tree.files_count, 1);

        assert_eq!(tree.remove(HgPath::new(b"a/b/c")), Some(entry));
        assert_eq!(tree.files_count, 0);

        assert_eq!(tree.insert_node(HgPath::new(b"a/b/x"), entry), None);
        assert_eq!(tree.insert_node(HgPath::new(b"a/b/y"), entry), None);
        assert_eq!(tree.insert_node(HgPath::new(b"a/b/z"), entry), None);
        assert_eq!(tree.insert_node(HgPath::new(b"x"), entry), None);
        assert_eq!(tree.insert_node(HgPath::new(b"y"), entry), None);
        assert_eq!(tree.files_count, 5);

        assert_eq!(tree.remove(HgPath::new(b"a/b/x")), Some(entry));
        assert_eq!(tree.files_count, 4);
        assert_eq!(tree.remove(HgPath::new(b"a/b/x")), None);
        assert_eq!(tree.files_count, 4);
        assert_eq!(tree.remove(HgPath::new(b"a/b/y")), Some(entry));
        assert_eq!(tree.files_count, 3);
        assert_eq!(tree.remove(HgPath::new(b"a/b/z")), Some(entry));
        assert_eq!(tree.files_count, 2);

        assert_eq!(tree.remove(HgPath::new(b"x")), Some(entry));
        assert_eq!(tree.files_count, 1);
        assert_eq!(tree.remove(HgPath::new(b"y")), Some(entry));
        assert_eq!(tree.files_count, 0);

        // `a` should have been cleaned up, no more files anywhere in its
        // descendents
        assert_eq!(tree.get_node(HgPath::new(b"a")), None);
        assert_eq!(tree.root.children().unwrap().len(), 0);

        let removed_entry = DirstateEntry {
            state: EntryState::Removed,
            ..entry
        };
        assert_eq!(tree.insert(HgPath::new(b"a"), removed_entry), None);
        assert_eq!(tree.insert_node(HgPath::new(b"a/b/x"), entry), None);
        assert_eq!(tree.files_count, 2);
        dbg!(&tree);
        assert_eq!(tree.remove(HgPath::new(b"a")), Some(removed_entry));
        assert_eq!(tree.files_count, 1);
        dbg!(&tree);
        assert_eq!(tree.remove(HgPath::new(b"a/b/x")), Some(entry));
        assert_eq!(tree.files_count, 0);

        // The entire tree should have been cleaned up, no more files anywhere
        // in its descendents
        assert_eq!(tree.root.children().unwrap().len(), 0);

        let removed_entry = DirstateEntry {
            state: EntryState::Removed,
            ..entry
        };
        assert_eq!(tree.insert(HgPath::new(b"a"), entry), None);
        assert_eq!(tree.insert_node(HgPath::new(b"a/b/x"), removed_entry), None);
        assert_eq!(tree.files_count, 2);
        dbg!(&tree);
        assert_eq!(tree.remove(HgPath::new(b"a")), Some(entry));
        assert_eq!(tree.files_count, 1);
        dbg!(&tree);
        assert_eq!(tree.remove(HgPath::new(b"a/b/x")), Some(removed_entry));
        assert_eq!(tree.files_count, 0);

        dbg!(&tree);
        // The entire tree should have been cleaned up, no more files anywhere
        // in its descendents
        assert_eq!(tree.root.children().unwrap().len(), 0);

        assert_eq!(tree.insert(HgPath::new(b"d"), entry), None);
        assert_eq!(tree.insert(HgPath::new(b"d/d/d"), entry), None);
        assert_eq!(tree.files_count, 2);

        // Deleting the nested file should not delete the top directory as it
        // used to be a file
        assert_eq!(tree.remove(HgPath::new(b"d/d/d")), Some(entry));
        assert_eq!(tree.files_count, 1);
        assert!(tree.get_node(HgPath::new(b"d")).is_some());
        assert!(tree.remove(HgPath::new(b"d")).is_some());
        assert_eq!(tree.files_count, 0);

        // Deleting the nested file should not delete the top file (other way
        // around from the last case)
        assert_eq!(tree.insert(HgPath::new(b"a/a"), entry), None);
        assert_eq!(tree.files_count, 1);
        assert_eq!(tree.insert(HgPath::new(b"a"), entry), None);
        assert_eq!(tree.files_count, 2);
        dbg!(&tree);
        assert_eq!(tree.remove(HgPath::new(b"a/a")), Some(entry));
        assert_eq!(tree.files_count, 1);
        dbg!(&tree);
        assert!(tree.get_node(HgPath::new(b"a")).is_some());
        assert!(tree.get_node(HgPath::new(b"a/a")).is_none());
    }

    #[test]
    fn test_was_directory() {
        let mut tree = Tree::new();

        let entry = DirstateEntry {
            state: EntryState::Removed,
            mode: 0,
            mtime: 0,
            size: 0,
        };
        assert_eq!(tree.insert_node(HgPath::new(b"a/b/c"), entry), None);
        assert_eq!(tree.files_count, 1);

        assert!(tree.insert_node(HgPath::new(b"a"), entry).is_some());
        let new_a = tree.root.children().unwrap().get(&b"a".to_vec()).unwrap();

        match &new_a.kind {
            NodeKind::Directory(_) => panic!(),
            NodeKind::File(f) => {
                let dir = f.was_directory.clone().unwrap();
                let c = dir
                    .children
                    .get(&b"b".to_vec())
                    .unwrap()
                    .children()
                    .unwrap()
                    .get(&b"c".to_vec())
                    .unwrap();

                assert_eq!(
                    match &c.kind {
                        NodeKind::Directory(_) => panic!(),
                        NodeKind::File(f) => f.entry,
                    },
                    entry
                );
            }
        }
        assert_eq!(tree.files_count, 2);
        dbg!(&tree);
        assert_eq!(tree.remove(HgPath::new(b"a/b/c")), Some(entry));
        assert_eq!(tree.files_count, 1);
        dbg!(&tree);
        let a = tree.get_node(HgPath::new(b"a")).unwrap();
        match &a.kind {
            NodeKind::Directory(_) => panic!(),
            NodeKind::File(f) => {
                // Directory in `was_directory` was emptied, should be removed
                assert_eq!(f.was_directory, None);
            }
        }
    }
    #[test]
    fn test_extend() {
        let insertions = [
            (
                HgPathBuf::from_bytes(b"d"),
                DirstateEntry {
                    state: EntryState::Added,
                    mode: 0,
                    mtime: -1,
                    size: -1,
                },
            ),
            (
                HgPathBuf::from_bytes(b"b"),
                DirstateEntry {
                    state: EntryState::Normal,
                    mode: 33188,
                    mtime: 1599647984,
                    size: 2,
                },
            ),
            (
                HgPathBuf::from_bytes(b"a/a"),
                DirstateEntry {
                    state: EntryState::Normal,
                    mode: 33188,
                    mtime: 1599647984,
                    size: 2,
                },
            ),
            (
                HgPathBuf::from_bytes(b"d/d/d"),
                DirstateEntry {
                    state: EntryState::Removed,
                    mode: 0,
                    mtime: 0,
                    size: 0,
                },
            ),
        ]
        .to_vec();
        let mut tree = Tree::new();

        tree.extend(insertions.clone().into_iter());

        for (path, _) in &insertions {
            assert!(tree.contains_key(path), true);
        }
        assert_eq!(tree.files_count, 4);
    }
}