rust: pure Rust lazyancestors iterator
authorGeorges Racinet <gracinet@anybox.fr>
Thu, 27 Sep 2018 17:03:16 +0200
changeset 40271 dbc28c91f7ff
parent 40270 8783710b1d58
child 40272 a36c5e23c055
rust: pure Rust lazyancestors iterator This is the first of a patch series aiming to provide an alternative implementation in the Rust programming language of the _lazyancestorsiter from the ancestor module. This iterator has been brought to our attention by the people at Octobus, as a potential good candidate for incremental "oxydation" (rewriting in Rust), because it has shown performance issues lately and it merely deals with ints (revision numbers) obtained by calling the index, whih should be directly callable from Rust code, being itself implemented as a C extension. The idea behind this series is to provide a minimal example of Rust code collaborating with existing C and Python code. To open the way to gradually rewriting more of Mercurial's Python code in Rust, without being forced to pay a large initial cost of rewriting the existing fast core into Rust. This patch does not introduce any bindings to other Mercurial code yet. Instead, it introduces the necessary abstractions to address the problem independently, and unit-test it. Since this is the first use of Rust as a Python module within Mercurial, the hg-core crate gets created within this patch. See its Cargo.toml for more details. Someone with a rustc/cargo installation may chdir into rust/hg-core and run the tests by issuing: cargo test --lib The algorithm is a bit simplified (see details in docstrings), and at its simplest becomes rather trivial, showcasing that Rust has batteries included too: BinaryHeap, the Rust analog of Python's heapq does actually all the work. The implementation can be further optimized and probably be made more idiomatic Rust.
rust/Cargo.lock
rust/Cargo.toml
rust/hg-core/Cargo.toml
rust/hg-core/rustfmt.toml
rust/hg-core/src/ancestors.rs
rust/hg-core/src/lib.rs
--- a/rust/Cargo.lock	Sat Oct 13 23:08:29 2018 -0400
+++ b/rust/Cargo.lock	Thu Sep 27 17:03:16 2018 +0200
@@ -17,6 +17,10 @@
 ]
 
 [[package]]
+name = "hg-core"
+version = "0.1.0"
+
+[[package]]
 name = "hgcli"
 version = "0.1.0"
 dependencies = [
--- a/rust/Cargo.toml	Sat Oct 13 23:08:29 2018 -0400
+++ b/rust/Cargo.toml	Thu Sep 27 17:03:16 2018 +0200
@@ -1,3 +1,3 @@
 [workspace]
-members = ["hgcli"]
+members = ["hgcli", "hg-core"]
 exclude = ["chg"]
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/rust/hg-core/Cargo.toml	Thu Sep 27 17:03:16 2018 +0200
@@ -0,0 +1,8 @@
+[package]
+name = "hg-core"
+version = "0.1.0"
+authors = ["Georges Racinet <gracinet@anybox.fr>"]
+description = "Mercurial pure Rust core library, with no assumption on Python bindings (FFI)"
+
+[lib]
+name = "hg"
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/rust/hg-core/rustfmt.toml	Thu Sep 27 17:03:16 2018 +0200
@@ -0,0 +1,3 @@
+max_width = 79
+wrap_comments = true
+error_on_line_overflow = true
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/rust/hg-core/src/ancestors.rs	Thu Sep 27 17:03:16 2018 +0200
@@ -0,0 +1,238 @@
+// ancestors.rs
+//
+// Copyright 2018 Georges Racinet <gracinet@anybox.fr>
+//
+// This software may be used and distributed according to the terms of the
+// GNU General Public License version 2 or any later version.
+
+//! Rust versions of generic DAG ancestors algorithms for Mercurial
+
+use super::{Graph, GraphError, Revision, NULL_REVISION};
+use std::collections::{BinaryHeap, HashSet};
+
+/// Iterator over the ancestors of a given list of revisions
+/// This is a generic type, defined and implemented for any Graph, so that
+/// it's easy to
+///
+/// - unit test in pure Rust
+/// - bind to main Mercurial code, potentially in several ways and have these
+///   bindings evolve over time
+pub struct AncestorsIterator<G: Graph> {
+    graph: G,
+    visit: BinaryHeap<Revision>,
+    seen: HashSet<Revision>,
+    stoprev: Revision,
+}
+
+impl<G: Graph> AncestorsIterator<G> {
+    /// Constructor.
+    ///
+    /// if `inclusive` is true, then the init revisions are emitted in
+    /// particular, otherwise iteration starts from their parents.
+    pub fn new<I>(
+        graph: G,
+        initrevs: I,
+        stoprev: Revision,
+        inclusive: bool,
+    ) -> Result<Self, GraphError>
+    where
+        I: IntoIterator<Item = Revision>,
+    {
+        let filtered_initrevs = initrevs.into_iter().filter(|&r| r >= stoprev);
+        if inclusive {
+            let visit: BinaryHeap<Revision> = filtered_initrevs.collect();
+            let seen = visit.iter().map(|&x| x).collect();
+            return Ok(AncestorsIterator {
+                visit: visit,
+                seen: seen,
+                stoprev: stoprev,
+                graph: graph,
+            });
+        }
+        let mut this = AncestorsIterator {
+            visit: BinaryHeap::new(),
+            seen: HashSet::new(),
+            stoprev: stoprev,
+            graph: graph,
+        };
+        this.seen.insert(NULL_REVISION);
+        for rev in filtered_initrevs {
+            this.conditionally_push_parents(rev)?;
+        }
+        Ok(this)
+    }
+
+    #[inline]
+    fn conditionally_push_rev(&mut self, rev: Revision) {
+        if self.stoprev <= rev && !self.seen.contains(&rev) {
+            self.seen.insert(rev);
+            self.visit.push(rev);
+        }
+    }
+
+    #[inline]
+    fn conditionally_push_parents(
+        &mut self,
+        rev: Revision,
+    ) -> Result<(), GraphError> {
+        let parents = self.graph.parents(rev)?;
+        self.conditionally_push_rev(parents.0);
+        self.conditionally_push_rev(parents.1);
+        Ok(())
+    }
+}
+
+/// Main implementation.
+///
+/// The algorithm is the same as in `_lazyancestorsiter()` from `ancestors.py`
+/// with a few non crucial differences:
+///
+/// - there's no filtering of invalid parent revisions. Actually, it should be
+///   consistent and more efficient to filter them from the end caller.
+/// - we don't use the equivalent of `heapq.heapreplace()`, but we should, for
+///   the same reasons (using `peek_mut`)
+/// - we don't have the optimization for adjacent revs (case where p1 == rev-1)
+/// - we save a few pushes by comparing with `stoprev` before pushing
+///
+/// Error treatment:
+/// We swallow the possible GraphError of conditionally_push_parents() to
+/// respect the Iterator trait in a simple manner: never emitting parents
+/// for the returned revision. We finds this good enough for now, because:
+///
+/// - there's a good chance that invalid revisionss are fed from the start,
+///   and `new()` doesn't swallow the error result.
+/// - this is probably what the Python implementation produces anyway, due
+///   to filtering at each step, and Python code is currently the only
+///   concrete caller we target, so we shouldn't need a finer error treatment
+///   for the time being.
+impl<G: Graph> Iterator for AncestorsIterator<G> {
+    type Item = Revision;
+
+    fn next(&mut self) -> Option<Revision> {
+        let current = match self.visit.pop() {
+            None => {
+                return None;
+            }
+            Some(i) => i,
+        };
+        self.conditionally_push_parents(current).unwrap_or(());
+        Some(current)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+
+    use super::*;
+
+    #[derive(Clone, Debug)]
+    struct Stub;
+
+    /// This is the same as the dict from test-ancestors.py
+    impl Graph for Stub {
+        fn parents(
+            &self,
+            rev: Revision,
+        ) -> Result<(Revision, Revision), GraphError> {
+            match rev {
+                0 => Ok((-1, -1)),
+                1 => Ok((0, -1)),
+                2 => Ok((1, -1)),
+                3 => Ok((1, -1)),
+                4 => Ok((2, -1)),
+                5 => Ok((4, -1)),
+                6 => Ok((4, -1)),
+                7 => Ok((4, -1)),
+                8 => Ok((-1, -1)),
+                9 => Ok((6, 7)),
+                10 => Ok((5, -1)),
+                11 => Ok((3, 7)),
+                12 => Ok((9, -1)),
+                13 => Ok((8, -1)),
+                r => Err(GraphError::ParentOutOfRange(r)),
+            }
+        }
+    }
+
+    fn list_ancestors<G: Graph>(
+        graph: G,
+        initrevs: Vec<Revision>,
+        stoprev: Revision,
+        inclusive: bool,
+    ) -> Vec<Revision> {
+        AncestorsIterator::new(graph, initrevs, stoprev, inclusive)
+            .unwrap()
+            .collect()
+    }
+
+    #[test]
+    /// Same tests as test-ancestor.py, without membership
+    /// (see also test-ancestor.py.out)
+    fn test_list_ancestor() {
+        assert_eq!(list_ancestors(Stub, vec![], 0, false), vec![]);
+        assert_eq!(
+            list_ancestors(Stub, vec![11, 13], 0, false),
+            vec![8, 7, 4, 3, 2, 1, 0]
+        );
+        assert_eq!(list_ancestors(Stub, vec![1, 3], 0, false), vec![1, 0]);
+        assert_eq!(
+            list_ancestors(Stub, vec![11, 13], 0, true),
+            vec![13, 11, 8, 7, 4, 3, 2, 1, 0]
+        );
+        assert_eq!(list_ancestors(Stub, vec![11, 13], 6, false), vec![8, 7]);
+        assert_eq!(
+            list_ancestors(Stub, vec![11, 13], 6, true),
+            vec![13, 11, 8, 7]
+        );
+        assert_eq!(list_ancestors(Stub, vec![11, 13], 11, true), vec![13, 11]);
+        assert_eq!(list_ancestors(Stub, vec![11, 13], 12, true), vec![13]);
+        assert_eq!(
+            list_ancestors(Stub, vec![10, 1], 0, true),
+            vec![10, 5, 4, 2, 1, 0]
+        );
+    }
+
+    #[test]
+    /// Corner case that's not directly in test-ancestors.py, but
+    /// that happens quite often, as demonstrated by running the whole
+    /// suite.
+    /// For instance, run tests/test-obsolete-checkheads.t
+    fn test_nullrev_input() {
+        let mut iter =
+            AncestorsIterator::new(Stub, vec![-1], 0, false).unwrap();
+        assert_eq!(iter.next(), None)
+    }
+
+    /// A corrupted Graph, supporting error handling tests
+    struct Corrupted;
+
+    impl Graph for Corrupted {
+        fn parents(
+            &self,
+            rev: Revision,
+        ) -> Result<(Revision, Revision), GraphError> {
+            match rev {
+                1 => Ok((0, -1)),
+                r => Err(GraphError::ParentOutOfRange(r)),
+            }
+        }
+    }
+
+    #[test]
+    fn test_initrev_out_of_range() {
+        // inclusive=false looks up initrev's parents right away
+        match AncestorsIterator::new(Stub, vec![25], 0, false) {
+            Ok(_) => panic!("Should have been ParentOutOfRange"),
+            Err(e) => assert_eq!(e, GraphError::ParentOutOfRange(25)),
+        }
+    }
+
+    #[test]
+    fn test_next_out_of_range() {
+        // inclusive=false looks up initrev's parents right away
+        let mut iter =
+            AncestorsIterator::new(Corrupted, vec![1], 0, false).unwrap();
+        assert_eq!(iter.next(), Some(0));
+        assert_eq!(iter.next(), None);
+    }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/rust/hg-core/src/lib.rs	Thu Sep 27 17:03:16 2018 +0200
@@ -0,0 +1,24 @@
+// Copyright 2018 Georges Racinet <gracinet@anybox.fr>
+//
+// This software may be used and distributed according to the terms of the
+// GNU General Public License version 2 or any later version.
+mod ancestors;
+pub use ancestors::AncestorsIterator;
+
+/// Mercurial revision numbers
+///
+/// As noted in revlog.c, revision numbers are actually encoded in
+/// 4 bytes, and are liberally converted to ints, whence the i32
+pub type Revision = i32;
+
+pub const NULL_REVISION: Revision = -1;
+
+/// The simplest expression of what we need of Mercurial DAGs.
+pub trait Graph {
+    fn parents(&self, Revision) -> Result<(Revision, Revision), GraphError>;
+}
+
+#[derive(Clone, Debug, PartialEq)]
+pub enum GraphError {
+    ParentOutOfRange(Revision),
+}