rhg: stop manifest traversal when no more files are needed
authorArseniy Alekseyev <aalekseyev@janestreet.com>
Tue, 05 Oct 2021 15:10:42 +0100
changeset 48225 0cc69017d47f
parent 48224 6b5773f89183
child 48226 5e77bdc29d56
rhg: stop manifest traversal when no more files are needed Stopping the traversal early can skip a significant part of the manifest traversal, to avoid some of its cost. The worst-case benchmarks are favorable, as well. Running [hg cat] on the last file in the manifest of a large repo, I'm seeing a ~4ms improvement (150ms -> 146ms), so this time is now almost indistinguishable from the baseline ("brute force") implementation. Running [hg cat] on ~220 files together with the last file of the repo is further improved by ~5ms or so. I suspect the raw performance improvements are caused by splitting the manifest search and the file data access into separate phases, instead of interleaving them. Differential Revision: https://phab.mercurial-scm.org/D11616
rust/hg-core/src/operations/cat.rs
--- a/rust/hg-core/src/operations/cat.rs	Mon Oct 04 19:06:45 2021 +0100
+++ b/rust/hg-core/src/operations/cat.rs	Tue Oct 05 15:10:42 2021 +0100
@@ -9,10 +9,12 @@
 use crate::revlog::revlog::RevlogError;
 use crate::revlog::Node;
 
+use crate::utils::hg_path::HgPath;
 use crate::utils::hg_path::HgPathBuf;
 
-use itertools::EitherOrBoth::{Both, Left, Right};
-use itertools::Itertools;
+use itertools::put_back;
+use itertools::PutBack;
+use std::cmp::Ordering;
 
 pub struct CatOutput {
     /// Whether any file in the manifest matched the paths given as CLI
@@ -26,6 +28,49 @@
     pub node: Node,
 }
 
+// Find an item in an iterator over a sorted collection.
+fn find_item<'a, 'b, 'c, D, I: Iterator<Item = (&'a HgPath, D)>>(
+    i: &mut PutBack<I>,
+    needle: &'b HgPath,
+) -> Option<I::Item> {
+    loop {
+        match i.next() {
+            None => return None,
+            Some(val) => match needle.as_bytes().cmp(val.0.as_bytes()) {
+                Ordering::Less => {
+                    i.put_back(val);
+                    return None;
+                }
+                Ordering::Greater => continue,
+                Ordering::Equal => return Some(val),
+            },
+        }
+    }
+}
+
+fn find_files_in_manifest<
+    'a,
+    'b,
+    D,
+    I: Iterator<Item = (&'a HgPath, D)>,
+    J: Iterator<Item = &'b HgPath>,
+>(
+    manifest: I,
+    files: J,
+) -> (Vec<(&'a HgPath, D)>, Vec<&'b HgPath>) {
+    let mut manifest = put_back(manifest);
+    let mut res = vec![];
+    let mut missing = vec![];
+
+    for file in files {
+        match find_item(&mut manifest, file) {
+            None => missing.push(file),
+            Some(item) => res.push(item),
+        }
+    }
+    return (res, missing);
+}
+
 /// Output the given revision of files
 ///
 /// * `root`: Repository root
@@ -42,34 +87,26 @@
         .changelog()?
         .node_from_rev(rev)
         .expect("should succeed when repo.manifest did");
-    let mut bytes = vec![];
+    let mut bytes: Vec<u8> = vec![];
     let mut found_any = false;
+
     files.sort_unstable();
 
-    let mut missing = vec![];
+    let (found, missing) = find_files_in_manifest(
+        manifest.files_with_nodes(),
+        files.iter().map(|f| f.as_ref()),
+    );
 
-    for entry in manifest
-        .files_with_nodes()
-        .merge_join_by(files.iter(), |(manifest_file, _), file| {
-            manifest_file.cmp(&file.as_ref())
-        })
-    {
-        match entry {
-            Left(_) => (),
-            Right(path) => missing.push(path),
-            Both((manifest_file, node_bytes), _) => {
-                found_any = true;
-                let file_log = repo.filelog(manifest_file)?;
-                let file_node = Node::from_hex_for_repo(node_bytes)?;
-                let entry = file_log.data_for_node(file_node)?;
-                bytes.extend(entry.data()?)
-            }
-        }
+    for (manifest_file, node_bytes) in found {
+        found_any = true;
+        let file_log = repo.filelog(manifest_file)?;
+        let file_node = Node::from_hex_for_repo(node_bytes)?;
+        bytes.extend(file_log.data_for_node(file_node)?.data()?);
     }
 
     let missing: Vec<HgPathBuf> = missing
         .iter()
-        .map(|file| (*(file.as_ref())).to_owned())
+        .map(|file| (*file).to_owned())
         .collect();
     Ok(CatOutput {
         found_any,