rhg: Use binary search in manifest lookup
authorSimon Sapin <simon.sapin@octobus.net>
Thu, 16 Dec 2021 17:34:51 +0100
changeset 48495 e293ff808a05
parent 48492 d3ec82016104
child 48496 30741bbea550
rhg: Use binary search in manifest lookup … instead of linear scan, when looking for a single entry based on its path. Manifest entries are sorted by path, but are variable-size so we can’t use the standard library’s `[T]::binary_search`. We can still jump to a byte index and then look around for entry boundaries. Differential Revision: https://phab.mercurial-scm.org/D11932
rust/hg-core/src/revlog/manifest.rs
rust/rhg/src/commands/status.rs
--- a/rust/hg-core/src/revlog/manifest.rs	Fri Dec 17 11:46:30 2021 +0100
+++ b/rust/hg-core/src/revlog/manifest.rs	Thu Dec 16 17:34:51 2021 +0100
@@ -52,6 +52,15 @@
 /// `Manifestlog` entry which knows how to interpret the `manifest` data bytes.
 #[derive(Debug)]
 pub struct Manifest {
+    /// Format for a manifest: flat sequence of variable-size entries,
+    /// sorted by path, each as:
+    ///
+    /// ```text
+    /// <path> \0 <hex_node_id> <flags> \n
+    /// ```
+    ///
+    /// The last entry is also terminated by a newline character.
+    /// Flags is one of `b""` (the empty string), `b"x"`, `b"l"`, or `b"t"`.
     bytes: Vec<u8>,
 }
 
@@ -62,44 +71,84 @@
         self.bytes
             .split(|b| b == &b'\n')
             .filter(|line| !line.is_empty())
-            .map(|line| {
-                let (path, rest) = line.split_2(b'\0').ok_or_else(|| {
-                    HgError::corrupted("manifest line should contain \\0")
-                })?;
-                let path = HgPath::new(path);
-                let (hex_node_id, flags) = match rest.split_last() {
-                    Some((&b'x', rest)) => (rest, Some(b'x')),
-                    Some((&b'l', rest)) => (rest, Some(b'l')),
-                    Some((&b't', rest)) => (rest, Some(b't')),
-                    _ => (rest, None),
-                };
-                Ok(ManifestEntry {
-                    path,
-                    hex_node_id,
-                    flags,
-                })
-            })
+            .map(ManifestEntry::from_raw)
     }
 
     /// If the given path is in this manifest, return its filelog node ID
-    pub fn find_file(
+    pub fn find_by_path(
         &self,
         path: &HgPath,
     ) -> Result<Option<ManifestEntry>, HgError> {
-        // TODO: use binary search instead of linear scan. This may involve
-        // building (and caching) an index of the byte indicex of each manifest
-        // line.
+        use std::cmp::Ordering::*;
+        let path = path.as_bytes();
+        // Both boundaries of this `&[u8]` slice are always at the boundary of
+        // an entry
+        let mut bytes = &*self.bytes;
 
-        // TODO: use try_find when available (if still using linear scan)
-        // https://github.com/rust-lang/rust/issues/63178
-        for entry in self.iter() {
-            let entry = entry?;
-            if entry.path == path {
-                return Ok(Some(entry));
+        // Binary search algorithm derived from `[T]::binary_search_by`
+        // <https://github.com/rust-lang/rust/blob/1.57.0/library/core/src/slice/mod.rs#L2221>
+        // except we don’t have a slice of entries. Instead we jump to the
+        // middle of the byte slice and look around for entry delimiters
+        // (newlines).
+        while let Some(entry_range) = Self::find_entry_near_middle_of(bytes)? {
+            let (entry_path, rest) =
+                ManifestEntry::split_path(&bytes[entry_range.clone()])?;
+            let cmp = entry_path.cmp(path);
+            if cmp == Less {
+                let after_newline = entry_range.end + 1;
+                bytes = &bytes[after_newline..];
+            } else if cmp == Greater {
+                bytes = &bytes[..entry_range.start];
+            } else {
+                return Ok(Some(ManifestEntry::from_path_and_rest(
+                    entry_path, rest,
+                )));
             }
         }
         Ok(None)
     }
+
+    /// If there is at least one, return the byte range of an entry *excluding*
+    /// the final newline.
+    fn find_entry_near_middle_of(
+        bytes: &[u8],
+    ) -> Result<Option<std::ops::Range<usize>>, HgError> {
+        let len = bytes.len();
+        if len > 0 {
+            let middle = bytes.len() / 2;
+            // Integer division rounds down, so `middle < len`.
+            let (before, after) = bytes.split_at(middle);
+            let is_newline = |&byte: &u8| byte == b'\n';
+            let entry_start = match before.iter().rposition(is_newline) {
+                Some(i) => i + 1,
+                None => 0, // We choose the first entry in `bytes`
+            };
+            let entry_end = match after.iter().position(is_newline) {
+                Some(i) => {
+                    // No `+ 1` here to exclude this newline from the range
+                    middle + i
+                }
+                None => {
+                    // In a well-formed manifest:
+                    //
+                    // * Since `len > 0`, `bytes` contains at least one entry
+                    // * Every entry ends with a newline
+                    // * Since `middle < len`, `after` contains at least the
+                    //   newline at the end of the last entry of `bytes`.
+                    //
+                    // We didn’t find a newline, so this manifest is not
+                    // well-formed.
+                    return Err(HgError::corrupted(
+                        "manifest entry without \\n delimiter",
+                    ));
+                }
+            };
+            Ok(Some(entry_start..entry_end))
+        } else {
+            // len == 0
+            Ok(None)
+        }
+    }
 }
 
 /// `Manifestlog` entry which knows how to interpret the `manifest` data bytes.
@@ -112,7 +161,32 @@
     pub flags: Option<u8>,
 }
 
-impl ManifestEntry<'_> {
+impl<'a> ManifestEntry<'a> {
+    fn split_path(bytes: &[u8]) -> Result<(&[u8], &[u8]), HgError> {
+        bytes.split_2(b'\0').ok_or_else(|| {
+            HgError::corrupted("manifest entry without \\0 delimiter")
+        })
+    }
+
+    fn from_path_and_rest(path: &'a [u8], rest: &'a [u8]) -> Self {
+        let (hex_node_id, flags) = match rest.split_last() {
+            Some((&b'x', rest)) => (rest, Some(b'x')),
+            Some((&b'l', rest)) => (rest, Some(b'l')),
+            Some((&b't', rest)) => (rest, Some(b't')),
+            _ => (rest, None),
+        };
+        Self {
+            path: HgPath::new(path),
+            hex_node_id,
+            flags,
+        }
+    }
+
+    fn from_raw(bytes: &'a [u8]) -> Result<Self, HgError> {
+        let (path, rest) = Self::split_path(bytes)?;
+        Ok(Self::from_path_and_rest(path, rest))
+    }
+
     pub fn node_id(&self) -> Result<Node, HgError> {
         Node::from_hex_for_repo(self.hex_node_id)
     }
--- a/rust/rhg/src/commands/status.rs	Fri Dec 17 11:46:30 2021 +0100
+++ b/rust/rhg/src/commands/status.rs	Thu Dec 16 17:34:51 2021 +0100
@@ -473,7 +473,7 @@
     };
 
     let entry = manifest
-        .find_file(hg_path)?
+        .find_by_path(hg_path)?
         .expect("ambgious file not in p1");
     if entry.flags != fs_flags {
         return Ok(true);