rhg: desambiguate status without decompressing filelog if possible
authorSimon Sapin <simon.sapin@octobus.net>
Fri, 07 Jan 2022 14:40:21 +0100
changeset 48546 e91aa800ae5b
parent 48545 5026a0d37526
child 48547 374bf34c9ffd
rhg: desambiguate status without decompressing filelog if possible When status is unsure based on `stat()` and the dirstate if a file is clean or modified, we need to compare it against the filelog. This comparison can skip looking at contents if the lengths differ. This changeset optimize this further to deduce what we can about the length if the filelog without decompressing it or resolving deltas. Differential Revision: https://phab.mercurial-scm.org/D11965
mercurial/revlogutils/flagutil.py
rust/hg-core/src/revlog/filelog.rs
rust/hg-core/src/revlog/index.rs
rust/hg-core/src/revlog/revlog.rs
rust/rhg/src/commands/status.rs
tests/test-issue6528.t
--- a/mercurial/revlogutils/flagutil.py	Thu Jan 06 12:46:10 2022 +0100
+++ b/mercurial/revlogutils/flagutil.py	Fri Jan 07 14:40:21 2022 +0100
@@ -32,6 +32,7 @@
 REVIDX_FLAGS_ORDER
 REVIDX_RAWTEXT_CHANGING_FLAGS
 
+# Keep this in sync with REVIDX_KNOWN_FLAGS in rust/hg-core/src/revlog/revlog.rs
 REVIDX_KNOWN_FLAGS = util.bitsfrom(REVIDX_FLAGS_ORDER)
 
 # Store flag processors (cf. 'addflagprocessor()' to register)
--- a/rust/hg-core/src/revlog/filelog.rs	Thu Jan 06 12:46:10 2022 +0100
+++ b/rust/hg-core/src/revlog/filelog.rs	Fri Jan 07 14:40:21 2022 +0100
@@ -73,6 +73,89 @@
 pub struct FilelogEntry<'a>(RevlogEntry<'a>);
 
 impl FilelogEntry<'_> {
+    /// `self.data()` can be expensive, with decompression and delta
+    /// resolution.
+    ///
+    /// *Without* paying this cost, based on revlog index information
+    /// including `RevlogEntry::uncompressed_len`:
+    ///
+    /// * Returns `true` if the length that `self.data().file_data().len()`
+    ///   would return is definitely **not equal** to `other_len`.
+    /// * Returns `false` if available information is inconclusive.
+    pub fn file_data_len_not_equal_to(&self, other_len: u64) -> bool {
+        // Relevant code that implement this behavior in Python code:
+        // basefilectx.cmp, filelog.size, storageutil.filerevisioncopied,
+        // revlog.size, revlog.rawsize
+
+        // Let’s call `file_data_len` what would be returned by
+        // `self.data().file_data().len()`.
+
+        if self.0.is_cencored() {
+            let file_data_len = 0;
+            return other_len != file_data_len;
+        }
+
+        if self.0.has_length_affecting_flag_processor() {
+            // We can’t conclude anything about `file_data_len`.
+            return false;
+        }
+
+        // Revlog revisions (usually) have metadata for the size of
+        // their data after decompression and delta resolution
+        // as would be returned by `Revlog::get_rev_data`.
+        //
+        // For filelogs this is the file’s contents preceded by an optional
+        // metadata block.
+        let uncompressed_len = if let Some(l) = self.0.uncompressed_len() {
+            l as u64
+        } else {
+            // The field was set to -1, the actual uncompressed len is unknown.
+            // We need to decompress to say more.
+            return false;
+        };
+        // `uncompressed_len = file_data_len + optional_metadata_len`,
+        // so `file_data_len <= uncompressed_len`.
+        if uncompressed_len < other_len {
+            // Transitively, `file_data_len < other_len`.
+            // So `other_len != file_data_len` definitely.
+            return true;
+        }
+
+        if uncompressed_len == other_len + 4 {
+            // It’s possible that `file_data_len == other_len` with an empty
+            // metadata block (2 start marker bytes + 2 end marker bytes).
+            // This happens when there wouldn’t otherwise be metadata, but
+            // the first 2 bytes of file data happen to match a start marker
+            // and would be ambiguous.
+            return false;
+        }
+
+        if !self.0.has_p1() {
+            // There may or may not be copy metadata, so we can’t deduce more
+            // about `file_data_len` without computing file data.
+            return false;
+        }
+
+        // Filelog ancestry is not meaningful in the way changelog ancestry is.
+        // It only provides hints to delta generation.
+        // p1 and p2 are set to null when making a copy or rename since
+        // contents are likely unrelatedto what might have previously existed
+        // at the destination path.
+        //
+        // Conversely, since here p1 is non-null, there is no copy metadata.
+        // Note that this reasoning may be invalidated in the presence of
+        // merges made by some previous versions of Mercurial that
+        // swapped p1 and p2. See <https://bz.mercurial-scm.org/show_bug.cgi?id=6528>
+        // and `tests/test-issue6528.t`.
+        //
+        // Since copy metadata is currently the only kind of metadata
+        // kept in revlog data of filelogs,
+        // this `FilelogEntry` does not have such metadata:
+        let file_data_len = uncompressed_len;
+
+        return file_data_len != other_len;
+    }
+
     pub fn data(&self) -> Result<FilelogRevisionData, HgError> {
         Ok(FilelogRevisionData(self.0.data()?.into_owned()))
     }
--- a/rust/hg-core/src/revlog/index.rs	Thu Jan 06 12:46:10 2022 +0100
+++ b/rust/hg-core/src/revlog/index.rs	Fri Jan 07 14:40:21 2022 +0100
@@ -260,6 +260,10 @@
         }
     }
 
+    pub fn flags(&self) -> u16 {
+        BigEndian::read_u16(&self.bytes[6..=7])
+    }
+
     /// Return the compressed length of the data.
     pub fn compressed_len(&self) -> u32 {
         BigEndian::read_u32(&self.bytes[8..=11])
--- a/rust/hg-core/src/revlog/revlog.rs	Thu Jan 06 12:46:10 2022 +0100
+++ b/rust/hg-core/src/revlog/revlog.rs	Fri Jan 07 14:40:21 2022 +0100
@@ -20,6 +20,18 @@
 use crate::revlog::Revision;
 use crate::{Node, NULL_REVISION};
 
+const REVISION_FLAG_CENSORED: u16 = 1 << 15;
+const REVISION_FLAG_ELLIPSIS: u16 = 1 << 14;
+const REVISION_FLAG_EXTSTORED: u16 = 1 << 13;
+const REVISION_FLAG_HASCOPIESINFO: u16 = 1 << 12;
+
+// Keep this in sync with REVIDX_KNOWN_FLAGS in
+// mercurial/revlogutils/flagutil.py
+const REVIDX_KNOWN_FLAGS: u16 = REVISION_FLAG_CENSORED
+    | REVISION_FLAG_ELLIPSIS
+    | REVISION_FLAG_EXTSTORED
+    | REVISION_FLAG_HASCOPIESINFO;
+
 #[derive(derive_more::From)]
 pub enum RevlogError {
     InvalidRevision,
@@ -282,6 +294,7 @@
             },
             p1: index_entry.p1(),
             p2: index_entry.p2(),
+            flags: index_entry.flags(),
             hash: *index_entry.hash(),
         };
         Ok(entry)
@@ -309,6 +322,7 @@
     base_rev_or_base_of_delta_chain: Option<Revision>,
     p1: Revision,
     p2: Revision,
+    flags: u16,
     hash: Node,
 }
 
@@ -321,6 +335,20 @@
         u32::try_from(self.uncompressed_len).ok()
     }
 
+    pub fn has_p1(&self) -> bool {
+        self.p1 != NULL_REVISION
+    }
+
+    pub fn is_cencored(&self) -> bool {
+        (self.flags & REVISION_FLAG_CENSORED) != 0
+    }
+
+    pub fn has_length_affecting_flag_processor(&self) -> bool {
+        // Relevant Python code: revlog.size()
+        // note: ELLIPSIS is known to not change the content
+        (self.flags & (REVIDX_KNOWN_FLAGS ^ REVISION_FLAG_ELLIPSIS)) != 0
+    }
+
     /// The data for this entry, after resolving deltas if any.
     pub fn data(&self) -> Result<Cow<'a, [u8]>, HgError> {
         let mut entry = self.clone();
--- a/rust/rhg/src/commands/status.rs	Thu Jan 06 12:46:10 2022 +0100
+++ b/rust/rhg/src/commands/status.rs	Fri Jan 07 14:40:21 2022 +0100
@@ -516,16 +516,16 @@
         filelog.entry_for_node(entry.node_id()?).map_err(|_| {
             HgError::corrupted("filelog missing node from manifest")
         })?;
-    // TODO: check `fs_len` here like below, but based on
-    // `RevlogEntry::uncompressed_len` without decompressing the full filelog
-    // contents where possible. This is only valid if the revlog data does not
-    // contain metadata. See how Python’s `revlog.rawsize` calls
-    // `storageutil.filerevisioncopied`.
-    // (Maybe also check for content-modifying flags? See `revlog.size`.)
-    let filelog_data = filelog_entry.data()?;
-    let contents_in_p1 = filelog_data.file_data()?;
-    if contents_in_p1.len() as u64 != fs_len {
-        // No need to read the file contents:
+    if filelog_entry.file_data_len_not_equal_to(fs_len) {
+        // No need to read file contents:
+        // it cannot be equal if it has a different length.
+        return Ok(true);
+    }
+
+    let p1_filelog_data = filelog_entry.data()?;
+    let p1_contents = p1_filelog_data.file_data()?;
+    if p1_contents.len() as u64 != fs_len {
+        // No need to read file contents:
         // it cannot be equal if it has a different length.
         return Ok(true);
     }
@@ -535,5 +535,5 @@
     } else {
         vfs.read(fs_path)?
     };
-    Ok(contents_in_p1 != &*fs_contents)
+    Ok(p1_contents != &*fs_contents)
 }
--- a/tests/test-issue6528.t	Thu Jan 06 12:46:10 2022 +0100
+++ b/tests/test-issue6528.t	Fri Jan 07 14:40:21 2022 +0100
@@ -193,8 +193,8 @@
 deltas where possible.)
 
   $ hg st
-  M D.txt (no-rhg !)
-  M b.txt (no-rhg !)
+  M D.txt
+  M b.txt
   $ hg debugrevlogindex b.txt
      rev linkrev nodeid       p1           p2
        0       2 05b806ebe5ea 000000000000 000000000000
@@ -212,8 +212,8 @@
   found affected revision 1 for filelog 'data/b.txt.i'
   found affected revision 3 for filelog 'data/b.txt.i'
   $ hg st
-  M D.txt (no-rhg !)
-  M b.txt (no-rhg !)
+  M D.txt
+  M b.txt
   $ hg debugrevlogindex b.txt
      rev linkrev nodeid       p1           p2
        0       2 05b806ebe5ea 000000000000 000000000000
@@ -231,8 +231,8 @@
   found affected revision 1 for filelog 'data/b.txt.i'
   found affected revision 3 for filelog 'data/b.txt.i'
   $ hg st
-  M D.txt (no-rhg !)
-  M b.txt (no-rhg !)
+  M D.txt
+  M b.txt
   $ hg debugrevlogindex b.txt
      rev linkrev nodeid       p1           p2
        0       2 05b806ebe5ea 000000000000 000000000000
@@ -308,8 +308,8 @@
   found affected revision 1 for filelog 'b.txt'
   found affected revision 3 for filelog 'b.txt'
   $ hg st
-  M D.txt (no-rhg !)
-  M b.txt (no-rhg !)
+  M D.txt
+  M b.txt
   $ hg debugrevlogindex b.txt
      rev linkrev nodeid       p1           p2
        0       2 05b806ebe5ea 000000000000 000000000000