rust/hg-core/src/revlog/filelog.rs
author Arseniy Alekseyev <aalekseyev@janestreet.com>
Wed, 05 Oct 2022 15:45:05 -0400
changeset 49517 52464a20add0
parent 49174 3f86ee422095
child 49930 e98fd81bb151
permissions -rw-r--r--
rhg: parallellize computation of [unsure_is_modified] [unsure_is_modified] is called for every file for which we can't determine its status based on its size and mtime alone. In particular, this happens if the mtime of the file changes without its contents changing. Parallellizing this improves performance significantly when we have many of these files. Here's an example run (on a repo with ~400k files after dropping FS caches) ``` before: real 0m53.901s user 0m27.806s sys 0m31.325s after: real 0m32.017s user 0m34.277s sys 1m26.250s ``` Another example run (a different FS): ``` before: real 3m28.479s user 0m31.800s sys 0m25.324s after: real 0m29.751s user 0m41.814s sys 1m15.387s ```

use crate::errors::HgError;
use crate::repo::Repo;
use crate::revlog::path_encode::path_encode;
use crate::revlog::revlog::RevlogEntry;
use crate::revlog::revlog::{Revlog, RevlogError};
use crate::revlog::NodePrefix;
use crate::revlog::Revision;
use crate::utils::files::get_path_from_bytes;
use crate::utils::hg_path::HgPath;
use crate::utils::SliceExt;
use std::path::PathBuf;

/// A specialized `Revlog` to work with file data logs.
pub struct Filelog {
    /// The generic `revlog` format.
    revlog: Revlog,
}

impl Filelog {
    pub fn open_vfs(
        store_vfs: &crate::vfs::Vfs<'_>,
        file_path: &HgPath,
    ) -> Result<Self, HgError> {
        let index_path = store_path(file_path, b".i");
        let data_path = store_path(file_path, b".d");
        let revlog =
            Revlog::open(store_vfs, index_path, Some(&data_path), false)?;
        Ok(Self { revlog })
    }

    pub fn open(repo: &Repo, file_path: &HgPath) -> Result<Self, HgError> {
        Self::open_vfs(&repo.store_vfs(), file_path)
    }

    /// The given node ID is that of the file as found in a filelog, not of a
    /// changeset.
    pub fn data_for_node(
        &self,
        file_node: impl Into<NodePrefix>,
    ) -> Result<FilelogRevisionData, RevlogError> {
        let file_rev = self.revlog.rev_from_node(file_node.into())?;
        self.data_for_rev(file_rev)
    }

    /// The given revision is that of the file as found in a filelog, not of a
    /// changeset.
    pub fn data_for_rev(
        &self,
        file_rev: Revision,
    ) -> Result<FilelogRevisionData, RevlogError> {
        let data: Vec<u8> = self.revlog.get_rev_data(file_rev)?.into_owned();
        Ok(FilelogRevisionData(data.into()))
    }

    /// The given node ID is that of the file as found in a filelog, not of a
    /// changeset.
    pub fn entry_for_node(
        &self,
        file_node: impl Into<NodePrefix>,
    ) -> Result<FilelogEntry, RevlogError> {
        let file_rev = self.revlog.rev_from_node(file_node.into())?;
        self.entry_for_rev(file_rev)
    }

    /// The given revision is that of the file as found in a filelog, not of a
    /// changeset.
    pub fn entry_for_rev(
        &self,
        file_rev: Revision,
    ) -> Result<FilelogEntry, RevlogError> {
        Ok(FilelogEntry(self.revlog.get_entry(file_rev)?))
    }
}

fn store_path(hg_path: &HgPath, suffix: &[u8]) -> PathBuf {
    let encoded_bytes =
        path_encode(&[b"data/", hg_path.as_bytes(), suffix].concat());
    get_path_from_bytes(&encoded_bytes).into()
}

pub struct FilelogEntry<'a>(RevlogEntry<'a>);

impl FilelogEntry<'_> {
    /// `self.data()` can be expensive, with decompression and delta
    /// resolution.
    ///
    /// *Without* paying this cost, based on revlog index information
    /// including `RevlogEntry::uncompressed_len`:
    ///
    /// * Returns `true` if the length that `self.data().file_data().len()`
    ///   would return is definitely **not equal** to `other_len`.
    /// * Returns `false` if available information is inconclusive.
    pub fn file_data_len_not_equal_to(&self, other_len: u64) -> bool {
        // Relevant code that implement this behavior in Python code:
        // basefilectx.cmp, filelog.size, storageutil.filerevisioncopied,
        // revlog.size, revlog.rawsize

        // Let’s call `file_data_len` what would be returned by
        // `self.data().file_data().len()`.

        if self.0.is_censored() {
            let file_data_len = 0;
            return other_len != file_data_len;
        }

        if self.0.has_length_affecting_flag_processor() {
            // We can’t conclude anything about `file_data_len`.
            return false;
        }

        // Revlog revisions (usually) have metadata for the size of
        // their data after decompression and delta resolution
        // as would be returned by `Revlog::get_rev_data`.
        //
        // For filelogs this is the file’s contents preceded by an optional
        // metadata block.
        let uncompressed_len = if let Some(l) = self.0.uncompressed_len() {
            l as u64
        } else {
            // The field was set to -1, the actual uncompressed len is unknown.
            // We need to decompress to say more.
            return false;
        };
        // `uncompressed_len = file_data_len + optional_metadata_len`,
        // so `file_data_len <= uncompressed_len`.
        if uncompressed_len < other_len {
            // Transitively, `file_data_len < other_len`.
            // So `other_len != file_data_len` definitely.
            return true;
        }

        if uncompressed_len == other_len + 4 {
            // It’s possible that `file_data_len == other_len` with an empty
            // metadata block (2 start marker bytes + 2 end marker bytes).
            // This happens when there wouldn’t otherwise be metadata, but
            // the first 2 bytes of file data happen to match a start marker
            // and would be ambiguous.
            return false;
        }

        if !self.0.has_p1() {
            // There may or may not be copy metadata, so we can’t deduce more
            // about `file_data_len` without computing file data.
            return false;
        }

        // Filelog ancestry is not meaningful in the way changelog ancestry is.
        // It only provides hints to delta generation.
        // p1 and p2 are set to null when making a copy or rename since
        // contents are likely unrelatedto what might have previously existed
        // at the destination path.
        //
        // Conversely, since here p1 is non-null, there is no copy metadata.
        // Note that this reasoning may be invalidated in the presence of
        // merges made by some previous versions of Mercurial that
        // swapped p1 and p2. See <https://bz.mercurial-scm.org/show_bug.cgi?id=6528>
        // and `tests/test-issue6528.t`.
        //
        // Since copy metadata is currently the only kind of metadata
        // kept in revlog data of filelogs,
        // this `FilelogEntry` does not have such metadata:
        let file_data_len = uncompressed_len;

        return file_data_len != other_len;
    }

    pub fn data(&self) -> Result<FilelogRevisionData, HgError> {
        Ok(FilelogRevisionData(self.0.data()?.into_owned()))
    }
}

/// The data for one revision in a filelog, uncompressed and delta-resolved.
pub struct FilelogRevisionData(Vec<u8>);

impl FilelogRevisionData {
    /// Split into metadata and data
    pub fn split(&self) -> Result<(Option<&[u8]>, &[u8]), HgError> {
        const DELIMITER: &[u8; 2] = &[b'\x01', b'\n'];

        if let Some(rest) = self.0.drop_prefix(DELIMITER) {
            if let Some((metadata, data)) = rest.split_2_by_slice(DELIMITER) {
                Ok((Some(metadata), data))
            } else {
                Err(HgError::corrupted(
                    "Missing metadata end delimiter in filelog entry",
                ))
            }
        } else {
            Ok((None, &self.0))
        }
    }

    /// Returns the file contents at this revision, stripped of any metadata
    pub fn file_data(&self) -> Result<&[u8], HgError> {
        let (_metadata, data) = self.split()?;
        Ok(data)
    }

    /// Consume the entry, and convert it into data, discarding any metadata,
    /// if present.
    pub fn into_file_data(self) -> Result<Vec<u8>, HgError> {
        if let (Some(_metadata), data) = self.split()? {
            Ok(data.to_owned())
        } else {
            Ok(self.0)
        }
    }
}