rust/hg-core/src/revlog/revlog.rs
author Antoine Cezar <antoine.cezar@octobus.net>
Fri, 04 Sep 2020 11:55:07 +0200
changeset 45526 26c53ee51c68
child 45531 b0d6309ff50c
permissions -rw-r--r--
hg-core: Add a limited read only `revlog` implementation Only covers the needs of the upcoming `rhg debugdata` command. Differential Revision: https://phab.mercurial-scm.org/D8958

use std::borrow::Cow;
use std::fs::File;
use std::io::Read;
use std::ops::Deref;
use std::path::Path;

use byteorder::{BigEndian, ByteOrder};
use flate2::read::ZlibDecoder;
use memmap::{Mmap, MmapOptions};
use micro_timer::timed;
use zstd;

use super::index::Index;
use super::patch;
use crate::revlog::Revision;

pub enum RevlogError {
    IoError(std::io::Error),
    UnsuportedVersion(u16),
    InvalidRevision,
    Corrupted,
    UnknowDataFormat(u8),
}

fn mmap_open(path: &Path) -> Result<Mmap, std::io::Error> {
    let file = File::open(path)?;
    let mmap = unsafe { MmapOptions::new().map(&file) }?;
    Ok(mmap)
}

/// Read only implementation of revlog.
pub struct Revlog {
    /// When index and data are not interleaved: bytes of the revlog index.
    /// When index and data are interleaved: bytes of the revlog index and
    /// data.
    index_bytes: Box<dyn Deref<Target = [u8]> + Send>,
    /// When index and data are not interleaved: bytes of the revlog data
    data_bytes: Option<Box<dyn Deref<Target = [u8]> + Send>>,
}

impl Revlog {
    /// Open a revlog index file.
    ///
    /// It will also open the associated data file if index and data are not
    /// interleaved.
    #[timed]
    pub fn open(index_path: &Path) -> Result<Self, RevlogError> {
        let index_mmap =
            mmap_open(&index_path).map_err(RevlogError::IoError)?;

        let version = get_version(&index_mmap);
        if version != 1 {
            return Err(RevlogError::UnsuportedVersion(version));
        }

        let is_inline = is_inline(&index_mmap);

        let index_bytes = Box::new(index_mmap);

        // TODO load data only when needed //
        // type annotation required
        // won't recognize Mmap as Deref<Target = [u8]>
        let data_bytes: Option<Box<dyn Deref<Target = [u8]> + Send>> =
            if is_inline {
                None
            } else {
                let data_path = index_path.with_extension("d");
                let data_mmap =
                    mmap_open(&data_path).map_err(RevlogError::IoError)?;
                Some(Box::new(data_mmap))
            };

        Ok(Revlog {
            index_bytes,
            data_bytes,
        })
    }

    /// Return the full data associated to a revision.
    ///
    /// All entries required to build the final data out of deltas will be
    /// retrieved as needed, and the deltas will be applied to the inital
    /// snapshot to rebuild the final data.
    #[timed]
    pub fn get_rev_data(&self, rev: Revision) -> Result<Vec<u8>, RevlogError> {
        // Todo return -> Cow
        let mut entry = self.get_entry(rev)?;
        let mut delta_chain = vec![];
        while let Some(base_rev) = entry.base_rev {
            delta_chain.push(entry);
            entry = self
                .get_entry(base_rev)
                .map_err(|_| RevlogError::Corrupted)?;
        }

        if delta_chain.is_empty() {
            Ok(entry.data()?.into())
        } else {
            Revlog::build_data_from_deltas(entry, &delta_chain)
        }
    }

    /// Build the full data of a revision out its snapshot
    /// and its deltas.
    #[timed]
    fn build_data_from_deltas(
        snapshot: RevlogEntry,
        deltas: &[RevlogEntry],
    ) -> Result<Vec<u8>, RevlogError> {
        let snapshot = snapshot.data()?;
        let deltas = deltas
            .iter()
            .rev()
            .map(RevlogEntry::data)
            .collect::<Result<Vec<Cow<'_, [u8]>>, RevlogError>>()?;
        let patches: Vec<_> =
            deltas.iter().map(|d| patch::PatchList::new(d)).collect();
        let patch = patch::fold_patch_lists(&patches);
        Ok(patch.apply(&snapshot))
    }

    /// Return the revlog index.
    fn index(&self) -> Index {
        let is_inline = self.data_bytes.is_none();
        Index::new(&self.index_bytes, is_inline)
    }

    /// Return the revlog data.
    fn data(&self) -> &[u8] {
        match self.data_bytes {
            Some(ref data_bytes) => &data_bytes,
            None => &self.index_bytes,
        }
    }

    /// Get an entry of the revlog.
    fn get_entry(&self, rev: Revision) -> Result<RevlogEntry, RevlogError> {
        let index = self.index();
        let index_entry =
            index.get_entry(rev).ok_or(RevlogError::InvalidRevision)?;
        let start = index_entry.offset();
        let end = start + index_entry.compressed_len();
        let entry = RevlogEntry {
            rev,
            bytes: &self.data()[start..end],
            compressed_len: index_entry.compressed_len(),
            uncompressed_len: index_entry.uncompressed_len(),
            base_rev: if index_entry.base_revision() == rev {
                None
            } else {
                Some(index_entry.base_revision())
            },
        };
        Ok(entry)
    }
}

/// The revlog entry's bytes and the necessary informations to extract
/// the entry's data.
#[derive(Debug)]
pub struct RevlogEntry<'a> {
    rev: Revision,
    bytes: &'a [u8],
    compressed_len: usize,
    uncompressed_len: usize,
    base_rev: Option<Revision>,
}

impl<'a> RevlogEntry<'a> {
    /// Extract the data contained in the entry.
    pub fn data(&self) -> Result<Cow<'_, [u8]>, RevlogError> {
        if self.bytes.is_empty() {
            return Ok(Cow::Borrowed(&[]));
        }
        match self.bytes[0] {
            // Revision data is the entirety of the entry, including this
            // header.
            b'\0' => Ok(Cow::Borrowed(self.bytes)),
            // Raw revision data follows.
            b'u' => Ok(Cow::Borrowed(&self.bytes[1..])),
            // zlib (RFC 1950) data.
            b'x' => Ok(Cow::Owned(self.uncompressed_zlib_data())),
            // zstd data.
            b'\x28' => Ok(Cow::Owned(self.uncompressed_zstd_data())),
            format_type => Err(RevlogError::UnknowDataFormat(format_type)),
        }
    }

    fn uncompressed_zlib_data(&self) -> Vec<u8> {
        let mut decoder = ZlibDecoder::new(self.bytes);
        if self.is_delta() {
            let mut buf = Vec::with_capacity(self.compressed_len);
            decoder.read_to_end(&mut buf).expect("corrupted zlib data");
            buf
        } else {
            let mut buf = vec![0; self.uncompressed_len];
            decoder.read_exact(&mut buf).expect("corrupted zlib data");
            buf
        }
    }

    fn uncompressed_zstd_data(&self) -> Vec<u8> {
        if self.is_delta() {
            let mut buf = Vec::with_capacity(self.compressed_len);
            zstd::stream::copy_decode(self.bytes, &mut buf)
                .expect("corrupted zstd data");
            buf
        } else {
            let mut buf = vec![0; self.uncompressed_len];
            let len = zstd::block::decompress_to_buffer(self.bytes, &mut buf)
                .expect("corrupted zstd data");
            assert_eq!(len, self.uncompressed_len, "corrupted zstd data");
            buf
        }
    }

    /// Tell if the entry is a snapshot or a delta
    /// (influences on decompression).
    fn is_delta(&self) -> bool {
        self.base_rev.is_some()
    }
}

/// Value of the inline flag.
pub fn is_inline(index_bytes: &[u8]) -> bool {
    match &index_bytes[0..=1] {
        [0, 0] | [0, 2] => false,
        _ => true,
    }
}

/// Format version of the revlog.
pub fn get_version(index_bytes: &[u8]) -> u16 {
    BigEndian::read_u16(&index_bytes[2..=3])
}

#[cfg(test)]
mod tests {
    use super::*;

    use super::super::index::IndexEntryBuilder;

    #[cfg(test)]
    pub struct RevlogBuilder {
        version: u16,
        is_general_delta: bool,
        is_inline: bool,
        offset: usize,
        index: Vec<Vec<u8>>,
        data: Vec<Vec<u8>>,
    }

    #[cfg(test)]
    impl RevlogBuilder {
        pub fn new() -> Self {
            Self {
                version: 2,
                is_inline: false,
                is_general_delta: true,
                offset: 0,
                index: vec![],
                data: vec![],
            }
        }

        pub fn with_inline(&mut self, value: bool) -> &mut Self {
            self.is_inline = value;
            self
        }

        pub fn with_general_delta(&mut self, value: bool) -> &mut Self {
            self.is_general_delta = value;
            self
        }

        pub fn with_version(&mut self, value: u16) -> &mut Self {
            self.version = value;
            self
        }

        pub fn push(
            &mut self,
            mut index: IndexEntryBuilder,
            data: Vec<u8>,
        ) -> &mut Self {
            if self.index.is_empty() {
                index.is_first(true);
                index.with_general_delta(self.is_general_delta);
                index.with_inline(self.is_inline);
                index.with_version(self.version);
            } else {
                index.with_offset(self.offset);
            }
            self.index.push(index.build());
            self.offset += data.len();
            self.data.push(data);
            self
        }

        pub fn build_inline(&self) -> Vec<u8> {
            let mut bytes =
                Vec::with_capacity(self.index.len() + self.data.len());
            for (index, data) in self.index.iter().zip(self.data.iter()) {
                bytes.extend(index);
                bytes.extend(data);
            }
            bytes
        }
    }

    #[test]
    fn is_not_inline_when_no_inline_flag_test() {
        let bytes = RevlogBuilder::new()
            .with_general_delta(false)
            .with_inline(false)
            .push(IndexEntryBuilder::new(), vec![])
            .build_inline();

        assert_eq!(is_inline(&bytes), false)
    }

    #[test]
    fn is_inline_when_inline_flag_test() {
        let bytes = RevlogBuilder::new()
            .with_general_delta(false)
            .with_inline(true)
            .push(IndexEntryBuilder::new(), vec![])
            .build_inline();

        assert_eq!(is_inline(&bytes), true)
    }

    #[test]
    fn is_inline_when_inline_and_generaldelta_flags_test() {
        let bytes = RevlogBuilder::new()
            .with_general_delta(true)
            .with_inline(true)
            .push(IndexEntryBuilder::new(), vec![])
            .build_inline();

        assert_eq!(is_inline(&bytes), true)
    }

    #[test]
    fn version_test() {
        let bytes = RevlogBuilder::new()
            .with_version(1)
            .push(IndexEntryBuilder::new(), vec![])
            .build_inline();

        assert_eq!(get_version(&bytes), 1)
    }
}