rust/hg-core/src/dirstate_tree/on_disk.rs
changeset 47283 2a9ddc8094c7
parent 47280 1766130fe9ba
child 47330 73f23e7610f8
equal deleted inserted replaced
47282:ce41ee53263f 47283:2a9ddc8094c7
       
     1 //! The "version 2" disk representation of the dirstate
       
     2 //!
       
     3 //! # File format
       
     4 //!
       
     5 //! The file starts with a fixed-sized header, whose layout is defined by the
       
     6 //! `Header` struct. Its `root` field contains the slice (offset and length) to
       
     7 //! the nodes representing the files and directories at the root of the
       
     8 //! repository. Each node is also fixed-size, defined by the `Node` struct.
       
     9 //! Nodes in turn contain slices to variable-size paths, and to their own child
       
    10 //! nodes (if any) for nested files and directories.
       
    11 
       
    12 use crate::dirstate::parsers::clear_ambiguous_mtime;
       
    13 use crate::dirstate::parsers::Timestamp;
       
    14 use crate::dirstate_tree::dirstate_map::{self, DirstateMap};
       
    15 use crate::dirstate_tree::path_with_basename::WithBasename;
       
    16 use crate::errors::HgError;
       
    17 use crate::utils::hg_path::HgPath;
       
    18 use crate::DirstateEntry;
       
    19 use crate::DirstateError;
       
    20 use crate::DirstateParents;
       
    21 use bytes_cast::unaligned::{I32Be, U32Be, U64Be};
       
    22 use bytes_cast::BytesCast;
       
    23 use std::borrow::Cow;
       
    24 use std::convert::{TryFrom, TryInto};
       
    25 
     1 /// Added at the start of `.hg/dirstate` when the "v2" format is used.
    26 /// Added at the start of `.hg/dirstate` when the "v2" format is used.
     2 /// Acts like a "magic number". This is a sanity check, not strictly necessary
    27 /// This a redundant sanity check more than an actual "magic number" since
     3 /// since `.hg/requires` already governs which format should be used.
    28 /// `.hg/requires` already governs which format should be used.
     4 pub const V2_FORMAT_MARKER: &[u8; 12] = b"dirstate-v2\n";
    29 pub const V2_FORMAT_MARKER: &[u8; 12] = b"dirstate-v2\n";
       
    30 
       
    31 #[derive(BytesCast)]
       
    32 #[repr(C)]
       
    33 struct Header {
       
    34     marker: [u8; V2_FORMAT_MARKER.len()],
       
    35 
       
    36     /// `dirstatemap.parents()` in `mercurial/dirstate.py` relies on this
       
    37     /// `parents` field being at this offset, immediately after `marker`.
       
    38     parents: DirstateParents,
       
    39 
       
    40     root: ChildNodes,
       
    41     nodes_with_entry_count: Size,
       
    42     nodes_with_copy_source_count: Size,
       
    43 }
       
    44 
       
    45 #[derive(BytesCast)]
       
    46 #[repr(C)]
       
    47 struct Node {
       
    48     full_path: PathSlice,
       
    49 
       
    50     /// In bytes from `self.full_path.start`
       
    51     base_name_start: Size,
       
    52 
       
    53     copy_source: OptPathSlice,
       
    54     entry: OptEntry,
       
    55     children: ChildNodes,
       
    56     tracked_descendants_count: Size,
       
    57 }
       
    58 
       
    59 /// Either nothing if `state == b'\0'`, or a dirstate entry like in the v1
       
    60 /// format
       
    61 #[derive(BytesCast)]
       
    62 #[repr(C)]
       
    63 struct OptEntry {
       
    64     state: u8,
       
    65     mode: I32Be,
       
    66     mtime: I32Be,
       
    67     size: I32Be,
       
    68 }
       
    69 
       
    70 /// Counted in bytes from the start of the file
       
    71 ///
       
    72 /// NOTE: If we decide to never support `.hg/dirstate` files larger than 4 GiB
       
    73 /// we could save space by using `U32Be` instead.
       
    74 type Offset = U64Be;
       
    75 
       
    76 /// Counted in number of items
       
    77 ///
       
    78 /// NOTE: not supporting directories with more than 4 billion direct children,
       
    79 /// or filenames more than 4 GiB.
       
    80 type Size = U32Be;
       
    81 
       
    82 /// Location of consecutive, fixed-size items.
       
    83 ///
       
    84 /// An item can be a single byte for paths, or a struct with
       
    85 /// `derive(BytesCast)`.
       
    86 #[derive(BytesCast, Copy, Clone)]
       
    87 #[repr(C)]
       
    88 struct Slice {
       
    89     start: Offset,
       
    90     len: Size,
       
    91 }
       
    92 
       
    93 /// A contiguous sequence of `len` times `Node`, representing the child nodes
       
    94 /// of either some other node or of the repository root.
       
    95 ///
       
    96 /// Always sorted by ascending `full_path`, to allow binary search.
       
    97 /// Since nodes with the same parent nodes also have the same parent path,
       
    98 /// only the `base_name`s need to be compared during binary search.
       
    99 type ChildNodes = Slice;
       
   100 
       
   101 /// A `HgPath` of `len` bytes
       
   102 type PathSlice = Slice;
       
   103 
       
   104 /// Either nothing if `start == 0`, or a `HgPath` of `len` bytes
       
   105 type OptPathSlice = Slice;
       
   106 
       
   107 /// Make sure that size-affecting changes are made knowingly
       
   108 fn _static_assert_size_of() {
       
   109     let _ = std::mem::transmute::<Header, [u8; 72]>;
       
   110     let _ = std::mem::transmute::<Node, [u8; 57]>;
       
   111 }
       
   112 
       
   113 pub(super) fn read<'on_disk>(
       
   114     on_disk: &'on_disk [u8],
       
   115 ) -> Result<(DirstateMap<'on_disk>, Option<DirstateParents>), DirstateError> {
       
   116     if on_disk.is_empty() {
       
   117         return Ok((DirstateMap::empty(on_disk), None));
       
   118     }
       
   119     let (header, _) = Header::from_bytes(on_disk)
       
   120         .map_err(|_| HgError::corrupted("truncated dirstate-v2"))?;
       
   121     let Header {
       
   122         marker,
       
   123         parents,
       
   124         root,
       
   125         nodes_with_entry_count,
       
   126         nodes_with_copy_source_count,
       
   127     } = header;
       
   128     if marker != V2_FORMAT_MARKER {
       
   129         return Err(HgError::corrupted("missing dirstated-v2 marker").into());
       
   130     }
       
   131     let dirstate_map = DirstateMap {
       
   132         on_disk,
       
   133         root: read_nodes(on_disk, *root)?,
       
   134         nodes_with_entry_count: nodes_with_entry_count.get(),
       
   135         nodes_with_copy_source_count: nodes_with_copy_source_count.get(),
       
   136     };
       
   137     let parents = Some(parents.clone());
       
   138     Ok((dirstate_map, parents))
       
   139 }
       
   140 
       
   141 impl Node {
       
   142     pub(super) fn path<'on_disk>(
       
   143         &self,
       
   144         on_disk: &'on_disk [u8],
       
   145     ) -> Result<dirstate_map::NodeKey<'on_disk>, HgError> {
       
   146         let full_path = read_hg_path(on_disk, self.full_path)?;
       
   147         let base_name_start = usize::try_from(self.base_name_start.get())
       
   148             // u32 -> usize, could only panic on a 16-bit CPU
       
   149             .expect("dirstate-v2 base_name_start out of bounds");
       
   150         if base_name_start < full_path.len() {
       
   151             Ok(WithBasename::from_raw_parts(full_path, base_name_start))
       
   152         } else {
       
   153             Err(HgError::corrupted(
       
   154                 "dirstate-v2 base_name_start out of bounds",
       
   155             ))
       
   156         }
       
   157     }
       
   158 
       
   159     pub(super) fn copy_source<'on_disk>(
       
   160         &self,
       
   161         on_disk: &'on_disk [u8],
       
   162     ) -> Result<Option<Cow<'on_disk, HgPath>>, HgError> {
       
   163         Ok(if self.copy_source.start.get() != 0 {
       
   164             Some(read_hg_path(on_disk, self.copy_source)?)
       
   165         } else {
       
   166             None
       
   167         })
       
   168     }
       
   169 
       
   170     pub(super) fn entry(&self) -> Result<Option<DirstateEntry>, HgError> {
       
   171         Ok(if self.entry.state != b'\0' {
       
   172             Some(DirstateEntry {
       
   173                 state: self.entry.state.try_into()?,
       
   174                 mode: self.entry.mode.get(),
       
   175                 mtime: self.entry.mtime.get(),
       
   176                 size: self.entry.size.get(),
       
   177             })
       
   178         } else {
       
   179             None
       
   180         })
       
   181     }
       
   182 
       
   183     pub(super) fn to_in_memory_node<'on_disk>(
       
   184         &self,
       
   185         on_disk: &'on_disk [u8],
       
   186     ) -> Result<dirstate_map::Node<'on_disk>, HgError> {
       
   187         Ok(dirstate_map::Node {
       
   188             children: read_nodes(on_disk, self.children)?,
       
   189             copy_source: self.copy_source(on_disk)?,
       
   190             entry: self.entry()?,
       
   191             tracked_descendants_count: self.tracked_descendants_count.get(),
       
   192         })
       
   193     }
       
   194 }
       
   195 
       
   196 fn read_nodes(
       
   197     on_disk: &[u8],
       
   198     slice: ChildNodes,
       
   199 ) -> Result<dirstate_map::ChildNodes, HgError> {
       
   200     read_slice::<Node>(on_disk, slice)?
       
   201         .iter()
       
   202         .map(|node| {
       
   203             Ok((node.path(on_disk)?, node.to_in_memory_node(on_disk)?))
       
   204         })
       
   205         .collect()
       
   206 }
       
   207 
       
   208 fn read_hg_path(on_disk: &[u8], slice: Slice) -> Result<Cow<HgPath>, HgError> {
       
   209     let bytes = read_slice::<u8>(on_disk, slice)?;
       
   210     Ok(Cow::Borrowed(HgPath::new(bytes)))
       
   211 }
       
   212 
       
   213 fn read_slice<T>(on_disk: &[u8], slice: Slice) -> Result<&[T], HgError>
       
   214 where
       
   215     T: BytesCast,
       
   216 {
       
   217     // Either `usize::MAX` would result in "out of bounds" error since a single
       
   218     // `&[u8]` cannot occupy the entire addess space.
       
   219     let start = usize::try_from(slice.start.get()).unwrap_or(std::usize::MAX);
       
   220     let len = usize::try_from(slice.len.get()).unwrap_or(std::usize::MAX);
       
   221     on_disk
       
   222         .get(start..)
       
   223         .and_then(|bytes| T::slice_from_bytes(bytes, len).ok())
       
   224         .map(|(slice, _rest)| slice)
       
   225         .ok_or_else(|| {
       
   226             HgError::corrupted("dirstate v2 slice is out of bounds")
       
   227         })
       
   228 }
       
   229 
       
   230 pub(super) fn write(
       
   231     dirstate_map: &mut DirstateMap,
       
   232     parents: DirstateParents,
       
   233     now: Timestamp,
       
   234 ) -> Result<Vec<u8>, DirstateError> {
       
   235     // TODO: how do we want to handle this in 2038?
       
   236     let now: i32 = now.0.try_into().expect("time overflow");
       
   237 
       
   238     let header_len = std::mem::size_of::<Header>();
       
   239 
       
   240     // This ignores the space for paths, and for nodes without an entry.
       
   241     // TODO: better estimate? Skip the `Vec` and write to a file directly?
       
   242     let size_guess = header_len
       
   243         + std::mem::size_of::<Node>()
       
   244             * dirstate_map.nodes_with_entry_count as usize;
       
   245     let mut out = Vec::with_capacity(size_guess);
       
   246 
       
   247     // Keep space for the header. We’ll fill it out at the end when we know the
       
   248     // actual offset for the root nodes.
       
   249     out.resize(header_len, 0_u8);
       
   250 
       
   251     let root = write_nodes(&mut dirstate_map.root, now, &mut out)?;
       
   252 
       
   253     let header = Header {
       
   254         marker: *V2_FORMAT_MARKER,
       
   255         parents: parents,
       
   256         root,
       
   257         nodes_with_entry_count: dirstate_map.nodes_with_entry_count.into(),
       
   258         nodes_with_copy_source_count: dirstate_map
       
   259             .nodes_with_copy_source_count
       
   260             .into(),
       
   261     };
       
   262     out[..header_len].copy_from_slice(header.as_bytes());
       
   263     Ok(out)
       
   264 }
       
   265 
       
   266 /// Serialize the dirstate to the `v2` format after clearing ambigous `mtime`s.
       
   267 fn write_nodes(
       
   268     nodes: &mut dirstate_map::ChildNodes,
       
   269     now: i32,
       
   270     out: &mut Vec<u8>,
       
   271 ) -> Result<ChildNodes, DirstateError> {
       
   272     // `dirstate_map::ChildNodes` is a `HashMap` with undefined iteration
       
   273     // order. Sort to enable binary search in the written file.
       
   274     let nodes = dirstate_map::Node::sorted(nodes);
       
   275 
       
   276     // First accumulate serialized nodes in a `Vec`
       
   277     let mut on_disk_nodes = Vec::with_capacity(nodes.len());
       
   278     for (full_path, node) in nodes {
       
   279         on_disk_nodes.push(Node {
       
   280             children: write_nodes(&mut node.children, now, out)?,
       
   281             tracked_descendants_count: node.tracked_descendants_count.into(),
       
   282             full_path: write_slice::<u8>(
       
   283                 full_path.full_path().as_bytes(),
       
   284                 out,
       
   285             ),
       
   286             base_name_start: u32::try_from(full_path.base_name_start())
       
   287                 // Could only panic for paths over 4 GiB
       
   288                 .expect("dirstate-v2 offset overflow")
       
   289                 .into(),
       
   290             copy_source: if let Some(source) = &node.copy_source {
       
   291                 write_slice::<u8>(source.as_bytes(), out)
       
   292             } else {
       
   293                 Slice {
       
   294                     start: 0.into(),
       
   295                     len: 0.into(),
       
   296                 }
       
   297             },
       
   298             entry: if let Some(entry) = &mut node.entry {
       
   299                 clear_ambiguous_mtime(entry, now);
       
   300                 OptEntry {
       
   301                     state: entry.state.into(),
       
   302                     mode: entry.mode.into(),
       
   303                     mtime: entry.mtime.into(),
       
   304                     size: entry.size.into(),
       
   305                 }
       
   306             } else {
       
   307                 OptEntry {
       
   308                     state: b'\0',
       
   309                     mode: 0.into(),
       
   310                     mtime: 0.into(),
       
   311                     size: 0.into(),
       
   312                 }
       
   313             },
       
   314         })
       
   315     }
       
   316     // … so we can write them contiguously
       
   317     Ok(write_slice::<Node>(&on_disk_nodes, out))
       
   318 }
       
   319 
       
   320 fn write_slice<T>(slice: &[T], out: &mut Vec<u8>) -> Slice
       
   321 where
       
   322     T: BytesCast,
       
   323 {
       
   324     let start = u64::try_from(out.len())
       
   325         // Could only panic on a 128-bit CPU with a dirstate over 16 EiB
       
   326         .expect("dirstate-v2 offset overflow")
       
   327         .into();
       
   328     let len = u32::try_from(slice.len())
       
   329         // Could only panic for paths over 4 GiB or nodes with over 4 billions
       
   330         // child nodes
       
   331         .expect("dirstate-v2 offset overflow")
       
   332         .into();
       
   333     out.extend(slice.as_bytes());
       
   334     Slice { start, len }
       
   335 }