rust-index: implementation of __getitem__
authorRaphaël Gomès <rgomes@octobus.net>
Thu, 02 Nov 2023 11:19:54 +0100
changeset 51205 002b49905aac
parent 51204 297fa956b6c4
child 51206 952e3cd7568f
rust-index: implementation of __getitem__ Although the removed panic tends to prove if the full test suite did pass that the case when the input is a node id does not happen, it is best not to remove it right now. Raising IndexError is crucial for iteration on the index to stop, given the default CPython sequence iterator, see for instance https://github.com/zpoint/CPython-Internals/blobs/master/BasicObject/iter/iter.md This was spotted by `test-rust-ancestors.py`, which does simple interations on indexes (as preflight checks). In `revlog.c`, `index_getitem` defaults to `index_get` when called on revision numbers, which does raise `IndexError` with the same message as the one we are introducing here.
rust/hg-core/src/revlog/index.rs
rust/hg-cpython/src/revlog.rs
--- a/rust/hg-core/src/revlog/index.rs	Wed Sep 27 11:34:52 2023 +0200
+++ b/rust/hg-core/src/revlog/index.rs	Thu Nov 02 11:19:54 2023 +0100
@@ -123,6 +123,10 @@
         }
         Ok(())
     }
+
+    fn is_new(&self) -> bool {
+        self.bytes.is_empty()
+    }
 }
 
 impl std::ops::Index<std::ops::Range<usize>> for IndexData {
@@ -146,6 +150,7 @@
     }
 }
 
+#[derive(Debug, PartialEq, Eq)]
 pub struct RevisionDataParams {
     pub flags: u16,
     pub data_offset: u64,
@@ -163,6 +168,27 @@
     pub _rank: i32,
 }
 
+impl Default for RevisionDataParams {
+    fn default() -> Self {
+        Self {
+            flags: 0,
+            data_offset: 0,
+            data_compressed_length: 0,
+            data_uncompressed_length: 0,
+            data_delta_base: -1,
+            link_rev: -1,
+            parent_rev_1: -1,
+            parent_rev_2: -1,
+            node_id: [0; NODE_BYTES_LENGTH],
+            _sidedata_offset: 0,
+            _sidedata_compressed_length: 0,
+            data_compression_mode: COMPRESSION_MODE_INLINE,
+            _sidedata_compression_mode: COMPRESSION_MODE_INLINE,
+            _rank: -1,
+        }
+    }
+}
+
 #[derive(BytesCast)]
 #[repr(C)]
 pub struct RevisionDataV1 {
@@ -397,6 +423,29 @@
         })
     }
 
+    pub fn entry_as_params(
+        &self,
+        rev: UncheckedRevision,
+    ) -> Option<RevisionDataParams> {
+        let rev = self.check_revision(rev)?;
+        self.get_entry(rev).map(|e| RevisionDataParams {
+            flags: e.flags(),
+            data_offset: if rev.0 == 0 && !self.bytes.is_new() {
+                e.flags() as u64
+            } else {
+                e.raw_offset()
+            },
+            data_compressed_length: e.compressed_len().try_into().unwrap(),
+            data_uncompressed_length: e.uncompressed_len(),
+            data_delta_base: e.base_revision_or_base_of_delta_chain().0,
+            link_rev: e.link_revision().0,
+            parent_rev_1: e.p1().0,
+            parent_rev_2: e.p2().0,
+            node_id: e.hash().as_bytes().try_into().unwrap(),
+            ..Default::default()
+        })
+    }
+
     fn get_entry_inline(
         &self,
         rev: Revision,
@@ -519,6 +568,9 @@
             BigEndian::read_u64(&bytes[..]) as usize
         }
     }
+    pub fn raw_offset(&self) -> u64 {
+        BigEndian::read_u64(&self.bytes[0..8])
+    }
 
     pub fn flags(&self) -> u16 {
         BigEndian::read_u16(&self.bytes[6..=7])
--- a/rust/hg-cpython/src/revlog.rs	Wed Sep 27 11:34:52 2023 +0200
+++ b/rust/hg-cpython/src/revlog.rs	Thu Nov 02 11:19:54 2023 +0100
@@ -17,11 +17,10 @@
     PyObject, PyResult, PyString, PyTuple, Python, PythonObject, ToPyObject,
 };
 use hg::{
-    index::IndexHeader,
-    index::{RevisionDataParams, COMPRESSION_MODE_INLINE},
+    index::{IndexHeader, RevisionDataParams},
     nodemap::{Block, NodeMapError, NodeTree},
     revlog::{nodemap::NodeMap, NodePrefix, RevlogIndex},
-    BaseRevision, Revision, UncheckedRevision,
+    BaseRevision, Revision, UncheckedRevision, NULL_REVISION,
 };
 use std::cell::RefCell;
 
@@ -237,11 +236,6 @@
         Ok(rust_res)
     }
 
-    /// get an index entry
-    def get(&self, *args, **kw) -> PyResult<PyObject> {
-        self.call_cindex(py, "get", args, kw)
-    }
-
     /// compute phases
     def computephasesmapsets(&self, *args, **kw) -> PyResult<PyObject> {
         self.call_cindex(py, "computephasesmapsets", args, kw)
@@ -292,23 +286,32 @@
     // Since we call back through the high level Python API,
     // there's no point making a distinction between index_get
     // and index_getitem.
+    // gracinet 2023: this above is no longer true for the pure Rust impl
 
     def __len__(&self) -> PyResult<usize> {
         self.len(py)
     }
 
     def __getitem__(&self, key: PyObject) -> PyResult<PyObject> {
+        let rust_res = self.inner_getitem(py, key.clone_ref(py))?;
+
         // this conversion seems needless, but that's actually because
         // `index_getitem` does not handle conversion from PyLong,
         // which expressions such as [e for e in index] internally use.
         // Note that we don't seem to have a direct way to call
         // PySequence_GetItem (does the job), which would possibly be better
         // for performance
-        let key = match key.extract::<i32>(py) {
+        // gracinet 2023: the above comment can be removed when we use
+        // the pure Rust impl only. Note also that `key` can be a binary
+        // node id.
+        let c_key = match key.extract::<BaseRevision>(py) {
             Ok(rev) => rev.to_py_object(py).into_object(),
             Err(_) => key,
         };
-        self.cindex(py).borrow().inner().get_item(py, key)
+        let c_res = self.cindex(py).borrow().inner().get_item(py, c_key)?;
+
+        assert_py_eq(py, "__getitem__", &rust_res, &c_res)?;
+        Ok(rust_res)
     }
 
     def __contains__(&self, item: PyObject) -> PyResult<bool> {
@@ -426,13 +429,49 @@
         parent_rev_1: tuple.get_item(py, 5).extract(py)?,
         parent_rev_2: tuple.get_item(py, 6).extract(py)?,
         node_id,
-        _sidedata_offset: 0,
-        _sidedata_compressed_length: 0,
-        data_compression_mode: COMPRESSION_MODE_INLINE,
-        _sidedata_compression_mode: COMPRESSION_MODE_INLINE,
-        _rank: -1,
+        ..Default::default()
     })
 }
+fn revision_data_params_to_py_tuple(
+    py: Python,
+    params: RevisionDataParams,
+) -> PyTuple {
+    PyTuple::new(
+        py,
+        &[
+            params.data_offset.into_py_object(py).into_object(),
+            params
+                .data_compressed_length
+                .into_py_object(py)
+                .into_object(),
+            params
+                .data_uncompressed_length
+                .into_py_object(py)
+                .into_object(),
+            params.data_delta_base.into_py_object(py).into_object(),
+            params.link_rev.into_py_object(py).into_object(),
+            params.parent_rev_1.into_py_object(py).into_object(),
+            params.parent_rev_2.into_py_object(py).into_object(),
+            PyBytes::new(py, &params.node_id)
+                .into_py_object(py)
+                .into_object(),
+            params._sidedata_offset.into_py_object(py).into_object(),
+            params
+                ._sidedata_compressed_length
+                .into_py_object(py)
+                .into_object(),
+            params
+                .data_compression_mode
+                .into_py_object(py)
+                .into_object(),
+            params
+                ._sidedata_compression_mode
+                .into_py_object(py)
+                .into_object(),
+            params._rank.into_py_object(py).into_object(),
+        ],
+    )
+}
 
 impl MixedIndex {
     fn new(
@@ -601,6 +640,34 @@
 
         Ok(py.None())
     }
+
+    fn inner_getitem(&self, py: Python, key: PyObject) -> PyResult<PyObject> {
+        let idx = self.index(py).borrow();
+        Ok(match key.extract::<BaseRevision>(py) {
+            Ok(key_as_int) => {
+                let entry_params = if key_as_int == NULL_REVISION.0 {
+                    RevisionDataParams::default()
+                } else {
+                    let rev = UncheckedRevision(key_as_int);
+                    match idx.entry_as_params(rev) {
+                        Some(e) => e,
+                        None => {
+                            return Err(PyErr::new::<IndexError, _>(
+                                py,
+                                "revlog index out of range",
+                            ));
+                        }
+                    }
+                };
+                revision_data_params_to_py_tuple(py, entry_params)
+                    .into_object()
+            }
+            _ => self.get_rev(py, key.extract::<PyBytes>(py)?)?.map_or_else(
+                || py.None(),
+                |py_rev| py_rev.into_py_object(py).into_object(),
+            ),
+        })
+    }
 }
 
 fn revlog_error(py: Python) -> PyErr {