hg-core: implement timestamp line parsing
authorArun Kulshreshtha <akulshreshtha@janestreet.com>
Thu, 15 Feb 2024 11:39:18 -0500
changeset 51391 a96ed440450e
parent 51390 92c7765931e0
child 51393 6603a1448f18
hg-core: implement timestamp line parsing
rust/Cargo.lock
rust/hg-core/Cargo.toml
rust/hg-core/src/revlog/changelog.rs
--- a/rust/Cargo.lock	Wed Feb 14 15:21:44 2024 -0500
+++ b/rust/Cargo.lock	Thu Feb 15 11:39:18 2024 -0500
@@ -29,6 +29,12 @@
 ]
 
 [[package]]
+name = "android-tzdata"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0"
+
+[[package]]
 name = "android_system_properties"
 version = "0.1.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -148,17 +154,16 @@
 
 [[package]]
 name = "chrono"
-version = "0.4.23"
+version = "0.4.34"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "16b0a3d9ed01224b22057780a37bb8c5dbfe1be8ba48678e7bf57ec4b385411f"
+checksum = "5bc015644b92d5890fab7489e49d21f879d5c990186827d42ec511919404f38b"
 dependencies = [
+ "android-tzdata",
  "iana-time-zone",
  "js-sys",
- "num-integer",
  "num-traits",
- "time",
  "wasm-bindgen",
- "winapi",
+ "windows-targets",
 ]
 
 [[package]]
@@ -537,6 +542,7 @@
  "bitvec",
  "byteorder",
  "bytes-cast",
+ "chrono",
  "clap",
  "crossbeam-channel",
  "derive_more",
@@ -795,16 +801,6 @@
 ]
 
 [[package]]
-name = "num-integer"
-version = "0.1.45"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "225d3389fb3509a24c93f5c29eb6bde2586b98d9f016636dff58d7c6f7569cd9"
-dependencies = [
- "autocfg",
- "num-traits",
-]
-
-[[package]]
 name = "num-traits"
 version = "0.2.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1289,17 +1285,6 @@
 ]
 
 [[package]]
-name = "time"
-version = "0.1.44"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6db9e6914ab8b1ae1c260a4ae7a49b6c5611b40328a735b21862567685e73255"
-dependencies = [
- "libc",
- "wasi 0.10.0+wasi-snapshot-preview1",
- "winapi",
-]
-
-[[package]]
 name = "toml"
 version = "0.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1393,12 +1378,6 @@
 
 [[package]]
 name = "wasi"
-version = "0.10.0+wasi-snapshot-preview1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f"
-
-[[package]]
-name = "wasi"
 version = "0.11.0+wasi-snapshot-preview1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
@@ -1520,6 +1499,63 @@
 checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
 
 [[package]]
+name = "windows-targets"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8a18201040b24831fbb9e4eb208f8892e1f50a37feb53cc7ff887feb8f50e7cd"
+dependencies = [
+ "windows_aarch64_gnullvm",
+ "windows_aarch64_msvc",
+ "windows_i686_gnu",
+ "windows_i686_msvc",
+ "windows_x86_64_gnu",
+ "windows_x86_64_gnullvm",
+ "windows_x86_64_msvc",
+]
+
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cb7764e35d4db8a7921e09562a0304bf2f93e0a51bfccee0bd0bb0b666b015ea"
+
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bbaa0368d4f1d2aaefc55b6fcfee13f41544ddf36801e793edbbfd7d7df075ef"
+
+[[package]]
+name = "windows_i686_gnu"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a28637cb1fa3560a16915793afb20081aba2c92ee8af57b4d5f28e4b3e7df313"
+
+[[package]]
+name = "windows_i686_msvc"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ffe5e8e31046ce6230cc7215707b816e339ff4d4d67c65dffa206fd0f7aa7b9a"
+
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3d6fa32db2bc4a2f5abeacf2b69f7992cd09dca97498da74a151a3132c26befd"
+
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1a657e1e9d3f514745a572a6846d3c7aa7dbe1658c056ed9c3344c4109a6949e"
+
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dff9641d1cd4be8d1a070daf9e3773c5f67e78b4d9d42263020c057706765c04"
+
+[[package]]
 name = "wyz"
 version = "0.5.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
--- a/rust/hg-core/Cargo.toml	Wed Feb 14 15:21:44 2024 -0500
+++ b/rust/hg-core/Cargo.toml	Thu Feb 15 11:39:18 2024 -0500
@@ -40,6 +40,7 @@
 format-bytes = "0.3.0"
 once_cell = "1.16.0"
 bitvec = "1.0.1"
+chrono = "0.4.34"
 
 # We don't use the `miniz-oxide` backend to not change rhg benchmarks and until
 # we have a clearer view of which backend is the fastest.
--- a/rust/hg-core/src/revlog/changelog.rs	Wed Feb 14 15:21:44 2024 -0500
+++ b/rust/hg-core/src/revlog/changelog.rs	Thu Feb 15 11:39:18 2024 -0500
@@ -1,3 +1,12 @@
+use std::ascii::escape_default;
+use std::borrow::Cow;
+use std::collections::BTreeMap;
+use std::fmt::{Debug, Formatter};
+use std::{iter, str};
+
+use chrono::{DateTime, FixedOffset, NaiveDateTime};
+use itertools::{Either, Itertools};
+
 use crate::errors::HgError;
 use crate::revlog::Revision;
 use crate::revlog::{Node, NodePrefix};
@@ -5,11 +14,6 @@
 use crate::utils::hg_path::HgPath;
 use crate::vfs::Vfs;
 use crate::{Graph, GraphError, RevlogOpenOptions, UncheckedRevision};
-use itertools::{Either, Itertools};
-use std::ascii::escape_default;
-use std::borrow::Cow;
-use std::fmt::{Debug, Formatter};
-use std::iter;
 
 /// A specialized `Revlog` to work with changelog data format.
 pub struct Changelog {
@@ -229,6 +233,11 @@
         &self.bytes[self.user_end + 1..self.timestamp_end]
     }
 
+    /// Parsed timestamp line, including optional extras.
+    pub fn parsed_timestamp(&self) -> Result<TimestampAndExtra, HgError> {
+        TimestampAndExtra::from_bytes(self.timestamp_line())
+    }
+
     /// The files changed in this revision.
     pub fn files(&self) -> impl Iterator<Item = &HgPath> {
         if self.timestamp_end == self.files_end {
@@ -286,6 +295,203 @@
     .to_string()
 }
 
+/// Parsed timestamp line, including the timestamp and optional extras.
+#[derive(Clone, Debug)]
+pub struct TimestampAndExtra {
+    pub timestamp: DateTime<FixedOffset>,
+    pub extra: BTreeMap<String, Vec<u8>>,
+}
+
+impl TimestampAndExtra {
+    /// Parse the raw bytes of the timestamp line from a changelog entry.
+    ///
+    /// According to the documentation in `hg help dates` and the
+    /// implementation in `changelog.py`, the format of the timestamp line
+    /// is `time tz extra\n` where:
+    ///
+    /// - `time` is an ASCII-encoded signed int or float denoting a UTC
+    ///   timestamp as seconds since the UNIX epoch.
+    ///
+    /// - `tz` is the timezone offset as an ASCII-encoded signed integer
+    ///   denoting seconds WEST of UTC (so negative for timezones east of UTC,
+    ///   which is the opposite of the sign in ISO 8601 timestamps).
+    ///
+    /// - `extra` is an optional set of NUL-delimited key-value pairs, with the
+    ///   key and value in each pair separated by an ASCII colon. Keys are
+    ///   limited to ASCII letters, digits, hyphens, and underscores, whereas
+    ///   values can be arbitrary bytes.
+    fn from_bytes(line: &[u8]) -> Result<Self, HgError> {
+        let mut parts = line.splitn(3, |c| *c == b' ');
+
+        let timestamp_bytes = parts
+            .next()
+            .ok_or_else(|| HgError::corrupted("missing timestamp"))?;
+        let timestamp_str = str::from_utf8(timestamp_bytes).map_err(|e| {
+            HgError::corrupted(format!("timestamp is not valid UTF-8: {e}"))
+        })?;
+        let timestamp_utc = timestamp_str
+            .parse()
+            .map_err(|e| {
+                HgError::corrupted(format!("failed to parse timestamp: {e}"))
+            })
+            .and_then(|secs| {
+                NaiveDateTime::from_timestamp_opt(secs, 0).ok_or_else(|| {
+                    HgError::corrupted(format!(
+                        "integer timestamp out of valid range: {secs}"
+                    ))
+                })
+            })
+            // Attempt to parse the timestamp as a float if we can't parse
+            // it as an int. It doesn't seem like float timestamps are actually
+            // used in practice, but the Python code supports them.
+            .or_else(|_| parse_float_timestamp(timestamp_str))?;
+
+        let timezone_bytes = parts
+            .next()
+            .ok_or_else(|| HgError::corrupted("missing timezone"))?;
+        let timezone_secs: i32 = str::from_utf8(timezone_bytes)
+            .map_err(|e| {
+                HgError::corrupted(format!("timezone is not valid UTF-8: {e}"))
+            })?
+            .parse()
+            .map_err(|e| {
+                HgError::corrupted(format!("timezone is not an integer: {e}"))
+            })?;
+        let timezone =
+            FixedOffset::west_opt(timezone_secs).ok_or_else(|| {
+                HgError::corrupted("timezone offset out of bounds")
+            })?;
+
+        let timestamp =
+            DateTime::from_naive_utc_and_offset(timestamp_utc, timezone);
+        let extra = parts
+            .next()
+            .map(parse_extra)
+            .transpose()?
+            .unwrap_or_default();
+
+        Ok(Self { timestamp, extra })
+    }
+}
+
+/// Attempt to parse the given string as floating-point timestamp, and
+/// convert the result into a `chrono::NaiveDateTime`.
+fn parse_float_timestamp(
+    timestamp_str: &str,
+) -> Result<NaiveDateTime, HgError> {
+    let timestamp = timestamp_str.parse::<f64>().map_err(|e| {
+        HgError::corrupted(format!("failed to parse timestamp: {e}"))
+    })?;
+
+    // To construct a `NaiveDateTime` we'll need to convert the float
+    // into signed integer seconds and unsigned integer nanoseconds.
+    let mut secs = timestamp.trunc() as i64;
+    let mut subsecs = timestamp.fract();
+
+    // If the timestamp is negative, we need to express the fractional
+    // component as positive nanoseconds since the previous second.
+    if timestamp < 0.0 {
+        secs -= 1;
+        subsecs += 1.0;
+    }
+
+    // This cast should be safe because the fractional component is
+    // by definition less than 1.0, so this value should not exceed
+    // 1 billion, which is representable as an f64 without loss of
+    // precision and should fit into a u32 without overflowing.
+    //
+    // (Any loss of precision in the fractional component will have
+    // already happened at the time of initial parsing; in general,
+    // f64s are insufficiently precise to provide nanosecond-level
+    // precision with present-day timestamps.)
+    let nsecs = (subsecs * 1_000_000_000.0) as u32;
+
+    NaiveDateTime::from_timestamp_opt(secs, nsecs).ok_or_else(|| {
+        HgError::corrupted(format!(
+            "float timestamp out of valid range: {timestamp}"
+        ))
+    })
+}
+
+/// Parse the "extra" fields from a changeset's timestamp line.
+///
+/// Extras are null-delimited key-value pairs where the key consists of ASCII
+/// alphanumeric characters plus hyphens and underscores, and the value can
+/// contain arbitrary bytes.
+fn parse_extra(extra: &[u8]) -> Result<BTreeMap<String, Vec<u8>>, HgError> {
+    extra
+        .split(|c| *c == b'\0')
+        .map(|pair| {
+            let pair = unescape_extra(pair);
+            let mut iter = pair.splitn(2, |c| *c == b':');
+
+            let key_bytes =
+                iter.next().filter(|k| !k.is_empty()).ok_or_else(|| {
+                    HgError::corrupted("empty key in changeset extras")
+                })?;
+
+            let key = str::from_utf8(key_bytes)
+                .ok()
+                .filter(|k| {
+                    k.chars().all(|c| {
+                        c.is_ascii_alphanumeric() || c == '_' || c == '-'
+                    })
+                })
+                .ok_or_else(|| {
+                    let key = String::from_utf8_lossy(key_bytes);
+                    HgError::corrupted(format!(
+                        "invalid key in changeset extras: {key}",
+                    ))
+                })?
+                .to_string();
+
+            let value = iter.next().map(Into::into).ok_or_else(|| {
+                HgError::corrupted(format!(
+                    "missing value for changeset extra: {key}"
+                ))
+            })?;
+
+            Ok((key, value))
+        })
+        .collect()
+}
+
+/// Decode Mercurial's escaping for changelog extras.
+///
+/// The `_string_escape` function in `changelog.py` only escapes 4 characters
+/// (null, backslash, newline, and carriage return) so we only decode those.
+///
+/// The Python code also includes a workaround for decoding escaped nuls
+/// that are followed by an ASCII octal digit, since Python's built-in
+/// `string_escape` codec will interpret that as an escaped octal byte value.
+/// That workaround is omitted here since we don't support decoding octal.
+fn unescape_extra(bytes: &[u8]) -> Vec<u8> {
+    let mut output = Vec::with_capacity(bytes.len());
+    let mut input = bytes.iter().copied();
+
+    while let Some(c) = input.next() {
+        if c != b'\\' {
+            output.push(c);
+            continue;
+        }
+
+        match input.next() {
+            Some(b'0') => output.push(b'\0'),
+            Some(b'\\') => output.push(b'\\'),
+            Some(b'n') => output.push(b'\n'),
+            Some(b'r') => output.push(b'\r'),
+            // The following cases should never occur in theory because any
+            // backslashes in the original input should have been escaped
+            // with another backslash, so it should not be possible to
+            // observe an escape sequence other than the 4 above.
+            Some(c) => output.extend_from_slice(&[b'\\', c]),
+            None => output.push(b'\\'),
+        }
+    }
+
+    output
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -375,4 +581,166 @@
             .collect_vec()
             .is_empty());
     }
+
+    #[test]
+    fn test_unescape_basic() {
+        // '\0', '\\', '\n', and '\r' are correctly unescaped.
+        let expected = b"AAA\0BBB\\CCC\nDDD\rEEE";
+        let escaped = br"AAA\0BBB\\CCC\nDDD\rEEE";
+        let unescaped = unescape_extra(escaped);
+        assert_eq!(&expected[..], &unescaped[..]);
+    }
+
+    #[test]
+    fn test_unescape_unsupported_sequence() {
+        // Other escape sequences are left unaltered.
+        for c in 0u8..255 {
+            match c {
+                b'0' | b'\\' | b'n' | b'r' => continue,
+                c => {
+                    let expected = &[b'\\', c][..];
+                    let unescaped = unescape_extra(expected);
+                    assert_eq!(expected, &unescaped[..]);
+                }
+            }
+        }
+    }
+
+    #[test]
+    fn test_unescape_trailing_backslash() {
+        // Trailing backslashes are OK.
+        let expected = br"hi\";
+        let unescaped = unescape_extra(expected);
+        assert_eq!(&expected[..], &unescaped[..]);
+    }
+
+    #[test]
+    fn test_unescape_nul_followed_by_octal() {
+        // Escaped NUL chars followed by octal digits are decoded correctly.
+        let expected = b"\012";
+        let escaped = br"\012";
+        let unescaped = unescape_extra(escaped);
+        assert_eq!(&expected[..], &unescaped[..]);
+    }
+
+    #[test]
+    fn test_parse_float_timestamp() {
+        let test_cases = [
+            // Zero should map to the UNIX epoch.
+            ("0.0", "1970-01-01 00:00:00"),
+            // Negative zero should be the same as positive zero.
+            ("-0.0", "1970-01-01 00:00:00"),
+            // Values without fractional components should work like integers.
+            // (Assuming the timestamp is within the limits of f64 precision.)
+            ("1115154970.0", "2005-05-03 21:16:10"),
+            // We expect some loss of precision in the fractional component
+            // when parsing arbitrary floating-point values.
+            ("1115154970.123456789", "2005-05-03 21:16:10.123456716"),
+            // But representable f64 values should parse losslessly.
+            ("1115154970.123456716", "2005-05-03 21:16:10.123456716"),
+            // Negative fractional components are subtracted from the epoch.
+            ("-1.333", "1969-12-31 23:59:58.667"),
+        ];
+
+        for (input, expected) in test_cases {
+            let res = parse_float_timestamp(input).unwrap().to_string();
+            assert_eq!(res, expected);
+        }
+    }
+
+    fn escape_extra(bytes: &[u8]) -> Vec<u8> {
+        let mut output = Vec::with_capacity(bytes.len());
+
+        for c in bytes.iter().copied() {
+            output.extend_from_slice(match c {
+                b'\0' => &b"\\0"[..],
+                b'\\' => &b"\\\\"[..],
+                b'\n' => &b"\\n"[..],
+                b'\r' => &b"\\r"[..],
+                _ => {
+                    output.push(c);
+                    continue;
+                }
+            });
+        }
+
+        output
+    }
+
+    fn encode_extra<K, V>(pairs: impl IntoIterator<Item = (K, V)>) -> Vec<u8>
+    where
+        K: AsRef<[u8]>,
+        V: AsRef<[u8]>,
+    {
+        let extras = pairs.into_iter().map(|(k, v)| {
+            escape_extra(&[k.as_ref(), b":", v.as_ref()].concat())
+        });
+        // Use fully-qualified syntax to avoid a future naming conflict with
+        // the standard library: https://github.com/rust-lang/rust/issues/79524
+        Itertools::intersperse(extras, b"\0".to_vec()).concat()
+    }
+
+    #[test]
+    fn test_parse_extra() {
+        let extra = [
+            ("branch".into(), b"default".to_vec()),
+            ("key-with-hyphens".into(), b"value1".to_vec()),
+            ("key_with_underscores".into(), b"value2".to_vec()),
+            ("empty-value".into(), b"".to_vec()),
+            ("binary-value".into(), (0u8..=255).collect::<Vec<_>>()),
+        ]
+        .into_iter()
+        .collect::<BTreeMap<String, Vec<u8>>>();
+
+        let encoded = encode_extra(&extra);
+        let parsed = parse_extra(&encoded).unwrap();
+
+        assert_eq!(extra, parsed);
+    }
+
+    #[test]
+    fn test_corrupt_extra() {
+        let test_cases = [
+            (&b""[..], "empty input"),
+            (&b"\0"[..], "unexpected null byte"),
+            (&b":empty-key"[..], "empty key"),
+            (&b"\0leading-null:"[..], "leading null"),
+            (&b"trailing-null:\0"[..], "trailing null"),
+            (&b"missing-value"[..], "missing value"),
+            (&b"$!@# non-alphanum-key:"[..], "non-alphanumeric key"),
+            (&b"\xF0\x9F\xA6\x80 non-ascii-key:"[..], "non-ASCII key"),
+        ];
+
+        for (extra, msg) in test_cases {
+            assert!(
+                parse_extra(&extra).is_err(),
+                "corrupt extra should have failed to parse: {}",
+                msg
+            );
+        }
+    }
+
+    #[test]
+    fn test_parse_timestamp_line() {
+        let extra = [
+            ("branch".into(), b"default".to_vec()),
+            ("key-with-hyphens".into(), b"value1".to_vec()),
+            ("key_with_underscores".into(), b"value2".to_vec()),
+            ("empty-value".into(), b"".to_vec()),
+            ("binary-value".into(), (0u8..=255).collect::<Vec<_>>()),
+        ]
+        .into_iter()
+        .collect::<BTreeMap<String, Vec<u8>>>();
+
+        let mut line: Vec<u8> = b"1115154970 28800 ".to_vec();
+        line.extend_from_slice(&encode_extra(&extra));
+
+        let parsed = TimestampAndExtra::from_bytes(&line).unwrap();
+
+        assert_eq!(
+            &parsed.timestamp.to_rfc3339(),
+            "2005-05-03T13:16:10-08:00"
+        );
+        assert_eq!(extra, parsed.extra);
+    }
 }