rust: create wrapper struct to reduce `regex` contention issues stable 6.3
authorRaphaël Gomès <rgomes@octobus.net>
Wed, 09 Nov 2022 23:28:01 -0500
branchstable
changeset 49581 04f1dba53c96
parent 49580 08fe5c4d4471
child 49582 b8f84d2c698a
rust: create wrapper struct to reduce `regex` contention issues Explanations inline.
rust/Cargo.lock
rust/hg-core/Cargo.toml
rust/hg-core/src/matchers.rs
--- a/rust/Cargo.lock	Sat Nov 12 02:38:53 2022 +0100
+++ b/rust/Cargo.lock	Wed Nov 09 23:28:01 2022 -0500
@@ -479,6 +479,7 @@
  "same-file",
  "sha-1 0.10.0",
  "tempfile",
+ "thread_local",
  "twox-hash",
  "zstd",
 ]
@@ -1120,6 +1121,15 @@
 ]
 
 [[package]]
+name = "thread_local"
+version = "1.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5516c27b78311c50bf42c071425c560ac799b11c30b31f87e3081965fe5e0180"
+dependencies = [
+ "once_cell",
+]
+
+[[package]]
 name = "time"
 version = "0.1.44"
 source = "registry+https://github.com/rust-lang/crates.io-index"
--- a/rust/hg-core/Cargo.toml	Sat Nov 12 02:38:53 2022 +0100
+++ b/rust/hg-core/Cargo.toml	Wed Nov 09 23:28:01 2022 -0500
@@ -29,13 +29,14 @@
 twox-hash = "1.6.2"
 same-file = "1.0.6"
 tempfile = "3.1.0"
+thread_local = "1.1.4"
 crossbeam-channel = "0.5.0"
 micro-timer = "0.4.0"
 log = "0.4.8"
 memmap2 = { version = "0.5.3", features = ["stable_deref_trait"] }
 zstd = "0.5.3"
 format-bytes = "0.3.0"
-# once_cell 1.15 uses edition 2021, while the heptapod CI 
+# once_cell 1.15 uses edition 2021, while the heptapod CI
 # uses an old version of Cargo that doesn't support it.
 once_cell = "=1.14.0"
 
--- a/rust/hg-core/src/matchers.rs	Sat Nov 12 02:38:53 2022 +0100
+++ b/rust/hg-core/src/matchers.rs	Wed Nov 09 23:28:01 2022 -0500
@@ -573,6 +573,39 @@
     }
 }
 
+/// Wraps [`regex::bytes::Regex`] to improve performance in multithreaded
+/// contexts.
+///
+/// The `status` algorithm makes heavy use of threads, and calling `is_match`
+/// from many threads at once is prone to contention, probably within the
+/// scratch space needed as the regex DFA is built lazily.
+///
+/// We are in the process of raising the issue upstream, but for now
+/// the workaround used here is to store the `Regex` in a lazily populated
+/// thread-local variable, sharing the initial read-only compilation, but
+/// not the lazy dfa scratch space mentioned above.
+///
+/// This reduces the contention observed with 16+ threads, but does not
+/// completely remove it. Hopefully this can be addressed upstream.
+struct RegexMatcher {
+    /// Compiled at the start of the status algorithm, used as a base for
+    /// cloning in each thread-local `self.local`, thus sharing the expensive
+    /// first compilation.
+    base: regex::bytes::Regex,
+    /// Thread-local variable that holds the `Regex` that is actually queried
+    /// from each thread.
+    local: thread_local::ThreadLocal<regex::bytes::Regex>,
+}
+
+impl RegexMatcher {
+    /// Returns whether the path matches the stored `Regex`.
+    pub fn is_match(&self, path: &HgPath) -> bool {
+        self.local
+            .get_or(|| self.base.clone())
+            .is_match(path.as_bytes())
+    }
+}
+
 /// Returns a function that matches an `HgPath` against the given regex
 /// pattern.
 ///
@@ -580,9 +613,7 @@
 /// underlying engine (the `regex` crate), for instance anything with
 /// back-references.
 #[timed]
-fn re_matcher(
-    pattern: &[u8],
-) -> PatternResult<impl Fn(&HgPath) -> bool + Sync> {
+fn re_matcher(pattern: &[u8]) -> PatternResult<RegexMatcher> {
     use std::io::Write;
 
     // The `regex` crate adds `.*` to the start and end of expressions if there
@@ -611,7 +642,10 @@
         .build()
         .map_err(|e| PatternError::UnsupportedSyntax(e.to_string()))?;
 
-    Ok(move |path: &HgPath| re.is_match(path.as_bytes()))
+    Ok(RegexMatcher {
+        base: re,
+        local: Default::default(),
+    })
 }
 
 /// Returns the regex pattern and a function that matches an `HgPath` against
@@ -638,7 +672,7 @@
     let func = if !(regexps.is_empty()) {
         let matcher = re_matcher(&full_regex)?;
         let func = move |filename: &HgPath| {
-            exact_set.contains(filename) || matcher(filename)
+            exact_set.contains(filename) || matcher.is_match(filename)
         };
         Box::new(func) as IgnoreFnType
     } else {