rust-matchers: raw regular expression builder stable
authorGeorges Racinet <georges.racinet@octobus.net>
Mon, 11 Mar 2024 13:36:25 +0100
branchstable
changeset 51471 5633de951d34
parent 51470 406b413e3cf2
child 51497 9da3fcc5f70f
rust-matchers: raw regular expression builder Extracting this `re_builder()` from `re_matcher()` makes it reusable in more general cases than matching `HgPath` instances and would help reducing code duplication in RHGitaly.
rust/hg-core/src/matchers.rs
--- a/rust/hg-core/src/matchers.rs	Mon Mar 11 13:23:18 2024 +0100
+++ b/rust/hg-core/src/matchers.rs	Mon Mar 11 13:36:25 2024 +0100
@@ -737,14 +737,11 @@
     }
 }
 
-/// Returns a function that matches an `HgPath` against the given regex
-/// pattern.
+/// Return a `RegexBuilder` from a bytes pattern
 ///
-/// This can fail when the pattern is invalid or not supported by the
-/// underlying engine (the `regex` crate), for instance anything with
-/// back-references.
-#[logging_timer::time("trace")]
-fn re_matcher(pattern: &[u8]) -> PatternResult<RegexMatcher> {
+/// This works around the fact that even if it works on byte haysacks,
+/// [`regex::bytes::Regex`] still uses UTF-8 patterns.
+pub fn re_bytes_builder(pattern: &[u8]) -> regex::bytes::RegexBuilder {
     use std::io::Write;
 
     // The `regex` crate adds `.*` to the start and end of expressions if there
@@ -764,7 +761,18 @@
     // # Safety
     // This is safe because we escaped all non-ASCII bytes.
     let pattern_string = unsafe { String::from_utf8_unchecked(escaped_bytes) };
-    let re = regex::bytes::RegexBuilder::new(&pattern_string)
+    regex::bytes::RegexBuilder::new(&pattern_string)
+}
+
+/// Returns a function that matches an `HgPath` against the given regex
+/// pattern.
+///
+/// This can fail when the pattern is invalid or not supported by the
+/// underlying engine (the `regex` crate), for instance anything with
+/// back-references.
+#[logging_timer::time("trace")]
+fn re_matcher(pattern: &[u8]) -> PatternResult<RegexMatcher> {
+    let re = re_bytes_builder(pattern)
         .unicode(false)
         // Big repos with big `.hgignore` will hit the default limit and
         // incur a significant performance hit. One repo's `hg status` hit