fileset: combine union of basic patterns into single matcher
authorYuya Nishihara <yuya@tcha.org>
Sat, 21 Jul 2018 17:19:12 +0900
changeset 38865 899b4c74209c
parent 38864 73731fa8d1bd
child 38866 aa33988ad8ab
fileset: combine union of basic patterns into single matcher This appears to improve query performance in a big repository than I thought. Writing less Python in a hot loop, faster computation we gain. $ hg files --cwd mozilla-central --time 'set:a* + b* + c* + d* + e*' (orig) time: real 0.670 secs (user 0.640+0.000 sys 0.030+0.000) (new) time: real 0.210 secs (user 0.180+0.000 sys 0.020+0.000)
mercurial/fileset.py
mercurial/filesetlang.py
mercurial/minifileset.py
tests/test-fileset.t
--- a/mercurial/fileset.py	Sat Jul 21 17:13:34 2018 +0900
+++ b/mercurial/fileset.py	Sat Jul 21 17:19:12 2018 +0900
@@ -50,6 +50,12 @@
     return stringmatch(mctx, _getkindpat(x, y, matchmod.allpatternkinds,
                                          _("pattern must be a string")))
 
+def patternsmatch(mctx, *xs):
+    allkinds = matchmod.allpatternkinds
+    patterns = [getpattern(x, allkinds, _("pattern must be a string"))
+                for x in xs]
+    return mctx.matcher(patterns)
+
 def andmatch(mctx, x, y):
     xm = getmatch(mctx, x)
     ym = getmatch(mctx, y)
@@ -436,6 +442,7 @@
     'string': stringmatch,
     'symbol': stringmatch,
     'kindpat': kindpatmatch,
+    'patterns': patternsmatch,
     'and': andmatch,
     'or': ormatch,
     'minus': minusmatch,
--- a/mercurial/filesetlang.py	Sat Jul 21 17:13:34 2018 +0900
+++ b/mercurial/filesetlang.py	Sat Jul 21 17:19:12 2018 +0900
@@ -185,6 +185,21 @@
         return ('minus', ta, tb[1])
     return (op, ta, tb)
 
+def _optimizeunion(xs):
+    # collect string patterns so they can be compiled into a single regexp
+    ws, ts, ss = [], [], []
+    for x in xs:
+        w, t = _optimize(x)
+        if t is not None and t[0] in {'string', 'symbol', 'kindpat'}:
+            ss.append(t)
+            continue
+        ws.append(w)
+        ts.append(t)
+    if ss:
+        ws.append(WEIGHT_CHECK_FILENAME)
+        ts.append(('patterns',) + tuple(ss))
+    return ws, ts
+
 def _optimize(x):
     if x is None:
         return 0, x
@@ -206,7 +221,9 @@
         else:
             return wb, _optimizeandops(op, tb, ta)
     if op == 'or':
-        ws, ts = zip(*(_optimize(y) for y in x[1:]))
+        ws, ts = _optimizeunion(x[1:])
+        if len(ts) == 1:
+            return ws[0], ts[0] # 'or' operation is fully optimized out
         ts = tuple(it[1] for it in sorted(enumerate(ts),
                                           key=lambda it: ws[it[0]]))
         return max(ws), (op,) + ts
--- a/mercurial/minifileset.py	Sat Jul 21 17:13:34 2018 +0900
+++ b/mercurial/minifileset.py	Sat Jul 21 17:19:12 2018 +0900
@@ -40,7 +40,7 @@
             return f
         raise error.ParseError(_("unsupported file pattern: %s") % name,
                                hint=_('paths must be prefixed with "path:"'))
-    elif op == 'or':
+    elif op in {'or', 'patterns'}:
         funcs = [_compile(x) for x in tree[1:]]
         return lambda n, s: any(f(n, s) for f in funcs)
     elif op == 'and':
--- a/tests/test-fileset.t	Sat Jul 21 17:13:34 2018 +0900
+++ b/tests/test-fileset.t	Sat Jul 21 17:19:12 2018 +0900
@@ -53,9 +53,7 @@
       (symbol 'glob')
       (symbol 'b?')))
   * matcher:
-  <unionmatcher matchers=[
-    <patternmatcher patterns='(?:a1(?:/|$))'>,
-    <patternmatcher patterns='(?:b.$)'>]>
+  <patternmatcher patterns='(?:a1(?:/|$)|b.$)'>
   a1
   b1
   b2
@@ -182,8 +180,9 @@
         None)))
   * optimized:
   (or
-    (symbol 'a1')
-    (symbol 'a2')
+    (patterns
+      (symbol 'a1')
+      (symbol 'a2'))
     (and
       (func
         (symbol 'clean')
@@ -193,8 +192,7 @@
         (string 'b'))))
   * matcher:
   <unionmatcher matchers=[
-    <patternmatcher patterns='(?:a1$)'>,
-    <patternmatcher patterns='(?:a2$)'>,
+    <patternmatcher patterns='(?:a1$|a2$)'>,
     <intersectionmatcher
       m1=<predicatenmatcher pred=clean>,
       m2=<predicatenmatcher pred=grep('b')>>]>
@@ -203,13 +201,30 @@
   b1
   b2
 
+Union of basic patterns:
+
+  $ fileset -p optimized -s -r. 'a1 or a2 or path:b1'
+  * optimized:
+  (patterns
+    (symbol 'a1')
+    (symbol 'a2')
+    (kindpat
+      (symbol 'path')
+      (symbol 'b1')))
+  * matcher:
+  <patternmatcher patterns='(?:a1$|a2$|b1(?:/|$))'>
+  a1
+  a2
+  b1
+
 OR expression should be reordered by weight:
 
   $ fileset -p optimized -s -r. 'grep("a") or a1 or grep("b") or b2'
   * optimized:
   (or
-    (symbol 'a1')
-    (symbol 'b2')
+    (patterns
+      (symbol 'a1')
+      (symbol 'b2'))
     (func
       (symbol 'grep')
       (string 'a'))
@@ -218,8 +233,7 @@
       (string 'b')))
   * matcher:
   <unionmatcher matchers=[
-    <patternmatcher patterns='(?:a1$)'>,
-    <patternmatcher patterns='(?:b2$)'>,
+    <patternmatcher patterns='(?:a1$|b2$)'>,
     <predicatenmatcher pred=grep('a')>,
     <predicatenmatcher pred=grep('b')>]>
   a1