revset: make sort() do dumb multi-pass sorting for multiple keys (issue5218) stable
authorYuya Nishihara <yuya@tcha.org>
Sat, 23 Apr 2016 16:09:30 +0900
branchstable
changeset 29001 923fa9e06ea0
parent 29000 2d3837a4bded
child 29002 ea794f2eb19d
revset: make sort() do dumb multi-pass sorting for multiple keys (issue5218) Our invert() function was too clever to not take length into account. I could fix the problem by appending '\xff' as a terminator (opposite to '\0'), but it turned out to be slower than simple multi-pass sorting. New implementation is pretty straightforward, which just calls sort() from the last key. We can do that since Python sort() is guaranteed to be stable. It doesn't sound nice to call sort() multiple times, but actually it is faster. That's probably because we have fewer Python codes in hot loop, and can avoid heavy string and list manipulation. revset #0: sort(0:10000, 'branch') 0) 0.412753 1) 0.393254 revset #1: sort(0:10000, '-branch') 0) 0.455377 1) 0.389191 85% revset #2: sort(0:10000, 'date') 0) 0.408082 1) 0.376332 92% revset #3: sort(0:10000, '-date') 0) 0.406910 1) 0.380498 93% revset #4: sort(0:10000, 'desc branch user date rev') 0) 0.542996 1) 0.486397 89% revset #5: sort(0:10000, '-desc -branch -user -date -rev') 0) 0.965032 1) 0.518426 53%
mercurial/revset.py
tests/test-revset.t
--- a/mercurial/revset.py	Thu Mar 24 22:55:56 2016 +0900
+++ b/mercurial/revset.py	Sat Apr 23 16:09:30 2016 +0900
@@ -1859,9 +1859,6 @@
 
     s = l[0]
     keys = keys.split()
-    l = []
-    def invert(s):
-        return "".join(chr(255 - ord(c)) for c in s)
     revs = getset(repo, subset, s)
     if keys == ["rev"]:
         revs.sort()
@@ -1869,36 +1866,33 @@
     elif keys == ["-rev"]:
         revs.sort(reverse=True)
         return revs
-    for r in revs:
-        c = repo[r]
-        e = []
-        for k in keys:
+    # sort() is guaranteed to be stable
+    ctxs = [repo[r] for r in revs]
+    if True:
+        for k in reversed(keys):
             if k == 'rev':
-                e.append(r)
+                ctxs.sort(key=lambda c: c.rev())
             elif k == '-rev':
-                e.append(-r)
+                ctxs.sort(key=lambda c: c.rev(), reverse=True)
             elif k == 'branch':
-                e.append(c.branch())
+                ctxs.sort(key=lambda c: c.branch())
             elif k == '-branch':
-                e.append(invert(c.branch()))
+                ctxs.sort(key=lambda c: c.branch(), reverse=True)
             elif k == 'desc':
-                e.append(c.description())
+                ctxs.sort(key=lambda c: c.description())
             elif k == '-desc':
-                e.append(invert(c.description()))
+                ctxs.sort(key=lambda c: c.description(), reverse=True)
             elif k in 'user author':
-                e.append(c.user())
+                ctxs.sort(key=lambda c: c.user())
             elif k in '-user -author':
-                e.append(invert(c.user()))
+                ctxs.sort(key=lambda c: c.user(), reverse=True)
             elif k == 'date':
-                e.append(c.date()[0])
+                ctxs.sort(key=lambda c: c.date()[0])
             elif k == '-date':
-                e.append(-c.date()[0])
+                ctxs.sort(key=lambda c: c.date()[0], reverse=True)
             else:
                 raise error.ParseError(_("unknown sort key %r") % k)
-        e.append(r)
-        l.append(e)
-    l.sort()
-    return baseset([e[-1] for e in l])
+    return baseset([c.rev() for c in ctxs])
 
 @predicate('subrepo([pattern])')
 def subrepo(repo, subset, x):
--- a/tests/test-revset.t	Thu Mar 24 22:55:56 2016 +0900
+++ b/tests/test-revset.t	Sat Apr 23 16:09:30 2016 +0900
@@ -952,6 +952,147 @@
   6
   2
 
+  $ cd ..
+
+test sorting by multiple keys including variable-length strings
+
+  $ hg init sorting
+  $ cd sorting
+  $ cat <<EOF >> .hg/hgrc
+  > [ui]
+  > logtemplate = '{rev} {branch|p5}{desc|p5}{author|p5}{date|hgdate}\n'
+  > [templatealias]
+  > p5(s) = pad(s, 5)
+  > EOF
+  $ hg branch -qf b12
+  $ hg ci -m m111 -u u112 -d '111 10800'
+  $ hg branch -qf b11
+  $ hg ci -m m12 -u u111 -d '112 7200'
+  $ hg branch -qf b111
+  $ hg ci -m m11 -u u12 -d '111 3600'
+  $ hg branch -qf b112
+  $ hg ci -m m111 -u u11 -d '120 0'
+  $ hg branch -qf b111
+  $ hg ci -m m112 -u u111 -d '110 14400'
+  created new head
+
+ compare revisions (has fast path):
+
+  $ hg log -r 'sort(all(), rev)'
+  0 b12  m111 u112 111 10800
+  1 b11  m12  u111 112 7200
+  2 b111 m11  u12  111 3600
+  3 b112 m111 u11  120 0
+  4 b111 m112 u111 110 14400
+
+  $ hg log -r 'sort(all(), -rev)'
+  4 b111 m112 u111 110 14400
+  3 b112 m111 u11  120 0
+  2 b111 m11  u12  111 3600
+  1 b11  m12  u111 112 7200
+  0 b12  m111 u112 111 10800
+
+ compare variable-length strings (issue5218):
+
+  $ hg log -r 'sort(all(), branch)'
+  1 b11  m12  u111 112 7200
+  2 b111 m11  u12  111 3600
+  4 b111 m112 u111 110 14400
+  3 b112 m111 u11  120 0
+  0 b12  m111 u112 111 10800
+
+  $ hg log -r 'sort(all(), -branch)'
+  0 b12  m111 u112 111 10800
+  3 b112 m111 u11  120 0
+  2 b111 m11  u12  111 3600
+  4 b111 m112 u111 110 14400
+  1 b11  m12  u111 112 7200
+
+  $ hg log -r 'sort(all(), desc)'
+  2 b111 m11  u12  111 3600
+  0 b12  m111 u112 111 10800
+  3 b112 m111 u11  120 0
+  4 b111 m112 u111 110 14400
+  1 b11  m12  u111 112 7200
+
+  $ hg log -r 'sort(all(), -desc)'
+  1 b11  m12  u111 112 7200
+  4 b111 m112 u111 110 14400
+  0 b12  m111 u112 111 10800
+  3 b112 m111 u11  120 0
+  2 b111 m11  u12  111 3600
+
+  $ hg log -r 'sort(all(), user)'
+  3 b112 m111 u11  120 0
+  1 b11  m12  u111 112 7200
+  4 b111 m112 u111 110 14400
+  0 b12  m111 u112 111 10800
+  2 b111 m11  u12  111 3600
+
+  $ hg log -r 'sort(all(), -user)'
+  2 b111 m11  u12  111 3600
+  0 b12  m111 u112 111 10800
+  1 b11  m12  u111 112 7200
+  4 b111 m112 u111 110 14400
+  3 b112 m111 u11  120 0
+
+ compare dates (tz offset should have no effect):
+
+  $ hg log -r 'sort(all(), date)'
+  4 b111 m112 u111 110 14400
+  0 b12  m111 u112 111 10800
+  2 b111 m11  u12  111 3600
+  1 b11  m12  u111 112 7200
+  3 b112 m111 u11  120 0
+
+  $ hg log -r 'sort(all(), -date)'
+  3 b112 m111 u11  120 0
+  1 b11  m12  u111 112 7200
+  0 b12  m111 u112 111 10800
+  2 b111 m11  u12  111 3600
+  4 b111 m112 u111 110 14400
+
+ be aware that 'sort(x, -k)' is not exactly the same as 'reverse(sort(x, k))'
+ because '-k' reverses the comparison, not the list itself:
+
+  $ hg log -r 'sort(0 + 2, date)'
+  0 b12  m111 u112 111 10800
+  2 b111 m11  u12  111 3600
+
+  $ hg log -r 'sort(0 + 2, -date)'
+  0 b12  m111 u112 111 10800
+  2 b111 m11  u12  111 3600
+
+  $ hg log -r 'reverse(sort(0 + 2, date))'
+  2 b111 m11  u12  111 3600
+  0 b12  m111 u112 111 10800
+
+ sort by multiple keys:
+
+  $ hg log -r 'sort(all(), "branch -rev")'
+  1 b11  m12  u111 112 7200
+  4 b111 m112 u111 110 14400
+  2 b111 m11  u12  111 3600
+  3 b112 m111 u11  120 0
+  0 b12  m111 u112 111 10800
+
+  $ hg log -r 'sort(all(), "-desc -date")'
+  1 b11  m12  u111 112 7200
+  4 b111 m112 u111 110 14400
+  3 b112 m111 u11  120 0
+  0 b12  m111 u112 111 10800
+  2 b111 m11  u12  111 3600
+
+  $ hg log -r 'sort(all(), "user -branch date rev")'
+  3 b112 m111 u11  120 0
+  4 b111 m112 u111 110 14400
+  1 b11  m12  u111 112 7200
+  0 b12  m111 u112 111 10800
+  2 b111 m11  u12  111 3600
+
+  $ cd ..
+  $ cd repo
+
 test subtracting something from an addset
 
   $ log '(outgoing() or removes(a)) - removes(a)'