tests: add a randomized test for pathencode
authorBryan O'Sullivan <bryano@fb.com>
Thu, 15 Nov 2012 10:04:29 -0800
changeset 17934 736f1c09f321
parent 17933 8243dd66e0e3
child 17935 9c888b945b65
tests: add a randomized test for pathencode This is a probabilistic test - it generates different test cases on every run, unless invoked from the command line with a specific seed. The default number of tests run should make the tests take about a second to complete on a semi-modern laptop.
tests/test-pathencode.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/test-pathencode.py	Thu Nov 15 10:04:29 2012 -0800
@@ -0,0 +1,190 @@
+# This is a randomized test that generates different pathnames every
+# time it is invoked, and tests the encoding of those pathnames.
+#
+# It uses a simple probabilistic model to generate valid pathnames
+# that have proven likely to expose bugs and divergent behaviour in
+# different encoding implementations.
+
+from mercurial import parsers
+from mercurial import store
+import binascii, itertools, math, os, random, sys, time
+from collections import defaultdict
+
+def hybridencode(path):
+    return store._hybridencode(path, True)
+
+validchars = set(map(chr, range(0, 256)))
+alphanum = range(ord('A'), ord('Z'))
+
+for c in '\0/':
+    validchars.remove(c)
+
+winreserved = ('aux con prn nul'.split() +
+               ['com%d' % i for i in xrange(1, 10)] +
+               ['lpt%d' % i for i in xrange(1, 10)])
+
+def casecombinations(names):
+    '''Build all case-diddled combinations of names.'''
+
+    combos = set()
+
+    for r in names:
+        for i in xrange(len(r) + 1):
+            for c in itertools.combinations(xrange(len(r)), i):
+                d = r
+                for j in c:
+                    d = ''.join((d[:j], d[j].upper(), d[j + 1:]))
+                combos.add(d)
+    return sorted(combos)
+
+def buildprobtable(fp, cmd='hg manifest tip'):
+    '''Construct and print a table of probabilities for path name
+    components.  The numbers are percentages.'''
+
+    counts = defaultdict(lambda: 0)
+    for line in os.popen(cmd).read().splitlines():
+        if line[-2:] in ('.i', '.d'):
+            line = line[:-2]
+        if line.startswith('data/'):
+            line = line[5:]
+        for c in line:
+            counts[c] += 1
+    for c in '\r/\n':
+        counts.pop(c, None)
+    t = sum(counts.itervalues()) / 100.0
+    fp.write('probtable = (')
+    for i, (k, v) in enumerate(sorted(counts.iteritems(), key=lambda x: x[1],
+                                      reverse=True)):
+        if (i % 5) == 0:
+            fp.write('\n    ')
+        vt = v / t
+        if vt < 0.0005:
+            break
+        fp.write('(%r, %.03f), ' % (k, vt))
+    fp.write('\n    )\n')
+
+# A table of character frequencies (as percentages), gleaned by
+# looking at filelog names from a real-world, very large repo.
+
+probtable = (
+    ('t', 9.828), ('e', 9.042), ('s', 8.011), ('a', 6.801), ('i', 6.618),
+    ('g', 5.053), ('r', 5.030), ('o', 4.887), ('p', 4.363), ('n', 4.258),
+    ('l', 3.830), ('h', 3.693), ('_', 3.659), ('.', 3.377), ('m', 3.194),
+    ('u', 2.364), ('d', 2.296), ('c', 2.163), ('b', 1.739), ('f', 1.625),
+    ('6', 0.666), ('j', 0.610), ('y', 0.554), ('x', 0.487), ('w', 0.477),
+    ('k', 0.476), ('v', 0.473), ('3', 0.336), ('1', 0.335), ('2', 0.326),
+    ('4', 0.310), ('5', 0.305), ('9', 0.302), ('8', 0.300), ('7', 0.299),
+    ('q', 0.298), ('0', 0.250), ('z', 0.223), ('-', 0.118), ('C', 0.095),
+    ('T', 0.087), ('F', 0.085), ('B', 0.077), ('S', 0.076), ('P', 0.076),
+    ('L', 0.059), ('A', 0.058), ('N', 0.051), ('D', 0.049), ('M', 0.046),
+    ('E', 0.039), ('I', 0.035), ('R', 0.035), ('G', 0.028), ('U', 0.026),
+    ('W', 0.025), ('O', 0.017), ('V', 0.015), ('H', 0.013), ('Q', 0.011),
+    ('J', 0.007), ('K', 0.005), ('+', 0.004), ('X', 0.003), ('Y', 0.001),
+    )
+
+for c, _ in probtable:
+    validchars.remove(c)
+validchars = list(validchars)
+
+def pickfrom(rng, table):
+    c = 0
+    r = rng.random() * sum(i[1] for i in table)
+    for i, p in table:
+        c += p
+        if c >= r:
+            return i
+
+reservedcombos = casecombinations(winreserved)
+
+# The first component of a name following a slash.
+
+firsttable = (
+    (lambda rng: pickfrom(rng, probtable), 90),
+    (lambda rng: rng.choice(validchars), 5),
+    (lambda rng: rng.choice(reservedcombos), 5),
+    )
+
+# Components of a name following the first.
+
+resttable = firsttable[:-1]
+
+# Special suffixes.
+
+internalsuffixcombos = casecombinations('.hg .i .d'.split())
+
+# The last component of a path, before a slash or at the end of a name.
+
+lasttable = resttable + (
+    (lambda rng: '', 95),
+    (lambda rng: rng.choice(internalsuffixcombos), 5),
+    )
+
+def makepart(rng, k):
+    '''Construct a part of a pathname, without slashes.'''
+
+    p = pickfrom(rng, firsttable)(rng)
+    l = len(p)
+    ps = [p]
+    while l <= k:
+        p = pickfrom(rng, resttable)(rng)
+        l += len(p)
+        ps.append(p)
+    ps.append(pickfrom(rng, lasttable)(rng))
+    return ''.join(ps)
+
+def makepath(rng, j, k):
+    '''Construct a complete pathname.'''
+
+    return ('data/' + '/'.join(makepart(rng, k) for _ in xrange(j)) +
+            rng.choice(['.d', '.i']))
+
+def genpath(rng, count):
+    '''Generate random pathnames with gradually increasing lengths.'''
+
+    mink, maxk = 1, 4096
+    def steps():
+        x, k = 0, mink
+        for i in xrange(count):
+            yield mink + int(round(math.sqrt((maxk - mink) * float(i) / count)))
+    for k in steps():
+        x = rng.randint(1, k)
+        y = rng.randint(1, k)
+        yield makepath(rng, x, y)
+
+def runtests(rng, seed, count):
+    nerrs = 0
+    for p in genpath(rng, count):
+        hybridencode(p)
+    return nerrs
+
+def main():
+    import getopt
+
+    # Empirically observed to take about a second to run
+    count = 100
+    seed = None
+    opts, args = getopt.getopt(sys.argv[1:], 'c:s:',
+                               ['build', 'count=', 'seed='])
+    for o, a in opts:
+        if o in ('-c', '--count'):
+            count = int(a)
+        elif o in ('-s', '--seed'):
+            seed = long(a)
+        elif o == '--build':
+            buildprobtable(sys.stdout,
+                           'find .hg/store/data -type f && '
+                           'cat .hg/store/fncache 2>/dev/null')
+            sys.exit(0)
+
+    if seed is None:
+        try:
+            seed = long(binascii.hexlify(os.urandom(16)), 16)
+        except AttributeError:
+            seed = long(time.time() * 1000)
+
+    rng = random.Random(seed)
+    if runtests(rng, seed, count):
+        sys.exit(1)
+
+if __name__ == '__main__' and sys.version_info[:2] >= (2, 6):
+    main()