New lazy index code for revlogs.
authormason@suse.com
Tue, 04 Apr 2006 16:47:12 -0400
changeset 2079 ee96ca273f32
parent 2078 441ea218414e
child 2080 1cbb14c048cb
New lazy index code for revlogs. This tunes for large repositories. It does not read the whole index file in one big chunk, but tries to buffer reads in more reasonable chunks instead. Search speeds are improved in two ways. When trying to find a specific sha hash, it searches from the end of the file backward. More recent entries are more likely to be relevant, especially the tip. Also, this can load only the mapping of nodes to revlog index number. Loading the map uses less cpu (no struct.unpack) and much less memory than loading both the map and the index. This cuts down the time for hg tip on the 80,000 changeset kernel repo from 1.8s to 3.69s. Most commands the pull a single rev out of a big index get roughly the same benefit. Commands that read the whole index are not slower.
mercurial/revlog.py
--- a/mercurial/revlog.py	Tue Apr 04 16:38:44 2006 -0400
+++ b/mercurial/revlog.py	Tue Apr 04 16:47:12 2006 -0400
@@ -64,6 +64,7 @@
     raise RevlogError(_("unknown compression type %r") % t)
 
 indexformatv0 = ">4l20s20s20s"
+v0shaoffset = 56
 # index ng:
 # 6 bytes offset
 # 2 bytes flags
@@ -75,48 +76,135 @@
 # 4 bytes parent 2 rev
 # 32 bytes: nodeid
 indexformatng = ">Qiiiiii20s12x"
+ngshaoffset = 32
 versionformat = ">i"
 
 class lazyparser(object):
     """
     this class avoids the need to parse the entirety of large indices
-
-    By default we parse and load 1000 entries at a time.
-
-    If no position is specified, we load the whole index, and replace
-    the lazy objects in revlog with the underlying objects for
-    efficiency in cases where we look at most of the nodes.
     """
-    def __init__(self, data, revlog, indexformat):
-        self.data = data
+    def __init__(self, dataf, size, indexformat, shaoffset):
+        self.dataf = dataf
+        self.format = indexformat
         self.s = struct.calcsize(indexformat)
         self.indexformat = indexformat
-        self.l = len(data)/self.s
+        self.datasize = size
+        self.l = size/self.s
         self.index = [None] * self.l
         self.map = {nullid: -1}
+        self.allmap = 0
         self.all = 0
-        self.revlog = revlog
+        self.mapfind_count = 0
+        self.shaoffset = shaoffset
 
-    def load(self, pos=None):
+    def loadmap(self):
+        """
+        during a commit, we need to make sure the rev being added is
+        not a duplicate.  This requires loading the entire index,
+        which is fairly slow.  loadmap can load up just the node map,
+        which takes much less time.
+        """
+        if self.allmap: return
+        start = 0
+        end = self.datasize
+        self.allmap = 1
+        cur = 0
+        count = 0
+        blocksize = self.s * 256
+        self.dataf.seek(0)
+        while cur < end:
+            data = self.dataf.read(blocksize)
+            off = 0
+            for x in xrange(256):
+                n = data[off + self.shaoffset:off + self.shaoffset + 20]
+                self.map[n] = count
+                count += 1
+                if count >= self.l:
+                    break
+                off += self.s
+            cur += blocksize
+
+    def loadblock(self, blockstart, blocksize, data=None):
         if self.all: return
-        if pos is not None:
-            block = pos / 1000
-            i = block * 1000
-            end = min(self.l, i + 1000)
+        if data is None:
+            self.dataf.seek(blockstart)
+            data = self.dataf.read(blocksize)
+        lend = len(data) / self.s
+        i = blockstart / self.s
+        off = 0
+        for x in xrange(lend):
+            if self.index[i + x] == None:
+                b = data[off : off + self.s]
+                e = struct.unpack(self.format, b)
+                self.index[i + x] = e
+                self.map[e[-1]] = i + x
+            off += self.s
+
+    def findnode(self, node):
+        """search backwards through the index file for a specific node"""
+        if self.allmap: return None
+
+        # hg log will cause many many searches for the manifest
+        # nodes.  After we get called a few times, just load the whole
+        # thing.
+        if self.mapfind_count > 8:
+            self.loadmap()
+            if node in self.map:
+                return node
+            return None
+        self.mapfind_count += 1
+        last = self.l - 1
+        while self.index[last] != None:
+            if last == 0:
+                self.all = 1
+                self.allmap = 1
+                return None
+            last -= 1
+        end = (last + 1) * self.s
+        blocksize = self.s * 256
+        while end >= 0:
+            start = max(end - blocksize, 0)
+            self.dataf.seek(start)
+            data = self.dataf.read(end - start)
+            findend = end - start
+            while True:
+                # we're searching backwards, so weh have to make sure
+                # we don't find a changeset where this node is a parent
+                off = data.rfind(node, 0, findend)
+                findend = off
+                if off >= 0:
+                    i = off / self.s
+                    off = i * self.s
+                    n = data[off + self.shaoffset:off + self.shaoffset + 20]
+                    if n == node:
+                        self.map[n] = i + start / self.s
+                        return node
+                else:
+                    break
+            end -= blocksize
+        return None
+
+    def loadindex(self, i=None, end=None):
+        if self.all: return
+        all = False
+        if i == None:
+            blockstart = 0
+            blocksize = (512 / self.s) * self.s
+            end = self.datasize
+            all = True
         else:
-            self.all = 1
-            i = 0
-            end = self.l
-            self.revlog.index = self.index
-            self.revlog.nodemap = self.map
-
-        while i < end:
-            if not self.index[i]:
-                d = self.data[i * self.s: (i + 1) * self.s]
-                e = struct.unpack(self.indexformat, d)
-                self.index[i] = e
-                self.map[e[-1]] = i
-            i += 1
+            if end:
+                blockstart = i * self.s
+                end = end * self.s
+                blocksize = end - blockstart
+            else:
+                blockstart = (i & ~(32)) * self.s
+                blocksize = self.s * 64
+                end = blockstart + blocksize
+        while blockstart < end:
+            self.loadblock(blockstart, blocksize)
+            blockstart += blocksize
+        if all: self.all = True
 
 class lazyindex(object):
     """a lazy version of the index array"""
@@ -127,7 +215,7 @@
     def load(self, pos):
         if pos < 0:
             pos += len(self.p.index)
-        self.p.load(pos)
+        self.p.loadindex(pos)
         return self.p.index[pos]
     def __getitem__(self, pos):
         return self.p.index[pos] or self.load(pos)
@@ -143,14 +231,13 @@
     def __init__(self, parser):
         self.p = parser
     def load(self, key):
-        if self.p.all: return
-        n = self.p.data.find(key)
-        if n < 0:
+        n = self.p.findnode(key)
+        if n == None:
             raise KeyError(key)
-        pos = n / self.p.s
-        self.p.load(pos)
     def __contains__(self, key):
-        self.p.load()
+        if key in self.p.map:
+            return True
+        self.p.loadmap()
         return key in self.p.map
     def __iter__(self):
         yield nullid
@@ -158,7 +245,7 @@
             try:
                 yield self.p.index[i][-1]
             except:
-                self.p.load(i)
+                self.p.loadindex(i)
                 yield self.p.index[i][-1]
     def __getitem__(self, key):
         try:
@@ -222,7 +309,8 @@
         v = self.defversion
         try:
             f = self.opener(self.indexfile)
-            i = f.read()
+            i = f.read(4)
+            f.seek(0)
         except IOError, inst:
             if inst.errno != errno.ENOENT:
                 raise
@@ -241,7 +329,7 @@
                     return
                 self.indexstat = st
                 if len(i) > 0:
-                    v = struct.unpack(versionformat, i[:4])[0]
+                    v = struct.unpack(versionformat, i)[0]
         flags = v & ~0xFFFF
         fmt = v & 0xFFFF
         if fmt == 0:
@@ -258,16 +346,19 @@
         self.version = v
         if v == 0:
             self.indexformat = indexformatv0
+            shaoffset = v0shaoffset
         else:
             self.indexformat = indexformatng
+            shaoffset = ngshaoffset
 
         if i:
             if not self.inlinedata() and st and st.st_size > 10000:
                 # big index, let's parse it on demand
-                parser = lazyparser(i, self, self.indexformat)
+                parser = lazyparser(f, st.st_size, self.indexformat, shaoffset)
                 self.index = lazyindex(parser)
                 self.nodemap = lazymap(parser)
             else:
+                i = f.read()
                 self.parseindex(i)
             if self.inlinedata():
                 # we've already got the entire data file read in, save it
@@ -312,11 +403,23 @@
     def offset_type(self, offset, type):
         return long(long(offset) << 16 | type)
 
+    def loadindex(self, start, end):
+        """load a block of indexes all at once from the lazy parser"""
+        if isinstance(self.index, lazyindex):
+            self.index.p.loadindex(start, end)
+
     def loadindexmap(self):
         """loads both the map and the index from the lazy parser"""
         if isinstance(self.index, lazyindex):
             p = self.index.p
-            p.load()
+            p.loadindex()
+            self.nodemap = p.map
+
+    def loadmap(self):
+        """loads the map from the lazy parser"""
+        if isinstance(self.nodemap, lazymap):
+            self.nodemap.p.loadmap()
+            self.nodemap = self.nodemap.p.map
 
     def inlinedata(self): return self.version & REVLOGNGINLINEDATA
     def tip(self): return self.node(len(self.index) - 1)
@@ -690,7 +793,9 @@
         if self.cache and self.cache[1] >= base and self.cache[1] < rev:
             base = self.cache[1]
             text = self.cache[2]
+            self.loadindex(base, rev + 1)
         else:
+            self.loadindex(base, rev + 1)
             text = self.chunk(base, df=df)
 
         bins = []