diff -r 9aeb9e2d28a7 -r 3a333a582d7b hgext/remotefilelog/historypack.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/hgext/remotefilelog/historypack.py Thu Sep 27 13:03:19 2018 -0400 @@ -0,0 +1,545 @@ +from __future__ import absolute_import + +import hashlib +import struct + +from mercurial.node import hex, nullid +from mercurial import ( + pycompat, + util, +) +from . import ( + basepack, + constants, + shallowutil, +) + +# (filename hash, offset, size) +INDEXFORMAT0 = '!20sQQ' +INDEXENTRYLENGTH0 = struct.calcsize(INDEXFORMAT0) +INDEXFORMAT1 = '!20sQQII' +INDEXENTRYLENGTH1 = struct.calcsize(INDEXFORMAT1) +NODELENGTH = 20 + +NODEINDEXFORMAT = '!20sQ' +NODEINDEXENTRYLENGTH = struct.calcsize(NODEINDEXFORMAT) + +# (node, p1, p2, linknode) +PACKFORMAT = "!20s20s20s20sH" +PACKENTRYLENGTH = 82 + +ENTRYCOUNTSIZE = 4 + +INDEXSUFFIX = '.histidx' +PACKSUFFIX = '.histpack' + +ANC_NODE = 0 +ANC_P1NODE = 1 +ANC_P2NODE = 2 +ANC_LINKNODE = 3 +ANC_COPYFROM = 4 + +class historypackstore(basepack.basepackstore): + INDEXSUFFIX = INDEXSUFFIX + PACKSUFFIX = PACKSUFFIX + + def getpack(self, path): + return historypack(path) + + def getancestors(self, name, node, known=None): + for pack in self.packs: + try: + return pack.getancestors(name, node, known=known) + except KeyError: + pass + + for pack in self.refresh(): + try: + return pack.getancestors(name, node, known=known) + except KeyError: + pass + + raise KeyError((name, node)) + + def getnodeinfo(self, name, node): + for pack in self.packs: + try: + return pack.getnodeinfo(name, node) + except KeyError: + pass + + for pack in self.refresh(): + try: + return pack.getnodeinfo(name, node) + except KeyError: + pass + + raise KeyError((name, node)) + + def add(self, filename, node, p1, p2, linknode, copyfrom): + raise RuntimeError("cannot add to historypackstore (%s:%s)" + % (filename, hex(node))) + +class historypack(basepack.basepack): + INDEXSUFFIX = INDEXSUFFIX + PACKSUFFIX = PACKSUFFIX + + SUPPORTED_VERSIONS = [0, 1] + + def __init__(self, path): + super(historypack, self).__init__(path) + + if self.VERSION == 0: + self.INDEXFORMAT = INDEXFORMAT0 + self.INDEXENTRYLENGTH = INDEXENTRYLENGTH0 + else: + self.INDEXFORMAT = INDEXFORMAT1 + self.INDEXENTRYLENGTH = INDEXENTRYLENGTH1 + + def getmissing(self, keys): + missing = [] + for name, node in keys: + try: + self._findnode(name, node) + except KeyError: + missing.append((name, node)) + + return missing + + def getancestors(self, name, node, known=None): + """Returns as many ancestors as we're aware of. + + return value: { + node: (p1, p2, linknode, copyfrom), + ... + } + """ + if known and node in known: + return [] + + ancestors = self._getancestors(name, node, known=known) + results = {} + for ancnode, p1, p2, linknode, copyfrom in ancestors: + results[ancnode] = (p1, p2, linknode, copyfrom) + + if not results: + raise KeyError((name, node)) + return results + + def getnodeinfo(self, name, node): + # Drop the node from the tuple before returning, since the result should + # just be (p1, p2, linknode, copyfrom) + return self._findnode(name, node)[1:] + + def _getancestors(self, name, node, known=None): + if known is None: + known = set() + section = self._findsection(name) + filename, offset, size, nodeindexoffset, nodeindexsize = section + pending = set((node,)) + o = 0 + while o < size: + if not pending: + break + entry, copyfrom = self._readentry(offset + o) + o += PACKENTRYLENGTH + if copyfrom: + o += len(copyfrom) + + ancnode = entry[ANC_NODE] + if ancnode in pending: + pending.remove(ancnode) + p1node = entry[ANC_P1NODE] + p2node = entry[ANC_P2NODE] + if p1node != nullid and p1node not in known: + pending.add(p1node) + if p2node != nullid and p2node not in known: + pending.add(p2node) + + yield (ancnode, p1node, p2node, entry[ANC_LINKNODE], copyfrom) + + def _readentry(self, offset): + data = self._data + entry = struct.unpack(PACKFORMAT, data[offset:offset + PACKENTRYLENGTH]) + copyfrom = None + copyfromlen = entry[ANC_COPYFROM] + if copyfromlen != 0: + offset += PACKENTRYLENGTH + copyfrom = data[offset:offset + copyfromlen] + return entry, copyfrom + + def add(self, filename, node, p1, p2, linknode, copyfrom): + raise RuntimeError("cannot add to historypack (%s:%s)" % + (filename, hex(node))) + + def _findnode(self, name, node): + if self.VERSION == 0: + ancestors = self._getancestors(name, node) + for ancnode, p1node, p2node, linknode, copyfrom in ancestors: + if ancnode == node: + return (ancnode, p1node, p2node, linknode, copyfrom) + else: + section = self._findsection(name) + nodeindexoffset, nodeindexsize = section[3:] + entry = self._bisect(node, nodeindexoffset, + nodeindexoffset + nodeindexsize, + NODEINDEXENTRYLENGTH) + if entry is not None: + node, offset = struct.unpack(NODEINDEXFORMAT, entry) + entry, copyfrom = self._readentry(offset) + # Drop the copyfromlen from the end of entry, and replace it + # with the copyfrom string. + return entry[:4] + (copyfrom,) + + raise KeyError("unable to find history for %s:%s" % (name, hex(node))) + + def _findsection(self, name): + params = self.params + namehash = hashlib.sha1(name).digest() + fanoutkey = struct.unpack(params.fanoutstruct, + namehash[:params.fanoutprefix])[0] + fanout = self._fanouttable + + start = fanout[fanoutkey] + params.indexstart + indexend = self._indexend + + for i in pycompat.xrange(fanoutkey + 1, params.fanoutcount): + end = fanout[i] + params.indexstart + if end != start: + break + else: + end = indexend + + entry = self._bisect(namehash, start, end, self.INDEXENTRYLENGTH) + if not entry: + raise KeyError(name) + + rawentry = struct.unpack(self.INDEXFORMAT, entry) + if self.VERSION == 0: + x, offset, size = rawentry + nodeindexoffset = None + nodeindexsize = None + else: + x, offset, size, nodeindexoffset, nodeindexsize = rawentry + rawnamelen = self._index[nodeindexoffset:nodeindexoffset + + constants.FILENAMESIZE] + actualnamelen = struct.unpack('!H', rawnamelen)[0] + nodeindexoffset += constants.FILENAMESIZE + actualname = self._index[nodeindexoffset:nodeindexoffset + + actualnamelen] + if actualname != name: + raise KeyError("found file name %s when looking for %s" % + (actualname, name)) + nodeindexoffset += actualnamelen + + filenamelength = struct.unpack('!H', self._data[offset:offset + + constants.FILENAMESIZE])[0] + offset += constants.FILENAMESIZE + + actualname = self._data[offset:offset + filenamelength] + offset += filenamelength + + if name != actualname: + raise KeyError("found file name %s when looking for %s" % + (actualname, name)) + + # Skip entry list size + offset += ENTRYCOUNTSIZE + + nodelistoffset = offset + nodelistsize = (size - constants.FILENAMESIZE - filenamelength - + ENTRYCOUNTSIZE) + return (name, nodelistoffset, nodelistsize, + nodeindexoffset, nodeindexsize) + + def _bisect(self, node, start, end, entrylen): + # Bisect between start and end to find node + origstart = start + startnode = self._index[start:start + NODELENGTH] + endnode = self._index[end:end + NODELENGTH] + + if startnode == node: + return self._index[start:start + entrylen] + elif endnode == node: + return self._index[end:end + entrylen] + else: + while start < end - entrylen: + mid = start + (end - start) / 2 + mid = mid - ((mid - origstart) % entrylen) + midnode = self._index[mid:mid + NODELENGTH] + if midnode == node: + return self._index[mid:mid + entrylen] + if node > midnode: + start = mid + startnode = midnode + elif node < midnode: + end = mid + endnode = midnode + return None + + def markledger(self, ledger, options=None): + for filename, node in self: + ledger.markhistoryentry(self, filename, node) + + def cleanup(self, ledger): + entries = ledger.sources.get(self, []) + allkeys = set(self) + repackedkeys = set((e.filename, e.node) for e in entries if + e.historyrepacked) + + if len(allkeys - repackedkeys) == 0: + if self.path not in ledger.created: + util.unlinkpath(self.indexpath, ignoremissing=True) + util.unlinkpath(self.packpath, ignoremissing=True) + + def __iter__(self): + for f, n, x, x, x, x in self.iterentries(): + yield f, n + + def iterentries(self): + # Start at 1 to skip the header + offset = 1 + while offset < self.datasize: + data = self._data + # <2 byte len> + + filenamelen = struct.unpack('!H', data[offset:offset + + constants.FILENAMESIZE])[0] + offset += constants.FILENAMESIZE + filename = data[offset:offset + filenamelen] + offset += filenamelen + + revcount = struct.unpack('!I', data[offset:offset + + ENTRYCOUNTSIZE])[0] + offset += ENTRYCOUNTSIZE + + for i in pycompat.xrange(revcount): + entry = struct.unpack(PACKFORMAT, data[offset:offset + + PACKENTRYLENGTH]) + offset += PACKENTRYLENGTH + + copyfrom = data[offset:offset + entry[ANC_COPYFROM]] + offset += entry[ANC_COPYFROM] + + yield (filename, entry[ANC_NODE], entry[ANC_P1NODE], + entry[ANC_P2NODE], entry[ANC_LINKNODE], copyfrom) + + self._pagedin += PACKENTRYLENGTH + + # If we've read a lot of data from the mmap, free some memory. + self.freememory() + +class mutablehistorypack(basepack.mutablebasepack): + """A class for constructing and serializing a histpack file and index. + + A history pack is a pair of files that contain the revision history for + various file revisions in Mercurial. It contains only revision history (like + parent pointers and linknodes), not any revision content information. + + It consists of two files, with the following format: + + .histpack + The pack itself is a series of file revisions with some basic header + information on each. + + datapack = + [,...] + filesection = + + + [,...] + revision = + + + + + + + The revisions within each filesection are stored in topological order + (newest first). If a given entry has a parent from another file (a copy) + then p1node is the node from the other file, and copyfrom is the + filepath of the other file. + + .histidx + The index file provides a mapping from filename to the file section in + the histpack. In V1 it also contains sub-indexes for specific nodes + within each file. It consists of three parts, the fanout, the file index + and the node indexes. + + The file index is a list of index entries, sorted by filename hash (one + per file section in the pack). Each entry has: + + - node (The 20 byte hash of the filename) + - pack entry offset (The location of this file section in the histpack) + - pack content size (The on-disk length of this file section's pack + data) + - node index offset (The location of the file's node index in the index + file) [1] + - node index size (the on-disk length of this file's node index) [1] + + The fanout is a quick lookup table to reduce the number of steps for + bisecting the index. It is a series of 4 byte pointers to positions + within the index. It has 2^16 entries, which corresponds to hash + prefixes [00, 01, 02,..., FD, FE, FF]. Example: the pointer in slot 4F + points to the index position of the first revision whose node starts + with 4F. This saves log(2^16) bisect steps. + + dataidx = + [1] + + [1] + [,...] [1] + fanouttable = [,...] (2^16 entries) + + fileindex = [,...] + fileindexentry = + + + [1] + [1] + nodeindex = [,...] [1] + filename = [1] + nodeindexentry = [1] + [1] + + [1]: new in version 1. + """ + INDEXSUFFIX = INDEXSUFFIX + PACKSUFFIX = PACKSUFFIX + + SUPPORTED_VERSIONS = [0, 1] + + def __init__(self, ui, packpath, version=0): + # internal config: remotefilelog.historypackv1 + if version == 0 and ui.configbool('remotefilelog', 'historypackv1'): + version = 1 + + super(mutablehistorypack, self).__init__(ui, packpath, version=version) + self.files = {} + self.entrylocations = {} + self.fileentries = {} + + if version == 0: + self.INDEXFORMAT = INDEXFORMAT0 + self.INDEXENTRYLENGTH = INDEXENTRYLENGTH0 + else: + self.INDEXFORMAT = INDEXFORMAT1 + self.INDEXENTRYLENGTH = INDEXENTRYLENGTH1 + + self.NODEINDEXFORMAT = NODEINDEXFORMAT + self.NODEINDEXENTRYLENGTH = NODEINDEXENTRYLENGTH + + def add(self, filename, node, p1, p2, linknode, copyfrom): + copyfrom = copyfrom or '' + copyfromlen = struct.pack('!H', len(copyfrom)) + self.fileentries.setdefault(filename, []).append((node, p1, p2, + linknode, + copyfromlen, + copyfrom)) + + def _write(self): + for filename in sorted(self.fileentries): + entries = self.fileentries[filename] + sectionstart = self.packfp.tell() + + # Write the file section content + entrymap = dict((e[0], e) for e in entries) + def parentfunc(node): + x, p1, p2, x, x, x = entrymap[node] + parents = [] + if p1 != nullid: + parents.append(p1) + if p2 != nullid: + parents.append(p2) + return parents + + sortednodes = list(reversed(shallowutil.sortnodes( + (e[0] for e in entries), + parentfunc))) + + # Write the file section header + self.writeraw("%s%s%s" % ( + struct.pack('!H', len(filename)), + filename, + struct.pack('!I', len(sortednodes)), + )) + + sectionlen = constants.FILENAMESIZE + len(filename) + 4 + + rawstrings = [] + + # Record the node locations for the index + locations = self.entrylocations.setdefault(filename, {}) + offset = sectionstart + sectionlen + for node in sortednodes: + locations[node] = offset + raw = '%s%s%s%s%s%s' % entrymap[node] + rawstrings.append(raw) + offset += len(raw) + + rawdata = ''.join(rawstrings) + sectionlen += len(rawdata) + + self.writeraw(rawdata) + + # Record metadata for the index + self.files[filename] = (sectionstart, sectionlen) + node = hashlib.sha1(filename).digest() + self.entries[node] = node + + def close(self, ledger=None): + if self._closed: + return + + self._write() + + return super(mutablehistorypack, self).close(ledger=ledger) + + def createindex(self, nodelocations, indexoffset): + fileindexformat = self.INDEXFORMAT + fileindexlength = self.INDEXENTRYLENGTH + nodeindexformat = self.NODEINDEXFORMAT + nodeindexlength = self.NODEINDEXENTRYLENGTH + version = self.VERSION + + files = ((hashlib.sha1(filename).digest(), filename, offset, size) + for filename, (offset, size) in self.files.iteritems()) + files = sorted(files) + + # node index is after file index size, file index, and node index size + indexlensize = struct.calcsize('!Q') + nodeindexoffset = (indexoffset + indexlensize + + (len(files) * fileindexlength) + indexlensize) + + fileindexentries = [] + nodeindexentries = [] + nodecount = 0 + for namehash, filename, offset, size in files: + # File section index + if version == 0: + rawentry = struct.pack(fileindexformat, namehash, offset, size) + else: + nodelocations = self.entrylocations[filename] + + nodeindexsize = len(nodelocations) * nodeindexlength + + rawentry = struct.pack(fileindexformat, namehash, offset, size, + nodeindexoffset, nodeindexsize) + # Node index + nodeindexentries.append(struct.pack(constants.FILENAMESTRUCT, + len(filename)) + filename) + nodeindexoffset += constants.FILENAMESIZE + len(filename) + + for node, location in sorted(nodelocations.iteritems()): + nodeindexentries.append(struct.pack(nodeindexformat, node, + location)) + nodecount += 1 + + nodeindexoffset += len(nodelocations) * nodeindexlength + + fileindexentries.append(rawentry) + + nodecountraw = '' + if version == 1: + nodecountraw = struct.pack('!Q', nodecount) + return (''.join(fileindexentries) + nodecountraw + + ''.join(nodeindexentries))