hgext/remotefilelog/datapack.py
changeset 40495 3a333a582d7b
child 40506 10c10da14c5d
equal deleted inserted replaced
40494:9aeb9e2d28a7 40495:3a333a582d7b
       
     1 from __future__ import absolute_import
       
     2 
       
     3 import struct
       
     4 
       
     5 from mercurial.node import hex, nullid
       
     6 from mercurial.i18n import _
       
     7 from mercurial import (
       
     8     error,
       
     9     pycompat,
       
    10     util,
       
    11 )
       
    12 from . import (
       
    13     basepack,
       
    14     constants,
       
    15     lz4wrapper,
       
    16     shallowutil,
       
    17 )
       
    18 
       
    19 NODELENGTH = 20
       
    20 
       
    21 # The indicator value in the index for a fulltext entry.
       
    22 FULLTEXTINDEXMARK = -1
       
    23 NOBASEINDEXMARK = -2
       
    24 
       
    25 INDEXSUFFIX = '.dataidx'
       
    26 PACKSUFFIX = '.datapack'
       
    27 
       
    28 class datapackstore(basepack.basepackstore):
       
    29     INDEXSUFFIX = INDEXSUFFIX
       
    30     PACKSUFFIX = PACKSUFFIX
       
    31 
       
    32     def __init__(self, ui, path):
       
    33         super(datapackstore, self).__init__(ui, path)
       
    34 
       
    35     def getpack(self, path):
       
    36         return datapack(path)
       
    37 
       
    38     def get(self, name, node):
       
    39         raise RuntimeError("must use getdeltachain with datapackstore")
       
    40 
       
    41     def getmeta(self, name, node):
       
    42         for pack in self.packs:
       
    43             try:
       
    44                 return pack.getmeta(name, node)
       
    45             except KeyError:
       
    46                 pass
       
    47 
       
    48         for pack in self.refresh():
       
    49             try:
       
    50                 return pack.getmeta(name, node)
       
    51             except KeyError:
       
    52                 pass
       
    53 
       
    54         raise KeyError((name, hex(node)))
       
    55 
       
    56     def getdelta(self, name, node):
       
    57         for pack in self.packs:
       
    58             try:
       
    59                 return pack.getdelta(name, node)
       
    60             except KeyError:
       
    61                 pass
       
    62 
       
    63         for pack in self.refresh():
       
    64             try:
       
    65                 return pack.getdelta(name, node)
       
    66             except KeyError:
       
    67                 pass
       
    68 
       
    69         raise KeyError((name, hex(node)))
       
    70 
       
    71     def getdeltachain(self, name, node):
       
    72         for pack in self.packs:
       
    73             try:
       
    74                 return pack.getdeltachain(name, node)
       
    75             except KeyError:
       
    76                 pass
       
    77 
       
    78         for pack in self.refresh():
       
    79             try:
       
    80                 return pack.getdeltachain(name, node)
       
    81             except KeyError:
       
    82                 pass
       
    83 
       
    84         raise KeyError((name, hex(node)))
       
    85 
       
    86     def add(self, name, node, data):
       
    87         raise RuntimeError("cannot add to datapackstore")
       
    88 
       
    89 class datapack(basepack.basepack):
       
    90     INDEXSUFFIX = INDEXSUFFIX
       
    91     PACKSUFFIX = PACKSUFFIX
       
    92 
       
    93     # Format is <node><delta offset><pack data offset><pack data size>
       
    94     # See the mutabledatapack doccomment for more details.
       
    95     INDEXFORMAT = '!20siQQ'
       
    96     INDEXENTRYLENGTH = 40
       
    97 
       
    98     SUPPORTED_VERSIONS = [0, 1]
       
    99 
       
   100     def getmissing(self, keys):
       
   101         missing = []
       
   102         for name, node in keys:
       
   103             value = self._find(node)
       
   104             if not value:
       
   105                 missing.append((name, node))
       
   106 
       
   107         return missing
       
   108 
       
   109     def get(self, name, node):
       
   110         raise RuntimeError("must use getdeltachain with datapack (%s:%s)"
       
   111                            % (name, hex(node)))
       
   112 
       
   113     def getmeta(self, name, node):
       
   114         value = self._find(node)
       
   115         if value is None:
       
   116             raise KeyError((name, hex(node)))
       
   117 
       
   118         # version 0 does not support metadata
       
   119         if self.VERSION == 0:
       
   120             return {}
       
   121 
       
   122         node, deltabaseoffset, offset, size = value
       
   123         rawentry = self._data[offset:offset + size]
       
   124 
       
   125         # see docstring of mutabledatapack for the format
       
   126         offset = 0
       
   127         offset += struct.unpack_from('!H', rawentry, offset)[0] + 2 # filename
       
   128         offset += 40 # node, deltabase node
       
   129         offset += struct.unpack_from('!Q', rawentry, offset)[0] + 8 # delta
       
   130 
       
   131         metalen = struct.unpack_from('!I', rawentry, offset)[0]
       
   132         offset += 4
       
   133 
       
   134         meta = shallowutil.parsepackmeta(rawentry[offset:offset + metalen])
       
   135 
       
   136         return meta
       
   137 
       
   138     def getdelta(self, name, node):
       
   139         value = self._find(node)
       
   140         if value is None:
       
   141             raise KeyError((name, hex(node)))
       
   142 
       
   143         node, deltabaseoffset, offset, size = value
       
   144         entry = self._readentry(offset, size, getmeta=True)
       
   145         filename, node, deltabasenode, delta, meta = entry
       
   146 
       
   147         # If we've read a lot of data from the mmap, free some memory.
       
   148         self.freememory()
       
   149 
       
   150         return delta, filename, deltabasenode, meta
       
   151 
       
   152     def getdeltachain(self, name, node):
       
   153         value = self._find(node)
       
   154         if value is None:
       
   155             raise KeyError((name, hex(node)))
       
   156 
       
   157         params = self.params
       
   158 
       
   159         # Precompute chains
       
   160         chain = [value]
       
   161         deltabaseoffset = value[1]
       
   162         entrylen = self.INDEXENTRYLENGTH
       
   163         while (deltabaseoffset != FULLTEXTINDEXMARK
       
   164                and deltabaseoffset != NOBASEINDEXMARK):
       
   165             loc = params.indexstart + deltabaseoffset
       
   166             value = struct.unpack(self.INDEXFORMAT,
       
   167                                   self._index[loc:loc + entrylen])
       
   168             deltabaseoffset = value[1]
       
   169             chain.append(value)
       
   170 
       
   171         # Read chain data
       
   172         deltachain = []
       
   173         for node, deltabaseoffset, offset, size in chain:
       
   174             filename, node, deltabasenode, delta = self._readentry(offset, size)
       
   175             deltachain.append((filename, node, filename, deltabasenode, delta))
       
   176 
       
   177         # If we've read a lot of data from the mmap, free some memory.
       
   178         self.freememory()
       
   179 
       
   180         return deltachain
       
   181 
       
   182     def _readentry(self, offset, size, getmeta=False):
       
   183         rawentry = self._data[offset:offset + size]
       
   184         self._pagedin += len(rawentry)
       
   185 
       
   186         # <2 byte len> + <filename>
       
   187         lengthsize = 2
       
   188         filenamelen = struct.unpack('!H', rawentry[:2])[0]
       
   189         filename = rawentry[lengthsize:lengthsize + filenamelen]
       
   190 
       
   191         # <20 byte node> + <20 byte deltabase>
       
   192         nodestart = lengthsize + filenamelen
       
   193         deltabasestart = nodestart + NODELENGTH
       
   194         node = rawentry[nodestart:deltabasestart]
       
   195         deltabasenode = rawentry[deltabasestart:deltabasestart + NODELENGTH]
       
   196 
       
   197         # <8 byte len> + <delta>
       
   198         deltastart = deltabasestart + NODELENGTH
       
   199         rawdeltalen = rawentry[deltastart:deltastart + 8]
       
   200         deltalen = struct.unpack('!Q', rawdeltalen)[0]
       
   201 
       
   202         delta = rawentry[deltastart + 8:deltastart + 8 + deltalen]
       
   203         delta = lz4wrapper.lz4decompress(delta)
       
   204 
       
   205         if getmeta:
       
   206             if self.VERSION == 0:
       
   207                 meta = {}
       
   208             else:
       
   209                 metastart = deltastart + 8 + deltalen
       
   210                 metalen = struct.unpack_from('!I', rawentry, metastart)[0]
       
   211 
       
   212                 rawmeta = rawentry[metastart + 4:metastart + 4 + metalen]
       
   213                 meta = shallowutil.parsepackmeta(rawmeta)
       
   214             return filename, node, deltabasenode, delta, meta
       
   215         else:
       
   216             return filename, node, deltabasenode, delta
       
   217 
       
   218     def add(self, name, node, data):
       
   219         raise RuntimeError("cannot add to datapack (%s:%s)" % (name, node))
       
   220 
       
   221     def _find(self, node):
       
   222         params = self.params
       
   223         fanoutkey = struct.unpack(params.fanoutstruct,
       
   224                                   node[:params.fanoutprefix])[0]
       
   225         fanout = self._fanouttable
       
   226 
       
   227         start = fanout[fanoutkey] + params.indexstart
       
   228         indexend = self._indexend
       
   229 
       
   230         # Scan forward to find the first non-same entry, which is the upper
       
   231         # bound.
       
   232         for i in pycompat.xrange(fanoutkey + 1, params.fanoutcount):
       
   233             end = fanout[i] + params.indexstart
       
   234             if end != start:
       
   235                 break
       
   236         else:
       
   237             end = indexend
       
   238 
       
   239         # Bisect between start and end to find node
       
   240         index = self._index
       
   241         startnode = index[start:start + NODELENGTH]
       
   242         endnode = index[end:end + NODELENGTH]
       
   243         entrylen = self.INDEXENTRYLENGTH
       
   244         if startnode == node:
       
   245             entry = index[start:start + entrylen]
       
   246         elif endnode == node:
       
   247             entry = index[end:end + entrylen]
       
   248         else:
       
   249             while start < end - entrylen:
       
   250                 mid = start  + (end - start) / 2
       
   251                 mid = mid - ((mid - params.indexstart) % entrylen)
       
   252                 midnode = index[mid:mid + NODELENGTH]
       
   253                 if midnode == node:
       
   254                     entry = index[mid:mid + entrylen]
       
   255                     break
       
   256                 if node > midnode:
       
   257                     start = mid
       
   258                     startnode = midnode
       
   259                 elif node < midnode:
       
   260                     end = mid
       
   261                     endnode = midnode
       
   262             else:
       
   263                 return None
       
   264 
       
   265         return struct.unpack(self.INDEXFORMAT, entry)
       
   266 
       
   267     def markledger(self, ledger, options=None):
       
   268         for filename, node in self:
       
   269             ledger.markdataentry(self, filename, node)
       
   270 
       
   271     def cleanup(self, ledger):
       
   272         entries = ledger.sources.get(self, [])
       
   273         allkeys = set(self)
       
   274         repackedkeys = set((e.filename, e.node) for e in entries if
       
   275                            e.datarepacked or e.gced)
       
   276 
       
   277         if len(allkeys - repackedkeys) == 0:
       
   278             if self.path not in ledger.created:
       
   279                 util.unlinkpath(self.indexpath, ignoremissing=True)
       
   280                 util.unlinkpath(self.packpath, ignoremissing=True)
       
   281 
       
   282     def __iter__(self):
       
   283         for f, n, deltabase, deltalen in self.iterentries():
       
   284             yield f, n
       
   285 
       
   286     def iterentries(self):
       
   287         # Start at 1 to skip the header
       
   288         offset = 1
       
   289         data = self._data
       
   290         while offset < self.datasize:
       
   291             oldoffset = offset
       
   292 
       
   293             # <2 byte len> + <filename>
       
   294             filenamelen = struct.unpack('!H', data[offset:offset + 2])[0]
       
   295             offset += 2
       
   296             filename = data[offset:offset + filenamelen]
       
   297             offset += filenamelen
       
   298 
       
   299             # <20 byte node>
       
   300             node = data[offset:offset + constants.NODESIZE]
       
   301             offset += constants.NODESIZE
       
   302             # <20 byte deltabase>
       
   303             deltabase = data[offset:offset + constants.NODESIZE]
       
   304             offset += constants.NODESIZE
       
   305 
       
   306             # <8 byte len> + <delta>
       
   307             rawdeltalen = data[offset:offset + 8]
       
   308             deltalen = struct.unpack('!Q', rawdeltalen)[0]
       
   309             offset += 8
       
   310 
       
   311             # it has to be at least long enough for the lz4 header.
       
   312             assert deltalen >= 4
       
   313 
       
   314             # python-lz4 stores the length of the uncompressed field as a
       
   315             # little-endian 32-bit integer at the start of the data.
       
   316             uncompressedlen = struct.unpack('<I', data[offset:offset + 4])[0]
       
   317             offset += deltalen
       
   318 
       
   319             if self.VERSION == 1:
       
   320                 # <4 byte len> + <metadata-list>
       
   321                 metalen = struct.unpack_from('!I', data, offset)[0]
       
   322                 offset += 4 + metalen
       
   323 
       
   324             yield (filename, node, deltabase, uncompressedlen)
       
   325 
       
   326             # If we've read a lot of data from the mmap, free some memory.
       
   327             self._pagedin += offset - oldoffset
       
   328             if self.freememory():
       
   329                 data = self._data
       
   330 
       
   331 class mutabledatapack(basepack.mutablebasepack):
       
   332     """A class for constructing and serializing a datapack file and index.
       
   333 
       
   334     A datapack is a pair of files that contain the revision contents for various
       
   335     file revisions in Mercurial. It contains only revision contents (like file
       
   336     contents), not any history information.
       
   337 
       
   338     It consists of two files, with the following format. All bytes are in
       
   339     network byte order (big endian).
       
   340 
       
   341     .datapack
       
   342         The pack itself is a series of revision deltas with some basic header
       
   343         information on each. A revision delta may be a fulltext, represented by
       
   344         a deltabasenode equal to the nullid.
       
   345 
       
   346         datapack = <version: 1 byte>
       
   347                    [<revision>,...]
       
   348         revision = <filename len: 2 byte unsigned int>
       
   349                    <filename>
       
   350                    <node: 20 byte>
       
   351                    <deltabasenode: 20 byte>
       
   352                    <delta len: 8 byte unsigned int>
       
   353                    <delta>
       
   354                    <metadata-list len: 4 byte unsigned int> [1]
       
   355                    <metadata-list>                          [1]
       
   356         metadata-list = [<metadata-item>, ...]
       
   357         metadata-item = <metadata-key: 1 byte>
       
   358                         <metadata-value len: 2 byte unsigned>
       
   359                         <metadata-value>
       
   360 
       
   361         metadata-key could be METAKEYFLAG or METAKEYSIZE or other single byte
       
   362         value in the future.
       
   363 
       
   364     .dataidx
       
   365         The index file consists of two parts, the fanout and the index.
       
   366 
       
   367         The index is a list of index entries, sorted by node (one per revision
       
   368         in the pack). Each entry has:
       
   369 
       
   370         - node (The 20 byte node of the entry; i.e. the commit hash, file node
       
   371                 hash, etc)
       
   372         - deltabase index offset (The location in the index of the deltabase for
       
   373                                   this entry. The deltabase is the next delta in
       
   374                                   the chain, with the chain eventually
       
   375                                   terminating in a full-text, represented by a
       
   376                                   deltabase offset of -1. This lets us compute
       
   377                                   delta chains from the index, then do
       
   378                                   sequential reads from the pack if the revision
       
   379                                   are nearby on disk.)
       
   380         - pack entry offset (The location of this entry in the datapack)
       
   381         - pack content size (The on-disk length of this entry's pack data)
       
   382 
       
   383         The fanout is a quick lookup table to reduce the number of steps for
       
   384         bisecting the index. It is a series of 4 byte pointers to positions
       
   385         within the index. It has 2^16 entries, which corresponds to hash
       
   386         prefixes [0000, 0001,..., FFFE, FFFF]. Example: the pointer in slot
       
   387         4F0A points to the index position of the first revision whose node
       
   388         starts with 4F0A. This saves log(2^16)=16 bisect steps.
       
   389 
       
   390         dataidx = <fanouttable>
       
   391                   <index>
       
   392         fanouttable = [<index offset: 4 byte unsigned int>,...] (2^16 entries)
       
   393         index = [<index entry>,...]
       
   394         indexentry = <node: 20 byte>
       
   395                      <deltabase location: 4 byte signed int>
       
   396                      <pack entry offset: 8 byte unsigned int>
       
   397                      <pack entry size: 8 byte unsigned int>
       
   398 
       
   399     [1]: new in version 1.
       
   400     """
       
   401     INDEXSUFFIX = INDEXSUFFIX
       
   402     PACKSUFFIX = PACKSUFFIX
       
   403 
       
   404     # v[01] index format: <node><delta offset><pack data offset><pack data size>
       
   405     INDEXFORMAT = datapack.INDEXFORMAT
       
   406     INDEXENTRYLENGTH = datapack.INDEXENTRYLENGTH
       
   407 
       
   408     # v1 has metadata support
       
   409     SUPPORTED_VERSIONS = [0, 1]
       
   410 
       
   411     def add(self, name, node, deltabasenode, delta, metadata=None):
       
   412         # metadata is a dict, ex. {METAKEYFLAG: flag}
       
   413         if len(name) > 2**16:
       
   414             raise RuntimeError(_("name too long %s") % name)
       
   415         if len(node) != 20:
       
   416             raise RuntimeError(_("node should be 20 bytes %s") % node)
       
   417 
       
   418         if node in self.entries:
       
   419             # The revision has already been added
       
   420             return
       
   421 
       
   422         # TODO: allow configurable compression
       
   423         delta = lz4wrapper.lz4compress(delta)
       
   424 
       
   425         rawdata = ''.join((
       
   426             struct.pack('!H', len(name)), # unsigned 2 byte int
       
   427             name,
       
   428             node,
       
   429             deltabasenode,
       
   430             struct.pack('!Q', len(delta)), # unsigned 8 byte int
       
   431             delta,
       
   432         ))
       
   433 
       
   434         if self.VERSION == 1:
       
   435             # v1 support metadata
       
   436             rawmeta = shallowutil.buildpackmeta(metadata)
       
   437             rawdata += struct.pack('!I', len(rawmeta)) # unsigned 4 byte
       
   438             rawdata += rawmeta
       
   439         else:
       
   440             # v0 cannot store metadata, raise if metadata contains flag
       
   441             if metadata and metadata.get(constants.METAKEYFLAG, 0) != 0:
       
   442                 raise error.ProgrammingError('v0 pack cannot store flags')
       
   443 
       
   444         offset = self.packfp.tell()
       
   445 
       
   446         size = len(rawdata)
       
   447 
       
   448         self.entries[node] = (deltabasenode, offset, size)
       
   449 
       
   450         self.writeraw(rawdata)
       
   451 
       
   452     def createindex(self, nodelocations, indexoffset):
       
   453         entries = sorted((n, db, o, s) for n, (db, o, s)
       
   454                          in self.entries.iteritems())
       
   455 
       
   456         rawindex = ''
       
   457         fmt = self.INDEXFORMAT
       
   458         for node, deltabase, offset, size in entries:
       
   459             if deltabase == nullid:
       
   460                 deltabaselocation = FULLTEXTINDEXMARK
       
   461             else:
       
   462                 # Instead of storing the deltabase node in the index, let's
       
   463                 # store a pointer directly to the index entry for the deltabase.
       
   464                 deltabaselocation = nodelocations.get(deltabase,
       
   465                                                       NOBASEINDEXMARK)
       
   466 
       
   467             entry = struct.pack(fmt, node, deltabaselocation, offset, size)
       
   468             rawindex += entry
       
   469 
       
   470         return rawindex