hgext/fastannotate/revmap.py
author Manuel Jacob <me@manueljacob.de>
Sun, 29 May 2022 15:17:27 +0200
changeset 49284 d44e3c45f0e4
parent 48946 642e31cb55f0
permissions -rw-r--r--
py3: replace `pycompat.xrange` by `range`

# Copyright 2016-present Facebook. All Rights Reserved.
#
# revmap: trivial hg hash - linelog rev bidirectional map
#
# This software may be used and distributed according to the terms of the
# GNU General Public License version 2 or any later version.


import bisect
import io
import os
import struct

from mercurial.node import hex
from mercurial.pycompat import open
from mercurial import (
    error as hgerror,
)
from . import error

# the revmap file format is straightforward:
#
#    8 bytes: header
#    1 byte : flag for linelog revision 1
#    ? bytes: (optional) '\0'-terminated path string
#             only exists if (flag & renameflag) != 0
#   20 bytes: hg hash for linelog revision 1
#    1 byte : flag for linelog revision 2
#    ? bytes: (optional) '\0'-terminated path string
#   20 bytes: hg hash for linelog revision 2
#   ....
#
# the implementation is kinda stupid: __init__ loads the whole revmap.
# no laziness. benchmark shows loading 10000 revisions is about 0.015
# seconds, which looks enough for our use-case. if this implementation
# becomes a bottleneck, we can change it to lazily read the file
# from the end.

# whether the changeset is in the side branch. i.e. not in the linear main
# branch but only got referenced by lines in merge changesets.
sidebranchflag = 1

# whether the changeset changes the file path (ie. is a rename)
renameflag = 2

# len(mercurial.node.nullid)
_hshlen = 20


class revmap:
    """trivial hg bin hash - linelog rev bidirectional map

    also stores a flag (uint8) for each revision, and track renames.
    """

    HEADER = b'REVMAP1\0'

    def __init__(self, path=None):
        """create or load the revmap, optionally associate to a file

        if path is None, the revmap is entirely in-memory. the caller is
        responsible for locking. concurrent writes to a same file is unsafe.
        the caller needs to make sure one file is associated to at most one
        revmap object at a time."""
        self.path = path
        self._rev2hsh = [None]
        self._rev2flag = [None]
        self._hsh2rev = {}
        # since rename does not happen frequently, do not store path for every
        # revision. self._renamerevs can be used for bisecting.
        self._renamerevs = [0]
        self._renamepaths = [b'']
        self._lastmaxrev = -1
        if path:
            if os.path.exists(path):
                self._load()
            else:
                # write the header so "append" can do incremental updates
                self.flush()

    def copyfrom(self, rhs):
        """copy the map data from another revmap. do not affect self.path"""
        self._rev2hsh = rhs._rev2hsh[:]
        self._rev2flag = rhs._rev2flag[:]
        self._hsh2rev = rhs._hsh2rev.copy()
        self._renamerevs = rhs._renamerevs[:]
        self._renamepaths = rhs._renamepaths[:]
        self._lastmaxrev = -1

    @property
    def maxrev(self):
        """return max linelog revision number"""
        return len(self._rev2hsh) - 1

    def append(self, hsh, sidebranch=False, path=None, flush=False):
        """add a binary hg hash and return the mapped linelog revision.
        if flush is True, incrementally update the file.
        """
        if hsh in self._hsh2rev:
            raise error.CorruptedFileError(
                b'%r is in revmap already' % hex(hsh)
            )
        if len(hsh) != _hshlen:
            raise hgerror.ProgrammingError(
                b'hsh must be %d-char long' % _hshlen
            )
        idx = len(self._rev2hsh)
        flag = 0
        if sidebranch:
            flag |= sidebranchflag
        if path is not None and path != self._renamepaths[-1]:
            flag |= renameflag
            self._renamerevs.append(idx)
            self._renamepaths.append(path)
        self._rev2hsh.append(hsh)
        self._rev2flag.append(flag)
        self._hsh2rev[hsh] = idx
        if flush:
            self.flush()
        return idx

    def rev2hsh(self, rev):
        """convert linelog revision to hg hash. return None if not found."""
        if rev > self.maxrev or rev < 0:
            return None
        return self._rev2hsh[rev]

    def rev2flag(self, rev):
        """get the flag (uint8) for a given linelog revision.
        return None if revision does not exist.
        """
        if rev > self.maxrev or rev < 0:
            return None
        return self._rev2flag[rev]

    def rev2path(self, rev):
        """get the path for a given linelog revision.
        return None if revision does not exist.
        """
        if rev > self.maxrev or rev < 0:
            return None
        idx = bisect.bisect_right(self._renamerevs, rev) - 1
        return self._renamepaths[idx]

    def hsh2rev(self, hsh):
        """convert hg hash to linelog revision. return None if not found."""
        return self._hsh2rev.get(hsh)

    def clear(self, flush=False):
        """make the map empty. if flush is True, write to disk"""
        # rev 0 is reserved, real rev starts from 1
        self._rev2hsh = [None]
        self._rev2flag = [None]
        self._hsh2rev = {}
        self._rev2path = [b'']
        self._lastmaxrev = -1
        if flush:
            self.flush()

    def flush(self):
        """write the state down to the file"""
        if not self.path:
            return
        if self._lastmaxrev == -1:  # write the entire file
            with open(self.path, b'wb') as f:
                f.write(self.HEADER)
                for i in range(1, len(self._rev2hsh)):
                    self._writerev(i, f)
        else:  # append incrementally
            with open(self.path, b'ab') as f:
                for i in range(self._lastmaxrev + 1, len(self._rev2hsh)):
                    self._writerev(i, f)
        self._lastmaxrev = self.maxrev

    def _load(self):
        """load state from file"""
        if not self.path:
            return
        # use local variables in a loop. CPython uses LOAD_FAST for them,
        # which is faster than both LOAD_CONST and LOAD_GLOBAL.
        flaglen = 1
        hshlen = _hshlen
        with open(self.path, b'rb') as f:
            if f.read(len(self.HEADER)) != self.HEADER:
                raise error.CorruptedFileError()
            self.clear(flush=False)
            while True:
                buf = f.read(flaglen)
                if not buf:
                    break
                flag = ord(buf)
                rev = len(self._rev2hsh)
                if flag & renameflag:
                    path = self._readcstr(f)
                    self._renamerevs.append(rev)
                    self._renamepaths.append(path)
                hsh = f.read(hshlen)
                if len(hsh) != hshlen:
                    raise error.CorruptedFileError()
                self._hsh2rev[hsh] = rev
                self._rev2flag.append(flag)
                self._rev2hsh.append(hsh)
        self._lastmaxrev = self.maxrev

    def _writerev(self, rev, f):
        """append a revision data to file"""
        flag = self._rev2flag[rev]
        hsh = self._rev2hsh[rev]
        f.write(struct.pack(b'B', flag))
        if flag & renameflag:
            path = self.rev2path(rev)
            if path is None:
                raise error.CorruptedFileError(b'cannot find path for %s' % rev)
            f.write(path + b'\0')
        f.write(hsh)

    @staticmethod
    def _readcstr(f):
        """read a C-language-like '\0'-terminated string"""
        buf = b''
        while True:
            ch = f.read(1)
            if not ch:  # unexpected eof
                raise error.CorruptedFileError()
            if ch == b'\0':
                break
            buf += ch
        return buf

    def __contains__(self, f):
        """(fctx or (node, path)) -> bool.
        test if (node, path) is in the map, and is not in a side branch.
        f can be either a tuple of (node, path), or a fctx.
        """
        if isinstance(f, tuple):  # f: (node, path)
            hsh, path = f
        else:  # f: fctx
            hsh, path = f.node(), f.path()
        rev = self.hsh2rev(hsh)
        if rev is None:
            return False
        if path is not None and path != self.rev2path(rev):
            return False
        return (self.rev2flag(rev) & sidebranchflag) == 0


def getlastnode(path):
    """return the last hash in a revmap, without loading its full content.
    this is equivalent to `m = revmap(path); m.rev2hsh(m.maxrev)`, but faster.
    """
    hsh = None
    try:
        with open(path, b'rb') as f:
            f.seek(-_hshlen, io.SEEK_END)
            if f.tell() > len(revmap.HEADER):
                hsh = f.read(_hshlen)
    except IOError:
        pass
    return hsh