hgext/largefiles/lfcommands.py
author Na'Tosha Bard <natosha@unity3d.com>
Fri, 13 Jul 2012 14:49:16 +0200
changeset 17155 88ff28bcd980
parent 17127 9e1616307c4c
child 17299 e51d4aedace9
permissions -rw-r--r--
largefiles: optimize status by synchronizing lfdirstate with the largefile on update This speeds up status on a largefiles repo by synchronizing the largefiles dirstate to the largefile's mtime upon update, preventing the files from coming back as "unsure" later, requiring a check of the SHA1 sum.

# Copyright 2009-2010 Gregory P. Ward
# Copyright 2009-2010 Intelerad Medical Systems Incorporated
# Copyright 2010-2011 Fog Creek Software
# Copyright 2010-2011 Unity Technologies
#
# This software may be used and distributed according to the terms of the
# GNU General Public License version 2 or any later version.

'''High-level command function for lfconvert, plus the cmdtable.'''

import os
import shutil

from mercurial import util, match as match_, hg, node, context, error, \
    cmdutil, scmutil
from mercurial.i18n import _
from mercurial.lock import release

import lfutil
import basestore

# -- Commands ----------------------------------------------------------

def lfconvert(ui, src, dest, *pats, **opts):
    '''convert a normal repository to a largefiles repository

    Convert repository SOURCE to a new repository DEST, identical to
    SOURCE except that certain files will be converted as largefiles:
    specifically, any file that matches any PATTERN *or* whose size is
    above the minimum size threshold is converted as a largefile. The
    size used to determine whether or not to track a file as a
    largefile is the size of the first version of the file. The
    minimum size can be specified either with --size or in
    configuration as ``largefiles.size``.

    After running this command you will need to make sure that
    largefiles is enabled anywhere you intend to push the new
    repository.

    Use --to-normal to convert largefiles back to normal files; after
    this, the DEST repository can be used without largefiles at all.'''

    if opts['to_normal']:
        tolfile = False
    else:
        tolfile = True
        size = lfutil.getminsize(ui, True, opts.get('size'), default=None)

    if not hg.islocal(src):
        raise util.Abort(_('%s is not a local Mercurial repo') % src)
    if not hg.islocal(dest):
        raise util.Abort(_('%s is not a local Mercurial repo') % dest)

    rsrc = hg.repository(ui, src)
    ui.status(_('initializing destination %s\n') % dest)
    rdst = hg.repository(ui, dest, create=True)

    success = False
    dstwlock = dstlock = None
    try:
        # Lock destination to prevent modification while it is converted to.
        # Don't need to lock src because we are just reading from its history
        # which can't change.
        dstwlock = rdst.wlock()
        dstlock = rdst.lock()

        # Get a list of all changesets in the source.  The easy way to do this
        # is to simply walk the changelog, using changelog.nodesbewteen().
        # Take a look at mercurial/revlog.py:639 for more details.
        # Use a generator instead of a list to decrease memory usage
        ctxs = (rsrc[ctx] for ctx in rsrc.changelog.nodesbetween(None,
            rsrc.heads())[0])
        revmap = {node.nullid: node.nullid}
        if tolfile:
            lfiles = set()
            normalfiles = set()
            if not pats:
                pats = ui.configlist(lfutil.longname, 'patterns', default=[])
            if pats:
                matcher = match_.match(rsrc.root, '', list(pats))
            else:
                matcher = None

            lfiletohash = {}
            for ctx in ctxs:
                ui.progress(_('converting revisions'), ctx.rev(),
                    unit=_('revision'), total=rsrc['tip'].rev())
                _lfconvert_addchangeset(rsrc, rdst, ctx, revmap,
                    lfiles, normalfiles, matcher, size, lfiletohash)
            ui.progress(_('converting revisions'), None)

            if os.path.exists(rdst.wjoin(lfutil.shortname)):
                shutil.rmtree(rdst.wjoin(lfutil.shortname))

            for f in lfiletohash.keys():
                if os.path.isfile(rdst.wjoin(f)):
                    os.unlink(rdst.wjoin(f))
                try:
                    os.removedirs(os.path.dirname(rdst.wjoin(f)))
                except OSError:
                    pass

            # If there were any files converted to largefiles, add largefiles
            # to the destination repository's requirements.
            if lfiles:
                rdst.requirements.add('largefiles')
                rdst._writerequirements()
        else:
            for ctx in ctxs:
                ui.progress(_('converting revisions'), ctx.rev(),
                    unit=_('revision'), total=rsrc['tip'].rev())
                _addchangeset(ui, rsrc, rdst, ctx, revmap)

            ui.progress(_('converting revisions'), None)
        success = True
    finally:
        rdst.dirstate.clear()
        release(dstlock, dstwlock)
        if not success:
            # we failed, remove the new directory
            shutil.rmtree(rdst.root)

def _addchangeset(ui, rsrc, rdst, ctx, revmap):
 # Convert src parents to dst parents
    parents = _convertparents(ctx, revmap)

    # Generate list of changed files
    files = _getchangedfiles(ctx, parents)

    def getfilectx(repo, memctx, f):
        if lfutil.standin(f) in files:
            # if the file isn't in the manifest then it was removed
            # or renamed, raise IOError to indicate this
            try:
                fctx = ctx.filectx(lfutil.standin(f))
            except error.LookupError:
                raise IOError
            renamed = fctx.renamed()
            if renamed:
                renamed = lfutil.splitstandin(renamed[0])

            hash = fctx.data().strip()
            path = lfutil.findfile(rsrc, hash)
            ### TODO: What if the file is not cached?
            data = ''
            fd = None
            try:
                fd = open(path, 'rb')
                data = fd.read()
            finally:
                if fd:
                    fd.close()
            return context.memfilectx(f, data, 'l' in fctx.flags(),
                                      'x' in fctx.flags(), renamed)
        else:
            return _getnormalcontext(repo.ui, ctx, f, revmap)

    dstfiles = []
    for file in files:
        if lfutil.isstandin(file):
            dstfiles.append(lfutil.splitstandin(file))
        else:
            dstfiles.append(file)
    # Commit
    _commitcontext(rdst, parents, ctx, dstfiles, getfilectx, revmap)

def _lfconvert_addchangeset(rsrc, rdst, ctx, revmap, lfiles, normalfiles,
        matcher, size, lfiletohash):
    # Convert src parents to dst parents
    parents = _convertparents(ctx, revmap)

    # Generate list of changed files
    files = _getchangedfiles(ctx, parents)

    dstfiles = []
    for f in files:
        if f not in lfiles and f not in normalfiles:
            islfile = _islfile(f, ctx, matcher, size)
            # If this file was renamed or copied then copy
            # the lfileness of its predecessor
            if f in ctx.manifest():
                fctx = ctx.filectx(f)
                renamed = fctx.renamed()
                renamedlfile = renamed and renamed[0] in lfiles
                islfile |= renamedlfile
                if 'l' in fctx.flags():
                    if renamedlfile:
                        raise util.Abort(
                            _('renamed/copied largefile %s becomes symlink')
                            % f)
                    islfile = False
            if islfile:
                lfiles.add(f)
            else:
                normalfiles.add(f)

        if f in lfiles:
            dstfiles.append(lfutil.standin(f))
            # largefile in manifest if it has not been removed/renamed
            if f in ctx.manifest():
                fctx = ctx.filectx(f)
                if 'l' in fctx.flags():
                    renamed = fctx.renamed()
                    if renamed and renamed[0] in lfiles:
                        raise util.Abort(_('largefile %s becomes symlink') % f)

                # largefile was modified, update standins
                fullpath = rdst.wjoin(f)
                util.makedirs(os.path.dirname(fullpath))
                m = util.sha1('')
                m.update(ctx[f].data())
                hash = m.hexdigest()
                if f not in lfiletohash or lfiletohash[f] != hash:
                    try:
                        fd = open(fullpath, 'wb')
                        fd.write(ctx[f].data())
                    finally:
                        if fd:
                            fd.close()
                    executable = 'x' in ctx[f].flags()
                    os.chmod(fullpath, lfutil.getmode(executable))
                    lfutil.writestandin(rdst, lfutil.standin(f), hash,
                        executable)
                    lfiletohash[f] = hash
        else:
            # normal file
            dstfiles.append(f)

    def getfilectx(repo, memctx, f):
        if lfutil.isstandin(f):
            # if the file isn't in the manifest then it was removed
            # or renamed, raise IOError to indicate this
            srcfname = lfutil.splitstandin(f)
            try:
                fctx = ctx.filectx(srcfname)
            except error.LookupError:
                raise IOError
            renamed = fctx.renamed()
            if renamed:
                # standin is always a largefile because largefile-ness
                # doesn't change after rename or copy
                renamed = lfutil.standin(renamed[0])

            return context.memfilectx(f, lfiletohash[srcfname] + '\n', 'l' in
                fctx.flags(), 'x' in fctx.flags(), renamed)
        else:
            return _getnormalcontext(repo.ui, ctx, f, revmap)

    # Commit
    _commitcontext(rdst, parents, ctx, dstfiles, getfilectx, revmap)

def _commitcontext(rdst, parents, ctx, dstfiles, getfilectx, revmap):
    mctx = context.memctx(rdst, parents, ctx.description(), dstfiles,
                          getfilectx, ctx.user(), ctx.date(), ctx.extra())
    ret = rdst.commitctx(mctx)
    rdst.setparents(ret)
    revmap[ctx.node()] = rdst.changelog.tip()

# Generate list of changed files
def _getchangedfiles(ctx, parents):
    files = set(ctx.files())
    if node.nullid not in parents:
        mc = ctx.manifest()
        mp1 = ctx.parents()[0].manifest()
        mp2 = ctx.parents()[1].manifest()
        files |= (set(mp1) | set(mp2)) - set(mc)
        for f in mc:
            if mc[f] != mp1.get(f, None) or mc[f] != mp2.get(f, None):
                files.add(f)
    return files

# Convert src parents to dst parents
def _convertparents(ctx, revmap):
    parents = []
    for p in ctx.parents():
        parents.append(revmap[p.node()])
    while len(parents) < 2:
        parents.append(node.nullid)
    return parents

# Get memfilectx for a normal file
def _getnormalcontext(ui, ctx, f, revmap):
    try:
        fctx = ctx.filectx(f)
    except error.LookupError:
        raise IOError
    renamed = fctx.renamed()
    if renamed:
        renamed = renamed[0]

    data = fctx.data()
    if f == '.hgtags':
        data = _converttags (ui, revmap, data)
    return context.memfilectx(f, data, 'l' in fctx.flags(),
                              'x' in fctx.flags(), renamed)

# Remap tag data using a revision map
def _converttags(ui, revmap, data):
    newdata = []
    for line in data.splitlines():
        try:
            id, name = line.split(' ', 1)
        except ValueError:
            ui.warn(_('skipping incorrectly formatted tag %s\n'
                % line))
            continue
        try:
            newid = node.bin(id)
        except TypeError:
            ui.warn(_('skipping incorrectly formatted id %s\n'
                % id))
            continue
        try:
            newdata.append('%s %s\n' % (node.hex(revmap[newid]),
                name))
        except KeyError:
            ui.warn(_('no mapping for id %s\n') % id)
            continue
    return ''.join(newdata)

def _islfile(file, ctx, matcher, size):
    '''Return true if file should be considered a largefile, i.e.
    matcher matches it or it is larger than size.'''
    # never store special .hg* files as largefiles
    if file == '.hgtags' or file == '.hgignore' or file == '.hgsigs':
        return False
    if matcher and matcher(file):
        return True
    try:
        return ctx.filectx(file).size() >= size * 1024 * 1024
    except error.LookupError:
        return False

def uploadlfiles(ui, rsrc, rdst, files):
    '''upload largefiles to the central store'''

    if not files:
        return

    store = basestore._openstore(rsrc, rdst, put=True)

    at = 0
    ui.debug("sending statlfile command for %d largefiles\n" % len(files))
    retval = store.exists(files)
    files = filter(lambda h: not retval[h], files)
    ui.debug("%d largefiles need to be uploaded\n" % len(files))

    for hash in files:
        ui.progress(_('uploading largefiles'), at, unit='largefile',
                    total=len(files))
        source = lfutil.findfile(rsrc, hash)
        if not source:
            raise util.Abort(_('largefile %s missing from store'
                               ' (needs to be uploaded)') % hash)
        # XXX check for errors here
        store.put(source, hash)
        at += 1
    ui.progress(_('uploading largefiles'), None)

def verifylfiles(ui, repo, all=False, contents=False):
    '''Verify that every big file revision in the current changeset
    exists in the central store.  With --contents, also verify that
    the contents of each big file revision are correct (SHA-1 hash
    matches the revision ID).  With --all, check every changeset in
    this repository.'''
    if all:
        # Pass a list to the function rather than an iterator because we know a
        # list will work.
        revs = range(len(repo))
    else:
        revs = ['.']

    store = basestore._openstore(repo)
    return store.verify(revs, contents=contents)

def cachelfiles(ui, repo, node, filelist=None):
    '''cachelfiles ensures that all largefiles needed by the specified revision
    are present in the repository's largefile cache.

    returns a tuple (cached, missing).  cached is the list of files downloaded
    by this operation; missing is the list of files that were needed but could
    not be found.'''
    lfiles = lfutil.listlfiles(repo, node)
    if filelist:
        lfiles = set(lfiles) & set(filelist)
    toget = []

    for lfile in lfiles:
        # If we are mid-merge, then we have to trust the standin that is in the
        # working copy to have the correct hashvalue.  This is because the
        # original hg.merge() already updated the standin as part of the normal
        # merge process -- we just have to udpate the largefile to match.
        if (getattr(repo, "_ismerging", False) and
             os.path.exists(repo.wjoin(lfutil.standin(lfile)))):
            expectedhash = lfutil.readstandin(repo, lfile)
        else:
            expectedhash = repo[node][lfutil.standin(lfile)].data().strip()

        # if it exists and its hash matches, it might have been locally
        # modified before updating and the user chose 'local'.  in this case,
        # it will not be in any store, so don't look for it.
        if ((not os.path.exists(repo.wjoin(lfile)) or
             expectedhash != lfutil.hashfile(repo.wjoin(lfile))) and
            not lfutil.findfile(repo, expectedhash)):
            toget.append((lfile, expectedhash))

    if toget:
        store = basestore._openstore(repo)
        ret = store.get(toget)
        return ret

    return ([], [])

def downloadlfiles(ui, repo, rev=None):
    matchfn = scmutil.match(repo[None],
                            [repo.wjoin(lfutil.shortname)], {})
    def prepare(ctx, fns):
        pass
    totalsuccess = 0
    totalmissing = 0
    for ctx in cmdutil.walkchangerevs(repo, matchfn, {'rev' : rev},
                                      prepare):
        success, missing = cachelfiles(ui, repo, ctx.node())
        totalsuccess += len(success)
        totalmissing += len(missing)
    ui.status(_("%d additional largefiles cached\n") % totalsuccess)
    if totalmissing > 0:
        ui.status(_("%d largefiles failed to download\n") % totalmissing)
    return totalsuccess, totalmissing

def updatelfiles(ui, repo, filelist=None, printmessage=True):
    wlock = repo.wlock()
    try:
        lfdirstate = lfutil.openlfdirstate(ui, repo)
        lfiles = set(lfutil.listlfiles(repo)) | set(lfdirstate)

        if filelist is not None:
            lfiles = [f for f in lfiles if f in filelist]

        printed = False
        if printmessage and lfiles:
            ui.status(_('getting changed largefiles\n'))
            printed = True
            cachelfiles(ui, repo, '.', lfiles)

        updated, removed = 0, 0
        for i in map(lambda f: _updatelfile(repo, lfdirstate, f), lfiles):
            # increment the appropriate counter according to _updatelfile's
            # return value
            updated += i > 0 and i or 0
            removed -= i < 0 and i or 0
            if printmessage and (removed or updated) and not printed:
                ui.status(_('getting changed largefiles\n'))
                printed = True

        lfdirstate.write()
        if printed and printmessage:
            ui.status(_('%d largefiles updated, %d removed\n') % (updated,
                removed))
    finally:
        wlock.release()

def _updatelfile(repo, lfdirstate, lfile):
    '''updates a single largefile and copies the state of its standin from
    the repository's dirstate to its state in the lfdirstate.

    returns 1 if the file was modified, -1 if the file was removed, 0 if the
    file was unchanged, and None if the needed largefile was missing from the
    cache.'''
    ret = 0
    abslfile = repo.wjoin(lfile)
    absstandin = repo.wjoin(lfutil.standin(lfile))
    if os.path.exists(absstandin):
        if os.path.exists(absstandin+'.orig'):
            shutil.copyfile(abslfile, abslfile+'.orig')
        expecthash = lfutil.readstandin(repo, lfile)
        if (expecthash != '' and
            (not os.path.exists(abslfile) or
             expecthash != lfutil.hashfile(abslfile))):
            if not lfutil.copyfromcache(repo, expecthash, lfile):
                # use normallookup() to allocate entry in largefiles dirstate,
                # because lack of it misleads lfilesrepo.status() into
                # recognition that such cache missing files are REMOVED.
                lfdirstate.normallookup(lfile)
                return None # don't try to set the mode
            else:
                # Synchronize largefile dirstate to the last modified time of
                # the file
                lfdirstate.normal(lfile)
            ret = 1
        mode = os.stat(absstandin).st_mode
        if mode != os.stat(abslfile).st_mode:
            os.chmod(abslfile, mode)
            ret = 1
    else:
        # Remove lfiles for which the standin is deleted, unless the
        # lfile is added to the repository again. This happens when a
        # largefile is converted back to a normal file: the standin
        # disappears, but a new (normal) file appears as the lfile.
        if os.path.exists(abslfile) and lfile not in repo[None]:
            util.unlinkpath(abslfile)
            ret = -1
    state = repo.dirstate[lfutil.standin(lfile)]
    if state == 'n':
        # When rebasing, we need to synchronize the standin and the largefile,
        # because otherwise the largefile will get reverted.  But for commit's
        # sake, we have to mark the file as unclean.
        if getattr(repo, "_isrebasing", False):
           lfdirstate.normallookup(lfile)
        else:
            lfdirstate.normal(lfile)
    elif state == 'r':
        lfdirstate.remove(lfile)
    elif state == 'a':
        lfdirstate.add(lfile)
    elif state == '?':
        lfdirstate.drop(lfile)
    return ret

def catlfile(repo, lfile, rev, filename):
    hash = lfutil.readstandin(repo, lfile, rev)
    if not lfutil.inusercache(repo.ui, hash):
        store = basestore._openstore(repo)
        success, missing = store.get([(lfile, hash)])
        if len(success) != 1:
            raise util.Abort(
                _('largefile %s is not in cache and could not be downloaded')
                    % lfile)
    path = lfutil.usercachepath(repo.ui, hash)
    fpout = cmdutil.makefileobj(repo, filename)
    fpin = open(path, "rb")
    fpout.write(fpin.read())
    fpout.close()
    fpin.close()
    return 0

# -- hg commands declarations ------------------------------------------------

cmdtable = {
    'lfconvert': (lfconvert,
                  [('s', 'size', '',
                    _('minimum size (MB) for files to be converted '
                      'as largefiles'),
                    'SIZE'),
                  ('', 'to-normal', False,
                   _('convert from a largefiles repo to a normal repo')),
                  ],
                  _('hg lfconvert SOURCE DEST [FILE ...]')),
    }