lfs: verify lfs object content when transferring to and from the remote store
authorMatt Harbison <matt_harbison@yahoo.com>
Fri, 17 Nov 2017 00:06:45 -0500
changeset 35476 417e8e040102
parent 35475 b0c01a5ee35c
child 35477 bb6a80fc969a
lfs: verify lfs object content when transferring to and from the remote store This avoids inserting corrupt files into the usercache, and local and remote stores. One down side is that the bad file won't be available locally for forensic purposes after a remote download. I'm thinking about adding an 'incoming' directory to the local lfs store to handle the download, and then move it to the 'objects' directory after it passes verification. That would have the additional benefit of not concatenating each transfer chunk in memory until the full file is transferred. Verification isn't needed when the data is passed back through the revlog interface or when the oid was just calculated, but otherwise it is on by default. The additional overhead should be well worth avoiding problems with file based remote stores, or buggy lfs servers. Having two different verify functions is a little sad, but the full data of the blob is mostly passed around in memory, because that's what the revlog interface wants. The upload function, however, chunks up the data. It would be ideal if that was how the content is always handled, but that's probably a huge project. I don't really like printing the long hash, but `hg debugdata` isn't a public interface, and is the only way to get it. The filelog and revision info is nowhere near this area, so recommending `hg verify` is the easiest thing to do.
hgext/lfs/blobstore.py
hgext/lfs/wrapper.py
tests/test-lfs-test-server.t
tests/test-lfs.t
--- a/hgext/lfs/blobstore.py	Mon Dec 04 21:41:04 2017 -0500
+++ b/hgext/lfs/blobstore.py	Fri Nov 17 00:06:45 2017 -0500
@@ -7,6 +7,7 @@
 
 from __future__ import absolute_import
 
+import hashlib
 import json
 import os
 import re
@@ -99,8 +100,11 @@
         self.cachevfs = lfsvfs(usercache)
         self.ui = repo.ui
 
-    def write(self, oid, data):
+    def write(self, oid, data, verify=True):
         """Write blob to local blobstore."""
+        if verify:
+            _verify(oid, data)
+
         with self.vfs(oid, 'wb', atomictemp=True) as fp:
             fp.write(data)
 
@@ -110,14 +114,23 @@
             self.ui.note(_('lfs: adding %s to the usercache\n') % oid)
             lfutil.link(self.vfs.join(oid), self.cachevfs.join(oid))
 
-    def read(self, oid):
+    def read(self, oid, verify=True):
         """Read blob from local blobstore."""
         if not self.vfs.exists(oid):
+            blob = self._read(self.cachevfs, oid, verify)
+            self.ui.note(_('lfs: found %s in the usercache\n') % oid)
             lfutil.link(self.cachevfs.join(oid), self.vfs.join(oid))
-            self.ui.note(_('lfs: found %s in the usercache\n') % oid)
         else:
             self.ui.note(_('lfs: found %s in the local lfs store\n') % oid)
-        return self.vfs.read(oid)
+            blob = self._read(self.vfs, oid, verify)
+        return blob
+
+    def _read(self, vfs, oid, verify):
+        """Read blob (after verifying) from the given store"""
+        blob = vfs.read(oid)
+        if verify:
+            _verify(oid, blob)
+        return blob
 
     def has(self, oid):
         """Returns True if the local blobstore contains the requested blob,
@@ -233,6 +246,8 @@
         request = util.urlreq.request(href)
         if action == 'upload':
             # If uploading blobs, read data from local blobstore.
+            with localstore.vfs(oid) as fp:
+                _verifyfile(oid, fp)
             request.data = filewithprogress(localstore.vfs(oid), None)
             request.get_method = lambda: 'PUT'
 
@@ -253,7 +268,7 @@
 
         if action == 'download':
             # If downloading blobs, store downloaded data to local blobstore
-            localstore.write(oid, response)
+            localstore.write(oid, response, verify=True)
 
     def _batch(self, pointers, localstore, action):
         if action not in ['upload', 'download']:
@@ -324,14 +339,14 @@
 
     def writebatch(self, pointers, fromstore):
         for p in pointers:
-            content = fromstore.read(p.oid())
+            content = fromstore.read(p.oid(), verify=True)
             with self.vfs(p.oid(), 'wb', atomictemp=True) as fp:
                 fp.write(content)
 
     def readbatch(self, pointers, tostore):
         for p in pointers:
             content = self.vfs.read(p.oid())
-            tostore.write(p.oid(), content)
+            tostore.write(p.oid(), content, verify=True)
 
 class _nullremote(object):
     """Null store storing blobs to /dev/null."""
@@ -368,6 +383,24 @@
     None: _promptremote,
 }
 
+def _verify(oid, content):
+    realoid = hashlib.sha256(content).hexdigest()
+    if realoid != oid:
+        raise error.Abort(_('detected corrupt lfs object: %s') % oid,
+                          hint=_('run hg verify'))
+
+def _verifyfile(oid, fp):
+    sha256 = hashlib.sha256()
+    while True:
+        data = fp.read(1024 * 1024)
+        if not data:
+            break
+        sha256.update(data)
+    realoid = sha256.hexdigest()
+    if realoid != oid:
+        raise error.Abort(_('detected corrupt lfs object: %s') % oid,
+                          hint=_('run hg verify'))
+
 def remote(repo):
     """remotestore factory. return a store in _storemap depending on config"""
     defaulturl = ''
--- a/hgext/lfs/wrapper.py	Mon Dec 04 21:41:04 2017 -0500
+++ b/hgext/lfs/wrapper.py	Fri Nov 17 00:06:45 2017 -0500
@@ -54,7 +54,9 @@
     if not store.has(oid):
         p.filename = getattr(self, 'indexfile', None)
         self.opener.lfsremoteblobstore.readbatch([p], store)
-    text = store.read(oid)
+
+    # The caller will validate the content
+    text = store.read(oid, verify=False)
 
     # pack hg filelog metadata
     hgmeta = {}
@@ -76,7 +78,7 @@
 
     # git-lfs only supports sha256
     oid = hashlib.sha256(text).hexdigest()
-    self.opener.lfslocalblobstore.write(oid, text)
+    self.opener.lfslocalblobstore.write(oid, text, verify=False)
 
     # replace contents with metadata
     longoid = 'sha256:%s' % oid
--- a/tests/test-lfs-test-server.t	Mon Dec 04 21:41:04 2017 -0500
+++ b/tests/test-lfs-test-server.t	Fri Nov 17 00:06:45 2017 -0500
@@ -105,16 +105,15 @@
   lfs: found 37a65ab78d5ecda767e8622c248b5dbff1e68b1678ab0e730d5eb8601ec8ad19 in the local lfs store
   3 files updated, 0 files merged, 0 files removed, 0 files unresolved
 
-Test a corrupt file download, but clear the cache first to force a download
-
-XXX: ideally, the validation would occur before polluting the usercache and
-local store, with a clearer error message.
+Test a corrupt file download, but clear the cache first to force a download.
 
   $ rm -rf `hg config lfs.usercache`
   $ cp $TESTTMP/lfs-content/d1/1e/1a642b60813aee592094109b406089b8dff4cb157157f753418ec7857998 blob
   $ echo 'damage' > $TESTTMP/lfs-content/d1/1e/1a642b60813aee592094109b406089b8dff4cb157157f753418ec7857998
   $ rm ../repo1/.hg/store/lfs/objects/d1/1e1a642b60813aee592094109b406089b8dff4cb157157f753418ec7857998
   $ rm ../repo1/*
+
+XXX: suggesting `hg verify` won't help with a corrupt file on the lfs server.
   $ hg --repo ../repo1 update -C tip -v
   resolving manifests
   getting a
@@ -123,18 +122,16 @@
   lfs: found 31cf46fbc4ecd458a0943c5b4881f1f5a6dd36c53d6167d5b69ac45149b38e5b in the local lfs store
   getting c
   lfs: downloading d11e1a642b60813aee592094109b406089b8dff4cb157157f753418ec7857998 (19 bytes)
-  lfs: adding d11e1a642b60813aee592094109b406089b8dff4cb157157f753418ec7857998 to the usercache
-  lfs: processed: d11e1a642b60813aee592094109b406089b8dff4cb157157f753418ec7857998
-  lfs: found d11e1a642b60813aee592094109b406089b8dff4cb157157f753418ec7857998 in the local lfs store
-  abort: integrity check failed on data/c.i:0!
+  abort: detected corrupt lfs object: d11e1a642b60813aee592094109b406089b8dff4cb157157f753418ec7857998
+  (run hg verify)
   [255]
 
-BUG: the corrupted blob was added to the usercache and local store
+The corrupted blob is not added to the usercache or local store
 
-  $ cat ../repo1/.hg/store/lfs/objects/d1/1e1a642b60813aee592094109b406089b8dff4cb157157f753418ec7857998 | $TESTDIR/f --sha256
-  sha256=fa82ca222fc9813afad3559637960bf311170cdd80ed35287f4623eb2320a660
-  $ cat `hg config lfs.usercache`/d1/1e1a642b60813aee592094109b406089b8dff4cb157157f753418ec7857998 | $TESTDIR/f --sha256
-  sha256=fa82ca222fc9813afad3559637960bf311170cdd80ed35287f4623eb2320a660
+  $ test -f ../repo1/.hg/store/lfs/objects/d1/1e1a642b60813aee592094109b406089b8dff4cb157157f753418ec7857998
+  [1]
+  $ test -f `hg config lfs.usercache`/d1/1e1a642b60813aee592094109b406089b8dff4cb157157f753418ec7857998
+  [1]
   $ cp blob $TESTTMP/lfs-content/d1/1e/1a642b60813aee592094109b406089b8dff4cb157157f753418ec7857998
 
 Test a corrupted file upload
@@ -146,7 +143,8 @@
   pushing to ../repo1
   searching for changes
   lfs: uploading e659058e26b07b39d2a9c7145b3f99b41f797b6621c8076600e9cb7ee88291f0 (17 bytes)
-  abort: HTTP error: HTTP Error 500: Internal Server Error (oid=e659058e26b07b39d2a9c7145b3f99b41f797b6621c8076600e9cb7ee88291f0, action=upload)!
+  abort: detected corrupt lfs object: e659058e26b07b39d2a9c7145b3f99b41f797b6621c8076600e9cb7ee88291f0
+  (run hg verify)
   [255]
 
 Check error message when the remote missed a blob:
--- a/tests/test-lfs.t	Mon Dec 04 21:41:04 2017 -0500
+++ b/tests/test-lfs.t	Fri Nov 17 00:06:45 2017 -0500
@@ -704,18 +704,17 @@
   updating to branch default
   resolving manifests
   getting l
-  lfs: adding 22f66a3fc0b9bf3f012c814303995ec07099b3a9ce02a7af84b5970811074a3b to the usercache
-  lfs: found 22f66a3fc0b9bf3f012c814303995ec07099b3a9ce02a7af84b5970811074a3b in the local lfs store
-  abort: integrity check failed on data/l.i:3!
+  abort: detected corrupt lfs object: 22f66a3fc0b9bf3f012c814303995ec07099b3a9ce02a7af84b5970811074a3b
+  (run hg verify)
   [255]
 
-BUG: A corrupted lfs blob either shouldn't be created after a transfer from a
-file://remotestore, or it shouldn't be left behind.
+A corrupted lfs blob is not transferred from a file://remotestore to the
+usercache or local store.
 
-  $ cat emptycache/22/f66a3fc0b9bf3f012c814303995ec07099b3a9ce02a7af84b5970811074a3b | $TESTDIR/f --sha256
-  sha256=40f67c7e91d554db4bc500f8f62c2e40f9f61daa5b62388e577bbae26f5396ff
-  $ cat fromcorrupt2/.hg/store/lfs/objects/22/f66a3fc0b9bf3f012c814303995ec07099b3a9ce02a7af84b5970811074a3b | $TESTDIR/f --sha256
-  sha256=40f67c7e91d554db4bc500f8f62c2e40f9f61daa5b62388e577bbae26f5396ff
+  $ test -f emptycache/22/f66a3fc0b9bf3f012c814303995ec07099b3a9ce02a7af84b5970811074a3b
+  [1]
+  $ test -f fromcorrupt2/.hg/store/lfs/objects/22/f66a3fc0b9bf3f012c814303995ec07099b3a9ce02a7af84b5970811074a3b
+  [1]
 
   $ hg -R fromcorrupt2 verify
   checking changesets
@@ -723,14 +722,13 @@
   crosschecking files in changesets and manifests
   checking files
    l@1: unpacking 46a2f24864bc: integrity check failed on data/l.i:0
-   l@4: unpacking 6f1ff1f39c11: integrity check failed on data/l.i:3
    large@0: unpacking 2c531e0992ff: integrity check failed on data/large.i:0
   4 files, 5 changesets, 10 total revisions
-  3 integrity errors encountered!
+  2 integrity errors encountered!
   (first damaged changeset appears to be 0)
   [1]
 
-BUG: push will happily send corrupt files upstream.  (The alternate dummy remote
+Corrupt local files are not sent upstream.  (The alternate dummy remote
 avoids the corrupt lfs object in the original remote.)
 
   $ mkdir $TESTTMP/dummy-remote2
@@ -742,18 +740,9 @@
   lfs: found 89b6070915a3d573ff3599d1cda305bc5e38549b15c4847ab034169da66e1ca8 in the local lfs store
   lfs: found b1a6ea88da0017a0e77db139a54618986e9a2489bee24af9fe596de9daac498c in the local lfs store
   lfs: found 66100b384bf761271b407d79fc30cdd0554f3b2c5d944836e936d584b88ce88e in the local lfs store
-  5 changesets found
-  uncompressed size of bundle content:
-       997 (changelog)
-      1032 (manifests)
-       841  l
-       272  large
-       788  s
-       139  small
-  adding changesets
-  adding manifests
-  adding file changes
-  added 5 changesets with 10 changes to 4 files
+  abort: detected corrupt lfs object: 66100b384bf761271b407d79fc30cdd0554f3b2c5d944836e936d584b88ce88e
+  (run hg verify)
+  [255]
 
   $ hg -R fromcorrupt2 --config lfs.url=file:///$TESTTMP/dummy-remote2 verify -v
   repository uses revlog format 1
@@ -764,27 +753,21 @@
   lfs: found 66100b384bf761271b407d79fc30cdd0554f3b2c5d944836e936d584b88ce88e in the local lfs store
    l@1: unpacking 46a2f24864bc: integrity check failed on data/l.i:0
   lfs: found 22f66a3fc0b9bf3f012c814303995ec07099b3a9ce02a7af84b5970811074a3b in the local lfs store
-   l@4: unpacking 6f1ff1f39c11: integrity check failed on data/l.i:3
   lfs: found 66100b384bf761271b407d79fc30cdd0554f3b2c5d944836e936d584b88ce88e in the local lfs store
    large@0: unpacking 2c531e0992ff: integrity check failed on data/large.i:0
   lfs: found 89b6070915a3d573ff3599d1cda305bc5e38549b15c4847ab034169da66e1ca8 in the local lfs store
   lfs: found b1a6ea88da0017a0e77db139a54618986e9a2489bee24af9fe596de9daac498c in the local lfs store
   4 files, 5 changesets, 10 total revisions
-  3 integrity errors encountered!
+  2 integrity errors encountered!
   (first damaged changeset appears to be 0)
   [1]
 
   $ cat $TESTTMP/dummy-remote2/22/f66a3fc0b9bf3f012c814303995ec07099b3a9ce02a7af84b5970811074a3b | $TESTDIR/f --sha256
-  sha256=40f67c7e91d554db4bc500f8f62c2e40f9f61daa5b62388e577bbae26f5396ff
+  sha256=22f66a3fc0b9bf3f012c814303995ec07099b3a9ce02a7af84b5970811074a3b
   $ cat fromcorrupt2/.hg/store/lfs/objects/22/f66a3fc0b9bf3f012c814303995ec07099b3a9ce02a7af84b5970811074a3b | $TESTDIR/f --sha256
-  sha256=40f67c7e91d554db4bc500f8f62c2e40f9f61daa5b62388e577bbae26f5396ff
-
-  $ cat $TESTTMP/dummy-remote2/66/100b384bf761271b407d79fc30cdd0554f3b2c5d944836e936d584b88ce88e
-  LONGER-THAN-TEN-BYTES-WILL-TRIGGER-LFS
-  damage
-  $ cat $TESTTMP/dummy-remote2/22/f66a3fc0b9bf3f012c814303995ec07099b3a9ce02a7af84b5970811074a3b
-  RESTORE-TO-BE-LARGE
-  damage
+  sha256=22f66a3fc0b9bf3f012c814303995ec07099b3a9ce02a7af84b5970811074a3b
+  $ test -f $TESTTMP/dummy-remote2/66/100b384bf761271b407d79fc30cdd0554f3b2c5d944836e936d584b88ce88e
+  [1]
 
 Accessing a corrupt file will complain