Schedule checksum computations so that we reduce hard drive seeks
authorMikael Berthe <mikael@lilotux.net>
Sun, 29 Jun 2014 01:41:33 +0200
changeset 8 25ad96511395
parent 7 68375cc98f98
child 9 5b58342459eb
Schedule checksum computations so that we reduce hard drive seeks The previous approach was not working.
goduf.go
--- a/goduf.go	Sat Jun 28 22:23:28 2014 +0200
+++ b/goduf.go	Sun Jun 29 01:41:33 2014 +0200
@@ -41,7 +41,8 @@
 type sumType int
 
 const (
-	fullChecksum sumType = iota
+	noChecksum sumType = iota
+	fullChecksum
 	partialChecksum
 )
 
@@ -51,6 +52,7 @@
 	os.FileInfo
 	PartialHash []byte
 	Hash        []byte
+	needHash    sumType
 }
 
 // FileObjList is only exported so that we can have a sort interface on inodes.
@@ -209,6 +211,8 @@
 		return fo.MedSum()
 	} else if sType == fullChecksum {
 		return fo.CheckSum()
+	} else if sType == noChecksum {
+		return nil
 	}
 	panic("Internal error: Invalid sType")
 }
@@ -240,8 +244,53 @@
 	return
 }
 
+// checksum returns the requested checksum as a string.
+// If the checksum has not been pre-computed, it is calculated now.
+func (fo fileObj) checksum(sType sumType) (string, error) {
+	var hbytes []byte
+	if sType == partialChecksum {
+		hbytes = fo.PartialHash
+	} else if sType == fullChecksum {
+		hbytes = fo.Hash
+	} else {
+		panic("Internal error: Invalid sType")
+	}
+	if hbytes == nil {
+		if err := fo.Sum(sType); err != nil {
+			return "", err
+		}
+		if sType == partialChecksum {
+			hbytes = fo.PartialHash
+		} else if sType == fullChecksum {
+			hbytes = fo.Hash
+		}
+	}
+	return hex.EncodeToString(hbytes), nil
+}
+
+func (fileList FileObjList) computeSheduledChecksums() {
+	// Sort the list for better efficiency
+	sort.Sort(ByInode(fileList))
+
+	myLog.Printf(6, "  . will compute %d checksums\n", len(fileList))
+
+	// Compute checksums
+	for _, fo := range fileList {
+		if err := fo.Sum(fo.needHash); err != nil {
+			myLog.Println(0, "Error:", err)
+		}
+	}
+}
+
+func (fileList FileObjList) scheduleChecksum(sType sumType) {
+	for _, fo := range fileList {
+		fo.needHash = sType
+	}
+}
+
 func (fileList FileObjList) findDupesChecksums(sType sumType) []FileObjList {
 	var dupeList []FileObjList
+	var scheduleFull []FileObjList
 	hashes := make(map[string]FileObjList)
 
 	// Sort the list for better efficiency
@@ -249,22 +298,12 @@
 
 	// Compute checksums
 	for _, fo := range fileList {
-		if err := fo.Sum(sType); err != nil {
+		hash, err := fo.checksum(sType)
+		if err != nil {
 			myLog.Println(0, "Error:", err)
 			continue
 		}
-		var hbytes []byte
-		if sType == partialChecksum {
-			hbytes = fo.PartialHash
-		} else if sType == fullChecksum {
-			hbytes = fo.Hash
-		} else {
-			panic("Internal error: Invalid sType")
-		}
-		if hbytes != nil {
-			hash := hex.EncodeToString(hbytes)
-			hashes[hash] = append(hashes[hash], fo)
-		}
+		hashes[hash] = append(hashes[hash], fo)
 	}
 
 	// Let's de-dupe now...
@@ -273,11 +312,25 @@
 			continue
 		}
 		if sType == partialChecksum {
-			dupeList = append(dupeList, l.findDupesChecksums(fullChecksum)...)
-		} else { // full checksums -> we’re done
+			scheduleFull = append(scheduleFull, l)
+		} else { // full checksums -> we're done
 			dupeList = append(dupeList, l)
+			// TODO: sort by increasing size
+			myLog.Printf(5, "  . found %d new duplicates\n", len(l))
 		}
-		// TODO: sort by increasing size
+	}
+	if sType == partialChecksum {
+		var csList FileObjList
+		for _, fol := range scheduleFull {
+			csList = append(csList, fol...)
+		}
+		myLog.Printf(6, "  .. findDupesChecksums: computing %d "+
+			"full checksums\n", len(csList)) // DBG
+		csList.computeSheduledChecksums()
+		for _, l := range scheduleFull {
+			r := l.findDupesChecksums(fullChecksum)
+			dupeList = append(dupeList, r...)
+		}
 	}
 
 	return dupeList
@@ -286,17 +339,40 @@
 // findDupes() uses checksums to find file duplicates
 func (data *dataT) findDupes(skipPartial bool) []FileObjList {
 	var dupeList []FileObjList
+	var schedulePartial []FileObjList
+	var scheduleFull []FileObjList
 
 	for size, sizeGroup := range data.sizeGroups {
-		var r []FileObjList
 		// We skip partial checksums for small files or if requested
 		if size > minSizePartialChecksum && !skipPartial {
-			r = sizeGroup.files.findDupesChecksums(partialChecksum)
+			sizeGroup.files.scheduleChecksum(partialChecksum)
+			schedulePartial = append(schedulePartial, sizeGroup.files)
 		} else {
-			r = sizeGroup.files.findDupesChecksums(fullChecksum)
+			sizeGroup.files.scheduleChecksum(fullChecksum)
+			scheduleFull = append(scheduleFull, sizeGroup.files)
 		}
+	}
+
+	var csList FileObjList
+	for _, fol := range schedulePartial {
+		csList = append(csList, fol...)
+	}
+	for _, fol := range scheduleFull {
+		csList = append(csList, fol...)
+	}
+	myLog.Printf(6, "  .. findDupes: computing %d misc checksums\n",
+		len(csList)) // DBG
+	csList.computeSheduledChecksums()
+
+	for _, l := range schedulePartial {
+		r := l.findDupesChecksums(partialChecksum)
 		dupeList = append(dupeList, r...)
 	}
+	for _, l := range scheduleFull {
+		r := l.findDupesChecksums(fullChecksum)
+		dupeList = append(dupeList, r...)
+	}
+	// TODO: sort by increasing size
 	return dupeList
 }