Refactor checksum functions to reduce code duplication
authorMikael Berthe <mikael@lilotux.net>
Sat, 28 Jun 2014 22:23:28 +0200
changeset 7 68375cc98f98
parent 6 6740350569d3
child 8 25ad96511395
Refactor checksum functions to reduce code duplication
goduf.go
--- a/goduf.go	Sat Jun 28 21:57:58 2014 +0200
+++ b/goduf.go	Sat Jun 28 22:23:28 2014 +0200
@@ -56,7 +56,7 @@
 // FileObjList is only exported so that we can have a sort interface on inodes.
 type FileObjList []*fileObj
 
-type sizeClass struct {
+type sizeClass struct { // XXX still useful?
 	files    FileObjList
 	medsums  map[string]FileObjList
 	fullsums map[string]FileObjList
@@ -240,21 +240,31 @@
 	return
 }
 
-func findDupesFullChecksums(fileList FileObjList) []FileObjList {
+func (fileList FileObjList) findDupesChecksums(sType sumType) []FileObjList {
 	var dupeList []FileObjList
 	hashes := make(map[string]FileObjList)
 
 	// Sort the list for better efficiency
 	sort.Sort(ByInode(fileList))
 
-	// Compute full checksums
+	// Compute checksums
 	for _, fo := range fileList {
-		if err := fo.Sum(fullChecksum); err != nil {
+		if err := fo.Sum(sType); err != nil {
 			myLog.Println(0, "Error:", err)
 			continue
 		}
-		hash := hex.EncodeToString(fo.Hash)
-		hashes[hash] = append(hashes[hash], fo)
+		var hbytes []byte
+		if sType == partialChecksum {
+			hbytes = fo.PartialHash
+		} else if sType == fullChecksum {
+			hbytes = fo.Hash
+		} else {
+			panic("Internal error: Invalid sType")
+		}
+		if hbytes != nil {
+			hash := hex.EncodeToString(hbytes)
+			hashes[hash] = append(hashes[hash], fo)
+		}
 	}
 
 	// Let's de-dupe now...
@@ -262,39 +272,12 @@
 		if len(l) < 2 {
 			continue
 		}
-		dupeList = append(dupeList, l)
-		// TODO sort by increasing size
-		myLog.Printf(5, "  . found %d new duplicates\n", len(l))
-	}
-
-	return dupeList
-}
-
-// TODO: refactor to avoid code duplication
-func findDupesPartialChecksums(fileList FileObjList) []FileObjList {
-	var dupeList []FileObjList
-	hashes := make(map[string]FileObjList)
-
-	// Sort the list for better efficiency
-	sort.Sort(ByInode(fileList))
-
-	// Compute partial checksums
-	for _, fo := range fileList {
-		if err := fo.Sum(partialChecksum); err != nil {
-			myLog.Println(0, "Error:", err)
-			continue
+		if sType == partialChecksum {
+			dupeList = append(dupeList, l.findDupesChecksums(fullChecksum)...)
+		} else { // full checksums -> we’re done
+			dupeList = append(dupeList, l)
 		}
-		hash := hex.EncodeToString(fo.PartialHash)
-		hashes[hash] = append(hashes[hash], fo)
-	}
-
-	// Let's de-dupe now...
-	for _, l := range hashes {
-		if len(l) < 2 {
-			continue
-		}
-		dupeList = append(dupeList, findDupesFullChecksums(l)...)
-		// TODO sort by increasing size
+		// TODO: sort by increasing size
 	}
 
 	return dupeList
@@ -308,9 +291,9 @@
 		var r []FileObjList
 		// We skip partial checksums for small files or if requested
 		if size > minSizePartialChecksum && !skipPartial {
-			r = findDupesPartialChecksums(sizeGroup.files)
+			r = sizeGroup.files.findDupesChecksums(partialChecksum)
 		} else {
-			r = findDupesFullChecksums(sizeGroup.files)
+			r = sizeGroup.files.findDupesChecksums(fullChecksum)
 		}
 		dupeList = append(dupeList, r...)
 	}