# HG changeset patch # User Mikael Berthe # Date 1403998893 -7200 # Node ID 25ad96511395855f3d1d8eb21b7eb1c5033ddb80 # Parent 68375cc98f98e23266c604b1445bde1f2a720748 Schedule checksum computations so that we reduce hard drive seeks The previous approach was not working. diff -r 68375cc98f98 -r 25ad96511395 goduf.go --- a/goduf.go Sat Jun 28 22:23:28 2014 +0200 +++ b/goduf.go Sun Jun 29 01:41:33 2014 +0200 @@ -41,7 +41,8 @@ type sumType int const ( - fullChecksum sumType = iota + noChecksum sumType = iota + fullChecksum partialChecksum ) @@ -51,6 +52,7 @@ os.FileInfo PartialHash []byte Hash []byte + needHash sumType } // FileObjList is only exported so that we can have a sort interface on inodes. @@ -209,6 +211,8 @@ return fo.MedSum() } else if sType == fullChecksum { return fo.CheckSum() + } else if sType == noChecksum { + return nil } panic("Internal error: Invalid sType") } @@ -240,8 +244,53 @@ return } +// checksum returns the requested checksum as a string. +// If the checksum has not been pre-computed, it is calculated now. +func (fo fileObj) checksum(sType sumType) (string, error) { + var hbytes []byte + if sType == partialChecksum { + hbytes = fo.PartialHash + } else if sType == fullChecksum { + hbytes = fo.Hash + } else { + panic("Internal error: Invalid sType") + } + if hbytes == nil { + if err := fo.Sum(sType); err != nil { + return "", err + } + if sType == partialChecksum { + hbytes = fo.PartialHash + } else if sType == fullChecksum { + hbytes = fo.Hash + } + } + return hex.EncodeToString(hbytes), nil +} + +func (fileList FileObjList) computeSheduledChecksums() { + // Sort the list for better efficiency + sort.Sort(ByInode(fileList)) + + myLog.Printf(6, " . will compute %d checksums\n", len(fileList)) + + // Compute checksums + for _, fo := range fileList { + if err := fo.Sum(fo.needHash); err != nil { + myLog.Println(0, "Error:", err) + } + } +} + +func (fileList FileObjList) scheduleChecksum(sType sumType) { + for _, fo := range fileList { + fo.needHash = sType + } +} + func (fileList FileObjList) findDupesChecksums(sType sumType) []FileObjList { var dupeList []FileObjList + var scheduleFull []FileObjList hashes := make(map[string]FileObjList) // Sort the list for better efficiency @@ -249,22 +298,12 @@ // Compute checksums for _, fo := range fileList { - if err := fo.Sum(sType); err != nil { + hash, err := fo.checksum(sType) + if err != nil { myLog.Println(0, "Error:", err) continue } - var hbytes []byte - if sType == partialChecksum { - hbytes = fo.PartialHash - } else if sType == fullChecksum { - hbytes = fo.Hash - } else { - panic("Internal error: Invalid sType") - } - if hbytes != nil { - hash := hex.EncodeToString(hbytes) - hashes[hash] = append(hashes[hash], fo) - } + hashes[hash] = append(hashes[hash], fo) } // Let's de-dupe now... @@ -273,11 +312,25 @@ continue } if sType == partialChecksum { - dupeList = append(dupeList, l.findDupesChecksums(fullChecksum)...) - } else { // full checksums -> we’re done + scheduleFull = append(scheduleFull, l) + } else { // full checksums -> we're done dupeList = append(dupeList, l) + // TODO: sort by increasing size + myLog.Printf(5, " . found %d new duplicates\n", len(l)) } - // TODO: sort by increasing size + } + if sType == partialChecksum { + var csList FileObjList + for _, fol := range scheduleFull { + csList = append(csList, fol...) + } + myLog.Printf(6, " .. findDupesChecksums: computing %d "+ + "full checksums\n", len(csList)) // DBG + csList.computeSheduledChecksums() + for _, l := range scheduleFull { + r := l.findDupesChecksums(fullChecksum) + dupeList = append(dupeList, r...) + } } return dupeList @@ -286,17 +339,40 @@ // findDupes() uses checksums to find file duplicates func (data *dataT) findDupes(skipPartial bool) []FileObjList { var dupeList []FileObjList + var schedulePartial []FileObjList + var scheduleFull []FileObjList for size, sizeGroup := range data.sizeGroups { - var r []FileObjList // We skip partial checksums for small files or if requested if size > minSizePartialChecksum && !skipPartial { - r = sizeGroup.files.findDupesChecksums(partialChecksum) + sizeGroup.files.scheduleChecksum(partialChecksum) + schedulePartial = append(schedulePartial, sizeGroup.files) } else { - r = sizeGroup.files.findDupesChecksums(fullChecksum) + sizeGroup.files.scheduleChecksum(fullChecksum) + scheduleFull = append(scheduleFull, sizeGroup.files) } + } + + var csList FileObjList + for _, fol := range schedulePartial { + csList = append(csList, fol...) + } + for _, fol := range scheduleFull { + csList = append(csList, fol...) + } + myLog.Printf(6, " .. findDupes: computing %d misc checksums\n", + len(csList)) // DBG + csList.computeSheduledChecksums() + + for _, l := range schedulePartial { + r := l.findDupesChecksums(partialChecksum) dupeList = append(dupeList, r...) } + for _, l := range scheduleFull { + r := l.findDupesChecksums(fullChecksum) + dupeList = append(dupeList, r...) + } + // TODO: sort by increasing size return dupeList }