# HG changeset patch # User Mikael Berthe # Date 1403980800 -7200 # Node ID 887c21c26cc865cf1e2d7d895617eff18f71cea3 # Parent bcc0975496bcbce66963f96c6f2005c301923951 Refactor the checksum part Instead of doing #1 partial checksums #2 full checksums when needed we do the full checksums as soon as partial checksums match. This should reduce disk seeks. diff -r bcc0975496bc -r 887c21c26cc8 goduf.go --- a/goduf.go Sat Jun 28 20:36:37 2014 +0200 +++ b/goduf.go Sat Jun 28 20:40:00 2014 +0200 @@ -53,6 +53,7 @@ Hash []byte } +// FileObjList is only exported so that we can have a sort interface on inodes. type FileObjList []*fileObj type sizeClass struct { @@ -122,7 +123,7 @@ if mode := f.Mode(); mode&os.ModeType != 0 { if mode&os.ModeSymlink != 0 { - myLog.Println(5, "Ignoring symbolic link", path) + myLog.Println(6, "Ignoring symbolic link", path) } else { myLog.Println(0, "Ignoring special file", path) } @@ -212,27 +213,22 @@ panic("Internal error: Invalid sType") } -func (data *dataT) dispCount() { +func (data *dataT) dispCount() { // FIXME rather useless if myLog.verbosity < 4 { return } - var c1, c2, c3, c4 int - var s4 string + var c1, c1b, c2 int + var s1 string for _, sc := range data.sizeGroups { c1 += len(sc.files) - for _, fol := range sc.medsums { - c2 += len(fol) - } - for _, fol := range sc.fullsums { - c3 += len(fol) - } + c2++ } - c4 = len(data.emptyFiles) - if c4 > 0 { - s4 = fmt.Sprintf("+%d", c4) + c1b = len(data.emptyFiles) + if c1b > 0 { + s1 = fmt.Sprintf("+%d", c1b) } - myLog.Printf(4, " Current countdown: %d [%d%s/%d/%d]\n", - c1+c2+c3+c4, c1, s4, c2, c3) + myLog.Printf(4, " Current countdown: %d [%d%s/%d]\n", + c1+c1b, c1, s1, c2) } func (data *dataT) filesWithSameHash() (hgroups []FileObjList) { @@ -244,153 +240,64 @@ return } -func createHashFromSum(sType sumType, folist FileObjList, hashes *map[string]FileObjList) { - for _, fo := range folist { - var hash string - var hbytes []byte - if sType == partialChecksum { - hbytes = fo.PartialHash - } else if sType == fullChecksum { - hbytes = fo.Hash - } else { - panic("Internal error: Invalid sType") - } - if hbytes == nil { - // Could happen if the file was not readable, - // We skip it (already shown as dropped) +func findDupesFullChecksums(fileList FileObjList) []FileObjList { + var dupeList []FileObjList + hashes := make(map[string]FileObjList) + + // Sort the list for better efficiency + sort.Sort(ByInode(fileList)) + + // Compute full checksums + for _, fo := range fileList { + if err := fo.Sum(fullChecksum); err != nil { + myLog.Println(0, "Error:", err) continue } - hash = hex.EncodeToString(hbytes) - - (*hashes)[hash] = append((*hashes)[hash], fo) - } -} - -func (data *dataT) calculateMedSums() { - var bigList FileObjList - var droppedGroups int - var droppedFiles int - - // Create a unique list of files to be partially checksummed - for size, sizeGroup := range data.sizeGroups { - if size < minSizePartialChecksum { - // We do not use MedSums for small files - continue - } - bigList = append(bigList, sizeGroup.files...) + hash := hex.EncodeToString(fo.Hash) + hashes[hash] = append(hashes[hash], fo) } - // Sort the list for better efficiency -- that's the whole point of - // the unique big list. - sort.Sort(ByInode(bigList)) - - // Compute partial checksums - for _, fo := range bigList { - if err := fo.Sum(partialChecksum); err != nil { - myLog.Println(0, "Error:", err) - droppedFiles++ - // The hash part will be nil and the file will - // be dropped in createHashFromSum() below. - } - } - - // Reparse size-grouped hashes and use the new hash information - // to build lists of files with the same partial checksum. - for size, sizeGroup := range data.sizeGroups { - if size < minSizePartialChecksum { - // We do not use MedSums for small files + // Let's de-dupe now... + for _, l := range hashes { + if len(l) < 2 { continue } - hashes := make(map[string]FileObjList) - createHashFromSum(partialChecksum, sizeGroup.files, &hashes) - - // Let's de-dupe now... - for h, l := range hashes { - if len(l) < 2 { - droppedGroups++ - droppedFiles += len(l) - delete(hashes, h) - } - } - - // Done, save the result - data.sizeGroups[size].medsums = hashes - - // We remove the items from "files" since they are in the hash - // tree now. - sizeGroup.files = nil - - if len(hashes) == 0 { // We're done with this size group - delete(data.sizeGroups, size) - } + dupeList = append(dupeList, l) + // TODO sort by increasing size + myLog.Printf(5, " . found %d new duplicates\n", len(l)) } - if droppedFiles > 0 { - myLog.Printf(3, " Dropped %d files in %d groups\n", - droppedFiles, droppedGroups) - } - return + return dupeList } -func (data *dataT) calculateChecksums() { - var bigList FileObjList - var droppedGroups int - var droppedFiles int +// TODO: refactor to avoid code duplication +func findDupesPartialChecksums(fileList FileObjList) []FileObjList { + var dupeList []FileObjList + hashes := make(map[string]FileObjList) - // Create a unique list of files to be fully checksummed - for _, sizeGroup := range data.sizeGroups { - // #1: small files - bigList = append(bigList, sizeGroup.files...) - // #2: files with same partial checksum - for _, l := range sizeGroup.medsums { - bigList = append(bigList, l...) + // Sort the list for better efficiency + sort.Sort(ByInode(fileList)) + + // Compute partial checksums + for _, fo := range fileList { + if err := fo.Sum(partialChecksum); err != nil { + myLog.Println(0, "Error:", err) + continue } - } - - // Sort the list for better efficiency -- that's the whole point of - // the unique big list. - sort.Sort(ByInode(bigList)) - - // Compute full checksums - for _, fo := range bigList { - if err := fo.Sum(fullChecksum); err != nil { - myLog.Println(0, "Error:", err) - droppedFiles++ - // The hash part will be nil and the file will - // be dropped in createHashFromSum() below. - } + hash := hex.EncodeToString(fo.PartialHash) + hashes[hash] = append(hashes[hash], fo) } - // Reparse size-grouped hashes and use the new hash information - // to build lists of files with the same checksum. - for size, sizeGroup := range data.sizeGroups { - hashes := make(map[string]FileObjList) - // #1: small files - createHashFromSum(fullChecksum, sizeGroup.files, &hashes) - // #2: files with same partial checksum - for _, l := range sizeGroup.medsums { - createHashFromSum(fullChecksum, l, &hashes) + // Let's de-dupe now... + for _, l := range hashes { + if len(l) < 2 { + continue } + dupeList = append(dupeList, findDupesFullChecksums(l)...) + // TODO sort by increasing size + } - // Let's de-dupe now... - for h, l := range hashes { - if len(l) < 2 { - droppedGroups++ - droppedFiles += len(l) - delete(hashes, h) - } - } - - // Done, save the result - data.sizeGroups[size].fullsums = hashes - data.sizeGroups[size].medsums = nil - sizeGroup.files = nil - } - if droppedFiles > 0 { - myLog.Printf(3, " Dropped %d files in %d groups\n", - droppedFiles, droppedGroups) - } - return + return dupeList } func (data *dataT) dropEmptyFiles(ignoreEmpty bool) (emptyCount int) { @@ -426,7 +333,6 @@ // Instead of this loop, another way would be to use the field // "Unique" of the fileObj to mark them to be discarded // and remove them all at the end. - // TODO: what about symlinks? for { if !OSHasInodes() { break @@ -434,10 +340,8 @@ var hardLinkIndex int fo := sizeGroup.files[0] prevDev, prevIno := GetDevIno(fo) - //fmt.Printf(" - FO: %#v\n", *fo) for i, fo := range sizeGroup.files[1:] { - //fmt.Printf(" - FO %d : %#v\n", i, *fo) dev, ino := GetDevIno(fo) if dev == prevDev && ino == prevIno { hardLinkIndex = i + 1 @@ -471,6 +375,23 @@ return } +// findDupes() uses checksums to find file duplicates +func (data *dataT) findDupes(skipPartial bool) []FileObjList { + var dupeList []FileObjList + + for size, sizeGroup := range data.sizeGroups { + var r []FileObjList + // We skip partial checksums for small files or if requested + if size > minSizePartialChecksum && !skipPartial { + r = findDupesPartialChecksums(sizeGroup.files) + } else { + r = findDupesFullChecksums(sizeGroup.files) + } + dupeList = append(dupeList, r...) + } + return dupeList +} + func formatSize(sizeBytes uint64, short bool) string { var units = map[int]string{ 0: "B", @@ -510,7 +431,7 @@ flag.BoolVar(&summary, "s", false, "See --summary") flag.BoolVar(&skipPartial, "skip-partial", false, "Skip partial checksums") flag.IntVar(&myLog.verbosity, "verbosity", 0, - "Set verbosity level (1-5)") + "Set verbosity level (1-6)") flag.IntVar(&myLog.verbosity, "vl", 0, "See verbosity") timings := flag.Bool("timings", false, "Set detailed log timings") flag.BoolVar(&ignoreEmpty, "no-empty", false, "Ignore empty files") @@ -557,40 +478,29 @@ myLog.Printf(1, " %d empty files were ignored\n", emptyCount) } - data.dispCount() + data.dispCount() // XXX myLog.Println(3, "* Number of size groups:", len(data.sizeGroups)) } // Remove unique sizes - myLog.Println(1, "* Removing files with unique size, sorting file lists...") + myLog.Println(1, "* Removing files with unique size and hard links...") hardLinkCount, uniqueSizeCount := data.initialCleanup() if verbose { myLog.Printf(2, " Dropped %d files with unique size\n", uniqueSizeCount) myLog.Printf(2, " Dropped %d hard links\n", hardLinkCount) myLog.Println(3, "* Number of size groups:", len(data.sizeGroups)) - data.dispCount() + data.dispCount() // XXX } - // Calculate medsums - if !skipPartial { - myLog.Println(1, "* Calculating partial checksums...") - data.calculateMedSums() - data.dispCount() - myLog.Println(3, "* Number of size groups:", len(data.sizeGroups)) - } - - // Calculate checksums - myLog.Println(1, "* Calculating checksums...") - data.calculateChecksums() - data.dispCount() - // Get list of dupes + myLog.Println(1, "* Computing checksums...") var result []FileObjList if len(data.emptyFiles) > 0 { result = append(result, data.emptyFiles) } - result = append(result, data.filesWithSameHash()...) + result = append(result, data.findDupes(skipPartial)...) + myLog.Println(3, "* Number of match groups:", len(result)) // Done! Dump dupes @@ -603,7 +513,7 @@ size := uint64(l[0].Size()) // We do not count the size of the 1st item // so we get only duplicate size. - dupeSize += size * uint64(len(l) - 1) + dupeSize += size * uint64(len(l)-1) if !summary { fmt.Printf("\nGroup #%d (%d files * %v):\n", i+1, len(l), formatSize(size, true))