# HG changeset patch # User Mikael Berthe # Date 1403956988 -7200 # Node ID a5642cd03cef75b3df99ea354f68e777e546678c Goduf - initial version-controlled revision diff -r 000000000000 -r a5642cd03cef .hgignore --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/.hgignore Sat Jun 28 14:03:08 2014 +0200 @@ -0,0 +1,6 @@ +syntax: glob + +goduf +goduf_* + +*.swp diff -r 000000000000 -r a5642cd03cef TODO.md --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TODO.md Sat Jun 28 14:03:08 2014 +0200 @@ -0,0 +1,7 @@ +# TODO: + +* Clean up, fix variable names +* Add version number +* Sort results? +* Possibility to accept file list from stdin +* Check Windows portability diff -r 000000000000 -r a5642cd03cef goduf-byinode_fallback.go --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/goduf-byinode_fallback.go Sat Jun 28 14:03:08 2014 +0200 @@ -0,0 +1,23 @@ + +// +build plan9 windows + +package main + +import "os" + +// ByInode is a FileObjList type with a sort interface +type ByInode FileObjList + +func (a ByInode) Len() int { return len(a) } +func (a ByInode) Swap(i, j int) { a[i], a[j] = a[j], a[i] } +func (a ByInode) Less(i, j int) bool { + return true +} + +func OSHasInodes() bool { + return false +} + +func GetDevIno(fi os.FileInfo) (uint64, uint64) { + return 0, 0 // Not supported +} diff -r 000000000000 -r a5642cd03cef goduf-byinode_unix.go --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/goduf-byinode_unix.go Sat Jun 28 14:03:08 2014 +0200 @@ -0,0 +1,37 @@ + +// +build darwin dragonfly freebsd linux nacl netbsd openbsd solaris + +package main + +import "os" +import "syscall" + +// ByInode is a FileObjList type with a sort interface +type ByInode FileObjList + +func (a ByInode) Len() int { return len(a) } +func (a ByInode) Swap(i, j int) { a[i], a[j] = a[j], a[i] } +func (a ByInode) Less(i, j int) bool { + // Sort by device id first + iDevice := a[i].Sys().(*syscall.Stat_t).Dev + jDevice := a[j].Sys().(*syscall.Stat_t).Dev + switch { + case iDevice < jDevice: + return true + case iDevice > jDevice: + return false + } + iInode := a[i].Sys().(*syscall.Stat_t).Ino + jInode := a[j].Sys().(*syscall.Stat_t).Ino + return iInode < jInode +} + +func OSHasInodes() bool { + return true +} + +func GetDevIno(fi os.FileInfo) (uint64, uint64) { + dev := fi.Sys().(*syscall.Stat_t).Dev + ino := fi.Sys().(*syscall.Stat_t).Ino + return uint64(dev), uint64(ino) +} diff -r 000000000000 -r a5642cd03cef goduf.go --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/goduf.go Sat Jun 28 14:03:08 2014 +0200 @@ -0,0 +1,635 @@ +/* + * Copyright (C) 2014 Mikael Berthe + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or (at + * your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 + * USA + */ + +// This program (Goduf) is a fast duplicate file finder. +// Use goduf --help to get the list of available options. + +package main + +import ( + "crypto/sha1" + "encoding/hex" + "errors" + "flag" + "fmt" + "io" + "log" + "os" + "path/filepath" + "sort" +) + +const medsumBytes = 128 +const minSizePartialChecksum = 49152 // Should be > 3*medsumBytes + +type sumType int + +const ( + fullChecksum sumType = iota + partialChecksum +) + +type fileObj struct { + //Unique bool + FilePath string + os.FileInfo + PartialHash []byte + Hash []byte +} + +type FileObjList []*fileObj + +type sizeClass struct { + files FileObjList + medsums map[string]FileObjList + fullsums map[string]FileObjList +} + +type dataT struct { + totalSize uint64 + cmpt uint + sizeGroups map[int64]*sizeClass + emptyFiles FileObjList + ignoreCount int +} + +var data dataT + +type myLogT struct { + verbosity int +} + +var myLog myLogT + +func (l *myLogT) Printf(level int, format string, args ...interface{}) { + if level > l.verbosity { + return + } + if level >= 0 { + log.Printf(format, args...) + return + } + // Error message without timestamp + fmt.Fprintf(os.Stderr, format, args...) +} + +func (l *myLogT) Println(level int, args ...interface{}) { + if level > l.verbosity { + return + } + if level >= 0 { + log.Println(args...) + return + } + // Error message without timestamp + fmt.Fprintln(os.Stderr, args...) +} + +func visit(path string, f os.FileInfo, err error) error { + if err != nil { + if f == nil { + return err + } + if f.IsDir() { + myLog.Println(-1, "Warning: cannot process directory:", + path) + return filepath.SkipDir + } + + myLog.Println(-1, "Ignoring ", path, " - ", err) + data.ignoreCount++ + return nil + } + if f.IsDir() { + return nil + } + + if mode := f.Mode(); mode&os.ModeType != 0 { + if mode&os.ModeSymlink != 0 { + myLog.Println(5, "Ignoring symbolic link", path) + } else { + myLog.Println(0, "Ignoring special file", path) + } + data.ignoreCount++ + return nil + } + + data.cmpt++ + data.totalSize += uint64(f.Size()) + fo := &fileObj{FilePath: path, FileInfo: f} + if _, ok := data.sizeGroups[f.Size()]; !ok { + data.sizeGroups[f.Size()] = &sizeClass{} + } + data.sizeGroups[f.Size()].files = + append(data.sizeGroups[f.Size()].files, fo) + return nil +} + +func (fo *fileObj) CheckSum() error { + file, err := os.Open(fo.FilePath) + if err != nil { + return err + } + defer file.Close() + hash := sha1.New() + if size, err := io.Copy(hash, file); size != fo.Size() || err != nil { + if err == nil { + return errors.New("failed to read the whole file: " + + fo.FilePath) + } + return err + } + + fo.Hash = hash.Sum(nil) + + return nil +} + +func (fo *fileObj) MedSum() error { + file, err := os.Open(fo.FilePath) + if err != nil { + return err + } + defer file.Close() + hash := sha1.New() + // XXX: duplicated code! + // BOF + if _, err := io.CopyN(hash, file, medsumBytes); err != nil { + if err == nil { + return errors.New("failed to read bytes from file:" + + fo.FilePath) + } + return err + } + /* + // MOF + file.Seek((fo.Size()-medsumBytes)/2, 0) + if _, err := io.CopyN(hash, file, medsumBytes); err != nil { + if err == nil { + return errors.New("failed to read bytes from file:" + + fo.FilePath) + } + return err + } + */ + // EOF + file.Seek(0-medsumBytes, 2) + if _, err := io.CopyN(hash, file, medsumBytes); err != nil { + if err == nil { + return errors.New("failed to read bytes from file:" + + fo.FilePath) + } + return err + } + + fo.PartialHash = hash.Sum(nil) + + return nil +} + +func (fo *fileObj) Sum(sType sumType) error { + if sType == partialChecksum { + return fo.MedSum() + } else if sType == fullChecksum { + return fo.CheckSum() + } + panic("Internal error: Invalid sType") +} + +func (data *dataT) dispCount() { + if myLog.verbosity < 4 { + return + } + var c1, c2, c3, c4 int + var s4 string + for _, sc := range data.sizeGroups { + c1 += len(sc.files) + for _, fol := range sc.medsums { + c2 += len(fol) + } + for _, fol := range sc.fullsums { + c3 += len(fol) + } + } + c4 = len(data.emptyFiles) + if c4 > 0 { + s4 = fmt.Sprintf("+%d", c4) + } + myLog.Printf(4, " Current countdown: %d [%d%s/%d/%d]\n", + c1+c2+c3+c4, c1, s4, c2, c3) +} + +func (data *dataT) filesWithSameHash() (hgroups []FileObjList) { + for _, sizeGroup := range data.sizeGroups { + for _, l := range sizeGroup.fullsums { + hgroups = append(hgroups, l) + } + } + return +} + +func createHashFromSum(sType sumType, folist FileObjList, hashes *map[string]FileObjList) { + for _, fo := range folist { + var hash string + var hbytes []byte + if sType == partialChecksum { + hbytes = fo.PartialHash + } else if sType == fullChecksum { + hbytes = fo.Hash + } else { + panic("Internal error: Invalid sType") + } + if hbytes == nil { + // Could happen if the file was not readable, + // We skip it (already shown as dropped) + continue + } + hash = hex.EncodeToString(hbytes) + + (*hashes)[hash] = append((*hashes)[hash], fo) + } +} + +func (data *dataT) calculateMedSums() { + var bigList FileObjList + var droppedGroups int + var droppedFiles int + + // Create a unique list of files to be partially checksummed + for size, sizeGroup := range data.sizeGroups { + if size < minSizePartialChecksum { + // We do not use MedSums for small files + continue + } + bigList = append(bigList, sizeGroup.files...) + } + + // Sort the list for better efficiency -- that's the whole point of + // the unique big list. + sort.Sort(ByInode(bigList)) + + // Compute partial checksums + for _, fo := range bigList { + if err := fo.Sum(partialChecksum); err != nil { + myLog.Println(0, "Error:", err) + droppedFiles++ + // The hash part will be nil and the file will + // be dropped in createHashFromSum() below. + } + } + + // Reparse size-grouped hashes and use the new hash information + // to build lists of files with the same partial checksum. + for size, sizeGroup := range data.sizeGroups { + if size < minSizePartialChecksum { + // We do not use MedSums for small files + continue + } + hashes := make(map[string]FileObjList) + createHashFromSum(partialChecksum, sizeGroup.files, &hashes) + + // Let's de-dupe now... + for h, l := range hashes { + if len(l) < 2 { + droppedGroups++ + droppedFiles += len(l) + delete(hashes, h) + } + } + + // Done, save the result + data.sizeGroups[size].medsums = hashes + + // We remove the items from "files" since they are in the hash + // tree now. + sizeGroup.files = nil + + if len(hashes) == 0 { // We're done with this size group + delete(data.sizeGroups, size) + } + } + + if droppedFiles > 0 { + myLog.Printf(3, " Dropped %d files in %d groups\n", + droppedFiles, droppedGroups) + } + return +} + +func (data *dataT) calculateChecksums() { + var bigList FileObjList + var droppedGroups int + var droppedFiles int + + // Create a unique list of files to be fully checksummed + for _, sizeGroup := range data.sizeGroups { + // #1: small files + bigList = append(bigList, sizeGroup.files...) + // #2: files with same partial checksum + for _, l := range sizeGroup.medsums { + bigList = append(bigList, l...) + } + } + + // Sort the list for better efficiency -- that's the whole point of + // the unique big list. + sort.Sort(ByInode(bigList)) + + // Compute full checksums + for _, fo := range bigList { + if err := fo.Sum(fullChecksum); err != nil { + myLog.Println(0, "Error:", err) + droppedFiles++ + // The hash part will be nil and the file will + // be dropped in createHashFromSum() below. + } + } + + // Reparse size-grouped hashes and use the new hash information + // to build lists of files with the same checksum. + for size, sizeGroup := range data.sizeGroups { + hashes := make(map[string]FileObjList) + // #1: small files + createHashFromSum(fullChecksum, sizeGroup.files, &hashes) + // #2: files with same partial checksum + for _, l := range sizeGroup.medsums { + createHashFromSum(fullChecksum, l, &hashes) + } + + // Let's de-dupe now... + for h, l := range hashes { + if len(l) < 2 { + droppedGroups++ + droppedFiles += len(l) + delete(hashes, h) + } + } + + // Done, save the result + data.sizeGroups[size].fullsums = hashes + data.sizeGroups[size].medsums = nil + sizeGroup.files = nil + } + if droppedFiles > 0 { + myLog.Printf(3, " Dropped %d files in %d groups\n", + droppedFiles, droppedGroups) + } + return +} + +func (data *dataT) dropEmptyFiles(ignoreEmpty bool) (emptyCount int) { + sc, ok := data.sizeGroups[0] + if ok == false { + return // no empty files + } + if !ignoreEmpty { + if len(sc.files) > 1 { + data.emptyFiles = sc.files + } + delete(data.sizeGroups, 0) + return + } + emptyCount = len(sc.files) + delete(data.sizeGroups, 0) + return +} + +func (data *dataT) createSizeHash() (hardLinkCount, uniqueSizeCount int) { + for s, sizeGroup := range data.sizeGroups { + if len(sizeGroup.files) < 2 { + delete(data.sizeGroups, s) + uniqueSizeCount++ + continue + } + + var hardlinksFound bool + + // Sort by device/inodes + sort.Sort(ByInode(sizeGroup.files)) + + // Check for hardlinks + // TODO: what about symlinks? + // Remove unique dev/inodes + // Instead of this loop, another way would be to use the field + // "Unique" of the fileObj to mark them to be discarded + // and remove them all at the end. + for { + if !OSHasInodes() { + break + } + var hardLinkIndex int + fo := sizeGroup.files[0] + prevDev, prevIno := GetDevIno(fo) + //fmt.Printf(" - FO: %#v\n", *fo) + + for i, fo := range sizeGroup.files[1:] { + //fmt.Printf(" - FO %d : %#v\n", i, *fo) + dev, ino := GetDevIno(fo) + if dev == prevDev && ino == prevIno { + hardLinkIndex = i + 1 + hardLinkCount++ + hardlinksFound = true + break + } + prevDev = dev + prevIno = ino + } + + if hardLinkIndex == 0 { + break + } + i := hardLinkIndex + // Remove hardink + copy(sizeGroup.files[i:], sizeGroup.files[i+1:]) + sizeGroup.files[len(sizeGroup.files)-1] = nil + sizeGroup.files = sizeGroup.files[:len(sizeGroup.files)-1] + } + // We have found hard links in this size group, + // maybe we can remove it + if hardlinksFound { + if len(sizeGroup.files) < 2 { + delete(data.sizeGroups, s) + uniqueSizeCount++ + continue + } + } + } + return +} + +func formatSize(sizeBytes uint64, short bool) string { + var units = map[int]string{ + 0: "B", + 1: "KiB", + 2: "MiB", + 3: "GiB", + 4: "TiB", + 5: "PiB", + } + humanSize := sizeBytes + var n int + for n < len(units)-1 { + if humanSize < 10000 { + break + } + humanSize /= 1024 + n++ + } + if n < 1 { + return fmt.Sprintf("%d bytes", sizeBytes) + } + if short { + return fmt.Sprintf("%d %s", humanSize, units[n]) + } + return fmt.Sprintf("%d bytes (%d %s)", sizeBytes, humanSize, units[n]) +} + +func main() { + var verbose bool + var summary bool + var skipPartial bool + var ignoreEmpty bool + + flag.BoolVar(&verbose, "verbose", false, "Be verbose (verbosity=1)") + flag.BoolVar(&verbose, "v", false, "See --verbose") + flag.BoolVar(&summary, "summary", false, "Do not display the duplicate list") + flag.BoolVar(&summary, "s", false, "See --summary") + flag.BoolVar(&skipPartial, "skip-partial", false, "Skip partial checksums") + flag.IntVar(&myLog.verbosity, "verbosity", 0, + "Set verbosity level (1-5)") + flag.IntVar(&myLog.verbosity, "vl", 0, "See verbosity") + timings := flag.Bool("timings", false, "Set detailed log timings") + flag.BoolVar(&ignoreEmpty, "no-empty", false, "Ignore empty files") + + flag.Parse() + + if myLog.verbosity > 0 { + verbose = true + } else if verbose == true { + myLog.verbosity = 1 + } + + if len(flag.Args()) == 0 { + // TODO: more helpful usage statement + myLog.Println(-1, "Usage:", os.Args[0], + "[options] base_directory") + os.Exit(0) + } + + if *timings { + log.SetFlags(log.LstdFlags | log.Lmicroseconds) + } + + data.sizeGroups = make(map[int64]*sizeClass) + myLog.Println(1, "* Reading file metadata") + + for _, root := range flag.Args() { + if err := filepath.Walk(root, visit); err != nil { + myLog.Printf(-1, "* Error: could not read file tree:\n") + myLog.Printf(-1, "> %v\n", err) + os.Exit(1) + } + } + emptyCount := data.dropEmptyFiles(ignoreEmpty) + if verbose { + if data.ignoreCount > 0 { + myLog.Printf(1, " %d special files were ignored\n", + data.ignoreCount) + } + myLog.Println(2, " Initial counter:", data.cmpt, "files") + myLog.Println(2, " Total size:", formatSize(data.totalSize, + false)) + if emptyCount > 0 { + myLog.Printf(1, " %d empty files were ignored\n", + emptyCount) + } + data.dispCount() + myLog.Println(3, "* Number of size groups:", len(data.sizeGroups)) + } + + // Remove unique sizes + myLog.Println(1, "* Removing files with unique size, sorting file lists...") + hardLinkCount, uniqueSizeCount := data.createSizeHash() + if verbose { + myLog.Printf(2, " Dropped %d files with unique size\n", + uniqueSizeCount) + myLog.Printf(2, " Dropped %d hard links\n", hardLinkCount) + myLog.Println(3, "* Number of size groups:", len(data.sizeGroups)) + data.dispCount() + } + + // Calculate medsums + if !skipPartial { + myLog.Println(1, "* Calculating partial checksums...") + data.calculateMedSums() + data.dispCount() + myLog.Println(3, "* Number of size groups:", len(data.sizeGroups)) + } + + // Calculate checksums + myLog.Println(1, "* Calculating checksums...") + data.calculateChecksums() + data.dispCount() + + // Get list of dupes + var result []FileObjList + if len(data.emptyFiles) > 0 { + result = append(result, data.emptyFiles) + } + result = append(result, data.filesWithSameHash()...) + myLog.Println(3, "* Number of match groups:", len(result)) + + // Done! Dump dupes + if len(result) > 0 && !summary { + myLog.Println(1, "* Dupes:") + } + var dupeSize uint64 + data.cmpt = 0 + for i, l := range result { + size := uint64(l[0].Size()) + // We do not count the size of the 1st item + // so we get only duplicate size. + dupeSize += size * uint64(len(l) - 1) + if !summary { + fmt.Printf("\nGroup #%d (%d files * %v):\n", i+1, + len(l), formatSize(size, true)) + } + for _, f := range l { + if !summary { + fmt.Println(f.FilePath) + } + data.cmpt++ + } + } + summaryLevel := 1 // Default verbosity for the summary line + if summary == false { + // Add a trailing newline + if len(result) > 0 { + fmt.Println("") + } + } else { + // The summary is requested so we lower the verbosity level + summaryLevel = 0 + } + + myLog.Println(summaryLevel, "Final count:", data.cmpt, + "duplicate files in", len(result), "sets") + myLog.Println(summaryLevel, "Redundant data size:", + formatSize(dupeSize, false)) +}