--- a/goduf.go Sat Jun 28 20:36:37 2014 +0200
+++ b/goduf.go Sat Jun 28 20:40:00 2014 +0200
@@ -53,6 +53,7 @@
Hash []byte
}
+// FileObjList is only exported so that we can have a sort interface on inodes.
type FileObjList []*fileObj
type sizeClass struct {
@@ -122,7 +123,7 @@
if mode := f.Mode(); mode&os.ModeType != 0 {
if mode&os.ModeSymlink != 0 {
- myLog.Println(5, "Ignoring symbolic link", path)
+ myLog.Println(6, "Ignoring symbolic link", path)
} else {
myLog.Println(0, "Ignoring special file", path)
}
@@ -212,27 +213,22 @@
panic("Internal error: Invalid sType")
}
-func (data *dataT) dispCount() {
+func (data *dataT) dispCount() { // FIXME rather useless
if myLog.verbosity < 4 {
return
}
- var c1, c2, c3, c4 int
- var s4 string
+ var c1, c1b, c2 int
+ var s1 string
for _, sc := range data.sizeGroups {
c1 += len(sc.files)
- for _, fol := range sc.medsums {
- c2 += len(fol)
- }
- for _, fol := range sc.fullsums {
- c3 += len(fol)
- }
+ c2++
}
- c4 = len(data.emptyFiles)
- if c4 > 0 {
- s4 = fmt.Sprintf("+%d", c4)
+ c1b = len(data.emptyFiles)
+ if c1b > 0 {
+ s1 = fmt.Sprintf("+%d", c1b)
}
- myLog.Printf(4, " Current countdown: %d [%d%s/%d/%d]\n",
- c1+c2+c3+c4, c1, s4, c2, c3)
+ myLog.Printf(4, " Current countdown: %d [%d%s/%d]\n",
+ c1+c1b, c1, s1, c2)
}
func (data *dataT) filesWithSameHash() (hgroups []FileObjList) {
@@ -244,153 +240,64 @@
return
}
-func createHashFromSum(sType sumType, folist FileObjList, hashes *map[string]FileObjList) {
- for _, fo := range folist {
- var hash string
- var hbytes []byte
- if sType == partialChecksum {
- hbytes = fo.PartialHash
- } else if sType == fullChecksum {
- hbytes = fo.Hash
- } else {
- panic("Internal error: Invalid sType")
- }
- if hbytes == nil {
- // Could happen if the file was not readable,
- // We skip it (already shown as dropped)
+func findDupesFullChecksums(fileList FileObjList) []FileObjList {
+ var dupeList []FileObjList
+ hashes := make(map[string]FileObjList)
+
+ // Sort the list for better efficiency
+ sort.Sort(ByInode(fileList))
+
+ // Compute full checksums
+ for _, fo := range fileList {
+ if err := fo.Sum(fullChecksum); err != nil {
+ myLog.Println(0, "Error:", err)
continue
}
- hash = hex.EncodeToString(hbytes)
-
- (*hashes)[hash] = append((*hashes)[hash], fo)
- }
-}
-
-func (data *dataT) calculateMedSums() {
- var bigList FileObjList
- var droppedGroups int
- var droppedFiles int
-
- // Create a unique list of files to be partially checksummed
- for size, sizeGroup := range data.sizeGroups {
- if size < minSizePartialChecksum {
- // We do not use MedSums for small files
- continue
- }
- bigList = append(bigList, sizeGroup.files...)
+ hash := hex.EncodeToString(fo.Hash)
+ hashes[hash] = append(hashes[hash], fo)
}
- // Sort the list for better efficiency -- that's the whole point of
- // the unique big list.
- sort.Sort(ByInode(bigList))
-
- // Compute partial checksums
- for _, fo := range bigList {
- if err := fo.Sum(partialChecksum); err != nil {
- myLog.Println(0, "Error:", err)
- droppedFiles++
- // The hash part will be nil and the file will
- // be dropped in createHashFromSum() below.
- }
- }
-
- // Reparse size-grouped hashes and use the new hash information
- // to build lists of files with the same partial checksum.
- for size, sizeGroup := range data.sizeGroups {
- if size < minSizePartialChecksum {
- // We do not use MedSums for small files
+ // Let's de-dupe now...
+ for _, l := range hashes {
+ if len(l) < 2 {
continue
}
- hashes := make(map[string]FileObjList)
- createHashFromSum(partialChecksum, sizeGroup.files, &hashes)
-
- // Let's de-dupe now...
- for h, l := range hashes {
- if len(l) < 2 {
- droppedGroups++
- droppedFiles += len(l)
- delete(hashes, h)
- }
- }
-
- // Done, save the result
- data.sizeGroups[size].medsums = hashes
-
- // We remove the items from "files" since they are in the hash
- // tree now.
- sizeGroup.files = nil
-
- if len(hashes) == 0 { // We're done with this size group
- delete(data.sizeGroups, size)
- }
+ dupeList = append(dupeList, l)
+ // TODO sort by increasing size
+ myLog.Printf(5, " . found %d new duplicates\n", len(l))
}
- if droppedFiles > 0 {
- myLog.Printf(3, " Dropped %d files in %d groups\n",
- droppedFiles, droppedGroups)
- }
- return
+ return dupeList
}
-func (data *dataT) calculateChecksums() {
- var bigList FileObjList
- var droppedGroups int
- var droppedFiles int
+// TODO: refactor to avoid code duplication
+func findDupesPartialChecksums(fileList FileObjList) []FileObjList {
+ var dupeList []FileObjList
+ hashes := make(map[string]FileObjList)
- // Create a unique list of files to be fully checksummed
- for _, sizeGroup := range data.sizeGroups {
- // #1: small files
- bigList = append(bigList, sizeGroup.files...)
- // #2: files with same partial checksum
- for _, l := range sizeGroup.medsums {
- bigList = append(bigList, l...)
+ // Sort the list for better efficiency
+ sort.Sort(ByInode(fileList))
+
+ // Compute partial checksums
+ for _, fo := range fileList {
+ if err := fo.Sum(partialChecksum); err != nil {
+ myLog.Println(0, "Error:", err)
+ continue
}
- }
-
- // Sort the list for better efficiency -- that's the whole point of
- // the unique big list.
- sort.Sort(ByInode(bigList))
-
- // Compute full checksums
- for _, fo := range bigList {
- if err := fo.Sum(fullChecksum); err != nil {
- myLog.Println(0, "Error:", err)
- droppedFiles++
- // The hash part will be nil and the file will
- // be dropped in createHashFromSum() below.
- }
+ hash := hex.EncodeToString(fo.PartialHash)
+ hashes[hash] = append(hashes[hash], fo)
}
- // Reparse size-grouped hashes and use the new hash information
- // to build lists of files with the same checksum.
- for size, sizeGroup := range data.sizeGroups {
- hashes := make(map[string]FileObjList)
- // #1: small files
- createHashFromSum(fullChecksum, sizeGroup.files, &hashes)
- // #2: files with same partial checksum
- for _, l := range sizeGroup.medsums {
- createHashFromSum(fullChecksum, l, &hashes)
+ // Let's de-dupe now...
+ for _, l := range hashes {
+ if len(l) < 2 {
+ continue
}
+ dupeList = append(dupeList, findDupesFullChecksums(l)...)
+ // TODO sort by increasing size
+ }
- // Let's de-dupe now...
- for h, l := range hashes {
- if len(l) < 2 {
- droppedGroups++
- droppedFiles += len(l)
- delete(hashes, h)
- }
- }
-
- // Done, save the result
- data.sizeGroups[size].fullsums = hashes
- data.sizeGroups[size].medsums = nil
- sizeGroup.files = nil
- }
- if droppedFiles > 0 {
- myLog.Printf(3, " Dropped %d files in %d groups\n",
- droppedFiles, droppedGroups)
- }
- return
+ return dupeList
}
func (data *dataT) dropEmptyFiles(ignoreEmpty bool) (emptyCount int) {
@@ -426,7 +333,6 @@
// Instead of this loop, another way would be to use the field
// "Unique" of the fileObj to mark them to be discarded
// and remove them all at the end.
- // TODO: what about symlinks?
for {
if !OSHasInodes() {
break
@@ -434,10 +340,8 @@
var hardLinkIndex int
fo := sizeGroup.files[0]
prevDev, prevIno := GetDevIno(fo)
- //fmt.Printf(" - FO: %#v\n", *fo)
for i, fo := range sizeGroup.files[1:] {
- //fmt.Printf(" - FO %d : %#v\n", i, *fo)
dev, ino := GetDevIno(fo)
if dev == prevDev && ino == prevIno {
hardLinkIndex = i + 1
@@ -471,6 +375,23 @@
return
}
+// findDupes() uses checksums to find file duplicates
+func (data *dataT) findDupes(skipPartial bool) []FileObjList {
+ var dupeList []FileObjList
+
+ for size, sizeGroup := range data.sizeGroups {
+ var r []FileObjList
+ // We skip partial checksums for small files or if requested
+ if size > minSizePartialChecksum && !skipPartial {
+ r = findDupesPartialChecksums(sizeGroup.files)
+ } else {
+ r = findDupesFullChecksums(sizeGroup.files)
+ }
+ dupeList = append(dupeList, r...)
+ }
+ return dupeList
+}
+
func formatSize(sizeBytes uint64, short bool) string {
var units = map[int]string{
0: "B",
@@ -510,7 +431,7 @@
flag.BoolVar(&summary, "s", false, "See --summary")
flag.BoolVar(&skipPartial, "skip-partial", false, "Skip partial checksums")
flag.IntVar(&myLog.verbosity, "verbosity", 0,
- "Set verbosity level (1-5)")
+ "Set verbosity level (1-6)")
flag.IntVar(&myLog.verbosity, "vl", 0, "See verbosity")
timings := flag.Bool("timings", false, "Set detailed log timings")
flag.BoolVar(&ignoreEmpty, "no-empty", false, "Ignore empty files")
@@ -557,40 +478,29 @@
myLog.Printf(1, " %d empty files were ignored\n",
emptyCount)
}
- data.dispCount()
+ data.dispCount() // XXX
myLog.Println(3, "* Number of size groups:", len(data.sizeGroups))
}
// Remove unique sizes
- myLog.Println(1, "* Removing files with unique size, sorting file lists...")
+ myLog.Println(1, "* Removing files with unique size and hard links...")
hardLinkCount, uniqueSizeCount := data.initialCleanup()
if verbose {
myLog.Printf(2, " Dropped %d files with unique size\n",
uniqueSizeCount)
myLog.Printf(2, " Dropped %d hard links\n", hardLinkCount)
myLog.Println(3, "* Number of size groups:", len(data.sizeGroups))
- data.dispCount()
+ data.dispCount() // XXX
}
- // Calculate medsums
- if !skipPartial {
- myLog.Println(1, "* Calculating partial checksums...")
- data.calculateMedSums()
- data.dispCount()
- myLog.Println(3, "* Number of size groups:", len(data.sizeGroups))
- }
-
- // Calculate checksums
- myLog.Println(1, "* Calculating checksums...")
- data.calculateChecksums()
- data.dispCount()
-
// Get list of dupes
+ myLog.Println(1, "* Computing checksums...")
var result []FileObjList
if len(data.emptyFiles) > 0 {
result = append(result, data.emptyFiles)
}
- result = append(result, data.filesWithSameHash()...)
+ result = append(result, data.findDupes(skipPartial)...)
+
myLog.Println(3, "* Number of match groups:", len(result))
// Done! Dump dupes
@@ -603,7 +513,7 @@
size := uint64(l[0].Size())
// We do not count the size of the 1st item
// so we get only duplicate size.
- dupeSize += size * uint64(len(l) - 1)
+ dupeSize += size * uint64(len(l)-1)
if !summary {
fmt.Printf("\nGroup #%d (%d files * %v):\n", i+1,
len(l), formatSize(size, true))