263 if err := fo.Sum(fo.needHash); err != nil { |
264 if err := fo.Sum(fo.needHash); err != nil { |
264 myLog.Println(0, "Error:", err) |
265 myLog.Println(0, "Error:", err) |
265 } |
266 } |
266 } |
267 } |
267 } |
268 } |
268 func computeSheduledChecksums(fileLists ...[]FileObjList) { |
269 func computeSheduledChecksums(fileLists ...foListList) { |
269 var bigFileList FileObjList |
270 var bigFileList FileObjList |
270 // Merge the lists of FileObjList lists and create a unique list |
271 // Merge the lists of FileObjList lists and create a unique list |
271 // of file objects. |
272 // of file objects. |
272 for _, foll := range fileLists { |
273 for _, foll := range fileLists { |
273 for _, fol := range foll { |
274 for _, fol := range foll { |
290 for _, fo := range fileList { |
291 for _, fo := range fileList { |
291 fo.needHash = sType |
292 fo.needHash = sType |
292 } |
293 } |
293 } |
294 } |
294 |
295 |
295 func (fileList FileObjList) findDupesChecksums(sType sumType) []FileObjList { |
296 func (fileList FileObjList) findDupesChecksums(sType sumType) foListList { |
296 var dupeList []FileObjList |
297 var dupeList foListList |
297 var scheduleFull []FileObjList |
298 var scheduleFull foListList |
298 hashes := make(map[string]FileObjList) |
299 hashes := make(map[string]FileObjList) |
299 |
300 |
300 // Sort the list for better efficiency |
301 // Sort the list for better efficiency |
301 sort.Sort(ByInode(fileList)) |
302 sort.Sort(ByInode(fileList)) |
302 |
303 |
332 |
333 |
333 return dupeList |
334 return dupeList |
334 } |
335 } |
335 |
336 |
336 // findDupes() uses checksums to find file duplicates |
337 // findDupes() uses checksums to find file duplicates |
337 func (data *dataT) findDupes(skipPartial bool) []FileObjList { |
338 func (data *dataT) findDupes(skipPartial bool) foListList { |
338 var dupeList []FileObjList |
339 var dupeList foListList |
339 var schedulePartial []FileObjList |
340 var schedulePartial foListList |
340 var scheduleFull []FileObjList |
341 var scheduleFull foListList |
341 |
342 |
342 for size, sgListP := range data.sizeGroups { |
343 for size, sgListP := range data.sizeGroups { |
343 // We skip partial checksums for small files or if requested |
344 // We skip partial checksums for small files or if requested |
344 if size > minSizePartialChecksum && !skipPartial { |
345 if size > minSizePartialChecksum && !skipPartial { |
345 sgListP.scheduleChecksum(partialChecksum) |
346 sgListP.scheduleChecksum(partialChecksum) |
543 data.dispCount() // XXX |
544 data.dispCount() // XXX |
544 } |
545 } |
545 |
546 |
546 // Get list of dupes |
547 // Get list of dupes |
547 myLog.Println(1, "* Computing checksums...") |
548 myLog.Println(1, "* Computing checksums...") |
548 var result []FileObjList |
549 var result foListList |
549 if len(data.emptyFiles) > 0 { |
550 if len(data.emptyFiles) > 0 { |
550 result = append(result, data.emptyFiles) |
551 result = append(result, data.emptyFiles) |
551 } |
552 } |
552 result = append(result, data.findDupes(skipPartial)...) |
553 result = append(result, data.findDupes(skipPartial)...) |
553 |
554 |
555 |
556 |
556 // Done! Dump dupes |
557 // Done! Dump dupes |
557 if len(result) > 0 && !summary { |
558 if len(result) > 0 && !summary { |
558 myLog.Println(1, "* Dupes:") |
559 myLog.Println(1, "* Dupes:") |
559 } |
560 } |
560 // TODO: sort by increasing size |
561 // Sort by increasing size (of the files, not groups) |
|
562 sort.Sort(byGroupFileSize(result)) |
|
563 |
561 var dupeSize uint64 |
564 var dupeSize uint64 |
562 data.cmpt = 0 |
565 data.cmpt = 0 |
563 for i, l := range result { |
566 for i, l := range result { |
564 size := uint64(l[0].Size()) |
567 size := uint64(l[0].Size()) |
565 // We do not count the size of the 1st item |
568 // We do not count the size of the 1st item |
567 dupeSize += size * uint64(len(l)-1) |
570 dupeSize += size * uint64(len(l)-1) |
568 if !summary { |
571 if !summary { |
569 fmt.Printf("\nGroup #%d (%d files * %v):\n", i+1, |
572 fmt.Printf("\nGroup #%d (%d files * %v):\n", i+1, |
570 len(l), formatSize(size, true)) |
573 len(l), formatSize(size, true)) |
571 } |
574 } |
|
575 sort.Sort(byFilePathName(l)) |
572 for _, f := range l { |
576 for _, f := range l { |
573 if !summary { |
577 if !summary { |
574 fmt.Println(f.FilePath) |
578 fmt.Println(f.FilePath) |
575 } |
579 } |
576 data.cmpt++ |
580 data.cmpt++ |
590 myLog.Println(summaryLevel, "Final count:", data.cmpt, |
594 myLog.Println(summaryLevel, "Final count:", data.cmpt, |
591 "duplicate files in", len(result), "sets") |
595 "duplicate files in", len(result), "sets") |
592 myLog.Println(summaryLevel, "Redundant data size:", |
596 myLog.Println(summaryLevel, "Redundant data size:", |
593 formatSize(dupeSize, false)) |
597 formatSize(dupeSize, false)) |
594 } |
598 } |
|
599 |
|
600 // Implement a sort interface for the list of duplicate groups |
|
601 type byGroupFileSize foListList |
|
602 |
|
603 func (a byGroupFileSize) Len() int { return len(a) } |
|
604 func (a byGroupFileSize) Swap(i, j int) { a[i], a[j] = a[j], a[i] } |
|
605 func (a byGroupFileSize) Less(i, j int) bool { |
|
606 // Since this is supposed to be used for duplicate lists, |
|
607 // we use the size of the first file of the group. |
|
608 return a[i][0].Size() < a[j][0].Size() |
|
609 } |
|
610 |
|
611 // Implement a sort interface for a slice of files |
|
612 type byFilePathName FileObjList |
|
613 |
|
614 func (a byFilePathName) Len() int { return len(a) } |
|
615 func (a byFilePathName) Swap(i, j int) { a[i], a[j] = a[j], a[i] } |
|
616 func (a byFilePathName) Less(i, j int) bool { |
|
617 return a[i].Name() < a[j].Name() |
|
618 } |