goduf.go
changeset 16 cc0ee80cf216
parent 15 4e3a67dc70a0
child 19 3389a17fc0d2
equal deleted inserted replaced
15:4e3a67dc70a0 16:cc0ee80cf216
    55 	needHash    sumType
    55 	needHash    sumType
    56 }
    56 }
    57 
    57 
    58 // FileObjList is only exported so that we can have a sort interface on inodes.
    58 // FileObjList is only exported so that we can have a sort interface on inodes.
    59 type FileObjList []*fileObj
    59 type FileObjList []*fileObj
       
    60 type foListList []FileObjList
    60 
    61 
    61 type dataT struct {
    62 type dataT struct {
    62 	totalSize   uint64
    63 	totalSize   uint64
    63 	cmpt        uint
    64 	cmpt        uint
    64 	sizeGroups  map[int64]*FileObjList
    65 	sizeGroups  map[int64]*FileObjList
   263 		if err := fo.Sum(fo.needHash); err != nil {
   264 		if err := fo.Sum(fo.needHash); err != nil {
   264 			myLog.Println(0, "Error:", err)
   265 			myLog.Println(0, "Error:", err)
   265 		}
   266 		}
   266 	}
   267 	}
   267 }
   268 }
   268 func computeSheduledChecksums(fileLists ...[]FileObjList) {
   269 func computeSheduledChecksums(fileLists ...foListList) {
   269 	var bigFileList FileObjList
   270 	var bigFileList FileObjList
   270 	// Merge the lists of FileObjList lists and create a unique list
   271 	// Merge the lists of FileObjList lists and create a unique list
   271 	// of file objects.
   272 	// of file objects.
   272 	for _, foll := range fileLists {
   273 	for _, foll := range fileLists {
   273 		for _, fol := range foll {
   274 		for _, fol := range foll {
   290 	for _, fo := range fileList {
   291 	for _, fo := range fileList {
   291 		fo.needHash = sType
   292 		fo.needHash = sType
   292 	}
   293 	}
   293 }
   294 }
   294 
   295 
   295 func (fileList FileObjList) findDupesChecksums(sType sumType) []FileObjList {
   296 func (fileList FileObjList) findDupesChecksums(sType sumType) foListList {
   296 	var dupeList []FileObjList
   297 	var dupeList foListList
   297 	var scheduleFull []FileObjList
   298 	var scheduleFull foListList
   298 	hashes := make(map[string]FileObjList)
   299 	hashes := make(map[string]FileObjList)
   299 
   300 
   300 	// Sort the list for better efficiency
   301 	// Sort the list for better efficiency
   301 	sort.Sort(ByInode(fileList))
   302 	sort.Sort(ByInode(fileList))
   302 
   303 
   332 
   333 
   333 	return dupeList
   334 	return dupeList
   334 }
   335 }
   335 
   336 
   336 // findDupes() uses checksums to find file duplicates
   337 // findDupes() uses checksums to find file duplicates
   337 func (data *dataT) findDupes(skipPartial bool) []FileObjList {
   338 func (data *dataT) findDupes(skipPartial bool) foListList {
   338 	var dupeList []FileObjList
   339 	var dupeList foListList
   339 	var schedulePartial []FileObjList
   340 	var schedulePartial foListList
   340 	var scheduleFull []FileObjList
   341 	var scheduleFull foListList
   341 
   342 
   342 	for size, sgListP := range data.sizeGroups {
   343 	for size, sgListP := range data.sizeGroups {
   343 		// We skip partial checksums for small files or if requested
   344 		// We skip partial checksums for small files or if requested
   344 		if size > minSizePartialChecksum && !skipPartial {
   345 		if size > minSizePartialChecksum && !skipPartial {
   345 			sgListP.scheduleChecksum(partialChecksum)
   346 			sgListP.scheduleChecksum(partialChecksum)
   543 		data.dispCount() // XXX
   544 		data.dispCount() // XXX
   544 	}
   545 	}
   545 
   546 
   546 	// Get list of dupes
   547 	// Get list of dupes
   547 	myLog.Println(1, "* Computing checksums...")
   548 	myLog.Println(1, "* Computing checksums...")
   548 	var result []FileObjList
   549 	var result foListList
   549 	if len(data.emptyFiles) > 0 {
   550 	if len(data.emptyFiles) > 0 {
   550 		result = append(result, data.emptyFiles)
   551 		result = append(result, data.emptyFiles)
   551 	}
   552 	}
   552 	result = append(result, data.findDupes(skipPartial)...)
   553 	result = append(result, data.findDupes(skipPartial)...)
   553 
   554 
   555 
   556 
   556 	// Done!  Dump dupes
   557 	// Done!  Dump dupes
   557 	if len(result) > 0 && !summary {
   558 	if len(result) > 0 && !summary {
   558 		myLog.Println(1, "* Dupes:")
   559 		myLog.Println(1, "* Dupes:")
   559 	}
   560 	}
   560 	// TODO: sort by increasing size
   561 	// Sort by increasing size (of the files, not groups)
       
   562 	sort.Sort(byGroupFileSize(result))
       
   563 
   561 	var dupeSize uint64
   564 	var dupeSize uint64
   562 	data.cmpt = 0
   565 	data.cmpt = 0
   563 	for i, l := range result {
   566 	for i, l := range result {
   564 		size := uint64(l[0].Size())
   567 		size := uint64(l[0].Size())
   565 		// We do not count the size of the 1st item
   568 		// We do not count the size of the 1st item
   567 		dupeSize += size * uint64(len(l)-1)
   570 		dupeSize += size * uint64(len(l)-1)
   568 		if !summary {
   571 		if !summary {
   569 			fmt.Printf("\nGroup #%d (%d files * %v):\n", i+1,
   572 			fmt.Printf("\nGroup #%d (%d files * %v):\n", i+1,
   570 				len(l), formatSize(size, true))
   573 				len(l), formatSize(size, true))
   571 		}
   574 		}
       
   575 		sort.Sort(byFilePathName(l))
   572 		for _, f := range l {
   576 		for _, f := range l {
   573 			if !summary {
   577 			if !summary {
   574 				fmt.Println(f.FilePath)
   578 				fmt.Println(f.FilePath)
   575 			}
   579 			}
   576 			data.cmpt++
   580 			data.cmpt++
   590 	myLog.Println(summaryLevel, "Final count:", data.cmpt,
   594 	myLog.Println(summaryLevel, "Final count:", data.cmpt,
   591 		"duplicate files in", len(result), "sets")
   595 		"duplicate files in", len(result), "sets")
   592 	myLog.Println(summaryLevel, "Redundant data size:",
   596 	myLog.Println(summaryLevel, "Redundant data size:",
   593 		formatSize(dupeSize, false))
   597 		formatSize(dupeSize, false))
   594 }
   598 }
       
   599 
       
   600 // Implement a sort interface for the list of duplicate groups
       
   601 type byGroupFileSize foListList
       
   602 
       
   603 func (a byGroupFileSize) Len() int      { return len(a) }
       
   604 func (a byGroupFileSize) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
       
   605 func (a byGroupFileSize) Less(i, j int) bool {
       
   606 	// Since this is supposed to be used for duplicate lists,
       
   607 	// we use the size of the first file of the group.
       
   608 	return a[i][0].Size() < a[j][0].Size()
       
   609 }
       
   610 
       
   611 // Implement a sort interface for a slice of files
       
   612 type byFilePathName FileObjList
       
   613 
       
   614 func (a byFilePathName) Len() int      { return len(a) }
       
   615 func (a byFilePathName) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
       
   616 func (a byFilePathName) Less(i, j int) bool {
       
   617 	return a[i].Name() < a[j].Name()
       
   618 }