goduf.go
changeset 5 887c21c26cc8
parent 2 55098d552ae2
child 6 6740350569d3
equal deleted inserted replaced
4:bcc0975496bc 5:887c21c26cc8
    51 	os.FileInfo
    51 	os.FileInfo
    52 	PartialHash []byte
    52 	PartialHash []byte
    53 	Hash        []byte
    53 	Hash        []byte
    54 }
    54 }
    55 
    55 
       
    56 // FileObjList is only exported so that we can have a sort interface on inodes.
    56 type FileObjList []*fileObj
    57 type FileObjList []*fileObj
    57 
    58 
    58 type sizeClass struct {
    59 type sizeClass struct {
    59 	files    FileObjList
    60 	files    FileObjList
    60 	medsums  map[string]FileObjList
    61 	medsums  map[string]FileObjList
   120 		return nil
   121 		return nil
   121 	}
   122 	}
   122 
   123 
   123 	if mode := f.Mode(); mode&os.ModeType != 0 {
   124 	if mode := f.Mode(); mode&os.ModeType != 0 {
   124 		if mode&os.ModeSymlink != 0 {
   125 		if mode&os.ModeSymlink != 0 {
   125 			myLog.Println(5, "Ignoring symbolic link", path)
   126 			myLog.Println(6, "Ignoring symbolic link", path)
   126 		} else {
   127 		} else {
   127 			myLog.Println(0, "Ignoring special file", path)
   128 			myLog.Println(0, "Ignoring special file", path)
   128 		}
   129 		}
   129 		data.ignoreCount++
   130 		data.ignoreCount++
   130 		return nil
   131 		return nil
   210 		return fo.CheckSum()
   211 		return fo.CheckSum()
   211 	}
   212 	}
   212 	panic("Internal error: Invalid sType")
   213 	panic("Internal error: Invalid sType")
   213 }
   214 }
   214 
   215 
   215 func (data *dataT) dispCount() {
   216 func (data *dataT) dispCount() { // FIXME rather useless
   216 	if myLog.verbosity < 4 {
   217 	if myLog.verbosity < 4 {
   217 		return
   218 		return
   218 	}
   219 	}
   219 	var c1, c2, c3, c4 int
   220 	var c1, c1b, c2 int
   220 	var s4 string
   221 	var s1 string
   221 	for _, sc := range data.sizeGroups {
   222 	for _, sc := range data.sizeGroups {
   222 		c1 += len(sc.files)
   223 		c1 += len(sc.files)
   223 		for _, fol := range sc.medsums {
   224 		c2++
   224 			c2 += len(fol)
   225 	}
   225 		}
   226 	c1b = len(data.emptyFiles)
   226 		for _, fol := range sc.fullsums {
   227 	if c1b > 0 {
   227 			c3 += len(fol)
   228 		s1 = fmt.Sprintf("+%d", c1b)
   228 		}
   229 	}
   229 	}
   230 	myLog.Printf(4, "  Current countdown: %d  [%d%s/%d]\n",
   230 	c4 = len(data.emptyFiles)
   231 		c1+c1b, c1, s1, c2)
   231 	if c4 > 0 {
       
   232 		s4 = fmt.Sprintf("+%d", c4)
       
   233 	}
       
   234 	myLog.Printf(4, "  Current countdown: %d  [%d%s/%d/%d]\n",
       
   235 		c1+c2+c3+c4, c1, s4, c2, c3)
       
   236 }
   232 }
   237 
   233 
   238 func (data *dataT) filesWithSameHash() (hgroups []FileObjList) {
   234 func (data *dataT) filesWithSameHash() (hgroups []FileObjList) {
   239 	for _, sizeGroup := range data.sizeGroups {
   235 	for _, sizeGroup := range data.sizeGroups {
   240 		for _, l := range sizeGroup.fullsums {
   236 		for _, l := range sizeGroup.fullsums {
   242 		}
   238 		}
   243 	}
   239 	}
   244 	return
   240 	return
   245 }
   241 }
   246 
   242 
   247 func createHashFromSum(sType sumType, folist FileObjList, hashes *map[string]FileObjList) {
   243 func findDupesFullChecksums(fileList FileObjList) []FileObjList {
   248 	for _, fo := range folist {
   244 	var dupeList []FileObjList
   249 		var hash string
   245 	hashes := make(map[string]FileObjList)
   250 		var hbytes []byte
   246 
   251 		if sType == partialChecksum {
   247 	// Sort the list for better efficiency
   252 			hbytes = fo.PartialHash
   248 	sort.Sort(ByInode(fileList))
   253 		} else if sType == fullChecksum {
   249 
   254 			hbytes = fo.Hash
   250 	// Compute full checksums
   255 		} else {
   251 	for _, fo := range fileList {
   256 			panic("Internal error: Invalid sType")
   252 		if err := fo.Sum(fullChecksum); err != nil {
   257 		}
   253 			myLog.Println(0, "Error:", err)
   258 		if hbytes == nil {
       
   259 			// Could happen if the file was not readable,
       
   260 			// We skip it (already shown as dropped)
       
   261 			continue
   254 			continue
   262 		}
   255 		}
   263 		hash = hex.EncodeToString(hbytes)
   256 		hash := hex.EncodeToString(fo.Hash)
   264 
   257 		hashes[hash] = append(hashes[hash], fo)
   265 		(*hashes)[hash] = append((*hashes)[hash], fo)
   258 	}
   266 	}
   259 
   267 }
   260 	// Let's de-dupe now...
   268 
   261 	for _, l := range hashes {
   269 func (data *dataT) calculateMedSums() {
   262 		if len(l) < 2 {
   270 	var bigList FileObjList
       
   271 	var droppedGroups int
       
   272 	var droppedFiles int
       
   273 
       
   274 	// Create a unique list of files to be partially checksummed
       
   275 	for size, sizeGroup := range data.sizeGroups {
       
   276 		if size < minSizePartialChecksum {
       
   277 			// We do not use MedSums for small files
       
   278 			continue
   263 			continue
   279 		}
   264 		}
   280 		bigList = append(bigList, sizeGroup.files...)
   265 		dupeList = append(dupeList, l)
   281 	}
   266 		// TODO sort by increasing size
   282 
   267 		myLog.Printf(5, "  . found %d new duplicates\n", len(l))
   283 	// Sort the list for better efficiency -- that's the whole point of
   268 	}
   284 	// the unique big list.
   269 
   285 	sort.Sort(ByInode(bigList))
   270 	return dupeList
       
   271 }
       
   272 
       
   273 // TODO: refactor to avoid code duplication
       
   274 func findDupesPartialChecksums(fileList FileObjList) []FileObjList {
       
   275 	var dupeList []FileObjList
       
   276 	hashes := make(map[string]FileObjList)
       
   277 
       
   278 	// Sort the list for better efficiency
       
   279 	sort.Sort(ByInode(fileList))
   286 
   280 
   287 	// Compute partial checksums
   281 	// Compute partial checksums
   288 	for _, fo := range bigList {
   282 	for _, fo := range fileList {
   289 		if err := fo.Sum(partialChecksum); err != nil {
   283 		if err := fo.Sum(partialChecksum); err != nil {
   290 			myLog.Println(0, "Error:", err)
   284 			myLog.Println(0, "Error:", err)
   291 			droppedFiles++
       
   292 			// The hash part will be nil and the file will
       
   293 			// be dropped in createHashFromSum() below.
       
   294 		}
       
   295 	}
       
   296 
       
   297 	// Reparse size-grouped hashes and use the new hash information
       
   298 	// to build lists of files with the same partial checksum.
       
   299 	for size, sizeGroup := range data.sizeGroups {
       
   300 		if size < minSizePartialChecksum {
       
   301 			// We do not use MedSums for small files
       
   302 			continue
   285 			continue
   303 		}
   286 		}
   304 		hashes := make(map[string]FileObjList)
   287 		hash := hex.EncodeToString(fo.PartialHash)
   305 		createHashFromSum(partialChecksum, sizeGroup.files, &hashes)
   288 		hashes[hash] = append(hashes[hash], fo)
   306 
   289 	}
   307 		// Let's de-dupe now...
   290 
   308 		for h, l := range hashes {
   291 	// Let's de-dupe now...
   309 			if len(l) < 2 {
   292 	for _, l := range hashes {
   310 				droppedGroups++
   293 		if len(l) < 2 {
   311 				droppedFiles += len(l)
   294 			continue
   312 				delete(hashes, h)
   295 		}
   313 			}
   296 		dupeList = append(dupeList, findDupesFullChecksums(l)...)
   314 		}
   297 		// TODO sort by increasing size
   315 
   298 	}
   316 		// Done, save the result
   299 
   317 		data.sizeGroups[size].medsums = hashes
   300 	return dupeList
   318 
       
   319 		// We remove the items from "files" since they are in the hash
       
   320 		// tree now.
       
   321 		sizeGroup.files = nil
       
   322 
       
   323 		if len(hashes) == 0 { // We're done with this size group
       
   324 			delete(data.sizeGroups, size)
       
   325 		}
       
   326 	}
       
   327 
       
   328 	if droppedFiles > 0 {
       
   329 		myLog.Printf(3, "  Dropped %d files in %d groups\n",
       
   330 			droppedFiles, droppedGroups)
       
   331 	}
       
   332 	return
       
   333 }
       
   334 
       
   335 func (data *dataT) calculateChecksums() {
       
   336 	var bigList FileObjList
       
   337 	var droppedGroups int
       
   338 	var droppedFiles int
       
   339 
       
   340 	// Create a unique list of files to be fully checksummed
       
   341 	for _, sizeGroup := range data.sizeGroups {
       
   342 		// #1: small files
       
   343 		bigList = append(bigList, sizeGroup.files...)
       
   344 		// #2: files with same partial checksum
       
   345 		for _, l := range sizeGroup.medsums {
       
   346 			bigList = append(bigList, l...)
       
   347 		}
       
   348 	}
       
   349 
       
   350 	// Sort the list for better efficiency -- that's the whole point of
       
   351 	// the unique big list.
       
   352 	sort.Sort(ByInode(bigList))
       
   353 
       
   354 	// Compute full checksums
       
   355 	for _, fo := range bigList {
       
   356 		if err := fo.Sum(fullChecksum); err != nil {
       
   357 			myLog.Println(0, "Error:", err)
       
   358 			droppedFiles++
       
   359 			// The hash part will be nil and the file will
       
   360 			// be dropped in createHashFromSum() below.
       
   361 		}
       
   362 	}
       
   363 
       
   364 	// Reparse size-grouped hashes and use the new hash information
       
   365 	// to build lists of files with the same checksum.
       
   366 	for size, sizeGroup := range data.sizeGroups {
       
   367 		hashes := make(map[string]FileObjList)
       
   368 		// #1: small files
       
   369 		createHashFromSum(fullChecksum, sizeGroup.files, &hashes)
       
   370 		// #2: files with same partial checksum
       
   371 		for _, l := range sizeGroup.medsums {
       
   372 			createHashFromSum(fullChecksum, l, &hashes)
       
   373 		}
       
   374 
       
   375 		// Let's de-dupe now...
       
   376 		for h, l := range hashes {
       
   377 			if len(l) < 2 {
       
   378 				droppedGroups++
       
   379 				droppedFiles += len(l)
       
   380 				delete(hashes, h)
       
   381 			}
       
   382 		}
       
   383 
       
   384 		// Done, save the result
       
   385 		data.sizeGroups[size].fullsums = hashes
       
   386 		data.sizeGroups[size].medsums = nil
       
   387 		sizeGroup.files = nil
       
   388 	}
       
   389 	if droppedFiles > 0 {
       
   390 		myLog.Printf(3, "  Dropped %d files in %d groups\n",
       
   391 			droppedFiles, droppedGroups)
       
   392 	}
       
   393 	return
       
   394 }
   301 }
   395 
   302 
   396 func (data *dataT) dropEmptyFiles(ignoreEmpty bool) (emptyCount int) {
   303 func (data *dataT) dropEmptyFiles(ignoreEmpty bool) (emptyCount int) {
   397 	sc, ok := data.sizeGroups[0]
   304 	sc, ok := data.sizeGroups[0]
   398 	if ok == false {
   305 	if ok == false {
   424 		// Check for hardlinks
   331 		// Check for hardlinks
   425 		// Remove unique dev/inodes
   332 		// Remove unique dev/inodes
   426 		// Instead of this loop, another way would be to use the field
   333 		// Instead of this loop, another way would be to use the field
   427 		// "Unique" of the fileObj to mark them to be discarded
   334 		// "Unique" of the fileObj to mark them to be discarded
   428 		// and remove them all at the end.
   335 		// and remove them all at the end.
   429 		// TODO: what about symlinks?
       
   430 		for {
   336 		for {
   431 			if !OSHasInodes() {
   337 			if !OSHasInodes() {
   432 				break
   338 				break
   433 			}
   339 			}
   434 			var hardLinkIndex int
   340 			var hardLinkIndex int
   435 			fo := sizeGroup.files[0]
   341 			fo := sizeGroup.files[0]
   436 			prevDev, prevIno := GetDevIno(fo)
   342 			prevDev, prevIno := GetDevIno(fo)
   437 			//fmt.Printf(" - FO: %#v\n", *fo)
       
   438 
   343 
   439 			for i, fo := range sizeGroup.files[1:] {
   344 			for i, fo := range sizeGroup.files[1:] {
   440 				//fmt.Printf(" - FO %d : %#v\n", i, *fo)
       
   441 				dev, ino := GetDevIno(fo)
   345 				dev, ino := GetDevIno(fo)
   442 				if dev == prevDev && ino == prevIno {
   346 				if dev == prevDev && ino == prevIno {
   443 					hardLinkIndex = i + 1
   347 					hardLinkIndex = i + 1
   444 					hardLinkCount++
   348 					hardLinkCount++
   445 					hardlinksFound = true
   349 					hardlinksFound = true
   469 		}
   373 		}
   470 	}
   374 	}
   471 	return
   375 	return
   472 }
   376 }
   473 
   377 
       
   378 // findDupes() uses checksums to find file duplicates
       
   379 func (data *dataT) findDupes(skipPartial bool) []FileObjList {
       
   380 	var dupeList []FileObjList
       
   381 
       
   382 	for size, sizeGroup := range data.sizeGroups {
       
   383 		var r []FileObjList
       
   384 		// We skip partial checksums for small files or if requested
       
   385 		if size > minSizePartialChecksum && !skipPartial {
       
   386 			r = findDupesPartialChecksums(sizeGroup.files)
       
   387 		} else {
       
   388 			r = findDupesFullChecksums(sizeGroup.files)
       
   389 		}
       
   390 		dupeList = append(dupeList, r...)
       
   391 	}
       
   392 	return dupeList
       
   393 }
       
   394 
   474 func formatSize(sizeBytes uint64, short bool) string {
   395 func formatSize(sizeBytes uint64, short bool) string {
   475 	var units = map[int]string{
   396 	var units = map[int]string{
   476 		0: "B",
   397 		0: "B",
   477 		1: "KiB",
   398 		1: "KiB",
   478 		2: "MiB",
   399 		2: "MiB",
   508 	flag.BoolVar(&verbose, "v", false, "See --verbose")
   429 	flag.BoolVar(&verbose, "v", false, "See --verbose")
   509 	flag.BoolVar(&summary, "summary", false, "Do not display the duplicate list")
   430 	flag.BoolVar(&summary, "summary", false, "Do not display the duplicate list")
   510 	flag.BoolVar(&summary, "s", false, "See --summary")
   431 	flag.BoolVar(&summary, "s", false, "See --summary")
   511 	flag.BoolVar(&skipPartial, "skip-partial", false, "Skip partial checksums")
   432 	flag.BoolVar(&skipPartial, "skip-partial", false, "Skip partial checksums")
   512 	flag.IntVar(&myLog.verbosity, "verbosity", 0,
   433 	flag.IntVar(&myLog.verbosity, "verbosity", 0,
   513 		"Set verbosity level (1-5)")
   434 		"Set verbosity level (1-6)")
   514 	flag.IntVar(&myLog.verbosity, "vl", 0, "See verbosity")
   435 	flag.IntVar(&myLog.verbosity, "vl", 0, "See verbosity")
   515 	timings := flag.Bool("timings", false, "Set detailed log timings")
   436 	timings := flag.Bool("timings", false, "Set detailed log timings")
   516 	flag.BoolVar(&ignoreEmpty, "no-empty", false, "Ignore empty files")
   437 	flag.BoolVar(&ignoreEmpty, "no-empty", false, "Ignore empty files")
   517 
   438 
   518 	flag.Parse()
   439 	flag.Parse()
   555 			false))
   476 			false))
   556 		if emptyCount > 0 {
   477 		if emptyCount > 0 {
   557 			myLog.Printf(1, "  %d empty files were ignored\n",
   478 			myLog.Printf(1, "  %d empty files were ignored\n",
   558 				emptyCount)
   479 				emptyCount)
   559 		}
   480 		}
   560 		data.dispCount()
   481 		data.dispCount() // XXX
   561 		myLog.Println(3, "* Number of size groups:", len(data.sizeGroups))
   482 		myLog.Println(3, "* Number of size groups:", len(data.sizeGroups))
   562 	}
   483 	}
   563 
   484 
   564 	// Remove unique sizes
   485 	// Remove unique sizes
   565 	myLog.Println(1, "* Removing files with unique size, sorting file lists...")
   486 	myLog.Println(1, "* Removing files with unique size and hard links...")
   566 	hardLinkCount, uniqueSizeCount := data.initialCleanup()
   487 	hardLinkCount, uniqueSizeCount := data.initialCleanup()
   567 	if verbose {
   488 	if verbose {
   568 		myLog.Printf(2, "  Dropped %d files with unique size\n",
   489 		myLog.Printf(2, "  Dropped %d files with unique size\n",
   569 			uniqueSizeCount)
   490 			uniqueSizeCount)
   570 		myLog.Printf(2, "  Dropped %d hard links\n", hardLinkCount)
   491 		myLog.Printf(2, "  Dropped %d hard links\n", hardLinkCount)
   571 		myLog.Println(3, "* Number of size groups:", len(data.sizeGroups))
   492 		myLog.Println(3, "* Number of size groups:", len(data.sizeGroups))
   572 		data.dispCount()
   493 		data.dispCount() // XXX
   573 	}
   494 	}
   574 
       
   575 	// Calculate medsums
       
   576 	if !skipPartial {
       
   577 		myLog.Println(1, "* Calculating partial checksums...")
       
   578 		data.calculateMedSums()
       
   579 		data.dispCount()
       
   580 		myLog.Println(3, "* Number of size groups:", len(data.sizeGroups))
       
   581 	}
       
   582 
       
   583 	// Calculate checksums
       
   584 	myLog.Println(1, "* Calculating checksums...")
       
   585 	data.calculateChecksums()
       
   586 	data.dispCount()
       
   587 
   495 
   588 	// Get list of dupes
   496 	// Get list of dupes
       
   497 	myLog.Println(1, "* Computing checksums...")
   589 	var result []FileObjList
   498 	var result []FileObjList
   590 	if len(data.emptyFiles) > 0 {
   499 	if len(data.emptyFiles) > 0 {
   591 		result = append(result, data.emptyFiles)
   500 		result = append(result, data.emptyFiles)
   592 	}
   501 	}
   593 	result = append(result, data.filesWithSameHash()...)
   502 	result = append(result, data.findDupes(skipPartial)...)
       
   503 
   594 	myLog.Println(3, "* Number of match groups:", len(result))
   504 	myLog.Println(3, "* Number of match groups:", len(result))
   595 
   505 
   596 	// Done!  Dump dupes
   506 	// Done!  Dump dupes
   597 	if len(result) > 0 && !summary {
   507 	if len(result) > 0 && !summary {
   598 		myLog.Println(1, "* Dupes:")
   508 		myLog.Println(1, "* Dupes:")
   601 	data.cmpt = 0
   511 	data.cmpt = 0
   602 	for i, l := range result {
   512 	for i, l := range result {
   603 		size := uint64(l[0].Size())
   513 		size := uint64(l[0].Size())
   604 		// We do not count the size of the 1st item
   514 		// We do not count the size of the 1st item
   605 		// so we get only duplicate size.
   515 		// so we get only duplicate size.
   606 		dupeSize += size * uint64(len(l) - 1)
   516 		dupeSize += size * uint64(len(l)-1)
   607 		if !summary {
   517 		if !summary {
   608 			fmt.Printf("\nGroup #%d (%d files * %v):\n", i+1,
   518 			fmt.Printf("\nGroup #%d (%d files * %v):\n", i+1,
   609 				len(l), formatSize(size, true))
   519 				len(l), formatSize(size, true))
   610 		}
   520 		}
   611 		for _, f := range l {
   521 		for _, f := range l {