goduf.go
changeset 20 f7ce9d750e83
parent 19 3389a17fc0d2
child 21 dee0e0c1ad10
equal deleted inserted replaced
19:3389a17fc0d2 20:f7ce9d750e83
    71 
    71 
    72 type myLogT struct {
    72 type myLogT struct {
    73 	verbosity int
    73 	verbosity int
    74 }
    74 }
    75 
    75 
       
    76 // Implement my own logger
    76 var myLog myLogT
    77 var myLog myLogT
    77 
    78 
    78 func (l *myLogT) Printf(level int, format string, args ...interface{}) {
    79 func (l *myLogT) Printf(level int, format string, args ...interface{}) {
    79 	if level > l.verbosity {
    80 	if level > l.verbosity {
    80 		return
    81 		return
    97 	}
    98 	}
    98 	// Error message without timestamp
    99 	// Error message without timestamp
    99 	fmt.Fprintln(os.Stderr, args...)
   100 	fmt.Fprintln(os.Stderr, args...)
   100 }
   101 }
   101 
   102 
       
   103 // visit is called for every file and directory.
       
   104 // We check the file object is correct (regular, readable...) and add
       
   105 // it to the data.sizeGroups hash.
   102 func visit(path string, f os.FileInfo, err error) error {
   106 func visit(path string, f os.FileInfo, err error) error {
   103 	if err != nil {
   107 	if err != nil {
   104 		if f == nil {
   108 		if f == nil {
   105 			return err
   109 			return err
   106 		}
   110 		}
   136 	}
   140 	}
   137 	*data.sizeGroups[f.Size()] = append(*data.sizeGroups[f.Size()], fo)
   141 	*data.sizeGroups[f.Size()] = append(*data.sizeGroups[f.Size()], fo)
   138 	return nil
   142 	return nil
   139 }
   143 }
   140 
   144 
   141 func (fo *fileObj) CheckSum() error {
   145 // Checksum computes the file's complete SHA1 hash.
       
   146 func (fo *fileObj) Checksum() error {
   142 	file, err := os.Open(fo.FilePath)
   147 	file, err := os.Open(fo.FilePath)
   143 	if err != nil {
   148 	if err != nil {
   144 		return err
   149 		return err
   145 	}
   150 	}
   146 	defer file.Close()
   151 	defer file.Close()
   156 	fo.Hash = hash.Sum(nil)
   161 	fo.Hash = hash.Sum(nil)
   157 
   162 
   158 	return nil
   163 	return nil
   159 }
   164 }
   160 
   165 
   161 func (fo *fileObj) MedSum() error {
   166 // partialChecksum computes the file's partial SHA1 hash (first and last bytes).
       
   167 func (fo *fileObj) partialChecksum() error {
   162 	file, err := os.Open(fo.FilePath)
   168 	file, err := os.Open(fo.FilePath)
   163 	if err != nil {
   169 	if err != nil {
   164 		return err
   170 		return err
   165 	}
   171 	}
   166 	defer file.Close()
   172 	defer file.Close()
   183 	fo.PartialHash = hash.Sum(nil)
   189 	fo.PartialHash = hash.Sum(nil)
   184 
   190 
   185 	return nil
   191 	return nil
   186 }
   192 }
   187 
   193 
       
   194 // Sum computes the file's SHA1 hash, partial or full according to sType.
   188 func (fo *fileObj) Sum(sType sumType) error {
   195 func (fo *fileObj) Sum(sType sumType) error {
   189 	if sType == partialChecksum {
   196 	if sType == partialChecksum {
   190 		return fo.MedSum()
   197 		return fo.partialChecksum()
   191 	} else if sType == fullChecksum {
   198 	} else if sType == fullChecksum {
   192 		return fo.CheckSum()
   199 		return fo.Checksum()
   193 	} else if sType == noChecksum {
   200 	} else if sType == noChecksum {
   194 		return nil
   201 		return nil
   195 	}
   202 	}
   196 	panic("Internal error: Invalid sType")
   203 	panic("Internal error: Invalid sType")
   197 }
   204 }
   198 
   205 
   199 func (data *dataT) dispCount() { // FIXME rather useless
   206 // dispCount display statistics to the user.
       
   207 func (data *dataT) dispCount() { // It this still useful?
   200 	if myLog.verbosity < 4 {
   208 	if myLog.verbosity < 4 {
   201 		return
   209 		return
   202 	}
   210 	}
   203 	var c1, c1b, c2 int
   211 	var c1, c1b, c2 int
   204 	var s1 string
   212 	var s1 string
   236 		}
   244 		}
   237 	}
   245 	}
   238 	return hex.EncodeToString(hbytes), nil
   246 	return hex.EncodeToString(hbytes), nil
   239 }
   247 }
   240 
   248 
   241 func (fileList FileObjList) computeSheduledChecksums() {
   249 // computeSheduledChecksums calculates the checksums for all the files
   242 	// Sort the list for better efficiency
   250 // from the fileLists slice items (the kind of hash is taken from the
   243 	sort.Sort(ByInode(fileList))
   251 // needHash field).
   244 
       
   245 	//myLog.Printf(6, "  . will compute %d checksums\n", len(fileList))
       
   246 
       
   247 	// Compute checksums
       
   248 	for _, fo := range fileList {
       
   249 		if err := fo.Sum(fo.needHash); err != nil {
       
   250 			myLog.Println(0, "Error:", err)
       
   251 		}
       
   252 	}
       
   253 }
       
   254 func computeSheduledChecksums(fileLists ...foListList) {
   252 func computeSheduledChecksums(fileLists ...foListList) {
   255 	var bigFileList FileObjList
   253 	var bigFileList FileObjList
   256 	// Merge the lists of FileObjList lists and create a unique list
   254 	// Merge the lists of FileObjList lists and create a unique list
   257 	// of file objects.
   255 	// of file objects.
   258 	for _, foll := range fileLists {
   256 	for _, foll := range fileLists {
   276 	for _, fo := range fileList {
   274 	for _, fo := range fileList {
   277 		fo.needHash = sType
   275 		fo.needHash = sType
   278 	}
   276 	}
   279 }
   277 }
   280 
   278 
       
   279 // findDupesChecksums splits the fileObj list into several lists with the
       
   280 // same sType hash.
   281 func (fileList FileObjList) findDupesChecksums(sType sumType) foListList {
   281 func (fileList FileObjList) findDupesChecksums(sType sumType) foListList {
   282 	var dupeList foListList
   282 	var dupeList foListList
   283 	var scheduleFull foListList
   283 	var scheduleFull foListList
   284 	hashes := make(map[string]FileObjList)
   284 	hashes := make(map[string]FileObjList)
   285 
   285 
   347 		dupeList = append(dupeList, r...)
   347 		dupeList = append(dupeList, r...)
   348 	}
   348 	}
   349 	return dupeList
   349 	return dupeList
   350 }
   350 }
   351 
   351 
       
   352 // dropEmptyFiles removes the empty files from the main map, since we don't
       
   353 // have to do any processing about them.
       
   354 // If ignoreEmpty is false, the empty file list is saved in data.emptyFiles.
   352 func (data *dataT) dropEmptyFiles(ignoreEmpty bool) (emptyCount int) {
   355 func (data *dataT) dropEmptyFiles(ignoreEmpty bool) (emptyCount int) {
   353 	sgListP, ok := data.sizeGroups[0]
   356 	sgListP, ok := data.sizeGroups[0]
   354 	if ok == false {
   357 	if ok == false {
   355 		return // no empty files
   358 		return // no empty files
   356 	}
   359 	}
   386 		// Check for hard links
   389 		// Check for hard links
   387 		// Remove unique dev/inodes
   390 		// Remove unique dev/inodes
   388 		// Instead of this loop, another way would be to use the field
   391 		// Instead of this loop, another way would be to use the field
   389 		// "Unique" of the fileObj to mark them to be discarded
   392 		// "Unique" of the fileObj to mark them to be discarded
   390 		// and remove them all at the end.
   393 		// and remove them all at the end.
       
   394 		// TODO: Should we also check for duplicate paths?
   391 		for {
   395 		for {
   392 			type devinode struct{ dev, ino uint64 }
   396 			type devinode struct{ dev, ino uint64 }
   393 			devinodes := make(map[devinode]bool)
   397 			devinodes := make(map[devinode]bool)
   394 			var hardLinkIndex int
   398 			var hardLinkIndex int
   395 
   399 
   426 		}
   430 		}
   427 	}
   431 	}
   428 	return
   432 	return
   429 }
   433 }
   430 
   434 
       
   435 // formatSize returns the size in a string with a human-readable format.
   431 func formatSize(sizeBytes uint64, short bool) string {
   436 func formatSize(sizeBytes uint64, short bool) string {
   432 	var units = map[int]string{
   437 	var units = map[int]string{
   433 		0: "B",
   438 		0: "B",
   434 		1: "KiB",
   439 		1: "KiB",
   435 		2: "MiB",
   440 		2: "MiB",
   453 		return fmt.Sprintf("%d %s", humanSize, units[n])
   458 		return fmt.Sprintf("%d %s", humanSize, units[n])
   454 	}
   459 	}
   455 	return fmt.Sprintf("%d bytes (%d %s)", sizeBytes, humanSize, units[n])
   460 	return fmt.Sprintf("%d bytes (%d %s)", sizeBytes, humanSize, units[n])
   456 }
   461 }
   457 
   462 
       
   463 // It all starts here.
   458 func main() {
   464 func main() {
   459 	var verbose bool
   465 	var verbose bool
   460 	var summary bool
   466 	var summary bool
   461 	var skipPartial bool
   467 	var skipPartial bool
   462 	var ignoreEmpty bool
   468 	var ignoreEmpty bool
   463 
   469 
       
   470 	// Command line parameters parsingg
   464 	flag.BoolVar(&verbose, "verbose", false, "Be verbose (verbosity=1)")
   471 	flag.BoolVar(&verbose, "verbose", false, "Be verbose (verbosity=1)")
   465 	flag.BoolVar(&verbose, "v", false, "See --verbose")
   472 	flag.BoolVar(&verbose, "v", false, "See --verbose")
   466 	flag.BoolVar(&summary, "summary", false, "Do not display the duplicate list")
   473 	flag.BoolVar(&summary, "summary", false, "Do not display the duplicate list")
   467 	flag.BoolVar(&summary, "s", false, "See --summary")
   474 	flag.BoolVar(&summary, "s", false, "See --summary")
   468 	flag.BoolVar(&skipPartial, "skip-partial", false, "Skip partial checksums")
   475 	flag.BoolVar(&skipPartial, "skip-partial", false, "Skip partial checksums")
   472 	timings := flag.Bool("timings", false, "Set detailed log timings")
   479 	timings := flag.Bool("timings", false, "Set detailed log timings")
   473 	flag.BoolVar(&ignoreEmpty, "no-empty", false, "Ignore empty files")
   480 	flag.BoolVar(&ignoreEmpty, "no-empty", false, "Ignore empty files")
   474 
   481 
   475 	flag.Parse()
   482 	flag.Parse()
   476 
   483 
       
   484 	// Set verbosity: --verbose=true == --verbosity=1
   477 	if myLog.verbosity > 0 {
   485 	if myLog.verbosity > 0 {
   478 		verbose = true
   486 		verbose = true
   479 	} else if verbose == true {
   487 	} else if verbose == true {
   480 		myLog.verbosity = 1
   488 		myLog.verbosity = 1
   481 	}
   489 	}
   482 
   490 
   483 	if len(flag.Args()) == 0 {
   491 	if len(flag.Args()) == 0 {
   484 		// TODO: more helpful usage statement
   492 		// TODO: more helpful usage statement
   485 		myLog.Println(-1, "Usage:", os.Args[0],
   493 		myLog.Println(-1, "Usage:", os.Args[0],
   486 			"[options] base_directory")
   494 			"[options] base_directory|file...")
   487 		os.Exit(0)
   495 		os.Exit(0)
   488 	}
   496 	}
   489 
   497 
       
   498 	// Change log format for benchmarking
   490 	if *timings {
   499 	if *timings {
   491 		log.SetFlags(log.LstdFlags | log.Lmicroseconds)
   500 		log.SetFlags(log.LstdFlags | log.Lmicroseconds)
   492 	}
   501 	}
   493 
   502 
   494 	data.sizeGroups = make(map[int64]*FileObjList)
   503 	data.sizeGroups = make(map[int64]*FileObjList)
   499 			myLog.Printf(-1, "* Error: could not read file tree:\n")
   508 			myLog.Printf(-1, "* Error: could not read file tree:\n")
   500 			myLog.Printf(-1, "> %v\n", err)
   509 			myLog.Printf(-1, "> %v\n", err)
   501 			os.Exit(1)
   510 			os.Exit(1)
   502 		}
   511 		}
   503 	}
   512 	}
       
   513 
       
   514 	// Count empty files and drop them if they should be ignored
   504 	emptyCount := data.dropEmptyFiles(ignoreEmpty)
   515 	emptyCount := data.dropEmptyFiles(ignoreEmpty)
       
   516 
       
   517 	// Display a small report
   505 	if verbose {
   518 	if verbose {
   506 		if data.ignoreCount > 0 {
   519 		if data.ignoreCount > 0 {
   507 			myLog.Printf(1, "  %d special files were ignored\n",
   520 			myLog.Printf(1, "  %d special files were ignored\n",
   508 				data.ignoreCount)
   521 				data.ignoreCount)
   509 		}
   522 		}
   512 			false))
   525 			false))
   513 		if emptyCount > 0 {
   526 		if emptyCount > 0 {
   514 			myLog.Printf(1, "  %d empty files were ignored\n",
   527 			myLog.Printf(1, "  %d empty files were ignored\n",
   515 				emptyCount)
   528 				emptyCount)
   516 		}
   529 		}
   517 		data.dispCount() // XXX
   530 		data.dispCount()
   518 		myLog.Println(3, "* Number of size groups:", len(data.sizeGroups))
   531 		myLog.Println(3, "* Number of size groups:", len(data.sizeGroups))
   519 	}
   532 	}
   520 
   533 
   521 	// Remove unique sizes
   534 	// Remove unique sizes and hard links
   522 	myLog.Println(1, "* Removing files with unique size and hard links...")
   535 	myLog.Println(1, "* Removing files with unique size and hard links...")
   523 	hardLinkCount, uniqueSizeCount := data.initialCleanup()
   536 	hardLinkCount, uniqueSizeCount := data.initialCleanup()
   524 	if verbose {
   537 	if verbose {
   525 		myLog.Printf(2, "  Dropped %d files with unique size\n",
   538 		myLog.Printf(2, "  Dropped %d files with unique size\n",
   526 			uniqueSizeCount)
   539 			uniqueSizeCount)
   527 		myLog.Printf(2, "  Dropped %d hard links\n", hardLinkCount)
   540 		myLog.Printf(2, "  Dropped %d hard links\n", hardLinkCount)
   528 		myLog.Println(3, "* Number of size groups:", len(data.sizeGroups))
   541 		myLog.Println(3, "* Number of size groups:", len(data.sizeGroups))
   529 		data.dispCount() // XXX
   542 		data.dispCount()
   530 	}
   543 	}
   531 
   544 
   532 	// Get list of dupes
   545 	// Get the final list of dupes, using checksums
   533 	myLog.Println(1, "* Computing checksums...")
   546 	myLog.Println(1, "* Computing checksums...")
   534 	var result foListList
   547 	var result foListList
   535 	if len(data.emptyFiles) > 0 {
   548 	if len(data.emptyFiles) > 0 {
   536 		result = append(result, data.emptyFiles)
   549 		result = append(result, data.emptyFiles)
   537 	}
   550 	}