golang/mikael/goduf: goduf.go@55098d552ae2


/*
 * Copyright (C) 2014 Mikael Berthe <mikael@lilotux.net>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or (at
 * your option) any later version.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
 * USA
 */

// This program (Goduf) is a fast duplicate file finder.
// Use goduf --help to get the list of available options.

package main

import (
	"crypto/sha1"
	"encoding/hex"
	"errors"
	"flag"
	"fmt"
	"io"
	"log"
	"os"
	"path/filepath"
	"sort"
)

const medsumBytes = 128
const minSizePartialChecksum = 49152 // Should be > 3*medsumBytes

type sumType int

const (
	fullChecksum sumType = iota
	partialChecksum
)

type fileObj struct {
	//Unique   bool
	FilePath string
	os.FileInfo
	PartialHash []byte
	Hash        []byte
}

type FileObjList []*fileObj

type sizeClass struct {
	files    FileObjList
	medsums  map[string]FileObjList
	fullsums map[string]FileObjList
}

type dataT struct {
	totalSize   uint64
	cmpt        uint
	sizeGroups  map[int64]*sizeClass
	emptyFiles  FileObjList
	ignoreCount int
}

var data dataT

type myLogT struct {
	verbosity int
}

var myLog myLogT

func (l *myLogT) Printf(level int, format string, args ...interface{}) {
	if level > l.verbosity {
		return
	}
	if level >= 0 {
		log.Printf(format, args...)
		return
	}
	// Error message without timestamp
	fmt.Fprintf(os.Stderr, format, args...)
}

func (l *myLogT) Println(level int, args ...interface{}) {
	if level > l.verbosity {
		return
	}
	if level >= 0 {
		log.Println(args...)
		return
	}
	// Error message without timestamp
	fmt.Fprintln(os.Stderr, args...)
}

func visit(path string, f os.FileInfo, err error) error {
	if err != nil {
		if f == nil {
			return err
		}
		if f.IsDir() {
			myLog.Println(-1, "Warning: cannot process directory:",
				path)
			return filepath.SkipDir
		}

		myLog.Println(-1, "Ignoring ", path, " - ", err)
		data.ignoreCount++
		return nil
	}
	if f.IsDir() {
		return nil
	}

	if mode := f.Mode(); mode&os.ModeType != 0 {
		if mode&os.ModeSymlink != 0 {
			myLog.Println(5, "Ignoring symbolic link", path)
		} else {
			myLog.Println(0, "Ignoring special file", path)
		}
		data.ignoreCount++
		return nil
	}

	data.cmpt++
	data.totalSize += uint64(f.Size())
	fo := &fileObj{FilePath: path, FileInfo: f}
	if _, ok := data.sizeGroups[f.Size()]; !ok {
		data.sizeGroups[f.Size()] = &sizeClass{}
	}
	data.sizeGroups[f.Size()].files =
		append(data.sizeGroups[f.Size()].files, fo)
	return nil
}

func (fo *fileObj) CheckSum() error {
	file, err := os.Open(fo.FilePath)
	if err != nil {
		return err
	}
	defer file.Close()
	hash := sha1.New()
	if size, err := io.Copy(hash, file); size != fo.Size() || err != nil {
		if err == nil {
			return errors.New("failed to read the whole file: " +
				fo.FilePath)
		}
		return err
	}

	fo.Hash = hash.Sum(nil)

	return nil
}

func (fo *fileObj) MedSum() error {
	file, err := os.Open(fo.FilePath)
	if err != nil {
		return err
	}
	defer file.Close()
	hash := sha1.New()
	// XXX: duplicated code!
	// BOF
	if _, err := io.CopyN(hash, file, medsumBytes); err != nil {
		if err == nil {
			return errors.New("failed to read bytes from file:" +
				fo.FilePath)
		}
		return err
	}
	/*
		// MOF
		file.Seek((fo.Size()-medsumBytes)/2, 0)
		if _, err := io.CopyN(hash, file, medsumBytes); err != nil {
			if err == nil {
				return errors.New("failed to read bytes from file:" +
					fo.FilePath)
			}
			return err
		}
	*/
	// EOF
	file.Seek(0-medsumBytes, 2)
	if _, err := io.CopyN(hash, file, medsumBytes); err != nil {
		if err == nil {
			return errors.New("failed to read bytes from file:" +
				fo.FilePath)
		}
		return err
	}

	fo.PartialHash = hash.Sum(nil)

	return nil
}

func (fo *fileObj) Sum(sType sumType) error {
	if sType == partialChecksum {
		return fo.MedSum()
	} else if sType == fullChecksum {
		return fo.CheckSum()
	}
	panic("Internal error: Invalid sType")
}

func (data *dataT) dispCount() {
	if myLog.verbosity < 4 {
		return
	}
	var c1, c2, c3, c4 int
	var s4 string
	for _, sc := range data.sizeGroups {
		c1 += len(sc.files)
		for _, fol := range sc.medsums {
			c2 += len(fol)
		}
		for _, fol := range sc.fullsums {
			c3 += len(fol)
		}
	}
	c4 = len(data.emptyFiles)
	if c4 > 0 {
		s4 = fmt.Sprintf("+%d", c4)
	}
	myLog.Printf(4, "  Current countdown: %d  [%d%s/%d/%d]\n",
		c1+c2+c3+c4, c1, s4, c2, c3)
}

func (data *dataT) filesWithSameHash() (hgroups []FileObjList) {
	for _, sizeGroup := range data.sizeGroups {
		for _, l := range sizeGroup.fullsums {
			hgroups = append(hgroups, l)
		}
	}
	return
}

func createHashFromSum(sType sumType, folist FileObjList, hashes *map[string]FileObjList) {
	for _, fo := range folist {
		var hash string
		var hbytes []byte
		if sType == partialChecksum {
			hbytes = fo.PartialHash
		} else if sType == fullChecksum {
			hbytes = fo.Hash
		} else {
			panic("Internal error: Invalid sType")
		}
		if hbytes == nil {
			// Could happen if the file was not readable,
			// We skip it (already shown as dropped)
			continue
		}
		hash = hex.EncodeToString(hbytes)

		(*hashes)[hash] = append((*hashes)[hash], fo)
	}
}

func (data *dataT) calculateMedSums() {
	var bigList FileObjList
	var droppedGroups int
	var droppedFiles int

	// Create a unique list of files to be partially checksummed
	for size, sizeGroup := range data.sizeGroups {
		if size < minSizePartialChecksum {
			// We do not use MedSums for small files
			continue
		}
		bigList = append(bigList, sizeGroup.files...)
	}

	// Sort the list for better efficiency -- that's the whole point of
	// the unique big list.
	sort.Sort(ByInode(bigList))

	// Compute partial checksums
	for _, fo := range bigList {
		if err := fo.Sum(partialChecksum); err != nil {
			myLog.Println(0, "Error:", err)
			droppedFiles++
			// The hash part will be nil and the file will
			// be dropped in createHashFromSum() below.
		}
	}

	// Reparse size-grouped hashes and use the new hash information
	// to build lists of files with the same partial checksum.
	for size, sizeGroup := range data.sizeGroups {
		if size < minSizePartialChecksum {
			// We do not use MedSums for small files
			continue
		}
		hashes := make(map[string]FileObjList)
		createHashFromSum(partialChecksum, sizeGroup.files, &hashes)

		// Let's de-dupe now...
		for h, l := range hashes {
			if len(l) < 2 {
				droppedGroups++
				droppedFiles += len(l)
				delete(hashes, h)
			}
		}

		// Done, save the result
		data.sizeGroups[size].medsums = hashes

		// We remove the items from "files" since they are in the hash
		// tree now.
		sizeGroup.files = nil

		if len(hashes) == 0 { // We're done with this size group
			delete(data.sizeGroups, size)
		}
	}

	if droppedFiles > 0 {
		myLog.Printf(3, "  Dropped %d files in %d groups\n",
			droppedFiles, droppedGroups)
	}
	return
}

func (data *dataT) calculateChecksums() {
	var bigList FileObjList
	var droppedGroups int
	var droppedFiles int

	// Create a unique list of files to be fully checksummed
	for _, sizeGroup := range data.sizeGroups {
		// #1: small files
		bigList = append(bigList, sizeGroup.files...)
		// #2: files with same partial checksum
		for _, l := range sizeGroup.medsums {
			bigList = append(bigList, l...)
		}
	}

	// Sort the list for better efficiency -- that's the whole point of
	// the unique big list.
	sort.Sort(ByInode(bigList))

	// Compute full checksums
	for _, fo := range bigList {
		if err := fo.Sum(fullChecksum); err != nil {
			myLog.Println(0, "Error:", err)
			droppedFiles++
			// The hash part will be nil and the file will
			// be dropped in createHashFromSum() below.
		}
	}

	// Reparse size-grouped hashes and use the new hash information
	// to build lists of files with the same checksum.
	for size, sizeGroup := range data.sizeGroups {
		hashes := make(map[string]FileObjList)
		// #1: small files
		createHashFromSum(fullChecksum, sizeGroup.files, &hashes)
		// #2: files with same partial checksum
		for _, l := range sizeGroup.medsums {
			createHashFromSum(fullChecksum, l, &hashes)
		}

		// Let's de-dupe now...
		for h, l := range hashes {
			if len(l) < 2 {
				droppedGroups++
				droppedFiles += len(l)
				delete(hashes, h)
			}
		}

		// Done, save the result
		data.sizeGroups[size].fullsums = hashes
		data.sizeGroups[size].medsums = nil
		sizeGroup.files = nil
	}
	if droppedFiles > 0 {
		myLog.Printf(3, "  Dropped %d files in %d groups\n",
			droppedFiles, droppedGroups)
	}
	return
}

func (data *dataT) dropEmptyFiles(ignoreEmpty bool) (emptyCount int) {
	sc, ok := data.sizeGroups[0]
	if ok == false {
		return // no empty files
	}
	if !ignoreEmpty {
		if len(sc.files) > 1 {
			data.emptyFiles = sc.files
		}
		delete(data.sizeGroups, 0)
		return
	}
	emptyCount = len(sc.files)
	delete(data.sizeGroups, 0)
	return
}

// initialCleanup() removes files with unique size as well as hard links
func (data *dataT) initialCleanup() (hardLinkCount, uniqueSizeCount int) {
	for s, sizeGroup := range data.sizeGroups {
		if len(sizeGroup.files) < 2 {
			delete(data.sizeGroups, s)
			uniqueSizeCount++
			continue
		}

		var hardlinksFound bool

		// Check for hardlinks
		// Remove unique dev/inodes
		// Instead of this loop, another way would be to use the field
		// "Unique" of the fileObj to mark them to be discarded
		// and remove them all at the end.
		// TODO: what about symlinks?
		for {
			if !OSHasInodes() {
				break
			}
			var hardLinkIndex int
			fo := sizeGroup.files[0]
			prevDev, prevIno := GetDevIno(fo)
			//fmt.Printf(" - FO: %#v\n", *fo)

			for i, fo := range sizeGroup.files[1:] {
				//fmt.Printf(" - FO %d : %#v\n", i, *fo)
				dev, ino := GetDevIno(fo)
				if dev == prevDev && ino == prevIno {
					hardLinkIndex = i + 1
					hardLinkCount++
					hardlinksFound = true
					break
				}
				prevDev = dev
				prevIno = ino
			}

			if hardLinkIndex == 0 {
				break
			}
			i := hardLinkIndex
			// Remove hardink
			copy(sizeGroup.files[i:], sizeGroup.files[i+1:])
			sizeGroup.files[len(sizeGroup.files)-1] = nil
			sizeGroup.files = sizeGroup.files[:len(sizeGroup.files)-1]
		}
		// We have found hard links in this size group,
		// maybe we can remove it
		if hardlinksFound {
			if len(sizeGroup.files) < 2 {
				delete(data.sizeGroups, s)
				uniqueSizeCount++
				continue
			}
		}
	}
	return
}

func formatSize(sizeBytes uint64, short bool) string {
	var units = map[int]string{
		0: "B",
		1: "KiB",
		2: "MiB",
		3: "GiB",
		4: "TiB",
		5: "PiB",
	}
	humanSize := sizeBytes
	var n int
	for n < len(units)-1 {
		if humanSize < 10000 {
			break
		}
		humanSize /= 1024
		n++
	}
	if n < 1 {
		return fmt.Sprintf("%d bytes", sizeBytes)
	}
	if short {
		return fmt.Sprintf("%d %s", humanSize, units[n])
	}
	return fmt.Sprintf("%d bytes (%d %s)", sizeBytes, humanSize, units[n])
}

func main() {
	var verbose bool
	var summary bool
	var skipPartial bool
	var ignoreEmpty bool

	flag.BoolVar(&verbose, "verbose", false, "Be verbose (verbosity=1)")
	flag.BoolVar(&verbose, "v", false, "See --verbose")
	flag.BoolVar(&summary, "summary", false, "Do not display the duplicate list")
	flag.BoolVar(&summary, "s", false, "See --summary")
	flag.BoolVar(&skipPartial, "skip-partial", false, "Skip partial checksums")
	flag.IntVar(&myLog.verbosity, "verbosity", 0,
		"Set verbosity level (1-5)")
	flag.IntVar(&myLog.verbosity, "vl", 0, "See verbosity")
	timings := flag.Bool("timings", false, "Set detailed log timings")
	flag.BoolVar(&ignoreEmpty, "no-empty", false, "Ignore empty files")

	flag.Parse()

	if myLog.verbosity > 0 {
		verbose = true
	} else if verbose == true {
		myLog.verbosity = 1
	}

	if len(flag.Args()) == 0 {
		// TODO: more helpful usage statement
		myLog.Println(-1, "Usage:", os.Args[0],
			"[options] base_directory")
		os.Exit(0)
	}

	if *timings {
		log.SetFlags(log.LstdFlags | log.Lmicroseconds)
	}

	data.sizeGroups = make(map[int64]*sizeClass)
	myLog.Println(1, "* Reading file metadata")

	for _, root := range flag.Args() {
		if err := filepath.Walk(root, visit); err != nil {
			myLog.Printf(-1, "* Error: could not read file tree:\n")
			myLog.Printf(-1, "> %v\n", err)
			os.Exit(1)
		}
	}
	emptyCount := data.dropEmptyFiles(ignoreEmpty)
	if verbose {
		if data.ignoreCount > 0 {
			myLog.Printf(1, "  %d special files were ignored\n",
				data.ignoreCount)
		}
		myLog.Println(2, "  Initial counter:", data.cmpt, "files")
		myLog.Println(2, "  Total size:", formatSize(data.totalSize,
			false))
		if emptyCount > 0 {
			myLog.Printf(1, "  %d empty files were ignored\n",
				emptyCount)
		}
		data.dispCount()
		myLog.Println(3, "* Number of size groups:", len(data.sizeGroups))
	}

	// Remove unique sizes
	myLog.Println(1, "* Removing files with unique size, sorting file lists...")
	hardLinkCount, uniqueSizeCount := data.initialCleanup()
	if verbose {
		myLog.Printf(2, "  Dropped %d files with unique size\n",
			uniqueSizeCount)
		myLog.Printf(2, "  Dropped %d hard links\n", hardLinkCount)
		myLog.Println(3, "* Number of size groups:", len(data.sizeGroups))
		data.dispCount()
	}

	// Calculate medsums
	if !skipPartial {
		myLog.Println(1, "* Calculating partial checksums...")
		data.calculateMedSums()
		data.dispCount()
		myLog.Println(3, "* Number of size groups:", len(data.sizeGroups))
	}

	// Calculate checksums
	myLog.Println(1, "* Calculating checksums...")
	data.calculateChecksums()
	data.dispCount()

	// Get list of dupes
	var result []FileObjList
	if len(data.emptyFiles) > 0 {
		result = append(result, data.emptyFiles)
	}
	result = append(result, data.filesWithSameHash()...)
	myLog.Println(3, "* Number of match groups:", len(result))

	// Done!  Dump dupes
	if len(result) > 0 && !summary {
		myLog.Println(1, "* Dupes:")
	}
	var dupeSize uint64
	data.cmpt = 0
	for i, l := range result {
		size := uint64(l[0].Size())
		// We do not count the size of the 1st item
		// so we get only duplicate size.
		dupeSize += size * uint64(len(l) - 1)
		if !summary {
			fmt.Printf("\nGroup #%d (%d files * %v):\n", i+1,
				len(l), formatSize(size, true))
		}
		for _, f := range l {
			if !summary {
				fmt.Println(f.FilePath)
			}
			data.cmpt++
		}
	}
	summaryLevel := 1 // Default verbosity for the summary line
	if summary == false {
		// Add a trailing newline
		if len(result) > 0 {
			fmt.Println("")
		}
	} else {
		// The summary is requested so we lower the verbosity level
		summaryLevel = 0
	}

	myLog.Println(summaryLevel, "Final count:", data.cmpt,
		"duplicate files in", len(result), "sets")
	myLog.Println(summaryLevel, "Redundant data size:",
		formatSize(dupeSize, false))
}
author	Mikael Berthe <mikael@lilotux.net>
	Sat, 28 Jun 2014 18:28:09 +0200
changeset 2	55098d552ae2
parent 1	a8aa9e54bce4
child 5	887c21c26cc8
permissions	-rw-r--r--