golang/mikael/goduf: changeset 0:a5642cd03cef

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/.hgignore	Sat Jun 28 14:03:08 2014 +0200
@@ -0,0 +1,6 @@
+syntax: glob
+
+goduf
+goduf_*
+
+*.swp

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/TODO.md	Sat Jun 28 14:03:08 2014 +0200
@@ -0,0 +1,7 @@
+# TODO:
+
+* Clean up, fix variable names
+* Add version number
+* Sort results?
+* Possibility to accept file list from stdin
+* Check Windows portability

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/goduf-byinode_fallback.go	Sat Jun 28 14:03:08 2014 +0200
@@ -0,0 +1,23 @@
+
+// +build plan9 windows
+
+package main
+
+import "os"
+
+// ByInode is a FileObjList type with a sort interface
+type ByInode FileObjList
+
+func (a ByInode) Len() int      { return len(a) }
+func (a ByInode) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
+func (a ByInode) Less(i, j int) bool {
+	return true
+}
+
+func OSHasInodes() bool {
+	return false
+}
+
+func GetDevIno(fi os.FileInfo) (uint64, uint64) {
+	return 0, 0 // Not supported
+}

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/goduf-byinode_unix.go	Sat Jun 28 14:03:08 2014 +0200
@@ -0,0 +1,37 @@
+
+// +build darwin dragonfly freebsd linux nacl netbsd openbsd solaris
+
+package main
+
+import "os"
+import "syscall"
+
+// ByInode is a FileObjList type with a sort interface
+type ByInode FileObjList
+
+func (a ByInode) Len() int      { return len(a) }
+func (a ByInode) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
+func (a ByInode) Less(i, j int) bool {
+	// Sort by device id first
+	iDevice := a[i].Sys().(*syscall.Stat_t).Dev
+	jDevice := a[j].Sys().(*syscall.Stat_t).Dev
+	switch {
+	case iDevice < jDevice:
+		return true
+	case iDevice > jDevice:
+		return false
+	}
+	iInode := a[i].Sys().(*syscall.Stat_t).Ino
+	jInode := a[j].Sys().(*syscall.Stat_t).Ino
+	return iInode < jInode
+}
+
+func OSHasInodes() bool {
+	return true
+}
+
+func GetDevIno(fi os.FileInfo) (uint64, uint64) {
+	dev := fi.Sys().(*syscall.Stat_t).Dev
+	ino := fi.Sys().(*syscall.Stat_t).Ino
+	return uint64(dev), uint64(ino)
+}

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/goduf.go	Sat Jun 28 14:03:08 2014 +0200
@@ -0,0 +1,635 @@
+/*
+ * Copyright (C) 2014 Mikael Berthe <mikael@lilotux.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+ * USA
+ */
+
+// This program (Goduf) is a fast duplicate file finder.
+// Use goduf --help to get the list of available options.
+
+package main
+
+import (
+	"crypto/sha1"
+	"encoding/hex"
+	"errors"
+	"flag"
+	"fmt"
+	"io"
+	"log"
+	"os"
+	"path/filepath"
+	"sort"
+)
+
+const medsumBytes = 128
+const minSizePartialChecksum = 49152 // Should be > 3*medsumBytes
+
+type sumType int
+
+const (
+	fullChecksum sumType = iota
+	partialChecksum
+)
+
+type fileObj struct {
+	//Unique   bool
+	FilePath string
+	os.FileInfo
+	PartialHash []byte
+	Hash        []byte
+}
+
+type FileObjList []*fileObj
+
+type sizeClass struct {
+	files    FileObjList
+	medsums  map[string]FileObjList
+	fullsums map[string]FileObjList
+}
+
+type dataT struct {
+	totalSize   uint64
+	cmpt        uint
+	sizeGroups  map[int64]*sizeClass
+	emptyFiles  FileObjList
+	ignoreCount int
+}
+
+var data dataT
+
+type myLogT struct {
+	verbosity int
+}
+
+var myLog myLogT
+
+func (l *myLogT) Printf(level int, format string, args ...interface{}) {
+	if level > l.verbosity {
+		return
+	}
+	if level >= 0 {
+		log.Printf(format, args...)
+		return
+	}
+	// Error message without timestamp
+	fmt.Fprintf(os.Stderr, format, args...)
+}
+
+func (l *myLogT) Println(level int, args ...interface{}) {
+	if level > l.verbosity {
+		return
+	}
+	if level >= 0 {
+		log.Println(args...)
+		return
+	}
+	// Error message without timestamp
+	fmt.Fprintln(os.Stderr, args...)
+}
+
+func visit(path string, f os.FileInfo, err error) error {
+	if err != nil {
+		if f == nil {
+			return err
+		}
+		if f.IsDir() {
+			myLog.Println(-1, "Warning: cannot process directory:",
+				path)
+			return filepath.SkipDir
+		}
+
+		myLog.Println(-1, "Ignoring ", path, " - ", err)
+		data.ignoreCount++
+		return nil
+	}
+	if f.IsDir() {
+		return nil
+	}
+
+	if mode := f.Mode(); mode&os.ModeType != 0 {
+		if mode&os.ModeSymlink != 0 {
+			myLog.Println(5, "Ignoring symbolic link", path)
+		} else {
+			myLog.Println(0, "Ignoring special file", path)
+		}
+		data.ignoreCount++
+		return nil
+	}
+
+	data.cmpt++
+	data.totalSize += uint64(f.Size())
+	fo := &fileObj{FilePath: path, FileInfo: f}
+	if _, ok := data.sizeGroups[f.Size()]; !ok {
+		data.sizeGroups[f.Size()] = &sizeClass{}
+	}
+	data.sizeGroups[f.Size()].files =
+		append(data.sizeGroups[f.Size()].files, fo)
+	return nil
+}
+
+func (fo *fileObj) CheckSum() error {
+	file, err := os.Open(fo.FilePath)
+	if err != nil {
+		return err
+	}
+	defer file.Close()
+	hash := sha1.New()
+	if size, err := io.Copy(hash, file); size != fo.Size() || err != nil {
+		if err == nil {
+			return errors.New("failed to read the whole file: " +
+				fo.FilePath)
+		}
+		return err
+	}
+
+	fo.Hash = hash.Sum(nil)
+
+	return nil
+}
+
+func (fo *fileObj) MedSum() error {
+	file, err := os.Open(fo.FilePath)
+	if err != nil {
+		return err
+	}
+	defer file.Close()
+	hash := sha1.New()
+	// XXX: duplicated code!
+	// BOF
+	if _, err := io.CopyN(hash, file, medsumBytes); err != nil {
+		if err == nil {
+			return errors.New("failed to read bytes from file:" +
+				fo.FilePath)
+		}
+		return err
+	}
+	/*
+		// MOF
+		file.Seek((fo.Size()-medsumBytes)/2, 0)
+		if _, err := io.CopyN(hash, file, medsumBytes); err != nil {
+			if err == nil {
+				return errors.New("failed to read bytes from file:" +
+					fo.FilePath)
+			}
+			return err
+		}
+	*/
+	// EOF
+	file.Seek(0-medsumBytes, 2)
+	if _, err := io.CopyN(hash, file, medsumBytes); err != nil {
+		if err == nil {
+			return errors.New("failed to read bytes from file:" +
+				fo.FilePath)
+		}
+		return err
+	}
+
+	fo.PartialHash = hash.Sum(nil)
+
+	return nil
+}
+
+func (fo *fileObj) Sum(sType sumType) error {
+	if sType == partialChecksum {
+		return fo.MedSum()
+	} else if sType == fullChecksum {
+		return fo.CheckSum()
+	}
+	panic("Internal error: Invalid sType")
+}
+
+func (data *dataT) dispCount() {
+	if myLog.verbosity < 4 {
+		return
+	}
+	var c1, c2, c3, c4 int
+	var s4 string
+	for _, sc := range data.sizeGroups {
+		c1 += len(sc.files)
+		for _, fol := range sc.medsums {
+			c2 += len(fol)
+		}
+		for _, fol := range sc.fullsums {
+			c3 += len(fol)
+		}
+	}
+	c4 = len(data.emptyFiles)
+	if c4 > 0 {
+		s4 = fmt.Sprintf("+%d", c4)
+	}
+	myLog.Printf(4, "  Current countdown: %d  [%d%s/%d/%d]\n",
+		c1+c2+c3+c4, c1, s4, c2, c3)
+}
+
+func (data *dataT) filesWithSameHash() (hgroups []FileObjList) {
+	for _, sizeGroup := range data.sizeGroups {
+		for _, l := range sizeGroup.fullsums {
+			hgroups = append(hgroups, l)
+		}
+	}
+	return
+}
+
+func createHashFromSum(sType sumType, folist FileObjList, hashes *map[string]FileObjList) {
+	for _, fo := range folist {
+		var hash string
+		var hbytes []byte
+		if sType == partialChecksum {
+			hbytes = fo.PartialHash
+		} else if sType == fullChecksum {
+			hbytes = fo.Hash
+		} else {
+			panic("Internal error: Invalid sType")
+		}
+		if hbytes == nil {
+			// Could happen if the file was not readable,
+			// We skip it (already shown as dropped)
+			continue
+		}
+		hash = hex.EncodeToString(hbytes)
+
+		(*hashes)[hash] = append((*hashes)[hash], fo)
+	}
+}
+
+func (data *dataT) calculateMedSums() {
+	var bigList FileObjList
+	var droppedGroups int
+	var droppedFiles int
+
+	// Create a unique list of files to be partially checksummed
+	for size, sizeGroup := range data.sizeGroups {
+		if size < minSizePartialChecksum {
+			// We do not use MedSums for small files
+			continue
+		}
+		bigList = append(bigList, sizeGroup.files...)
+	}
+
+	// Sort the list for better efficiency -- that's the whole point of
+	// the unique big list.
+	sort.Sort(ByInode(bigList))
+
+	// Compute partial checksums
+	for _, fo := range bigList {
+		if err := fo.Sum(partialChecksum); err != nil {
+			myLog.Println(0, "Error:", err)
+			droppedFiles++
+			// The hash part will be nil and the file will
+			// be dropped in createHashFromSum() below.
+		}
+	}
+
+	// Reparse size-grouped hashes and use the new hash information
+	// to build lists of files with the same partial checksum.
+	for size, sizeGroup := range data.sizeGroups {
+		if size < minSizePartialChecksum {
+			// We do not use MedSums for small files
+			continue
+		}
+		hashes := make(map[string]FileObjList)
+		createHashFromSum(partialChecksum, sizeGroup.files, &hashes)
+
+		// Let's de-dupe now...
+		for h, l := range hashes {
+			if len(l) < 2 {
+				droppedGroups++
+				droppedFiles += len(l)
+				delete(hashes, h)
+			}
+		}
+
+		// Done, save the result
+		data.sizeGroups[size].medsums = hashes
+
+		// We remove the items from "files" since they are in the hash
+		// tree now.
+		sizeGroup.files = nil
+
+		if len(hashes) == 0 { // We're done with this size group
+			delete(data.sizeGroups, size)
+		}
+	}
+
+	if droppedFiles > 0 {
+		myLog.Printf(3, "  Dropped %d files in %d groups\n",
+			droppedFiles, droppedGroups)
+	}
+	return
+}
+
+func (data *dataT) calculateChecksums() {
+	var bigList FileObjList
+	var droppedGroups int
+	var droppedFiles int
+
+	// Create a unique list of files to be fully checksummed
+	for _, sizeGroup := range data.sizeGroups {
+		// #1: small files
+		bigList = append(bigList, sizeGroup.files...)
+		// #2: files with same partial checksum
+		for _, l := range sizeGroup.medsums {
+			bigList = append(bigList, l...)
+		}
+	}
+
+	// Sort the list for better efficiency -- that's the whole point of
+	// the unique big list.
+	sort.Sort(ByInode(bigList))
+
+	// Compute full checksums
+	for _, fo := range bigList {
+		if err := fo.Sum(fullChecksum); err != nil {
+			myLog.Println(0, "Error:", err)
+			droppedFiles++
+			// The hash part will be nil and the file will
+			// be dropped in createHashFromSum() below.
+		}
+	}
+
+	// Reparse size-grouped hashes and use the new hash information
+	// to build lists of files with the same checksum.
+	for size, sizeGroup := range data.sizeGroups {
+		hashes := make(map[string]FileObjList)
+		// #1: small files
+		createHashFromSum(fullChecksum, sizeGroup.files, &hashes)
+		// #2: files with same partial checksum
+		for _, l := range sizeGroup.medsums {
+			createHashFromSum(fullChecksum, l, &hashes)
+		}
+
+		// Let's de-dupe now...
+		for h, l := range hashes {
+			if len(l) < 2 {
+				droppedGroups++
+				droppedFiles += len(l)
+				delete(hashes, h)
+			}
+		}
+
+		// Done, save the result
+		data.sizeGroups[size].fullsums = hashes
+		data.sizeGroups[size].medsums = nil
+		sizeGroup.files = nil
+	}
+	if droppedFiles > 0 {
+		myLog.Printf(3, "  Dropped %d files in %d groups\n",
+			droppedFiles, droppedGroups)
+	}
+	return
+}
+
+func (data *dataT) dropEmptyFiles(ignoreEmpty bool) (emptyCount int) {
+	sc, ok := data.sizeGroups[0]
+	if ok == false {
+		return // no empty files
+	}
+	if !ignoreEmpty {
+		if len(sc.files) > 1 {
+			data.emptyFiles = sc.files
+		}
+		delete(data.sizeGroups, 0)
+		return
+	}
+	emptyCount = len(sc.files)
+	delete(data.sizeGroups, 0)
+	return
+}
+
+func (data *dataT) createSizeHash() (hardLinkCount, uniqueSizeCount int) {
+	for s, sizeGroup := range data.sizeGroups {
+		if len(sizeGroup.files) < 2 {
+			delete(data.sizeGroups, s)
+			uniqueSizeCount++
+			continue
+		}
+
+		var hardlinksFound bool
+
+		// Sort by device/inodes
+		sort.Sort(ByInode(sizeGroup.files))
+
+		// Check for hardlinks
+		// TODO: what about symlinks?
+		// Remove unique dev/inodes
+		// Instead of this loop, another way would be to use the field
+		// "Unique" of the fileObj to mark them to be discarded
+		// and remove them all at the end.
+		for {
+			if !OSHasInodes() {
+				break
+			}
+			var hardLinkIndex int
+			fo := sizeGroup.files[0]
+			prevDev, prevIno := GetDevIno(fo)
+			//fmt.Printf(" - FO: %#v\n", *fo)
+
+			for i, fo := range sizeGroup.files[1:] {
+				//fmt.Printf(" - FO %d : %#v\n", i, *fo)
+				dev, ino := GetDevIno(fo)
+				if dev == prevDev && ino == prevIno {
+					hardLinkIndex = i + 1
+					hardLinkCount++
+					hardlinksFound = true
+					break
+				}
+				prevDev = dev
+				prevIno = ino
+			}
+
+			if hardLinkIndex == 0 {
+				break
+			}
+			i := hardLinkIndex
+			// Remove hardink
+			copy(sizeGroup.files[i:], sizeGroup.files[i+1:])
+			sizeGroup.files[len(sizeGroup.files)-1] = nil
+			sizeGroup.files = sizeGroup.files[:len(sizeGroup.files)-1]
+		}
+		// We have found hard links in this size group,
+		// maybe we can remove it
+		if hardlinksFound {
+			if len(sizeGroup.files) < 2 {
+				delete(data.sizeGroups, s)
+				uniqueSizeCount++
+				continue
+			}
+		}
+	}
+	return
+}
+
+func formatSize(sizeBytes uint64, short bool) string {
+	var units = map[int]string{
+		0: "B",
+		1: "KiB",
+		2: "MiB",
+		3: "GiB",
+		4: "TiB",
+		5: "PiB",
+	}
+	humanSize := sizeBytes
+	var n int
+	for n < len(units)-1 {
+		if humanSize < 10000 {
+			break
+		}
+		humanSize /= 1024
+		n++
+	}
+	if n < 1 {
+		return fmt.Sprintf("%d bytes", sizeBytes)
+	}
+	if short {
+		return fmt.Sprintf("%d %s", humanSize, units[n])
+	}
+	return fmt.Sprintf("%d bytes (%d %s)", sizeBytes, humanSize, units[n])
+}
+
+func main() {
+	var verbose bool
+	var summary bool
+	var skipPartial bool
+	var ignoreEmpty bool
+
+	flag.BoolVar(&verbose, "verbose", false, "Be verbose (verbosity=1)")
+	flag.BoolVar(&verbose, "v", false, "See --verbose")
+	flag.BoolVar(&summary, "summary", false, "Do not display the duplicate list")
+	flag.BoolVar(&summary, "s", false, "See --summary")
+	flag.BoolVar(&skipPartial, "skip-partial", false, "Skip partial checksums")
+	flag.IntVar(&myLog.verbosity, "verbosity", 0,
+		"Set verbosity level (1-5)")
+	flag.IntVar(&myLog.verbosity, "vl", 0, "See verbosity")
+	timings := flag.Bool("timings", false, "Set detailed log timings")
+	flag.BoolVar(&ignoreEmpty, "no-empty", false, "Ignore empty files")
+
+	flag.Parse()
+
+	if myLog.verbosity > 0 {
+		verbose = true
+	} else if verbose == true {
+		myLog.verbosity = 1
+	}
+
+	if len(flag.Args()) == 0 {
+		// TODO: more helpful usage statement
+		myLog.Println(-1, "Usage:", os.Args[0],
+			"[options] base_directory")
+		os.Exit(0)
+	}
+
+	if *timings {
+		log.SetFlags(log.LstdFlags | log.Lmicroseconds)
+	}
+
+	data.sizeGroups = make(map[int64]*sizeClass)
+	myLog.Println(1, "* Reading file metadata")
+
+	for _, root := range flag.Args() {
+		if err := filepath.Walk(root, visit); err != nil {
+			myLog.Printf(-1, "* Error: could not read file tree:\n")
+			myLog.Printf(-1, "> %v\n", err)
+			os.Exit(1)
+		}
+	}
+	emptyCount := data.dropEmptyFiles(ignoreEmpty)
+	if verbose {
+		if data.ignoreCount > 0 {
+			myLog.Printf(1, "  %d special files were ignored\n",
+				data.ignoreCount)
+		}
+		myLog.Println(2, "  Initial counter:", data.cmpt, "files")
+		myLog.Println(2, "  Total size:", formatSize(data.totalSize,
+			false))
+		if emptyCount > 0 {
+			myLog.Printf(1, "  %d empty files were ignored\n",
+				emptyCount)
+		}
+		data.dispCount()
+		myLog.Println(3, "* Number of size groups:", len(data.sizeGroups))
+	}
+
+	// Remove unique sizes
+	myLog.Println(1, "* Removing files with unique size, sorting file lists...")
+	hardLinkCount, uniqueSizeCount := data.createSizeHash()
+	if verbose {
+		myLog.Printf(2, "  Dropped %d files with unique size\n",
+			uniqueSizeCount)
+		myLog.Printf(2, "  Dropped %d hard links\n", hardLinkCount)
+		myLog.Println(3, "* Number of size groups:", len(data.sizeGroups))
+		data.dispCount()
+	}
+
+	// Calculate medsums
+	if !skipPartial {
+		myLog.Println(1, "* Calculating partial checksums...")
+		data.calculateMedSums()
+		data.dispCount()
+		myLog.Println(3, "* Number of size groups:", len(data.sizeGroups))
+	}
+
+	// Calculate checksums
+	myLog.Println(1, "* Calculating checksums...")
+	data.calculateChecksums()
+	data.dispCount()
+
+	// Get list of dupes
+	var result []FileObjList
+	if len(data.emptyFiles) > 0 {
+		result = append(result, data.emptyFiles)
+	}
+	result = append(result, data.filesWithSameHash()...)
+	myLog.Println(3, "* Number of match groups:", len(result))
+
+	// Done!  Dump dupes
+	if len(result) > 0 && !summary {
+		myLog.Println(1, "* Dupes:")
+	}
+	var dupeSize uint64
+	data.cmpt = 0
+	for i, l := range result {
+		size := uint64(l[0].Size())
+		// We do not count the size of the 1st item
+		// so we get only duplicate size.
+		dupeSize += size * uint64(len(l) - 1)
+		if !summary {
+			fmt.Printf("\nGroup #%d (%d files * %v):\n", i+1,
+				len(l), formatSize(size, true))
+		}
+		for _, f := range l {
+			if !summary {
+				fmt.Println(f.FilePath)
+			}
+			data.cmpt++
+		}
+	}
+	summaryLevel := 1 // Default verbosity for the summary line
+	if summary == false {
+		// Add a trailing newline
+		if len(result) > 0 {
+			fmt.Println("")
+		}
+	} else {
+		// The summary is requested so we lower the verbosity level
+		summaryLevel = 0
+	}
+
+	myLog.Println(summaryLevel, "Final count:", data.cmpt,
+		"duplicate files in", len(result), "sets")
+	myLog.Println(summaryLevel, "Redundant data size:",
+		formatSize(dupeSize, false))
+}

author	Mikael Berthe <mikael@lilotux.net>
	Sat, 28 Jun 2014 14:03:08 +0200
changeset 0	a5642cd03cef
child 1	a8aa9e54bce4

.hgignore		file \| annotate \| diff \| comparison \| revisions
TODO.md		file \| annotate \| diff \| comparison \| revisions
goduf-byinode_fallback.go		file \| annotate \| diff \| comparison \| revisions
goduf-byinode_unix.go		file \| annotate \| diff \| comparison \| revisions
goduf.go		file \| annotate \| diff \| comparison \| revisions