|
1 /* |
|
2 * Copyright (C) 2014 Mikael Berthe <mikael@lilotux.net> |
|
3 * |
|
4 * This program is free software; you can redistribute it and/or modify |
|
5 * it under the terms of the GNU General Public License as published by |
|
6 * the Free Software Foundation; either version 2 of the License, or (at |
|
7 * your option) any later version. |
|
8 * |
|
9 * This program is distributed in the hope that it will be useful, but |
|
10 * WITHOUT ANY WARRANTY; without even the implied warranty of |
|
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
12 * General Public License for more details. |
|
13 * |
|
14 * You should have received a copy of the GNU General Public License |
|
15 * along with this program; if not, write to the Free Software |
|
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 |
|
17 * USA |
|
18 */ |
|
19 |
|
20 // This program (Goduf) is a fast duplicate file finder. |
|
21 // Use goduf --help to get the list of available options. |
|
22 |
|
23 package main |
|
24 |
|
25 import ( |
|
26 "crypto/sha1" |
|
27 "encoding/hex" |
|
28 "errors" |
|
29 "flag" |
|
30 "fmt" |
|
31 "io" |
|
32 "log" |
|
33 "os" |
|
34 "path/filepath" |
|
35 "sort" |
|
36 ) |
|
37 |
|
38 const medsumBytes = 128 |
|
39 const minSizePartialChecksum = 49152 // Should be > 3*medsumBytes |
|
40 |
|
41 type sumType int |
|
42 |
|
43 const ( |
|
44 fullChecksum sumType = iota |
|
45 partialChecksum |
|
46 ) |
|
47 |
|
48 type fileObj struct { |
|
49 //Unique bool |
|
50 FilePath string |
|
51 os.FileInfo |
|
52 PartialHash []byte |
|
53 Hash []byte |
|
54 } |
|
55 |
|
56 type FileObjList []*fileObj |
|
57 |
|
58 type sizeClass struct { |
|
59 files FileObjList |
|
60 medsums map[string]FileObjList |
|
61 fullsums map[string]FileObjList |
|
62 } |
|
63 |
|
64 type dataT struct { |
|
65 totalSize uint64 |
|
66 cmpt uint |
|
67 sizeGroups map[int64]*sizeClass |
|
68 emptyFiles FileObjList |
|
69 ignoreCount int |
|
70 } |
|
71 |
|
72 var data dataT |
|
73 |
|
74 type myLogT struct { |
|
75 verbosity int |
|
76 } |
|
77 |
|
78 var myLog myLogT |
|
79 |
|
80 func (l *myLogT) Printf(level int, format string, args ...interface{}) { |
|
81 if level > l.verbosity { |
|
82 return |
|
83 } |
|
84 if level >= 0 { |
|
85 log.Printf(format, args...) |
|
86 return |
|
87 } |
|
88 // Error message without timestamp |
|
89 fmt.Fprintf(os.Stderr, format, args...) |
|
90 } |
|
91 |
|
92 func (l *myLogT) Println(level int, args ...interface{}) { |
|
93 if level > l.verbosity { |
|
94 return |
|
95 } |
|
96 if level >= 0 { |
|
97 log.Println(args...) |
|
98 return |
|
99 } |
|
100 // Error message without timestamp |
|
101 fmt.Fprintln(os.Stderr, args...) |
|
102 } |
|
103 |
|
104 func visit(path string, f os.FileInfo, err error) error { |
|
105 if err != nil { |
|
106 if f == nil { |
|
107 return err |
|
108 } |
|
109 if f.IsDir() { |
|
110 myLog.Println(-1, "Warning: cannot process directory:", |
|
111 path) |
|
112 return filepath.SkipDir |
|
113 } |
|
114 |
|
115 myLog.Println(-1, "Ignoring ", path, " - ", err) |
|
116 data.ignoreCount++ |
|
117 return nil |
|
118 } |
|
119 if f.IsDir() { |
|
120 return nil |
|
121 } |
|
122 |
|
123 if mode := f.Mode(); mode&os.ModeType != 0 { |
|
124 if mode&os.ModeSymlink != 0 { |
|
125 myLog.Println(5, "Ignoring symbolic link", path) |
|
126 } else { |
|
127 myLog.Println(0, "Ignoring special file", path) |
|
128 } |
|
129 data.ignoreCount++ |
|
130 return nil |
|
131 } |
|
132 |
|
133 data.cmpt++ |
|
134 data.totalSize += uint64(f.Size()) |
|
135 fo := &fileObj{FilePath: path, FileInfo: f} |
|
136 if _, ok := data.sizeGroups[f.Size()]; !ok { |
|
137 data.sizeGroups[f.Size()] = &sizeClass{} |
|
138 } |
|
139 data.sizeGroups[f.Size()].files = |
|
140 append(data.sizeGroups[f.Size()].files, fo) |
|
141 return nil |
|
142 } |
|
143 |
|
144 func (fo *fileObj) CheckSum() error { |
|
145 file, err := os.Open(fo.FilePath) |
|
146 if err != nil { |
|
147 return err |
|
148 } |
|
149 defer file.Close() |
|
150 hash := sha1.New() |
|
151 if size, err := io.Copy(hash, file); size != fo.Size() || err != nil { |
|
152 if err == nil { |
|
153 return errors.New("failed to read the whole file: " + |
|
154 fo.FilePath) |
|
155 } |
|
156 return err |
|
157 } |
|
158 |
|
159 fo.Hash = hash.Sum(nil) |
|
160 |
|
161 return nil |
|
162 } |
|
163 |
|
164 func (fo *fileObj) MedSum() error { |
|
165 file, err := os.Open(fo.FilePath) |
|
166 if err != nil { |
|
167 return err |
|
168 } |
|
169 defer file.Close() |
|
170 hash := sha1.New() |
|
171 // XXX: duplicated code! |
|
172 // BOF |
|
173 if _, err := io.CopyN(hash, file, medsumBytes); err != nil { |
|
174 if err == nil { |
|
175 return errors.New("failed to read bytes from file:" + |
|
176 fo.FilePath) |
|
177 } |
|
178 return err |
|
179 } |
|
180 /* |
|
181 // MOF |
|
182 file.Seek((fo.Size()-medsumBytes)/2, 0) |
|
183 if _, err := io.CopyN(hash, file, medsumBytes); err != nil { |
|
184 if err == nil { |
|
185 return errors.New("failed to read bytes from file:" + |
|
186 fo.FilePath) |
|
187 } |
|
188 return err |
|
189 } |
|
190 */ |
|
191 // EOF |
|
192 file.Seek(0-medsumBytes, 2) |
|
193 if _, err := io.CopyN(hash, file, medsumBytes); err != nil { |
|
194 if err == nil { |
|
195 return errors.New("failed to read bytes from file:" + |
|
196 fo.FilePath) |
|
197 } |
|
198 return err |
|
199 } |
|
200 |
|
201 fo.PartialHash = hash.Sum(nil) |
|
202 |
|
203 return nil |
|
204 } |
|
205 |
|
206 func (fo *fileObj) Sum(sType sumType) error { |
|
207 if sType == partialChecksum { |
|
208 return fo.MedSum() |
|
209 } else if sType == fullChecksum { |
|
210 return fo.CheckSum() |
|
211 } |
|
212 panic("Internal error: Invalid sType") |
|
213 } |
|
214 |
|
215 func (data *dataT) dispCount() { |
|
216 if myLog.verbosity < 4 { |
|
217 return |
|
218 } |
|
219 var c1, c2, c3, c4 int |
|
220 var s4 string |
|
221 for _, sc := range data.sizeGroups { |
|
222 c1 += len(sc.files) |
|
223 for _, fol := range sc.medsums { |
|
224 c2 += len(fol) |
|
225 } |
|
226 for _, fol := range sc.fullsums { |
|
227 c3 += len(fol) |
|
228 } |
|
229 } |
|
230 c4 = len(data.emptyFiles) |
|
231 if c4 > 0 { |
|
232 s4 = fmt.Sprintf("+%d", c4) |
|
233 } |
|
234 myLog.Printf(4, " Current countdown: %d [%d%s/%d/%d]\n", |
|
235 c1+c2+c3+c4, c1, s4, c2, c3) |
|
236 } |
|
237 |
|
238 func (data *dataT) filesWithSameHash() (hgroups []FileObjList) { |
|
239 for _, sizeGroup := range data.sizeGroups { |
|
240 for _, l := range sizeGroup.fullsums { |
|
241 hgroups = append(hgroups, l) |
|
242 } |
|
243 } |
|
244 return |
|
245 } |
|
246 |
|
247 func createHashFromSum(sType sumType, folist FileObjList, hashes *map[string]FileObjList) { |
|
248 for _, fo := range folist { |
|
249 var hash string |
|
250 var hbytes []byte |
|
251 if sType == partialChecksum { |
|
252 hbytes = fo.PartialHash |
|
253 } else if sType == fullChecksum { |
|
254 hbytes = fo.Hash |
|
255 } else { |
|
256 panic("Internal error: Invalid sType") |
|
257 } |
|
258 if hbytes == nil { |
|
259 // Could happen if the file was not readable, |
|
260 // We skip it (already shown as dropped) |
|
261 continue |
|
262 } |
|
263 hash = hex.EncodeToString(hbytes) |
|
264 |
|
265 (*hashes)[hash] = append((*hashes)[hash], fo) |
|
266 } |
|
267 } |
|
268 |
|
269 func (data *dataT) calculateMedSums() { |
|
270 var bigList FileObjList |
|
271 var droppedGroups int |
|
272 var droppedFiles int |
|
273 |
|
274 // Create a unique list of files to be partially checksummed |
|
275 for size, sizeGroup := range data.sizeGroups { |
|
276 if size < minSizePartialChecksum { |
|
277 // We do not use MedSums for small files |
|
278 continue |
|
279 } |
|
280 bigList = append(bigList, sizeGroup.files...) |
|
281 } |
|
282 |
|
283 // Sort the list for better efficiency -- that's the whole point of |
|
284 // the unique big list. |
|
285 sort.Sort(ByInode(bigList)) |
|
286 |
|
287 // Compute partial checksums |
|
288 for _, fo := range bigList { |
|
289 if err := fo.Sum(partialChecksum); err != nil { |
|
290 myLog.Println(0, "Error:", err) |
|
291 droppedFiles++ |
|
292 // The hash part will be nil and the file will |
|
293 // be dropped in createHashFromSum() below. |
|
294 } |
|
295 } |
|
296 |
|
297 // Reparse size-grouped hashes and use the new hash information |
|
298 // to build lists of files with the same partial checksum. |
|
299 for size, sizeGroup := range data.sizeGroups { |
|
300 if size < minSizePartialChecksum { |
|
301 // We do not use MedSums for small files |
|
302 continue |
|
303 } |
|
304 hashes := make(map[string]FileObjList) |
|
305 createHashFromSum(partialChecksum, sizeGroup.files, &hashes) |
|
306 |
|
307 // Let's de-dupe now... |
|
308 for h, l := range hashes { |
|
309 if len(l) < 2 { |
|
310 droppedGroups++ |
|
311 droppedFiles += len(l) |
|
312 delete(hashes, h) |
|
313 } |
|
314 } |
|
315 |
|
316 // Done, save the result |
|
317 data.sizeGroups[size].medsums = hashes |
|
318 |
|
319 // We remove the items from "files" since they are in the hash |
|
320 // tree now. |
|
321 sizeGroup.files = nil |
|
322 |
|
323 if len(hashes) == 0 { // We're done with this size group |
|
324 delete(data.sizeGroups, size) |
|
325 } |
|
326 } |
|
327 |
|
328 if droppedFiles > 0 { |
|
329 myLog.Printf(3, " Dropped %d files in %d groups\n", |
|
330 droppedFiles, droppedGroups) |
|
331 } |
|
332 return |
|
333 } |
|
334 |
|
335 func (data *dataT) calculateChecksums() { |
|
336 var bigList FileObjList |
|
337 var droppedGroups int |
|
338 var droppedFiles int |
|
339 |
|
340 // Create a unique list of files to be fully checksummed |
|
341 for _, sizeGroup := range data.sizeGroups { |
|
342 // #1: small files |
|
343 bigList = append(bigList, sizeGroup.files...) |
|
344 // #2: files with same partial checksum |
|
345 for _, l := range sizeGroup.medsums { |
|
346 bigList = append(bigList, l...) |
|
347 } |
|
348 } |
|
349 |
|
350 // Sort the list for better efficiency -- that's the whole point of |
|
351 // the unique big list. |
|
352 sort.Sort(ByInode(bigList)) |
|
353 |
|
354 // Compute full checksums |
|
355 for _, fo := range bigList { |
|
356 if err := fo.Sum(fullChecksum); err != nil { |
|
357 myLog.Println(0, "Error:", err) |
|
358 droppedFiles++ |
|
359 // The hash part will be nil and the file will |
|
360 // be dropped in createHashFromSum() below. |
|
361 } |
|
362 } |
|
363 |
|
364 // Reparse size-grouped hashes and use the new hash information |
|
365 // to build lists of files with the same checksum. |
|
366 for size, sizeGroup := range data.sizeGroups { |
|
367 hashes := make(map[string]FileObjList) |
|
368 // #1: small files |
|
369 createHashFromSum(fullChecksum, sizeGroup.files, &hashes) |
|
370 // #2: files with same partial checksum |
|
371 for _, l := range sizeGroup.medsums { |
|
372 createHashFromSum(fullChecksum, l, &hashes) |
|
373 } |
|
374 |
|
375 // Let's de-dupe now... |
|
376 for h, l := range hashes { |
|
377 if len(l) < 2 { |
|
378 droppedGroups++ |
|
379 droppedFiles += len(l) |
|
380 delete(hashes, h) |
|
381 } |
|
382 } |
|
383 |
|
384 // Done, save the result |
|
385 data.sizeGroups[size].fullsums = hashes |
|
386 data.sizeGroups[size].medsums = nil |
|
387 sizeGroup.files = nil |
|
388 } |
|
389 if droppedFiles > 0 { |
|
390 myLog.Printf(3, " Dropped %d files in %d groups\n", |
|
391 droppedFiles, droppedGroups) |
|
392 } |
|
393 return |
|
394 } |
|
395 |
|
396 func (data *dataT) dropEmptyFiles(ignoreEmpty bool) (emptyCount int) { |
|
397 sc, ok := data.sizeGroups[0] |
|
398 if ok == false { |
|
399 return // no empty files |
|
400 } |
|
401 if !ignoreEmpty { |
|
402 if len(sc.files) > 1 { |
|
403 data.emptyFiles = sc.files |
|
404 } |
|
405 delete(data.sizeGroups, 0) |
|
406 return |
|
407 } |
|
408 emptyCount = len(sc.files) |
|
409 delete(data.sizeGroups, 0) |
|
410 return |
|
411 } |
|
412 |
|
413 func (data *dataT) createSizeHash() (hardLinkCount, uniqueSizeCount int) { |
|
414 for s, sizeGroup := range data.sizeGroups { |
|
415 if len(sizeGroup.files) < 2 { |
|
416 delete(data.sizeGroups, s) |
|
417 uniqueSizeCount++ |
|
418 continue |
|
419 } |
|
420 |
|
421 var hardlinksFound bool |
|
422 |
|
423 // Sort by device/inodes |
|
424 sort.Sort(ByInode(sizeGroup.files)) |
|
425 |
|
426 // Check for hardlinks |
|
427 // TODO: what about symlinks? |
|
428 // Remove unique dev/inodes |
|
429 // Instead of this loop, another way would be to use the field |
|
430 // "Unique" of the fileObj to mark them to be discarded |
|
431 // and remove them all at the end. |
|
432 for { |
|
433 if !OSHasInodes() { |
|
434 break |
|
435 } |
|
436 var hardLinkIndex int |
|
437 fo := sizeGroup.files[0] |
|
438 prevDev, prevIno := GetDevIno(fo) |
|
439 //fmt.Printf(" - FO: %#v\n", *fo) |
|
440 |
|
441 for i, fo := range sizeGroup.files[1:] { |
|
442 //fmt.Printf(" - FO %d : %#v\n", i, *fo) |
|
443 dev, ino := GetDevIno(fo) |
|
444 if dev == prevDev && ino == prevIno { |
|
445 hardLinkIndex = i + 1 |
|
446 hardLinkCount++ |
|
447 hardlinksFound = true |
|
448 break |
|
449 } |
|
450 prevDev = dev |
|
451 prevIno = ino |
|
452 } |
|
453 |
|
454 if hardLinkIndex == 0 { |
|
455 break |
|
456 } |
|
457 i := hardLinkIndex |
|
458 // Remove hardink |
|
459 copy(sizeGroup.files[i:], sizeGroup.files[i+1:]) |
|
460 sizeGroup.files[len(sizeGroup.files)-1] = nil |
|
461 sizeGroup.files = sizeGroup.files[:len(sizeGroup.files)-1] |
|
462 } |
|
463 // We have found hard links in this size group, |
|
464 // maybe we can remove it |
|
465 if hardlinksFound { |
|
466 if len(sizeGroup.files) < 2 { |
|
467 delete(data.sizeGroups, s) |
|
468 uniqueSizeCount++ |
|
469 continue |
|
470 } |
|
471 } |
|
472 } |
|
473 return |
|
474 } |
|
475 |
|
476 func formatSize(sizeBytes uint64, short bool) string { |
|
477 var units = map[int]string{ |
|
478 0: "B", |
|
479 1: "KiB", |
|
480 2: "MiB", |
|
481 3: "GiB", |
|
482 4: "TiB", |
|
483 5: "PiB", |
|
484 } |
|
485 humanSize := sizeBytes |
|
486 var n int |
|
487 for n < len(units)-1 { |
|
488 if humanSize < 10000 { |
|
489 break |
|
490 } |
|
491 humanSize /= 1024 |
|
492 n++ |
|
493 } |
|
494 if n < 1 { |
|
495 return fmt.Sprintf("%d bytes", sizeBytes) |
|
496 } |
|
497 if short { |
|
498 return fmt.Sprintf("%d %s", humanSize, units[n]) |
|
499 } |
|
500 return fmt.Sprintf("%d bytes (%d %s)", sizeBytes, humanSize, units[n]) |
|
501 } |
|
502 |
|
503 func main() { |
|
504 var verbose bool |
|
505 var summary bool |
|
506 var skipPartial bool |
|
507 var ignoreEmpty bool |
|
508 |
|
509 flag.BoolVar(&verbose, "verbose", false, "Be verbose (verbosity=1)") |
|
510 flag.BoolVar(&verbose, "v", false, "See --verbose") |
|
511 flag.BoolVar(&summary, "summary", false, "Do not display the duplicate list") |
|
512 flag.BoolVar(&summary, "s", false, "See --summary") |
|
513 flag.BoolVar(&skipPartial, "skip-partial", false, "Skip partial checksums") |
|
514 flag.IntVar(&myLog.verbosity, "verbosity", 0, |
|
515 "Set verbosity level (1-5)") |
|
516 flag.IntVar(&myLog.verbosity, "vl", 0, "See verbosity") |
|
517 timings := flag.Bool("timings", false, "Set detailed log timings") |
|
518 flag.BoolVar(&ignoreEmpty, "no-empty", false, "Ignore empty files") |
|
519 |
|
520 flag.Parse() |
|
521 |
|
522 if myLog.verbosity > 0 { |
|
523 verbose = true |
|
524 } else if verbose == true { |
|
525 myLog.verbosity = 1 |
|
526 } |
|
527 |
|
528 if len(flag.Args()) == 0 { |
|
529 // TODO: more helpful usage statement |
|
530 myLog.Println(-1, "Usage:", os.Args[0], |
|
531 "[options] base_directory") |
|
532 os.Exit(0) |
|
533 } |
|
534 |
|
535 if *timings { |
|
536 log.SetFlags(log.LstdFlags | log.Lmicroseconds) |
|
537 } |
|
538 |
|
539 data.sizeGroups = make(map[int64]*sizeClass) |
|
540 myLog.Println(1, "* Reading file metadata") |
|
541 |
|
542 for _, root := range flag.Args() { |
|
543 if err := filepath.Walk(root, visit); err != nil { |
|
544 myLog.Printf(-1, "* Error: could not read file tree:\n") |
|
545 myLog.Printf(-1, "> %v\n", err) |
|
546 os.Exit(1) |
|
547 } |
|
548 } |
|
549 emptyCount := data.dropEmptyFiles(ignoreEmpty) |
|
550 if verbose { |
|
551 if data.ignoreCount > 0 { |
|
552 myLog.Printf(1, " %d special files were ignored\n", |
|
553 data.ignoreCount) |
|
554 } |
|
555 myLog.Println(2, " Initial counter:", data.cmpt, "files") |
|
556 myLog.Println(2, " Total size:", formatSize(data.totalSize, |
|
557 false)) |
|
558 if emptyCount > 0 { |
|
559 myLog.Printf(1, " %d empty files were ignored\n", |
|
560 emptyCount) |
|
561 } |
|
562 data.dispCount() |
|
563 myLog.Println(3, "* Number of size groups:", len(data.sizeGroups)) |
|
564 } |
|
565 |
|
566 // Remove unique sizes |
|
567 myLog.Println(1, "* Removing files with unique size, sorting file lists...") |
|
568 hardLinkCount, uniqueSizeCount := data.createSizeHash() |
|
569 if verbose { |
|
570 myLog.Printf(2, " Dropped %d files with unique size\n", |
|
571 uniqueSizeCount) |
|
572 myLog.Printf(2, " Dropped %d hard links\n", hardLinkCount) |
|
573 myLog.Println(3, "* Number of size groups:", len(data.sizeGroups)) |
|
574 data.dispCount() |
|
575 } |
|
576 |
|
577 // Calculate medsums |
|
578 if !skipPartial { |
|
579 myLog.Println(1, "* Calculating partial checksums...") |
|
580 data.calculateMedSums() |
|
581 data.dispCount() |
|
582 myLog.Println(3, "* Number of size groups:", len(data.sizeGroups)) |
|
583 } |
|
584 |
|
585 // Calculate checksums |
|
586 myLog.Println(1, "* Calculating checksums...") |
|
587 data.calculateChecksums() |
|
588 data.dispCount() |
|
589 |
|
590 // Get list of dupes |
|
591 var result []FileObjList |
|
592 if len(data.emptyFiles) > 0 { |
|
593 result = append(result, data.emptyFiles) |
|
594 } |
|
595 result = append(result, data.filesWithSameHash()...) |
|
596 myLog.Println(3, "* Number of match groups:", len(result)) |
|
597 |
|
598 // Done! Dump dupes |
|
599 if len(result) > 0 && !summary { |
|
600 myLog.Println(1, "* Dupes:") |
|
601 } |
|
602 var dupeSize uint64 |
|
603 data.cmpt = 0 |
|
604 for i, l := range result { |
|
605 size := uint64(l[0].Size()) |
|
606 // We do not count the size of the 1st item |
|
607 // so we get only duplicate size. |
|
608 dupeSize += size * uint64(len(l) - 1) |
|
609 if !summary { |
|
610 fmt.Printf("\nGroup #%d (%d files * %v):\n", i+1, |
|
611 len(l), formatSize(size, true)) |
|
612 } |
|
613 for _, f := range l { |
|
614 if !summary { |
|
615 fmt.Println(f.FilePath) |
|
616 } |
|
617 data.cmpt++ |
|
618 } |
|
619 } |
|
620 summaryLevel := 1 // Default verbosity for the summary line |
|
621 if summary == false { |
|
622 // Add a trailing newline |
|
623 if len(result) > 0 { |
|
624 fmt.Println("") |
|
625 } |
|
626 } else { |
|
627 // The summary is requested so we lower the verbosity level |
|
628 summaryLevel = 0 |
|
629 } |
|
630 |
|
631 myLog.Println(summaryLevel, "Final count:", data.cmpt, |
|
632 "duplicate files in", len(result), "sets") |
|
633 myLog.Println(summaryLevel, "Redundant data size:", |
|
634 formatSize(dupeSize, false)) |
|
635 } |