210 return fo.CheckSum() |
211 return fo.CheckSum() |
211 } |
212 } |
212 panic("Internal error: Invalid sType") |
213 panic("Internal error: Invalid sType") |
213 } |
214 } |
214 |
215 |
215 func (data *dataT) dispCount() { |
216 func (data *dataT) dispCount() { // FIXME rather useless |
216 if myLog.verbosity < 4 { |
217 if myLog.verbosity < 4 { |
217 return |
218 return |
218 } |
219 } |
219 var c1, c2, c3, c4 int |
220 var c1, c1b, c2 int |
220 var s4 string |
221 var s1 string |
221 for _, sc := range data.sizeGroups { |
222 for _, sc := range data.sizeGroups { |
222 c1 += len(sc.files) |
223 c1 += len(sc.files) |
223 for _, fol := range sc.medsums { |
224 c2++ |
224 c2 += len(fol) |
225 } |
225 } |
226 c1b = len(data.emptyFiles) |
226 for _, fol := range sc.fullsums { |
227 if c1b > 0 { |
227 c3 += len(fol) |
228 s1 = fmt.Sprintf("+%d", c1b) |
228 } |
229 } |
229 } |
230 myLog.Printf(4, " Current countdown: %d [%d%s/%d]\n", |
230 c4 = len(data.emptyFiles) |
231 c1+c1b, c1, s1, c2) |
231 if c4 > 0 { |
|
232 s4 = fmt.Sprintf("+%d", c4) |
|
233 } |
|
234 myLog.Printf(4, " Current countdown: %d [%d%s/%d/%d]\n", |
|
235 c1+c2+c3+c4, c1, s4, c2, c3) |
|
236 } |
232 } |
237 |
233 |
238 func (data *dataT) filesWithSameHash() (hgroups []FileObjList) { |
234 func (data *dataT) filesWithSameHash() (hgroups []FileObjList) { |
239 for _, sizeGroup := range data.sizeGroups { |
235 for _, sizeGroup := range data.sizeGroups { |
240 for _, l := range sizeGroup.fullsums { |
236 for _, l := range sizeGroup.fullsums { |
242 } |
238 } |
243 } |
239 } |
244 return |
240 return |
245 } |
241 } |
246 |
242 |
247 func createHashFromSum(sType sumType, folist FileObjList, hashes *map[string]FileObjList) { |
243 func findDupesFullChecksums(fileList FileObjList) []FileObjList { |
248 for _, fo := range folist { |
244 var dupeList []FileObjList |
249 var hash string |
245 hashes := make(map[string]FileObjList) |
250 var hbytes []byte |
246 |
251 if sType == partialChecksum { |
247 // Sort the list for better efficiency |
252 hbytes = fo.PartialHash |
248 sort.Sort(ByInode(fileList)) |
253 } else if sType == fullChecksum { |
249 |
254 hbytes = fo.Hash |
250 // Compute full checksums |
255 } else { |
251 for _, fo := range fileList { |
256 panic("Internal error: Invalid sType") |
252 if err := fo.Sum(fullChecksum); err != nil { |
257 } |
253 myLog.Println(0, "Error:", err) |
258 if hbytes == nil { |
|
259 // Could happen if the file was not readable, |
|
260 // We skip it (already shown as dropped) |
|
261 continue |
254 continue |
262 } |
255 } |
263 hash = hex.EncodeToString(hbytes) |
256 hash := hex.EncodeToString(fo.Hash) |
264 |
257 hashes[hash] = append(hashes[hash], fo) |
265 (*hashes)[hash] = append((*hashes)[hash], fo) |
258 } |
266 } |
259 |
267 } |
260 // Let's de-dupe now... |
268 |
261 for _, l := range hashes { |
269 func (data *dataT) calculateMedSums() { |
262 if len(l) < 2 { |
270 var bigList FileObjList |
|
271 var droppedGroups int |
|
272 var droppedFiles int |
|
273 |
|
274 // Create a unique list of files to be partially checksummed |
|
275 for size, sizeGroup := range data.sizeGroups { |
|
276 if size < minSizePartialChecksum { |
|
277 // We do not use MedSums for small files |
|
278 continue |
263 continue |
279 } |
264 } |
280 bigList = append(bigList, sizeGroup.files...) |
265 dupeList = append(dupeList, l) |
281 } |
266 // TODO sort by increasing size |
282 |
267 myLog.Printf(5, " . found %d new duplicates\n", len(l)) |
283 // Sort the list for better efficiency -- that's the whole point of |
268 } |
284 // the unique big list. |
269 |
285 sort.Sort(ByInode(bigList)) |
270 return dupeList |
|
271 } |
|
272 |
|
273 // TODO: refactor to avoid code duplication |
|
274 func findDupesPartialChecksums(fileList FileObjList) []FileObjList { |
|
275 var dupeList []FileObjList |
|
276 hashes := make(map[string]FileObjList) |
|
277 |
|
278 // Sort the list for better efficiency |
|
279 sort.Sort(ByInode(fileList)) |
286 |
280 |
287 // Compute partial checksums |
281 // Compute partial checksums |
288 for _, fo := range bigList { |
282 for _, fo := range fileList { |
289 if err := fo.Sum(partialChecksum); err != nil { |
283 if err := fo.Sum(partialChecksum); err != nil { |
290 myLog.Println(0, "Error:", err) |
284 myLog.Println(0, "Error:", err) |
291 droppedFiles++ |
|
292 // The hash part will be nil and the file will |
|
293 // be dropped in createHashFromSum() below. |
|
294 } |
|
295 } |
|
296 |
|
297 // Reparse size-grouped hashes and use the new hash information |
|
298 // to build lists of files with the same partial checksum. |
|
299 for size, sizeGroup := range data.sizeGroups { |
|
300 if size < minSizePartialChecksum { |
|
301 // We do not use MedSums for small files |
|
302 continue |
285 continue |
303 } |
286 } |
304 hashes := make(map[string]FileObjList) |
287 hash := hex.EncodeToString(fo.PartialHash) |
305 createHashFromSum(partialChecksum, sizeGroup.files, &hashes) |
288 hashes[hash] = append(hashes[hash], fo) |
306 |
289 } |
307 // Let's de-dupe now... |
290 |
308 for h, l := range hashes { |
291 // Let's de-dupe now... |
309 if len(l) < 2 { |
292 for _, l := range hashes { |
310 droppedGroups++ |
293 if len(l) < 2 { |
311 droppedFiles += len(l) |
294 continue |
312 delete(hashes, h) |
295 } |
313 } |
296 dupeList = append(dupeList, findDupesFullChecksums(l)...) |
314 } |
297 // TODO sort by increasing size |
315 |
298 } |
316 // Done, save the result |
299 |
317 data.sizeGroups[size].medsums = hashes |
300 return dupeList |
318 |
|
319 // We remove the items from "files" since they are in the hash |
|
320 // tree now. |
|
321 sizeGroup.files = nil |
|
322 |
|
323 if len(hashes) == 0 { // We're done with this size group |
|
324 delete(data.sizeGroups, size) |
|
325 } |
|
326 } |
|
327 |
|
328 if droppedFiles > 0 { |
|
329 myLog.Printf(3, " Dropped %d files in %d groups\n", |
|
330 droppedFiles, droppedGroups) |
|
331 } |
|
332 return |
|
333 } |
|
334 |
|
335 func (data *dataT) calculateChecksums() { |
|
336 var bigList FileObjList |
|
337 var droppedGroups int |
|
338 var droppedFiles int |
|
339 |
|
340 // Create a unique list of files to be fully checksummed |
|
341 for _, sizeGroup := range data.sizeGroups { |
|
342 // #1: small files |
|
343 bigList = append(bigList, sizeGroup.files...) |
|
344 // #2: files with same partial checksum |
|
345 for _, l := range sizeGroup.medsums { |
|
346 bigList = append(bigList, l...) |
|
347 } |
|
348 } |
|
349 |
|
350 // Sort the list for better efficiency -- that's the whole point of |
|
351 // the unique big list. |
|
352 sort.Sort(ByInode(bigList)) |
|
353 |
|
354 // Compute full checksums |
|
355 for _, fo := range bigList { |
|
356 if err := fo.Sum(fullChecksum); err != nil { |
|
357 myLog.Println(0, "Error:", err) |
|
358 droppedFiles++ |
|
359 // The hash part will be nil and the file will |
|
360 // be dropped in createHashFromSum() below. |
|
361 } |
|
362 } |
|
363 |
|
364 // Reparse size-grouped hashes and use the new hash information |
|
365 // to build lists of files with the same checksum. |
|
366 for size, sizeGroup := range data.sizeGroups { |
|
367 hashes := make(map[string]FileObjList) |
|
368 // #1: small files |
|
369 createHashFromSum(fullChecksum, sizeGroup.files, &hashes) |
|
370 // #2: files with same partial checksum |
|
371 for _, l := range sizeGroup.medsums { |
|
372 createHashFromSum(fullChecksum, l, &hashes) |
|
373 } |
|
374 |
|
375 // Let's de-dupe now... |
|
376 for h, l := range hashes { |
|
377 if len(l) < 2 { |
|
378 droppedGroups++ |
|
379 droppedFiles += len(l) |
|
380 delete(hashes, h) |
|
381 } |
|
382 } |
|
383 |
|
384 // Done, save the result |
|
385 data.sizeGroups[size].fullsums = hashes |
|
386 data.sizeGroups[size].medsums = nil |
|
387 sizeGroup.files = nil |
|
388 } |
|
389 if droppedFiles > 0 { |
|
390 myLog.Printf(3, " Dropped %d files in %d groups\n", |
|
391 droppedFiles, droppedGroups) |
|
392 } |
|
393 return |
|
394 } |
301 } |
395 |
302 |
396 func (data *dataT) dropEmptyFiles(ignoreEmpty bool) (emptyCount int) { |
303 func (data *dataT) dropEmptyFiles(ignoreEmpty bool) (emptyCount int) { |
397 sc, ok := data.sizeGroups[0] |
304 sc, ok := data.sizeGroups[0] |
398 if ok == false { |
305 if ok == false { |
424 // Check for hardlinks |
331 // Check for hardlinks |
425 // Remove unique dev/inodes |
332 // Remove unique dev/inodes |
426 // Instead of this loop, another way would be to use the field |
333 // Instead of this loop, another way would be to use the field |
427 // "Unique" of the fileObj to mark them to be discarded |
334 // "Unique" of the fileObj to mark them to be discarded |
428 // and remove them all at the end. |
335 // and remove them all at the end. |
429 // TODO: what about symlinks? |
|
430 for { |
336 for { |
431 if !OSHasInodes() { |
337 if !OSHasInodes() { |
432 break |
338 break |
433 } |
339 } |
434 var hardLinkIndex int |
340 var hardLinkIndex int |
435 fo := sizeGroup.files[0] |
341 fo := sizeGroup.files[0] |
436 prevDev, prevIno := GetDevIno(fo) |
342 prevDev, prevIno := GetDevIno(fo) |
437 //fmt.Printf(" - FO: %#v\n", *fo) |
|
438 |
343 |
439 for i, fo := range sizeGroup.files[1:] { |
344 for i, fo := range sizeGroup.files[1:] { |
440 //fmt.Printf(" - FO %d : %#v\n", i, *fo) |
|
441 dev, ino := GetDevIno(fo) |
345 dev, ino := GetDevIno(fo) |
442 if dev == prevDev && ino == prevIno { |
346 if dev == prevDev && ino == prevIno { |
443 hardLinkIndex = i + 1 |
347 hardLinkIndex = i + 1 |
444 hardLinkCount++ |
348 hardLinkCount++ |
445 hardlinksFound = true |
349 hardlinksFound = true |
508 flag.BoolVar(&verbose, "v", false, "See --verbose") |
429 flag.BoolVar(&verbose, "v", false, "See --verbose") |
509 flag.BoolVar(&summary, "summary", false, "Do not display the duplicate list") |
430 flag.BoolVar(&summary, "summary", false, "Do not display the duplicate list") |
510 flag.BoolVar(&summary, "s", false, "See --summary") |
431 flag.BoolVar(&summary, "s", false, "See --summary") |
511 flag.BoolVar(&skipPartial, "skip-partial", false, "Skip partial checksums") |
432 flag.BoolVar(&skipPartial, "skip-partial", false, "Skip partial checksums") |
512 flag.IntVar(&myLog.verbosity, "verbosity", 0, |
433 flag.IntVar(&myLog.verbosity, "verbosity", 0, |
513 "Set verbosity level (1-5)") |
434 "Set verbosity level (1-6)") |
514 flag.IntVar(&myLog.verbosity, "vl", 0, "See verbosity") |
435 flag.IntVar(&myLog.verbosity, "vl", 0, "See verbosity") |
515 timings := flag.Bool("timings", false, "Set detailed log timings") |
436 timings := flag.Bool("timings", false, "Set detailed log timings") |
516 flag.BoolVar(&ignoreEmpty, "no-empty", false, "Ignore empty files") |
437 flag.BoolVar(&ignoreEmpty, "no-empty", false, "Ignore empty files") |
517 |
438 |
518 flag.Parse() |
439 flag.Parse() |
555 false)) |
476 false)) |
556 if emptyCount > 0 { |
477 if emptyCount > 0 { |
557 myLog.Printf(1, " %d empty files were ignored\n", |
478 myLog.Printf(1, " %d empty files were ignored\n", |
558 emptyCount) |
479 emptyCount) |
559 } |
480 } |
560 data.dispCount() |
481 data.dispCount() // XXX |
561 myLog.Println(3, "* Number of size groups:", len(data.sizeGroups)) |
482 myLog.Println(3, "* Number of size groups:", len(data.sizeGroups)) |
562 } |
483 } |
563 |
484 |
564 // Remove unique sizes |
485 // Remove unique sizes |
565 myLog.Println(1, "* Removing files with unique size, sorting file lists...") |
486 myLog.Println(1, "* Removing files with unique size and hard links...") |
566 hardLinkCount, uniqueSizeCount := data.initialCleanup() |
487 hardLinkCount, uniqueSizeCount := data.initialCleanup() |
567 if verbose { |
488 if verbose { |
568 myLog.Printf(2, " Dropped %d files with unique size\n", |
489 myLog.Printf(2, " Dropped %d files with unique size\n", |
569 uniqueSizeCount) |
490 uniqueSizeCount) |
570 myLog.Printf(2, " Dropped %d hard links\n", hardLinkCount) |
491 myLog.Printf(2, " Dropped %d hard links\n", hardLinkCount) |
571 myLog.Println(3, "* Number of size groups:", len(data.sizeGroups)) |
492 myLog.Println(3, "* Number of size groups:", len(data.sizeGroups)) |
572 data.dispCount() |
493 data.dispCount() // XXX |
573 } |
494 } |
574 |
|
575 // Calculate medsums |
|
576 if !skipPartial { |
|
577 myLog.Println(1, "* Calculating partial checksums...") |
|
578 data.calculateMedSums() |
|
579 data.dispCount() |
|
580 myLog.Println(3, "* Number of size groups:", len(data.sizeGroups)) |
|
581 } |
|
582 |
|
583 // Calculate checksums |
|
584 myLog.Println(1, "* Calculating checksums...") |
|
585 data.calculateChecksums() |
|
586 data.dispCount() |
|
587 |
495 |
588 // Get list of dupes |
496 // Get list of dupes |
|
497 myLog.Println(1, "* Computing checksums...") |
589 var result []FileObjList |
498 var result []FileObjList |
590 if len(data.emptyFiles) > 0 { |
499 if len(data.emptyFiles) > 0 { |
591 result = append(result, data.emptyFiles) |
500 result = append(result, data.emptyFiles) |
592 } |
501 } |
593 result = append(result, data.filesWithSameHash()...) |
502 result = append(result, data.findDupes(skipPartial)...) |
|
503 |
594 myLog.Println(3, "* Number of match groups:", len(result)) |
504 myLog.Println(3, "* Number of match groups:", len(result)) |
595 |
505 |
596 // Done! Dump dupes |
506 // Done! Dump dupes |
597 if len(result) > 0 && !summary { |
507 if len(result) > 0 && !summary { |
598 myLog.Println(1, "* Dupes:") |
508 myLog.Println(1, "* Dupes:") |