238 } |
242 } |
239 } |
243 } |
240 return |
244 return |
241 } |
245 } |
242 |
246 |
243 func (fileList FileObjList) findDupesChecksums(sType sumType) []FileObjList { |
247 // checksum returns the requested checksum as a string. |
244 var dupeList []FileObjList |
248 // If the checksum has not been pre-computed, it is calculated now. |
245 hashes := make(map[string]FileObjList) |
249 func (fo fileObj) checksum(sType sumType) (string, error) { |
246 |
250 var hbytes []byte |
247 // Sort the list for better efficiency |
251 if sType == partialChecksum { |
248 sort.Sort(ByInode(fileList)) |
252 hbytes = fo.PartialHash |
249 |
253 } else if sType == fullChecksum { |
250 // Compute checksums |
254 hbytes = fo.Hash |
251 for _, fo := range fileList { |
255 } else { |
|
256 panic("Internal error: Invalid sType") |
|
257 } |
|
258 if hbytes == nil { |
252 if err := fo.Sum(sType); err != nil { |
259 if err := fo.Sum(sType); err != nil { |
253 myLog.Println(0, "Error:", err) |
260 return "", err |
254 continue |
261 } |
255 } |
|
256 var hbytes []byte |
|
257 if sType == partialChecksum { |
262 if sType == partialChecksum { |
258 hbytes = fo.PartialHash |
263 hbytes = fo.PartialHash |
259 } else if sType == fullChecksum { |
264 } else if sType == fullChecksum { |
260 hbytes = fo.Hash |
265 hbytes = fo.Hash |
261 } else { |
266 } |
262 panic("Internal error: Invalid sType") |
267 } |
263 } |
268 return hex.EncodeToString(hbytes), nil |
264 if hbytes != nil { |
269 } |
265 hash := hex.EncodeToString(hbytes) |
270 |
266 hashes[hash] = append(hashes[hash], fo) |
271 func (fileList FileObjList) computeSheduledChecksums() { |
267 } |
272 // Sort the list for better efficiency |
|
273 sort.Sort(ByInode(fileList)) |
|
274 |
|
275 myLog.Printf(6, " . will compute %d checksums\n", len(fileList)) |
|
276 |
|
277 // Compute checksums |
|
278 for _, fo := range fileList { |
|
279 if err := fo.Sum(fo.needHash); err != nil { |
|
280 myLog.Println(0, "Error:", err) |
|
281 } |
|
282 } |
|
283 } |
|
284 |
|
285 func (fileList FileObjList) scheduleChecksum(sType sumType) { |
|
286 for _, fo := range fileList { |
|
287 fo.needHash = sType |
|
288 } |
|
289 } |
|
290 |
|
291 func (fileList FileObjList) findDupesChecksums(sType sumType) []FileObjList { |
|
292 var dupeList []FileObjList |
|
293 var scheduleFull []FileObjList |
|
294 hashes := make(map[string]FileObjList) |
|
295 |
|
296 // Sort the list for better efficiency |
|
297 sort.Sort(ByInode(fileList)) |
|
298 |
|
299 // Compute checksums |
|
300 for _, fo := range fileList { |
|
301 hash, err := fo.checksum(sType) |
|
302 if err != nil { |
|
303 myLog.Println(0, "Error:", err) |
|
304 continue |
|
305 } |
|
306 hashes[hash] = append(hashes[hash], fo) |
268 } |
307 } |
269 |
308 |
270 // Let's de-dupe now... |
309 // Let's de-dupe now... |
271 for _, l := range hashes { |
310 for _, l := range hashes { |
272 if len(l) < 2 { |
311 if len(l) < 2 { |
273 continue |
312 continue |
274 } |
313 } |
275 if sType == partialChecksum { |
314 if sType == partialChecksum { |
276 dupeList = append(dupeList, l.findDupesChecksums(fullChecksum)...) |
315 scheduleFull = append(scheduleFull, l) |
277 } else { // full checksums -> we’re done |
316 } else { // full checksums -> we're done |
278 dupeList = append(dupeList, l) |
317 dupeList = append(dupeList, l) |
279 } |
318 // TODO: sort by increasing size |
280 // TODO: sort by increasing size |
319 myLog.Printf(5, " . found %d new duplicates\n", len(l)) |
|
320 } |
|
321 } |
|
322 if sType == partialChecksum { |
|
323 var csList FileObjList |
|
324 for _, fol := range scheduleFull { |
|
325 csList = append(csList, fol...) |
|
326 } |
|
327 myLog.Printf(6, " .. findDupesChecksums: computing %d "+ |
|
328 "full checksums\n", len(csList)) // DBG |
|
329 csList.computeSheduledChecksums() |
|
330 for _, l := range scheduleFull { |
|
331 r := l.findDupesChecksums(fullChecksum) |
|
332 dupeList = append(dupeList, r...) |
|
333 } |
281 } |
334 } |
282 |
335 |
283 return dupeList |
336 return dupeList |
284 } |
337 } |
285 |
338 |
286 // findDupes() uses checksums to find file duplicates |
339 // findDupes() uses checksums to find file duplicates |
287 func (data *dataT) findDupes(skipPartial bool) []FileObjList { |
340 func (data *dataT) findDupes(skipPartial bool) []FileObjList { |
288 var dupeList []FileObjList |
341 var dupeList []FileObjList |
|
342 var schedulePartial []FileObjList |
|
343 var scheduleFull []FileObjList |
289 |
344 |
290 for size, sizeGroup := range data.sizeGroups { |
345 for size, sizeGroup := range data.sizeGroups { |
291 var r []FileObjList |
|
292 // We skip partial checksums for small files or if requested |
346 // We skip partial checksums for small files or if requested |
293 if size > minSizePartialChecksum && !skipPartial { |
347 if size > minSizePartialChecksum && !skipPartial { |
294 r = sizeGroup.files.findDupesChecksums(partialChecksum) |
348 sizeGroup.files.scheduleChecksum(partialChecksum) |
|
349 schedulePartial = append(schedulePartial, sizeGroup.files) |
295 } else { |
350 } else { |
296 r = sizeGroup.files.findDupesChecksums(fullChecksum) |
351 sizeGroup.files.scheduleChecksum(fullChecksum) |
297 } |
352 scheduleFull = append(scheduleFull, sizeGroup.files) |
|
353 } |
|
354 } |
|
355 |
|
356 var csList FileObjList |
|
357 for _, fol := range schedulePartial { |
|
358 csList = append(csList, fol...) |
|
359 } |
|
360 for _, fol := range scheduleFull { |
|
361 csList = append(csList, fol...) |
|
362 } |
|
363 myLog.Printf(6, " .. findDupes: computing %d misc checksums\n", |
|
364 len(csList)) // DBG |
|
365 csList.computeSheduledChecksums() |
|
366 |
|
367 for _, l := range schedulePartial { |
|
368 r := l.findDupesChecksums(partialChecksum) |
298 dupeList = append(dupeList, r...) |
369 dupeList = append(dupeList, r...) |
299 } |
370 } |
|
371 for _, l := range scheduleFull { |
|
372 r := l.findDupesChecksums(fullChecksum) |
|
373 dupeList = append(dupeList, r...) |
|
374 } |
|
375 // TODO: sort by increasing size |
300 return dupeList |
376 return dupeList |
301 } |
377 } |
302 |
378 |
303 func (data *dataT) dropEmptyFiles(ignoreEmpty bool) (emptyCount int) { |
379 func (data *dataT) dropEmptyFiles(ignoreEmpty bool) (emptyCount int) { |
304 sc, ok := data.sizeGroups[0] |
380 sc, ok := data.sizeGroups[0] |