238 } |
238 } |
239 } |
239 } |
240 return |
240 return |
241 } |
241 } |
242 |
242 |
243 func findDupesFullChecksums(fileList FileObjList) []FileObjList { |
243 func (fileList FileObjList) findDupesChecksums(sType sumType) []FileObjList { |
244 var dupeList []FileObjList |
244 var dupeList []FileObjList |
245 hashes := make(map[string]FileObjList) |
245 hashes := make(map[string]FileObjList) |
246 |
246 |
247 // Sort the list for better efficiency |
247 // Sort the list for better efficiency |
248 sort.Sort(ByInode(fileList)) |
248 sort.Sort(ByInode(fileList)) |
249 |
249 |
250 // Compute full checksums |
250 // Compute checksums |
251 for _, fo := range fileList { |
251 for _, fo := range fileList { |
252 if err := fo.Sum(fullChecksum); err != nil { |
252 if err := fo.Sum(sType); err != nil { |
253 myLog.Println(0, "Error:", err) |
253 myLog.Println(0, "Error:", err) |
254 continue |
254 continue |
255 } |
255 } |
256 hash := hex.EncodeToString(fo.Hash) |
256 var hbytes []byte |
257 hashes[hash] = append(hashes[hash], fo) |
257 if sType == partialChecksum { |
|
258 hbytes = fo.PartialHash |
|
259 } else if sType == fullChecksum { |
|
260 hbytes = fo.Hash |
|
261 } else { |
|
262 panic("Internal error: Invalid sType") |
|
263 } |
|
264 if hbytes != nil { |
|
265 hash := hex.EncodeToString(hbytes) |
|
266 hashes[hash] = append(hashes[hash], fo) |
|
267 } |
258 } |
268 } |
259 |
269 |
260 // Let's de-dupe now... |
270 // Let's de-dupe now... |
261 for _, l := range hashes { |
271 for _, l := range hashes { |
262 if len(l) < 2 { |
272 if len(l) < 2 { |
263 continue |
273 continue |
264 } |
274 } |
265 dupeList = append(dupeList, l) |
275 if sType == partialChecksum { |
266 // TODO sort by increasing size |
276 dupeList = append(dupeList, l.findDupesChecksums(fullChecksum)...) |
267 myLog.Printf(5, " . found %d new duplicates\n", len(l)) |
277 } else { // full checksums -> we’re done |
268 } |
278 dupeList = append(dupeList, l) |
269 |
279 } |
270 return dupeList |
280 // TODO: sort by increasing size |
271 } |
|
272 |
|
273 // TODO: refactor to avoid code duplication |
|
274 func findDupesPartialChecksums(fileList FileObjList) []FileObjList { |
|
275 var dupeList []FileObjList |
|
276 hashes := make(map[string]FileObjList) |
|
277 |
|
278 // Sort the list for better efficiency |
|
279 sort.Sort(ByInode(fileList)) |
|
280 |
|
281 // Compute partial checksums |
|
282 for _, fo := range fileList { |
|
283 if err := fo.Sum(partialChecksum); err != nil { |
|
284 myLog.Println(0, "Error:", err) |
|
285 continue |
|
286 } |
|
287 hash := hex.EncodeToString(fo.PartialHash) |
|
288 hashes[hash] = append(hashes[hash], fo) |
|
289 } |
|
290 |
|
291 // Let's de-dupe now... |
|
292 for _, l := range hashes { |
|
293 if len(l) < 2 { |
|
294 continue |
|
295 } |
|
296 dupeList = append(dupeList, findDupesFullChecksums(l)...) |
|
297 // TODO sort by increasing size |
|
298 } |
281 } |
299 |
282 |
300 return dupeList |
283 return dupeList |
301 } |
284 } |
302 |
285 |
306 |
289 |
307 for size, sizeGroup := range data.sizeGroups { |
290 for size, sizeGroup := range data.sizeGroups { |
308 var r []FileObjList |
291 var r []FileObjList |
309 // We skip partial checksums for small files or if requested |
292 // We skip partial checksums for small files or if requested |
310 if size > minSizePartialChecksum && !skipPartial { |
293 if size > minSizePartialChecksum && !skipPartial { |
311 r = findDupesPartialChecksums(sizeGroup.files) |
294 r = sizeGroup.files.findDupesChecksums(partialChecksum) |
312 } else { |
295 } else { |
313 r = findDupesFullChecksums(sizeGroup.files) |
296 r = sizeGroup.files.findDupesChecksums(fullChecksum) |
314 } |
297 } |
315 dupeList = append(dupeList, r...) |
298 dupeList = append(dupeList, r...) |
316 } |
299 } |
317 return dupeList |
300 return dupeList |
318 } |
301 } |