goduf.go
changeset 7 68375cc98f98
parent 6 6740350569d3
child 8 25ad96511395
equal deleted inserted replaced
6:6740350569d3 7:68375cc98f98
    54 }
    54 }
    55 
    55 
    56 // FileObjList is only exported so that we can have a sort interface on inodes.
    56 // FileObjList is only exported so that we can have a sort interface on inodes.
    57 type FileObjList []*fileObj
    57 type FileObjList []*fileObj
    58 
    58 
    59 type sizeClass struct {
    59 type sizeClass struct { // XXX still useful?
    60 	files    FileObjList
    60 	files    FileObjList
    61 	medsums  map[string]FileObjList
    61 	medsums  map[string]FileObjList
    62 	fullsums map[string]FileObjList
    62 	fullsums map[string]FileObjList
    63 }
    63 }
    64 
    64 
   238 		}
   238 		}
   239 	}
   239 	}
   240 	return
   240 	return
   241 }
   241 }
   242 
   242 
   243 func findDupesFullChecksums(fileList FileObjList) []FileObjList {
   243 func (fileList FileObjList) findDupesChecksums(sType sumType) []FileObjList {
   244 	var dupeList []FileObjList
   244 	var dupeList []FileObjList
   245 	hashes := make(map[string]FileObjList)
   245 	hashes := make(map[string]FileObjList)
   246 
   246 
   247 	// Sort the list for better efficiency
   247 	// Sort the list for better efficiency
   248 	sort.Sort(ByInode(fileList))
   248 	sort.Sort(ByInode(fileList))
   249 
   249 
   250 	// Compute full checksums
   250 	// Compute checksums
   251 	for _, fo := range fileList {
   251 	for _, fo := range fileList {
   252 		if err := fo.Sum(fullChecksum); err != nil {
   252 		if err := fo.Sum(sType); err != nil {
   253 			myLog.Println(0, "Error:", err)
   253 			myLog.Println(0, "Error:", err)
   254 			continue
   254 			continue
   255 		}
   255 		}
   256 		hash := hex.EncodeToString(fo.Hash)
   256 		var hbytes []byte
   257 		hashes[hash] = append(hashes[hash], fo)
   257 		if sType == partialChecksum {
       
   258 			hbytes = fo.PartialHash
       
   259 		} else if sType == fullChecksum {
       
   260 			hbytes = fo.Hash
       
   261 		} else {
       
   262 			panic("Internal error: Invalid sType")
       
   263 		}
       
   264 		if hbytes != nil {
       
   265 			hash := hex.EncodeToString(hbytes)
       
   266 			hashes[hash] = append(hashes[hash], fo)
       
   267 		}
   258 	}
   268 	}
   259 
   269 
   260 	// Let's de-dupe now...
   270 	// Let's de-dupe now...
   261 	for _, l := range hashes {
   271 	for _, l := range hashes {
   262 		if len(l) < 2 {
   272 		if len(l) < 2 {
   263 			continue
   273 			continue
   264 		}
   274 		}
   265 		dupeList = append(dupeList, l)
   275 		if sType == partialChecksum {
   266 		// TODO sort by increasing size
   276 			dupeList = append(dupeList, l.findDupesChecksums(fullChecksum)...)
   267 		myLog.Printf(5, "  . found %d new duplicates\n", len(l))
   277 		} else { // full checksums -> we’re done
   268 	}
   278 			dupeList = append(dupeList, l)
   269 
   279 		}
   270 	return dupeList
   280 		// TODO: sort by increasing size
   271 }
       
   272 
       
   273 // TODO: refactor to avoid code duplication
       
   274 func findDupesPartialChecksums(fileList FileObjList) []FileObjList {
       
   275 	var dupeList []FileObjList
       
   276 	hashes := make(map[string]FileObjList)
       
   277 
       
   278 	// Sort the list for better efficiency
       
   279 	sort.Sort(ByInode(fileList))
       
   280 
       
   281 	// Compute partial checksums
       
   282 	for _, fo := range fileList {
       
   283 		if err := fo.Sum(partialChecksum); err != nil {
       
   284 			myLog.Println(0, "Error:", err)
       
   285 			continue
       
   286 		}
       
   287 		hash := hex.EncodeToString(fo.PartialHash)
       
   288 		hashes[hash] = append(hashes[hash], fo)
       
   289 	}
       
   290 
       
   291 	// Let's de-dupe now...
       
   292 	for _, l := range hashes {
       
   293 		if len(l) < 2 {
       
   294 			continue
       
   295 		}
       
   296 		dupeList = append(dupeList, findDupesFullChecksums(l)...)
       
   297 		// TODO sort by increasing size
       
   298 	}
   281 	}
   299 
   282 
   300 	return dupeList
   283 	return dupeList
   301 }
   284 }
   302 
   285 
   306 
   289 
   307 	for size, sizeGroup := range data.sizeGroups {
   290 	for size, sizeGroup := range data.sizeGroups {
   308 		var r []FileObjList
   291 		var r []FileObjList
   309 		// We skip partial checksums for small files or if requested
   292 		// We skip partial checksums for small files or if requested
   310 		if size > minSizePartialChecksum && !skipPartial {
   293 		if size > minSizePartialChecksum && !skipPartial {
   311 			r = findDupesPartialChecksums(sizeGroup.files)
   294 			r = sizeGroup.files.findDupesChecksums(partialChecksum)
   312 		} else {
   295 		} else {
   313 			r = findDupesFullChecksums(sizeGroup.files)
   296 			r = sizeGroup.files.findDupesChecksums(fullChecksum)
   314 		}
   297 		}
   315 		dupeList = append(dupeList, r...)
   298 		dupeList = append(dupeList, r...)
   316 	}
   299 	}
   317 	return dupeList
   300 	return dupeList
   318 }
   301 }