goduf.go
changeset 8 25ad96511395
parent 7 68375cc98f98
child 9 5b58342459eb
equal deleted inserted replaced
7:68375cc98f98 8:25ad96511395
    39 const minSizePartialChecksum = 49152 // Should be > 3*medsumBytes
    39 const minSizePartialChecksum = 49152 // Should be > 3*medsumBytes
    40 
    40 
    41 type sumType int
    41 type sumType int
    42 
    42 
    43 const (
    43 const (
    44 	fullChecksum sumType = iota
    44 	noChecksum sumType = iota
       
    45 	fullChecksum
    45 	partialChecksum
    46 	partialChecksum
    46 )
    47 )
    47 
    48 
    48 type fileObj struct {
    49 type fileObj struct {
    49 	//Unique   bool
    50 	//Unique   bool
    50 	FilePath string
    51 	FilePath string
    51 	os.FileInfo
    52 	os.FileInfo
    52 	PartialHash []byte
    53 	PartialHash []byte
    53 	Hash        []byte
    54 	Hash        []byte
       
    55 	needHash    sumType
    54 }
    56 }
    55 
    57 
    56 // FileObjList is only exported so that we can have a sort interface on inodes.
    58 // FileObjList is only exported so that we can have a sort interface on inodes.
    57 type FileObjList []*fileObj
    59 type FileObjList []*fileObj
    58 
    60 
   207 func (fo *fileObj) Sum(sType sumType) error {
   209 func (fo *fileObj) Sum(sType sumType) error {
   208 	if sType == partialChecksum {
   210 	if sType == partialChecksum {
   209 		return fo.MedSum()
   211 		return fo.MedSum()
   210 	} else if sType == fullChecksum {
   212 	} else if sType == fullChecksum {
   211 		return fo.CheckSum()
   213 		return fo.CheckSum()
       
   214 	} else if sType == noChecksum {
       
   215 		return nil
   212 	}
   216 	}
   213 	panic("Internal error: Invalid sType")
   217 	panic("Internal error: Invalid sType")
   214 }
   218 }
   215 
   219 
   216 func (data *dataT) dispCount() { // FIXME rather useless
   220 func (data *dataT) dispCount() { // FIXME rather useless
   238 		}
   242 		}
   239 	}
   243 	}
   240 	return
   244 	return
   241 }
   245 }
   242 
   246 
   243 func (fileList FileObjList) findDupesChecksums(sType sumType) []FileObjList {
   247 // checksum returns the requested checksum as a string.
   244 	var dupeList []FileObjList
   248 // If the checksum has not been pre-computed, it is calculated now.
   245 	hashes := make(map[string]FileObjList)
   249 func (fo fileObj) checksum(sType sumType) (string, error) {
   246 
   250 	var hbytes []byte
   247 	// Sort the list for better efficiency
   251 	if sType == partialChecksum {
   248 	sort.Sort(ByInode(fileList))
   252 		hbytes = fo.PartialHash
   249 
   253 	} else if sType == fullChecksum {
   250 	// Compute checksums
   254 		hbytes = fo.Hash
   251 	for _, fo := range fileList {
   255 	} else {
       
   256 		panic("Internal error: Invalid sType")
       
   257 	}
       
   258 	if hbytes == nil {
   252 		if err := fo.Sum(sType); err != nil {
   259 		if err := fo.Sum(sType); err != nil {
   253 			myLog.Println(0, "Error:", err)
   260 			return "", err
   254 			continue
   261 		}
   255 		}
       
   256 		var hbytes []byte
       
   257 		if sType == partialChecksum {
   262 		if sType == partialChecksum {
   258 			hbytes = fo.PartialHash
   263 			hbytes = fo.PartialHash
   259 		} else if sType == fullChecksum {
   264 		} else if sType == fullChecksum {
   260 			hbytes = fo.Hash
   265 			hbytes = fo.Hash
   261 		} else {
   266 		}
   262 			panic("Internal error: Invalid sType")
   267 	}
   263 		}
   268 	return hex.EncodeToString(hbytes), nil
   264 		if hbytes != nil {
   269 }
   265 			hash := hex.EncodeToString(hbytes)
   270 
   266 			hashes[hash] = append(hashes[hash], fo)
   271 func (fileList FileObjList) computeSheduledChecksums() {
   267 		}
   272 	// Sort the list for better efficiency
       
   273 	sort.Sort(ByInode(fileList))
       
   274 
       
   275 	myLog.Printf(6, "  . will compute %d checksums\n", len(fileList))
       
   276 
       
   277 	// Compute checksums
       
   278 	for _, fo := range fileList {
       
   279 		if err := fo.Sum(fo.needHash); err != nil {
       
   280 			myLog.Println(0, "Error:", err)
       
   281 		}
       
   282 	}
       
   283 }
       
   284 
       
   285 func (fileList FileObjList) scheduleChecksum(sType sumType) {
       
   286 	for _, fo := range fileList {
       
   287 		fo.needHash = sType
       
   288 	}
       
   289 }
       
   290 
       
   291 func (fileList FileObjList) findDupesChecksums(sType sumType) []FileObjList {
       
   292 	var dupeList []FileObjList
       
   293 	var scheduleFull []FileObjList
       
   294 	hashes := make(map[string]FileObjList)
       
   295 
       
   296 	// Sort the list for better efficiency
       
   297 	sort.Sort(ByInode(fileList))
       
   298 
       
   299 	// Compute checksums
       
   300 	for _, fo := range fileList {
       
   301 		hash, err := fo.checksum(sType)
       
   302 		if err != nil {
       
   303 			myLog.Println(0, "Error:", err)
       
   304 			continue
       
   305 		}
       
   306 		hashes[hash] = append(hashes[hash], fo)
   268 	}
   307 	}
   269 
   308 
   270 	// Let's de-dupe now...
   309 	// Let's de-dupe now...
   271 	for _, l := range hashes {
   310 	for _, l := range hashes {
   272 		if len(l) < 2 {
   311 		if len(l) < 2 {
   273 			continue
   312 			continue
   274 		}
   313 		}
   275 		if sType == partialChecksum {
   314 		if sType == partialChecksum {
   276 			dupeList = append(dupeList, l.findDupesChecksums(fullChecksum)...)
   315 			scheduleFull = append(scheduleFull, l)
   277 		} else { // full checksums -> we’re done
   316 		} else { // full checksums -> we're done
   278 			dupeList = append(dupeList, l)
   317 			dupeList = append(dupeList, l)
   279 		}
   318 			// TODO: sort by increasing size
   280 		// TODO: sort by increasing size
   319 			myLog.Printf(5, "  . found %d new duplicates\n", len(l))
       
   320 		}
       
   321 	}
       
   322 	if sType == partialChecksum {
       
   323 		var csList FileObjList
       
   324 		for _, fol := range scheduleFull {
       
   325 			csList = append(csList, fol...)
       
   326 		}
       
   327 		myLog.Printf(6, "  .. findDupesChecksums: computing %d "+
       
   328 			"full checksums\n", len(csList)) // DBG
       
   329 		csList.computeSheduledChecksums()
       
   330 		for _, l := range scheduleFull {
       
   331 			r := l.findDupesChecksums(fullChecksum)
       
   332 			dupeList = append(dupeList, r...)
       
   333 		}
   281 	}
   334 	}
   282 
   335 
   283 	return dupeList
   336 	return dupeList
   284 }
   337 }
   285 
   338 
   286 // findDupes() uses checksums to find file duplicates
   339 // findDupes() uses checksums to find file duplicates
   287 func (data *dataT) findDupes(skipPartial bool) []FileObjList {
   340 func (data *dataT) findDupes(skipPartial bool) []FileObjList {
   288 	var dupeList []FileObjList
   341 	var dupeList []FileObjList
       
   342 	var schedulePartial []FileObjList
       
   343 	var scheduleFull []FileObjList
   289 
   344 
   290 	for size, sizeGroup := range data.sizeGroups {
   345 	for size, sizeGroup := range data.sizeGroups {
   291 		var r []FileObjList
       
   292 		// We skip partial checksums for small files or if requested
   346 		// We skip partial checksums for small files or if requested
   293 		if size > minSizePartialChecksum && !skipPartial {
   347 		if size > minSizePartialChecksum && !skipPartial {
   294 			r = sizeGroup.files.findDupesChecksums(partialChecksum)
   348 			sizeGroup.files.scheduleChecksum(partialChecksum)
       
   349 			schedulePartial = append(schedulePartial, sizeGroup.files)
   295 		} else {
   350 		} else {
   296 			r = sizeGroup.files.findDupesChecksums(fullChecksum)
   351 			sizeGroup.files.scheduleChecksum(fullChecksum)
   297 		}
   352 			scheduleFull = append(scheduleFull, sizeGroup.files)
       
   353 		}
       
   354 	}
       
   355 
       
   356 	var csList FileObjList
       
   357 	for _, fol := range schedulePartial {
       
   358 		csList = append(csList, fol...)
       
   359 	}
       
   360 	for _, fol := range scheduleFull {
       
   361 		csList = append(csList, fol...)
       
   362 	}
       
   363 	myLog.Printf(6, "  .. findDupes: computing %d misc checksums\n",
       
   364 		len(csList)) // DBG
       
   365 	csList.computeSheduledChecksums()
       
   366 
       
   367 	for _, l := range schedulePartial {
       
   368 		r := l.findDupesChecksums(partialChecksum)
   298 		dupeList = append(dupeList, r...)
   369 		dupeList = append(dupeList, r...)
   299 	}
   370 	}
       
   371 	for _, l := range scheduleFull {
       
   372 		r := l.findDupesChecksums(fullChecksum)
       
   373 		dupeList = append(dupeList, r...)
       
   374 	}
       
   375 	// TODO: sort by increasing size
   300 	return dupeList
   376 	return dupeList
   301 }
   377 }
   302 
   378 
   303 func (data *dataT) dropEmptyFiles(ignoreEmpty bool) (emptyCount int) {
   379 func (data *dataT) dropEmptyFiles(ignoreEmpty bool) (emptyCount int) {
   304 	sc, ok := data.sizeGroups[0]
   380 	sc, ok := data.sizeGroups[0]