1 /** |
1 /* |
2 * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. |
2 * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. |
3 * All rights reserved. |
3 * All rights reserved. |
4 * |
4 * |
5 * This source code is licensed under the BSD-style license found in the |
5 * This source code is licensed under both the BSD-style license (found in the |
6 * LICENSE file in the root directory of this source tree. An additional grant |
6 * LICENSE file in the root directory of this source tree) and the GPLv2 (found |
7 * of patent rights can be found in the PATENTS file in the same directory. |
7 * in the COPYING file in the root directory of this source tree). |
|
8 * You may select, at your option, one of the above-listed licenses. |
8 */ |
9 */ |
9 |
10 |
10 |
11 |
11 /*-************************************** |
12 /*-************************************** |
12 * Tuning parameters |
13 * Tuning parameters |
13 ****************************************/ |
14 ****************************************/ |
|
15 #define MINRATIO 4 /* minimum nb of apparition to be selected in dictionary */ |
14 #define ZDICT_MAX_SAMPLES_SIZE (2000U << 20) |
16 #define ZDICT_MAX_SAMPLES_SIZE (2000U << 20) |
15 #define ZDICT_MIN_SAMPLES_SIZE 512 |
17 #define ZDICT_MIN_SAMPLES_SIZE (ZDICT_CONTENTSIZE_MIN * MINRATIO) |
16 |
18 |
17 |
19 |
18 /*-************************************** |
20 /*-************************************** |
19 * Compiler Options |
21 * Compiler Options |
20 ****************************************/ |
22 ****************************************/ |
94 const char* ZDICT_getErrorName(size_t errorCode) { return ERR_getErrorName(errorCode); } |
93 const char* ZDICT_getErrorName(size_t errorCode) { return ERR_getErrorName(errorCode); } |
95 |
94 |
96 unsigned ZDICT_getDictID(const void* dictBuffer, size_t dictSize) |
95 unsigned ZDICT_getDictID(const void* dictBuffer, size_t dictSize) |
97 { |
96 { |
98 if (dictSize < 8) return 0; |
97 if (dictSize < 8) return 0; |
99 if (MEM_readLE32(dictBuffer) != ZSTD_DICT_MAGIC) return 0; |
98 if (MEM_readLE32(dictBuffer) != ZSTD_MAGIC_DICTIONARY) return 0; |
100 return MEM_readLE32((const char*)dictBuffer + 4); |
99 return MEM_readLE32((const char*)dictBuffer + 4); |
101 } |
100 } |
102 |
101 |
103 |
102 |
104 /*-******************************************************** |
103 /*-******************************************************** |
105 * Dictionary training functions |
104 * Dictionary training functions |
106 **********************************************************/ |
105 **********************************************************/ |
107 static unsigned ZDICT_NbCommonBytes (register size_t val) |
106 static unsigned ZDICT_NbCommonBytes (size_t val) |
108 { |
107 { |
109 if (MEM_isLittleEndian()) { |
108 if (MEM_isLittleEndian()) { |
110 if (MEM_64bits()) { |
109 if (MEM_64bits()) { |
111 # if defined(_MSC_VER) && defined(_WIN64) |
110 # if defined(_MSC_VER) && defined(_WIN64) |
112 unsigned long r = 0; |
111 unsigned long r = 0; |
221 /* trivial repetition cases */ |
219 /* trivial repetition cases */ |
222 if ( (MEM_read16(b+pos+0) == MEM_read16(b+pos+2)) |
220 if ( (MEM_read16(b+pos+0) == MEM_read16(b+pos+2)) |
223 ||(MEM_read16(b+pos+1) == MEM_read16(b+pos+3)) |
221 ||(MEM_read16(b+pos+1) == MEM_read16(b+pos+3)) |
224 ||(MEM_read16(b+pos+2) == MEM_read16(b+pos+4)) ) { |
222 ||(MEM_read16(b+pos+2) == MEM_read16(b+pos+4)) ) { |
225 /* skip and mark segment */ |
223 /* skip and mark segment */ |
226 U16 u16 = MEM_read16(b+pos+4); |
224 U16 const pattern16 = MEM_read16(b+pos+4); |
227 U32 u, e = 6; |
225 U32 u, patternEnd = 6; |
228 while (MEM_read16(b+pos+e) == u16) e+=2 ; |
226 while (MEM_read16(b+pos+patternEnd) == pattern16) patternEnd+=2 ; |
229 if (b[pos+e] == b[pos+e-1]) e++; |
227 if (b[pos+patternEnd] == b[pos+patternEnd-1]) patternEnd++; |
230 for (u=1; u<e; u++) |
228 for (u=1; u<patternEnd; u++) |
231 doneMarks[pos+u] = 1; |
229 doneMarks[pos+u] = 1; |
232 return solution; |
230 return solution; |
233 } |
231 } |
234 |
232 |
235 /* look forward */ |
233 /* look forward */ |
236 do { |
234 { size_t length; |
237 end++; |
235 do { |
238 length = ZDICT_count(b + pos, b + suffix[end]); |
236 end++; |
239 } while (length >=MINMATCHLENGTH); |
237 length = ZDICT_count(b + pos, b + suffix[end]); |
|
238 } while (length >= MINMATCHLENGTH); |
|
239 } |
240 |
240 |
241 /* look backward */ |
241 /* look backward */ |
242 do { |
242 { size_t length; |
243 length = ZDICT_count(b + pos, b + *(suffix+start-1)); |
243 do { |
244 if (length >=MINMATCHLENGTH) start--; |
244 length = ZDICT_count(b + pos, b + *(suffix+start-1)); |
245 } while(length >= MINMATCHLENGTH); |
245 if (length >=MINMATCHLENGTH) start--; |
|
246 } while(length >= MINMATCHLENGTH); |
|
247 } |
246 |
248 |
247 /* exit if not found a minimum nb of repetitions */ |
249 /* exit if not found a minimum nb of repetitions */ |
248 if (end-start < minRatio) { |
250 if (end-start < minRatio) { |
249 U32 idx; |
251 U32 idx; |
250 for(idx=start; idx<end; idx++) |
252 for(idx=start; idx<end; idx++) |
296 pos = suffix[refinedStart]; |
298 pos = suffix[refinedStart]; |
297 end = start; |
299 end = start; |
298 memset(lengthList, 0, sizeof(lengthList)); |
300 memset(lengthList, 0, sizeof(lengthList)); |
299 |
301 |
300 /* look forward */ |
302 /* look forward */ |
301 do { |
303 { size_t length; |
302 end++; |
304 do { |
303 length = ZDICT_count(b + pos, b + suffix[end]); |
305 end++; |
304 if (length >= LLIMIT) length = LLIMIT-1; |
306 length = ZDICT_count(b + pos, b + suffix[end]); |
305 lengthList[length]++; |
307 if (length >= LLIMIT) length = LLIMIT-1; |
306 } while (length >=MINMATCHLENGTH); |
308 lengthList[length]++; |
|
309 } while (length >=MINMATCHLENGTH); |
|
310 } |
307 |
311 |
308 /* look backward */ |
312 /* look backward */ |
309 length = MINMATCHLENGTH; |
313 { size_t length = MINMATCHLENGTH; |
310 while ((length >= MINMATCHLENGTH) & (start > 0)) { |
314 while ((length >= MINMATCHLENGTH) & (start > 0)) { |
311 length = ZDICT_count(b + pos, b + suffix[start - 1]); |
315 length = ZDICT_count(b + pos, b + suffix[start - 1]); |
312 if (length >= LLIMIT) length = LLIMIT - 1; |
316 if (length >= LLIMIT) length = LLIMIT - 1; |
313 lengthList[length]++; |
317 lengthList[length]++; |
314 if (length >= MINMATCHLENGTH) start--; |
318 if (length >= MINMATCHLENGTH) start--; |
|
319 } |
315 } |
320 } |
316 |
321 |
317 /* largest useful length */ |
322 /* largest useful length */ |
318 memset(cumulLength, 0, sizeof(cumulLength)); |
323 memset(cumulLength, 0, sizeof(cumulLength)); |
319 cumulLength[maxLength-1] = lengthList[maxLength-1]; |
324 cumulLength[maxLength-1] = lengthList[maxLength-1]; |
361 |
366 |
362 return solution; |
367 return solution; |
363 } |
368 } |
364 |
369 |
365 |
370 |
366 /*! ZDICT_checkMerge |
371 static int isIncluded(const void* in, const void* container, size_t length) |
|
372 { |
|
373 const char* const ip = (const char*) in; |
|
374 const char* const into = (const char*) container; |
|
375 size_t u; |
|
376 |
|
377 for (u=0; u<length; u++) { /* works because end of buffer is a noisy guard band */ |
|
378 if (ip[u] != into[u]) break; |
|
379 } |
|
380 |
|
381 return u==length; |
|
382 } |
|
383 |
|
384 /*! ZDICT_tryMerge() : |
367 check if dictItem can be merged, do it if possible |
385 check if dictItem can be merged, do it if possible |
368 @return : id of destination elt, 0 if not merged |
386 @return : id of destination elt, 0 if not merged |
369 */ |
387 */ |
370 static U32 ZDICT_checkMerge(dictItem* table, dictItem elt, U32 eltNbToSkip) |
388 static U32 ZDICT_tryMerge(dictItem* table, dictItem elt, U32 eltNbToSkip, const void* buffer) |
371 { |
389 { |
372 const U32 tableSize = table->pos; |
390 const U32 tableSize = table->pos; |
373 const U32 eltEnd = elt.pos + elt.length; |
391 const U32 eltEnd = elt.pos + elt.length; |
|
392 const char* const buf = (const char*) buffer; |
374 |
393 |
375 /* tail overlap */ |
394 /* tail overlap */ |
376 U32 u; for (u=1; u<tableSize; u++) { |
395 U32 u; for (u=1; u<tableSize; u++) { |
377 if (u==eltNbToSkip) continue; |
396 if (u==eltNbToSkip) continue; |
378 if ((table[u].pos > elt.pos) && (table[u].pos <= eltEnd)) { /* overlap, existing > new */ |
397 if ((table[u].pos > elt.pos) && (table[u].pos <= eltEnd)) { /* overlap, existing > new */ |
379 /* append */ |
398 /* append */ |
380 U32 addedLength = table[u].pos - elt.pos; |
399 U32 const addedLength = table[u].pos - elt.pos; |
381 table[u].length += addedLength; |
400 table[u].length += addedLength; |
382 table[u].pos = elt.pos; |
401 table[u].pos = elt.pos; |
383 table[u].savings += elt.savings * addedLength / elt.length; /* rough approx */ |
402 table[u].savings += elt.savings * addedLength / elt.length; /* rough approx */ |
384 table[u].savings += elt.length / 8; /* rough approx bonus */ |
403 table[u].savings += elt.length / 8; /* rough approx bonus */ |
385 elt = table[u]; |
404 elt = table[u]; |
405 elt = table[u]; |
425 elt = table[u]; |
406 while ((u>1) && (table[u-1].savings < elt.savings)) |
426 while ((u>1) && (table[u-1].savings < elt.savings)) |
407 table[u] = table[u-1], u--; |
427 table[u] = table[u-1], u--; |
408 table[u] = elt; |
428 table[u] = elt; |
409 return u; |
429 return u; |
410 } } |
430 } |
|
431 |
|
432 if (MEM_read64(buf + table[u].pos) == MEM_read64(buf + elt.pos + 1)) { |
|
433 if (isIncluded(buf + table[u].pos, buf + elt.pos + 1, table[u].length)) { |
|
434 size_t const addedLength = MAX( (int)elt.length - (int)table[u].length , 1 ); |
|
435 table[u].pos = elt.pos; |
|
436 table[u].savings += (U32)(elt.savings * addedLength / elt.length); |
|
437 table[u].length = MIN(elt.length, table[u].length + 1); |
|
438 return u; |
|
439 } |
|
440 } |
|
441 } |
411 |
442 |
412 return 0; |
443 return 0; |
413 } |
444 } |
414 |
445 |
415 |
446 |
416 static void ZDICT_removeDictItem(dictItem* table, U32 id) |
447 static void ZDICT_removeDictItem(dictItem* table, U32 id) |
417 { |
448 { |
418 /* convention : first element is nb of elts */ |
449 /* convention : table[0].pos stores nb of elts */ |
419 U32 const max = table->pos; |
450 U32 const max = table[0].pos; |
420 U32 u; |
451 U32 u; |
421 if (!id) return; /* protection, should never happen */ |
452 if (!id) return; /* protection, should never happen */ |
422 for (u=id; u<max-1; u++) |
453 for (u=id; u<max-1; u++) |
423 table[u] = table[u+1]; |
454 table[u] = table[u+1]; |
424 table->pos--; |
455 table->pos--; |
425 } |
456 } |
426 |
457 |
427 |
458 |
428 static void ZDICT_insertDictItem(dictItem* table, U32 maxSize, dictItem elt) |
459 static void ZDICT_insertDictItem(dictItem* table, U32 maxSize, dictItem elt, const void* buffer) |
429 { |
460 { |
430 /* merge if possible */ |
461 /* merge if possible */ |
431 U32 mergeId = ZDICT_checkMerge(table, elt, 0); |
462 U32 mergeId = ZDICT_tryMerge(table, elt, 0, buffer); |
432 if (mergeId) { |
463 if (mergeId) { |
433 U32 newMerge = 1; |
464 U32 newMerge = 1; |
434 while (newMerge) { |
465 while (newMerge) { |
435 newMerge = ZDICT_checkMerge(table, table[mergeId], mergeId); |
466 newMerge = ZDICT_tryMerge(table, table[mergeId], mergeId, buffer); |
436 if (newMerge) ZDICT_removeDictItem(table, mergeId); |
467 if (newMerge) ZDICT_removeDictItem(table, mergeId); |
437 mergeId = newMerge; |
468 mergeId = newMerge; |
438 } |
469 } |
439 return; |
470 return; |
440 } |
471 } |
519 { U32 cursor; for (cursor=0; cursor < bufferSize; ) { |
550 { U32 cursor; for (cursor=0; cursor < bufferSize; ) { |
520 dictItem solution; |
551 dictItem solution; |
521 if (doneMarks[cursor]) { cursor++; continue; } |
552 if (doneMarks[cursor]) { cursor++; continue; } |
522 solution = ZDICT_analyzePos(doneMarks, suffix, reverseSuffix[cursor], buffer, minRatio, notificationLevel); |
553 solution = ZDICT_analyzePos(doneMarks, suffix, reverseSuffix[cursor], buffer, minRatio, notificationLevel); |
523 if (solution.length==0) { cursor++; continue; } |
554 if (solution.length==0) { cursor++; continue; } |
524 ZDICT_insertDictItem(dictList, dictListSize, solution); |
555 ZDICT_insertDictItem(dictList, dictListSize, solution, buffer); |
525 cursor += solution.length; |
556 cursor += solution.length; |
526 DISPLAYUPDATE(2, "\r%4.2f %% \r", (double)cursor / bufferSize * 100); |
557 DISPLAYUPDATE(2, "\r%4.2f %% \r", (double)cursor / bufferSize * 100); |
527 } } |
558 } } |
528 |
559 |
529 _cleanup: |
560 _cleanup: |
548 } |
579 } |
549 |
580 |
550 |
581 |
551 typedef struct |
582 typedef struct |
552 { |
583 { |
553 ZSTD_CCtx* ref; |
584 ZSTD_CCtx* ref; /* contains reference to dictionary */ |
554 ZSTD_CCtx* zc; |
585 ZSTD_CCtx* zc; /* working context */ |
555 void* workPlace; /* must be ZSTD_BLOCKSIZE_ABSOLUTEMAX allocated */ |
586 void* workPlace; /* must be ZSTD_BLOCKSIZE_MAX allocated */ |
556 } EStats_ress_t; |
587 } EStats_ress_t; |
557 |
588 |
558 #define MAXREPOFFSET 1024 |
589 #define MAXREPOFFSET 1024 |
559 |
590 |
560 static void ZDICT_countEStats(EStats_ress_t esr, ZSTD_parameters params, |
591 static void ZDICT_countEStats(EStats_ress_t esr, ZSTD_parameters params, |
561 U32* countLit, U32* offsetcodeCount, U32* matchlengthCount, U32* litlengthCount, U32* repOffsets, |
592 U32* countLit, U32* offsetcodeCount, U32* matchlengthCount, U32* litlengthCount, U32* repOffsets, |
562 const void* src, size_t srcSize, U32 notificationLevel) |
593 const void* src, size_t srcSize, |
563 { |
594 U32 notificationLevel) |
564 size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_ABSOLUTEMAX, 1 << params.cParams.windowLog); |
595 { |
|
596 size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_MAX, 1 << params.cParams.windowLog); |
565 size_t cSize; |
597 size_t cSize; |
566 |
598 |
567 if (srcSize > blockSizeMax) srcSize = blockSizeMax; /* protection vs large samples */ |
599 if (srcSize > blockSizeMax) srcSize = blockSizeMax; /* protection vs large samples */ |
568 { size_t const errorCode = ZSTD_copyCCtx(esr.zc, esr.ref, 0); |
600 { size_t const errorCode = ZSTD_copyCCtx(esr.zc, esr.ref, 0); |
569 if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_copyCCtx failed \n"); return; } |
601 if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_copyCCtx failed \n"); return; } |
570 } |
602 } |
571 cSize = ZSTD_compressBlock(esr.zc, esr.workPlace, ZSTD_BLOCKSIZE_ABSOLUTEMAX, src, srcSize); |
603 cSize = ZSTD_compressBlock(esr.zc, esr.workPlace, ZSTD_BLOCKSIZE_MAX, src, srcSize); |
572 if (ZSTD_isError(cSize)) { DISPLAYLEVEL(3, "warning : could not compress sample size %u \n", (U32)srcSize); return; } |
604 if (ZSTD_isError(cSize)) { DISPLAYLEVEL(3, "warning : could not compress sample size %u \n", (U32)srcSize); return; } |
573 |
605 |
574 if (cSize) { /* if == 0; block is not compressible */ |
606 if (cSize) { /* if == 0; block is not compressible */ |
575 const seqStore_t* seqStorePtr = ZSTD_getSeqStore(esr.zc); |
607 const seqStore_t* const seqStorePtr = ZSTD_getSeqStore(esr.zc); |
576 |
608 |
577 /* literals stats */ |
609 /* literals stats */ |
578 { const BYTE* bytePtr; |
610 { const BYTE* bytePtr; |
579 for(bytePtr = seqStorePtr->litStart; bytePtr < seqStorePtr->lit; bytePtr++) |
611 for(bytePtr = seqStorePtr->litStart; bytePtr < seqStorePtr->lit; bytePtr++) |
580 countLit[*bytePtr]++; |
612 countLit[*bytePtr]++; |
672 size_t const totalSrcSize = ZDICT_totalSampleSize(fileSizes, nbFiles); |
705 size_t const totalSrcSize = ZDICT_totalSampleSize(fileSizes, nbFiles); |
673 size_t const averageSampleSize = totalSrcSize / (nbFiles + !nbFiles); |
706 size_t const averageSampleSize = totalSrcSize / (nbFiles + !nbFiles); |
674 BYTE* dstPtr = (BYTE*)dstBuffer; |
707 BYTE* dstPtr = (BYTE*)dstBuffer; |
675 |
708 |
676 /* init */ |
709 /* init */ |
|
710 DEBUGLOG(4, "ZDICT_analyzeEntropy"); |
677 esr.ref = ZSTD_createCCtx(); |
711 esr.ref = ZSTD_createCCtx(); |
678 esr.zc = ZSTD_createCCtx(); |
712 esr.zc = ZSTD_createCCtx(); |
679 esr.workPlace = malloc(ZSTD_BLOCKSIZE_ABSOLUTEMAX); |
713 esr.workPlace = malloc(ZSTD_BLOCKSIZE_MAX); |
680 if (!esr.ref || !esr.zc || !esr.workPlace) { |
714 if (!esr.ref || !esr.zc || !esr.workPlace) { |
681 eSize = ERROR(memory_allocation); |
715 eSize = ERROR(memory_allocation); |
682 DISPLAYLEVEL(1, "Not enough memory \n"); |
716 DISPLAYLEVEL(1, "Not enough memory \n"); |
683 goto _cleanup; |
717 goto _cleanup; |
684 } |
718 } |
685 if (offcodeMax>OFFCODE_MAX) { eSize = ERROR(dictionary_wrong); goto _cleanup; } /* too large dictionary */ |
719 if (offcodeMax>OFFCODE_MAX) { eSize = ERROR(dictionaryCreation_failed); goto _cleanup; } /* too large dictionary */ |
686 for (u=0; u<256; u++) countLit[u]=1; /* any character must be described */ |
720 for (u=0; u<256; u++) countLit[u] = 1; /* any character must be described */ |
687 for (u=0; u<=offcodeMax; u++) offcodeCount[u]=1; |
721 for (u=0; u<=offcodeMax; u++) offcodeCount[u] = 1; |
688 for (u=0; u<=MaxML; u++) matchLengthCount[u]=1; |
722 for (u=0; u<=MaxML; u++) matchLengthCount[u] = 1; |
689 for (u=0; u<=MaxLL; u++) litLengthCount[u]=1; |
723 for (u=0; u<=MaxLL; u++) litLengthCount[u] = 1; |
690 memset(repOffset, 0, sizeof(repOffset)); |
724 memset(repOffset, 0, sizeof(repOffset)); |
691 repOffset[1] = repOffset[4] = repOffset[8] = 1; |
725 repOffset[1] = repOffset[4] = repOffset[8] = 1; |
692 memset(bestRepOffset, 0, sizeof(bestRepOffset)); |
726 memset(bestRepOffset, 0, sizeof(bestRepOffset)); |
693 if (compressionLevel==0) compressionLevel=g_compressionLevel_default; |
727 if (compressionLevel<=0) compressionLevel = g_compressionLevel_default; |
694 params = ZSTD_getParams(compressionLevel, averageSampleSize, dictBufferSize); |
728 params = ZSTD_getParams(compressionLevel, averageSampleSize, dictBufferSize); |
695 { size_t const beginResult = ZSTD_compressBegin_advanced(esr.ref, dictBuffer, dictBufferSize, params, 0); |
729 { size_t const beginResult = ZSTD_compressBegin_advanced(esr.ref, dictBuffer, dictBufferSize, params, 0); |
696 if (ZSTD_isError(beginResult)) { |
730 if (ZSTD_isError(beginResult)) { |
|
731 DISPLAYLEVEL(1, "error : ZSTD_compressBegin_advanced() failed : %s \n", ZSTD_getErrorName(beginResult)); |
697 eSize = ERROR(GENERIC); |
732 eSize = ERROR(GENERIC); |
698 DISPLAYLEVEL(1, "error : ZSTD_compressBegin_advanced failed \n"); |
|
699 goto _cleanup; |
733 goto _cleanup; |
700 } } |
734 } } |
701 |
735 |
702 /* collect stats on all files */ |
736 /* collect stats on all samples */ |
703 for (u=0; u<nbFiles; u++) { |
737 for (u=0; u<nbFiles; u++) { |
704 ZDICT_countEStats(esr, params, |
738 ZDICT_countEStats(esr, params, |
705 countLit, offcodeCount, matchLengthCount, litLengthCount, repOffset, |
739 countLit, offcodeCount, matchLengthCount, litLengthCount, repOffset, |
706 (const char*)srcBuffer + pos, fileSizes[u], |
740 (const char*)srcBuffer + pos, fileSizes[u], |
707 notificationLevel); |
741 notificationLevel); |
708 pos += fileSizes[u]; |
742 pos += fileSizes[u]; |
709 } |
743 } |
710 |
744 |
711 /* analyze */ |
745 /* analyze, build stats, starting with literals */ |
712 errorCode = HUF_buildCTable (hufTable, countLit, 255, huffLog); |
746 { size_t maxNbBits = HUF_buildCTable (hufTable, countLit, 255, huffLog); |
713 if (HUF_isError(errorCode)) { |
747 if (HUF_isError(maxNbBits)) { |
714 eSize = ERROR(GENERIC); |
748 eSize = ERROR(GENERIC); |
715 DISPLAYLEVEL(1, "HUF_buildCTable error \n"); |
749 DISPLAYLEVEL(1, " HUF_buildCTable error \n"); |
716 goto _cleanup; |
750 goto _cleanup; |
717 } |
751 } |
718 huffLog = (U32)errorCode; |
752 if (maxNbBits==8) { /* not compressible : will fail on HUF_writeCTable() */ |
|
753 DISPLAYLEVEL(2, "warning : pathological dataset : literals are not compressible : samples are noisy or too regular \n"); |
|
754 ZDICT_flatLit(countLit); /* replace distribution by a fake "mostly flat but still compressible" distribution, that HUF_writeCTable() can encode */ |
|
755 maxNbBits = HUF_buildCTable (hufTable, countLit, 255, huffLog); |
|
756 assert(maxNbBits==9); |
|
757 } |
|
758 huffLog = (U32)maxNbBits; |
|
759 } |
719 |
760 |
720 /* looking for most common first offsets */ |
761 /* looking for most common first offsets */ |
721 { U32 offset; |
762 { U32 offset; |
722 for (offset=1; offset<MAXREPOFFSET; offset++) |
763 for (offset=1; offset<MAXREPOFFSET; offset++) |
723 ZDICT_insertSortCount(bestRepOffset, offset, repOffset[offset]); |
764 ZDICT_insertSortCount(bestRepOffset, offset, repOffset[offset]); |
829 const void* customDictContent, size_t dictContentSize, |
869 const void* customDictContent, size_t dictContentSize, |
830 const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, |
870 const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, |
831 ZDICT_params_t params) |
871 ZDICT_params_t params) |
832 { |
872 { |
833 size_t hSize; |
873 size_t hSize; |
834 #define HBUFFSIZE 256 |
874 #define HBUFFSIZE 256 /* should prove large enough for all entropy headers */ |
835 BYTE header[HBUFFSIZE]; |
875 BYTE header[HBUFFSIZE]; |
836 int const compressionLevel = (params.compressionLevel <= 0) ? g_compressionLevel_default : params.compressionLevel; |
876 int const compressionLevel = (params.compressionLevel <= 0) ? g_compressionLevel_default : params.compressionLevel; |
837 U32 const notificationLevel = params.notificationLevel; |
877 U32 const notificationLevel = params.notificationLevel; |
838 |
878 |
839 /* check conditions */ |
879 /* check conditions */ |
|
880 DEBUGLOG(4, "ZDICT_finalizeDictionary"); |
840 if (dictBufferCapacity < dictContentSize) return ERROR(dstSize_tooSmall); |
881 if (dictBufferCapacity < dictContentSize) return ERROR(dstSize_tooSmall); |
841 if (dictContentSize < ZDICT_CONTENTSIZE_MIN) return ERROR(srcSize_wrong); |
882 if (dictContentSize < ZDICT_CONTENTSIZE_MIN) return ERROR(srcSize_wrong); |
842 if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) return ERROR(dstSize_tooSmall); |
883 if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) return ERROR(dstSize_tooSmall); |
843 |
884 |
844 /* dictionary header */ |
885 /* dictionary header */ |
845 MEM_writeLE32(header, ZSTD_DICT_MAGIC); |
886 MEM_writeLE32(header, ZSTD_MAGIC_DICTIONARY); |
846 { U64 const randomID = XXH64(customDictContent, dictContentSize, 0); |
887 { U64 const randomID = XXH64(customDictContent, dictContentSize, 0); |
847 U32 const compliantID = (randomID % ((1U<<31)-32768)) + 32768; |
888 U32 const compliantID = (randomID % ((1U<<31)-32768)) + 32768; |
848 U32 const dictID = params.dictID ? params.dictID : compliantID; |
889 U32 const dictID = params.dictID ? params.dictID : compliantID; |
849 MEM_writeLE32(header+4, dictID); |
890 MEM_writeLE32(header+4, dictID); |
850 } |
891 } |
875 |
916 |
876 size_t ZDICT_addEntropyTablesFromBuffer_advanced(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity, |
917 size_t ZDICT_addEntropyTablesFromBuffer_advanced(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity, |
877 const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, |
918 const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, |
878 ZDICT_params_t params) |
919 ZDICT_params_t params) |
879 { |
920 { |
880 size_t hSize; |
|
881 int const compressionLevel = (params.compressionLevel <= 0) ? g_compressionLevel_default : params.compressionLevel; |
921 int const compressionLevel = (params.compressionLevel <= 0) ? g_compressionLevel_default : params.compressionLevel; |
882 U32 const notificationLevel = params.notificationLevel; |
922 U32 const notificationLevel = params.notificationLevel; |
883 |
923 size_t hSize = 8; |
884 /* dictionary header */ |
924 |
885 MEM_writeLE32(dictBuffer, ZSTD_DICT_MAGIC); |
925 /* calculate entropy tables */ |
886 { U64 const randomID = XXH64((char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize, 0); |
|
887 U32 const compliantID = (randomID % ((1U<<31)-32768)) + 32768; |
|
888 U32 const dictID = params.dictID ? params.dictID : compliantID; |
|
889 MEM_writeLE32((char*)dictBuffer+4, dictID); |
|
890 } |
|
891 hSize = 8; |
|
892 |
|
893 /* entropy tables */ |
|
894 DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */ |
926 DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */ |
895 DISPLAYLEVEL(2, "statistics ... \n"); |
927 DISPLAYLEVEL(2, "statistics ... \n"); |
896 { size_t const eSize = ZDICT_analyzeEntropy((char*)dictBuffer+hSize, dictBufferCapacity-hSize, |
928 { size_t const eSize = ZDICT_analyzeEntropy((char*)dictBuffer+hSize, dictBufferCapacity-hSize, |
897 compressionLevel, |
929 compressionLevel, |
898 samplesBuffer, samplesSizes, nbSamples, |
930 samplesBuffer, samplesSizes, nbSamples, |
900 notificationLevel); |
932 notificationLevel); |
901 if (ZDICT_isError(eSize)) return eSize; |
933 if (ZDICT_isError(eSize)) return eSize; |
902 hSize += eSize; |
934 hSize += eSize; |
903 } |
935 } |
904 |
936 |
|
937 /* add dictionary header (after entropy tables) */ |
|
938 MEM_writeLE32(dictBuffer, ZSTD_MAGIC_DICTIONARY); |
|
939 { U64 const randomID = XXH64((char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize, 0); |
|
940 U32 const compliantID = (randomID % ((1U<<31)-32768)) + 32768; |
|
941 U32 const dictID = params.dictID ? params.dictID : compliantID; |
|
942 MEM_writeLE32((char*)dictBuffer+4, dictID); |
|
943 } |
905 |
944 |
906 if (hSize + dictContentSize < dictBufferCapacity) |
945 if (hSize + dictContentSize < dictBufferCapacity) |
907 memmove((char*)dictBuffer + hSize, (char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize); |
946 memmove((char*)dictBuffer + hSize, (char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize); |
908 return MIN(dictBufferCapacity, hSize+dictContentSize); |
947 return MIN(dictBufferCapacity, hSize+dictContentSize); |
909 } |
948 } |
910 |
949 |
911 |
950 |
912 /*! ZDICT_trainFromBuffer_unsafe() : |
951 /*! ZDICT_trainFromBuffer_unsafe_legacy() : |
913 * Warning : `samplesBuffer` must be followed by noisy guard band. |
952 * Warning : `samplesBuffer` must be followed by noisy guard band. |
914 * @return : size of dictionary, or an error code which can be tested with ZDICT_isError() |
953 * @return : size of dictionary, or an error code which can be tested with ZDICT_isError() |
915 */ |
954 */ |
916 size_t ZDICT_trainFromBuffer_unsafe( |
955 size_t ZDICT_trainFromBuffer_unsafe_legacy( |
917 void* dictBuffer, size_t maxDictSize, |
956 void* dictBuffer, size_t maxDictSize, |
918 const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, |
957 const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, |
919 ZDICT_params_t params) |
958 ZDICT_legacy_params_t params) |
920 { |
959 { |
921 U32 const dictListSize = MAX(MAX(DICTLISTSIZE_DEFAULT, nbSamples), (U32)(maxDictSize/16)); |
960 U32 const dictListSize = MAX(MAX(DICTLISTSIZE_DEFAULT, nbSamples), (U32)(maxDictSize/16)); |
922 dictItem* const dictList = (dictItem*)malloc(dictListSize * sizeof(*dictList)); |
961 dictItem* const dictList = (dictItem*)malloc(dictListSize * sizeof(*dictList)); |
923 unsigned const selectivity = params.selectivityLevel == 0 ? g_selectivity_default : params.selectivityLevel; |
962 unsigned const selectivity = params.selectivityLevel == 0 ? g_selectivity_default : params.selectivityLevel; |
924 unsigned const minRep = (selectivity > 30) ? MINRATIO : nbSamples >> selectivity; |
963 unsigned const minRep = (selectivity > 30) ? MINRATIO : nbSamples >> selectivity; |
925 size_t const targetDictSize = maxDictSize; |
964 size_t const targetDictSize = maxDictSize; |
926 size_t const samplesBuffSize = ZDICT_totalSampleSize(samplesSizes, nbSamples); |
965 size_t const samplesBuffSize = ZDICT_totalSampleSize(samplesSizes, nbSamples); |
927 size_t dictSize = 0; |
966 size_t dictSize = 0; |
928 U32 const notificationLevel = params.notificationLevel; |
967 U32 const notificationLevel = params.zParams.notificationLevel; |
929 |
968 |
930 /* checks */ |
969 /* checks */ |
931 if (!dictList) return ERROR(memory_allocation); |
970 if (!dictList) return ERROR(memory_allocation); |
932 if (maxDictSize <= g_provision_entropySize + g_min_fast_dictContent) { free(dictList); return ERROR(dstSize_tooSmall); } |
971 if (maxDictSize < ZDICT_DICTSIZE_MIN) { free(dictList); return ERROR(dstSize_tooSmall); } /* requested dictionary size is too small */ |
933 if (samplesBuffSize < ZDICT_MIN_SAMPLES_SIZE) { free(dictList); return 0; } /* not enough source to create dictionary */ |
972 if (samplesBuffSize < ZDICT_MIN_SAMPLES_SIZE) { free(dictList); return ERROR(dictionaryCreation_failed); } /* not enough source to create dictionary */ |
934 |
973 |
935 /* init */ |
974 /* init */ |
936 ZDICT_initDictItem(dictList); |
975 ZDICT_initDictItem(dictList); |
937 |
976 |
938 /* build dictionary */ |
977 /* build dictionary */ |
939 ZDICT_trainBuffer(dictList, dictListSize, |
978 ZDICT_trainBuffer_legacy(dictList, dictListSize, |
940 samplesBuffer, samplesBuffSize, |
979 samplesBuffer, samplesBuffSize, |
941 samplesSizes, nbSamples, |
980 samplesSizes, nbSamples, |
942 minRep, notificationLevel); |
981 minRep, notificationLevel); |
943 |
982 |
944 /* display best matches */ |
983 /* display best matches */ |
945 if (params.notificationLevel>= 3) { |
984 if (params.zParams.notificationLevel>= 3) { |
946 U32 const nb = MIN(25, dictList[0].pos); |
985 U32 const nb = MIN(25, dictList[0].pos); |
947 U32 const dictContentSize = ZDICT_dictSize(dictList); |
986 U32 const dictContentSize = ZDICT_dictSize(dictList); |
948 U32 u; |
987 U32 u; |
949 DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", dictList[0].pos-1, dictContentSize); |
988 DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", dictList[0].pos-1, dictContentSize); |
950 DISPLAYLEVEL(3, "list %u best segments \n", nb-1); |
989 DISPLAYLEVEL(3, "list %u best segments \n", nb-1); |
961 } } |
1000 } } |
962 |
1001 |
963 |
1002 |
964 /* create dictionary */ |
1003 /* create dictionary */ |
965 { U32 dictContentSize = ZDICT_dictSize(dictList); |
1004 { U32 dictContentSize = ZDICT_dictSize(dictList); |
966 if (dictContentSize < targetDictSize/3) { |
1005 if (dictContentSize < ZDICT_CONTENTSIZE_MIN) { free(dictList); return ERROR(dictionaryCreation_failed); } /* dictionary content too small */ |
|
1006 if (dictContentSize < targetDictSize/4) { |
967 DISPLAYLEVEL(2, "! warning : selected content significantly smaller than requested (%u < %u) \n", dictContentSize, (U32)maxDictSize); |
1007 DISPLAYLEVEL(2, "! warning : selected content significantly smaller than requested (%u < %u) \n", dictContentSize, (U32)maxDictSize); |
|
1008 if (samplesBuffSize < 10 * targetDictSize) |
|
1009 DISPLAYLEVEL(2, "! consider increasing the number of samples (total size : %u MB)\n", (U32)(samplesBuffSize>>20)); |
968 if (minRep > MINRATIO) { |
1010 if (minRep > MINRATIO) { |
969 DISPLAYLEVEL(2, "! consider increasing selectivity to produce larger dictionary (-s%u) \n", selectivity+1); |
1011 DISPLAYLEVEL(2, "! consider increasing selectivity to produce larger dictionary (-s%u) \n", selectivity+1); |
970 DISPLAYLEVEL(2, "! note : larger dictionaries are not necessarily better, test its efficiency on samples \n"); |
1012 DISPLAYLEVEL(2, "! note : larger dictionaries are not necessarily better, test its efficiency on samples \n"); |
971 } |
1013 } |
972 if (samplesBuffSize < 10 * targetDictSize) |
|
973 DISPLAYLEVEL(2, "! consider increasing the number of samples (total size : %u MB)\n", (U32)(samplesBuffSize>>20)); |
|
974 } |
1014 } |
975 |
1015 |
976 if ((dictContentSize > targetDictSize*3) && (nbSamples > 2*MINRATIO) && (selectivity>1)) { |
1016 if ((dictContentSize > targetDictSize*3) && (nbSamples > 2*MINRATIO) && (selectivity>1)) { |
977 U32 proposedSelectivity = selectivity-1; |
1017 U32 proposedSelectivity = selectivity-1; |
978 while ((nbSamples >> proposedSelectivity) <= MINRATIO) { proposedSelectivity--; } |
1018 while ((nbSamples >> proposedSelectivity) <= MINRATIO) { proposedSelectivity--; } |
979 DISPLAYLEVEL(2, "! note : calculated dictionary significantly larger than requested (%u > %u) \n", dictContentSize, (U32)maxDictSize); |
1019 DISPLAYLEVEL(2, "! note : calculated dictionary significantly larger than requested (%u > %u) \n", dictContentSize, (U32)maxDictSize); |
980 DISPLAYLEVEL(2, "! consider increasing dictionary size, or produce denser dictionary (-s%u) \n", proposedSelectivity); |
1020 DISPLAYLEVEL(2, "! consider increasing dictionary size, or produce denser dictionary (-s%u) \n", proposedSelectivity); |
981 DISPLAYLEVEL(2, "! always test dictionary efficiency on samples \n"); |
1021 DISPLAYLEVEL(2, "! always test dictionary efficiency on real samples \n"); |
982 } |
1022 } |
983 |
1023 |
984 /* limit dictionary size */ |
1024 /* limit dictionary size */ |
985 { U32 const max = dictList->pos; /* convention : nb of useful elts within dictList */ |
1025 { U32 const max = dictList->pos; /* convention : nb of useful elts within dictList */ |
986 U32 currentSize = 0; |
1026 U32 currentSize = 0; |
1002 memcpy(ptr, (const char*)samplesBuffer+dictList[u].pos, l); |
1042 memcpy(ptr, (const char*)samplesBuffer+dictList[u].pos, l); |
1003 } } |
1043 } } |
1004 |
1044 |
1005 dictSize = ZDICT_addEntropyTablesFromBuffer_advanced(dictBuffer, dictContentSize, maxDictSize, |
1045 dictSize = ZDICT_addEntropyTablesFromBuffer_advanced(dictBuffer, dictContentSize, maxDictSize, |
1006 samplesBuffer, samplesSizes, nbSamples, |
1046 samplesBuffer, samplesSizes, nbSamples, |
1007 params); |
1047 params.zParams); |
1008 } |
1048 } |
1009 |
1049 |
1010 /* clean up */ |
1050 /* clean up */ |
1011 free(dictList); |
1051 free(dictList); |
1012 return dictSize; |
1052 return dictSize; |
1013 } |
1053 } |
1014 |
1054 |
1015 |
1055 |
1016 /* issue : samplesBuffer need to be followed by a noisy guard band. |
1056 /* ZDICT_trainFromBuffer_legacy() : |
1017 * work around : duplicate the buffer, and add the noise */ |
1057 * issue : samplesBuffer need to be followed by a noisy guard band. |
1018 size_t ZDICT_trainFromBuffer_advanced(void* dictBuffer, size_t dictBufferCapacity, |
1058 * work around : duplicate the buffer, and add the noise */ |
1019 const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, |
1059 size_t ZDICT_trainFromBuffer_legacy(void* dictBuffer, size_t dictBufferCapacity, |
1020 ZDICT_params_t params) |
1060 const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, |
|
1061 ZDICT_legacy_params_t params) |
1021 { |
1062 { |
1022 size_t result; |
1063 size_t result; |
1023 void* newBuff; |
1064 void* newBuff; |
1024 size_t const sBuffSize = ZDICT_totalSampleSize(samplesSizes, nbSamples); |
1065 size_t const sBuffSize = ZDICT_totalSampleSize(samplesSizes, nbSamples); |
1025 if (sBuffSize < ZDICT_MIN_SAMPLES_SIZE) return 0; /* not enough content => no dictionary */ |
1066 if (sBuffSize < ZDICT_MIN_SAMPLES_SIZE) return 0; /* not enough content => no dictionary */ |
1028 if (!newBuff) return ERROR(memory_allocation); |
1069 if (!newBuff) return ERROR(memory_allocation); |
1029 |
1070 |
1030 memcpy(newBuff, samplesBuffer, sBuffSize); |
1071 memcpy(newBuff, samplesBuffer, sBuffSize); |
1031 ZDICT_fillNoise((char*)newBuff + sBuffSize, NOISELENGTH); /* guard band, for end of buffer condition */ |
1072 ZDICT_fillNoise((char*)newBuff + sBuffSize, NOISELENGTH); /* guard band, for end of buffer condition */ |
1032 |
1073 |
1033 result = ZDICT_trainFromBuffer_unsafe( |
1074 result = |
1034 dictBuffer, dictBufferCapacity, |
1075 ZDICT_trainFromBuffer_unsafe_legacy(dictBuffer, dictBufferCapacity, newBuff, |
1035 newBuff, samplesSizes, nbSamples, |
1076 samplesSizes, nbSamples, params); |
1036 params); |
|
1037 free(newBuff); |
1077 free(newBuff); |
1038 return result; |
1078 return result; |
1039 } |
1079 } |
1040 |
1080 |
1041 |
1081 |
1042 size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity, |
1082 size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity, |
1043 const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples) |
1083 const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples) |
1044 { |
1084 { |
1045 ZDICT_params_t params; |
1085 ZDICT_cover_params_t params; |
|
1086 DEBUGLOG(3, "ZDICT_trainFromBuffer"); |
1046 memset(¶ms, 0, sizeof(params)); |
1087 memset(¶ms, 0, sizeof(params)); |
1047 return ZDICT_trainFromBuffer_advanced(dictBuffer, dictBufferCapacity, |
1088 params.d = 8; |
1048 samplesBuffer, samplesSizes, nbSamples, |
1089 params.steps = 4; |
1049 params); |
1090 /* Default to level 6 since no compression level information is available */ |
|
1091 params.zParams.compressionLevel = 6; |
|
1092 #if defined(ZSTD_DEBUG) && (ZSTD_DEBUG>=1) |
|
1093 params.zParams.notificationLevel = ZSTD_DEBUG; |
|
1094 #endif |
|
1095 return ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, dictBufferCapacity, |
|
1096 samplesBuffer, samplesSizes, nbSamples, |
|
1097 ¶ms); |
1050 } |
1098 } |
1051 |
1099 |
1052 size_t ZDICT_addEntropyTablesFromBuffer(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity, |
1100 size_t ZDICT_addEntropyTablesFromBuffer(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity, |
1053 const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples) |
1101 const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples) |
1054 { |
1102 { |
1055 ZDICT_params_t params; |
1103 ZDICT_params_t params; |
1056 memset(¶ms, 0, sizeof(params)); |
1104 memset(¶ms, 0, sizeof(params)); |
1057 return ZDICT_addEntropyTablesFromBuffer_advanced(dictBuffer, dictContentSize, dictBufferCapacity, |
1105 return ZDICT_addEntropyTablesFromBuffer_advanced(dictBuffer, dictContentSize, dictBufferCapacity, |
1058 samplesBuffer, samplesSizes, nbSamples, |
1106 samplesBuffer, samplesSizes, nbSamples, |