contrib/python-zstandard/zstd/dictBuilder/zdict.c
changeset 37495 b1fb341d8a61
parent 30895 c32454d69b85
child 40121 73fef626dae3
equal deleted inserted replaced
37494:1ce7a55b09d1 37495:b1fb341d8a61
     1 /**
     1 /*
     2  * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
     2  * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
     3  * All rights reserved.
     3  * All rights reserved.
     4  *
     4  *
     5  * This source code is licensed under the BSD-style license found in the
     5  * This source code is licensed under both the BSD-style license (found in the
     6  * LICENSE file in the root directory of this source tree. An additional grant
     6  * LICENSE file in the root directory of this source tree) and the GPLv2 (found
     7  * of patent rights can be found in the PATENTS file in the same directory.
     7  * in the COPYING file in the root directory of this source tree).
       
     8  * You may select, at your option, one of the above-listed licenses.
     8  */
     9  */
     9 
    10 
    10 
    11 
    11 /*-**************************************
    12 /*-**************************************
    12 *  Tuning parameters
    13 *  Tuning parameters
    13 ****************************************/
    14 ****************************************/
       
    15 #define MINRATIO 4   /* minimum nb of apparition to be selected in dictionary */
    14 #define ZDICT_MAX_SAMPLES_SIZE (2000U << 20)
    16 #define ZDICT_MAX_SAMPLES_SIZE (2000U << 20)
    15 #define ZDICT_MIN_SAMPLES_SIZE 512
    17 #define ZDICT_MIN_SAMPLES_SIZE (ZDICT_CONTENTSIZE_MIN * MINRATIO)
    16 
    18 
    17 
    19 
    18 /*-**************************************
    20 /*-**************************************
    19 *  Compiler Options
    21 *  Compiler Options
    20 ****************************************/
    22 ****************************************/
    57 
    59 
    58 #define DICTLISTSIZE_DEFAULT 10000
    60 #define DICTLISTSIZE_DEFAULT 10000
    59 
    61 
    60 #define NOISELENGTH 32
    62 #define NOISELENGTH 32
    61 
    63 
    62 #define MINRATIO 4
    64 static const int g_compressionLevel_default = 3;
    63 static const int g_compressionLevel_default = 6;
       
    64 static const U32 g_selectivity_default = 9;
    65 static const U32 g_selectivity_default = 9;
    65 static const size_t g_provision_entropySize = 200;
       
    66 static const size_t g_min_fast_dictContent = 192;
       
    67 
    66 
    68 
    67 
    69 /*-*************************************
    68 /*-*************************************
    70 *  Console display
    69 *  Console display
    71 ***************************************/
    70 ***************************************/
    94 const char* ZDICT_getErrorName(size_t errorCode) { return ERR_getErrorName(errorCode); }
    93 const char* ZDICT_getErrorName(size_t errorCode) { return ERR_getErrorName(errorCode); }
    95 
    94 
    96 unsigned ZDICT_getDictID(const void* dictBuffer, size_t dictSize)
    95 unsigned ZDICT_getDictID(const void* dictBuffer, size_t dictSize)
    97 {
    96 {
    98     if (dictSize < 8) return 0;
    97     if (dictSize < 8) return 0;
    99     if (MEM_readLE32(dictBuffer) != ZSTD_DICT_MAGIC) return 0;
    98     if (MEM_readLE32(dictBuffer) != ZSTD_MAGIC_DICTIONARY) return 0;
   100     return MEM_readLE32((const char*)dictBuffer + 4);
    99     return MEM_readLE32((const char*)dictBuffer + 4);
   101 }
   100 }
   102 
   101 
   103 
   102 
   104 /*-********************************************************
   103 /*-********************************************************
   105 *  Dictionary training functions
   104 *  Dictionary training functions
   106 **********************************************************/
   105 **********************************************************/
   107 static unsigned ZDICT_NbCommonBytes (register size_t val)
   106 static unsigned ZDICT_NbCommonBytes (size_t val)
   108 {
   107 {
   109     if (MEM_isLittleEndian()) {
   108     if (MEM_isLittleEndian()) {
   110         if (MEM_64bits()) {
   109         if (MEM_64bits()) {
   111 #       if defined(_MSC_VER) && defined(_WIN64)
   110 #       if defined(_MSC_VER) && defined(_WIN64)
   112             unsigned long r = 0;
   111             unsigned long r = 0;
   206 {
   205 {
   207     U32 lengthList[LLIMIT] = {0};
   206     U32 lengthList[LLIMIT] = {0};
   208     U32 cumulLength[LLIMIT] = {0};
   207     U32 cumulLength[LLIMIT] = {0};
   209     U32 savings[LLIMIT] = {0};
   208     U32 savings[LLIMIT] = {0};
   210     const BYTE* b = (const BYTE*)buffer;
   209     const BYTE* b = (const BYTE*)buffer;
   211     size_t length;
       
   212     size_t maxLength = LLIMIT;
   210     size_t maxLength = LLIMIT;
   213     size_t pos = suffix[start];
   211     size_t pos = suffix[start];
   214     U32 end = start;
   212     U32 end = start;
   215     dictItem solution;
   213     dictItem solution;
   216 
   214 
   221     /* trivial repetition cases */
   219     /* trivial repetition cases */
   222     if ( (MEM_read16(b+pos+0) == MEM_read16(b+pos+2))
   220     if ( (MEM_read16(b+pos+0) == MEM_read16(b+pos+2))
   223        ||(MEM_read16(b+pos+1) == MEM_read16(b+pos+3))
   221        ||(MEM_read16(b+pos+1) == MEM_read16(b+pos+3))
   224        ||(MEM_read16(b+pos+2) == MEM_read16(b+pos+4)) ) {
   222        ||(MEM_read16(b+pos+2) == MEM_read16(b+pos+4)) ) {
   225         /* skip and mark segment */
   223         /* skip and mark segment */
   226         U16 u16 = MEM_read16(b+pos+4);
   224         U16 const pattern16 = MEM_read16(b+pos+4);
   227         U32 u, e = 6;
   225         U32 u, patternEnd = 6;
   228         while (MEM_read16(b+pos+e) == u16) e+=2 ;
   226         while (MEM_read16(b+pos+patternEnd) == pattern16) patternEnd+=2 ;
   229         if (b[pos+e] == b[pos+e-1]) e++;
   227         if (b[pos+patternEnd] == b[pos+patternEnd-1]) patternEnd++;
   230         for (u=1; u<e; u++)
   228         for (u=1; u<patternEnd; u++)
   231             doneMarks[pos+u] = 1;
   229             doneMarks[pos+u] = 1;
   232         return solution;
   230         return solution;
   233     }
   231     }
   234 
   232 
   235     /* look forward */
   233     /* look forward */
   236     do {
   234     {   size_t length;
   237         end++;
   235         do {
   238         length = ZDICT_count(b + pos, b + suffix[end]);
   236             end++;
   239     } while (length >=MINMATCHLENGTH);
   237             length = ZDICT_count(b + pos, b + suffix[end]);
       
   238         } while (length >= MINMATCHLENGTH);
       
   239     }
   240 
   240 
   241     /* look backward */
   241     /* look backward */
   242     do {
   242     {   size_t length;
   243         length = ZDICT_count(b + pos, b + *(suffix+start-1));
   243         do {
   244         if (length >=MINMATCHLENGTH) start--;
   244             length = ZDICT_count(b + pos, b + *(suffix+start-1));
   245     } while(length >= MINMATCHLENGTH);
   245             if (length >=MINMATCHLENGTH) start--;
       
   246         } while(length >= MINMATCHLENGTH);
       
   247     }
   246 
   248 
   247     /* exit if not found a minimum nb of repetitions */
   249     /* exit if not found a minimum nb of repetitions */
   248     if (end-start < minRatio) {
   250     if (end-start < minRatio) {
   249         U32 idx;
   251         U32 idx;
   250         for(idx=start; idx<end; idx++)
   252         for(idx=start; idx<end; idx++)
   267             U32 currentID = refinedStart;
   269             U32 currentID = refinedStart;
   268             U32 id;
   270             U32 id;
   269             U32 selectedCount = 0;
   271             U32 selectedCount = 0;
   270             U32 selectedID = currentID;
   272             U32 selectedID = currentID;
   271             for (id =refinedStart; id < refinedEnd; id++) {
   273             for (id =refinedStart; id < refinedEnd; id++) {
   272                 if (b[ suffix[id] + searchLength] != currentChar) {
   274                 if (b[suffix[id] + searchLength] != currentChar) {
   273                     if (currentCount > selectedCount) {
   275                     if (currentCount > selectedCount) {
   274                         selectedCount = currentCount;
   276                         selectedCount = currentCount;
   275                         selectedID = currentID;
   277                         selectedID = currentID;
   276                     }
   278                     }
   277                     currentID = id;
   279                     currentID = id;
   296         pos = suffix[refinedStart];
   298         pos = suffix[refinedStart];
   297         end = start;
   299         end = start;
   298         memset(lengthList, 0, sizeof(lengthList));
   300         memset(lengthList, 0, sizeof(lengthList));
   299 
   301 
   300         /* look forward */
   302         /* look forward */
   301         do {
   303         {   size_t length;
   302             end++;
   304             do {
   303             length = ZDICT_count(b + pos, b + suffix[end]);
   305                 end++;
   304             if (length >= LLIMIT) length = LLIMIT-1;
   306                 length = ZDICT_count(b + pos, b + suffix[end]);
   305             lengthList[length]++;
   307                 if (length >= LLIMIT) length = LLIMIT-1;
   306         } while (length >=MINMATCHLENGTH);
   308                 lengthList[length]++;
       
   309             } while (length >=MINMATCHLENGTH);
       
   310         }
   307 
   311 
   308         /* look backward */
   312         /* look backward */
   309         length = MINMATCHLENGTH;
   313         {   size_t length = MINMATCHLENGTH;
   310         while ((length >= MINMATCHLENGTH) & (start > 0)) {
   314             while ((length >= MINMATCHLENGTH) & (start > 0)) {
   311         	length = ZDICT_count(b + pos, b + suffix[start - 1]);
   315                 length = ZDICT_count(b + pos, b + suffix[start - 1]);
   312         	if (length >= LLIMIT) length = LLIMIT - 1;
   316                 if (length >= LLIMIT) length = LLIMIT - 1;
   313         	lengthList[length]++;
   317                 lengthList[length]++;
   314         	if (length >= MINMATCHLENGTH) start--;
   318                 if (length >= MINMATCHLENGTH) start--;
       
   319             }
   315         }
   320         }
   316 
   321 
   317         /* largest useful length */
   322         /* largest useful length */
   318         memset(cumulLength, 0, sizeof(cumulLength));
   323         memset(cumulLength, 0, sizeof(cumulLength));
   319         cumulLength[maxLength-1] = lengthList[maxLength-1];
   324         cumulLength[maxLength-1] = lengthList[maxLength-1];
   344         solution.savings = savings[maxLength];
   349         solution.savings = savings[maxLength];
   345 
   350 
   346         /* mark positions done */
   351         /* mark positions done */
   347         {   U32 id;
   352         {   U32 id;
   348             for (id=start; id<end; id++) {
   353             for (id=start; id<end; id++) {
   349                 U32 p, pEnd;
   354                 U32 p, pEnd, length;
   350                 U32 const testedPos = suffix[id];
   355                 U32 const testedPos = suffix[id];
   351                 if (testedPos == pos)
   356                 if (testedPos == pos)
   352                     length = solution.length;
   357                     length = solution.length;
   353                 else {
   358                 else {
   354                     length = ZDICT_count(b+pos, b+testedPos);
   359                     length = (U32)ZDICT_count(b+pos, b+testedPos);
   355                     if (length > solution.length) length = solution.length;
   360                     if (length > solution.length) length = solution.length;
   356                 }
   361                 }
   357                 pEnd = (U32)(testedPos + length);
   362                 pEnd = (U32)(testedPos + length);
   358                 for (p=testedPos; p<pEnd; p++)
   363                 for (p=testedPos; p<pEnd; p++)
   359                     doneMarks[p] = 1;
   364                     doneMarks[p] = 1;
   361 
   366 
   362     return solution;
   367     return solution;
   363 }
   368 }
   364 
   369 
   365 
   370 
   366 /*! ZDICT_checkMerge
   371 static int isIncluded(const void* in, const void* container, size_t length)
       
   372 {
       
   373     const char* const ip = (const char*) in;
       
   374     const char* const into = (const char*) container;
       
   375     size_t u;
       
   376 
       
   377     for (u=0; u<length; u++) {  /* works because end of buffer is a noisy guard band */
       
   378         if (ip[u] != into[u]) break;
       
   379     }
       
   380 
       
   381     return u==length;
       
   382 }
       
   383 
       
   384 /*! ZDICT_tryMerge() :
   367     check if dictItem can be merged, do it if possible
   385     check if dictItem can be merged, do it if possible
   368     @return : id of destination elt, 0 if not merged
   386     @return : id of destination elt, 0 if not merged
   369 */
   387 */
   370 static U32 ZDICT_checkMerge(dictItem* table, dictItem elt, U32 eltNbToSkip)
   388 static U32 ZDICT_tryMerge(dictItem* table, dictItem elt, U32 eltNbToSkip, const void* buffer)
   371 {
   389 {
   372     const U32 tableSize = table->pos;
   390     const U32 tableSize = table->pos;
   373     const U32 eltEnd = elt.pos + elt.length;
   391     const U32 eltEnd = elt.pos + elt.length;
       
   392     const char* const buf = (const char*) buffer;
   374 
   393 
   375     /* tail overlap */
   394     /* tail overlap */
   376     U32 u; for (u=1; u<tableSize; u++) {
   395     U32 u; for (u=1; u<tableSize; u++) {
   377         if (u==eltNbToSkip) continue;
   396         if (u==eltNbToSkip) continue;
   378         if ((table[u].pos > elt.pos) && (table[u].pos <= eltEnd)) {  /* overlap, existing > new */
   397         if ((table[u].pos > elt.pos) && (table[u].pos <= eltEnd)) {  /* overlap, existing > new */
   379             /* append */
   398             /* append */
   380             U32 addedLength = table[u].pos - elt.pos;
   399             U32 const addedLength = table[u].pos - elt.pos;
   381             table[u].length += addedLength;
   400             table[u].length += addedLength;
   382             table[u].pos = elt.pos;
   401             table[u].pos = elt.pos;
   383             table[u].savings += elt.savings * addedLength / elt.length;   /* rough approx */
   402             table[u].savings += elt.savings * addedLength / elt.length;   /* rough approx */
   384             table[u].savings += elt.length / 8;    /* rough approx bonus */
   403             table[u].savings += elt.length / 8;    /* rough approx bonus */
   385             elt = table[u];
   404             elt = table[u];
   391     }   }
   410     }   }
   392 
   411 
   393     /* front overlap */
   412     /* front overlap */
   394     for (u=1; u<tableSize; u++) {
   413     for (u=1; u<tableSize; u++) {
   395         if (u==eltNbToSkip) continue;
   414         if (u==eltNbToSkip) continue;
       
   415 
   396         if ((table[u].pos + table[u].length >= elt.pos) && (table[u].pos < elt.pos)) {  /* overlap, existing < new */
   416         if ((table[u].pos + table[u].length >= elt.pos) && (table[u].pos < elt.pos)) {  /* overlap, existing < new */
   397             /* append */
   417             /* append */
   398             int addedLength = (int)eltEnd - (table[u].pos + table[u].length);
   418             int const addedLength = (int)eltEnd - (table[u].pos + table[u].length);
   399             table[u].savings += elt.length / 8;    /* rough approx bonus */
   419             table[u].savings += elt.length / 8;    /* rough approx bonus */
   400             if (addedLength > 0) {   /* otherwise, elt fully included into existing */
   420             if (addedLength > 0) {   /* otherwise, elt fully included into existing */
   401                 table[u].length += addedLength;
   421                 table[u].length += addedLength;
   402                 table[u].savings += elt.savings * addedLength / elt.length;   /* rough approx */
   422                 table[u].savings += elt.savings * addedLength / elt.length;   /* rough approx */
   403             }
   423             }
   405             elt = table[u];
   425             elt = table[u];
   406             while ((u>1) && (table[u-1].savings < elt.savings))
   426             while ((u>1) && (table[u-1].savings < elt.savings))
   407                 table[u] = table[u-1], u--;
   427                 table[u] = table[u-1], u--;
   408             table[u] = elt;
   428             table[u] = elt;
   409             return u;
   429             return u;
   410     }   }
   430         }
       
   431 
       
   432         if (MEM_read64(buf + table[u].pos) == MEM_read64(buf + elt.pos + 1)) {
       
   433             if (isIncluded(buf + table[u].pos, buf + elt.pos + 1, table[u].length)) {
       
   434                 size_t const addedLength = MAX( (int)elt.length - (int)table[u].length , 1 );
       
   435                 table[u].pos = elt.pos;
       
   436                 table[u].savings += (U32)(elt.savings * addedLength / elt.length);
       
   437                 table[u].length = MIN(elt.length, table[u].length + 1);
       
   438                 return u;
       
   439             }
       
   440         }
       
   441     }
   411 
   442 
   412     return 0;
   443     return 0;
   413 }
   444 }
   414 
   445 
   415 
   446 
   416 static void ZDICT_removeDictItem(dictItem* table, U32 id)
   447 static void ZDICT_removeDictItem(dictItem* table, U32 id)
   417 {
   448 {
   418     /* convention : first element is nb of elts */
   449     /* convention : table[0].pos stores nb of elts */
   419     U32 const max = table->pos;
   450     U32 const max = table[0].pos;
   420     U32 u;
   451     U32 u;
   421     if (!id) return;   /* protection, should never happen */
   452     if (!id) return;   /* protection, should never happen */
   422     for (u=id; u<max-1; u++)
   453     for (u=id; u<max-1; u++)
   423         table[u] = table[u+1];
   454         table[u] = table[u+1];
   424     table->pos--;
   455     table->pos--;
   425 }
   456 }
   426 
   457 
   427 
   458 
   428 static void ZDICT_insertDictItem(dictItem* table, U32 maxSize, dictItem elt)
   459 static void ZDICT_insertDictItem(dictItem* table, U32 maxSize, dictItem elt, const void* buffer)
   429 {
   460 {
   430     /* merge if possible */
   461     /* merge if possible */
   431     U32 mergeId = ZDICT_checkMerge(table, elt, 0);
   462     U32 mergeId = ZDICT_tryMerge(table, elt, 0, buffer);
   432     if (mergeId) {
   463     if (mergeId) {
   433         U32 newMerge = 1;
   464         U32 newMerge = 1;
   434         while (newMerge) {
   465         while (newMerge) {
   435             newMerge = ZDICT_checkMerge(table, table[mergeId], mergeId);
   466             newMerge = ZDICT_tryMerge(table, table[mergeId], mergeId, buffer);
   436             if (newMerge) ZDICT_removeDictItem(table, mergeId);
   467             if (newMerge) ZDICT_removeDictItem(table, mergeId);
   437             mergeId = newMerge;
   468             mergeId = newMerge;
   438         }
   469         }
   439         return;
   470         return;
   440     }
   471     }
   461         dictSize += dictList[u].length;
   492         dictSize += dictList[u].length;
   462     return dictSize;
   493     return dictSize;
   463 }
   494 }
   464 
   495 
   465 
   496 
   466 static size_t ZDICT_trainBuffer(dictItem* dictList, U32 dictListSize,
   497 static size_t ZDICT_trainBuffer_legacy(dictItem* dictList, U32 dictListSize,
   467                             const void* const buffer, size_t bufferSize,   /* buffer must end with noisy guard band */
   498                             const void* const buffer, size_t bufferSize,   /* buffer must end with noisy guard band */
   468                             const size_t* fileSizes, unsigned nbFiles,
   499                             const size_t* fileSizes, unsigned nbFiles,
   469                             U32 minRatio, U32 notificationLevel)
   500                             U32 minRatio, U32 notificationLevel)
   470 {
   501 {
   471     int* const suffix0 = (int*)malloc((bufferSize+2)*sizeof(*suffix0));
   502     int* const suffix0 = (int*)malloc((bufferSize+2)*sizeof(*suffix0));
   478     clock_t const refreshRate = CLOCKS_PER_SEC * 3 / 10;
   509     clock_t const refreshRate = CLOCKS_PER_SEC * 3 / 10;
   479 
   510 
   480 #   define DISPLAYUPDATE(l, ...) if (notificationLevel>=l) { \
   511 #   define DISPLAYUPDATE(l, ...) if (notificationLevel>=l) { \
   481             if (ZDICT_clockSpan(displayClock) > refreshRate)  \
   512             if (ZDICT_clockSpan(displayClock) > refreshRate)  \
   482             { displayClock = clock(); DISPLAY(__VA_ARGS__); \
   513             { displayClock = clock(); DISPLAY(__VA_ARGS__); \
   483             if (notificationLevel>=4) fflush(stdout); } }
   514             if (notificationLevel>=4) fflush(stderr); } }
   484 
   515 
   485     /* init */
   516     /* init */
   486     DISPLAYLEVEL(2, "\r%70s\r", "");   /* clean display line */
   517     DISPLAYLEVEL(2, "\r%70s\r", "");   /* clean display line */
   487     if (!suffix0 || !reverseSuffix || !doneMarks || !filePos) {
   518     if (!suffix0 || !reverseSuffix || !doneMarks || !filePos) {
   488         result = ERROR(memory_allocation);
   519         result = ERROR(memory_allocation);
   519     {   U32 cursor; for (cursor=0; cursor < bufferSize; ) {
   550     {   U32 cursor; for (cursor=0; cursor < bufferSize; ) {
   520             dictItem solution;
   551             dictItem solution;
   521             if (doneMarks[cursor]) { cursor++; continue; }
   552             if (doneMarks[cursor]) { cursor++; continue; }
   522             solution = ZDICT_analyzePos(doneMarks, suffix, reverseSuffix[cursor], buffer, minRatio, notificationLevel);
   553             solution = ZDICT_analyzePos(doneMarks, suffix, reverseSuffix[cursor], buffer, minRatio, notificationLevel);
   523             if (solution.length==0) { cursor++; continue; }
   554             if (solution.length==0) { cursor++; continue; }
   524             ZDICT_insertDictItem(dictList, dictListSize, solution);
   555             ZDICT_insertDictItem(dictList, dictListSize, solution, buffer);
   525             cursor += solution.length;
   556             cursor += solution.length;
   526             DISPLAYUPDATE(2, "\r%4.2f %% \r", (double)cursor / bufferSize * 100);
   557             DISPLAYUPDATE(2, "\r%4.2f %% \r", (double)cursor / bufferSize * 100);
   527     }   }
   558     }   }
   528 
   559 
   529 _cleanup:
   560 _cleanup:
   548 }
   579 }
   549 
   580 
   550 
   581 
   551 typedef struct
   582 typedef struct
   552 {
   583 {
   553     ZSTD_CCtx* ref;
   584     ZSTD_CCtx* ref;    /* contains reference to dictionary */
   554     ZSTD_CCtx* zc;
   585     ZSTD_CCtx* zc;     /* working context */
   555     void* workPlace;   /* must be ZSTD_BLOCKSIZE_ABSOLUTEMAX allocated */
   586     void* workPlace;   /* must be ZSTD_BLOCKSIZE_MAX allocated */
   556 } EStats_ress_t;
   587 } EStats_ress_t;
   557 
   588 
   558 #define MAXREPOFFSET 1024
   589 #define MAXREPOFFSET 1024
   559 
   590 
   560 static void ZDICT_countEStats(EStats_ress_t esr, ZSTD_parameters params,
   591 static void ZDICT_countEStats(EStats_ress_t esr, ZSTD_parameters params,
   561                             U32* countLit, U32* offsetcodeCount, U32* matchlengthCount, U32* litlengthCount, U32* repOffsets,
   592                               U32* countLit, U32* offsetcodeCount, U32* matchlengthCount, U32* litlengthCount, U32* repOffsets,
   562                             const void* src, size_t srcSize, U32 notificationLevel)
   593                               const void* src, size_t srcSize,
   563 {
   594                               U32 notificationLevel)
   564     size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_ABSOLUTEMAX, 1 << params.cParams.windowLog);
   595 {
       
   596     size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_MAX, 1 << params.cParams.windowLog);
   565     size_t cSize;
   597     size_t cSize;
   566 
   598 
   567     if (srcSize > blockSizeMax) srcSize = blockSizeMax;   /* protection vs large samples */
   599     if (srcSize > blockSizeMax) srcSize = blockSizeMax;   /* protection vs large samples */
   568     {  size_t const errorCode = ZSTD_copyCCtx(esr.zc, esr.ref, 0);
   600     {   size_t const errorCode = ZSTD_copyCCtx(esr.zc, esr.ref, 0);
   569             if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_copyCCtx failed \n"); return; }
   601         if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_copyCCtx failed \n"); return; }
   570     }
   602     }
   571     cSize = ZSTD_compressBlock(esr.zc, esr.workPlace, ZSTD_BLOCKSIZE_ABSOLUTEMAX, src, srcSize);
   603     cSize = ZSTD_compressBlock(esr.zc, esr.workPlace, ZSTD_BLOCKSIZE_MAX, src, srcSize);
   572     if (ZSTD_isError(cSize)) { DISPLAYLEVEL(3, "warning : could not compress sample size %u \n", (U32)srcSize); return; }
   604     if (ZSTD_isError(cSize)) { DISPLAYLEVEL(3, "warning : could not compress sample size %u \n", (U32)srcSize); return; }
   573 
   605 
   574     if (cSize) {  /* if == 0; block is not compressible */
   606     if (cSize) {  /* if == 0; block is not compressible */
   575         const seqStore_t* seqStorePtr = ZSTD_getSeqStore(esr.zc);
   607         const seqStore_t* const seqStorePtr = ZSTD_getSeqStore(esr.zc);
   576 
   608 
   577         /* literals stats */
   609         /* literals stats */
   578         {   const BYTE* bytePtr;
   610         {   const BYTE* bytePtr;
   579             for(bytePtr = seqStorePtr->litStart; bytePtr < seqStorePtr->lit; bytePtr++)
   611             for(bytePtr = seqStorePtr->litStart; bytePtr < seqStorePtr->lit; bytePtr++)
   580                 countLit[*bytePtr]++;
   612                 countLit[*bytePtr]++;
   608                 repOffsets[offset1] += 3;
   640                 repOffsets[offset1] += 3;
   609                 repOffsets[offset2] += 1;
   641                 repOffsets[offset2] += 1;
   610     }   }   }
   642     }   }   }
   611 }
   643 }
   612 
   644 
   613 /*
       
   614 static size_t ZDICT_maxSampleSize(const size_t* fileSizes, unsigned nbFiles)
       
   615 {
       
   616     unsigned u;
       
   617     size_t max=0;
       
   618     for (u=0; u<nbFiles; u++)
       
   619         if (max < fileSizes[u]) max = fileSizes[u];
       
   620     return max;
       
   621 }
       
   622 */
       
   623 
       
   624 static size_t ZDICT_totalSampleSize(const size_t* fileSizes, unsigned nbFiles)
   645 static size_t ZDICT_totalSampleSize(const size_t* fileSizes, unsigned nbFiles)
   625 {
   646 {
   626     size_t total=0;
   647     size_t total=0;
   627     unsigned u;
   648     unsigned u;
   628     for (u=0; u<nbFiles; u++) total += fileSizes[u];
   649     for (u=0; u<nbFiles; u++) total += fileSizes[u];
   643         table[u-1] = table[u];
   664         table[u-1] = table[u];
   644         table[u] = tmp;
   665         table[u] = tmp;
   645     }
   666     }
   646 }
   667 }
   647 
   668 
       
   669 /* ZDICT_flatLit() :
       
   670  * rewrite `countLit` to contain a mostly flat but still compressible distribution of literals.
       
   671  * necessary to avoid generating a non-compressible distribution that HUF_writeCTable() cannot encode.
       
   672  */
       
   673 static void ZDICT_flatLit(U32* countLit)
       
   674 {
       
   675     int u;
       
   676     for (u=1; u<256; u++) countLit[u] = 2;
       
   677     countLit[0]   = 4;
       
   678     countLit[253] = 1;
       
   679     countLit[254] = 1;
       
   680 }
   648 
   681 
   649 #define OFFCODE_MAX 30  /* only applicable to first block */
   682 #define OFFCODE_MAX 30  /* only applicable to first block */
   650 static size_t ZDICT_analyzeEntropy(void*  dstBuffer, size_t maxDstSize,
   683 static size_t ZDICT_analyzeEntropy(void*  dstBuffer, size_t maxDstSize,
   651                                    unsigned compressionLevel,
   684                                    unsigned compressionLevel,
   652                              const void*  srcBuffer, const size_t* fileSizes, unsigned nbFiles,
   685                              const void*  srcBuffer, const size_t* fileSizes, unsigned nbFiles,
   672     size_t const totalSrcSize = ZDICT_totalSampleSize(fileSizes, nbFiles);
   705     size_t const totalSrcSize = ZDICT_totalSampleSize(fileSizes, nbFiles);
   673     size_t const averageSampleSize = totalSrcSize / (nbFiles + !nbFiles);
   706     size_t const averageSampleSize = totalSrcSize / (nbFiles + !nbFiles);
   674     BYTE* dstPtr = (BYTE*)dstBuffer;
   707     BYTE* dstPtr = (BYTE*)dstBuffer;
   675 
   708 
   676     /* init */
   709     /* init */
       
   710     DEBUGLOG(4, "ZDICT_analyzeEntropy");
   677     esr.ref = ZSTD_createCCtx();
   711     esr.ref = ZSTD_createCCtx();
   678     esr.zc = ZSTD_createCCtx();
   712     esr.zc = ZSTD_createCCtx();
   679     esr.workPlace = malloc(ZSTD_BLOCKSIZE_ABSOLUTEMAX);
   713     esr.workPlace = malloc(ZSTD_BLOCKSIZE_MAX);
   680     if (!esr.ref || !esr.zc || !esr.workPlace) {
   714     if (!esr.ref || !esr.zc || !esr.workPlace) {
   681         eSize = ERROR(memory_allocation);
   715         eSize = ERROR(memory_allocation);
   682         DISPLAYLEVEL(1, "Not enough memory \n");
   716         DISPLAYLEVEL(1, "Not enough memory \n");
   683         goto _cleanup;
   717         goto _cleanup;
   684     }
   718     }
   685     if (offcodeMax>OFFCODE_MAX) { eSize = ERROR(dictionary_wrong); goto _cleanup; }   /* too large dictionary */
   719     if (offcodeMax>OFFCODE_MAX) { eSize = ERROR(dictionaryCreation_failed); goto _cleanup; }   /* too large dictionary */
   686     for (u=0; u<256; u++) countLit[u]=1;   /* any character must be described */
   720     for (u=0; u<256; u++) countLit[u] = 1;   /* any character must be described */
   687     for (u=0; u<=offcodeMax; u++) offcodeCount[u]=1;
   721     for (u=0; u<=offcodeMax; u++) offcodeCount[u] = 1;
   688     for (u=0; u<=MaxML; u++) matchLengthCount[u]=1;
   722     for (u=0; u<=MaxML; u++) matchLengthCount[u] = 1;
   689     for (u=0; u<=MaxLL; u++) litLengthCount[u]=1;
   723     for (u=0; u<=MaxLL; u++) litLengthCount[u] = 1;
   690     memset(repOffset, 0, sizeof(repOffset));
   724     memset(repOffset, 0, sizeof(repOffset));
   691     repOffset[1] = repOffset[4] = repOffset[8] = 1;
   725     repOffset[1] = repOffset[4] = repOffset[8] = 1;
   692     memset(bestRepOffset, 0, sizeof(bestRepOffset));
   726     memset(bestRepOffset, 0, sizeof(bestRepOffset));
   693     if (compressionLevel==0) compressionLevel=g_compressionLevel_default;
   727     if (compressionLevel<=0) compressionLevel = g_compressionLevel_default;
   694     params = ZSTD_getParams(compressionLevel, averageSampleSize, dictBufferSize);
   728     params = ZSTD_getParams(compressionLevel, averageSampleSize, dictBufferSize);
   695     {   size_t const beginResult = ZSTD_compressBegin_advanced(esr.ref, dictBuffer, dictBufferSize, params, 0);
   729     {   size_t const beginResult = ZSTD_compressBegin_advanced(esr.ref, dictBuffer, dictBufferSize, params, 0);
   696             if (ZSTD_isError(beginResult)) {
   730         if (ZSTD_isError(beginResult)) {
       
   731             DISPLAYLEVEL(1, "error : ZSTD_compressBegin_advanced() failed : %s \n", ZSTD_getErrorName(beginResult));
   697             eSize = ERROR(GENERIC);
   732             eSize = ERROR(GENERIC);
   698             DISPLAYLEVEL(1, "error : ZSTD_compressBegin_advanced failed \n");
       
   699             goto _cleanup;
   733             goto _cleanup;
   700     }   }
   734     }   }
   701 
   735 
   702     /* collect stats on all files */
   736     /* collect stats on all samples */
   703     for (u=0; u<nbFiles; u++) {
   737     for (u=0; u<nbFiles; u++) {
   704         ZDICT_countEStats(esr, params,
   738         ZDICT_countEStats(esr, params,
   705                           countLit, offcodeCount, matchLengthCount, litLengthCount, repOffset,
   739                           countLit, offcodeCount, matchLengthCount, litLengthCount, repOffset,
   706                          (const char*)srcBuffer + pos, fileSizes[u],
   740                          (const char*)srcBuffer + pos, fileSizes[u],
   707                           notificationLevel);
   741                           notificationLevel);
   708         pos += fileSizes[u];
   742         pos += fileSizes[u];
   709     }
   743     }
   710 
   744 
   711     /* analyze */
   745     /* analyze, build stats, starting with literals */
   712     errorCode = HUF_buildCTable (hufTable, countLit, 255, huffLog);
   746     {   size_t maxNbBits = HUF_buildCTable (hufTable, countLit, 255, huffLog);
   713     if (HUF_isError(errorCode)) {
   747         if (HUF_isError(maxNbBits)) {
   714         eSize = ERROR(GENERIC);
   748             eSize = ERROR(GENERIC);
   715         DISPLAYLEVEL(1, "HUF_buildCTable error \n");
   749             DISPLAYLEVEL(1, " HUF_buildCTable error \n");
   716         goto _cleanup;
   750             goto _cleanup;
   717     }
   751         }
   718     huffLog = (U32)errorCode;
   752         if (maxNbBits==8) {  /* not compressible : will fail on HUF_writeCTable() */
       
   753             DISPLAYLEVEL(2, "warning : pathological dataset : literals are not compressible : samples are noisy or too regular \n");
       
   754             ZDICT_flatLit(countLit);  /* replace distribution by a fake "mostly flat but still compressible" distribution, that HUF_writeCTable() can encode */
       
   755             maxNbBits = HUF_buildCTable (hufTable, countLit, 255, huffLog);
       
   756             assert(maxNbBits==9);
       
   757         }
       
   758         huffLog = (U32)maxNbBits;
       
   759     }
   719 
   760 
   720     /* looking for most common first offsets */
   761     /* looking for most common first offsets */
   721     {   U32 offset;
   762     {   U32 offset;
   722         for (offset=1; offset<MAXREPOFFSET; offset++)
   763         for (offset=1; offset<MAXREPOFFSET; offset++)
   723             ZDICT_insertSortCount(bestRepOffset, offset, repOffset[offset]);
   764             ZDICT_insertSortCount(bestRepOffset, offset, repOffset[offset]);
   810        as the impact of statistics is not properly evaluated */
   851        as the impact of statistics is not properly evaluated */
   811     MEM_writeLE32(dstPtr+0, repStartValue[0]);
   852     MEM_writeLE32(dstPtr+0, repStartValue[0]);
   812     MEM_writeLE32(dstPtr+4, repStartValue[1]);
   853     MEM_writeLE32(dstPtr+4, repStartValue[1]);
   813     MEM_writeLE32(dstPtr+8, repStartValue[2]);
   854     MEM_writeLE32(dstPtr+8, repStartValue[2]);
   814 #endif
   855 #endif
   815     //dstPtr += 12;
       
   816     eSize += 12;
   856     eSize += 12;
   817 
   857 
   818 _cleanup:
   858 _cleanup:
   819     ZSTD_freeCCtx(esr.ref);
   859     ZSTD_freeCCtx(esr.ref);
   820     ZSTD_freeCCtx(esr.zc);
   860     ZSTD_freeCCtx(esr.zc);
   829                           const void* customDictContent, size_t dictContentSize,
   869                           const void* customDictContent, size_t dictContentSize,
   830                           const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
   870                           const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
   831                           ZDICT_params_t params)
   871                           ZDICT_params_t params)
   832 {
   872 {
   833     size_t hSize;
   873     size_t hSize;
   834 #define HBUFFSIZE 256
   874 #define HBUFFSIZE 256   /* should prove large enough for all entropy headers */
   835     BYTE header[HBUFFSIZE];
   875     BYTE header[HBUFFSIZE];
   836     int const compressionLevel = (params.compressionLevel <= 0) ? g_compressionLevel_default : params.compressionLevel;
   876     int const compressionLevel = (params.compressionLevel <= 0) ? g_compressionLevel_default : params.compressionLevel;
   837     U32 const notificationLevel = params.notificationLevel;
   877     U32 const notificationLevel = params.notificationLevel;
   838 
   878 
   839     /* check conditions */
   879     /* check conditions */
       
   880     DEBUGLOG(4, "ZDICT_finalizeDictionary");
   840     if (dictBufferCapacity < dictContentSize) return ERROR(dstSize_tooSmall);
   881     if (dictBufferCapacity < dictContentSize) return ERROR(dstSize_tooSmall);
   841     if (dictContentSize < ZDICT_CONTENTSIZE_MIN) return ERROR(srcSize_wrong);
   882     if (dictContentSize < ZDICT_CONTENTSIZE_MIN) return ERROR(srcSize_wrong);
   842     if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) return ERROR(dstSize_tooSmall);
   883     if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) return ERROR(dstSize_tooSmall);
   843 
   884 
   844     /* dictionary header */
   885     /* dictionary header */
   845     MEM_writeLE32(header, ZSTD_DICT_MAGIC);
   886     MEM_writeLE32(header, ZSTD_MAGIC_DICTIONARY);
   846     {   U64 const randomID = XXH64(customDictContent, dictContentSize, 0);
   887     {   U64 const randomID = XXH64(customDictContent, dictContentSize, 0);
   847         U32 const compliantID = (randomID % ((1U<<31)-32768)) + 32768;
   888         U32 const compliantID = (randomID % ((1U<<31)-32768)) + 32768;
   848         U32 const dictID = params.dictID ? params.dictID : compliantID;
   889         U32 const dictID = params.dictID ? params.dictID : compliantID;
   849         MEM_writeLE32(header+4, dictID);
   890         MEM_writeLE32(header+4, dictID);
   850     }
   891     }
   875 
   916 
   876 size_t ZDICT_addEntropyTablesFromBuffer_advanced(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
   917 size_t ZDICT_addEntropyTablesFromBuffer_advanced(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
   877                                                  const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
   918                                                  const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
   878                                                  ZDICT_params_t params)
   919                                                  ZDICT_params_t params)
   879 {
   920 {
   880     size_t hSize;
       
   881     int const compressionLevel = (params.compressionLevel <= 0) ? g_compressionLevel_default : params.compressionLevel;
   921     int const compressionLevel = (params.compressionLevel <= 0) ? g_compressionLevel_default : params.compressionLevel;
   882     U32 const notificationLevel = params.notificationLevel;
   922     U32 const notificationLevel = params.notificationLevel;
   883 
   923     size_t hSize = 8;
   884     /* dictionary header */
   924 
   885     MEM_writeLE32(dictBuffer, ZSTD_DICT_MAGIC);
   925     /* calculate entropy tables */
   886     {   U64 const randomID = XXH64((char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize, 0);
       
   887         U32 const compliantID = (randomID % ((1U<<31)-32768)) + 32768;
       
   888         U32 const dictID = params.dictID ? params.dictID : compliantID;
       
   889         MEM_writeLE32((char*)dictBuffer+4, dictID);
       
   890     }
       
   891     hSize = 8;
       
   892 
       
   893     /* entropy tables */
       
   894     DISPLAYLEVEL(2, "\r%70s\r", "");   /* clean display line */
   926     DISPLAYLEVEL(2, "\r%70s\r", "");   /* clean display line */
   895     DISPLAYLEVEL(2, "statistics ... \n");
   927     DISPLAYLEVEL(2, "statistics ... \n");
   896     {   size_t const eSize = ZDICT_analyzeEntropy((char*)dictBuffer+hSize, dictBufferCapacity-hSize,
   928     {   size_t const eSize = ZDICT_analyzeEntropy((char*)dictBuffer+hSize, dictBufferCapacity-hSize,
   897                                   compressionLevel,
   929                                   compressionLevel,
   898                                   samplesBuffer, samplesSizes, nbSamples,
   930                                   samplesBuffer, samplesSizes, nbSamples,
   900                                   notificationLevel);
   932                                   notificationLevel);
   901         if (ZDICT_isError(eSize)) return eSize;
   933         if (ZDICT_isError(eSize)) return eSize;
   902         hSize += eSize;
   934         hSize += eSize;
   903     }
   935     }
   904 
   936 
       
   937     /* add dictionary header (after entropy tables) */
       
   938     MEM_writeLE32(dictBuffer, ZSTD_MAGIC_DICTIONARY);
       
   939     {   U64 const randomID = XXH64((char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize, 0);
       
   940         U32 const compliantID = (randomID % ((1U<<31)-32768)) + 32768;
       
   941         U32 const dictID = params.dictID ? params.dictID : compliantID;
       
   942         MEM_writeLE32((char*)dictBuffer+4, dictID);
       
   943     }
   905 
   944 
   906     if (hSize + dictContentSize < dictBufferCapacity)
   945     if (hSize + dictContentSize < dictBufferCapacity)
   907         memmove((char*)dictBuffer + hSize, (char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize);
   946         memmove((char*)dictBuffer + hSize, (char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize);
   908     return MIN(dictBufferCapacity, hSize+dictContentSize);
   947     return MIN(dictBufferCapacity, hSize+dictContentSize);
   909 }
   948 }
   910 
   949 
   911 
   950 
   912 /*! ZDICT_trainFromBuffer_unsafe() :
   951 /*! ZDICT_trainFromBuffer_unsafe_legacy() :
   913 *   Warning : `samplesBuffer` must be followed by noisy guard band.
   952 *   Warning : `samplesBuffer` must be followed by noisy guard band.
   914 *   @return : size of dictionary, or an error code which can be tested with ZDICT_isError()
   953 *   @return : size of dictionary, or an error code which can be tested with ZDICT_isError()
   915 */
   954 */
   916 size_t ZDICT_trainFromBuffer_unsafe(
   955 size_t ZDICT_trainFromBuffer_unsafe_legacy(
   917                             void* dictBuffer, size_t maxDictSize,
   956                             void* dictBuffer, size_t maxDictSize,
   918                             const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
   957                             const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
   919                             ZDICT_params_t params)
   958                             ZDICT_legacy_params_t params)
   920 {
   959 {
   921     U32 const dictListSize = MAX(MAX(DICTLISTSIZE_DEFAULT, nbSamples), (U32)(maxDictSize/16));
   960     U32 const dictListSize = MAX(MAX(DICTLISTSIZE_DEFAULT, nbSamples), (U32)(maxDictSize/16));
   922     dictItem* const dictList = (dictItem*)malloc(dictListSize * sizeof(*dictList));
   961     dictItem* const dictList = (dictItem*)malloc(dictListSize * sizeof(*dictList));
   923     unsigned const selectivity = params.selectivityLevel == 0 ? g_selectivity_default : params.selectivityLevel;
   962     unsigned const selectivity = params.selectivityLevel == 0 ? g_selectivity_default : params.selectivityLevel;
   924     unsigned const minRep = (selectivity > 30) ? MINRATIO : nbSamples >> selectivity;
   963     unsigned const minRep = (selectivity > 30) ? MINRATIO : nbSamples >> selectivity;
   925     size_t const targetDictSize = maxDictSize;
   964     size_t const targetDictSize = maxDictSize;
   926     size_t const samplesBuffSize = ZDICT_totalSampleSize(samplesSizes, nbSamples);
   965     size_t const samplesBuffSize = ZDICT_totalSampleSize(samplesSizes, nbSamples);
   927     size_t dictSize = 0;
   966     size_t dictSize = 0;
   928     U32 const notificationLevel = params.notificationLevel;
   967     U32 const notificationLevel = params.zParams.notificationLevel;
   929 
   968 
   930     /* checks */
   969     /* checks */
   931     if (!dictList) return ERROR(memory_allocation);
   970     if (!dictList) return ERROR(memory_allocation);
   932     if (maxDictSize <= g_provision_entropySize + g_min_fast_dictContent) { free(dictList); return ERROR(dstSize_tooSmall); }
   971     if (maxDictSize < ZDICT_DICTSIZE_MIN) { free(dictList); return ERROR(dstSize_tooSmall); }   /* requested dictionary size is too small */
   933     if (samplesBuffSize < ZDICT_MIN_SAMPLES_SIZE) { free(dictList); return 0; }   /* not enough source to create dictionary */
   972     if (samplesBuffSize < ZDICT_MIN_SAMPLES_SIZE) { free(dictList); return ERROR(dictionaryCreation_failed); }   /* not enough source to create dictionary */
   934 
   973 
   935     /* init */
   974     /* init */
   936     ZDICT_initDictItem(dictList);
   975     ZDICT_initDictItem(dictList);
   937 
   976 
   938     /* build dictionary */
   977     /* build dictionary */
   939     ZDICT_trainBuffer(dictList, dictListSize,
   978     ZDICT_trainBuffer_legacy(dictList, dictListSize,
   940                     samplesBuffer, samplesBuffSize,
   979                        samplesBuffer, samplesBuffSize,
   941                     samplesSizes, nbSamples,
   980                        samplesSizes, nbSamples,
   942                     minRep, notificationLevel);
   981                        minRep, notificationLevel);
   943 
   982 
   944     /* display best matches */
   983     /* display best matches */
   945     if (params.notificationLevel>= 3) {
   984     if (params.zParams.notificationLevel>= 3) {
   946         U32 const nb = MIN(25, dictList[0].pos);
   985         U32 const nb = MIN(25, dictList[0].pos);
   947         U32 const dictContentSize = ZDICT_dictSize(dictList);
   986         U32 const dictContentSize = ZDICT_dictSize(dictList);
   948         U32 u;
   987         U32 u;
   949         DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", dictList[0].pos-1, dictContentSize);
   988         DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", dictList[0].pos-1, dictContentSize);
   950         DISPLAYLEVEL(3, "list %u best segments \n", nb-1);
   989         DISPLAYLEVEL(3, "list %u best segments \n", nb-1);
   961     }   }
  1000     }   }
   962 
  1001 
   963 
  1002 
   964     /* create dictionary */
  1003     /* create dictionary */
   965     {   U32 dictContentSize = ZDICT_dictSize(dictList);
  1004     {   U32 dictContentSize = ZDICT_dictSize(dictList);
   966         if (dictContentSize < targetDictSize/3) {
  1005         if (dictContentSize < ZDICT_CONTENTSIZE_MIN) { free(dictList); return ERROR(dictionaryCreation_failed); }   /* dictionary content too small */
       
  1006         if (dictContentSize < targetDictSize/4) {
   967             DISPLAYLEVEL(2, "!  warning : selected content significantly smaller than requested (%u < %u) \n", dictContentSize, (U32)maxDictSize);
  1007             DISPLAYLEVEL(2, "!  warning : selected content significantly smaller than requested (%u < %u) \n", dictContentSize, (U32)maxDictSize);
       
  1008             if (samplesBuffSize < 10 * targetDictSize)
       
  1009                 DISPLAYLEVEL(2, "!  consider increasing the number of samples (total size : %u MB)\n", (U32)(samplesBuffSize>>20));
   968             if (minRep > MINRATIO) {
  1010             if (minRep > MINRATIO) {
   969                 DISPLAYLEVEL(2, "!  consider increasing selectivity to produce larger dictionary (-s%u) \n", selectivity+1);
  1011                 DISPLAYLEVEL(2, "!  consider increasing selectivity to produce larger dictionary (-s%u) \n", selectivity+1);
   970                 DISPLAYLEVEL(2, "!  note : larger dictionaries are not necessarily better, test its efficiency on samples \n");
  1012                 DISPLAYLEVEL(2, "!  note : larger dictionaries are not necessarily better, test its efficiency on samples \n");
   971             }
  1013             }
   972             if (samplesBuffSize < 10 * targetDictSize)
       
   973                 DISPLAYLEVEL(2, "!  consider increasing the number of samples (total size : %u MB)\n", (U32)(samplesBuffSize>>20));
       
   974         }
  1014         }
   975 
  1015 
   976         if ((dictContentSize > targetDictSize*3) && (nbSamples > 2*MINRATIO) && (selectivity>1)) {
  1016         if ((dictContentSize > targetDictSize*3) && (nbSamples > 2*MINRATIO) && (selectivity>1)) {
   977             U32 proposedSelectivity = selectivity-1;
  1017             U32 proposedSelectivity = selectivity-1;
   978             while ((nbSamples >> proposedSelectivity) <= MINRATIO) { proposedSelectivity--; }
  1018             while ((nbSamples >> proposedSelectivity) <= MINRATIO) { proposedSelectivity--; }
   979             DISPLAYLEVEL(2, "!  note : calculated dictionary significantly larger than requested (%u > %u) \n", dictContentSize, (U32)maxDictSize);
  1019             DISPLAYLEVEL(2, "!  note : calculated dictionary significantly larger than requested (%u > %u) \n", dictContentSize, (U32)maxDictSize);
   980             DISPLAYLEVEL(2, "!  consider increasing dictionary size, or produce denser dictionary (-s%u) \n", proposedSelectivity);
  1020             DISPLAYLEVEL(2, "!  consider increasing dictionary size, or produce denser dictionary (-s%u) \n", proposedSelectivity);
   981             DISPLAYLEVEL(2, "!  always test dictionary efficiency on samples \n");
  1021             DISPLAYLEVEL(2, "!  always test dictionary efficiency on real samples \n");
   982         }
  1022         }
   983 
  1023 
   984         /* limit dictionary size */
  1024         /* limit dictionary size */
   985         {   U32 const max = dictList->pos;   /* convention : nb of useful elts within dictList */
  1025         {   U32 const max = dictList->pos;   /* convention : nb of useful elts within dictList */
   986             U32 currentSize = 0;
  1026             U32 currentSize = 0;
  1002                 memcpy(ptr, (const char*)samplesBuffer+dictList[u].pos, l);
  1042                 memcpy(ptr, (const char*)samplesBuffer+dictList[u].pos, l);
  1003         }   }
  1043         }   }
  1004 
  1044 
  1005         dictSize = ZDICT_addEntropyTablesFromBuffer_advanced(dictBuffer, dictContentSize, maxDictSize,
  1045         dictSize = ZDICT_addEntropyTablesFromBuffer_advanced(dictBuffer, dictContentSize, maxDictSize,
  1006                                                              samplesBuffer, samplesSizes, nbSamples,
  1046                                                              samplesBuffer, samplesSizes, nbSamples,
  1007                                                              params);
  1047                                                              params.zParams);
  1008     }
  1048     }
  1009 
  1049 
  1010     /* clean up */
  1050     /* clean up */
  1011     free(dictList);
  1051     free(dictList);
  1012     return dictSize;
  1052     return dictSize;
  1013 }
  1053 }
  1014 
  1054 
  1015 
  1055 
  1016 /* issue : samplesBuffer need to be followed by a noisy guard band.
  1056 /* ZDICT_trainFromBuffer_legacy() :
  1017 *  work around : duplicate the buffer, and add the noise */
  1057  * issue : samplesBuffer need to be followed by a noisy guard band.
  1018 size_t ZDICT_trainFromBuffer_advanced(void* dictBuffer, size_t dictBufferCapacity,
  1058  * work around : duplicate the buffer, and add the noise */
  1019                                       const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
  1059 size_t ZDICT_trainFromBuffer_legacy(void* dictBuffer, size_t dictBufferCapacity,
  1020                                       ZDICT_params_t params)
  1060                               const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
       
  1061                               ZDICT_legacy_params_t params)
  1021 {
  1062 {
  1022     size_t result;
  1063     size_t result;
  1023     void* newBuff;
  1064     void* newBuff;
  1024     size_t const sBuffSize = ZDICT_totalSampleSize(samplesSizes, nbSamples);
  1065     size_t const sBuffSize = ZDICT_totalSampleSize(samplesSizes, nbSamples);
  1025     if (sBuffSize < ZDICT_MIN_SAMPLES_SIZE) return 0;   /* not enough content => no dictionary */
  1066     if (sBuffSize < ZDICT_MIN_SAMPLES_SIZE) return 0;   /* not enough content => no dictionary */
  1028     if (!newBuff) return ERROR(memory_allocation);
  1069     if (!newBuff) return ERROR(memory_allocation);
  1029 
  1070 
  1030     memcpy(newBuff, samplesBuffer, sBuffSize);
  1071     memcpy(newBuff, samplesBuffer, sBuffSize);
  1031     ZDICT_fillNoise((char*)newBuff + sBuffSize, NOISELENGTH);   /* guard band, for end of buffer condition */
  1072     ZDICT_fillNoise((char*)newBuff + sBuffSize, NOISELENGTH);   /* guard band, for end of buffer condition */
  1032 
  1073 
  1033     result = ZDICT_trainFromBuffer_unsafe(
  1074     result =
  1034                                         dictBuffer, dictBufferCapacity,
  1075         ZDICT_trainFromBuffer_unsafe_legacy(dictBuffer, dictBufferCapacity, newBuff,
  1035                                         newBuff, samplesSizes, nbSamples,
  1076                                             samplesSizes, nbSamples, params);
  1036                                         params);
       
  1037     free(newBuff);
  1077     free(newBuff);
  1038     return result;
  1078     return result;
  1039 }
  1079 }
  1040 
  1080 
  1041 
  1081 
  1042 size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
  1082 size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
  1043                              const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
  1083                              const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
  1044 {
  1084 {
  1045     ZDICT_params_t params;
  1085     ZDICT_cover_params_t params;
       
  1086     DEBUGLOG(3, "ZDICT_trainFromBuffer");
  1046     memset(&params, 0, sizeof(params));
  1087     memset(&params, 0, sizeof(params));
  1047     return ZDICT_trainFromBuffer_advanced(dictBuffer, dictBufferCapacity,
  1088     params.d = 8;
  1048                                           samplesBuffer, samplesSizes, nbSamples,
  1089     params.steps = 4;
  1049                                           params);
  1090     /* Default to level 6 since no compression level information is available */
       
  1091     params.zParams.compressionLevel = 6;
       
  1092 #if defined(ZSTD_DEBUG) && (ZSTD_DEBUG>=1)
       
  1093     params.zParams.notificationLevel = ZSTD_DEBUG;
       
  1094 #endif
       
  1095     return ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, dictBufferCapacity,
       
  1096                                                samplesBuffer, samplesSizes, nbSamples,
       
  1097                                                &params);
  1050 }
  1098 }
  1051 
  1099 
  1052 size_t ZDICT_addEntropyTablesFromBuffer(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
  1100 size_t ZDICT_addEntropyTablesFromBuffer(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
  1053                                         const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
  1101                                   const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
  1054 {
  1102 {
  1055     ZDICT_params_t params;
  1103     ZDICT_params_t params;
  1056     memset(&params, 0, sizeof(params));
  1104     memset(&params, 0, sizeof(params));
  1057     return ZDICT_addEntropyTablesFromBuffer_advanced(dictBuffer, dictContentSize, dictBufferCapacity,
  1105     return ZDICT_addEntropyTablesFromBuffer_advanced(dictBuffer, dictContentSize, dictBufferCapacity,
  1058                                                      samplesBuffer, samplesSizes, nbSamples,
  1106                                                      samplesBuffer, samplesSizes, nbSamples,