1 // SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause 2 /* 3 * Copyright (c) Meta Platforms, Inc. and affiliates. 4 * All rights reserved. 5 * 6 * This source code is licensed under both the BSD-style license (found in the 7 * LICENSE file in the root directory of this source tree) and the GPLv2 (found 8 * in the COPYING file in the root directory of this source tree). 9 * You may select, at your option, one of the above-listed licenses. 10 */ 11 12 #include "zstd_compress_internal.h" 13 #include "zstd_lazy.h" 14 #include "../common/bits.h" /* ZSTD_countTrailingZeros64 */ 15 16 #if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \ 17 || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \ 18 || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) \ 19 || !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) 20 21 #define kLazySkippingStep 8 22 23 24 /*-************************************* 25 * Binary Tree search 26 ***************************************/ 27 28 static 29 ZSTD_ALLOW_POINTER_OVERFLOW_ATTR 30 void ZSTD_updateDUBT(ZSTD_MatchState_t* ms, 31 const BYTE* ip, const BYTE* iend, 32 U32 mls) 33 { 34 const ZSTD_compressionParameters* const cParams = &ms->cParams; 35 U32* const hashTable = ms->hashTable; 36 U32 const hashLog = cParams->hashLog; 37 38 U32* const bt = ms->chainTable; 39 U32 const btLog = cParams->chainLog - 1; 40 U32 const btMask = (1 << btLog) - 1; 41 42 const BYTE* const base = ms->window.base; 43 U32 const target = (U32)(ip - base); 44 U32 idx = ms->nextToUpdate; 45 46 if (idx != target) 47 DEBUGLOG(7, "ZSTD_updateDUBT, from %u to %u (dictLimit:%u)", 48 idx, target, ms->window.dictLimit); 49 assert(ip + 8 <= iend); /* condition for ZSTD_hashPtr */ 50 (void)iend; 51 52 assert(idx >= ms->window.dictLimit); /* condition for valid base+idx */ 53 for ( ; idx < target ; idx++) { 54 size_t const h = ZSTD_hashPtr(base + idx, hashLog, mls); /* assumption : ip + 8 <= iend */ 55 U32 const matchIndex = hashTable[h]; 56 57 U32* const nextCandidatePtr = bt + 2*(idx&btMask); 58 U32* const sortMarkPtr = nextCandidatePtr + 1; 59 60 DEBUGLOG(8, "ZSTD_updateDUBT: insert %u", idx); 61 hashTable[h] = idx; /* Update Hash Table */ 62 *nextCandidatePtr = matchIndex; /* update BT like a chain */ 63 *sortMarkPtr = ZSTD_DUBT_UNSORTED_MARK; 64 } 65 ms->nextToUpdate = target; 66 } 67 68 69 /* ZSTD_insertDUBT1() : 70 * sort one already inserted but unsorted position 71 * assumption : curr >= btlow == (curr - btmask) 72 * doesn't fail */ 73 static 74 ZSTD_ALLOW_POINTER_OVERFLOW_ATTR 75 void ZSTD_insertDUBT1(const ZSTD_MatchState_t* ms, 76 U32 curr, const BYTE* inputEnd, 77 U32 nbCompares, U32 btLow, 78 const ZSTD_dictMode_e dictMode) 79 { 80 const ZSTD_compressionParameters* const cParams = &ms->cParams; 81 U32* const bt = ms->chainTable; 82 U32 const btLog = cParams->chainLog - 1; 83 U32 const btMask = (1 << btLog) - 1; 84 size_t commonLengthSmaller=0, commonLengthLarger=0; 85 const BYTE* const base = ms->window.base; 86 const BYTE* const dictBase = ms->window.dictBase; 87 const U32 dictLimit = ms->window.dictLimit; 88 const BYTE* const ip = (curr>=dictLimit) ? base + curr : dictBase + curr; 89 const BYTE* const iend = (curr>=dictLimit) ? inputEnd : dictBase + dictLimit; 90 const BYTE* const dictEnd = dictBase + dictLimit; 91 const BYTE* const prefixStart = base + dictLimit; 92 const BYTE* match; 93 U32* smallerPtr = bt + 2*(curr&btMask); 94 U32* largerPtr = smallerPtr + 1; 95 U32 matchIndex = *smallerPtr; /* this candidate is unsorted : next sorted candidate is reached through *smallerPtr, while *largerPtr contains previous unsorted candidate (which is already saved and can be overwritten) */ 96 U32 dummy32; /* to be nullified at the end */ 97 U32 const windowValid = ms->window.lowLimit; 98 U32 const maxDistance = 1U << cParams->windowLog; 99 U32 const windowLow = (curr - windowValid > maxDistance) ? curr - maxDistance : windowValid; 100 101 102 DEBUGLOG(8, "ZSTD_insertDUBT1(%u) (dictLimit=%u, lowLimit=%u)", 103 curr, dictLimit, windowLow); 104 assert(curr >= btLow); 105 assert(ip < iend); /* condition for ZSTD_count */ 106 107 for (; nbCompares && (matchIndex > windowLow); --nbCompares) { 108 U32* const nextPtr = bt + 2*(matchIndex & btMask); 109 size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */ 110 assert(matchIndex < curr); 111 /* note : all candidates are now supposed sorted, 112 * but it's still possible to have nextPtr[1] == ZSTD_DUBT_UNSORTED_MARK 113 * when a real index has the same value as ZSTD_DUBT_UNSORTED_MARK */ 114 115 if ( (dictMode != ZSTD_extDict) 116 || (matchIndex+matchLength >= dictLimit) /* both in current segment*/ 117 || (curr < dictLimit) /* both in extDict */) { 118 const BYTE* const mBase = ( (dictMode != ZSTD_extDict) 119 || (matchIndex+matchLength >= dictLimit)) ? 120 base : dictBase; 121 assert( (matchIndex+matchLength >= dictLimit) /* might be wrong if extDict is incorrectly set to 0 */ 122 || (curr < dictLimit) ); 123 match = mBase + matchIndex; 124 matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend); 125 } else { 126 match = dictBase + matchIndex; 127 matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart); 128 if (matchIndex+matchLength >= dictLimit) 129 match = base + matchIndex; /* preparation for next read of match[matchLength] */ 130 } 131 132 DEBUGLOG(8, "ZSTD_insertDUBT1: comparing %u with %u : found %u common bytes ", 133 curr, matchIndex, (U32)matchLength); 134 135 if (ip+matchLength == iend) { /* equal : no way to know if inf or sup */ 136 break; /* drop , to guarantee consistency ; miss a bit of compression, but other solutions can corrupt tree */ 137 } 138 139 if (match[matchLength] < ip[matchLength]) { /* necessarily within buffer */ 140 /* match is smaller than current */ 141 *smallerPtr = matchIndex; /* update smaller idx */ 142 commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */ 143 if (matchIndex <= btLow) { smallerPtr=&dummy32; break; } /* beyond tree size, stop searching */ 144 DEBUGLOG(8, "ZSTD_insertDUBT1: %u (>btLow=%u) is smaller : next => %u", 145 matchIndex, btLow, nextPtr[1]); 146 smallerPtr = nextPtr+1; /* new "candidate" => larger than match, which was smaller than target */ 147 matchIndex = nextPtr[1]; /* new matchIndex, larger than previous and closer to current */ 148 } else { 149 /* match is larger than current */ 150 *largerPtr = matchIndex; 151 commonLengthLarger = matchLength; 152 if (matchIndex <= btLow) { largerPtr=&dummy32; break; } /* beyond tree size, stop searching */ 153 DEBUGLOG(8, "ZSTD_insertDUBT1: %u (>btLow=%u) is larger => %u", 154 matchIndex, btLow, nextPtr[0]); 155 largerPtr = nextPtr; 156 matchIndex = nextPtr[0]; 157 } } 158 159 *smallerPtr = *largerPtr = 0; 160 } 161 162 163 static 164 ZSTD_ALLOW_POINTER_OVERFLOW_ATTR 165 size_t ZSTD_DUBT_findBetterDictMatch ( 166 const ZSTD_MatchState_t* ms, 167 const BYTE* const ip, const BYTE* const iend, 168 size_t* offsetPtr, 169 size_t bestLength, 170 U32 nbCompares, 171 U32 const mls, 172 const ZSTD_dictMode_e dictMode) 173 { 174 const ZSTD_MatchState_t * const dms = ms->dictMatchState; 175 const ZSTD_compressionParameters* const dmsCParams = &dms->cParams; 176 const U32 * const dictHashTable = dms->hashTable; 177 U32 const hashLog = dmsCParams->hashLog; 178 size_t const h = ZSTD_hashPtr(ip, hashLog, mls); 179 U32 dictMatchIndex = dictHashTable[h]; 180 181 const BYTE* const base = ms->window.base; 182 const BYTE* const prefixStart = base + ms->window.dictLimit; 183 U32 const curr = (U32)(ip-base); 184 const BYTE* const dictBase = dms->window.base; 185 const BYTE* const dictEnd = dms->window.nextSrc; 186 U32 const dictHighLimit = (U32)(dms->window.nextSrc - dms->window.base); 187 U32 const dictLowLimit = dms->window.lowLimit; 188 U32 const dictIndexDelta = ms->window.lowLimit - dictHighLimit; 189 190 U32* const dictBt = dms->chainTable; 191 U32 const btLog = dmsCParams->chainLog - 1; 192 U32 const btMask = (1 << btLog) - 1; 193 U32 const btLow = (btMask >= dictHighLimit - dictLowLimit) ? dictLowLimit : dictHighLimit - btMask; 194 195 size_t commonLengthSmaller=0, commonLengthLarger=0; 196 197 (void)dictMode; 198 assert(dictMode == ZSTD_dictMatchState); 199 200 for (; nbCompares && (dictMatchIndex > dictLowLimit); --nbCompares) { 201 U32* const nextPtr = dictBt + 2*(dictMatchIndex & btMask); 202 size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */ 203 const BYTE* match = dictBase + dictMatchIndex; 204 matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart); 205 if (dictMatchIndex+matchLength >= dictHighLimit) 206 match = base + dictMatchIndex + dictIndexDelta; /* to prepare for next usage of match[matchLength] */ 207 208 if (matchLength > bestLength) { 209 U32 matchIndex = dictMatchIndex + dictIndexDelta; 210 if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) { 211 DEBUGLOG(9, "ZSTD_DUBT_findBetterDictMatch(%u) : found better match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, matchIndex %u)", 212 curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, OFFSET_TO_OFFBASE(curr - matchIndex), dictMatchIndex, matchIndex); 213 bestLength = matchLength, *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex); 214 } 215 if (ip+matchLength == iend) { /* reached end of input : ip[matchLength] is not valid, no way to know if it's larger or smaller than match */ 216 break; /* drop, to guarantee consistency (miss a little bit of compression) */ 217 } 218 } 219 220 if (match[matchLength] < ip[matchLength]) { 221 if (dictMatchIndex <= btLow) { break; } /* beyond tree size, stop the search */ 222 commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */ 223 dictMatchIndex = nextPtr[1]; /* new matchIndex larger than previous (closer to current) */ 224 } else { 225 /* match is larger than current */ 226 if (dictMatchIndex <= btLow) { break; } /* beyond tree size, stop the search */ 227 commonLengthLarger = matchLength; 228 dictMatchIndex = nextPtr[0]; 229 } 230 } 231 232 if (bestLength >= MINMATCH) { 233 U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offsetPtr); (void)mIndex; 234 DEBUGLOG(8, "ZSTD_DUBT_findBetterDictMatch(%u) : found match of length %u and offsetCode %u (pos %u)", 235 curr, (U32)bestLength, (U32)*offsetPtr, mIndex); 236 } 237 return bestLength; 238 239 } 240 241 242 static 243 ZSTD_ALLOW_POINTER_OVERFLOW_ATTR 244 size_t ZSTD_DUBT_findBestMatch(ZSTD_MatchState_t* ms, 245 const BYTE* const ip, const BYTE* const iend, 246 size_t* offBasePtr, 247 U32 const mls, 248 const ZSTD_dictMode_e dictMode) 249 { 250 const ZSTD_compressionParameters* const cParams = &ms->cParams; 251 U32* const hashTable = ms->hashTable; 252 U32 const hashLog = cParams->hashLog; 253 size_t const h = ZSTD_hashPtr(ip, hashLog, mls); 254 U32 matchIndex = hashTable[h]; 255 256 const BYTE* const base = ms->window.base; 257 U32 const curr = (U32)(ip-base); 258 U32 const windowLow = ZSTD_getLowestMatchIndex(ms, curr, cParams->windowLog); 259 260 U32* const bt = ms->chainTable; 261 U32 const btLog = cParams->chainLog - 1; 262 U32 const btMask = (1 << btLog) - 1; 263 U32 const btLow = (btMask >= curr) ? 0 : curr - btMask; 264 U32 const unsortLimit = MAX(btLow, windowLow); 265 266 U32* nextCandidate = bt + 2*(matchIndex&btMask); 267 U32* unsortedMark = bt + 2*(matchIndex&btMask) + 1; 268 U32 nbCompares = 1U << cParams->searchLog; 269 U32 nbCandidates = nbCompares; 270 U32 previousCandidate = 0; 271 272 DEBUGLOG(7, "ZSTD_DUBT_findBestMatch (%u) ", curr); 273 assert(ip <= iend-8); /* required for h calculation */ 274 assert(dictMode != ZSTD_dedicatedDictSearch); 275 276 /* reach end of unsorted candidates list */ 277 while ( (matchIndex > unsortLimit) 278 && (*unsortedMark == ZSTD_DUBT_UNSORTED_MARK) 279 && (nbCandidates > 1) ) { 280 DEBUGLOG(8, "ZSTD_DUBT_findBestMatch: candidate %u is unsorted", 281 matchIndex); 282 *unsortedMark = previousCandidate; /* the unsortedMark becomes a reversed chain, to move up back to original position */ 283 previousCandidate = matchIndex; 284 matchIndex = *nextCandidate; 285 nextCandidate = bt + 2*(matchIndex&btMask); 286 unsortedMark = bt + 2*(matchIndex&btMask) + 1; 287 nbCandidates --; 288 } 289 290 /* nullify last candidate if it's still unsorted 291 * simplification, detrimental to compression ratio, beneficial for speed */ 292 if ( (matchIndex > unsortLimit) 293 && (*unsortedMark==ZSTD_DUBT_UNSORTED_MARK) ) { 294 DEBUGLOG(7, "ZSTD_DUBT_findBestMatch: nullify last unsorted candidate %u", 295 matchIndex); 296 *nextCandidate = *unsortedMark = 0; 297 } 298 299 /* batch sort stacked candidates */ 300 matchIndex = previousCandidate; 301 while (matchIndex) { /* will end on matchIndex == 0 */ 302 U32* const nextCandidateIdxPtr = bt + 2*(matchIndex&btMask) + 1; 303 U32 const nextCandidateIdx = *nextCandidateIdxPtr; 304 ZSTD_insertDUBT1(ms, matchIndex, iend, 305 nbCandidates, unsortLimit, dictMode); 306 matchIndex = nextCandidateIdx; 307 nbCandidates++; 308 } 309 310 /* find longest match */ 311 { size_t commonLengthSmaller = 0, commonLengthLarger = 0; 312 const BYTE* const dictBase = ms->window.dictBase; 313 const U32 dictLimit = ms->window.dictLimit; 314 const BYTE* const dictEnd = dictBase + dictLimit; 315 const BYTE* const prefixStart = base + dictLimit; 316 U32* smallerPtr = bt + 2*(curr&btMask); 317 U32* largerPtr = bt + 2*(curr&btMask) + 1; 318 U32 matchEndIdx = curr + 8 + 1; 319 U32 dummy32; /* to be nullified at the end */ 320 size_t bestLength = 0; 321 322 matchIndex = hashTable[h]; 323 hashTable[h] = curr; /* Update Hash Table */ 324 325 for (; nbCompares && (matchIndex > windowLow); --nbCompares) { 326 U32* const nextPtr = bt + 2*(matchIndex & btMask); 327 size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */ 328 const BYTE* match; 329 330 if ((dictMode != ZSTD_extDict) || (matchIndex+matchLength >= dictLimit)) { 331 match = base + matchIndex; 332 matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend); 333 } else { 334 match = dictBase + matchIndex; 335 matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart); 336 if (matchIndex+matchLength >= dictLimit) 337 match = base + matchIndex; /* to prepare for next usage of match[matchLength] */ 338 } 339 340 if (matchLength > bestLength) { 341 if (matchLength > matchEndIdx - matchIndex) 342 matchEndIdx = matchIndex + (U32)matchLength; 343 if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr - matchIndex + 1) - ZSTD_highbit32((U32)*offBasePtr)) ) 344 bestLength = matchLength, *offBasePtr = OFFSET_TO_OFFBASE(curr - matchIndex); 345 if (ip+matchLength == iend) { /* equal : no way to know if inf or sup */ 346 if (dictMode == ZSTD_dictMatchState) { 347 nbCompares = 0; /* in addition to avoiding checking any 348 * further in this loop, make sure we 349 * skip checking in the dictionary. */ 350 } 351 break; /* drop, to guarantee consistency (miss a little bit of compression) */ 352 } 353 } 354 355 if (match[matchLength] < ip[matchLength]) { 356 /* match is smaller than current */ 357 *smallerPtr = matchIndex; /* update smaller idx */ 358 commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */ 359 if (matchIndex <= btLow) { smallerPtr=&dummy32; break; } /* beyond tree size, stop the search */ 360 smallerPtr = nextPtr+1; /* new "smaller" => larger of match */ 361 matchIndex = nextPtr[1]; /* new matchIndex larger than previous (closer to current) */ 362 } else { 363 /* match is larger than current */ 364 *largerPtr = matchIndex; 365 commonLengthLarger = matchLength; 366 if (matchIndex <= btLow) { largerPtr=&dummy32; break; } /* beyond tree size, stop the search */ 367 largerPtr = nextPtr; 368 matchIndex = nextPtr[0]; 369 } } 370 371 *smallerPtr = *largerPtr = 0; 372 373 assert(nbCompares <= (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven't underflowed. */ 374 if (dictMode == ZSTD_dictMatchState && nbCompares) { 375 bestLength = ZSTD_DUBT_findBetterDictMatch( 376 ms, ip, iend, 377 offBasePtr, bestLength, nbCompares, 378 mls, dictMode); 379 } 380 381 assert(matchEndIdx > curr+8); /* ensure nextToUpdate is increased */ 382 ms->nextToUpdate = matchEndIdx - 8; /* skip repetitive patterns */ 383 if (bestLength >= MINMATCH) { 384 U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offBasePtr); (void)mIndex; 385 DEBUGLOG(8, "ZSTD_DUBT_findBestMatch(%u) : found match of length %u and offsetCode %u (pos %u)", 386 curr, (U32)bestLength, (U32)*offBasePtr, mIndex); 387 } 388 return bestLength; 389 } 390 } 391 392 393 /* ZSTD_BtFindBestMatch() : Tree updater, providing best match */ 394 FORCE_INLINE_TEMPLATE 395 ZSTD_ALLOW_POINTER_OVERFLOW_ATTR 396 size_t ZSTD_BtFindBestMatch( ZSTD_MatchState_t* ms, 397 const BYTE* const ip, const BYTE* const iLimit, 398 size_t* offBasePtr, 399 const U32 mls /* template */, 400 const ZSTD_dictMode_e dictMode) 401 { 402 DEBUGLOG(7, "ZSTD_BtFindBestMatch"); 403 if (ip < ms->window.base + ms->nextToUpdate) return 0; /* skipped area */ 404 ZSTD_updateDUBT(ms, ip, iLimit, mls); 405 return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offBasePtr, mls, dictMode); 406 } 407 408 /* ********************************* 409 * Dedicated dict search 410 ***********************************/ 411 412 void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_MatchState_t* ms, const BYTE* const ip) 413 { 414 const BYTE* const base = ms->window.base; 415 U32 const target = (U32)(ip - base); 416 U32* const hashTable = ms->hashTable; 417 U32* const chainTable = ms->chainTable; 418 U32 const chainSize = 1 << ms->cParams.chainLog; 419 U32 idx = ms->nextToUpdate; 420 U32 const minChain = chainSize < target - idx ? target - chainSize : idx; 421 U32 const bucketSize = 1 << ZSTD_LAZY_DDSS_BUCKET_LOG; 422 U32 const cacheSize = bucketSize - 1; 423 U32 const chainAttempts = (1 << ms->cParams.searchLog) - cacheSize; 424 U32 const chainLimit = chainAttempts > 255 ? 255 : chainAttempts; 425 426 /* We know the hashtable is oversized by a factor of `bucketSize`. 427 * We are going to temporarily pretend `bucketSize == 1`, keeping only a 428 * single entry. We will use the rest of the space to construct a temporary 429 * chaintable. 430 */ 431 U32 const hashLog = ms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG; 432 U32* const tmpHashTable = hashTable; 433 U32* const tmpChainTable = hashTable + ((size_t)1 << hashLog); 434 U32 const tmpChainSize = (U32)((1 << ZSTD_LAZY_DDSS_BUCKET_LOG) - 1) << hashLog; 435 U32 const tmpMinChain = tmpChainSize < target ? target - tmpChainSize : idx; 436 U32 hashIdx; 437 438 assert(ms->cParams.chainLog <= 24); 439 assert(ms->cParams.hashLog > ms->cParams.chainLog); 440 assert(idx != 0); 441 assert(tmpMinChain <= minChain); 442 443 /* fill conventional hash table and conventional chain table */ 444 for ( ; idx < target; idx++) { 445 U32 const h = (U32)ZSTD_hashPtr(base + idx, hashLog, ms->cParams.minMatch); 446 if (idx >= tmpMinChain) { 447 tmpChainTable[idx - tmpMinChain] = hashTable[h]; 448 } 449 tmpHashTable[h] = idx; 450 } 451 452 /* sort chains into ddss chain table */ 453 { 454 U32 chainPos = 0; 455 for (hashIdx = 0; hashIdx < (1U << hashLog); hashIdx++) { 456 U32 count; 457 U32 countBeyondMinChain = 0; 458 U32 i = tmpHashTable[hashIdx]; 459 for (count = 0; i >= tmpMinChain && count < cacheSize; count++) { 460 /* skip through the chain to the first position that won't be 461 * in the hash cache bucket */ 462 if (i < minChain) { 463 countBeyondMinChain++; 464 } 465 i = tmpChainTable[i - tmpMinChain]; 466 } 467 if (count == cacheSize) { 468 for (count = 0; count < chainLimit;) { 469 if (i < minChain) { 470 if (!i || ++countBeyondMinChain > cacheSize) { 471 /* only allow pulling `cacheSize` number of entries 472 * into the cache or chainTable beyond `minChain`, 473 * to replace the entries pulled out of the 474 * chainTable into the cache. This lets us reach 475 * back further without increasing the total number 476 * of entries in the chainTable, guaranteeing the 477 * DDSS chain table will fit into the space 478 * allocated for the regular one. */ 479 break; 480 } 481 } 482 chainTable[chainPos++] = i; 483 count++; 484 if (i < tmpMinChain) { 485 break; 486 } 487 i = tmpChainTable[i - tmpMinChain]; 488 } 489 } else { 490 count = 0; 491 } 492 if (count) { 493 tmpHashTable[hashIdx] = ((chainPos - count) << 8) + count; 494 } else { 495 tmpHashTable[hashIdx] = 0; 496 } 497 } 498 assert(chainPos <= chainSize); /* I believe this is guaranteed... */ 499 } 500 501 /* move chain pointers into the last entry of each hash bucket */ 502 for (hashIdx = (1 << hashLog); hashIdx; ) { 503 U32 const bucketIdx = --hashIdx << ZSTD_LAZY_DDSS_BUCKET_LOG; 504 U32 const chainPackedPointer = tmpHashTable[hashIdx]; 505 U32 i; 506 for (i = 0; i < cacheSize; i++) { 507 hashTable[bucketIdx + i] = 0; 508 } 509 hashTable[bucketIdx + bucketSize - 1] = chainPackedPointer; 510 } 511 512 /* fill the buckets of the hash table */ 513 for (idx = ms->nextToUpdate; idx < target; idx++) { 514 U32 const h = (U32)ZSTD_hashPtr(base + idx, hashLog, ms->cParams.minMatch) 515 << ZSTD_LAZY_DDSS_BUCKET_LOG; 516 U32 i; 517 /* Shift hash cache down 1. */ 518 for (i = cacheSize - 1; i; i--) 519 hashTable[h + i] = hashTable[h + i - 1]; 520 hashTable[h] = idx; 521 } 522 523 ms->nextToUpdate = target; 524 } 525 526 /* Returns the longest match length found in the dedicated dict search structure. 527 * If none are longer than the argument ml, then ml will be returned. 528 */ 529 FORCE_INLINE_TEMPLATE 530 size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nbAttempts, 531 const ZSTD_MatchState_t* const dms, 532 const BYTE* const ip, const BYTE* const iLimit, 533 const BYTE* const prefixStart, const U32 curr, 534 const U32 dictLimit, const size_t ddsIdx) { 535 const U32 ddsLowestIndex = dms->window.dictLimit; 536 const BYTE* const ddsBase = dms->window.base; 537 const BYTE* const ddsEnd = dms->window.nextSrc; 538 const U32 ddsSize = (U32)(ddsEnd - ddsBase); 539 const U32 ddsIndexDelta = dictLimit - ddsSize; 540 const U32 bucketSize = (1 << ZSTD_LAZY_DDSS_BUCKET_LOG); 541 const U32 bucketLimit = nbAttempts < bucketSize - 1 ? nbAttempts : bucketSize - 1; 542 U32 ddsAttempt; 543 U32 matchIndex; 544 545 for (ddsAttempt = 0; ddsAttempt < bucketSize - 1; ddsAttempt++) { 546 PREFETCH_L1(ddsBase + dms->hashTable[ddsIdx + ddsAttempt]); 547 } 548 549 { 550 U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1]; 551 U32 const chainIndex = chainPackedPointer >> 8; 552 553 PREFETCH_L1(&dms->chainTable[chainIndex]); 554 } 555 556 for (ddsAttempt = 0; ddsAttempt < bucketLimit; ddsAttempt++) { 557 size_t currentMl=0; 558 const BYTE* match; 559 matchIndex = dms->hashTable[ddsIdx + ddsAttempt]; 560 match = ddsBase + matchIndex; 561 562 if (!matchIndex) { 563 return ml; 564 } 565 566 /* guaranteed by table construction */ 567 (void)ddsLowestIndex; 568 assert(matchIndex >= ddsLowestIndex); 569 assert(match+4 <= ddsEnd); 570 if (MEM_read32(match) == MEM_read32(ip)) { 571 /* assumption : matchIndex <= dictLimit-4 (by table construction) */ 572 currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4; 573 } 574 575 /* save best solution */ 576 if (currentMl > ml) { 577 ml = currentMl; 578 *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta)); 579 if (ip+currentMl == iLimit) { 580 /* best possible, avoids read overflow on next attempt */ 581 return ml; 582 } 583 } 584 } 585 586 { 587 U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1]; 588 U32 chainIndex = chainPackedPointer >> 8; 589 U32 const chainLength = chainPackedPointer & 0xFF; 590 U32 const chainAttempts = nbAttempts - ddsAttempt; 591 U32 const chainLimit = chainAttempts > chainLength ? chainLength : chainAttempts; 592 U32 chainAttempt; 593 594 for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++) { 595 PREFETCH_L1(ddsBase + dms->chainTable[chainIndex + chainAttempt]); 596 } 597 598 for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++, chainIndex++) { 599 size_t currentMl=0; 600 const BYTE* match; 601 matchIndex = dms->chainTable[chainIndex]; 602 match = ddsBase + matchIndex; 603 604 /* guaranteed by table construction */ 605 assert(matchIndex >= ddsLowestIndex); 606 assert(match+4 <= ddsEnd); 607 if (MEM_read32(match) == MEM_read32(ip)) { 608 /* assumption : matchIndex <= dictLimit-4 (by table construction) */ 609 currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4; 610 } 611 612 /* save best solution */ 613 if (currentMl > ml) { 614 ml = currentMl; 615 *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta)); 616 if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ 617 } 618 } 619 } 620 return ml; 621 } 622 623 624 /* ********************************* 625 * Hash Chain 626 ***********************************/ 627 #define NEXT_IN_CHAIN(d, mask) chainTable[(d) & (mask)] 628 629 /* Update chains up to ip (excluded) 630 Assumption : always within prefix (i.e. not within extDict) */ 631 FORCE_INLINE_TEMPLATE 632 ZSTD_ALLOW_POINTER_OVERFLOW_ATTR 633 U32 ZSTD_insertAndFindFirstIndex_internal( 634 ZSTD_MatchState_t* ms, 635 const ZSTD_compressionParameters* const cParams, 636 const BYTE* ip, U32 const mls, U32 const lazySkipping) 637 { 638 U32* const hashTable = ms->hashTable; 639 const U32 hashLog = cParams->hashLog; 640 U32* const chainTable = ms->chainTable; 641 const U32 chainMask = (1 << cParams->chainLog) - 1; 642 const BYTE* const base = ms->window.base; 643 const U32 target = (U32)(ip - base); 644 U32 idx = ms->nextToUpdate; 645 646 while(idx < target) { /* catch up */ 647 size_t const h = ZSTD_hashPtr(base+idx, hashLog, mls); 648 NEXT_IN_CHAIN(idx, chainMask) = hashTable[h]; 649 hashTable[h] = idx; 650 idx++; 651 /* Stop inserting every position when in the lazy skipping mode. */ 652 if (lazySkipping) 653 break; 654 } 655 656 ms->nextToUpdate = target; 657 return hashTable[ZSTD_hashPtr(ip, hashLog, mls)]; 658 } 659 660 U32 ZSTD_insertAndFindFirstIndex(ZSTD_MatchState_t* ms, const BYTE* ip) { 661 const ZSTD_compressionParameters* const cParams = &ms->cParams; 662 return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch, /* lazySkipping*/ 0); 663 } 664 665 /* inlining is important to hardwire a hot branch (template emulation) */ 666 FORCE_INLINE_TEMPLATE 667 ZSTD_ALLOW_POINTER_OVERFLOW_ATTR 668 size_t ZSTD_HcFindBestMatch( 669 ZSTD_MatchState_t* ms, 670 const BYTE* const ip, const BYTE* const iLimit, 671 size_t* offsetPtr, 672 const U32 mls, const ZSTD_dictMode_e dictMode) 673 { 674 const ZSTD_compressionParameters* const cParams = &ms->cParams; 675 U32* const chainTable = ms->chainTable; 676 const U32 chainSize = (1 << cParams->chainLog); 677 const U32 chainMask = chainSize-1; 678 const BYTE* const base = ms->window.base; 679 const BYTE* const dictBase = ms->window.dictBase; 680 const U32 dictLimit = ms->window.dictLimit; 681 const BYTE* const prefixStart = base + dictLimit; 682 const BYTE* const dictEnd = dictBase + dictLimit; 683 const U32 curr = (U32)(ip-base); 684 const U32 maxDistance = 1U << cParams->windowLog; 685 const U32 lowestValid = ms->window.lowLimit; 686 const U32 withinMaxDistance = (curr - lowestValid > maxDistance) ? curr - maxDistance : lowestValid; 687 const U32 isDictionary = (ms->loadedDictEnd != 0); 688 const U32 lowLimit = isDictionary ? lowestValid : withinMaxDistance; 689 const U32 minChain = curr > chainSize ? curr - chainSize : 0; 690 U32 nbAttempts = 1U << cParams->searchLog; 691 size_t ml=4-1; 692 693 const ZSTD_MatchState_t* const dms = ms->dictMatchState; 694 const U32 ddsHashLog = dictMode == ZSTD_dedicatedDictSearch 695 ? dms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG : 0; 696 const size_t ddsIdx = dictMode == ZSTD_dedicatedDictSearch 697 ? ZSTD_hashPtr(ip, ddsHashLog, mls) << ZSTD_LAZY_DDSS_BUCKET_LOG : 0; 698 699 U32 matchIndex; 700 701 if (dictMode == ZSTD_dedicatedDictSearch) { 702 const U32* entry = &dms->hashTable[ddsIdx]; 703 PREFETCH_L1(entry); 704 } 705 706 /* HC4 match finder */ 707 matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls, ms->lazySkipping); 708 709 for ( ; (matchIndex>=lowLimit) & (nbAttempts>0) ; nbAttempts--) { 710 size_t currentMl=0; 711 if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) { 712 const BYTE* const match = base + matchIndex; 713 assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */ 714 /* read 4B starting from (match + ml + 1 - sizeof(U32)) */ 715 if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3)) /* potentially better */ 716 currentMl = ZSTD_count(ip, match, iLimit); 717 } else { 718 const BYTE* const match = dictBase + matchIndex; 719 assert(match+4 <= dictEnd); 720 if (MEM_read32(match) == MEM_read32(ip)) /* assumption : matchIndex <= dictLimit-4 (by table construction) */ 721 currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dictEnd, prefixStart) + 4; 722 } 723 724 /* save best solution */ 725 if (currentMl > ml) { 726 ml = currentMl; 727 *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex); 728 if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ 729 } 730 731 if (matchIndex <= minChain) break; 732 matchIndex = NEXT_IN_CHAIN(matchIndex, chainMask); 733 } 734 735 assert(nbAttempts <= (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven't underflowed. */ 736 if (dictMode == ZSTD_dedicatedDictSearch) { 737 ml = ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttempts, dms, 738 ip, iLimit, prefixStart, curr, dictLimit, ddsIdx); 739 } else if (dictMode == ZSTD_dictMatchState) { 740 const U32* const dmsChainTable = dms->chainTable; 741 const U32 dmsChainSize = (1 << dms->cParams.chainLog); 742 const U32 dmsChainMask = dmsChainSize - 1; 743 const U32 dmsLowestIndex = dms->window.dictLimit; 744 const BYTE* const dmsBase = dms->window.base; 745 const BYTE* const dmsEnd = dms->window.nextSrc; 746 const U32 dmsSize = (U32)(dmsEnd - dmsBase); 747 const U32 dmsIndexDelta = dictLimit - dmsSize; 748 const U32 dmsMinChain = dmsSize > dmsChainSize ? dmsSize - dmsChainSize : 0; 749 750 matchIndex = dms->hashTable[ZSTD_hashPtr(ip, dms->cParams.hashLog, mls)]; 751 752 for ( ; (matchIndex>=dmsLowestIndex) & (nbAttempts>0) ; nbAttempts--) { 753 size_t currentMl=0; 754 const BYTE* const match = dmsBase + matchIndex; 755 assert(match+4 <= dmsEnd); 756 if (MEM_read32(match) == MEM_read32(ip)) /* assumption : matchIndex <= dictLimit-4 (by table construction) */ 757 currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dmsEnd, prefixStart) + 4; 758 759 /* save best solution */ 760 if (currentMl > ml) { 761 ml = currentMl; 762 assert(curr > matchIndex + dmsIndexDelta); 763 *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta)); 764 if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ 765 } 766 767 if (matchIndex <= dmsMinChain) break; 768 769 matchIndex = dmsChainTable[matchIndex & dmsChainMask]; 770 } 771 } 772 773 return ml; 774 } 775 776 /* ********************************* 777 * (SIMD) Row-based matchfinder 778 ***********************************/ 779 /* Constants for row-based hash */ 780 #define ZSTD_ROW_HASH_TAG_MASK ((1u << ZSTD_ROW_HASH_TAG_BITS) - 1) 781 #define ZSTD_ROW_HASH_MAX_ENTRIES 64 /* absolute maximum number of entries per row, for all configurations */ 782 783 #define ZSTD_ROW_HASH_CACHE_MASK (ZSTD_ROW_HASH_CACHE_SIZE - 1) 784 785 typedef U64 ZSTD_VecMask; /* Clarifies when we are interacting with a U64 representing a mask of matches */ 786 787 /* ZSTD_VecMask_next(): 788 * Starting from the LSB, returns the idx of the next non-zero bit. 789 * Basically counting the nb of trailing zeroes. 790 */ 791 MEM_STATIC U32 ZSTD_VecMask_next(ZSTD_VecMask val) { 792 return ZSTD_countTrailingZeros64(val); 793 } 794 795 /* ZSTD_row_nextIndex(): 796 * Returns the next index to insert at within a tagTable row, and updates the "head" 797 * value to reflect the update. Essentially cycles backwards from [1, {entries per row}) 798 */ 799 FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextIndex(BYTE* const tagRow, U32 const rowMask) { 800 U32 next = (*tagRow-1) & rowMask; 801 next += (next == 0) ? rowMask : 0; /* skip first position */ 802 *tagRow = (BYTE)next; 803 return next; 804 } 805 806 /* ZSTD_isAligned(): 807 * Checks that a pointer is aligned to "align" bytes which must be a power of 2. 808 */ 809 MEM_STATIC int ZSTD_isAligned(void const* ptr, size_t align) { 810 assert((align & (align - 1)) == 0); 811 return (((size_t)ptr) & (align - 1)) == 0; 812 } 813 814 /* ZSTD_row_prefetch(): 815 * Performs prefetching for the hashTable and tagTable at a given row. 816 */ 817 FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, BYTE const* tagTable, U32 const relRow, U32 const rowLog) { 818 PREFETCH_L1(hashTable + relRow); 819 if (rowLog >= 5) { 820 PREFETCH_L1(hashTable + relRow + 16); 821 /* Note: prefetching more of the hash table does not appear to be beneficial for 128-entry rows */ 822 } 823 PREFETCH_L1(tagTable + relRow); 824 if (rowLog == 6) { 825 PREFETCH_L1(tagTable + relRow + 32); 826 } 827 assert(rowLog == 4 || rowLog == 5 || rowLog == 6); 828 assert(ZSTD_isAligned(hashTable + relRow, 64)); /* prefetched hash row always 64-byte aligned */ 829 assert(ZSTD_isAligned(tagTable + relRow, (size_t)1 << rowLog)); /* prefetched tagRow sits on correct multiple of bytes (32,64,128) */ 830 } 831 832 /* ZSTD_row_fillHashCache(): 833 * Fill up the hash cache starting at idx, prefetching up to ZSTD_ROW_HASH_CACHE_SIZE entries, 834 * but not beyond iLimit. 835 */ 836 FORCE_INLINE_TEMPLATE 837 ZSTD_ALLOW_POINTER_OVERFLOW_ATTR 838 void ZSTD_row_fillHashCache(ZSTD_MatchState_t* ms, const BYTE* base, 839 U32 const rowLog, U32 const mls, 840 U32 idx, const BYTE* const iLimit) 841 { 842 U32 const* const hashTable = ms->hashTable; 843 BYTE const* const tagTable = ms->tagTable; 844 U32 const hashLog = ms->rowHashLog; 845 U32 const maxElemsToPrefetch = (base + idx) > iLimit ? 0 : (U32)(iLimit - (base + idx) + 1); 846 U32 const lim = idx + MIN(ZSTD_ROW_HASH_CACHE_SIZE, maxElemsToPrefetch); 847 848 for (; idx < lim; ++idx) { 849 U32 const hash = (U32)ZSTD_hashPtrSalted(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt); 850 U32 const row = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; 851 ZSTD_row_prefetch(hashTable, tagTable, row, rowLog); 852 ms->hashCache[idx & ZSTD_ROW_HASH_CACHE_MASK] = hash; 853 } 854 855 DEBUGLOG(6, "ZSTD_row_fillHashCache(): [%u %u %u %u %u %u %u %u]", ms->hashCache[0], ms->hashCache[1], 856 ms->hashCache[2], ms->hashCache[3], ms->hashCache[4], 857 ms->hashCache[5], ms->hashCache[6], ms->hashCache[7]); 858 } 859 860 /* ZSTD_row_nextCachedHash(): 861 * Returns the hash of base + idx, and replaces the hash in the hash cache with the byte at 862 * base + idx + ZSTD_ROW_HASH_CACHE_SIZE. Also prefetches the appropriate rows from hashTable and tagTable. 863 */ 864 FORCE_INLINE_TEMPLATE 865 ZSTD_ALLOW_POINTER_OVERFLOW_ATTR 866 U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable, 867 BYTE const* tagTable, BYTE const* base, 868 U32 idx, U32 const hashLog, 869 U32 const rowLog, U32 const mls, 870 U64 const hashSalt) 871 { 872 U32 const newHash = (U32)ZSTD_hashPtrSalted(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt); 873 U32 const row = (newHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; 874 ZSTD_row_prefetch(hashTable, tagTable, row, rowLog); 875 { U32 const hash = cache[idx & ZSTD_ROW_HASH_CACHE_MASK]; 876 cache[idx & ZSTD_ROW_HASH_CACHE_MASK] = newHash; 877 return hash; 878 } 879 } 880 881 /* ZSTD_row_update_internalImpl(): 882 * Updates the hash table with positions starting from updateStartIdx until updateEndIdx. 883 */ 884 FORCE_INLINE_TEMPLATE 885 ZSTD_ALLOW_POINTER_OVERFLOW_ATTR 886 void ZSTD_row_update_internalImpl(ZSTD_MatchState_t* ms, 887 U32 updateStartIdx, U32 const updateEndIdx, 888 U32 const mls, U32 const rowLog, 889 U32 const rowMask, U32 const useCache) 890 { 891 U32* const hashTable = ms->hashTable; 892 BYTE* const tagTable = ms->tagTable; 893 U32 const hashLog = ms->rowHashLog; 894 const BYTE* const base = ms->window.base; 895 896 DEBUGLOG(6, "ZSTD_row_update_internalImpl(): updateStartIdx=%u, updateEndIdx=%u", updateStartIdx, updateEndIdx); 897 for (; updateStartIdx < updateEndIdx; ++updateStartIdx) { 898 U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls, ms->hashSalt) 899 : (U32)ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt); 900 U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; 901 U32* const row = hashTable + relRow; 902 BYTE* tagRow = tagTable + relRow; 903 U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask); 904 905 assert(hash == ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt)); 906 tagRow[pos] = hash & ZSTD_ROW_HASH_TAG_MASK; 907 row[pos] = updateStartIdx; 908 } 909 } 910 911 /* ZSTD_row_update_internal(): 912 * Inserts the byte at ip into the appropriate position in the hash table, and updates ms->nextToUpdate. 913 * Skips sections of long matches as is necessary. 914 */ 915 FORCE_INLINE_TEMPLATE 916 ZSTD_ALLOW_POINTER_OVERFLOW_ATTR 917 void ZSTD_row_update_internal(ZSTD_MatchState_t* ms, const BYTE* ip, 918 U32 const mls, U32 const rowLog, 919 U32 const rowMask, U32 const useCache) 920 { 921 U32 idx = ms->nextToUpdate; 922 const BYTE* const base = ms->window.base; 923 const U32 target = (U32)(ip - base); 924 const U32 kSkipThreshold = 384; 925 const U32 kMaxMatchStartPositionsToUpdate = 96; 926 const U32 kMaxMatchEndPositionsToUpdate = 32; 927 928 if (useCache) { 929 /* Only skip positions when using hash cache, i.e. 930 * if we are loading a dict, don't skip anything. 931 * If we decide to skip, then we only update a set number 932 * of positions at the beginning and end of the match. 933 */ 934 if (UNLIKELY(target - idx > kSkipThreshold)) { 935 U32 const bound = idx + kMaxMatchStartPositionsToUpdate; 936 ZSTD_row_update_internalImpl(ms, idx, bound, mls, rowLog, rowMask, useCache); 937 idx = target - kMaxMatchEndPositionsToUpdate; 938 ZSTD_row_fillHashCache(ms, base, rowLog, mls, idx, ip+1); 939 } 940 } 941 assert(target >= idx); 942 ZSTD_row_update_internalImpl(ms, idx, target, mls, rowLog, rowMask, useCache); 943 ms->nextToUpdate = target; 944 } 945 946 /* ZSTD_row_update(): 947 * External wrapper for ZSTD_row_update_internal(). Used for filling the hashtable during dictionary 948 * processing. 949 */ 950 void ZSTD_row_update(ZSTD_MatchState_t* const ms, const BYTE* ip) { 951 const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6); 952 const U32 rowMask = (1u << rowLog) - 1; 953 const U32 mls = MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */); 954 955 DEBUGLOG(5, "ZSTD_row_update(), rowLog=%u", rowLog); 956 ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* don't use cache */); 957 } 958 959 /* Returns the mask width of bits group of which will be set to 1. Given not all 960 * architectures have easy movemask instruction, this helps to iterate over 961 * groups of bits easier and faster. 962 */ 963 FORCE_INLINE_TEMPLATE U32 964 ZSTD_row_matchMaskGroupWidth(const U32 rowEntries) 965 { 966 assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64); 967 assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES); 968 (void)rowEntries; 969 #if defined(ZSTD_ARCH_ARM_NEON) 970 /* NEON path only works for little endian */ 971 if (!MEM_isLittleEndian()) { 972 return 1; 973 } 974 if (rowEntries == 16) { 975 return 4; 976 } 977 if (rowEntries == 32) { 978 return 2; 979 } 980 if (rowEntries == 64) { 981 return 1; 982 } 983 #endif 984 return 1; 985 } 986 987 #if defined(ZSTD_ARCH_X86_SSE2) 988 FORCE_INLINE_TEMPLATE ZSTD_VecMask 989 ZSTD_row_getSSEMask(int nbChunks, const BYTE* const src, const BYTE tag, const U32 head) 990 { 991 const __m128i comparisonMask = _mm_set1_epi8((char)tag); 992 int matches[4] = {0}; 993 int i; 994 assert(nbChunks == 1 || nbChunks == 2 || nbChunks == 4); 995 for (i=0; i<nbChunks; i++) { 996 const __m128i chunk = _mm_loadu_si128((const __m128i*)(const void*)(src + 16*i)); 997 const __m128i equalMask = _mm_cmpeq_epi8(chunk, comparisonMask); 998 matches[i] = _mm_movemask_epi8(equalMask); 999 } 1000 if (nbChunks == 1) return ZSTD_rotateRight_U16((U16)matches[0], head); 1001 if (nbChunks == 2) return ZSTD_rotateRight_U32((U32)matches[1] << 16 | (U32)matches[0], head); 1002 assert(nbChunks == 4); 1003 return ZSTD_rotateRight_U64((U64)matches[3] << 48 | (U64)matches[2] << 32 | (U64)matches[1] << 16 | (U64)matches[0], head); 1004 } 1005 #endif 1006 1007 #if defined(ZSTD_ARCH_ARM_NEON) 1008 FORCE_INLINE_TEMPLATE ZSTD_VecMask 1009 ZSTD_row_getNEONMask(const U32 rowEntries, const BYTE* const src, const BYTE tag, const U32 headGrouped) 1010 { 1011 assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64); 1012 if (rowEntries == 16) { 1013 /* vshrn_n_u16 shifts by 4 every u16 and narrows to 8 lower bits. 1014 * After that groups of 4 bits represent the equalMask. We lower 1015 * all bits except the highest in these groups by doing AND with 1016 * 0x88 = 0b10001000. 1017 */ 1018 const uint8x16_t chunk = vld1q_u8(src); 1019 const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag))); 1020 const uint8x8_t res = vshrn_n_u16(equalMask, 4); 1021 const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0); 1022 return ZSTD_rotateRight_U64(matches, headGrouped) & 0x8888888888888888ull; 1023 } else if (rowEntries == 32) { 1024 /* Same idea as with rowEntries == 16 but doing AND with 1025 * 0x55 = 0b01010101. 1026 */ 1027 const uint16x8x2_t chunk = vld2q_u16((const uint16_t*)(const void*)src); 1028 const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]); 1029 const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]); 1030 const uint8x16_t dup = vdupq_n_u8(tag); 1031 const uint8x8_t t0 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk0, dup)), 6); 1032 const uint8x8_t t1 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk1, dup)), 6); 1033 const uint8x8_t res = vsli_n_u8(t0, t1, 4); 1034 const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0) ; 1035 return ZSTD_rotateRight_U64(matches, headGrouped) & 0x5555555555555555ull; 1036 } else { /* rowEntries == 64 */ 1037 const uint8x16x4_t chunk = vld4q_u8(src); 1038 const uint8x16_t dup = vdupq_n_u8(tag); 1039 const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup); 1040 const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup); 1041 const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup); 1042 const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup); 1043 1044 const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1); 1045 const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1); 1046 const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2); 1047 const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4); 1048 const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4); 1049 const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0); 1050 return ZSTD_rotateRight_U64(matches, headGrouped); 1051 } 1052 } 1053 #endif 1054 1055 /* Returns a ZSTD_VecMask (U64) that has the nth group (determined by 1056 * ZSTD_row_matchMaskGroupWidth) of bits set to 1 if the newly-computed "tag" 1057 * matches the hash at the nth position in a row of the tagTable. 1058 * Each row is a circular buffer beginning at the value of "headGrouped". So we 1059 * must rotate the "matches" bitfield to match up with the actual layout of the 1060 * entries within the hashTable */ 1061 FORCE_INLINE_TEMPLATE ZSTD_VecMask 1062 ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 headGrouped, const U32 rowEntries) 1063 { 1064 const BYTE* const src = tagRow; 1065 assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64); 1066 assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES); 1067 assert(ZSTD_row_matchMaskGroupWidth(rowEntries) * rowEntries <= sizeof(ZSTD_VecMask) * 8); 1068 1069 #if defined(ZSTD_ARCH_X86_SSE2) 1070 1071 return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, headGrouped); 1072 1073 #else /* SW or NEON-LE */ 1074 1075 # if defined(ZSTD_ARCH_ARM_NEON) 1076 /* This NEON path only works for little endian - otherwise use SWAR below */ 1077 if (MEM_isLittleEndian()) { 1078 return ZSTD_row_getNEONMask(rowEntries, src, tag, headGrouped); 1079 } 1080 # endif /* ZSTD_ARCH_ARM_NEON */ 1081 /* SWAR */ 1082 { const int chunkSize = sizeof(size_t); 1083 const size_t shiftAmount = ((chunkSize * 8) - chunkSize); 1084 const size_t xFF = ~((size_t)0); 1085 const size_t x01 = xFF / 0xFF; 1086 const size_t x80 = x01 << 7; 1087 const size_t splatChar = tag * x01; 1088 ZSTD_VecMask matches = 0; 1089 int i = rowEntries - chunkSize; 1090 assert((sizeof(size_t) == 4) || (sizeof(size_t) == 8)); 1091 if (MEM_isLittleEndian()) { /* runtime check so have two loops */ 1092 const size_t extractMagic = (xFF / 0x7F) >> chunkSize; 1093 do { 1094 size_t chunk = MEM_readST(&src[i]); 1095 chunk ^= splatChar; 1096 chunk = (((chunk | x80) - x01) | chunk) & x80; 1097 matches <<= chunkSize; 1098 matches |= (chunk * extractMagic) >> shiftAmount; 1099 i -= chunkSize; 1100 } while (i >= 0); 1101 } else { /* big endian: reverse bits during extraction */ 1102 const size_t msb = xFF ^ (xFF >> 1); 1103 const size_t extractMagic = (msb / 0x1FF) | msb; 1104 do { 1105 size_t chunk = MEM_readST(&src[i]); 1106 chunk ^= splatChar; 1107 chunk = (((chunk | x80) - x01) | chunk) & x80; 1108 matches <<= chunkSize; 1109 matches |= ((chunk >> 7) * extractMagic) >> shiftAmount; 1110 i -= chunkSize; 1111 } while (i >= 0); 1112 } 1113 matches = ~matches; 1114 if (rowEntries == 16) { 1115 return ZSTD_rotateRight_U16((U16)matches, headGrouped); 1116 } else if (rowEntries == 32) { 1117 return ZSTD_rotateRight_U32((U32)matches, headGrouped); 1118 } else { 1119 return ZSTD_rotateRight_U64((U64)matches, headGrouped); 1120 } 1121 } 1122 #endif 1123 } 1124 1125 /* The high-level approach of the SIMD row based match finder is as follows: 1126 * - Figure out where to insert the new entry: 1127 * - Generate a hash for current input position and split it into a one byte of tag and `rowHashLog` bits of index. 1128 * - The hash is salted by a value that changes on every context reset, so when the same table is used 1129 * we will avoid collisions that would otherwise slow us down by introducing phantom matches. 1130 * - The hashTable is effectively split into groups or "rows" of 15 or 31 entries of U32, and the index determines 1131 * which row to insert into. 1132 * - Determine the correct position within the row to insert the entry into. Each row of 15 or 31 can 1133 * be considered as a circular buffer with a "head" index that resides in the tagTable (overall 16 or 32 bytes 1134 * per row). 1135 * - Use SIMD to efficiently compare the tags in the tagTable to the 1-byte tag calculated for the position and 1136 * generate a bitfield that we can cycle through to check the collisions in the hash table. 1137 * - Pick the longest match. 1138 * - Insert the tag into the equivalent row and position in the tagTable. 1139 */ 1140 FORCE_INLINE_TEMPLATE 1141 ZSTD_ALLOW_POINTER_OVERFLOW_ATTR 1142 size_t ZSTD_RowFindBestMatch( 1143 ZSTD_MatchState_t* ms, 1144 const BYTE* const ip, const BYTE* const iLimit, 1145 size_t* offsetPtr, 1146 const U32 mls, const ZSTD_dictMode_e dictMode, 1147 const U32 rowLog) 1148 { 1149 U32* const hashTable = ms->hashTable; 1150 BYTE* const tagTable = ms->tagTable; 1151 U32* const hashCache = ms->hashCache; 1152 const U32 hashLog = ms->rowHashLog; 1153 const ZSTD_compressionParameters* const cParams = &ms->cParams; 1154 const BYTE* const base = ms->window.base; 1155 const BYTE* const dictBase = ms->window.dictBase; 1156 const U32 dictLimit = ms->window.dictLimit; 1157 const BYTE* const prefixStart = base + dictLimit; 1158 const BYTE* const dictEnd = dictBase + dictLimit; 1159 const U32 curr = (U32)(ip-base); 1160 const U32 maxDistance = 1U << cParams->windowLog; 1161 const U32 lowestValid = ms->window.lowLimit; 1162 const U32 withinMaxDistance = (curr - lowestValid > maxDistance) ? curr - maxDistance : lowestValid; 1163 const U32 isDictionary = (ms->loadedDictEnd != 0); 1164 const U32 lowLimit = isDictionary ? lowestValid : withinMaxDistance; 1165 const U32 rowEntries = (1U << rowLog); 1166 const U32 rowMask = rowEntries - 1; 1167 const U32 cappedSearchLog = MIN(cParams->searchLog, rowLog); /* nb of searches is capped at nb entries per row */ 1168 const U32 groupWidth = ZSTD_row_matchMaskGroupWidth(rowEntries); 1169 const U64 hashSalt = ms->hashSalt; 1170 U32 nbAttempts = 1U << cappedSearchLog; 1171 size_t ml=4-1; 1172 U32 hash; 1173 1174 /* DMS/DDS variables that may be referenced laster */ 1175 const ZSTD_MatchState_t* const dms = ms->dictMatchState; 1176 1177 /* Initialize the following variables to satisfy static analyzer */ 1178 size_t ddsIdx = 0; 1179 U32 ddsExtraAttempts = 0; /* cctx hash tables are limited in searches, but allow extra searches into DDS */ 1180 U32 dmsTag = 0; 1181 U32* dmsRow = NULL; 1182 BYTE* dmsTagRow = NULL; 1183 1184 if (dictMode == ZSTD_dedicatedDictSearch) { 1185 const U32 ddsHashLog = dms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG; 1186 { /* Prefetch DDS hashtable entry */ 1187 ddsIdx = ZSTD_hashPtr(ip, ddsHashLog, mls) << ZSTD_LAZY_DDSS_BUCKET_LOG; 1188 PREFETCH_L1(&dms->hashTable[ddsIdx]); 1189 } 1190 ddsExtraAttempts = cParams->searchLog > rowLog ? 1U << (cParams->searchLog - rowLog) : 0; 1191 } 1192 1193 if (dictMode == ZSTD_dictMatchState) { 1194 /* Prefetch DMS rows */ 1195 U32* const dmsHashTable = dms->hashTable; 1196 BYTE* const dmsTagTable = dms->tagTable; 1197 U32 const dmsHash = (U32)ZSTD_hashPtr(ip, dms->rowHashLog + ZSTD_ROW_HASH_TAG_BITS, mls); 1198 U32 const dmsRelRow = (dmsHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; 1199 dmsTag = dmsHash & ZSTD_ROW_HASH_TAG_MASK; 1200 dmsTagRow = (BYTE*)(dmsTagTable + dmsRelRow); 1201 dmsRow = dmsHashTable + dmsRelRow; 1202 ZSTD_row_prefetch(dmsHashTable, dmsTagTable, dmsRelRow, rowLog); 1203 } 1204 1205 /* Update the hashTable and tagTable up to (but not including) ip */ 1206 if (!ms->lazySkipping) { 1207 ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */); 1208 hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls, hashSalt); 1209 } else { 1210 /* Stop inserting every position when in the lazy skipping mode. 1211 * The hash cache is also not kept up to date in this mode. 1212 */ 1213 hash = (U32)ZSTD_hashPtrSalted(ip, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt); 1214 ms->nextToUpdate = curr; 1215 } 1216 ms->hashSaltEntropy += hash; /* collect salt entropy */ 1217 1218 { /* Get the hash for ip, compute the appropriate row */ 1219 U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; 1220 U32 const tag = hash & ZSTD_ROW_HASH_TAG_MASK; 1221 U32* const row = hashTable + relRow; 1222 BYTE* tagRow = (BYTE*)(tagTable + relRow); 1223 U32 const headGrouped = (*tagRow & rowMask) * groupWidth; 1224 U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES]; 1225 size_t numMatches = 0; 1226 size_t currMatch = 0; 1227 ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, headGrouped, rowEntries); 1228 1229 /* Cycle through the matches and prefetch */ 1230 for (; (matches > 0) && (nbAttempts > 0); matches &= (matches - 1)) { 1231 U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask; 1232 U32 const matchIndex = row[matchPos]; 1233 if(matchPos == 0) continue; 1234 assert(numMatches < rowEntries); 1235 if (matchIndex < lowLimit) 1236 break; 1237 if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) { 1238 PREFETCH_L1(base + matchIndex); 1239 } else { 1240 PREFETCH_L1(dictBase + matchIndex); 1241 } 1242 matchBuffer[numMatches++] = matchIndex; 1243 --nbAttempts; 1244 } 1245 1246 /* Speed opt: insert current byte into hashtable too. This allows us to avoid one iteration of the loop 1247 in ZSTD_row_update_internal() at the next search. */ 1248 { 1249 U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask); 1250 tagRow[pos] = (BYTE)tag; 1251 row[pos] = ms->nextToUpdate++; 1252 } 1253 1254 /* Return the longest match */ 1255 for (; currMatch < numMatches; ++currMatch) { 1256 U32 const matchIndex = matchBuffer[currMatch]; 1257 size_t currentMl=0; 1258 assert(matchIndex < curr); 1259 assert(matchIndex >= lowLimit); 1260 1261 if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) { 1262 const BYTE* const match = base + matchIndex; 1263 assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */ 1264 /* read 4B starting from (match + ml + 1 - sizeof(U32)) */ 1265 if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3)) /* potentially better */ 1266 currentMl = ZSTD_count(ip, match, iLimit); 1267 } else { 1268 const BYTE* const match = dictBase + matchIndex; 1269 assert(match+4 <= dictEnd); 1270 if (MEM_read32(match) == MEM_read32(ip)) /* assumption : matchIndex <= dictLimit-4 (by table construction) */ 1271 currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dictEnd, prefixStart) + 4; 1272 } 1273 1274 /* Save best solution */ 1275 if (currentMl > ml) { 1276 ml = currentMl; 1277 *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex); 1278 if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ 1279 } 1280 } 1281 } 1282 1283 assert(nbAttempts <= (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven't underflowed. */ 1284 if (dictMode == ZSTD_dedicatedDictSearch) { 1285 ml = ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttempts + ddsExtraAttempts, dms, 1286 ip, iLimit, prefixStart, curr, dictLimit, ddsIdx); 1287 } else if (dictMode == ZSTD_dictMatchState) { 1288 /* TODO: Measure and potentially add prefetching to DMS */ 1289 const U32 dmsLowestIndex = dms->window.dictLimit; 1290 const BYTE* const dmsBase = dms->window.base; 1291 const BYTE* const dmsEnd = dms->window.nextSrc; 1292 const U32 dmsSize = (U32)(dmsEnd - dmsBase); 1293 const U32 dmsIndexDelta = dictLimit - dmsSize; 1294 1295 { U32 const headGrouped = (*dmsTagRow & rowMask) * groupWidth; 1296 U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES]; 1297 size_t numMatches = 0; 1298 size_t currMatch = 0; 1299 ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, headGrouped, rowEntries); 1300 1301 for (; (matches > 0) && (nbAttempts > 0); matches &= (matches - 1)) { 1302 U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask; 1303 U32 const matchIndex = dmsRow[matchPos]; 1304 if(matchPos == 0) continue; 1305 if (matchIndex < dmsLowestIndex) 1306 break; 1307 PREFETCH_L1(dmsBase + matchIndex); 1308 matchBuffer[numMatches++] = matchIndex; 1309 --nbAttempts; 1310 } 1311 1312 /* Return the longest match */ 1313 for (; currMatch < numMatches; ++currMatch) { 1314 U32 const matchIndex = matchBuffer[currMatch]; 1315 size_t currentMl=0; 1316 assert(matchIndex >= dmsLowestIndex); 1317 assert(matchIndex < curr); 1318 1319 { const BYTE* const match = dmsBase + matchIndex; 1320 assert(match+4 <= dmsEnd); 1321 if (MEM_read32(match) == MEM_read32(ip)) 1322 currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dmsEnd, prefixStart) + 4; 1323 } 1324 1325 if (currentMl > ml) { 1326 ml = currentMl; 1327 assert(curr > matchIndex + dmsIndexDelta); 1328 *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta)); 1329 if (ip+currentMl == iLimit) break; 1330 } 1331 } 1332 } 1333 } 1334 return ml; 1335 } 1336 1337 1338 /* 1339 * Generate search functions templated on (dictMode, mls, rowLog). 1340 * These functions are outlined for code size & compilation time. 1341 * ZSTD_searchMax() dispatches to the correct implementation function. 1342 * 1343 * TODO: The start of the search function involves loading and calculating a 1344 * bunch of constants from the ZSTD_MatchState_t. These computations could be 1345 * done in an initialization function, and saved somewhere in the match state. 1346 * Then we could pass a pointer to the saved state instead of the match state, 1347 * and avoid duplicate computations. 1348 * 1349 * TODO: Move the match re-winding into searchMax. This improves compression 1350 * ratio, and unlocks further simplifications with the next TODO. 1351 * 1352 * TODO: Try moving the repcode search into searchMax. After the re-winding 1353 * and repcode search are in searchMax, there is no more logic in the match 1354 * finder loop that requires knowledge about the dictMode. So we should be 1355 * able to avoid force inlining it, and we can join the extDict loop with 1356 * the single segment loop. It should go in searchMax instead of its own 1357 * function to avoid having multiple virtual function calls per search. 1358 */ 1359 1360 #define ZSTD_BT_SEARCH_FN(dictMode, mls) ZSTD_BtFindBestMatch_##dictMode##_##mls 1361 #define ZSTD_HC_SEARCH_FN(dictMode, mls) ZSTD_HcFindBestMatch_##dictMode##_##mls 1362 #define ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog) ZSTD_RowFindBestMatch_##dictMode##_##mls##_##rowLog 1363 1364 #define ZSTD_SEARCH_FN_ATTRS FORCE_NOINLINE 1365 1366 #define GEN_ZSTD_BT_SEARCH_FN(dictMode, mls) \ 1367 ZSTD_SEARCH_FN_ATTRS size_t ZSTD_BT_SEARCH_FN(dictMode, mls)( \ 1368 ZSTD_MatchState_t* ms, \ 1369 const BYTE* ip, const BYTE* const iLimit, \ 1370 size_t* offBasePtr) \ 1371 { \ 1372 assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \ 1373 return ZSTD_BtFindBestMatch(ms, ip, iLimit, offBasePtr, mls, ZSTD_##dictMode); \ 1374 } \ 1375 1376 #define GEN_ZSTD_HC_SEARCH_FN(dictMode, mls) \ 1377 ZSTD_SEARCH_FN_ATTRS size_t ZSTD_HC_SEARCH_FN(dictMode, mls)( \ 1378 ZSTD_MatchState_t* ms, \ 1379 const BYTE* ip, const BYTE* const iLimit, \ 1380 size_t* offsetPtr) \ 1381 { \ 1382 assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \ 1383 return ZSTD_HcFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode); \ 1384 } \ 1385 1386 #define GEN_ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog) \ 1387 ZSTD_SEARCH_FN_ATTRS size_t ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog)( \ 1388 ZSTD_MatchState_t* ms, \ 1389 const BYTE* ip, const BYTE* const iLimit, \ 1390 size_t* offsetPtr) \ 1391 { \ 1392 assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \ 1393 assert(MAX(4, MIN(6, ms->cParams.searchLog)) == rowLog); \ 1394 return ZSTD_RowFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode, rowLog); \ 1395 } \ 1396 1397 #define ZSTD_FOR_EACH_ROWLOG(X, dictMode, mls) \ 1398 X(dictMode, mls, 4) \ 1399 X(dictMode, mls, 5) \ 1400 X(dictMode, mls, 6) 1401 1402 #define ZSTD_FOR_EACH_MLS_ROWLOG(X, dictMode) \ 1403 ZSTD_FOR_EACH_ROWLOG(X, dictMode, 4) \ 1404 ZSTD_FOR_EACH_ROWLOG(X, dictMode, 5) \ 1405 ZSTD_FOR_EACH_ROWLOG(X, dictMode, 6) 1406 1407 #define ZSTD_FOR_EACH_MLS(X, dictMode) \ 1408 X(dictMode, 4) \ 1409 X(dictMode, 5) \ 1410 X(dictMode, 6) 1411 1412 #define ZSTD_FOR_EACH_DICT_MODE(X, ...) \ 1413 X(__VA_ARGS__, noDict) \ 1414 X(__VA_ARGS__, extDict) \ 1415 X(__VA_ARGS__, dictMatchState) \ 1416 X(__VA_ARGS__, dedicatedDictSearch) 1417 1418 /* Generate row search fns for each combination of (dictMode, mls, rowLog) */ 1419 ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS_ROWLOG, GEN_ZSTD_ROW_SEARCH_FN) 1420 /* Generate binary Tree search fns for each combination of (dictMode, mls) */ 1421 ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_BT_SEARCH_FN) 1422 /* Generate hash chain search fns for each combination of (dictMode, mls) */ 1423 ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_HC_SEARCH_FN) 1424 1425 typedef enum { search_hashChain=0, search_binaryTree=1, search_rowHash=2 } searchMethod_e; 1426 1427 #define GEN_ZSTD_CALL_BT_SEARCH_FN(dictMode, mls) \ 1428 case mls: \ 1429 return ZSTD_BT_SEARCH_FN(dictMode, mls)(ms, ip, iend, offsetPtr); 1430 #define GEN_ZSTD_CALL_HC_SEARCH_FN(dictMode, mls) \ 1431 case mls: \ 1432 return ZSTD_HC_SEARCH_FN(dictMode, mls)(ms, ip, iend, offsetPtr); 1433 #define GEN_ZSTD_CALL_ROW_SEARCH_FN(dictMode, mls, rowLog) \ 1434 case rowLog: \ 1435 return ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog)(ms, ip, iend, offsetPtr); 1436 1437 #define ZSTD_SWITCH_MLS(X, dictMode) \ 1438 switch (mls) { \ 1439 ZSTD_FOR_EACH_MLS(X, dictMode) \ 1440 } 1441 1442 #define ZSTD_SWITCH_ROWLOG(dictMode, mls) \ 1443 case mls: \ 1444 switch (rowLog) { \ 1445 ZSTD_FOR_EACH_ROWLOG(GEN_ZSTD_CALL_ROW_SEARCH_FN, dictMode, mls) \ 1446 } \ 1447 ZSTD_UNREACHABLE; \ 1448 break; 1449 1450 #define ZSTD_SWITCH_SEARCH_METHOD(dictMode) \ 1451 switch (searchMethod) { \ 1452 case search_hashChain: \ 1453 ZSTD_SWITCH_MLS(GEN_ZSTD_CALL_HC_SEARCH_FN, dictMode) \ 1454 break; \ 1455 case search_binaryTree: \ 1456 ZSTD_SWITCH_MLS(GEN_ZSTD_CALL_BT_SEARCH_FN, dictMode) \ 1457 break; \ 1458 case search_rowHash: \ 1459 ZSTD_SWITCH_MLS(ZSTD_SWITCH_ROWLOG, dictMode) \ 1460 break; \ 1461 } \ 1462 ZSTD_UNREACHABLE; 1463 1464 /* 1465 * Searches for the longest match at @p ip. 1466 * Dispatches to the correct implementation function based on the 1467 * (searchMethod, dictMode, mls, rowLog). We use switch statements 1468 * here instead of using an indirect function call through a function 1469 * pointer because after Spectre and Meltdown mitigations, indirect 1470 * function calls can be very costly, especially in the kernel. 1471 * 1472 * NOTE: dictMode and searchMethod should be templated, so those switch 1473 * statements should be optimized out. Only the mls & rowLog switches 1474 * should be left. 1475 * 1476 * @param ms The match state. 1477 * @param ip The position to search at. 1478 * @param iend The end of the input data. 1479 * @param[out] offsetPtr Stores the match offset into this pointer. 1480 * @param mls The minimum search length, in the range [4, 6]. 1481 * @param rowLog The row log (if applicable), in the range [4, 6]. 1482 * @param searchMethod The search method to use (templated). 1483 * @param dictMode The dictMode (templated). 1484 * 1485 * @returns The length of the longest match found, or < mls if no match is found. 1486 * If a match is found its offset is stored in @p offsetPtr. 1487 */ 1488 FORCE_INLINE_TEMPLATE size_t ZSTD_searchMax( 1489 ZSTD_MatchState_t* ms, 1490 const BYTE* ip, 1491 const BYTE* iend, 1492 size_t* offsetPtr, 1493 U32 const mls, 1494 U32 const rowLog, 1495 searchMethod_e const searchMethod, 1496 ZSTD_dictMode_e const dictMode) 1497 { 1498 if (dictMode == ZSTD_noDict) { 1499 ZSTD_SWITCH_SEARCH_METHOD(noDict) 1500 } else if (dictMode == ZSTD_extDict) { 1501 ZSTD_SWITCH_SEARCH_METHOD(extDict) 1502 } else if (dictMode == ZSTD_dictMatchState) { 1503 ZSTD_SWITCH_SEARCH_METHOD(dictMatchState) 1504 } else if (dictMode == ZSTD_dedicatedDictSearch) { 1505 ZSTD_SWITCH_SEARCH_METHOD(dedicatedDictSearch) 1506 } 1507 ZSTD_UNREACHABLE; 1508 return 0; 1509 } 1510 1511 /* ******************************* 1512 * Common parser - lazy strategy 1513 *********************************/ 1514 1515 FORCE_INLINE_TEMPLATE 1516 ZSTD_ALLOW_POINTER_OVERFLOW_ATTR 1517 size_t ZSTD_compressBlock_lazy_generic( 1518 ZSTD_MatchState_t* ms, SeqStore_t* seqStore, 1519 U32 rep[ZSTD_REP_NUM], 1520 const void* src, size_t srcSize, 1521 const searchMethod_e searchMethod, const U32 depth, 1522 ZSTD_dictMode_e const dictMode) 1523 { 1524 const BYTE* const istart = (const BYTE*)src; 1525 const BYTE* ip = istart; 1526 const BYTE* anchor = istart; 1527 const BYTE* const iend = istart + srcSize; 1528 const BYTE* const ilimit = (searchMethod == search_rowHash) ? iend - 8 - ZSTD_ROW_HASH_CACHE_SIZE : iend - 8; 1529 const BYTE* const base = ms->window.base; 1530 const U32 prefixLowestIndex = ms->window.dictLimit; 1531 const BYTE* const prefixLowest = base + prefixLowestIndex; 1532 const U32 mls = BOUNDED(4, ms->cParams.minMatch, 6); 1533 const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6); 1534 1535 U32 offset_1 = rep[0], offset_2 = rep[1]; 1536 U32 offsetSaved1 = 0, offsetSaved2 = 0; 1537 1538 const int isDMS = dictMode == ZSTD_dictMatchState; 1539 const int isDDS = dictMode == ZSTD_dedicatedDictSearch; 1540 const int isDxS = isDMS || isDDS; 1541 const ZSTD_MatchState_t* const dms = ms->dictMatchState; 1542 const U32 dictLowestIndex = isDxS ? dms->window.dictLimit : 0; 1543 const BYTE* const dictBase = isDxS ? dms->window.base : NULL; 1544 const BYTE* const dictLowest = isDxS ? dictBase + dictLowestIndex : NULL; 1545 const BYTE* const dictEnd = isDxS ? dms->window.nextSrc : NULL; 1546 const U32 dictIndexDelta = isDxS ? 1547 prefixLowestIndex - (U32)(dictEnd - dictBase) : 1548 0; 1549 const U32 dictAndPrefixLength = (U32)((ip - prefixLowest) + (dictEnd - dictLowest)); 1550 1551 DEBUGLOG(5, "ZSTD_compressBlock_lazy_generic (dictMode=%u) (searchFunc=%u)", (U32)dictMode, (U32)searchMethod); 1552 ip += (dictAndPrefixLength == 0); 1553 if (dictMode == ZSTD_noDict) { 1554 U32 const curr = (U32)(ip - base); 1555 U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, ms->cParams.windowLog); 1556 U32 const maxRep = curr - windowLow; 1557 if (offset_2 > maxRep) offsetSaved2 = offset_2, offset_2 = 0; 1558 if (offset_1 > maxRep) offsetSaved1 = offset_1, offset_1 = 0; 1559 } 1560 if (isDxS) { 1561 /* dictMatchState repCode checks don't currently handle repCode == 0 1562 * disabling. */ 1563 assert(offset_1 <= dictAndPrefixLength); 1564 assert(offset_2 <= dictAndPrefixLength); 1565 } 1566 1567 /* Reset the lazy skipping state */ 1568 ms->lazySkipping = 0; 1569 1570 if (searchMethod == search_rowHash) { 1571 ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit); 1572 } 1573 1574 /* Match Loop */ 1575 #if defined(__x86_64__) 1576 /* I've measured random a 5% speed loss on levels 5 & 6 (greedy) when the 1577 * code alignment is perturbed. To fix the instability align the loop on 32-bytes. 1578 */ 1579 __asm__(".p2align 5"); 1580 #endif 1581 while (ip < ilimit) { 1582 size_t matchLength=0; 1583 size_t offBase = REPCODE1_TO_OFFBASE; 1584 const BYTE* start=ip+1; 1585 DEBUGLOG(7, "search baseline (depth 0)"); 1586 1587 /* check repCode */ 1588 if (isDxS) { 1589 const U32 repIndex = (U32)(ip - base) + 1 - offset_1; 1590 const BYTE* repMatch = ((dictMode == ZSTD_dictMatchState || dictMode == ZSTD_dedicatedDictSearch) 1591 && repIndex < prefixLowestIndex) ? 1592 dictBase + (repIndex - dictIndexDelta) : 1593 base + repIndex; 1594 if ((ZSTD_index_overlap_check(prefixLowestIndex, repIndex)) 1595 && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) { 1596 const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend; 1597 matchLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4; 1598 if (depth==0) goto _storeSequence; 1599 } 1600 } 1601 if ( dictMode == ZSTD_noDict 1602 && ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1)))) { 1603 matchLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4; 1604 if (depth==0) goto _storeSequence; 1605 } 1606 1607 /* first search (depth 0) */ 1608 { size_t offbaseFound = 999999999; 1609 size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offbaseFound, mls, rowLog, searchMethod, dictMode); 1610 if (ml2 > matchLength) 1611 matchLength = ml2, start = ip, offBase = offbaseFound; 1612 } 1613 1614 if (matchLength < 4) { 1615 size_t const step = ((size_t)(ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */; 1616 ip += step; 1617 /* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time. 1618 * In this mode we stop inserting every position into our tables, and only insert 1619 * positions that we search, which is one in step positions. 1620 * The exact cutoff is flexible, I've just chosen a number that is reasonably high, 1621 * so we minimize the compression ratio loss in "normal" scenarios. This mode gets 1622 * triggered once we've gone 2KB without finding any matches. 1623 */ 1624 ms->lazySkipping = step > kLazySkippingStep; 1625 continue; 1626 } 1627 1628 /* let's try to find a better solution */ 1629 if (depth>=1) 1630 while (ip<ilimit) { 1631 DEBUGLOG(7, "search depth 1"); 1632 ip ++; 1633 if ( (dictMode == ZSTD_noDict) 1634 && (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) { 1635 size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4; 1636 int const gain2 = (int)(mlRep * 3); 1637 int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1); 1638 if ((mlRep >= 4) && (gain2 > gain1)) 1639 matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip; 1640 } 1641 if (isDxS) { 1642 const U32 repIndex = (U32)(ip - base) - offset_1; 1643 const BYTE* repMatch = repIndex < prefixLowestIndex ? 1644 dictBase + (repIndex - dictIndexDelta) : 1645 base + repIndex; 1646 if ((ZSTD_index_overlap_check(prefixLowestIndex, repIndex)) 1647 && (MEM_read32(repMatch) == MEM_read32(ip)) ) { 1648 const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend; 1649 size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4; 1650 int const gain2 = (int)(mlRep * 3); 1651 int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1); 1652 if ((mlRep >= 4) && (gain2 > gain1)) 1653 matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip; 1654 } 1655 } 1656 { size_t ofbCandidate=999999999; 1657 size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode); 1658 int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */ 1659 int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4); 1660 if ((ml2 >= 4) && (gain2 > gain1)) { 1661 matchLength = ml2, offBase = ofbCandidate, start = ip; 1662 continue; /* search a better one */ 1663 } } 1664 1665 /* let's find an even better one */ 1666 if ((depth==2) && (ip<ilimit)) { 1667 DEBUGLOG(7, "search depth 2"); 1668 ip ++; 1669 if ( (dictMode == ZSTD_noDict) 1670 && (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) { 1671 size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4; 1672 int const gain2 = (int)(mlRep * 4); 1673 int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1); 1674 if ((mlRep >= 4) && (gain2 > gain1)) 1675 matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip; 1676 } 1677 if (isDxS) { 1678 const U32 repIndex = (U32)(ip - base) - offset_1; 1679 const BYTE* repMatch = repIndex < prefixLowestIndex ? 1680 dictBase + (repIndex - dictIndexDelta) : 1681 base + repIndex; 1682 if ((ZSTD_index_overlap_check(prefixLowestIndex, repIndex)) 1683 && (MEM_read32(repMatch) == MEM_read32(ip)) ) { 1684 const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend; 1685 size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4; 1686 int const gain2 = (int)(mlRep * 4); 1687 int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1); 1688 if ((mlRep >= 4) && (gain2 > gain1)) 1689 matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip; 1690 } 1691 } 1692 { size_t ofbCandidate=999999999; 1693 size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode); 1694 int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */ 1695 int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7); 1696 if ((ml2 >= 4) && (gain2 > gain1)) { 1697 matchLength = ml2, offBase = ofbCandidate, start = ip; 1698 continue; 1699 } } } 1700 break; /* nothing found : store previous solution */ 1701 } 1702 1703 /* NOTE: 1704 * Pay attention that `start[-value]` can lead to strange undefined behavior 1705 * notably if `value` is unsigned, resulting in a large positive `-value`. 1706 */ 1707 /* catch up */ 1708 if (OFFBASE_IS_OFFSET(offBase)) { 1709 if (dictMode == ZSTD_noDict) { 1710 while ( ((start > anchor) & (start - OFFBASE_TO_OFFSET(offBase) > prefixLowest)) 1711 && (start[-1] == (start-OFFBASE_TO_OFFSET(offBase))[-1]) ) /* only search for offset within prefix */ 1712 { start--; matchLength++; } 1713 } 1714 if (isDxS) { 1715 U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase)); 1716 const BYTE* match = (matchIndex < prefixLowestIndex) ? dictBase + matchIndex - dictIndexDelta : base + matchIndex; 1717 const BYTE* const mStart = (matchIndex < prefixLowestIndex) ? dictLowest : prefixLowest; 1718 while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */ 1719 } 1720 offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase); 1721 } 1722 /* store sequence */ 1723 _storeSequence: 1724 { size_t const litLength = (size_t)(start - anchor); 1725 ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength); 1726 anchor = ip = start + matchLength; 1727 } 1728 if (ms->lazySkipping) { 1729 /* We've found a match, disable lazy skipping mode, and refill the hash cache. */ 1730 if (searchMethod == search_rowHash) { 1731 ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit); 1732 } 1733 ms->lazySkipping = 0; 1734 } 1735 1736 /* check immediate repcode */ 1737 if (isDxS) { 1738 while (ip <= ilimit) { 1739 U32 const current2 = (U32)(ip-base); 1740 U32 const repIndex = current2 - offset_2; 1741 const BYTE* repMatch = repIndex < prefixLowestIndex ? 1742 dictBase - dictIndexDelta + repIndex : 1743 base + repIndex; 1744 if ( (ZSTD_index_overlap_check(prefixLowestIndex, repIndex)) 1745 && (MEM_read32(repMatch) == MEM_read32(ip)) ) { 1746 const BYTE* const repEnd2 = repIndex < prefixLowestIndex ? dictEnd : iend; 1747 matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd2, prefixLowest) + 4; 1748 offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap offset_2 <=> offset_1 */ 1749 ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength); 1750 ip += matchLength; 1751 anchor = ip; 1752 continue; 1753 } 1754 break; 1755 } 1756 } 1757 1758 if (dictMode == ZSTD_noDict) { 1759 while ( ((ip <= ilimit) & (offset_2>0)) 1760 && (MEM_read32(ip) == MEM_read32(ip - offset_2)) ) { 1761 /* store sequence */ 1762 matchLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4; 1763 offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap repcodes */ 1764 ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength); 1765 ip += matchLength; 1766 anchor = ip; 1767 continue; /* faster when present ... (?) */ 1768 } } } 1769 1770 /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0), 1771 * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */ 1772 offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2; 1773 1774 /* save reps for next block */ 1775 rep[0] = offset_1 ? offset_1 : offsetSaved1; 1776 rep[1] = offset_2 ? offset_2 : offsetSaved2; 1777 1778 /* Return the last literals size */ 1779 return (size_t)(iend - anchor); 1780 } 1781 #endif /* build exclusions */ 1782 1783 1784 #ifndef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR 1785 size_t ZSTD_compressBlock_greedy( 1786 ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1787 void const* src, size_t srcSize) 1788 { 1789 return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_noDict); 1790 } 1791 1792 size_t ZSTD_compressBlock_greedy_dictMatchState( 1793 ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1794 void const* src, size_t srcSize) 1795 { 1796 return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dictMatchState); 1797 } 1798 1799 size_t ZSTD_compressBlock_greedy_dedicatedDictSearch( 1800 ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1801 void const* src, size_t srcSize) 1802 { 1803 return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dedicatedDictSearch); 1804 } 1805 1806 size_t ZSTD_compressBlock_greedy_row( 1807 ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1808 void const* src, size_t srcSize) 1809 { 1810 return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_noDict); 1811 } 1812 1813 size_t ZSTD_compressBlock_greedy_dictMatchState_row( 1814 ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1815 void const* src, size_t srcSize) 1816 { 1817 return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dictMatchState); 1818 } 1819 1820 size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row( 1821 ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1822 void const* src, size_t srcSize) 1823 { 1824 return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dedicatedDictSearch); 1825 } 1826 #endif 1827 1828 #ifndef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR 1829 size_t ZSTD_compressBlock_lazy( 1830 ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1831 void const* src, size_t srcSize) 1832 { 1833 return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_noDict); 1834 } 1835 1836 size_t ZSTD_compressBlock_lazy_dictMatchState( 1837 ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1838 void const* src, size_t srcSize) 1839 { 1840 return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dictMatchState); 1841 } 1842 1843 size_t ZSTD_compressBlock_lazy_dedicatedDictSearch( 1844 ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1845 void const* src, size_t srcSize) 1846 { 1847 return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dedicatedDictSearch); 1848 } 1849 1850 size_t ZSTD_compressBlock_lazy_row( 1851 ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1852 void const* src, size_t srcSize) 1853 { 1854 return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_noDict); 1855 } 1856 1857 size_t ZSTD_compressBlock_lazy_dictMatchState_row( 1858 ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1859 void const* src, size_t srcSize) 1860 { 1861 return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dictMatchState); 1862 } 1863 1864 size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row( 1865 ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1866 void const* src, size_t srcSize) 1867 { 1868 return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dedicatedDictSearch); 1869 } 1870 #endif 1871 1872 #ifndef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR 1873 size_t ZSTD_compressBlock_lazy2( 1874 ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1875 void const* src, size_t srcSize) 1876 { 1877 return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_noDict); 1878 } 1879 1880 size_t ZSTD_compressBlock_lazy2_dictMatchState( 1881 ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1882 void const* src, size_t srcSize) 1883 { 1884 return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dictMatchState); 1885 } 1886 1887 size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch( 1888 ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1889 void const* src, size_t srcSize) 1890 { 1891 return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dedicatedDictSearch); 1892 } 1893 1894 size_t ZSTD_compressBlock_lazy2_row( 1895 ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1896 void const* src, size_t srcSize) 1897 { 1898 return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_noDict); 1899 } 1900 1901 size_t ZSTD_compressBlock_lazy2_dictMatchState_row( 1902 ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1903 void const* src, size_t srcSize) 1904 { 1905 return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dictMatchState); 1906 } 1907 1908 size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row( 1909 ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1910 void const* src, size_t srcSize) 1911 { 1912 return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dedicatedDictSearch); 1913 } 1914 #endif 1915 1916 #ifndef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR 1917 size_t ZSTD_compressBlock_btlazy2( 1918 ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1919 void const* src, size_t srcSize) 1920 { 1921 return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_noDict); 1922 } 1923 1924 size_t ZSTD_compressBlock_btlazy2_dictMatchState( 1925 ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1926 void const* src, size_t srcSize) 1927 { 1928 return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_dictMatchState); 1929 } 1930 #endif 1931 1932 #if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \ 1933 || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \ 1934 || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) \ 1935 || !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) 1936 FORCE_INLINE_TEMPLATE 1937 ZSTD_ALLOW_POINTER_OVERFLOW_ATTR 1938 size_t ZSTD_compressBlock_lazy_extDict_generic( 1939 ZSTD_MatchState_t* ms, SeqStore_t* seqStore, 1940 U32 rep[ZSTD_REP_NUM], 1941 const void* src, size_t srcSize, 1942 const searchMethod_e searchMethod, const U32 depth) 1943 { 1944 const BYTE* const istart = (const BYTE*)src; 1945 const BYTE* ip = istart; 1946 const BYTE* anchor = istart; 1947 const BYTE* const iend = istart + srcSize; 1948 const BYTE* const ilimit = searchMethod == search_rowHash ? iend - 8 - ZSTD_ROW_HASH_CACHE_SIZE : iend - 8; 1949 const BYTE* const base = ms->window.base; 1950 const U32 dictLimit = ms->window.dictLimit; 1951 const BYTE* const prefixStart = base + dictLimit; 1952 const BYTE* const dictBase = ms->window.dictBase; 1953 const BYTE* const dictEnd = dictBase + dictLimit; 1954 const BYTE* const dictStart = dictBase + ms->window.lowLimit; 1955 const U32 windowLog = ms->cParams.windowLog; 1956 const U32 mls = BOUNDED(4, ms->cParams.minMatch, 6); 1957 const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6); 1958 1959 U32 offset_1 = rep[0], offset_2 = rep[1]; 1960 1961 DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=%u)", (U32)searchMethod); 1962 1963 /* Reset the lazy skipping state */ 1964 ms->lazySkipping = 0; 1965 1966 /* init */ 1967 ip += (ip == prefixStart); 1968 if (searchMethod == search_rowHash) { 1969 ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit); 1970 } 1971 1972 /* Match Loop */ 1973 #if defined(__x86_64__) 1974 /* I've measured random a 5% speed loss on levels 5 & 6 (greedy) when the 1975 * code alignment is perturbed. To fix the instability align the loop on 32-bytes. 1976 */ 1977 __asm__(".p2align 5"); 1978 #endif 1979 while (ip < ilimit) { 1980 size_t matchLength=0; 1981 size_t offBase = REPCODE1_TO_OFFBASE; 1982 const BYTE* start=ip+1; 1983 U32 curr = (U32)(ip-base); 1984 1985 /* check repCode */ 1986 { const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr+1, windowLog); 1987 const U32 repIndex = (U32)(curr+1 - offset_1); 1988 const BYTE* const repBase = repIndex < dictLimit ? dictBase : base; 1989 const BYTE* const repMatch = repBase + repIndex; 1990 if ( (ZSTD_index_overlap_check(dictLimit, repIndex)) 1991 & (offset_1 <= curr+1 - windowLow) ) /* note: we are searching at curr+1 */ 1992 if (MEM_read32(ip+1) == MEM_read32(repMatch)) { 1993 /* repcode detected we should take it */ 1994 const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; 1995 matchLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repEnd, prefixStart) + 4; 1996 if (depth==0) goto _storeSequence; 1997 } } 1998 1999 /* first search (depth 0) */ 2000 { size_t ofbCandidate = 999999999; 2001 size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict); 2002 if (ml2 > matchLength) 2003 matchLength = ml2, start = ip, offBase = ofbCandidate; 2004 } 2005 2006 if (matchLength < 4) { 2007 size_t const step = ((size_t)(ip-anchor) >> kSearchStrength); 2008 ip += step + 1; /* jump faster over incompressible sections */ 2009 /* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time. 2010 * In this mode we stop inserting every position into our tables, and only insert 2011 * positions that we search, which is one in step positions. 2012 * The exact cutoff is flexible, I've just chosen a number that is reasonably high, 2013 * so we minimize the compression ratio loss in "normal" scenarios. This mode gets 2014 * triggered once we've gone 2KB without finding any matches. 2015 */ 2016 ms->lazySkipping = step > kLazySkippingStep; 2017 continue; 2018 } 2019 2020 /* let's try to find a better solution */ 2021 if (depth>=1) 2022 while (ip<ilimit) { 2023 ip ++; 2024 curr++; 2025 /* check repCode */ 2026 if (offBase) { 2027 const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog); 2028 const U32 repIndex = (U32)(curr - offset_1); 2029 const BYTE* const repBase = repIndex < dictLimit ? dictBase : base; 2030 const BYTE* const repMatch = repBase + repIndex; 2031 if ( (ZSTD_index_overlap_check(dictLimit, repIndex)) 2032 & (offset_1 <= curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */ 2033 if (MEM_read32(ip) == MEM_read32(repMatch)) { 2034 /* repcode detected */ 2035 const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; 2036 size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4; 2037 int const gain2 = (int)(repLength * 3); 2038 int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1); 2039 if ((repLength >= 4) && (gain2 > gain1)) 2040 matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip; 2041 } } 2042 2043 /* search match, depth 1 */ 2044 { size_t ofbCandidate = 999999999; 2045 size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict); 2046 int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */ 2047 int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4); 2048 if ((ml2 >= 4) && (gain2 > gain1)) { 2049 matchLength = ml2, offBase = ofbCandidate, start = ip; 2050 continue; /* search a better one */ 2051 } } 2052 2053 /* let's find an even better one */ 2054 if ((depth==2) && (ip<ilimit)) { 2055 ip ++; 2056 curr++; 2057 /* check repCode */ 2058 if (offBase) { 2059 const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog); 2060 const U32 repIndex = (U32)(curr - offset_1); 2061 const BYTE* const repBase = repIndex < dictLimit ? dictBase : base; 2062 const BYTE* const repMatch = repBase + repIndex; 2063 if ( (ZSTD_index_overlap_check(dictLimit, repIndex)) 2064 & (offset_1 <= curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */ 2065 if (MEM_read32(ip) == MEM_read32(repMatch)) { 2066 /* repcode detected */ 2067 const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; 2068 size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4; 2069 int const gain2 = (int)(repLength * 4); 2070 int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1); 2071 if ((repLength >= 4) && (gain2 > gain1)) 2072 matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip; 2073 } } 2074 2075 /* search match, depth 2 */ 2076 { size_t ofbCandidate = 999999999; 2077 size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict); 2078 int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */ 2079 int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7); 2080 if ((ml2 >= 4) && (gain2 > gain1)) { 2081 matchLength = ml2, offBase = ofbCandidate, start = ip; 2082 continue; 2083 } } } 2084 break; /* nothing found : store previous solution */ 2085 } 2086 2087 /* catch up */ 2088 if (OFFBASE_IS_OFFSET(offBase)) { 2089 U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase)); 2090 const BYTE* match = (matchIndex < dictLimit) ? dictBase + matchIndex : base + matchIndex; 2091 const BYTE* const mStart = (matchIndex < dictLimit) ? dictStart : prefixStart; 2092 while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */ 2093 offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase); 2094 } 2095 2096 /* store sequence */ 2097 _storeSequence: 2098 { size_t const litLength = (size_t)(start - anchor); 2099 ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength); 2100 anchor = ip = start + matchLength; 2101 } 2102 if (ms->lazySkipping) { 2103 /* We've found a match, disable lazy skipping mode, and refill the hash cache. */ 2104 if (searchMethod == search_rowHash) { 2105 ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit); 2106 } 2107 ms->lazySkipping = 0; 2108 } 2109 2110 /* check immediate repcode */ 2111 while (ip <= ilimit) { 2112 const U32 repCurrent = (U32)(ip-base); 2113 const U32 windowLow = ZSTD_getLowestMatchIndex(ms, repCurrent, windowLog); 2114 const U32 repIndex = repCurrent - offset_2; 2115 const BYTE* const repBase = repIndex < dictLimit ? dictBase : base; 2116 const BYTE* const repMatch = repBase + repIndex; 2117 if ( (ZSTD_index_overlap_check(dictLimit, repIndex)) 2118 & (offset_2 <= repCurrent - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */ 2119 if (MEM_read32(ip) == MEM_read32(repMatch)) { 2120 /* repcode detected we should take it */ 2121 const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; 2122 matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4; 2123 offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap offset history */ 2124 ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength); 2125 ip += matchLength; 2126 anchor = ip; 2127 continue; /* faster when present ... (?) */ 2128 } 2129 break; 2130 } } 2131 2132 /* Save reps for next block */ 2133 rep[0] = offset_1; 2134 rep[1] = offset_2; 2135 2136 /* Return the last literals size */ 2137 return (size_t)(iend - anchor); 2138 } 2139 #endif /* build exclusions */ 2140 2141 #ifndef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR 2142 size_t ZSTD_compressBlock_greedy_extDict( 2143 ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 2144 void const* src, size_t srcSize) 2145 { 2146 return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0); 2147 } 2148 2149 size_t ZSTD_compressBlock_greedy_extDict_row( 2150 ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 2151 void const* src, size_t srcSize) 2152 { 2153 return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0); 2154 } 2155 #endif 2156 2157 #ifndef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR 2158 size_t ZSTD_compressBlock_lazy_extDict( 2159 ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 2160 void const* src, size_t srcSize) 2161 2162 { 2163 return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1); 2164 } 2165 2166 size_t ZSTD_compressBlock_lazy_extDict_row( 2167 ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 2168 void const* src, size_t srcSize) 2169 2170 { 2171 return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1); 2172 } 2173 #endif 2174 2175 #ifndef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR 2176 size_t ZSTD_compressBlock_lazy2_extDict( 2177 ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 2178 void const* src, size_t srcSize) 2179 2180 { 2181 return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2); 2182 } 2183 2184 size_t ZSTD_compressBlock_lazy2_extDict_row( 2185 ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 2186 void const* src, size_t srcSize) 2187 { 2188 return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2); 2189 } 2190 #endif 2191 2192 #ifndef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR 2193 size_t ZSTD_compressBlock_btlazy2_extDict( 2194 ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 2195 void const* src, size_t srcSize) 2196 2197 { 2198 return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2); 2199 } 2200 #endif 2201