1 // SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause 2 /* ****************************************************************** 3 * huff0 huffman decoder, 4 * part of Finite State Entropy library 5 * Copyright (c) Meta Platforms, Inc. and affiliates. 6 * 7 * You can contact the author at : 8 * - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy 9 * 10 * This source code is licensed under both the BSD-style license (found in the 11 * LICENSE file in the root directory of this source tree) and the GPLv2 (found 12 * in the COPYING file in the root directory of this source tree). 13 * You may select, at your option, one of the above-listed licenses. 14 ****************************************************************** */ 15 16 /* ************************************************************** 17 * Dependencies 18 ****************************************************************/ 19 #include "../common/zstd_deps.h" /* ZSTD_memcpy, ZSTD_memset */ 20 #include "../common/compiler.h" 21 #include "../common/bitstream.h" /* BIT_* */ 22 #include "../common/fse.h" /* to compress headers */ 23 #include "../common/huf.h" 24 #include "../common/error_private.h" 25 #include "../common/zstd_internal.h" 26 #include "../common/bits.h" /* ZSTD_highbit32, ZSTD_countTrailingZeros64 */ 27 28 /* ************************************************************** 29 * Constants 30 ****************************************************************/ 31 32 #define HUF_DECODER_FAST_TABLELOG 11 33 34 /* ************************************************************** 35 * Macros 36 ****************************************************************/ 37 38 #ifdef HUF_DISABLE_FAST_DECODE 39 # define HUF_ENABLE_FAST_DECODE 0 40 #else 41 # define HUF_ENABLE_FAST_DECODE 1 42 #endif 43 44 /* These two optional macros force the use one way or another of the two 45 * Huffman decompression implementations. You can't force in both directions 46 * at the same time. 47 */ 48 #if defined(HUF_FORCE_DECOMPRESS_X1) && \ 49 defined(HUF_FORCE_DECOMPRESS_X2) 50 #error "Cannot force the use of the X1 and X2 decoders at the same time!" 51 #endif 52 53 /* When DYNAMIC_BMI2 is enabled, fast decoders are only called when bmi2 is 54 * supported at runtime, so we can add the BMI2 target attribute. 55 * When it is disabled, we will still get BMI2 if it is enabled statically. 56 */ 57 #if DYNAMIC_BMI2 58 # define HUF_FAST_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE 59 #else 60 # define HUF_FAST_BMI2_ATTRS 61 #endif 62 63 #define HUF_EXTERN_C 64 #define HUF_ASM_DECL HUF_EXTERN_C 65 66 #if DYNAMIC_BMI2 67 # define HUF_NEED_BMI2_FUNCTION 1 68 #else 69 # define HUF_NEED_BMI2_FUNCTION 0 70 #endif 71 72 /* ************************************************************** 73 * Error Management 74 ****************************************************************/ 75 #define HUF_isError ERR_isError 76 77 78 /* ************************************************************** 79 * Byte alignment for workSpace management 80 ****************************************************************/ 81 #define HUF_ALIGN(x, a) HUF_ALIGN_MASK((x), (a) - 1) 82 #define HUF_ALIGN_MASK(x, mask) (((x) + (mask)) & ~(mask)) 83 84 85 /* ************************************************************** 86 * BMI2 Variant Wrappers 87 ****************************************************************/ 88 typedef size_t (*HUF_DecompressUsingDTableFn)(void *dst, size_t dstSize, 89 const void *cSrc, 90 size_t cSrcSize, 91 const HUF_DTable *DTable); 92 93 #if DYNAMIC_BMI2 94 95 #define HUF_DGEN(fn) \ 96 \ 97 static size_t fn##_default( \ 98 void* dst, size_t dstSize, \ 99 const void* cSrc, size_t cSrcSize, \ 100 const HUF_DTable* DTable) \ 101 { \ 102 return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable); \ 103 } \ 104 \ 105 static BMI2_TARGET_ATTRIBUTE size_t fn##_bmi2( \ 106 void* dst, size_t dstSize, \ 107 const void* cSrc, size_t cSrcSize, \ 108 const HUF_DTable* DTable) \ 109 { \ 110 return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable); \ 111 } \ 112 \ 113 static size_t fn(void* dst, size_t dstSize, void const* cSrc, \ 114 size_t cSrcSize, HUF_DTable const* DTable, int flags) \ 115 { \ 116 if (flags & HUF_flags_bmi2) { \ 117 return fn##_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); \ 118 } \ 119 return fn##_default(dst, dstSize, cSrc, cSrcSize, DTable); \ 120 } 121 122 #else 123 124 #define HUF_DGEN(fn) \ 125 static size_t fn(void* dst, size_t dstSize, void const* cSrc, \ 126 size_t cSrcSize, HUF_DTable const* DTable, int flags) \ 127 { \ 128 (void)flags; \ 129 return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable); \ 130 } 131 132 #endif 133 134 135 /*-***************************/ 136 /* generic DTableDesc */ 137 /*-***************************/ 138 typedef struct { BYTE maxTableLog; BYTE tableType; BYTE tableLog; BYTE reserved; } DTableDesc; 139 140 static DTableDesc HUF_getDTableDesc(const HUF_DTable* table) 141 { 142 DTableDesc dtd; 143 ZSTD_memcpy(&dtd, table, sizeof(dtd)); 144 return dtd; 145 } 146 147 static size_t HUF_initFastDStream(BYTE const* ip) { 148 BYTE const lastByte = ip[7]; 149 size_t const bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0; 150 size_t const value = MEM_readLEST(ip) | 1; 151 assert(bitsConsumed <= 8); 152 assert(sizeof(size_t) == 8); 153 return value << bitsConsumed; 154 } 155 156 157 /* 158 * The input/output arguments to the Huffman fast decoding loop: 159 * 160 * ip [in/out] - The input pointers, must be updated to reflect what is consumed. 161 * op [in/out] - The output pointers, must be updated to reflect what is written. 162 * bits [in/out] - The bitstream containers, must be updated to reflect the current state. 163 * dt [in] - The decoding table. 164 * ilowest [in] - The beginning of the valid range of the input. Decoders may read 165 * down to this pointer. It may be below iend[0]. 166 * oend [in] - The end of the output stream. op[3] must not cross oend. 167 * iend [in] - The end of each input stream. ip[i] may cross iend[i], 168 * as long as it is above ilowest, but that indicates corruption. 169 */ 170 typedef struct { 171 BYTE const* ip[4]; 172 BYTE* op[4]; 173 U64 bits[4]; 174 void const* dt; 175 BYTE const* ilowest; 176 BYTE* oend; 177 BYTE const* iend[4]; 178 } HUF_DecompressFastArgs; 179 180 typedef void (*HUF_DecompressFastLoopFn)(HUF_DecompressFastArgs*); 181 182 /* 183 * Initializes args for the fast decoding loop. 184 * @returns 1 on success 185 * 0 if the fallback implementation should be used. 186 * Or an error code on failure. 187 */ 188 static size_t HUF_DecompressFastArgs_init(HUF_DecompressFastArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable) 189 { 190 void const* dt = DTable + 1; 191 U32 const dtLog = HUF_getDTableDesc(DTable).tableLog; 192 193 const BYTE* const istart = (const BYTE*)src; 194 195 BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize); 196 197 /* The fast decoding loop assumes 64-bit little-endian. 198 * This condition is false on x32. 199 */ 200 if (!MEM_isLittleEndian() || MEM_32bits()) 201 return 0; 202 203 /* Avoid nullptr addition */ 204 if (dstSize == 0) 205 return 0; 206 assert(dst != NULL); 207 208 /* strict minimum : jump table + 1 byte per stream */ 209 if (srcSize < 10) 210 return ERROR(corruption_detected); 211 212 /* Must have at least 8 bytes per stream because we don't handle initializing smaller bit containers. 213 * If table log is not correct at this point, fallback to the old decoder. 214 * On small inputs we don't have enough data to trigger the fast loop, so use the old decoder. 215 */ 216 if (dtLog != HUF_DECODER_FAST_TABLELOG) 217 return 0; 218 219 /* Read the jump table. */ 220 { 221 size_t const length1 = MEM_readLE16(istart); 222 size_t const length2 = MEM_readLE16(istart+2); 223 size_t const length3 = MEM_readLE16(istart+4); 224 size_t const length4 = srcSize - (length1 + length2 + length3 + 6); 225 args->iend[0] = istart + 6; /* jumpTable */ 226 args->iend[1] = args->iend[0] + length1; 227 args->iend[2] = args->iend[1] + length2; 228 args->iend[3] = args->iend[2] + length3; 229 230 /* HUF_initFastDStream() requires this, and this small of an input 231 * won't benefit from the ASM loop anyways. 232 */ 233 if (length1 < 8 || length2 < 8 || length3 < 8 || length4 < 8) 234 return 0; 235 if (length4 > srcSize) return ERROR(corruption_detected); /* overflow */ 236 } 237 /* ip[] contains the position that is currently loaded into bits[]. */ 238 args->ip[0] = args->iend[1] - sizeof(U64); 239 args->ip[1] = args->iend[2] - sizeof(U64); 240 args->ip[2] = args->iend[3] - sizeof(U64); 241 args->ip[3] = (BYTE const*)src + srcSize - sizeof(U64); 242 243 /* op[] contains the output pointers. */ 244 args->op[0] = (BYTE*)dst; 245 args->op[1] = args->op[0] + (dstSize+3)/4; 246 args->op[2] = args->op[1] + (dstSize+3)/4; 247 args->op[3] = args->op[2] + (dstSize+3)/4; 248 249 /* No point to call the ASM loop for tiny outputs. */ 250 if (args->op[3] >= oend) 251 return 0; 252 253 /* bits[] is the bit container. 254 * It is read from the MSB down to the LSB. 255 * It is shifted left as it is read, and zeros are 256 * shifted in. After the lowest valid bit a 1 is 257 * set, so that CountTrailingZeros(bits[]) can be used 258 * to count how many bits we've consumed. 259 */ 260 args->bits[0] = HUF_initFastDStream(args->ip[0]); 261 args->bits[1] = HUF_initFastDStream(args->ip[1]); 262 args->bits[2] = HUF_initFastDStream(args->ip[2]); 263 args->bits[3] = HUF_initFastDStream(args->ip[3]); 264 265 /* The decoders must be sure to never read beyond ilowest. 266 * This is lower than iend[0], but allowing decoders to read 267 * down to ilowest can allow an extra iteration or two in the 268 * fast loop. 269 */ 270 args->ilowest = istart; 271 272 args->oend = oend; 273 args->dt = dt; 274 275 return 1; 276 } 277 278 static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressFastArgs const* args, int stream, BYTE* segmentEnd) 279 { 280 /* Validate that we haven't overwritten. */ 281 if (args->op[stream] > segmentEnd) 282 return ERROR(corruption_detected); 283 /* Validate that we haven't read beyond iend[]. 284 * Note that ip[] may be < iend[] because the MSB is 285 * the next bit to read, and we may have consumed 100% 286 * of the stream, so down to iend[i] - 8 is valid. 287 */ 288 if (args->ip[stream] < args->iend[stream] - 8) 289 return ERROR(corruption_detected); 290 291 /* Construct the BIT_DStream_t. */ 292 assert(sizeof(size_t) == 8); 293 bit->bitContainer = MEM_readLEST(args->ip[stream]); 294 bit->bitsConsumed = ZSTD_countTrailingZeros64(args->bits[stream]); 295 bit->start = (const char*)args->ilowest; 296 bit->limitPtr = bit->start + sizeof(size_t); 297 bit->ptr = (const char*)args->ip[stream]; 298 299 return 0; 300 } 301 302 /* Calls X(N) for each stream 0, 1, 2, 3. */ 303 #define HUF_4X_FOR_EACH_STREAM(X) \ 304 do { \ 305 X(0); \ 306 X(1); \ 307 X(2); \ 308 X(3); \ 309 } while (0) 310 311 /* Calls X(N, var) for each stream 0, 1, 2, 3. */ 312 #define HUF_4X_FOR_EACH_STREAM_WITH_VAR(X, var) \ 313 do { \ 314 X(0, (var)); \ 315 X(1, (var)); \ 316 X(2, (var)); \ 317 X(3, (var)); \ 318 } while (0) 319 320 321 #ifndef HUF_FORCE_DECOMPRESS_X2 322 323 /*-***************************/ 324 /* single-symbol decoding */ 325 /*-***************************/ 326 typedef struct { BYTE nbBits; BYTE byte; } HUF_DEltX1; /* single-symbol decoding */ 327 328 /* 329 * Packs 4 HUF_DEltX1 structs into a U64. This is used to lay down 4 entries at 330 * a time. 331 */ 332 static U64 HUF_DEltX1_set4(BYTE symbol, BYTE nbBits) { 333 U64 D4; 334 if (MEM_isLittleEndian()) { 335 D4 = (U64)((symbol << 8) + nbBits); 336 } else { 337 D4 = (U64)(symbol + (nbBits << 8)); 338 } 339 assert(D4 < (1U << 16)); 340 D4 *= 0x0001000100010001ULL; 341 return D4; 342 } 343 344 /* 345 * Increase the tableLog to targetTableLog and rescales the stats. 346 * If tableLog > targetTableLog this is a no-op. 347 * @returns New tableLog 348 */ 349 static U32 HUF_rescaleStats(BYTE* huffWeight, U32* rankVal, U32 nbSymbols, U32 tableLog, U32 targetTableLog) 350 { 351 if (tableLog > targetTableLog) 352 return tableLog; 353 if (tableLog < targetTableLog) { 354 U32 const scale = targetTableLog - tableLog; 355 U32 s; 356 /* Increase the weight for all non-zero probability symbols by scale. */ 357 for (s = 0; s < nbSymbols; ++s) { 358 huffWeight[s] += (BYTE)((huffWeight[s] == 0) ? 0 : scale); 359 } 360 /* Update rankVal to reflect the new weights. 361 * All weights except 0 get moved to weight + scale. 362 * Weights [1, scale] are empty. 363 */ 364 for (s = targetTableLog; s > scale; --s) { 365 rankVal[s] = rankVal[s - scale]; 366 } 367 for (s = scale; s > 0; --s) { 368 rankVal[s] = 0; 369 } 370 } 371 return targetTableLog; 372 } 373 374 typedef struct { 375 U32 rankVal[HUF_TABLELOG_ABSOLUTEMAX + 1]; 376 U32 rankStart[HUF_TABLELOG_ABSOLUTEMAX + 1]; 377 U32 statsWksp[HUF_READ_STATS_WORKSPACE_SIZE_U32]; 378 BYTE symbols[HUF_SYMBOLVALUE_MAX + 1]; 379 BYTE huffWeight[HUF_SYMBOLVALUE_MAX + 1]; 380 } HUF_ReadDTableX1_Workspace; 381 382 size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags) 383 { 384 U32 tableLog = 0; 385 U32 nbSymbols = 0; 386 size_t iSize; 387 void* const dtPtr = DTable + 1; 388 HUF_DEltX1* const dt = (HUF_DEltX1*)dtPtr; 389 HUF_ReadDTableX1_Workspace* wksp = (HUF_ReadDTableX1_Workspace*)workSpace; 390 391 DEBUG_STATIC_ASSERT(HUF_DECOMPRESS_WORKSPACE_SIZE >= sizeof(*wksp)); 392 if (sizeof(*wksp) > wkspSize) return ERROR(tableLog_tooLarge); 393 394 DEBUG_STATIC_ASSERT(sizeof(DTableDesc) == sizeof(HUF_DTable)); 395 /* ZSTD_memset(huffWeight, 0, sizeof(huffWeight)); */ /* is not necessary, even though some analyzer complain ... */ 396 397 iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), flags); 398 if (HUF_isError(iSize)) return iSize; 399 400 401 /* Table header */ 402 { DTableDesc dtd = HUF_getDTableDesc(DTable); 403 U32 const maxTableLog = dtd.maxTableLog + 1; 404 U32 const targetTableLog = MIN(maxTableLog, HUF_DECODER_FAST_TABLELOG); 405 tableLog = HUF_rescaleStats(wksp->huffWeight, wksp->rankVal, nbSymbols, tableLog, targetTableLog); 406 if (tableLog > (U32)(dtd.maxTableLog+1)) return ERROR(tableLog_tooLarge); /* DTable too small, Huffman tree cannot fit in */ 407 dtd.tableType = 0; 408 dtd.tableLog = (BYTE)tableLog; 409 ZSTD_memcpy(DTable, &dtd, sizeof(dtd)); 410 } 411 412 /* Compute symbols and rankStart given rankVal: 413 * 414 * rankVal already contains the number of values of each weight. 415 * 416 * symbols contains the symbols ordered by weight. First are the rankVal[0] 417 * weight 0 symbols, followed by the rankVal[1] weight 1 symbols, and so on. 418 * symbols[0] is filled (but unused) to avoid a branch. 419 * 420 * rankStart contains the offset where each rank belongs in the DTable. 421 * rankStart[0] is not filled because there are no entries in the table for 422 * weight 0. 423 */ 424 { int n; 425 U32 nextRankStart = 0; 426 int const unroll = 4; 427 int const nLimit = (int)nbSymbols - unroll + 1; 428 for (n=0; n<(int)tableLog+1; n++) { 429 U32 const curr = nextRankStart; 430 nextRankStart += wksp->rankVal[n]; 431 wksp->rankStart[n] = curr; 432 } 433 for (n=0; n < nLimit; n += unroll) { 434 int u; 435 for (u=0; u < unroll; ++u) { 436 size_t const w = wksp->huffWeight[n+u]; 437 wksp->symbols[wksp->rankStart[w]++] = (BYTE)(n+u); 438 } 439 } 440 for (; n < (int)nbSymbols; ++n) { 441 size_t const w = wksp->huffWeight[n]; 442 wksp->symbols[wksp->rankStart[w]++] = (BYTE)n; 443 } 444 } 445 446 /* fill DTable 447 * We fill all entries of each weight in order. 448 * That way length is a constant for each iteration of the outer loop. 449 * We can switch based on the length to a different inner loop which is 450 * optimized for that particular case. 451 */ 452 { U32 w; 453 int symbol = wksp->rankVal[0]; 454 int rankStart = 0; 455 for (w=1; w<tableLog+1; ++w) { 456 int const symbolCount = wksp->rankVal[w]; 457 int const length = (1 << w) >> 1; 458 int uStart = rankStart; 459 BYTE const nbBits = (BYTE)(tableLog + 1 - w); 460 int s; 461 int u; 462 switch (length) { 463 case 1: 464 for (s=0; s<symbolCount; ++s) { 465 HUF_DEltX1 D; 466 D.byte = wksp->symbols[symbol + s]; 467 D.nbBits = nbBits; 468 dt[uStart] = D; 469 uStart += 1; 470 } 471 break; 472 case 2: 473 for (s=0; s<symbolCount; ++s) { 474 HUF_DEltX1 D; 475 D.byte = wksp->symbols[symbol + s]; 476 D.nbBits = nbBits; 477 dt[uStart+0] = D; 478 dt[uStart+1] = D; 479 uStart += 2; 480 } 481 break; 482 case 4: 483 for (s=0; s<symbolCount; ++s) { 484 U64 const D4 = HUF_DEltX1_set4(wksp->symbols[symbol + s], nbBits); 485 MEM_write64(dt + uStart, D4); 486 uStart += 4; 487 } 488 break; 489 case 8: 490 for (s=0; s<symbolCount; ++s) { 491 U64 const D4 = HUF_DEltX1_set4(wksp->symbols[symbol + s], nbBits); 492 MEM_write64(dt + uStart, D4); 493 MEM_write64(dt + uStart + 4, D4); 494 uStart += 8; 495 } 496 break; 497 default: 498 for (s=0; s<symbolCount; ++s) { 499 U64 const D4 = HUF_DEltX1_set4(wksp->symbols[symbol + s], nbBits); 500 for (u=0; u < length; u += 16) { 501 MEM_write64(dt + uStart + u + 0, D4); 502 MEM_write64(dt + uStart + u + 4, D4); 503 MEM_write64(dt + uStart + u + 8, D4); 504 MEM_write64(dt + uStart + u + 12, D4); 505 } 506 assert(u == length); 507 uStart += length; 508 } 509 break; 510 } 511 symbol += symbolCount; 512 rankStart += symbolCount * length; 513 } 514 } 515 return iSize; 516 } 517 518 FORCE_INLINE_TEMPLATE BYTE 519 HUF_decodeSymbolX1(BIT_DStream_t* Dstream, const HUF_DEltX1* dt, const U32 dtLog) 520 { 521 size_t const val = BIT_lookBitsFast(Dstream, dtLog); /* note : dtLog >= 1 */ 522 BYTE const c = dt[val].byte; 523 BIT_skipBits(Dstream, dt[val].nbBits); 524 return c; 525 } 526 527 #define HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr) \ 528 do { *ptr++ = HUF_decodeSymbolX1(DStreamPtr, dt, dtLog); } while (0) 529 530 #define HUF_DECODE_SYMBOLX1_1(ptr, DStreamPtr) \ 531 do { \ 532 if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \ 533 HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr); \ 534 } while (0) 535 536 #define HUF_DECODE_SYMBOLX1_2(ptr, DStreamPtr) \ 537 do { \ 538 if (MEM_64bits()) \ 539 HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr); \ 540 } while (0) 541 542 HINT_INLINE size_t 543 HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, const HUF_DEltX1* const dt, const U32 dtLog) 544 { 545 BYTE* const pStart = p; 546 547 /* up to 4 symbols at a time */ 548 if ((pEnd - p) > 3) { 549 while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-3)) { 550 HUF_DECODE_SYMBOLX1_2(p, bitDPtr); 551 HUF_DECODE_SYMBOLX1_1(p, bitDPtr); 552 HUF_DECODE_SYMBOLX1_2(p, bitDPtr); 553 HUF_DECODE_SYMBOLX1_0(p, bitDPtr); 554 } 555 } else { 556 BIT_reloadDStream(bitDPtr); 557 } 558 559 /* [0-3] symbols remaining */ 560 if (MEM_32bits()) 561 while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd)) 562 HUF_DECODE_SYMBOLX1_0(p, bitDPtr); 563 564 /* no more data to retrieve from bitstream, no need to reload */ 565 while (p < pEnd) 566 HUF_DECODE_SYMBOLX1_0(p, bitDPtr); 567 568 return (size_t)(pEnd-pStart); 569 } 570 571 FORCE_INLINE_TEMPLATE size_t 572 HUF_decompress1X1_usingDTable_internal_body( 573 void* dst, size_t dstSize, 574 const void* cSrc, size_t cSrcSize, 575 const HUF_DTable* DTable) 576 { 577 BYTE* op = (BYTE*)dst; 578 BYTE* const oend = ZSTD_maybeNullPtrAdd(op, dstSize); 579 const void* dtPtr = DTable + 1; 580 const HUF_DEltX1* const dt = (const HUF_DEltX1*)dtPtr; 581 BIT_DStream_t bitD; 582 DTableDesc const dtd = HUF_getDTableDesc(DTable); 583 U32 const dtLog = dtd.tableLog; 584 585 CHECK_F( BIT_initDStream(&bitD, cSrc, cSrcSize) ); 586 587 HUF_decodeStreamX1(op, &bitD, oend, dt, dtLog); 588 589 if (!BIT_endOfDStream(&bitD)) return ERROR(corruption_detected); 590 591 return dstSize; 592 } 593 594 /* HUF_decompress4X1_usingDTable_internal_body(): 595 * Conditions : 596 * @dstSize >= 6 597 */ 598 FORCE_INLINE_TEMPLATE size_t 599 HUF_decompress4X1_usingDTable_internal_body( 600 void* dst, size_t dstSize, 601 const void* cSrc, size_t cSrcSize, 602 const HUF_DTable* DTable) 603 { 604 /* Check */ 605 if (cSrcSize < 10) return ERROR(corruption_detected); /* strict minimum : jump table + 1 byte per stream */ 606 if (dstSize < 6) return ERROR(corruption_detected); /* stream 4-split doesn't work */ 607 608 { const BYTE* const istart = (const BYTE*) cSrc; 609 BYTE* const ostart = (BYTE*) dst; 610 BYTE* const oend = ostart + dstSize; 611 BYTE* const olimit = oend - 3; 612 const void* const dtPtr = DTable + 1; 613 const HUF_DEltX1* const dt = (const HUF_DEltX1*)dtPtr; 614 615 /* Init */ 616 BIT_DStream_t bitD1; 617 BIT_DStream_t bitD2; 618 BIT_DStream_t bitD3; 619 BIT_DStream_t bitD4; 620 size_t const length1 = MEM_readLE16(istart); 621 size_t const length2 = MEM_readLE16(istart+2); 622 size_t const length3 = MEM_readLE16(istart+4); 623 size_t const length4 = cSrcSize - (length1 + length2 + length3 + 6); 624 const BYTE* const istart1 = istart + 6; /* jumpTable */ 625 const BYTE* const istart2 = istart1 + length1; 626 const BYTE* const istart3 = istart2 + length2; 627 const BYTE* const istart4 = istart3 + length3; 628 const size_t segmentSize = (dstSize+3) / 4; 629 BYTE* const opStart2 = ostart + segmentSize; 630 BYTE* const opStart3 = opStart2 + segmentSize; 631 BYTE* const opStart4 = opStart3 + segmentSize; 632 BYTE* op1 = ostart; 633 BYTE* op2 = opStart2; 634 BYTE* op3 = opStart3; 635 BYTE* op4 = opStart4; 636 DTableDesc const dtd = HUF_getDTableDesc(DTable); 637 U32 const dtLog = dtd.tableLog; 638 U32 endSignal = 1; 639 640 if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */ 641 if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */ 642 assert(dstSize >= 6); /* validated above */ 643 CHECK_F( BIT_initDStream(&bitD1, istart1, length1) ); 644 CHECK_F( BIT_initDStream(&bitD2, istart2, length2) ); 645 CHECK_F( BIT_initDStream(&bitD3, istart3, length3) ); 646 CHECK_F( BIT_initDStream(&bitD4, istart4, length4) ); 647 648 /* up to 16 symbols per loop (4 symbols per stream) in 64-bit mode */ 649 if ((size_t)(oend - op4) >= sizeof(size_t)) { 650 for ( ; (endSignal) & (op4 < olimit) ; ) { 651 HUF_DECODE_SYMBOLX1_2(op1, &bitD1); 652 HUF_DECODE_SYMBOLX1_2(op2, &bitD2); 653 HUF_DECODE_SYMBOLX1_2(op3, &bitD3); 654 HUF_DECODE_SYMBOLX1_2(op4, &bitD4); 655 HUF_DECODE_SYMBOLX1_1(op1, &bitD1); 656 HUF_DECODE_SYMBOLX1_1(op2, &bitD2); 657 HUF_DECODE_SYMBOLX1_1(op3, &bitD3); 658 HUF_DECODE_SYMBOLX1_1(op4, &bitD4); 659 HUF_DECODE_SYMBOLX1_2(op1, &bitD1); 660 HUF_DECODE_SYMBOLX1_2(op2, &bitD2); 661 HUF_DECODE_SYMBOLX1_2(op3, &bitD3); 662 HUF_DECODE_SYMBOLX1_2(op4, &bitD4); 663 HUF_DECODE_SYMBOLX1_0(op1, &bitD1); 664 HUF_DECODE_SYMBOLX1_0(op2, &bitD2); 665 HUF_DECODE_SYMBOLX1_0(op3, &bitD3); 666 HUF_DECODE_SYMBOLX1_0(op4, &bitD4); 667 endSignal &= BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished; 668 endSignal &= BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished; 669 endSignal &= BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished; 670 endSignal &= BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished; 671 } 672 } 673 674 /* check corruption */ 675 /* note : should not be necessary : op# advance in lock step, and we control op4. 676 * but curiously, binary generated by gcc 7.2 & 7.3 with -mbmi2 runs faster when >=1 test is present */ 677 if (op1 > opStart2) return ERROR(corruption_detected); 678 if (op2 > opStart3) return ERROR(corruption_detected); 679 if (op3 > opStart4) return ERROR(corruption_detected); 680 /* note : op4 supposed already verified within main loop */ 681 682 /* finish bitStreams one by one */ 683 HUF_decodeStreamX1(op1, &bitD1, opStart2, dt, dtLog); 684 HUF_decodeStreamX1(op2, &bitD2, opStart3, dt, dtLog); 685 HUF_decodeStreamX1(op3, &bitD3, opStart4, dt, dtLog); 686 HUF_decodeStreamX1(op4, &bitD4, oend, dt, dtLog); 687 688 /* check */ 689 { U32 const endCheck = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4); 690 if (!endCheck) return ERROR(corruption_detected); } 691 692 /* decoded size */ 693 return dstSize; 694 } 695 } 696 697 #if HUF_NEED_BMI2_FUNCTION 698 static BMI2_TARGET_ATTRIBUTE 699 size_t HUF_decompress4X1_usingDTable_internal_bmi2(void* dst, size_t dstSize, void const* cSrc, 700 size_t cSrcSize, HUF_DTable const* DTable) { 701 return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable); 702 } 703 #endif 704 705 static 706 size_t HUF_decompress4X1_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc, 707 size_t cSrcSize, HUF_DTable const* DTable) { 708 return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable); 709 } 710 711 #if ZSTD_ENABLE_ASM_X86_64_BMI2 712 713 HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN; 714 715 #endif 716 717 static HUF_FAST_BMI2_ATTRS 718 void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args) 719 { 720 U64 bits[4]; 721 BYTE const* ip[4]; 722 BYTE* op[4]; 723 U16 const* const dtable = (U16 const*)args->dt; 724 BYTE* const oend = args->oend; 725 BYTE const* const ilowest = args->ilowest; 726 727 /* Copy the arguments to local variables */ 728 ZSTD_memcpy(&bits, &args->bits, sizeof(bits)); 729 ZSTD_memcpy((void*)(&ip), &args->ip, sizeof(ip)); 730 ZSTD_memcpy(&op, &args->op, sizeof(op)); 731 732 assert(MEM_isLittleEndian()); 733 assert(!MEM_32bits()); 734 735 for (;;) { 736 BYTE* olimit; 737 int stream; 738 739 /* Assert loop preconditions */ 740 #ifndef NDEBUG 741 for (stream = 0; stream < 4; ++stream) { 742 assert(op[stream] <= (stream == 3 ? oend : op[stream + 1])); 743 assert(ip[stream] >= ilowest); 744 } 745 #endif 746 /* Compute olimit */ 747 { 748 /* Each iteration produces 5 output symbols per stream */ 749 size_t const oiters = (size_t)(oend - op[3]) / 5; 750 /* Each iteration consumes up to 11 bits * 5 = 55 bits < 7 bytes 751 * per stream. 752 */ 753 size_t const iiters = (size_t)(ip[0] - ilowest) / 7; 754 /* We can safely run iters iterations before running bounds checks */ 755 size_t const iters = MIN(oiters, iiters); 756 size_t const symbols = iters * 5; 757 758 /* We can simply check that op[3] < olimit, instead of checking all 759 * of our bounds, since we can't hit the other bounds until we've run 760 * iters iterations, which only happens when op[3] == olimit. 761 */ 762 olimit = op[3] + symbols; 763 764 /* Exit fast decoding loop once we reach the end. */ 765 if (op[3] == olimit) 766 break; 767 768 /* Exit the decoding loop if any input pointer has crossed the 769 * previous one. This indicates corruption, and a precondition 770 * to our loop is that ip[i] >= ip[0]. 771 */ 772 for (stream = 1; stream < 4; ++stream) { 773 if (ip[stream] < ip[stream - 1]) 774 goto _out; 775 } 776 } 777 778 #ifndef NDEBUG 779 for (stream = 1; stream < 4; ++stream) { 780 assert(ip[stream] >= ip[stream - 1]); 781 } 782 #endif 783 784 #define HUF_4X1_DECODE_SYMBOL(_stream, _symbol) \ 785 do { \ 786 int const index = (int)(bits[(_stream)] >> 53); \ 787 int const entry = (int)dtable[index]; \ 788 bits[(_stream)] <<= (entry & 0x3F); \ 789 op[(_stream)][(_symbol)] = (BYTE)((entry >> 8) & 0xFF); \ 790 } while (0) 791 792 #define HUF_4X1_RELOAD_STREAM(_stream) \ 793 do { \ 794 int const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \ 795 int const nbBits = ctz & 7; \ 796 int const nbBytes = ctz >> 3; \ 797 op[(_stream)] += 5; \ 798 ip[(_stream)] -= nbBytes; \ 799 bits[(_stream)] = MEM_read64(ip[(_stream)]) | 1; \ 800 bits[(_stream)] <<= nbBits; \ 801 } while (0) 802 803 /* Manually unroll the loop because compilers don't consistently 804 * unroll the inner loops, which destroys performance. 805 */ 806 do { 807 /* Decode 5 symbols in each of the 4 streams */ 808 HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 0); 809 HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 1); 810 HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 2); 811 HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 3); 812 HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 4); 813 814 /* Reload each of the 4 the bitstreams */ 815 HUF_4X_FOR_EACH_STREAM(HUF_4X1_RELOAD_STREAM); 816 } while (op[3] < olimit); 817 818 #undef HUF_4X1_DECODE_SYMBOL 819 #undef HUF_4X1_RELOAD_STREAM 820 } 821 822 _out: 823 824 /* Save the final values of each of the state variables back to args. */ 825 ZSTD_memcpy(&args->bits, &bits, sizeof(bits)); 826 ZSTD_memcpy((void*)(&args->ip), &ip, sizeof(ip)); 827 ZSTD_memcpy(&args->op, &op, sizeof(op)); 828 } 829 830 /* 831 * @returns @p dstSize on success (>= 6) 832 * 0 if the fallback implementation should be used 833 * An error if an error occurred 834 */ 835 static HUF_FAST_BMI2_ATTRS 836 size_t 837 HUF_decompress4X1_usingDTable_internal_fast( 838 void* dst, size_t dstSize, 839 const void* cSrc, size_t cSrcSize, 840 const HUF_DTable* DTable, 841 HUF_DecompressFastLoopFn loopFn) 842 { 843 void const* dt = DTable + 1; 844 BYTE const* const ilowest = (BYTE const*)cSrc; 845 BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize); 846 HUF_DecompressFastArgs args; 847 { size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable); 848 FORWARD_IF_ERROR(ret, "Failed to init fast loop args"); 849 if (ret == 0) 850 return 0; 851 } 852 853 assert(args.ip[0] >= args.ilowest); 854 loopFn(&args); 855 856 /* Our loop guarantees that ip[] >= ilowest and that we haven't 857 * overwritten any op[]. 858 */ 859 assert(args.ip[0] >= ilowest); 860 assert(args.ip[0] >= ilowest); 861 assert(args.ip[1] >= ilowest); 862 assert(args.ip[2] >= ilowest); 863 assert(args.ip[3] >= ilowest); 864 assert(args.op[3] <= oend); 865 866 assert(ilowest == args.ilowest); 867 assert(ilowest + 6 == args.iend[0]); 868 (void)ilowest; 869 870 /* finish bit streams one by one. */ 871 { size_t const segmentSize = (dstSize+3) / 4; 872 BYTE* segmentEnd = (BYTE*)dst; 873 int i; 874 for (i = 0; i < 4; ++i) { 875 BIT_DStream_t bit; 876 if (segmentSize <= (size_t)(oend - segmentEnd)) 877 segmentEnd += segmentSize; 878 else 879 segmentEnd = oend; 880 FORWARD_IF_ERROR(HUF_initRemainingDStream(&bit, &args, i, segmentEnd), "corruption"); 881 /* Decompress and validate that we've produced exactly the expected length. */ 882 args.op[i] += HUF_decodeStreamX1(args.op[i], &bit, segmentEnd, (HUF_DEltX1 const*)dt, HUF_DECODER_FAST_TABLELOG); 883 if (args.op[i] != segmentEnd) return ERROR(corruption_detected); 884 } 885 } 886 887 /* decoded size */ 888 assert(dstSize != 0); 889 return dstSize; 890 } 891 892 HUF_DGEN(HUF_decompress1X1_usingDTable_internal) 893 894 static size_t HUF_decompress4X1_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc, 895 size_t cSrcSize, HUF_DTable const* DTable, int flags) 896 { 897 HUF_DecompressUsingDTableFn fallbackFn = HUF_decompress4X1_usingDTable_internal_default; 898 HUF_DecompressFastLoopFn loopFn = HUF_decompress4X1_usingDTable_internal_fast_c_loop; 899 900 #if DYNAMIC_BMI2 901 if (flags & HUF_flags_bmi2) { 902 fallbackFn = HUF_decompress4X1_usingDTable_internal_bmi2; 903 # if ZSTD_ENABLE_ASM_X86_64_BMI2 904 if (!(flags & HUF_flags_disableAsm)) { 905 loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop; 906 } 907 # endif 908 } else { 909 return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable); 910 } 911 #endif 912 913 #if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__) 914 if (!(flags & HUF_flags_disableAsm)) { 915 loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop; 916 } 917 #endif 918 919 if (HUF_ENABLE_FAST_DECODE && !(flags & HUF_flags_disableFast)) { 920 size_t const ret = HUF_decompress4X1_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn); 921 if (ret != 0) 922 return ret; 923 } 924 return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable); 925 } 926 927 static size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, 928 const void* cSrc, size_t cSrcSize, 929 void* workSpace, size_t wkspSize, int flags) 930 { 931 const BYTE* ip = (const BYTE*) cSrc; 932 933 size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize, flags); 934 if (HUF_isError(hSize)) return hSize; 935 if (hSize >= cSrcSize) return ERROR(srcSize_wrong); 936 ip += hSize; cSrcSize -= hSize; 937 938 return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags); 939 } 940 941 #endif /* HUF_FORCE_DECOMPRESS_X2 */ 942 943 944 #ifndef HUF_FORCE_DECOMPRESS_X1 945 946 /* *************************/ 947 /* double-symbols decoding */ 948 /* *************************/ 949 950 typedef struct { U16 sequence; BYTE nbBits; BYTE length; } HUF_DEltX2; /* double-symbols decoding */ 951 typedef struct { BYTE symbol; } sortedSymbol_t; 952 typedef U32 rankValCol_t[HUF_TABLELOG_MAX + 1]; 953 typedef rankValCol_t rankVal_t[HUF_TABLELOG_MAX]; 954 955 /* 956 * Constructs a HUF_DEltX2 in a U32. 957 */ 958 static U32 HUF_buildDEltX2U32(U32 symbol, U32 nbBits, U32 baseSeq, int level) 959 { 960 U32 seq; 961 DEBUG_STATIC_ASSERT(offsetof(HUF_DEltX2, sequence) == 0); 962 DEBUG_STATIC_ASSERT(offsetof(HUF_DEltX2, nbBits) == 2); 963 DEBUG_STATIC_ASSERT(offsetof(HUF_DEltX2, length) == 3); 964 DEBUG_STATIC_ASSERT(sizeof(HUF_DEltX2) == sizeof(U32)); 965 if (MEM_isLittleEndian()) { 966 seq = level == 1 ? symbol : (baseSeq + (symbol << 8)); 967 return seq + (nbBits << 16) + ((U32)level << 24); 968 } else { 969 seq = level == 1 ? (symbol << 8) : ((baseSeq << 8) + symbol); 970 return (seq << 16) + (nbBits << 8) + (U32)level; 971 } 972 } 973 974 /* 975 * Constructs a HUF_DEltX2. 976 */ 977 static HUF_DEltX2 HUF_buildDEltX2(U32 symbol, U32 nbBits, U32 baseSeq, int level) 978 { 979 HUF_DEltX2 DElt; 980 U32 const val = HUF_buildDEltX2U32(symbol, nbBits, baseSeq, level); 981 DEBUG_STATIC_ASSERT(sizeof(DElt) == sizeof(val)); 982 ZSTD_memcpy(&DElt, &val, sizeof(val)); 983 return DElt; 984 } 985 986 /* 987 * Constructs 2 HUF_DEltX2s and packs them into a U64. 988 */ 989 static U64 HUF_buildDEltX2U64(U32 symbol, U32 nbBits, U16 baseSeq, int level) 990 { 991 U32 DElt = HUF_buildDEltX2U32(symbol, nbBits, baseSeq, level); 992 return (U64)DElt + ((U64)DElt << 32); 993 } 994 995 /* 996 * Fills the DTable rank with all the symbols from [begin, end) that are each 997 * nbBits long. 998 * 999 * @param DTableRank The start of the rank in the DTable. 1000 * @param begin The first symbol to fill (inclusive). 1001 * @param end The last symbol to fill (exclusive). 1002 * @param nbBits Each symbol is nbBits long. 1003 * @param tableLog The table log. 1004 * @param baseSeq If level == 1 { 0 } else { the first level symbol } 1005 * @param level The level in the table. Must be 1 or 2. 1006 */ 1007 static void HUF_fillDTableX2ForWeight( 1008 HUF_DEltX2* DTableRank, 1009 sortedSymbol_t const* begin, sortedSymbol_t const* end, 1010 U32 nbBits, U32 tableLog, 1011 U16 baseSeq, int const level) 1012 { 1013 U32 const length = 1U << ((tableLog - nbBits) & 0x1F /* quiet static-analyzer */); 1014 const sortedSymbol_t* ptr; 1015 assert(level >= 1 && level <= 2); 1016 switch (length) { 1017 case 1: 1018 for (ptr = begin; ptr != end; ++ptr) { 1019 HUF_DEltX2 const DElt = HUF_buildDEltX2(ptr->symbol, nbBits, baseSeq, level); 1020 *DTableRank++ = DElt; 1021 } 1022 break; 1023 case 2: 1024 for (ptr = begin; ptr != end; ++ptr) { 1025 HUF_DEltX2 const DElt = HUF_buildDEltX2(ptr->symbol, nbBits, baseSeq, level); 1026 DTableRank[0] = DElt; 1027 DTableRank[1] = DElt; 1028 DTableRank += 2; 1029 } 1030 break; 1031 case 4: 1032 for (ptr = begin; ptr != end; ++ptr) { 1033 U64 const DEltX2 = HUF_buildDEltX2U64(ptr->symbol, nbBits, baseSeq, level); 1034 ZSTD_memcpy(DTableRank + 0, &DEltX2, sizeof(DEltX2)); 1035 ZSTD_memcpy(DTableRank + 2, &DEltX2, sizeof(DEltX2)); 1036 DTableRank += 4; 1037 } 1038 break; 1039 case 8: 1040 for (ptr = begin; ptr != end; ++ptr) { 1041 U64 const DEltX2 = HUF_buildDEltX2U64(ptr->symbol, nbBits, baseSeq, level); 1042 ZSTD_memcpy(DTableRank + 0, &DEltX2, sizeof(DEltX2)); 1043 ZSTD_memcpy(DTableRank + 2, &DEltX2, sizeof(DEltX2)); 1044 ZSTD_memcpy(DTableRank + 4, &DEltX2, sizeof(DEltX2)); 1045 ZSTD_memcpy(DTableRank + 6, &DEltX2, sizeof(DEltX2)); 1046 DTableRank += 8; 1047 } 1048 break; 1049 default: 1050 for (ptr = begin; ptr != end; ++ptr) { 1051 U64 const DEltX2 = HUF_buildDEltX2U64(ptr->symbol, nbBits, baseSeq, level); 1052 HUF_DEltX2* const DTableRankEnd = DTableRank + length; 1053 for (; DTableRank != DTableRankEnd; DTableRank += 8) { 1054 ZSTD_memcpy(DTableRank + 0, &DEltX2, sizeof(DEltX2)); 1055 ZSTD_memcpy(DTableRank + 2, &DEltX2, sizeof(DEltX2)); 1056 ZSTD_memcpy(DTableRank + 4, &DEltX2, sizeof(DEltX2)); 1057 ZSTD_memcpy(DTableRank + 6, &DEltX2, sizeof(DEltX2)); 1058 } 1059 } 1060 break; 1061 } 1062 } 1063 1064 /* HUF_fillDTableX2Level2() : 1065 * `rankValOrigin` must be a table of at least (HUF_TABLELOG_MAX + 1) U32 */ 1066 static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 targetLog, const U32 consumedBits, 1067 const U32* rankVal, const int minWeight, const int maxWeight1, 1068 const sortedSymbol_t* sortedSymbols, U32 const* rankStart, 1069 U32 nbBitsBaseline, U16 baseSeq) 1070 { 1071 /* Fill skipped values (all positions up to rankVal[minWeight]). 1072 * These are positions only get a single symbol because the combined weight 1073 * is too large. 1074 */ 1075 if (minWeight>1) { 1076 U32 const length = 1U << ((targetLog - consumedBits) & 0x1F /* quiet static-analyzer */); 1077 U64 const DEltX2 = HUF_buildDEltX2U64(baseSeq, consumedBits, /* baseSeq */ 0, /* level */ 1); 1078 int const skipSize = rankVal[minWeight]; 1079 assert(length > 1); 1080 assert((U32)skipSize < length); 1081 switch (length) { 1082 case 2: 1083 assert(skipSize == 1); 1084 ZSTD_memcpy(DTable, &DEltX2, sizeof(DEltX2)); 1085 break; 1086 case 4: 1087 assert(skipSize <= 4); 1088 ZSTD_memcpy(DTable + 0, &DEltX2, sizeof(DEltX2)); 1089 ZSTD_memcpy(DTable + 2, &DEltX2, sizeof(DEltX2)); 1090 break; 1091 default: 1092 { 1093 int i; 1094 for (i = 0; i < skipSize; i += 8) { 1095 ZSTD_memcpy(DTable + i + 0, &DEltX2, sizeof(DEltX2)); 1096 ZSTD_memcpy(DTable + i + 2, &DEltX2, sizeof(DEltX2)); 1097 ZSTD_memcpy(DTable + i + 4, &DEltX2, sizeof(DEltX2)); 1098 ZSTD_memcpy(DTable + i + 6, &DEltX2, sizeof(DEltX2)); 1099 } 1100 } 1101 } 1102 } 1103 1104 /* Fill each of the second level symbols by weight. */ 1105 { 1106 int w; 1107 for (w = minWeight; w < maxWeight1; ++w) { 1108 int const begin = rankStart[w]; 1109 int const end = rankStart[w+1]; 1110 U32 const nbBits = nbBitsBaseline - w; 1111 U32 const totalBits = nbBits + consumedBits; 1112 HUF_fillDTableX2ForWeight( 1113 DTable + rankVal[w], 1114 sortedSymbols + begin, sortedSymbols + end, 1115 totalBits, targetLog, 1116 baseSeq, /* level */ 2); 1117 } 1118 } 1119 } 1120 1121 static void HUF_fillDTableX2(HUF_DEltX2* DTable, const U32 targetLog, 1122 const sortedSymbol_t* sortedList, 1123 const U32* rankStart, rankValCol_t* rankValOrigin, const U32 maxWeight, 1124 const U32 nbBitsBaseline) 1125 { 1126 U32* const rankVal = rankValOrigin[0]; 1127 const int scaleLog = nbBitsBaseline - targetLog; /* note : targetLog >= srcLog, hence scaleLog <= 1 */ 1128 const U32 minBits = nbBitsBaseline - maxWeight; 1129 int w; 1130 int const wEnd = (int)maxWeight + 1; 1131 1132 /* Fill DTable in order of weight. */ 1133 for (w = 1; w < wEnd; ++w) { 1134 int const begin = (int)rankStart[w]; 1135 int const end = (int)rankStart[w+1]; 1136 U32 const nbBits = nbBitsBaseline - w; 1137 1138 if (targetLog-nbBits >= minBits) { 1139 /* Enough room for a second symbol. */ 1140 int start = rankVal[w]; 1141 U32 const length = 1U << ((targetLog - nbBits) & 0x1F /* quiet static-analyzer */); 1142 int minWeight = nbBits + scaleLog; 1143 int s; 1144 if (minWeight < 1) minWeight = 1; 1145 /* Fill the DTable for every symbol of weight w. 1146 * These symbols get at least 1 second symbol. 1147 */ 1148 for (s = begin; s != end; ++s) { 1149 HUF_fillDTableX2Level2( 1150 DTable + start, targetLog, nbBits, 1151 rankValOrigin[nbBits], minWeight, wEnd, 1152 sortedList, rankStart, 1153 nbBitsBaseline, sortedList[s].symbol); 1154 start += length; 1155 } 1156 } else { 1157 /* Only a single symbol. */ 1158 HUF_fillDTableX2ForWeight( 1159 DTable + rankVal[w], 1160 sortedList + begin, sortedList + end, 1161 nbBits, targetLog, 1162 /* baseSeq */ 0, /* level */ 1); 1163 } 1164 } 1165 } 1166 1167 typedef struct { 1168 rankValCol_t rankVal[HUF_TABLELOG_MAX]; 1169 U32 rankStats[HUF_TABLELOG_MAX + 1]; 1170 U32 rankStart0[HUF_TABLELOG_MAX + 3]; 1171 sortedSymbol_t sortedSymbol[HUF_SYMBOLVALUE_MAX + 1]; 1172 BYTE weightList[HUF_SYMBOLVALUE_MAX + 1]; 1173 U32 calleeWksp[HUF_READ_STATS_WORKSPACE_SIZE_U32]; 1174 } HUF_ReadDTableX2_Workspace; 1175 1176 size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, 1177 const void* src, size_t srcSize, 1178 void* workSpace, size_t wkspSize, int flags) 1179 { 1180 U32 tableLog, maxW, nbSymbols; 1181 DTableDesc dtd = HUF_getDTableDesc(DTable); 1182 U32 maxTableLog = dtd.maxTableLog; 1183 size_t iSize; 1184 void* dtPtr = DTable+1; /* force compiler to avoid strict-aliasing */ 1185 HUF_DEltX2* const dt = (HUF_DEltX2*)dtPtr; 1186 U32 *rankStart; 1187 1188 HUF_ReadDTableX2_Workspace* const wksp = (HUF_ReadDTableX2_Workspace*)workSpace; 1189 1190 if (sizeof(*wksp) > wkspSize) return ERROR(GENERIC); 1191 1192 rankStart = wksp->rankStart0 + 1; 1193 ZSTD_memset(wksp->rankStats, 0, sizeof(wksp->rankStats)); 1194 ZSTD_memset(wksp->rankStart0, 0, sizeof(wksp->rankStart0)); 1195 1196 DEBUG_STATIC_ASSERT(sizeof(HUF_DEltX2) == sizeof(HUF_DTable)); /* if compiler fails here, assertion is wrong */ 1197 if (maxTableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge); 1198 /* ZSTD_memset(weightList, 0, sizeof(weightList)); */ /* is not necessary, even though some analyzer complain ... */ 1199 1200 iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), flags); 1201 if (HUF_isError(iSize)) return iSize; 1202 1203 /* check result */ 1204 if (tableLog > maxTableLog) return ERROR(tableLog_tooLarge); /* DTable can't fit code depth */ 1205 if (tableLog <= HUF_DECODER_FAST_TABLELOG && maxTableLog > HUF_DECODER_FAST_TABLELOG) maxTableLog = HUF_DECODER_FAST_TABLELOG; 1206 1207 /* find maxWeight */ 1208 for (maxW = tableLog; wksp->rankStats[maxW]==0; maxW--) {} /* necessarily finds a solution before 0 */ 1209 1210 /* Get start index of each weight */ 1211 { U32 w, nextRankStart = 0; 1212 for (w=1; w<maxW+1; w++) { 1213 U32 curr = nextRankStart; 1214 nextRankStart += wksp->rankStats[w]; 1215 rankStart[w] = curr; 1216 } 1217 rankStart[0] = nextRankStart; /* put all 0w symbols at the end of sorted list*/ 1218 rankStart[maxW+1] = nextRankStart; 1219 } 1220 1221 /* sort symbols by weight */ 1222 { U32 s; 1223 for (s=0; s<nbSymbols; s++) { 1224 U32 const w = wksp->weightList[s]; 1225 U32 const r = rankStart[w]++; 1226 wksp->sortedSymbol[r].symbol = (BYTE)s; 1227 } 1228 rankStart[0] = 0; /* forget 0w symbols; this is beginning of weight(1) */ 1229 } 1230 1231 /* Build rankVal */ 1232 { U32* const rankVal0 = wksp->rankVal[0]; 1233 { int const rescale = (maxTableLog-tableLog) - 1; /* tableLog <= maxTableLog */ 1234 U32 nextRankVal = 0; 1235 U32 w; 1236 for (w=1; w<maxW+1; w++) { 1237 U32 curr = nextRankVal; 1238 nextRankVal += wksp->rankStats[w] << (w+rescale); 1239 rankVal0[w] = curr; 1240 } } 1241 { U32 const minBits = tableLog+1 - maxW; 1242 U32 consumed; 1243 for (consumed = minBits; consumed < maxTableLog - minBits + 1; consumed++) { 1244 U32* const rankValPtr = wksp->rankVal[consumed]; 1245 U32 w; 1246 for (w = 1; w < maxW+1; w++) { 1247 rankValPtr[w] = rankVal0[w] >> consumed; 1248 } } } } 1249 1250 HUF_fillDTableX2(dt, maxTableLog, 1251 wksp->sortedSymbol, 1252 wksp->rankStart0, wksp->rankVal, maxW, 1253 tableLog+1); 1254 1255 dtd.tableLog = (BYTE)maxTableLog; 1256 dtd.tableType = 1; 1257 ZSTD_memcpy(DTable, &dtd, sizeof(dtd)); 1258 return iSize; 1259 } 1260 1261 1262 FORCE_INLINE_TEMPLATE U32 1263 HUF_decodeSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, const U32 dtLog) 1264 { 1265 size_t const val = BIT_lookBitsFast(DStream, dtLog); /* note : dtLog >= 1 */ 1266 ZSTD_memcpy(op, &dt[val].sequence, 2); 1267 BIT_skipBits(DStream, dt[val].nbBits); 1268 return dt[val].length; 1269 } 1270 1271 FORCE_INLINE_TEMPLATE U32 1272 HUF_decodeLastSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, const U32 dtLog) 1273 { 1274 size_t const val = BIT_lookBitsFast(DStream, dtLog); /* note : dtLog >= 1 */ 1275 ZSTD_memcpy(op, &dt[val].sequence, 1); 1276 if (dt[val].length==1) { 1277 BIT_skipBits(DStream, dt[val].nbBits); 1278 } else { 1279 if (DStream->bitsConsumed < (sizeof(DStream->bitContainer)*8)) { 1280 BIT_skipBits(DStream, dt[val].nbBits); 1281 if (DStream->bitsConsumed > (sizeof(DStream->bitContainer)*8)) 1282 /* ugly hack; works only because it's the last symbol. Note : can't easily extract nbBits from just this symbol */ 1283 DStream->bitsConsumed = (sizeof(DStream->bitContainer)*8); 1284 } 1285 } 1286 return 1; 1287 } 1288 1289 #define HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr) \ 1290 do { ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); } while (0) 1291 1292 #define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr) \ 1293 do { \ 1294 if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \ 1295 ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); \ 1296 } while (0) 1297 1298 #define HUF_DECODE_SYMBOLX2_2(ptr, DStreamPtr) \ 1299 do { \ 1300 if (MEM_64bits()) \ 1301 ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); \ 1302 } while (0) 1303 1304 HINT_INLINE size_t 1305 HUF_decodeStreamX2(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* const pEnd, 1306 const HUF_DEltX2* const dt, const U32 dtLog) 1307 { 1308 BYTE* const pStart = p; 1309 1310 /* up to 8 symbols at a time */ 1311 if ((size_t)(pEnd - p) >= sizeof(bitDPtr->bitContainer)) { 1312 if (dtLog <= 11 && MEM_64bits()) { 1313 /* up to 10 symbols at a time */ 1314 while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-9)) { 1315 HUF_DECODE_SYMBOLX2_0(p, bitDPtr); 1316 HUF_DECODE_SYMBOLX2_0(p, bitDPtr); 1317 HUF_DECODE_SYMBOLX2_0(p, bitDPtr); 1318 HUF_DECODE_SYMBOLX2_0(p, bitDPtr); 1319 HUF_DECODE_SYMBOLX2_0(p, bitDPtr); 1320 } 1321 } else { 1322 /* up to 8 symbols at a time */ 1323 while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-(sizeof(bitDPtr->bitContainer)-1))) { 1324 HUF_DECODE_SYMBOLX2_2(p, bitDPtr); 1325 HUF_DECODE_SYMBOLX2_1(p, bitDPtr); 1326 HUF_DECODE_SYMBOLX2_2(p, bitDPtr); 1327 HUF_DECODE_SYMBOLX2_0(p, bitDPtr); 1328 } 1329 } 1330 } else { 1331 BIT_reloadDStream(bitDPtr); 1332 } 1333 1334 /* closer to end : up to 2 symbols at a time */ 1335 if ((size_t)(pEnd - p) >= 2) { 1336 while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p <= pEnd-2)) 1337 HUF_DECODE_SYMBOLX2_0(p, bitDPtr); 1338 1339 while (p <= pEnd-2) 1340 HUF_DECODE_SYMBOLX2_0(p, bitDPtr); /* no need to reload : reached the end of DStream */ 1341 } 1342 1343 if (p < pEnd) 1344 p += HUF_decodeLastSymbolX2(p, bitDPtr, dt, dtLog); 1345 1346 return p-pStart; 1347 } 1348 1349 FORCE_INLINE_TEMPLATE size_t 1350 HUF_decompress1X2_usingDTable_internal_body( 1351 void* dst, size_t dstSize, 1352 const void* cSrc, size_t cSrcSize, 1353 const HUF_DTable* DTable) 1354 { 1355 BIT_DStream_t bitD; 1356 1357 /* Init */ 1358 CHECK_F( BIT_initDStream(&bitD, cSrc, cSrcSize) ); 1359 1360 /* decode */ 1361 { BYTE* const ostart = (BYTE*) dst; 1362 BYTE* const oend = ZSTD_maybeNullPtrAdd(ostart, dstSize); 1363 const void* const dtPtr = DTable+1; /* force compiler to not use strict-aliasing */ 1364 const HUF_DEltX2* const dt = (const HUF_DEltX2*)dtPtr; 1365 DTableDesc const dtd = HUF_getDTableDesc(DTable); 1366 HUF_decodeStreamX2(ostart, &bitD, oend, dt, dtd.tableLog); 1367 } 1368 1369 /* check */ 1370 if (!BIT_endOfDStream(&bitD)) return ERROR(corruption_detected); 1371 1372 /* decoded size */ 1373 return dstSize; 1374 } 1375 1376 /* HUF_decompress4X2_usingDTable_internal_body(): 1377 * Conditions: 1378 * @dstSize >= 6 1379 */ 1380 FORCE_INLINE_TEMPLATE size_t 1381 HUF_decompress4X2_usingDTable_internal_body( 1382 void* dst, size_t dstSize, 1383 const void* cSrc, size_t cSrcSize, 1384 const HUF_DTable* DTable) 1385 { 1386 if (cSrcSize < 10) return ERROR(corruption_detected); /* strict minimum : jump table + 1 byte per stream */ 1387 if (dstSize < 6) return ERROR(corruption_detected); /* stream 4-split doesn't work */ 1388 1389 { const BYTE* const istart = (const BYTE*) cSrc; 1390 BYTE* const ostart = (BYTE*) dst; 1391 BYTE* const oend = ostart + dstSize; 1392 BYTE* const olimit = oend - (sizeof(size_t)-1); 1393 const void* const dtPtr = DTable+1; 1394 const HUF_DEltX2* const dt = (const HUF_DEltX2*)dtPtr; 1395 1396 /* Init */ 1397 BIT_DStream_t bitD1; 1398 BIT_DStream_t bitD2; 1399 BIT_DStream_t bitD3; 1400 BIT_DStream_t bitD4; 1401 size_t const length1 = MEM_readLE16(istart); 1402 size_t const length2 = MEM_readLE16(istart+2); 1403 size_t const length3 = MEM_readLE16(istart+4); 1404 size_t const length4 = cSrcSize - (length1 + length2 + length3 + 6); 1405 const BYTE* const istart1 = istart + 6; /* jumpTable */ 1406 const BYTE* const istart2 = istart1 + length1; 1407 const BYTE* const istart3 = istart2 + length2; 1408 const BYTE* const istart4 = istart3 + length3; 1409 size_t const segmentSize = (dstSize+3) / 4; 1410 BYTE* const opStart2 = ostart + segmentSize; 1411 BYTE* const opStart3 = opStart2 + segmentSize; 1412 BYTE* const opStart4 = opStart3 + segmentSize; 1413 BYTE* op1 = ostart; 1414 BYTE* op2 = opStart2; 1415 BYTE* op3 = opStart3; 1416 BYTE* op4 = opStart4; 1417 U32 endSignal = 1; 1418 DTableDesc const dtd = HUF_getDTableDesc(DTable); 1419 U32 const dtLog = dtd.tableLog; 1420 1421 if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */ 1422 if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */ 1423 assert(dstSize >= 6 /* validated above */); 1424 CHECK_F( BIT_initDStream(&bitD1, istart1, length1) ); 1425 CHECK_F( BIT_initDStream(&bitD2, istart2, length2) ); 1426 CHECK_F( BIT_initDStream(&bitD3, istart3, length3) ); 1427 CHECK_F( BIT_initDStream(&bitD4, istart4, length4) ); 1428 1429 /* 16-32 symbols per loop (4-8 symbols per stream) */ 1430 if ((size_t)(oend - op4) >= sizeof(size_t)) { 1431 for ( ; (endSignal) & (op4 < olimit); ) { 1432 #if defined(__clang__) && (defined(__x86_64__) || defined(__i386__)) 1433 HUF_DECODE_SYMBOLX2_2(op1, &bitD1); 1434 HUF_DECODE_SYMBOLX2_1(op1, &bitD1); 1435 HUF_DECODE_SYMBOLX2_2(op1, &bitD1); 1436 HUF_DECODE_SYMBOLX2_0(op1, &bitD1); 1437 HUF_DECODE_SYMBOLX2_2(op2, &bitD2); 1438 HUF_DECODE_SYMBOLX2_1(op2, &bitD2); 1439 HUF_DECODE_SYMBOLX2_2(op2, &bitD2); 1440 HUF_DECODE_SYMBOLX2_0(op2, &bitD2); 1441 endSignal &= BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished; 1442 endSignal &= BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished; 1443 HUF_DECODE_SYMBOLX2_2(op3, &bitD3); 1444 HUF_DECODE_SYMBOLX2_1(op3, &bitD3); 1445 HUF_DECODE_SYMBOLX2_2(op3, &bitD3); 1446 HUF_DECODE_SYMBOLX2_0(op3, &bitD3); 1447 HUF_DECODE_SYMBOLX2_2(op4, &bitD4); 1448 HUF_DECODE_SYMBOLX2_1(op4, &bitD4); 1449 HUF_DECODE_SYMBOLX2_2(op4, &bitD4); 1450 HUF_DECODE_SYMBOLX2_0(op4, &bitD4); 1451 endSignal &= BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished; 1452 endSignal &= BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished; 1453 #else 1454 HUF_DECODE_SYMBOLX2_2(op1, &bitD1); 1455 HUF_DECODE_SYMBOLX2_2(op2, &bitD2); 1456 HUF_DECODE_SYMBOLX2_2(op3, &bitD3); 1457 HUF_DECODE_SYMBOLX2_2(op4, &bitD4); 1458 HUF_DECODE_SYMBOLX2_1(op1, &bitD1); 1459 HUF_DECODE_SYMBOLX2_1(op2, &bitD2); 1460 HUF_DECODE_SYMBOLX2_1(op3, &bitD3); 1461 HUF_DECODE_SYMBOLX2_1(op4, &bitD4); 1462 HUF_DECODE_SYMBOLX2_2(op1, &bitD1); 1463 HUF_DECODE_SYMBOLX2_2(op2, &bitD2); 1464 HUF_DECODE_SYMBOLX2_2(op3, &bitD3); 1465 HUF_DECODE_SYMBOLX2_2(op4, &bitD4); 1466 HUF_DECODE_SYMBOLX2_0(op1, &bitD1); 1467 HUF_DECODE_SYMBOLX2_0(op2, &bitD2); 1468 HUF_DECODE_SYMBOLX2_0(op3, &bitD3); 1469 HUF_DECODE_SYMBOLX2_0(op4, &bitD4); 1470 endSignal = (U32)LIKELY((U32) 1471 (BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished) 1472 & (BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished) 1473 & (BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished) 1474 & (BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished)); 1475 #endif 1476 } 1477 } 1478 1479 /* check corruption */ 1480 if (op1 > opStart2) return ERROR(corruption_detected); 1481 if (op2 > opStart3) return ERROR(corruption_detected); 1482 if (op3 > opStart4) return ERROR(corruption_detected); 1483 /* note : op4 already verified within main loop */ 1484 1485 /* finish bitStreams one by one */ 1486 HUF_decodeStreamX2(op1, &bitD1, opStart2, dt, dtLog); 1487 HUF_decodeStreamX2(op2, &bitD2, opStart3, dt, dtLog); 1488 HUF_decodeStreamX2(op3, &bitD3, opStart4, dt, dtLog); 1489 HUF_decodeStreamX2(op4, &bitD4, oend, dt, dtLog); 1490 1491 /* check */ 1492 { U32 const endCheck = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4); 1493 if (!endCheck) return ERROR(corruption_detected); } 1494 1495 /* decoded size */ 1496 return dstSize; 1497 } 1498 } 1499 1500 #if HUF_NEED_BMI2_FUNCTION 1501 static BMI2_TARGET_ATTRIBUTE 1502 size_t HUF_decompress4X2_usingDTable_internal_bmi2(void* dst, size_t dstSize, void const* cSrc, 1503 size_t cSrcSize, HUF_DTable const* DTable) { 1504 return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable); 1505 } 1506 #endif 1507 1508 static 1509 size_t HUF_decompress4X2_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc, 1510 size_t cSrcSize, HUF_DTable const* DTable) { 1511 return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable); 1512 } 1513 1514 #if ZSTD_ENABLE_ASM_X86_64_BMI2 1515 1516 HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN; 1517 1518 #endif 1519 1520 static HUF_FAST_BMI2_ATTRS 1521 void HUF_decompress4X2_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args) 1522 { 1523 U64 bits[4]; 1524 BYTE const* ip[4]; 1525 BYTE* op[4]; 1526 BYTE* oend[4]; 1527 HUF_DEltX2 const* const dtable = (HUF_DEltX2 const*)args->dt; 1528 BYTE const* const ilowest = args->ilowest; 1529 1530 /* Copy the arguments to local registers. */ 1531 ZSTD_memcpy(&bits, &args->bits, sizeof(bits)); 1532 ZSTD_memcpy((void*)(&ip), &args->ip, sizeof(ip)); 1533 ZSTD_memcpy(&op, &args->op, sizeof(op)); 1534 1535 oend[0] = op[1]; 1536 oend[1] = op[2]; 1537 oend[2] = op[3]; 1538 oend[3] = args->oend; 1539 1540 assert(MEM_isLittleEndian()); 1541 assert(!MEM_32bits()); 1542 1543 for (;;) { 1544 BYTE* olimit; 1545 int stream; 1546 1547 /* Assert loop preconditions */ 1548 #ifndef NDEBUG 1549 for (stream = 0; stream < 4; ++stream) { 1550 assert(op[stream] <= oend[stream]); 1551 assert(ip[stream] >= ilowest); 1552 } 1553 #endif 1554 /* Compute olimit */ 1555 { 1556 /* Each loop does 5 table lookups for each of the 4 streams. 1557 * Each table lookup consumes up to 11 bits of input, and produces 1558 * up to 2 bytes of output. 1559 */ 1560 /* We can consume up to 7 bytes of input per iteration per stream. 1561 * We also know that each input pointer is >= ip[0]. So we can run 1562 * iters loops before running out of input. 1563 */ 1564 size_t iters = (size_t)(ip[0] - ilowest) / 7; 1565 /* Each iteration can produce up to 10 bytes of output per stream. 1566 * Each output stream my advance at different rates. So take the 1567 * minimum number of safe iterations among all the output streams. 1568 */ 1569 for (stream = 0; stream < 4; ++stream) { 1570 size_t const oiters = (size_t)(oend[stream] - op[stream]) / 10; 1571 iters = MIN(iters, oiters); 1572 } 1573 1574 /* Each iteration produces at least 5 output symbols. So until 1575 * op[3] crosses olimit, we know we haven't executed iters 1576 * iterations yet. This saves us maintaining an iters counter, 1577 * at the expense of computing the remaining # of iterations 1578 * more frequently. 1579 */ 1580 olimit = op[3] + (iters * 5); 1581 1582 /* Exit the fast decoding loop once we reach the end. */ 1583 if (op[3] == olimit) 1584 break; 1585 1586 /* Exit the decoding loop if any input pointer has crossed the 1587 * previous one. This indicates corruption, and a precondition 1588 * to our loop is that ip[i] >= ip[0]. 1589 */ 1590 for (stream = 1; stream < 4; ++stream) { 1591 if (ip[stream] < ip[stream - 1]) 1592 goto _out; 1593 } 1594 } 1595 1596 #ifndef NDEBUG 1597 for (stream = 1; stream < 4; ++stream) { 1598 assert(ip[stream] >= ip[stream - 1]); 1599 } 1600 #endif 1601 1602 #define HUF_4X2_DECODE_SYMBOL(_stream, _decode3) \ 1603 do { \ 1604 if ((_decode3) || (_stream) != 3) { \ 1605 int const index = (int)(bits[(_stream)] >> 53); \ 1606 HUF_DEltX2 const entry = dtable[index]; \ 1607 MEM_write16(op[(_stream)], entry.sequence); \ 1608 bits[(_stream)] <<= (entry.nbBits) & 0x3F; \ 1609 op[(_stream)] += (entry.length); \ 1610 } \ 1611 } while (0) 1612 1613 #define HUF_4X2_RELOAD_STREAM(_stream) \ 1614 do { \ 1615 HUF_4X2_DECODE_SYMBOL(3, 1); \ 1616 { \ 1617 int const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \ 1618 int const nbBits = ctz & 7; \ 1619 int const nbBytes = ctz >> 3; \ 1620 ip[(_stream)] -= nbBytes; \ 1621 bits[(_stream)] = MEM_read64(ip[(_stream)]) | 1; \ 1622 bits[(_stream)] <<= nbBits; \ 1623 } \ 1624 } while (0) 1625 1626 /* Manually unroll the loop because compilers don't consistently 1627 * unroll the inner loops, which destroys performance. 1628 */ 1629 do { 1630 /* Decode 5 symbols from each of the first 3 streams. 1631 * The final stream will be decoded during the reload phase 1632 * to reduce register pressure. 1633 */ 1634 HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0); 1635 HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0); 1636 HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0); 1637 HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0); 1638 HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0); 1639 1640 /* Decode one symbol from the final stream */ 1641 HUF_4X2_DECODE_SYMBOL(3, 1); 1642 1643 /* Decode 4 symbols from the final stream & reload bitstreams. 1644 * The final stream is reloaded last, meaning that all 5 symbols 1645 * are decoded from the final stream before it is reloaded. 1646 */ 1647 HUF_4X_FOR_EACH_STREAM(HUF_4X2_RELOAD_STREAM); 1648 } while (op[3] < olimit); 1649 } 1650 1651 #undef HUF_4X2_DECODE_SYMBOL 1652 #undef HUF_4X2_RELOAD_STREAM 1653 1654 _out: 1655 1656 /* Save the final values of each of the state variables back to args. */ 1657 ZSTD_memcpy(&args->bits, &bits, sizeof(bits)); 1658 ZSTD_memcpy((void*)(&args->ip), &ip, sizeof(ip)); 1659 ZSTD_memcpy(&args->op, &op, sizeof(op)); 1660 } 1661 1662 1663 static HUF_FAST_BMI2_ATTRS size_t 1664 HUF_decompress4X2_usingDTable_internal_fast( 1665 void* dst, size_t dstSize, 1666 const void* cSrc, size_t cSrcSize, 1667 const HUF_DTable* DTable, 1668 HUF_DecompressFastLoopFn loopFn) { 1669 void const* dt = DTable + 1; 1670 const BYTE* const ilowest = (const BYTE*)cSrc; 1671 BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize); 1672 HUF_DecompressFastArgs args; 1673 { 1674 size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable); 1675 FORWARD_IF_ERROR(ret, "Failed to init asm args"); 1676 if (ret == 0) 1677 return 0; 1678 } 1679 1680 assert(args.ip[0] >= args.ilowest); 1681 loopFn(&args); 1682 1683 /* note : op4 already verified within main loop */ 1684 assert(args.ip[0] >= ilowest); 1685 assert(args.ip[1] >= ilowest); 1686 assert(args.ip[2] >= ilowest); 1687 assert(args.ip[3] >= ilowest); 1688 assert(args.op[3] <= oend); 1689 1690 assert(ilowest == args.ilowest); 1691 assert(ilowest + 6 == args.iend[0]); 1692 (void)ilowest; 1693 1694 /* finish bitStreams one by one */ 1695 { 1696 size_t const segmentSize = (dstSize+3) / 4; 1697 BYTE* segmentEnd = (BYTE*)dst; 1698 int i; 1699 for (i = 0; i < 4; ++i) { 1700 BIT_DStream_t bit; 1701 if (segmentSize <= (size_t)(oend - segmentEnd)) 1702 segmentEnd += segmentSize; 1703 else 1704 segmentEnd = oend; 1705 FORWARD_IF_ERROR(HUF_initRemainingDStream(&bit, &args, i, segmentEnd), "corruption"); 1706 args.op[i] += HUF_decodeStreamX2(args.op[i], &bit, segmentEnd, (HUF_DEltX2 const*)dt, HUF_DECODER_FAST_TABLELOG); 1707 if (args.op[i] != segmentEnd) 1708 return ERROR(corruption_detected); 1709 } 1710 } 1711 1712 /* decoded size */ 1713 return dstSize; 1714 } 1715 1716 static size_t HUF_decompress4X2_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc, 1717 size_t cSrcSize, HUF_DTable const* DTable, int flags) 1718 { 1719 HUF_DecompressUsingDTableFn fallbackFn = HUF_decompress4X2_usingDTable_internal_default; 1720 HUF_DecompressFastLoopFn loopFn = HUF_decompress4X2_usingDTable_internal_fast_c_loop; 1721 1722 #if DYNAMIC_BMI2 1723 if (flags & HUF_flags_bmi2) { 1724 fallbackFn = HUF_decompress4X2_usingDTable_internal_bmi2; 1725 # if ZSTD_ENABLE_ASM_X86_64_BMI2 1726 if (!(flags & HUF_flags_disableAsm)) { 1727 loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop; 1728 } 1729 # endif 1730 } else { 1731 return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable); 1732 } 1733 #endif 1734 1735 #if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__) 1736 if (!(flags & HUF_flags_disableAsm)) { 1737 loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop; 1738 } 1739 #endif 1740 1741 if (HUF_ENABLE_FAST_DECODE && !(flags & HUF_flags_disableFast)) { 1742 size_t const ret = HUF_decompress4X2_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn); 1743 if (ret != 0) 1744 return ret; 1745 } 1746 return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable); 1747 } 1748 1749 HUF_DGEN(HUF_decompress1X2_usingDTable_internal) 1750 1751 size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize, 1752 const void* cSrc, size_t cSrcSize, 1753 void* workSpace, size_t wkspSize, int flags) 1754 { 1755 const BYTE* ip = (const BYTE*) cSrc; 1756 1757 size_t const hSize = HUF_readDTableX2_wksp(DCtx, cSrc, cSrcSize, 1758 workSpace, wkspSize, flags); 1759 if (HUF_isError(hSize)) return hSize; 1760 if (hSize >= cSrcSize) return ERROR(srcSize_wrong); 1761 ip += hSize; cSrcSize -= hSize; 1762 1763 return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, flags); 1764 } 1765 1766 static size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, 1767 const void* cSrc, size_t cSrcSize, 1768 void* workSpace, size_t wkspSize, int flags) 1769 { 1770 const BYTE* ip = (const BYTE*) cSrc; 1771 1772 size_t hSize = HUF_readDTableX2_wksp(dctx, cSrc, cSrcSize, 1773 workSpace, wkspSize, flags); 1774 if (HUF_isError(hSize)) return hSize; 1775 if (hSize >= cSrcSize) return ERROR(srcSize_wrong); 1776 ip += hSize; cSrcSize -= hSize; 1777 1778 return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags); 1779 } 1780 1781 #endif /* HUF_FORCE_DECOMPRESS_X1 */ 1782 1783 1784 /* ***********************************/ 1785 /* Universal decompression selectors */ 1786 /* ***********************************/ 1787 1788 1789 #if !defined(HUF_FORCE_DECOMPRESS_X1) && !defined(HUF_FORCE_DECOMPRESS_X2) 1790 typedef struct { U32 tableTime; U32 decode256Time; } algo_time_t; 1791 static const algo_time_t algoTime[16 /* Quantization */][2 /* single, double */] = 1792 { 1793 /* single, double, quad */ 1794 {{0,0}, {1,1}}, /* Q==0 : impossible */ 1795 {{0,0}, {1,1}}, /* Q==1 : impossible */ 1796 {{ 150,216}, { 381,119}}, /* Q == 2 : 12-18% */ 1797 {{ 170,205}, { 514,112}}, /* Q == 3 : 18-25% */ 1798 {{ 177,199}, { 539,110}}, /* Q == 4 : 25-32% */ 1799 {{ 197,194}, { 644,107}}, /* Q == 5 : 32-38% */ 1800 {{ 221,192}, { 735,107}}, /* Q == 6 : 38-44% */ 1801 {{ 256,189}, { 881,106}}, /* Q == 7 : 44-50% */ 1802 {{ 359,188}, {1167,109}}, /* Q == 8 : 50-56% */ 1803 {{ 582,187}, {1570,114}}, /* Q == 9 : 56-62% */ 1804 {{ 688,187}, {1712,122}}, /* Q ==10 : 62-69% */ 1805 {{ 825,186}, {1965,136}}, /* Q ==11 : 69-75% */ 1806 {{ 976,185}, {2131,150}}, /* Q ==12 : 75-81% */ 1807 {{1180,186}, {2070,175}}, /* Q ==13 : 81-87% */ 1808 {{1377,185}, {1731,202}}, /* Q ==14 : 87-93% */ 1809 {{1412,185}, {1695,202}}, /* Q ==15 : 93-99% */ 1810 }; 1811 #endif 1812 1813 /* HUF_selectDecoder() : 1814 * Tells which decoder is likely to decode faster, 1815 * based on a set of pre-computed metrics. 1816 * @return : 0==HUF_decompress4X1, 1==HUF_decompress4X2 . 1817 * Assumption : 0 < dstSize <= 128 KB */ 1818 U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize) 1819 { 1820 assert(dstSize > 0); 1821 assert(dstSize <= 128*1024); 1822 #if defined(HUF_FORCE_DECOMPRESS_X1) 1823 (void)dstSize; 1824 (void)cSrcSize; 1825 return 0; 1826 #elif defined(HUF_FORCE_DECOMPRESS_X2) 1827 (void)dstSize; 1828 (void)cSrcSize; 1829 return 1; 1830 #else 1831 /* decoder timing evaluation */ 1832 { U32 const Q = (cSrcSize >= dstSize) ? 15 : (U32)(cSrcSize * 16 / dstSize); /* Q < 16 */ 1833 U32 const D256 = (U32)(dstSize >> 8); 1834 U32 const DTime0 = algoTime[Q][0].tableTime + (algoTime[Q][0].decode256Time * D256); 1835 U32 DTime1 = algoTime[Q][1].tableTime + (algoTime[Q][1].decode256Time * D256); 1836 DTime1 += DTime1 >> 5; /* small advantage to algorithm using less memory, to reduce cache eviction */ 1837 return DTime1 < DTime0; 1838 } 1839 #endif 1840 } 1841 1842 size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, 1843 const void* cSrc, size_t cSrcSize, 1844 void* workSpace, size_t wkspSize, int flags) 1845 { 1846 /* validation checks */ 1847 if (dstSize == 0) return ERROR(dstSize_tooSmall); 1848 if (cSrcSize > dstSize) return ERROR(corruption_detected); /* invalid */ 1849 if (cSrcSize == dstSize) { ZSTD_memcpy(dst, cSrc, dstSize); return dstSize; } /* not compressed */ 1850 if (cSrcSize == 1) { ZSTD_memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; } /* RLE */ 1851 1852 { U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize); 1853 #if defined(HUF_FORCE_DECOMPRESS_X1) 1854 (void)algoNb; 1855 assert(algoNb == 0); 1856 return HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc, 1857 cSrcSize, workSpace, wkspSize, flags); 1858 #elif defined(HUF_FORCE_DECOMPRESS_X2) 1859 (void)algoNb; 1860 assert(algoNb == 1); 1861 return HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc, 1862 cSrcSize, workSpace, wkspSize, flags); 1863 #else 1864 return algoNb ? HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc, 1865 cSrcSize, workSpace, wkspSize, flags): 1866 HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc, 1867 cSrcSize, workSpace, wkspSize, flags); 1868 #endif 1869 } 1870 } 1871 1872 1873 size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags) 1874 { 1875 DTableDesc const dtd = HUF_getDTableDesc(DTable); 1876 #if defined(HUF_FORCE_DECOMPRESS_X1) 1877 (void)dtd; 1878 assert(dtd.tableType == 0); 1879 return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags); 1880 #elif defined(HUF_FORCE_DECOMPRESS_X2) 1881 (void)dtd; 1882 assert(dtd.tableType == 1); 1883 return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags); 1884 #else 1885 return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags) : 1886 HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags); 1887 #endif 1888 } 1889 1890 #ifndef HUF_FORCE_DECOMPRESS_X2 1891 size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags) 1892 { 1893 const BYTE* ip = (const BYTE*) cSrc; 1894 1895 size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize, flags); 1896 if (HUF_isError(hSize)) return hSize; 1897 if (hSize >= cSrcSize) return ERROR(srcSize_wrong); 1898 ip += hSize; cSrcSize -= hSize; 1899 1900 return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags); 1901 } 1902 #endif 1903 1904 size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags) 1905 { 1906 DTableDesc const dtd = HUF_getDTableDesc(DTable); 1907 #if defined(HUF_FORCE_DECOMPRESS_X1) 1908 (void)dtd; 1909 assert(dtd.tableType == 0); 1910 return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags); 1911 #elif defined(HUF_FORCE_DECOMPRESS_X2) 1912 (void)dtd; 1913 assert(dtd.tableType == 1); 1914 return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags); 1915 #else 1916 return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags) : 1917 HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags); 1918 #endif 1919 } 1920 1921 size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags) 1922 { 1923 /* validation checks */ 1924 if (dstSize == 0) return ERROR(dstSize_tooSmall); 1925 if (cSrcSize == 0) return ERROR(corruption_detected); 1926 1927 { U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize); 1928 #if defined(HUF_FORCE_DECOMPRESS_X1) 1929 (void)algoNb; 1930 assert(algoNb == 0); 1931 return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags); 1932 #elif defined(HUF_FORCE_DECOMPRESS_X2) 1933 (void)algoNb; 1934 assert(algoNb == 1); 1935 return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags); 1936 #else 1937 return algoNb ? HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags) : 1938 HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags); 1939 #endif 1940 } 1941 } 1942