1 // SPDX-License-Identifier: BSD-3-Clause
2 /*
3 * BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html)
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. Neither the name of the copyright holder nor the names of its
16 * contributors may be used to endorse or promote products derived from this
17 * software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*
33 * Copyright (c) 2016-2018, Klara Inc.
34 * Copyright (c) 2016-2018, Allan Jude
35 * Copyright (c) 2018-2020, Sebastian Gottschall
36 * Copyright (c) 2019-2020, Michael Niewöhner
37 * Copyright (c) 2020, The FreeBSD Foundation [1]
38 *
39 * [1] Portions of this software were developed by Allan Jude
40 * under sponsorship from the FreeBSD Foundation.
41 */
42
43 #include <sys/param.h>
44 #include <sys/sysmacros.h>
45 #include <sys/zfs_context.h>
46 #include <sys/zio_compress.h>
47 #include <sys/spa.h>
48 #include <sys/zstd/zstd.h>
49
50 #define ZSTD_STATIC_LINKING_ONLY
51 #include "lib/zstd.h"
52 #include "lib/zstd_errors.h"
53
54 #ifndef IN_LIBSA
55 static uint_t zstd_earlyabort_pass = 1;
56 static int zstd_cutoff_level = ZIO_ZSTD_LEVEL_3;
57 static unsigned int zstd_abort_size = (128 * 1024);
58 #endif
59
60 #ifdef IN_BASE
61 int zfs_zstd_decompress_buf(void *, void *, size_t, size_t, int);
62 #endif
63
64 static kstat_t *zstd_ksp = NULL;
65
66 typedef struct zstd_stats {
67 kstat_named_t zstd_stat_alloc_fail;
68 kstat_named_t zstd_stat_alloc_fallback;
69 kstat_named_t zstd_stat_com_alloc_fail;
70 kstat_named_t zstd_stat_dec_alloc_fail;
71 kstat_named_t zstd_stat_com_inval;
72 kstat_named_t zstd_stat_dec_inval;
73 kstat_named_t zstd_stat_dec_header_inval;
74 kstat_named_t zstd_stat_com_fail;
75 kstat_named_t zstd_stat_dec_fail;
76 /*
77 * LZ4 first-pass early abort verdict
78 */
79 kstat_named_t zstd_stat_lz4pass_allowed;
80 kstat_named_t zstd_stat_lz4pass_rejected;
81 /*
82 * zstd-1 second-pass early abort verdict
83 */
84 kstat_named_t zstd_stat_zstdpass_allowed;
85 kstat_named_t zstd_stat_zstdpass_rejected;
86 /*
87 * We excluded this from early abort for some reason
88 */
89 kstat_named_t zstd_stat_passignored;
90 kstat_named_t zstd_stat_passignored_size;
91 kstat_named_t zstd_stat_buffers;
92 kstat_named_t zstd_stat_size;
93 } zstd_stats_t;
94
95 static zstd_stats_t zstd_stats = {
96 { "alloc_fail", KSTAT_DATA_UINT64 },
97 { "alloc_fallback", KSTAT_DATA_UINT64 },
98 { "compress_alloc_fail", KSTAT_DATA_UINT64 },
99 { "decompress_alloc_fail", KSTAT_DATA_UINT64 },
100 { "compress_level_invalid", KSTAT_DATA_UINT64 },
101 { "decompress_level_invalid", KSTAT_DATA_UINT64 },
102 { "decompress_header_invalid", KSTAT_DATA_UINT64 },
103 { "compress_failed", KSTAT_DATA_UINT64 },
104 { "decompress_failed", KSTAT_DATA_UINT64 },
105 { "lz4pass_allowed", KSTAT_DATA_UINT64 },
106 { "lz4pass_rejected", KSTAT_DATA_UINT64 },
107 { "zstdpass_allowed", KSTAT_DATA_UINT64 },
108 { "zstdpass_rejected", KSTAT_DATA_UINT64 },
109 { "passignored", KSTAT_DATA_UINT64 },
110 { "passignored_size", KSTAT_DATA_UINT64 },
111 { "buffers", KSTAT_DATA_UINT64 },
112 { "size", KSTAT_DATA_UINT64 },
113 };
114
115 #ifdef _KERNEL
116 static int
kstat_zstd_update(kstat_t * ksp,int rw)117 kstat_zstd_update(kstat_t *ksp, int rw)
118 {
119 ASSERT(ksp != NULL);
120
121 if (rw == KSTAT_WRITE && ksp == zstd_ksp) {
122 ZSTDSTAT_ZERO(zstd_stat_alloc_fail);
123 ZSTDSTAT_ZERO(zstd_stat_alloc_fallback);
124 ZSTDSTAT_ZERO(zstd_stat_com_alloc_fail);
125 ZSTDSTAT_ZERO(zstd_stat_dec_alloc_fail);
126 ZSTDSTAT_ZERO(zstd_stat_com_inval);
127 ZSTDSTAT_ZERO(zstd_stat_dec_inval);
128 ZSTDSTAT_ZERO(zstd_stat_dec_header_inval);
129 ZSTDSTAT_ZERO(zstd_stat_com_fail);
130 ZSTDSTAT_ZERO(zstd_stat_dec_fail);
131 ZSTDSTAT_ZERO(zstd_stat_lz4pass_allowed);
132 ZSTDSTAT_ZERO(zstd_stat_lz4pass_rejected);
133 ZSTDSTAT_ZERO(zstd_stat_zstdpass_allowed);
134 ZSTDSTAT_ZERO(zstd_stat_zstdpass_rejected);
135 ZSTDSTAT_ZERO(zstd_stat_passignored);
136 ZSTDSTAT_ZERO(zstd_stat_passignored_size);
137 }
138
139 return (0);
140 }
141 #endif
142
143 /* Enums describing the allocator type specified by kmem_type in zstd_kmem */
144 enum zstd_kmem_type {
145 ZSTD_KMEM_UNKNOWN = 0,
146 /* Allocation type using kmem_vmalloc */
147 ZSTD_KMEM_DEFAULT,
148 /* Pool based allocation using mempool_alloc */
149 ZSTD_KMEM_POOL,
150 /* Reserved fallback memory for decompression only */
151 ZSTD_KMEM_DCTX,
152 ZSTD_KMEM_COUNT,
153 };
154
155 /* Structure for pooled memory objects */
156 struct zstd_pool {
157 void *mem;
158 size_t size;
159 kmutex_t barrier;
160 hrtime_t timeout;
161 };
162
163 /* Global structure for handling memory allocations */
164 struct zstd_kmem {
165 enum zstd_kmem_type kmem_type;
166 size_t kmem_size;
167 struct zstd_pool *pool;
168 };
169
170 /* Fallback memory structure used for decompression only if memory runs out */
171 struct zstd_fallback_mem {
172 size_t mem_size;
173 void *mem;
174 kmutex_t barrier;
175 };
176
177 struct zstd_levelmap {
178 int16_t zstd_level;
179 enum zio_zstd_levels level;
180 };
181
182 /*
183 * ZSTD memory handlers
184 *
185 * For decompression we use a different handler which also provides fallback
186 * memory allocation in case memory runs out.
187 *
188 * The ZSTD handlers were split up for the most simplified implementation.
189 */
190 #ifndef IN_LIBSA
191 static void *zstd_alloc(void *opaque, size_t size);
192 #endif
193 static void *zstd_dctx_alloc(void *opaque, size_t size);
194 static void zstd_free(void *opaque, void *ptr);
195
196 #ifndef IN_LIBSA
197 /* Compression memory handler */
198 static const ZSTD_customMem zstd_malloc = {
199 zstd_alloc,
200 zstd_free,
201 NULL,
202 };
203 #endif
204
205 /* Decompression memory handler */
206 static const ZSTD_customMem zstd_dctx_malloc = {
207 zstd_dctx_alloc,
208 zstd_free,
209 NULL,
210 };
211
212 /* Level map for converting ZFS internal levels to ZSTD levels and vice versa */
213 static struct zstd_levelmap zstd_levels[] = {
214 {ZIO_ZSTD_LEVEL_1, ZIO_ZSTD_LEVEL_1},
215 {ZIO_ZSTD_LEVEL_2, ZIO_ZSTD_LEVEL_2},
216 {ZIO_ZSTD_LEVEL_3, ZIO_ZSTD_LEVEL_3},
217 {ZIO_ZSTD_LEVEL_4, ZIO_ZSTD_LEVEL_4},
218 {ZIO_ZSTD_LEVEL_5, ZIO_ZSTD_LEVEL_5},
219 {ZIO_ZSTD_LEVEL_6, ZIO_ZSTD_LEVEL_6},
220 {ZIO_ZSTD_LEVEL_7, ZIO_ZSTD_LEVEL_7},
221 {ZIO_ZSTD_LEVEL_8, ZIO_ZSTD_LEVEL_8},
222 {ZIO_ZSTD_LEVEL_9, ZIO_ZSTD_LEVEL_9},
223 {ZIO_ZSTD_LEVEL_10, ZIO_ZSTD_LEVEL_10},
224 {ZIO_ZSTD_LEVEL_11, ZIO_ZSTD_LEVEL_11},
225 {ZIO_ZSTD_LEVEL_12, ZIO_ZSTD_LEVEL_12},
226 {ZIO_ZSTD_LEVEL_13, ZIO_ZSTD_LEVEL_13},
227 {ZIO_ZSTD_LEVEL_14, ZIO_ZSTD_LEVEL_14},
228 {ZIO_ZSTD_LEVEL_15, ZIO_ZSTD_LEVEL_15},
229 {ZIO_ZSTD_LEVEL_16, ZIO_ZSTD_LEVEL_16},
230 {ZIO_ZSTD_LEVEL_17, ZIO_ZSTD_LEVEL_17},
231 {ZIO_ZSTD_LEVEL_18, ZIO_ZSTD_LEVEL_18},
232 {ZIO_ZSTD_LEVEL_19, ZIO_ZSTD_LEVEL_19},
233 {-1, ZIO_ZSTD_LEVEL_FAST_1},
234 {-2, ZIO_ZSTD_LEVEL_FAST_2},
235 {-3, ZIO_ZSTD_LEVEL_FAST_3},
236 {-4, ZIO_ZSTD_LEVEL_FAST_4},
237 {-5, ZIO_ZSTD_LEVEL_FAST_5},
238 {-6, ZIO_ZSTD_LEVEL_FAST_6},
239 {-7, ZIO_ZSTD_LEVEL_FAST_7},
240 {-8, ZIO_ZSTD_LEVEL_FAST_8},
241 {-9, ZIO_ZSTD_LEVEL_FAST_9},
242 {-10, ZIO_ZSTD_LEVEL_FAST_10},
243 {-20, ZIO_ZSTD_LEVEL_FAST_20},
244 {-30, ZIO_ZSTD_LEVEL_FAST_30},
245 {-40, ZIO_ZSTD_LEVEL_FAST_40},
246 {-50, ZIO_ZSTD_LEVEL_FAST_50},
247 {-60, ZIO_ZSTD_LEVEL_FAST_60},
248 {-70, ZIO_ZSTD_LEVEL_FAST_70},
249 {-80, ZIO_ZSTD_LEVEL_FAST_80},
250 {-90, ZIO_ZSTD_LEVEL_FAST_90},
251 {-100, ZIO_ZSTD_LEVEL_FAST_100},
252 {-500, ZIO_ZSTD_LEVEL_FAST_500},
253 {-1000, ZIO_ZSTD_LEVEL_FAST_1000},
254 };
255
256 /*
257 * This variable represents the maximum count of the pool based on the number
258 * of CPUs plus some buffer. We default to cpu count * 4, see init_zstd.
259 */
260 static int pool_count = 16;
261
262 #define ZSTD_POOL_MAX pool_count
263 #define ZSTD_POOL_TIMEOUT 60 * 2
264
265 static struct zstd_fallback_mem zstd_dctx_fallback;
266 static struct zstd_pool *zstd_mempool_cctx;
267 static struct zstd_pool *zstd_mempool_dctx;
268
269 /*
270 * The library zstd code expects these if ADDRESS_SANITIZER gets defined,
271 * and while ASAN does this, KASAN defines that and does not. So to avoid
272 * changing the external code, we do this.
273 */
274 #if defined(ZFS_ASAN_ENABLED)
275 #define ADDRESS_SANITIZER 1
276 #endif
277
278 /* Kernel space. */
279 #if defined(_KERNEL) && defined(ADDRESS_SANITIZER)
280 void __asan_unpoison_memory_region(void const volatile *addr, size_t size);
281 void __asan_poison_memory_region(void const volatile *addr, size_t size);
__asan_unpoison_memory_region(void const volatile * addr,size_t size)282 void __asan_unpoison_memory_region(void const volatile *addr, size_t size) {};
__asan_poison_memory_region(void const volatile * addr,size_t size)283 void __asan_poison_memory_region(void const volatile *addr, size_t size) {};
284 #endif
285
286 /* User space. */
287 #if defined(ADDRESS_SANITIZER) && !defined(_KERNEL)
288 #define ZSTD_ASAN_POISON(p, n) __asan_poison_memory_region((p), (n))
289 #define ZSTD_ASAN_UNPOISON(p, n) __asan_unpoison_memory_region((p), (n))
290 #else
291 #define ZSTD_ASAN_POISON(p, n) do { } while (0)
292 #define ZSTD_ASAN_UNPOISON(p, n) do { } while (0)
293 #endif
294
295 static void
zstd_mempool_reap(struct zstd_pool * zstd_mempool)296 zstd_mempool_reap(struct zstd_pool *zstd_mempool)
297 {
298 struct zstd_pool *pool;
299
300 if (!zstd_mempool || !ZSTDSTAT(zstd_stat_buffers)) {
301 return;
302 }
303
304 /* free obsolete slots */
305 for (int i = 0; i < ZSTD_POOL_MAX; i++) {
306 pool = &zstd_mempool[i];
307 if (pool->mem && mutex_tryenter(&pool->barrier)) {
308 /* Free memory if unused object older than 2 minutes */
309 if (pool->mem && gethrestime_sec() > pool->timeout) {
310 vmem_free(pool->mem, pool->size);
311 ZSTDSTAT_SUB(zstd_stat_buffers, 1);
312 ZSTDSTAT_SUB(zstd_stat_size, pool->size);
313 pool->mem = NULL;
314 pool->size = 0;
315 pool->timeout = 0;
316 }
317 mutex_exit(&pool->barrier);
318 }
319 }
320 }
321
322 /*
323 * Try to get a cached allocated buffer from memory pool or allocate a new one
324 * if necessary. If a object is older than 2 minutes and does not fit the
325 * requested size, it will be released and a new cached entry will be allocated.
326 * If other pooled objects are detected without being used for 2 minutes, they
327 * will be released, too.
328 *
329 * The concept is that high frequency memory allocations of bigger objects are
330 * expensive. So if a lot of work is going on, allocations will be kept for a
331 * while and can be reused in that time frame.
332 *
333 * The scheduled release will be updated every time a object is reused.
334 */
335
336 static void *
zstd_mempool_alloc(struct zstd_pool * zstd_mempool,size_t size)337 zstd_mempool_alloc(struct zstd_pool *zstd_mempool, size_t size)
338 {
339 struct zstd_pool *pool;
340 struct zstd_kmem *mem = NULL;
341
342 if (!zstd_mempool) {
343 return (NULL);
344 }
345
346 /* Seek for preallocated memory slot and free obsolete slots */
347 for (int i = 0; i < ZSTD_POOL_MAX; i++) {
348 pool = &zstd_mempool[i];
349 /*
350 * This lock is simply a marker for a pool object being in use.
351 * If it's already hold, it will be skipped.
352 *
353 * We need to create it before checking it to avoid race
354 * conditions caused by running in a threaded context.
355 *
356 * The lock is later released by zstd_mempool_free.
357 */
358 if (mutex_tryenter(&pool->barrier)) {
359 /*
360 * Check if objects fits the size, if so we take it and
361 * update the timestamp.
362 */
363 if (pool->mem && size <= pool->size) {
364 pool->timeout = gethrestime_sec() +
365 ZSTD_POOL_TIMEOUT;
366 mem = pool->mem;
367 return (mem);
368 }
369 mutex_exit(&pool->barrier);
370 }
371 }
372
373 /*
374 * If no preallocated slot was found, try to fill in a new one.
375 *
376 * We run a similar algorithm twice here to avoid pool fragmentation.
377 * The first one may generate holes in the list if objects get released.
378 * We always make sure that these holes get filled instead of adding new
379 * allocations constantly at the end.
380 */
381 for (int i = 0; i < ZSTD_POOL_MAX; i++) {
382 pool = &zstd_mempool[i];
383 if (mutex_tryenter(&pool->barrier)) {
384 /* Object is free, try to allocate new one */
385 if (!pool->mem) {
386 mem = vmem_alloc(size, KM_SLEEP);
387 if (mem) {
388 ZSTDSTAT_ADD(zstd_stat_buffers, 1);
389 ZSTDSTAT_ADD(zstd_stat_size, size);
390 pool->mem = mem;
391 pool->size = size;
392 /* Keep track for later release */
393 mem->pool = pool;
394 mem->kmem_type = ZSTD_KMEM_POOL;
395 mem->kmem_size = size;
396 }
397 }
398
399 if (size <= pool->size) {
400 /* Update timestamp */
401 pool->timeout = gethrestime_sec() +
402 ZSTD_POOL_TIMEOUT;
403
404 return (pool->mem);
405 }
406
407 mutex_exit(&pool->barrier);
408 }
409 }
410
411 /*
412 * If the pool is full or the allocation failed, try lazy allocation
413 * instead.
414 */
415 if (!mem) {
416 mem = vmem_alloc(size, KM_NOSLEEP);
417 if (mem) {
418 mem->pool = NULL;
419 mem->kmem_type = ZSTD_KMEM_DEFAULT;
420 mem->kmem_size = size;
421 }
422 }
423
424 return (mem);
425 }
426
427 /* Mark object as released by releasing the barrier mutex */
428 static void
zstd_mempool_free(struct zstd_kmem * z)429 zstd_mempool_free(struct zstd_kmem *z)
430 {
431 /* Poison only the user-visible region (exclude header). */
432 ZSTD_ASAN_POISON((char *)z + sizeof (struct zstd_kmem),
433 z->kmem_size - sizeof (struct zstd_kmem));
434
435 mutex_exit(&z->pool->barrier);
436 }
437
438 /* Convert ZFS internal enum to ZSTD level */
439 static int
zstd_enum_to_level(enum zio_zstd_levels level,int16_t * zstd_level)440 zstd_enum_to_level(enum zio_zstd_levels level, int16_t *zstd_level)
441 {
442 if (level > 0 && level <= ZIO_ZSTD_LEVEL_19) {
443 *zstd_level = zstd_levels[level - 1].zstd_level;
444 return (0);
445 }
446 if (level >= ZIO_ZSTD_LEVEL_FAST_1 &&
447 level <= ZIO_ZSTD_LEVEL_FAST_1000) {
448 *zstd_level = zstd_levels[level - ZIO_ZSTD_LEVEL_FAST_1
449 + ZIO_ZSTD_LEVEL_19].zstd_level;
450 return (0);
451 }
452
453 /* Invalid/unknown zfs compression enum - this should never happen. */
454 return (1);
455 }
456
457 #ifndef IN_LIBSA
458 /* Compress block using zstd */
459 static size_t
zfs_zstd_compress_impl(void * s_start,void * d_start,size_t s_len,size_t d_len,int level)460 zfs_zstd_compress_impl(void *s_start, void *d_start, size_t s_len, size_t d_len,
461 int level)
462 {
463 size_t c_len;
464 int16_t zstd_level;
465 zfs_zstdhdr_t *hdr;
466 ZSTD_CCtx *cctx;
467
468 hdr = (zfs_zstdhdr_t *)d_start;
469
470 /* Skip compression if the specified level is invalid */
471 if (zstd_enum_to_level(level, &zstd_level)) {
472 ZSTDSTAT_BUMP(zstd_stat_com_inval);
473 return (s_len);
474 }
475
476 ASSERT3U(d_len, >=, sizeof (*hdr));
477 ASSERT3U(d_len, <=, s_len);
478 ASSERT3U(zstd_level, !=, 0);
479
480 cctx = ZSTD_createCCtx_advanced(zstd_malloc);
481
482 /*
483 * Out of kernel memory, gently fall through - this will disable
484 * compression in zio_compress_data
485 */
486 if (!cctx) {
487 ZSTDSTAT_BUMP(zstd_stat_com_alloc_fail);
488 return (s_len);
489 }
490
491 /* Set the compression level */
492 ZSTD_CCtx_setParameter(cctx, ZSTD_c_compressionLevel, zstd_level);
493
494 /* Use the "magicless" zstd header which saves us 4 header bytes */
495 ZSTD_CCtx_setParameter(cctx, ZSTD_c_format, ZSTD_f_zstd1_magicless);
496
497 /*
498 * Disable redundant checksum calculation and content size storage since
499 * this is already done by ZFS itself.
500 */
501 ZSTD_CCtx_setParameter(cctx, ZSTD_c_checksumFlag, 0);
502 ZSTD_CCtx_setParameter(cctx, ZSTD_c_contentSizeFlag, 0);
503
504 c_len = ZSTD_compress2(cctx,
505 hdr->data,
506 d_len - sizeof (*hdr),
507 s_start, s_len);
508
509 ZSTD_freeCCtx(cctx);
510
511 /* Error in the compression routine, disable compression. */
512 if (ZSTD_isError(c_len)) {
513 /*
514 * If we are aborting the compression because the saves are
515 * too small, that is not a failure. Everything else is a
516 * failure, so increment the compression failure counter.
517 */
518 int err = ZSTD_getErrorCode(c_len);
519 if (err != ZSTD_error_dstSize_tooSmall) {
520 ZSTDSTAT_BUMP(zstd_stat_com_fail);
521 dprintf("Error: %s", ZSTD_getErrorString(err));
522 }
523 return (s_len);
524 }
525
526 /*
527 * Encode the compressed buffer size at the start. We'll need this in
528 * decompression to counter the effects of padding which might be added
529 * to the compressed buffer and which, if unhandled, would confuse the
530 * hell out of our decompression function.
531 */
532 hdr->c_len = BE_32(c_len);
533
534 /*
535 * Check version for overflow.
536 * The limit of 24 bits must not be exceeded. This allows a maximum
537 * version 1677.72.15 which we don't expect to be ever reached.
538 */
539 ASSERT3U(ZSTD_VERSION_NUMBER, <=, 0xFFFFFF);
540
541 /*
542 * Encode the compression level as well. We may need to know the
543 * original compression level if compressed_arc is disabled, to match
544 * the compression settings to write this block to the L2ARC.
545 *
546 * Encode the actual level, so if the enum changes in the future, we
547 * will be compatible.
548 *
549 * The upper 24 bits store the ZSTD version to be able to provide
550 * future compatibility, since new versions might enhance the
551 * compression algorithm in a way, where the compressed data will
552 * change.
553 *
554 * As soon as such incompatibility occurs, handling code needs to be
555 * added, differentiating between the versions.
556 */
557 zfs_set_hdrversion(hdr, ZSTD_VERSION_NUMBER);
558 zfs_set_hdrlevel(hdr, level);
559 hdr->raw_version_level = BE_32(hdr->raw_version_level);
560
561 return (c_len + sizeof (*hdr));
562 }
563
564 static size_t
zfs_zstd_compress_buf(void * s_start,void * d_start,size_t s_len,size_t d_len,int level)565 zfs_zstd_compress_buf(void *s_start, void *d_start, size_t s_len, size_t d_len,
566 int level)
567 {
568 int16_t zstd_level;
569 if (zstd_enum_to_level(level, &zstd_level)) {
570 ZSTDSTAT_BUMP(zstd_stat_com_inval);
571 return (s_len);
572 }
573 /*
574 * A zstd early abort heuristic.
575 *
576 * - Zeroth, if this is <= zstd-3, or < zstd_abort_size (currently
577 * 128k), don't try any of this, just go.
578 * (because experimentally that was a reasonable cutoff for a perf win
579 * with tiny ratio change)
580 * - First, we try LZ4 compression, and if it doesn't early abort, we
581 * jump directly to whatever compression level we intended to try.
582 * - Second, we try zstd-1 - if that errors out (usually, but not
583 * exclusively, if it would overflow), we give up early.
584 *
585 * If it works, instead we go on and compress anyway.
586 *
587 * Why two passes? LZ4 alone gets you a lot of the way, but on highly
588 * compressible data, it was losing up to 8.5% of the compressed
589 * savings versus no early abort, and all the zstd-fast levels are
590 * worse indications on their own than LZ4, and don't improve the LZ4
591 * pass noticably if stacked like this.
592 */
593 size_t actual_abort_size = zstd_abort_size;
594 if (zstd_earlyabort_pass > 0 && zstd_level >= zstd_cutoff_level &&
595 s_len >= actual_abort_size) {
596 abd_t sabd, dabd;
597 abd_get_from_buf_struct(&sabd, s_start, s_len);
598 abd_get_from_buf_struct(&dabd, d_start, d_len);
599 int pass_len = zfs_lz4_compress(&sabd, &dabd, s_len, d_len, 0);
600 abd_free(&dabd);
601 abd_free(&sabd);
602 if (pass_len < d_len) {
603 ZSTDSTAT_BUMP(zstd_stat_lz4pass_allowed);
604 goto keep_trying;
605 }
606 ZSTDSTAT_BUMP(zstd_stat_lz4pass_rejected);
607
608 pass_len = zfs_zstd_compress_impl(s_start, d_start, s_len,
609 d_len, ZIO_ZSTD_LEVEL_1);
610 if (pass_len == s_len || pass_len <= 0 || pass_len > d_len) {
611 ZSTDSTAT_BUMP(zstd_stat_zstdpass_rejected);
612 return (s_len);
613 }
614 ZSTDSTAT_BUMP(zstd_stat_zstdpass_allowed);
615 } else {
616 ZSTDSTAT_BUMP(zstd_stat_passignored);
617 if (s_len < actual_abort_size) {
618 ZSTDSTAT_BUMP(zstd_stat_passignored_size);
619 }
620 }
621 keep_trying:
622 return (zfs_zstd_compress_impl(s_start, d_start, s_len, d_len, level));
623
624 }
625 #endif
626
627 /* Decompress block using zstd and return its stored level */
628 static int
zfs_zstd_decompress_level_buf(void * s_start,void * d_start,size_t s_len,size_t d_len,uint8_t * level)629 zfs_zstd_decompress_level_buf(void *s_start, void *d_start, size_t s_len,
630 size_t d_len, uint8_t *level)
631 {
632 ZSTD_DCtx *dctx;
633 size_t result;
634 int16_t zstd_level;
635 uint32_t c_len;
636 const zfs_zstdhdr_t *hdr;
637 zfs_zstdhdr_t hdr_copy;
638
639 hdr = (const zfs_zstdhdr_t *)s_start;
640 c_len = BE_32(hdr->c_len);
641
642 /*
643 * Make a copy instead of directly converting the header, since we must
644 * not modify the original data that may be used again later.
645 */
646 hdr_copy.raw_version_level = BE_32(hdr->raw_version_level);
647 uint8_t curlevel = zfs_get_hdrlevel(&hdr_copy);
648
649 /*
650 * NOTE: We ignore the ZSTD version for now. As soon as any
651 * incompatibility occurs, it has to be handled accordingly.
652 * The version can be accessed via `hdr_copy.version`.
653 */
654
655 /*
656 * Convert and check the level
657 * An invalid level is a strong indicator for data corruption! In such
658 * case return an error so the upper layers can try to fix it.
659 */
660 if (zstd_enum_to_level(curlevel, &zstd_level)) {
661 ZSTDSTAT_BUMP(zstd_stat_dec_inval);
662 return (1);
663 }
664
665 ASSERT3U(d_len, >=, s_len);
666 ASSERT3U(curlevel, !=, ZIO_COMPLEVEL_INHERIT);
667
668 /* Invalid compressed buffer size encoded at start */
669 if (c_len + sizeof (*hdr) > s_len) {
670 ZSTDSTAT_BUMP(zstd_stat_dec_header_inval);
671 return (1);
672 }
673
674 dctx = ZSTD_createDCtx_advanced(zstd_dctx_malloc);
675 if (!dctx) {
676 ZSTDSTAT_BUMP(zstd_stat_dec_alloc_fail);
677 return (1);
678 }
679
680 /* Set header type to "magicless" */
681 ZSTD_DCtx_setParameter(dctx, ZSTD_d_format, ZSTD_f_zstd1_magicless);
682
683 /* Decompress the data and release the context */
684 result = ZSTD_decompressDCtx(dctx, d_start, d_len, hdr->data, c_len);
685 ZSTD_freeDCtx(dctx);
686
687 /*
688 * Returns 0 on success (decompression function returned non-negative)
689 * and non-zero on failure (decompression function returned negative.
690 */
691 if (ZSTD_isError(result)) {
692 ZSTDSTAT_BUMP(zstd_stat_dec_fail);
693 return (1);
694 }
695
696 if (level) {
697 *level = curlevel;
698 }
699
700 return (0);
701 }
702
703 /* Decompress datablock using zstd */
704 #ifdef IN_BASE
705 int
zfs_zstd_decompress_buf(void * s_start,void * d_start,size_t s_len,size_t d_len,int level __maybe_unused)706 zfs_zstd_decompress_buf(void *s_start, void *d_start, size_t s_len,
707 size_t d_len, int level __maybe_unused)
708 {
709
710 return (zfs_zstd_decompress_level_buf(s_start, d_start, s_len, d_len,
711 NULL));
712 }
713 #else
714 static int
zfs_zstd_decompress_buf(void * s_start,void * d_start,size_t s_len,size_t d_len,int level __maybe_unused)715 zfs_zstd_decompress_buf(void *s_start, void *d_start, size_t s_len,
716 size_t d_len, int level __maybe_unused)
717 {
718
719 return (zfs_zstd_decompress_level_buf(s_start, d_start, s_len, d_len,
720 NULL));
721 }
722 #endif
723
724 #ifndef IN_LIBSA
725 ZFS_COMPRESS_WRAP_DECL(zfs_zstd_compress)
ZFS_DECOMPRESS_WRAP_DECL(zfs_zstd_decompress)726 ZFS_DECOMPRESS_WRAP_DECL(zfs_zstd_decompress)
727 ZFS_DECOMPRESS_LEVEL_WRAP_DECL(zfs_zstd_decompress_level)
728
729 /* Allocator for zstd compression context using mempool_allocator */
730 static void *
731 zstd_alloc(void *opaque __maybe_unused, size_t size)
732 {
733 size_t nbytes = sizeof (struct zstd_kmem) + size;
734 struct zstd_kmem *z = NULL;
735
736 z = (struct zstd_kmem *)zstd_mempool_alloc(zstd_mempool_cctx, nbytes);
737
738 if (!z) {
739 ZSTDSTAT_BUMP(zstd_stat_alloc_fail);
740 return (NULL);
741 }
742
743 void *p = (char *)z + sizeof (struct zstd_kmem);
744 ZSTD_ASAN_UNPOISON(p, size);
745 return (p);
746 }
747
748 #endif
749 /*
750 * Allocator for zstd decompression context using mempool_allocator with
751 * fallback to reserved memory if allocation fails
752 */
753 static void *
zstd_dctx_alloc(void * opaque __maybe_unused,size_t size)754 zstd_dctx_alloc(void *opaque __maybe_unused, size_t size)
755 {
756 size_t nbytes = sizeof (struct zstd_kmem) + size;
757 struct zstd_kmem *z = NULL;
758 enum zstd_kmem_type type = ZSTD_KMEM_DEFAULT;
759
760 z = (struct zstd_kmem *)zstd_mempool_alloc(zstd_mempool_dctx, nbytes);
761 if (z) {
762 type = ZSTD_KMEM_POOL;
763 } else {
764 /* Try harder, decompression shall not fail */
765 z = vmem_alloc(nbytes, KM_SLEEP);
766 if (z) {
767 z->pool = NULL;
768 }
769 ZSTDSTAT_BUMP(zstd_stat_alloc_fail);
770 }
771
772 /* Fallback if everything fails */
773 if (!z) {
774 /*
775 * Barrier since we only can handle it in a single thread. All
776 * other following threads need to wait here until decompression
777 * is completed. zstd_free will release this barrier later.
778 */
779 mutex_enter(&zstd_dctx_fallback.barrier);
780
781 z = zstd_dctx_fallback.mem;
782 type = ZSTD_KMEM_DCTX;
783 ZSTDSTAT_BUMP(zstd_stat_alloc_fallback);
784 }
785
786 /* Allocation should always be successful */
787 if (!z) {
788 return (NULL);
789 }
790
791 z->kmem_type = type;
792 z->kmem_size = nbytes;
793
794 void *p = (char *)z + sizeof (struct zstd_kmem);
795 ZSTD_ASAN_UNPOISON(p, size);
796 return (p);
797 }
798
799 /* Free allocated memory by its specific type */
800 static void
zstd_free(void * opaque __maybe_unused,void * ptr)801 zstd_free(void *opaque __maybe_unused, void *ptr)
802 {
803 struct zstd_kmem *z =
804 (struct zstd_kmem *)((char *)ptr - sizeof (struct zstd_kmem));
805 enum zstd_kmem_type type;
806
807 ASSERT3U(z->kmem_type, <, ZSTD_KMEM_COUNT);
808 ASSERT3U(z->kmem_type, >, ZSTD_KMEM_UNKNOWN);
809
810 type = z->kmem_type;
811 switch (type) {
812 case ZSTD_KMEM_DEFAULT:
813 vmem_free(z, z->kmem_size);
814 break;
815 case ZSTD_KMEM_POOL:
816 zstd_mempool_free(z);
817 break;
818 case ZSTD_KMEM_DCTX:
819 /* Poison fallback user region on release. */
820 ZSTD_ASAN_POISON(ptr, z->kmem_size - sizeof (struct zstd_kmem));
821 mutex_exit(&zstd_dctx_fallback.barrier);
822 break;
823 default:
824 break;
825 }
826 }
827
828 /* Allocate fallback memory to ensure safe decompression */
829 static void __init
create_fallback_mem(struct zstd_fallback_mem * mem,size_t size)830 create_fallback_mem(struct zstd_fallback_mem *mem, size_t size)
831 {
832 mem->mem_size = size;
833 mem->mem = vmem_zalloc(mem->mem_size, KM_SLEEP);
834 mutex_init(&mem->barrier, NULL, MUTEX_DEFAULT, NULL);
835 }
836
837 /* Initialize memory pool barrier mutexes */
838 static void __init
zstd_mempool_init(void)839 zstd_mempool_init(void)
840 {
841 zstd_mempool_cctx =
842 vmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP);
843 zstd_mempool_dctx =
844 vmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP);
845
846 for (int i = 0; i < ZSTD_POOL_MAX; i++) {
847 mutex_init(&zstd_mempool_cctx[i].barrier, NULL,
848 MUTEX_DEFAULT, NULL);
849 mutex_init(&zstd_mempool_dctx[i].barrier, NULL,
850 MUTEX_DEFAULT, NULL);
851 }
852 }
853
854 /* Initialize zstd-related memory handling */
855 static int __init
zstd_meminit(void)856 zstd_meminit(void)
857 {
858 zstd_mempool_init();
859
860 /*
861 * Estimate the size of the fallback decompression context.
862 * The expected size on x64 with current ZSTD should be about 160 KB.
863 */
864 create_fallback_mem(&zstd_dctx_fallback,
865 P2ROUNDUP(ZSTD_estimateDCtxSize() + sizeof (struct zstd_kmem),
866 PAGESIZE));
867
868 return (0);
869 }
870
871 /* Release object from pool and free memory */
872 static void
release_pool(struct zstd_pool * pool)873 release_pool(struct zstd_pool *pool)
874 {
875 mutex_destroy(&pool->barrier);
876 vmem_free(pool->mem, pool->size);
877 pool->mem = NULL;
878 pool->size = 0;
879 }
880
881 /* Release memory pool objects */
882 static void
zstd_mempool_deinit(void)883 zstd_mempool_deinit(void)
884 {
885 for (int i = 0; i < ZSTD_POOL_MAX; i++) {
886 release_pool(&zstd_mempool_cctx[i]);
887 release_pool(&zstd_mempool_dctx[i]);
888 }
889
890 vmem_free(zstd_mempool_dctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool));
891 vmem_free(zstd_mempool_cctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool));
892 zstd_mempool_dctx = NULL;
893 zstd_mempool_cctx = NULL;
894 }
895
896 /* release unused memory from pool */
897
898 void
zfs_zstd_cache_reap_now(void)899 zfs_zstd_cache_reap_now(void)
900 {
901
902 /*
903 * Short-circuit if there are no buffers to begin with.
904 */
905 if (ZSTDSTAT(zstd_stat_buffers) == 0)
906 return;
907
908 /*
909 * calling alloc with zero size seeks
910 * and releases old unused objects
911 */
912 zstd_mempool_reap(zstd_mempool_cctx);
913 zstd_mempool_reap(zstd_mempool_dctx);
914 }
915
916 extern int __init
zstd_init(void)917 zstd_init(void)
918 {
919 /* Set pool size by using maximum sane thread count * 4 */
920 pool_count = (boot_ncpus * 4);
921 zstd_meminit();
922
923 /* Initialize kstat */
924 zstd_ksp = kstat_create("zfs", 0, "zstd", "misc",
925 KSTAT_TYPE_NAMED, sizeof (zstd_stats) / sizeof (kstat_named_t),
926 KSTAT_FLAG_VIRTUAL);
927 if (zstd_ksp != NULL) {
928 zstd_ksp->ks_data = &zstd_stats;
929 kstat_install(zstd_ksp);
930 #ifdef _KERNEL
931 zstd_ksp->ks_update = kstat_zstd_update;
932 #endif
933 }
934
935 return (0);
936 }
937
938 extern void
zstd_fini(void)939 zstd_fini(void)
940 {
941 /* Deinitialize kstat */
942 if (zstd_ksp != NULL) {
943 kstat_delete(zstd_ksp);
944 zstd_ksp = NULL;
945 }
946
947 /* Release fallback memory */
948 vmem_free(zstd_dctx_fallback.mem, zstd_dctx_fallback.mem_size);
949 mutex_destroy(&zstd_dctx_fallback.barrier);
950
951 /* Deinit memory pool */
952 zstd_mempool_deinit();
953 }
954
955 #if defined(_KERNEL)
956 #ifdef __FreeBSD__
957 module_init(zstd_init);
958 module_exit(zstd_fini);
959 #endif
960
961 ZFS_MODULE_PARAM(zfs, zstd_, earlyabort_pass, UINT, ZMOD_RW,
962 "Enable early abort attempts when using zstd");
963 ZFS_MODULE_PARAM(zfs, zstd_, abort_size, UINT, ZMOD_RW,
964 "Minimal size of block to attempt early abort");
965 #endif
966