1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2022 The FreeBSD Foundation
5 *
6 * This software was developed by Mark Johnston under sponsorship from
7 * the FreeBSD Foundation.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions are
11 * met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in
16 * the documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 */
30
31 #include <sys/param.h>
32 #include <assert.h>
33 #include <fcntl.h>
34 #include <stdlib.h>
35 #include <string.h>
36 #include <unistd.h>
37
38 #include <util.h>
39
40 #include "zfs.h"
41
42 #pragma GCC diagnostic push
43 #pragma GCC diagnostic ignored "-Wunused-function"
44 #include "zfs/fletcher.c"
45 #include "zfs/sha256.c"
46 #pragma GCC diagnostic pop
47
48 static void
blkptr_set(blkptr_t * bp,off_t off,off_t size,uint8_t dntype,uint8_t level,uint64_t fill,enum zio_checksum cksumt,zio_cksum_t * cksum)49 blkptr_set(blkptr_t *bp, off_t off, off_t size, uint8_t dntype, uint8_t level,
50 uint64_t fill, enum zio_checksum cksumt, zio_cksum_t *cksum)
51 {
52 dva_t *dva;
53
54 assert(powerof2(size));
55
56 BP_ZERO(bp);
57 BP_SET_LSIZE(bp, size);
58 BP_SET_PSIZE(bp, size);
59 BP_SET_CHECKSUM(bp, cksumt);
60 BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
61 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
62 BP_SET_BIRTH(bp, TXG, TXG);
63 BP_SET_LEVEL(bp, level);
64 BP_SET_FILL(bp, fill);
65 BP_SET_TYPE(bp, dntype);
66
67 dva = BP_IDENTITY(bp);
68 DVA_SET_VDEV(dva, 0);
69 DVA_SET_OFFSET(dva, off);
70 DVA_SET_ASIZE(dva, size);
71 memcpy(&bp->blk_cksum, cksum, sizeof(*cksum));
72 }
73
74 /*
75 * Write a block of data to the vdev. The offset is always relative to the end
76 * of the second leading vdev label.
77 *
78 * Consumers should generally use the helpers below, which provide block
79 * pointers and update dnode accounting, rather than calling this function
80 * directly.
81 */
82 static void
vdev_pwrite(const zfs_opt_t * zfs,const void * buf,size_t len,off_t off)83 vdev_pwrite(const zfs_opt_t *zfs, const void *buf, size_t len, off_t off)
84 {
85 ssize_t n;
86
87 assert(off >= 0 && off < zfs->asize);
88 assert(powerof2(len));
89 assert((off_t)len > 0 && off + (off_t)len > off &&
90 off + (off_t)len < zfs->asize);
91 if (zfs->spacemap != NULL) {
92 /*
93 * Verify that the blocks being written were in fact allocated.
94 *
95 * The space map isn't available once the on-disk space map is
96 * finalized, so this check doesn't quite catch everything.
97 */
98 assert(bit_ntest(zfs->spacemap, off >> zfs->ashift,
99 (off + len - 1) >> zfs->ashift, 1));
100 }
101
102 off += VDEV_LABEL_START_SIZE;
103 for (size_t sofar = 0; sofar < len; sofar += n) {
104 n = pwrite(zfs->fd, (const char *)buf + sofar, len - sofar,
105 off + sofar);
106 if (n < 0)
107 err(1, "pwrite");
108 assert(n > 0);
109 }
110 }
111
112 void
vdev_pwrite_data(zfs_opt_t * zfs,uint8_t datatype,uint8_t cksumtype,uint8_t level,uint64_t fill,const void * data,off_t sz,off_t loc,blkptr_t * bp)113 vdev_pwrite_data(zfs_opt_t *zfs, uint8_t datatype, uint8_t cksumtype,
114 uint8_t level, uint64_t fill, const void *data, off_t sz, off_t loc,
115 blkptr_t *bp)
116 {
117 zio_cksum_t cksum;
118
119 assert(cksumtype == ZIO_CHECKSUM_FLETCHER_4);
120
121 fletcher_4_native(data, sz, NULL, &cksum);
122 blkptr_set(bp, loc, sz, datatype, level, fill, cksumtype, &cksum);
123 vdev_pwrite(zfs, data, sz, loc);
124 }
125
126 void
vdev_pwrite_dnode_indir(zfs_opt_t * zfs,dnode_phys_t * dnode,uint8_t level,uint64_t fill,const void * data,off_t sz,off_t loc,blkptr_t * bp)127 vdev_pwrite_dnode_indir(zfs_opt_t *zfs, dnode_phys_t *dnode, uint8_t level,
128 uint64_t fill, const void *data, off_t sz, off_t loc, blkptr_t *bp)
129 {
130 vdev_pwrite_data(zfs, dnode->dn_type, dnode->dn_checksum, level, fill,
131 data, sz, loc, bp);
132
133 assert((dnode->dn_flags & DNODE_FLAG_USED_BYTES) != 0);
134 dnode->dn_used += sz;
135 }
136
137 void
vdev_pwrite_dnode_data(zfs_opt_t * zfs,dnode_phys_t * dnode,const void * data,off_t sz,off_t loc)138 vdev_pwrite_dnode_data(zfs_opt_t *zfs, dnode_phys_t *dnode, const void *data,
139 off_t sz, off_t loc)
140 {
141 vdev_pwrite_dnode_indir(zfs, dnode, 0, 1, data, sz, loc,
142 &dnode->dn_blkptr[0]);
143 }
144
145 static void
vdev_label_set_checksum(void * buf,off_t off,off_t size)146 vdev_label_set_checksum(void *buf, off_t off, off_t size)
147 {
148 zio_cksum_t cksum;
149 zio_eck_t *eck;
150
151 assert(size > 0 && (size_t)size >= sizeof(zio_eck_t));
152
153 eck = (zio_eck_t *)((char *)buf + size) - 1;
154 eck->zec_magic = ZEC_MAGIC;
155 ZIO_SET_CHECKSUM(&eck->zec_cksum, off, 0, 0, 0);
156 zio_checksum_SHA256(buf, size, NULL, &cksum);
157 eck->zec_cksum = cksum;
158 }
159
160 /*
161 * Set embedded checksums and write the label at the specified index.
162 */
163 void
vdev_label_write(zfs_opt_t * zfs,int ind,const vdev_label_t * labelp)164 vdev_label_write(zfs_opt_t *zfs, int ind, const vdev_label_t *labelp)
165 {
166 vdev_label_t *label;
167 ssize_t n;
168 off_t blksz, loff;
169
170 assert(ind >= 0 && ind < VDEV_LABELS);
171
172 /*
173 * Make a copy since we have to modify the label to set checksums.
174 */
175 label = ecalloc(1, sizeof(*label));
176 memcpy(label, labelp, sizeof(*label));
177
178 if (ind < 2)
179 loff = ind * sizeof(*label);
180 else
181 loff = zfs->vdevsize - (VDEV_LABELS - ind) * sizeof(*label);
182
183 /*
184 * Set the verifier checksum for the boot block. We don't use it, but
185 * the FreeBSD loader reads it and will complain if the checksum isn't
186 * valid.
187 */
188 vdev_label_set_checksum(&label->vl_be,
189 loff + __offsetof(vdev_label_t, vl_be), sizeof(label->vl_be));
190
191 /*
192 * Set the verifier checksum for the label.
193 */
194 vdev_label_set_checksum(&label->vl_vdev_phys,
195 loff + __offsetof(vdev_label_t, vl_vdev_phys),
196 sizeof(label->vl_vdev_phys));
197
198 /*
199 * Set the verifier checksum for the uberblocks. There is one uberblock
200 * per sector; for example, with an ashift of 12 we end up with
201 * 128KB/4KB=32 copies of the uberblock in the ring.
202 */
203 blksz = ASHIFT_UBERBLOCK_SIZE(zfs->ashift);
204 assert(sizeof(label->vl_uberblock) % blksz == 0);
205 for (size_t roff = 0; roff < sizeof(label->vl_uberblock);
206 roff += blksz) {
207 vdev_label_set_checksum(&label->vl_uberblock[0] + roff,
208 loff + __offsetof(vdev_label_t, vl_uberblock) + roff,
209 blksz);
210 }
211
212 n = pwrite(zfs->fd, label, sizeof(*label), loff);
213 if (n < 0)
214 err(1, "writing vdev label");
215 assert(n == sizeof(*label));
216
217 free(label);
218 }
219
220 /*
221 * Find a chunk of contiguous free space of length *lenp, according to the
222 * following rules:
223 * 1. If the length is less than or equal to 128KB, the returned run's length
224 * will be the smallest power of 2 equal to or larger than the length.
225 * 2. If the length is larger than 128KB, the returned run's length will be
226 * the smallest multiple of 128KB that is larger than the length.
227 * 3. The returned run's length will be size-aligned up to 128KB.
228 *
229 * XXX-MJ the third rule isn't actually required, so this can just be a dumb
230 * bump allocator. Maybe there's some benefit to keeping large blocks aligned,
231 * so let's keep it for now and hope we don't get too much fragmentation.
232 * Alternately we could try to allocate all blocks of a certain size from the
233 * same metaslab.
234 */
235 off_t
vdev_space_alloc(zfs_opt_t * zfs,off_t * lenp)236 vdev_space_alloc(zfs_opt_t *zfs, off_t *lenp)
237 {
238 off_t len;
239 int align, loc, minblksz, nbits;
240
241 minblksz = 1 << zfs->ashift;
242 len = roundup2(*lenp, minblksz);
243
244 assert(len != 0);
245 assert(len / minblksz <= INT_MAX);
246
247 if (len < MAXBLOCKSIZE) {
248 if ((len & (len - 1)) != 0)
249 len = (off_t)1 << flsll(len);
250 align = len / minblksz;
251 } else {
252 len = roundup2(len, MAXBLOCKSIZE);
253 align = MAXBLOCKSIZE / minblksz;
254 }
255
256 for (loc = 0, nbits = len / minblksz;; loc = roundup2(loc, align)) {
257 bit_ffc_area_at(zfs->spacemap, loc, zfs->spacemapbits, nbits,
258 &loc);
259 if (loc == -1) {
260 errx(1, "failed to find %ju bytes of space",
261 (uintmax_t)len);
262 }
263 if ((loc & (align - 1)) == 0)
264 break;
265 }
266 assert(loc + nbits > loc);
267 bit_nset(zfs->spacemap, loc, loc + nbits - 1);
268 *lenp = len;
269
270 return ((off_t)loc << zfs->ashift);
271 }
272
273 static void
vdev_spacemap_init(zfs_opt_t * zfs)274 vdev_spacemap_init(zfs_opt_t *zfs)
275 {
276 uint64_t nbits;
277
278 assert(powerof2(zfs->mssize));
279
280 nbits = rounddown2(zfs->asize, zfs->mssize) >> zfs->ashift;
281 if (nbits > INT_MAX) {
282 /*
283 * With the smallest block size of 512B, the limit on the image
284 * size is 2TB. That should be enough for anyone.
285 */
286 errx(1, "image size is too large");
287 }
288 zfs->spacemapbits = (int)nbits;
289 zfs->spacemap = bit_alloc(zfs->spacemapbits);
290 if (zfs->spacemap == NULL)
291 err(1, "bitstring allocation failed");
292 }
293
294 void
vdev_spacemap_write(zfs_opt_t * zfs)295 vdev_spacemap_write(zfs_opt_t *zfs)
296 {
297 dnode_phys_t *objarr;
298 bitstr_t *spacemap;
299 uint64_t *objarrblk;
300 off_t smblksz, objarrblksz, objarrloc;
301
302 struct {
303 dnode_phys_t *dnode;
304 uint64_t dnid;
305 off_t loc;
306 } *sma;
307
308 objarrblksz = sizeof(uint64_t) * zfs->mscount;
309 assert(objarrblksz <= MAXBLOCKSIZE);
310 objarrloc = objset_space_alloc(zfs, zfs->mos, &objarrblksz);
311 objarrblk = ecalloc(1, objarrblksz);
312
313 objarr = objset_dnode_lookup(zfs->mos, zfs->objarrid);
314 objarr->dn_datablkszsec = objarrblksz >> MINBLOCKSHIFT;
315
316 /*
317 * Use the smallest block size for space maps. The space allocation
318 * algorithm should aim to minimize the number of holes.
319 */
320 smblksz = 1 << zfs->ashift;
321
322 /*
323 * First allocate dnodes and space for all of our space maps. No more
324 * space can be allocated from the vdev after this point.
325 */
326 sma = ecalloc(zfs->mscount, sizeof(*sma));
327 for (uint64_t i = 0; i < zfs->mscount; i++) {
328 sma[i].dnode = objset_dnode_bonus_alloc(zfs->mos,
329 DMU_OT_SPACE_MAP, DMU_OT_SPACE_MAP_HEADER,
330 sizeof(space_map_phys_t), &sma[i].dnid);
331 sma[i].loc = objset_space_alloc(zfs, zfs->mos, &smblksz);
332 }
333 spacemap = zfs->spacemap;
334 zfs->spacemap = NULL;
335
336 /*
337 * Now that the set of allocated space is finalized, populate each space
338 * map and write it to the vdev.
339 */
340 for (uint64_t i = 0; i < zfs->mscount; i++) {
341 space_map_phys_t *sm;
342 uint64_t alloc, length, *smblk;
343 int shift, startb, endb, srunb, erunb;
344
345 /*
346 * We only allocate a single block for this space map, but
347 * OpenZFS assumes that a space map object with sufficient bonus
348 * space supports histograms.
349 */
350 sma[i].dnode->dn_nblkptr = 3;
351 sma[i].dnode->dn_datablkszsec = smblksz >> MINBLOCKSHIFT;
352
353 smblk = ecalloc(1, smblksz);
354
355 alloc = length = 0;
356 shift = zfs->msshift - zfs->ashift;
357 for (srunb = startb = i * (1 << shift),
358 endb = (i + 1) * (1 << shift);
359 srunb < endb; srunb = erunb) {
360 uint64_t runlen, runoff;
361
362 /* Find a run of allocated space. */
363 bit_ffs_at(spacemap, srunb, zfs->spacemapbits, &srunb);
364 if (srunb == -1 || srunb >= endb)
365 break;
366
367 bit_ffc_at(spacemap, srunb, zfs->spacemapbits, &erunb);
368 if (erunb == -1 || erunb > endb)
369 erunb = endb;
370
371 /*
372 * The space represented by [srunb, erunb) has been
373 * allocated. Add a record to the space map to indicate
374 * this. Run offsets are relative to the beginning of
375 * the metaslab.
376 */
377 runlen = erunb - srunb;
378 runoff = srunb - startb;
379
380 assert(length * sizeof(uint64_t) < (uint64_t)smblksz);
381 smblk[length] = SM_PREFIX_ENCODE(SM2_PREFIX) |
382 SM2_RUN_ENCODE(runlen) | SM2_VDEV_ENCODE(0);
383 smblk[length + 1] = SM2_TYPE_ENCODE(SM_ALLOC) |
384 SM2_OFFSET_ENCODE(runoff);
385
386 alloc += runlen << zfs->ashift;
387 length += 2;
388 }
389
390 sm = DN_BONUS(sma[i].dnode);
391 sm->smp_length = length * sizeof(uint64_t);
392 sm->smp_alloc = alloc;
393
394 vdev_pwrite_dnode_data(zfs, sma[i].dnode, smblk, smblksz,
395 sma[i].loc);
396 free(smblk);
397
398 /* Record this space map in the space map object array. */
399 objarrblk[i] = sma[i].dnid;
400 }
401
402 /*
403 * All of the space maps are written, now write the object array.
404 */
405 vdev_pwrite_dnode_data(zfs, objarr, objarrblk, objarrblksz, objarrloc);
406 free(objarrblk);
407
408 assert(zfs->spacemap == NULL);
409 free(spacemap);
410 free(sma);
411 }
412
413 void
vdev_init(zfs_opt_t * zfs,const char * image)414 vdev_init(zfs_opt_t *zfs, const char *image)
415 {
416 assert(zfs->ashift >= MINBLOCKSHIFT);
417
418 zfs->fd = open(image, O_RDWR | O_CREAT | O_TRUNC, 0644);
419 if (zfs->fd == -1)
420 err(1, "Can't open `%s' for writing", image);
421 if (ftruncate(zfs->fd, zfs->vdevsize) != 0)
422 err(1, "Failed to extend image file `%s'", image);
423
424 vdev_spacemap_init(zfs);
425 }
426
427 void
vdev_fini(zfs_opt_t * zfs)428 vdev_fini(zfs_opt_t *zfs)
429 {
430 assert(zfs->spacemap == NULL);
431
432 if (zfs->fd != -1) {
433 if (close(zfs->fd) != 0)
434 err(1, "close");
435 zfs->fd = -1;
436 }
437 }
438