1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * Copyright (C) 2020 Oracle. All Rights Reserved.
4 * Author: Darrick J. Wong <darrick.wong@oracle.com>
5 */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_log_format.h"
11 #include "xfs_trans_resv.h"
12 #include "xfs_bit.h"
13 #include "xfs_mount.h"
14 #include "xfs_inode.h"
15 #include "xfs_trans.h"
16 #include "xfs_btree.h"
17 #include "xfs_trace.h"
18 #include "xfs_btree_staging.h"
19
20 /*
21 * Staging Cursors and Fake Roots for Btrees
22 * =========================================
23 *
24 * A staging btree cursor is a special type of btree cursor that callers must
25 * use to construct a new btree index using the btree bulk loader code. The
26 * bulk loading code uses the staging btree cursor to abstract the details of
27 * initializing new btree blocks and filling them with records or key/ptr
28 * pairs. Regular btree operations (e.g. queries and modifications) are not
29 * supported with staging cursors, and callers must not invoke them.
30 *
31 * Fake root structures contain all the information about a btree that is under
32 * construction by the bulk loading code. Staging btree cursors point to fake
33 * root structures instead of the usual AG header or inode structure.
34 *
35 * Callers are expected to initialize a fake root structure and pass it into
36 * the _stage_cursor function for a specific btree type. When bulk loading is
37 * complete, callers should call the _commit_staged_btree function for that
38 * specific btree type to commit the new btree into the filesystem.
39 */
40
41 /*
42 * Don't allow staging cursors to be duplicated because they're supposed to be
43 * kept private to a single thread.
44 */
45 STATIC struct xfs_btree_cur *
xfs_btree_fakeroot_dup_cursor(struct xfs_btree_cur * cur)46 xfs_btree_fakeroot_dup_cursor(
47 struct xfs_btree_cur *cur)
48 {
49 ASSERT(0);
50 return NULL;
51 }
52
53 /*
54 * Don't allow block allocation for a staging cursor, because staging cursors
55 * do not support regular btree modifications.
56 *
57 * Bulk loading uses a separate callback to obtain new blocks from a
58 * preallocated list, which prevents ENOSPC failures during loading.
59 */
60 STATIC int
xfs_btree_fakeroot_alloc_block(struct xfs_btree_cur * cur,const union xfs_btree_ptr * start_bno,union xfs_btree_ptr * new_bno,int * stat)61 xfs_btree_fakeroot_alloc_block(
62 struct xfs_btree_cur *cur,
63 const union xfs_btree_ptr *start_bno,
64 union xfs_btree_ptr *new_bno,
65 int *stat)
66 {
67 ASSERT(0);
68 return -EFSCORRUPTED;
69 }
70
71 /*
72 * Don't allow block freeing for a staging cursor, because staging cursors
73 * do not support regular btree modifications.
74 */
75 STATIC int
xfs_btree_fakeroot_free_block(struct xfs_btree_cur * cur,struct xfs_buf * bp)76 xfs_btree_fakeroot_free_block(
77 struct xfs_btree_cur *cur,
78 struct xfs_buf *bp)
79 {
80 ASSERT(0);
81 return -EFSCORRUPTED;
82 }
83
84 /* Initialize a pointer to the root block from the fakeroot. */
85 STATIC void
xfs_btree_fakeroot_init_ptr_from_cur(struct xfs_btree_cur * cur,union xfs_btree_ptr * ptr)86 xfs_btree_fakeroot_init_ptr_from_cur(
87 struct xfs_btree_cur *cur,
88 union xfs_btree_ptr *ptr)
89 {
90 struct xbtree_afakeroot *afake;
91
92 ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
93
94 afake = cur->bc_ag.afake;
95 ptr->s = cpu_to_be32(afake->af_root);
96 }
97
98 /*
99 * Bulk Loading for AG Btrees
100 * ==========================
101 *
102 * For a btree rooted in an AG header, pass a xbtree_afakeroot structure to the
103 * staging cursor. Callers should initialize this to zero.
104 *
105 * The _stage_cursor() function for a specific btree type should call
106 * xfs_btree_stage_afakeroot to set up the in-memory cursor as a staging
107 * cursor. The corresponding _commit_staged_btree() function should log the
108 * new root and call xfs_btree_commit_afakeroot() to transform the staging
109 * cursor into a regular btree cursor.
110 */
111
112 /* Update the btree root information for a per-AG fake root. */
113 STATIC void
xfs_btree_afakeroot_set_root(struct xfs_btree_cur * cur,const union xfs_btree_ptr * ptr,int inc)114 xfs_btree_afakeroot_set_root(
115 struct xfs_btree_cur *cur,
116 const union xfs_btree_ptr *ptr,
117 int inc)
118 {
119 struct xbtree_afakeroot *afake = cur->bc_ag.afake;
120
121 ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
122 afake->af_root = be32_to_cpu(ptr->s);
123 afake->af_levels += inc;
124 }
125
126 /*
127 * Initialize a AG-rooted btree cursor with the given AG btree fake root.
128 * The btree cursor's bc_ops will be overridden as needed to make the staging
129 * functionality work.
130 */
131 void
xfs_btree_stage_afakeroot(struct xfs_btree_cur * cur,struct xbtree_afakeroot * afake)132 xfs_btree_stage_afakeroot(
133 struct xfs_btree_cur *cur,
134 struct xbtree_afakeroot *afake)
135 {
136 struct xfs_btree_ops *nops;
137
138 ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING));
139 ASSERT(!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE));
140 ASSERT(cur->bc_tp == NULL);
141
142 nops = kmem_alloc(sizeof(struct xfs_btree_ops), KM_NOFS);
143 memcpy(nops, cur->bc_ops, sizeof(struct xfs_btree_ops));
144 nops->alloc_block = xfs_btree_fakeroot_alloc_block;
145 nops->free_block = xfs_btree_fakeroot_free_block;
146 nops->init_ptr_from_cur = xfs_btree_fakeroot_init_ptr_from_cur;
147 nops->set_root = xfs_btree_afakeroot_set_root;
148 nops->dup_cursor = xfs_btree_fakeroot_dup_cursor;
149
150 cur->bc_ag.afake = afake;
151 cur->bc_nlevels = afake->af_levels;
152 cur->bc_ops = nops;
153 cur->bc_flags |= XFS_BTREE_STAGING;
154 }
155
156 /*
157 * Transform an AG-rooted staging btree cursor back into a regular cursor by
158 * substituting a real btree root for the fake one and restoring normal btree
159 * cursor ops. The caller must log the btree root change prior to calling
160 * this.
161 */
162 void
xfs_btree_commit_afakeroot(struct xfs_btree_cur * cur,struct xfs_trans * tp,struct xfs_buf * agbp,const struct xfs_btree_ops * ops)163 xfs_btree_commit_afakeroot(
164 struct xfs_btree_cur *cur,
165 struct xfs_trans *tp,
166 struct xfs_buf *agbp,
167 const struct xfs_btree_ops *ops)
168 {
169 ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
170 ASSERT(cur->bc_tp == NULL);
171
172 trace_xfs_btree_commit_afakeroot(cur);
173
174 kmem_free((void *)cur->bc_ops);
175 cur->bc_ag.agbp = agbp;
176 cur->bc_ops = ops;
177 cur->bc_flags &= ~XFS_BTREE_STAGING;
178 cur->bc_tp = tp;
179 }
180
181 /*
182 * Bulk Loading for Inode-Rooted Btrees
183 * ====================================
184 *
185 * For a btree rooted in an inode fork, pass a xbtree_ifakeroot structure to
186 * the staging cursor. This structure should be initialized as follows:
187 *
188 * - if_fork_size field should be set to the number of bytes available to the
189 * fork in the inode.
190 *
191 * - if_fork should point to a freshly allocated struct xfs_ifork.
192 *
193 * - if_format should be set to the appropriate fork type (e.g.
194 * XFS_DINODE_FMT_BTREE).
195 *
196 * All other fields must be zero.
197 *
198 * The _stage_cursor() function for a specific btree type should call
199 * xfs_btree_stage_ifakeroot to set up the in-memory cursor as a staging
200 * cursor. The corresponding _commit_staged_btree() function should log the
201 * new root and call xfs_btree_commit_ifakeroot() to transform the staging
202 * cursor into a regular btree cursor.
203 */
204
205 /*
206 * Initialize an inode-rooted btree cursor with the given inode btree fake
207 * root. The btree cursor's bc_ops will be overridden as needed to make the
208 * staging functionality work. If new_ops is not NULL, these new ops will be
209 * passed out to the caller for further overriding.
210 */
211 void
xfs_btree_stage_ifakeroot(struct xfs_btree_cur * cur,struct xbtree_ifakeroot * ifake,struct xfs_btree_ops ** new_ops)212 xfs_btree_stage_ifakeroot(
213 struct xfs_btree_cur *cur,
214 struct xbtree_ifakeroot *ifake,
215 struct xfs_btree_ops **new_ops)
216 {
217 struct xfs_btree_ops *nops;
218
219 ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING));
220 ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
221 ASSERT(cur->bc_tp == NULL);
222
223 nops = kmem_alloc(sizeof(struct xfs_btree_ops), KM_NOFS);
224 memcpy(nops, cur->bc_ops, sizeof(struct xfs_btree_ops));
225 nops->alloc_block = xfs_btree_fakeroot_alloc_block;
226 nops->free_block = xfs_btree_fakeroot_free_block;
227 nops->init_ptr_from_cur = xfs_btree_fakeroot_init_ptr_from_cur;
228 nops->dup_cursor = xfs_btree_fakeroot_dup_cursor;
229
230 cur->bc_ino.ifake = ifake;
231 cur->bc_nlevels = ifake->if_levels;
232 cur->bc_ops = nops;
233 cur->bc_flags |= XFS_BTREE_STAGING;
234
235 if (new_ops)
236 *new_ops = nops;
237 }
238
239 /*
240 * Transform an inode-rooted staging btree cursor back into a regular cursor by
241 * substituting a real btree root for the fake one and restoring normal btree
242 * cursor ops. The caller must log the btree root change prior to calling
243 * this.
244 */
245 void
xfs_btree_commit_ifakeroot(struct xfs_btree_cur * cur,struct xfs_trans * tp,int whichfork,const struct xfs_btree_ops * ops)246 xfs_btree_commit_ifakeroot(
247 struct xfs_btree_cur *cur,
248 struct xfs_trans *tp,
249 int whichfork,
250 const struct xfs_btree_ops *ops)
251 {
252 ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
253 ASSERT(cur->bc_tp == NULL);
254
255 trace_xfs_btree_commit_ifakeroot(cur);
256
257 kmem_free((void *)cur->bc_ops);
258 cur->bc_ino.ifake = NULL;
259 cur->bc_ino.whichfork = whichfork;
260 cur->bc_ops = ops;
261 cur->bc_flags &= ~XFS_BTREE_STAGING;
262 cur->bc_tp = tp;
263 }
264
265 /*
266 * Bulk Loading of Staged Btrees
267 * =============================
268 *
269 * This interface is used with a staged btree cursor to create a totally new
270 * btree with a large number of records (i.e. more than what would fit in a
271 * single root block). When the creation is complete, the new root can be
272 * linked atomically into the filesystem by committing the staged cursor.
273 *
274 * Creation of a new btree proceeds roughly as follows:
275 *
276 * The first step is to initialize an appropriate fake btree root structure and
277 * then construct a staged btree cursor. Refer to the block comments about
278 * "Bulk Loading for AG Btrees" and "Bulk Loading for Inode-Rooted Btrees" for
279 * more information about how to do this.
280 *
281 * The second step is to initialize a struct xfs_btree_bload context as
282 * documented in the structure definition.
283 *
284 * The third step is to call xfs_btree_bload_compute_geometry to compute the
285 * height of and the number of blocks needed to construct the btree. See the
286 * section "Computing the Geometry of the New Btree" for details about this
287 * computation.
288 *
289 * In step four, the caller must allocate xfs_btree_bload.nr_blocks blocks and
290 * save them for later use by ->claim_block(). Bulk loading requires all
291 * blocks to be allocated beforehand to avoid ENOSPC failures midway through a
292 * rebuild, and to minimize seek distances of the new btree.
293 *
294 * Step five is to call xfs_btree_bload() to start constructing the btree.
295 *
296 * The final step is to commit the staging btree cursor, which logs the new
297 * btree root and turns the staging cursor into a regular cursor. The caller
298 * is responsible for cleaning up the previous btree blocks, if any.
299 *
300 * Computing the Geometry of the New Btree
301 * =======================================
302 *
303 * The number of items placed in each btree block is computed via the following
304 * algorithm: For leaf levels, the number of items for the level is nr_records
305 * in the bload structure. For node levels, the number of items for the level
306 * is the number of blocks in the next lower level of the tree. For each
307 * level, the desired number of items per block is defined as:
308 *
309 * desired = max(minrecs, maxrecs - slack factor)
310 *
311 * The number of blocks for the level is defined to be:
312 *
313 * blocks = floor(nr_items / desired)
314 *
315 * Note this is rounded down so that the npb calculation below will never fall
316 * below minrecs. The number of items that will actually be loaded into each
317 * btree block is defined as:
318 *
319 * npb = nr_items / blocks
320 *
321 * Some of the leftmost blocks in the level will contain one extra record as
322 * needed to handle uneven division. If the number of records in any block
323 * would exceed maxrecs for that level, blocks is incremented and npb is
324 * recalculated.
325 *
326 * In other words, we compute the number of blocks needed to satisfy a given
327 * loading level, then spread the items as evenly as possible.
328 *
329 * The height and number of fs blocks required to create the btree are computed
330 * and returned via btree_height and nr_blocks.
331 */
332
333 /*
334 * Put a btree block that we're loading onto the ordered list and release it.
335 * The btree blocks will be written to disk when bulk loading is finished.
336 * If we reach the dirty buffer threshold, flush them to disk before
337 * continuing.
338 */
339 static int
xfs_btree_bload_drop_buf(struct xfs_btree_bload * bbl,struct list_head * buffers_list,struct xfs_buf ** bpp)340 xfs_btree_bload_drop_buf(
341 struct xfs_btree_bload *bbl,
342 struct list_head *buffers_list,
343 struct xfs_buf **bpp)
344 {
345 struct xfs_buf *bp = *bpp;
346 int error;
347
348 if (!bp)
349 return 0;
350
351 /*
352 * Mark this buffer XBF_DONE (i.e. uptodate) so that a subsequent
353 * xfs_buf_read will not pointlessly reread the contents from the disk.
354 */
355 bp->b_flags |= XBF_DONE;
356
357 xfs_buf_delwri_queue_here(bp, buffers_list);
358 xfs_buf_relse(bp);
359 *bpp = NULL;
360 bbl->nr_dirty++;
361
362 if (!bbl->max_dirty || bbl->nr_dirty < bbl->max_dirty)
363 return 0;
364
365 error = xfs_buf_delwri_submit(buffers_list);
366 if (error)
367 return error;
368
369 bbl->nr_dirty = 0;
370 return 0;
371 }
372
373 /*
374 * Allocate and initialize one btree block for bulk loading.
375 *
376 * The new btree block will have its level and numrecs fields set to the values
377 * of the level and nr_this_block parameters, respectively.
378 *
379 * The caller should ensure that ptrp, bpp, and blockp refer to the left
380 * sibling of the new block, if there is any. On exit, ptrp, bpp, and blockp
381 * will all point to the new block.
382 */
383 STATIC int
xfs_btree_bload_prep_block(struct xfs_btree_cur * cur,struct xfs_btree_bload * bbl,struct list_head * buffers_list,unsigned int level,unsigned int nr_this_block,union xfs_btree_ptr * ptrp,struct xfs_buf ** bpp,struct xfs_btree_block ** blockp,void * priv)384 xfs_btree_bload_prep_block(
385 struct xfs_btree_cur *cur,
386 struct xfs_btree_bload *bbl,
387 struct list_head *buffers_list,
388 unsigned int level,
389 unsigned int nr_this_block,
390 union xfs_btree_ptr *ptrp, /* in/out */
391 struct xfs_buf **bpp, /* in/out */
392 struct xfs_btree_block **blockp, /* in/out */
393 void *priv)
394 {
395 union xfs_btree_ptr new_ptr;
396 struct xfs_buf *new_bp;
397 struct xfs_btree_block *new_block;
398 int ret;
399
400 if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
401 level == cur->bc_nlevels - 1) {
402 struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur);
403 size_t new_size;
404
405 ASSERT(*bpp == NULL);
406
407 /* Allocate a new incore btree root block. */
408 new_size = bbl->iroot_size(cur, level, nr_this_block, priv);
409 ifp->if_broot = kmem_zalloc(new_size, 0);
410 ifp->if_broot_bytes = (int)new_size;
411
412 /* Initialize it and send it out. */
413 xfs_btree_init_block_int(cur->bc_mp, ifp->if_broot,
414 XFS_BUF_DADDR_NULL, cur->bc_btnum, level,
415 nr_this_block, cur->bc_ino.ip->i_ino,
416 cur->bc_flags);
417
418 *bpp = NULL;
419 *blockp = ifp->if_broot;
420 xfs_btree_set_ptr_null(cur, ptrp);
421 return 0;
422 }
423
424 /* Claim one of the caller's preallocated blocks. */
425 xfs_btree_set_ptr_null(cur, &new_ptr);
426 ret = bbl->claim_block(cur, &new_ptr, priv);
427 if (ret)
428 return ret;
429
430 ASSERT(!xfs_btree_ptr_is_null(cur, &new_ptr));
431
432 ret = xfs_btree_get_buf_block(cur, &new_ptr, &new_block, &new_bp);
433 if (ret)
434 return ret;
435
436 /*
437 * The previous block (if any) is the left sibling of the new block,
438 * so set its right sibling pointer to the new block and drop it.
439 */
440 if (*blockp)
441 xfs_btree_set_sibling(cur, *blockp, &new_ptr, XFS_BB_RIGHTSIB);
442
443 ret = xfs_btree_bload_drop_buf(bbl, buffers_list, bpp);
444 if (ret)
445 return ret;
446
447 /* Initialize the new btree block. */
448 xfs_btree_init_block_cur(cur, new_bp, level, nr_this_block);
449 xfs_btree_set_sibling(cur, new_block, ptrp, XFS_BB_LEFTSIB);
450
451 /* Set the out parameters. */
452 *bpp = new_bp;
453 *blockp = new_block;
454 xfs_btree_copy_ptrs(cur, ptrp, &new_ptr, 1);
455 return 0;
456 }
457
458 /* Load one leaf block. */
459 STATIC int
xfs_btree_bload_leaf(struct xfs_btree_cur * cur,unsigned int recs_this_block,xfs_btree_bload_get_records_fn get_records,struct xfs_btree_block * block,void * priv)460 xfs_btree_bload_leaf(
461 struct xfs_btree_cur *cur,
462 unsigned int recs_this_block,
463 xfs_btree_bload_get_records_fn get_records,
464 struct xfs_btree_block *block,
465 void *priv)
466 {
467 unsigned int j = 1;
468 int ret;
469
470 /* Fill the leaf block with records. */
471 while (j <= recs_this_block) {
472 ret = get_records(cur, j, block, recs_this_block - j + 1, priv);
473 if (ret < 0)
474 return ret;
475 j += ret;
476 }
477
478 return 0;
479 }
480
481 /*
482 * Load one node block with key/ptr pairs.
483 *
484 * child_ptr must point to a block within the next level down in the tree. A
485 * key/ptr entry will be created in the new node block to the block pointed to
486 * by child_ptr. On exit, child_ptr points to the next block on the child
487 * level that needs processing.
488 */
489 STATIC int
xfs_btree_bload_node(struct xfs_btree_cur * cur,unsigned int recs_this_block,union xfs_btree_ptr * child_ptr,struct xfs_btree_block * block)490 xfs_btree_bload_node(
491 struct xfs_btree_cur *cur,
492 unsigned int recs_this_block,
493 union xfs_btree_ptr *child_ptr,
494 struct xfs_btree_block *block)
495 {
496 unsigned int j;
497 int ret;
498
499 /* Fill the node block with keys and pointers. */
500 for (j = 1; j <= recs_this_block; j++) {
501 union xfs_btree_key child_key;
502 union xfs_btree_ptr *block_ptr;
503 union xfs_btree_key *block_key;
504 struct xfs_btree_block *child_block;
505 struct xfs_buf *child_bp;
506
507 ASSERT(!xfs_btree_ptr_is_null(cur, child_ptr));
508
509 /*
510 * Read the lower-level block in case the buffer for it has
511 * been reclaimed. LRU refs will be set on the block, which is
512 * desirable if the new btree commits.
513 */
514 ret = xfs_btree_read_buf_block(cur, child_ptr, 0, &child_block,
515 &child_bp);
516 if (ret)
517 return ret;
518
519 block_ptr = xfs_btree_ptr_addr(cur, j, block);
520 xfs_btree_copy_ptrs(cur, block_ptr, child_ptr, 1);
521
522 block_key = xfs_btree_key_addr(cur, j, block);
523 xfs_btree_get_keys(cur, child_block, &child_key);
524 xfs_btree_copy_keys(cur, block_key, &child_key, 1);
525
526 xfs_btree_get_sibling(cur, child_block, child_ptr,
527 XFS_BB_RIGHTSIB);
528 xfs_buf_relse(child_bp);
529 }
530
531 return 0;
532 }
533
534 /*
535 * Compute the maximum number of records (or keyptrs) per block that we want to
536 * install at this level in the btree. Caller is responsible for having set
537 * @cur->bc_ino.forksize to the desired fork size, if appropriate.
538 */
539 STATIC unsigned int
xfs_btree_bload_max_npb(struct xfs_btree_cur * cur,struct xfs_btree_bload * bbl,unsigned int level)540 xfs_btree_bload_max_npb(
541 struct xfs_btree_cur *cur,
542 struct xfs_btree_bload *bbl,
543 unsigned int level)
544 {
545 unsigned int ret;
546
547 if (level == cur->bc_nlevels - 1 && cur->bc_ops->get_dmaxrecs)
548 return cur->bc_ops->get_dmaxrecs(cur, level);
549
550 ret = cur->bc_ops->get_maxrecs(cur, level);
551 if (level == 0)
552 ret -= bbl->leaf_slack;
553 else
554 ret -= bbl->node_slack;
555 return ret;
556 }
557
558 /*
559 * Compute the desired number of records (or keyptrs) per block that we want to
560 * install at this level in the btree, which must be somewhere between minrecs
561 * and max_npb. The caller is free to install fewer records per block.
562 */
563 STATIC unsigned int
xfs_btree_bload_desired_npb(struct xfs_btree_cur * cur,struct xfs_btree_bload * bbl,unsigned int level)564 xfs_btree_bload_desired_npb(
565 struct xfs_btree_cur *cur,
566 struct xfs_btree_bload *bbl,
567 unsigned int level)
568 {
569 unsigned int npb = xfs_btree_bload_max_npb(cur, bbl, level);
570
571 /* Root blocks are not subject to minrecs rules. */
572 if (level == cur->bc_nlevels - 1)
573 return max(1U, npb);
574
575 return max_t(unsigned int, cur->bc_ops->get_minrecs(cur, level), npb);
576 }
577
578 /*
579 * Compute the number of records to be stored in each block at this level and
580 * the number of blocks for this level. For leaf levels, we must populate an
581 * empty root block even if there are no records, so we have to have at least
582 * one block.
583 */
584 STATIC void
xfs_btree_bload_level_geometry(struct xfs_btree_cur * cur,struct xfs_btree_bload * bbl,unsigned int level,uint64_t nr_this_level,unsigned int * avg_per_block,uint64_t * blocks,uint64_t * blocks_with_extra)585 xfs_btree_bload_level_geometry(
586 struct xfs_btree_cur *cur,
587 struct xfs_btree_bload *bbl,
588 unsigned int level,
589 uint64_t nr_this_level,
590 unsigned int *avg_per_block,
591 uint64_t *blocks,
592 uint64_t *blocks_with_extra)
593 {
594 uint64_t npb;
595 uint64_t dontcare;
596 unsigned int desired_npb;
597 unsigned int maxnr;
598
599 /*
600 * Compute the absolute maximum number of records that we can store in
601 * the ondisk block or inode root.
602 */
603 if (cur->bc_ops->get_dmaxrecs)
604 maxnr = cur->bc_ops->get_dmaxrecs(cur, level);
605 else
606 maxnr = cur->bc_ops->get_maxrecs(cur, level);
607
608 /*
609 * Compute the number of blocks we need to fill each block with the
610 * desired number of records/keyptrs per block. Because desired_npb
611 * could be minrecs, we use regular integer division (which rounds
612 * the block count down) so that in the next step the effective # of
613 * items per block will never be less than desired_npb.
614 */
615 desired_npb = xfs_btree_bload_desired_npb(cur, bbl, level);
616 *blocks = div64_u64_rem(nr_this_level, desired_npb, &dontcare);
617 *blocks = max(1ULL, *blocks);
618
619 /*
620 * Compute the number of records that we will actually put in each
621 * block, assuming that we want to spread the records evenly between
622 * the blocks. Take care that the effective # of items per block (npb)
623 * won't exceed maxrecs even for the blocks that get an extra record,
624 * since desired_npb could be maxrecs, and in the previous step we
625 * rounded the block count down.
626 */
627 npb = div64_u64_rem(nr_this_level, *blocks, blocks_with_extra);
628 if (npb > maxnr || (npb == maxnr && *blocks_with_extra > 0)) {
629 (*blocks)++;
630 npb = div64_u64_rem(nr_this_level, *blocks, blocks_with_extra);
631 }
632
633 *avg_per_block = min_t(uint64_t, npb, nr_this_level);
634
635 trace_xfs_btree_bload_level_geometry(cur, level, nr_this_level,
636 *avg_per_block, desired_npb, *blocks,
637 *blocks_with_extra);
638 }
639
640 /*
641 * Ensure a slack value is appropriate for the btree.
642 *
643 * If the slack value is negative, set slack so that we fill the block to
644 * halfway between minrecs and maxrecs. Make sure the slack is never so large
645 * that we can underflow minrecs.
646 */
647 static void
xfs_btree_bload_ensure_slack(struct xfs_btree_cur * cur,int * slack,int level)648 xfs_btree_bload_ensure_slack(
649 struct xfs_btree_cur *cur,
650 int *slack,
651 int level)
652 {
653 int maxr;
654 int minr;
655
656 maxr = cur->bc_ops->get_maxrecs(cur, level);
657 minr = cur->bc_ops->get_minrecs(cur, level);
658
659 /*
660 * If slack is negative, automatically set slack so that we load the
661 * btree block approximately halfway between minrecs and maxrecs.
662 * Generally, this will net us 75% loading.
663 */
664 if (*slack < 0)
665 *slack = maxr - ((maxr + minr) >> 1);
666
667 *slack = min(*slack, maxr - minr);
668 }
669
670 /*
671 * Prepare a btree cursor for a bulk load operation by computing the geometry
672 * fields in bbl. Caller must ensure that the btree cursor is a staging
673 * cursor. This function can be called multiple times.
674 */
675 int
xfs_btree_bload_compute_geometry(struct xfs_btree_cur * cur,struct xfs_btree_bload * bbl,uint64_t nr_records)676 xfs_btree_bload_compute_geometry(
677 struct xfs_btree_cur *cur,
678 struct xfs_btree_bload *bbl,
679 uint64_t nr_records)
680 {
681 uint64_t nr_blocks = 0;
682 uint64_t nr_this_level;
683
684 ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
685
686 /*
687 * Make sure that the slack values make sense for traditional leaf and
688 * node blocks. Inode-rooted btrees will return different minrecs and
689 * maxrecs values for the root block (bc_nlevels == level - 1). We're
690 * checking levels 0 and 1 here, so set bc_nlevels such that the btree
691 * code doesn't interpret either as the root level.
692 */
693 cur->bc_nlevels = cur->bc_maxlevels - 1;
694 xfs_btree_bload_ensure_slack(cur, &bbl->leaf_slack, 0);
695 xfs_btree_bload_ensure_slack(cur, &bbl->node_slack, 1);
696
697 bbl->nr_records = nr_this_level = nr_records;
698 for (cur->bc_nlevels = 1; cur->bc_nlevels <= cur->bc_maxlevels;) {
699 uint64_t level_blocks;
700 uint64_t dontcare64;
701 unsigned int level = cur->bc_nlevels - 1;
702 unsigned int avg_per_block;
703
704 xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level,
705 &avg_per_block, &level_blocks, &dontcare64);
706
707 if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
708 /*
709 * If all the items we want to store at this level
710 * would fit in the inode root block, then we have our
711 * btree root and are done.
712 *
713 * Note that bmap btrees forbid records in the root.
714 */
715 if (level != 0 && nr_this_level <= avg_per_block) {
716 nr_blocks++;
717 break;
718 }
719
720 /*
721 * Otherwise, we have to store all the items for this
722 * level in traditional btree blocks and therefore need
723 * another level of btree to point to those blocks.
724 *
725 * We have to re-compute the geometry for each level of
726 * an inode-rooted btree because the geometry differs
727 * between a btree root in an inode fork and a
728 * traditional btree block.
729 *
730 * This distinction is made in the btree code based on
731 * whether level == bc_nlevels - 1. Based on the
732 * previous root block size check against the root
733 * block geometry, we know that we aren't yet ready to
734 * populate the root. Increment bc_nevels and
735 * recalculate the geometry for a traditional
736 * block-based btree level.
737 */
738 cur->bc_nlevels++;
739 ASSERT(cur->bc_nlevels <= cur->bc_maxlevels);
740 xfs_btree_bload_level_geometry(cur, bbl, level,
741 nr_this_level, &avg_per_block,
742 &level_blocks, &dontcare64);
743 } else {
744 /*
745 * If all the items we want to store at this level
746 * would fit in a single root block, we're done.
747 */
748 if (nr_this_level <= avg_per_block) {
749 nr_blocks++;
750 break;
751 }
752
753 /* Otherwise, we need another level of btree. */
754 cur->bc_nlevels++;
755 ASSERT(cur->bc_nlevels <= cur->bc_maxlevels);
756 }
757
758 nr_blocks += level_blocks;
759 nr_this_level = level_blocks;
760 }
761
762 if (cur->bc_nlevels > cur->bc_maxlevels)
763 return -EOVERFLOW;
764
765 bbl->btree_height = cur->bc_nlevels;
766 if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
767 bbl->nr_blocks = nr_blocks - 1;
768 else
769 bbl->nr_blocks = nr_blocks;
770 return 0;
771 }
772
773 /* Bulk load a btree given the parameters and geometry established in bbl. */
774 int
xfs_btree_bload(struct xfs_btree_cur * cur,struct xfs_btree_bload * bbl,void * priv)775 xfs_btree_bload(
776 struct xfs_btree_cur *cur,
777 struct xfs_btree_bload *bbl,
778 void *priv)
779 {
780 struct list_head buffers_list;
781 union xfs_btree_ptr child_ptr;
782 union xfs_btree_ptr ptr;
783 struct xfs_buf *bp = NULL;
784 struct xfs_btree_block *block = NULL;
785 uint64_t nr_this_level = bbl->nr_records;
786 uint64_t blocks;
787 uint64_t i;
788 uint64_t blocks_with_extra;
789 uint64_t total_blocks = 0;
790 unsigned int avg_per_block;
791 unsigned int level = 0;
792 int ret;
793
794 ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
795
796 INIT_LIST_HEAD(&buffers_list);
797 cur->bc_nlevels = bbl->btree_height;
798 xfs_btree_set_ptr_null(cur, &child_ptr);
799 xfs_btree_set_ptr_null(cur, &ptr);
800 bbl->nr_dirty = 0;
801
802 xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level,
803 &avg_per_block, &blocks, &blocks_with_extra);
804
805 /* Load each leaf block. */
806 for (i = 0; i < blocks; i++) {
807 unsigned int nr_this_block = avg_per_block;
808
809 /*
810 * Due to rounding, btree blocks will not be evenly populated
811 * in most cases. blocks_with_extra tells us how many blocks
812 * will receive an extra record to distribute the excess across
813 * the current level as evenly as possible.
814 */
815 if (i < blocks_with_extra)
816 nr_this_block++;
817
818 ret = xfs_btree_bload_prep_block(cur, bbl, &buffers_list, level,
819 nr_this_block, &ptr, &bp, &block, priv);
820 if (ret)
821 goto out;
822
823 trace_xfs_btree_bload_block(cur, level, i, blocks, &ptr,
824 nr_this_block);
825
826 ret = xfs_btree_bload_leaf(cur, nr_this_block, bbl->get_records,
827 block, priv);
828 if (ret)
829 goto out;
830
831 /*
832 * Record the leftmost leaf pointer so we know where to start
833 * with the first node level.
834 */
835 if (i == 0)
836 xfs_btree_copy_ptrs(cur, &child_ptr, &ptr, 1);
837 }
838 total_blocks += blocks;
839
840 ret = xfs_btree_bload_drop_buf(bbl, &buffers_list, &bp);
841 if (ret)
842 goto out;
843
844 /* Populate the internal btree nodes. */
845 for (level = 1; level < cur->bc_nlevels; level++) {
846 union xfs_btree_ptr first_ptr;
847
848 nr_this_level = blocks;
849 block = NULL;
850 xfs_btree_set_ptr_null(cur, &ptr);
851
852 xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level,
853 &avg_per_block, &blocks, &blocks_with_extra);
854
855 /* Load each node block. */
856 for (i = 0; i < blocks; i++) {
857 unsigned int nr_this_block = avg_per_block;
858
859 if (i < blocks_with_extra)
860 nr_this_block++;
861
862 ret = xfs_btree_bload_prep_block(cur, bbl,
863 &buffers_list, level, nr_this_block,
864 &ptr, &bp, &block, priv);
865 if (ret)
866 goto out;
867
868 trace_xfs_btree_bload_block(cur, level, i, blocks,
869 &ptr, nr_this_block);
870
871 ret = xfs_btree_bload_node(cur, nr_this_block,
872 &child_ptr, block);
873 if (ret)
874 goto out;
875
876 /*
877 * Record the leftmost node pointer so that we know
878 * where to start the next node level above this one.
879 */
880 if (i == 0)
881 xfs_btree_copy_ptrs(cur, &first_ptr, &ptr, 1);
882 }
883 total_blocks += blocks;
884
885 ret = xfs_btree_bload_drop_buf(bbl, &buffers_list, &bp);
886 if (ret)
887 goto out;
888
889 xfs_btree_copy_ptrs(cur, &child_ptr, &first_ptr, 1);
890 }
891
892 /* Initialize the new root. */
893 if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
894 ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
895 cur->bc_ino.ifake->if_levels = cur->bc_nlevels;
896 cur->bc_ino.ifake->if_blocks = total_blocks - 1;
897 } else {
898 cur->bc_ag.afake->af_root = be32_to_cpu(ptr.s);
899 cur->bc_ag.afake->af_levels = cur->bc_nlevels;
900 cur->bc_ag.afake->af_blocks = total_blocks;
901 }
902
903 /*
904 * Write the new blocks to disk. If the ordered list isn't empty after
905 * that, then something went wrong and we have to fail. This should
906 * never happen, but we'll check anyway.
907 */
908 ret = xfs_buf_delwri_submit(&buffers_list);
909 if (ret)
910 goto out;
911 if (!list_empty(&buffers_list)) {
912 ASSERT(list_empty(&buffers_list));
913 ret = -EIO;
914 }
915
916 out:
917 xfs_buf_delwri_cancel(&buffers_list);
918 if (bp)
919 xfs_buf_relse(bp);
920 return ret;
921 }
922