1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (c) 2023-2025 Christoph Hellwig.
4  * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates.
5  */
6 #include "xfs.h"
7 #include "xfs_shared.h"
8 #include "xfs_format.h"
9 #include "xfs_log_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_inode.h"
13 #include "xfs_btree.h"
14 #include "xfs_trans.h"
15 #include "xfs_icache.h"
16 #include "xfs_rmap.h"
17 #include "xfs_rtbitmap.h"
18 #include "xfs_rtrmap_btree.h"
19 #include "xfs_zone_alloc.h"
20 #include "xfs_zone_priv.h"
21 #include "xfs_zones.h"
22 #include "xfs_trace.h"
23 
24 /*
25  * Implement Garbage Collection (GC) of partially used zoned.
26  *
27  * To support the purely sequential writes in each zone, zoned XFS needs to be
28  * able to move data remaining in a zone out of it to reset the zone to prepare
29  * for writing to it again.
30  *
31  * This is done by the GC thread implemented in this file.  To support that a
32  * number of zones (XFS_GC_ZONES) is reserved from the user visible capacity to
33  * write the garbage collected data into.
34  *
35  * Whenever the available space is below the chosen threshold, the GC thread
36  * looks for potential non-empty but not fully used zones that are worth
37  * reclaiming.  Once found the rmap for the victim zone is queried, and after
38  * a bit of sorting to reduce fragmentation, the still live extents are read
39  * into memory and written to the GC target zone, and the bmap btree of the
40  * files is updated to point to the new location.  To avoid taking the IOLOCK
41  * and MMAPLOCK for the entire GC process and thus affecting the latency of
42  * user reads and writes to the files, the GC writes are speculative and the
43  * I/O completion checks that no other writes happened for the affected regions
44  * before remapping.
45  *
46  * Once a zone does not contain any valid data, be that through GC or user
47  * block removal, it is queued for for a zone reset.  The reset operation
48  * carefully ensures that the RT device cache is flushed and all transactions
49  * referencing the rmap have been committed to disk.
50  */
51 
52 /*
53  * Size of each GC scratch pad.  This is also the upper bound for each
54  * GC I/O, which helps to keep latency down.
55  */
56 #define XFS_GC_CHUNK_SIZE	SZ_1M
57 
58 /*
59  * Scratchpad data to read GCed data into.
60  *
61  * The offset member tracks where the next allocation starts, and freed tracks
62  * the amount of space that is not used anymore.
63  */
64 #define XFS_ZONE_GC_NR_SCRATCH	2
65 struct xfs_zone_scratch {
66 	struct folio			*folio;
67 	unsigned int			offset;
68 	unsigned int			freed;
69 };
70 
71 /*
72  * Chunk that is read and written for each GC operation.
73  *
74  * Note that for writes to actual zoned devices, the chunk can be split when
75  * reaching the hardware limit.
76  */
77 struct xfs_gc_bio {
78 	struct xfs_zone_gc_data		*data;
79 
80 	/*
81 	 * Entry into the reading/writing/resetting list.  Only accessed from
82 	 * the GC thread, so no locking needed.
83 	 */
84 	struct list_head		entry;
85 
86 	/*
87 	 * State of this gc_bio.  Done means the current I/O completed.
88 	 * Set from the bio end I/O handler, read from the GC thread.
89 	 */
90 	enum {
91 		XFS_GC_BIO_NEW,
92 		XFS_GC_BIO_DONE,
93 	} state;
94 
95 	/*
96 	 * Pointer to the inode and byte range in the inode that this
97 	 * GC chunk is operating on.
98 	 */
99 	struct xfs_inode		*ip;
100 	loff_t				offset;
101 	unsigned int			len;
102 
103 	/*
104 	 * Existing startblock (in the zone to be freed) and newly assigned
105 	 * daddr in the zone GCed into.
106 	 */
107 	xfs_fsblock_t			old_startblock;
108 	xfs_daddr_t			new_daddr;
109 	struct xfs_zone_scratch		*scratch;
110 
111 	/* Are we writing to a sequential write required zone? */
112 	bool				is_seq;
113 
114 	/* Open Zone being written to */
115 	struct xfs_open_zone		*oz;
116 
117 	/* Bio used for reads and writes, including the bvec used by it */
118 	struct bio_vec			bv;
119 	struct bio			bio;	/* must be last */
120 };
121 
122 #define XFS_ZONE_GC_RECS		1024
123 
124 /* iterator, needs to be reinitialized for each victim zone */
125 struct xfs_zone_gc_iter {
126 	struct xfs_rtgroup		*victim_rtg;
127 	unsigned int			rec_count;
128 	unsigned int			rec_idx;
129 	xfs_agblock_t			next_startblock;
130 	struct xfs_rmap_irec		*recs;
131 };
132 
133 /*
134  * Per-mount GC state.
135  */
136 struct xfs_zone_gc_data {
137 	struct xfs_mount		*mp;
138 
139 	/* bioset used to allocate the gc_bios */
140 	struct bio_set			bio_set;
141 
142 	/*
143 	 * Scratchpad used, and index to indicated which one is used.
144 	 */
145 	struct xfs_zone_scratch		scratch[XFS_ZONE_GC_NR_SCRATCH];
146 	unsigned int			scratch_idx;
147 
148 	/*
149 	 * List of bios currently being read, written and reset.
150 	 * These lists are only accessed by the GC thread itself, and must only
151 	 * be processed in order.
152 	 */
153 	struct list_head		reading;
154 	struct list_head		writing;
155 	struct list_head		resetting;
156 
157 	/*
158 	 * Iterator for the victim zone.
159 	 */
160 	struct xfs_zone_gc_iter		iter;
161 };
162 
163 /*
164  * We aim to keep enough zones free in stock to fully use the open zone limit
165  * for data placement purposes. Additionally, the m_zonegc_low_space tunable
166  * can be set to make sure a fraction of the unused blocks are available for
167  * writing.
168  */
169 bool
xfs_zoned_need_gc(struct xfs_mount * mp)170 xfs_zoned_need_gc(
171 	struct xfs_mount	*mp)
172 {
173 	s64			available, free, threshold;
174 	s32			remainder;
175 
176 	if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE))
177 		return false;
178 
179 	available = xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE);
180 
181 	if (available <
182 	    mp->m_groups[XG_TYPE_RTG].blocks *
183 	    (mp->m_max_open_zones - XFS_OPEN_GC_ZONES))
184 		return true;
185 
186 	free = xfs_estimate_freecounter(mp, XC_FREE_RTEXTENTS);
187 
188 	threshold = div_s64_rem(free, 100, &remainder);
189 	threshold = threshold * mp->m_zonegc_low_space +
190 		    remainder * div_s64(mp->m_zonegc_low_space, 100);
191 
192 	if (available < threshold)
193 		return true;
194 
195 	return false;
196 }
197 
198 static struct xfs_zone_gc_data *
xfs_zone_gc_data_alloc(struct xfs_mount * mp)199 xfs_zone_gc_data_alloc(
200 	struct xfs_mount	*mp)
201 {
202 	struct xfs_zone_gc_data	*data;
203 	int			i;
204 
205 	data = kzalloc(sizeof(*data), GFP_KERNEL);
206 	if (!data)
207 		return NULL;
208 	data->iter.recs = kcalloc(XFS_ZONE_GC_RECS, sizeof(*data->iter.recs),
209 			GFP_KERNEL);
210 	if (!data->iter.recs)
211 		goto out_free_data;
212 
213 	/*
214 	 * We actually only need a single bio_vec.  It would be nice to have
215 	 * a flag that only allocates the inline bvecs and not the separate
216 	 * bvec pool.
217 	 */
218 	if (bioset_init(&data->bio_set, 16, offsetof(struct xfs_gc_bio, bio),
219 			BIOSET_NEED_BVECS))
220 		goto out_free_recs;
221 	for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++) {
222 		data->scratch[i].folio =
223 			folio_alloc(GFP_KERNEL, get_order(XFS_GC_CHUNK_SIZE));
224 		if (!data->scratch[i].folio)
225 			goto out_free_scratch;
226 	}
227 	INIT_LIST_HEAD(&data->reading);
228 	INIT_LIST_HEAD(&data->writing);
229 	INIT_LIST_HEAD(&data->resetting);
230 	data->mp = mp;
231 	return data;
232 
233 out_free_scratch:
234 	while (--i >= 0)
235 		folio_put(data->scratch[i].folio);
236 	bioset_exit(&data->bio_set);
237 out_free_recs:
238 	kfree(data->iter.recs);
239 out_free_data:
240 	kfree(data);
241 	return NULL;
242 }
243 
244 static void
xfs_zone_gc_data_free(struct xfs_zone_gc_data * data)245 xfs_zone_gc_data_free(
246 	struct xfs_zone_gc_data	*data)
247 {
248 	int			i;
249 
250 	for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++)
251 		folio_put(data->scratch[i].folio);
252 	bioset_exit(&data->bio_set);
253 	kfree(data->iter.recs);
254 	kfree(data);
255 }
256 
257 static void
xfs_zone_gc_iter_init(struct xfs_zone_gc_iter * iter,struct xfs_rtgroup * victim_rtg)258 xfs_zone_gc_iter_init(
259 	struct xfs_zone_gc_iter	*iter,
260 	struct xfs_rtgroup	*victim_rtg)
261 
262 {
263 	iter->next_startblock = 0;
264 	iter->rec_count = 0;
265 	iter->rec_idx = 0;
266 	iter->victim_rtg = victim_rtg;
267 }
268 
269 /*
270  * Query the rmap of the victim zone to gather the records to evacuate.
271  */
272 static int
xfs_zone_gc_query_cb(struct xfs_btree_cur * cur,const struct xfs_rmap_irec * irec,void * private)273 xfs_zone_gc_query_cb(
274 	struct xfs_btree_cur	*cur,
275 	const struct xfs_rmap_irec *irec,
276 	void			*private)
277 {
278 	struct xfs_zone_gc_iter	*iter = private;
279 
280 	ASSERT(!XFS_RMAP_NON_INODE_OWNER(irec->rm_owner));
281 	ASSERT(!xfs_is_sb_inum(cur->bc_mp, irec->rm_owner));
282 	ASSERT(!(irec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)));
283 
284 	iter->recs[iter->rec_count] = *irec;
285 	if (++iter->rec_count == XFS_ZONE_GC_RECS) {
286 		iter->next_startblock =
287 			irec->rm_startblock + irec->rm_blockcount;
288 		return 1;
289 	}
290 	return 0;
291 }
292 
293 #define cmp_int(l, r)		((l > r) - (l < r))
294 
295 static int
xfs_zone_gc_rmap_rec_cmp(const void * a,const void * b)296 xfs_zone_gc_rmap_rec_cmp(
297 	const void			*a,
298 	const void			*b)
299 {
300 	const struct xfs_rmap_irec	*reca = a;
301 	const struct xfs_rmap_irec	*recb = b;
302 	int				diff;
303 
304 	diff = cmp_int(reca->rm_owner, recb->rm_owner);
305 	if (diff)
306 		return diff;
307 	return cmp_int(reca->rm_offset, recb->rm_offset);
308 }
309 
310 static int
xfs_zone_gc_query(struct xfs_mount * mp,struct xfs_zone_gc_iter * iter)311 xfs_zone_gc_query(
312 	struct xfs_mount	*mp,
313 	struct xfs_zone_gc_iter	*iter)
314 {
315 	struct xfs_rtgroup	*rtg = iter->victim_rtg;
316 	struct xfs_rmap_irec	ri_low = { };
317 	struct xfs_rmap_irec	ri_high;
318 	struct xfs_btree_cur	*cur;
319 	struct xfs_trans	*tp;
320 	int			error;
321 
322 	ASSERT(iter->next_startblock <= rtg_blocks(rtg));
323 	if (iter->next_startblock == rtg_blocks(rtg))
324 		goto done;
325 
326 	ASSERT(iter->next_startblock < rtg_blocks(rtg));
327 	ri_low.rm_startblock = iter->next_startblock;
328 	memset(&ri_high, 0xFF, sizeof(ri_high));
329 
330 	iter->rec_idx = 0;
331 	iter->rec_count = 0;
332 
333 	error = xfs_trans_alloc_empty(mp, &tp);
334 	if (error)
335 		return error;
336 
337 	xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
338 	cur = xfs_rtrmapbt_init_cursor(tp, rtg);
339 	error = xfs_rmap_query_range(cur, &ri_low, &ri_high,
340 			xfs_zone_gc_query_cb, iter);
341 	xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP);
342 	xfs_btree_del_cursor(cur, error < 0 ? error : 0);
343 	xfs_trans_cancel(tp);
344 
345 	if (error < 0)
346 		return error;
347 
348 	/*
349 	 * Sort the rmap records by inode number and increasing offset to
350 	 * defragment the mappings.
351 	 *
352 	 * This could be further enhanced by an even bigger look ahead window,
353 	 * but that's better left until we have better detection of changes to
354 	 * inode mapping to avoid the potential of GCing already dead data.
355 	 */
356 	sort(iter->recs, iter->rec_count, sizeof(iter->recs[0]),
357 			xfs_zone_gc_rmap_rec_cmp, NULL);
358 
359 	if (error == 0) {
360 		/*
361 		 * We finished iterating through the zone.
362 		 */
363 		iter->next_startblock = rtg_blocks(rtg);
364 		if (iter->rec_count == 0)
365 			goto done;
366 	}
367 
368 	return 0;
369 done:
370 	xfs_rtgroup_rele(iter->victim_rtg);
371 	iter->victim_rtg = NULL;
372 	return 0;
373 }
374 
375 static bool
xfs_zone_gc_iter_next(struct xfs_mount * mp,struct xfs_zone_gc_iter * iter,struct xfs_rmap_irec * chunk_rec,struct xfs_inode ** ipp)376 xfs_zone_gc_iter_next(
377 	struct xfs_mount	*mp,
378 	struct xfs_zone_gc_iter	*iter,
379 	struct xfs_rmap_irec	*chunk_rec,
380 	struct xfs_inode	**ipp)
381 {
382 	struct xfs_rmap_irec	*irec;
383 	int			error;
384 
385 	if (!iter->victim_rtg)
386 		return false;
387 
388 retry:
389 	if (iter->rec_idx == iter->rec_count) {
390 		error = xfs_zone_gc_query(mp, iter);
391 		if (error)
392 			goto fail;
393 		if (!iter->victim_rtg)
394 			return false;
395 	}
396 
397 	irec = &iter->recs[iter->rec_idx];
398 	error = xfs_iget(mp, NULL, irec->rm_owner,
399 			XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE, 0, ipp);
400 	if (error) {
401 		/*
402 		 * If the inode was already deleted, skip over it.
403 		 */
404 		if (error == -ENOENT) {
405 			iter->rec_idx++;
406 			goto retry;
407 		}
408 		goto fail;
409 	}
410 
411 	if (!S_ISREG(VFS_I(*ipp)->i_mode) || !XFS_IS_REALTIME_INODE(*ipp)) {
412 		iter->rec_idx++;
413 		xfs_irele(*ipp);
414 		goto retry;
415 	}
416 
417 	*chunk_rec = *irec;
418 	return true;
419 
420 fail:
421 	xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
422 	return false;
423 }
424 
425 static void
xfs_zone_gc_iter_advance(struct xfs_zone_gc_iter * iter,xfs_extlen_t count_fsb)426 xfs_zone_gc_iter_advance(
427 	struct xfs_zone_gc_iter	*iter,
428 	xfs_extlen_t		count_fsb)
429 {
430 	struct xfs_rmap_irec	*irec = &iter->recs[iter->rec_idx];
431 
432 	irec->rm_offset += count_fsb;
433 	irec->rm_startblock += count_fsb;
434 	irec->rm_blockcount -= count_fsb;
435 	if (!irec->rm_blockcount)
436 		iter->rec_idx++;
437 }
438 
439 static struct xfs_rtgroup *
xfs_zone_gc_pick_victim_from(struct xfs_mount * mp,uint32_t bucket)440 xfs_zone_gc_pick_victim_from(
441 	struct xfs_mount	*mp,
442 	uint32_t		bucket)
443 {
444 	struct xfs_zone_info	*zi = mp->m_zone_info;
445 	uint32_t		victim_used = U32_MAX;
446 	struct xfs_rtgroup	*victim_rtg = NULL;
447 	uint32_t		bit;
448 
449 	if (!zi->zi_used_bucket_entries[bucket])
450 		return NULL;
451 
452 	for_each_set_bit(bit, zi->zi_used_bucket_bitmap[bucket],
453 			mp->m_sb.sb_rgcount) {
454 		struct xfs_rtgroup *rtg = xfs_rtgroup_grab(mp, bit);
455 
456 		if (!rtg)
457 			continue;
458 
459 		/* skip zones that are just waiting for a reset */
460 		if (rtg_rmap(rtg)->i_used_blocks == 0 ||
461 		    rtg_rmap(rtg)->i_used_blocks >= victim_used) {
462 			xfs_rtgroup_rele(rtg);
463 			continue;
464 		}
465 
466 		if (victim_rtg)
467 			xfs_rtgroup_rele(victim_rtg);
468 		victim_rtg = rtg;
469 		victim_used = rtg_rmap(rtg)->i_used_blocks;
470 
471 		/*
472 		 * Any zone that is less than 1 percent used is fair game for
473 		 * instant reclaim. All of these zones are in the last
474 		 * bucket, so avoid the expensive division for the zones
475 		 * in the other buckets.
476 		 */
477 		if (bucket == 0 &&
478 		    rtg_rmap(rtg)->i_used_blocks < rtg_blocks(rtg) / 100)
479 			break;
480 	}
481 
482 	return victim_rtg;
483 }
484 
485 /*
486  * Iterate through all zones marked as reclaimable and find a candidate to
487  * reclaim.
488  */
489 static bool
xfs_zone_gc_select_victim(struct xfs_zone_gc_data * data)490 xfs_zone_gc_select_victim(
491 	struct xfs_zone_gc_data	*data)
492 {
493 	struct xfs_zone_gc_iter	*iter = &data->iter;
494 	struct xfs_mount	*mp = data->mp;
495 	struct xfs_zone_info	*zi = mp->m_zone_info;
496 	struct xfs_rtgroup	*victim_rtg = NULL;
497 	unsigned int		bucket;
498 
499 	if (xfs_is_shutdown(mp))
500 		return false;
501 
502 	if (iter->victim_rtg)
503 		return true;
504 
505 	/*
506 	 * Don't start new work if we are asked to stop or park.
507 	 */
508 	if (kthread_should_stop() || kthread_should_park())
509 		return false;
510 
511 	if (!xfs_zoned_need_gc(mp))
512 		return false;
513 
514 	spin_lock(&zi->zi_used_buckets_lock);
515 	for (bucket = 0; bucket < XFS_ZONE_USED_BUCKETS; bucket++) {
516 		victim_rtg = xfs_zone_gc_pick_victim_from(mp, bucket);
517 		if (victim_rtg)
518 			break;
519 	}
520 	spin_unlock(&zi->zi_used_buckets_lock);
521 
522 	if (!victim_rtg)
523 		return false;
524 
525 	trace_xfs_zone_gc_select_victim(victim_rtg, bucket);
526 	xfs_zone_gc_iter_init(iter, victim_rtg);
527 	return true;
528 }
529 
530 static struct xfs_open_zone *
xfs_zone_gc_steal_open(struct xfs_zone_info * zi)531 xfs_zone_gc_steal_open(
532 	struct xfs_zone_info	*zi)
533 {
534 	struct xfs_open_zone	*oz, *found = NULL;
535 
536 	spin_lock(&zi->zi_open_zones_lock);
537 	list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) {
538 		if (!found ||
539 		    oz->oz_write_pointer < found->oz_write_pointer)
540 			found = oz;
541 	}
542 
543 	if (found) {
544 		found->oz_is_gc = true;
545 		list_del_init(&found->oz_entry);
546 		zi->zi_nr_open_zones--;
547 	}
548 
549 	spin_unlock(&zi->zi_open_zones_lock);
550 	return found;
551 }
552 
553 static struct xfs_open_zone *
xfs_zone_gc_select_target(struct xfs_mount * mp)554 xfs_zone_gc_select_target(
555 	struct xfs_mount	*mp)
556 {
557 	struct xfs_zone_info	*zi = mp->m_zone_info;
558 	struct xfs_open_zone	*oz = zi->zi_open_gc_zone;
559 
560 	/*
561 	 * We need to wait for pending writes to finish.
562 	 */
563 	if (oz && oz->oz_written < rtg_blocks(oz->oz_rtg))
564 		return NULL;
565 
566 	ASSERT(zi->zi_nr_open_zones <=
567 		mp->m_max_open_zones - XFS_OPEN_GC_ZONES);
568 	oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true);
569 	if (oz)
570 		trace_xfs_zone_gc_target_opened(oz->oz_rtg);
571 	spin_lock(&zi->zi_open_zones_lock);
572 	zi->zi_open_gc_zone = oz;
573 	spin_unlock(&zi->zi_open_zones_lock);
574 	return oz;
575 }
576 
577 /*
578  * Ensure we have a valid open zone to write the GC data to.
579  *
580  * If the current target zone has space keep writing to it, else first wait for
581  * all pending writes and then pick a new one.
582  */
583 static struct xfs_open_zone *
xfs_zone_gc_ensure_target(struct xfs_mount * mp)584 xfs_zone_gc_ensure_target(
585 	struct xfs_mount	*mp)
586 {
587 	struct xfs_open_zone	*oz = mp->m_zone_info->zi_open_gc_zone;
588 
589 	if (!oz || oz->oz_write_pointer == rtg_blocks(oz->oz_rtg))
590 		return xfs_zone_gc_select_target(mp);
591 	return oz;
592 }
593 
594 static unsigned int
xfs_zone_gc_scratch_available(struct xfs_zone_gc_data * data)595 xfs_zone_gc_scratch_available(
596 	struct xfs_zone_gc_data	*data)
597 {
598 	return XFS_GC_CHUNK_SIZE - data->scratch[data->scratch_idx].offset;
599 }
600 
601 static bool
xfs_zone_gc_space_available(struct xfs_zone_gc_data * data)602 xfs_zone_gc_space_available(
603 	struct xfs_zone_gc_data	*data)
604 {
605 	struct xfs_open_zone	*oz;
606 
607 	oz = xfs_zone_gc_ensure_target(data->mp);
608 	if (!oz)
609 		return false;
610 	return oz->oz_write_pointer < rtg_blocks(oz->oz_rtg) &&
611 		xfs_zone_gc_scratch_available(data);
612 }
613 
614 static void
xfs_zone_gc_end_io(struct bio * bio)615 xfs_zone_gc_end_io(
616 	struct bio		*bio)
617 {
618 	struct xfs_gc_bio	*chunk =
619 		container_of(bio, struct xfs_gc_bio, bio);
620 	struct xfs_zone_gc_data	*data = chunk->data;
621 
622 	WRITE_ONCE(chunk->state, XFS_GC_BIO_DONE);
623 	wake_up_process(data->mp->m_zone_info->zi_gc_thread);
624 }
625 
626 static struct xfs_open_zone *
xfs_zone_gc_alloc_blocks(struct xfs_zone_gc_data * data,xfs_extlen_t * count_fsb,xfs_daddr_t * daddr,bool * is_seq)627 xfs_zone_gc_alloc_blocks(
628 	struct xfs_zone_gc_data	*data,
629 	xfs_extlen_t		*count_fsb,
630 	xfs_daddr_t		*daddr,
631 	bool			*is_seq)
632 {
633 	struct xfs_mount	*mp = data->mp;
634 	struct xfs_open_zone	*oz;
635 
636 	oz = xfs_zone_gc_ensure_target(mp);
637 	if (!oz)
638 		return NULL;
639 
640 	*count_fsb = min(*count_fsb,
641 		XFS_B_TO_FSB(mp, xfs_zone_gc_scratch_available(data)));
642 
643 	/*
644 	 * Directly allocate GC blocks from the reserved pool.
645 	 *
646 	 * If we'd take them from the normal pool we could be stealing blocks
647 	 * from a regular writer, which would then have to wait for GC and
648 	 * deadlock.
649 	 */
650 	spin_lock(&mp->m_sb_lock);
651 	*count_fsb = min(*count_fsb,
652 			rtg_blocks(oz->oz_rtg) - oz->oz_write_pointer);
653 	*count_fsb = min3(*count_fsb,
654 			mp->m_free[XC_FREE_RTEXTENTS].res_avail,
655 			mp->m_free[XC_FREE_RTAVAILABLE].res_avail);
656 	mp->m_free[XC_FREE_RTEXTENTS].res_avail -= *count_fsb;
657 	mp->m_free[XC_FREE_RTAVAILABLE].res_avail -= *count_fsb;
658 	spin_unlock(&mp->m_sb_lock);
659 
660 	if (!*count_fsb)
661 		return NULL;
662 
663 	*daddr = xfs_gbno_to_daddr(&oz->oz_rtg->rtg_group, 0);
664 	*is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *daddr);
665 	if (!*is_seq)
666 		*daddr += XFS_FSB_TO_BB(mp, oz->oz_write_pointer);
667 	oz->oz_write_pointer += *count_fsb;
668 	atomic_inc(&oz->oz_ref);
669 	return oz;
670 }
671 
672 static bool
xfs_zone_gc_start_chunk(struct xfs_zone_gc_data * data)673 xfs_zone_gc_start_chunk(
674 	struct xfs_zone_gc_data	*data)
675 {
676 	struct xfs_zone_gc_iter	*iter = &data->iter;
677 	struct xfs_mount	*mp = data->mp;
678 	struct block_device	*bdev = mp->m_rtdev_targp->bt_bdev;
679 	struct xfs_open_zone	*oz;
680 	struct xfs_rmap_irec	irec;
681 	struct xfs_gc_bio	*chunk;
682 	struct xfs_inode	*ip;
683 	struct bio		*bio;
684 	xfs_daddr_t		daddr;
685 	bool			is_seq;
686 
687 	if (xfs_is_shutdown(mp))
688 		return false;
689 
690 	if (!xfs_zone_gc_iter_next(mp, iter, &irec, &ip))
691 		return false;
692 	oz = xfs_zone_gc_alloc_blocks(data, &irec.rm_blockcount, &daddr,
693 			&is_seq);
694 	if (!oz) {
695 		xfs_irele(ip);
696 		return false;
697 	}
698 
699 	bio = bio_alloc_bioset(bdev, 1, REQ_OP_READ, GFP_NOFS, &data->bio_set);
700 
701 	chunk = container_of(bio, struct xfs_gc_bio, bio);
702 	chunk->ip = ip;
703 	chunk->offset = XFS_FSB_TO_B(mp, irec.rm_offset);
704 	chunk->len = XFS_FSB_TO_B(mp, irec.rm_blockcount);
705 	chunk->old_startblock =
706 		xfs_rgbno_to_rtb(iter->victim_rtg, irec.rm_startblock);
707 	chunk->new_daddr = daddr;
708 	chunk->is_seq = is_seq;
709 	chunk->scratch = &data->scratch[data->scratch_idx];
710 	chunk->data = data;
711 	chunk->oz = oz;
712 
713 	bio->bi_iter.bi_sector = xfs_rtb_to_daddr(mp, chunk->old_startblock);
714 	bio->bi_end_io = xfs_zone_gc_end_io;
715 	bio_add_folio_nofail(bio, chunk->scratch->folio, chunk->len,
716 			chunk->scratch->offset);
717 	chunk->scratch->offset += chunk->len;
718 	if (chunk->scratch->offset == XFS_GC_CHUNK_SIZE) {
719 		data->scratch_idx =
720 			(data->scratch_idx + 1) % XFS_ZONE_GC_NR_SCRATCH;
721 	}
722 	WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
723 	list_add_tail(&chunk->entry, &data->reading);
724 	xfs_zone_gc_iter_advance(iter, irec.rm_blockcount);
725 
726 	submit_bio(bio);
727 	return true;
728 }
729 
730 static void
xfs_zone_gc_free_chunk(struct xfs_gc_bio * chunk)731 xfs_zone_gc_free_chunk(
732 	struct xfs_gc_bio	*chunk)
733 {
734 	list_del(&chunk->entry);
735 	xfs_open_zone_put(chunk->oz);
736 	xfs_irele(chunk->ip);
737 	bio_put(&chunk->bio);
738 }
739 
740 static void
xfs_zone_gc_submit_write(struct xfs_zone_gc_data * data,struct xfs_gc_bio * chunk)741 xfs_zone_gc_submit_write(
742 	struct xfs_zone_gc_data	*data,
743 	struct xfs_gc_bio	*chunk)
744 {
745 	if (chunk->is_seq) {
746 		chunk->bio.bi_opf &= ~REQ_OP_WRITE;
747 		chunk->bio.bi_opf |= REQ_OP_ZONE_APPEND;
748 	}
749 	chunk->bio.bi_iter.bi_sector = chunk->new_daddr;
750 	chunk->bio.bi_end_io = xfs_zone_gc_end_io;
751 	submit_bio(&chunk->bio);
752 }
753 
754 static struct xfs_gc_bio *
xfs_zone_gc_split_write(struct xfs_zone_gc_data * data,struct xfs_gc_bio * chunk)755 xfs_zone_gc_split_write(
756 	struct xfs_zone_gc_data	*data,
757 	struct xfs_gc_bio	*chunk)
758 {
759 	struct queue_limits	*lim =
760 		&bdev_get_queue(chunk->bio.bi_bdev)->limits;
761 	struct xfs_gc_bio	*split_chunk;
762 	int			split_sectors;
763 	unsigned int		split_len;
764 	struct bio		*split;
765 	unsigned int		nsegs;
766 
767 	if (!chunk->is_seq)
768 		return NULL;
769 
770 	split_sectors = bio_split_rw_at(&chunk->bio, lim, &nsegs,
771 			lim->max_zone_append_sectors << SECTOR_SHIFT);
772 	if (!split_sectors)
773 		return NULL;
774 
775 	/* ensure the split chunk is still block size aligned */
776 	split_sectors = ALIGN_DOWN(split_sectors << SECTOR_SHIFT,
777 			data->mp->m_sb.sb_blocksize) >> SECTOR_SHIFT;
778 	split_len = split_sectors << SECTOR_SHIFT;
779 
780 	split = bio_split(&chunk->bio, split_sectors, GFP_NOFS, &data->bio_set);
781 	split_chunk = container_of(split, struct xfs_gc_bio, bio);
782 	split_chunk->data = data;
783 	ihold(VFS_I(chunk->ip));
784 	split_chunk->ip = chunk->ip;
785 	split_chunk->is_seq = chunk->is_seq;
786 	split_chunk->scratch = chunk->scratch;
787 	split_chunk->offset = chunk->offset;
788 	split_chunk->len = split_len;
789 	split_chunk->old_startblock = chunk->old_startblock;
790 	split_chunk->new_daddr = chunk->new_daddr;
791 	split_chunk->oz = chunk->oz;
792 	atomic_inc(&chunk->oz->oz_ref);
793 
794 	chunk->offset += split_len;
795 	chunk->len -= split_len;
796 	chunk->old_startblock += XFS_B_TO_FSB(data->mp, split_len);
797 
798 	/* add right before the original chunk */
799 	WRITE_ONCE(split_chunk->state, XFS_GC_BIO_NEW);
800 	list_add_tail(&split_chunk->entry, &chunk->entry);
801 	return split_chunk;
802 }
803 
804 static void
xfs_zone_gc_write_chunk(struct xfs_gc_bio * chunk)805 xfs_zone_gc_write_chunk(
806 	struct xfs_gc_bio	*chunk)
807 {
808 	struct xfs_zone_gc_data	*data = chunk->data;
809 	struct xfs_mount	*mp = chunk->ip->i_mount;
810 	phys_addr_t		bvec_paddr =
811 		bvec_phys(bio_first_bvec_all(&chunk->bio));
812 	struct xfs_gc_bio	*split_chunk;
813 
814 	if (chunk->bio.bi_status)
815 		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
816 	if (xfs_is_shutdown(mp)) {
817 		xfs_zone_gc_free_chunk(chunk);
818 		return;
819 	}
820 
821 	WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
822 	list_move_tail(&chunk->entry, &data->writing);
823 
824 	bio_reset(&chunk->bio, mp->m_rtdev_targp->bt_bdev, REQ_OP_WRITE);
825 	bio_add_folio_nofail(&chunk->bio, chunk->scratch->folio, chunk->len,
826 			offset_in_folio(chunk->scratch->folio, bvec_paddr));
827 
828 	while ((split_chunk = xfs_zone_gc_split_write(data, chunk)))
829 		xfs_zone_gc_submit_write(data, split_chunk);
830 	xfs_zone_gc_submit_write(data, chunk);
831 }
832 
833 static void
xfs_zone_gc_finish_chunk(struct xfs_gc_bio * chunk)834 xfs_zone_gc_finish_chunk(
835 	struct xfs_gc_bio	*chunk)
836 {
837 	uint			iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
838 	struct xfs_inode	*ip = chunk->ip;
839 	struct xfs_mount	*mp = ip->i_mount;
840 	int			error;
841 
842 	if (chunk->bio.bi_status)
843 		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
844 	if (xfs_is_shutdown(mp)) {
845 		xfs_zone_gc_free_chunk(chunk);
846 		return;
847 	}
848 
849 	chunk->scratch->freed += chunk->len;
850 	if (chunk->scratch->freed == chunk->scratch->offset) {
851 		chunk->scratch->offset = 0;
852 		chunk->scratch->freed = 0;
853 	}
854 
855 	/*
856 	 * Cycle through the iolock and wait for direct I/O and layouts to
857 	 * ensure no one is reading from the old mapping before it goes away.
858 	 *
859 	 * Note that xfs_zoned_end_io() below checks that no other writer raced
860 	 * with us to update the mapping by checking that the old startblock
861 	 * didn't change.
862 	 */
863 	xfs_ilock(ip, iolock);
864 	error = xfs_break_layouts(VFS_I(ip), &iolock, BREAK_UNMAP);
865 	if (!error)
866 		inode_dio_wait(VFS_I(ip));
867 	xfs_iunlock(ip, iolock);
868 	if (error)
869 		goto free;
870 
871 	if (chunk->is_seq)
872 		chunk->new_daddr = chunk->bio.bi_iter.bi_sector;
873 	error = xfs_zoned_end_io(ip, chunk->offset, chunk->len,
874 			chunk->new_daddr, chunk->oz, chunk->old_startblock);
875 free:
876 	if (error)
877 		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
878 	xfs_zone_gc_free_chunk(chunk);
879 }
880 
881 static void
xfs_zone_gc_finish_reset(struct xfs_gc_bio * chunk)882 xfs_zone_gc_finish_reset(
883 	struct xfs_gc_bio	*chunk)
884 {
885 	struct xfs_rtgroup	*rtg = chunk->bio.bi_private;
886 	struct xfs_mount	*mp = rtg_mount(rtg);
887 	struct xfs_zone_info	*zi = mp->m_zone_info;
888 
889 	if (chunk->bio.bi_status) {
890 		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
891 		goto out;
892 	}
893 
894 	xfs_group_set_mark(&rtg->rtg_group, XFS_RTG_FREE);
895 	atomic_inc(&zi->zi_nr_free_zones);
896 
897 	xfs_zoned_add_available(mp, rtg_blocks(rtg));
898 
899 	wake_up_all(&zi->zi_zone_wait);
900 out:
901 	list_del(&chunk->entry);
902 	bio_put(&chunk->bio);
903 }
904 
905 static bool
xfs_zone_gc_prepare_reset(struct bio * bio,struct xfs_rtgroup * rtg)906 xfs_zone_gc_prepare_reset(
907 	struct bio		*bio,
908 	struct xfs_rtgroup	*rtg)
909 {
910 	trace_xfs_zone_reset(rtg);
911 
912 	ASSERT(rtg_rmap(rtg)->i_used_blocks == 0);
913 	bio->bi_iter.bi_sector = xfs_gbno_to_daddr(&rtg->rtg_group, 0);
914 	if (!bdev_zone_is_seq(bio->bi_bdev, bio->bi_iter.bi_sector)) {
915 		if (!bdev_max_discard_sectors(bio->bi_bdev))
916 			return false;
917 		bio->bi_opf = REQ_OP_DISCARD | REQ_SYNC;
918 		bio->bi_iter.bi_size =
919 			XFS_FSB_TO_B(rtg_mount(rtg), rtg_blocks(rtg));
920 	}
921 
922 	return true;
923 }
924 
925 int
xfs_zone_gc_reset_sync(struct xfs_rtgroup * rtg)926 xfs_zone_gc_reset_sync(
927 	struct xfs_rtgroup	*rtg)
928 {
929 	int			error = 0;
930 	struct bio		bio;
931 
932 	bio_init(&bio, rtg_mount(rtg)->m_rtdev_targp->bt_bdev, NULL, 0,
933 			REQ_OP_ZONE_RESET);
934 	if (xfs_zone_gc_prepare_reset(&bio, rtg))
935 		error = submit_bio_wait(&bio);
936 	bio_uninit(&bio);
937 
938 	return error;
939 }
940 
941 static void
xfs_zone_gc_reset_zones(struct xfs_zone_gc_data * data,struct xfs_group * reset_list)942 xfs_zone_gc_reset_zones(
943 	struct xfs_zone_gc_data	*data,
944 	struct xfs_group	*reset_list)
945 {
946 	struct xfs_group	*next = reset_list;
947 
948 	if (blkdev_issue_flush(data->mp->m_rtdev_targp->bt_bdev) < 0) {
949 		xfs_force_shutdown(data->mp, SHUTDOWN_META_IO_ERROR);
950 		return;
951 	}
952 
953 	do {
954 		struct xfs_rtgroup	*rtg = to_rtg(next);
955 		struct xfs_gc_bio	*chunk;
956 		struct bio		*bio;
957 
958 		xfs_log_force_inode(rtg_rmap(rtg));
959 
960 		next = rtg_group(rtg)->xg_next_reset;
961 		rtg_group(rtg)->xg_next_reset = NULL;
962 
963 		bio = bio_alloc_bioset(rtg_mount(rtg)->m_rtdev_targp->bt_bdev,
964 				0, REQ_OP_ZONE_RESET, GFP_NOFS, &data->bio_set);
965 		bio->bi_private = rtg;
966 		bio->bi_end_io = xfs_zone_gc_end_io;
967 
968 		chunk = container_of(bio, struct xfs_gc_bio, bio);
969 		chunk->data = data;
970 		WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
971 		list_add_tail(&chunk->entry, &data->resetting);
972 
973 		/*
974 		 * Also use the bio to drive the state machine when neither
975 		 * zone reset nor discard is supported to keep things simple.
976 		 */
977 		if (xfs_zone_gc_prepare_reset(bio, rtg))
978 			submit_bio(bio);
979 		else
980 			bio_endio(bio);
981 	} while (next);
982 }
983 
984 /*
985  * Handle the work to read and write data for GC and to reset the zones,
986  * including handling all completions.
987  *
988  * Note that the order of the chunks is preserved so that we don't undo the
989  * optimal order established by xfs_zone_gc_query().
990  */
991 static bool
xfs_zone_gc_handle_work(struct xfs_zone_gc_data * data)992 xfs_zone_gc_handle_work(
993 	struct xfs_zone_gc_data	*data)
994 {
995 	struct xfs_zone_info	*zi = data->mp->m_zone_info;
996 	struct xfs_gc_bio	*chunk, *next;
997 	struct xfs_group	*reset_list;
998 	struct blk_plug		plug;
999 
1000 	spin_lock(&zi->zi_reset_list_lock);
1001 	reset_list = zi->zi_reset_list;
1002 	zi->zi_reset_list = NULL;
1003 	spin_unlock(&zi->zi_reset_list_lock);
1004 
1005 	if (!xfs_zone_gc_select_victim(data) ||
1006 	    !xfs_zone_gc_space_available(data)) {
1007 		if (list_empty(&data->reading) &&
1008 		    list_empty(&data->writing) &&
1009 		    list_empty(&data->resetting) &&
1010 		    !reset_list)
1011 			return false;
1012 	}
1013 
1014 	__set_current_state(TASK_RUNNING);
1015 	try_to_freeze();
1016 
1017 	if (reset_list)
1018 		xfs_zone_gc_reset_zones(data, reset_list);
1019 
1020 	list_for_each_entry_safe(chunk, next, &data->resetting, entry) {
1021 		if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
1022 			break;
1023 		xfs_zone_gc_finish_reset(chunk);
1024 	}
1025 
1026 	list_for_each_entry_safe(chunk, next, &data->writing, entry) {
1027 		if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
1028 			break;
1029 		xfs_zone_gc_finish_chunk(chunk);
1030 	}
1031 
1032 	blk_start_plug(&plug);
1033 	list_for_each_entry_safe(chunk, next, &data->reading, entry) {
1034 		if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
1035 			break;
1036 		xfs_zone_gc_write_chunk(chunk);
1037 	}
1038 	blk_finish_plug(&plug);
1039 
1040 	blk_start_plug(&plug);
1041 	while (xfs_zone_gc_start_chunk(data))
1042 		;
1043 	blk_finish_plug(&plug);
1044 	return true;
1045 }
1046 
1047 /*
1048  * Note that the current GC algorithm would break reflinks and thus duplicate
1049  * data that was shared by multiple owners before.  Because of that reflinks
1050  * are currently not supported on zoned file systems and can't be created or
1051  * mounted.
1052  */
1053 static int
xfs_zoned_gcd(void * private)1054 xfs_zoned_gcd(
1055 	void			*private)
1056 {
1057 	struct xfs_zone_gc_data	*data = private;
1058 	struct xfs_mount	*mp = data->mp;
1059 	struct xfs_zone_info	*zi = mp->m_zone_info;
1060 	unsigned int		nofs_flag;
1061 
1062 	nofs_flag = memalloc_nofs_save();
1063 	set_freezable();
1064 
1065 	for (;;) {
1066 		set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE);
1067 		xfs_set_zonegc_running(mp);
1068 		if (xfs_zone_gc_handle_work(data))
1069 			continue;
1070 
1071 		if (list_empty(&data->reading) &&
1072 		    list_empty(&data->writing) &&
1073 		    list_empty(&data->resetting) &&
1074 		    !zi->zi_reset_list) {
1075 			xfs_clear_zonegc_running(mp);
1076 			xfs_zoned_resv_wake_all(mp);
1077 
1078 			if (kthread_should_stop()) {
1079 				__set_current_state(TASK_RUNNING);
1080 				break;
1081 			}
1082 
1083 			if (kthread_should_park()) {
1084 				__set_current_state(TASK_RUNNING);
1085 				kthread_parkme();
1086 				continue;
1087 			}
1088 		}
1089 
1090 		schedule();
1091 	}
1092 	xfs_clear_zonegc_running(mp);
1093 
1094 	if (data->iter.victim_rtg)
1095 		xfs_rtgroup_rele(data->iter.victim_rtg);
1096 
1097 	memalloc_nofs_restore(nofs_flag);
1098 	xfs_zone_gc_data_free(data);
1099 	return 0;
1100 }
1101 
1102 void
xfs_zone_gc_start(struct xfs_mount * mp)1103 xfs_zone_gc_start(
1104 	struct xfs_mount	*mp)
1105 {
1106 	if (xfs_has_zoned(mp))
1107 		kthread_unpark(mp->m_zone_info->zi_gc_thread);
1108 }
1109 
1110 void
xfs_zone_gc_stop(struct xfs_mount * mp)1111 xfs_zone_gc_stop(
1112 	struct xfs_mount	*mp)
1113 {
1114 	if (xfs_has_zoned(mp))
1115 		kthread_park(mp->m_zone_info->zi_gc_thread);
1116 }
1117 
1118 int
xfs_zone_gc_mount(struct xfs_mount * mp)1119 xfs_zone_gc_mount(
1120 	struct xfs_mount	*mp)
1121 {
1122 	struct xfs_zone_info	*zi = mp->m_zone_info;
1123 	struct xfs_zone_gc_data	*data;
1124 	struct xfs_open_zone	*oz;
1125 	int			error;
1126 
1127 	/*
1128 	 * If there are no free zones available for GC, pick the open zone with
1129 	 * the least used space to GC into.  This should only happen after an
1130 	 * unclean shutdown near ENOSPC while GC was ongoing.
1131 	 *
1132 	 * We also need to do this for the first gc zone allocation if we
1133 	 * unmounted while at the open limit.
1134 	 */
1135 	if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_FREE) ||
1136 	    zi->zi_nr_open_zones == mp->m_max_open_zones)
1137 		oz = xfs_zone_gc_steal_open(zi);
1138 	else
1139 		oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true);
1140 	if (!oz) {
1141 		xfs_warn(mp, "unable to allocate a zone for gc");
1142 		error = -EIO;
1143 		goto out;
1144 	}
1145 
1146 	trace_xfs_zone_gc_target_opened(oz->oz_rtg);
1147 	zi->zi_open_gc_zone = oz;
1148 
1149 	data = xfs_zone_gc_data_alloc(mp);
1150 	if (!data) {
1151 		error = -ENOMEM;
1152 		goto out_put_gc_zone;
1153 	}
1154 
1155 	mp->m_zone_info->zi_gc_thread = kthread_create(xfs_zoned_gcd, data,
1156 			"xfs-zone-gc/%s", mp->m_super->s_id);
1157 	if (IS_ERR(mp->m_zone_info->zi_gc_thread)) {
1158 		xfs_warn(mp, "unable to create zone gc thread");
1159 		error = PTR_ERR(mp->m_zone_info->zi_gc_thread);
1160 		goto out_free_gc_data;
1161 	}
1162 
1163 	/* xfs_zone_gc_start will unpark for rw mounts */
1164 	kthread_park(mp->m_zone_info->zi_gc_thread);
1165 	return 0;
1166 
1167 out_free_gc_data:
1168 	kfree(data);
1169 out_put_gc_zone:
1170 	xfs_open_zone_put(zi->zi_open_gc_zone);
1171 out:
1172 	return error;
1173 }
1174 
1175 void
xfs_zone_gc_unmount(struct xfs_mount * mp)1176 xfs_zone_gc_unmount(
1177 	struct xfs_mount	*mp)
1178 {
1179 	struct xfs_zone_info	*zi = mp->m_zone_info;
1180 
1181 	kthread_stop(zi->zi_gc_thread);
1182 	if (zi->zi_open_gc_zone)
1183 		xfs_open_zone_put(zi->zi_open_gc_zone);
1184 }
1185