1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Moving/copying garbage collector
4  *
5  * Copyright 2012 Google, Inc.
6  */
7 
8 #include "bcachefs.h"
9 #include "alloc_background.h"
10 #include "alloc_foreground.h"
11 #include "backpointers.h"
12 #include "btree_iter.h"
13 #include "btree_update.h"
14 #include "btree_write_buffer.h"
15 #include "buckets.h"
16 #include "clock.h"
17 #include "errcode.h"
18 #include "error.h"
19 #include "lru.h"
20 #include "move.h"
21 #include "movinggc.h"
22 #include "trace.h"
23 
24 #include <linux/freezer.h>
25 #include <linux/kthread.h>
26 #include <linux/math64.h>
27 #include <linux/sched/task.h>
28 #include <linux/wait.h>
29 
30 struct buckets_in_flight {
31 	struct rhashtable	table;
32 	struct move_bucket	*first;
33 	struct move_bucket	*last;
34 	size_t			nr;
35 	size_t			sectors;
36 
37 	DARRAY(struct move_bucket *) to_evacuate;
38 };
39 
40 static const struct rhashtable_params bch_move_bucket_params = {
41 	.head_offset		= offsetof(struct move_bucket, hash),
42 	.key_offset		= offsetof(struct move_bucket, k),
43 	.key_len		= sizeof(struct move_bucket_key),
44 	.automatic_shrinking	= true,
45 };
46 
47 static void move_bucket_in_flight_add(struct buckets_in_flight *list, struct move_bucket *b)
48 {
49 	if (!list->first)
50 		list->first = b;
51 	else
52 		list->last->next = b;
53 
54 	list->last = b;
55 	list->nr++;
56 	list->sectors += b->sectors;
57 }
58 
59 static int bch2_bucket_is_movable(struct btree_trans *trans,
60 				  struct move_bucket *b, u64 time)
61 {
62 	struct bch_fs *c = trans->c;
63 
64 	if (bch2_bucket_is_open(c, b->k.bucket.inode, b->k.bucket.offset))
65 		return 0;
66 
67 	struct btree_iter iter;
68 	struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc,
69 				       b->k.bucket, BTREE_ITER_cached);
70 	int ret = bkey_err(k);
71 	if (ret)
72 		return ret;
73 
74 	struct bch_dev *ca = bch2_dev_tryget(c, k.k->p.inode);
75 	if (!ca)
76 		goto out;
77 
78 	if (bch2_bucket_bitmap_test(&ca->bucket_backpointer_mismatch, b->k.bucket.offset))
79 		goto out;
80 
81 	if (ca->mi.state != BCH_MEMBER_STATE_rw ||
82 	    !bch2_dev_is_online(ca))
83 		goto out;
84 
85 	struct bch_alloc_v4 _a;
86 	const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a);
87 	b->k.gen	= a->gen;
88 	b->sectors	= bch2_bucket_sectors_dirty(*a);
89 	u64 lru_idx	= alloc_lru_idx_fragmentation(*a, ca);
90 
91 	ret = lru_idx && lru_idx <= time;
92 out:
93 	bch2_dev_put(ca);
94 	bch2_trans_iter_exit(trans, &iter);
95 	return ret;
96 }
97 
98 static void move_bucket_free(struct buckets_in_flight *list,
99 			     struct move_bucket *b)
100 {
101 	int ret = rhashtable_remove_fast(&list->table, &b->hash,
102 					 bch_move_bucket_params);
103 	BUG_ON(ret);
104 	kfree(b);
105 }
106 
107 static void move_buckets_wait(struct moving_context *ctxt,
108 			      struct buckets_in_flight *list,
109 			      bool flush)
110 {
111 	struct move_bucket *i;
112 
113 	while ((i = list->first)) {
114 		if (flush)
115 			move_ctxt_wait_event(ctxt, !atomic_read(&i->count));
116 
117 		if (atomic_read(&i->count))
118 			break;
119 
120 		list->first = i->next;
121 		if (!list->first)
122 			list->last = NULL;
123 
124 		list->nr--;
125 		list->sectors -= i->sectors;
126 
127 		move_bucket_free(list, i);
128 	}
129 
130 	bch2_trans_unlock_long(ctxt->trans);
131 }
132 
133 static bool bucket_in_flight(struct buckets_in_flight *list,
134 			     struct move_bucket_key k)
135 {
136 	return rhashtable_lookup_fast(&list->table, &k, bch_move_bucket_params);
137 }
138 
139 static int bch2_copygc_get_buckets(struct moving_context *ctxt,
140 			struct buckets_in_flight *buckets_in_flight)
141 {
142 	struct btree_trans *trans = ctxt->trans;
143 	struct bch_fs *c = trans->c;
144 	size_t nr_to_get = max_t(size_t, 16U, buckets_in_flight->nr / 4);
145 	size_t saw = 0, in_flight = 0, not_movable = 0, sectors = 0;
146 	int ret;
147 
148 	move_buckets_wait(ctxt, buckets_in_flight, false);
149 
150 	ret = bch2_btree_write_buffer_tryflush(trans);
151 	if (bch2_err_matches(ret, EROFS))
152 		return ret;
153 
154 	if (bch2_fs_fatal_err_on(ret, c, "%s: from bch2_btree_write_buffer_tryflush()", bch2_err_str(ret)))
155 		return ret;
156 
157 	ret = for_each_btree_key_max(trans, iter, BTREE_ID_lru,
158 				  lru_pos(BCH_LRU_BUCKET_FRAGMENTATION, 0, 0),
159 				  lru_pos(BCH_LRU_BUCKET_FRAGMENTATION, U64_MAX, LRU_TIME_MAX),
160 				  0, k, ({
161 		struct move_bucket b = { .k.bucket = u64_to_bucket(k.k->p.offset) };
162 		int ret2 = 0;
163 
164 		saw++;
165 
166 		ret2 = bch2_bucket_is_movable(trans, &b, lru_pos_time(k.k->p));
167 		if (ret2 < 0)
168 			goto err;
169 
170 		if (!ret2)
171 			not_movable++;
172 		else if (bucket_in_flight(buckets_in_flight, b.k))
173 			in_flight++;
174 		else {
175 			struct move_bucket *b_i = kmalloc(sizeof(*b_i), GFP_KERNEL);
176 			ret2 = b_i ? 0 : -ENOMEM;
177 			if (ret2)
178 				goto err;
179 
180 			*b_i = b;
181 
182 			ret2 = darray_push(&buckets_in_flight->to_evacuate, b_i);
183 			if (ret2) {
184 				kfree(b_i);
185 				goto err;
186 			}
187 
188 			ret2 = rhashtable_lookup_insert_fast(&buckets_in_flight->table, &b_i->hash,
189 							     bch_move_bucket_params);
190 			BUG_ON(ret2);
191 
192 			sectors += b.sectors;
193 		}
194 
195 		ret2 = buckets_in_flight->to_evacuate.nr >= nr_to_get;
196 err:
197 		ret2;
198 	}));
199 
200 	pr_debug("have: %zu (%zu) saw %zu in flight %zu not movable %zu got %zu (%zu)/%zu buckets ret %i",
201 		 buckets_in_flight->nr, buckets_in_flight->sectors,
202 		 saw, in_flight, not_movable, buckets_in_flight->to_evacuate.nr, sectors, nr_to_get, ret);
203 
204 	return ret < 0 ? ret : 0;
205 }
206 
207 noinline
208 static int bch2_copygc(struct moving_context *ctxt,
209 		       struct buckets_in_flight *buckets_in_flight,
210 		       bool *did_work)
211 {
212 	struct btree_trans *trans = ctxt->trans;
213 	struct bch_fs *c = trans->c;
214 	struct data_update_opts data_opts = {
215 		.btree_insert_flags = BCH_WATERMARK_copygc,
216 	};
217 	u64 sectors_seen	= atomic64_read(&ctxt->stats->sectors_seen);
218 	u64 sectors_moved	= atomic64_read(&ctxt->stats->sectors_moved);
219 	int ret = 0;
220 
221 	ret = bch2_copygc_get_buckets(ctxt, buckets_in_flight);
222 	if (ret)
223 		goto err;
224 
225 	darray_for_each(buckets_in_flight->to_evacuate, i) {
226 		if (kthread_should_stop() || freezing(current))
227 			break;
228 
229 		struct move_bucket *b = *i;
230 		*i = NULL;
231 
232 		move_bucket_in_flight_add(buckets_in_flight, b);
233 
234 		ret = bch2_evacuate_bucket(ctxt, b, b->k.bucket, b->k.gen, data_opts);
235 		if (ret)
236 			goto err;
237 
238 		*did_work = true;
239 	}
240 err:
241 	/* no entries in LRU btree found, or got to end: */
242 	if (bch2_err_matches(ret, ENOENT))
243 		ret = 0;
244 
245 	if (ret < 0 && !bch2_err_matches(ret, EROFS))
246 		bch_err_msg(c, ret, "from bch2_move_data()");
247 
248 	sectors_seen	= atomic64_read(&ctxt->stats->sectors_seen) - sectors_seen;
249 	sectors_moved	= atomic64_read(&ctxt->stats->sectors_moved) - sectors_moved;
250 	trace_and_count(c, copygc, c, buckets_in_flight->to_evacuate.nr, sectors_seen, sectors_moved);
251 
252 	darray_for_each(buckets_in_flight->to_evacuate, i)
253 		if (*i)
254 			move_bucket_free(buckets_in_flight, *i);
255 	darray_exit(&buckets_in_flight->to_evacuate);
256 	return ret;
257 }
258 
259 static u64 bch2_copygc_dev_wait_amount(struct bch_dev *ca)
260 {
261 	struct bch_dev_usage_full usage_full = bch2_dev_usage_full_read(ca);
262 	struct bch_dev_usage usage;
263 
264 	for (unsigned i = 0; i < BCH_DATA_NR; i++)
265 		usage.buckets[i] = usage_full.d[i].buckets;
266 
267 	s64 fragmented_allowed = ((__dev_buckets_available(ca, usage, BCH_WATERMARK_stripe) *
268 				   ca->mi.bucket_size) >> 1);
269 	s64 fragmented = 0;
270 
271 	for (unsigned i = 0; i < BCH_DATA_NR; i++)
272 		if (data_type_movable(i))
273 			fragmented += usage_full.d[i].fragmented;
274 
275 	return max(0LL, fragmented_allowed - fragmented);
276 }
277 
278 /*
279  * Copygc runs when the amount of fragmented data is above some arbitrary
280  * threshold:
281  *
282  * The threshold at the limit - when the device is full - is the amount of space
283  * we reserved in bch2_recalc_capacity; we can't have more than that amount of
284  * disk space stranded due to fragmentation and store everything we have
285  * promised to store.
286  *
287  * But we don't want to be running copygc unnecessarily when the device still
288  * has plenty of free space - rather, we want copygc to smoothly run every so
289  * often and continually reduce the amount of fragmented space as the device
290  * fills up. So, we increase the threshold by half the current free space.
291  */
292 u64 bch2_copygc_wait_amount(struct bch_fs *c)
293 {
294 	u64 wait = U64_MAX;
295 
296 	rcu_read_lock();
297 	for_each_rw_member_rcu(c, ca)
298 		wait = min(wait, bch2_copygc_dev_wait_amount(ca));
299 	rcu_read_unlock();
300 
301 	return wait;
302 }
303 
304 void bch2_copygc_wait_to_text(struct printbuf *out, struct bch_fs *c)
305 {
306 	printbuf_tabstop_push(out, 32);
307 	prt_printf(out, "running:\t%u\n",		c->copygc_running);
308 	prt_printf(out, "copygc_wait:\t%llu\n",		c->copygc_wait);
309 	prt_printf(out, "copygc_wait_at:\t%llu\n",	c->copygc_wait_at);
310 
311 	prt_printf(out, "Currently waiting for:\t");
312 	prt_human_readable_u64(out, max(0LL, c->copygc_wait -
313 					atomic64_read(&c->io_clock[WRITE].now)) << 9);
314 	prt_newline(out);
315 
316 	prt_printf(out, "Currently waiting since:\t");
317 	prt_human_readable_u64(out, max(0LL,
318 					atomic64_read(&c->io_clock[WRITE].now) -
319 					c->copygc_wait_at) << 9);
320 	prt_newline(out);
321 
322 	bch2_printbuf_make_room(out, 4096);
323 
324 	rcu_read_lock();
325 	out->atomic++;
326 
327 	prt_printf(out, "Currently calculated wait:\n");
328 	for_each_rw_member_rcu(c, ca) {
329 		prt_printf(out, "  %s:\t", ca->name);
330 		prt_human_readable_u64(out, bch2_copygc_dev_wait_amount(ca));
331 		prt_newline(out);
332 	}
333 
334 	struct task_struct *t = rcu_dereference(c->copygc_thread);
335 	if (t)
336 		get_task_struct(t);
337 	--out->atomic;
338 	rcu_read_unlock();
339 
340 	if (t) {
341 		bch2_prt_task_backtrace(out, t, 0, GFP_KERNEL);
342 		put_task_struct(t);
343 	}
344 }
345 
346 static int bch2_copygc_thread(void *arg)
347 {
348 	struct bch_fs *c = arg;
349 	struct moving_context ctxt;
350 	struct bch_move_stats move_stats;
351 	struct io_clock *clock = &c->io_clock[WRITE];
352 	struct buckets_in_flight buckets = {};
353 	u64 last, wait;
354 
355 	int ret = rhashtable_init(&buckets.table, &bch_move_bucket_params);
356 	bch_err_msg(c, ret, "allocating copygc buckets in flight");
357 	if (ret)
358 		return ret;
359 
360 	set_freezable();
361 
362 	/*
363 	 * Data move operations can't run until after check_snapshots has
364 	 * completed, and bch2_snapshot_is_ancestor() is available.
365 	 */
366 	kthread_wait_freezable(c->recovery.pass_done > BCH_RECOVERY_PASS_check_snapshots ||
367 			       kthread_should_stop());
368 
369 	bch2_move_stats_init(&move_stats, "copygc");
370 	bch2_moving_ctxt_init(&ctxt, c, NULL, &move_stats,
371 			      writepoint_ptr(&c->copygc_write_point),
372 			      false);
373 
374 	while (!ret && !kthread_should_stop()) {
375 		bool did_work = false;
376 
377 		bch2_trans_unlock_long(ctxt.trans);
378 		cond_resched();
379 
380 		if (!c->opts.copygc_enabled) {
381 			move_buckets_wait(&ctxt, &buckets, true);
382 			kthread_wait_freezable(c->opts.copygc_enabled ||
383 					       kthread_should_stop());
384 		}
385 
386 		if (unlikely(freezing(current))) {
387 			move_buckets_wait(&ctxt, &buckets, true);
388 			__refrigerator(false);
389 			continue;
390 		}
391 
392 		last = atomic64_read(&clock->now);
393 		wait = bch2_copygc_wait_amount(c);
394 
395 		if (wait > clock->max_slop) {
396 			c->copygc_wait_at = last;
397 			c->copygc_wait = last + wait;
398 			move_buckets_wait(&ctxt, &buckets, true);
399 			trace_and_count(c, copygc_wait, c, wait, last + wait);
400 			bch2_kthread_io_clock_wait(clock, last + wait,
401 					MAX_SCHEDULE_TIMEOUT);
402 			continue;
403 		}
404 
405 		c->copygc_wait = 0;
406 
407 		c->copygc_running = true;
408 		ret = bch2_copygc(&ctxt, &buckets, &did_work);
409 		c->copygc_running = false;
410 
411 		wake_up(&c->copygc_running_wq);
412 
413 		if (!wait && !did_work) {
414 			u64 min_member_capacity = bch2_min_rw_member_capacity(c);
415 
416 			if (min_member_capacity == U64_MAX)
417 				min_member_capacity = 128 * 2048;
418 
419 			move_buckets_wait(&ctxt, &buckets, true);
420 			bch2_kthread_io_clock_wait(clock, last + (min_member_capacity >> 6),
421 					MAX_SCHEDULE_TIMEOUT);
422 		}
423 	}
424 
425 	move_buckets_wait(&ctxt, &buckets, true);
426 	rhashtable_destroy(&buckets.table);
427 	bch2_moving_ctxt_exit(&ctxt);
428 	bch2_move_stats_exit(&move_stats, c);
429 
430 	return 0;
431 }
432 
433 void bch2_copygc_stop(struct bch_fs *c)
434 {
435 	if (c->copygc_thread) {
436 		kthread_stop(c->copygc_thread);
437 		put_task_struct(c->copygc_thread);
438 	}
439 	c->copygc_thread = NULL;
440 }
441 
442 int bch2_copygc_start(struct bch_fs *c)
443 {
444 	struct task_struct *t;
445 	int ret;
446 
447 	if (c->copygc_thread)
448 		return 0;
449 
450 	if (c->opts.nochanges)
451 		return 0;
452 
453 	if (bch2_fs_init_fault("copygc_start"))
454 		return -ENOMEM;
455 
456 	t = kthread_create(bch2_copygc_thread, c, "bch-copygc/%s", c->name);
457 	ret = PTR_ERR_OR_ZERO(t);
458 	bch_err_msg(c, ret, "creating copygc thread");
459 	if (ret)
460 		return ret;
461 
462 	get_task_struct(t);
463 
464 	c->copygc_thread = t;
465 	wake_up_process(c->copygc_thread);
466 
467 	return 0;
468 }
469 
470 void bch2_fs_copygc_init(struct bch_fs *c)
471 {
472 	init_waitqueue_head(&c->copygc_running_wq);
473 	c->copygc_running = false;
474 }
475