1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "bcachefs.h"
4 #include "alloc_background.h"
5 #include "alloc_foreground.h"
6 #include "btree_iter.h"
7 #include "btree_update.h"
8 #include "btree_write_buffer.h"
9 #include "buckets.h"
10 #include "clock.h"
11 #include "compress.h"
12 #include "disk_groups.h"
13 #include "errcode.h"
14 #include "error.h"
15 #include "inode.h"
16 #include "io_write.h"
17 #include "move.h"
18 #include "rebalance.h"
19 #include "subvolume.h"
20 #include "super-io.h"
21 #include "trace.h"
22 
23 #include <linux/freezer.h>
24 #include <linux/kthread.h>
25 #include <linux/sched/cputime.h>
26 
27 /* bch_extent_rebalance: */
28 
29 static const struct bch_extent_rebalance *bch2_bkey_ptrs_rebalance_opts(struct bkey_ptrs_c ptrs)
30 {
31 	const union bch_extent_entry *entry;
32 
33 	bkey_extent_entry_for_each(ptrs, entry)
34 		if (__extent_entry_type(entry) == BCH_EXTENT_ENTRY_rebalance)
35 			return &entry->rebalance;
36 
37 	return NULL;
38 }
39 
40 static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k)
41 {
42 	return bch2_bkey_ptrs_rebalance_opts(bch2_bkey_ptrs_c(k));
43 }
44 
45 static inline unsigned bch2_bkey_ptrs_need_compress(struct bch_fs *c,
46 					   struct bch_io_opts *opts,
47 					   struct bkey_s_c k,
48 					   struct bkey_ptrs_c ptrs)
49 {
50 	if (!opts->background_compression)
51 		return 0;
52 
53 	unsigned compression_type = bch2_compression_opt_to_type(opts->background_compression);
54 	const union bch_extent_entry *entry;
55 	struct extent_ptr_decoded p;
56 	unsigned ptr_bit = 1;
57 	unsigned rewrite_ptrs = 0;
58 
59 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
60 		if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible ||
61 		    p.ptr.unwritten)
62 			return 0;
63 
64 		if (!p.ptr.cached && p.crc.compression_type != compression_type)
65 			rewrite_ptrs |= ptr_bit;
66 		ptr_bit <<= 1;
67 	}
68 
69 	return rewrite_ptrs;
70 }
71 
72 static inline unsigned bch2_bkey_ptrs_need_move(struct bch_fs *c,
73 				       struct bch_io_opts *opts,
74 				       struct bkey_ptrs_c ptrs)
75 {
76 	if (!opts->background_target ||
77 	    !bch2_target_accepts_data(c, BCH_DATA_user, opts->background_target))
78 		return 0;
79 
80 	unsigned ptr_bit = 1;
81 	unsigned rewrite_ptrs = 0;
82 
83 	rcu_read_lock();
84 	bkey_for_each_ptr(ptrs, ptr) {
85 		if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, opts->background_target))
86 			rewrite_ptrs |= ptr_bit;
87 		ptr_bit <<= 1;
88 	}
89 	rcu_read_unlock();
90 
91 	return rewrite_ptrs;
92 }
93 
94 static unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c,
95 					      struct bch_io_opts *opts,
96 					      struct bkey_s_c k)
97 {
98 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
99 
100 	if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned))
101 		return 0;
102 
103 	return bch2_bkey_ptrs_need_compress(c, opts, k, ptrs) |
104 		bch2_bkey_ptrs_need_move(c, opts, ptrs);
105 }
106 
107 u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k)
108 {
109 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
110 
111 	const struct bch_extent_rebalance *opts = bch2_bkey_ptrs_rebalance_opts(ptrs);
112 	if (!opts)
113 		return 0;
114 
115 	if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned))
116 		return 0;
117 
118 	const union bch_extent_entry *entry;
119 	struct extent_ptr_decoded p;
120 	u64 sectors = 0;
121 
122 	if (opts->background_compression) {
123 		unsigned compression_type = bch2_compression_opt_to_type(opts->background_compression);
124 
125 		bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
126 			if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible ||
127 			    p.ptr.unwritten) {
128 				sectors = 0;
129 				goto incompressible;
130 			}
131 
132 			if (!p.ptr.cached && p.crc.compression_type != compression_type)
133 				sectors += p.crc.compressed_size;
134 		}
135 	}
136 incompressible:
137 	if (opts->background_target) {
138 		rcu_read_lock();
139 		bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
140 			if (!p.ptr.cached &&
141 			    !bch2_dev_in_target(c, p.ptr.dev, opts->background_target))
142 				sectors += p.crc.compressed_size;
143 		rcu_read_unlock();
144 	}
145 
146 	return sectors;
147 }
148 
149 static bool bch2_bkey_rebalance_needs_update(struct bch_fs *c, struct bch_io_opts *opts,
150 					     struct bkey_s_c k)
151 {
152 	if (!bkey_extent_is_direct_data(k.k))
153 		return 0;
154 
155 	const struct bch_extent_rebalance *old = bch2_bkey_rebalance_opts(k);
156 
157 	if (k.k->type == KEY_TYPE_reflink_v || bch2_bkey_ptrs_need_rebalance(c, opts, k)) {
158 		struct bch_extent_rebalance new = io_opts_to_rebalance_opts(c, opts);
159 		return old == NULL || memcmp(old, &new, sizeof(new));
160 	} else {
161 		return old != NULL;
162 	}
163 }
164 
165 int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bch_io_opts *opts,
166 				  struct bkey_i *_k)
167 {
168 	if (!bkey_extent_is_direct_data(&_k->k))
169 		return 0;
170 
171 	struct bkey_s k = bkey_i_to_s(_k);
172 	struct bch_extent_rebalance *old =
173 		(struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k.s_c);
174 
175 	if (k.k->type == KEY_TYPE_reflink_v || bch2_bkey_ptrs_need_rebalance(c, opts, k.s_c)) {
176 		if (!old) {
177 			old = bkey_val_end(k);
178 			k.k->u64s += sizeof(*old) / sizeof(u64);
179 		}
180 
181 		*old = io_opts_to_rebalance_opts(c, opts);
182 	} else {
183 		if (old)
184 			extent_entry_drop(k, (union bch_extent_entry *) old);
185 	}
186 
187 	return 0;
188 }
189 
190 int bch2_get_update_rebalance_opts(struct btree_trans *trans,
191 				   struct bch_io_opts *io_opts,
192 				   struct btree_iter *iter,
193 				   struct bkey_s_c k)
194 {
195 	BUG_ON(iter->flags & BTREE_ITER_is_extents);
196 	BUG_ON(iter->flags & BTREE_ITER_filter_snapshots);
197 
198 	const struct bch_extent_rebalance *r = k.k->type == KEY_TYPE_reflink_v
199 		? bch2_bkey_rebalance_opts(k) : NULL;
200 	if (r) {
201 #define x(_name)							\
202 		if (r->_name##_from_inode) {				\
203 			io_opts->_name = r->_name;			\
204 			io_opts->_name##_from_inode = true;		\
205 		}
206 		BCH_REBALANCE_OPTS()
207 #undef x
208 	}
209 
210 	if (!bch2_bkey_rebalance_needs_update(trans->c, io_opts, k))
211 		return 0;
212 
213 	struct bkey_i *n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + 8);
214 	int ret = PTR_ERR_OR_ZERO(n);
215 	if (ret)
216 		return ret;
217 
218 	bkey_reassemble(n, k);
219 
220 	/* On successfull transaction commit, @k was invalidated: */
221 
222 	return bch2_bkey_set_needs_rebalance(trans->c, io_opts, n) ?:
223 		bch2_trans_update(trans, iter, n, BTREE_UPDATE_internal_snapshot_node) ?:
224 		bch2_trans_commit(trans, NULL, NULL, 0) ?:
225 		-BCH_ERR_transaction_restart_nested;
226 }
227 
228 #define REBALANCE_WORK_SCAN_OFFSET	(U64_MAX - 1)
229 
230 static const char * const bch2_rebalance_state_strs[] = {
231 #define x(t) #t,
232 	BCH_REBALANCE_STATES()
233 	NULL
234 #undef x
235 };
236 
237 int bch2_set_rebalance_needs_scan_trans(struct btree_trans *trans, u64 inum)
238 {
239 	struct btree_iter iter;
240 	struct bkey_s_c k;
241 	struct bkey_i_cookie *cookie;
242 	u64 v;
243 	int ret;
244 
245 	bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work,
246 			     SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX),
247 			     BTREE_ITER_intent);
248 	k = bch2_btree_iter_peek_slot(trans, &iter);
249 	ret = bkey_err(k);
250 	if (ret)
251 		goto err;
252 
253 	v = k.k->type == KEY_TYPE_cookie
254 		? le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie)
255 		: 0;
256 
257 	cookie = bch2_trans_kmalloc(trans, sizeof(*cookie));
258 	ret = PTR_ERR_OR_ZERO(cookie);
259 	if (ret)
260 		goto err;
261 
262 	bkey_cookie_init(&cookie->k_i);
263 	cookie->k.p = iter.pos;
264 	cookie->v.cookie = cpu_to_le64(v + 1);
265 
266 	ret = bch2_trans_update(trans, &iter, &cookie->k_i, 0);
267 err:
268 	bch2_trans_iter_exit(trans, &iter);
269 	return ret;
270 }
271 
272 int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum)
273 {
274 	int ret = bch2_trans_commit_do(c, NULL, NULL,
275 				       BCH_TRANS_COMMIT_no_enospc,
276 			    bch2_set_rebalance_needs_scan_trans(trans, inum));
277 	bch2_rebalance_wakeup(c);
278 	return ret;
279 }
280 
281 int bch2_set_fs_needs_rebalance(struct bch_fs *c)
282 {
283 	return bch2_set_rebalance_needs_scan(c, 0);
284 }
285 
286 static int bch2_clear_rebalance_needs_scan(struct btree_trans *trans, u64 inum, u64 cookie)
287 {
288 	struct btree_iter iter;
289 	struct bkey_s_c k;
290 	u64 v;
291 	int ret;
292 
293 	bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work,
294 			     SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX),
295 			     BTREE_ITER_intent);
296 	k = bch2_btree_iter_peek_slot(trans, &iter);
297 	ret = bkey_err(k);
298 	if (ret)
299 		goto err;
300 
301 	v = k.k->type == KEY_TYPE_cookie
302 		? le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie)
303 		: 0;
304 
305 	if (v == cookie)
306 		ret = bch2_btree_delete_at(trans, &iter, 0);
307 err:
308 	bch2_trans_iter_exit(trans, &iter);
309 	return ret;
310 }
311 
312 static struct bkey_s_c next_rebalance_entry(struct btree_trans *trans,
313 					    struct btree_iter *work_iter)
314 {
315 	return !kthread_should_stop()
316 		? bch2_btree_iter_peek(trans, work_iter)
317 		: bkey_s_c_null;
318 }
319 
320 static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans,
321 					   struct btree_iter *iter,
322 					   struct bkey_s_c k)
323 {
324 	if (k.k->type == KEY_TYPE_reflink_v || !bch2_bkey_rebalance_opts(k))
325 		return 0;
326 
327 	struct bkey_i *n = bch2_bkey_make_mut(trans, iter, &k, 0);
328 	int ret = PTR_ERR_OR_ZERO(n);
329 	if (ret)
330 		return ret;
331 
332 	extent_entry_drop(bkey_i_to_s(n),
333 			  (void *) bch2_bkey_rebalance_opts(bkey_i_to_s_c(n)));
334 	return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
335 }
336 
337 static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
338 			struct bpos work_pos,
339 			struct btree_iter *extent_iter,
340 			struct bch_io_opts *io_opts,
341 			struct data_update_opts *data_opts)
342 {
343 	struct bch_fs *c = trans->c;
344 
345 	bch2_trans_iter_exit(trans, extent_iter);
346 	bch2_trans_iter_init(trans, extent_iter,
347 			     work_pos.inode ? BTREE_ID_extents : BTREE_ID_reflink,
348 			     work_pos,
349 			     BTREE_ITER_all_snapshots);
350 	struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, extent_iter);
351 	if (bkey_err(k))
352 		return k;
353 
354 	int ret = bch2_move_get_io_opts_one(trans, io_opts, extent_iter, k);
355 	if (ret)
356 		return bkey_s_c_err(ret);
357 
358 	memset(data_opts, 0, sizeof(*data_opts));
359 	data_opts->rewrite_ptrs		= bch2_bkey_ptrs_need_rebalance(c, io_opts, k);
360 	data_opts->target		= io_opts->background_target;
361 	data_opts->write_flags		|= BCH_WRITE_only_specified_devs;
362 
363 	if (!data_opts->rewrite_ptrs) {
364 		/*
365 		 * device we would want to write to offline? devices in target
366 		 * changed?
367 		 *
368 		 * We'll now need a full scan before this extent is picked up
369 		 * again:
370 		 */
371 		int ret = bch2_bkey_clear_needs_rebalance(trans, extent_iter, k);
372 		if (ret)
373 			return bkey_s_c_err(ret);
374 		return bkey_s_c_null;
375 	}
376 
377 	if (trace_rebalance_extent_enabled()) {
378 		struct printbuf buf = PRINTBUF;
379 
380 		bch2_bkey_val_to_text(&buf, c, k);
381 		prt_newline(&buf);
382 
383 		struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
384 
385 		unsigned p = bch2_bkey_ptrs_need_compress(c, io_opts, k, ptrs);
386 		if (p) {
387 			prt_str(&buf, "compression=");
388 			bch2_compression_opt_to_text(&buf, io_opts->background_compression);
389 			prt_str(&buf, " ");
390 			bch2_prt_u64_base2(&buf, p);
391 			prt_newline(&buf);
392 		}
393 
394 		p = bch2_bkey_ptrs_need_move(c, io_opts, ptrs);
395 		if (p) {
396 			prt_str(&buf, "move=");
397 			bch2_target_to_text(&buf, c, io_opts->background_target);
398 			prt_str(&buf, " ");
399 			bch2_prt_u64_base2(&buf, p);
400 			prt_newline(&buf);
401 		}
402 
403 		trace_rebalance_extent(c, buf.buf);
404 		printbuf_exit(&buf);
405 	}
406 
407 	return k;
408 }
409 
410 noinline_for_stack
411 static int do_rebalance_extent(struct moving_context *ctxt,
412 			       struct bpos work_pos,
413 			       struct btree_iter *extent_iter)
414 {
415 	struct btree_trans *trans = ctxt->trans;
416 	struct bch_fs *c = trans->c;
417 	struct bch_fs_rebalance *r = &trans->c->rebalance;
418 	struct data_update_opts data_opts;
419 	struct bch_io_opts io_opts;
420 	struct bkey_s_c k;
421 	struct bkey_buf sk;
422 	int ret;
423 
424 	ctxt->stats = &r->work_stats;
425 	r->state = BCH_REBALANCE_working;
426 
427 	bch2_bkey_buf_init(&sk);
428 
429 	ret = bkey_err(k = next_rebalance_extent(trans, work_pos,
430 				extent_iter, &io_opts, &data_opts));
431 	if (ret || !k.k)
432 		goto out;
433 
434 	atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
435 
436 	/*
437 	 * The iterator gets unlocked by __bch2_read_extent - need to
438 	 * save a copy of @k elsewhere:
439 	 */
440 	bch2_bkey_buf_reassemble(&sk, c, k);
441 	k = bkey_i_to_s_c(sk.k);
442 
443 	ret = bch2_move_extent(ctxt, NULL, extent_iter, k, io_opts, data_opts);
444 	if (ret) {
445 		if (bch2_err_matches(ret, ENOMEM)) {
446 			/* memory allocation failure, wait for some IO to finish */
447 			bch2_move_ctxt_wait_for_io(ctxt);
448 			ret = -BCH_ERR_transaction_restart_nested;
449 		}
450 
451 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
452 			goto out;
453 
454 		/* skip it and continue, XXX signal failure */
455 		ret = 0;
456 	}
457 out:
458 	bch2_bkey_buf_exit(&sk, c);
459 	return ret;
460 }
461 
462 static int do_rebalance_scan(struct moving_context *ctxt, u64 inum, u64 cookie)
463 {
464 	struct btree_trans *trans = ctxt->trans;
465 	struct bch_fs *c = trans->c;
466 	struct bch_fs_rebalance *r = &trans->c->rebalance;
467 
468 	bch2_move_stats_init(&r->scan_stats, "rebalance_scan");
469 	ctxt->stats = &r->scan_stats;
470 
471 	if (!inum) {
472 		r->scan_start	= BBPOS_MIN;
473 		r->scan_end	= BBPOS_MAX;
474 	} else {
475 		r->scan_start	= BBPOS(BTREE_ID_extents, POS(inum, 0));
476 		r->scan_end	= BBPOS(BTREE_ID_extents, POS(inum, U64_MAX));
477 	}
478 
479 	r->state = BCH_REBALANCE_scanning;
480 
481 	struct per_snapshot_io_opts snapshot_io_opts;
482 	per_snapshot_io_opts_init(&snapshot_io_opts, c);
483 
484 	int ret = for_each_btree_key_max(trans, iter, BTREE_ID_extents,
485 				      r->scan_start.pos, r->scan_end.pos,
486 				      BTREE_ITER_all_snapshots|
487 				      BTREE_ITER_not_extents|
488 				      BTREE_ITER_prefetch, k, ({
489 		ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos);
490 
491 		struct bch_io_opts *io_opts = bch2_move_get_io_opts(trans,
492 					&snapshot_io_opts, iter.pos, &iter, k);
493 		PTR_ERR_OR_ZERO(io_opts);
494 	})) ?:
495 	commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
496 		  bch2_clear_rebalance_needs_scan(trans, inum, cookie));
497 
498 	per_snapshot_io_opts_exit(&snapshot_io_opts);
499 	bch2_move_stats_exit(&r->scan_stats, trans->c);
500 
501 	/*
502 	 * Ensure that the rebalance_work entries we created are seen by the
503 	 * next iteration of do_rebalance(), so we don't end up stuck in
504 	 * rebalance_wait():
505 	 */
506 	atomic64_inc(&r->scan_stats.sectors_seen);
507 	bch2_btree_write_buffer_flush_sync(trans);
508 
509 	return ret;
510 }
511 
512 static void rebalance_wait(struct bch_fs *c)
513 {
514 	struct bch_fs_rebalance *r = &c->rebalance;
515 	struct io_clock *clock = &c->io_clock[WRITE];
516 	u64 now = atomic64_read(&clock->now);
517 	u64 min_member_capacity = bch2_min_rw_member_capacity(c);
518 
519 	if (min_member_capacity == U64_MAX)
520 		min_member_capacity = 128 * 2048;
521 
522 	r->wait_iotime_end		= now + (min_member_capacity >> 6);
523 
524 	if (r->state != BCH_REBALANCE_waiting) {
525 		r->wait_iotime_start	= now;
526 		r->wait_wallclock_start	= ktime_get_real_ns();
527 		r->state		= BCH_REBALANCE_waiting;
528 	}
529 
530 	bch2_kthread_io_clock_wait(clock, r->wait_iotime_end, MAX_SCHEDULE_TIMEOUT);
531 }
532 
533 static bool bch2_rebalance_enabled(struct bch_fs *c)
534 {
535 	return c->opts.rebalance_enabled &&
536 		!(c->opts.rebalance_on_ac_only &&
537 		  c->rebalance.on_battery);
538 }
539 
540 static int do_rebalance(struct moving_context *ctxt)
541 {
542 	struct btree_trans *trans = ctxt->trans;
543 	struct bch_fs *c = trans->c;
544 	struct bch_fs_rebalance *r = &c->rebalance;
545 	struct btree_iter rebalance_work_iter, extent_iter = {};
546 	struct bkey_s_c k;
547 	int ret = 0;
548 
549 	bch2_trans_begin(trans);
550 
551 	bch2_move_stats_init(&r->work_stats, "rebalance_work");
552 	bch2_move_stats_init(&r->scan_stats, "rebalance_scan");
553 
554 	bch2_trans_iter_init(trans, &rebalance_work_iter,
555 			     BTREE_ID_rebalance_work, POS_MIN,
556 			     BTREE_ITER_all_snapshots);
557 
558 	while (!bch2_move_ratelimit(ctxt)) {
559 		if (!bch2_rebalance_enabled(c)) {
560 			bch2_moving_ctxt_flush_all(ctxt);
561 			kthread_wait_freezable(bch2_rebalance_enabled(c) ||
562 					       kthread_should_stop());
563 		}
564 
565 		if (kthread_should_stop())
566 			break;
567 
568 		bch2_trans_begin(trans);
569 
570 		ret = bkey_err(k = next_rebalance_entry(trans, &rebalance_work_iter));
571 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
572 			continue;
573 		if (ret || !k.k)
574 			break;
575 
576 		ret = k.k->type == KEY_TYPE_cookie
577 			? do_rebalance_scan(ctxt, k.k->p.inode,
578 					    le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie))
579 			: do_rebalance_extent(ctxt, k.k->p, &extent_iter);
580 
581 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
582 			continue;
583 		if (ret)
584 			break;
585 
586 		bch2_btree_iter_advance(trans, &rebalance_work_iter);
587 	}
588 
589 	bch2_trans_iter_exit(trans, &extent_iter);
590 	bch2_trans_iter_exit(trans, &rebalance_work_iter);
591 	bch2_move_stats_exit(&r->scan_stats, c);
592 
593 	if (!ret &&
594 	    !kthread_should_stop() &&
595 	    !atomic64_read(&r->work_stats.sectors_seen) &&
596 	    !atomic64_read(&r->scan_stats.sectors_seen)) {
597 		bch2_moving_ctxt_flush_all(ctxt);
598 		bch2_trans_unlock_long(trans);
599 		rebalance_wait(c);
600 	}
601 
602 	if (!bch2_err_matches(ret, EROFS))
603 		bch_err_fn(c, ret);
604 	return ret;
605 }
606 
607 static int bch2_rebalance_thread(void *arg)
608 {
609 	struct bch_fs *c = arg;
610 	struct bch_fs_rebalance *r = &c->rebalance;
611 	struct moving_context ctxt;
612 
613 	set_freezable();
614 
615 	/*
616 	 * Data move operations can't run until after check_snapshots has
617 	 * completed, and bch2_snapshot_is_ancestor() is available.
618 	 */
619 	kthread_wait_freezable(c->recovery.pass_done > BCH_RECOVERY_PASS_check_snapshots ||
620 			       kthread_should_stop());
621 
622 	bch2_moving_ctxt_init(&ctxt, c, NULL, &r->work_stats,
623 			      writepoint_ptr(&c->rebalance_write_point),
624 			      true);
625 
626 	while (!kthread_should_stop() && !do_rebalance(&ctxt))
627 		;
628 
629 	bch2_moving_ctxt_exit(&ctxt);
630 
631 	return 0;
632 }
633 
634 void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c)
635 {
636 	printbuf_tabstop_push(out, 32);
637 
638 	struct bch_fs_rebalance *r = &c->rebalance;
639 
640 	/* print pending work */
641 	struct disk_accounting_pos acc;
642 	disk_accounting_key_init(acc, rebalance_work);
643 	u64 v;
644 	bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1);
645 
646 	prt_printf(out, "pending work:\t");
647 	prt_human_readable_u64(out, v << 9);
648 	prt_printf(out, "\n\n");
649 
650 	prt_str(out, bch2_rebalance_state_strs[r->state]);
651 	prt_newline(out);
652 	printbuf_indent_add(out, 2);
653 
654 	switch (r->state) {
655 	case BCH_REBALANCE_waiting: {
656 		u64 now = atomic64_read(&c->io_clock[WRITE].now);
657 
658 		prt_printf(out, "io wait duration:\t");
659 		bch2_prt_human_readable_s64(out, (r->wait_iotime_end - r->wait_iotime_start) << 9);
660 		prt_newline(out);
661 
662 		prt_printf(out, "io wait remaining:\t");
663 		bch2_prt_human_readable_s64(out, (r->wait_iotime_end - now) << 9);
664 		prt_newline(out);
665 
666 		prt_printf(out, "duration waited:\t");
667 		bch2_pr_time_units(out, ktime_get_real_ns() - r->wait_wallclock_start);
668 		prt_newline(out);
669 		break;
670 	}
671 	case BCH_REBALANCE_working:
672 		bch2_move_stats_to_text(out, &r->work_stats);
673 		break;
674 	case BCH_REBALANCE_scanning:
675 		bch2_move_stats_to_text(out, &r->scan_stats);
676 		break;
677 	}
678 	prt_newline(out);
679 
680 	rcu_read_lock();
681 	struct task_struct *t = rcu_dereference(c->rebalance.thread);
682 	if (t)
683 		get_task_struct(t);
684 	rcu_read_unlock();
685 
686 	if (t) {
687 		bch2_prt_task_backtrace(out, t, 0, GFP_KERNEL);
688 		put_task_struct(t);
689 	}
690 
691 	printbuf_indent_sub(out, 2);
692 }
693 
694 void bch2_rebalance_stop(struct bch_fs *c)
695 {
696 	struct task_struct *p;
697 
698 	c->rebalance.pd.rate.rate = UINT_MAX;
699 	bch2_ratelimit_reset(&c->rebalance.pd.rate);
700 
701 	p = rcu_dereference_protected(c->rebalance.thread, 1);
702 	c->rebalance.thread = NULL;
703 
704 	if (p) {
705 		/* for sychronizing with bch2_rebalance_wakeup() */
706 		synchronize_rcu();
707 
708 		kthread_stop(p);
709 		put_task_struct(p);
710 	}
711 }
712 
713 int bch2_rebalance_start(struct bch_fs *c)
714 {
715 	struct task_struct *p;
716 	int ret;
717 
718 	if (c->rebalance.thread)
719 		return 0;
720 
721 	if (c->opts.nochanges)
722 		return 0;
723 
724 	p = kthread_create(bch2_rebalance_thread, c, "bch-rebalance/%s", c->name);
725 	ret = PTR_ERR_OR_ZERO(p);
726 	bch_err_msg(c, ret, "creating rebalance thread");
727 	if (ret)
728 		return ret;
729 
730 	get_task_struct(p);
731 	rcu_assign_pointer(c->rebalance.thread, p);
732 	wake_up_process(p);
733 	return 0;
734 }
735 
736 #ifdef CONFIG_POWER_SUPPLY
737 #include <linux/power_supply.h>
738 
739 static int bch2_rebalance_power_notifier(struct notifier_block *nb,
740 					 unsigned long event, void *data)
741 {
742 	struct bch_fs *c = container_of(nb, struct bch_fs, rebalance.power_notifier);
743 
744 	c->rebalance.on_battery = !power_supply_is_system_supplied();
745 	bch2_rebalance_wakeup(c);
746 	return NOTIFY_OK;
747 }
748 #endif
749 
750 void bch2_fs_rebalance_exit(struct bch_fs *c)
751 {
752 #ifdef CONFIG_POWER_SUPPLY
753 	power_supply_unreg_notifier(&c->rebalance.power_notifier);
754 #endif
755 }
756 
757 int bch2_fs_rebalance_init(struct bch_fs *c)
758 {
759 	struct bch_fs_rebalance *r = &c->rebalance;
760 
761 	bch2_pd_controller_init(&r->pd);
762 
763 #ifdef CONFIG_POWER_SUPPLY
764 	r->power_notifier.notifier_call = bch2_rebalance_power_notifier;
765 	int ret = power_supply_reg_notifier(&r->power_notifier);
766 	if (ret)
767 		return ret;
768 
769 	r->on_battery = !power_supply_is_system_supplied();
770 #endif
771 	return 0;
772 }
773 
774 static int check_rebalance_work_one(struct btree_trans *trans,
775 				    struct btree_iter *extent_iter,
776 				    struct btree_iter *rebalance_iter,
777 				    struct bkey_buf *last_flushed)
778 {
779 	struct bch_fs *c = trans->c;
780 	struct bkey_s_c extent_k, rebalance_k;
781 	struct printbuf buf = PRINTBUF;
782 
783 	int ret = bkey_err(extent_k	= bch2_btree_iter_peek(trans, extent_iter)) ?:
784 		  bkey_err(rebalance_k	= bch2_btree_iter_peek(trans, rebalance_iter));
785 	if (ret)
786 		return ret;
787 
788 	if (!extent_k.k &&
789 	    extent_iter->btree_id == BTREE_ID_reflink &&
790 	    (!rebalance_k.k ||
791 	     rebalance_k.k->p.inode >= BCACHEFS_ROOT_INO)) {
792 		bch2_trans_iter_exit(trans, extent_iter);
793 		bch2_trans_iter_init(trans, extent_iter,
794 				     BTREE_ID_extents, POS_MIN,
795 				     BTREE_ITER_prefetch|
796 				     BTREE_ITER_all_snapshots);
797 		return -BCH_ERR_transaction_restart_nested;
798 	}
799 
800 	if (!extent_k.k && !rebalance_k.k)
801 		return 1;
802 
803 	int cmp = bpos_cmp(extent_k.k	 ? extent_k.k->p    : SPOS_MAX,
804 			   rebalance_k.k ? rebalance_k.k->p : SPOS_MAX);
805 
806 	struct bkey deleted;
807 	bkey_init(&deleted);
808 
809 	if (cmp < 0) {
810 		deleted.p = extent_k.k->p;
811 		rebalance_k.k = &deleted;
812 	} else if (cmp > 0) {
813 		deleted.p = rebalance_k.k->p;
814 		extent_k.k = &deleted;
815 	}
816 
817 	bool should_have_rebalance =
818 		bch2_bkey_sectors_need_rebalance(c, extent_k) != 0;
819 	bool have_rebalance = rebalance_k.k->type == KEY_TYPE_set;
820 
821 	if (should_have_rebalance != have_rebalance) {
822 		ret = bch2_btree_write_buffer_maybe_flush(trans, extent_k, last_flushed);
823 		if (ret)
824 			return ret;
825 
826 		bch2_bkey_val_to_text(&buf, c, extent_k);
827 	}
828 
829 	if (fsck_err_on(!should_have_rebalance && have_rebalance,
830 			trans, rebalance_work_incorrectly_set,
831 			"rebalance work incorrectly set\n%s", buf.buf)) {
832 		ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work,
833 						  extent_k.k->p, false);
834 		if (ret)
835 			goto err;
836 	}
837 
838 	if (fsck_err_on(should_have_rebalance && !have_rebalance,
839 			trans, rebalance_work_incorrectly_unset,
840 			"rebalance work incorrectly unset\n%s", buf.buf)) {
841 		ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work,
842 						  extent_k.k->p, true);
843 		if (ret)
844 			goto err;
845 	}
846 
847 	if (cmp <= 0)
848 		bch2_btree_iter_advance(trans, extent_iter);
849 	if (cmp >= 0)
850 		bch2_btree_iter_advance(trans, rebalance_iter);
851 err:
852 fsck_err:
853 	printbuf_exit(&buf);
854 	return ret;
855 }
856 
857 int bch2_check_rebalance_work(struct bch_fs *c)
858 {
859 	struct btree_trans *trans = bch2_trans_get(c);
860 	struct btree_iter rebalance_iter, extent_iter;
861 	int ret = 0;
862 
863 	bch2_trans_iter_init(trans, &extent_iter,
864 			     BTREE_ID_reflink, POS_MIN,
865 			     BTREE_ITER_prefetch);
866 	bch2_trans_iter_init(trans, &rebalance_iter,
867 			     BTREE_ID_rebalance_work, POS_MIN,
868 			     BTREE_ITER_prefetch);
869 
870 	struct bkey_buf last_flushed;
871 	bch2_bkey_buf_init(&last_flushed);
872 	bkey_init(&last_flushed.k->k);
873 
874 	while (!ret) {
875 		bch2_trans_begin(trans);
876 
877 		ret = check_rebalance_work_one(trans, &extent_iter, &rebalance_iter, &last_flushed);
878 
879 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
880 			ret = 0;
881 	}
882 
883 	bch2_bkey_buf_exit(&last_flushed, c);
884 	bch2_trans_iter_exit(trans, &extent_iter);
885 	bch2_trans_iter_exit(trans, &rebalance_iter);
886 	bch2_trans_put(trans);
887 	return ret < 0 ? ret : 0;
888 }
889