1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Some low level IO code, and hacks for various block layer limitations
4  *
5  * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
6  * Copyright 2012 Google, Inc.
7  */
8 
9 #include "bcachefs.h"
10 #include "alloc_background.h"
11 #include "alloc_foreground.h"
12 #include "btree_update.h"
13 #include "buckets.h"
14 #include "checksum.h"
15 #include "clock.h"
16 #include "compress.h"
17 #include "data_update.h"
18 #include "disk_groups.h"
19 #include "ec.h"
20 #include "error.h"
21 #include "io_read.h"
22 #include "io_misc.h"
23 #include "io_write.h"
24 #include "reflink.h"
25 #include "subvolume.h"
26 #include "trace.h"
27 
28 #include <linux/random.h>
29 #include <linux/sched/mm.h>
30 
31 #ifdef CONFIG_BCACHEFS_DEBUG
32 static unsigned bch2_read_corrupt_ratio;
33 module_param_named(read_corrupt_ratio, bch2_read_corrupt_ratio, uint, 0644);
34 MODULE_PARM_DESC(read_corrupt_ratio, "");
35 #endif
36 
37 #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
38 
bch2_target_congested(struct bch_fs * c,u16 target)39 static bool bch2_target_congested(struct bch_fs *c, u16 target)
40 {
41 	const struct bch_devs_mask *devs;
42 	unsigned d, nr = 0, total = 0;
43 	u64 now = local_clock(), last;
44 	s64 congested;
45 	struct bch_dev *ca;
46 
47 	if (!target)
48 		return false;
49 
50 	rcu_read_lock();
51 	devs = bch2_target_to_mask(c, target) ?:
52 		&c->rw_devs[BCH_DATA_user];
53 
54 	for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) {
55 		ca = rcu_dereference(c->devs[d]);
56 		if (!ca)
57 			continue;
58 
59 		congested = atomic_read(&ca->congested);
60 		last = READ_ONCE(ca->congested_last);
61 		if (time_after64(now, last))
62 			congested -= (now - last) >> 12;
63 
64 		total += max(congested, 0LL);
65 		nr++;
66 	}
67 	rcu_read_unlock();
68 
69 	return get_random_u32_below(nr * CONGESTED_MAX) < total;
70 }
71 
72 #else
73 
bch2_target_congested(struct bch_fs * c,u16 target)74 static bool bch2_target_congested(struct bch_fs *c, u16 target)
75 {
76 	return false;
77 }
78 
79 #endif
80 
81 /* Cache promotion on read */
82 
83 struct promote_op {
84 	struct rcu_head		rcu;
85 	u64			start_time;
86 
87 	struct rhash_head	hash;
88 	struct bpos		pos;
89 
90 	struct work_struct	work;
91 	struct data_update	write;
92 	struct bio_vec		bi_inline_vecs[]; /* must be last */
93 };
94 
95 static const struct rhashtable_params bch_promote_params = {
96 	.head_offset		= offsetof(struct promote_op, hash),
97 	.key_offset		= offsetof(struct promote_op, pos),
98 	.key_len		= sizeof(struct bpos),
99 	.automatic_shrinking	= true,
100 };
101 
have_io_error(struct bch_io_failures * failed)102 static inline bool have_io_error(struct bch_io_failures *failed)
103 {
104 	return failed && failed->nr;
105 }
106 
rbio_data_update(struct bch_read_bio * rbio)107 static inline struct data_update *rbio_data_update(struct bch_read_bio *rbio)
108 {
109 	EBUG_ON(rbio->split);
110 
111 	return rbio->data_update
112 		? container_of(rbio, struct data_update, rbio)
113 		: NULL;
114 }
115 
ptr_being_rewritten(struct bch_read_bio * orig,unsigned dev)116 static bool ptr_being_rewritten(struct bch_read_bio *orig, unsigned dev)
117 {
118 	struct data_update *u = rbio_data_update(orig);
119 	if (!u)
120 		return false;
121 
122 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(u->k.k));
123 	unsigned i = 0;
124 	bkey_for_each_ptr(ptrs, ptr) {
125 		if (ptr->dev == dev &&
126 		    u->data_opts.rewrite_ptrs & BIT(i))
127 			return true;
128 		i++;
129 	}
130 
131 	return false;
132 }
133 
should_promote(struct bch_fs * c,struct bkey_s_c k,struct bpos pos,struct bch_io_opts opts,unsigned flags,struct bch_io_failures * failed)134 static inline int should_promote(struct bch_fs *c, struct bkey_s_c k,
135 				  struct bpos pos,
136 				  struct bch_io_opts opts,
137 				  unsigned flags,
138 				  struct bch_io_failures *failed)
139 {
140 	if (!have_io_error(failed)) {
141 		BUG_ON(!opts.promote_target);
142 
143 		if (!(flags & BCH_READ_may_promote))
144 			return -BCH_ERR_nopromote_may_not;
145 
146 		if (bch2_bkey_has_target(c, k, opts.promote_target))
147 			return -BCH_ERR_nopromote_already_promoted;
148 
149 		if (bkey_extent_is_unwritten(k))
150 			return -BCH_ERR_nopromote_unwritten;
151 
152 		if (bch2_target_congested(c, opts.promote_target))
153 			return -BCH_ERR_nopromote_congested;
154 	}
155 
156 	if (rhashtable_lookup_fast(&c->promote_table, &pos,
157 				   bch_promote_params))
158 		return -BCH_ERR_nopromote_in_flight;
159 
160 	return 0;
161 }
162 
promote_free(struct bch_read_bio * rbio)163 static noinline void promote_free(struct bch_read_bio *rbio)
164 {
165 	struct promote_op *op = container_of(rbio, struct promote_op, write.rbio);
166 	struct bch_fs *c = rbio->c;
167 
168 	int ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
169 					 bch_promote_params);
170 	BUG_ON(ret);
171 
172 	bch2_data_update_exit(&op->write);
173 
174 	bch2_write_ref_put(c, BCH_WRITE_REF_promote);
175 	kfree_rcu(op, rcu);
176 }
177 
promote_done(struct bch_write_op * wop)178 static void promote_done(struct bch_write_op *wop)
179 {
180 	struct promote_op *op = container_of(wop, struct promote_op, write.op);
181 	struct bch_fs *c = op->write.rbio.c;
182 
183 	bch2_time_stats_update(&c->times[BCH_TIME_data_promote], op->start_time);
184 	promote_free(&op->write.rbio);
185 }
186 
promote_start_work(struct work_struct * work)187 static void promote_start_work(struct work_struct *work)
188 {
189 	struct promote_op *op = container_of(work, struct promote_op, work);
190 
191 	bch2_data_update_read_done(&op->write);
192 }
193 
promote_start(struct bch_read_bio * rbio)194 static noinline void promote_start(struct bch_read_bio *rbio)
195 {
196 	struct promote_op *op = container_of(rbio, struct promote_op, write.rbio);
197 
198 	trace_and_count(op->write.op.c, io_read_promote, &rbio->bio);
199 
200 	INIT_WORK(&op->work, promote_start_work);
201 	queue_work(rbio->c->write_ref_wq, &op->work);
202 }
203 
__promote_alloc(struct btree_trans * trans,enum btree_id btree_id,struct bkey_s_c k,struct bpos pos,struct extent_ptr_decoded * pick,unsigned sectors,struct bch_read_bio * orig,struct bch_io_failures * failed)204 static struct bch_read_bio *__promote_alloc(struct btree_trans *trans,
205 					    enum btree_id btree_id,
206 					    struct bkey_s_c k,
207 					    struct bpos pos,
208 					    struct extent_ptr_decoded *pick,
209 					    unsigned sectors,
210 					    struct bch_read_bio *orig,
211 					    struct bch_io_failures *failed)
212 {
213 	struct bch_fs *c = trans->c;
214 	int ret;
215 
216 	struct data_update_opts update_opts = { .write_flags = BCH_WRITE_alloc_nowait };
217 
218 	if (!have_io_error(failed)) {
219 		update_opts.target = orig->opts.promote_target;
220 		update_opts.extra_replicas = 1;
221 		update_opts.write_flags |= BCH_WRITE_cached;
222 		update_opts.write_flags |= BCH_WRITE_only_specified_devs;
223 	} else {
224 		update_opts.target = orig->opts.foreground_target;
225 
226 		struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
227 		unsigned ptr_bit = 1;
228 		bkey_for_each_ptr(ptrs, ptr) {
229 			if (bch2_dev_io_failures(failed, ptr->dev) &&
230 			    !ptr_being_rewritten(orig, ptr->dev))
231 				update_opts.rewrite_ptrs |= ptr_bit;
232 			ptr_bit <<= 1;
233 		}
234 
235 		if (!update_opts.rewrite_ptrs)
236 			return NULL;
237 	}
238 
239 	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote))
240 		return ERR_PTR(-BCH_ERR_nopromote_no_writes);
241 
242 	struct promote_op *op = kzalloc(sizeof(*op), GFP_KERNEL);
243 	if (!op) {
244 		ret = -BCH_ERR_nopromote_enomem;
245 		goto err_put;
246 	}
247 
248 	op->start_time = local_clock();
249 	op->pos = pos;
250 
251 	if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash,
252 					  bch_promote_params)) {
253 		ret = -BCH_ERR_nopromote_in_flight;
254 		goto err;
255 	}
256 
257 	ret = bch2_data_update_init(trans, NULL, NULL, &op->write,
258 			writepoint_hashed((unsigned long) current),
259 			&orig->opts,
260 			update_opts,
261 			btree_id, k);
262 	op->write.type = BCH_DATA_UPDATE_promote;
263 	/*
264 	 * possible errors: -BCH_ERR_nocow_lock_blocked,
265 	 * -BCH_ERR_ENOSPC_disk_reservation:
266 	 */
267 	if (ret)
268 		goto err_remove_hash;
269 
270 	rbio_init_fragment(&op->write.rbio.bio, orig);
271 	op->write.rbio.bounce	= true;
272 	op->write.rbio.promote	= true;
273 	op->write.op.end_io = promote_done;
274 
275 	return &op->write.rbio;
276 err_remove_hash:
277 	BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash,
278 				      bch_promote_params));
279 err:
280 	bio_free_pages(&op->write.op.wbio.bio);
281 	/* We may have added to the rhashtable and thus need rcu freeing: */
282 	kfree_rcu(op, rcu);
283 err_put:
284 	bch2_write_ref_put(c, BCH_WRITE_REF_promote);
285 	return ERR_PTR(ret);
286 }
287 
288 noinline
promote_alloc(struct btree_trans * trans,struct bvec_iter iter,struct bkey_s_c k,struct extent_ptr_decoded * pick,unsigned flags,struct bch_read_bio * orig,bool * bounce,bool * read_full,struct bch_io_failures * failed)289 static struct bch_read_bio *promote_alloc(struct btree_trans *trans,
290 					struct bvec_iter iter,
291 					struct bkey_s_c k,
292 					struct extent_ptr_decoded *pick,
293 					unsigned flags,
294 					struct bch_read_bio *orig,
295 					bool *bounce,
296 					bool *read_full,
297 					struct bch_io_failures *failed)
298 {
299 	struct bch_fs *c = trans->c;
300 	/*
301 	 * if failed != NULL we're not actually doing a promote, we're
302 	 * recovering from an io/checksum error
303 	 */
304 	bool promote_full = (have_io_error(failed) ||
305 			     *read_full ||
306 			     READ_ONCE(c->opts.promote_whole_extents));
307 	/* data might have to be decompressed in the write path: */
308 	unsigned sectors = promote_full
309 		? max(pick->crc.compressed_size, pick->crc.live_size)
310 		: bvec_iter_sectors(iter);
311 	struct bpos pos = promote_full
312 		? bkey_start_pos(k.k)
313 		: POS(k.k->p.inode, iter.bi_sector);
314 	int ret;
315 
316 	ret = should_promote(c, k, pos, orig->opts, flags, failed);
317 	if (ret)
318 		goto nopromote;
319 
320 	struct bch_read_bio *promote =
321 		__promote_alloc(trans,
322 				k.k->type == KEY_TYPE_reflink_v
323 				? BTREE_ID_reflink
324 				: BTREE_ID_extents,
325 				k, pos, pick, sectors, orig, failed);
326 	if (!promote)
327 		return NULL;
328 
329 	ret = PTR_ERR_OR_ZERO(promote);
330 	if (ret)
331 		goto nopromote;
332 
333 	*bounce		= true;
334 	*read_full	= promote_full;
335 	return promote;
336 nopromote:
337 	trace_io_read_nopromote(c, ret);
338 	return NULL;
339 }
340 
341 /* Read */
342 
bch2_read_err_msg_trans(struct btree_trans * trans,struct printbuf * out,struct bch_read_bio * rbio,struct bpos read_pos)343 static int bch2_read_err_msg_trans(struct btree_trans *trans, struct printbuf *out,
344 				   struct bch_read_bio *rbio, struct bpos read_pos)
345 {
346 	int ret = lockrestart_do(trans,
347 		bch2_inum_offset_err_msg_trans(trans, out,
348 				(subvol_inum) { rbio->subvol, read_pos.inode },
349 				read_pos.offset << 9));
350 	if (ret)
351 		return ret;
352 
353 	if (rbio->data_update)
354 		prt_str(out, "(internal move) ");
355 
356 	return 0;
357 }
358 
bch2_read_err_msg(struct bch_fs * c,struct printbuf * out,struct bch_read_bio * rbio,struct bpos read_pos)359 static void bch2_read_err_msg(struct bch_fs *c, struct printbuf *out,
360 			      struct bch_read_bio *rbio, struct bpos read_pos)
361 {
362 	bch2_trans_run(c, bch2_read_err_msg_trans(trans, out, rbio, read_pos));
363 }
364 
365 enum rbio_context {
366 	RBIO_CONTEXT_NULL,
367 	RBIO_CONTEXT_HIGHPRI,
368 	RBIO_CONTEXT_UNBOUND,
369 };
370 
371 static inline struct bch_read_bio *
bch2_rbio_parent(struct bch_read_bio * rbio)372 bch2_rbio_parent(struct bch_read_bio *rbio)
373 {
374 	return rbio->split ? rbio->parent : rbio;
375 }
376 
377 __always_inline
bch2_rbio_punt(struct bch_read_bio * rbio,work_func_t fn,enum rbio_context context,struct workqueue_struct * wq)378 static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn,
379 			   enum rbio_context context,
380 			   struct workqueue_struct *wq)
381 {
382 	if (context <= rbio->context) {
383 		fn(&rbio->work);
384 	} else {
385 		rbio->work.func		= fn;
386 		rbio->context		= context;
387 		queue_work(wq, &rbio->work);
388 	}
389 }
390 
bch2_rbio_free(struct bch_read_bio * rbio)391 static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
392 {
393 	BUG_ON(rbio->bounce && !rbio->split);
394 
395 	if (rbio->have_ioref) {
396 		struct bch_dev *ca = bch2_dev_have_ref(rbio->c, rbio->pick.ptr.dev);
397 		percpu_ref_put(&ca->io_ref[READ]);
398 	}
399 
400 	if (rbio->split) {
401 		struct bch_read_bio *parent = rbio->parent;
402 
403 		if (unlikely(rbio->promote)) {
404 			if (!rbio->bio.bi_status)
405 				promote_start(rbio);
406 			else
407 				promote_free(rbio);
408 		} else {
409 			if (rbio->bounce)
410 				bch2_bio_free_pages_pool(rbio->c, &rbio->bio);
411 
412 			bio_put(&rbio->bio);
413 		}
414 
415 		rbio = parent;
416 	}
417 
418 	return rbio;
419 }
420 
421 /*
422  * Only called on a top level bch_read_bio to complete an entire read request,
423  * not a split:
424  */
bch2_rbio_done(struct bch_read_bio * rbio)425 static void bch2_rbio_done(struct bch_read_bio *rbio)
426 {
427 	if (rbio->start_time)
428 		bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read],
429 				       rbio->start_time);
430 	bio_endio(&rbio->bio);
431 }
432 
bch2_read_retry_nodecode(struct btree_trans * trans,struct bch_read_bio * rbio,struct bvec_iter bvec_iter,struct bch_io_failures * failed,unsigned flags)433 static noinline int bch2_read_retry_nodecode(struct btree_trans *trans,
434 					struct bch_read_bio *rbio,
435 					struct bvec_iter bvec_iter,
436 					struct bch_io_failures *failed,
437 					unsigned flags)
438 {
439 	struct data_update *u = container_of(rbio, struct data_update, rbio);
440 retry:
441 	bch2_trans_begin(trans);
442 
443 	struct btree_iter iter;
444 	struct bkey_s_c k;
445 	int ret = lockrestart_do(trans,
446 		bkey_err(k = bch2_bkey_get_iter(trans, &iter,
447 				u->btree_id, bkey_start_pos(&u->k.k->k),
448 				0)));
449 	if (ret)
450 		goto err;
451 
452 	if (!bkey_and_val_eq(k, bkey_i_to_s_c(u->k.k))) {
453 		/* extent we wanted to read no longer exists: */
454 		rbio->ret = -BCH_ERR_data_read_key_overwritten;
455 		goto err;
456 	}
457 
458 	ret = __bch2_read_extent(trans, rbio, bvec_iter,
459 				 bkey_start_pos(&u->k.k->k),
460 				 u->btree_id,
461 				 bkey_i_to_s_c(u->k.k),
462 				 0, failed, flags, -1);
463 err:
464 	bch2_trans_iter_exit(trans, &iter);
465 
466 	if (bch2_err_matches(ret, BCH_ERR_data_read_retry))
467 		goto retry;
468 
469 	if (ret) {
470 		rbio->bio.bi_status	= BLK_STS_IOERR;
471 		rbio->ret		= ret;
472 	}
473 
474 	BUG_ON(atomic_read(&rbio->bio.__bi_remaining) != 1);
475 	return ret;
476 }
477 
bch2_rbio_retry(struct work_struct * work)478 static void bch2_rbio_retry(struct work_struct *work)
479 {
480 	struct bch_read_bio *rbio =
481 		container_of(work, struct bch_read_bio, work);
482 	struct bch_fs *c	= rbio->c;
483 	struct bvec_iter iter	= rbio->bvec_iter;
484 	unsigned flags		= rbio->flags;
485 	subvol_inum inum = {
486 		.subvol = rbio->subvol,
487 		.inum	= rbio->read_pos.inode,
488 	};
489 	struct bch_io_failures failed = { .nr = 0 };
490 	int orig_error = rbio->ret;
491 
492 	struct btree_trans *trans = bch2_trans_get(c);
493 
494 	trace_io_read_retry(&rbio->bio);
495 	this_cpu_add(c->counters[BCH_COUNTER_io_read_retry],
496 		     bvec_iter_sectors(rbio->bvec_iter));
497 
498 	if (bch2_err_matches(rbio->ret, BCH_ERR_data_read_retry_avoid))
499 		bch2_mark_io_failure(&failed, &rbio->pick,
500 				     rbio->ret == -BCH_ERR_data_read_retry_csum_err);
501 
502 	if (!rbio->split) {
503 		rbio->bio.bi_status	= 0;
504 		rbio->ret		= 0;
505 	}
506 
507 	unsigned subvol		= rbio->subvol;
508 	struct bpos read_pos	= rbio->read_pos;
509 
510 	rbio = bch2_rbio_free(rbio);
511 
512 	flags |= BCH_READ_in_retry;
513 	flags &= ~BCH_READ_may_promote;
514 	flags &= ~BCH_READ_last_fragment;
515 	flags |= BCH_READ_must_clone;
516 
517 	int ret = rbio->data_update
518 		? bch2_read_retry_nodecode(trans, rbio, iter, &failed, flags)
519 		: __bch2_read(trans, rbio, iter, inum, &failed, flags);
520 
521 	if (ret) {
522 		rbio->ret = ret;
523 		rbio->bio.bi_status = BLK_STS_IOERR;
524 	} else if (orig_error != -BCH_ERR_data_read_retry_csum_err_maybe_userspace &&
525 		   orig_error != -BCH_ERR_data_read_ptr_stale_race &&
526 		   !failed.nr) {
527 		struct printbuf buf = PRINTBUF;
528 
529 		lockrestart_do(trans,
530 			bch2_inum_offset_err_msg_trans(trans, &buf,
531 					(subvol_inum) { subvol, read_pos.inode },
532 					read_pos.offset << 9));
533 		if (rbio->data_update)
534 			prt_str(&buf, "(internal move) ");
535 		prt_str(&buf, "successful retry");
536 
537 		bch_err_ratelimited(c, "%s", buf.buf);
538 		printbuf_exit(&buf);
539 	}
540 
541 	bch2_rbio_done(rbio);
542 	bch2_trans_put(trans);
543 }
544 
bch2_rbio_error(struct bch_read_bio * rbio,int ret,blk_status_t blk_error)545 static void bch2_rbio_error(struct bch_read_bio *rbio,
546 			    int ret, blk_status_t blk_error)
547 {
548 	BUG_ON(ret >= 0);
549 
550 	rbio->ret		= ret;
551 	rbio->bio.bi_status	= blk_error;
552 
553 	bch2_rbio_parent(rbio)->saw_error = true;
554 
555 	if (rbio->flags & BCH_READ_in_retry)
556 		return;
557 
558 	if (bch2_err_matches(ret, BCH_ERR_data_read_retry)) {
559 		bch2_rbio_punt(rbio, bch2_rbio_retry,
560 			       RBIO_CONTEXT_UNBOUND, system_unbound_wq);
561 	} else {
562 		rbio = bch2_rbio_free(rbio);
563 
564 		rbio->ret		= ret;
565 		rbio->bio.bi_status	= blk_error;
566 
567 		bch2_rbio_done(rbio);
568 	}
569 }
570 
bch2_read_io_err(struct work_struct * work)571 static void bch2_read_io_err(struct work_struct *work)
572 {
573 	struct bch_read_bio *rbio =
574 		container_of(work, struct bch_read_bio, work);
575 	struct bio *bio = &rbio->bio;
576 	struct bch_fs *c	= rbio->c;
577 	struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
578 	struct printbuf buf = PRINTBUF;
579 
580 	bch2_read_err_msg(c, &buf, rbio, rbio->read_pos);
581 	prt_printf(&buf, "data read error: %s", bch2_blk_status_to_str(bio->bi_status));
582 
583 	if (ca)
584 		bch_err_ratelimited(ca, "%s", buf.buf);
585 	else
586 		bch_err_ratelimited(c, "%s", buf.buf);
587 
588 	printbuf_exit(&buf);
589 	bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_io_err, bio->bi_status);
590 }
591 
__bch2_rbio_narrow_crcs(struct btree_trans * trans,struct bch_read_bio * rbio)592 static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
593 				   struct bch_read_bio *rbio)
594 {
595 	struct bch_fs *c = rbio->c;
596 	u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset;
597 	struct bch_extent_crc_unpacked new_crc;
598 	struct btree_iter iter;
599 	struct bkey_i *new;
600 	struct bkey_s_c k;
601 	int ret = 0;
602 
603 	if (crc_is_compressed(rbio->pick.crc))
604 		return 0;
605 
606 	k = bch2_bkey_get_iter(trans, &iter, rbio->data_btree, rbio->data_pos,
607 			       BTREE_ITER_slots|BTREE_ITER_intent);
608 	if ((ret = bkey_err(k)))
609 		goto out;
610 
611 	if (bversion_cmp(k.k->bversion, rbio->version) ||
612 	    !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset))
613 		goto out;
614 
615 	/* Extent was merged? */
616 	if (bkey_start_offset(k.k) < data_offset ||
617 	    k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size)
618 		goto out;
619 
620 	if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version,
621 			rbio->pick.crc, NULL, &new_crc,
622 			bkey_start_offset(k.k) - data_offset, k.k->size,
623 			rbio->pick.crc.csum_type)) {
624 		bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)");
625 		ret = 0;
626 		goto out;
627 	}
628 
629 	/*
630 	 * going to be temporarily appending another checksum entry:
631 	 */
632 	new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) +
633 				 sizeof(struct bch_extent_crc128));
634 	if ((ret = PTR_ERR_OR_ZERO(new)))
635 		goto out;
636 
637 	bkey_reassemble(new, k);
638 
639 	if (!bch2_bkey_narrow_crcs(new, new_crc))
640 		goto out;
641 
642 	ret = bch2_trans_update(trans, &iter, new,
643 				BTREE_UPDATE_internal_snapshot_node);
644 out:
645 	bch2_trans_iter_exit(trans, &iter);
646 	return ret;
647 }
648 
bch2_rbio_narrow_crcs(struct bch_read_bio * rbio)649 static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
650 {
651 	bch2_trans_commit_do(rbio->c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
652 			     __bch2_rbio_narrow_crcs(trans, rbio));
653 }
654 
bch2_read_csum_err(struct work_struct * work)655 static void bch2_read_csum_err(struct work_struct *work)
656 {
657 	struct bch_read_bio *rbio =
658 		container_of(work, struct bch_read_bio, work);
659 	struct bch_fs *c	= rbio->c;
660 	struct bio *src		= &rbio->bio;
661 	struct bch_extent_crc_unpacked crc = rbio->pick.crc;
662 	struct nonce nonce = extent_nonce(rbio->version, crc);
663 	struct bch_csum csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
664 	struct printbuf buf = PRINTBUF;
665 
666 	bch2_read_err_msg(c, &buf, rbio, rbio->read_pos);
667 	prt_str(&buf, "data ");
668 	bch2_csum_err_msg(&buf, crc.csum_type, rbio->pick.crc.csum, csum);
669 
670 	struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
671 	if (ca)
672 		bch_err_ratelimited(ca, "%s", buf.buf);
673 	else
674 		bch_err_ratelimited(c, "%s", buf.buf);
675 
676 	bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err, BLK_STS_IOERR);
677 	printbuf_exit(&buf);
678 }
679 
bch2_read_decompress_err(struct work_struct * work)680 static void bch2_read_decompress_err(struct work_struct *work)
681 {
682 	struct bch_read_bio *rbio =
683 		container_of(work, struct bch_read_bio, work);
684 	struct bch_fs *c	= rbio->c;
685 	struct printbuf buf = PRINTBUF;
686 
687 	bch2_read_err_msg(c, &buf, rbio, rbio->read_pos);
688 	prt_str(&buf, "decompression error");
689 
690 	struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
691 	if (ca)
692 		bch_err_ratelimited(ca, "%s", buf.buf);
693 	else
694 		bch_err_ratelimited(c, "%s", buf.buf);
695 
696 	bch2_rbio_error(rbio, -BCH_ERR_data_read_decompress_err, BLK_STS_IOERR);
697 	printbuf_exit(&buf);
698 }
699 
bch2_read_decrypt_err(struct work_struct * work)700 static void bch2_read_decrypt_err(struct work_struct *work)
701 {
702 	struct bch_read_bio *rbio =
703 		container_of(work, struct bch_read_bio, work);
704 	struct bch_fs *c	= rbio->c;
705 	struct printbuf buf = PRINTBUF;
706 
707 	bch2_read_err_msg(c, &buf, rbio, rbio->read_pos);
708 	prt_str(&buf, "decrypt error");
709 
710 	struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
711 	if (ca)
712 		bch_err_ratelimited(ca, "%s", buf.buf);
713 	else
714 		bch_err_ratelimited(c, "%s", buf.buf);
715 
716 	bch2_rbio_error(rbio, -BCH_ERR_data_read_decrypt_err, BLK_STS_IOERR);
717 	printbuf_exit(&buf);
718 }
719 
720 /* Inner part that may run in process context */
__bch2_read_endio(struct work_struct * work)721 static void __bch2_read_endio(struct work_struct *work)
722 {
723 	struct bch_read_bio *rbio =
724 		container_of(work, struct bch_read_bio, work);
725 	struct bch_fs *c	= rbio->c;
726 	struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
727 	struct bch_read_bio *parent	= bch2_rbio_parent(rbio);
728 	struct bio *src			= &rbio->bio;
729 	struct bio *dst			= &parent->bio;
730 	struct bvec_iter dst_iter	= rbio->bvec_iter;
731 	struct bch_extent_crc_unpacked crc = rbio->pick.crc;
732 	struct nonce nonce = extent_nonce(rbio->version, crc);
733 	unsigned nofs_flags;
734 	struct bch_csum csum;
735 	int ret;
736 
737 	nofs_flags = memalloc_nofs_save();
738 
739 	/* Reset iterator for checksumming and copying bounced data: */
740 	if (rbio->bounce) {
741 		src->bi_iter.bi_size		= crc.compressed_size << 9;
742 		src->bi_iter.bi_idx		= 0;
743 		src->bi_iter.bi_bvec_done	= 0;
744 	} else {
745 		src->bi_iter			= rbio->bvec_iter;
746 	}
747 
748 	bch2_maybe_corrupt_bio(src, bch2_read_corrupt_ratio);
749 
750 	csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
751 	bool csum_good = !bch2_crc_cmp(csum, rbio->pick.crc.csum) || c->opts.no_data_io;
752 
753 	/*
754 	 * Checksum error: if the bio wasn't bounced, we may have been
755 	 * reading into buffers owned by userspace (that userspace can
756 	 * scribble over) - retry the read, bouncing it this time:
757 	 */
758 	if (!csum_good && !rbio->bounce && (rbio->flags & BCH_READ_user_mapped)) {
759 		rbio->flags |= BCH_READ_must_bounce;
760 		bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err_maybe_userspace,
761 				BLK_STS_IOERR);
762 		goto out;
763 	}
764 
765 	bch2_account_io_completion(ca, BCH_MEMBER_ERROR_checksum, 0, csum_good);
766 
767 	if (!csum_good)
768 		goto csum_err;
769 
770 	/*
771 	 * XXX
772 	 * We need to rework the narrow_crcs path to deliver the read completion
773 	 * first, and then punt to a different workqueue, otherwise we're
774 	 * holding up reads while doing btree updates which is bad for memory
775 	 * reclaim.
776 	 */
777 	if (unlikely(rbio->narrow_crcs))
778 		bch2_rbio_narrow_crcs(rbio);
779 
780 	if (likely(!parent->data_update)) {
781 		/* Adjust crc to point to subset of data we want: */
782 		crc.offset     += rbio->offset_into_extent;
783 		crc.live_size	= bvec_iter_sectors(rbio->bvec_iter);
784 
785 		if (crc_is_compressed(crc)) {
786 			ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
787 			if (ret)
788 				goto decrypt_err;
789 
790 			if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) &&
791 			    !c->opts.no_data_io)
792 				goto decompression_err;
793 		} else {
794 			/* don't need to decrypt the entire bio: */
795 			nonce = nonce_add(nonce, crc.offset << 9);
796 			bio_advance(src, crc.offset << 9);
797 
798 			BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
799 			src->bi_iter.bi_size = dst_iter.bi_size;
800 
801 			ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
802 			if (ret)
803 				goto decrypt_err;
804 
805 			if (rbio->bounce) {
806 				struct bvec_iter src_iter = src->bi_iter;
807 
808 				bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
809 			}
810 		}
811 	} else {
812 		if (rbio->split)
813 			rbio->parent->pick = rbio->pick;
814 
815 		if (rbio->bounce) {
816 			struct bvec_iter src_iter = src->bi_iter;
817 
818 			bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
819 		}
820 	}
821 
822 	if (rbio->promote) {
823 		/*
824 		 * Re encrypt data we decrypted, so it's consistent with
825 		 * rbio->crc:
826 		 */
827 		ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
828 		if (ret)
829 			goto decrypt_err;
830 	}
831 
832 	if (likely(!(rbio->flags & BCH_READ_in_retry))) {
833 		rbio = bch2_rbio_free(rbio);
834 		bch2_rbio_done(rbio);
835 	}
836 out:
837 	memalloc_nofs_restore(nofs_flags);
838 	return;
839 csum_err:
840 	bch2_rbio_punt(rbio, bch2_read_csum_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq);
841 	goto out;
842 decompression_err:
843 	bch2_rbio_punt(rbio, bch2_read_decompress_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq);
844 	goto out;
845 decrypt_err:
846 	bch2_rbio_punt(rbio, bch2_read_decrypt_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq);
847 	goto out;
848 }
849 
bch2_read_endio(struct bio * bio)850 static void bch2_read_endio(struct bio *bio)
851 {
852 	struct bch_read_bio *rbio =
853 		container_of(bio, struct bch_read_bio, bio);
854 	struct bch_fs *c	= rbio->c;
855 	struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
856 	struct workqueue_struct *wq = NULL;
857 	enum rbio_context context = RBIO_CONTEXT_NULL;
858 
859 	bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read,
860 				   rbio->submit_time, !bio->bi_status);
861 
862 	if (!rbio->split)
863 		rbio->bio.bi_end_io = rbio->end_io;
864 
865 	if (unlikely(bio->bi_status)) {
866 		bch2_rbio_punt(rbio, bch2_read_io_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq);
867 		return;
868 	}
869 
870 	if (((rbio->flags & BCH_READ_retry_if_stale) && race_fault()) ||
871 	    (ca && dev_ptr_stale(ca, &rbio->pick.ptr))) {
872 		trace_and_count(c, io_read_reuse_race, &rbio->bio);
873 
874 		if (rbio->flags & BCH_READ_retry_if_stale)
875 			bch2_rbio_error(rbio, -BCH_ERR_data_read_ptr_stale_retry, BLK_STS_AGAIN);
876 		else
877 			bch2_rbio_error(rbio, -BCH_ERR_data_read_ptr_stale_race, BLK_STS_AGAIN);
878 		return;
879 	}
880 
881 	if (rbio->narrow_crcs ||
882 	    rbio->promote ||
883 	    crc_is_compressed(rbio->pick.crc) ||
884 	    bch2_csum_type_is_encryption(rbio->pick.crc.csum_type))
885 		context = RBIO_CONTEXT_UNBOUND,	wq = system_unbound_wq;
886 	else if (rbio->pick.crc.csum_type)
887 		context = RBIO_CONTEXT_HIGHPRI,	wq = system_highpri_wq;
888 
889 	bch2_rbio_punt(rbio, __bch2_read_endio, context, wq);
890 }
891 
read_from_stale_dirty_pointer(struct btree_trans * trans,struct bch_dev * ca,struct bkey_s_c k,struct bch_extent_ptr ptr)892 static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans,
893 						   struct bch_dev *ca,
894 						   struct bkey_s_c k,
895 						   struct bch_extent_ptr ptr)
896 {
897 	struct bch_fs *c = trans->c;
898 	struct btree_iter iter;
899 	struct printbuf buf = PRINTBUF;
900 	int ret;
901 
902 	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
903 			     PTR_BUCKET_POS(ca, &ptr),
904 			     BTREE_ITER_cached);
905 
906 	int gen = bucket_gen_get(ca, iter.pos.offset);
907 	if (gen >= 0) {
908 		prt_printf(&buf, "Attempting to read from stale dirty pointer:\n");
909 		printbuf_indent_add(&buf, 2);
910 
911 		bch2_bkey_val_to_text(&buf, c, k);
912 		prt_newline(&buf);
913 
914 		prt_printf(&buf, "memory gen: %u", gen);
915 
916 		ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(trans, &iter)));
917 		if (!ret) {
918 			prt_newline(&buf);
919 			bch2_bkey_val_to_text(&buf, c, k);
920 		}
921 	} else {
922 		prt_printf(&buf, "Attempting to read from invalid bucket %llu:%llu:\n",
923 			   iter.pos.inode, iter.pos.offset);
924 		printbuf_indent_add(&buf, 2);
925 
926 		prt_printf(&buf, "first bucket %u nbuckets %llu\n",
927 			   ca->mi.first_bucket, ca->mi.nbuckets);
928 
929 		bch2_bkey_val_to_text(&buf, c, k);
930 		prt_newline(&buf);
931 	}
932 
933 	bch2_fs_inconsistent(c, "%s", buf.buf);
934 
935 	bch2_trans_iter_exit(trans, &iter);
936 	printbuf_exit(&buf);
937 }
938 
__bch2_read_extent(struct btree_trans * trans,struct bch_read_bio * orig,struct bvec_iter iter,struct bpos read_pos,enum btree_id data_btree,struct bkey_s_c k,unsigned offset_into_extent,struct bch_io_failures * failed,unsigned flags,int dev)939 int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
940 		       struct bvec_iter iter, struct bpos read_pos,
941 		       enum btree_id data_btree, struct bkey_s_c k,
942 		       unsigned offset_into_extent,
943 		       struct bch_io_failures *failed, unsigned flags, int dev)
944 {
945 	struct bch_fs *c = trans->c;
946 	struct extent_ptr_decoded pick;
947 	struct bch_read_bio *rbio = NULL;
948 	bool bounce = false, read_full = false, narrow_crcs = false;
949 	struct bpos data_pos = bkey_start_pos(k.k);
950 	struct data_update *u = rbio_data_update(orig);
951 	int ret = 0;
952 
953 	if (bkey_extent_is_inline_data(k.k)) {
954 		unsigned bytes = min_t(unsigned, iter.bi_size,
955 				       bkey_inline_data_bytes(k.k));
956 
957 		swap(iter.bi_size, bytes);
958 		memcpy_to_bio(&orig->bio, iter, bkey_inline_data_p(k));
959 		swap(iter.bi_size, bytes);
960 		bio_advance_iter(&orig->bio, &iter, bytes);
961 		zero_fill_bio_iter(&orig->bio, iter);
962 		this_cpu_add(c->counters[BCH_COUNTER_io_read_inline],
963 			     bvec_iter_sectors(iter));
964 		goto out_read_done;
965 	}
966 retry_pick:
967 	ret = bch2_bkey_pick_read_device(c, k, failed, &pick, dev);
968 
969 	/* hole or reservation - just zero fill: */
970 	if (!ret)
971 		goto hole;
972 
973 	if (unlikely(ret < 0)) {
974 		struct printbuf buf = PRINTBUF;
975 		bch2_read_err_msg_trans(trans, &buf, orig, read_pos);
976 		prt_printf(&buf, "%s\n  ", bch2_err_str(ret));
977 		bch2_bkey_val_to_text(&buf, c, k);
978 
979 		bch_err_ratelimited(c, "%s", buf.buf);
980 		printbuf_exit(&buf);
981 		goto err;
982 	}
983 
984 	if (unlikely(bch2_csum_type_is_encryption(pick.crc.csum_type)) &&
985 	    !c->chacha20_key_set) {
986 		struct printbuf buf = PRINTBUF;
987 		bch2_read_err_msg_trans(trans, &buf, orig, read_pos);
988 		prt_printf(&buf, "attempting to read encrypted data without encryption key\n  ");
989 		bch2_bkey_val_to_text(&buf, c, k);
990 
991 		bch_err_ratelimited(c, "%s", buf.buf);
992 		printbuf_exit(&buf);
993 		ret = -BCH_ERR_data_read_no_encryption_key;
994 		goto err;
995 	}
996 
997 	struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ);
998 
999 	/*
1000 	 * Stale dirty pointers are treated as IO errors, but @failed isn't
1001 	 * allocated unless we're in the retry path - so if we're not in the
1002 	 * retry path, don't check here, it'll be caught in bch2_read_endio()
1003 	 * and we'll end up in the retry path:
1004 	 */
1005 	if ((flags & BCH_READ_in_retry) &&
1006 	    !pick.ptr.cached &&
1007 	    ca &&
1008 	    unlikely(dev_ptr_stale(ca, &pick.ptr))) {
1009 		read_from_stale_dirty_pointer(trans, ca, k, pick.ptr);
1010 		bch2_mark_io_failure(failed, &pick, false);
1011 		percpu_ref_put(&ca->io_ref[READ]);
1012 		goto retry_pick;
1013 	}
1014 
1015 	if (likely(!u)) {
1016 		if (!(flags & BCH_READ_last_fragment) ||
1017 		    bio_flagged(&orig->bio, BIO_CHAIN))
1018 			flags |= BCH_READ_must_clone;
1019 
1020 		narrow_crcs = !(flags & BCH_READ_in_retry) &&
1021 			bch2_can_narrow_extent_crcs(k, pick.crc);
1022 
1023 		if (narrow_crcs && (flags & BCH_READ_user_mapped))
1024 			flags |= BCH_READ_must_bounce;
1025 
1026 		EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size);
1027 
1028 		if (crc_is_compressed(pick.crc) ||
1029 		    (pick.crc.csum_type != BCH_CSUM_none &&
1030 		     (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
1031 		      (bch2_csum_type_is_encryption(pick.crc.csum_type) &&
1032 		       (flags & BCH_READ_user_mapped)) ||
1033 		      (flags & BCH_READ_must_bounce)))) {
1034 			read_full = true;
1035 			bounce = true;
1036 		}
1037 	} else {
1038 		/*
1039 		 * can happen if we retry, and the extent we were going to read
1040 		 * has been merged in the meantime:
1041 		 */
1042 		if (pick.crc.compressed_size > u->op.wbio.bio.bi_iter.bi_size) {
1043 			if (ca)
1044 				percpu_ref_put(&ca->io_ref[READ]);
1045 			rbio->ret = -BCH_ERR_data_read_buffer_too_small;
1046 			goto out_read_done;
1047 		}
1048 
1049 		iter.bi_size	= pick.crc.compressed_size << 9;
1050 		read_full = true;
1051 	}
1052 
1053 	if (orig->opts.promote_target || have_io_error(failed))
1054 		rbio = promote_alloc(trans, iter, k, &pick, flags, orig,
1055 				     &bounce, &read_full, failed);
1056 
1057 	if (!read_full) {
1058 		EBUG_ON(crc_is_compressed(pick.crc));
1059 		EBUG_ON(pick.crc.csum_type &&
1060 			(bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
1061 			 bvec_iter_sectors(iter) != pick.crc.live_size ||
1062 			 pick.crc.offset ||
1063 			 offset_into_extent));
1064 
1065 		data_pos.offset += offset_into_extent;
1066 		pick.ptr.offset += pick.crc.offset +
1067 			offset_into_extent;
1068 		offset_into_extent		= 0;
1069 		pick.crc.compressed_size	= bvec_iter_sectors(iter);
1070 		pick.crc.uncompressed_size	= bvec_iter_sectors(iter);
1071 		pick.crc.offset			= 0;
1072 		pick.crc.live_size		= bvec_iter_sectors(iter);
1073 	}
1074 
1075 	if (rbio) {
1076 		/*
1077 		 * promote already allocated bounce rbio:
1078 		 * promote needs to allocate a bio big enough for uncompressing
1079 		 * data in the write path, but we're not going to use it all
1080 		 * here:
1081 		 */
1082 		EBUG_ON(rbio->bio.bi_iter.bi_size <
1083 		       pick.crc.compressed_size << 9);
1084 		rbio->bio.bi_iter.bi_size =
1085 			pick.crc.compressed_size << 9;
1086 	} else if (bounce) {
1087 		unsigned sectors = pick.crc.compressed_size;
1088 
1089 		rbio = rbio_init_fragment(bio_alloc_bioset(NULL,
1090 						  DIV_ROUND_UP(sectors, PAGE_SECTORS),
1091 						  0,
1092 						  GFP_NOFS,
1093 						  &c->bio_read_split),
1094 				 orig);
1095 
1096 		bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9);
1097 		rbio->bounce	= true;
1098 	} else if (flags & BCH_READ_must_clone) {
1099 		/*
1100 		 * Have to clone if there were any splits, due to error
1101 		 * reporting issues (if a split errored, and retrying didn't
1102 		 * work, when it reports the error to its parent (us) we don't
1103 		 * know if the error was from our bio, and we should retry, or
1104 		 * from the whole bio, in which case we don't want to retry and
1105 		 * lose the error)
1106 		 */
1107 		rbio = rbio_init_fragment(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS,
1108 						 &c->bio_read_split),
1109 				 orig);
1110 		rbio->bio.bi_iter = iter;
1111 	} else {
1112 		rbio = orig;
1113 		rbio->bio.bi_iter = iter;
1114 		EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN));
1115 	}
1116 
1117 	EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size);
1118 
1119 	rbio->submit_time	= local_clock();
1120 	if (!rbio->split)
1121 		rbio->end_io	= orig->bio.bi_end_io;
1122 	rbio->bvec_iter		= iter;
1123 	rbio->offset_into_extent= offset_into_extent;
1124 	rbio->flags		= flags;
1125 	rbio->have_ioref	= ca != NULL;
1126 	rbio->narrow_crcs	= narrow_crcs;
1127 	rbio->ret		= 0;
1128 	rbio->context		= 0;
1129 	rbio->pick		= pick;
1130 	rbio->subvol		= orig->subvol;
1131 	rbio->read_pos		= read_pos;
1132 	rbio->data_btree	= data_btree;
1133 	rbio->data_pos		= data_pos;
1134 	rbio->version		= k.k->bversion;
1135 	INIT_WORK(&rbio->work, NULL);
1136 
1137 	rbio->bio.bi_opf	= orig->bio.bi_opf;
1138 	rbio->bio.bi_iter.bi_sector = pick.ptr.offset;
1139 	rbio->bio.bi_end_io	= bch2_read_endio;
1140 
1141 	if (rbio->bounce)
1142 		trace_and_count(c, io_read_bounce, &rbio->bio);
1143 
1144 	if (!u)
1145 		this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio));
1146 	else
1147 		this_cpu_add(c->counters[BCH_COUNTER_io_move_read], bio_sectors(&rbio->bio));
1148 	bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
1149 
1150 	/*
1151 	 * If it's being moved internally, we don't want to flag it as a cache
1152 	 * hit:
1153 	 */
1154 	if (ca && pick.ptr.cached && !u)
1155 		bch2_bucket_io_time_reset(trans, pick.ptr.dev,
1156 			PTR_BUCKET_NR(ca, &pick.ptr), READ);
1157 
1158 	if (!(flags & (BCH_READ_in_retry|BCH_READ_last_fragment))) {
1159 		bio_inc_remaining(&orig->bio);
1160 		trace_and_count(c, io_read_split, &orig->bio);
1161 	}
1162 
1163 	/*
1164 	 * Unlock the iterator while the btree node's lock is still in
1165 	 * cache, before doing the IO:
1166 	 */
1167 	if (!(flags & BCH_READ_in_retry))
1168 		bch2_trans_unlock(trans);
1169 	else
1170 		bch2_trans_unlock_long(trans);
1171 
1172 	if (likely(!rbio->pick.do_ec_reconstruct)) {
1173 		if (unlikely(!rbio->have_ioref)) {
1174 			struct printbuf buf = PRINTBUF;
1175 			bch2_read_err_msg_trans(trans, &buf, rbio, read_pos);
1176 			prt_printf(&buf, "no device to read from:\n  ");
1177 			bch2_bkey_val_to_text(&buf, c, k);
1178 
1179 			bch_err_ratelimited(c, "%s", buf.buf);
1180 			printbuf_exit(&buf);
1181 
1182 			bch2_rbio_error(rbio,
1183 					-BCH_ERR_data_read_retry_device_offline,
1184 					BLK_STS_IOERR);
1185 			goto out;
1186 		}
1187 
1188 		this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_user],
1189 			     bio_sectors(&rbio->bio));
1190 		bio_set_dev(&rbio->bio, ca->disk_sb.bdev);
1191 
1192 		if (unlikely(c->opts.no_data_io)) {
1193 			if (likely(!(flags & BCH_READ_in_retry)))
1194 				bio_endio(&rbio->bio);
1195 		} else {
1196 			if (likely(!(flags & BCH_READ_in_retry)))
1197 				submit_bio(&rbio->bio);
1198 			else
1199 				submit_bio_wait(&rbio->bio);
1200 		}
1201 
1202 		/*
1203 		 * We just submitted IO which may block, we expect relock fail
1204 		 * events and shouldn't count them:
1205 		 */
1206 		trans->notrace_relock_fail = true;
1207 	} else {
1208 		/* Attempting reconstruct read: */
1209 		if (bch2_ec_read_extent(trans, rbio, k)) {
1210 			bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_ec_reconstruct_err,
1211 					BLK_STS_IOERR);
1212 			goto out;
1213 		}
1214 
1215 		if (likely(!(flags & BCH_READ_in_retry)))
1216 			bio_endio(&rbio->bio);
1217 	}
1218 out:
1219 	if (likely(!(flags & BCH_READ_in_retry))) {
1220 		return 0;
1221 	} else {
1222 		bch2_trans_unlock(trans);
1223 
1224 		int ret;
1225 
1226 		rbio->context = RBIO_CONTEXT_UNBOUND;
1227 		bch2_read_endio(&rbio->bio);
1228 
1229 		ret = rbio->ret;
1230 		rbio = bch2_rbio_free(rbio);
1231 
1232 		if (bch2_err_matches(ret, BCH_ERR_data_read_retry_avoid))
1233 			bch2_mark_io_failure(failed, &pick,
1234 					ret == -BCH_ERR_data_read_retry_csum_err);
1235 
1236 		return ret;
1237 	}
1238 
1239 err:
1240 	if (flags & BCH_READ_in_retry)
1241 		return ret;
1242 
1243 	orig->bio.bi_status	= BLK_STS_IOERR;
1244 	orig->ret		= ret;
1245 	goto out_read_done;
1246 
1247 hole:
1248 	this_cpu_add(c->counters[BCH_COUNTER_io_read_hole],
1249 		     bvec_iter_sectors(iter));
1250 	/*
1251 	 * won't normally happen in the data update (bch2_move_extent()) path,
1252 	 * but if we retry and the extent we wanted to read no longer exists we
1253 	 * have to signal that:
1254 	 */
1255 	if (u)
1256 		orig->ret = -BCH_ERR_data_read_key_overwritten;
1257 
1258 	zero_fill_bio_iter(&orig->bio, iter);
1259 out_read_done:
1260 	if ((flags & BCH_READ_last_fragment) &&
1261 	    !(flags & BCH_READ_in_retry))
1262 		bch2_rbio_done(orig);
1263 	return 0;
1264 }
1265 
__bch2_read(struct btree_trans * trans,struct bch_read_bio * rbio,struct bvec_iter bvec_iter,subvol_inum inum,struct bch_io_failures * failed,unsigned flags)1266 int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio,
1267 		struct bvec_iter bvec_iter, subvol_inum inum,
1268 		struct bch_io_failures *failed, unsigned flags)
1269 {
1270 	struct bch_fs *c = trans->c;
1271 	struct btree_iter iter;
1272 	struct bkey_buf sk;
1273 	struct bkey_s_c k;
1274 	int ret;
1275 
1276 	EBUG_ON(rbio->data_update);
1277 
1278 	bch2_bkey_buf_init(&sk);
1279 	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
1280 			     POS(inum.inum, bvec_iter.bi_sector),
1281 			     BTREE_ITER_slots);
1282 
1283 	while (1) {
1284 		enum btree_id data_btree = BTREE_ID_extents;
1285 
1286 		bch2_trans_begin(trans);
1287 
1288 		u32 snapshot;
1289 		ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
1290 		if (ret)
1291 			goto err;
1292 
1293 		bch2_btree_iter_set_snapshot(trans, &iter, snapshot);
1294 
1295 		bch2_btree_iter_set_pos(trans, &iter,
1296 				POS(inum.inum, bvec_iter.bi_sector));
1297 
1298 		k = bch2_btree_iter_peek_slot(trans, &iter);
1299 		ret = bkey_err(k);
1300 		if (ret)
1301 			goto err;
1302 
1303 		s64 offset_into_extent = iter.pos.offset -
1304 			bkey_start_offset(k.k);
1305 		unsigned sectors = k.k->size - offset_into_extent;
1306 
1307 		bch2_bkey_buf_reassemble(&sk, c, k);
1308 
1309 		ret = bch2_read_indirect_extent(trans, &data_btree,
1310 					&offset_into_extent, &sk);
1311 		if (ret)
1312 			goto err;
1313 
1314 		k = bkey_i_to_s_c(sk.k);
1315 
1316 		/*
1317 		 * With indirect extents, the amount of data to read is the min
1318 		 * of the original extent and the indirect extent:
1319 		 */
1320 		sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent);
1321 
1322 		unsigned bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9;
1323 		swap(bvec_iter.bi_size, bytes);
1324 
1325 		if (bvec_iter.bi_size == bytes)
1326 			flags |= BCH_READ_last_fragment;
1327 
1328 		ret = __bch2_read_extent(trans, rbio, bvec_iter, iter.pos,
1329 					 data_btree, k,
1330 					 offset_into_extent, failed, flags, -1);
1331 		swap(bvec_iter.bi_size, bytes);
1332 
1333 		if (ret)
1334 			goto err;
1335 
1336 		if (flags & BCH_READ_last_fragment)
1337 			break;
1338 
1339 		bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
1340 err:
1341 		if (ret == -BCH_ERR_data_read_retry_csum_err_maybe_userspace)
1342 			flags |= BCH_READ_must_bounce;
1343 
1344 		if (ret &&
1345 		    !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
1346 		    !bch2_err_matches(ret, BCH_ERR_data_read_retry))
1347 			break;
1348 	}
1349 
1350 	bch2_trans_iter_exit(trans, &iter);
1351 
1352 	if (unlikely(ret)) {
1353 		if (ret != -BCH_ERR_extent_poisoned) {
1354 			struct printbuf buf = PRINTBUF;
1355 			lockrestart_do(trans,
1356 				       bch2_inum_offset_err_msg_trans(trans, &buf, inum,
1357 								      bvec_iter.bi_sector << 9));
1358 			prt_printf(&buf, "data read error: %s", bch2_err_str(ret));
1359 			bch_err_ratelimited(c, "%s", buf.buf);
1360 			printbuf_exit(&buf);
1361 		}
1362 
1363 		rbio->bio.bi_status	= BLK_STS_IOERR;
1364 		rbio->ret		= ret;
1365 
1366 		if (!(flags & BCH_READ_in_retry))
1367 			bch2_rbio_done(rbio);
1368 	}
1369 
1370 	bch2_bkey_buf_exit(&sk, c);
1371 	return ret;
1372 }
1373 
bch2_fs_io_read_exit(struct bch_fs * c)1374 void bch2_fs_io_read_exit(struct bch_fs *c)
1375 {
1376 	if (c->promote_table.tbl)
1377 		rhashtable_destroy(&c->promote_table);
1378 	bioset_exit(&c->bio_read_split);
1379 	bioset_exit(&c->bio_read);
1380 }
1381 
bch2_fs_io_read_init(struct bch_fs * c)1382 int bch2_fs_io_read_init(struct bch_fs *c)
1383 {
1384 	if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
1385 			BIOSET_NEED_BVECS))
1386 		return -BCH_ERR_ENOMEM_bio_read_init;
1387 
1388 	if (bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
1389 			BIOSET_NEED_BVECS))
1390 		return -BCH_ERR_ENOMEM_bio_read_split_init;
1391 
1392 	if (rhashtable_init(&c->promote_table, &bch_promote_params))
1393 		return -BCH_ERR_ENOMEM_promote_table_init;
1394 
1395 	return 0;
1396 }
1397