1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2012 Fusion-io  All rights reserved.
4  * Copyright (C) 2012 Intel Corp. All rights reserved.
5  */
6 
7 #include <linux/sched.h>
8 #include <linux/bio.h>
9 #include <linux/slab.h>
10 #include <linux/blkdev.h>
11 #include <linux/raid/pq.h>
12 #include <linux/hash.h>
13 #include <linux/list_sort.h>
14 #include <linux/raid/xor.h>
15 #include <linux/mm.h>
16 #include "messages.h"
17 #include "ctree.h"
18 #include "disk-io.h"
19 #include "volumes.h"
20 #include "raid56.h"
21 #include "async-thread.h"
22 #include "file-item.h"
23 #include "btrfs_inode.h"
24 
25 /* set when additional merges to this rbio are not allowed */
26 #define RBIO_RMW_LOCKED_BIT	1
27 
28 /*
29  * set when this rbio is sitting in the hash, but it is just a cache
30  * of past RMW
31  */
32 #define RBIO_CACHE_BIT		2
33 
34 /*
35  * set when it is safe to trust the stripe_pages for caching
36  */
37 #define RBIO_CACHE_READY_BIT	3
38 
39 #define RBIO_CACHE_SIZE 1024
40 
41 #define BTRFS_STRIPE_HASH_TABLE_BITS				11
42 
43 static void dump_bioc(const struct btrfs_fs_info *fs_info, const struct btrfs_io_context *bioc)
44 {
45 	if (unlikely(!bioc)) {
46 		btrfs_crit(fs_info, "bioc=NULL");
47 		return;
48 	}
49 	btrfs_crit(fs_info,
50 "bioc logical=%llu full_stripe=%llu size=%llu map_type=0x%llx mirror=%u replace_nr_stripes=%u replace_stripe_src=%d num_stripes=%u",
51 		bioc->logical, bioc->full_stripe_logical, bioc->size,
52 		bioc->map_type, bioc->mirror_num, bioc->replace_nr_stripes,
53 		bioc->replace_stripe_src, bioc->num_stripes);
54 	for (int i = 0; i < bioc->num_stripes; i++) {
55 		btrfs_crit(fs_info, "    nr=%d devid=%llu physical=%llu",
56 			   i, bioc->stripes[i].dev->devid,
57 			   bioc->stripes[i].physical);
58 	}
59 }
60 
61 static void btrfs_dump_rbio(const struct btrfs_fs_info *fs_info,
62 			    const struct btrfs_raid_bio *rbio)
63 {
64 	if (!IS_ENABLED(CONFIG_BTRFS_ASSERT))
65 		return;
66 
67 	dump_bioc(fs_info, rbio->bioc);
68 	btrfs_crit(fs_info,
69 "rbio flags=0x%lx nr_sectors=%u nr_data=%u real_stripes=%u stripe_nsectors=%u scrubp=%u dbitmap=0x%lx",
70 		rbio->flags, rbio->nr_sectors, rbio->nr_data,
71 		rbio->real_stripes, rbio->stripe_nsectors,
72 		rbio->scrubp, rbio->dbitmap);
73 }
74 
75 #define ASSERT_RBIO(expr, rbio)						\
76 ({									\
77 	if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) {	\
78 		const struct btrfs_fs_info *__fs_info = (rbio)->bioc ?	\
79 					(rbio)->bioc->fs_info : NULL;	\
80 									\
81 		btrfs_dump_rbio(__fs_info, (rbio));			\
82 	}								\
83 	ASSERT((expr));							\
84 })
85 
86 #define ASSERT_RBIO_STRIPE(expr, rbio, stripe_nr)			\
87 ({									\
88 	if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) {	\
89 		const struct btrfs_fs_info *__fs_info = (rbio)->bioc ?	\
90 					(rbio)->bioc->fs_info : NULL;	\
91 									\
92 		btrfs_dump_rbio(__fs_info, (rbio));			\
93 		btrfs_crit(__fs_info, "stripe_nr=%d", (stripe_nr));	\
94 	}								\
95 	ASSERT((expr));							\
96 })
97 
98 #define ASSERT_RBIO_SECTOR(expr, rbio, sector_nr)			\
99 ({									\
100 	if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) {	\
101 		const struct btrfs_fs_info *__fs_info = (rbio)->bioc ?	\
102 					(rbio)->bioc->fs_info : NULL;	\
103 									\
104 		btrfs_dump_rbio(__fs_info, (rbio));			\
105 		btrfs_crit(__fs_info, "sector_nr=%d", (sector_nr));	\
106 	}								\
107 	ASSERT((expr));							\
108 })
109 
110 #define ASSERT_RBIO_LOGICAL(expr, rbio, logical)			\
111 ({									\
112 	if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) {	\
113 		const struct btrfs_fs_info *__fs_info = (rbio)->bioc ?	\
114 					(rbio)->bioc->fs_info : NULL;	\
115 									\
116 		btrfs_dump_rbio(__fs_info, (rbio));			\
117 		btrfs_crit(__fs_info, "logical=%llu", (logical));		\
118 	}								\
119 	ASSERT((expr));							\
120 })
121 
122 /* Used by the raid56 code to lock stripes for read/modify/write */
123 struct btrfs_stripe_hash {
124 	struct list_head hash_list;
125 	spinlock_t lock;
126 };
127 
128 /* Used by the raid56 code to lock stripes for read/modify/write */
129 struct btrfs_stripe_hash_table {
130 	struct list_head stripe_cache;
131 	spinlock_t cache_lock;
132 	int cache_size;
133 	struct btrfs_stripe_hash table[];
134 };
135 
136 /*
137  * A structure to present a sector inside a page, the length is fixed to
138  * sectorsize;
139  */
140 struct sector_ptr {
141 	/*
142 	 * Blocks from the bio list can still be highmem.
143 	 * So here we use physical address to present a page and the offset inside it.
144 	 */
145 	phys_addr_t paddr;
146 	bool has_paddr;
147 	bool uptodate;
148 };
149 
150 static void rmw_rbio_work(struct work_struct *work);
151 static void rmw_rbio_work_locked(struct work_struct *work);
152 static void index_rbio_pages(struct btrfs_raid_bio *rbio);
153 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
154 
155 static int finish_parity_scrub(struct btrfs_raid_bio *rbio);
156 static void scrub_rbio_work_locked(struct work_struct *work);
157 
158 static void free_raid_bio_pointers(struct btrfs_raid_bio *rbio)
159 {
160 	bitmap_free(rbio->error_bitmap);
161 	kfree(rbio->stripe_pages);
162 	kfree(rbio->bio_sectors);
163 	kfree(rbio->stripe_sectors);
164 	kfree(rbio->finish_pointers);
165 }
166 
167 static void free_raid_bio(struct btrfs_raid_bio *rbio)
168 {
169 	int i;
170 
171 	if (!refcount_dec_and_test(&rbio->refs))
172 		return;
173 
174 	WARN_ON(!list_empty(&rbio->stripe_cache));
175 	WARN_ON(!list_empty(&rbio->hash_list));
176 	WARN_ON(!bio_list_empty(&rbio->bio_list));
177 
178 	for (i = 0; i < rbio->nr_pages; i++) {
179 		if (rbio->stripe_pages[i]) {
180 			__free_page(rbio->stripe_pages[i]);
181 			rbio->stripe_pages[i] = NULL;
182 		}
183 	}
184 
185 	btrfs_put_bioc(rbio->bioc);
186 	free_raid_bio_pointers(rbio);
187 	kfree(rbio);
188 }
189 
190 static void start_async_work(struct btrfs_raid_bio *rbio, work_func_t work_func)
191 {
192 	INIT_WORK(&rbio->work, work_func);
193 	queue_work(rbio->bioc->fs_info->rmw_workers, &rbio->work);
194 }
195 
196 /*
197  * the stripe hash table is used for locking, and to collect
198  * bios in hopes of making a full stripe
199  */
200 int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
201 {
202 	struct btrfs_stripe_hash_table *table;
203 	struct btrfs_stripe_hash_table *x;
204 	struct btrfs_stripe_hash *cur;
205 	struct btrfs_stripe_hash *h;
206 	unsigned int num_entries = 1U << BTRFS_STRIPE_HASH_TABLE_BITS;
207 
208 	if (info->stripe_hash_table)
209 		return 0;
210 
211 	/*
212 	 * The table is large, starting with order 4 and can go as high as
213 	 * order 7 in case lock debugging is turned on.
214 	 *
215 	 * Try harder to allocate and fallback to vmalloc to lower the chance
216 	 * of a failing mount.
217 	 */
218 	table = kvzalloc(struct_size(table, table, num_entries), GFP_KERNEL);
219 	if (!table)
220 		return -ENOMEM;
221 
222 	spin_lock_init(&table->cache_lock);
223 	INIT_LIST_HEAD(&table->stripe_cache);
224 
225 	h = table->table;
226 
227 	for (unsigned int i = 0; i < num_entries; i++) {
228 		cur = h + i;
229 		INIT_LIST_HEAD(&cur->hash_list);
230 		spin_lock_init(&cur->lock);
231 	}
232 
233 	x = cmpxchg(&info->stripe_hash_table, NULL, table);
234 	kvfree(x);
235 	return 0;
236 }
237 
238 static void memcpy_sectors(const struct sector_ptr *dst,
239 			   const struct sector_ptr *src, u32 blocksize)
240 {
241 	memcpy_page(phys_to_page(dst->paddr), offset_in_page(dst->paddr),
242 		    phys_to_page(src->paddr), offset_in_page(src->paddr),
243 		    blocksize);
244 }
245 
246 /*
247  * caching an rbio means to copy anything from the
248  * bio_sectors array into the stripe_pages array.  We
249  * use the page uptodate bit in the stripe cache array
250  * to indicate if it has valid data
251  *
252  * once the caching is done, we set the cache ready
253  * bit.
254  */
255 static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
256 {
257 	int i;
258 	int ret;
259 
260 	ret = alloc_rbio_pages(rbio);
261 	if (ret)
262 		return;
263 
264 	for (i = 0; i < rbio->nr_sectors; i++) {
265 		/* Some range not covered by bio (partial write), skip it */
266 		if (!rbio->bio_sectors[i].has_paddr) {
267 			/*
268 			 * Even if the sector is not covered by bio, if it is
269 			 * a data sector it should still be uptodate as it is
270 			 * read from disk.
271 			 */
272 			if (i < rbio->nr_data * rbio->stripe_nsectors)
273 				ASSERT(rbio->stripe_sectors[i].uptodate);
274 			continue;
275 		}
276 
277 		memcpy_sectors(&rbio->stripe_sectors[i], &rbio->bio_sectors[i],
278 				rbio->bioc->fs_info->sectorsize);
279 		rbio->stripe_sectors[i].uptodate = 1;
280 	}
281 	set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
282 }
283 
284 /*
285  * we hash on the first logical address of the stripe
286  */
287 static int rbio_bucket(struct btrfs_raid_bio *rbio)
288 {
289 	u64 num = rbio->bioc->full_stripe_logical;
290 
291 	/*
292 	 * we shift down quite a bit.  We're using byte
293 	 * addressing, and most of the lower bits are zeros.
294 	 * This tends to upset hash_64, and it consistently
295 	 * returns just one or two different values.
296 	 *
297 	 * shifting off the lower bits fixes things.
298 	 */
299 	return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
300 }
301 
302 static bool full_page_sectors_uptodate(struct btrfs_raid_bio *rbio,
303 				       unsigned int page_nr)
304 {
305 	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
306 	const u32 sectors_per_page = PAGE_SIZE / sectorsize;
307 	int i;
308 
309 	ASSERT(page_nr < rbio->nr_pages);
310 
311 	for (i = sectors_per_page * page_nr;
312 	     i < sectors_per_page * page_nr + sectors_per_page;
313 	     i++) {
314 		if (!rbio->stripe_sectors[i].uptodate)
315 			return false;
316 	}
317 	return true;
318 }
319 
320 /*
321  * Update the stripe_sectors[] array to use correct page and pgoff
322  *
323  * Should be called every time any page pointer in stripes_pages[] got modified.
324  */
325 static void index_stripe_sectors(struct btrfs_raid_bio *rbio)
326 {
327 	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
328 	u32 offset;
329 	int i;
330 
331 	for (i = 0, offset = 0; i < rbio->nr_sectors; i++, offset += sectorsize) {
332 		int page_index = offset >> PAGE_SHIFT;
333 
334 		ASSERT(page_index < rbio->nr_pages);
335 		if (!rbio->stripe_pages[page_index])
336 			continue;
337 
338 		rbio->stripe_sectors[i].has_paddr = true;
339 		rbio->stripe_sectors[i].paddr =
340 			page_to_phys(rbio->stripe_pages[page_index]) +
341 			offset_in_page(offset);
342 	}
343 }
344 
345 static void steal_rbio_page(struct btrfs_raid_bio *src,
346 			    struct btrfs_raid_bio *dest, int page_nr)
347 {
348 	const u32 sectorsize = src->bioc->fs_info->sectorsize;
349 	const u32 sectors_per_page = PAGE_SIZE / sectorsize;
350 	int i;
351 
352 	if (dest->stripe_pages[page_nr])
353 		__free_page(dest->stripe_pages[page_nr]);
354 	dest->stripe_pages[page_nr] = src->stripe_pages[page_nr];
355 	src->stripe_pages[page_nr] = NULL;
356 
357 	/* Also update the sector->uptodate bits. */
358 	for (i = sectors_per_page * page_nr;
359 	     i < sectors_per_page * page_nr + sectors_per_page; i++)
360 		dest->stripe_sectors[i].uptodate = true;
361 }
362 
363 static bool is_data_stripe_page(struct btrfs_raid_bio *rbio, int page_nr)
364 {
365 	const int sector_nr = (page_nr << PAGE_SHIFT) >>
366 			      rbio->bioc->fs_info->sectorsize_bits;
367 
368 	/*
369 	 * We have ensured PAGE_SIZE is aligned with sectorsize, thus
370 	 * we won't have a page which is half data half parity.
371 	 *
372 	 * Thus if the first sector of the page belongs to data stripes, then
373 	 * the full page belongs to data stripes.
374 	 */
375 	return (sector_nr < rbio->nr_data * rbio->stripe_nsectors);
376 }
377 
378 /*
379  * Stealing an rbio means taking all the uptodate pages from the stripe array
380  * in the source rbio and putting them into the destination rbio.
381  *
382  * This will also update the involved stripe_sectors[] which are referring to
383  * the old pages.
384  */
385 static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
386 {
387 	int i;
388 
389 	if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
390 		return;
391 
392 	for (i = 0; i < dest->nr_pages; i++) {
393 		struct page *p = src->stripe_pages[i];
394 
395 		/*
396 		 * We don't need to steal P/Q pages as they will always be
397 		 * regenerated for RMW or full write anyway.
398 		 */
399 		if (!is_data_stripe_page(src, i))
400 			continue;
401 
402 		/*
403 		 * If @src already has RBIO_CACHE_READY_BIT, it should have
404 		 * all data stripe pages present and uptodate.
405 		 */
406 		ASSERT(p);
407 		ASSERT(full_page_sectors_uptodate(src, i));
408 		steal_rbio_page(src, dest, i);
409 	}
410 	index_stripe_sectors(dest);
411 	index_stripe_sectors(src);
412 }
413 
414 /*
415  * merging means we take the bio_list from the victim and
416  * splice it into the destination.  The victim should
417  * be discarded afterwards.
418  *
419  * must be called with dest->rbio_list_lock held
420  */
421 static void merge_rbio(struct btrfs_raid_bio *dest,
422 		       struct btrfs_raid_bio *victim)
423 {
424 	bio_list_merge_init(&dest->bio_list, &victim->bio_list);
425 	dest->bio_list_bytes += victim->bio_list_bytes;
426 	/* Also inherit the bitmaps from @victim. */
427 	bitmap_or(&dest->dbitmap, &victim->dbitmap, &dest->dbitmap,
428 		  dest->stripe_nsectors);
429 }
430 
431 /*
432  * used to prune items that are in the cache.  The caller
433  * must hold the hash table lock.
434  */
435 static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
436 {
437 	int bucket = rbio_bucket(rbio);
438 	struct btrfs_stripe_hash_table *table;
439 	struct btrfs_stripe_hash *h;
440 	int freeit = 0;
441 
442 	/*
443 	 * check the bit again under the hash table lock.
444 	 */
445 	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
446 		return;
447 
448 	table = rbio->bioc->fs_info->stripe_hash_table;
449 	h = table->table + bucket;
450 
451 	/* hold the lock for the bucket because we may be
452 	 * removing it from the hash table
453 	 */
454 	spin_lock(&h->lock);
455 
456 	/*
457 	 * hold the lock for the bio list because we need
458 	 * to make sure the bio list is empty
459 	 */
460 	spin_lock(&rbio->bio_list_lock);
461 
462 	if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
463 		list_del_init(&rbio->stripe_cache);
464 		table->cache_size -= 1;
465 		freeit = 1;
466 
467 		/* if the bio list isn't empty, this rbio is
468 		 * still involved in an IO.  We take it out
469 		 * of the cache list, and drop the ref that
470 		 * was held for the list.
471 		 *
472 		 * If the bio_list was empty, we also remove
473 		 * the rbio from the hash_table, and drop
474 		 * the corresponding ref
475 		 */
476 		if (bio_list_empty(&rbio->bio_list)) {
477 			if (!list_empty(&rbio->hash_list)) {
478 				list_del_init(&rbio->hash_list);
479 				refcount_dec(&rbio->refs);
480 				BUG_ON(!list_empty(&rbio->plug_list));
481 			}
482 		}
483 	}
484 
485 	spin_unlock(&rbio->bio_list_lock);
486 	spin_unlock(&h->lock);
487 
488 	if (freeit)
489 		free_raid_bio(rbio);
490 }
491 
492 /*
493  * prune a given rbio from the cache
494  */
495 static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
496 {
497 	struct btrfs_stripe_hash_table *table;
498 
499 	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
500 		return;
501 
502 	table = rbio->bioc->fs_info->stripe_hash_table;
503 
504 	spin_lock(&table->cache_lock);
505 	__remove_rbio_from_cache(rbio);
506 	spin_unlock(&table->cache_lock);
507 }
508 
509 /*
510  * remove everything in the cache
511  */
512 static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
513 {
514 	struct btrfs_stripe_hash_table *table;
515 	struct btrfs_raid_bio *rbio;
516 
517 	table = info->stripe_hash_table;
518 
519 	spin_lock(&table->cache_lock);
520 	while (!list_empty(&table->stripe_cache)) {
521 		rbio = list_first_entry(&table->stripe_cache,
522 					struct btrfs_raid_bio, stripe_cache);
523 		__remove_rbio_from_cache(rbio);
524 	}
525 	spin_unlock(&table->cache_lock);
526 }
527 
528 /*
529  * remove all cached entries and free the hash table
530  * used by unmount
531  */
532 void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
533 {
534 	if (!info->stripe_hash_table)
535 		return;
536 	btrfs_clear_rbio_cache(info);
537 	kvfree(info->stripe_hash_table);
538 	info->stripe_hash_table = NULL;
539 }
540 
541 /*
542  * insert an rbio into the stripe cache.  It
543  * must have already been prepared by calling
544  * cache_rbio_pages
545  *
546  * If this rbio was already cached, it gets
547  * moved to the front of the lru.
548  *
549  * If the size of the rbio cache is too big, we
550  * prune an item.
551  */
552 static void cache_rbio(struct btrfs_raid_bio *rbio)
553 {
554 	struct btrfs_stripe_hash_table *table;
555 
556 	if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
557 		return;
558 
559 	table = rbio->bioc->fs_info->stripe_hash_table;
560 
561 	spin_lock(&table->cache_lock);
562 	spin_lock(&rbio->bio_list_lock);
563 
564 	/* bump our ref if we were not in the list before */
565 	if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
566 		refcount_inc(&rbio->refs);
567 
568 	if (!list_empty(&rbio->stripe_cache)){
569 		list_move(&rbio->stripe_cache, &table->stripe_cache);
570 	} else {
571 		list_add(&rbio->stripe_cache, &table->stripe_cache);
572 		table->cache_size += 1;
573 	}
574 
575 	spin_unlock(&rbio->bio_list_lock);
576 
577 	if (table->cache_size > RBIO_CACHE_SIZE) {
578 		struct btrfs_raid_bio *found;
579 
580 		found = list_last_entry(&table->stripe_cache,
581 					struct btrfs_raid_bio,
582 					stripe_cache);
583 
584 		if (found != rbio)
585 			__remove_rbio_from_cache(found);
586 	}
587 
588 	spin_unlock(&table->cache_lock);
589 }
590 
591 /*
592  * helper function to run the xor_blocks api.  It is only
593  * able to do MAX_XOR_BLOCKS at a time, so we need to
594  * loop through.
595  */
596 static void run_xor(void **pages, int src_cnt, ssize_t len)
597 {
598 	int src_off = 0;
599 	int xor_src_cnt = 0;
600 	void *dest = pages[src_cnt];
601 
602 	while(src_cnt > 0) {
603 		xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
604 		xor_blocks(xor_src_cnt, len, dest, pages + src_off);
605 
606 		src_cnt -= xor_src_cnt;
607 		src_off += xor_src_cnt;
608 	}
609 }
610 
611 /*
612  * Returns true if the bio list inside this rbio covers an entire stripe (no
613  * rmw required).
614  */
615 static int rbio_is_full(struct btrfs_raid_bio *rbio)
616 {
617 	unsigned long size = rbio->bio_list_bytes;
618 	int ret = 1;
619 
620 	spin_lock(&rbio->bio_list_lock);
621 	if (size != rbio->nr_data * BTRFS_STRIPE_LEN)
622 		ret = 0;
623 	BUG_ON(size > rbio->nr_data * BTRFS_STRIPE_LEN);
624 	spin_unlock(&rbio->bio_list_lock);
625 
626 	return ret;
627 }
628 
629 /*
630  * returns 1 if it is safe to merge two rbios together.
631  * The merging is safe if the two rbios correspond to
632  * the same stripe and if they are both going in the same
633  * direction (read vs write), and if neither one is
634  * locked for final IO
635  *
636  * The caller is responsible for locking such that
637  * rmw_locked is safe to test
638  */
639 static int rbio_can_merge(struct btrfs_raid_bio *last,
640 			  struct btrfs_raid_bio *cur)
641 {
642 	if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) ||
643 	    test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
644 		return 0;
645 
646 	/*
647 	 * we can't merge with cached rbios, since the
648 	 * idea is that when we merge the destination
649 	 * rbio is going to run our IO for us.  We can
650 	 * steal from cached rbios though, other functions
651 	 * handle that.
652 	 */
653 	if (test_bit(RBIO_CACHE_BIT, &last->flags) ||
654 	    test_bit(RBIO_CACHE_BIT, &cur->flags))
655 		return 0;
656 
657 	if (last->bioc->full_stripe_logical != cur->bioc->full_stripe_logical)
658 		return 0;
659 
660 	/* we can't merge with different operations */
661 	if (last->operation != cur->operation)
662 		return 0;
663 	/*
664 	 * We've need read the full stripe from the drive.
665 	 * check and repair the parity and write the new results.
666 	 *
667 	 * We're not allowed to add any new bios to the
668 	 * bio list here, anyone else that wants to
669 	 * change this stripe needs to do their own rmw.
670 	 */
671 	if (last->operation == BTRFS_RBIO_PARITY_SCRUB)
672 		return 0;
673 
674 	if (last->operation == BTRFS_RBIO_READ_REBUILD)
675 		return 0;
676 
677 	return 1;
678 }
679 
680 static unsigned int rbio_stripe_sector_index(const struct btrfs_raid_bio *rbio,
681 					     unsigned int stripe_nr,
682 					     unsigned int sector_nr)
683 {
684 	ASSERT_RBIO_STRIPE(stripe_nr < rbio->real_stripes, rbio, stripe_nr);
685 	ASSERT_RBIO_SECTOR(sector_nr < rbio->stripe_nsectors, rbio, sector_nr);
686 
687 	return stripe_nr * rbio->stripe_nsectors + sector_nr;
688 }
689 
690 /* Return a sector from rbio->stripe_sectors, not from the bio list */
691 static struct sector_ptr *rbio_stripe_sector(const struct btrfs_raid_bio *rbio,
692 					     unsigned int stripe_nr,
693 					     unsigned int sector_nr)
694 {
695 	return &rbio->stripe_sectors[rbio_stripe_sector_index(rbio, stripe_nr,
696 							      sector_nr)];
697 }
698 
699 /* Grab a sector inside P stripe */
700 static struct sector_ptr *rbio_pstripe_sector(const struct btrfs_raid_bio *rbio,
701 					      unsigned int sector_nr)
702 {
703 	return rbio_stripe_sector(rbio, rbio->nr_data, sector_nr);
704 }
705 
706 /* Grab a sector inside Q stripe, return NULL if not RAID6 */
707 static struct sector_ptr *rbio_qstripe_sector(const struct btrfs_raid_bio *rbio,
708 					      unsigned int sector_nr)
709 {
710 	if (rbio->nr_data + 1 == rbio->real_stripes)
711 		return NULL;
712 	return rbio_stripe_sector(rbio, rbio->nr_data + 1, sector_nr);
713 }
714 
715 /*
716  * The first stripe in the table for a logical address
717  * has the lock.  rbios are added in one of three ways:
718  *
719  * 1) Nobody has the stripe locked yet.  The rbio is given
720  * the lock and 0 is returned.  The caller must start the IO
721  * themselves.
722  *
723  * 2) Someone has the stripe locked, but we're able to merge
724  * with the lock owner.  The rbio is freed and the IO will
725  * start automatically along with the existing rbio.  1 is returned.
726  *
727  * 3) Someone has the stripe locked, but we're not able to merge.
728  * The rbio is added to the lock owner's plug list, or merged into
729  * an rbio already on the plug list.  When the lock owner unlocks,
730  * the next rbio on the list is run and the IO is started automatically.
731  * 1 is returned
732  *
733  * If we return 0, the caller still owns the rbio and must continue with
734  * IO submission.  If we return 1, the caller must assume the rbio has
735  * already been freed.
736  */
737 static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
738 {
739 	struct btrfs_stripe_hash *h;
740 	struct btrfs_raid_bio *cur;
741 	struct btrfs_raid_bio *pending;
742 	struct btrfs_raid_bio *freeit = NULL;
743 	struct btrfs_raid_bio *cache_drop = NULL;
744 	int ret = 0;
745 
746 	h = rbio->bioc->fs_info->stripe_hash_table->table + rbio_bucket(rbio);
747 
748 	spin_lock(&h->lock);
749 	list_for_each_entry(cur, &h->hash_list, hash_list) {
750 		if (cur->bioc->full_stripe_logical != rbio->bioc->full_stripe_logical)
751 			continue;
752 
753 		spin_lock(&cur->bio_list_lock);
754 
755 		/* Can we steal this cached rbio's pages? */
756 		if (bio_list_empty(&cur->bio_list) &&
757 		    list_empty(&cur->plug_list) &&
758 		    test_bit(RBIO_CACHE_BIT, &cur->flags) &&
759 		    !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
760 			list_del_init(&cur->hash_list);
761 			refcount_dec(&cur->refs);
762 
763 			steal_rbio(cur, rbio);
764 			cache_drop = cur;
765 			spin_unlock(&cur->bio_list_lock);
766 
767 			goto lockit;
768 		}
769 
770 		/* Can we merge into the lock owner? */
771 		if (rbio_can_merge(cur, rbio)) {
772 			merge_rbio(cur, rbio);
773 			spin_unlock(&cur->bio_list_lock);
774 			freeit = rbio;
775 			ret = 1;
776 			goto out;
777 		}
778 
779 
780 		/*
781 		 * We couldn't merge with the running rbio, see if we can merge
782 		 * with the pending ones.  We don't have to check for rmw_locked
783 		 * because there is no way they are inside finish_rmw right now
784 		 */
785 		list_for_each_entry(pending, &cur->plug_list, plug_list) {
786 			if (rbio_can_merge(pending, rbio)) {
787 				merge_rbio(pending, rbio);
788 				spin_unlock(&cur->bio_list_lock);
789 				freeit = rbio;
790 				ret = 1;
791 				goto out;
792 			}
793 		}
794 
795 		/*
796 		 * No merging, put us on the tail of the plug list, our rbio
797 		 * will be started with the currently running rbio unlocks
798 		 */
799 		list_add_tail(&rbio->plug_list, &cur->plug_list);
800 		spin_unlock(&cur->bio_list_lock);
801 		ret = 1;
802 		goto out;
803 	}
804 lockit:
805 	refcount_inc(&rbio->refs);
806 	list_add(&rbio->hash_list, &h->hash_list);
807 out:
808 	spin_unlock(&h->lock);
809 	if (cache_drop)
810 		remove_rbio_from_cache(cache_drop);
811 	if (freeit)
812 		free_raid_bio(freeit);
813 	return ret;
814 }
815 
816 static void recover_rbio_work_locked(struct work_struct *work);
817 
818 /*
819  * called as rmw or parity rebuild is completed.  If the plug list has more
820  * rbios waiting for this stripe, the next one on the list will be started
821  */
822 static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
823 {
824 	int bucket;
825 	struct btrfs_stripe_hash *h;
826 	int keep_cache = 0;
827 
828 	bucket = rbio_bucket(rbio);
829 	h = rbio->bioc->fs_info->stripe_hash_table->table + bucket;
830 
831 	if (list_empty(&rbio->plug_list))
832 		cache_rbio(rbio);
833 
834 	spin_lock(&h->lock);
835 	spin_lock(&rbio->bio_list_lock);
836 
837 	if (!list_empty(&rbio->hash_list)) {
838 		/*
839 		 * if we're still cached and there is no other IO
840 		 * to perform, just leave this rbio here for others
841 		 * to steal from later
842 		 */
843 		if (list_empty(&rbio->plug_list) &&
844 		    test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
845 			keep_cache = 1;
846 			clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
847 			BUG_ON(!bio_list_empty(&rbio->bio_list));
848 			goto done;
849 		}
850 
851 		list_del_init(&rbio->hash_list);
852 		refcount_dec(&rbio->refs);
853 
854 		/*
855 		 * we use the plug list to hold all the rbios
856 		 * waiting for the chance to lock this stripe.
857 		 * hand the lock over to one of them.
858 		 */
859 		if (!list_empty(&rbio->plug_list)) {
860 			struct btrfs_raid_bio *next;
861 			struct list_head *head = rbio->plug_list.next;
862 
863 			next = list_entry(head, struct btrfs_raid_bio,
864 					  plug_list);
865 
866 			list_del_init(&rbio->plug_list);
867 
868 			list_add(&next->hash_list, &h->hash_list);
869 			refcount_inc(&next->refs);
870 			spin_unlock(&rbio->bio_list_lock);
871 			spin_unlock(&h->lock);
872 
873 			if (next->operation == BTRFS_RBIO_READ_REBUILD) {
874 				start_async_work(next, recover_rbio_work_locked);
875 			} else if (next->operation == BTRFS_RBIO_WRITE) {
876 				steal_rbio(rbio, next);
877 				start_async_work(next, rmw_rbio_work_locked);
878 			} else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
879 				steal_rbio(rbio, next);
880 				start_async_work(next, scrub_rbio_work_locked);
881 			}
882 
883 			goto done_nolock;
884 		}
885 	}
886 done:
887 	spin_unlock(&rbio->bio_list_lock);
888 	spin_unlock(&h->lock);
889 
890 done_nolock:
891 	if (!keep_cache)
892 		remove_rbio_from_cache(rbio);
893 }
894 
895 static void rbio_endio_bio_list(struct bio *cur, blk_status_t status)
896 {
897 	struct bio *next;
898 
899 	while (cur) {
900 		next = cur->bi_next;
901 		cur->bi_next = NULL;
902 		cur->bi_status = status;
903 		bio_endio(cur);
904 		cur = next;
905 	}
906 }
907 
908 /*
909  * this frees the rbio and runs through all the bios in the
910  * bio_list and calls end_io on them
911  */
912 static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t status)
913 {
914 	struct bio *cur = bio_list_get(&rbio->bio_list);
915 	struct bio *extra;
916 
917 	kfree(rbio->csum_buf);
918 	bitmap_free(rbio->csum_bitmap);
919 	rbio->csum_buf = NULL;
920 	rbio->csum_bitmap = NULL;
921 
922 	/*
923 	 * Clear the data bitmap, as the rbio may be cached for later usage.
924 	 * do this before before unlock_stripe() so there will be no new bio
925 	 * for this bio.
926 	 */
927 	bitmap_clear(&rbio->dbitmap, 0, rbio->stripe_nsectors);
928 
929 	/*
930 	 * At this moment, rbio->bio_list is empty, however since rbio does not
931 	 * always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the
932 	 * hash list, rbio may be merged with others so that rbio->bio_list
933 	 * becomes non-empty.
934 	 * Once unlock_stripe() is done, rbio->bio_list will not be updated any
935 	 * more and we can call bio_endio() on all queued bios.
936 	 */
937 	unlock_stripe(rbio);
938 	extra = bio_list_get(&rbio->bio_list);
939 	free_raid_bio(rbio);
940 
941 	rbio_endio_bio_list(cur, status);
942 	if (extra)
943 		rbio_endio_bio_list(extra, status);
944 }
945 
946 /*
947  * Get a sector pointer specified by its @stripe_nr and @sector_nr.
948  *
949  * @rbio:               The raid bio
950  * @stripe_nr:          Stripe number, valid range [0, real_stripe)
951  * @sector_nr:		Sector number inside the stripe,
952  *			valid range [0, stripe_nsectors)
953  * @bio_list_only:      Whether to use sectors inside the bio list only.
954  *
955  * The read/modify/write code wants to reuse the original bio page as much
956  * as possible, and only use stripe_sectors as fallback.
957  */
958 static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio,
959 					 int stripe_nr, int sector_nr,
960 					 bool bio_list_only)
961 {
962 	struct sector_ptr *sector;
963 	int index;
964 
965 	ASSERT_RBIO_STRIPE(stripe_nr >= 0 && stripe_nr < rbio->real_stripes,
966 			   rbio, stripe_nr);
967 	ASSERT_RBIO_SECTOR(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors,
968 			   rbio, sector_nr);
969 
970 	index = stripe_nr * rbio->stripe_nsectors + sector_nr;
971 	ASSERT(index >= 0 && index < rbio->nr_sectors);
972 
973 	spin_lock(&rbio->bio_list_lock);
974 	sector = &rbio->bio_sectors[index];
975 	if (sector->has_paddr || bio_list_only) {
976 		/* Don't return sector without a valid page pointer */
977 		if (!sector->has_paddr)
978 			sector = NULL;
979 		spin_unlock(&rbio->bio_list_lock);
980 		return sector;
981 	}
982 	spin_unlock(&rbio->bio_list_lock);
983 
984 	return &rbio->stripe_sectors[index];
985 }
986 
987 /*
988  * allocation and initial setup for the btrfs_raid_bio.  Not
989  * this does not allocate any pages for rbio->pages.
990  */
991 static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
992 					 struct btrfs_io_context *bioc)
993 {
994 	const unsigned int real_stripes = bioc->num_stripes - bioc->replace_nr_stripes;
995 	const unsigned int stripe_npages = BTRFS_STRIPE_LEN >> PAGE_SHIFT;
996 	const unsigned int num_pages = stripe_npages * real_stripes;
997 	const unsigned int stripe_nsectors =
998 		BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits;
999 	const unsigned int num_sectors = stripe_nsectors * real_stripes;
1000 	struct btrfs_raid_bio *rbio;
1001 
1002 	/* PAGE_SIZE must also be aligned to sectorsize for subpage support */
1003 	ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize));
1004 	/*
1005 	 * Our current stripe len should be fixed to 64k thus stripe_nsectors
1006 	 * (at most 16) should be no larger than BITS_PER_LONG.
1007 	 */
1008 	ASSERT(stripe_nsectors <= BITS_PER_LONG);
1009 
1010 	/*
1011 	 * Real stripes must be between 2 (2 disks RAID5, aka RAID1) and 256
1012 	 * (limited by u8).
1013 	 */
1014 	ASSERT(real_stripes >= 2);
1015 	ASSERT(real_stripes <= U8_MAX);
1016 
1017 	rbio = kzalloc(sizeof(*rbio), GFP_NOFS);
1018 	if (!rbio)
1019 		return ERR_PTR(-ENOMEM);
1020 	rbio->stripe_pages = kcalloc(num_pages, sizeof(struct page *),
1021 				     GFP_NOFS);
1022 	rbio->bio_sectors = kcalloc(num_sectors, sizeof(struct sector_ptr),
1023 				    GFP_NOFS);
1024 	rbio->stripe_sectors = kcalloc(num_sectors, sizeof(struct sector_ptr),
1025 				       GFP_NOFS);
1026 	rbio->finish_pointers = kcalloc(real_stripes, sizeof(void *), GFP_NOFS);
1027 	rbio->error_bitmap = bitmap_zalloc(num_sectors, GFP_NOFS);
1028 
1029 	if (!rbio->stripe_pages || !rbio->bio_sectors || !rbio->stripe_sectors ||
1030 	    !rbio->finish_pointers || !rbio->error_bitmap) {
1031 		free_raid_bio_pointers(rbio);
1032 		kfree(rbio);
1033 		return ERR_PTR(-ENOMEM);
1034 	}
1035 
1036 	bio_list_init(&rbio->bio_list);
1037 	init_waitqueue_head(&rbio->io_wait);
1038 	INIT_LIST_HEAD(&rbio->plug_list);
1039 	spin_lock_init(&rbio->bio_list_lock);
1040 	INIT_LIST_HEAD(&rbio->stripe_cache);
1041 	INIT_LIST_HEAD(&rbio->hash_list);
1042 	btrfs_get_bioc(bioc);
1043 	rbio->bioc = bioc;
1044 	rbio->nr_pages = num_pages;
1045 	rbio->nr_sectors = num_sectors;
1046 	rbio->real_stripes = real_stripes;
1047 	rbio->stripe_npages = stripe_npages;
1048 	rbio->stripe_nsectors = stripe_nsectors;
1049 	refcount_set(&rbio->refs, 1);
1050 	atomic_set(&rbio->stripes_pending, 0);
1051 
1052 	ASSERT(btrfs_nr_parity_stripes(bioc->map_type));
1053 	rbio->nr_data = real_stripes - btrfs_nr_parity_stripes(bioc->map_type);
1054 	ASSERT(rbio->nr_data > 0);
1055 
1056 	return rbio;
1057 }
1058 
1059 /* allocate pages for all the stripes in the bio, including parity */
1060 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
1061 {
1062 	int ret;
1063 
1064 	ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages, false);
1065 	if (ret < 0)
1066 		return ret;
1067 	/* Mapping all sectors */
1068 	index_stripe_sectors(rbio);
1069 	return 0;
1070 }
1071 
1072 /* only allocate pages for p/q stripes */
1073 static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
1074 {
1075 	const int data_pages = rbio->nr_data * rbio->stripe_npages;
1076 	int ret;
1077 
1078 	ret = btrfs_alloc_page_array(rbio->nr_pages - data_pages,
1079 				     rbio->stripe_pages + data_pages, false);
1080 	if (ret < 0)
1081 		return ret;
1082 
1083 	index_stripe_sectors(rbio);
1084 	return 0;
1085 }
1086 
1087 /*
1088  * Return the total number of errors found in the vertical stripe of @sector_nr.
1089  *
1090  * @faila and @failb will also be updated to the first and second stripe
1091  * number of the errors.
1092  */
1093 static int get_rbio_veritical_errors(struct btrfs_raid_bio *rbio, int sector_nr,
1094 				     int *faila, int *failb)
1095 {
1096 	int stripe_nr;
1097 	int found_errors = 0;
1098 
1099 	if (faila || failb) {
1100 		/*
1101 		 * Both @faila and @failb should be valid pointers if any of
1102 		 * them is specified.
1103 		 */
1104 		ASSERT(faila && failb);
1105 		*faila = -1;
1106 		*failb = -1;
1107 	}
1108 
1109 	for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) {
1110 		int total_sector_nr = stripe_nr * rbio->stripe_nsectors + sector_nr;
1111 
1112 		if (test_bit(total_sector_nr, rbio->error_bitmap)) {
1113 			found_errors++;
1114 			if (faila) {
1115 				/* Update faila and failb. */
1116 				if (*faila < 0)
1117 					*faila = stripe_nr;
1118 				else if (*failb < 0)
1119 					*failb = stripe_nr;
1120 			}
1121 		}
1122 	}
1123 	return found_errors;
1124 }
1125 
1126 /*
1127  * Add a single sector @sector into our list of bios for IO.
1128  *
1129  * Return 0 if everything went well.
1130  * Return <0 for error.
1131  */
1132 static int rbio_add_io_sector(struct btrfs_raid_bio *rbio,
1133 			      struct bio_list *bio_list,
1134 			      struct sector_ptr *sector,
1135 			      unsigned int stripe_nr,
1136 			      unsigned int sector_nr,
1137 			      enum req_op op)
1138 {
1139 	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
1140 	struct bio *last = bio_list->tail;
1141 	int ret;
1142 	struct bio *bio;
1143 	struct btrfs_io_stripe *stripe;
1144 	u64 disk_start;
1145 
1146 	/*
1147 	 * Note: here stripe_nr has taken device replace into consideration,
1148 	 * thus it can be larger than rbio->real_stripe.
1149 	 * So here we check against bioc->num_stripes, not rbio->real_stripes.
1150 	 */
1151 	ASSERT_RBIO_STRIPE(stripe_nr >= 0 && stripe_nr < rbio->bioc->num_stripes,
1152 			   rbio, stripe_nr);
1153 	ASSERT_RBIO_SECTOR(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors,
1154 			   rbio, sector_nr);
1155 	ASSERT(sector->has_paddr);
1156 
1157 	stripe = &rbio->bioc->stripes[stripe_nr];
1158 	disk_start = stripe->physical + sector_nr * sectorsize;
1159 
1160 	/* if the device is missing, just fail this stripe */
1161 	if (!stripe->dev->bdev) {
1162 		int found_errors;
1163 
1164 		set_bit(stripe_nr * rbio->stripe_nsectors + sector_nr,
1165 			rbio->error_bitmap);
1166 
1167 		/* Check if we have reached tolerance early. */
1168 		found_errors = get_rbio_veritical_errors(rbio, sector_nr,
1169 							 NULL, NULL);
1170 		if (found_errors > rbio->bioc->max_errors)
1171 			return -EIO;
1172 		return 0;
1173 	}
1174 
1175 	/* see if we can add this page onto our existing bio */
1176 	if (last) {
1177 		u64 last_end = last->bi_iter.bi_sector << SECTOR_SHIFT;
1178 		last_end += last->bi_iter.bi_size;
1179 
1180 		/*
1181 		 * we can't merge these if they are from different
1182 		 * devices or if they are not contiguous
1183 		 */
1184 		if (last_end == disk_start && !last->bi_status &&
1185 		    last->bi_bdev == stripe->dev->bdev) {
1186 			ret = bio_add_page(last, phys_to_page(sector->paddr),
1187 					   sectorsize, offset_in_page(sector->paddr));
1188 			if (ret == sectorsize)
1189 				return 0;
1190 		}
1191 	}
1192 
1193 	/* put a new bio on the list */
1194 	bio = bio_alloc(stripe->dev->bdev,
1195 			max(BTRFS_STRIPE_LEN >> PAGE_SHIFT, 1),
1196 			op, GFP_NOFS);
1197 	bio->bi_iter.bi_sector = disk_start >> SECTOR_SHIFT;
1198 	bio->bi_private = rbio;
1199 
1200 	__bio_add_page(bio, phys_to_page(sector->paddr), sectorsize,
1201 		       offset_in_page(sector->paddr));
1202 	bio_list_add(bio_list, bio);
1203 	return 0;
1204 }
1205 
1206 static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio)
1207 {
1208 	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
1209 	const u32 sectorsize_bits = rbio->bioc->fs_info->sectorsize_bits;
1210 	struct bvec_iter iter = bio->bi_iter;
1211 	u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
1212 		     rbio->bioc->full_stripe_logical;
1213 
1214 	while (iter.bi_size) {
1215 		unsigned int index = (offset >> sectorsize_bits);
1216 		struct sector_ptr *sector = &rbio->bio_sectors[index];
1217 		struct bio_vec bv = bio_iter_iovec(bio, iter);
1218 
1219 		sector->has_paddr = true;
1220 		sector->paddr = bvec_phys(&bv);
1221 		bio_advance_iter_single(bio, &iter, sectorsize);
1222 		offset += sectorsize;
1223 	}
1224 }
1225 
1226 /*
1227  * helper function to walk our bio list and populate the bio_pages array with
1228  * the result.  This seems expensive, but it is faster than constantly
1229  * searching through the bio list as we setup the IO in finish_rmw or stripe
1230  * reconstruction.
1231  *
1232  * This must be called before you trust the answers from page_in_rbio
1233  */
1234 static void index_rbio_pages(struct btrfs_raid_bio *rbio)
1235 {
1236 	struct bio *bio;
1237 
1238 	spin_lock(&rbio->bio_list_lock);
1239 	bio_list_for_each(bio, &rbio->bio_list)
1240 		index_one_bio(rbio, bio);
1241 
1242 	spin_unlock(&rbio->bio_list_lock);
1243 }
1244 
1245 static void bio_get_trace_info(struct btrfs_raid_bio *rbio, struct bio *bio,
1246 			       struct raid56_bio_trace_info *trace_info)
1247 {
1248 	const struct btrfs_io_context *bioc = rbio->bioc;
1249 	int i;
1250 
1251 	ASSERT(bioc);
1252 
1253 	/* We rely on bio->bi_bdev to find the stripe number. */
1254 	if (!bio->bi_bdev)
1255 		goto not_found;
1256 
1257 	for (i = 0; i < bioc->num_stripes; i++) {
1258 		if (bio->bi_bdev != bioc->stripes[i].dev->bdev)
1259 			continue;
1260 		trace_info->stripe_nr = i;
1261 		trace_info->devid = bioc->stripes[i].dev->devid;
1262 		trace_info->offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
1263 				     bioc->stripes[i].physical;
1264 		return;
1265 	}
1266 
1267 not_found:
1268 	trace_info->devid = -1;
1269 	trace_info->offset = -1;
1270 	trace_info->stripe_nr = -1;
1271 }
1272 
1273 static inline void bio_list_put(struct bio_list *bio_list)
1274 {
1275 	struct bio *bio;
1276 
1277 	while ((bio = bio_list_pop(bio_list)))
1278 		bio_put(bio);
1279 }
1280 
1281 static void assert_rbio(struct btrfs_raid_bio *rbio)
1282 {
1283 	if (!IS_ENABLED(CONFIG_BTRFS_ASSERT))
1284 		return;
1285 
1286 	/*
1287 	 * At least two stripes (2 disks RAID5), and since real_stripes is U8,
1288 	 * we won't go beyond 256 disks anyway.
1289 	 */
1290 	ASSERT_RBIO(rbio->real_stripes >= 2, rbio);
1291 	ASSERT_RBIO(rbio->nr_data > 0, rbio);
1292 
1293 	/*
1294 	 * This is another check to make sure nr data stripes is smaller
1295 	 * than total stripes.
1296 	 */
1297 	ASSERT_RBIO(rbio->nr_data < rbio->real_stripes, rbio);
1298 }
1299 
1300 static inline void *kmap_local_sector(const struct sector_ptr *sector)
1301 {
1302 	/* The sector pointer must have a page mapped to it. */
1303 	ASSERT(sector->has_paddr);
1304 
1305 	return kmap_local_page(phys_to_page(sector->paddr)) +
1306 	       offset_in_page(sector->paddr);
1307 }
1308 
1309 /* Generate PQ for one vertical stripe. */
1310 static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr)
1311 {
1312 	void **pointers = rbio->finish_pointers;
1313 	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
1314 	struct sector_ptr *sector;
1315 	int stripe;
1316 	const bool has_qstripe = rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6;
1317 
1318 	/* First collect one sector from each data stripe */
1319 	for (stripe = 0; stripe < rbio->nr_data; stripe++) {
1320 		sector = sector_in_rbio(rbio, stripe, sectornr, 0);
1321 		pointers[stripe] = kmap_local_sector(sector);
1322 	}
1323 
1324 	/* Then add the parity stripe */
1325 	sector = rbio_pstripe_sector(rbio, sectornr);
1326 	sector->uptodate = 1;
1327 	pointers[stripe++] = kmap_local_sector(sector);
1328 
1329 	if (has_qstripe) {
1330 		/*
1331 		 * RAID6, add the qstripe and call the library function
1332 		 * to fill in our p/q
1333 		 */
1334 		sector = rbio_qstripe_sector(rbio, sectornr);
1335 		sector->uptodate = 1;
1336 		pointers[stripe++] = kmap_local_sector(sector);
1337 
1338 		assert_rbio(rbio);
1339 		raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
1340 					pointers);
1341 	} else {
1342 		/* raid5 */
1343 		memcpy(pointers[rbio->nr_data], pointers[0], sectorsize);
1344 		run_xor(pointers + 1, rbio->nr_data - 1, sectorsize);
1345 	}
1346 	for (stripe = stripe - 1; stripe >= 0; stripe--)
1347 		kunmap_local(pointers[stripe]);
1348 }
1349 
1350 static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio,
1351 				   struct bio_list *bio_list)
1352 {
1353 	/* The total sector number inside the full stripe. */
1354 	int total_sector_nr;
1355 	int sectornr;
1356 	int stripe;
1357 	int ret;
1358 
1359 	ASSERT(bio_list_size(bio_list) == 0);
1360 
1361 	/* We should have at least one data sector. */
1362 	ASSERT(bitmap_weight(&rbio->dbitmap, rbio->stripe_nsectors));
1363 
1364 	/*
1365 	 * Reset errors, as we may have errors inherited from from degraded
1366 	 * write.
1367 	 */
1368 	bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
1369 
1370 	/*
1371 	 * Start assembly.  Make bios for everything from the higher layers (the
1372 	 * bio_list in our rbio) and our P/Q.  Ignore everything else.
1373 	 */
1374 	for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
1375 	     total_sector_nr++) {
1376 		struct sector_ptr *sector;
1377 
1378 		stripe = total_sector_nr / rbio->stripe_nsectors;
1379 		sectornr = total_sector_nr % rbio->stripe_nsectors;
1380 
1381 		/* This vertical stripe has no data, skip it. */
1382 		if (!test_bit(sectornr, &rbio->dbitmap))
1383 			continue;
1384 
1385 		if (stripe < rbio->nr_data) {
1386 			sector = sector_in_rbio(rbio, stripe, sectornr, 1);
1387 			if (!sector)
1388 				continue;
1389 		} else {
1390 			sector = rbio_stripe_sector(rbio, stripe, sectornr);
1391 		}
1392 
1393 		ret = rbio_add_io_sector(rbio, bio_list, sector, stripe,
1394 					 sectornr, REQ_OP_WRITE);
1395 		if (ret)
1396 			goto error;
1397 	}
1398 
1399 	if (likely(!rbio->bioc->replace_nr_stripes))
1400 		return 0;
1401 
1402 	/*
1403 	 * Make a copy for the replace target device.
1404 	 *
1405 	 * Thus the source stripe number (in replace_stripe_src) should be valid.
1406 	 */
1407 	ASSERT(rbio->bioc->replace_stripe_src >= 0);
1408 
1409 	for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
1410 	     total_sector_nr++) {
1411 		struct sector_ptr *sector;
1412 
1413 		stripe = total_sector_nr / rbio->stripe_nsectors;
1414 		sectornr = total_sector_nr % rbio->stripe_nsectors;
1415 
1416 		/*
1417 		 * For RAID56, there is only one device that can be replaced,
1418 		 * and replace_stripe_src[0] indicates the stripe number we
1419 		 * need to copy from.
1420 		 */
1421 		if (stripe != rbio->bioc->replace_stripe_src) {
1422 			/*
1423 			 * We can skip the whole stripe completely, note
1424 			 * total_sector_nr will be increased by one anyway.
1425 			 */
1426 			ASSERT(sectornr == 0);
1427 			total_sector_nr += rbio->stripe_nsectors - 1;
1428 			continue;
1429 		}
1430 
1431 		/* This vertical stripe has no data, skip it. */
1432 		if (!test_bit(sectornr, &rbio->dbitmap))
1433 			continue;
1434 
1435 		if (stripe < rbio->nr_data) {
1436 			sector = sector_in_rbio(rbio, stripe, sectornr, 1);
1437 			if (!sector)
1438 				continue;
1439 		} else {
1440 			sector = rbio_stripe_sector(rbio, stripe, sectornr);
1441 		}
1442 
1443 		ret = rbio_add_io_sector(rbio, bio_list, sector,
1444 					 rbio->real_stripes,
1445 					 sectornr, REQ_OP_WRITE);
1446 		if (ret)
1447 			goto error;
1448 	}
1449 
1450 	return 0;
1451 error:
1452 	bio_list_put(bio_list);
1453 	return -EIO;
1454 }
1455 
1456 static void set_rbio_range_error(struct btrfs_raid_bio *rbio, struct bio *bio)
1457 {
1458 	struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1459 	u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
1460 		     rbio->bioc->full_stripe_logical;
1461 	int total_nr_sector = offset >> fs_info->sectorsize_bits;
1462 
1463 	ASSERT(total_nr_sector < rbio->nr_data * rbio->stripe_nsectors);
1464 
1465 	bitmap_set(rbio->error_bitmap, total_nr_sector,
1466 		   bio->bi_iter.bi_size >> fs_info->sectorsize_bits);
1467 
1468 	/*
1469 	 * Special handling for raid56_alloc_missing_rbio() used by
1470 	 * scrub/replace.  Unlike call path in raid56_parity_recover(), they
1471 	 * pass an empty bio here.  Thus we have to find out the missing device
1472 	 * and mark the stripe error instead.
1473 	 */
1474 	if (bio->bi_iter.bi_size == 0) {
1475 		bool found_missing = false;
1476 		int stripe_nr;
1477 
1478 		for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) {
1479 			if (!rbio->bioc->stripes[stripe_nr].dev->bdev) {
1480 				found_missing = true;
1481 				bitmap_set(rbio->error_bitmap,
1482 					   stripe_nr * rbio->stripe_nsectors,
1483 					   rbio->stripe_nsectors);
1484 			}
1485 		}
1486 		ASSERT(found_missing);
1487 	}
1488 }
1489 
1490 /*
1491  * For subpage case, we can no longer set page Up-to-date directly for
1492  * stripe_pages[], thus we need to locate the sector.
1493  */
1494 static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio,
1495 					     phys_addr_t paddr)
1496 {
1497 	int i;
1498 
1499 	for (i = 0; i < rbio->nr_sectors; i++) {
1500 		struct sector_ptr *sector = &rbio->stripe_sectors[i];
1501 
1502 		if (sector->has_paddr && sector->paddr == paddr)
1503 			return sector;
1504 	}
1505 	return NULL;
1506 }
1507 
1508 /*
1509  * this sets each page in the bio uptodate.  It should only be used on private
1510  * rbio pages, nothing that comes in from the higher layers
1511  */
1512 static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio)
1513 {
1514 	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
1515 	struct bio_vec *bvec;
1516 	struct bvec_iter_all iter_all;
1517 
1518 	ASSERT(!bio_flagged(bio, BIO_CLONED));
1519 
1520 	bio_for_each_segment_all(bvec, bio, iter_all) {
1521 		struct sector_ptr *sector;
1522 		phys_addr_t paddr = bvec_phys(bvec);
1523 
1524 		for (u32 off = 0; off < bvec->bv_len; off += sectorsize) {
1525 			sector = find_stripe_sector(rbio, paddr + off);
1526 			ASSERT(sector);
1527 			if (sector)
1528 				sector->uptodate = 1;
1529 		}
1530 	}
1531 }
1532 
1533 static int get_bio_sector_nr(struct btrfs_raid_bio *rbio, struct bio *bio)
1534 {
1535 	phys_addr_t bvec_paddr = bvec_phys(bio_first_bvec_all(bio));
1536 	int i;
1537 
1538 	for (i = 0; i < rbio->nr_sectors; i++) {
1539 		if (rbio->stripe_sectors[i].paddr == bvec_paddr)
1540 			break;
1541 		if (rbio->bio_sectors[i].has_paddr &&
1542 		    rbio->bio_sectors[i].paddr == bvec_paddr)
1543 			break;
1544 	}
1545 	ASSERT(i < rbio->nr_sectors);
1546 	return i;
1547 }
1548 
1549 static void rbio_update_error_bitmap(struct btrfs_raid_bio *rbio, struct bio *bio)
1550 {
1551 	int total_sector_nr = get_bio_sector_nr(rbio, bio);
1552 	u32 bio_size = 0;
1553 	struct bio_vec *bvec;
1554 	int i;
1555 
1556 	bio_for_each_bvec_all(bvec, bio, i)
1557 		bio_size += bvec->bv_len;
1558 
1559 	/*
1560 	 * Since we can have multiple bios touching the error_bitmap, we cannot
1561 	 * call bitmap_set() without protection.
1562 	 *
1563 	 * Instead use set_bit() for each bit, as set_bit() itself is atomic.
1564 	 */
1565 	for (i = total_sector_nr; i < total_sector_nr +
1566 	     (bio_size >> rbio->bioc->fs_info->sectorsize_bits); i++)
1567 		set_bit(i, rbio->error_bitmap);
1568 }
1569 
1570 /* Verify the data sectors at read time. */
1571 static void verify_bio_data_sectors(struct btrfs_raid_bio *rbio,
1572 				    struct bio *bio)
1573 {
1574 	struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1575 	int total_sector_nr = get_bio_sector_nr(rbio, bio);
1576 	struct bio_vec *bvec;
1577 	struct bvec_iter_all iter_all;
1578 
1579 	/* No data csum for the whole stripe, no need to verify. */
1580 	if (!rbio->csum_bitmap || !rbio->csum_buf)
1581 		return;
1582 
1583 	/* P/Q stripes, they have no data csum to verify against. */
1584 	if (total_sector_nr >= rbio->nr_data * rbio->stripe_nsectors)
1585 		return;
1586 
1587 	bio_for_each_segment_all(bvec, bio, iter_all) {
1588 		void *kaddr;
1589 
1590 		kaddr = bvec_kmap_local(bvec);
1591 		for (u32 off = 0; off < bvec->bv_len;
1592 		     off += fs_info->sectorsize, total_sector_nr++) {
1593 			u8 csum_buf[BTRFS_CSUM_SIZE];
1594 			u8 *expected_csum = rbio->csum_buf +
1595 					    total_sector_nr * fs_info->csum_size;
1596 			int ret;
1597 
1598 			/* No csum for this sector, skip to the next sector. */
1599 			if (!test_bit(total_sector_nr, rbio->csum_bitmap))
1600 				continue;
1601 
1602 			ret = btrfs_check_sector_csum(fs_info, kaddr + off,
1603 						      csum_buf, expected_csum);
1604 			if (ret < 0)
1605 				set_bit(total_sector_nr, rbio->error_bitmap);
1606 		}
1607 		kunmap_local(kaddr);
1608 	}
1609 }
1610 
1611 static void raid_wait_read_end_io(struct bio *bio)
1612 {
1613 	struct btrfs_raid_bio *rbio = bio->bi_private;
1614 
1615 	if (bio->bi_status) {
1616 		rbio_update_error_bitmap(rbio, bio);
1617 	} else {
1618 		set_bio_pages_uptodate(rbio, bio);
1619 		verify_bio_data_sectors(rbio, bio);
1620 	}
1621 
1622 	bio_put(bio);
1623 	if (atomic_dec_and_test(&rbio->stripes_pending))
1624 		wake_up(&rbio->io_wait);
1625 }
1626 
1627 static void submit_read_wait_bio_list(struct btrfs_raid_bio *rbio,
1628 			     struct bio_list *bio_list)
1629 {
1630 	struct bio *bio;
1631 
1632 	atomic_set(&rbio->stripes_pending, bio_list_size(bio_list));
1633 	while ((bio = bio_list_pop(bio_list))) {
1634 		bio->bi_end_io = raid_wait_read_end_io;
1635 
1636 		if (trace_raid56_read_enabled()) {
1637 			struct raid56_bio_trace_info trace_info = { 0 };
1638 
1639 			bio_get_trace_info(rbio, bio, &trace_info);
1640 			trace_raid56_read(rbio, bio, &trace_info);
1641 		}
1642 		submit_bio(bio);
1643 	}
1644 
1645 	wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
1646 }
1647 
1648 static int alloc_rbio_data_pages(struct btrfs_raid_bio *rbio)
1649 {
1650 	const int data_pages = rbio->nr_data * rbio->stripe_npages;
1651 	int ret;
1652 
1653 	ret = btrfs_alloc_page_array(data_pages, rbio->stripe_pages, false);
1654 	if (ret < 0)
1655 		return ret;
1656 
1657 	index_stripe_sectors(rbio);
1658 	return 0;
1659 }
1660 
1661 /*
1662  * We use plugging call backs to collect full stripes.
1663  * Any time we get a partial stripe write while plugged
1664  * we collect it into a list.  When the unplug comes down,
1665  * we sort the list by logical block number and merge
1666  * everything we can into the same rbios
1667  */
1668 struct btrfs_plug_cb {
1669 	struct blk_plug_cb cb;
1670 	struct btrfs_fs_info *info;
1671 	struct list_head rbio_list;
1672 };
1673 
1674 /*
1675  * rbios on the plug list are sorted for easier merging.
1676  */
1677 static int plug_cmp(void *priv, const struct list_head *a,
1678 		    const struct list_head *b)
1679 {
1680 	const struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
1681 						       plug_list);
1682 	const struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
1683 						       plug_list);
1684 	u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
1685 	u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
1686 
1687 	if (a_sector < b_sector)
1688 		return -1;
1689 	if (a_sector > b_sector)
1690 		return 1;
1691 	return 0;
1692 }
1693 
1694 static void raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
1695 {
1696 	struct btrfs_plug_cb *plug = container_of(cb, struct btrfs_plug_cb, cb);
1697 	struct btrfs_raid_bio *cur;
1698 	struct btrfs_raid_bio *last = NULL;
1699 
1700 	list_sort(NULL, &plug->rbio_list, plug_cmp);
1701 
1702 	while (!list_empty(&plug->rbio_list)) {
1703 		cur = list_first_entry(&plug->rbio_list,
1704 				       struct btrfs_raid_bio, plug_list);
1705 		list_del_init(&cur->plug_list);
1706 
1707 		if (rbio_is_full(cur)) {
1708 			/* We have a full stripe, queue it down. */
1709 			start_async_work(cur, rmw_rbio_work);
1710 			continue;
1711 		}
1712 		if (last) {
1713 			if (rbio_can_merge(last, cur)) {
1714 				merge_rbio(last, cur);
1715 				free_raid_bio(cur);
1716 				continue;
1717 			}
1718 			start_async_work(last, rmw_rbio_work);
1719 		}
1720 		last = cur;
1721 	}
1722 	if (last)
1723 		start_async_work(last, rmw_rbio_work);
1724 	kfree(plug);
1725 }
1726 
1727 /* Add the original bio into rbio->bio_list, and update rbio::dbitmap. */
1728 static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio)
1729 {
1730 	const struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1731 	const u64 orig_logical = orig_bio->bi_iter.bi_sector << SECTOR_SHIFT;
1732 	const u64 full_stripe_start = rbio->bioc->full_stripe_logical;
1733 	const u32 orig_len = orig_bio->bi_iter.bi_size;
1734 	const u32 sectorsize = fs_info->sectorsize;
1735 	u64 cur_logical;
1736 
1737 	ASSERT_RBIO_LOGICAL(orig_logical >= full_stripe_start &&
1738 			    orig_logical + orig_len <= full_stripe_start +
1739 			    rbio->nr_data * BTRFS_STRIPE_LEN,
1740 			    rbio, orig_logical);
1741 
1742 	bio_list_add(&rbio->bio_list, orig_bio);
1743 	rbio->bio_list_bytes += orig_bio->bi_iter.bi_size;
1744 
1745 	/* Update the dbitmap. */
1746 	for (cur_logical = orig_logical; cur_logical < orig_logical + orig_len;
1747 	     cur_logical += sectorsize) {
1748 		int bit = ((u32)(cur_logical - full_stripe_start) >>
1749 			   fs_info->sectorsize_bits) % rbio->stripe_nsectors;
1750 
1751 		set_bit(bit, &rbio->dbitmap);
1752 	}
1753 }
1754 
1755 /*
1756  * our main entry point for writes from the rest of the FS.
1757  */
1758 void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc)
1759 {
1760 	struct btrfs_fs_info *fs_info = bioc->fs_info;
1761 	struct btrfs_raid_bio *rbio;
1762 	struct btrfs_plug_cb *plug = NULL;
1763 	struct blk_plug_cb *cb;
1764 
1765 	rbio = alloc_rbio(fs_info, bioc);
1766 	if (IS_ERR(rbio)) {
1767 		bio->bi_status = errno_to_blk_status(PTR_ERR(rbio));
1768 		bio_endio(bio);
1769 		return;
1770 	}
1771 	rbio->operation = BTRFS_RBIO_WRITE;
1772 	rbio_add_bio(rbio, bio);
1773 
1774 	/*
1775 	 * Don't plug on full rbios, just get them out the door
1776 	 * as quickly as we can
1777 	 */
1778 	if (!rbio_is_full(rbio)) {
1779 		cb = blk_check_plugged(raid_unplug, fs_info, sizeof(*plug));
1780 		if (cb) {
1781 			plug = container_of(cb, struct btrfs_plug_cb, cb);
1782 			if (!plug->info) {
1783 				plug->info = fs_info;
1784 				INIT_LIST_HEAD(&plug->rbio_list);
1785 			}
1786 			list_add_tail(&rbio->plug_list, &plug->rbio_list);
1787 			return;
1788 		}
1789 	}
1790 
1791 	/*
1792 	 * Either we don't have any existing plug, or we're doing a full stripe,
1793 	 * queue the rmw work now.
1794 	 */
1795 	start_async_work(rbio, rmw_rbio_work);
1796 }
1797 
1798 static int verify_one_sector(struct btrfs_raid_bio *rbio,
1799 			     int stripe_nr, int sector_nr)
1800 {
1801 	struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1802 	struct sector_ptr *sector;
1803 	u8 csum_buf[BTRFS_CSUM_SIZE];
1804 	u8 *csum_expected;
1805 	void *kaddr;
1806 	int ret;
1807 
1808 	if (!rbio->csum_bitmap || !rbio->csum_buf)
1809 		return 0;
1810 
1811 	/* No way to verify P/Q as they are not covered by data csum. */
1812 	if (stripe_nr >= rbio->nr_data)
1813 		return 0;
1814 	/*
1815 	 * If we're rebuilding a read, we have to use pages from the
1816 	 * bio list if possible.
1817 	 */
1818 	if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
1819 		sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0);
1820 	} else {
1821 		sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr);
1822 	}
1823 
1824 	csum_expected = rbio->csum_buf +
1825 			(stripe_nr * rbio->stripe_nsectors + sector_nr) *
1826 			fs_info->csum_size;
1827 	kaddr = kmap_local_sector(sector);
1828 	ret = btrfs_check_sector_csum(fs_info, kaddr, csum_buf, csum_expected);
1829 	kunmap_local(kaddr);
1830 	return ret;
1831 }
1832 
1833 /*
1834  * Recover a vertical stripe specified by @sector_nr.
1835  * @*pointers are the pre-allocated pointers by the caller, so we don't
1836  * need to allocate/free the pointers again and again.
1837  */
1838 static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr,
1839 			    void **pointers, void **unmap_array)
1840 {
1841 	struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1842 	struct sector_ptr *sector;
1843 	const u32 sectorsize = fs_info->sectorsize;
1844 	int found_errors;
1845 	int faila;
1846 	int failb;
1847 	int stripe_nr;
1848 	int ret = 0;
1849 
1850 	/*
1851 	 * Now we just use bitmap to mark the horizontal stripes in
1852 	 * which we have data when doing parity scrub.
1853 	 */
1854 	if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
1855 	    !test_bit(sector_nr, &rbio->dbitmap))
1856 		return 0;
1857 
1858 	found_errors = get_rbio_veritical_errors(rbio, sector_nr, &faila,
1859 						 &failb);
1860 	/*
1861 	 * No errors in the vertical stripe, skip it.  Can happen for recovery
1862 	 * which only part of a stripe failed csum check.
1863 	 */
1864 	if (!found_errors)
1865 		return 0;
1866 
1867 	if (found_errors > rbio->bioc->max_errors)
1868 		return -EIO;
1869 
1870 	/*
1871 	 * Setup our array of pointers with sectors from each stripe
1872 	 *
1873 	 * NOTE: store a duplicate array of pointers to preserve the
1874 	 * pointer order.
1875 	 */
1876 	for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) {
1877 		/*
1878 		 * If we're rebuilding a read, we have to use pages from the
1879 		 * bio list if possible.
1880 		 */
1881 		if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
1882 			sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0);
1883 		} else {
1884 			sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr);
1885 		}
1886 		pointers[stripe_nr] = kmap_local_sector(sector);
1887 		unmap_array[stripe_nr] = pointers[stripe_nr];
1888 	}
1889 
1890 	/* All raid6 handling here */
1891 	if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) {
1892 		/* Single failure, rebuild from parity raid5 style */
1893 		if (failb < 0) {
1894 			if (faila == rbio->nr_data)
1895 				/*
1896 				 * Just the P stripe has failed, without
1897 				 * a bad data or Q stripe.
1898 				 * We have nothing to do, just skip the
1899 				 * recovery for this stripe.
1900 				 */
1901 				goto cleanup;
1902 			/*
1903 			 * a single failure in raid6 is rebuilt
1904 			 * in the pstripe code below
1905 			 */
1906 			goto pstripe;
1907 		}
1908 
1909 		/*
1910 		 * If the q stripe is failed, do a pstripe reconstruction from
1911 		 * the xors.
1912 		 * If both the q stripe and the P stripe are failed, we're
1913 		 * here due to a crc mismatch and we can't give them the
1914 		 * data they want.
1915 		 */
1916 		if (failb == rbio->real_stripes - 1) {
1917 			if (faila == rbio->real_stripes - 2)
1918 				/*
1919 				 * Only P and Q are corrupted.
1920 				 * We only care about data stripes recovery,
1921 				 * can skip this vertical stripe.
1922 				 */
1923 				goto cleanup;
1924 			/*
1925 			 * Otherwise we have one bad data stripe and
1926 			 * a good P stripe.  raid5!
1927 			 */
1928 			goto pstripe;
1929 		}
1930 
1931 		if (failb == rbio->real_stripes - 2) {
1932 			raid6_datap_recov(rbio->real_stripes, sectorsize,
1933 					  faila, pointers);
1934 		} else {
1935 			raid6_2data_recov(rbio->real_stripes, sectorsize,
1936 					  faila, failb, pointers);
1937 		}
1938 	} else {
1939 		void *p;
1940 
1941 		/* Rebuild from P stripe here (raid5 or raid6). */
1942 		ASSERT(failb == -1);
1943 pstripe:
1944 		/* Copy parity block into failed block to start with */
1945 		memcpy(pointers[faila], pointers[rbio->nr_data], sectorsize);
1946 
1947 		/* Rearrange the pointer array */
1948 		p = pointers[faila];
1949 		for (stripe_nr = faila; stripe_nr < rbio->nr_data - 1;
1950 		     stripe_nr++)
1951 			pointers[stripe_nr] = pointers[stripe_nr + 1];
1952 		pointers[rbio->nr_data - 1] = p;
1953 
1954 		/* Xor in the rest */
1955 		run_xor(pointers, rbio->nr_data - 1, sectorsize);
1956 
1957 	}
1958 
1959 	/*
1960 	 * No matter if this is a RMW or recovery, we should have all
1961 	 * failed sectors repaired in the vertical stripe, thus they are now
1962 	 * uptodate.
1963 	 * Especially if we determine to cache the rbio, we need to
1964 	 * have at least all data sectors uptodate.
1965 	 *
1966 	 * If possible, also check if the repaired sector matches its data
1967 	 * checksum.
1968 	 */
1969 	if (faila >= 0) {
1970 		ret = verify_one_sector(rbio, faila, sector_nr);
1971 		if (ret < 0)
1972 			goto cleanup;
1973 
1974 		sector = rbio_stripe_sector(rbio, faila, sector_nr);
1975 		sector->uptodate = 1;
1976 	}
1977 	if (failb >= 0) {
1978 		ret = verify_one_sector(rbio, failb, sector_nr);
1979 		if (ret < 0)
1980 			goto cleanup;
1981 
1982 		sector = rbio_stripe_sector(rbio, failb, sector_nr);
1983 		sector->uptodate = 1;
1984 	}
1985 
1986 cleanup:
1987 	for (stripe_nr = rbio->real_stripes - 1; stripe_nr >= 0; stripe_nr--)
1988 		kunmap_local(unmap_array[stripe_nr]);
1989 	return ret;
1990 }
1991 
1992 static int recover_sectors(struct btrfs_raid_bio *rbio)
1993 {
1994 	void **pointers = NULL;
1995 	void **unmap_array = NULL;
1996 	int sectornr;
1997 	int ret = 0;
1998 
1999 	/*
2000 	 * @pointers array stores the pointer for each sector.
2001 	 *
2002 	 * @unmap_array stores copy of pointers that does not get reordered
2003 	 * during reconstruction so that kunmap_local works.
2004 	 */
2005 	pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
2006 	unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
2007 	if (!pointers || !unmap_array) {
2008 		ret = -ENOMEM;
2009 		goto out;
2010 	}
2011 
2012 	if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
2013 		spin_lock(&rbio->bio_list_lock);
2014 		set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
2015 		spin_unlock(&rbio->bio_list_lock);
2016 	}
2017 
2018 	index_rbio_pages(rbio);
2019 
2020 	for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
2021 		ret = recover_vertical(rbio, sectornr, pointers, unmap_array);
2022 		if (ret < 0)
2023 			break;
2024 	}
2025 
2026 out:
2027 	kfree(pointers);
2028 	kfree(unmap_array);
2029 	return ret;
2030 }
2031 
2032 static void recover_rbio(struct btrfs_raid_bio *rbio)
2033 {
2034 	struct bio_list bio_list = BIO_EMPTY_LIST;
2035 	int total_sector_nr;
2036 	int ret = 0;
2037 
2038 	/*
2039 	 * Either we're doing recover for a read failure or degraded write,
2040 	 * caller should have set error bitmap correctly.
2041 	 */
2042 	ASSERT(bitmap_weight(rbio->error_bitmap, rbio->nr_sectors));
2043 
2044 	/* For recovery, we need to read all sectors including P/Q. */
2045 	ret = alloc_rbio_pages(rbio);
2046 	if (ret < 0)
2047 		goto out;
2048 
2049 	index_rbio_pages(rbio);
2050 
2051 	/*
2052 	 * Read everything that hasn't failed. However this time we will
2053 	 * not trust any cached sector.
2054 	 * As we may read out some stale data but higher layer is not reading
2055 	 * that stale part.
2056 	 *
2057 	 * So here we always re-read everything in recovery path.
2058 	 */
2059 	for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
2060 	     total_sector_nr++) {
2061 		int stripe = total_sector_nr / rbio->stripe_nsectors;
2062 		int sectornr = total_sector_nr % rbio->stripe_nsectors;
2063 		struct sector_ptr *sector;
2064 
2065 		/*
2066 		 * Skip the range which has error.  It can be a range which is
2067 		 * marked error (for csum mismatch), or it can be a missing
2068 		 * device.
2069 		 */
2070 		if (!rbio->bioc->stripes[stripe].dev->bdev ||
2071 		    test_bit(total_sector_nr, rbio->error_bitmap)) {
2072 			/*
2073 			 * Also set the error bit for missing device, which
2074 			 * may not yet have its error bit set.
2075 			 */
2076 			set_bit(total_sector_nr, rbio->error_bitmap);
2077 			continue;
2078 		}
2079 
2080 		sector = rbio_stripe_sector(rbio, stripe, sectornr);
2081 		ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
2082 					 sectornr, REQ_OP_READ);
2083 		if (ret < 0) {
2084 			bio_list_put(&bio_list);
2085 			goto out;
2086 		}
2087 	}
2088 
2089 	submit_read_wait_bio_list(rbio, &bio_list);
2090 	ret = recover_sectors(rbio);
2091 out:
2092 	rbio_orig_end_io(rbio, errno_to_blk_status(ret));
2093 }
2094 
2095 static void recover_rbio_work(struct work_struct *work)
2096 {
2097 	struct btrfs_raid_bio *rbio;
2098 
2099 	rbio = container_of(work, struct btrfs_raid_bio, work);
2100 	if (!lock_stripe_add(rbio))
2101 		recover_rbio(rbio);
2102 }
2103 
2104 static void recover_rbio_work_locked(struct work_struct *work)
2105 {
2106 	recover_rbio(container_of(work, struct btrfs_raid_bio, work));
2107 }
2108 
2109 static void set_rbio_raid6_extra_error(struct btrfs_raid_bio *rbio, int mirror_num)
2110 {
2111 	bool found = false;
2112 	int sector_nr;
2113 
2114 	/*
2115 	 * This is for RAID6 extra recovery tries, thus mirror number should
2116 	 * be large than 2.
2117 	 * Mirror 1 means read from data stripes. Mirror 2 means rebuild using
2118 	 * RAID5 methods.
2119 	 */
2120 	ASSERT(mirror_num > 2);
2121 	for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) {
2122 		int found_errors;
2123 		int faila;
2124 		int failb;
2125 
2126 		found_errors = get_rbio_veritical_errors(rbio, sector_nr,
2127 							 &faila, &failb);
2128 		/* This vertical stripe doesn't have errors. */
2129 		if (!found_errors)
2130 			continue;
2131 
2132 		/*
2133 		 * If we found errors, there should be only one error marked
2134 		 * by previous set_rbio_range_error().
2135 		 */
2136 		ASSERT(found_errors == 1);
2137 		found = true;
2138 
2139 		/* Now select another stripe to mark as error. */
2140 		failb = rbio->real_stripes - (mirror_num - 1);
2141 		if (failb <= faila)
2142 			failb--;
2143 
2144 		/* Set the extra bit in error bitmap. */
2145 		if (failb >= 0)
2146 			set_bit(failb * rbio->stripe_nsectors + sector_nr,
2147 				rbio->error_bitmap);
2148 	}
2149 
2150 	/* We should found at least one vertical stripe with error.*/
2151 	ASSERT(found);
2152 }
2153 
2154 /*
2155  * the main entry point for reads from the higher layers.  This
2156  * is really only called when the normal read path had a failure,
2157  * so we assume the bio they send down corresponds to a failed part
2158  * of the drive.
2159  */
2160 void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
2161 			   int mirror_num)
2162 {
2163 	struct btrfs_fs_info *fs_info = bioc->fs_info;
2164 	struct btrfs_raid_bio *rbio;
2165 
2166 	rbio = alloc_rbio(fs_info, bioc);
2167 	if (IS_ERR(rbio)) {
2168 		bio->bi_status = errno_to_blk_status(PTR_ERR(rbio));
2169 		bio_endio(bio);
2170 		return;
2171 	}
2172 
2173 	rbio->operation = BTRFS_RBIO_READ_REBUILD;
2174 	rbio_add_bio(rbio, bio);
2175 
2176 	set_rbio_range_error(rbio, bio);
2177 
2178 	/*
2179 	 * Loop retry:
2180 	 * for 'mirror == 2', reconstruct from all other stripes.
2181 	 * for 'mirror_num > 2', select a stripe to fail on every retry.
2182 	 */
2183 	if (mirror_num > 2)
2184 		set_rbio_raid6_extra_error(rbio, mirror_num);
2185 
2186 	start_async_work(rbio, recover_rbio_work);
2187 }
2188 
2189 static void fill_data_csums(struct btrfs_raid_bio *rbio)
2190 {
2191 	struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
2192 	struct btrfs_root *csum_root = btrfs_csum_root(fs_info,
2193 						       rbio->bioc->full_stripe_logical);
2194 	const u64 start = rbio->bioc->full_stripe_logical;
2195 	const u32 len = (rbio->nr_data * rbio->stripe_nsectors) <<
2196 			fs_info->sectorsize_bits;
2197 	int ret;
2198 
2199 	/* The rbio should not have its csum buffer initialized. */
2200 	ASSERT(!rbio->csum_buf && !rbio->csum_bitmap);
2201 
2202 	/*
2203 	 * Skip the csum search if:
2204 	 *
2205 	 * - The rbio doesn't belong to data block groups
2206 	 *   Then we are doing IO for tree blocks, no need to search csums.
2207 	 *
2208 	 * - The rbio belongs to mixed block groups
2209 	 *   This is to avoid deadlock, as we're already holding the full
2210 	 *   stripe lock, if we trigger a metadata read, and it needs to do
2211 	 *   raid56 recovery, we will deadlock.
2212 	 */
2213 	if (!(rbio->bioc->map_type & BTRFS_BLOCK_GROUP_DATA) ||
2214 	    rbio->bioc->map_type & BTRFS_BLOCK_GROUP_METADATA)
2215 		return;
2216 
2217 	rbio->csum_buf = kzalloc(rbio->nr_data * rbio->stripe_nsectors *
2218 				 fs_info->csum_size, GFP_NOFS);
2219 	rbio->csum_bitmap = bitmap_zalloc(rbio->nr_data * rbio->stripe_nsectors,
2220 					  GFP_NOFS);
2221 	if (!rbio->csum_buf || !rbio->csum_bitmap) {
2222 		ret = -ENOMEM;
2223 		goto error;
2224 	}
2225 
2226 	ret = btrfs_lookup_csums_bitmap(csum_root, NULL, start, start + len - 1,
2227 					rbio->csum_buf, rbio->csum_bitmap);
2228 	if (ret < 0)
2229 		goto error;
2230 	if (bitmap_empty(rbio->csum_bitmap, len >> fs_info->sectorsize_bits))
2231 		goto no_csum;
2232 	return;
2233 
2234 error:
2235 	/*
2236 	 * We failed to allocate memory or grab the csum, but it's not fatal,
2237 	 * we can still continue.  But better to warn users that RMW is no
2238 	 * longer safe for this particular sub-stripe write.
2239 	 */
2240 	btrfs_warn_rl(fs_info,
2241 "sub-stripe write for full stripe %llu is not safe, failed to get csum: %d",
2242 			rbio->bioc->full_stripe_logical, ret);
2243 no_csum:
2244 	kfree(rbio->csum_buf);
2245 	bitmap_free(rbio->csum_bitmap);
2246 	rbio->csum_buf = NULL;
2247 	rbio->csum_bitmap = NULL;
2248 }
2249 
2250 static int rmw_read_wait_recover(struct btrfs_raid_bio *rbio)
2251 {
2252 	struct bio_list bio_list = BIO_EMPTY_LIST;
2253 	int total_sector_nr;
2254 	int ret = 0;
2255 
2256 	/*
2257 	 * Fill the data csums we need for data verification.  We need to fill
2258 	 * the csum_bitmap/csum_buf first, as our endio function will try to
2259 	 * verify the data sectors.
2260 	 */
2261 	fill_data_csums(rbio);
2262 
2263 	/*
2264 	 * Build a list of bios to read all sectors (including data and P/Q).
2265 	 *
2266 	 * This behavior is to compensate the later csum verification and recovery.
2267 	 */
2268 	for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
2269 	     total_sector_nr++) {
2270 		struct sector_ptr *sector;
2271 		int stripe = total_sector_nr / rbio->stripe_nsectors;
2272 		int sectornr = total_sector_nr % rbio->stripe_nsectors;
2273 
2274 		sector = rbio_stripe_sector(rbio, stripe, sectornr);
2275 		ret = rbio_add_io_sector(rbio, &bio_list, sector,
2276 			       stripe, sectornr, REQ_OP_READ);
2277 		if (ret) {
2278 			bio_list_put(&bio_list);
2279 			return ret;
2280 		}
2281 	}
2282 
2283 	/*
2284 	 * We may or may not have any corrupted sectors (including missing dev
2285 	 * and csum mismatch), just let recover_sectors() to handle them all.
2286 	 */
2287 	submit_read_wait_bio_list(rbio, &bio_list);
2288 	return recover_sectors(rbio);
2289 }
2290 
2291 static void raid_wait_write_end_io(struct bio *bio)
2292 {
2293 	struct btrfs_raid_bio *rbio = bio->bi_private;
2294 
2295 	if (bio->bi_status)
2296 		rbio_update_error_bitmap(rbio, bio);
2297 	bio_put(bio);
2298 	if (atomic_dec_and_test(&rbio->stripes_pending))
2299 		wake_up(&rbio->io_wait);
2300 }
2301 
2302 static void submit_write_bios(struct btrfs_raid_bio *rbio,
2303 			      struct bio_list *bio_list)
2304 {
2305 	struct bio *bio;
2306 
2307 	atomic_set(&rbio->stripes_pending, bio_list_size(bio_list));
2308 	while ((bio = bio_list_pop(bio_list))) {
2309 		bio->bi_end_io = raid_wait_write_end_io;
2310 
2311 		if (trace_raid56_write_enabled()) {
2312 			struct raid56_bio_trace_info trace_info = { 0 };
2313 
2314 			bio_get_trace_info(rbio, bio, &trace_info);
2315 			trace_raid56_write(rbio, bio, &trace_info);
2316 		}
2317 		submit_bio(bio);
2318 	}
2319 }
2320 
2321 /*
2322  * To determine if we need to read any sector from the disk.
2323  * Should only be utilized in RMW path, to skip cached rbio.
2324  */
2325 static bool need_read_stripe_sectors(struct btrfs_raid_bio *rbio)
2326 {
2327 	int i;
2328 
2329 	for (i = 0; i < rbio->nr_data * rbio->stripe_nsectors; i++) {
2330 		struct sector_ptr *sector = &rbio->stripe_sectors[i];
2331 
2332 		/*
2333 		 * We have a sector which doesn't have page nor uptodate,
2334 		 * thus this rbio can not be cached one, as cached one must
2335 		 * have all its data sectors present and uptodate.
2336 		 */
2337 		if (!sector->has_paddr || !sector->uptodate)
2338 			return true;
2339 	}
2340 	return false;
2341 }
2342 
2343 static void rmw_rbio(struct btrfs_raid_bio *rbio)
2344 {
2345 	struct bio_list bio_list;
2346 	int sectornr;
2347 	int ret = 0;
2348 
2349 	/*
2350 	 * Allocate the pages for parity first, as P/Q pages will always be
2351 	 * needed for both full-stripe and sub-stripe writes.
2352 	 */
2353 	ret = alloc_rbio_parity_pages(rbio);
2354 	if (ret < 0)
2355 		goto out;
2356 
2357 	/*
2358 	 * Either full stripe write, or we have every data sector already
2359 	 * cached, can go to write path immediately.
2360 	 */
2361 	if (!rbio_is_full(rbio) && need_read_stripe_sectors(rbio)) {
2362 		/*
2363 		 * Now we're doing sub-stripe write, also need all data stripes
2364 		 * to do the full RMW.
2365 		 */
2366 		ret = alloc_rbio_data_pages(rbio);
2367 		if (ret < 0)
2368 			goto out;
2369 
2370 		index_rbio_pages(rbio);
2371 
2372 		ret = rmw_read_wait_recover(rbio);
2373 		if (ret < 0)
2374 			goto out;
2375 	}
2376 
2377 	/*
2378 	 * At this stage we're not allowed to add any new bios to the
2379 	 * bio list any more, anyone else that wants to change this stripe
2380 	 * needs to do their own rmw.
2381 	 */
2382 	spin_lock(&rbio->bio_list_lock);
2383 	set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
2384 	spin_unlock(&rbio->bio_list_lock);
2385 
2386 	bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
2387 
2388 	index_rbio_pages(rbio);
2389 
2390 	/*
2391 	 * We don't cache full rbios because we're assuming
2392 	 * the higher layers are unlikely to use this area of
2393 	 * the disk again soon.  If they do use it again,
2394 	 * hopefully they will send another full bio.
2395 	 */
2396 	if (!rbio_is_full(rbio))
2397 		cache_rbio_pages(rbio);
2398 	else
2399 		clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
2400 
2401 	for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++)
2402 		generate_pq_vertical(rbio, sectornr);
2403 
2404 	bio_list_init(&bio_list);
2405 	ret = rmw_assemble_write_bios(rbio, &bio_list);
2406 	if (ret < 0)
2407 		goto out;
2408 
2409 	/* We should have at least one bio assembled. */
2410 	ASSERT(bio_list_size(&bio_list));
2411 	submit_write_bios(rbio, &bio_list);
2412 	wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
2413 
2414 	/* We may have more errors than our tolerance during the read. */
2415 	for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
2416 		int found_errors;
2417 
2418 		found_errors = get_rbio_veritical_errors(rbio, sectornr, NULL, NULL);
2419 		if (found_errors > rbio->bioc->max_errors) {
2420 			ret = -EIO;
2421 			break;
2422 		}
2423 	}
2424 out:
2425 	rbio_orig_end_io(rbio, errno_to_blk_status(ret));
2426 }
2427 
2428 static void rmw_rbio_work(struct work_struct *work)
2429 {
2430 	struct btrfs_raid_bio *rbio;
2431 
2432 	rbio = container_of(work, struct btrfs_raid_bio, work);
2433 	if (lock_stripe_add(rbio) == 0)
2434 		rmw_rbio(rbio);
2435 }
2436 
2437 static void rmw_rbio_work_locked(struct work_struct *work)
2438 {
2439 	rmw_rbio(container_of(work, struct btrfs_raid_bio, work));
2440 }
2441 
2442 /*
2443  * The following code is used to scrub/replace the parity stripe
2444  *
2445  * Caller must have already increased bio_counter for getting @bioc.
2446  *
2447  * Note: We need make sure all the pages that add into the scrub/replace
2448  * raid bio are correct and not be changed during the scrub/replace. That
2449  * is those pages just hold metadata or file data with checksum.
2450  */
2451 
2452 struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
2453 				struct btrfs_io_context *bioc,
2454 				struct btrfs_device *scrub_dev,
2455 				unsigned long *dbitmap, int stripe_nsectors)
2456 {
2457 	struct btrfs_fs_info *fs_info = bioc->fs_info;
2458 	struct btrfs_raid_bio *rbio;
2459 	int i;
2460 
2461 	rbio = alloc_rbio(fs_info, bioc);
2462 	if (IS_ERR(rbio))
2463 		return NULL;
2464 	bio_list_add(&rbio->bio_list, bio);
2465 	/*
2466 	 * This is a special bio which is used to hold the completion handler
2467 	 * and make the scrub rbio is similar to the other types
2468 	 */
2469 	ASSERT(!bio->bi_iter.bi_size);
2470 	rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
2471 
2472 	/*
2473 	 * After mapping bioc with BTRFS_MAP_WRITE, parities have been sorted
2474 	 * to the end position, so this search can start from the first parity
2475 	 * stripe.
2476 	 */
2477 	for (i = rbio->nr_data; i < rbio->real_stripes; i++) {
2478 		if (bioc->stripes[i].dev == scrub_dev) {
2479 			rbio->scrubp = i;
2480 			break;
2481 		}
2482 	}
2483 	ASSERT_RBIO_STRIPE(i < rbio->real_stripes, rbio, i);
2484 
2485 	bitmap_copy(&rbio->dbitmap, dbitmap, stripe_nsectors);
2486 	return rbio;
2487 }
2488 
2489 /*
2490  * We just scrub the parity that we have correct data on the same horizontal,
2491  * so we needn't allocate all pages for all the stripes.
2492  */
2493 static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
2494 {
2495 	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
2496 	int total_sector_nr;
2497 
2498 	for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
2499 	     total_sector_nr++) {
2500 		struct page *page;
2501 		int sectornr = total_sector_nr % rbio->stripe_nsectors;
2502 		int index = (total_sector_nr * sectorsize) >> PAGE_SHIFT;
2503 
2504 		if (!test_bit(sectornr, &rbio->dbitmap))
2505 			continue;
2506 		if (rbio->stripe_pages[index])
2507 			continue;
2508 		page = alloc_page(GFP_NOFS);
2509 		if (!page)
2510 			return -ENOMEM;
2511 		rbio->stripe_pages[index] = page;
2512 	}
2513 	index_stripe_sectors(rbio);
2514 	return 0;
2515 }
2516 
2517 static int finish_parity_scrub(struct btrfs_raid_bio *rbio)
2518 {
2519 	struct btrfs_io_context *bioc = rbio->bioc;
2520 	const u32 sectorsize = bioc->fs_info->sectorsize;
2521 	void **pointers = rbio->finish_pointers;
2522 	unsigned long *pbitmap = &rbio->finish_pbitmap;
2523 	int nr_data = rbio->nr_data;
2524 	int stripe;
2525 	int sectornr;
2526 	bool has_qstripe;
2527 	struct page *page;
2528 	struct sector_ptr p_sector = { 0 };
2529 	struct sector_ptr q_sector = { 0 };
2530 	struct bio_list bio_list;
2531 	int is_replace = 0;
2532 	int ret;
2533 
2534 	bio_list_init(&bio_list);
2535 
2536 	if (rbio->real_stripes - rbio->nr_data == 1)
2537 		has_qstripe = false;
2538 	else if (rbio->real_stripes - rbio->nr_data == 2)
2539 		has_qstripe = true;
2540 	else
2541 		BUG();
2542 
2543 	/*
2544 	 * Replace is running and our P/Q stripe is being replaced, then we
2545 	 * need to duplicate the final write to replace target.
2546 	 */
2547 	if (bioc->replace_nr_stripes && bioc->replace_stripe_src == rbio->scrubp) {
2548 		is_replace = 1;
2549 		bitmap_copy(pbitmap, &rbio->dbitmap, rbio->stripe_nsectors);
2550 	}
2551 
2552 	/*
2553 	 * Because the higher layers(scrubber) are unlikely to
2554 	 * use this area of the disk again soon, so don't cache
2555 	 * it.
2556 	 */
2557 	clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
2558 
2559 	page = alloc_page(GFP_NOFS);
2560 	if (!page)
2561 		return -ENOMEM;
2562 	p_sector.has_paddr = true;
2563 	p_sector.paddr = page_to_phys(page);
2564 	p_sector.uptodate = 1;
2565 	page = NULL;
2566 
2567 	if (has_qstripe) {
2568 		/* RAID6, allocate and map temp space for the Q stripe */
2569 		page = alloc_page(GFP_NOFS);
2570 		if (!page) {
2571 			__free_page(phys_to_page(p_sector.paddr));
2572 			p_sector.has_paddr = false;
2573 			return -ENOMEM;
2574 		}
2575 		q_sector.has_paddr = true;
2576 		q_sector.paddr = page_to_phys(page);
2577 		q_sector.uptodate = 1;
2578 		page = NULL;
2579 		pointers[rbio->real_stripes - 1] = kmap_local_sector(&q_sector);
2580 	}
2581 
2582 	bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
2583 
2584 	/* Map the parity stripe just once */
2585 	pointers[nr_data] = kmap_local_sector(&p_sector);
2586 
2587 	for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) {
2588 		struct sector_ptr *sector;
2589 		void *parity;
2590 
2591 		/* first collect one page from each data stripe */
2592 		for (stripe = 0; stripe < nr_data; stripe++) {
2593 			sector = sector_in_rbio(rbio, stripe, sectornr, 0);
2594 			pointers[stripe] = kmap_local_sector(sector);
2595 		}
2596 
2597 		if (has_qstripe) {
2598 			assert_rbio(rbio);
2599 			/* RAID6, call the library function to fill in our P/Q */
2600 			raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
2601 						pointers);
2602 		} else {
2603 			/* raid5 */
2604 			memcpy(pointers[nr_data], pointers[0], sectorsize);
2605 			run_xor(pointers + 1, nr_data - 1, sectorsize);
2606 		}
2607 
2608 		/* Check scrubbing parity and repair it */
2609 		sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
2610 		parity = kmap_local_sector(sector);
2611 		if (memcmp(parity, pointers[rbio->scrubp], sectorsize) != 0)
2612 			memcpy(parity, pointers[rbio->scrubp], sectorsize);
2613 		else
2614 			/* Parity is right, needn't writeback */
2615 			bitmap_clear(&rbio->dbitmap, sectornr, 1);
2616 		kunmap_local(parity);
2617 
2618 		for (stripe = nr_data - 1; stripe >= 0; stripe--)
2619 			kunmap_local(pointers[stripe]);
2620 	}
2621 
2622 	kunmap_local(pointers[nr_data]);
2623 	__free_page(phys_to_page(p_sector.paddr));
2624 	p_sector.has_paddr = false;
2625 	if (q_sector.has_paddr) {
2626 		__free_page(phys_to_page(q_sector.paddr));
2627 		q_sector.has_paddr = false;
2628 	}
2629 
2630 	/*
2631 	 * time to start writing.  Make bios for everything from the
2632 	 * higher layers (the bio_list in our rbio) and our p/q.  Ignore
2633 	 * everything else.
2634 	 */
2635 	for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) {
2636 		struct sector_ptr *sector;
2637 
2638 		sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
2639 		ret = rbio_add_io_sector(rbio, &bio_list, sector, rbio->scrubp,
2640 					 sectornr, REQ_OP_WRITE);
2641 		if (ret)
2642 			goto cleanup;
2643 	}
2644 
2645 	if (!is_replace)
2646 		goto submit_write;
2647 
2648 	/*
2649 	 * Replace is running and our parity stripe needs to be duplicated to
2650 	 * the target device.  Check we have a valid source stripe number.
2651 	 */
2652 	ASSERT_RBIO(rbio->bioc->replace_stripe_src >= 0, rbio);
2653 	for_each_set_bit(sectornr, pbitmap, rbio->stripe_nsectors) {
2654 		struct sector_ptr *sector;
2655 
2656 		sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
2657 		ret = rbio_add_io_sector(rbio, &bio_list, sector,
2658 					 rbio->real_stripes,
2659 					 sectornr, REQ_OP_WRITE);
2660 		if (ret)
2661 			goto cleanup;
2662 	}
2663 
2664 submit_write:
2665 	submit_write_bios(rbio, &bio_list);
2666 	return 0;
2667 
2668 cleanup:
2669 	bio_list_put(&bio_list);
2670 	return ret;
2671 }
2672 
2673 static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
2674 {
2675 	if (stripe >= 0 && stripe < rbio->nr_data)
2676 		return 1;
2677 	return 0;
2678 }
2679 
2680 static int recover_scrub_rbio(struct btrfs_raid_bio *rbio)
2681 {
2682 	void **pointers = NULL;
2683 	void **unmap_array = NULL;
2684 	int sector_nr;
2685 	int ret = 0;
2686 
2687 	/*
2688 	 * @pointers array stores the pointer for each sector.
2689 	 *
2690 	 * @unmap_array stores copy of pointers that does not get reordered
2691 	 * during reconstruction so that kunmap_local works.
2692 	 */
2693 	pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
2694 	unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
2695 	if (!pointers || !unmap_array) {
2696 		ret = -ENOMEM;
2697 		goto out;
2698 	}
2699 
2700 	for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) {
2701 		int dfail = 0, failp = -1;
2702 		int faila;
2703 		int failb;
2704 		int found_errors;
2705 
2706 		found_errors = get_rbio_veritical_errors(rbio, sector_nr,
2707 							 &faila, &failb);
2708 		if (found_errors > rbio->bioc->max_errors) {
2709 			ret = -EIO;
2710 			goto out;
2711 		}
2712 		if (found_errors == 0)
2713 			continue;
2714 
2715 		/* We should have at least one error here. */
2716 		ASSERT(faila >= 0 || failb >= 0);
2717 
2718 		if (is_data_stripe(rbio, faila))
2719 			dfail++;
2720 		else if (is_parity_stripe(faila))
2721 			failp = faila;
2722 
2723 		if (is_data_stripe(rbio, failb))
2724 			dfail++;
2725 		else if (is_parity_stripe(failb))
2726 			failp = failb;
2727 		/*
2728 		 * Because we can not use a scrubbing parity to repair the
2729 		 * data, so the capability of the repair is declined.  (In the
2730 		 * case of RAID5, we can not repair anything.)
2731 		 */
2732 		if (dfail > rbio->bioc->max_errors - 1) {
2733 			ret = -EIO;
2734 			goto out;
2735 		}
2736 		/*
2737 		 * If all data is good, only parity is correctly, just repair
2738 		 * the parity, no need to recover data stripes.
2739 		 */
2740 		if (dfail == 0)
2741 			continue;
2742 
2743 		/*
2744 		 * Here means we got one corrupted data stripe and one
2745 		 * corrupted parity on RAID6, if the corrupted parity is
2746 		 * scrubbing parity, luckily, use the other one to repair the
2747 		 * data, or we can not repair the data stripe.
2748 		 */
2749 		if (failp != rbio->scrubp) {
2750 			ret = -EIO;
2751 			goto out;
2752 		}
2753 
2754 		ret = recover_vertical(rbio, sector_nr, pointers, unmap_array);
2755 		if (ret < 0)
2756 			goto out;
2757 	}
2758 out:
2759 	kfree(pointers);
2760 	kfree(unmap_array);
2761 	return ret;
2762 }
2763 
2764 static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio)
2765 {
2766 	struct bio_list bio_list = BIO_EMPTY_LIST;
2767 	int total_sector_nr;
2768 	int ret = 0;
2769 
2770 	/* Build a list of bios to read all the missing parts. */
2771 	for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
2772 	     total_sector_nr++) {
2773 		int sectornr = total_sector_nr % rbio->stripe_nsectors;
2774 		int stripe = total_sector_nr / rbio->stripe_nsectors;
2775 		struct sector_ptr *sector;
2776 
2777 		/* No data in the vertical stripe, no need to read. */
2778 		if (!test_bit(sectornr, &rbio->dbitmap))
2779 			continue;
2780 
2781 		/*
2782 		 * We want to find all the sectors missing from the rbio and
2783 		 * read them from the disk. If sector_in_rbio() finds a sector
2784 		 * in the bio list we don't need to read it off the stripe.
2785 		 */
2786 		sector = sector_in_rbio(rbio, stripe, sectornr, 1);
2787 		if (sector)
2788 			continue;
2789 
2790 		sector = rbio_stripe_sector(rbio, stripe, sectornr);
2791 		/*
2792 		 * The bio cache may have handed us an uptodate sector.  If so,
2793 		 * use it.
2794 		 */
2795 		if (sector->uptodate)
2796 			continue;
2797 
2798 		ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
2799 					 sectornr, REQ_OP_READ);
2800 		if (ret) {
2801 			bio_list_put(&bio_list);
2802 			return ret;
2803 		}
2804 	}
2805 
2806 	submit_read_wait_bio_list(rbio, &bio_list);
2807 	return 0;
2808 }
2809 
2810 static void scrub_rbio(struct btrfs_raid_bio *rbio)
2811 {
2812 	int sector_nr;
2813 	int ret;
2814 
2815 	ret = alloc_rbio_essential_pages(rbio);
2816 	if (ret)
2817 		goto out;
2818 
2819 	bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
2820 
2821 	ret = scrub_assemble_read_bios(rbio);
2822 	if (ret < 0)
2823 		goto out;
2824 
2825 	/* We may have some failures, recover the failed sectors first. */
2826 	ret = recover_scrub_rbio(rbio);
2827 	if (ret < 0)
2828 		goto out;
2829 
2830 	/*
2831 	 * We have every sector properly prepared. Can finish the scrub
2832 	 * and writeback the good content.
2833 	 */
2834 	ret = finish_parity_scrub(rbio);
2835 	wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
2836 	for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) {
2837 		int found_errors;
2838 
2839 		found_errors = get_rbio_veritical_errors(rbio, sector_nr, NULL, NULL);
2840 		if (found_errors > rbio->bioc->max_errors) {
2841 			ret = -EIO;
2842 			break;
2843 		}
2844 	}
2845 out:
2846 	rbio_orig_end_io(rbio, errno_to_blk_status(ret));
2847 }
2848 
2849 static void scrub_rbio_work_locked(struct work_struct *work)
2850 {
2851 	scrub_rbio(container_of(work, struct btrfs_raid_bio, work));
2852 }
2853 
2854 void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
2855 {
2856 	if (!lock_stripe_add(rbio))
2857 		start_async_work(rbio, scrub_rbio_work_locked);
2858 }
2859 
2860 /*
2861  * This is for scrub call sites where we already have correct data contents.
2862  * This allows us to avoid reading data stripes again.
2863  *
2864  * Unfortunately here we have to do page copy, other than reusing the pages.
2865  * This is due to the fact rbio has its own page management for its cache.
2866  */
2867 void raid56_parity_cache_data_pages(struct btrfs_raid_bio *rbio,
2868 				    struct page **data_pages, u64 data_logical)
2869 {
2870 	const u64 offset_in_full_stripe = data_logical -
2871 					  rbio->bioc->full_stripe_logical;
2872 	const int page_index = offset_in_full_stripe >> PAGE_SHIFT;
2873 	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
2874 	const u32 sectors_per_page = PAGE_SIZE / sectorsize;
2875 	int ret;
2876 
2877 	/*
2878 	 * If we hit ENOMEM temporarily, but later at
2879 	 * raid56_parity_submit_scrub_rbio() time it succeeded, we just do
2880 	 * the extra read, not a big deal.
2881 	 *
2882 	 * If we hit ENOMEM later at raid56_parity_submit_scrub_rbio() time,
2883 	 * the bio would got proper error number set.
2884 	 */
2885 	ret = alloc_rbio_data_pages(rbio);
2886 	if (ret < 0)
2887 		return;
2888 
2889 	/* data_logical must be at stripe boundary and inside the full stripe. */
2890 	ASSERT(IS_ALIGNED(offset_in_full_stripe, BTRFS_STRIPE_LEN));
2891 	ASSERT(offset_in_full_stripe < (rbio->nr_data << BTRFS_STRIPE_LEN_SHIFT));
2892 
2893 	for (int page_nr = 0; page_nr < (BTRFS_STRIPE_LEN >> PAGE_SHIFT); page_nr++) {
2894 		struct page *dst = rbio->stripe_pages[page_nr + page_index];
2895 		struct page *src = data_pages[page_nr];
2896 
2897 		memcpy_page(dst, 0, src, 0, PAGE_SIZE);
2898 		for (int sector_nr = sectors_per_page * page_index;
2899 		     sector_nr < sectors_per_page * (page_index + 1);
2900 		     sector_nr++)
2901 			rbio->stripe_sectors[sector_nr].uptodate = true;
2902 	}
2903 }
2904