1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2012 Fusion-io All rights reserved. 4 * Copyright (C) 2012 Intel Corp. All rights reserved. 5 */ 6 7 #include <linux/sched.h> 8 #include <linux/bio.h> 9 #include <linux/slab.h> 10 #include <linux/blkdev.h> 11 #include <linux/raid/pq.h> 12 #include <linux/hash.h> 13 #include <linux/list_sort.h> 14 #include <linux/raid/xor.h> 15 #include <linux/mm.h> 16 #include "messages.h" 17 #include "ctree.h" 18 #include "disk-io.h" 19 #include "volumes.h" 20 #include "raid56.h" 21 #include "async-thread.h" 22 #include "file-item.h" 23 #include "btrfs_inode.h" 24 25 /* set when additional merges to this rbio are not allowed */ 26 #define RBIO_RMW_LOCKED_BIT 1 27 28 /* 29 * set when this rbio is sitting in the hash, but it is just a cache 30 * of past RMW 31 */ 32 #define RBIO_CACHE_BIT 2 33 34 /* 35 * set when it is safe to trust the stripe_pages for caching 36 */ 37 #define RBIO_CACHE_READY_BIT 3 38 39 #define RBIO_CACHE_SIZE 1024 40 41 #define BTRFS_STRIPE_HASH_TABLE_BITS 11 42 43 static void dump_bioc(const struct btrfs_fs_info *fs_info, const struct btrfs_io_context *bioc) 44 { 45 if (unlikely(!bioc)) { 46 btrfs_crit(fs_info, "bioc=NULL"); 47 return; 48 } 49 btrfs_crit(fs_info, 50 "bioc logical=%llu full_stripe=%llu size=%llu map_type=0x%llx mirror=%u replace_nr_stripes=%u replace_stripe_src=%d num_stripes=%u", 51 bioc->logical, bioc->full_stripe_logical, bioc->size, 52 bioc->map_type, bioc->mirror_num, bioc->replace_nr_stripes, 53 bioc->replace_stripe_src, bioc->num_stripes); 54 for (int i = 0; i < bioc->num_stripes; i++) { 55 btrfs_crit(fs_info, " nr=%d devid=%llu physical=%llu", 56 i, bioc->stripes[i].dev->devid, 57 bioc->stripes[i].physical); 58 } 59 } 60 61 static void btrfs_dump_rbio(const struct btrfs_fs_info *fs_info, 62 const struct btrfs_raid_bio *rbio) 63 { 64 if (!IS_ENABLED(CONFIG_BTRFS_ASSERT)) 65 return; 66 67 dump_bioc(fs_info, rbio->bioc); 68 btrfs_crit(fs_info, 69 "rbio flags=0x%lx nr_sectors=%u nr_data=%u real_stripes=%u stripe_nsectors=%u scrubp=%u dbitmap=0x%lx", 70 rbio->flags, rbio->nr_sectors, rbio->nr_data, 71 rbio->real_stripes, rbio->stripe_nsectors, 72 rbio->scrubp, rbio->dbitmap); 73 } 74 75 #define ASSERT_RBIO(expr, rbio) \ 76 ({ \ 77 if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) { \ 78 const struct btrfs_fs_info *__fs_info = (rbio)->bioc ? \ 79 (rbio)->bioc->fs_info : NULL; \ 80 \ 81 btrfs_dump_rbio(__fs_info, (rbio)); \ 82 } \ 83 ASSERT((expr)); \ 84 }) 85 86 #define ASSERT_RBIO_STRIPE(expr, rbio, stripe_nr) \ 87 ({ \ 88 if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) { \ 89 const struct btrfs_fs_info *__fs_info = (rbio)->bioc ? \ 90 (rbio)->bioc->fs_info : NULL; \ 91 \ 92 btrfs_dump_rbio(__fs_info, (rbio)); \ 93 btrfs_crit(__fs_info, "stripe_nr=%d", (stripe_nr)); \ 94 } \ 95 ASSERT((expr)); \ 96 }) 97 98 #define ASSERT_RBIO_SECTOR(expr, rbio, sector_nr) \ 99 ({ \ 100 if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) { \ 101 const struct btrfs_fs_info *__fs_info = (rbio)->bioc ? \ 102 (rbio)->bioc->fs_info : NULL; \ 103 \ 104 btrfs_dump_rbio(__fs_info, (rbio)); \ 105 btrfs_crit(__fs_info, "sector_nr=%d", (sector_nr)); \ 106 } \ 107 ASSERT((expr)); \ 108 }) 109 110 #define ASSERT_RBIO_LOGICAL(expr, rbio, logical) \ 111 ({ \ 112 if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) { \ 113 const struct btrfs_fs_info *__fs_info = (rbio)->bioc ? \ 114 (rbio)->bioc->fs_info : NULL; \ 115 \ 116 btrfs_dump_rbio(__fs_info, (rbio)); \ 117 btrfs_crit(__fs_info, "logical=%llu", (logical)); \ 118 } \ 119 ASSERT((expr)); \ 120 }) 121 122 /* Used by the raid56 code to lock stripes for read/modify/write */ 123 struct btrfs_stripe_hash { 124 struct list_head hash_list; 125 spinlock_t lock; 126 }; 127 128 /* Used by the raid56 code to lock stripes for read/modify/write */ 129 struct btrfs_stripe_hash_table { 130 struct list_head stripe_cache; 131 spinlock_t cache_lock; 132 int cache_size; 133 struct btrfs_stripe_hash table[]; 134 }; 135 136 /* 137 * A structure to present a sector inside a page, the length is fixed to 138 * sectorsize; 139 */ 140 struct sector_ptr { 141 /* 142 * Blocks from the bio list can still be highmem. 143 * So here we use physical address to present a page and the offset inside it. 144 */ 145 phys_addr_t paddr; 146 bool has_paddr; 147 bool uptodate; 148 }; 149 150 static void rmw_rbio_work(struct work_struct *work); 151 static void rmw_rbio_work_locked(struct work_struct *work); 152 static void index_rbio_pages(struct btrfs_raid_bio *rbio); 153 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); 154 155 static int finish_parity_scrub(struct btrfs_raid_bio *rbio); 156 static void scrub_rbio_work_locked(struct work_struct *work); 157 158 static void free_raid_bio_pointers(struct btrfs_raid_bio *rbio) 159 { 160 bitmap_free(rbio->error_bitmap); 161 kfree(rbio->stripe_pages); 162 kfree(rbio->bio_sectors); 163 kfree(rbio->stripe_sectors); 164 kfree(rbio->finish_pointers); 165 } 166 167 static void free_raid_bio(struct btrfs_raid_bio *rbio) 168 { 169 int i; 170 171 if (!refcount_dec_and_test(&rbio->refs)) 172 return; 173 174 WARN_ON(!list_empty(&rbio->stripe_cache)); 175 WARN_ON(!list_empty(&rbio->hash_list)); 176 WARN_ON(!bio_list_empty(&rbio->bio_list)); 177 178 for (i = 0; i < rbio->nr_pages; i++) { 179 if (rbio->stripe_pages[i]) { 180 __free_page(rbio->stripe_pages[i]); 181 rbio->stripe_pages[i] = NULL; 182 } 183 } 184 185 btrfs_put_bioc(rbio->bioc); 186 free_raid_bio_pointers(rbio); 187 kfree(rbio); 188 } 189 190 static void start_async_work(struct btrfs_raid_bio *rbio, work_func_t work_func) 191 { 192 INIT_WORK(&rbio->work, work_func); 193 queue_work(rbio->bioc->fs_info->rmw_workers, &rbio->work); 194 } 195 196 /* 197 * the stripe hash table is used for locking, and to collect 198 * bios in hopes of making a full stripe 199 */ 200 int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) 201 { 202 struct btrfs_stripe_hash_table *table; 203 struct btrfs_stripe_hash_table *x; 204 struct btrfs_stripe_hash *cur; 205 struct btrfs_stripe_hash *h; 206 unsigned int num_entries = 1U << BTRFS_STRIPE_HASH_TABLE_BITS; 207 208 if (info->stripe_hash_table) 209 return 0; 210 211 /* 212 * The table is large, starting with order 4 and can go as high as 213 * order 7 in case lock debugging is turned on. 214 * 215 * Try harder to allocate and fallback to vmalloc to lower the chance 216 * of a failing mount. 217 */ 218 table = kvzalloc(struct_size(table, table, num_entries), GFP_KERNEL); 219 if (!table) 220 return -ENOMEM; 221 222 spin_lock_init(&table->cache_lock); 223 INIT_LIST_HEAD(&table->stripe_cache); 224 225 h = table->table; 226 227 for (unsigned int i = 0; i < num_entries; i++) { 228 cur = h + i; 229 INIT_LIST_HEAD(&cur->hash_list); 230 spin_lock_init(&cur->lock); 231 } 232 233 x = cmpxchg(&info->stripe_hash_table, NULL, table); 234 kvfree(x); 235 return 0; 236 } 237 238 static void memcpy_sectors(const struct sector_ptr *dst, 239 const struct sector_ptr *src, u32 blocksize) 240 { 241 memcpy_page(phys_to_page(dst->paddr), offset_in_page(dst->paddr), 242 phys_to_page(src->paddr), offset_in_page(src->paddr), 243 blocksize); 244 } 245 246 /* 247 * caching an rbio means to copy anything from the 248 * bio_sectors array into the stripe_pages array. We 249 * use the page uptodate bit in the stripe cache array 250 * to indicate if it has valid data 251 * 252 * once the caching is done, we set the cache ready 253 * bit. 254 */ 255 static void cache_rbio_pages(struct btrfs_raid_bio *rbio) 256 { 257 int i; 258 int ret; 259 260 ret = alloc_rbio_pages(rbio); 261 if (ret) 262 return; 263 264 for (i = 0; i < rbio->nr_sectors; i++) { 265 /* Some range not covered by bio (partial write), skip it */ 266 if (!rbio->bio_sectors[i].has_paddr) { 267 /* 268 * Even if the sector is not covered by bio, if it is 269 * a data sector it should still be uptodate as it is 270 * read from disk. 271 */ 272 if (i < rbio->nr_data * rbio->stripe_nsectors) 273 ASSERT(rbio->stripe_sectors[i].uptodate); 274 continue; 275 } 276 277 memcpy_sectors(&rbio->stripe_sectors[i], &rbio->bio_sectors[i], 278 rbio->bioc->fs_info->sectorsize); 279 rbio->stripe_sectors[i].uptodate = 1; 280 } 281 set_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 282 } 283 284 /* 285 * we hash on the first logical address of the stripe 286 */ 287 static int rbio_bucket(struct btrfs_raid_bio *rbio) 288 { 289 u64 num = rbio->bioc->full_stripe_logical; 290 291 /* 292 * we shift down quite a bit. We're using byte 293 * addressing, and most of the lower bits are zeros. 294 * This tends to upset hash_64, and it consistently 295 * returns just one or two different values. 296 * 297 * shifting off the lower bits fixes things. 298 */ 299 return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS); 300 } 301 302 static bool full_page_sectors_uptodate(struct btrfs_raid_bio *rbio, 303 unsigned int page_nr) 304 { 305 const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 306 const u32 sectors_per_page = PAGE_SIZE / sectorsize; 307 int i; 308 309 ASSERT(page_nr < rbio->nr_pages); 310 311 for (i = sectors_per_page * page_nr; 312 i < sectors_per_page * page_nr + sectors_per_page; 313 i++) { 314 if (!rbio->stripe_sectors[i].uptodate) 315 return false; 316 } 317 return true; 318 } 319 320 /* 321 * Update the stripe_sectors[] array to use correct page and pgoff 322 * 323 * Should be called every time any page pointer in stripes_pages[] got modified. 324 */ 325 static void index_stripe_sectors(struct btrfs_raid_bio *rbio) 326 { 327 const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 328 u32 offset; 329 int i; 330 331 for (i = 0, offset = 0; i < rbio->nr_sectors; i++, offset += sectorsize) { 332 int page_index = offset >> PAGE_SHIFT; 333 334 ASSERT(page_index < rbio->nr_pages); 335 if (!rbio->stripe_pages[page_index]) 336 continue; 337 338 rbio->stripe_sectors[i].has_paddr = true; 339 rbio->stripe_sectors[i].paddr = 340 page_to_phys(rbio->stripe_pages[page_index]) + 341 offset_in_page(offset); 342 } 343 } 344 345 static void steal_rbio_page(struct btrfs_raid_bio *src, 346 struct btrfs_raid_bio *dest, int page_nr) 347 { 348 const u32 sectorsize = src->bioc->fs_info->sectorsize; 349 const u32 sectors_per_page = PAGE_SIZE / sectorsize; 350 int i; 351 352 if (dest->stripe_pages[page_nr]) 353 __free_page(dest->stripe_pages[page_nr]); 354 dest->stripe_pages[page_nr] = src->stripe_pages[page_nr]; 355 src->stripe_pages[page_nr] = NULL; 356 357 /* Also update the sector->uptodate bits. */ 358 for (i = sectors_per_page * page_nr; 359 i < sectors_per_page * page_nr + sectors_per_page; i++) 360 dest->stripe_sectors[i].uptodate = true; 361 } 362 363 static bool is_data_stripe_page(struct btrfs_raid_bio *rbio, int page_nr) 364 { 365 const int sector_nr = (page_nr << PAGE_SHIFT) >> 366 rbio->bioc->fs_info->sectorsize_bits; 367 368 /* 369 * We have ensured PAGE_SIZE is aligned with sectorsize, thus 370 * we won't have a page which is half data half parity. 371 * 372 * Thus if the first sector of the page belongs to data stripes, then 373 * the full page belongs to data stripes. 374 */ 375 return (sector_nr < rbio->nr_data * rbio->stripe_nsectors); 376 } 377 378 /* 379 * Stealing an rbio means taking all the uptodate pages from the stripe array 380 * in the source rbio and putting them into the destination rbio. 381 * 382 * This will also update the involved stripe_sectors[] which are referring to 383 * the old pages. 384 */ 385 static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest) 386 { 387 int i; 388 389 if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags)) 390 return; 391 392 for (i = 0; i < dest->nr_pages; i++) { 393 struct page *p = src->stripe_pages[i]; 394 395 /* 396 * We don't need to steal P/Q pages as they will always be 397 * regenerated for RMW or full write anyway. 398 */ 399 if (!is_data_stripe_page(src, i)) 400 continue; 401 402 /* 403 * If @src already has RBIO_CACHE_READY_BIT, it should have 404 * all data stripe pages present and uptodate. 405 */ 406 ASSERT(p); 407 ASSERT(full_page_sectors_uptodate(src, i)); 408 steal_rbio_page(src, dest, i); 409 } 410 index_stripe_sectors(dest); 411 index_stripe_sectors(src); 412 } 413 414 /* 415 * merging means we take the bio_list from the victim and 416 * splice it into the destination. The victim should 417 * be discarded afterwards. 418 * 419 * must be called with dest->rbio_list_lock held 420 */ 421 static void merge_rbio(struct btrfs_raid_bio *dest, 422 struct btrfs_raid_bio *victim) 423 { 424 bio_list_merge_init(&dest->bio_list, &victim->bio_list); 425 dest->bio_list_bytes += victim->bio_list_bytes; 426 /* Also inherit the bitmaps from @victim. */ 427 bitmap_or(&dest->dbitmap, &victim->dbitmap, &dest->dbitmap, 428 dest->stripe_nsectors); 429 } 430 431 /* 432 * used to prune items that are in the cache. The caller 433 * must hold the hash table lock. 434 */ 435 static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio) 436 { 437 int bucket = rbio_bucket(rbio); 438 struct btrfs_stripe_hash_table *table; 439 struct btrfs_stripe_hash *h; 440 int freeit = 0; 441 442 /* 443 * check the bit again under the hash table lock. 444 */ 445 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) 446 return; 447 448 table = rbio->bioc->fs_info->stripe_hash_table; 449 h = table->table + bucket; 450 451 /* hold the lock for the bucket because we may be 452 * removing it from the hash table 453 */ 454 spin_lock(&h->lock); 455 456 /* 457 * hold the lock for the bio list because we need 458 * to make sure the bio list is empty 459 */ 460 spin_lock(&rbio->bio_list_lock); 461 462 if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) { 463 list_del_init(&rbio->stripe_cache); 464 table->cache_size -= 1; 465 freeit = 1; 466 467 /* if the bio list isn't empty, this rbio is 468 * still involved in an IO. We take it out 469 * of the cache list, and drop the ref that 470 * was held for the list. 471 * 472 * If the bio_list was empty, we also remove 473 * the rbio from the hash_table, and drop 474 * the corresponding ref 475 */ 476 if (bio_list_empty(&rbio->bio_list)) { 477 if (!list_empty(&rbio->hash_list)) { 478 list_del_init(&rbio->hash_list); 479 refcount_dec(&rbio->refs); 480 BUG_ON(!list_empty(&rbio->plug_list)); 481 } 482 } 483 } 484 485 spin_unlock(&rbio->bio_list_lock); 486 spin_unlock(&h->lock); 487 488 if (freeit) 489 free_raid_bio(rbio); 490 } 491 492 /* 493 * prune a given rbio from the cache 494 */ 495 static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio) 496 { 497 struct btrfs_stripe_hash_table *table; 498 499 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) 500 return; 501 502 table = rbio->bioc->fs_info->stripe_hash_table; 503 504 spin_lock(&table->cache_lock); 505 __remove_rbio_from_cache(rbio); 506 spin_unlock(&table->cache_lock); 507 } 508 509 /* 510 * remove everything in the cache 511 */ 512 static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info) 513 { 514 struct btrfs_stripe_hash_table *table; 515 struct btrfs_raid_bio *rbio; 516 517 table = info->stripe_hash_table; 518 519 spin_lock(&table->cache_lock); 520 while (!list_empty(&table->stripe_cache)) { 521 rbio = list_first_entry(&table->stripe_cache, 522 struct btrfs_raid_bio, stripe_cache); 523 __remove_rbio_from_cache(rbio); 524 } 525 spin_unlock(&table->cache_lock); 526 } 527 528 /* 529 * remove all cached entries and free the hash table 530 * used by unmount 531 */ 532 void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info) 533 { 534 if (!info->stripe_hash_table) 535 return; 536 btrfs_clear_rbio_cache(info); 537 kvfree(info->stripe_hash_table); 538 info->stripe_hash_table = NULL; 539 } 540 541 /* 542 * insert an rbio into the stripe cache. It 543 * must have already been prepared by calling 544 * cache_rbio_pages 545 * 546 * If this rbio was already cached, it gets 547 * moved to the front of the lru. 548 * 549 * If the size of the rbio cache is too big, we 550 * prune an item. 551 */ 552 static void cache_rbio(struct btrfs_raid_bio *rbio) 553 { 554 struct btrfs_stripe_hash_table *table; 555 556 if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags)) 557 return; 558 559 table = rbio->bioc->fs_info->stripe_hash_table; 560 561 spin_lock(&table->cache_lock); 562 spin_lock(&rbio->bio_list_lock); 563 564 /* bump our ref if we were not in the list before */ 565 if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags)) 566 refcount_inc(&rbio->refs); 567 568 if (!list_empty(&rbio->stripe_cache)){ 569 list_move(&rbio->stripe_cache, &table->stripe_cache); 570 } else { 571 list_add(&rbio->stripe_cache, &table->stripe_cache); 572 table->cache_size += 1; 573 } 574 575 spin_unlock(&rbio->bio_list_lock); 576 577 if (table->cache_size > RBIO_CACHE_SIZE) { 578 struct btrfs_raid_bio *found; 579 580 found = list_last_entry(&table->stripe_cache, 581 struct btrfs_raid_bio, 582 stripe_cache); 583 584 if (found != rbio) 585 __remove_rbio_from_cache(found); 586 } 587 588 spin_unlock(&table->cache_lock); 589 } 590 591 /* 592 * helper function to run the xor_blocks api. It is only 593 * able to do MAX_XOR_BLOCKS at a time, so we need to 594 * loop through. 595 */ 596 static void run_xor(void **pages, int src_cnt, ssize_t len) 597 { 598 int src_off = 0; 599 int xor_src_cnt = 0; 600 void *dest = pages[src_cnt]; 601 602 while(src_cnt > 0) { 603 xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS); 604 xor_blocks(xor_src_cnt, len, dest, pages + src_off); 605 606 src_cnt -= xor_src_cnt; 607 src_off += xor_src_cnt; 608 } 609 } 610 611 /* 612 * Returns true if the bio list inside this rbio covers an entire stripe (no 613 * rmw required). 614 */ 615 static int rbio_is_full(struct btrfs_raid_bio *rbio) 616 { 617 unsigned long size = rbio->bio_list_bytes; 618 int ret = 1; 619 620 spin_lock(&rbio->bio_list_lock); 621 if (size != rbio->nr_data * BTRFS_STRIPE_LEN) 622 ret = 0; 623 BUG_ON(size > rbio->nr_data * BTRFS_STRIPE_LEN); 624 spin_unlock(&rbio->bio_list_lock); 625 626 return ret; 627 } 628 629 /* 630 * returns 1 if it is safe to merge two rbios together. 631 * The merging is safe if the two rbios correspond to 632 * the same stripe and if they are both going in the same 633 * direction (read vs write), and if neither one is 634 * locked for final IO 635 * 636 * The caller is responsible for locking such that 637 * rmw_locked is safe to test 638 */ 639 static int rbio_can_merge(struct btrfs_raid_bio *last, 640 struct btrfs_raid_bio *cur) 641 { 642 if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) || 643 test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) 644 return 0; 645 646 /* 647 * we can't merge with cached rbios, since the 648 * idea is that when we merge the destination 649 * rbio is going to run our IO for us. We can 650 * steal from cached rbios though, other functions 651 * handle that. 652 */ 653 if (test_bit(RBIO_CACHE_BIT, &last->flags) || 654 test_bit(RBIO_CACHE_BIT, &cur->flags)) 655 return 0; 656 657 if (last->bioc->full_stripe_logical != cur->bioc->full_stripe_logical) 658 return 0; 659 660 /* we can't merge with different operations */ 661 if (last->operation != cur->operation) 662 return 0; 663 /* 664 * We've need read the full stripe from the drive. 665 * check and repair the parity and write the new results. 666 * 667 * We're not allowed to add any new bios to the 668 * bio list here, anyone else that wants to 669 * change this stripe needs to do their own rmw. 670 */ 671 if (last->operation == BTRFS_RBIO_PARITY_SCRUB) 672 return 0; 673 674 if (last->operation == BTRFS_RBIO_READ_REBUILD) 675 return 0; 676 677 return 1; 678 } 679 680 static unsigned int rbio_stripe_sector_index(const struct btrfs_raid_bio *rbio, 681 unsigned int stripe_nr, 682 unsigned int sector_nr) 683 { 684 ASSERT_RBIO_STRIPE(stripe_nr < rbio->real_stripes, rbio, stripe_nr); 685 ASSERT_RBIO_SECTOR(sector_nr < rbio->stripe_nsectors, rbio, sector_nr); 686 687 return stripe_nr * rbio->stripe_nsectors + sector_nr; 688 } 689 690 /* Return a sector from rbio->stripe_sectors, not from the bio list */ 691 static struct sector_ptr *rbio_stripe_sector(const struct btrfs_raid_bio *rbio, 692 unsigned int stripe_nr, 693 unsigned int sector_nr) 694 { 695 return &rbio->stripe_sectors[rbio_stripe_sector_index(rbio, stripe_nr, 696 sector_nr)]; 697 } 698 699 /* Grab a sector inside P stripe */ 700 static struct sector_ptr *rbio_pstripe_sector(const struct btrfs_raid_bio *rbio, 701 unsigned int sector_nr) 702 { 703 return rbio_stripe_sector(rbio, rbio->nr_data, sector_nr); 704 } 705 706 /* Grab a sector inside Q stripe, return NULL if not RAID6 */ 707 static struct sector_ptr *rbio_qstripe_sector(const struct btrfs_raid_bio *rbio, 708 unsigned int sector_nr) 709 { 710 if (rbio->nr_data + 1 == rbio->real_stripes) 711 return NULL; 712 return rbio_stripe_sector(rbio, rbio->nr_data + 1, sector_nr); 713 } 714 715 /* 716 * The first stripe in the table for a logical address 717 * has the lock. rbios are added in one of three ways: 718 * 719 * 1) Nobody has the stripe locked yet. The rbio is given 720 * the lock and 0 is returned. The caller must start the IO 721 * themselves. 722 * 723 * 2) Someone has the stripe locked, but we're able to merge 724 * with the lock owner. The rbio is freed and the IO will 725 * start automatically along with the existing rbio. 1 is returned. 726 * 727 * 3) Someone has the stripe locked, but we're not able to merge. 728 * The rbio is added to the lock owner's plug list, or merged into 729 * an rbio already on the plug list. When the lock owner unlocks, 730 * the next rbio on the list is run and the IO is started automatically. 731 * 1 is returned 732 * 733 * If we return 0, the caller still owns the rbio and must continue with 734 * IO submission. If we return 1, the caller must assume the rbio has 735 * already been freed. 736 */ 737 static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) 738 { 739 struct btrfs_stripe_hash *h; 740 struct btrfs_raid_bio *cur; 741 struct btrfs_raid_bio *pending; 742 struct btrfs_raid_bio *freeit = NULL; 743 struct btrfs_raid_bio *cache_drop = NULL; 744 int ret = 0; 745 746 h = rbio->bioc->fs_info->stripe_hash_table->table + rbio_bucket(rbio); 747 748 spin_lock(&h->lock); 749 list_for_each_entry(cur, &h->hash_list, hash_list) { 750 if (cur->bioc->full_stripe_logical != rbio->bioc->full_stripe_logical) 751 continue; 752 753 spin_lock(&cur->bio_list_lock); 754 755 /* Can we steal this cached rbio's pages? */ 756 if (bio_list_empty(&cur->bio_list) && 757 list_empty(&cur->plug_list) && 758 test_bit(RBIO_CACHE_BIT, &cur->flags) && 759 !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) { 760 list_del_init(&cur->hash_list); 761 refcount_dec(&cur->refs); 762 763 steal_rbio(cur, rbio); 764 cache_drop = cur; 765 spin_unlock(&cur->bio_list_lock); 766 767 goto lockit; 768 } 769 770 /* Can we merge into the lock owner? */ 771 if (rbio_can_merge(cur, rbio)) { 772 merge_rbio(cur, rbio); 773 spin_unlock(&cur->bio_list_lock); 774 freeit = rbio; 775 ret = 1; 776 goto out; 777 } 778 779 780 /* 781 * We couldn't merge with the running rbio, see if we can merge 782 * with the pending ones. We don't have to check for rmw_locked 783 * because there is no way they are inside finish_rmw right now 784 */ 785 list_for_each_entry(pending, &cur->plug_list, plug_list) { 786 if (rbio_can_merge(pending, rbio)) { 787 merge_rbio(pending, rbio); 788 spin_unlock(&cur->bio_list_lock); 789 freeit = rbio; 790 ret = 1; 791 goto out; 792 } 793 } 794 795 /* 796 * No merging, put us on the tail of the plug list, our rbio 797 * will be started with the currently running rbio unlocks 798 */ 799 list_add_tail(&rbio->plug_list, &cur->plug_list); 800 spin_unlock(&cur->bio_list_lock); 801 ret = 1; 802 goto out; 803 } 804 lockit: 805 refcount_inc(&rbio->refs); 806 list_add(&rbio->hash_list, &h->hash_list); 807 out: 808 spin_unlock(&h->lock); 809 if (cache_drop) 810 remove_rbio_from_cache(cache_drop); 811 if (freeit) 812 free_raid_bio(freeit); 813 return ret; 814 } 815 816 static void recover_rbio_work_locked(struct work_struct *work); 817 818 /* 819 * called as rmw or parity rebuild is completed. If the plug list has more 820 * rbios waiting for this stripe, the next one on the list will be started 821 */ 822 static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) 823 { 824 int bucket; 825 struct btrfs_stripe_hash *h; 826 int keep_cache = 0; 827 828 bucket = rbio_bucket(rbio); 829 h = rbio->bioc->fs_info->stripe_hash_table->table + bucket; 830 831 if (list_empty(&rbio->plug_list)) 832 cache_rbio(rbio); 833 834 spin_lock(&h->lock); 835 spin_lock(&rbio->bio_list_lock); 836 837 if (!list_empty(&rbio->hash_list)) { 838 /* 839 * if we're still cached and there is no other IO 840 * to perform, just leave this rbio here for others 841 * to steal from later 842 */ 843 if (list_empty(&rbio->plug_list) && 844 test_bit(RBIO_CACHE_BIT, &rbio->flags)) { 845 keep_cache = 1; 846 clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 847 BUG_ON(!bio_list_empty(&rbio->bio_list)); 848 goto done; 849 } 850 851 list_del_init(&rbio->hash_list); 852 refcount_dec(&rbio->refs); 853 854 /* 855 * we use the plug list to hold all the rbios 856 * waiting for the chance to lock this stripe. 857 * hand the lock over to one of them. 858 */ 859 if (!list_empty(&rbio->plug_list)) { 860 struct btrfs_raid_bio *next; 861 struct list_head *head = rbio->plug_list.next; 862 863 next = list_entry(head, struct btrfs_raid_bio, 864 plug_list); 865 866 list_del_init(&rbio->plug_list); 867 868 list_add(&next->hash_list, &h->hash_list); 869 refcount_inc(&next->refs); 870 spin_unlock(&rbio->bio_list_lock); 871 spin_unlock(&h->lock); 872 873 if (next->operation == BTRFS_RBIO_READ_REBUILD) { 874 start_async_work(next, recover_rbio_work_locked); 875 } else if (next->operation == BTRFS_RBIO_WRITE) { 876 steal_rbio(rbio, next); 877 start_async_work(next, rmw_rbio_work_locked); 878 } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) { 879 steal_rbio(rbio, next); 880 start_async_work(next, scrub_rbio_work_locked); 881 } 882 883 goto done_nolock; 884 } 885 } 886 done: 887 spin_unlock(&rbio->bio_list_lock); 888 spin_unlock(&h->lock); 889 890 done_nolock: 891 if (!keep_cache) 892 remove_rbio_from_cache(rbio); 893 } 894 895 static void rbio_endio_bio_list(struct bio *cur, blk_status_t status) 896 { 897 struct bio *next; 898 899 while (cur) { 900 next = cur->bi_next; 901 cur->bi_next = NULL; 902 cur->bi_status = status; 903 bio_endio(cur); 904 cur = next; 905 } 906 } 907 908 /* 909 * this frees the rbio and runs through all the bios in the 910 * bio_list and calls end_io on them 911 */ 912 static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t status) 913 { 914 struct bio *cur = bio_list_get(&rbio->bio_list); 915 struct bio *extra; 916 917 kfree(rbio->csum_buf); 918 bitmap_free(rbio->csum_bitmap); 919 rbio->csum_buf = NULL; 920 rbio->csum_bitmap = NULL; 921 922 /* 923 * Clear the data bitmap, as the rbio may be cached for later usage. 924 * do this before before unlock_stripe() so there will be no new bio 925 * for this bio. 926 */ 927 bitmap_clear(&rbio->dbitmap, 0, rbio->stripe_nsectors); 928 929 /* 930 * At this moment, rbio->bio_list is empty, however since rbio does not 931 * always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the 932 * hash list, rbio may be merged with others so that rbio->bio_list 933 * becomes non-empty. 934 * Once unlock_stripe() is done, rbio->bio_list will not be updated any 935 * more and we can call bio_endio() on all queued bios. 936 */ 937 unlock_stripe(rbio); 938 extra = bio_list_get(&rbio->bio_list); 939 free_raid_bio(rbio); 940 941 rbio_endio_bio_list(cur, status); 942 if (extra) 943 rbio_endio_bio_list(extra, status); 944 } 945 946 /* 947 * Get a sector pointer specified by its @stripe_nr and @sector_nr. 948 * 949 * @rbio: The raid bio 950 * @stripe_nr: Stripe number, valid range [0, real_stripe) 951 * @sector_nr: Sector number inside the stripe, 952 * valid range [0, stripe_nsectors) 953 * @bio_list_only: Whether to use sectors inside the bio list only. 954 * 955 * The read/modify/write code wants to reuse the original bio page as much 956 * as possible, and only use stripe_sectors as fallback. 957 */ 958 static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio, 959 int stripe_nr, int sector_nr, 960 bool bio_list_only) 961 { 962 struct sector_ptr *sector; 963 int index; 964 965 ASSERT_RBIO_STRIPE(stripe_nr >= 0 && stripe_nr < rbio->real_stripes, 966 rbio, stripe_nr); 967 ASSERT_RBIO_SECTOR(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors, 968 rbio, sector_nr); 969 970 index = stripe_nr * rbio->stripe_nsectors + sector_nr; 971 ASSERT(index >= 0 && index < rbio->nr_sectors); 972 973 spin_lock(&rbio->bio_list_lock); 974 sector = &rbio->bio_sectors[index]; 975 if (sector->has_paddr || bio_list_only) { 976 /* Don't return sector without a valid page pointer */ 977 if (!sector->has_paddr) 978 sector = NULL; 979 spin_unlock(&rbio->bio_list_lock); 980 return sector; 981 } 982 spin_unlock(&rbio->bio_list_lock); 983 984 return &rbio->stripe_sectors[index]; 985 } 986 987 /* 988 * allocation and initial setup for the btrfs_raid_bio. Not 989 * this does not allocate any pages for rbio->pages. 990 */ 991 static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, 992 struct btrfs_io_context *bioc) 993 { 994 const unsigned int real_stripes = bioc->num_stripes - bioc->replace_nr_stripes; 995 const unsigned int stripe_npages = BTRFS_STRIPE_LEN >> PAGE_SHIFT; 996 const unsigned int num_pages = stripe_npages * real_stripes; 997 const unsigned int stripe_nsectors = 998 BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits; 999 const unsigned int num_sectors = stripe_nsectors * real_stripes; 1000 struct btrfs_raid_bio *rbio; 1001 1002 /* PAGE_SIZE must also be aligned to sectorsize for subpage support */ 1003 ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize)); 1004 /* 1005 * Our current stripe len should be fixed to 64k thus stripe_nsectors 1006 * (at most 16) should be no larger than BITS_PER_LONG. 1007 */ 1008 ASSERT(stripe_nsectors <= BITS_PER_LONG); 1009 1010 /* 1011 * Real stripes must be between 2 (2 disks RAID5, aka RAID1) and 256 1012 * (limited by u8). 1013 */ 1014 ASSERT(real_stripes >= 2); 1015 ASSERT(real_stripes <= U8_MAX); 1016 1017 rbio = kzalloc(sizeof(*rbio), GFP_NOFS); 1018 if (!rbio) 1019 return ERR_PTR(-ENOMEM); 1020 rbio->stripe_pages = kcalloc(num_pages, sizeof(struct page *), 1021 GFP_NOFS); 1022 rbio->bio_sectors = kcalloc(num_sectors, sizeof(struct sector_ptr), 1023 GFP_NOFS); 1024 rbio->stripe_sectors = kcalloc(num_sectors, sizeof(struct sector_ptr), 1025 GFP_NOFS); 1026 rbio->finish_pointers = kcalloc(real_stripes, sizeof(void *), GFP_NOFS); 1027 rbio->error_bitmap = bitmap_zalloc(num_sectors, GFP_NOFS); 1028 1029 if (!rbio->stripe_pages || !rbio->bio_sectors || !rbio->stripe_sectors || 1030 !rbio->finish_pointers || !rbio->error_bitmap) { 1031 free_raid_bio_pointers(rbio); 1032 kfree(rbio); 1033 return ERR_PTR(-ENOMEM); 1034 } 1035 1036 bio_list_init(&rbio->bio_list); 1037 init_waitqueue_head(&rbio->io_wait); 1038 INIT_LIST_HEAD(&rbio->plug_list); 1039 spin_lock_init(&rbio->bio_list_lock); 1040 INIT_LIST_HEAD(&rbio->stripe_cache); 1041 INIT_LIST_HEAD(&rbio->hash_list); 1042 btrfs_get_bioc(bioc); 1043 rbio->bioc = bioc; 1044 rbio->nr_pages = num_pages; 1045 rbio->nr_sectors = num_sectors; 1046 rbio->real_stripes = real_stripes; 1047 rbio->stripe_npages = stripe_npages; 1048 rbio->stripe_nsectors = stripe_nsectors; 1049 refcount_set(&rbio->refs, 1); 1050 atomic_set(&rbio->stripes_pending, 0); 1051 1052 ASSERT(btrfs_nr_parity_stripes(bioc->map_type)); 1053 rbio->nr_data = real_stripes - btrfs_nr_parity_stripes(bioc->map_type); 1054 ASSERT(rbio->nr_data > 0); 1055 1056 return rbio; 1057 } 1058 1059 /* allocate pages for all the stripes in the bio, including parity */ 1060 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio) 1061 { 1062 int ret; 1063 1064 ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages, false); 1065 if (ret < 0) 1066 return ret; 1067 /* Mapping all sectors */ 1068 index_stripe_sectors(rbio); 1069 return 0; 1070 } 1071 1072 /* only allocate pages for p/q stripes */ 1073 static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) 1074 { 1075 const int data_pages = rbio->nr_data * rbio->stripe_npages; 1076 int ret; 1077 1078 ret = btrfs_alloc_page_array(rbio->nr_pages - data_pages, 1079 rbio->stripe_pages + data_pages, false); 1080 if (ret < 0) 1081 return ret; 1082 1083 index_stripe_sectors(rbio); 1084 return 0; 1085 } 1086 1087 /* 1088 * Return the total number of errors found in the vertical stripe of @sector_nr. 1089 * 1090 * @faila and @failb will also be updated to the first and second stripe 1091 * number of the errors. 1092 */ 1093 static int get_rbio_veritical_errors(struct btrfs_raid_bio *rbio, int sector_nr, 1094 int *faila, int *failb) 1095 { 1096 int stripe_nr; 1097 int found_errors = 0; 1098 1099 if (faila || failb) { 1100 /* 1101 * Both @faila and @failb should be valid pointers if any of 1102 * them is specified. 1103 */ 1104 ASSERT(faila && failb); 1105 *faila = -1; 1106 *failb = -1; 1107 } 1108 1109 for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) { 1110 int total_sector_nr = stripe_nr * rbio->stripe_nsectors + sector_nr; 1111 1112 if (test_bit(total_sector_nr, rbio->error_bitmap)) { 1113 found_errors++; 1114 if (faila) { 1115 /* Update faila and failb. */ 1116 if (*faila < 0) 1117 *faila = stripe_nr; 1118 else if (*failb < 0) 1119 *failb = stripe_nr; 1120 } 1121 } 1122 } 1123 return found_errors; 1124 } 1125 1126 /* 1127 * Add a single sector @sector into our list of bios for IO. 1128 * 1129 * Return 0 if everything went well. 1130 * Return <0 for error. 1131 */ 1132 static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, 1133 struct bio_list *bio_list, 1134 struct sector_ptr *sector, 1135 unsigned int stripe_nr, 1136 unsigned int sector_nr, 1137 enum req_op op) 1138 { 1139 const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 1140 struct bio *last = bio_list->tail; 1141 int ret; 1142 struct bio *bio; 1143 struct btrfs_io_stripe *stripe; 1144 u64 disk_start; 1145 1146 /* 1147 * Note: here stripe_nr has taken device replace into consideration, 1148 * thus it can be larger than rbio->real_stripe. 1149 * So here we check against bioc->num_stripes, not rbio->real_stripes. 1150 */ 1151 ASSERT_RBIO_STRIPE(stripe_nr >= 0 && stripe_nr < rbio->bioc->num_stripes, 1152 rbio, stripe_nr); 1153 ASSERT_RBIO_SECTOR(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors, 1154 rbio, sector_nr); 1155 ASSERT(sector->has_paddr); 1156 1157 stripe = &rbio->bioc->stripes[stripe_nr]; 1158 disk_start = stripe->physical + sector_nr * sectorsize; 1159 1160 /* if the device is missing, just fail this stripe */ 1161 if (!stripe->dev->bdev) { 1162 int found_errors; 1163 1164 set_bit(stripe_nr * rbio->stripe_nsectors + sector_nr, 1165 rbio->error_bitmap); 1166 1167 /* Check if we have reached tolerance early. */ 1168 found_errors = get_rbio_veritical_errors(rbio, sector_nr, 1169 NULL, NULL); 1170 if (found_errors > rbio->bioc->max_errors) 1171 return -EIO; 1172 return 0; 1173 } 1174 1175 /* see if we can add this page onto our existing bio */ 1176 if (last) { 1177 u64 last_end = last->bi_iter.bi_sector << SECTOR_SHIFT; 1178 last_end += last->bi_iter.bi_size; 1179 1180 /* 1181 * we can't merge these if they are from different 1182 * devices or if they are not contiguous 1183 */ 1184 if (last_end == disk_start && !last->bi_status && 1185 last->bi_bdev == stripe->dev->bdev) { 1186 ret = bio_add_page(last, phys_to_page(sector->paddr), 1187 sectorsize, offset_in_page(sector->paddr)); 1188 if (ret == sectorsize) 1189 return 0; 1190 } 1191 } 1192 1193 /* put a new bio on the list */ 1194 bio = bio_alloc(stripe->dev->bdev, 1195 max(BTRFS_STRIPE_LEN >> PAGE_SHIFT, 1), 1196 op, GFP_NOFS); 1197 bio->bi_iter.bi_sector = disk_start >> SECTOR_SHIFT; 1198 bio->bi_private = rbio; 1199 1200 __bio_add_page(bio, phys_to_page(sector->paddr), sectorsize, 1201 offset_in_page(sector->paddr)); 1202 bio_list_add(bio_list, bio); 1203 return 0; 1204 } 1205 1206 static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio) 1207 { 1208 const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 1209 const u32 sectorsize_bits = rbio->bioc->fs_info->sectorsize_bits; 1210 struct bvec_iter iter = bio->bi_iter; 1211 u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - 1212 rbio->bioc->full_stripe_logical; 1213 1214 while (iter.bi_size) { 1215 unsigned int index = (offset >> sectorsize_bits); 1216 struct sector_ptr *sector = &rbio->bio_sectors[index]; 1217 struct bio_vec bv = bio_iter_iovec(bio, iter); 1218 1219 sector->has_paddr = true; 1220 sector->paddr = bvec_phys(&bv); 1221 bio_advance_iter_single(bio, &iter, sectorsize); 1222 offset += sectorsize; 1223 } 1224 } 1225 1226 /* 1227 * helper function to walk our bio list and populate the bio_pages array with 1228 * the result. This seems expensive, but it is faster than constantly 1229 * searching through the bio list as we setup the IO in finish_rmw or stripe 1230 * reconstruction. 1231 * 1232 * This must be called before you trust the answers from page_in_rbio 1233 */ 1234 static void index_rbio_pages(struct btrfs_raid_bio *rbio) 1235 { 1236 struct bio *bio; 1237 1238 spin_lock(&rbio->bio_list_lock); 1239 bio_list_for_each(bio, &rbio->bio_list) 1240 index_one_bio(rbio, bio); 1241 1242 spin_unlock(&rbio->bio_list_lock); 1243 } 1244 1245 static void bio_get_trace_info(struct btrfs_raid_bio *rbio, struct bio *bio, 1246 struct raid56_bio_trace_info *trace_info) 1247 { 1248 const struct btrfs_io_context *bioc = rbio->bioc; 1249 int i; 1250 1251 ASSERT(bioc); 1252 1253 /* We rely on bio->bi_bdev to find the stripe number. */ 1254 if (!bio->bi_bdev) 1255 goto not_found; 1256 1257 for (i = 0; i < bioc->num_stripes; i++) { 1258 if (bio->bi_bdev != bioc->stripes[i].dev->bdev) 1259 continue; 1260 trace_info->stripe_nr = i; 1261 trace_info->devid = bioc->stripes[i].dev->devid; 1262 trace_info->offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - 1263 bioc->stripes[i].physical; 1264 return; 1265 } 1266 1267 not_found: 1268 trace_info->devid = -1; 1269 trace_info->offset = -1; 1270 trace_info->stripe_nr = -1; 1271 } 1272 1273 static inline void bio_list_put(struct bio_list *bio_list) 1274 { 1275 struct bio *bio; 1276 1277 while ((bio = bio_list_pop(bio_list))) 1278 bio_put(bio); 1279 } 1280 1281 static void assert_rbio(struct btrfs_raid_bio *rbio) 1282 { 1283 if (!IS_ENABLED(CONFIG_BTRFS_ASSERT)) 1284 return; 1285 1286 /* 1287 * At least two stripes (2 disks RAID5), and since real_stripes is U8, 1288 * we won't go beyond 256 disks anyway. 1289 */ 1290 ASSERT_RBIO(rbio->real_stripes >= 2, rbio); 1291 ASSERT_RBIO(rbio->nr_data > 0, rbio); 1292 1293 /* 1294 * This is another check to make sure nr data stripes is smaller 1295 * than total stripes. 1296 */ 1297 ASSERT_RBIO(rbio->nr_data < rbio->real_stripes, rbio); 1298 } 1299 1300 static inline void *kmap_local_sector(const struct sector_ptr *sector) 1301 { 1302 /* The sector pointer must have a page mapped to it. */ 1303 ASSERT(sector->has_paddr); 1304 1305 return kmap_local_page(phys_to_page(sector->paddr)) + 1306 offset_in_page(sector->paddr); 1307 } 1308 1309 /* Generate PQ for one vertical stripe. */ 1310 static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr) 1311 { 1312 void **pointers = rbio->finish_pointers; 1313 const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 1314 struct sector_ptr *sector; 1315 int stripe; 1316 const bool has_qstripe = rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6; 1317 1318 /* First collect one sector from each data stripe */ 1319 for (stripe = 0; stripe < rbio->nr_data; stripe++) { 1320 sector = sector_in_rbio(rbio, stripe, sectornr, 0); 1321 pointers[stripe] = kmap_local_sector(sector); 1322 } 1323 1324 /* Then add the parity stripe */ 1325 sector = rbio_pstripe_sector(rbio, sectornr); 1326 sector->uptodate = 1; 1327 pointers[stripe++] = kmap_local_sector(sector); 1328 1329 if (has_qstripe) { 1330 /* 1331 * RAID6, add the qstripe and call the library function 1332 * to fill in our p/q 1333 */ 1334 sector = rbio_qstripe_sector(rbio, sectornr); 1335 sector->uptodate = 1; 1336 pointers[stripe++] = kmap_local_sector(sector); 1337 1338 assert_rbio(rbio); 1339 raid6_call.gen_syndrome(rbio->real_stripes, sectorsize, 1340 pointers); 1341 } else { 1342 /* raid5 */ 1343 memcpy(pointers[rbio->nr_data], pointers[0], sectorsize); 1344 run_xor(pointers + 1, rbio->nr_data - 1, sectorsize); 1345 } 1346 for (stripe = stripe - 1; stripe >= 0; stripe--) 1347 kunmap_local(pointers[stripe]); 1348 } 1349 1350 static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio, 1351 struct bio_list *bio_list) 1352 { 1353 /* The total sector number inside the full stripe. */ 1354 int total_sector_nr; 1355 int sectornr; 1356 int stripe; 1357 int ret; 1358 1359 ASSERT(bio_list_size(bio_list) == 0); 1360 1361 /* We should have at least one data sector. */ 1362 ASSERT(bitmap_weight(&rbio->dbitmap, rbio->stripe_nsectors)); 1363 1364 /* 1365 * Reset errors, as we may have errors inherited from from degraded 1366 * write. 1367 */ 1368 bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); 1369 1370 /* 1371 * Start assembly. Make bios for everything from the higher layers (the 1372 * bio_list in our rbio) and our P/Q. Ignore everything else. 1373 */ 1374 for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; 1375 total_sector_nr++) { 1376 struct sector_ptr *sector; 1377 1378 stripe = total_sector_nr / rbio->stripe_nsectors; 1379 sectornr = total_sector_nr % rbio->stripe_nsectors; 1380 1381 /* This vertical stripe has no data, skip it. */ 1382 if (!test_bit(sectornr, &rbio->dbitmap)) 1383 continue; 1384 1385 if (stripe < rbio->nr_data) { 1386 sector = sector_in_rbio(rbio, stripe, sectornr, 1); 1387 if (!sector) 1388 continue; 1389 } else { 1390 sector = rbio_stripe_sector(rbio, stripe, sectornr); 1391 } 1392 1393 ret = rbio_add_io_sector(rbio, bio_list, sector, stripe, 1394 sectornr, REQ_OP_WRITE); 1395 if (ret) 1396 goto error; 1397 } 1398 1399 if (likely(!rbio->bioc->replace_nr_stripes)) 1400 return 0; 1401 1402 /* 1403 * Make a copy for the replace target device. 1404 * 1405 * Thus the source stripe number (in replace_stripe_src) should be valid. 1406 */ 1407 ASSERT(rbio->bioc->replace_stripe_src >= 0); 1408 1409 for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; 1410 total_sector_nr++) { 1411 struct sector_ptr *sector; 1412 1413 stripe = total_sector_nr / rbio->stripe_nsectors; 1414 sectornr = total_sector_nr % rbio->stripe_nsectors; 1415 1416 /* 1417 * For RAID56, there is only one device that can be replaced, 1418 * and replace_stripe_src[0] indicates the stripe number we 1419 * need to copy from. 1420 */ 1421 if (stripe != rbio->bioc->replace_stripe_src) { 1422 /* 1423 * We can skip the whole stripe completely, note 1424 * total_sector_nr will be increased by one anyway. 1425 */ 1426 ASSERT(sectornr == 0); 1427 total_sector_nr += rbio->stripe_nsectors - 1; 1428 continue; 1429 } 1430 1431 /* This vertical stripe has no data, skip it. */ 1432 if (!test_bit(sectornr, &rbio->dbitmap)) 1433 continue; 1434 1435 if (stripe < rbio->nr_data) { 1436 sector = sector_in_rbio(rbio, stripe, sectornr, 1); 1437 if (!sector) 1438 continue; 1439 } else { 1440 sector = rbio_stripe_sector(rbio, stripe, sectornr); 1441 } 1442 1443 ret = rbio_add_io_sector(rbio, bio_list, sector, 1444 rbio->real_stripes, 1445 sectornr, REQ_OP_WRITE); 1446 if (ret) 1447 goto error; 1448 } 1449 1450 return 0; 1451 error: 1452 bio_list_put(bio_list); 1453 return -EIO; 1454 } 1455 1456 static void set_rbio_range_error(struct btrfs_raid_bio *rbio, struct bio *bio) 1457 { 1458 struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; 1459 u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - 1460 rbio->bioc->full_stripe_logical; 1461 int total_nr_sector = offset >> fs_info->sectorsize_bits; 1462 1463 ASSERT(total_nr_sector < rbio->nr_data * rbio->stripe_nsectors); 1464 1465 bitmap_set(rbio->error_bitmap, total_nr_sector, 1466 bio->bi_iter.bi_size >> fs_info->sectorsize_bits); 1467 1468 /* 1469 * Special handling for raid56_alloc_missing_rbio() used by 1470 * scrub/replace. Unlike call path in raid56_parity_recover(), they 1471 * pass an empty bio here. Thus we have to find out the missing device 1472 * and mark the stripe error instead. 1473 */ 1474 if (bio->bi_iter.bi_size == 0) { 1475 bool found_missing = false; 1476 int stripe_nr; 1477 1478 for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) { 1479 if (!rbio->bioc->stripes[stripe_nr].dev->bdev) { 1480 found_missing = true; 1481 bitmap_set(rbio->error_bitmap, 1482 stripe_nr * rbio->stripe_nsectors, 1483 rbio->stripe_nsectors); 1484 } 1485 } 1486 ASSERT(found_missing); 1487 } 1488 } 1489 1490 /* 1491 * For subpage case, we can no longer set page Up-to-date directly for 1492 * stripe_pages[], thus we need to locate the sector. 1493 */ 1494 static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio, 1495 phys_addr_t paddr) 1496 { 1497 int i; 1498 1499 for (i = 0; i < rbio->nr_sectors; i++) { 1500 struct sector_ptr *sector = &rbio->stripe_sectors[i]; 1501 1502 if (sector->has_paddr && sector->paddr == paddr) 1503 return sector; 1504 } 1505 return NULL; 1506 } 1507 1508 /* 1509 * this sets each page in the bio uptodate. It should only be used on private 1510 * rbio pages, nothing that comes in from the higher layers 1511 */ 1512 static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio) 1513 { 1514 const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 1515 struct bio_vec *bvec; 1516 struct bvec_iter_all iter_all; 1517 1518 ASSERT(!bio_flagged(bio, BIO_CLONED)); 1519 1520 bio_for_each_segment_all(bvec, bio, iter_all) { 1521 struct sector_ptr *sector; 1522 phys_addr_t paddr = bvec_phys(bvec); 1523 1524 for (u32 off = 0; off < bvec->bv_len; off += sectorsize) { 1525 sector = find_stripe_sector(rbio, paddr + off); 1526 ASSERT(sector); 1527 if (sector) 1528 sector->uptodate = 1; 1529 } 1530 } 1531 } 1532 1533 static int get_bio_sector_nr(struct btrfs_raid_bio *rbio, struct bio *bio) 1534 { 1535 phys_addr_t bvec_paddr = bvec_phys(bio_first_bvec_all(bio)); 1536 int i; 1537 1538 for (i = 0; i < rbio->nr_sectors; i++) { 1539 if (rbio->stripe_sectors[i].paddr == bvec_paddr) 1540 break; 1541 if (rbio->bio_sectors[i].has_paddr && 1542 rbio->bio_sectors[i].paddr == bvec_paddr) 1543 break; 1544 } 1545 ASSERT(i < rbio->nr_sectors); 1546 return i; 1547 } 1548 1549 static void rbio_update_error_bitmap(struct btrfs_raid_bio *rbio, struct bio *bio) 1550 { 1551 int total_sector_nr = get_bio_sector_nr(rbio, bio); 1552 u32 bio_size = 0; 1553 struct bio_vec *bvec; 1554 int i; 1555 1556 bio_for_each_bvec_all(bvec, bio, i) 1557 bio_size += bvec->bv_len; 1558 1559 /* 1560 * Since we can have multiple bios touching the error_bitmap, we cannot 1561 * call bitmap_set() without protection. 1562 * 1563 * Instead use set_bit() for each bit, as set_bit() itself is atomic. 1564 */ 1565 for (i = total_sector_nr; i < total_sector_nr + 1566 (bio_size >> rbio->bioc->fs_info->sectorsize_bits); i++) 1567 set_bit(i, rbio->error_bitmap); 1568 } 1569 1570 /* Verify the data sectors at read time. */ 1571 static void verify_bio_data_sectors(struct btrfs_raid_bio *rbio, 1572 struct bio *bio) 1573 { 1574 struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; 1575 int total_sector_nr = get_bio_sector_nr(rbio, bio); 1576 struct bio_vec *bvec; 1577 struct bvec_iter_all iter_all; 1578 1579 /* No data csum for the whole stripe, no need to verify. */ 1580 if (!rbio->csum_bitmap || !rbio->csum_buf) 1581 return; 1582 1583 /* P/Q stripes, they have no data csum to verify against. */ 1584 if (total_sector_nr >= rbio->nr_data * rbio->stripe_nsectors) 1585 return; 1586 1587 bio_for_each_segment_all(bvec, bio, iter_all) { 1588 void *kaddr; 1589 1590 kaddr = bvec_kmap_local(bvec); 1591 for (u32 off = 0; off < bvec->bv_len; 1592 off += fs_info->sectorsize, total_sector_nr++) { 1593 u8 csum_buf[BTRFS_CSUM_SIZE]; 1594 u8 *expected_csum = rbio->csum_buf + 1595 total_sector_nr * fs_info->csum_size; 1596 int ret; 1597 1598 /* No csum for this sector, skip to the next sector. */ 1599 if (!test_bit(total_sector_nr, rbio->csum_bitmap)) 1600 continue; 1601 1602 ret = btrfs_check_sector_csum(fs_info, kaddr + off, 1603 csum_buf, expected_csum); 1604 if (ret < 0) 1605 set_bit(total_sector_nr, rbio->error_bitmap); 1606 } 1607 kunmap_local(kaddr); 1608 } 1609 } 1610 1611 static void raid_wait_read_end_io(struct bio *bio) 1612 { 1613 struct btrfs_raid_bio *rbio = bio->bi_private; 1614 1615 if (bio->bi_status) { 1616 rbio_update_error_bitmap(rbio, bio); 1617 } else { 1618 set_bio_pages_uptodate(rbio, bio); 1619 verify_bio_data_sectors(rbio, bio); 1620 } 1621 1622 bio_put(bio); 1623 if (atomic_dec_and_test(&rbio->stripes_pending)) 1624 wake_up(&rbio->io_wait); 1625 } 1626 1627 static void submit_read_wait_bio_list(struct btrfs_raid_bio *rbio, 1628 struct bio_list *bio_list) 1629 { 1630 struct bio *bio; 1631 1632 atomic_set(&rbio->stripes_pending, bio_list_size(bio_list)); 1633 while ((bio = bio_list_pop(bio_list))) { 1634 bio->bi_end_io = raid_wait_read_end_io; 1635 1636 if (trace_raid56_read_enabled()) { 1637 struct raid56_bio_trace_info trace_info = { 0 }; 1638 1639 bio_get_trace_info(rbio, bio, &trace_info); 1640 trace_raid56_read(rbio, bio, &trace_info); 1641 } 1642 submit_bio(bio); 1643 } 1644 1645 wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); 1646 } 1647 1648 static int alloc_rbio_data_pages(struct btrfs_raid_bio *rbio) 1649 { 1650 const int data_pages = rbio->nr_data * rbio->stripe_npages; 1651 int ret; 1652 1653 ret = btrfs_alloc_page_array(data_pages, rbio->stripe_pages, false); 1654 if (ret < 0) 1655 return ret; 1656 1657 index_stripe_sectors(rbio); 1658 return 0; 1659 } 1660 1661 /* 1662 * We use plugging call backs to collect full stripes. 1663 * Any time we get a partial stripe write while plugged 1664 * we collect it into a list. When the unplug comes down, 1665 * we sort the list by logical block number and merge 1666 * everything we can into the same rbios 1667 */ 1668 struct btrfs_plug_cb { 1669 struct blk_plug_cb cb; 1670 struct btrfs_fs_info *info; 1671 struct list_head rbio_list; 1672 }; 1673 1674 /* 1675 * rbios on the plug list are sorted for easier merging. 1676 */ 1677 static int plug_cmp(void *priv, const struct list_head *a, 1678 const struct list_head *b) 1679 { 1680 const struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio, 1681 plug_list); 1682 const struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio, 1683 plug_list); 1684 u64 a_sector = ra->bio_list.head->bi_iter.bi_sector; 1685 u64 b_sector = rb->bio_list.head->bi_iter.bi_sector; 1686 1687 if (a_sector < b_sector) 1688 return -1; 1689 if (a_sector > b_sector) 1690 return 1; 1691 return 0; 1692 } 1693 1694 static void raid_unplug(struct blk_plug_cb *cb, bool from_schedule) 1695 { 1696 struct btrfs_plug_cb *plug = container_of(cb, struct btrfs_plug_cb, cb); 1697 struct btrfs_raid_bio *cur; 1698 struct btrfs_raid_bio *last = NULL; 1699 1700 list_sort(NULL, &plug->rbio_list, plug_cmp); 1701 1702 while (!list_empty(&plug->rbio_list)) { 1703 cur = list_first_entry(&plug->rbio_list, 1704 struct btrfs_raid_bio, plug_list); 1705 list_del_init(&cur->plug_list); 1706 1707 if (rbio_is_full(cur)) { 1708 /* We have a full stripe, queue it down. */ 1709 start_async_work(cur, rmw_rbio_work); 1710 continue; 1711 } 1712 if (last) { 1713 if (rbio_can_merge(last, cur)) { 1714 merge_rbio(last, cur); 1715 free_raid_bio(cur); 1716 continue; 1717 } 1718 start_async_work(last, rmw_rbio_work); 1719 } 1720 last = cur; 1721 } 1722 if (last) 1723 start_async_work(last, rmw_rbio_work); 1724 kfree(plug); 1725 } 1726 1727 /* Add the original bio into rbio->bio_list, and update rbio::dbitmap. */ 1728 static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio) 1729 { 1730 const struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; 1731 const u64 orig_logical = orig_bio->bi_iter.bi_sector << SECTOR_SHIFT; 1732 const u64 full_stripe_start = rbio->bioc->full_stripe_logical; 1733 const u32 orig_len = orig_bio->bi_iter.bi_size; 1734 const u32 sectorsize = fs_info->sectorsize; 1735 u64 cur_logical; 1736 1737 ASSERT_RBIO_LOGICAL(orig_logical >= full_stripe_start && 1738 orig_logical + orig_len <= full_stripe_start + 1739 rbio->nr_data * BTRFS_STRIPE_LEN, 1740 rbio, orig_logical); 1741 1742 bio_list_add(&rbio->bio_list, orig_bio); 1743 rbio->bio_list_bytes += orig_bio->bi_iter.bi_size; 1744 1745 /* Update the dbitmap. */ 1746 for (cur_logical = orig_logical; cur_logical < orig_logical + orig_len; 1747 cur_logical += sectorsize) { 1748 int bit = ((u32)(cur_logical - full_stripe_start) >> 1749 fs_info->sectorsize_bits) % rbio->stripe_nsectors; 1750 1751 set_bit(bit, &rbio->dbitmap); 1752 } 1753 } 1754 1755 /* 1756 * our main entry point for writes from the rest of the FS. 1757 */ 1758 void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc) 1759 { 1760 struct btrfs_fs_info *fs_info = bioc->fs_info; 1761 struct btrfs_raid_bio *rbio; 1762 struct btrfs_plug_cb *plug = NULL; 1763 struct blk_plug_cb *cb; 1764 1765 rbio = alloc_rbio(fs_info, bioc); 1766 if (IS_ERR(rbio)) { 1767 bio->bi_status = errno_to_blk_status(PTR_ERR(rbio)); 1768 bio_endio(bio); 1769 return; 1770 } 1771 rbio->operation = BTRFS_RBIO_WRITE; 1772 rbio_add_bio(rbio, bio); 1773 1774 /* 1775 * Don't plug on full rbios, just get them out the door 1776 * as quickly as we can 1777 */ 1778 if (!rbio_is_full(rbio)) { 1779 cb = blk_check_plugged(raid_unplug, fs_info, sizeof(*plug)); 1780 if (cb) { 1781 plug = container_of(cb, struct btrfs_plug_cb, cb); 1782 if (!plug->info) { 1783 plug->info = fs_info; 1784 INIT_LIST_HEAD(&plug->rbio_list); 1785 } 1786 list_add_tail(&rbio->plug_list, &plug->rbio_list); 1787 return; 1788 } 1789 } 1790 1791 /* 1792 * Either we don't have any existing plug, or we're doing a full stripe, 1793 * queue the rmw work now. 1794 */ 1795 start_async_work(rbio, rmw_rbio_work); 1796 } 1797 1798 static int verify_one_sector(struct btrfs_raid_bio *rbio, 1799 int stripe_nr, int sector_nr) 1800 { 1801 struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; 1802 struct sector_ptr *sector; 1803 u8 csum_buf[BTRFS_CSUM_SIZE]; 1804 u8 *csum_expected; 1805 void *kaddr; 1806 int ret; 1807 1808 if (!rbio->csum_bitmap || !rbio->csum_buf) 1809 return 0; 1810 1811 /* No way to verify P/Q as they are not covered by data csum. */ 1812 if (stripe_nr >= rbio->nr_data) 1813 return 0; 1814 /* 1815 * If we're rebuilding a read, we have to use pages from the 1816 * bio list if possible. 1817 */ 1818 if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { 1819 sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0); 1820 } else { 1821 sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr); 1822 } 1823 1824 csum_expected = rbio->csum_buf + 1825 (stripe_nr * rbio->stripe_nsectors + sector_nr) * 1826 fs_info->csum_size; 1827 kaddr = kmap_local_sector(sector); 1828 ret = btrfs_check_sector_csum(fs_info, kaddr, csum_buf, csum_expected); 1829 kunmap_local(kaddr); 1830 return ret; 1831 } 1832 1833 /* 1834 * Recover a vertical stripe specified by @sector_nr. 1835 * @*pointers are the pre-allocated pointers by the caller, so we don't 1836 * need to allocate/free the pointers again and again. 1837 */ 1838 static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, 1839 void **pointers, void **unmap_array) 1840 { 1841 struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; 1842 struct sector_ptr *sector; 1843 const u32 sectorsize = fs_info->sectorsize; 1844 int found_errors; 1845 int faila; 1846 int failb; 1847 int stripe_nr; 1848 int ret = 0; 1849 1850 /* 1851 * Now we just use bitmap to mark the horizontal stripes in 1852 * which we have data when doing parity scrub. 1853 */ 1854 if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB && 1855 !test_bit(sector_nr, &rbio->dbitmap)) 1856 return 0; 1857 1858 found_errors = get_rbio_veritical_errors(rbio, sector_nr, &faila, 1859 &failb); 1860 /* 1861 * No errors in the vertical stripe, skip it. Can happen for recovery 1862 * which only part of a stripe failed csum check. 1863 */ 1864 if (!found_errors) 1865 return 0; 1866 1867 if (found_errors > rbio->bioc->max_errors) 1868 return -EIO; 1869 1870 /* 1871 * Setup our array of pointers with sectors from each stripe 1872 * 1873 * NOTE: store a duplicate array of pointers to preserve the 1874 * pointer order. 1875 */ 1876 for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) { 1877 /* 1878 * If we're rebuilding a read, we have to use pages from the 1879 * bio list if possible. 1880 */ 1881 if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { 1882 sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0); 1883 } else { 1884 sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr); 1885 } 1886 pointers[stripe_nr] = kmap_local_sector(sector); 1887 unmap_array[stripe_nr] = pointers[stripe_nr]; 1888 } 1889 1890 /* All raid6 handling here */ 1891 if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) { 1892 /* Single failure, rebuild from parity raid5 style */ 1893 if (failb < 0) { 1894 if (faila == rbio->nr_data) 1895 /* 1896 * Just the P stripe has failed, without 1897 * a bad data or Q stripe. 1898 * We have nothing to do, just skip the 1899 * recovery for this stripe. 1900 */ 1901 goto cleanup; 1902 /* 1903 * a single failure in raid6 is rebuilt 1904 * in the pstripe code below 1905 */ 1906 goto pstripe; 1907 } 1908 1909 /* 1910 * If the q stripe is failed, do a pstripe reconstruction from 1911 * the xors. 1912 * If both the q stripe and the P stripe are failed, we're 1913 * here due to a crc mismatch and we can't give them the 1914 * data they want. 1915 */ 1916 if (failb == rbio->real_stripes - 1) { 1917 if (faila == rbio->real_stripes - 2) 1918 /* 1919 * Only P and Q are corrupted. 1920 * We only care about data stripes recovery, 1921 * can skip this vertical stripe. 1922 */ 1923 goto cleanup; 1924 /* 1925 * Otherwise we have one bad data stripe and 1926 * a good P stripe. raid5! 1927 */ 1928 goto pstripe; 1929 } 1930 1931 if (failb == rbio->real_stripes - 2) { 1932 raid6_datap_recov(rbio->real_stripes, sectorsize, 1933 faila, pointers); 1934 } else { 1935 raid6_2data_recov(rbio->real_stripes, sectorsize, 1936 faila, failb, pointers); 1937 } 1938 } else { 1939 void *p; 1940 1941 /* Rebuild from P stripe here (raid5 or raid6). */ 1942 ASSERT(failb == -1); 1943 pstripe: 1944 /* Copy parity block into failed block to start with */ 1945 memcpy(pointers[faila], pointers[rbio->nr_data], sectorsize); 1946 1947 /* Rearrange the pointer array */ 1948 p = pointers[faila]; 1949 for (stripe_nr = faila; stripe_nr < rbio->nr_data - 1; 1950 stripe_nr++) 1951 pointers[stripe_nr] = pointers[stripe_nr + 1]; 1952 pointers[rbio->nr_data - 1] = p; 1953 1954 /* Xor in the rest */ 1955 run_xor(pointers, rbio->nr_data - 1, sectorsize); 1956 1957 } 1958 1959 /* 1960 * No matter if this is a RMW or recovery, we should have all 1961 * failed sectors repaired in the vertical stripe, thus they are now 1962 * uptodate. 1963 * Especially if we determine to cache the rbio, we need to 1964 * have at least all data sectors uptodate. 1965 * 1966 * If possible, also check if the repaired sector matches its data 1967 * checksum. 1968 */ 1969 if (faila >= 0) { 1970 ret = verify_one_sector(rbio, faila, sector_nr); 1971 if (ret < 0) 1972 goto cleanup; 1973 1974 sector = rbio_stripe_sector(rbio, faila, sector_nr); 1975 sector->uptodate = 1; 1976 } 1977 if (failb >= 0) { 1978 ret = verify_one_sector(rbio, failb, sector_nr); 1979 if (ret < 0) 1980 goto cleanup; 1981 1982 sector = rbio_stripe_sector(rbio, failb, sector_nr); 1983 sector->uptodate = 1; 1984 } 1985 1986 cleanup: 1987 for (stripe_nr = rbio->real_stripes - 1; stripe_nr >= 0; stripe_nr--) 1988 kunmap_local(unmap_array[stripe_nr]); 1989 return ret; 1990 } 1991 1992 static int recover_sectors(struct btrfs_raid_bio *rbio) 1993 { 1994 void **pointers = NULL; 1995 void **unmap_array = NULL; 1996 int sectornr; 1997 int ret = 0; 1998 1999 /* 2000 * @pointers array stores the pointer for each sector. 2001 * 2002 * @unmap_array stores copy of pointers that does not get reordered 2003 * during reconstruction so that kunmap_local works. 2004 */ 2005 pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); 2006 unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); 2007 if (!pointers || !unmap_array) { 2008 ret = -ENOMEM; 2009 goto out; 2010 } 2011 2012 if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { 2013 spin_lock(&rbio->bio_list_lock); 2014 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 2015 spin_unlock(&rbio->bio_list_lock); 2016 } 2017 2018 index_rbio_pages(rbio); 2019 2020 for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { 2021 ret = recover_vertical(rbio, sectornr, pointers, unmap_array); 2022 if (ret < 0) 2023 break; 2024 } 2025 2026 out: 2027 kfree(pointers); 2028 kfree(unmap_array); 2029 return ret; 2030 } 2031 2032 static void recover_rbio(struct btrfs_raid_bio *rbio) 2033 { 2034 struct bio_list bio_list = BIO_EMPTY_LIST; 2035 int total_sector_nr; 2036 int ret = 0; 2037 2038 /* 2039 * Either we're doing recover for a read failure or degraded write, 2040 * caller should have set error bitmap correctly. 2041 */ 2042 ASSERT(bitmap_weight(rbio->error_bitmap, rbio->nr_sectors)); 2043 2044 /* For recovery, we need to read all sectors including P/Q. */ 2045 ret = alloc_rbio_pages(rbio); 2046 if (ret < 0) 2047 goto out; 2048 2049 index_rbio_pages(rbio); 2050 2051 /* 2052 * Read everything that hasn't failed. However this time we will 2053 * not trust any cached sector. 2054 * As we may read out some stale data but higher layer is not reading 2055 * that stale part. 2056 * 2057 * So here we always re-read everything in recovery path. 2058 */ 2059 for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; 2060 total_sector_nr++) { 2061 int stripe = total_sector_nr / rbio->stripe_nsectors; 2062 int sectornr = total_sector_nr % rbio->stripe_nsectors; 2063 struct sector_ptr *sector; 2064 2065 /* 2066 * Skip the range which has error. It can be a range which is 2067 * marked error (for csum mismatch), or it can be a missing 2068 * device. 2069 */ 2070 if (!rbio->bioc->stripes[stripe].dev->bdev || 2071 test_bit(total_sector_nr, rbio->error_bitmap)) { 2072 /* 2073 * Also set the error bit for missing device, which 2074 * may not yet have its error bit set. 2075 */ 2076 set_bit(total_sector_nr, rbio->error_bitmap); 2077 continue; 2078 } 2079 2080 sector = rbio_stripe_sector(rbio, stripe, sectornr); 2081 ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, 2082 sectornr, REQ_OP_READ); 2083 if (ret < 0) { 2084 bio_list_put(&bio_list); 2085 goto out; 2086 } 2087 } 2088 2089 submit_read_wait_bio_list(rbio, &bio_list); 2090 ret = recover_sectors(rbio); 2091 out: 2092 rbio_orig_end_io(rbio, errno_to_blk_status(ret)); 2093 } 2094 2095 static void recover_rbio_work(struct work_struct *work) 2096 { 2097 struct btrfs_raid_bio *rbio; 2098 2099 rbio = container_of(work, struct btrfs_raid_bio, work); 2100 if (!lock_stripe_add(rbio)) 2101 recover_rbio(rbio); 2102 } 2103 2104 static void recover_rbio_work_locked(struct work_struct *work) 2105 { 2106 recover_rbio(container_of(work, struct btrfs_raid_bio, work)); 2107 } 2108 2109 static void set_rbio_raid6_extra_error(struct btrfs_raid_bio *rbio, int mirror_num) 2110 { 2111 bool found = false; 2112 int sector_nr; 2113 2114 /* 2115 * This is for RAID6 extra recovery tries, thus mirror number should 2116 * be large than 2. 2117 * Mirror 1 means read from data stripes. Mirror 2 means rebuild using 2118 * RAID5 methods. 2119 */ 2120 ASSERT(mirror_num > 2); 2121 for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) { 2122 int found_errors; 2123 int faila; 2124 int failb; 2125 2126 found_errors = get_rbio_veritical_errors(rbio, sector_nr, 2127 &faila, &failb); 2128 /* This vertical stripe doesn't have errors. */ 2129 if (!found_errors) 2130 continue; 2131 2132 /* 2133 * If we found errors, there should be only one error marked 2134 * by previous set_rbio_range_error(). 2135 */ 2136 ASSERT(found_errors == 1); 2137 found = true; 2138 2139 /* Now select another stripe to mark as error. */ 2140 failb = rbio->real_stripes - (mirror_num - 1); 2141 if (failb <= faila) 2142 failb--; 2143 2144 /* Set the extra bit in error bitmap. */ 2145 if (failb >= 0) 2146 set_bit(failb * rbio->stripe_nsectors + sector_nr, 2147 rbio->error_bitmap); 2148 } 2149 2150 /* We should found at least one vertical stripe with error.*/ 2151 ASSERT(found); 2152 } 2153 2154 /* 2155 * the main entry point for reads from the higher layers. This 2156 * is really only called when the normal read path had a failure, 2157 * so we assume the bio they send down corresponds to a failed part 2158 * of the drive. 2159 */ 2160 void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, 2161 int mirror_num) 2162 { 2163 struct btrfs_fs_info *fs_info = bioc->fs_info; 2164 struct btrfs_raid_bio *rbio; 2165 2166 rbio = alloc_rbio(fs_info, bioc); 2167 if (IS_ERR(rbio)) { 2168 bio->bi_status = errno_to_blk_status(PTR_ERR(rbio)); 2169 bio_endio(bio); 2170 return; 2171 } 2172 2173 rbio->operation = BTRFS_RBIO_READ_REBUILD; 2174 rbio_add_bio(rbio, bio); 2175 2176 set_rbio_range_error(rbio, bio); 2177 2178 /* 2179 * Loop retry: 2180 * for 'mirror == 2', reconstruct from all other stripes. 2181 * for 'mirror_num > 2', select a stripe to fail on every retry. 2182 */ 2183 if (mirror_num > 2) 2184 set_rbio_raid6_extra_error(rbio, mirror_num); 2185 2186 start_async_work(rbio, recover_rbio_work); 2187 } 2188 2189 static void fill_data_csums(struct btrfs_raid_bio *rbio) 2190 { 2191 struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; 2192 struct btrfs_root *csum_root = btrfs_csum_root(fs_info, 2193 rbio->bioc->full_stripe_logical); 2194 const u64 start = rbio->bioc->full_stripe_logical; 2195 const u32 len = (rbio->nr_data * rbio->stripe_nsectors) << 2196 fs_info->sectorsize_bits; 2197 int ret; 2198 2199 /* The rbio should not have its csum buffer initialized. */ 2200 ASSERT(!rbio->csum_buf && !rbio->csum_bitmap); 2201 2202 /* 2203 * Skip the csum search if: 2204 * 2205 * - The rbio doesn't belong to data block groups 2206 * Then we are doing IO for tree blocks, no need to search csums. 2207 * 2208 * - The rbio belongs to mixed block groups 2209 * This is to avoid deadlock, as we're already holding the full 2210 * stripe lock, if we trigger a metadata read, and it needs to do 2211 * raid56 recovery, we will deadlock. 2212 */ 2213 if (!(rbio->bioc->map_type & BTRFS_BLOCK_GROUP_DATA) || 2214 rbio->bioc->map_type & BTRFS_BLOCK_GROUP_METADATA) 2215 return; 2216 2217 rbio->csum_buf = kzalloc(rbio->nr_data * rbio->stripe_nsectors * 2218 fs_info->csum_size, GFP_NOFS); 2219 rbio->csum_bitmap = bitmap_zalloc(rbio->nr_data * rbio->stripe_nsectors, 2220 GFP_NOFS); 2221 if (!rbio->csum_buf || !rbio->csum_bitmap) { 2222 ret = -ENOMEM; 2223 goto error; 2224 } 2225 2226 ret = btrfs_lookup_csums_bitmap(csum_root, NULL, start, start + len - 1, 2227 rbio->csum_buf, rbio->csum_bitmap); 2228 if (ret < 0) 2229 goto error; 2230 if (bitmap_empty(rbio->csum_bitmap, len >> fs_info->sectorsize_bits)) 2231 goto no_csum; 2232 return; 2233 2234 error: 2235 /* 2236 * We failed to allocate memory or grab the csum, but it's not fatal, 2237 * we can still continue. But better to warn users that RMW is no 2238 * longer safe for this particular sub-stripe write. 2239 */ 2240 btrfs_warn_rl(fs_info, 2241 "sub-stripe write for full stripe %llu is not safe, failed to get csum: %d", 2242 rbio->bioc->full_stripe_logical, ret); 2243 no_csum: 2244 kfree(rbio->csum_buf); 2245 bitmap_free(rbio->csum_bitmap); 2246 rbio->csum_buf = NULL; 2247 rbio->csum_bitmap = NULL; 2248 } 2249 2250 static int rmw_read_wait_recover(struct btrfs_raid_bio *rbio) 2251 { 2252 struct bio_list bio_list = BIO_EMPTY_LIST; 2253 int total_sector_nr; 2254 int ret = 0; 2255 2256 /* 2257 * Fill the data csums we need for data verification. We need to fill 2258 * the csum_bitmap/csum_buf first, as our endio function will try to 2259 * verify the data sectors. 2260 */ 2261 fill_data_csums(rbio); 2262 2263 /* 2264 * Build a list of bios to read all sectors (including data and P/Q). 2265 * 2266 * This behavior is to compensate the later csum verification and recovery. 2267 */ 2268 for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; 2269 total_sector_nr++) { 2270 struct sector_ptr *sector; 2271 int stripe = total_sector_nr / rbio->stripe_nsectors; 2272 int sectornr = total_sector_nr % rbio->stripe_nsectors; 2273 2274 sector = rbio_stripe_sector(rbio, stripe, sectornr); 2275 ret = rbio_add_io_sector(rbio, &bio_list, sector, 2276 stripe, sectornr, REQ_OP_READ); 2277 if (ret) { 2278 bio_list_put(&bio_list); 2279 return ret; 2280 } 2281 } 2282 2283 /* 2284 * We may or may not have any corrupted sectors (including missing dev 2285 * and csum mismatch), just let recover_sectors() to handle them all. 2286 */ 2287 submit_read_wait_bio_list(rbio, &bio_list); 2288 return recover_sectors(rbio); 2289 } 2290 2291 static void raid_wait_write_end_io(struct bio *bio) 2292 { 2293 struct btrfs_raid_bio *rbio = bio->bi_private; 2294 2295 if (bio->bi_status) 2296 rbio_update_error_bitmap(rbio, bio); 2297 bio_put(bio); 2298 if (atomic_dec_and_test(&rbio->stripes_pending)) 2299 wake_up(&rbio->io_wait); 2300 } 2301 2302 static void submit_write_bios(struct btrfs_raid_bio *rbio, 2303 struct bio_list *bio_list) 2304 { 2305 struct bio *bio; 2306 2307 atomic_set(&rbio->stripes_pending, bio_list_size(bio_list)); 2308 while ((bio = bio_list_pop(bio_list))) { 2309 bio->bi_end_io = raid_wait_write_end_io; 2310 2311 if (trace_raid56_write_enabled()) { 2312 struct raid56_bio_trace_info trace_info = { 0 }; 2313 2314 bio_get_trace_info(rbio, bio, &trace_info); 2315 trace_raid56_write(rbio, bio, &trace_info); 2316 } 2317 submit_bio(bio); 2318 } 2319 } 2320 2321 /* 2322 * To determine if we need to read any sector from the disk. 2323 * Should only be utilized in RMW path, to skip cached rbio. 2324 */ 2325 static bool need_read_stripe_sectors(struct btrfs_raid_bio *rbio) 2326 { 2327 int i; 2328 2329 for (i = 0; i < rbio->nr_data * rbio->stripe_nsectors; i++) { 2330 struct sector_ptr *sector = &rbio->stripe_sectors[i]; 2331 2332 /* 2333 * We have a sector which doesn't have page nor uptodate, 2334 * thus this rbio can not be cached one, as cached one must 2335 * have all its data sectors present and uptodate. 2336 */ 2337 if (!sector->has_paddr || !sector->uptodate) 2338 return true; 2339 } 2340 return false; 2341 } 2342 2343 static void rmw_rbio(struct btrfs_raid_bio *rbio) 2344 { 2345 struct bio_list bio_list; 2346 int sectornr; 2347 int ret = 0; 2348 2349 /* 2350 * Allocate the pages for parity first, as P/Q pages will always be 2351 * needed for both full-stripe and sub-stripe writes. 2352 */ 2353 ret = alloc_rbio_parity_pages(rbio); 2354 if (ret < 0) 2355 goto out; 2356 2357 /* 2358 * Either full stripe write, or we have every data sector already 2359 * cached, can go to write path immediately. 2360 */ 2361 if (!rbio_is_full(rbio) && need_read_stripe_sectors(rbio)) { 2362 /* 2363 * Now we're doing sub-stripe write, also need all data stripes 2364 * to do the full RMW. 2365 */ 2366 ret = alloc_rbio_data_pages(rbio); 2367 if (ret < 0) 2368 goto out; 2369 2370 index_rbio_pages(rbio); 2371 2372 ret = rmw_read_wait_recover(rbio); 2373 if (ret < 0) 2374 goto out; 2375 } 2376 2377 /* 2378 * At this stage we're not allowed to add any new bios to the 2379 * bio list any more, anyone else that wants to change this stripe 2380 * needs to do their own rmw. 2381 */ 2382 spin_lock(&rbio->bio_list_lock); 2383 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 2384 spin_unlock(&rbio->bio_list_lock); 2385 2386 bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); 2387 2388 index_rbio_pages(rbio); 2389 2390 /* 2391 * We don't cache full rbios because we're assuming 2392 * the higher layers are unlikely to use this area of 2393 * the disk again soon. If they do use it again, 2394 * hopefully they will send another full bio. 2395 */ 2396 if (!rbio_is_full(rbio)) 2397 cache_rbio_pages(rbio); 2398 else 2399 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 2400 2401 for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) 2402 generate_pq_vertical(rbio, sectornr); 2403 2404 bio_list_init(&bio_list); 2405 ret = rmw_assemble_write_bios(rbio, &bio_list); 2406 if (ret < 0) 2407 goto out; 2408 2409 /* We should have at least one bio assembled. */ 2410 ASSERT(bio_list_size(&bio_list)); 2411 submit_write_bios(rbio, &bio_list); 2412 wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); 2413 2414 /* We may have more errors than our tolerance during the read. */ 2415 for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { 2416 int found_errors; 2417 2418 found_errors = get_rbio_veritical_errors(rbio, sectornr, NULL, NULL); 2419 if (found_errors > rbio->bioc->max_errors) { 2420 ret = -EIO; 2421 break; 2422 } 2423 } 2424 out: 2425 rbio_orig_end_io(rbio, errno_to_blk_status(ret)); 2426 } 2427 2428 static void rmw_rbio_work(struct work_struct *work) 2429 { 2430 struct btrfs_raid_bio *rbio; 2431 2432 rbio = container_of(work, struct btrfs_raid_bio, work); 2433 if (lock_stripe_add(rbio) == 0) 2434 rmw_rbio(rbio); 2435 } 2436 2437 static void rmw_rbio_work_locked(struct work_struct *work) 2438 { 2439 rmw_rbio(container_of(work, struct btrfs_raid_bio, work)); 2440 } 2441 2442 /* 2443 * The following code is used to scrub/replace the parity stripe 2444 * 2445 * Caller must have already increased bio_counter for getting @bioc. 2446 * 2447 * Note: We need make sure all the pages that add into the scrub/replace 2448 * raid bio are correct and not be changed during the scrub/replace. That 2449 * is those pages just hold metadata or file data with checksum. 2450 */ 2451 2452 struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, 2453 struct btrfs_io_context *bioc, 2454 struct btrfs_device *scrub_dev, 2455 unsigned long *dbitmap, int stripe_nsectors) 2456 { 2457 struct btrfs_fs_info *fs_info = bioc->fs_info; 2458 struct btrfs_raid_bio *rbio; 2459 int i; 2460 2461 rbio = alloc_rbio(fs_info, bioc); 2462 if (IS_ERR(rbio)) 2463 return NULL; 2464 bio_list_add(&rbio->bio_list, bio); 2465 /* 2466 * This is a special bio which is used to hold the completion handler 2467 * and make the scrub rbio is similar to the other types 2468 */ 2469 ASSERT(!bio->bi_iter.bi_size); 2470 rbio->operation = BTRFS_RBIO_PARITY_SCRUB; 2471 2472 /* 2473 * After mapping bioc with BTRFS_MAP_WRITE, parities have been sorted 2474 * to the end position, so this search can start from the first parity 2475 * stripe. 2476 */ 2477 for (i = rbio->nr_data; i < rbio->real_stripes; i++) { 2478 if (bioc->stripes[i].dev == scrub_dev) { 2479 rbio->scrubp = i; 2480 break; 2481 } 2482 } 2483 ASSERT_RBIO_STRIPE(i < rbio->real_stripes, rbio, i); 2484 2485 bitmap_copy(&rbio->dbitmap, dbitmap, stripe_nsectors); 2486 return rbio; 2487 } 2488 2489 /* 2490 * We just scrub the parity that we have correct data on the same horizontal, 2491 * so we needn't allocate all pages for all the stripes. 2492 */ 2493 static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio) 2494 { 2495 const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 2496 int total_sector_nr; 2497 2498 for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; 2499 total_sector_nr++) { 2500 struct page *page; 2501 int sectornr = total_sector_nr % rbio->stripe_nsectors; 2502 int index = (total_sector_nr * sectorsize) >> PAGE_SHIFT; 2503 2504 if (!test_bit(sectornr, &rbio->dbitmap)) 2505 continue; 2506 if (rbio->stripe_pages[index]) 2507 continue; 2508 page = alloc_page(GFP_NOFS); 2509 if (!page) 2510 return -ENOMEM; 2511 rbio->stripe_pages[index] = page; 2512 } 2513 index_stripe_sectors(rbio); 2514 return 0; 2515 } 2516 2517 static int finish_parity_scrub(struct btrfs_raid_bio *rbio) 2518 { 2519 struct btrfs_io_context *bioc = rbio->bioc; 2520 const u32 sectorsize = bioc->fs_info->sectorsize; 2521 void **pointers = rbio->finish_pointers; 2522 unsigned long *pbitmap = &rbio->finish_pbitmap; 2523 int nr_data = rbio->nr_data; 2524 int stripe; 2525 int sectornr; 2526 bool has_qstripe; 2527 struct page *page; 2528 struct sector_ptr p_sector = { 0 }; 2529 struct sector_ptr q_sector = { 0 }; 2530 struct bio_list bio_list; 2531 int is_replace = 0; 2532 int ret; 2533 2534 bio_list_init(&bio_list); 2535 2536 if (rbio->real_stripes - rbio->nr_data == 1) 2537 has_qstripe = false; 2538 else if (rbio->real_stripes - rbio->nr_data == 2) 2539 has_qstripe = true; 2540 else 2541 BUG(); 2542 2543 /* 2544 * Replace is running and our P/Q stripe is being replaced, then we 2545 * need to duplicate the final write to replace target. 2546 */ 2547 if (bioc->replace_nr_stripes && bioc->replace_stripe_src == rbio->scrubp) { 2548 is_replace = 1; 2549 bitmap_copy(pbitmap, &rbio->dbitmap, rbio->stripe_nsectors); 2550 } 2551 2552 /* 2553 * Because the higher layers(scrubber) are unlikely to 2554 * use this area of the disk again soon, so don't cache 2555 * it. 2556 */ 2557 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 2558 2559 page = alloc_page(GFP_NOFS); 2560 if (!page) 2561 return -ENOMEM; 2562 p_sector.has_paddr = true; 2563 p_sector.paddr = page_to_phys(page); 2564 p_sector.uptodate = 1; 2565 page = NULL; 2566 2567 if (has_qstripe) { 2568 /* RAID6, allocate and map temp space for the Q stripe */ 2569 page = alloc_page(GFP_NOFS); 2570 if (!page) { 2571 __free_page(phys_to_page(p_sector.paddr)); 2572 p_sector.has_paddr = false; 2573 return -ENOMEM; 2574 } 2575 q_sector.has_paddr = true; 2576 q_sector.paddr = page_to_phys(page); 2577 q_sector.uptodate = 1; 2578 page = NULL; 2579 pointers[rbio->real_stripes - 1] = kmap_local_sector(&q_sector); 2580 } 2581 2582 bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); 2583 2584 /* Map the parity stripe just once */ 2585 pointers[nr_data] = kmap_local_sector(&p_sector); 2586 2587 for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) { 2588 struct sector_ptr *sector; 2589 void *parity; 2590 2591 /* first collect one page from each data stripe */ 2592 for (stripe = 0; stripe < nr_data; stripe++) { 2593 sector = sector_in_rbio(rbio, stripe, sectornr, 0); 2594 pointers[stripe] = kmap_local_sector(sector); 2595 } 2596 2597 if (has_qstripe) { 2598 assert_rbio(rbio); 2599 /* RAID6, call the library function to fill in our P/Q */ 2600 raid6_call.gen_syndrome(rbio->real_stripes, sectorsize, 2601 pointers); 2602 } else { 2603 /* raid5 */ 2604 memcpy(pointers[nr_data], pointers[0], sectorsize); 2605 run_xor(pointers + 1, nr_data - 1, sectorsize); 2606 } 2607 2608 /* Check scrubbing parity and repair it */ 2609 sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); 2610 parity = kmap_local_sector(sector); 2611 if (memcmp(parity, pointers[rbio->scrubp], sectorsize) != 0) 2612 memcpy(parity, pointers[rbio->scrubp], sectorsize); 2613 else 2614 /* Parity is right, needn't writeback */ 2615 bitmap_clear(&rbio->dbitmap, sectornr, 1); 2616 kunmap_local(parity); 2617 2618 for (stripe = nr_data - 1; stripe >= 0; stripe--) 2619 kunmap_local(pointers[stripe]); 2620 } 2621 2622 kunmap_local(pointers[nr_data]); 2623 __free_page(phys_to_page(p_sector.paddr)); 2624 p_sector.has_paddr = false; 2625 if (q_sector.has_paddr) { 2626 __free_page(phys_to_page(q_sector.paddr)); 2627 q_sector.has_paddr = false; 2628 } 2629 2630 /* 2631 * time to start writing. Make bios for everything from the 2632 * higher layers (the bio_list in our rbio) and our p/q. Ignore 2633 * everything else. 2634 */ 2635 for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) { 2636 struct sector_ptr *sector; 2637 2638 sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); 2639 ret = rbio_add_io_sector(rbio, &bio_list, sector, rbio->scrubp, 2640 sectornr, REQ_OP_WRITE); 2641 if (ret) 2642 goto cleanup; 2643 } 2644 2645 if (!is_replace) 2646 goto submit_write; 2647 2648 /* 2649 * Replace is running and our parity stripe needs to be duplicated to 2650 * the target device. Check we have a valid source stripe number. 2651 */ 2652 ASSERT_RBIO(rbio->bioc->replace_stripe_src >= 0, rbio); 2653 for_each_set_bit(sectornr, pbitmap, rbio->stripe_nsectors) { 2654 struct sector_ptr *sector; 2655 2656 sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); 2657 ret = rbio_add_io_sector(rbio, &bio_list, sector, 2658 rbio->real_stripes, 2659 sectornr, REQ_OP_WRITE); 2660 if (ret) 2661 goto cleanup; 2662 } 2663 2664 submit_write: 2665 submit_write_bios(rbio, &bio_list); 2666 return 0; 2667 2668 cleanup: 2669 bio_list_put(&bio_list); 2670 return ret; 2671 } 2672 2673 static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe) 2674 { 2675 if (stripe >= 0 && stripe < rbio->nr_data) 2676 return 1; 2677 return 0; 2678 } 2679 2680 static int recover_scrub_rbio(struct btrfs_raid_bio *rbio) 2681 { 2682 void **pointers = NULL; 2683 void **unmap_array = NULL; 2684 int sector_nr; 2685 int ret = 0; 2686 2687 /* 2688 * @pointers array stores the pointer for each sector. 2689 * 2690 * @unmap_array stores copy of pointers that does not get reordered 2691 * during reconstruction so that kunmap_local works. 2692 */ 2693 pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); 2694 unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); 2695 if (!pointers || !unmap_array) { 2696 ret = -ENOMEM; 2697 goto out; 2698 } 2699 2700 for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) { 2701 int dfail = 0, failp = -1; 2702 int faila; 2703 int failb; 2704 int found_errors; 2705 2706 found_errors = get_rbio_veritical_errors(rbio, sector_nr, 2707 &faila, &failb); 2708 if (found_errors > rbio->bioc->max_errors) { 2709 ret = -EIO; 2710 goto out; 2711 } 2712 if (found_errors == 0) 2713 continue; 2714 2715 /* We should have at least one error here. */ 2716 ASSERT(faila >= 0 || failb >= 0); 2717 2718 if (is_data_stripe(rbio, faila)) 2719 dfail++; 2720 else if (is_parity_stripe(faila)) 2721 failp = faila; 2722 2723 if (is_data_stripe(rbio, failb)) 2724 dfail++; 2725 else if (is_parity_stripe(failb)) 2726 failp = failb; 2727 /* 2728 * Because we can not use a scrubbing parity to repair the 2729 * data, so the capability of the repair is declined. (In the 2730 * case of RAID5, we can not repair anything.) 2731 */ 2732 if (dfail > rbio->bioc->max_errors - 1) { 2733 ret = -EIO; 2734 goto out; 2735 } 2736 /* 2737 * If all data is good, only parity is correctly, just repair 2738 * the parity, no need to recover data stripes. 2739 */ 2740 if (dfail == 0) 2741 continue; 2742 2743 /* 2744 * Here means we got one corrupted data stripe and one 2745 * corrupted parity on RAID6, if the corrupted parity is 2746 * scrubbing parity, luckily, use the other one to repair the 2747 * data, or we can not repair the data stripe. 2748 */ 2749 if (failp != rbio->scrubp) { 2750 ret = -EIO; 2751 goto out; 2752 } 2753 2754 ret = recover_vertical(rbio, sector_nr, pointers, unmap_array); 2755 if (ret < 0) 2756 goto out; 2757 } 2758 out: 2759 kfree(pointers); 2760 kfree(unmap_array); 2761 return ret; 2762 } 2763 2764 static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio) 2765 { 2766 struct bio_list bio_list = BIO_EMPTY_LIST; 2767 int total_sector_nr; 2768 int ret = 0; 2769 2770 /* Build a list of bios to read all the missing parts. */ 2771 for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; 2772 total_sector_nr++) { 2773 int sectornr = total_sector_nr % rbio->stripe_nsectors; 2774 int stripe = total_sector_nr / rbio->stripe_nsectors; 2775 struct sector_ptr *sector; 2776 2777 /* No data in the vertical stripe, no need to read. */ 2778 if (!test_bit(sectornr, &rbio->dbitmap)) 2779 continue; 2780 2781 /* 2782 * We want to find all the sectors missing from the rbio and 2783 * read them from the disk. If sector_in_rbio() finds a sector 2784 * in the bio list we don't need to read it off the stripe. 2785 */ 2786 sector = sector_in_rbio(rbio, stripe, sectornr, 1); 2787 if (sector) 2788 continue; 2789 2790 sector = rbio_stripe_sector(rbio, stripe, sectornr); 2791 /* 2792 * The bio cache may have handed us an uptodate sector. If so, 2793 * use it. 2794 */ 2795 if (sector->uptodate) 2796 continue; 2797 2798 ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, 2799 sectornr, REQ_OP_READ); 2800 if (ret) { 2801 bio_list_put(&bio_list); 2802 return ret; 2803 } 2804 } 2805 2806 submit_read_wait_bio_list(rbio, &bio_list); 2807 return 0; 2808 } 2809 2810 static void scrub_rbio(struct btrfs_raid_bio *rbio) 2811 { 2812 int sector_nr; 2813 int ret; 2814 2815 ret = alloc_rbio_essential_pages(rbio); 2816 if (ret) 2817 goto out; 2818 2819 bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); 2820 2821 ret = scrub_assemble_read_bios(rbio); 2822 if (ret < 0) 2823 goto out; 2824 2825 /* We may have some failures, recover the failed sectors first. */ 2826 ret = recover_scrub_rbio(rbio); 2827 if (ret < 0) 2828 goto out; 2829 2830 /* 2831 * We have every sector properly prepared. Can finish the scrub 2832 * and writeback the good content. 2833 */ 2834 ret = finish_parity_scrub(rbio); 2835 wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); 2836 for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) { 2837 int found_errors; 2838 2839 found_errors = get_rbio_veritical_errors(rbio, sector_nr, NULL, NULL); 2840 if (found_errors > rbio->bioc->max_errors) { 2841 ret = -EIO; 2842 break; 2843 } 2844 } 2845 out: 2846 rbio_orig_end_io(rbio, errno_to_blk_status(ret)); 2847 } 2848 2849 static void scrub_rbio_work_locked(struct work_struct *work) 2850 { 2851 scrub_rbio(container_of(work, struct btrfs_raid_bio, work)); 2852 } 2853 2854 void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio) 2855 { 2856 if (!lock_stripe_add(rbio)) 2857 start_async_work(rbio, scrub_rbio_work_locked); 2858 } 2859 2860 /* 2861 * This is for scrub call sites where we already have correct data contents. 2862 * This allows us to avoid reading data stripes again. 2863 * 2864 * Unfortunately here we have to do page copy, other than reusing the pages. 2865 * This is due to the fact rbio has its own page management for its cache. 2866 */ 2867 void raid56_parity_cache_data_pages(struct btrfs_raid_bio *rbio, 2868 struct page **data_pages, u64 data_logical) 2869 { 2870 const u64 offset_in_full_stripe = data_logical - 2871 rbio->bioc->full_stripe_logical; 2872 const int page_index = offset_in_full_stripe >> PAGE_SHIFT; 2873 const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 2874 const u32 sectors_per_page = PAGE_SIZE / sectorsize; 2875 int ret; 2876 2877 /* 2878 * If we hit ENOMEM temporarily, but later at 2879 * raid56_parity_submit_scrub_rbio() time it succeeded, we just do 2880 * the extra read, not a big deal. 2881 * 2882 * If we hit ENOMEM later at raid56_parity_submit_scrub_rbio() time, 2883 * the bio would got proper error number set. 2884 */ 2885 ret = alloc_rbio_data_pages(rbio); 2886 if (ret < 0) 2887 return; 2888 2889 /* data_logical must be at stripe boundary and inside the full stripe. */ 2890 ASSERT(IS_ALIGNED(offset_in_full_stripe, BTRFS_STRIPE_LEN)); 2891 ASSERT(offset_in_full_stripe < (rbio->nr_data << BTRFS_STRIPE_LEN_SHIFT)); 2892 2893 for (int page_nr = 0; page_nr < (BTRFS_STRIPE_LEN >> PAGE_SHIFT); page_nr++) { 2894 struct page *dst = rbio->stripe_pages[page_nr + page_index]; 2895 struct page *src = data_pages[page_nr]; 2896 2897 memcpy_page(dst, 0, src, 0, PAGE_SIZE); 2898 for (int sector_nr = sectors_per_page * page_index; 2899 sector_nr < sectors_per_page * (page_index + 1); 2900 sector_nr++) 2901 rbio->stripe_sectors[sector_nr].uptodate = true; 2902 } 2903 } 2904