1 // SPDX-License-Identifier: GPL-2.0-or-later
2
3 #include <linux/blkdev.h>
4 #include <linux/module.h>
5 #include <linux/errno.h>
6 #include <linux/slab.h>
7 #include <linux/init.h>
8 #include <linux/timer.h>
9 #include <linux/sched.h>
10 #include <linux/list.h>
11 #include <linux/file.h>
12 #include <linux/seq_file.h>
13 #include <trace/events/block.h>
14
15 #include "md.h"
16 #include "md-bitmap.h"
17
18 /*
19 * #### Background
20 *
21 * Redundant data is used to enhance data fault tolerance, and the storage
22 * methods for redundant data vary depending on the RAID levels. And it's
23 * important to maintain the consistency of redundant data.
24 *
25 * Bitmap is used to record which data blocks have been synchronized and which
26 * ones need to be resynchronized or recovered. Each bit in the bitmap
27 * represents a segment of data in the array. When a bit is set, it indicates
28 * that the multiple redundant copies of that data segment may not be
29 * consistent. Data synchronization can be performed based on the bitmap after
30 * power failure or readding a disk. If there is no bitmap, a full disk
31 * synchronization is required.
32 *
33 * #### Key Features
34 *
35 * - IO fastpath is lockless, if user issues lots of write IO to the same
36 * bitmap bit in a short time, only the first write has additional overhead
37 * to update bitmap bit, no additional overhead for the following writes;
38 * - support only resync or recover written data, means in the case creating
39 * new array or replacing with a new disk, there is no need to do a full disk
40 * resync/recovery;
41 *
42 * #### Key Concept
43 *
44 * ##### State Machine
45 *
46 * Each bit is one byte, contain 6 different states, see llbitmap_state. And
47 * there are total 8 different actions, see llbitmap_action, can change state:
48 *
49 * llbitmap state machine: transitions between states
50 *
51 * | | Startwrite | Startsync | Endsync | Abortsync|
52 * | --------- | ---------- | --------- | ------- | ------- |
53 * | Unwritten | Dirty | x | x | x |
54 * | Clean | Dirty | x | x | x |
55 * | Dirty | x | x | x | x |
56 * | NeedSync | x | Syncing | x | x |
57 * | Syncing | x | Syncing | Dirty | NeedSync |
58 *
59 * | | Reload | Daemon | Discard | Stale |
60 * | --------- | -------- | ------ | --------- | --------- |
61 * | Unwritten | x | x | x | x |
62 * | Clean | x | x | Unwritten | NeedSync |
63 * | Dirty | NeedSync | Clean | Unwritten | NeedSync |
64 * | NeedSync | x | x | Unwritten | x |
65 * | Syncing | NeedSync | x | Unwritten | NeedSync |
66 *
67 * Typical scenarios:
68 *
69 * 1) Create new array
70 * All bits will be set to Unwritten by default, if --assume-clean is set,
71 * all bits will be set to Clean instead.
72 *
73 * 2) write data, raid1/raid10 have full copy of data, while raid456 doesn't and
74 * rely on xor data
75 *
76 * 2.1) write new data to raid1/raid10:
77 * Unwritten --StartWrite--> Dirty
78 *
79 * 2.2) write new data to raid456:
80 * Unwritten --StartWrite--> NeedSync
81 *
82 * Because the initial recover for raid456 is skipped, the xor data is not built
83 * yet, the bit must be set to NeedSync first and after lazy initial recover is
84 * finished, the bit will finally set to Dirty(see 5.1 and 5.4);
85 *
86 * 2.3) cover write
87 * Clean --StartWrite--> Dirty
88 *
89 * 3) daemon, if the array is not degraded:
90 * Dirty --Daemon--> Clean
91 *
92 * 4) discard
93 * {Clean, Dirty, NeedSync, Syncing} --Discard--> Unwritten
94 *
95 * 5) resync and recover
96 *
97 * 5.1) common process
98 * NeedSync --Startsync--> Syncing --Endsync--> Dirty --Daemon--> Clean
99 *
100 * 5.2) resync after power failure
101 * Dirty --Reload--> NeedSync
102 *
103 * 5.3) recover while replacing with a new disk
104 * By default, the old bitmap framework will recover all data, and llbitmap
105 * implements this by a new helper, see llbitmap_skip_sync_blocks:
106 *
107 * skip recover for bits other than dirty or clean;
108 *
109 * 5.4) lazy initial recover for raid5:
110 * By default, the old bitmap framework will only allow new recover when there
111 * are spares(new disk), a new recovery flag MD_RECOVERY_LAZY_RECOVER is added
112 * to perform raid456 lazy recover for set bits(from 2.2).
113 *
114 * 6. special handling for degraded array:
115 *
116 * - Dirty bits will never be cleared, daemon will just do nothing, so that if
117 * a disk is readded, Clean bits can be skipped with recovery;
118 * - Dirty bits will convert to Syncing from start write, to do data recovery
119 * for new added disks;
120 * - New write will convert bits to NeedSync directly;
121 *
122 * ##### Bitmap IO
123 *
124 * ##### Chunksize
125 *
126 * The default bitmap size is 128k, incluing 1k bitmap super block, and
127 * the default size of segment of data in the array each bit(chunksize) is 64k,
128 * and chunksize will adjust to twice the old size each time if the total number
129 * bits is not less than 127k.(see llbitmap_init)
130 *
131 * ##### READ
132 *
133 * While creating bitmap, all pages will be allocated and read for llbitmap,
134 * there won't be read afterwards
135 *
136 * ##### WRITE
137 *
138 * WRITE IO is divided into logical_block_size of the array, the dirty state
139 * of each block is tracked independently, for example:
140 *
141 * each page is 4k, contain 8 blocks; each block is 512 bytes contain 512 bit;
142 *
143 * | page0 | page1 | ... | page 31 |
144 * | |
145 * | \-----------------------\
146 * | |
147 * | block0 | block1 | ... | block 8|
148 * | |
149 * | \-----------------\
150 * | |
151 * | bit0 | bit1 | ... | bit511 |
152 *
153 * From IO path, if one bit is changed to Dirty or NeedSync, the corresponding
154 * subpage will be marked dirty, such block must write first before the IO is
155 * issued. This behaviour will affect IO performance, to reduce the impact, if
156 * multiple bits are changed in the same block in a short time, all bits in this
157 * block will be changed to Dirty/NeedSync, so that there won't be any overhead
158 * until daemon clears dirty bits.
159 *
160 * ##### Dirty Bits synchronization
161 *
162 * IO fast path will set bits to dirty, and those dirty bits will be cleared
163 * by daemon after IO is done. llbitmap_page_ctl is used to synchronize between
164 * IO path and daemon;
165 *
166 * IO path:
167 * 1) try to grab a reference, if succeed, set expire time after 5s and return;
168 * 2) if failed to grab a reference, wait for daemon to finish clearing dirty
169 * bits;
170 *
171 * Daemon (Daemon will be woken up every daemon_sleep seconds):
172 * For each page:
173 * 1) check if page expired, if not skip this page; for expired page:
174 * 2) suspend the page and wait for inflight write IO to be done;
175 * 3) change dirty page to clean;
176 * 4) resume the page;
177 */
178
179 #define BITMAP_DATA_OFFSET 1024
180
181 /* 64k is the max IO size of sync IO for raid1/raid10 */
182 #define MIN_CHUNK_SIZE (64 * 2)
183
184 /* By default, daemon will be woken up every 30s */
185 #define DEFAULT_DAEMON_SLEEP 30
186
187 /*
188 * Dirtied bits that have not been accessed for more than 5s will be cleared
189 * by daemon.
190 */
191 #define DEFAULT_BARRIER_IDLE 5
192
193 enum llbitmap_state {
194 /* No valid data, init state after assemble the array */
195 BitUnwritten = 0,
196 /* data is consistent */
197 BitClean,
198 /* data will be consistent after IO is done, set directly for writes */
199 BitDirty,
200 /*
201 * data need to be resynchronized:
202 * 1) set directly for writes if array is degraded, prevent full disk
203 * synchronization after readding a disk;
204 * 2) reassemble the array after power failure, and dirty bits are
205 * found after reloading the bitmap;
206 * 3) set for first write for raid5, to build initial xor data lazily
207 */
208 BitNeedSync,
209 /* data is synchronizing */
210 BitSyncing,
211 /*
212 * Proactive sync requested for unwritten region (raid456 only).
213 * Triggered via sysfs when user wants to pre-build XOR parity
214 * for regions that have never been written.
215 */
216 BitNeedSyncUnwritten,
217 /* Proactive sync in progress for unwritten region */
218 BitSyncingUnwritten,
219 /*
220 * XOR parity has been pre-built for a region that has never had
221 * user data written. When user writes to this region, it transitions
222 * to BitDirty.
223 */
224 BitCleanUnwritten,
225 BitStateCount,
226 BitNone = 0xff,
227 };
228
229 enum llbitmap_action {
230 /* User write new data, this is the only action from IO fast path */
231 BitmapActionStartwrite = 0,
232 /* Start recovery */
233 BitmapActionStartsync,
234 /* Finish recovery */
235 BitmapActionEndsync,
236 /* Failed recovery */
237 BitmapActionAbortsync,
238 /* Reassemble the array */
239 BitmapActionReload,
240 /* Daemon thread is trying to clear dirty bits */
241 BitmapActionDaemon,
242 /* Data is deleted */
243 BitmapActionDiscard,
244 /*
245 * Bitmap is stale, mark all bits in addition to BitUnwritten to
246 * BitNeedSync.
247 */
248 BitmapActionStale,
249 /*
250 * Proactive sync trigger for raid456 - builds XOR parity for
251 * Unwritten regions without requiring user data write first.
252 */
253 BitmapActionProactiveSync,
254 BitmapActionClearUnwritten,
255 BitmapActionCount,
256 /* Init state is BitUnwritten */
257 BitmapActionInit,
258 };
259
260 enum llbitmap_page_state {
261 LLPageFlush = 0,
262 LLPageDirty,
263 };
264
265 struct llbitmap_page_ctl {
266 char *state;
267 struct page *page;
268 unsigned long expire;
269 unsigned long flags;
270 wait_queue_head_t wait;
271 struct percpu_ref active;
272 /* Per block size dirty state, maximum 64k page / 1 sector = 128 */
273 unsigned long dirty[];
274 };
275
276 struct llbitmap {
277 struct mddev *mddev;
278 struct llbitmap_page_ctl **pctl;
279
280 unsigned int nr_pages;
281 unsigned int io_size;
282 unsigned int blocks_per_page;
283
284 /* shift of one chunk */
285 unsigned long chunkshift;
286 /* size of one chunk in sector */
287 unsigned long chunksize;
288 /* total number of chunks */
289 unsigned long chunks;
290 unsigned long last_end_sync;
291 /*
292 * time in seconds that dirty bits will be cleared if the page is not
293 * accessed.
294 */
295 unsigned long barrier_idle;
296 /* fires on first BitDirty state */
297 struct timer_list pending_timer;
298 struct work_struct daemon_work;
299
300 unsigned long flags;
301 __u64 events_cleared;
302
303 /* for slow disks */
304 atomic_t behind_writes;
305 wait_queue_head_t behind_wait;
306 };
307
308 struct llbitmap_unplug_work {
309 struct work_struct work;
310 struct llbitmap *llbitmap;
311 struct completion *done;
312 };
313
314 static struct workqueue_struct *md_llbitmap_io_wq;
315 static struct workqueue_struct *md_llbitmap_unplug_wq;
316
317 static char state_machine[BitStateCount][BitmapActionCount] = {
318 [BitUnwritten] = {
319 [BitmapActionStartwrite] = BitDirty,
320 [BitmapActionStartsync] = BitNone,
321 [BitmapActionEndsync] = BitNone,
322 [BitmapActionAbortsync] = BitNone,
323 [BitmapActionReload] = BitNone,
324 [BitmapActionDaemon] = BitNone,
325 [BitmapActionDiscard] = BitNone,
326 [BitmapActionStale] = BitNone,
327 [BitmapActionProactiveSync] = BitNeedSyncUnwritten,
328 [BitmapActionClearUnwritten] = BitNone,
329 },
330 [BitClean] = {
331 [BitmapActionStartwrite] = BitDirty,
332 [BitmapActionStartsync] = BitNone,
333 [BitmapActionEndsync] = BitNone,
334 [BitmapActionAbortsync] = BitNone,
335 [BitmapActionReload] = BitNone,
336 [BitmapActionDaemon] = BitNone,
337 [BitmapActionDiscard] = BitUnwritten,
338 [BitmapActionStale] = BitNeedSync,
339 [BitmapActionProactiveSync] = BitNone,
340 [BitmapActionClearUnwritten] = BitNone,
341 },
342 [BitDirty] = {
343 [BitmapActionStartwrite] = BitNone,
344 [BitmapActionStartsync] = BitNone,
345 [BitmapActionEndsync] = BitNone,
346 [BitmapActionAbortsync] = BitNone,
347 [BitmapActionReload] = BitNeedSync,
348 [BitmapActionDaemon] = BitClean,
349 [BitmapActionDiscard] = BitUnwritten,
350 [BitmapActionStale] = BitNeedSync,
351 [BitmapActionProactiveSync] = BitNone,
352 [BitmapActionClearUnwritten] = BitNone,
353 },
354 [BitNeedSync] = {
355 [BitmapActionStartwrite] = BitNone,
356 [BitmapActionStartsync] = BitSyncing,
357 [BitmapActionEndsync] = BitNone,
358 [BitmapActionAbortsync] = BitNone,
359 [BitmapActionReload] = BitNone,
360 [BitmapActionDaemon] = BitNone,
361 [BitmapActionDiscard] = BitUnwritten,
362 [BitmapActionStale] = BitNone,
363 [BitmapActionProactiveSync] = BitNone,
364 [BitmapActionClearUnwritten] = BitNone,
365 },
366 [BitSyncing] = {
367 [BitmapActionStartwrite] = BitNone,
368 [BitmapActionStartsync] = BitSyncing,
369 [BitmapActionEndsync] = BitDirty,
370 [BitmapActionAbortsync] = BitNeedSync,
371 [BitmapActionReload] = BitNeedSync,
372 [BitmapActionDaemon] = BitNone,
373 [BitmapActionDiscard] = BitUnwritten,
374 [BitmapActionStale] = BitNeedSync,
375 [BitmapActionProactiveSync] = BitNone,
376 [BitmapActionClearUnwritten] = BitNone,
377 },
378 [BitNeedSyncUnwritten] = {
379 [BitmapActionStartwrite] = BitNeedSync,
380 [BitmapActionStartsync] = BitSyncingUnwritten,
381 [BitmapActionEndsync] = BitNone,
382 [BitmapActionAbortsync] = BitUnwritten,
383 [BitmapActionReload] = BitUnwritten,
384 [BitmapActionDaemon] = BitNone,
385 [BitmapActionDiscard] = BitUnwritten,
386 [BitmapActionStale] = BitUnwritten,
387 [BitmapActionProactiveSync] = BitNone,
388 [BitmapActionClearUnwritten] = BitUnwritten,
389 },
390 [BitSyncingUnwritten] = {
391 [BitmapActionStartwrite] = BitSyncing,
392 [BitmapActionStartsync] = BitSyncingUnwritten,
393 [BitmapActionEndsync] = BitCleanUnwritten,
394 [BitmapActionAbortsync] = BitUnwritten,
395 [BitmapActionReload] = BitUnwritten,
396 [BitmapActionDaemon] = BitNone,
397 [BitmapActionDiscard] = BitUnwritten,
398 [BitmapActionStale] = BitUnwritten,
399 [BitmapActionProactiveSync] = BitNone,
400 [BitmapActionClearUnwritten] = BitUnwritten,
401 },
402 [BitCleanUnwritten] = {
403 [BitmapActionStartwrite] = BitDirty,
404 [BitmapActionStartsync] = BitNone,
405 [BitmapActionEndsync] = BitNone,
406 [BitmapActionAbortsync] = BitNone,
407 [BitmapActionReload] = BitNone,
408 [BitmapActionDaemon] = BitNone,
409 [BitmapActionDiscard] = BitUnwritten,
410 [BitmapActionStale] = BitUnwritten,
411 [BitmapActionProactiveSync] = BitNone,
412 [BitmapActionClearUnwritten] = BitUnwritten,
413 },
414 };
415
416 static void __llbitmap_flush(struct mddev *mddev);
417
llbitmap_read(struct llbitmap * llbitmap,loff_t pos)418 static enum llbitmap_state llbitmap_read(struct llbitmap *llbitmap, loff_t pos)
419 {
420 unsigned int idx;
421 unsigned int offset;
422
423 pos += BITMAP_DATA_OFFSET;
424 idx = pos >> PAGE_SHIFT;
425 offset = offset_in_page(pos);
426
427 return llbitmap->pctl[idx]->state[offset];
428 }
429
430 /* set all the bits in the subpage as dirty */
llbitmap_infect_dirty_bits(struct llbitmap * llbitmap,struct llbitmap_page_ctl * pctl,unsigned int block)431 static void llbitmap_infect_dirty_bits(struct llbitmap *llbitmap,
432 struct llbitmap_page_ctl *pctl,
433 unsigned int block)
434 {
435 bool level_456 = raid_is_456(llbitmap->mddev);
436 unsigned int io_size = llbitmap->io_size;
437 int pos;
438
439 for (pos = block * io_size; pos < (block + 1) * io_size; pos++) {
440 switch (pctl->state[pos]) {
441 case BitUnwritten:
442 pctl->state[pos] = level_456 ? BitNeedSync : BitDirty;
443 break;
444 case BitClean:
445 case BitCleanUnwritten:
446 pctl->state[pos] = BitDirty;
447 break;
448 }
449 }
450 }
451
llbitmap_set_page_dirty(struct llbitmap * llbitmap,int idx,int offset,bool infect)452 static void llbitmap_set_page_dirty(struct llbitmap *llbitmap, int idx,
453 int offset, bool infect)
454 {
455 struct llbitmap_page_ctl *pctl = llbitmap->pctl[idx];
456 unsigned int io_size = llbitmap->io_size;
457 int block = offset / io_size;
458 int pos;
459
460 if (!test_bit(LLPageDirty, &pctl->flags))
461 set_bit(LLPageDirty, &pctl->flags);
462
463 /*
464 * For degraded array, dirty bits will never be cleared, and we must
465 * resync all the dirty bits, hence skip infect new dirty bits to
466 * prevent resync unnecessary data.
467 */
468 if (llbitmap->mddev->degraded || !infect) {
469 set_bit(block, pctl->dirty);
470 return;
471 }
472
473 /*
474 * The subpage usually contains a total of 512 bits. If any single bit
475 * within the subpage is marked as dirty, the entire sector will be
476 * written. To avoid impacting write performance, when multiple bits
477 * within the same sector are modified within llbitmap->barrier_idle,
478 * all bits in the sector will be collectively marked as dirty at once.
479 */
480 if (test_and_set_bit(block, pctl->dirty)) {
481 llbitmap_infect_dirty_bits(llbitmap, pctl, block);
482 return;
483 }
484
485 for (pos = block * io_size; pos < (block + 1) * io_size; pos++) {
486 if (pos == offset)
487 continue;
488 if (pctl->state[pos] == BitDirty ||
489 pctl->state[pos] == BitNeedSync) {
490 llbitmap_infect_dirty_bits(llbitmap, pctl, block);
491 return;
492 }
493 }
494 }
495
llbitmap_write(struct llbitmap * llbitmap,enum llbitmap_state state,loff_t pos)496 static void llbitmap_write(struct llbitmap *llbitmap, enum llbitmap_state state,
497 loff_t pos)
498 {
499 unsigned int idx;
500 unsigned int bit;
501
502 pos += BITMAP_DATA_OFFSET;
503 idx = pos >> PAGE_SHIFT;
504 bit = offset_in_page(pos);
505
506 llbitmap->pctl[idx]->state[bit] = state;
507 if (state == BitDirty || state == BitNeedSync)
508 llbitmap_set_page_dirty(llbitmap, idx, bit, true);
509 else if (state == BitNeedSyncUnwritten)
510 llbitmap_set_page_dirty(llbitmap, idx, bit, false);
511 }
512
llbitmap_read_page(struct llbitmap * llbitmap,int idx)513 static struct page *llbitmap_read_page(struct llbitmap *llbitmap, int idx)
514 {
515 struct mddev *mddev = llbitmap->mddev;
516 struct page *page = NULL;
517 struct md_rdev *rdev;
518
519 if (llbitmap->pctl && llbitmap->pctl[idx])
520 page = llbitmap->pctl[idx]->page;
521 if (page)
522 return page;
523
524 page = alloc_page(GFP_KERNEL | __GFP_ZERO);
525 if (!page)
526 return ERR_PTR(-ENOMEM);
527
528 rdev_for_each(rdev, mddev) {
529 sector_t sector;
530
531 if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags) ||
532 !test_bit(In_sync, &rdev->flags))
533 continue;
534
535 sector = mddev->bitmap_info.offset +
536 (idx << PAGE_SECTORS_SHIFT);
537
538 if (sync_page_io(rdev, sector, PAGE_SIZE, page, REQ_OP_READ,
539 true))
540 return page;
541
542 md_error(mddev, rdev);
543 }
544
545 __free_page(page);
546 return ERR_PTR(-EIO);
547 }
548
llbitmap_write_page(struct llbitmap * llbitmap,int idx)549 static void llbitmap_write_page(struct llbitmap *llbitmap, int idx)
550 {
551 struct page *page = llbitmap->pctl[idx]->page;
552 struct mddev *mddev = llbitmap->mddev;
553 struct md_rdev *rdev;
554 int block;
555
556 for (block = 0; block < llbitmap->blocks_per_page; block++) {
557 struct llbitmap_page_ctl *pctl = llbitmap->pctl[idx];
558
559 if (!test_and_clear_bit(block, pctl->dirty))
560 continue;
561
562 rdev_for_each(rdev, mddev) {
563 sector_t sector;
564 sector_t bit_sector = llbitmap->io_size >> SECTOR_SHIFT;
565
566 if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags))
567 continue;
568
569 sector = mddev->bitmap_info.offset + rdev->sb_start +
570 (idx << PAGE_SECTORS_SHIFT) +
571 block * bit_sector;
572 md_write_metadata(mddev, rdev, sector,
573 llbitmap->io_size, page,
574 block * llbitmap->io_size);
575 }
576 }
577 }
578
active_release(struct percpu_ref * ref)579 static void active_release(struct percpu_ref *ref)
580 {
581 struct llbitmap_page_ctl *pctl =
582 container_of(ref, struct llbitmap_page_ctl, active);
583
584 wake_up(&pctl->wait);
585 }
586
llbitmap_free_pages(struct llbitmap * llbitmap)587 static void llbitmap_free_pages(struct llbitmap *llbitmap)
588 {
589 int i;
590
591 if (!llbitmap->pctl)
592 return;
593
594 for (i = 0; i < llbitmap->nr_pages; i++) {
595 struct llbitmap_page_ctl *pctl = llbitmap->pctl[i];
596
597 if (!pctl || !pctl->page)
598 break;
599
600 __free_page(pctl->page);
601 percpu_ref_exit(&pctl->active);
602 }
603
604 kfree(llbitmap->pctl[0]);
605 kfree(llbitmap->pctl);
606 llbitmap->pctl = NULL;
607 }
608
llbitmap_cache_pages(struct llbitmap * llbitmap)609 static int llbitmap_cache_pages(struct llbitmap *llbitmap)
610 {
611 struct llbitmap_page_ctl *pctl;
612 unsigned int nr_pages = DIV_ROUND_UP(llbitmap->chunks +
613 BITMAP_DATA_OFFSET, PAGE_SIZE);
614 unsigned int size = struct_size(pctl, dirty, BITS_TO_LONGS(
615 llbitmap->blocks_per_page));
616 int i;
617
618 llbitmap->pctl = kmalloc_array(nr_pages, sizeof(void *),
619 GFP_KERNEL | __GFP_ZERO);
620 if (!llbitmap->pctl)
621 return -ENOMEM;
622
623 size = round_up(size, cache_line_size());
624 pctl = kmalloc_array(nr_pages, size, GFP_KERNEL | __GFP_ZERO);
625 if (!pctl) {
626 kfree(llbitmap->pctl);
627 return -ENOMEM;
628 }
629
630 llbitmap->nr_pages = nr_pages;
631
632 for (i = 0; i < nr_pages; i++, pctl = (void *)pctl + size) {
633 struct page *page = llbitmap_read_page(llbitmap, i);
634
635 llbitmap->pctl[i] = pctl;
636
637 if (IS_ERR(page)) {
638 llbitmap_free_pages(llbitmap);
639 return PTR_ERR(page);
640 }
641
642 if (percpu_ref_init(&pctl->active, active_release,
643 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
644 __free_page(page);
645 llbitmap_free_pages(llbitmap);
646 return -ENOMEM;
647 }
648
649 pctl->page = page;
650 pctl->state = page_address(page);
651 init_waitqueue_head(&pctl->wait);
652 }
653
654 return 0;
655 }
656
657 /*
658 * Check if all underlying disks support write_zeroes with unmap.
659 */
llbitmap_all_disks_support_wzeroes_unmap(struct llbitmap * llbitmap)660 static bool llbitmap_all_disks_support_wzeroes_unmap(struct llbitmap *llbitmap)
661 {
662 struct mddev *mddev = llbitmap->mddev;
663 struct md_rdev *rdev;
664
665 rdev_for_each(rdev, mddev) {
666 if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags))
667 continue;
668
669 if (bdev_write_zeroes_unmap_sectors(rdev->bdev) == 0)
670 return false;
671 }
672
673 return true;
674 }
675
676 /*
677 * Issue write_zeroes to all underlying disks to zero their data regions.
678 * This ensures parity consistency for RAID-456 (0 XOR 0 = 0).
679 * Returns true if all disks were successfully zeroed.
680 */
llbitmap_zero_all_disks(struct llbitmap * llbitmap)681 static bool llbitmap_zero_all_disks(struct llbitmap *llbitmap)
682 {
683 struct mddev *mddev = llbitmap->mddev;
684 struct md_rdev *rdev;
685 sector_t dev_sectors = mddev->dev_sectors;
686 int ret;
687
688 rdev_for_each(rdev, mddev) {
689 if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags))
690 continue;
691
692 ret = blkdev_issue_zeroout(rdev->bdev,
693 rdev->data_offset,
694 dev_sectors,
695 GFP_KERNEL, 0);
696 if (ret) {
697 pr_warn("md/llbitmap: failed to zero disk %pg: %d\n",
698 rdev->bdev, ret);
699 return false;
700 }
701 }
702
703 return true;
704 }
705
llbitmap_init_state(struct llbitmap * llbitmap)706 static void llbitmap_init_state(struct llbitmap *llbitmap)
707 {
708 struct mddev *mddev = llbitmap->mddev;
709 enum llbitmap_state state = BitUnwritten;
710 unsigned long i;
711
712 if (test_and_clear_bit(BITMAP_CLEAN, &llbitmap->flags)) {
713 state = BitClean;
714 } else if (raid_is_456(mddev) &&
715 llbitmap_all_disks_support_wzeroes_unmap(llbitmap)) {
716 /*
717 * All disks support write_zeroes with unmap. Zero all disks
718 * to ensure parity consistency, then set BitCleanUnwritten
719 * to skip initial sync.
720 */
721 if (llbitmap_zero_all_disks(llbitmap))
722 state = BitCleanUnwritten;
723 }
724
725 for (i = 0; i < llbitmap->chunks; i++)
726 llbitmap_write(llbitmap, state, i);
727 }
728
729 /* The return value is only used from resync, where @start == @end. */
llbitmap_state_machine(struct llbitmap * llbitmap,unsigned long start,unsigned long end,enum llbitmap_action action)730 static enum llbitmap_state llbitmap_state_machine(struct llbitmap *llbitmap,
731 unsigned long start,
732 unsigned long end,
733 enum llbitmap_action action)
734 {
735 struct mddev *mddev = llbitmap->mddev;
736 enum llbitmap_state state = BitNone;
737 bool level_456 = raid_is_456(llbitmap->mddev);
738 bool need_resync = false;
739 bool need_recovery = false;
740
741 if (test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags))
742 return BitNone;
743
744 if (action == BitmapActionInit) {
745 llbitmap_init_state(llbitmap);
746 return BitNone;
747 }
748
749 while (start <= end) {
750 enum llbitmap_state c = llbitmap_read(llbitmap, start);
751
752 if (c < 0 || c >= BitStateCount) {
753 pr_err("%s: invalid bit %lu state %d action %d, forcing resync\n",
754 __func__, start, c, action);
755 state = BitNeedSync;
756 goto write_bitmap;
757 }
758
759 if (c == BitNeedSync || c == BitNeedSyncUnwritten)
760 need_resync = !mddev->degraded;
761
762 state = state_machine[c][action];
763 write_bitmap:
764 if (unlikely(mddev->degraded)) {
765 /* For degraded array, mark new data as need sync. */
766 if (state == BitDirty &&
767 action == BitmapActionStartwrite)
768 state = BitNeedSync;
769 /*
770 * For degraded array, resync dirty data as well, noted
771 * if array is still degraded after resync is done, all
772 * new data will still be dirty until array is clean.
773 */
774 else if (c == BitDirty &&
775 action == BitmapActionStartsync)
776 state = BitSyncing;
777 } else if (c == BitUnwritten && state == BitDirty &&
778 action == BitmapActionStartwrite && level_456) {
779 /* Delay raid456 initial recovery to first write. */
780 state = BitNeedSync;
781 }
782
783 if (state == BitNone) {
784 start++;
785 continue;
786 }
787
788 llbitmap_write(llbitmap, state, start);
789 if (state == BitNeedSync || state == BitNeedSyncUnwritten)
790 need_resync = !mddev->degraded;
791 else if (state == BitDirty &&
792 !timer_pending(&llbitmap->pending_timer))
793 mod_timer(&llbitmap->pending_timer,
794 jiffies + mddev->bitmap_info.daemon_sleep * HZ);
795
796 start++;
797 }
798
799 if (need_resync && level_456)
800 need_recovery = true;
801
802 if (need_recovery) {
803 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
804 set_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery);
805 md_wakeup_thread(mddev->thread);
806 } else if (need_resync) {
807 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
808 set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
809 md_wakeup_thread(mddev->thread);
810 }
811
812 return state;
813 }
814
llbitmap_raise_barrier(struct llbitmap * llbitmap,int page_idx)815 static void llbitmap_raise_barrier(struct llbitmap *llbitmap, int page_idx)
816 {
817 struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx];
818
819 retry:
820 if (likely(percpu_ref_tryget_live(&pctl->active))) {
821 WRITE_ONCE(pctl->expire, jiffies + llbitmap->barrier_idle * HZ);
822 return;
823 }
824
825 wait_event(pctl->wait, !percpu_ref_is_dying(&pctl->active));
826 goto retry;
827 }
828
llbitmap_release_barrier(struct llbitmap * llbitmap,int page_idx)829 static void llbitmap_release_barrier(struct llbitmap *llbitmap, int page_idx)
830 {
831 struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx];
832
833 percpu_ref_put(&pctl->active);
834 }
835
llbitmap_suspend_timeout(struct llbitmap * llbitmap,int page_idx)836 static int llbitmap_suspend_timeout(struct llbitmap *llbitmap, int page_idx)
837 {
838 struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx];
839
840 percpu_ref_kill(&pctl->active);
841
842 if (!wait_event_timeout(pctl->wait, percpu_ref_is_zero(&pctl->active),
843 llbitmap->mddev->bitmap_info.daemon_sleep * HZ)) {
844 percpu_ref_resurrect(&pctl->active);
845 return -ETIMEDOUT;
846 }
847
848 return 0;
849 }
850
llbitmap_resume(struct llbitmap * llbitmap,int page_idx)851 static void llbitmap_resume(struct llbitmap *llbitmap, int page_idx)
852 {
853 struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx];
854
855 pctl->expire = LONG_MAX;
856 percpu_ref_resurrect(&pctl->active);
857 wake_up(&pctl->wait);
858 }
859
llbitmap_check_support(struct mddev * mddev)860 static int llbitmap_check_support(struct mddev *mddev)
861 {
862 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
863 pr_notice("md/llbitmap: %s: array with journal cannot have bitmap\n",
864 mdname(mddev));
865 return -EBUSY;
866 }
867
868 if (mddev->bitmap_info.space == 0) {
869 if (mddev->bitmap_info.default_space == 0) {
870 pr_notice("md/llbitmap: %s: no space for bitmap\n",
871 mdname(mddev));
872 return -ENOSPC;
873 }
874 }
875
876 if (!mddev->persistent) {
877 pr_notice("md/llbitmap: %s: array must be persistent\n",
878 mdname(mddev));
879 return -EOPNOTSUPP;
880 }
881
882 if (mddev->bitmap_info.file) {
883 pr_notice("md/llbitmap: %s: doesn't support bitmap file\n",
884 mdname(mddev));
885 return -EOPNOTSUPP;
886 }
887
888 if (mddev->bitmap_info.external) {
889 pr_notice("md/llbitmap: %s: doesn't support external metadata\n",
890 mdname(mddev));
891 return -EOPNOTSUPP;
892 }
893
894 if (mddev_is_dm(mddev)) {
895 pr_notice("md/llbitmap: %s: doesn't support dm-raid\n",
896 mdname(mddev));
897 return -EOPNOTSUPP;
898 }
899
900 return 0;
901 }
902
llbitmap_init(struct llbitmap * llbitmap)903 static int llbitmap_init(struct llbitmap *llbitmap)
904 {
905 struct mddev *mddev = llbitmap->mddev;
906 sector_t blocks = mddev->resync_max_sectors;
907 unsigned long chunksize = MIN_CHUNK_SIZE;
908 unsigned long chunks = DIV_ROUND_UP(blocks, chunksize);
909 unsigned long space = mddev->bitmap_info.space << SECTOR_SHIFT;
910 int ret;
911
912 while (chunks > space) {
913 chunksize = chunksize << 1;
914 chunks = DIV_ROUND_UP_SECTOR_T(blocks, chunksize);
915 }
916
917 llbitmap->barrier_idle = DEFAULT_BARRIER_IDLE;
918 llbitmap->chunkshift = ffz(~chunksize);
919 llbitmap->chunksize = chunksize;
920 llbitmap->chunks = chunks;
921 mddev->bitmap_info.daemon_sleep = DEFAULT_DAEMON_SLEEP;
922
923 ret = llbitmap_cache_pages(llbitmap);
924 if (ret)
925 return ret;
926
927 llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1,
928 BitmapActionInit);
929 /* flush initial llbitmap to disk */
930 __llbitmap_flush(mddev);
931
932 return 0;
933 }
934
llbitmap_read_sb(struct llbitmap * llbitmap)935 static int llbitmap_read_sb(struct llbitmap *llbitmap)
936 {
937 struct mddev *mddev = llbitmap->mddev;
938 unsigned long daemon_sleep;
939 unsigned long chunksize;
940 unsigned long events;
941 struct page *sb_page;
942 bitmap_super_t *sb;
943 int ret = -EINVAL;
944
945 if (!mddev->bitmap_info.offset) {
946 pr_err("md/llbitmap: %s: no super block found", mdname(mddev));
947 return -EINVAL;
948 }
949
950 sb_page = llbitmap_read_page(llbitmap, 0);
951 if (IS_ERR(sb_page)) {
952 pr_err("md/llbitmap: %s: read super block failed",
953 mdname(mddev));
954 return -EIO;
955 }
956
957 sb = kmap_local_page(sb_page);
958 if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) {
959 pr_err("md/llbitmap: %s: invalid super block magic number",
960 mdname(mddev));
961 goto out_put_page;
962 }
963
964 if (sb->version != cpu_to_le32(BITMAP_MAJOR_LOCKLESS)) {
965 pr_err("md/llbitmap: %s: invalid super block version",
966 mdname(mddev));
967 goto out_put_page;
968 }
969
970 if (memcmp(sb->uuid, mddev->uuid, 16)) {
971 pr_err("md/llbitmap: %s: bitmap superblock UUID mismatch\n",
972 mdname(mddev));
973 goto out_put_page;
974 }
975
976 if (mddev->bitmap_info.space == 0) {
977 int room = le32_to_cpu(sb->sectors_reserved);
978
979 if (room)
980 mddev->bitmap_info.space = room;
981 else
982 mddev->bitmap_info.space = mddev->bitmap_info.default_space;
983 }
984 llbitmap->flags = le32_to_cpu(sb->state);
985 if (test_and_clear_bit(BITMAP_FIRST_USE, &llbitmap->flags)) {
986 ret = llbitmap_init(llbitmap);
987 goto out_put_page;
988 }
989
990 chunksize = le32_to_cpu(sb->chunksize);
991 if (!is_power_of_2(chunksize)) {
992 pr_err("md/llbitmap: %s: chunksize not a power of 2",
993 mdname(mddev));
994 goto out_put_page;
995 }
996
997 if (chunksize < DIV_ROUND_UP_SECTOR_T(mddev->resync_max_sectors,
998 mddev->bitmap_info.space << SECTOR_SHIFT)) {
999 pr_err("md/llbitmap: %s: chunksize too small %lu < %llu / %lu",
1000 mdname(mddev), chunksize, mddev->resync_max_sectors,
1001 mddev->bitmap_info.space);
1002 goto out_put_page;
1003 }
1004
1005 daemon_sleep = le32_to_cpu(sb->daemon_sleep);
1006 if (daemon_sleep < 1 || daemon_sleep > MAX_SCHEDULE_TIMEOUT / HZ) {
1007 pr_err("md/llbitmap: %s: daemon sleep %lu period out of range",
1008 mdname(mddev), daemon_sleep);
1009 goto out_put_page;
1010 }
1011
1012 events = le64_to_cpu(sb->events);
1013 if (events < mddev->events) {
1014 pr_warn("md/llbitmap :%s: bitmap file is out of date (%lu < %llu) -- forcing full recovery",
1015 mdname(mddev), events, mddev->events);
1016 set_bit(BITMAP_STALE, &llbitmap->flags);
1017 }
1018
1019 sb->sync_size = cpu_to_le64(mddev->resync_max_sectors);
1020 mddev->bitmap_info.chunksize = chunksize;
1021 mddev->bitmap_info.daemon_sleep = daemon_sleep;
1022
1023 llbitmap->barrier_idle = DEFAULT_BARRIER_IDLE;
1024 llbitmap->chunksize = chunksize;
1025 llbitmap->chunks = DIV_ROUND_UP_SECTOR_T(mddev->resync_max_sectors, chunksize);
1026 llbitmap->chunkshift = ffz(~chunksize);
1027 ret = llbitmap_cache_pages(llbitmap);
1028
1029 out_put_page:
1030 __free_page(sb_page);
1031 kunmap_local(sb);
1032 return ret;
1033 }
1034
llbitmap_pending_timer_fn(struct timer_list * pending_timer)1035 static void llbitmap_pending_timer_fn(struct timer_list *pending_timer)
1036 {
1037 struct llbitmap *llbitmap =
1038 container_of(pending_timer, struct llbitmap, pending_timer);
1039
1040 if (work_busy(&llbitmap->daemon_work)) {
1041 pr_warn("md/llbitmap: %s daemon_work not finished in %lu seconds\n",
1042 mdname(llbitmap->mddev),
1043 llbitmap->mddev->bitmap_info.daemon_sleep);
1044 set_bit(BITMAP_DAEMON_BUSY, &llbitmap->flags);
1045 return;
1046 }
1047
1048 queue_work(md_llbitmap_io_wq, &llbitmap->daemon_work);
1049 }
1050
md_llbitmap_daemon_fn(struct work_struct * work)1051 static void md_llbitmap_daemon_fn(struct work_struct *work)
1052 {
1053 struct llbitmap *llbitmap =
1054 container_of(work, struct llbitmap, daemon_work);
1055 unsigned long start;
1056 unsigned long end;
1057 bool restart;
1058 int idx;
1059
1060 if (llbitmap->mddev->degraded)
1061 return;
1062 retry:
1063 start = 0;
1064 end = min(llbitmap->chunks, PAGE_SIZE - BITMAP_DATA_OFFSET) - 1;
1065 restart = false;
1066
1067 for (idx = 0; idx < llbitmap->nr_pages; idx++) {
1068 struct llbitmap_page_ctl *pctl = llbitmap->pctl[idx];
1069
1070 if (idx > 0) {
1071 start = end + 1;
1072 end = min(end + PAGE_SIZE, llbitmap->chunks - 1);
1073 }
1074
1075 if (!test_bit(LLPageFlush, &pctl->flags) &&
1076 time_before(jiffies, pctl->expire)) {
1077 restart = true;
1078 continue;
1079 }
1080
1081 if (llbitmap_suspend_timeout(llbitmap, idx) < 0) {
1082 pr_warn("md/llbitmap: %s: %s waiting for page %d timeout\n",
1083 mdname(llbitmap->mddev), __func__, idx);
1084 continue;
1085 }
1086
1087 llbitmap_state_machine(llbitmap, start, end, BitmapActionDaemon);
1088 llbitmap_resume(llbitmap, idx);
1089 }
1090
1091 /*
1092 * If the daemon took a long time to finish, retry to prevent missing
1093 * clearing dirty bits.
1094 */
1095 if (test_and_clear_bit(BITMAP_DAEMON_BUSY, &llbitmap->flags))
1096 goto retry;
1097
1098 /* If some page is dirty but not expired, setup timer again */
1099 if (restart)
1100 mod_timer(&llbitmap->pending_timer,
1101 jiffies + llbitmap->mddev->bitmap_info.daemon_sleep * HZ);
1102 }
1103
llbitmap_create(struct mddev * mddev)1104 static int llbitmap_create(struct mddev *mddev)
1105 {
1106 struct llbitmap *llbitmap;
1107 int ret;
1108
1109 ret = llbitmap_check_support(mddev);
1110 if (ret)
1111 return ret;
1112
1113 llbitmap = kzalloc_obj(*llbitmap);
1114 if (!llbitmap)
1115 return -ENOMEM;
1116
1117 llbitmap->mddev = mddev;
1118 llbitmap->io_size = bdev_logical_block_size(mddev->gendisk->part0);
1119 llbitmap->blocks_per_page = PAGE_SIZE / llbitmap->io_size;
1120
1121 timer_setup(&llbitmap->pending_timer, llbitmap_pending_timer_fn, 0);
1122 INIT_WORK(&llbitmap->daemon_work, md_llbitmap_daemon_fn);
1123 atomic_set(&llbitmap->behind_writes, 0);
1124 init_waitqueue_head(&llbitmap->behind_wait);
1125
1126 mutex_lock(&mddev->bitmap_info.mutex);
1127 mddev->bitmap = llbitmap;
1128 ret = llbitmap_read_sb(llbitmap);
1129 mutex_unlock(&mddev->bitmap_info.mutex);
1130 if (ret) {
1131 kfree(llbitmap);
1132 mddev->bitmap = NULL;
1133 }
1134
1135 return ret;
1136 }
1137
llbitmap_resize(struct mddev * mddev,sector_t blocks,int chunksize)1138 static int llbitmap_resize(struct mddev *mddev, sector_t blocks, int chunksize)
1139 {
1140 struct llbitmap *llbitmap = mddev->bitmap;
1141 unsigned long chunks;
1142
1143 if (chunksize == 0)
1144 chunksize = llbitmap->chunksize;
1145
1146 /* If there is enough space, leave the chunksize unchanged. */
1147 chunks = DIV_ROUND_UP_SECTOR_T(blocks, chunksize);
1148 while (chunks > mddev->bitmap_info.space << SECTOR_SHIFT) {
1149 chunksize = chunksize << 1;
1150 chunks = DIV_ROUND_UP_SECTOR_T(blocks, chunksize);
1151 }
1152
1153 llbitmap->chunkshift = ffz(~chunksize);
1154 llbitmap->chunksize = chunksize;
1155 llbitmap->chunks = chunks;
1156
1157 return 0;
1158 }
1159
llbitmap_load(struct mddev * mddev)1160 static int llbitmap_load(struct mddev *mddev)
1161 {
1162 enum llbitmap_action action = BitmapActionReload;
1163 struct llbitmap *llbitmap = mddev->bitmap;
1164
1165 if (test_and_clear_bit(BITMAP_STALE, &llbitmap->flags))
1166 action = BitmapActionStale;
1167
1168 llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1, action);
1169 return 0;
1170 }
1171
llbitmap_destroy(struct mddev * mddev)1172 static void llbitmap_destroy(struct mddev *mddev)
1173 {
1174 struct llbitmap *llbitmap = mddev->bitmap;
1175
1176 if (!llbitmap)
1177 return;
1178
1179 mutex_lock(&mddev->bitmap_info.mutex);
1180
1181 timer_delete_sync(&llbitmap->pending_timer);
1182 flush_workqueue(md_llbitmap_io_wq);
1183 flush_workqueue(md_llbitmap_unplug_wq);
1184
1185 mddev->bitmap = NULL;
1186 llbitmap_free_pages(llbitmap);
1187 kfree(llbitmap);
1188 mutex_unlock(&mddev->bitmap_info.mutex);
1189 }
1190
llbitmap_start_write(struct mddev * mddev,sector_t offset,unsigned long sectors)1191 static void llbitmap_start_write(struct mddev *mddev, sector_t offset,
1192 unsigned long sectors)
1193 {
1194 struct llbitmap *llbitmap = mddev->bitmap;
1195 unsigned long start = offset >> llbitmap->chunkshift;
1196 unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift;
1197 int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
1198 int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
1199
1200 while (page_start <= page_end) {
1201 llbitmap_raise_barrier(llbitmap, page_start);
1202 page_start++;
1203 }
1204
1205 llbitmap_state_machine(llbitmap, start, end, BitmapActionStartwrite);
1206 }
1207
llbitmap_end_write(struct mddev * mddev,sector_t offset,unsigned long sectors)1208 static void llbitmap_end_write(struct mddev *mddev, sector_t offset,
1209 unsigned long sectors)
1210 {
1211 struct llbitmap *llbitmap = mddev->bitmap;
1212 unsigned long start = offset >> llbitmap->chunkshift;
1213 unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift;
1214 int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
1215 int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
1216
1217 while (page_start <= page_end) {
1218 llbitmap_release_barrier(llbitmap, page_start);
1219 page_start++;
1220 }
1221 }
1222
llbitmap_start_discard(struct mddev * mddev,sector_t offset,unsigned long sectors)1223 static void llbitmap_start_discard(struct mddev *mddev, sector_t offset,
1224 unsigned long sectors)
1225 {
1226 struct llbitmap *llbitmap = mddev->bitmap;
1227 unsigned long start = DIV_ROUND_UP_SECTOR_T(offset, llbitmap->chunksize);
1228 unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift;
1229 int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
1230 int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
1231
1232 while (page_start <= page_end) {
1233 llbitmap_raise_barrier(llbitmap, page_start);
1234 page_start++;
1235 }
1236
1237 llbitmap_state_machine(llbitmap, start, end, BitmapActionDiscard);
1238 }
1239
llbitmap_end_discard(struct mddev * mddev,sector_t offset,unsigned long sectors)1240 static void llbitmap_end_discard(struct mddev *mddev, sector_t offset,
1241 unsigned long sectors)
1242 {
1243 struct llbitmap *llbitmap = mddev->bitmap;
1244 unsigned long start = DIV_ROUND_UP_SECTOR_T(offset, llbitmap->chunksize);
1245 unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift;
1246 int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
1247 int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
1248
1249 while (page_start <= page_end) {
1250 llbitmap_release_barrier(llbitmap, page_start);
1251 page_start++;
1252 }
1253 }
1254
llbitmap_unplug_fn(struct work_struct * work)1255 static void llbitmap_unplug_fn(struct work_struct *work)
1256 {
1257 struct llbitmap_unplug_work *unplug_work =
1258 container_of(work, struct llbitmap_unplug_work, work);
1259 struct llbitmap *llbitmap = unplug_work->llbitmap;
1260 struct blk_plug plug;
1261 int i;
1262
1263 blk_start_plug(&plug);
1264
1265 for (i = 0; i < llbitmap->nr_pages; i++) {
1266 if (!test_bit(LLPageDirty, &llbitmap->pctl[i]->flags) ||
1267 !test_and_clear_bit(LLPageDirty, &llbitmap->pctl[i]->flags))
1268 continue;
1269
1270 llbitmap_write_page(llbitmap, i);
1271 }
1272
1273 blk_finish_plug(&plug);
1274 md_super_wait(llbitmap->mddev);
1275 complete(unplug_work->done);
1276 }
1277
llbitmap_dirty(struct llbitmap * llbitmap)1278 static bool llbitmap_dirty(struct llbitmap *llbitmap)
1279 {
1280 int i;
1281
1282 for (i = 0; i < llbitmap->nr_pages; i++)
1283 if (test_bit(LLPageDirty, &llbitmap->pctl[i]->flags))
1284 return true;
1285
1286 return false;
1287 }
1288
llbitmap_unplug(struct mddev * mddev,bool sync)1289 static void llbitmap_unplug(struct mddev *mddev, bool sync)
1290 {
1291 DECLARE_COMPLETION_ONSTACK(done);
1292 struct llbitmap *llbitmap = mddev->bitmap;
1293 struct llbitmap_unplug_work unplug_work = {
1294 .llbitmap = llbitmap,
1295 .done = &done,
1296 };
1297
1298 if (!llbitmap_dirty(llbitmap))
1299 return;
1300
1301 /*
1302 * Issue new bitmap IO under submit_bio() context will deadlock:
1303 * - the bio will wait for bitmap bio to be done, before it can be
1304 * issued;
1305 * - bitmap bio will be added to current->bio_list and wait for this
1306 * bio to be issued;
1307 */
1308 INIT_WORK_ONSTACK(&unplug_work.work, llbitmap_unplug_fn);
1309 queue_work(md_llbitmap_unplug_wq, &unplug_work.work);
1310 wait_for_completion(&done);
1311 destroy_work_on_stack(&unplug_work.work);
1312 }
1313
1314 /*
1315 * Force to write all bitmap pages to disk, called when stopping the array, or
1316 * every daemon_sleep seconds when sync_thread is running.
1317 */
__llbitmap_flush(struct mddev * mddev)1318 static void __llbitmap_flush(struct mddev *mddev)
1319 {
1320 struct llbitmap *llbitmap = mddev->bitmap;
1321 struct blk_plug plug;
1322 int i;
1323
1324 blk_start_plug(&plug);
1325 for (i = 0; i < llbitmap->nr_pages; i++) {
1326 struct llbitmap_page_ctl *pctl = llbitmap->pctl[i];
1327
1328 /* mark all blocks as dirty */
1329 set_bit(LLPageDirty, &pctl->flags);
1330 bitmap_fill(pctl->dirty, llbitmap->blocks_per_page);
1331 llbitmap_write_page(llbitmap, i);
1332 }
1333 blk_finish_plug(&plug);
1334 md_super_wait(llbitmap->mddev);
1335 }
1336
llbitmap_flush(struct mddev * mddev)1337 static void llbitmap_flush(struct mddev *mddev)
1338 {
1339 struct llbitmap *llbitmap = mddev->bitmap;
1340 int i;
1341
1342 for (i = 0; i < llbitmap->nr_pages; i++)
1343 set_bit(LLPageFlush, &llbitmap->pctl[i]->flags);
1344
1345 timer_delete_sync(&llbitmap->pending_timer);
1346 queue_work(md_llbitmap_io_wq, &llbitmap->daemon_work);
1347 flush_work(&llbitmap->daemon_work);
1348
1349 __llbitmap_flush(mddev);
1350 }
1351
1352 /* This is used for raid5 lazy initial recovery */
llbitmap_blocks_synced(struct mddev * mddev,sector_t offset)1353 static bool llbitmap_blocks_synced(struct mddev *mddev, sector_t offset)
1354 {
1355 struct llbitmap *llbitmap = mddev->bitmap;
1356 unsigned long p = offset >> llbitmap->chunkshift;
1357 enum llbitmap_state c = llbitmap_read(llbitmap, p);
1358
1359 return c == BitClean || c == BitDirty || c == BitCleanUnwritten;
1360 }
1361
llbitmap_skip_sync_blocks(struct mddev * mddev,sector_t offset)1362 static sector_t llbitmap_skip_sync_blocks(struct mddev *mddev, sector_t offset)
1363 {
1364 struct llbitmap *llbitmap = mddev->bitmap;
1365 unsigned long p = offset >> llbitmap->chunkshift;
1366 int blocks = llbitmap->chunksize - (offset & (llbitmap->chunksize - 1));
1367 enum llbitmap_state c = llbitmap_read(llbitmap, p);
1368
1369 /* always skip unwritten blocks */
1370 if (c == BitUnwritten)
1371 return blocks;
1372
1373 /* Skip CleanUnwritten - no user data, will be reset after recovery */
1374 if (c == BitCleanUnwritten)
1375 return blocks;
1376
1377 /* For degraded array, don't skip */
1378 if (mddev->degraded)
1379 return 0;
1380
1381 /* For resync also skip clean/dirty blocks */
1382 if ((c == BitClean || c == BitDirty) &&
1383 test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
1384 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
1385 return blocks;
1386
1387 return 0;
1388 }
1389
llbitmap_start_sync(struct mddev * mddev,sector_t offset,sector_t * blocks,bool degraded)1390 static bool llbitmap_start_sync(struct mddev *mddev, sector_t offset,
1391 sector_t *blocks, bool degraded)
1392 {
1393 struct llbitmap *llbitmap = mddev->bitmap;
1394 unsigned long p = offset >> llbitmap->chunkshift;
1395 enum llbitmap_state state;
1396
1397 /*
1398 * Before recovery starts, convert CleanUnwritten to Unwritten.
1399 * This ensures the new disk won't have stale parity data.
1400 */
1401 if (offset == 0 && test_bit(MD_RECOVERY_RECOVER, &mddev->recovery) &&
1402 !test_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery))
1403 llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1,
1404 BitmapActionClearUnwritten);
1405
1406
1407 /*
1408 * Handle one bit at a time, this is much simpler. And it doesn't matter
1409 * if md_do_sync() loop more times.
1410 */
1411 *blocks = llbitmap->chunksize - (offset & (llbitmap->chunksize - 1));
1412 state = llbitmap_state_machine(llbitmap, p, p, BitmapActionStartsync);
1413 return state == BitSyncing || state == BitSyncingUnwritten;
1414 }
1415
1416 /* Something is wrong, sync_thread stop at @offset */
llbitmap_end_sync(struct mddev * mddev,sector_t offset,sector_t * blocks)1417 static void llbitmap_end_sync(struct mddev *mddev, sector_t offset,
1418 sector_t *blocks)
1419 {
1420 struct llbitmap *llbitmap = mddev->bitmap;
1421 unsigned long p = offset >> llbitmap->chunkshift;
1422
1423 *blocks = llbitmap->chunksize - (offset & (llbitmap->chunksize - 1));
1424 llbitmap_state_machine(llbitmap, p, llbitmap->chunks - 1,
1425 BitmapActionAbortsync);
1426 }
1427
1428 /* A full sync_thread is finished */
llbitmap_close_sync(struct mddev * mddev)1429 static void llbitmap_close_sync(struct mddev *mddev)
1430 {
1431 struct llbitmap *llbitmap = mddev->bitmap;
1432 int i;
1433
1434 for (i = 0; i < llbitmap->nr_pages; i++) {
1435 struct llbitmap_page_ctl *pctl = llbitmap->pctl[i];
1436
1437 /* let daemon_fn clear dirty bits immediately */
1438 WRITE_ONCE(pctl->expire, jiffies);
1439 }
1440
1441 llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1,
1442 BitmapActionEndsync);
1443 }
1444
1445 /*
1446 * sync_thread have reached @sector, update metadata every daemon_sleep seconds,
1447 * just in case sync_thread have to restart after power failure.
1448 */
llbitmap_cond_end_sync(struct mddev * mddev,sector_t sector,bool force)1449 static void llbitmap_cond_end_sync(struct mddev *mddev, sector_t sector,
1450 bool force)
1451 {
1452 struct llbitmap *llbitmap = mddev->bitmap;
1453
1454 if (sector == 0) {
1455 llbitmap->last_end_sync = jiffies;
1456 return;
1457 }
1458
1459 if (time_before(jiffies, llbitmap->last_end_sync +
1460 HZ * mddev->bitmap_info.daemon_sleep))
1461 return;
1462
1463 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
1464
1465 mddev->curr_resync_completed = sector;
1466 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
1467 llbitmap_state_machine(llbitmap, 0, sector >> llbitmap->chunkshift,
1468 BitmapActionEndsync);
1469 __llbitmap_flush(mddev);
1470
1471 llbitmap->last_end_sync = jiffies;
1472 sysfs_notify_dirent_safe(mddev->sysfs_completed);
1473 }
1474
llbitmap_enabled(void * data,bool flush)1475 static bool llbitmap_enabled(void *data, bool flush)
1476 {
1477 struct llbitmap *llbitmap = data;
1478
1479 return llbitmap && !test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags);
1480 }
1481
llbitmap_dirty_bits(struct mddev * mddev,unsigned long s,unsigned long e)1482 static void llbitmap_dirty_bits(struct mddev *mddev, unsigned long s,
1483 unsigned long e)
1484 {
1485 llbitmap_state_machine(mddev->bitmap, s, e, BitmapActionStartwrite);
1486 }
1487
llbitmap_write_sb(struct llbitmap * llbitmap)1488 static void llbitmap_write_sb(struct llbitmap *llbitmap)
1489 {
1490 int nr_blocks = DIV_ROUND_UP(BITMAP_DATA_OFFSET, llbitmap->io_size);
1491
1492 bitmap_fill(llbitmap->pctl[0]->dirty, nr_blocks);
1493 llbitmap_write_page(llbitmap, 0);
1494 md_super_wait(llbitmap->mddev);
1495 }
1496
llbitmap_update_sb(void * data)1497 static void llbitmap_update_sb(void *data)
1498 {
1499 struct llbitmap *llbitmap = data;
1500 struct mddev *mddev = llbitmap->mddev;
1501 struct page *sb_page;
1502 bitmap_super_t *sb;
1503
1504 if (test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags))
1505 return;
1506
1507 sb_page = llbitmap_read_page(llbitmap, 0);
1508 if (IS_ERR(sb_page)) {
1509 pr_err("%s: %s: read super block failed", __func__,
1510 mdname(mddev));
1511 set_bit(BITMAP_WRITE_ERROR, &llbitmap->flags);
1512 return;
1513 }
1514
1515 if (mddev->events < llbitmap->events_cleared)
1516 llbitmap->events_cleared = mddev->events;
1517
1518 sb = kmap_local_page(sb_page);
1519 sb->events = cpu_to_le64(mddev->events);
1520 sb->state = cpu_to_le32(llbitmap->flags);
1521 sb->chunksize = cpu_to_le32(llbitmap->chunksize);
1522 sb->sync_size = cpu_to_le64(mddev->resync_max_sectors);
1523 sb->events_cleared = cpu_to_le64(llbitmap->events_cleared);
1524 sb->sectors_reserved = cpu_to_le32(mddev->bitmap_info.space);
1525 sb->daemon_sleep = cpu_to_le32(mddev->bitmap_info.daemon_sleep);
1526
1527 kunmap_local(sb);
1528 llbitmap_write_sb(llbitmap);
1529 }
1530
llbitmap_get_stats(void * data,struct md_bitmap_stats * stats)1531 static int llbitmap_get_stats(void *data, struct md_bitmap_stats *stats)
1532 {
1533 struct llbitmap *llbitmap = data;
1534
1535 memset(stats, 0, sizeof(*stats));
1536
1537 stats->missing_pages = 0;
1538 stats->pages = llbitmap->nr_pages;
1539 stats->file_pages = llbitmap->nr_pages;
1540
1541 stats->behind_writes = atomic_read(&llbitmap->behind_writes);
1542 stats->behind_wait = wq_has_sleeper(&llbitmap->behind_wait);
1543 stats->events_cleared = llbitmap->events_cleared;
1544
1545 return 0;
1546 }
1547
1548 /* just flag all pages as needing to be written */
llbitmap_write_all(struct mddev * mddev)1549 static void llbitmap_write_all(struct mddev *mddev)
1550 {
1551 int i;
1552 struct llbitmap *llbitmap = mddev->bitmap;
1553
1554 for (i = 0; i < llbitmap->nr_pages; i++) {
1555 struct llbitmap_page_ctl *pctl = llbitmap->pctl[i];
1556
1557 set_bit(LLPageDirty, &pctl->flags);
1558 bitmap_fill(pctl->dirty, llbitmap->blocks_per_page);
1559 }
1560 }
1561
llbitmap_start_behind_write(struct mddev * mddev)1562 static void llbitmap_start_behind_write(struct mddev *mddev)
1563 {
1564 struct llbitmap *llbitmap = mddev->bitmap;
1565
1566 atomic_inc(&llbitmap->behind_writes);
1567 }
1568
llbitmap_end_behind_write(struct mddev * mddev)1569 static void llbitmap_end_behind_write(struct mddev *mddev)
1570 {
1571 struct llbitmap *llbitmap = mddev->bitmap;
1572
1573 if (atomic_dec_and_test(&llbitmap->behind_writes))
1574 wake_up(&llbitmap->behind_wait);
1575 }
1576
llbitmap_wait_behind_writes(struct mddev * mddev)1577 static void llbitmap_wait_behind_writes(struct mddev *mddev)
1578 {
1579 struct llbitmap *llbitmap = mddev->bitmap;
1580
1581 if (!llbitmap)
1582 return;
1583
1584 wait_event(llbitmap->behind_wait,
1585 atomic_read(&llbitmap->behind_writes) == 0);
1586
1587 }
1588
bits_show(struct mddev * mddev,char * page)1589 static ssize_t bits_show(struct mddev *mddev, char *page)
1590 {
1591 struct llbitmap *llbitmap;
1592 int bits[BitStateCount] = {0};
1593 loff_t start = 0;
1594
1595 mutex_lock(&mddev->bitmap_info.mutex);
1596 llbitmap = mddev->bitmap;
1597 if (!llbitmap || !llbitmap->pctl) {
1598 mutex_unlock(&mddev->bitmap_info.mutex);
1599 return sprintf(page, "no bitmap\n");
1600 }
1601
1602 if (test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags)) {
1603 mutex_unlock(&mddev->bitmap_info.mutex);
1604 return sprintf(page, "bitmap io error\n");
1605 }
1606
1607 while (start < llbitmap->chunks) {
1608 enum llbitmap_state c = llbitmap_read(llbitmap, start);
1609
1610 if (c < 0 || c >= BitStateCount)
1611 pr_err("%s: invalid bit %llu state %d\n",
1612 __func__, start, c);
1613 else
1614 bits[c]++;
1615 start++;
1616 }
1617
1618 mutex_unlock(&mddev->bitmap_info.mutex);
1619 return sprintf(page,
1620 "unwritten %d\nclean %d\ndirty %d\n"
1621 "need sync %d\nsyncing %d\n"
1622 "need sync unwritten %d\nsyncing unwritten %d\n"
1623 "clean unwritten %d\n",
1624 bits[BitUnwritten], bits[BitClean], bits[BitDirty],
1625 bits[BitNeedSync], bits[BitSyncing],
1626 bits[BitNeedSyncUnwritten], bits[BitSyncingUnwritten],
1627 bits[BitCleanUnwritten]);
1628 }
1629
1630 static struct md_sysfs_entry llbitmap_bits = __ATTR_RO(bits);
1631
metadata_show(struct mddev * mddev,char * page)1632 static ssize_t metadata_show(struct mddev *mddev, char *page)
1633 {
1634 struct llbitmap *llbitmap;
1635 ssize_t ret;
1636
1637 mutex_lock(&mddev->bitmap_info.mutex);
1638 llbitmap = mddev->bitmap;
1639 if (!llbitmap) {
1640 mutex_unlock(&mddev->bitmap_info.mutex);
1641 return sprintf(page, "no bitmap\n");
1642 }
1643
1644 ret = sprintf(page, "chunksize %lu\nchunkshift %lu\nchunks %lu\noffset %llu\ndaemon_sleep %lu\n",
1645 llbitmap->chunksize, llbitmap->chunkshift,
1646 llbitmap->chunks, mddev->bitmap_info.offset,
1647 llbitmap->mddev->bitmap_info.daemon_sleep);
1648 mutex_unlock(&mddev->bitmap_info.mutex);
1649
1650 return ret;
1651 }
1652
1653 static struct md_sysfs_entry llbitmap_metadata = __ATTR_RO(metadata);
1654
1655 static ssize_t
daemon_sleep_show(struct mddev * mddev,char * page)1656 daemon_sleep_show(struct mddev *mddev, char *page)
1657 {
1658 return sprintf(page, "%lu\n", mddev->bitmap_info.daemon_sleep);
1659 }
1660
1661 static ssize_t
daemon_sleep_store(struct mddev * mddev,const char * buf,size_t len)1662 daemon_sleep_store(struct mddev *mddev, const char *buf, size_t len)
1663 {
1664 unsigned long timeout;
1665 int rv = kstrtoul(buf, 10, &timeout);
1666
1667 if (rv)
1668 return rv;
1669
1670 mddev->bitmap_info.daemon_sleep = timeout;
1671 return len;
1672 }
1673
1674 static struct md_sysfs_entry llbitmap_daemon_sleep = __ATTR_RW(daemon_sleep);
1675
1676 static ssize_t
barrier_idle_show(struct mddev * mddev,char * page)1677 barrier_idle_show(struct mddev *mddev, char *page)
1678 {
1679 struct llbitmap *llbitmap = mddev->bitmap;
1680
1681 return sprintf(page, "%lu\n", llbitmap->barrier_idle);
1682 }
1683
1684 static ssize_t
barrier_idle_store(struct mddev * mddev,const char * buf,size_t len)1685 barrier_idle_store(struct mddev *mddev, const char *buf, size_t len)
1686 {
1687 struct llbitmap *llbitmap = mddev->bitmap;
1688 unsigned long timeout;
1689 int rv = kstrtoul(buf, 10, &timeout);
1690
1691 if (rv)
1692 return rv;
1693
1694 llbitmap->barrier_idle = timeout;
1695 return len;
1696 }
1697
1698 static struct md_sysfs_entry llbitmap_barrier_idle = __ATTR_RW(barrier_idle);
1699
1700 static ssize_t
proactive_sync_store(struct mddev * mddev,const char * buf,size_t len)1701 proactive_sync_store(struct mddev *mddev, const char *buf, size_t len)
1702 {
1703 struct llbitmap *llbitmap;
1704
1705 /* Only for RAID-456 */
1706 if (!raid_is_456(mddev))
1707 return -EINVAL;
1708
1709 mutex_lock(&mddev->bitmap_info.mutex);
1710 llbitmap = mddev->bitmap;
1711 if (!llbitmap || !llbitmap->pctl) {
1712 mutex_unlock(&mddev->bitmap_info.mutex);
1713 return -ENODEV;
1714 }
1715
1716 /* Trigger proactive sync on all Unwritten regions */
1717 llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1,
1718 BitmapActionProactiveSync);
1719
1720 mutex_unlock(&mddev->bitmap_info.mutex);
1721 return len;
1722 }
1723
1724 static struct md_sysfs_entry llbitmap_proactive_sync =
1725 __ATTR(proactive_sync, 0200, NULL, proactive_sync_store);
1726
1727 static struct attribute *md_llbitmap_attrs[] = {
1728 &llbitmap_bits.attr,
1729 &llbitmap_metadata.attr,
1730 &llbitmap_daemon_sleep.attr,
1731 &llbitmap_barrier_idle.attr,
1732 &llbitmap_proactive_sync.attr,
1733 NULL
1734 };
1735
1736 static struct attribute_group md_llbitmap_group = {
1737 .name = "llbitmap",
1738 .attrs = md_llbitmap_attrs,
1739 };
1740
1741 static struct bitmap_operations llbitmap_ops = {
1742 .head = {
1743 .type = MD_BITMAP,
1744 .id = ID_LLBITMAP,
1745 .name = "llbitmap",
1746 },
1747
1748 .enabled = llbitmap_enabled,
1749 .create = llbitmap_create,
1750 .resize = llbitmap_resize,
1751 .load = llbitmap_load,
1752 .destroy = llbitmap_destroy,
1753
1754 .start_write = llbitmap_start_write,
1755 .end_write = llbitmap_end_write,
1756 .start_discard = llbitmap_start_discard,
1757 .end_discard = llbitmap_end_discard,
1758 .unplug = llbitmap_unplug,
1759 .flush = llbitmap_flush,
1760
1761 .start_behind_write = llbitmap_start_behind_write,
1762 .end_behind_write = llbitmap_end_behind_write,
1763 .wait_behind_writes = llbitmap_wait_behind_writes,
1764
1765 .blocks_synced = llbitmap_blocks_synced,
1766 .skip_sync_blocks = llbitmap_skip_sync_blocks,
1767 .start_sync = llbitmap_start_sync,
1768 .end_sync = llbitmap_end_sync,
1769 .close_sync = llbitmap_close_sync,
1770 .cond_end_sync = llbitmap_cond_end_sync,
1771
1772 .update_sb = llbitmap_update_sb,
1773 .get_stats = llbitmap_get_stats,
1774 .dirty_bits = llbitmap_dirty_bits,
1775 .write_all = llbitmap_write_all,
1776
1777 .group = &md_llbitmap_group,
1778 };
1779
md_llbitmap_init(void)1780 int md_llbitmap_init(void)
1781 {
1782 md_llbitmap_io_wq = alloc_workqueue("md_llbitmap_io",
1783 WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
1784 if (!md_llbitmap_io_wq)
1785 return -ENOMEM;
1786
1787 md_llbitmap_unplug_wq = alloc_workqueue("md_llbitmap_unplug",
1788 WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
1789 if (!md_llbitmap_unplug_wq) {
1790 destroy_workqueue(md_llbitmap_io_wq);
1791 md_llbitmap_io_wq = NULL;
1792 return -ENOMEM;
1793 }
1794
1795 return register_md_submodule(&llbitmap_ops.head);
1796 }
1797
md_llbitmap_exit(void)1798 void md_llbitmap_exit(void)
1799 {
1800 destroy_workqueue(md_llbitmap_io_wq);
1801 md_llbitmap_io_wq = NULL;
1802 destroy_workqueue(md_llbitmap_unplug_wq);
1803 md_llbitmap_unplug_wq = NULL;
1804 unregister_md_submodule(&llbitmap_ops.head);
1805 }
1806