1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * linux/fs/buffer.c
4 *
5 * Copyright (C) 1991, 1992, 2002 Linus Torvalds
6 */
7
8 /*
9 * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
10 *
11 * Removed a lot of unnecessary code and simplified things now that
12 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
13 *
14 * Speed up hash, lru, and free list operations. Use gfp() for allocating
15 * hash table, use SLAB cache for buffer heads. SMP threading. -DaveM
16 *
17 * Added 32k buffer block sizes - these are required older ARM systems. - RMK
18 *
19 * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
20 */
21
22 #include <linux/kernel.h>
23 #include <linux/sched/signal.h>
24 #include <linux/syscalls.h>
25 #include <linux/fs.h>
26 #include <linux/iomap.h>
27 #include <linux/mm.h>
28 #include <linux/percpu.h>
29 #include <linux/slab.h>
30 #include <linux/capability.h>
31 #include <linux/blkdev.h>
32 #include <linux/blk-crypto.h>
33 #include <linux/file.h>
34 #include <linux/quotaops.h>
35 #include <linux/highmem.h>
36 #include <linux/export.h>
37 #include <linux/backing-dev.h>
38 #include <linux/writeback.h>
39 #include <linux/hash.h>
40 #include <linux/suspend.h>
41 #include <linux/buffer_head.h>
42 #include <linux/task_io_accounting_ops.h>
43 #include <linux/bio.h>
44 #include <linux/cpu.h>
45 #include <linux/bitops.h>
46 #include <linux/mpage.h>
47 #include <linux/bit_spinlock.h>
48 #include <linux/pagevec.h>
49 #include <linux/sched/mm.h>
50 #include <trace/events/block.h>
51 #include <linux/fscrypt.h>
52 #include <linux/fsverity.h>
53 #include <linux/sched/isolation.h>
54
55 #include "internal.h"
56
57 static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
58 enum rw_hint hint, struct writeback_control *wbc);
59
60 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
61
touch_buffer(struct buffer_head * bh)62 inline void touch_buffer(struct buffer_head *bh)
63 {
64 trace_block_touch_buffer(bh);
65 folio_mark_accessed(bh->b_folio);
66 }
67 EXPORT_SYMBOL(touch_buffer);
68
__lock_buffer(struct buffer_head * bh)69 void __lock_buffer(struct buffer_head *bh)
70 {
71 wait_on_bit_lock_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE);
72 }
73 EXPORT_SYMBOL(__lock_buffer);
74
unlock_buffer(struct buffer_head * bh)75 void unlock_buffer(struct buffer_head *bh)
76 {
77 clear_bit_unlock(BH_Lock, &bh->b_state);
78 smp_mb__after_atomic();
79 wake_up_bit(&bh->b_state, BH_Lock);
80 }
81 EXPORT_SYMBOL(unlock_buffer);
82
83 /*
84 * Returns if the folio has dirty or writeback buffers. If all the buffers
85 * are unlocked and clean then the folio_test_dirty information is stale. If
86 * any of the buffers are locked, it is assumed they are locked for IO.
87 */
buffer_check_dirty_writeback(struct folio * folio,bool * dirty,bool * writeback)88 void buffer_check_dirty_writeback(struct folio *folio,
89 bool *dirty, bool *writeback)
90 {
91 struct buffer_head *head, *bh;
92 *dirty = false;
93 *writeback = false;
94
95 BUG_ON(!folio_test_locked(folio));
96
97 head = folio_buffers(folio);
98 if (!head)
99 return;
100
101 if (folio_test_writeback(folio))
102 *writeback = true;
103
104 bh = head;
105 do {
106 if (buffer_locked(bh))
107 *writeback = true;
108
109 if (buffer_dirty(bh))
110 *dirty = true;
111
112 bh = bh->b_this_page;
113 } while (bh != head);
114 }
115
116 /*
117 * Block until a buffer comes unlocked. This doesn't stop it
118 * from becoming locked again - you have to lock it yourself
119 * if you want to preserve its state.
120 */
__wait_on_buffer(struct buffer_head * bh)121 void __wait_on_buffer(struct buffer_head * bh)
122 {
123 wait_on_bit_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE);
124 }
125 EXPORT_SYMBOL(__wait_on_buffer);
126
buffer_io_error(struct buffer_head * bh,char * msg)127 static void buffer_io_error(struct buffer_head *bh, char *msg)
128 {
129 if (!test_bit(BH_Quiet, &bh->b_state))
130 printk_ratelimited(KERN_ERR
131 "Buffer I/O error on dev %pg, logical block %llu%s\n",
132 bh->b_bdev, (unsigned long long)bh->b_blocknr, msg);
133 }
134
135 /*
136 * End-of-IO handler helper function which does not touch the bh after
137 * unlocking it.
138 * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
139 * a race there is benign: unlock_buffer() only use the bh's address for
140 * hashing after unlocking the buffer, so it doesn't actually touch the bh
141 * itself.
142 */
__end_buffer_read_notouch(struct buffer_head * bh,int uptodate)143 static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
144 {
145 if (uptodate) {
146 set_buffer_uptodate(bh);
147 } else {
148 /* This happens, due to failed read-ahead attempts. */
149 clear_buffer_uptodate(bh);
150 }
151 unlock_buffer(bh);
152 }
153
154 /*
155 * Default synchronous end-of-IO handler.. Just mark it up-to-date and
156 * unlock the buffer.
157 */
end_buffer_read_sync(struct buffer_head * bh,int uptodate)158 void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
159 {
160 put_bh(bh);
161 __end_buffer_read_notouch(bh, uptodate);
162 }
163 EXPORT_SYMBOL(end_buffer_read_sync);
164
end_buffer_write_sync(struct buffer_head * bh,int uptodate)165 void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
166 {
167 if (uptodate) {
168 set_buffer_uptodate(bh);
169 } else {
170 buffer_io_error(bh, ", lost sync page write");
171 mark_buffer_write_io_error(bh);
172 clear_buffer_uptodate(bh);
173 }
174 unlock_buffer(bh);
175 put_bh(bh);
176 }
177 EXPORT_SYMBOL(end_buffer_write_sync);
178
179 static struct buffer_head *
__find_get_block_slow(struct block_device * bdev,sector_t block,bool atomic)180 __find_get_block_slow(struct block_device *bdev, sector_t block, bool atomic)
181 {
182 struct address_space *bd_mapping = bdev->bd_mapping;
183 const int blkbits = bd_mapping->host->i_blkbits;
184 struct buffer_head *ret = NULL;
185 pgoff_t index;
186 struct buffer_head *bh;
187 struct buffer_head *head;
188 struct folio *folio;
189 int all_mapped = 1;
190 static DEFINE_RATELIMIT_STATE(last_warned, HZ, 1);
191
192 index = ((loff_t)block << blkbits) / PAGE_SIZE;
193 folio = __filemap_get_folio(bd_mapping, index, FGP_ACCESSED, 0);
194 if (IS_ERR(folio))
195 goto out;
196
197 /*
198 * Folio lock protects the buffers. Callers that cannot block
199 * will fallback to serializing vs try_to_free_buffers() via
200 * the i_private_lock.
201 */
202 if (atomic)
203 spin_lock(&bd_mapping->i_private_lock);
204 else
205 folio_lock(folio);
206
207 head = folio_buffers(folio);
208 if (!head)
209 goto out_unlock;
210 /*
211 * Upon a noref migration, the folio lock serializes here;
212 * otherwise bail.
213 */
214 if (test_bit_acquire(BH_Migrate, &head->b_state)) {
215 WARN_ON(!atomic);
216 goto out_unlock;
217 }
218
219 bh = head;
220 do {
221 if (!buffer_mapped(bh))
222 all_mapped = 0;
223 else if (bh->b_blocknr == block) {
224 ret = bh;
225 get_bh(bh);
226 goto out_unlock;
227 }
228 bh = bh->b_this_page;
229 } while (bh != head);
230
231 /* we might be here because some of the buffers on this page are
232 * not mapped. This is due to various races between
233 * file io on the block device and getblk. It gets dealt with
234 * elsewhere, don't buffer_error if we had some unmapped buffers
235 */
236 ratelimit_set_flags(&last_warned, RATELIMIT_MSG_ON_RELEASE);
237 if (all_mapped && __ratelimit(&last_warned)) {
238 printk("__find_get_block_slow() failed. block=%llu, "
239 "b_blocknr=%llu, b_state=0x%08lx, b_size=%zu, "
240 "device %pg blocksize: %d\n",
241 (unsigned long long)block,
242 (unsigned long long)bh->b_blocknr,
243 bh->b_state, bh->b_size, bdev,
244 1 << blkbits);
245 }
246 out_unlock:
247 if (atomic)
248 spin_unlock(&bd_mapping->i_private_lock);
249 else
250 folio_unlock(folio);
251 folio_put(folio);
252 out:
253 return ret;
254 }
255
end_buffer_async_read(struct buffer_head * bh,int uptodate)256 static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
257 {
258 unsigned long flags;
259 struct buffer_head *first;
260 struct buffer_head *tmp;
261 struct folio *folio;
262 int folio_uptodate = 1;
263
264 BUG_ON(!buffer_async_read(bh));
265
266 folio = bh->b_folio;
267 if (uptodate) {
268 set_buffer_uptodate(bh);
269 } else {
270 clear_buffer_uptodate(bh);
271 buffer_io_error(bh, ", async page read");
272 }
273
274 /*
275 * Be _very_ careful from here on. Bad things can happen if
276 * two buffer heads end IO at almost the same time and both
277 * decide that the page is now completely done.
278 */
279 first = folio_buffers(folio);
280 spin_lock_irqsave(&first->b_uptodate_lock, flags);
281 clear_buffer_async_read(bh);
282 unlock_buffer(bh);
283 tmp = bh;
284 do {
285 if (!buffer_uptodate(tmp))
286 folio_uptodate = 0;
287 if (buffer_async_read(tmp)) {
288 BUG_ON(!buffer_locked(tmp));
289 goto still_busy;
290 }
291 tmp = tmp->b_this_page;
292 } while (tmp != bh);
293 spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
294
295 folio_end_read(folio, folio_uptodate);
296 return;
297
298 still_busy:
299 spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
300 }
301
302 struct postprocess_bh_ctx {
303 struct work_struct work;
304 struct buffer_head *bh;
305 struct fsverity_info *vi;
306 };
307
verify_bh(struct work_struct * work)308 static void verify_bh(struct work_struct *work)
309 {
310 struct postprocess_bh_ctx *ctx =
311 container_of(work, struct postprocess_bh_ctx, work);
312 struct buffer_head *bh = ctx->bh;
313 bool valid;
314
315 valid = fsverity_verify_blocks(ctx->vi, bh->b_folio, bh->b_size,
316 bh_offset(bh));
317 end_buffer_async_read(bh, valid);
318 kfree(ctx);
319 }
320
decrypt_bh(struct work_struct * work)321 static void decrypt_bh(struct work_struct *work)
322 {
323 struct postprocess_bh_ctx *ctx =
324 container_of(work, struct postprocess_bh_ctx, work);
325 struct buffer_head *bh = ctx->bh;
326 int err;
327
328 err = fscrypt_decrypt_pagecache_blocks(bh->b_folio, bh->b_size,
329 bh_offset(bh));
330 if (err == 0 && ctx->vi) {
331 /*
332 * We use different work queues for decryption and for verity
333 * because verity may require reading metadata pages that need
334 * decryption, and we shouldn't recurse to the same workqueue.
335 */
336 INIT_WORK(&ctx->work, verify_bh);
337 fsverity_enqueue_verify_work(&ctx->work);
338 return;
339 }
340 end_buffer_async_read(bh, err == 0);
341 kfree(ctx);
342 }
343
344 /*
345 * I/O completion handler for block_read_full_folio() - pages
346 * which come unlocked at the end of I/O.
347 */
end_buffer_async_read_io(struct buffer_head * bh,int uptodate)348 static void end_buffer_async_read_io(struct buffer_head *bh, int uptodate)
349 {
350 struct inode *inode = bh->b_folio->mapping->host;
351 bool decrypt = fscrypt_inode_uses_fs_layer_crypto(inode);
352 struct fsverity_info *vi = NULL;
353
354 /* needed by ext4 */
355 if (bh->b_folio->index < DIV_ROUND_UP(inode->i_size, PAGE_SIZE))
356 vi = fsverity_get_info(inode);
357
358 /* Decrypt (with fscrypt) and/or verify (with fsverity) if needed. */
359 if (uptodate && (decrypt || vi)) {
360 struct postprocess_bh_ctx *ctx = kmalloc_obj(*ctx, GFP_ATOMIC);
361
362 if (ctx) {
363 ctx->bh = bh;
364 ctx->vi = vi;
365 if (decrypt) {
366 INIT_WORK(&ctx->work, decrypt_bh);
367 fscrypt_enqueue_decrypt_work(&ctx->work);
368 } else {
369 INIT_WORK(&ctx->work, verify_bh);
370 fsverity_enqueue_verify_work(&ctx->work);
371 }
372 return;
373 }
374 uptodate = 0;
375 }
376 end_buffer_async_read(bh, uptodate);
377 }
378
379 /*
380 * Completion handler for block_write_full_folio() - folios which are unlocked
381 * during I/O, and which have the writeback flag cleared upon I/O completion.
382 */
end_buffer_async_write(struct buffer_head * bh,int uptodate)383 static void end_buffer_async_write(struct buffer_head *bh, int uptodate)
384 {
385 unsigned long flags;
386 struct buffer_head *first;
387 struct buffer_head *tmp;
388 struct folio *folio;
389
390 BUG_ON(!buffer_async_write(bh));
391
392 folio = bh->b_folio;
393 if (uptodate) {
394 set_buffer_uptodate(bh);
395 } else {
396 buffer_io_error(bh, ", lost async page write");
397 mark_buffer_write_io_error(bh);
398 clear_buffer_uptodate(bh);
399 }
400
401 first = folio_buffers(folio);
402 spin_lock_irqsave(&first->b_uptodate_lock, flags);
403
404 clear_buffer_async_write(bh);
405 unlock_buffer(bh);
406 tmp = bh->b_this_page;
407 while (tmp != bh) {
408 if (buffer_async_write(tmp)) {
409 BUG_ON(!buffer_locked(tmp));
410 goto still_busy;
411 }
412 tmp = tmp->b_this_page;
413 }
414 spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
415 folio_end_writeback(folio);
416 return;
417
418 still_busy:
419 spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
420 }
421
422 /*
423 * If a page's buffers are under async readin (end_buffer_async_read
424 * completion) then there is a possibility that another thread of
425 * control could lock one of the buffers after it has completed
426 * but while some of the other buffers have not completed. This
427 * locked buffer would confuse end_buffer_async_read() into not unlocking
428 * the page. So the absence of BH_Async_Read tells end_buffer_async_read()
429 * that this buffer is not under async I/O.
430 *
431 * The page comes unlocked when it has no locked buffer_async buffers
432 * left.
433 *
434 * PageLocked prevents anyone starting new async I/O reads any of
435 * the buffers.
436 *
437 * PageWriteback is used to prevent simultaneous writeout of the same
438 * page.
439 *
440 * PageLocked prevents anyone from starting writeback of a page which is
441 * under read I/O (PageWriteback is only ever set against a locked page).
442 */
mark_buffer_async_read(struct buffer_head * bh)443 static void mark_buffer_async_read(struct buffer_head *bh)
444 {
445 bh->b_end_io = end_buffer_async_read_io;
446 set_buffer_async_read(bh);
447 }
448
mark_buffer_async_write_endio(struct buffer_head * bh,bh_end_io_t * handler)449 static void mark_buffer_async_write_endio(struct buffer_head *bh,
450 bh_end_io_t *handler)
451 {
452 bh->b_end_io = handler;
453 set_buffer_async_write(bh);
454 }
455
mark_buffer_async_write(struct buffer_head * bh)456 void mark_buffer_async_write(struct buffer_head *bh)
457 {
458 mark_buffer_async_write_endio(bh, end_buffer_async_write);
459 }
460 EXPORT_SYMBOL(mark_buffer_async_write);
461
462
463 /*
464 * fs/buffer.c contains helper functions for buffer-backed address space's
465 * fsync functions. A common requirement for buffer-based filesystems is
466 * that certain data from the backing blockdev needs to be written out for
467 * a successful fsync(). For example, ext2 indirect blocks need to be
468 * written back and waited upon before fsync() returns.
469 *
470 * The functions mmb_mark_buffer_dirty(), mmb_sync(), mmb_has_buffers()
471 * and mmb_invalidate() are provided for the management of a list of dependent
472 * buffers in mapping_metadata_bhs struct.
473 *
474 * The locking is a little subtle: The list of buffer heads is protected by
475 * the lock in mapping_metadata_bhs so functions coming from bdev mapping
476 * (such as try_to_free_buffers()) need to safely get to mapping_metadata_bhs
477 * using RCU, grab the lock, verify we didn't race with somebody detaching the
478 * bh / moving it to different inode and only then proceeding.
479 */
480
mmb_init(struct mapping_metadata_bhs * mmb,struct address_space * mapping)481 void mmb_init(struct mapping_metadata_bhs *mmb, struct address_space *mapping)
482 {
483 spin_lock_init(&mmb->lock);
484 INIT_LIST_HEAD(&mmb->list);
485 mmb->mapping = mapping;
486 }
487 EXPORT_SYMBOL(mmb_init);
488
__remove_assoc_queue(struct mapping_metadata_bhs * mmb,struct buffer_head * bh)489 static void __remove_assoc_queue(struct mapping_metadata_bhs *mmb,
490 struct buffer_head *bh)
491 {
492 lockdep_assert_held(&mmb->lock);
493 list_del_init(&bh->b_assoc_buffers);
494 WARN_ON(!bh->b_mmb);
495 bh->b_mmb = NULL;
496 }
497
remove_assoc_queue(struct buffer_head * bh)498 static void remove_assoc_queue(struct buffer_head *bh)
499 {
500 struct mapping_metadata_bhs *mmb;
501
502 /*
503 * The locking dance is ugly here. We need to acquire the lock
504 * protecting the metadata bh list while possibly racing with bh
505 * being removed from the list or moved to a different one. We
506 * use RCU to pin mapping_metadata_bhs in memory to
507 * opportunistically acquire the lock and then recheck the bh
508 * didn't move under us.
509 */
510 while (bh->b_mmb) {
511 rcu_read_lock();
512 mmb = READ_ONCE(bh->b_mmb);
513 if (mmb) {
514 spin_lock(&mmb->lock);
515 if (bh->b_mmb == mmb)
516 __remove_assoc_queue(mmb, bh);
517 spin_unlock(&mmb->lock);
518 }
519 rcu_read_unlock();
520 }
521 }
522
mmb_has_buffers(struct mapping_metadata_bhs * mmb)523 bool mmb_has_buffers(struct mapping_metadata_bhs *mmb)
524 {
525 return !list_empty(&mmb->list);
526 }
527 EXPORT_SYMBOL_GPL(mmb_has_buffers);
528
529 /**
530 * mmb_sync - write out & wait upon all buffers in a list
531 * @mmb: the list of buffers to write
532 *
533 * Starts I/O against the buffers in the given list and waits upon
534 * that I/O. Basically, this is a convenience function for fsync(). @mmb is
535 * for a file or directory which needs those buffers to be written for a
536 * successful fsync().
537 *
538 * We have conflicting pressures: we want to make sure that all
539 * initially dirty buffers get waited on, but that any subsequently
540 * dirtied buffers don't. After all, we don't want fsync to last
541 * forever if somebody is actively writing to the file.
542 *
543 * Do this in two main stages: first we copy dirty buffers to a
544 * temporary inode list, queueing the writes as we go. Then we clean
545 * up, waiting for those writes to complete. mark_buffer_dirty_inode()
546 * doesn't touch b_assoc_buffers list if b_mmb is not NULL so we are sure the
547 * buffer stays on our list until IO completes (at which point it can be
548 * reaped).
549 */
mmb_sync(struct mapping_metadata_bhs * mmb)550 int mmb_sync(struct mapping_metadata_bhs *mmb)
551 {
552 struct buffer_head *bh;
553 int err = 0;
554 struct blk_plug plug;
555 LIST_HEAD(tmp);
556
557 if (!mmb_has_buffers(mmb))
558 return 0;
559
560 blk_start_plug(&plug);
561
562 spin_lock(&mmb->lock);
563 while (!list_empty(&mmb->list)) {
564 bh = BH_ENTRY(mmb->list.next);
565 WARN_ON_ONCE(bh->b_mmb != mmb);
566 __remove_assoc_queue(mmb, bh);
567 /* Avoid race with mark_buffer_dirty_inode() which does
568 * a lockless check and we rely on seeing the dirty bit */
569 smp_mb();
570 if (buffer_dirty(bh) || buffer_locked(bh)) {
571 list_add(&bh->b_assoc_buffers, &tmp);
572 bh->b_mmb = mmb;
573 if (buffer_dirty(bh)) {
574 get_bh(bh);
575 spin_unlock(&mmb->lock);
576 /*
577 * Ensure any pending I/O completes so that
578 * write_dirty_buffer() actually writes the
579 * current contents - it is a noop if I/O is
580 * still in flight on potentially older
581 * contents.
582 */
583 write_dirty_buffer(bh, REQ_SYNC);
584
585 /*
586 * Kick off IO for the previous mapping. Note
587 * that we will not run the very last mapping,
588 * wait_on_buffer() will do that for us
589 * through sync_buffer().
590 */
591 brelse(bh);
592 spin_lock(&mmb->lock);
593 }
594 }
595 }
596
597 spin_unlock(&mmb->lock);
598 blk_finish_plug(&plug);
599 spin_lock(&mmb->lock);
600
601 while (!list_empty(&tmp)) {
602 bh = BH_ENTRY(tmp.prev);
603 get_bh(bh);
604 __remove_assoc_queue(mmb, bh);
605 /* Avoid race with mark_buffer_dirty_inode() which does
606 * a lockless check and we rely on seeing the dirty bit */
607 smp_mb();
608 if (buffer_dirty(bh)) {
609 list_add(&bh->b_assoc_buffers, &mmb->list);
610 bh->b_mmb = mmb;
611 }
612 spin_unlock(&mmb->lock);
613 wait_on_buffer(bh);
614 if (!buffer_uptodate(bh))
615 err = -EIO;
616 brelse(bh);
617 spin_lock(&mmb->lock);
618 }
619 spin_unlock(&mmb->lock);
620 return err;
621 }
622 EXPORT_SYMBOL(mmb_sync);
623
624 /**
625 * mmb_fsync_noflush - fsync implementation for simple filesystems with
626 * metadata buffers list
627 *
628 * @file: file to synchronize
629 * @mmb: list of metadata bhs to flush
630 * @start: start offset in bytes
631 * @end: end offset in bytes (inclusive)
632 * @datasync: only synchronize essential metadata if true
633 *
634 * This is an implementation of the fsync method for simple filesystems which
635 * track all non-inode metadata in the buffers list hanging off the @mmb
636 * structure.
637 */
mmb_fsync_noflush(struct file * file,struct mapping_metadata_bhs * mmb,loff_t start,loff_t end,bool datasync)638 int mmb_fsync_noflush(struct file *file, struct mapping_metadata_bhs *mmb,
639 loff_t start, loff_t end, bool datasync)
640 {
641 struct inode *inode = file->f_mapping->host;
642 int err;
643 int ret = 0;
644
645 err = file_write_and_wait_range(file, start, end);
646 if (err)
647 return err;
648
649 if (mmb)
650 ret = mmb_sync(mmb);
651 if (!(inode_state_read_once(inode) & I_DIRTY_ALL))
652 goto out;
653 if (datasync && !(inode_state_read_once(inode) & I_DIRTY_DATASYNC))
654 goto out;
655
656 err = sync_inode_metadata(inode, 1);
657 if (ret == 0)
658 ret = err;
659
660 out:
661 /* check and advance again to catch errors after syncing out buffers */
662 err = file_check_and_advance_wb_err(file);
663 if (ret == 0)
664 ret = err;
665 return ret;
666 }
667 EXPORT_SYMBOL(mmb_fsync_noflush);
668
669 /**
670 * mmb_fsync - fsync implementation for simple filesystems with metadata
671 * buffers list
672 *
673 * @file: file to synchronize
674 * @mmb: list of metadata bhs to flush
675 * @start: start offset in bytes
676 * @end: end offset in bytes (inclusive)
677 * @datasync: only synchronize essential metadata if true
678 *
679 * This is an implementation of the fsync method for simple filesystems which
680 * track all non-inode metadata in the buffers list hanging off the @mmb
681 * structure. This also makes sure that a device cache flush operation is
682 * called at the end.
683 */
mmb_fsync(struct file * file,struct mapping_metadata_bhs * mmb,loff_t start,loff_t end,bool datasync)684 int mmb_fsync(struct file *file, struct mapping_metadata_bhs *mmb,
685 loff_t start, loff_t end, bool datasync)
686 {
687 struct inode *inode = file->f_mapping->host;
688 int ret;
689
690 ret = mmb_fsync_noflush(file, mmb, start, end, datasync);
691 if (!ret)
692 ret = blkdev_issue_flush(inode->i_sb->s_bdev);
693 return ret;
694 }
695 EXPORT_SYMBOL(mmb_fsync);
696
697 /*
698 * Called when we've recently written block `bblock', and it is known that
699 * `bblock' was for a buffer_boundary() buffer. This means that the block at
700 * `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's
701 * dirty, schedule it for IO. So that indirects merge nicely with their data.
702 */
write_boundary_block(struct block_device * bdev,sector_t bblock,unsigned blocksize)703 void write_boundary_block(struct block_device *bdev,
704 sector_t bblock, unsigned blocksize)
705 {
706 struct buffer_head *bh;
707
708 bh = __find_get_block_nonatomic(bdev, bblock + 1, blocksize);
709 if (bh) {
710 if (buffer_dirty(bh))
711 write_dirty_buffer(bh, 0);
712 put_bh(bh);
713 }
714 }
715
mmb_mark_buffer_dirty(struct buffer_head * bh,struct mapping_metadata_bhs * mmb)716 void mmb_mark_buffer_dirty(struct buffer_head *bh,
717 struct mapping_metadata_bhs *mmb)
718 {
719 mark_buffer_dirty(bh);
720 if (!bh->b_mmb) {
721 spin_lock(&mmb->lock);
722 list_move_tail(&bh->b_assoc_buffers, &mmb->list);
723 bh->b_mmb = mmb;
724 spin_unlock(&mmb->lock);
725 }
726 }
727 EXPORT_SYMBOL(mmb_mark_buffer_dirty);
728
729 /**
730 * block_dirty_folio - Mark a folio as dirty.
731 * @mapping: The address space containing this folio.
732 * @folio: The folio to mark dirty.
733 *
734 * Filesystems which use buffer_heads can use this function as their
735 * ->dirty_folio implementation. Some filesystems need to do a little
736 * work before calling this function. Filesystems which do not use
737 * buffer_heads should call filemap_dirty_folio() instead.
738 *
739 * If the folio has buffers, the uptodate buffers are set dirty, to
740 * preserve dirty-state coherency between the folio and the buffers.
741 * Buffers added to a dirty folio are created dirty.
742 *
743 * The buffers are dirtied before the folio is dirtied. There's a small
744 * race window in which writeback may see the folio cleanness but not the
745 * buffer dirtiness. That's fine. If this code were to set the folio
746 * dirty before the buffers, writeback could clear the folio dirty flag,
747 * see a bunch of clean buffers and we'd end up with dirty buffers/clean
748 * folio on the dirty folio list.
749 *
750 * We use i_private_lock to lock against try_to_free_buffers() while
751 * using the folio's buffer list. This also prevents clean buffers
752 * being added to the folio after it was set dirty.
753 *
754 * Context: May only be called from process context. Does not sleep.
755 * Caller must ensure that @folio cannot be truncated during this call,
756 * typically by holding the folio lock or having a page in the folio
757 * mapped and holding the page table lock.
758 *
759 * Return: True if the folio was dirtied; false if it was already dirtied.
760 */
block_dirty_folio(struct address_space * mapping,struct folio * folio)761 bool block_dirty_folio(struct address_space *mapping, struct folio *folio)
762 {
763 struct buffer_head *head;
764 bool newly_dirty;
765
766 spin_lock(&mapping->i_private_lock);
767 head = folio_buffers(folio);
768 if (head) {
769 struct buffer_head *bh = head;
770
771 do {
772 set_buffer_dirty(bh);
773 bh = bh->b_this_page;
774 } while (bh != head);
775 }
776 /*
777 * Lock out page's memcg migration to keep PageDirty
778 * synchronized with per-memcg dirty page counters.
779 */
780 newly_dirty = !folio_test_set_dirty(folio);
781 spin_unlock(&mapping->i_private_lock);
782
783 if (newly_dirty)
784 __folio_mark_dirty(folio, mapping, 1);
785
786 if (newly_dirty)
787 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
788
789 return newly_dirty;
790 }
791 EXPORT_SYMBOL(block_dirty_folio);
792
793 /*
794 * Invalidate any and all dirty buffers on a given buffers list. We are
795 * probably unmounting the fs, but that doesn't mean we have already
796 * done a sync(). Just drop the buffers from the inode list.
797 */
mmb_invalidate(struct mapping_metadata_bhs * mmb)798 void mmb_invalidate(struct mapping_metadata_bhs *mmb)
799 {
800 if (mmb_has_buffers(mmb)) {
801 spin_lock(&mmb->lock);
802 while (!list_empty(&mmb->list))
803 __remove_assoc_queue(mmb, BH_ENTRY(mmb->list.next));
804 spin_unlock(&mmb->lock);
805 }
806 }
807 EXPORT_SYMBOL(mmb_invalidate);
808
809 /*
810 * Create the appropriate buffers when given a folio for data area and
811 * the size of each buffer.. Use the bh->b_this_page linked list to
812 * follow the buffers created. Return NULL if unable to create more
813 * buffers.
814 *
815 * The retry flag is used to differentiate async IO (paging, swapping)
816 * which may not fail from ordinary buffer allocations.
817 */
folio_alloc_buffers(struct folio * folio,unsigned long size,gfp_t gfp)818 struct buffer_head *folio_alloc_buffers(struct folio *folio, unsigned long size,
819 gfp_t gfp)
820 {
821 struct buffer_head *bh, *head;
822 long offset;
823 struct mem_cgroup *memcg, *old_memcg;
824
825 /* The folio lock pins the memcg */
826 memcg = folio_memcg(folio);
827 old_memcg = set_active_memcg(memcg);
828
829 head = NULL;
830 offset = folio_size(folio);
831 while ((offset -= size) >= 0) {
832 bh = alloc_buffer_head(gfp);
833 if (!bh)
834 goto no_grow;
835
836 bh->b_this_page = head;
837 bh->b_blocknr = -1;
838 head = bh;
839
840 bh->b_size = size;
841
842 /* Link the buffer to its folio */
843 folio_set_bh(bh, folio, offset);
844 }
845 out:
846 set_active_memcg(old_memcg);
847 return head;
848 /*
849 * In case anything failed, we just free everything we got.
850 */
851 no_grow:
852 if (head) {
853 do {
854 bh = head;
855 head = head->b_this_page;
856 free_buffer_head(bh);
857 } while (head);
858 }
859
860 goto out;
861 }
862 EXPORT_SYMBOL_GPL(folio_alloc_buffers);
863
alloc_page_buffers(struct page * page,unsigned long size)864 struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size)
865 {
866 gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT;
867
868 return folio_alloc_buffers(page_folio(page), size, gfp);
869 }
870 EXPORT_SYMBOL_GPL(alloc_page_buffers);
871
link_dev_buffers(struct folio * folio,struct buffer_head * head)872 static inline void link_dev_buffers(struct folio *folio,
873 struct buffer_head *head)
874 {
875 struct buffer_head *bh, *tail;
876
877 bh = head;
878 do {
879 tail = bh;
880 bh = bh->b_this_page;
881 } while (bh);
882 tail->b_this_page = head;
883 folio_attach_private(folio, head);
884 }
885
blkdev_max_block(struct block_device * bdev,unsigned int size)886 static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size)
887 {
888 sector_t retval = ~((sector_t)0);
889 loff_t sz = bdev_nr_bytes(bdev);
890
891 if (sz) {
892 unsigned int sizebits = blksize_bits(size);
893 retval = (sz >> sizebits);
894 }
895 return retval;
896 }
897
898 /*
899 * Initialise the state of a blockdev folio's buffers.
900 */
folio_init_buffers(struct folio * folio,struct block_device * bdev,unsigned size)901 static sector_t folio_init_buffers(struct folio *folio,
902 struct block_device *bdev, unsigned size)
903 {
904 struct buffer_head *head = folio_buffers(folio);
905 struct buffer_head *bh = head;
906 bool uptodate = folio_test_uptodate(folio);
907 sector_t block = div_u64(folio_pos(folio), size);
908 sector_t end_block = blkdev_max_block(bdev, size);
909
910 do {
911 if (!buffer_mapped(bh)) {
912 bh->b_end_io = NULL;
913 bh->b_private = NULL;
914 bh->b_bdev = bdev;
915 bh->b_blocknr = block;
916 if (uptodate)
917 set_buffer_uptodate(bh);
918 if (block < end_block)
919 set_buffer_mapped(bh);
920 }
921 block++;
922 bh = bh->b_this_page;
923 } while (bh != head);
924
925 /*
926 * Caller needs to validate requested block against end of device.
927 */
928 return end_block;
929 }
930
931 /*
932 * Create the page-cache folio that contains the requested block.
933 *
934 * This is used purely for blockdev mappings.
935 *
936 * Returns false if we have a failure which cannot be cured by retrying
937 * without sleeping. Returns true if we succeeded, or the caller should retry.
938 */
grow_dev_folio(struct block_device * bdev,sector_t block,pgoff_t index,unsigned size,gfp_t gfp)939 static bool grow_dev_folio(struct block_device *bdev, sector_t block,
940 pgoff_t index, unsigned size, gfp_t gfp)
941 {
942 struct address_space *mapping = bdev->bd_mapping;
943 struct folio *folio;
944 struct buffer_head *bh;
945 sector_t end_block = 0;
946
947 folio = __filemap_get_folio(mapping, index,
948 FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp);
949 if (IS_ERR(folio))
950 return false;
951
952 bh = folio_buffers(folio);
953 if (bh) {
954 if (bh->b_size == size) {
955 end_block = folio_init_buffers(folio, bdev, size);
956 goto unlock;
957 }
958
959 /*
960 * Retrying may succeed; for example the folio may finish
961 * writeback, or buffers may be cleaned. This should not
962 * happen very often; maybe we have old buffers attached to
963 * this blockdev's page cache and we're trying to change
964 * the block size?
965 */
966 if (!try_to_free_buffers(folio)) {
967 end_block = ~0ULL;
968 goto unlock;
969 }
970 }
971
972 bh = folio_alloc_buffers(folio, size, gfp | __GFP_ACCOUNT);
973 if (!bh)
974 goto unlock;
975
976 /*
977 * Link the folio to the buffers and initialise them. Take the
978 * lock to be atomic wrt __find_get_block(), which does not
979 * run under the folio lock.
980 */
981 spin_lock(&mapping->i_private_lock);
982 link_dev_buffers(folio, bh);
983 end_block = folio_init_buffers(folio, bdev, size);
984 spin_unlock(&mapping->i_private_lock);
985 unlock:
986 folio_unlock(folio);
987 folio_put(folio);
988 return block < end_block;
989 }
990
991 /*
992 * Create buffers for the specified block device block's folio. If
993 * that folio was dirty, the buffers are set dirty also. Returns false
994 * if we've hit a permanent error.
995 */
grow_buffers(struct block_device * bdev,sector_t block,unsigned size,gfp_t gfp)996 static bool grow_buffers(struct block_device *bdev, sector_t block,
997 unsigned size, gfp_t gfp)
998 {
999 loff_t pos;
1000
1001 /*
1002 * Check for a block which lies outside our maximum possible
1003 * pagecache index.
1004 */
1005 if (check_mul_overflow(block, (sector_t)size, &pos) || pos > MAX_LFS_FILESIZE) {
1006 printk(KERN_ERR "%s: requested out-of-range block %llu for device %pg\n",
1007 __func__, (unsigned long long)block,
1008 bdev);
1009 return false;
1010 }
1011
1012 /* Create a folio with the proper size buffers */
1013 return grow_dev_folio(bdev, block, pos / PAGE_SIZE, size, gfp);
1014 }
1015
1016 static struct buffer_head *
__getblk_slow(struct block_device * bdev,sector_t block,unsigned size,gfp_t gfp)1017 __getblk_slow(struct block_device *bdev, sector_t block,
1018 unsigned size, gfp_t gfp)
1019 {
1020 bool blocking = gfpflags_allow_blocking(gfp);
1021
1022 if (WARN_ON_ONCE(!IS_ALIGNED(size, bdev_logical_block_size(bdev)))) {
1023 printk(KERN_ERR "getblk(): block size %d not aligned to logical block size %d\n",
1024 size, bdev_logical_block_size(bdev));
1025 return NULL;
1026 }
1027
1028 for (;;) {
1029 struct buffer_head *bh;
1030
1031 if (!grow_buffers(bdev, block, size, gfp))
1032 return NULL;
1033
1034 if (blocking)
1035 bh = __find_get_block_nonatomic(bdev, block, size);
1036 else
1037 bh = __find_get_block(bdev, block, size);
1038 if (bh)
1039 return bh;
1040 }
1041 }
1042
1043 /*
1044 * The relationship between dirty buffers and dirty pages:
1045 *
1046 * Whenever a page has any dirty buffers, the page's dirty bit is set, and
1047 * the page is tagged dirty in the page cache.
1048 *
1049 * At all times, the dirtiness of the buffers represents the dirtiness of
1050 * subsections of the page. If the page has buffers, the page dirty bit is
1051 * merely a hint about the true dirty state.
1052 *
1053 * When a page is set dirty in its entirety, all its buffers are marked dirty
1054 * (if the page has buffers).
1055 *
1056 * When a buffer is marked dirty, its page is dirtied, but the page's other
1057 * buffers are not.
1058 *
1059 * Also. When blockdev buffers are explicitly read with bread(), they
1060 * individually become uptodate. But their backing page remains not
1061 * uptodate - even if all of its buffers are uptodate. A subsequent
1062 * block_read_full_folio() against that folio will discover all the uptodate
1063 * buffers, will set the folio uptodate and will perform no I/O.
1064 */
1065
1066 /**
1067 * mark_buffer_dirty - mark a buffer_head as needing writeout
1068 * @bh: the buffer_head to mark dirty
1069 *
1070 * mark_buffer_dirty() will set the dirty bit against the buffer, then set
1071 * its backing page dirty, then tag the page as dirty in the page cache
1072 * and then attach the address_space's inode to its superblock's dirty
1073 * inode list.
1074 *
1075 * mark_buffer_dirty() is atomic. It takes bh->b_folio->mapping->i_private_lock,
1076 * i_pages lock and mapping->host->i_lock.
1077 */
mark_buffer_dirty(struct buffer_head * bh)1078 void mark_buffer_dirty(struct buffer_head *bh)
1079 {
1080 WARN_ON_ONCE(!buffer_uptodate(bh));
1081
1082 trace_block_dirty_buffer(bh);
1083
1084 /*
1085 * Very *carefully* optimize the it-is-already-dirty case.
1086 *
1087 * Don't let the final "is it dirty" escape to before we
1088 * perhaps modified the buffer.
1089 */
1090 if (buffer_dirty(bh)) {
1091 smp_mb();
1092 if (buffer_dirty(bh))
1093 return;
1094 }
1095
1096 if (!test_set_buffer_dirty(bh)) {
1097 struct folio *folio = bh->b_folio;
1098 struct address_space *mapping = NULL;
1099
1100 if (!folio_test_set_dirty(folio)) {
1101 mapping = folio->mapping;
1102 if (mapping)
1103 __folio_mark_dirty(folio, mapping, 0);
1104 }
1105 if (mapping)
1106 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
1107 }
1108 }
1109 EXPORT_SYMBOL(mark_buffer_dirty);
1110
mark_buffer_write_io_error(struct buffer_head * bh)1111 void mark_buffer_write_io_error(struct buffer_head *bh)
1112 {
1113 set_buffer_write_io_error(bh);
1114 /* FIXME: do we need to set this in both places? */
1115 if (bh->b_folio && bh->b_folio->mapping)
1116 mapping_set_error(bh->b_folio->mapping, -EIO);
1117 if (bh->b_mmb)
1118 mapping_set_error(bh->b_mmb->mapping, -EIO);
1119 }
1120 EXPORT_SYMBOL(mark_buffer_write_io_error);
1121
1122 /**
1123 * __brelse - Release a buffer.
1124 * @bh: The buffer to release.
1125 *
1126 * This variant of brelse() can be called if @bh is guaranteed to not be NULL.
1127 */
__brelse(struct buffer_head * bh)1128 void __brelse(struct buffer_head *bh)
1129 {
1130 if (atomic_read(&bh->b_count)) {
1131 put_bh(bh);
1132 return;
1133 }
1134 WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1135 }
1136 EXPORT_SYMBOL(__brelse);
1137
1138 /**
1139 * __bforget - Discard any dirty data in a buffer.
1140 * @bh: The buffer to forget.
1141 *
1142 * This variant of bforget() can be called if @bh is guaranteed to not
1143 * be NULL.
1144 */
__bforget(struct buffer_head * bh)1145 void __bforget(struct buffer_head *bh)
1146 {
1147 clear_buffer_dirty(bh);
1148 remove_assoc_queue(bh);
1149 __brelse(bh);
1150 }
1151 EXPORT_SYMBOL(__bforget);
1152
__bread_slow(struct buffer_head * bh)1153 static struct buffer_head *__bread_slow(struct buffer_head *bh)
1154 {
1155 lock_buffer(bh);
1156 if (buffer_uptodate(bh)) {
1157 unlock_buffer(bh);
1158 return bh;
1159 } else {
1160 get_bh(bh);
1161 bh->b_end_io = end_buffer_read_sync;
1162 submit_bh(REQ_OP_READ, bh);
1163 wait_on_buffer(bh);
1164 if (buffer_uptodate(bh))
1165 return bh;
1166 }
1167 brelse(bh);
1168 return NULL;
1169 }
1170
1171 /*
1172 * Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block().
1173 * The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their
1174 * refcount elevated by one when they're in an LRU. A buffer can only appear
1175 * once in a particular CPU's LRU. A single buffer can be present in multiple
1176 * CPU's LRUs at the same time.
1177 *
1178 * This is a transparent caching front-end to sb_bread(), sb_getblk() and
1179 * sb_find_get_block().
1180 *
1181 * The LRUs themselves only need locking against invalidate_bh_lrus. We use
1182 * a local interrupt disable for that.
1183 */
1184
1185 #define BH_LRU_SIZE 16
1186
1187 struct bh_lru {
1188 struct buffer_head *bhs[BH_LRU_SIZE];
1189 };
1190
1191 static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
1192
1193 #ifdef CONFIG_SMP
1194 #define bh_lru_lock() local_irq_disable()
1195 #define bh_lru_unlock() local_irq_enable()
1196 #else
1197 #define bh_lru_lock() preempt_disable()
1198 #define bh_lru_unlock() preempt_enable()
1199 #endif
1200
check_irqs_on(void)1201 static inline void check_irqs_on(void)
1202 {
1203 #ifdef irqs_disabled
1204 BUG_ON(irqs_disabled());
1205 #endif
1206 }
1207
1208 /*
1209 * Install a buffer_head into this cpu's LRU. If not already in the LRU, it is
1210 * inserted at the front, and the buffer_head at the back if any is evicted.
1211 * Or, if already in the LRU it is moved to the front.
1212 */
bh_lru_install(struct buffer_head * bh)1213 static void bh_lru_install(struct buffer_head *bh)
1214 {
1215 struct buffer_head *evictee = bh;
1216 struct bh_lru *b;
1217 int i;
1218
1219 check_irqs_on();
1220 bh_lru_lock();
1221
1222 /*
1223 * the refcount of buffer_head in bh_lru prevents dropping the
1224 * attached page(i.e., try_to_free_buffers) so it could cause
1225 * failing page migration.
1226 * Skip putting upcoming bh into bh_lru until migration is done.
1227 */
1228 if (lru_cache_disabled() || cpu_is_isolated(smp_processor_id())) {
1229 bh_lru_unlock();
1230 return;
1231 }
1232
1233 b = this_cpu_ptr(&bh_lrus);
1234 for (i = 0; i < BH_LRU_SIZE; i++) {
1235 swap(evictee, b->bhs[i]);
1236 if (evictee == bh) {
1237 bh_lru_unlock();
1238 return;
1239 }
1240 }
1241
1242 get_bh(bh);
1243 bh_lru_unlock();
1244 brelse(evictee);
1245 }
1246
1247 /*
1248 * Look up the bh in this cpu's LRU. If it's there, move it to the head.
1249 */
1250 static struct buffer_head *
lookup_bh_lru(struct block_device * bdev,sector_t block,unsigned size)1251 lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
1252 {
1253 struct buffer_head *ret = NULL;
1254 unsigned int i;
1255
1256 check_irqs_on();
1257 bh_lru_lock();
1258 if (cpu_is_isolated(smp_processor_id())) {
1259 bh_lru_unlock();
1260 return NULL;
1261 }
1262 for (i = 0; i < BH_LRU_SIZE; i++) {
1263 struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]);
1264
1265 if (bh && bh->b_blocknr == block && bh->b_bdev == bdev &&
1266 bh->b_size == size) {
1267 if (i) {
1268 while (i) {
1269 __this_cpu_write(bh_lrus.bhs[i],
1270 __this_cpu_read(bh_lrus.bhs[i - 1]));
1271 i--;
1272 }
1273 __this_cpu_write(bh_lrus.bhs[0], bh);
1274 }
1275 get_bh(bh);
1276 ret = bh;
1277 break;
1278 }
1279 }
1280 bh_lru_unlock();
1281 return ret;
1282 }
1283
1284 /*
1285 * Perform a pagecache lookup for the matching buffer. If it's there, refresh
1286 * it in the LRU and mark it as accessed. If it is not present then return
1287 * NULL. Atomic context callers may also return NULL if the buffer is being
1288 * migrated; similarly the page is not marked accessed either.
1289 */
1290 static struct buffer_head *
find_get_block_common(struct block_device * bdev,sector_t block,unsigned size,bool atomic)1291 find_get_block_common(struct block_device *bdev, sector_t block,
1292 unsigned size, bool atomic)
1293 {
1294 struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
1295
1296 if (bh == NULL) {
1297 /* __find_get_block_slow will mark the page accessed */
1298 bh = __find_get_block_slow(bdev, block, atomic);
1299 if (bh)
1300 bh_lru_install(bh);
1301 } else
1302 touch_buffer(bh);
1303
1304 return bh;
1305 }
1306
1307 struct buffer_head *
__find_get_block(struct block_device * bdev,sector_t block,unsigned size)1308 __find_get_block(struct block_device *bdev, sector_t block, unsigned size)
1309 {
1310 return find_get_block_common(bdev, block, size, true);
1311 }
1312 EXPORT_SYMBOL(__find_get_block);
1313
1314 /* same as __find_get_block() but allows sleeping contexts */
1315 struct buffer_head *
__find_get_block_nonatomic(struct block_device * bdev,sector_t block,unsigned size)1316 __find_get_block_nonatomic(struct block_device *bdev, sector_t block,
1317 unsigned size)
1318 {
1319 return find_get_block_common(bdev, block, size, false);
1320 }
1321 EXPORT_SYMBOL(__find_get_block_nonatomic);
1322
1323 /**
1324 * bdev_getblk - Get a buffer_head in a block device's buffer cache.
1325 * @bdev: The block device.
1326 * @block: The block number.
1327 * @size: The size of buffer_heads for this @bdev.
1328 * @gfp: The memory allocation flags to use.
1329 *
1330 * The returned buffer head has its reference count incremented, but is
1331 * not locked. The caller should call brelse() when it has finished
1332 * with the buffer. The buffer may not be uptodate. If needed, the
1333 * caller can bring it uptodate either by reading it or overwriting it.
1334 *
1335 * Return: The buffer head, or NULL if memory could not be allocated.
1336 */
bdev_getblk(struct block_device * bdev,sector_t block,unsigned size,gfp_t gfp)1337 struct buffer_head *bdev_getblk(struct block_device *bdev, sector_t block,
1338 unsigned size, gfp_t gfp)
1339 {
1340 struct buffer_head *bh;
1341
1342 if (gfpflags_allow_blocking(gfp))
1343 bh = __find_get_block_nonatomic(bdev, block, size);
1344 else
1345 bh = __find_get_block(bdev, block, size);
1346
1347 might_alloc(gfp);
1348 if (bh)
1349 return bh;
1350
1351 return __getblk_slow(bdev, block, size, gfp);
1352 }
1353 EXPORT_SYMBOL(bdev_getblk);
1354
1355 /*
1356 * Do async read-ahead on a buffer..
1357 */
__breadahead(struct block_device * bdev,sector_t block,unsigned size)1358 void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
1359 {
1360 struct buffer_head *bh = bdev_getblk(bdev, block, size,
1361 GFP_NOWAIT | __GFP_MOVABLE);
1362
1363 if (likely(bh)) {
1364 bh_readahead(bh, REQ_RAHEAD);
1365 brelse(bh);
1366 }
1367 }
1368 EXPORT_SYMBOL(__breadahead);
1369
1370 /**
1371 * __bread_gfp() - Read a block.
1372 * @bdev: The block device to read from.
1373 * @block: Block number in units of block size.
1374 * @size: The block size of this device in bytes.
1375 * @gfp: Not page allocation flags; see below.
1376 *
1377 * You are not expected to call this function. You should use one of
1378 * sb_bread(), sb_bread_unmovable() or __bread().
1379 *
1380 * Read a specified block, and return the buffer head that refers to it.
1381 * If @gfp is 0, the memory will be allocated using the block device's
1382 * default GFP flags. If @gfp is __GFP_MOVABLE, the memory may be
1383 * allocated from a movable area. Do not pass in a complete set of
1384 * GFP flags.
1385 *
1386 * The returned buffer head has its refcount increased. The caller should
1387 * call brelse() when it has finished with the buffer.
1388 *
1389 * Context: May sleep waiting for I/O.
1390 * Return: NULL if the block was unreadable.
1391 */
__bread_gfp(struct block_device * bdev,sector_t block,unsigned size,gfp_t gfp)1392 struct buffer_head *__bread_gfp(struct block_device *bdev, sector_t block,
1393 unsigned size, gfp_t gfp)
1394 {
1395 struct buffer_head *bh;
1396
1397 gfp |= mapping_gfp_constraint(bdev->bd_mapping, ~__GFP_FS);
1398
1399 /*
1400 * Prefer looping in the allocator rather than here, at least that
1401 * code knows what it's doing.
1402 */
1403 gfp |= __GFP_NOFAIL;
1404
1405 bh = bdev_getblk(bdev, block, size, gfp);
1406
1407 if (likely(bh) && !buffer_uptodate(bh))
1408 bh = __bread_slow(bh);
1409 return bh;
1410 }
1411 EXPORT_SYMBOL(__bread_gfp);
1412
__invalidate_bh_lrus(struct bh_lru * b)1413 static void __invalidate_bh_lrus(struct bh_lru *b)
1414 {
1415 int i;
1416
1417 for (i = 0; i < BH_LRU_SIZE; i++) {
1418 brelse(b->bhs[i]);
1419 b->bhs[i] = NULL;
1420 }
1421 }
1422 /*
1423 * invalidate_bh_lrus() is called rarely - but not only at unmount.
1424 * This doesn't race because it runs in each cpu either in irq
1425 * or with preempt disabled.
1426 */
invalidate_bh_lru(void * arg)1427 static void invalidate_bh_lru(void *arg)
1428 {
1429 struct bh_lru *b = &get_cpu_var(bh_lrus);
1430
1431 __invalidate_bh_lrus(b);
1432 put_cpu_var(bh_lrus);
1433 }
1434
has_bh_in_lru(int cpu,void * dummy)1435 bool has_bh_in_lru(int cpu, void *dummy)
1436 {
1437 struct bh_lru *b = per_cpu_ptr(&bh_lrus, cpu);
1438 int i;
1439
1440 for (i = 0; i < BH_LRU_SIZE; i++) {
1441 if (b->bhs[i])
1442 return true;
1443 }
1444
1445 return false;
1446 }
1447
invalidate_bh_lrus(void)1448 void invalidate_bh_lrus(void)
1449 {
1450 on_each_cpu_cond(has_bh_in_lru, invalidate_bh_lru, NULL, 1);
1451 }
1452 EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
1453
1454 /*
1455 * It's called from workqueue context so we need a bh_lru_lock to close
1456 * the race with preemption/irq.
1457 */
invalidate_bh_lrus_cpu(void)1458 void invalidate_bh_lrus_cpu(void)
1459 {
1460 struct bh_lru *b;
1461
1462 bh_lru_lock();
1463 b = this_cpu_ptr(&bh_lrus);
1464 __invalidate_bh_lrus(b);
1465 bh_lru_unlock();
1466 }
1467
folio_set_bh(struct buffer_head * bh,struct folio * folio,unsigned long offset)1468 void folio_set_bh(struct buffer_head *bh, struct folio *folio,
1469 unsigned long offset)
1470 {
1471 bh->b_folio = folio;
1472 BUG_ON(offset >= folio_size(folio));
1473 if (folio_test_highmem(folio))
1474 /*
1475 * This catches illegal uses and preserves the offset:
1476 */
1477 bh->b_data = (char *)(0 + offset);
1478 else
1479 bh->b_data = folio_address(folio) + offset;
1480 }
1481 EXPORT_SYMBOL(folio_set_bh);
1482
1483 /*
1484 * Called when truncating a buffer on a page completely.
1485 */
1486
1487 /* Bits that are cleared during an invalidate */
1488 #define BUFFER_FLAGS_DISCARD \
1489 (1 << BH_Mapped | 1 << BH_New | 1 << BH_Req | \
1490 1 << BH_Delay | 1 << BH_Unwritten)
1491
discard_buffer(struct buffer_head * bh)1492 static void discard_buffer(struct buffer_head * bh)
1493 {
1494 unsigned long b_state;
1495
1496 lock_buffer(bh);
1497 clear_buffer_dirty(bh);
1498 bh->b_bdev = NULL;
1499 b_state = READ_ONCE(bh->b_state);
1500 do {
1501 } while (!try_cmpxchg_relaxed(&bh->b_state, &b_state,
1502 b_state & ~BUFFER_FLAGS_DISCARD));
1503 unlock_buffer(bh);
1504 }
1505
1506 /**
1507 * block_invalidate_folio - Invalidate part or all of a buffer-backed folio.
1508 * @folio: The folio which is affected.
1509 * @offset: start of the range to invalidate
1510 * @length: length of the range to invalidate
1511 *
1512 * block_invalidate_folio() is called when all or part of the folio has been
1513 * invalidated by a truncate operation.
1514 *
1515 * block_invalidate_folio() does not have to release all buffers, but it must
1516 * ensure that no dirty buffer is left outside @offset and that no I/O
1517 * is underway against any of the blocks which are outside the truncation
1518 * point. Because the caller is about to free (and possibly reuse) those
1519 * blocks on-disk.
1520 */
block_invalidate_folio(struct folio * folio,size_t offset,size_t length)1521 void block_invalidate_folio(struct folio *folio, size_t offset, size_t length)
1522 {
1523 struct buffer_head *head, *bh, *next;
1524 size_t curr_off = 0;
1525 size_t stop = length + offset;
1526
1527 BUG_ON(!folio_test_locked(folio));
1528
1529 /*
1530 * Check for overflow
1531 */
1532 BUG_ON(stop > folio_size(folio) || stop < length);
1533
1534 head = folio_buffers(folio);
1535 if (!head)
1536 return;
1537
1538 bh = head;
1539 do {
1540 size_t next_off = curr_off + bh->b_size;
1541 next = bh->b_this_page;
1542
1543 /*
1544 * Are we still fully in range ?
1545 */
1546 if (next_off > stop)
1547 goto out;
1548
1549 /*
1550 * is this block fully invalidated?
1551 */
1552 if (offset <= curr_off)
1553 discard_buffer(bh);
1554 curr_off = next_off;
1555 bh = next;
1556 } while (bh != head);
1557
1558 /*
1559 * We release buffers only if the entire folio is being invalidated.
1560 * The get_block cached value has been unconditionally invalidated,
1561 * so real IO is not possible anymore.
1562 */
1563 if (length == folio_size(folio))
1564 filemap_release_folio(folio, 0);
1565 out:
1566 folio_clear_mappedtodisk(folio);
1567 }
1568 EXPORT_SYMBOL(block_invalidate_folio);
1569
1570 /*
1571 * We attach and possibly dirty the buffers atomically wrt
1572 * block_dirty_folio() via i_private_lock. try_to_free_buffers
1573 * is already excluded via the folio lock.
1574 */
create_empty_buffers(struct folio * folio,unsigned long blocksize,unsigned long b_state)1575 struct buffer_head *create_empty_buffers(struct folio *folio,
1576 unsigned long blocksize, unsigned long b_state)
1577 {
1578 struct buffer_head *bh, *head, *tail;
1579 gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT | __GFP_NOFAIL;
1580
1581 head = folio_alloc_buffers(folio, blocksize, gfp);
1582 bh = head;
1583 do {
1584 bh->b_state |= b_state;
1585 tail = bh;
1586 bh = bh->b_this_page;
1587 } while (bh);
1588 tail->b_this_page = head;
1589
1590 spin_lock(&folio->mapping->i_private_lock);
1591 if (folio_test_uptodate(folio) || folio_test_dirty(folio)) {
1592 bh = head;
1593 do {
1594 if (folio_test_dirty(folio))
1595 set_buffer_dirty(bh);
1596 if (folio_test_uptodate(folio))
1597 set_buffer_uptodate(bh);
1598 bh = bh->b_this_page;
1599 } while (bh != head);
1600 }
1601 folio_attach_private(folio, head);
1602 spin_unlock(&folio->mapping->i_private_lock);
1603
1604 return head;
1605 }
1606 EXPORT_SYMBOL(create_empty_buffers);
1607
1608 /**
1609 * clean_bdev_aliases: clean a range of buffers in block device
1610 * @bdev: Block device to clean buffers in
1611 * @block: Start of a range of blocks to clean
1612 * @len: Number of blocks to clean
1613 *
1614 * We are taking a range of blocks for data and we don't want writeback of any
1615 * buffer-cache aliases starting from return from this function and until the
1616 * moment when something will explicitly mark the buffer dirty (hopefully that
1617 * will not happen until we will free that block ;-) We don't even need to mark
1618 * it not-uptodate - nobody can expect anything from a newly allocated buffer
1619 * anyway. We used to use unmap_buffer() for such invalidation, but that was
1620 * wrong. We definitely don't want to mark the alias unmapped, for example - it
1621 * would confuse anyone who might pick it with bread() afterwards...
1622 *
1623 * Also.. Note that bforget() doesn't lock the buffer. So there can be
1624 * writeout I/O going on against recently-freed buffers. We don't wait on that
1625 * I/O in bforget() - it's more efficient to wait on the I/O only if we really
1626 * need to. That happens here.
1627 */
clean_bdev_aliases(struct block_device * bdev,sector_t block,sector_t len)1628 void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len)
1629 {
1630 struct address_space *bd_mapping = bdev->bd_mapping;
1631 const int blkbits = bd_mapping->host->i_blkbits;
1632 struct folio_batch fbatch;
1633 pgoff_t index = ((loff_t)block << blkbits) / PAGE_SIZE;
1634 pgoff_t end;
1635 int i, count;
1636 struct buffer_head *bh;
1637 struct buffer_head *head;
1638
1639 end = ((loff_t)(block + len - 1) << blkbits) / PAGE_SIZE;
1640 folio_batch_init(&fbatch);
1641 while (filemap_get_folios(bd_mapping, &index, end, &fbatch)) {
1642 count = folio_batch_count(&fbatch);
1643 for (i = 0; i < count; i++) {
1644 struct folio *folio = fbatch.folios[i];
1645
1646 if (!folio_buffers(folio))
1647 continue;
1648 /*
1649 * We use folio lock instead of bd_mapping->i_private_lock
1650 * to pin buffers here since we can afford to sleep and
1651 * it scales better than a global spinlock lock.
1652 */
1653 folio_lock(folio);
1654 /* Recheck when the folio is locked which pins bhs */
1655 head = folio_buffers(folio);
1656 if (!head)
1657 goto unlock_page;
1658 bh = head;
1659 do {
1660 if (!buffer_mapped(bh) || (bh->b_blocknr < block))
1661 goto next;
1662 if (bh->b_blocknr >= block + len)
1663 break;
1664 clear_buffer_dirty(bh);
1665 wait_on_buffer(bh);
1666 clear_buffer_req(bh);
1667 next:
1668 bh = bh->b_this_page;
1669 } while (bh != head);
1670 unlock_page:
1671 folio_unlock(folio);
1672 }
1673 folio_batch_release(&fbatch);
1674 cond_resched();
1675 /* End of range already reached? */
1676 if (index > end || !index)
1677 break;
1678 }
1679 }
1680 EXPORT_SYMBOL(clean_bdev_aliases);
1681
folio_create_buffers(struct folio * folio,struct inode * inode,unsigned int b_state)1682 static struct buffer_head *folio_create_buffers(struct folio *folio,
1683 struct inode *inode,
1684 unsigned int b_state)
1685 {
1686 struct buffer_head *bh;
1687
1688 BUG_ON(!folio_test_locked(folio));
1689
1690 bh = folio_buffers(folio);
1691 if (!bh)
1692 bh = create_empty_buffers(folio,
1693 1 << READ_ONCE(inode->i_blkbits), b_state);
1694 return bh;
1695 }
1696
1697 /*
1698 * NOTE! All mapped/uptodate combinations are valid:
1699 *
1700 * Mapped Uptodate Meaning
1701 *
1702 * No No "unknown" - must do get_block()
1703 * No Yes "hole" - zero-filled
1704 * Yes No "allocated" - allocated on disk, not read in
1705 * Yes Yes "valid" - allocated and up-to-date in memory.
1706 *
1707 * "Dirty" is valid only with the last case (mapped+uptodate).
1708 */
1709
1710 /*
1711 * While block_write_full_folio is writing back the dirty buffers under
1712 * the page lock, whoever dirtied the buffers may decide to clean them
1713 * again at any time. We handle that by only looking at the buffer
1714 * state inside lock_buffer().
1715 *
1716 * If block_write_full_folio() is called for regular writeback
1717 * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
1718 * locked buffer. This only can happen if someone has written the buffer
1719 * directly, with submit_bh(). At the address_space level PageWriteback
1720 * prevents this contention from occurring.
1721 *
1722 * If block_write_full_folio() is called with wbc->sync_mode ==
1723 * WB_SYNC_ALL, the writes are posted using REQ_SYNC; this
1724 * causes the writes to be flagged as synchronous writes.
1725 */
__block_write_full_folio(struct inode * inode,struct folio * folio,get_block_t * get_block,struct writeback_control * wbc)1726 int __block_write_full_folio(struct inode *inode, struct folio *folio,
1727 get_block_t *get_block, struct writeback_control *wbc)
1728 {
1729 int err;
1730 sector_t block;
1731 sector_t last_block;
1732 struct buffer_head *bh, *head;
1733 size_t blocksize;
1734 int nr_underway = 0;
1735 blk_opf_t write_flags = wbc_to_write_flags(wbc);
1736
1737 head = folio_create_buffers(folio, inode,
1738 (1 << BH_Dirty) | (1 << BH_Uptodate));
1739
1740 /*
1741 * Be very careful. We have no exclusion from block_dirty_folio
1742 * here, and the (potentially unmapped) buffers may become dirty at
1743 * any time. If a buffer becomes dirty here after we've inspected it
1744 * then we just miss that fact, and the folio stays dirty.
1745 *
1746 * Buffers outside i_size may be dirtied by block_dirty_folio;
1747 * handle that here by just cleaning them.
1748 */
1749
1750 bh = head;
1751 blocksize = bh->b_size;
1752
1753 block = div_u64(folio_pos(folio), blocksize);
1754 last_block = div_u64(i_size_read(inode) - 1, blocksize);
1755
1756 /*
1757 * Get all the dirty buffers mapped to disk addresses and
1758 * handle any aliases from the underlying blockdev's mapping.
1759 */
1760 do {
1761 if (block > last_block) {
1762 /*
1763 * mapped buffers outside i_size will occur, because
1764 * this folio can be outside i_size when there is a
1765 * truncate in progress.
1766 */
1767 /*
1768 * The buffer was zeroed by block_write_full_folio()
1769 */
1770 clear_buffer_dirty(bh);
1771 set_buffer_uptodate(bh);
1772 } else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
1773 buffer_dirty(bh)) {
1774 WARN_ON(bh->b_size != blocksize);
1775 err = get_block(inode, block, bh, 1);
1776 if (err)
1777 goto recover;
1778 clear_buffer_delay(bh);
1779 if (buffer_new(bh)) {
1780 /* blockdev mappings never come here */
1781 clear_buffer_new(bh);
1782 clean_bdev_bh_alias(bh);
1783 }
1784 }
1785 bh = bh->b_this_page;
1786 block++;
1787 } while (bh != head);
1788
1789 do {
1790 if (!buffer_mapped(bh))
1791 continue;
1792 /*
1793 * If it's a fully non-blocking write attempt and we cannot
1794 * lock the buffer then redirty the folio. Note that this can
1795 * potentially cause a busy-wait loop from writeback threads
1796 * and kswapd activity, but those code paths have their own
1797 * higher-level throttling.
1798 */
1799 if (wbc->sync_mode != WB_SYNC_NONE) {
1800 lock_buffer(bh);
1801 } else if (!trylock_buffer(bh)) {
1802 folio_redirty_for_writepage(wbc, folio);
1803 continue;
1804 }
1805 if (test_clear_buffer_dirty(bh)) {
1806 mark_buffer_async_write_endio(bh,
1807 end_buffer_async_write);
1808 } else {
1809 unlock_buffer(bh);
1810 }
1811 } while ((bh = bh->b_this_page) != head);
1812
1813 /*
1814 * The folio and its buffers are protected by the writeback flag,
1815 * so we can drop the bh refcounts early.
1816 */
1817 BUG_ON(folio_test_writeback(folio));
1818 folio_start_writeback(folio);
1819
1820 do {
1821 struct buffer_head *next = bh->b_this_page;
1822 if (buffer_async_write(bh)) {
1823 submit_bh_wbc(REQ_OP_WRITE | write_flags, bh,
1824 inode->i_write_hint, wbc);
1825 nr_underway++;
1826 }
1827 bh = next;
1828 } while (bh != head);
1829 folio_unlock(folio);
1830
1831 err = 0;
1832 done:
1833 if (nr_underway == 0) {
1834 /*
1835 * The folio was marked dirty, but the buffers were
1836 * clean. Someone wrote them back by hand with
1837 * write_dirty_buffer/submit_bh. A rare case.
1838 */
1839 folio_end_writeback(folio);
1840
1841 /*
1842 * The folio and buffer_heads can be released at any time from
1843 * here on.
1844 */
1845 }
1846 return err;
1847
1848 recover:
1849 /*
1850 * ENOSPC, or some other error. We may already have added some
1851 * blocks to the file, so we need to write these out to avoid
1852 * exposing stale data.
1853 * The folio is currently locked and not marked for writeback
1854 */
1855 bh = head;
1856 /* Recovery: lock and submit the mapped buffers */
1857 do {
1858 if (buffer_mapped(bh) && buffer_dirty(bh) &&
1859 !buffer_delay(bh)) {
1860 lock_buffer(bh);
1861 mark_buffer_async_write_endio(bh,
1862 end_buffer_async_write);
1863 } else {
1864 /*
1865 * The buffer may have been set dirty during
1866 * attachment to a dirty folio.
1867 */
1868 clear_buffer_dirty(bh);
1869 }
1870 } while ((bh = bh->b_this_page) != head);
1871 BUG_ON(folio_test_writeback(folio));
1872 mapping_set_error(folio->mapping, err);
1873 folio_start_writeback(folio);
1874 do {
1875 struct buffer_head *next = bh->b_this_page;
1876 if (buffer_async_write(bh)) {
1877 clear_buffer_dirty(bh);
1878 submit_bh_wbc(REQ_OP_WRITE | write_flags, bh,
1879 inode->i_write_hint, wbc);
1880 nr_underway++;
1881 }
1882 bh = next;
1883 } while (bh != head);
1884 folio_unlock(folio);
1885 goto done;
1886 }
1887 EXPORT_SYMBOL(__block_write_full_folio);
1888
1889 /*
1890 * If a folio has any new buffers, zero them out here, and mark them uptodate
1891 * and dirty so they'll be written out (in order to prevent uninitialised
1892 * block data from leaking). And clear the new bit.
1893 */
folio_zero_new_buffers(struct folio * folio,size_t from,size_t to)1894 void folio_zero_new_buffers(struct folio *folio, size_t from, size_t to)
1895 {
1896 size_t block_start, block_end;
1897 struct buffer_head *head, *bh;
1898
1899 BUG_ON(!folio_test_locked(folio));
1900 head = folio_buffers(folio);
1901 if (!head)
1902 return;
1903
1904 bh = head;
1905 block_start = 0;
1906 do {
1907 block_end = block_start + bh->b_size;
1908
1909 if (buffer_new(bh)) {
1910 if (block_end > from && block_start < to) {
1911 if (!folio_test_uptodate(folio)) {
1912 size_t start, xend;
1913
1914 start = max(from, block_start);
1915 xend = min(to, block_end);
1916
1917 folio_zero_segment(folio, start, xend);
1918 set_buffer_uptodate(bh);
1919 }
1920
1921 clear_buffer_new(bh);
1922 mark_buffer_dirty(bh);
1923 }
1924 }
1925
1926 block_start = block_end;
1927 bh = bh->b_this_page;
1928 } while (bh != head);
1929 }
1930 EXPORT_SYMBOL(folio_zero_new_buffers);
1931
1932 static int
iomap_to_bh(struct inode * inode,sector_t block,struct buffer_head * bh,const struct iomap * iomap)1933 iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
1934 const struct iomap *iomap)
1935 {
1936 loff_t offset = (loff_t)block << inode->i_blkbits;
1937
1938 bh->b_bdev = iomap->bdev;
1939
1940 /*
1941 * Block points to offset in file we need to map, iomap contains
1942 * the offset at which the map starts. If the map ends before the
1943 * current block, then do not map the buffer and let the caller
1944 * handle it.
1945 */
1946 if (offset >= iomap->offset + iomap->length)
1947 return -EIO;
1948
1949 switch (iomap->type) {
1950 case IOMAP_HOLE:
1951 /*
1952 * If the buffer is not up to date or beyond the current EOF,
1953 * we need to mark it as new to ensure sub-block zeroing is
1954 * executed if necessary.
1955 */
1956 if (!buffer_uptodate(bh) ||
1957 (offset >= i_size_read(inode)))
1958 set_buffer_new(bh);
1959 return 0;
1960 case IOMAP_DELALLOC:
1961 if (!buffer_uptodate(bh) ||
1962 (offset >= i_size_read(inode)))
1963 set_buffer_new(bh);
1964 set_buffer_uptodate(bh);
1965 set_buffer_mapped(bh);
1966 set_buffer_delay(bh);
1967 return 0;
1968 case IOMAP_UNWRITTEN:
1969 /*
1970 * For unwritten regions, we always need to ensure that regions
1971 * in the block we are not writing to are zeroed. Mark the
1972 * buffer as new to ensure this.
1973 */
1974 set_buffer_new(bh);
1975 set_buffer_unwritten(bh);
1976 fallthrough;
1977 case IOMAP_MAPPED:
1978 if ((iomap->flags & IOMAP_F_NEW) ||
1979 offset >= i_size_read(inode)) {
1980 /*
1981 * This can happen if truncating the block device races
1982 * with the check in the caller as i_size updates on
1983 * block devices aren't synchronized by i_rwsem for
1984 * block devices.
1985 */
1986 if (S_ISBLK(inode->i_mode))
1987 return -EIO;
1988 set_buffer_new(bh);
1989 }
1990 bh->b_blocknr = (iomap->addr + offset - iomap->offset) >>
1991 inode->i_blkbits;
1992 set_buffer_mapped(bh);
1993 return 0;
1994 default:
1995 WARN_ON_ONCE(1);
1996 return -EIO;
1997 }
1998 }
1999
__block_write_begin_int(struct folio * folio,loff_t pos,unsigned len,get_block_t * get_block,const struct iomap * iomap)2000 int __block_write_begin_int(struct folio *folio, loff_t pos, unsigned len,
2001 get_block_t *get_block, const struct iomap *iomap)
2002 {
2003 size_t from = offset_in_folio(folio, pos);
2004 size_t to = from + len;
2005 struct inode *inode = folio->mapping->host;
2006 size_t block_start, block_end;
2007 sector_t block;
2008 int err = 0;
2009 size_t blocksize;
2010 struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
2011
2012 BUG_ON(!folio_test_locked(folio));
2013 BUG_ON(to > folio_size(folio));
2014 BUG_ON(from > to);
2015
2016 head = folio_create_buffers(folio, inode, 0);
2017 blocksize = head->b_size;
2018 block = div_u64(folio_pos(folio), blocksize);
2019
2020 for (bh = head, block_start = 0; bh != head || !block_start;
2021 block++, block_start=block_end, bh = bh->b_this_page) {
2022 block_end = block_start + blocksize;
2023 if (block_end <= from || block_start >= to) {
2024 if (folio_test_uptodate(folio)) {
2025 if (!buffer_uptodate(bh))
2026 set_buffer_uptodate(bh);
2027 }
2028 continue;
2029 }
2030 if (buffer_new(bh))
2031 clear_buffer_new(bh);
2032 if (!buffer_mapped(bh)) {
2033 WARN_ON(bh->b_size != blocksize);
2034 if (get_block)
2035 err = get_block(inode, block, bh, 1);
2036 else
2037 err = iomap_to_bh(inode, block, bh, iomap);
2038 if (err)
2039 break;
2040
2041 if (buffer_new(bh)) {
2042 clean_bdev_bh_alias(bh);
2043 if (folio_test_uptodate(folio)) {
2044 clear_buffer_new(bh);
2045 set_buffer_uptodate(bh);
2046 mark_buffer_dirty(bh);
2047 continue;
2048 }
2049 if (block_end > to || block_start < from)
2050 folio_zero_segments(folio,
2051 to, block_end,
2052 block_start, from);
2053 continue;
2054 }
2055 }
2056 if (folio_test_uptodate(folio)) {
2057 if (!buffer_uptodate(bh))
2058 set_buffer_uptodate(bh);
2059 continue;
2060 }
2061 if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
2062 !buffer_unwritten(bh) &&
2063 (block_start < from || block_end > to)) {
2064 bh_read_nowait(bh, 0);
2065 *wait_bh++=bh;
2066 }
2067 }
2068 /*
2069 * If we issued read requests - let them complete.
2070 */
2071 while(wait_bh > wait) {
2072 wait_on_buffer(*--wait_bh);
2073 if (!buffer_uptodate(*wait_bh))
2074 err = -EIO;
2075 }
2076 if (unlikely(err))
2077 folio_zero_new_buffers(folio, from, to);
2078 return err;
2079 }
2080
__block_write_begin(struct folio * folio,loff_t pos,unsigned len,get_block_t * get_block)2081 int __block_write_begin(struct folio *folio, loff_t pos, unsigned len,
2082 get_block_t *get_block)
2083 {
2084 return __block_write_begin_int(folio, pos, len, get_block, NULL);
2085 }
2086 EXPORT_SYMBOL(__block_write_begin);
2087
block_commit_write(struct folio * folio,size_t from,size_t to)2088 void block_commit_write(struct folio *folio, size_t from, size_t to)
2089 {
2090 size_t block_start, block_end;
2091 bool partial = false;
2092 unsigned blocksize;
2093 struct buffer_head *bh, *head;
2094
2095 bh = head = folio_buffers(folio);
2096 if (!bh)
2097 return;
2098 blocksize = bh->b_size;
2099
2100 block_start = 0;
2101 do {
2102 block_end = block_start + blocksize;
2103 if (block_end <= from || block_start >= to) {
2104 if (!buffer_uptodate(bh))
2105 partial = true;
2106 } else {
2107 set_buffer_uptodate(bh);
2108 mark_buffer_dirty(bh);
2109 }
2110 if (buffer_new(bh))
2111 clear_buffer_new(bh);
2112
2113 block_start = block_end;
2114 bh = bh->b_this_page;
2115 } while (bh != head);
2116
2117 /*
2118 * If this is a partial write which happened to make all buffers
2119 * uptodate then we can optimize away a bogus read_folio() for
2120 * the next read(). Here we 'discover' whether the folio went
2121 * uptodate as a result of this (potentially partial) write.
2122 */
2123 if (!partial)
2124 folio_mark_uptodate(folio);
2125 }
2126 EXPORT_SYMBOL(block_commit_write);
2127
2128 /*
2129 * block_write_begin takes care of the basic task of block allocation and
2130 * bringing partial write blocks uptodate first.
2131 *
2132 * The filesystem needs to handle block truncation upon failure.
2133 */
block_write_begin(struct address_space * mapping,loff_t pos,unsigned len,struct folio ** foliop,get_block_t * get_block)2134 int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
2135 struct folio **foliop, get_block_t *get_block)
2136 {
2137 pgoff_t index = pos >> PAGE_SHIFT;
2138 struct folio *folio;
2139 int status;
2140
2141 folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
2142 mapping_gfp_mask(mapping));
2143 if (IS_ERR(folio))
2144 return PTR_ERR(folio);
2145
2146 status = __block_write_begin_int(folio, pos, len, get_block, NULL);
2147 if (unlikely(status)) {
2148 folio_unlock(folio);
2149 folio_put(folio);
2150 folio = NULL;
2151 }
2152
2153 *foliop = folio;
2154 return status;
2155 }
2156 EXPORT_SYMBOL(block_write_begin);
2157
block_write_end(loff_t pos,unsigned len,unsigned copied,struct folio * folio)2158 int block_write_end(loff_t pos, unsigned len, unsigned copied,
2159 struct folio *folio)
2160 {
2161 size_t start = pos - folio_pos(folio);
2162
2163 if (unlikely(copied < len)) {
2164 /*
2165 * The buffers that were written will now be uptodate, so
2166 * we don't have to worry about a read_folio reading them
2167 * and overwriting a partial write. However if we have
2168 * encountered a short write and only partially written
2169 * into a buffer, it will not be marked uptodate, so a
2170 * read_folio might come in and destroy our partial write.
2171 *
2172 * Do the simplest thing, and just treat any short write to a
2173 * non uptodate folio as a zero-length write, and force the
2174 * caller to redo the whole thing.
2175 */
2176 if (!folio_test_uptodate(folio))
2177 copied = 0;
2178
2179 folio_zero_new_buffers(folio, start+copied, start+len);
2180 }
2181 flush_dcache_folio(folio);
2182
2183 /* This could be a short (even 0-length) commit */
2184 block_commit_write(folio, start, start + copied);
2185
2186 return copied;
2187 }
2188 EXPORT_SYMBOL(block_write_end);
2189
generic_write_end(const struct kiocb * iocb,struct address_space * mapping,loff_t pos,unsigned len,unsigned copied,struct folio * folio,void * fsdata)2190 int generic_write_end(const struct kiocb *iocb, struct address_space *mapping,
2191 loff_t pos, unsigned len, unsigned copied,
2192 struct folio *folio, void *fsdata)
2193 {
2194 struct inode *inode = mapping->host;
2195 loff_t old_size = inode->i_size;
2196 bool i_size_changed = false;
2197
2198 copied = block_write_end(pos, len, copied, folio);
2199
2200 /*
2201 * No need to use i_size_read() here, the i_size cannot change under us
2202 * because we hold i_rwsem.
2203 *
2204 * But it's important to update i_size while still holding folio lock:
2205 * page writeout could otherwise come in and zero beyond i_size.
2206 */
2207 if (pos + copied > inode->i_size) {
2208 i_size_write(inode, pos + copied);
2209 i_size_changed = true;
2210 }
2211
2212 folio_unlock(folio);
2213 folio_put(folio);
2214
2215 if (old_size < pos)
2216 pagecache_isize_extended(inode, old_size, pos);
2217 /*
2218 * Don't mark the inode dirty under page lock. First, it unnecessarily
2219 * makes the holding time of page lock longer. Second, it forces lock
2220 * ordering of page lock and transaction start for journaling
2221 * filesystems.
2222 */
2223 if (i_size_changed)
2224 mark_inode_dirty(inode);
2225 return copied;
2226 }
2227 EXPORT_SYMBOL(generic_write_end);
2228
2229 /*
2230 * block_is_partially_uptodate checks whether buffers within a folio are
2231 * uptodate or not.
2232 *
2233 * Returns true if all buffers which correspond to the specified part
2234 * of the folio are uptodate.
2235 */
block_is_partially_uptodate(struct folio * folio,size_t from,size_t count)2236 bool block_is_partially_uptodate(struct folio *folio, size_t from, size_t count)
2237 {
2238 unsigned block_start, block_end, blocksize;
2239 unsigned to;
2240 struct buffer_head *bh, *head;
2241 bool ret = true;
2242
2243 head = folio_buffers(folio);
2244 if (!head)
2245 return false;
2246 blocksize = head->b_size;
2247 to = min(folio_size(folio) - from, count);
2248 to = from + to;
2249 if (from < blocksize && to > folio_size(folio) - blocksize)
2250 return false;
2251
2252 bh = head;
2253 block_start = 0;
2254 do {
2255 block_end = block_start + blocksize;
2256 if (block_end > from && block_start < to) {
2257 if (!buffer_uptodate(bh)) {
2258 ret = false;
2259 break;
2260 }
2261 if (block_end >= to)
2262 break;
2263 }
2264 block_start = block_end;
2265 bh = bh->b_this_page;
2266 } while (bh != head);
2267
2268 return ret;
2269 }
2270 EXPORT_SYMBOL(block_is_partially_uptodate);
2271
2272 /*
2273 * Generic "read_folio" function for block devices that have the normal
2274 * get_block functionality. This is most of the block device filesystems.
2275 * Reads the folio asynchronously --- the unlock_buffer() and
2276 * set/clear_buffer_uptodate() functions propagate buffer state into the
2277 * folio once IO has completed.
2278 */
block_read_full_folio(struct folio * folio,get_block_t * get_block)2279 int block_read_full_folio(struct folio *folio, get_block_t *get_block)
2280 {
2281 struct inode *inode = folio->mapping->host;
2282 sector_t iblock, lblock;
2283 struct buffer_head *bh, *head, *prev = NULL;
2284 size_t blocksize;
2285 int fully_mapped = 1;
2286 bool page_error = false;
2287 loff_t limit = i_size_read(inode);
2288
2289 /* This is needed for ext4. */
2290 if (IS_ENABLED(CONFIG_FS_VERITY) && IS_VERITY(inode))
2291 limit = inode->i_sb->s_maxbytes;
2292
2293 head = folio_create_buffers(folio, inode, 0);
2294 blocksize = head->b_size;
2295
2296 iblock = div_u64(folio_pos(folio), blocksize);
2297 lblock = div_u64(limit + blocksize - 1, blocksize);
2298 bh = head;
2299
2300 do {
2301 if (buffer_uptodate(bh))
2302 continue;
2303
2304 if (!buffer_mapped(bh)) {
2305 int err = 0;
2306
2307 fully_mapped = 0;
2308 if (iblock < lblock) {
2309 WARN_ON(bh->b_size != blocksize);
2310 err = get_block(inode, iblock, bh, 0);
2311 if (err)
2312 page_error = true;
2313 }
2314 if (!buffer_mapped(bh)) {
2315 folio_zero_range(folio, bh_offset(bh),
2316 blocksize);
2317 if (!err)
2318 set_buffer_uptodate(bh);
2319 continue;
2320 }
2321 /*
2322 * get_block() might have updated the buffer
2323 * synchronously
2324 */
2325 if (buffer_uptodate(bh))
2326 continue;
2327 }
2328
2329 lock_buffer(bh);
2330 if (buffer_uptodate(bh)) {
2331 unlock_buffer(bh);
2332 continue;
2333 }
2334
2335 mark_buffer_async_read(bh);
2336 if (prev)
2337 submit_bh(REQ_OP_READ, prev);
2338 prev = bh;
2339 } while (iblock++, (bh = bh->b_this_page) != head);
2340
2341 if (fully_mapped)
2342 folio_set_mappedtodisk(folio);
2343
2344 /*
2345 * All buffers are uptodate or get_block() returned an error
2346 * when trying to map them - we must finish the read because
2347 * end_buffer_async_read() will never be called on any buffer
2348 * in this folio.
2349 */
2350 if (prev)
2351 submit_bh(REQ_OP_READ, prev);
2352 else
2353 folio_end_read(folio, !page_error);
2354
2355 return 0;
2356 }
2357 EXPORT_SYMBOL(block_read_full_folio);
2358
2359 /* utility function for filesystems that need to do work on expanding
2360 * truncates. Uses filesystem pagecache writes to allow the filesystem to
2361 * deal with the hole.
2362 */
generic_cont_expand_simple(struct inode * inode,loff_t size)2363 int generic_cont_expand_simple(struct inode *inode, loff_t size)
2364 {
2365 struct address_space *mapping = inode->i_mapping;
2366 const struct address_space_operations *aops = mapping->a_ops;
2367 struct folio *folio;
2368 void *fsdata = NULL;
2369 int err;
2370
2371 err = inode_newsize_ok(inode, size);
2372 if (err)
2373 goto out;
2374
2375 err = aops->write_begin(NULL, mapping, size, 0, &folio, &fsdata);
2376 if (err)
2377 goto out;
2378
2379 err = aops->write_end(NULL, mapping, size, 0, 0, folio, fsdata);
2380 BUG_ON(err > 0);
2381
2382 out:
2383 return err;
2384 }
2385 EXPORT_SYMBOL(generic_cont_expand_simple);
2386
cont_expand_zero(const struct kiocb * iocb,struct address_space * mapping,loff_t pos,loff_t * bytes)2387 static int cont_expand_zero(const struct kiocb *iocb,
2388 struct address_space *mapping,
2389 loff_t pos, loff_t *bytes)
2390 {
2391 struct inode *inode = mapping->host;
2392 const struct address_space_operations *aops = mapping->a_ops;
2393 unsigned int blocksize = i_blocksize(inode);
2394 struct folio *folio;
2395 void *fsdata = NULL;
2396 pgoff_t index, curidx;
2397 loff_t curpos;
2398 unsigned zerofrom, offset, len;
2399 int err = 0;
2400
2401 index = pos >> PAGE_SHIFT;
2402 offset = pos & ~PAGE_MASK;
2403
2404 while (index > (curidx = (curpos = *bytes)>>PAGE_SHIFT)) {
2405 zerofrom = curpos & ~PAGE_MASK;
2406 if (zerofrom & (blocksize-1)) {
2407 *bytes |= (blocksize-1);
2408 (*bytes)++;
2409 }
2410 len = PAGE_SIZE - zerofrom;
2411
2412 err = aops->write_begin(iocb, mapping, curpos, len,
2413 &folio, &fsdata);
2414 if (err)
2415 goto out;
2416 folio_zero_range(folio, offset_in_folio(folio, curpos), len);
2417 err = aops->write_end(iocb, mapping, curpos, len, len,
2418 folio, fsdata);
2419 if (err < 0)
2420 goto out;
2421 BUG_ON(err != len);
2422 err = 0;
2423
2424 balance_dirty_pages_ratelimited(mapping);
2425
2426 if (fatal_signal_pending(current)) {
2427 err = -EINTR;
2428 goto out;
2429 }
2430 }
2431
2432 /* page covers the boundary, find the boundary offset */
2433 if (index == curidx) {
2434 zerofrom = curpos & ~PAGE_MASK;
2435 /* if we will expand the thing last block will be filled */
2436 if (offset <= zerofrom) {
2437 goto out;
2438 }
2439 if (zerofrom & (blocksize-1)) {
2440 *bytes |= (blocksize-1);
2441 (*bytes)++;
2442 }
2443 len = offset - zerofrom;
2444
2445 err = aops->write_begin(iocb, mapping, curpos, len,
2446 &folio, &fsdata);
2447 if (err)
2448 goto out;
2449 folio_zero_range(folio, offset_in_folio(folio, curpos), len);
2450 err = aops->write_end(iocb, mapping, curpos, len, len,
2451 folio, fsdata);
2452 if (err < 0)
2453 goto out;
2454 BUG_ON(err != len);
2455 err = 0;
2456 }
2457 out:
2458 return err;
2459 }
2460
2461 /*
2462 * For moronic filesystems that do not allow holes in file.
2463 * We may have to extend the file.
2464 */
cont_write_begin(const struct kiocb * iocb,struct address_space * mapping,loff_t pos,unsigned len,struct folio ** foliop,void ** fsdata,get_block_t * get_block,loff_t * bytes)2465 int cont_write_begin(const struct kiocb *iocb, struct address_space *mapping,
2466 loff_t pos, unsigned len, struct folio **foliop,
2467 void **fsdata, get_block_t *get_block, loff_t *bytes)
2468 {
2469 struct inode *inode = mapping->host;
2470 unsigned int blocksize = i_blocksize(inode);
2471 unsigned int zerofrom;
2472 int err;
2473
2474 err = cont_expand_zero(iocb, mapping, pos, bytes);
2475 if (err)
2476 return err;
2477
2478 zerofrom = *bytes & ~PAGE_MASK;
2479 if (pos+len > *bytes && zerofrom & (blocksize-1)) {
2480 *bytes |= (blocksize-1);
2481 (*bytes)++;
2482 }
2483
2484 return block_write_begin(mapping, pos, len, foliop, get_block);
2485 }
2486 EXPORT_SYMBOL(cont_write_begin);
2487
2488 /*
2489 * block_page_mkwrite() is not allowed to change the file size as it gets
2490 * called from a page fault handler when a page is first dirtied. Hence we must
2491 * be careful to check for EOF conditions here. We set the page up correctly
2492 * for a written page which means we get ENOSPC checking when writing into
2493 * holes and correct delalloc and unwritten extent mapping on filesystems that
2494 * support these features.
2495 *
2496 * We are not allowed to take the i_rwsem here so we have to play games to
2497 * protect against truncate races as the page could now be beyond EOF. Because
2498 * truncate writes the inode size before removing pages, once we have the
2499 * page lock we can determine safely if the page is beyond EOF. If it is not
2500 * beyond EOF, then the page is guaranteed safe against truncation until we
2501 * unlock the page.
2502 *
2503 * Direct callers of this function should protect against filesystem freezing
2504 * using sb_start_pagefault() - sb_end_pagefault() functions.
2505 */
block_page_mkwrite(struct vm_area_struct * vma,struct vm_fault * vmf,get_block_t get_block)2506 int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
2507 get_block_t get_block)
2508 {
2509 struct folio *folio = page_folio(vmf->page);
2510 struct inode *inode = file_inode(vma->vm_file);
2511 unsigned long end;
2512 loff_t size;
2513 int ret;
2514
2515 folio_lock(folio);
2516 size = i_size_read(inode);
2517 if ((folio->mapping != inode->i_mapping) ||
2518 (folio_pos(folio) >= size)) {
2519 /* We overload EFAULT to mean page got truncated */
2520 ret = -EFAULT;
2521 goto out_unlock;
2522 }
2523
2524 end = folio_size(folio);
2525 /* folio is wholly or partially inside EOF */
2526 if (folio_pos(folio) + end > size)
2527 end = size - folio_pos(folio);
2528
2529 ret = __block_write_begin_int(folio, 0, end, get_block, NULL);
2530 if (unlikely(ret))
2531 goto out_unlock;
2532
2533 block_commit_write(folio, 0, end);
2534
2535 folio_mark_dirty(folio);
2536 folio_wait_stable(folio);
2537 return 0;
2538 out_unlock:
2539 folio_unlock(folio);
2540 return ret;
2541 }
2542 EXPORT_SYMBOL(block_page_mkwrite);
2543
block_truncate_page(struct address_space * mapping,loff_t from,get_block_t * get_block)2544 int block_truncate_page(struct address_space *mapping,
2545 loff_t from, get_block_t *get_block)
2546 {
2547 pgoff_t index = from >> PAGE_SHIFT;
2548 unsigned blocksize;
2549 sector_t iblock;
2550 size_t offset, length, pos;
2551 struct inode *inode = mapping->host;
2552 struct folio *folio;
2553 struct buffer_head *bh;
2554 int err = 0;
2555
2556 blocksize = i_blocksize(inode);
2557 length = from & (blocksize - 1);
2558
2559 /* Block boundary? Nothing to do */
2560 if (!length)
2561 return 0;
2562
2563 length = blocksize - length;
2564 iblock = ((loff_t)index * PAGE_SIZE) >> inode->i_blkbits;
2565
2566 folio = filemap_grab_folio(mapping, index);
2567 if (IS_ERR(folio))
2568 return PTR_ERR(folio);
2569
2570 bh = folio_buffers(folio);
2571 if (!bh)
2572 bh = create_empty_buffers(folio, blocksize, 0);
2573
2574 /* Find the buffer that contains "offset" */
2575 offset = offset_in_folio(folio, from);
2576 pos = blocksize;
2577 while (offset >= pos) {
2578 bh = bh->b_this_page;
2579 iblock++;
2580 pos += blocksize;
2581 }
2582
2583 if (!buffer_mapped(bh)) {
2584 WARN_ON(bh->b_size != blocksize);
2585 err = get_block(inode, iblock, bh, 0);
2586 if (err)
2587 goto unlock;
2588 /* unmapped? It's a hole - nothing to do */
2589 if (!buffer_mapped(bh))
2590 goto unlock;
2591 }
2592
2593 /* Ok, it's mapped. Make sure it's up-to-date */
2594 if (folio_test_uptodate(folio))
2595 set_buffer_uptodate(bh);
2596
2597 if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
2598 err = bh_read(bh, 0);
2599 /* Uhhuh. Read error. Complain and punt. */
2600 if (err < 0)
2601 goto unlock;
2602 }
2603
2604 folio_zero_range(folio, offset, length);
2605 mark_buffer_dirty(bh);
2606
2607 unlock:
2608 folio_unlock(folio);
2609 folio_put(folio);
2610
2611 return err;
2612 }
2613 EXPORT_SYMBOL(block_truncate_page);
2614
2615 /*
2616 * The generic write folio function for buffer-backed address_spaces
2617 */
block_write_full_folio(struct folio * folio,struct writeback_control * wbc,void * get_block)2618 int block_write_full_folio(struct folio *folio, struct writeback_control *wbc,
2619 void *get_block)
2620 {
2621 struct inode * const inode = folio->mapping->host;
2622 loff_t i_size = i_size_read(inode);
2623
2624 /* Is the folio fully inside i_size? */
2625 if (folio_next_pos(folio) <= i_size)
2626 return __block_write_full_folio(inode, folio, get_block, wbc);
2627
2628 /* Is the folio fully outside i_size? (truncate in progress) */
2629 if (folio_pos(folio) >= i_size) {
2630 folio_unlock(folio);
2631 return 0; /* don't care */
2632 }
2633
2634 /*
2635 * The folio straddles i_size. It must be zeroed out on each and every
2636 * writeback invocation because it may be mmapped. "A file is mapped
2637 * in multiples of the page size. For a file that is not a multiple of
2638 * the page size, the remaining memory is zeroed when mapped, and
2639 * writes to that region are not written out to the file."
2640 */
2641 folio_zero_segment(folio, offset_in_folio(folio, i_size),
2642 folio_size(folio));
2643 return __block_write_full_folio(inode, folio, get_block, wbc);
2644 }
2645
generic_block_bmap(struct address_space * mapping,sector_t block,get_block_t * get_block)2646 sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2647 get_block_t *get_block)
2648 {
2649 struct inode *inode = mapping->host;
2650 struct buffer_head tmp = {
2651 .b_size = i_blocksize(inode),
2652 };
2653
2654 get_block(inode, block, &tmp, 0);
2655 return tmp.b_blocknr;
2656 }
2657 EXPORT_SYMBOL(generic_block_bmap);
2658
end_bio_bh_io_sync(struct bio * bio)2659 static void end_bio_bh_io_sync(struct bio *bio)
2660 {
2661 struct buffer_head *bh = bio->bi_private;
2662
2663 if (unlikely(bio_flagged(bio, BIO_QUIET)))
2664 set_bit(BH_Quiet, &bh->b_state);
2665
2666 bh->b_end_io(bh, !bio->bi_status);
2667 bio_put(bio);
2668 }
2669
buffer_set_crypto_ctx(struct bio * bio,const struct buffer_head * bh,gfp_t gfp_mask)2670 static void buffer_set_crypto_ctx(struct bio *bio, const struct buffer_head *bh,
2671 gfp_t gfp_mask)
2672 {
2673 const struct address_space *mapping = folio_mapping(bh->b_folio);
2674
2675 /*
2676 * The ext4 journal (jbd2) can submit a buffer_head it directly created
2677 * for a non-pagecache page. fscrypt doesn't care about these.
2678 */
2679 if (!mapping)
2680 return;
2681 fscrypt_set_bio_crypt_ctx(bio, mapping->host,
2682 folio_pos(bh->b_folio) + bh_offset(bh), gfp_mask);
2683 }
2684
submit_bh_wbc(blk_opf_t opf,struct buffer_head * bh,enum rw_hint write_hint,struct writeback_control * wbc)2685 static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
2686 enum rw_hint write_hint,
2687 struct writeback_control *wbc)
2688 {
2689 const enum req_op op = opf & REQ_OP_MASK;
2690 struct bio *bio;
2691
2692 BUG_ON(!buffer_locked(bh));
2693 BUG_ON(!buffer_mapped(bh));
2694 BUG_ON(!bh->b_end_io);
2695 BUG_ON(buffer_delay(bh));
2696 BUG_ON(buffer_unwritten(bh));
2697
2698 /*
2699 * Only clear out a write error when rewriting
2700 */
2701 if (test_set_buffer_req(bh) && (op == REQ_OP_WRITE))
2702 clear_buffer_write_io_error(bh);
2703
2704 if (buffer_meta(bh))
2705 opf |= REQ_META;
2706 if (buffer_prio(bh))
2707 opf |= REQ_PRIO;
2708
2709 bio = bio_alloc(bh->b_bdev, 1, opf, GFP_NOIO);
2710
2711 if (IS_ENABLED(CONFIG_FS_ENCRYPTION))
2712 buffer_set_crypto_ctx(bio, bh, GFP_NOIO);
2713
2714 bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
2715 bio->bi_write_hint = write_hint;
2716
2717 bio_add_folio_nofail(bio, bh->b_folio, bh->b_size, bh_offset(bh));
2718
2719 bio->bi_end_io = end_bio_bh_io_sync;
2720 bio->bi_private = bh;
2721
2722 /* Take care of bh's that straddle the end of the device */
2723 guard_bio_eod(bio);
2724
2725 if (wbc) {
2726 wbc_init_bio(wbc, bio);
2727 wbc_account_cgroup_owner(wbc, bh->b_folio, bh->b_size);
2728 }
2729
2730 blk_crypto_submit_bio(bio);
2731 }
2732
submit_bh(blk_opf_t opf,struct buffer_head * bh)2733 void submit_bh(blk_opf_t opf, struct buffer_head *bh)
2734 {
2735 submit_bh_wbc(opf, bh, WRITE_LIFE_NOT_SET, NULL);
2736 }
2737 EXPORT_SYMBOL(submit_bh);
2738
write_dirty_buffer(struct buffer_head * bh,blk_opf_t op_flags)2739 void write_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags)
2740 {
2741 lock_buffer(bh);
2742 if (!test_clear_buffer_dirty(bh)) {
2743 unlock_buffer(bh);
2744 return;
2745 }
2746 bh->b_end_io = end_buffer_write_sync;
2747 get_bh(bh);
2748 submit_bh(REQ_OP_WRITE | op_flags, bh);
2749 }
2750 EXPORT_SYMBOL(write_dirty_buffer);
2751
2752 /*
2753 * For a data-integrity writeout, we need to wait upon any in-progress I/O
2754 * and then start new I/O and then wait upon it. The caller must have a ref on
2755 * the buffer_head.
2756 */
__sync_dirty_buffer(struct buffer_head * bh,blk_opf_t op_flags)2757 int __sync_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags)
2758 {
2759 WARN_ON(atomic_read(&bh->b_count) < 1);
2760 lock_buffer(bh);
2761 if (test_clear_buffer_dirty(bh)) {
2762 /*
2763 * The bh should be mapped, but it might not be if the
2764 * device was hot-removed. Not much we can do but fail the I/O.
2765 */
2766 if (!buffer_mapped(bh)) {
2767 unlock_buffer(bh);
2768 return -EIO;
2769 }
2770
2771 get_bh(bh);
2772 bh->b_end_io = end_buffer_write_sync;
2773 submit_bh(REQ_OP_WRITE | op_flags, bh);
2774 wait_on_buffer(bh);
2775 if (!buffer_uptodate(bh))
2776 return -EIO;
2777 } else {
2778 unlock_buffer(bh);
2779 }
2780 return 0;
2781 }
2782 EXPORT_SYMBOL(__sync_dirty_buffer);
2783
sync_dirty_buffer(struct buffer_head * bh)2784 int sync_dirty_buffer(struct buffer_head *bh)
2785 {
2786 return __sync_dirty_buffer(bh, REQ_SYNC);
2787 }
2788 EXPORT_SYMBOL(sync_dirty_buffer);
2789
buffer_busy(struct buffer_head * bh)2790 static inline int buffer_busy(struct buffer_head *bh)
2791 {
2792 return atomic_read(&bh->b_count) |
2793 (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
2794 }
2795
2796 static bool
drop_buffers(struct folio * folio,struct buffer_head ** buffers_to_free)2797 drop_buffers(struct folio *folio, struct buffer_head **buffers_to_free)
2798 {
2799 struct buffer_head *head = folio_buffers(folio);
2800 struct buffer_head *bh;
2801
2802 bh = head;
2803 do {
2804 if (buffer_busy(bh))
2805 goto failed;
2806 bh = bh->b_this_page;
2807 } while (bh != head);
2808
2809 do {
2810 struct buffer_head *next = bh->b_this_page;
2811
2812 remove_assoc_queue(bh);
2813 bh = next;
2814 } while (bh != head);
2815 *buffers_to_free = head;
2816 folio_detach_private(folio);
2817 return true;
2818 failed:
2819 return false;
2820 }
2821
2822 /**
2823 * try_to_free_buffers - Release buffers attached to this folio.
2824 * @folio: The folio.
2825 *
2826 * If any buffers are in use (dirty, under writeback, elevated refcount),
2827 * no buffers will be freed.
2828 *
2829 * If the folio is dirty but all the buffers are clean then we need to
2830 * be sure to mark the folio clean as well. This is because the folio
2831 * may be against a block device, and a later reattachment of buffers
2832 * to a dirty folio will set *all* buffers dirty. Which would corrupt
2833 * filesystem data on the same device.
2834 *
2835 * The same applies to regular filesystem folios: if all the buffers are
2836 * clean then we set the folio clean and proceed. To do that, we require
2837 * total exclusion from block_dirty_folio(). That is obtained with
2838 * i_private_lock.
2839 *
2840 * Exclusion against try_to_free_buffers may be obtained by either
2841 * locking the folio or by holding its mapping's i_private_lock.
2842 *
2843 * Context: Process context. @folio must be locked. Will not sleep.
2844 * Return: true if all buffers attached to this folio were freed.
2845 */
try_to_free_buffers(struct folio * folio)2846 bool try_to_free_buffers(struct folio *folio)
2847 {
2848 struct address_space * const mapping = folio->mapping;
2849 struct buffer_head *buffers_to_free = NULL;
2850 bool ret = 0;
2851
2852 BUG_ON(!folio_test_locked(folio));
2853 if (folio_test_writeback(folio))
2854 return false;
2855
2856 /* Misconfigured folio check */
2857 if (WARN_ON_ONCE(!folio_buffers(folio)))
2858 return true;
2859
2860 if (mapping == NULL) { /* can this still happen? */
2861 ret = drop_buffers(folio, &buffers_to_free);
2862 goto out;
2863 }
2864
2865 spin_lock(&mapping->i_private_lock);
2866 ret = drop_buffers(folio, &buffers_to_free);
2867
2868 /*
2869 * If the filesystem writes its buffers by hand (eg ext3)
2870 * then we can have clean buffers against a dirty folio. We
2871 * clean the folio here; otherwise the VM will never notice
2872 * that the filesystem did any IO at all.
2873 *
2874 * Also, during truncate, discard_buffer will have marked all
2875 * the folio's buffers clean. We discover that here and clean
2876 * the folio also.
2877 *
2878 * i_private_lock must be held over this entire operation in order
2879 * to synchronise against block_dirty_folio and prevent the
2880 * dirty bit from being lost.
2881 */
2882 if (ret)
2883 folio_cancel_dirty(folio);
2884 spin_unlock(&mapping->i_private_lock);
2885 out:
2886 if (buffers_to_free) {
2887 struct buffer_head *bh = buffers_to_free;
2888
2889 do {
2890 struct buffer_head *next = bh->b_this_page;
2891 free_buffer_head(bh);
2892 bh = next;
2893 } while (bh != buffers_to_free);
2894 }
2895 return ret;
2896 }
2897 EXPORT_SYMBOL(try_to_free_buffers);
2898
2899 /*
2900 * Buffer-head allocation
2901 */
2902 static struct kmem_cache *bh_cachep __ro_after_init;
2903
2904 /*
2905 * Once the number of bh's in the machine exceeds this level, we start
2906 * stripping them in writeback.
2907 */
2908 static unsigned long max_buffer_heads __ro_after_init;
2909
2910 int buffer_heads_over_limit;
2911
2912 struct bh_accounting {
2913 int nr; /* Number of live bh's */
2914 int ratelimit; /* Limit cacheline bouncing */
2915 };
2916
2917 static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
2918
recalc_bh_state(void)2919 static void recalc_bh_state(void)
2920 {
2921 int i;
2922 int tot = 0;
2923
2924 if (__this_cpu_inc_return(bh_accounting.ratelimit) - 1 < 4096)
2925 return;
2926 __this_cpu_write(bh_accounting.ratelimit, 0);
2927 for_each_online_cpu(i)
2928 tot += per_cpu(bh_accounting, i).nr;
2929 buffer_heads_over_limit = (tot > max_buffer_heads);
2930 }
2931
alloc_buffer_head(gfp_t gfp_flags)2932 struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
2933 {
2934 struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
2935 if (ret) {
2936 INIT_LIST_HEAD(&ret->b_assoc_buffers);
2937 spin_lock_init(&ret->b_uptodate_lock);
2938 preempt_disable();
2939 __this_cpu_inc(bh_accounting.nr);
2940 recalc_bh_state();
2941 preempt_enable();
2942 }
2943 return ret;
2944 }
2945 EXPORT_SYMBOL(alloc_buffer_head);
2946
free_buffer_head(struct buffer_head * bh)2947 void free_buffer_head(struct buffer_head *bh)
2948 {
2949 BUG_ON(!list_empty(&bh->b_assoc_buffers));
2950 kmem_cache_free(bh_cachep, bh);
2951 preempt_disable();
2952 __this_cpu_dec(bh_accounting.nr);
2953 recalc_bh_state();
2954 preempt_enable();
2955 }
2956 EXPORT_SYMBOL(free_buffer_head);
2957
buffer_exit_cpu_dead(unsigned int cpu)2958 static int buffer_exit_cpu_dead(unsigned int cpu)
2959 {
2960 int i;
2961 struct bh_lru *b = &per_cpu(bh_lrus, cpu);
2962
2963 for (i = 0; i < BH_LRU_SIZE; i++) {
2964 brelse(b->bhs[i]);
2965 b->bhs[i] = NULL;
2966 }
2967 this_cpu_add(bh_accounting.nr, per_cpu(bh_accounting, cpu).nr);
2968 per_cpu(bh_accounting, cpu).nr = 0;
2969 return 0;
2970 }
2971
2972 /**
2973 * bh_uptodate_or_lock - Test whether the buffer is uptodate
2974 * @bh: struct buffer_head
2975 *
2976 * Return true if the buffer is up-to-date and false,
2977 * with the buffer locked, if not.
2978 */
bh_uptodate_or_lock(struct buffer_head * bh)2979 int bh_uptodate_or_lock(struct buffer_head *bh)
2980 {
2981 if (!buffer_uptodate(bh)) {
2982 lock_buffer(bh);
2983 if (!buffer_uptodate(bh))
2984 return 0;
2985 unlock_buffer(bh);
2986 }
2987 return 1;
2988 }
2989 EXPORT_SYMBOL(bh_uptodate_or_lock);
2990
2991 /**
2992 * __bh_read - Submit read for a locked buffer
2993 * @bh: struct buffer_head
2994 * @op_flags: appending REQ_OP_* flags besides REQ_OP_READ
2995 * @wait: wait until reading finish
2996 *
2997 * Returns zero on success or don't wait, and -EIO on error.
2998 */
__bh_read(struct buffer_head * bh,blk_opf_t op_flags,bool wait)2999 int __bh_read(struct buffer_head *bh, blk_opf_t op_flags, bool wait)
3000 {
3001 int ret = 0;
3002
3003 BUG_ON(!buffer_locked(bh));
3004
3005 get_bh(bh);
3006 bh->b_end_io = end_buffer_read_sync;
3007 submit_bh(REQ_OP_READ | op_flags, bh);
3008 if (wait) {
3009 wait_on_buffer(bh);
3010 if (!buffer_uptodate(bh))
3011 ret = -EIO;
3012 }
3013 return ret;
3014 }
3015 EXPORT_SYMBOL(__bh_read);
3016
3017 /**
3018 * __bh_read_batch - Submit read for a batch of unlocked buffers
3019 * @nr: entry number of the buffer batch
3020 * @bhs: a batch of struct buffer_head
3021 * @op_flags: appending REQ_OP_* flags besides REQ_OP_READ
3022 * @force_lock: force to get a lock on the buffer if set, otherwise drops any
3023 * buffer that cannot lock.
3024 *
3025 * Returns zero on success or don't wait, and -EIO on error.
3026 */
__bh_read_batch(int nr,struct buffer_head * bhs[],blk_opf_t op_flags,bool force_lock)3027 void __bh_read_batch(int nr, struct buffer_head *bhs[],
3028 blk_opf_t op_flags, bool force_lock)
3029 {
3030 int i;
3031
3032 for (i = 0; i < nr; i++) {
3033 struct buffer_head *bh = bhs[i];
3034
3035 if (buffer_uptodate(bh))
3036 continue;
3037
3038 if (force_lock)
3039 lock_buffer(bh);
3040 else
3041 if (!trylock_buffer(bh))
3042 continue;
3043
3044 if (buffer_uptodate(bh)) {
3045 unlock_buffer(bh);
3046 continue;
3047 }
3048
3049 bh->b_end_io = end_buffer_read_sync;
3050 get_bh(bh);
3051 submit_bh(REQ_OP_READ | op_flags, bh);
3052 }
3053 }
3054 EXPORT_SYMBOL(__bh_read_batch);
3055
buffer_init(void)3056 void __init buffer_init(void)
3057 {
3058 unsigned long nrpages;
3059 int ret;
3060
3061 bh_cachep = KMEM_CACHE(buffer_head,
3062 SLAB_RECLAIM_ACCOUNT|SLAB_PANIC);
3063 /*
3064 * Limit the bh occupancy to 10% of ZONE_NORMAL
3065 */
3066 nrpages = (nr_free_buffer_pages() * 10) / 100;
3067 max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
3068 ret = cpuhp_setup_state_nocalls(CPUHP_FS_BUFF_DEAD, "fs/buffer:dead",
3069 NULL, buffer_exit_cpu_dead);
3070 WARN_ON(ret < 0);
3071 }
3072