1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * fs/f2fs/checkpoint.c
4 *
5 * Copyright (c) 2012 Samsung Electronics Co., Ltd.
6 * http://www.samsung.com/
7 */
8 #include <linux/fs.h>
9 #include <linux/bio.h>
10 #include <linux/mpage.h>
11 #include <linux/writeback.h>
12 #include <linux/blkdev.h>
13 #include <linux/f2fs_fs.h>
14 #include <linux/pagevec.h>
15 #include <linux/swap.h>
16 #include <linux/kthread.h>
17 #include <linux/delayacct.h>
18 #include <linux/ioprio.h>
19 #include <linux/math64.h>
20
21 #include "f2fs.h"
22 #include "node.h"
23 #include "segment.h"
24 #include "iostat.h"
25 #include <trace/events/f2fs.h>
26
get_lock_elapsed_time(struct f2fs_time_stat * ts)27 static inline void get_lock_elapsed_time(struct f2fs_time_stat *ts)
28 {
29 ts->total_time = ktime_get();
30 #ifdef CONFIG_64BIT
31 ts->running_time = current->se.sum_exec_runtime;
32 #endif
33 #if defined(CONFIG_SCHED_INFO) && defined(CONFIG_SCHEDSTATS)
34 ts->runnable_time = current->sched_info.run_delay;
35 #endif
36 #ifdef CONFIG_TASK_DELAY_ACCT
37 if (current->delays)
38 ts->io_sleep_time = current->delays->blkio_delay;
39 #endif
40 }
41
trace_lock_elapsed_time_start(struct f2fs_rwsem * sem,struct f2fs_lock_context * lc)42 static inline void trace_lock_elapsed_time_start(struct f2fs_rwsem *sem,
43 struct f2fs_lock_context *lc)
44 {
45 lc->lock_trace = trace_f2fs_lock_elapsed_time_enabled();
46 if (!lc->lock_trace)
47 return;
48
49 get_lock_elapsed_time(&lc->ts);
50 }
51
trace_lock_elapsed_time_end(struct f2fs_rwsem * sem,struct f2fs_lock_context * lc,bool is_write)52 static inline void trace_lock_elapsed_time_end(struct f2fs_rwsem *sem,
53 struct f2fs_lock_context *lc, bool is_write)
54 {
55 struct f2fs_time_stat tts;
56 unsigned long long total_time;
57 unsigned long long running_time = 0;
58 unsigned long long runnable_time = 0;
59 unsigned long long io_sleep_time = 0;
60 unsigned long long other_time = 0;
61 unsigned npm = NSEC_PER_MSEC;
62
63 if (!lc->lock_trace)
64 return;
65
66 if (time_to_inject(sem->sbi, FAULT_LOCK_TIMEOUT))
67 f2fs_schedule_timeout_killable(DEFAULT_FAULT_TIMEOUT, true);
68
69 get_lock_elapsed_time(&tts);
70
71 total_time = div_u64(tts.total_time - lc->ts.total_time, npm);
72 if (total_time <= sem->sbi->max_lock_elapsed_time)
73 return;
74
75 #ifdef CONFIG_64BIT
76 running_time = div_u64(tts.running_time - lc->ts.running_time, npm);
77 #endif
78 #if defined(CONFIG_SCHED_INFO) && defined(CONFIG_SCHEDSTATS)
79 runnable_time = div_u64(tts.runnable_time - lc->ts.runnable_time, npm);
80 #endif
81 #ifdef CONFIG_TASK_DELAY_ACCT
82 io_sleep_time = div_u64(tts.io_sleep_time - lc->ts.io_sleep_time, npm);
83 #endif
84 if (total_time > running_time + io_sleep_time + runnable_time)
85 other_time = total_time - running_time -
86 io_sleep_time - runnable_time;
87
88 trace_f2fs_lock_elapsed_time(sem->sbi, sem->name, is_write, current,
89 get_current_ioprio(), total_time, running_time,
90 runnable_time, io_sleep_time, other_time);
91 }
92
need_uplift_priority(struct f2fs_rwsem * sem,bool is_write)93 static bool need_uplift_priority(struct f2fs_rwsem *sem, bool is_write)
94 {
95 if (!(sem->sbi->adjust_lock_priority & BIT(sem->name - 1)))
96 return false;
97
98 switch (sem->name) {
99 /*
100 * writer is checkpoint which has high priority, let's just uplift
101 * priority for reader
102 */
103 case LOCK_NAME_CP_RWSEM:
104 case LOCK_NAME_NODE_CHANGE:
105 case LOCK_NAME_NODE_WRITE:
106 return !is_write;
107 case LOCK_NAME_GC_LOCK:
108 case LOCK_NAME_CP_GLOBAL:
109 case LOCK_NAME_IO_RWSEM:
110 return true;
111 default:
112 f2fs_bug_on(sem->sbi, 1);
113 }
114 return false;
115 }
116
uplift_priority(struct f2fs_rwsem * sem,struct f2fs_lock_context * lc,bool is_write)117 static void uplift_priority(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc,
118 bool is_write)
119 {
120 lc->need_restore = false;
121 if (!sem->sbi->adjust_lock_priority)
122 return;
123 if (rt_task(current))
124 return;
125 if (!need_uplift_priority(sem, is_write))
126 return;
127 lc->orig_nice = task_nice(current);
128 lc->new_nice = PRIO_TO_NICE(sem->sbi->lock_duration_priority);
129 if (lc->orig_nice <= lc->new_nice)
130 return;
131 set_user_nice(current, lc->new_nice);
132 lc->need_restore = true;
133
134 trace_f2fs_priority_uplift(sem->sbi, sem->name, is_write, current,
135 NICE_TO_PRIO(lc->orig_nice), NICE_TO_PRIO(lc->new_nice));
136 }
137
restore_priority(struct f2fs_rwsem * sem,struct f2fs_lock_context * lc,bool is_write)138 static void restore_priority(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc,
139 bool is_write)
140 {
141 if (!lc->need_restore)
142 return;
143 /* someone has updated the priority */
144 if (task_nice(current) != lc->new_nice)
145 return;
146 set_user_nice(current, lc->orig_nice);
147
148 trace_f2fs_priority_restore(sem->sbi, sem->name, is_write, current,
149 NICE_TO_PRIO(lc->orig_nice), NICE_TO_PRIO(lc->new_nice));
150 }
151
f2fs_down_read_trace(struct f2fs_rwsem * sem,struct f2fs_lock_context * lc)152 void f2fs_down_read_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc)
153 {
154 uplift_priority(sem, lc, false);
155 f2fs_down_read(sem);
156 trace_lock_elapsed_time_start(sem, lc);
157 }
158
f2fs_down_read_trylock_trace(struct f2fs_rwsem * sem,struct f2fs_lock_context * lc)159 int f2fs_down_read_trylock_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc)
160 {
161 uplift_priority(sem, lc, false);
162 if (!f2fs_down_read_trylock(sem)) {
163 restore_priority(sem, lc, false);
164 return 0;
165 }
166 trace_lock_elapsed_time_start(sem, lc);
167 return 1;
168 }
169
f2fs_up_read_trace(struct f2fs_rwsem * sem,struct f2fs_lock_context * lc)170 void f2fs_up_read_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc)
171 {
172 f2fs_up_read(sem);
173 restore_priority(sem, lc, false);
174 trace_lock_elapsed_time_end(sem, lc, false);
175 }
176
f2fs_down_write_trace(struct f2fs_rwsem * sem,struct f2fs_lock_context * lc)177 void f2fs_down_write_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc)
178 {
179 uplift_priority(sem, lc, true);
180 f2fs_down_write(sem);
181 trace_lock_elapsed_time_start(sem, lc);
182 }
183
f2fs_down_write_trylock_trace(struct f2fs_rwsem * sem,struct f2fs_lock_context * lc)184 int f2fs_down_write_trylock_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc)
185 {
186 uplift_priority(sem, lc, true);
187 if (!f2fs_down_write_trylock(sem)) {
188 restore_priority(sem, lc, true);
189 return 0;
190 }
191 trace_lock_elapsed_time_start(sem, lc);
192 return 1;
193 }
194
f2fs_up_write_trace(struct f2fs_rwsem * sem,struct f2fs_lock_context * lc)195 void f2fs_up_write_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc)
196 {
197 f2fs_up_write(sem);
198 restore_priority(sem, lc, true);
199 trace_lock_elapsed_time_end(sem, lc, true);
200 }
201
f2fs_lock_op(struct f2fs_sb_info * sbi,struct f2fs_lock_context * lc)202 void f2fs_lock_op(struct f2fs_sb_info *sbi, struct f2fs_lock_context *lc)
203 {
204 f2fs_down_read_trace(&sbi->cp_rwsem, lc);
205 }
206
f2fs_trylock_op(struct f2fs_sb_info * sbi,struct f2fs_lock_context * lc)207 int f2fs_trylock_op(struct f2fs_sb_info *sbi, struct f2fs_lock_context *lc)
208 {
209 if (time_to_inject(sbi, FAULT_LOCK_OP))
210 return 0;
211
212 return f2fs_down_read_trylock_trace(&sbi->cp_rwsem, lc);
213 }
214
f2fs_unlock_op(struct f2fs_sb_info * sbi,struct f2fs_lock_context * lc)215 void f2fs_unlock_op(struct f2fs_sb_info *sbi, struct f2fs_lock_context *lc)
216 {
217 f2fs_up_read_trace(&sbi->cp_rwsem, lc);
218 }
219
f2fs_lock_all(struct f2fs_sb_info * sbi)220 static inline void f2fs_lock_all(struct f2fs_sb_info *sbi)
221 {
222 f2fs_down_write(&sbi->cp_rwsem);
223 }
224
f2fs_unlock_all(struct f2fs_sb_info * sbi)225 static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi)
226 {
227 f2fs_up_write(&sbi->cp_rwsem);
228 }
229
230 #define DEFAULT_CHECKPOINT_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_RT, 3))
231
232 static struct kmem_cache *ino_entry_slab;
233 struct kmem_cache *f2fs_inode_entry_slab;
234
f2fs_stop_checkpoint(struct f2fs_sb_info * sbi,bool end_io,unsigned char reason)235 void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io,
236 unsigned char reason)
237 {
238 f2fs_build_fault_attr(sbi, 0, 0, FAULT_ALL);
239 if (!end_io)
240 f2fs_flush_merged_writes(sbi);
241 f2fs_handle_critical_error(sbi, reason);
242 }
243
244 /*
245 * We guarantee no failure on the returned page.
246 */
f2fs_grab_meta_folio(struct f2fs_sb_info * sbi,pgoff_t index)247 struct folio *f2fs_grab_meta_folio(struct f2fs_sb_info *sbi, pgoff_t index)
248 {
249 struct address_space *mapping = META_MAPPING(sbi);
250 struct folio *folio;
251 repeat:
252 folio = f2fs_grab_cache_folio(mapping, index, false);
253 if (IS_ERR(folio)) {
254 cond_resched();
255 goto repeat;
256 }
257 f2fs_folio_wait_writeback(folio, META, true, true);
258 if (!folio_test_uptodate(folio))
259 folio_mark_uptodate(folio);
260 return folio;
261 }
262
__get_meta_folio(struct f2fs_sb_info * sbi,pgoff_t index,bool is_meta)263 static struct folio *__get_meta_folio(struct f2fs_sb_info *sbi, pgoff_t index,
264 bool is_meta)
265 {
266 struct address_space *mapping = META_MAPPING(sbi);
267 struct folio *folio;
268 struct f2fs_io_info fio = {
269 .sbi = sbi,
270 .type = META,
271 .op = REQ_OP_READ,
272 .op_flags = REQ_META | REQ_PRIO,
273 .old_blkaddr = index,
274 .new_blkaddr = index,
275 .encrypted_page = NULL,
276 .is_por = !is_meta ? 1 : 0,
277 };
278 int err;
279
280 if (unlikely(!is_meta))
281 fio.op_flags &= ~REQ_META;
282 repeat:
283 folio = f2fs_grab_cache_folio(mapping, index, false);
284 if (IS_ERR(folio)) {
285 cond_resched();
286 goto repeat;
287 }
288 if (folio_test_uptodate(folio))
289 goto out;
290
291 fio.folio = folio;
292
293 err = f2fs_submit_page_bio(&fio);
294 if (err) {
295 f2fs_folio_put(folio, true);
296 return ERR_PTR(err);
297 }
298
299 f2fs_update_iostat(sbi, NULL, FS_META_READ_IO, F2FS_BLKSIZE);
300
301 folio_lock(folio);
302 if (unlikely(!is_meta_folio(folio))) {
303 f2fs_folio_put(folio, true);
304 goto repeat;
305 }
306
307 if (unlikely(!folio_test_uptodate(folio))) {
308 f2fs_handle_page_eio(sbi, folio, META);
309 f2fs_folio_put(folio, true);
310 return ERR_PTR(-EIO);
311 }
312 out:
313 return folio;
314 }
315
f2fs_get_meta_folio(struct f2fs_sb_info * sbi,pgoff_t index)316 struct folio *f2fs_get_meta_folio(struct f2fs_sb_info *sbi, pgoff_t index)
317 {
318 return __get_meta_folio(sbi, index, true);
319 }
320
f2fs_get_meta_folio_retry(struct f2fs_sb_info * sbi,pgoff_t index)321 struct folio *f2fs_get_meta_folio_retry(struct f2fs_sb_info *sbi, pgoff_t index)
322 {
323 struct folio *folio;
324 int count = 0;
325
326 retry:
327 folio = __get_meta_folio(sbi, index, true);
328 if (IS_ERR(folio)) {
329 if (PTR_ERR(folio) == -EIO &&
330 ++count <= DEFAULT_RETRY_IO_COUNT)
331 goto retry;
332 f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_META_PAGE);
333 }
334 return folio;
335 }
336
337 /* for POR only */
f2fs_get_tmp_folio(struct f2fs_sb_info * sbi,pgoff_t index)338 struct folio *f2fs_get_tmp_folio(struct f2fs_sb_info *sbi, pgoff_t index)
339 {
340 return __get_meta_folio(sbi, index, false);
341 }
342
__is_bitmap_valid(struct f2fs_sb_info * sbi,block_t blkaddr,int type)343 static bool __is_bitmap_valid(struct f2fs_sb_info *sbi, block_t blkaddr,
344 int type)
345 {
346 struct seg_entry *se;
347 unsigned int segno, offset;
348 bool exist;
349
350 if (type == DATA_GENERIC)
351 return true;
352
353 segno = GET_SEGNO(sbi, blkaddr);
354 offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr);
355 se = get_seg_entry(sbi, segno);
356
357 exist = f2fs_test_bit(offset, se->cur_valid_map);
358
359 /* skip data, if we already have an error in checkpoint. */
360 if (unlikely(f2fs_cp_error(sbi)))
361 return exist;
362
363 if ((exist && type == DATA_GENERIC_ENHANCE_UPDATE) ||
364 (!exist && type == DATA_GENERIC_ENHANCE))
365 goto out_err;
366 if (!exist && type != DATA_GENERIC_ENHANCE_UPDATE)
367 goto out_handle;
368 return exist;
369
370 out_err:
371 f2fs_err(sbi, "Inconsistent error blkaddr:%u, sit bitmap:%d",
372 blkaddr, exist);
373 set_sbi_flag(sbi, SBI_NEED_FSCK);
374 dump_stack();
375 out_handle:
376 f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
377 return exist;
378 }
379
__f2fs_is_valid_blkaddr(struct f2fs_sb_info * sbi,block_t blkaddr,int type)380 static bool __f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi,
381 block_t blkaddr, int type)
382 {
383 switch (type) {
384 case META_NAT:
385 break;
386 case META_SIT:
387 if (unlikely(blkaddr >= SIT_BLK_CNT(sbi)))
388 goto check_only;
389 break;
390 case META_SSA:
391 if (unlikely(blkaddr >= MAIN_BLKADDR(sbi) ||
392 blkaddr < SM_I(sbi)->ssa_blkaddr))
393 goto check_only;
394 break;
395 case META_CP:
396 if (unlikely(blkaddr >= SIT_I(sbi)->sit_base_addr ||
397 blkaddr < __start_cp_addr(sbi)))
398 goto check_only;
399 break;
400 case META_POR:
401 if (unlikely(blkaddr >= MAX_BLKADDR(sbi) ||
402 blkaddr < MAIN_BLKADDR(sbi)))
403 goto check_only;
404 break;
405 case DATA_GENERIC:
406 case DATA_GENERIC_ENHANCE:
407 case DATA_GENERIC_ENHANCE_READ:
408 case DATA_GENERIC_ENHANCE_UPDATE:
409 if (unlikely(blkaddr >= MAX_BLKADDR(sbi) ||
410 blkaddr < MAIN_BLKADDR(sbi))) {
411
412 /* Skip to emit an error message. */
413 if (unlikely(f2fs_cp_error(sbi)))
414 return false;
415
416 f2fs_warn(sbi, "access invalid blkaddr:%u",
417 blkaddr);
418 set_sbi_flag(sbi, SBI_NEED_FSCK);
419 dump_stack();
420 goto err;
421 } else {
422 return __is_bitmap_valid(sbi, blkaddr, type);
423 }
424 break;
425 case META_GENERIC:
426 if (unlikely(blkaddr < SEG0_BLKADDR(sbi) ||
427 blkaddr >= MAIN_BLKADDR(sbi)))
428 goto err;
429 break;
430 default:
431 BUG();
432 }
433
434 return true;
435 err:
436 f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
437 check_only:
438 return false;
439 }
440
f2fs_is_valid_blkaddr(struct f2fs_sb_info * sbi,block_t blkaddr,int type)441 bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi,
442 block_t blkaddr, int type)
443 {
444 if (time_to_inject(sbi, FAULT_BLKADDR_VALIDITY))
445 return false;
446 return __f2fs_is_valid_blkaddr(sbi, blkaddr, type);
447 }
448
f2fs_is_valid_blkaddr_raw(struct f2fs_sb_info * sbi,block_t blkaddr,int type)449 bool f2fs_is_valid_blkaddr_raw(struct f2fs_sb_info *sbi,
450 block_t blkaddr, int type)
451 {
452 return __f2fs_is_valid_blkaddr(sbi, blkaddr, type);
453 }
454
455 /*
456 * Readahead CP/NAT/SIT/SSA/POR pages
457 */
f2fs_ra_meta_pages(struct f2fs_sb_info * sbi,block_t start,int nrpages,int type,bool sync)458 int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
459 int type, bool sync)
460 {
461 block_t blkno = start;
462 struct f2fs_io_info fio = {
463 .sbi = sbi,
464 .type = META,
465 .op = REQ_OP_READ,
466 .op_flags = sync ? (REQ_META | REQ_PRIO) : REQ_RAHEAD,
467 .encrypted_page = NULL,
468 .in_list = 0,
469 .is_por = (type == META_POR) ? 1 : 0,
470 };
471 struct blk_plug plug;
472 int err;
473
474 if (unlikely(type == META_POR))
475 fio.op_flags &= ~REQ_META;
476
477 blk_start_plug(&plug);
478 for (; nrpages-- > 0; blkno++) {
479 struct folio *folio;
480
481 if (!f2fs_is_valid_blkaddr(sbi, blkno, type))
482 goto out;
483
484 switch (type) {
485 case META_NAT:
486 if (unlikely(blkno >=
487 NAT_BLOCK_OFFSET(NM_I(sbi)->max_nid)))
488 blkno = 0;
489 /* get nat block addr */
490 fio.new_blkaddr = current_nat_addr(sbi,
491 blkno * NAT_ENTRY_PER_BLOCK);
492 break;
493 case META_SIT:
494 if (unlikely(blkno >= TOTAL_SEGS(sbi)))
495 goto out;
496 /* get sit block addr */
497 fio.new_blkaddr = current_sit_addr(sbi,
498 blkno * SIT_ENTRY_PER_BLOCK);
499 break;
500 case META_SSA:
501 case META_CP:
502 case META_POR:
503 fio.new_blkaddr = blkno;
504 break;
505 default:
506 BUG();
507 }
508
509 folio = f2fs_grab_cache_folio(META_MAPPING(sbi),
510 fio.new_blkaddr, false);
511 if (IS_ERR(folio))
512 continue;
513 if (folio_test_uptodate(folio)) {
514 f2fs_folio_put(folio, true);
515 continue;
516 }
517
518 fio.folio = folio;
519 err = f2fs_submit_page_bio(&fio);
520 f2fs_folio_put(folio, err ? true : false);
521
522 if (!err)
523 f2fs_update_iostat(sbi, NULL, FS_META_READ_IO,
524 F2FS_BLKSIZE);
525 }
526 out:
527 blk_finish_plug(&plug);
528 return blkno - start;
529 }
530
f2fs_ra_meta_pages_cond(struct f2fs_sb_info * sbi,pgoff_t index,unsigned int ra_blocks)531 void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index,
532 unsigned int ra_blocks)
533 {
534 struct folio *folio;
535 bool readahead = false;
536
537 if (ra_blocks == RECOVERY_MIN_RA_BLOCKS)
538 return;
539
540 folio = filemap_get_folio(META_MAPPING(sbi), index);
541 if (IS_ERR(folio) || !folio_test_uptodate(folio))
542 readahead = true;
543 f2fs_folio_put(folio, false);
544
545 if (readahead)
546 f2fs_ra_meta_pages(sbi, index, ra_blocks, META_POR, true);
547 }
548
__f2fs_write_meta_folio(struct folio * folio,struct writeback_control * wbc,enum iostat_type io_type)549 static bool __f2fs_write_meta_folio(struct folio *folio,
550 struct writeback_control *wbc,
551 enum iostat_type io_type)
552 {
553 struct f2fs_sb_info *sbi = F2FS_F_SB(folio);
554
555 trace_f2fs_writepage(folio, META);
556
557 if (unlikely(f2fs_cp_error(sbi))) {
558 if (is_sbi_flag_set(sbi, SBI_IS_CLOSE)) {
559 folio_clear_uptodate(folio);
560 dec_page_count(sbi, F2FS_DIRTY_META);
561 folio_unlock(folio);
562 return true;
563 }
564 goto redirty_out;
565 }
566 if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
567 goto redirty_out;
568
569 f2fs_do_write_meta_page(sbi, folio, io_type);
570 dec_page_count(sbi, F2FS_DIRTY_META);
571
572 folio_unlock(folio);
573
574 if (unlikely(f2fs_cp_error(sbi)))
575 f2fs_submit_merged_write(sbi, META);
576
577 return true;
578
579 redirty_out:
580 folio_redirty_for_writepage(wbc, folio);
581 return false;
582 }
583
f2fs_write_meta_pages(struct address_space * mapping,struct writeback_control * wbc)584 static int f2fs_write_meta_pages(struct address_space *mapping,
585 struct writeback_control *wbc)
586 {
587 struct f2fs_sb_info *sbi = F2FS_M_SB(mapping);
588 struct f2fs_lock_context lc;
589 long diff, written;
590
591 if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
592 goto skip_write;
593
594 /* collect a number of dirty meta pages and write together */
595 if (wbc->sync_mode != WB_SYNC_ALL &&
596 get_pages(sbi, F2FS_DIRTY_META) <
597 nr_pages_to_skip(sbi, META))
598 goto skip_write;
599
600 /* if locked failed, cp will flush dirty pages instead */
601 if (!f2fs_down_write_trylock_trace(&sbi->cp_global_sem, &lc))
602 goto skip_write;
603
604 trace_f2fs_writepages(mapping->host, wbc, META);
605 diff = nr_pages_to_write(sbi, META, wbc);
606 written = f2fs_sync_meta_pages(sbi, wbc->nr_to_write, FS_META_IO);
607 f2fs_up_write_trace(&sbi->cp_global_sem, &lc);
608 wbc->nr_to_write = max((long)0, wbc->nr_to_write - written - diff);
609 return 0;
610
611 skip_write:
612 wbc->pages_skipped += get_pages(sbi, F2FS_DIRTY_META);
613 trace_f2fs_writepages(mapping->host, wbc, META);
614 return 0;
615 }
616
f2fs_sync_meta_pages(struct f2fs_sb_info * sbi,long nr_to_write,enum iostat_type io_type)617 long f2fs_sync_meta_pages(struct f2fs_sb_info *sbi, long nr_to_write,
618 enum iostat_type io_type)
619 {
620 struct address_space *mapping = META_MAPPING(sbi);
621 pgoff_t index = 0, prev = ULONG_MAX;
622 struct folio_batch fbatch;
623 long nwritten = 0;
624 int nr_folios;
625 struct writeback_control wbc = {};
626 struct blk_plug plug;
627
628 folio_batch_init(&fbatch);
629
630 blk_start_plug(&plug);
631
632 while ((nr_folios = filemap_get_folios_tag(mapping, &index,
633 (pgoff_t)-1,
634 PAGECACHE_TAG_DIRTY, &fbatch))) {
635 int i;
636
637 for (i = 0; i < nr_folios; i++) {
638 struct folio *folio = fbatch.folios[i];
639
640 if (nr_to_write != LONG_MAX && i != 0 &&
641 folio->index != prev +
642 folio_nr_pages(fbatch.folios[i-1])) {
643 folio_batch_release(&fbatch);
644 goto stop;
645 }
646
647 folio_lock(folio);
648
649 if (unlikely(!is_meta_folio(folio))) {
650 continue_unlock:
651 folio_unlock(folio);
652 continue;
653 }
654 if (!folio_test_dirty(folio)) {
655 /* someone wrote it for us */
656 goto continue_unlock;
657 }
658
659 f2fs_folio_wait_writeback(folio, META, true, true);
660
661 if (!folio_clear_dirty_for_io(folio))
662 goto continue_unlock;
663
664 if (!__f2fs_write_meta_folio(folio, &wbc,
665 io_type)) {
666 folio_unlock(folio);
667 break;
668 }
669 nwritten += folio_nr_pages(folio);
670 prev = folio->index;
671 if (unlikely(nwritten >= nr_to_write))
672 break;
673 }
674 folio_batch_release(&fbatch);
675 cond_resched();
676 }
677 stop:
678 if (nwritten)
679 f2fs_submit_merged_write(sbi, META);
680
681 blk_finish_plug(&plug);
682
683 return nwritten;
684 }
685
f2fs_dirty_meta_folio(struct address_space * mapping,struct folio * folio)686 static bool f2fs_dirty_meta_folio(struct address_space *mapping,
687 struct folio *folio)
688 {
689 trace_f2fs_set_page_dirty(folio, META);
690
691 if (!folio_test_uptodate(folio))
692 folio_mark_uptodate(folio);
693 if (filemap_dirty_folio(mapping, folio)) {
694 inc_page_count(F2FS_M_SB(mapping), F2FS_DIRTY_META);
695 folio_set_f2fs_reference(folio);
696 return true;
697 }
698 return false;
699 }
700
701 const struct address_space_operations f2fs_meta_aops = {
702 .writepages = f2fs_write_meta_pages,
703 .dirty_folio = f2fs_dirty_meta_folio,
704 .invalidate_folio = f2fs_invalidate_folio,
705 .release_folio = f2fs_release_folio,
706 .migrate_folio = filemap_migrate_folio,
707 };
708
__add_ino_entry(struct f2fs_sb_info * sbi,nid_t ino,unsigned int devidx,int type)709 static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino,
710 unsigned int devidx, int type)
711 {
712 struct inode_management *im = &sbi->im[type];
713 struct ino_entry *e = NULL, *new = NULL;
714 int ret;
715
716 if (type == FLUSH_INO) {
717 rcu_read_lock();
718 e = radix_tree_lookup(&im->ino_root, ino);
719 rcu_read_unlock();
720 }
721
722 retry:
723 if (!e)
724 new = f2fs_kmem_cache_alloc(ino_entry_slab,
725 GFP_NOFS, true, NULL);
726
727 ret = radix_tree_preload(GFP_NOFS | __GFP_NOFAIL);
728 f2fs_bug_on(sbi, ret);
729
730 spin_lock(&im->ino_lock);
731 e = radix_tree_lookup(&im->ino_root, ino);
732 if (!e) {
733 if (!new) {
734 spin_unlock(&im->ino_lock);
735 radix_tree_preload_end();
736 goto retry;
737 }
738 e = new;
739 if (unlikely(radix_tree_insert(&im->ino_root, ino, e)))
740 f2fs_bug_on(sbi, 1);
741
742 memset(e, 0, sizeof(struct ino_entry));
743 e->ino = ino;
744
745 list_add_tail(&e->list, &im->ino_list);
746 if (type != ORPHAN_INO)
747 im->ino_num++;
748 }
749
750 if (type == FLUSH_INO)
751 f2fs_set_bit(devidx, (char *)&e->dirty_device);
752
753 spin_unlock(&im->ino_lock);
754 radix_tree_preload_end();
755
756 if (new && e != new)
757 kmem_cache_free(ino_entry_slab, new);
758 }
759
__remove_ino_entry(struct f2fs_sb_info * sbi,nid_t ino,int type)760 static void __remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
761 {
762 struct inode_management *im = &sbi->im[type];
763 struct ino_entry *e;
764
765 spin_lock(&im->ino_lock);
766 e = radix_tree_lookup(&im->ino_root, ino);
767 if (e) {
768 list_del(&e->list);
769 radix_tree_delete(&im->ino_root, ino);
770 im->ino_num--;
771 spin_unlock(&im->ino_lock);
772 kmem_cache_free(ino_entry_slab, e);
773 return;
774 }
775 spin_unlock(&im->ino_lock);
776 }
777
f2fs_add_ino_entry(struct f2fs_sb_info * sbi,nid_t ino,int type)778 void f2fs_add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
779 {
780 /* add new dirty ino entry into list */
781 __add_ino_entry(sbi, ino, 0, type);
782 }
783
f2fs_remove_ino_entry(struct f2fs_sb_info * sbi,nid_t ino,int type)784 void f2fs_remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
785 {
786 /* remove dirty ino entry from list */
787 __remove_ino_entry(sbi, ino, type);
788 }
789
790 /* mode should be APPEND_INO, UPDATE_INO or TRANS_DIR_INO */
f2fs_exist_written_data(struct f2fs_sb_info * sbi,nid_t ino,int mode)791 bool f2fs_exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode)
792 {
793 struct inode_management *im = &sbi->im[mode];
794 struct ino_entry *e;
795
796 spin_lock(&im->ino_lock);
797 e = radix_tree_lookup(&im->ino_root, ino);
798 spin_unlock(&im->ino_lock);
799 return e ? true : false;
800 }
801
f2fs_release_ino_entry(struct f2fs_sb_info * sbi,bool all)802 void f2fs_release_ino_entry(struct f2fs_sb_info *sbi, bool all)
803 {
804 struct ino_entry *e, *tmp;
805 int i;
806
807 for (i = all ? ORPHAN_INO : APPEND_INO; i < MAX_INO_ENTRY; i++) {
808 struct inode_management *im = &sbi->im[i];
809
810 spin_lock(&im->ino_lock);
811 list_for_each_entry_safe(e, tmp, &im->ino_list, list) {
812 list_del(&e->list);
813 radix_tree_delete(&im->ino_root, e->ino);
814 kmem_cache_free(ino_entry_slab, e);
815 im->ino_num--;
816 }
817 spin_unlock(&im->ino_lock);
818 }
819 }
820
f2fs_set_dirty_device(struct f2fs_sb_info * sbi,nid_t ino,unsigned int devidx,int type)821 void f2fs_set_dirty_device(struct f2fs_sb_info *sbi, nid_t ino,
822 unsigned int devidx, int type)
823 {
824 __add_ino_entry(sbi, ino, devidx, type);
825 }
826
f2fs_is_dirty_device(struct f2fs_sb_info * sbi,nid_t ino,unsigned int devidx,int type)827 bool f2fs_is_dirty_device(struct f2fs_sb_info *sbi, nid_t ino,
828 unsigned int devidx, int type)
829 {
830 struct inode_management *im = &sbi->im[type];
831 struct ino_entry *e;
832 bool is_dirty = false;
833
834 spin_lock(&im->ino_lock);
835 e = radix_tree_lookup(&im->ino_root, ino);
836 if (e && f2fs_test_bit(devidx, (char *)&e->dirty_device))
837 is_dirty = true;
838 spin_unlock(&im->ino_lock);
839 return is_dirty;
840 }
841
f2fs_acquire_orphan_inode(struct f2fs_sb_info * sbi)842 int f2fs_acquire_orphan_inode(struct f2fs_sb_info *sbi)
843 {
844 struct inode_management *im = &sbi->im[ORPHAN_INO];
845 int err = 0;
846
847 spin_lock(&im->ino_lock);
848
849 if (time_to_inject(sbi, FAULT_ORPHAN)) {
850 spin_unlock(&im->ino_lock);
851 return -ENOSPC;
852 }
853
854 if (unlikely(im->ino_num >= sbi->max_orphans))
855 err = -ENOSPC;
856 else
857 im->ino_num++;
858 spin_unlock(&im->ino_lock);
859
860 return err;
861 }
862
f2fs_release_orphan_inode(struct f2fs_sb_info * sbi)863 void f2fs_release_orphan_inode(struct f2fs_sb_info *sbi)
864 {
865 struct inode_management *im = &sbi->im[ORPHAN_INO];
866
867 spin_lock(&im->ino_lock);
868 f2fs_bug_on(sbi, im->ino_num == 0);
869 im->ino_num--;
870 spin_unlock(&im->ino_lock);
871 }
872
f2fs_add_orphan_inode(struct inode * inode)873 void f2fs_add_orphan_inode(struct inode *inode)
874 {
875 /* add new orphan ino entry into list */
876 __add_ino_entry(F2FS_I_SB(inode), inode->i_ino, 0, ORPHAN_INO);
877 f2fs_update_inode_page(inode);
878 }
879
f2fs_remove_orphan_inode(struct f2fs_sb_info * sbi,nid_t ino)880 void f2fs_remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
881 {
882 /* remove orphan entry from orphan list */
883 __remove_ino_entry(sbi, ino, ORPHAN_INO);
884 }
885
recover_orphan_inode(struct f2fs_sb_info * sbi,nid_t ino)886 static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
887 {
888 struct inode *inode;
889 struct node_info ni;
890 int err;
891
892 inode = f2fs_iget_retry(sbi->sb, ino);
893 if (IS_ERR(inode)) {
894 /*
895 * there should be a bug that we can't find the entry
896 * to orphan inode.
897 */
898 f2fs_bug_on(sbi, PTR_ERR(inode) == -ENOENT);
899 return PTR_ERR(inode);
900 }
901
902 err = f2fs_dquot_initialize(inode);
903 if (err) {
904 iput(inode);
905 goto err_out;
906 }
907
908 clear_nlink(inode);
909
910 /* truncate all the data during iput */
911 iput(inode);
912
913 err = f2fs_get_node_info(sbi, ino, &ni, false);
914 if (err)
915 goto err_out;
916
917 /* ENOMEM was fully retried in f2fs_evict_inode. */
918 if (ni.blk_addr != NULL_ADDR) {
919 err = -EIO;
920 goto err_out;
921 }
922 return 0;
923
924 err_out:
925 set_sbi_flag(sbi, SBI_NEED_FSCK);
926 f2fs_warn(sbi, "%s: orphan failed (ino=%x), run fsck to fix.",
927 __func__, ino);
928 return err;
929 }
930
f2fs_recover_orphan_inodes(struct f2fs_sb_info * sbi)931 int f2fs_recover_orphan_inodes(struct f2fs_sb_info *sbi)
932 {
933 block_t start_blk, orphan_blocks, i, j;
934 int err = 0;
935
936 if (!is_set_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG))
937 return 0;
938
939 if (f2fs_hw_is_readonly(sbi)) {
940 f2fs_info(sbi, "write access unavailable, skipping orphan cleanup");
941 return 0;
942 }
943
944 if (is_sbi_flag_set(sbi, SBI_IS_WRITABLE))
945 f2fs_info(sbi, "orphan cleanup on readonly fs");
946
947 start_blk = __start_cp_addr(sbi) + 1 + __cp_payload(sbi);
948 orphan_blocks = __start_sum_addr(sbi) - 1 - __cp_payload(sbi);
949
950 f2fs_ra_meta_pages(sbi, start_blk, orphan_blocks, META_CP, true);
951
952 for (i = 0; i < orphan_blocks; i++) {
953 struct folio *folio;
954 struct f2fs_orphan_block *orphan_blk;
955
956 folio = f2fs_get_meta_folio(sbi, start_blk + i);
957 if (IS_ERR(folio)) {
958 err = PTR_ERR(folio);
959 goto out;
960 }
961
962 orphan_blk = folio_address(folio);
963 for (j = 0; j < le32_to_cpu(orphan_blk->entry_count); j++) {
964 nid_t ino = le32_to_cpu(orphan_blk->ino[j]);
965
966 err = recover_orphan_inode(sbi, ino);
967 if (err) {
968 f2fs_folio_put(folio, true);
969 goto out;
970 }
971 }
972 f2fs_folio_put(folio, true);
973 }
974 /* clear Orphan Flag */
975 clear_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG);
976 out:
977 set_sbi_flag(sbi, SBI_IS_RECOVERED);
978
979 return err;
980 }
981
write_orphan_inodes(struct f2fs_sb_info * sbi,block_t start_blk)982 static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
983 {
984 struct list_head *head;
985 struct f2fs_orphan_block *orphan_blk = NULL;
986 unsigned int nentries = 0;
987 unsigned short index = 1;
988 unsigned short orphan_blocks;
989 struct folio *folio = NULL;
990 struct ino_entry *orphan = NULL;
991 struct inode_management *im = &sbi->im[ORPHAN_INO];
992
993 orphan_blocks = GET_ORPHAN_BLOCKS(im->ino_num);
994
995 /*
996 * we don't need to do spin_lock(&im->ino_lock) here, since all the
997 * orphan inode operations are covered under f2fs_lock_op().
998 * And, spin_lock should be avoided due to page operations below.
999 */
1000 head = &im->ino_list;
1001
1002 /* loop for each orphan inode entry and write them in journal block */
1003 list_for_each_entry(orphan, head, list) {
1004 if (!folio) {
1005 folio = f2fs_grab_meta_folio(sbi, start_blk++);
1006 orphan_blk = folio_address(folio);
1007 memset(orphan_blk, 0, sizeof(*orphan_blk));
1008 }
1009
1010 orphan_blk->ino[nentries++] = cpu_to_le32(orphan->ino);
1011
1012 if (nentries == F2FS_ORPHANS_PER_BLOCK) {
1013 /*
1014 * an orphan block is full of 1020 entries,
1015 * then we need to flush current orphan blocks
1016 * and bring another one in memory
1017 */
1018 orphan_blk->blk_addr = cpu_to_le16(index);
1019 orphan_blk->blk_count = cpu_to_le16(orphan_blocks);
1020 orphan_blk->entry_count = cpu_to_le32(nentries);
1021 folio_mark_dirty(folio);
1022 f2fs_folio_put(folio, true);
1023 index++;
1024 nentries = 0;
1025 folio = NULL;
1026 }
1027 }
1028
1029 if (folio) {
1030 orphan_blk->blk_addr = cpu_to_le16(index);
1031 orphan_blk->blk_count = cpu_to_le16(orphan_blocks);
1032 orphan_blk->entry_count = cpu_to_le32(nentries);
1033 folio_mark_dirty(folio);
1034 f2fs_folio_put(folio, true);
1035 }
1036 }
1037
f2fs_checkpoint_chksum(struct f2fs_checkpoint * ckpt)1038 static __u32 f2fs_checkpoint_chksum(struct f2fs_checkpoint *ckpt)
1039 {
1040 unsigned int chksum_ofs = le32_to_cpu(ckpt->checksum_offset);
1041 __u32 chksum;
1042
1043 chksum = f2fs_crc32(ckpt, chksum_ofs);
1044 if (chksum_ofs < CP_CHKSUM_OFFSET) {
1045 chksum_ofs += sizeof(chksum);
1046 chksum = f2fs_chksum(chksum, (__u8 *)ckpt + chksum_ofs,
1047 F2FS_BLKSIZE - chksum_ofs);
1048 }
1049 return chksum;
1050 }
1051
get_checkpoint_version(struct f2fs_sb_info * sbi,block_t cp_addr,struct f2fs_checkpoint ** cp_block,struct folio ** cp_folio,unsigned long long * version)1052 static int get_checkpoint_version(struct f2fs_sb_info *sbi, block_t cp_addr,
1053 struct f2fs_checkpoint **cp_block, struct folio **cp_folio,
1054 unsigned long long *version)
1055 {
1056 size_t crc_offset = 0;
1057 __u32 crc;
1058
1059 *cp_folio = f2fs_get_meta_folio(sbi, cp_addr);
1060 if (IS_ERR(*cp_folio))
1061 return PTR_ERR(*cp_folio);
1062
1063 *cp_block = folio_address(*cp_folio);
1064
1065 crc_offset = le32_to_cpu((*cp_block)->checksum_offset);
1066 if (crc_offset < CP_MIN_CHKSUM_OFFSET ||
1067 crc_offset > CP_CHKSUM_OFFSET) {
1068 f2fs_folio_put(*cp_folio, true);
1069 f2fs_warn(sbi, "invalid crc_offset: %zu", crc_offset);
1070 return -EINVAL;
1071 }
1072
1073 crc = f2fs_checkpoint_chksum(*cp_block);
1074 if (crc != cur_cp_crc(*cp_block)) {
1075 f2fs_folio_put(*cp_folio, true);
1076 f2fs_warn(sbi, "invalid crc value");
1077 return -EINVAL;
1078 }
1079
1080 *version = cur_cp_version(*cp_block);
1081 return 0;
1082 }
1083
validate_checkpoint(struct f2fs_sb_info * sbi,block_t cp_addr,unsigned long long * version)1084 static struct folio *validate_checkpoint(struct f2fs_sb_info *sbi,
1085 block_t cp_addr, unsigned long long *version)
1086 {
1087 struct folio *cp_folio_1 = NULL, *cp_folio_2 = NULL;
1088 struct f2fs_checkpoint *cp_block = NULL;
1089 unsigned long long cur_version = 0, pre_version = 0;
1090 unsigned int cp_blocks;
1091 int err;
1092
1093 err = get_checkpoint_version(sbi, cp_addr, &cp_block,
1094 &cp_folio_1, version);
1095 if (err)
1096 return NULL;
1097
1098 cp_blocks = le32_to_cpu(cp_block->cp_pack_total_block_count);
1099
1100 if (cp_blocks > BLKS_PER_SEG(sbi) || cp_blocks <= F2FS_CP_PACKS) {
1101 f2fs_warn(sbi, "invalid cp_pack_total_block_count:%u",
1102 le32_to_cpu(cp_block->cp_pack_total_block_count));
1103 goto invalid_cp;
1104 }
1105 pre_version = *version;
1106
1107 cp_addr += cp_blocks - 1;
1108 err = get_checkpoint_version(sbi, cp_addr, &cp_block,
1109 &cp_folio_2, version);
1110 if (err)
1111 goto invalid_cp;
1112 cur_version = *version;
1113
1114 if (cur_version == pre_version) {
1115 *version = cur_version;
1116 f2fs_folio_put(cp_folio_2, true);
1117 return cp_folio_1;
1118 }
1119 f2fs_folio_put(cp_folio_2, true);
1120 invalid_cp:
1121 f2fs_folio_put(cp_folio_1, true);
1122 return NULL;
1123 }
1124
f2fs_get_valid_checkpoint(struct f2fs_sb_info * sbi)1125 int f2fs_get_valid_checkpoint(struct f2fs_sb_info *sbi)
1126 {
1127 struct f2fs_checkpoint *cp_block;
1128 struct f2fs_super_block *fsb = sbi->raw_super;
1129 struct folio *cp1, *cp2, *cur_folio;
1130 unsigned long blk_size = sbi->blocksize;
1131 unsigned long long cp1_version = 0, cp2_version = 0;
1132 unsigned long long cp_start_blk_no;
1133 unsigned int cp_blks = 1 + __cp_payload(sbi);
1134 block_t cp_blk_no;
1135 int i;
1136 int err;
1137
1138 sbi->ckpt = f2fs_kvzalloc(sbi, array_size(blk_size, cp_blks),
1139 GFP_KERNEL);
1140 if (!sbi->ckpt)
1141 return -ENOMEM;
1142 /*
1143 * Finding out valid cp block involves read both
1144 * sets( cp pack 1 and cp pack 2)
1145 */
1146 cp_start_blk_no = le32_to_cpu(fsb->cp_blkaddr);
1147 cp1 = validate_checkpoint(sbi, cp_start_blk_no, &cp1_version);
1148
1149 /* The second checkpoint pack should start at the next segment */
1150 cp_start_blk_no += ((unsigned long long)1) <<
1151 le32_to_cpu(fsb->log_blocks_per_seg);
1152 cp2 = validate_checkpoint(sbi, cp_start_blk_no, &cp2_version);
1153
1154 if (cp1 && cp2) {
1155 if (ver_after(cp2_version, cp1_version))
1156 cur_folio = cp2;
1157 else
1158 cur_folio = cp1;
1159 } else if (cp1) {
1160 cur_folio = cp1;
1161 } else if (cp2) {
1162 cur_folio = cp2;
1163 } else {
1164 err = -EFSCORRUPTED;
1165 goto fail_no_cp;
1166 }
1167
1168 cp_block = folio_address(cur_folio);
1169 memcpy(sbi->ckpt, cp_block, blk_size);
1170
1171 if (cur_folio == cp1)
1172 sbi->cur_cp_pack = 1;
1173 else
1174 sbi->cur_cp_pack = 2;
1175
1176 /* Sanity checking of checkpoint */
1177 if (f2fs_sanity_check_ckpt(sbi)) {
1178 err = -EFSCORRUPTED;
1179 goto free_fail_no_cp;
1180 }
1181
1182 if (cp_blks <= 1)
1183 goto done;
1184
1185 cp_blk_no = le32_to_cpu(fsb->cp_blkaddr);
1186 if (cur_folio == cp2)
1187 cp_blk_no += BIT(le32_to_cpu(fsb->log_blocks_per_seg));
1188
1189 for (i = 1; i < cp_blks; i++) {
1190 void *sit_bitmap_ptr;
1191 unsigned char *ckpt = (unsigned char *)sbi->ckpt;
1192
1193 cur_folio = f2fs_get_meta_folio(sbi, cp_blk_no + i);
1194 if (IS_ERR(cur_folio)) {
1195 err = PTR_ERR(cur_folio);
1196 goto free_fail_no_cp;
1197 }
1198 sit_bitmap_ptr = folio_address(cur_folio);
1199 memcpy(ckpt + i * blk_size, sit_bitmap_ptr, blk_size);
1200 f2fs_folio_put(cur_folio, true);
1201 }
1202 done:
1203 f2fs_folio_put(cp1, true);
1204 f2fs_folio_put(cp2, true);
1205 return 0;
1206
1207 free_fail_no_cp:
1208 f2fs_folio_put(cp1, true);
1209 f2fs_folio_put(cp2, true);
1210 fail_no_cp:
1211 kvfree(sbi->ckpt);
1212 return err;
1213 }
1214
__add_dirty_inode(struct inode * inode,enum inode_type type)1215 static void __add_dirty_inode(struct inode *inode, enum inode_type type)
1216 {
1217 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
1218 int flag = (type == DIR_INODE) ? FI_DIRTY_DIR : FI_DIRTY_FILE;
1219
1220 if (is_inode_flag_set(inode, flag))
1221 return;
1222
1223 set_inode_flag(inode, flag);
1224 list_add_tail(&F2FS_I(inode)->dirty_list, &sbi->inode_list[type]);
1225 stat_inc_dirty_inode(sbi, type);
1226 }
1227
__remove_dirty_inode(struct inode * inode,enum inode_type type)1228 static void __remove_dirty_inode(struct inode *inode, enum inode_type type)
1229 {
1230 int flag = (type == DIR_INODE) ? FI_DIRTY_DIR : FI_DIRTY_FILE;
1231
1232 if (get_dirty_pages(inode) || !is_inode_flag_set(inode, flag))
1233 return;
1234
1235 list_del_init(&F2FS_I(inode)->dirty_list);
1236 clear_inode_flag(inode, flag);
1237 stat_dec_dirty_inode(F2FS_I_SB(inode), type);
1238 }
1239
f2fs_update_dirty_folio(struct inode * inode,struct folio * folio)1240 void f2fs_update_dirty_folio(struct inode *inode, struct folio *folio)
1241 {
1242 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
1243 enum inode_type type = S_ISDIR(inode->i_mode) ? DIR_INODE : FILE_INODE;
1244
1245 if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode) &&
1246 !S_ISLNK(inode->i_mode))
1247 return;
1248
1249 spin_lock(&sbi->inode_lock[type]);
1250 if (type != FILE_INODE || test_opt(sbi, DATA_FLUSH))
1251 __add_dirty_inode(inode, type);
1252 inode_inc_dirty_pages(inode);
1253 spin_unlock(&sbi->inode_lock[type]);
1254
1255 folio_set_f2fs_reference(folio);
1256 }
1257
f2fs_remove_dirty_inode(struct inode * inode)1258 void f2fs_remove_dirty_inode(struct inode *inode)
1259 {
1260 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
1261 enum inode_type type = S_ISDIR(inode->i_mode) ? DIR_INODE : FILE_INODE;
1262
1263 if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode) &&
1264 !S_ISLNK(inode->i_mode))
1265 return;
1266
1267 if (type == FILE_INODE && !test_opt(sbi, DATA_FLUSH))
1268 return;
1269
1270 spin_lock(&sbi->inode_lock[type]);
1271 __remove_dirty_inode(inode, type);
1272 spin_unlock(&sbi->inode_lock[type]);
1273 }
1274
f2fs_sync_dirty_inodes(struct f2fs_sb_info * sbi,enum inode_type type,bool from_cp)1275 int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type,
1276 bool from_cp)
1277 {
1278 struct list_head *head;
1279 struct inode *inode;
1280 struct f2fs_inode_info *fi;
1281 bool is_dir = (type == DIR_INODE);
1282 unsigned long ino = 0;
1283
1284 trace_f2fs_sync_dirty_inodes_enter(sbi->sb, is_dir,
1285 get_pages(sbi, is_dir ?
1286 F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA));
1287 retry:
1288 if (unlikely(f2fs_cp_error(sbi))) {
1289 trace_f2fs_sync_dirty_inodes_exit(sbi->sb, is_dir,
1290 get_pages(sbi, is_dir ?
1291 F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA));
1292 return -EIO;
1293 }
1294
1295 spin_lock(&sbi->inode_lock[type]);
1296
1297 head = &sbi->inode_list[type];
1298 if (list_empty(head)) {
1299 spin_unlock(&sbi->inode_lock[type]);
1300 trace_f2fs_sync_dirty_inodes_exit(sbi->sb, is_dir,
1301 get_pages(sbi, is_dir ?
1302 F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA));
1303 return 0;
1304 }
1305 fi = list_first_entry(head, struct f2fs_inode_info, dirty_list);
1306 inode = igrab(&fi->vfs_inode);
1307 spin_unlock(&sbi->inode_lock[type]);
1308 if (inode) {
1309 unsigned long cur_ino = inode->i_ino;
1310
1311 if (from_cp)
1312 F2FS_I(inode)->cp_task = current;
1313 F2FS_I(inode)->wb_task = current;
1314
1315 filemap_fdatawrite(inode->i_mapping);
1316
1317 F2FS_I(inode)->wb_task = NULL;
1318 if (from_cp)
1319 F2FS_I(inode)->cp_task = NULL;
1320
1321 iput(inode);
1322 /* We need to give cpu to another writers. */
1323 if (ino == cur_ino)
1324 cond_resched();
1325 else
1326 ino = cur_ino;
1327 } else {
1328 /*
1329 * We should submit bio, since it exists several
1330 * writebacking dentry pages in the freeing inode.
1331 */
1332 f2fs_submit_merged_write(sbi, DATA);
1333 cond_resched();
1334 }
1335 goto retry;
1336 }
1337
f2fs_sync_inode_meta(struct f2fs_sb_info * sbi)1338 static int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi)
1339 {
1340 struct list_head *head = &sbi->inode_list[DIRTY_META];
1341 struct inode *inode;
1342 struct f2fs_inode_info *fi;
1343 s64 total = get_pages(sbi, F2FS_DIRTY_IMETA);
1344
1345 while (total--) {
1346 if (unlikely(f2fs_cp_error(sbi)))
1347 return -EIO;
1348
1349 spin_lock(&sbi->inode_lock[DIRTY_META]);
1350 if (list_empty(head)) {
1351 spin_unlock(&sbi->inode_lock[DIRTY_META]);
1352 return 0;
1353 }
1354 fi = list_first_entry(head, struct f2fs_inode_info,
1355 gdirty_list);
1356 inode = igrab(&fi->vfs_inode);
1357 spin_unlock(&sbi->inode_lock[DIRTY_META]);
1358 if (inode) {
1359 sync_inode_metadata(inode, 0);
1360
1361 /* it's on eviction */
1362 if (is_inode_flag_set(inode, FI_DIRTY_INODE))
1363 f2fs_update_inode_page(inode);
1364 iput(inode);
1365 }
1366 }
1367 return 0;
1368 }
1369
__prepare_cp_block(struct f2fs_sb_info * sbi)1370 static void __prepare_cp_block(struct f2fs_sb_info *sbi)
1371 {
1372 struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
1373 struct f2fs_nm_info *nm_i = NM_I(sbi);
1374 nid_t last_nid = nm_i->next_scan_nid;
1375
1376 next_free_nid(sbi, &last_nid);
1377 ckpt->valid_block_count = cpu_to_le64(valid_user_blocks(sbi));
1378 ckpt->valid_node_count = cpu_to_le32(valid_node_count(sbi));
1379 ckpt->valid_inode_count = cpu_to_le32(valid_inode_count(sbi));
1380 ckpt->next_free_nid = cpu_to_le32(last_nid);
1381
1382 /* update user_block_counts */
1383 sbi->last_valid_block_count = sbi->total_valid_block_count;
1384 percpu_counter_set(&sbi->alloc_valid_block_count, 0);
1385 percpu_counter_set(&sbi->rf_node_block_count, 0);
1386 }
1387
__need_flush_quota(struct f2fs_sb_info * sbi)1388 static bool __need_flush_quota(struct f2fs_sb_info *sbi)
1389 {
1390 bool ret = false;
1391
1392 if (!is_journalled_quota(sbi))
1393 return false;
1394
1395 if (!f2fs_down_write_trylock(&sbi->quota_sem))
1396 return true;
1397 if (is_sbi_flag_set(sbi, SBI_QUOTA_SKIP_FLUSH)) {
1398 ret = false;
1399 } else if (is_sbi_flag_set(sbi, SBI_QUOTA_NEED_REPAIR)) {
1400 ret = false;
1401 } else if (is_sbi_flag_set(sbi, SBI_QUOTA_NEED_FLUSH)) {
1402 clear_sbi_flag(sbi, SBI_QUOTA_NEED_FLUSH);
1403 ret = true;
1404 } else if (get_pages(sbi, F2FS_DIRTY_QDATA)) {
1405 ret = true;
1406 }
1407 f2fs_up_write(&sbi->quota_sem);
1408 return ret;
1409 }
1410
1411 /*
1412 * Freeze all the FS-operations for checkpoint.
1413 */
block_operations(struct f2fs_sb_info * sbi)1414 static int block_operations(struct f2fs_sb_info *sbi)
1415 {
1416 struct writeback_control wbc = {
1417 .sync_mode = WB_SYNC_ALL,
1418 .nr_to_write = LONG_MAX,
1419 };
1420 int err = 0, cnt = 0;
1421
1422 /*
1423 * Let's flush inline_data in dirty node pages.
1424 */
1425 f2fs_flush_inline_data(sbi);
1426
1427 retry_flush_quotas:
1428 f2fs_lock_all(sbi);
1429 if (__need_flush_quota(sbi)) {
1430 bool need_lock = sbi->umount_lock_holder != current;
1431
1432 if (++cnt > DEFAULT_RETRY_QUOTA_FLUSH_COUNT) {
1433 set_sbi_flag(sbi, SBI_QUOTA_SKIP_FLUSH);
1434 set_sbi_flag(sbi, SBI_QUOTA_NEED_FLUSH);
1435 goto retry_flush_dents;
1436 }
1437 f2fs_unlock_all(sbi);
1438
1439 /* don't grab s_umount lock during mount/umount/remount/freeze/quotactl */
1440 if (!need_lock) {
1441 f2fs_do_quota_sync(sbi->sb, -1);
1442 } else if (down_read_trylock(&sbi->sb->s_umount)) {
1443 f2fs_do_quota_sync(sbi->sb, -1);
1444 up_read(&sbi->sb->s_umount);
1445 }
1446 cond_resched();
1447 goto retry_flush_quotas;
1448 }
1449
1450 retry_flush_dents:
1451 /* write all the dirty dentry pages */
1452 if (get_pages(sbi, F2FS_DIRTY_DENTS)) {
1453 f2fs_unlock_all(sbi);
1454 err = f2fs_sync_dirty_inodes(sbi, DIR_INODE, true);
1455 if (err)
1456 return err;
1457 cond_resched();
1458 goto retry_flush_quotas;
1459 }
1460
1461 /*
1462 * POR: we should ensure that there are no dirty node pages
1463 * until finishing nat/sit flush. inode->i_blocks can be updated.
1464 */
1465 f2fs_down_write(&sbi->node_change);
1466
1467 if (get_pages(sbi, F2FS_DIRTY_IMETA)) {
1468 f2fs_up_write(&sbi->node_change);
1469 f2fs_unlock_all(sbi);
1470 err = f2fs_sync_inode_meta(sbi);
1471 if (err)
1472 return err;
1473 cond_resched();
1474 goto retry_flush_quotas;
1475 }
1476
1477 retry_flush_nodes:
1478 f2fs_down_write(&sbi->node_write);
1479
1480 if (get_pages(sbi, F2FS_DIRTY_NODES)) {
1481 f2fs_up_write(&sbi->node_write);
1482 atomic_inc(&sbi->wb_sync_req[NODE]);
1483 err = f2fs_sync_node_pages(sbi, &wbc, false, FS_CP_NODE_IO);
1484 atomic_dec(&sbi->wb_sync_req[NODE]);
1485 if (err) {
1486 f2fs_up_write(&sbi->node_change);
1487 f2fs_unlock_all(sbi);
1488 return err;
1489 }
1490 cond_resched();
1491 goto retry_flush_nodes;
1492 }
1493
1494 /*
1495 * sbi->node_change is used only for AIO write_begin path which produces
1496 * dirty node blocks and some checkpoint values by block allocation.
1497 */
1498 __prepare_cp_block(sbi);
1499 f2fs_up_write(&sbi->node_change);
1500 return err;
1501 }
1502
unblock_operations(struct f2fs_sb_info * sbi)1503 static void unblock_operations(struct f2fs_sb_info *sbi)
1504 {
1505 f2fs_up_write(&sbi->node_write);
1506 f2fs_unlock_all(sbi);
1507 }
1508
f2fs_wait_on_all_pages(struct f2fs_sb_info * sbi,int type)1509 void f2fs_wait_on_all_pages(struct f2fs_sb_info *sbi, int type)
1510 {
1511 DEFINE_WAIT(wait);
1512
1513 for (;;) {
1514 if (!get_pages(sbi, type))
1515 break;
1516
1517 if (unlikely(f2fs_cp_error(sbi) &&
1518 !is_sbi_flag_set(sbi, SBI_IS_CLOSE)))
1519 break;
1520
1521 if (type == F2FS_DIRTY_META)
1522 f2fs_sync_meta_pages(sbi, LONG_MAX, FS_CP_META_IO);
1523 else if (type == F2FS_WB_CP_DATA)
1524 f2fs_submit_merged_write(sbi, DATA);
1525
1526 prepare_to_wait(&sbi->cp_wait, &wait, TASK_UNINTERRUPTIBLE);
1527 io_schedule_timeout(DEFAULT_SCHEDULE_TIMEOUT);
1528 }
1529 finish_wait(&sbi->cp_wait, &wait);
1530 }
1531
update_ckpt_flags(struct f2fs_sb_info * sbi,struct cp_control * cpc)1532 static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc)
1533 {
1534 unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num;
1535 struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
1536 unsigned long flags;
1537
1538 spin_lock_irqsave(&sbi->cp_lock, flags);
1539
1540 if ((cpc->reason & CP_UMOUNT) &&
1541 le32_to_cpu(ckpt->cp_pack_total_block_count) >
1542 sbi->blocks_per_seg - NM_I(sbi)->nat_bits_blocks)
1543 disable_nat_bits(sbi, false);
1544
1545 if (cpc->reason & CP_TRIMMED)
1546 __set_ckpt_flags(ckpt, CP_TRIMMED_FLAG);
1547 else
1548 __clear_ckpt_flags(ckpt, CP_TRIMMED_FLAG);
1549
1550 if (cpc->reason & CP_UMOUNT)
1551 __set_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
1552 else
1553 __clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
1554
1555 if (cpc->reason & CP_FASTBOOT)
1556 __set_ckpt_flags(ckpt, CP_FASTBOOT_FLAG);
1557 else
1558 __clear_ckpt_flags(ckpt, CP_FASTBOOT_FLAG);
1559
1560 if (orphan_num)
1561 __set_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG);
1562 else
1563 __clear_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG);
1564
1565 if (is_sbi_flag_set(sbi, SBI_NEED_FSCK))
1566 __set_ckpt_flags(ckpt, CP_FSCK_FLAG);
1567
1568 if (is_sbi_flag_set(sbi, SBI_IS_RESIZEFS))
1569 __set_ckpt_flags(ckpt, CP_RESIZEFS_FLAG);
1570 else
1571 __clear_ckpt_flags(ckpt, CP_RESIZEFS_FLAG);
1572
1573 if (is_sbi_flag_set(sbi, SBI_CP_DISABLED))
1574 __set_ckpt_flags(ckpt, CP_DISABLED_FLAG);
1575 else
1576 __clear_ckpt_flags(ckpt, CP_DISABLED_FLAG);
1577
1578 if (is_sbi_flag_set(sbi, SBI_CP_DISABLED_QUICK))
1579 __set_ckpt_flags(ckpt, CP_DISABLED_QUICK_FLAG);
1580 else
1581 __clear_ckpt_flags(ckpt, CP_DISABLED_QUICK_FLAG);
1582
1583 if (is_sbi_flag_set(sbi, SBI_QUOTA_SKIP_FLUSH))
1584 __set_ckpt_flags(ckpt, CP_QUOTA_NEED_FSCK_FLAG);
1585 else
1586 __clear_ckpt_flags(ckpt, CP_QUOTA_NEED_FSCK_FLAG);
1587
1588 if (is_sbi_flag_set(sbi, SBI_QUOTA_NEED_REPAIR))
1589 __set_ckpt_flags(ckpt, CP_QUOTA_NEED_FSCK_FLAG);
1590
1591 /* set this flag to activate crc|cp_ver for recovery */
1592 __set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG);
1593 __clear_ckpt_flags(ckpt, CP_NOCRC_RECOVERY_FLAG);
1594
1595 spin_unlock_irqrestore(&sbi->cp_lock, flags);
1596 }
1597
commit_checkpoint(struct f2fs_sb_info * sbi,void * src,block_t blk_addr)1598 static void commit_checkpoint(struct f2fs_sb_info *sbi,
1599 void *src, block_t blk_addr)
1600 {
1601 struct writeback_control wbc = {};
1602
1603 /*
1604 * filemap_get_folios_tag and folio_lock again will take
1605 * some extra time. Therefore, f2fs_update_meta_pages and
1606 * f2fs_sync_meta_pages are combined in this function.
1607 */
1608 struct folio *folio = f2fs_grab_meta_folio(sbi, blk_addr);
1609
1610 memcpy(folio_address(folio), src, PAGE_SIZE);
1611
1612 folio_mark_dirty(folio);
1613 if (unlikely(!folio_clear_dirty_for_io(folio)))
1614 f2fs_bug_on(sbi, 1);
1615
1616 /* writeout cp pack 2 page */
1617 if (unlikely(!__f2fs_write_meta_folio(folio, &wbc, FS_CP_META_IO))) {
1618 if (f2fs_cp_error(sbi)) {
1619 f2fs_folio_put(folio, true);
1620 return;
1621 }
1622 f2fs_bug_on(sbi, true);
1623 }
1624
1625 f2fs_folio_put(folio, false);
1626
1627 /* submit checkpoint (with barrier if NOBARRIER is not set) */
1628 f2fs_submit_merged_write(sbi, META_FLUSH);
1629 }
1630
get_sectors_written(struct block_device * bdev)1631 static inline u64 get_sectors_written(struct block_device *bdev)
1632 {
1633 return (u64)part_stat_read(bdev, sectors[STAT_WRITE]);
1634 }
1635
f2fs_get_sectors_written(struct f2fs_sb_info * sbi)1636 u64 f2fs_get_sectors_written(struct f2fs_sb_info *sbi)
1637 {
1638 if (f2fs_is_multi_device(sbi)) {
1639 u64 sectors = 0;
1640 int i;
1641
1642 for (i = 0; i < sbi->s_ndevs; i++)
1643 sectors += get_sectors_written(FDEV(i).bdev);
1644
1645 return sectors;
1646 }
1647
1648 return get_sectors_written(sbi->sb->s_bdev);
1649 }
1650
stat_cp_time(struct cp_control * cpc,enum cp_time type)1651 static inline void stat_cp_time(struct cp_control *cpc, enum cp_time type)
1652 {
1653 cpc->stats.times[type] = ktime_get();
1654 }
1655
check_cp_time(struct f2fs_sb_info * sbi,struct cp_control * cpc)1656 static inline void check_cp_time(struct f2fs_sb_info *sbi, struct cp_control *cpc)
1657 {
1658 unsigned long long sb_diff, cur_diff;
1659 enum cp_time ct;
1660
1661 sb_diff = (u64)ktime_ms_delta(sbi->cp_stats.times[CP_TIME_END],
1662 sbi->cp_stats.times[CP_TIME_START]);
1663 cur_diff = (u64)ktime_ms_delta(cpc->stats.times[CP_TIME_END],
1664 cpc->stats.times[CP_TIME_START]);
1665
1666 if (cur_diff > sb_diff) {
1667 sbi->cp_stats = cpc->stats;
1668 if (cur_diff < CP_LONG_LATENCY_THRESHOLD)
1669 return;
1670
1671 f2fs_warn(sbi, "checkpoint was blocked for %llu ms", cur_diff);
1672 for (ct = CP_TIME_START; ct < CP_TIME_MAX - 1; ct++)
1673 f2fs_warn(sbi, "Step#%d: %llu ms", ct,
1674 (u64)ktime_ms_delta(cpc->stats.times[ct + 1],
1675 cpc->stats.times[ct]));
1676 }
1677 }
1678
do_checkpoint(struct f2fs_sb_info * sbi,struct cp_control * cpc)1679 static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
1680 {
1681 struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
1682 struct f2fs_nm_info *nm_i = NM_I(sbi);
1683 unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num, flags;
1684 block_t start_blk;
1685 unsigned int data_sum_blocks, orphan_blocks;
1686 __u32 crc32 = 0;
1687 int i;
1688 int cp_payload_blks = __cp_payload(sbi);
1689 struct curseg_info *seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE);
1690 u64 kbytes_written;
1691 int err;
1692
1693 /* Flush all the NAT/SIT pages */
1694 f2fs_sync_meta_pages(sbi, LONG_MAX, FS_CP_META_IO);
1695
1696 stat_cp_time(cpc, CP_TIME_SYNC_META);
1697
1698 /* start to update checkpoint, cp ver is already updated previously */
1699 ckpt->elapsed_time = cpu_to_le64(get_mtime(sbi, true));
1700 ckpt->free_segment_count = cpu_to_le32(free_segments(sbi));
1701 for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) {
1702 struct curseg_info *curseg = CURSEG_I(sbi, i + CURSEG_HOT_NODE);
1703
1704 ckpt->cur_node_segno[i] = cpu_to_le32(curseg->segno);
1705 ckpt->cur_node_blkoff[i] = cpu_to_le16(curseg->next_blkoff);
1706 ckpt->alloc_type[i + CURSEG_HOT_NODE] = curseg->alloc_type;
1707 }
1708 for (i = 0; i < NR_CURSEG_DATA_TYPE; i++) {
1709 struct curseg_info *curseg = CURSEG_I(sbi, i + CURSEG_HOT_DATA);
1710
1711 ckpt->cur_data_segno[i] = cpu_to_le32(curseg->segno);
1712 ckpt->cur_data_blkoff[i] = cpu_to_le16(curseg->next_blkoff);
1713 ckpt->alloc_type[i + CURSEG_HOT_DATA] = curseg->alloc_type;
1714 }
1715
1716 /* 2 cp + n data seg summary + orphan inode blocks */
1717 data_sum_blocks = f2fs_npages_for_summary_flush(sbi, false);
1718 spin_lock_irqsave(&sbi->cp_lock, flags);
1719 if (data_sum_blocks < NR_CURSEG_DATA_TYPE)
1720 __set_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG);
1721 else
1722 __clear_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG);
1723 spin_unlock_irqrestore(&sbi->cp_lock, flags);
1724
1725 orphan_blocks = GET_ORPHAN_BLOCKS(orphan_num);
1726 ckpt->cp_pack_start_sum = cpu_to_le32(1 + cp_payload_blks +
1727 orphan_blocks);
1728
1729 if (__remain_node_summaries(cpc->reason))
1730 ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS +
1731 cp_payload_blks + data_sum_blocks +
1732 orphan_blocks + NR_CURSEG_NODE_TYPE);
1733 else
1734 ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS +
1735 cp_payload_blks + data_sum_blocks +
1736 orphan_blocks);
1737
1738 /* update ckpt flag for checkpoint */
1739 update_ckpt_flags(sbi, cpc);
1740
1741 /* update SIT/NAT bitmap */
1742 get_sit_bitmap(sbi, __bitmap_ptr(sbi, SIT_BITMAP));
1743 get_nat_bitmap(sbi, __bitmap_ptr(sbi, NAT_BITMAP));
1744
1745 crc32 = f2fs_checkpoint_chksum(ckpt);
1746 *((__le32 *)((unsigned char *)ckpt +
1747 le32_to_cpu(ckpt->checksum_offset)))
1748 = cpu_to_le32(crc32);
1749
1750 start_blk = __start_cp_next_addr(sbi);
1751
1752 /* write nat bits */
1753 if (enabled_nat_bits(sbi, cpc)) {
1754 __u64 cp_ver = cur_cp_version(ckpt);
1755 block_t blk;
1756
1757 cp_ver |= ((__u64)crc32 << 32);
1758 *(__le64 *)nm_i->nat_bits = cpu_to_le64(cp_ver);
1759
1760 blk = start_blk + BLKS_PER_SEG(sbi) - nm_i->nat_bits_blocks;
1761 for (i = 0; i < nm_i->nat_bits_blocks; i++)
1762 f2fs_update_meta_page(sbi, nm_i->nat_bits +
1763 F2FS_BLK_TO_BYTES(i), blk + i);
1764 }
1765
1766 /* write out checkpoint buffer at block 0 */
1767 f2fs_update_meta_page(sbi, ckpt, start_blk++);
1768
1769 for (i = 1; i < 1 + cp_payload_blks; i++)
1770 f2fs_update_meta_page(sbi, (char *)ckpt + i * F2FS_BLKSIZE,
1771 start_blk++);
1772
1773 if (orphan_num) {
1774 write_orphan_inodes(sbi, start_blk);
1775 start_blk += orphan_blocks;
1776 }
1777
1778 f2fs_write_data_summaries(sbi, start_blk);
1779 start_blk += data_sum_blocks;
1780
1781 /* Record write statistics in the hot node summary */
1782 kbytes_written = sbi->kbytes_written;
1783 kbytes_written += (f2fs_get_sectors_written(sbi) -
1784 sbi->sectors_written_start) >> 1;
1785 seg_i->journal->info.kbytes_written = cpu_to_le64(kbytes_written);
1786
1787 if (__remain_node_summaries(cpc->reason)) {
1788 f2fs_write_node_summaries(sbi, start_blk);
1789 start_blk += NR_CURSEG_NODE_TYPE;
1790 }
1791
1792 /* Here, we have one bio having CP pack except cp pack 2 page */
1793 f2fs_sync_meta_pages(sbi, LONG_MAX, FS_CP_META_IO);
1794 stat_cp_time(cpc, CP_TIME_SYNC_CP_META);
1795
1796 /* Wait for all dirty meta pages to be submitted for IO */
1797 f2fs_wait_on_all_pages(sbi, F2FS_DIRTY_META);
1798 stat_cp_time(cpc, CP_TIME_WAIT_DIRTY_META);
1799
1800 /* wait for previous submitted meta pages writeback */
1801 f2fs_wait_on_all_pages(sbi, F2FS_WB_CP_DATA);
1802 stat_cp_time(cpc, CP_TIME_WAIT_CP_DATA);
1803
1804 /* flush all device cache */
1805 err = f2fs_flush_device_cache(sbi);
1806 if (err)
1807 return err;
1808 stat_cp_time(cpc, CP_TIME_FLUSH_DEVICE);
1809
1810 /* barrier and flush checkpoint cp pack 2 page if it can */
1811 commit_checkpoint(sbi, ckpt, start_blk);
1812 f2fs_wait_on_all_pages(sbi, F2FS_WB_CP_DATA);
1813 stat_cp_time(cpc, CP_TIME_WAIT_LAST_CP);
1814
1815 /*
1816 * invalidate intermediate page cache borrowed from meta inode which are
1817 * used for migration of encrypted, verity or compressed inode's blocks.
1818 */
1819 if (f2fs_sb_has_encrypt(sbi) || f2fs_sb_has_verity(sbi) ||
1820 f2fs_sb_has_compression(sbi))
1821 f2fs_bug_on(sbi,
1822 invalidate_inode_pages2_range(META_MAPPING(sbi),
1823 MAIN_BLKADDR(sbi), MAX_BLKADDR(sbi) - 1));
1824
1825 f2fs_release_ino_entry(sbi, false);
1826
1827 f2fs_reset_fsync_node_info(sbi);
1828
1829 clear_sbi_flag(sbi, SBI_IS_DIRTY);
1830 clear_sbi_flag(sbi, SBI_NEED_CP);
1831 clear_sbi_flag(sbi, SBI_QUOTA_SKIP_FLUSH);
1832
1833 spin_lock(&sbi->stat_lock);
1834 sbi->unusable_block_count = 0;
1835 spin_unlock(&sbi->stat_lock);
1836
1837 __set_cp_next_pack(sbi);
1838
1839 /*
1840 * redirty superblock if metadata like node page or inode cache is
1841 * updated during writing checkpoint.
1842 */
1843 if (get_pages(sbi, F2FS_DIRTY_NODES) ||
1844 get_pages(sbi, F2FS_DIRTY_IMETA))
1845 set_sbi_flag(sbi, SBI_IS_DIRTY);
1846
1847 f2fs_bug_on(sbi, get_pages(sbi, F2FS_DIRTY_DENTS));
1848
1849 return unlikely(f2fs_cp_error(sbi)) ? -EIO : 0;
1850 }
1851
f2fs_write_checkpoint(struct f2fs_sb_info * sbi,struct cp_control * cpc)1852 int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
1853 {
1854 struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
1855 struct f2fs_lock_context lc;
1856 unsigned long long ckpt_ver;
1857 int err = 0;
1858
1859 stat_cp_time(cpc, CP_TIME_START);
1860
1861 if (f2fs_readonly(sbi->sb) || f2fs_hw_is_readonly(sbi))
1862 return -EROFS;
1863
1864 if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) {
1865 if (cpc->reason != CP_PAUSE)
1866 return 0;
1867 f2fs_warn(sbi, "Start checkpoint disabled!");
1868 }
1869 if (cpc->reason != CP_RESIZE)
1870 f2fs_down_write_trace(&sbi->cp_global_sem, &lc);
1871
1872 stat_cp_time(cpc, CP_TIME_LOCK);
1873
1874 if (!is_sbi_flag_set(sbi, SBI_IS_DIRTY) &&
1875 ((cpc->reason & CP_FASTBOOT) || (cpc->reason & CP_SYNC) ||
1876 ((cpc->reason & CP_DISCARD) && !sbi->discard_blks)))
1877 goto out;
1878 if (unlikely(f2fs_cp_error(sbi))) {
1879 err = -EIO;
1880 goto out;
1881 }
1882
1883 trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, CP_PHASE_START_BLOCK_OPS);
1884
1885 err = block_operations(sbi);
1886 if (err)
1887 goto out;
1888
1889 stat_cp_time(cpc, CP_TIME_OP_LOCK);
1890
1891 trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, CP_PHASE_FINISH_BLOCK_OPS);
1892
1893 f2fs_flush_merged_writes(sbi);
1894
1895 /* this is the case of multiple fstrims without any changes */
1896 if (cpc->reason & CP_DISCARD) {
1897 if (!f2fs_exist_trim_candidates(sbi, cpc)) {
1898 unblock_operations(sbi);
1899 goto out;
1900 }
1901
1902 if (NM_I(sbi)->nat_cnt[DIRTY_NAT] == 0 &&
1903 SIT_I(sbi)->dirty_sentries == 0 &&
1904 prefree_segments(sbi) == 0) {
1905 f2fs_flush_sit_entries(sbi, cpc);
1906 f2fs_clear_prefree_segments(sbi, cpc);
1907 unblock_operations(sbi);
1908 goto out;
1909 }
1910 }
1911 stat_cp_time(cpc, CP_TIME_MERGE_WRITE);
1912
1913 /*
1914 * update checkpoint pack index
1915 * Increase the version number so that
1916 * SIT entries and seg summaries are written at correct place
1917 */
1918 ckpt_ver = cur_cp_version(ckpt);
1919 ckpt->checkpoint_ver = cpu_to_le64(++ckpt_ver);
1920
1921 /* write cached NAT/SIT entries to NAT/SIT area */
1922 err = f2fs_flush_nat_entries(sbi, cpc);
1923 if (err) {
1924 f2fs_err(sbi, "f2fs_flush_nat_entries failed err:%d, stop checkpoint", err);
1925 f2fs_bug_on(sbi, !f2fs_cp_error(sbi));
1926 goto stop;
1927 }
1928 stat_cp_time(cpc, CP_TIME_FLUSH_NAT);
1929
1930 f2fs_flush_sit_entries(sbi, cpc);
1931
1932 stat_cp_time(cpc, CP_TIME_FLUSH_SIT);
1933
1934 /* save inmem log status */
1935 f2fs_save_inmem_curseg(sbi);
1936
1937 err = do_checkpoint(sbi, cpc);
1938 if (err) {
1939 f2fs_err(sbi, "do_checkpoint failed err:%d, stop checkpoint", err);
1940 f2fs_bug_on(sbi, !f2fs_cp_error(sbi));
1941 f2fs_release_discard_addrs(sbi);
1942 } else {
1943 f2fs_clear_prefree_segments(sbi, cpc);
1944 }
1945
1946 f2fs_restore_inmem_curseg(sbi);
1947 f2fs_reinit_atgc_curseg(sbi);
1948 stat_inc_cp_count(sbi);
1949 stop:
1950 unblock_operations(sbi);
1951 stat_cp_time(cpc, CP_TIME_END);
1952 check_cp_time(sbi, cpc);
1953
1954 if (cpc->reason & CP_RECOVERY)
1955 f2fs_notice(sbi, "checkpoint: version = %llx", ckpt_ver);
1956
1957 /* update CP_TIME to trigger checkpoint periodically */
1958 f2fs_update_time(sbi, CP_TIME);
1959 trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, CP_PHASE_FINISH_CHECKPOINT);
1960 out:
1961 if (cpc->reason != CP_RESIZE)
1962 f2fs_up_write_trace(&sbi->cp_global_sem, &lc);
1963 return err;
1964 }
1965
f2fs_init_ino_entry_info(struct f2fs_sb_info * sbi)1966 void f2fs_init_ino_entry_info(struct f2fs_sb_info *sbi)
1967 {
1968 int i;
1969
1970 for (i = 0; i < MAX_INO_ENTRY; i++) {
1971 struct inode_management *im = &sbi->im[i];
1972
1973 INIT_RADIX_TREE(&im->ino_root, GFP_ATOMIC);
1974 spin_lock_init(&im->ino_lock);
1975 INIT_LIST_HEAD(&im->ino_list);
1976 im->ino_num = 0;
1977 }
1978
1979 sbi->max_orphans = (BLKS_PER_SEG(sbi) - F2FS_CP_PACKS -
1980 NR_CURSEG_PERSIST_TYPE - __cp_payload(sbi)) *
1981 F2FS_ORPHANS_PER_BLOCK;
1982 }
1983
f2fs_create_checkpoint_caches(void)1984 int __init f2fs_create_checkpoint_caches(void)
1985 {
1986 ino_entry_slab = f2fs_kmem_cache_create("f2fs_ino_entry",
1987 sizeof(struct ino_entry));
1988 if (!ino_entry_slab)
1989 return -ENOMEM;
1990 f2fs_inode_entry_slab = f2fs_kmem_cache_create("f2fs_inode_entry",
1991 sizeof(struct inode_entry));
1992 if (!f2fs_inode_entry_slab) {
1993 kmem_cache_destroy(ino_entry_slab);
1994 return -ENOMEM;
1995 }
1996 return 0;
1997 }
1998
f2fs_destroy_checkpoint_caches(void)1999 void f2fs_destroy_checkpoint_caches(void)
2000 {
2001 kmem_cache_destroy(ino_entry_slab);
2002 kmem_cache_destroy(f2fs_inode_entry_slab);
2003 }
2004
__write_checkpoint_sync(struct f2fs_sb_info * sbi)2005 static int __write_checkpoint_sync(struct f2fs_sb_info *sbi)
2006 {
2007 struct cp_control cpc = { .reason = CP_SYNC, };
2008 struct f2fs_lock_context lc;
2009 int err;
2010
2011 f2fs_down_write_trace(&sbi->gc_lock, &lc);
2012 err = f2fs_write_checkpoint(sbi, &cpc);
2013 f2fs_up_write_trace(&sbi->gc_lock, &lc);
2014
2015 return err;
2016 }
2017
__checkpoint_and_complete_reqs(struct f2fs_sb_info * sbi)2018 static void __checkpoint_and_complete_reqs(struct f2fs_sb_info *sbi)
2019 {
2020 struct ckpt_req_control *cprc = &sbi->cprc_info;
2021 struct ckpt_req *req, *next;
2022 struct llist_node *dispatch_list;
2023 u64 sum_diff = 0, diff, count = 0;
2024 int ret;
2025
2026 dispatch_list = llist_del_all(&cprc->issue_list);
2027 if (!dispatch_list)
2028 return;
2029 dispatch_list = llist_reverse_order(dispatch_list);
2030
2031 ret = __write_checkpoint_sync(sbi);
2032 atomic_inc(&cprc->issued_ckpt);
2033
2034 llist_for_each_entry_safe(req, next, dispatch_list, llnode) {
2035 diff = (u64)ktime_ms_delta(ktime_get(), req->queue_time);
2036 req->ret = ret;
2037 req->delta_time = diff;
2038 complete(&req->wait);
2039
2040 sum_diff += diff;
2041 count++;
2042 }
2043 atomic_sub(count, &cprc->queued_ckpt);
2044 atomic_add(count, &cprc->total_ckpt);
2045
2046 spin_lock(&cprc->stat_lock);
2047 cprc->cur_time = (unsigned int)div64_u64(sum_diff, count);
2048 if (cprc->peak_time < cprc->cur_time)
2049 cprc->peak_time = cprc->cur_time;
2050 spin_unlock(&cprc->stat_lock);
2051 }
2052
issue_checkpoint_thread(void * data)2053 static int issue_checkpoint_thread(void *data)
2054 {
2055 struct f2fs_sb_info *sbi = data;
2056 struct ckpt_req_control *cprc = &sbi->cprc_info;
2057 wait_queue_head_t *q = &cprc->ckpt_wait_queue;
2058 repeat:
2059 if (kthread_should_stop())
2060 return 0;
2061
2062 if (!llist_empty(&cprc->issue_list))
2063 __checkpoint_and_complete_reqs(sbi);
2064
2065 wait_event_interruptible(*q,
2066 kthread_should_stop() || !llist_empty(&cprc->issue_list));
2067 goto repeat;
2068 }
2069
flush_remained_ckpt_reqs(struct f2fs_sb_info * sbi,struct ckpt_req * wait_req)2070 static void flush_remained_ckpt_reqs(struct f2fs_sb_info *sbi,
2071 struct ckpt_req *wait_req)
2072 {
2073 struct ckpt_req_control *cprc = &sbi->cprc_info;
2074
2075 if (!llist_empty(&cprc->issue_list)) {
2076 __checkpoint_and_complete_reqs(sbi);
2077 } else {
2078 /* already dispatched by issue_checkpoint_thread */
2079 if (wait_req)
2080 wait_for_completion(&wait_req->wait);
2081 }
2082 }
2083
init_ckpt_req(struct ckpt_req * req)2084 static void init_ckpt_req(struct ckpt_req *req)
2085 {
2086 memset(req, 0, sizeof(struct ckpt_req));
2087
2088 init_completion(&req->wait);
2089 req->queue_time = ktime_get();
2090 }
2091
f2fs_issue_checkpoint(struct f2fs_sb_info * sbi)2092 int f2fs_issue_checkpoint(struct f2fs_sb_info *sbi)
2093 {
2094 struct ckpt_req_control *cprc = &sbi->cprc_info;
2095 struct ckpt_req req;
2096 struct cp_control cpc;
2097
2098 cpc.reason = __get_cp_reason(sbi);
2099 if (!test_opt(sbi, MERGE_CHECKPOINT) || cpc.reason != CP_SYNC ||
2100 sbi->umount_lock_holder == current) {
2101 struct f2fs_lock_context lc;
2102 int ret;
2103
2104 f2fs_down_write_trace(&sbi->gc_lock, &lc);
2105 ret = f2fs_write_checkpoint(sbi, &cpc);
2106 f2fs_up_write_trace(&sbi->gc_lock, &lc);
2107
2108 return ret;
2109 }
2110
2111 if (!cprc->f2fs_issue_ckpt)
2112 return __write_checkpoint_sync(sbi);
2113
2114 init_ckpt_req(&req);
2115
2116 llist_add(&req.llnode, &cprc->issue_list);
2117 atomic_inc(&cprc->queued_ckpt);
2118
2119 /*
2120 * update issue_list before we wake up issue_checkpoint thread,
2121 * this smp_mb() pairs with another barrier in ___wait_event(),
2122 * see more details in comments of waitqueue_active().
2123 */
2124 smp_mb();
2125
2126 if (waitqueue_active(&cprc->ckpt_wait_queue))
2127 wake_up(&cprc->ckpt_wait_queue);
2128
2129 if (cprc->f2fs_issue_ckpt)
2130 wait_for_completion(&req.wait);
2131 else
2132 flush_remained_ckpt_reqs(sbi, &req);
2133
2134 if (unlikely(req.delta_time >= CP_LONG_LATENCY_THRESHOLD)) {
2135 f2fs_warn_ratelimited(sbi,
2136 "blocked on checkpoint for %u ms", cprc->peak_time);
2137 dump_stack();
2138 }
2139
2140 return req.ret;
2141 }
2142
f2fs_start_ckpt_thread(struct f2fs_sb_info * sbi)2143 int f2fs_start_ckpt_thread(struct f2fs_sb_info *sbi)
2144 {
2145 dev_t dev = sbi->sb->s_bdev->bd_dev;
2146 struct ckpt_req_control *cprc = &sbi->cprc_info;
2147
2148 if (cprc->f2fs_issue_ckpt)
2149 return 0;
2150
2151 cprc->f2fs_issue_ckpt = kthread_run(issue_checkpoint_thread, sbi,
2152 "f2fs_ckpt-%u:%u", MAJOR(dev), MINOR(dev));
2153 if (IS_ERR(cprc->f2fs_issue_ckpt)) {
2154 int err = PTR_ERR(cprc->f2fs_issue_ckpt);
2155
2156 cprc->f2fs_issue_ckpt = NULL;
2157 return err;
2158 }
2159
2160 set_task_ioprio(cprc->f2fs_issue_ckpt, cprc->ckpt_thread_ioprio);
2161 set_user_nice(cprc->f2fs_issue_ckpt,
2162 PRIO_TO_NICE(sbi->critical_task_priority));
2163
2164 return 0;
2165 }
2166
f2fs_stop_ckpt_thread(struct f2fs_sb_info * sbi)2167 void f2fs_stop_ckpt_thread(struct f2fs_sb_info *sbi)
2168 {
2169 struct ckpt_req_control *cprc = &sbi->cprc_info;
2170 struct task_struct *ckpt_task;
2171
2172 if (!cprc->f2fs_issue_ckpt)
2173 return;
2174
2175 ckpt_task = cprc->f2fs_issue_ckpt;
2176 cprc->f2fs_issue_ckpt = NULL;
2177 kthread_stop(ckpt_task);
2178
2179 f2fs_flush_ckpt_thread(sbi);
2180 }
2181
f2fs_flush_ckpt_thread(struct f2fs_sb_info * sbi)2182 void f2fs_flush_ckpt_thread(struct f2fs_sb_info *sbi)
2183 {
2184 struct ckpt_req_control *cprc = &sbi->cprc_info;
2185
2186 flush_remained_ckpt_reqs(sbi, NULL);
2187
2188 /* Let's wait for the previous dispatched checkpoint. */
2189 while (atomic_read(&cprc->queued_ckpt))
2190 io_schedule_timeout(DEFAULT_SCHEDULE_TIMEOUT);
2191 }
2192
f2fs_init_ckpt_req_control(struct f2fs_sb_info * sbi)2193 void f2fs_init_ckpt_req_control(struct f2fs_sb_info *sbi)
2194 {
2195 struct ckpt_req_control *cprc = &sbi->cprc_info;
2196
2197 atomic_set(&cprc->issued_ckpt, 0);
2198 atomic_set(&cprc->total_ckpt, 0);
2199 atomic_set(&cprc->queued_ckpt, 0);
2200 cprc->ckpt_thread_ioprio = DEFAULT_CHECKPOINT_IOPRIO;
2201 init_waitqueue_head(&cprc->ckpt_wait_queue);
2202 init_llist_head(&cprc->issue_list);
2203 spin_lock_init(&cprc->stat_lock);
2204 }
2205