1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * fs/ext4/extents_status.c
4 *
5 * Written by Yongqiang Yang <xiaoqiangnk@gmail.com>
6 * Modified by
7 * Allison Henderson <achender@linux.vnet.ibm.com>
8 * Hugh Dickins <hughd@google.com>
9 * Zheng Liu <wenqing.lz@taobao.com>
10 *
11 * Ext4 extents status tree core functions.
12 */
13 #include <linux/list_sort.h>
14 #include <linux/proc_fs.h>
15 #include <linux/seq_file.h>
16 #include "ext4.h"
17
18 #include <trace/events/ext4.h>
19 #include <kunit/static_stub.h>
20
21 /*
22 * According to previous discussion in Ext4 Developer Workshop, we
23 * will introduce a new structure called io tree to track all extent
24 * status in order to solve some problems that we have met
25 * (e.g. Reservation space warning), and provide extent-level locking.
26 * Delay extent tree is the first step to achieve this goal. It is
27 * original built by Yongqiang Yang. At that time it is called delay
28 * extent tree, whose goal is only track delayed extents in memory to
29 * simplify the implementation of fiemap and bigalloc, and introduce
30 * lseek SEEK_DATA/SEEK_HOLE support. That is why it is still called
31 * delay extent tree at the first commit. But for better understand
32 * what it does, it has been rename to extent status tree.
33 *
34 * Step1:
35 * Currently the first step has been done. All delayed extents are
36 * tracked in the tree. It maintains the delayed extent when a delayed
37 * allocation is issued, and the delayed extent is written out or
38 * invalidated. Therefore the implementation of fiemap and bigalloc
39 * are simplified, and SEEK_DATA/SEEK_HOLE are introduced.
40 *
41 * The following comment describes the implemenmtation of extent
42 * status tree and future works.
43 *
44 * Step2:
45 * In this step all extent status are tracked by extent status tree.
46 * Thus, we can first try to lookup a block mapping in this tree before
47 * finding it in extent tree. Hence, single extent cache can be removed
48 * because extent status tree can do a better job. Extents in status
49 * tree are loaded on-demand. Therefore, the extent status tree may not
50 * contain all of the extents in a file. Meanwhile we define a shrinker
51 * to reclaim memory from extent status tree because fragmented extent
52 * tree will make status tree cost too much memory. written/unwritten/-
53 * hole extents in the tree will be reclaimed by this shrinker when we
54 * are under high memory pressure. Delayed extents will not be
55 * reclimed because fiemap, bigalloc, and seek_data/hole need it.
56 */
57
58 /*
59 * Extent status tree implementation for ext4.
60 *
61 *
62 * ==========================================================================
63 * Extent status tree tracks all extent status.
64 *
65 * 1. Why we need to implement extent status tree?
66 *
67 * Without extent status tree, ext4 identifies a delayed extent by looking
68 * up page cache, this has several deficiencies - complicated, buggy,
69 * and inefficient code.
70 *
71 * FIEMAP, SEEK_HOLE/DATA, bigalloc, and writeout all need to know if a
72 * block or a range of blocks are belonged to a delayed extent.
73 *
74 * Let us have a look at how they do without extent status tree.
75 * -- FIEMAP
76 * FIEMAP looks up page cache to identify delayed allocations from holes.
77 *
78 * -- SEEK_HOLE/DATA
79 * SEEK_HOLE/DATA has the same problem as FIEMAP.
80 *
81 * -- bigalloc
82 * bigalloc looks up page cache to figure out if a block is
83 * already under delayed allocation or not to determine whether
84 * quota reserving is needed for the cluster.
85 *
86 * -- writeout
87 * Writeout looks up whole page cache to see if a buffer is
88 * mapped, If there are not very many delayed buffers, then it is
89 * time consuming.
90 *
91 * With extent status tree implementation, FIEMAP, SEEK_HOLE/DATA,
92 * bigalloc and writeout can figure out if a block or a range of
93 * blocks is under delayed allocation(belonged to a delayed extent) or
94 * not by searching the extent tree.
95 *
96 *
97 * ==========================================================================
98 * 2. Ext4 extent status tree impelmentation
99 *
100 * -- extent
101 * A extent is a range of blocks which are contiguous logically and
102 * physically. Unlike extent in extent tree, this extent in ext4 is
103 * a in-memory struct, there is no corresponding on-disk data. There
104 * is no limit on length of extent, so an extent can contain as many
105 * blocks as they are contiguous logically and physically.
106 *
107 * -- extent status tree
108 * Every inode has an extent status tree and all allocation blocks
109 * are added to the tree with different status. The extent in the
110 * tree are ordered by logical block no.
111 *
112 * -- operations on a extent status tree
113 * There are three important operations on a delayed extent tree: find
114 * next extent, adding a extent(a range of blocks) and removing a extent.
115 *
116 * -- race on a extent status tree
117 * Extent status tree is protected by inode->i_es_lock.
118 *
119 * -- memory consumption
120 * Fragmented extent tree will make extent status tree cost too much
121 * memory. Hence, we will reclaim written/unwritten/hole extents from
122 * the tree under a heavy memory pressure.
123 *
124 * ==========================================================================
125 * 3. Assurance of Ext4 extent status tree consistency
126 *
127 * When mapping blocks, Ext4 queries the extent status tree first and should
128 * always trusts that the extent status tree is consistent and up to date.
129 * Therefore, it is important to adheres to the following rules when createing,
130 * modifying and removing extents.
131 *
132 * 1. Besides fastcommit replay, when Ext4 creates or queries block mappings,
133 * the extent information should always be processed through the extent
134 * status tree instead of being organized manually through the on-disk
135 * extent tree.
136 *
137 * 2. When updating the extent tree, Ext4 should acquire the i_data_sem
138 * exclusively and update the extent status tree atomically. If the extents
139 * to be modified are large enough to exceed the range that a single
140 * i_data_sem can process (as ext4_datasem_ensure_credits() may drop
141 * i_data_sem to restart a transaction), it must (e.g. as ext4_punch_hole()
142 * does):
143 *
144 * a) Hold the i_rwsem and invalidate_lock exclusively. This ensures
145 * exclusion against page faults, as well as reads and writes that may
146 * concurrently modify the extent status tree.
147 * b) Evict all page cache in the affected range and recommend rebuilding
148 * or dropping the extent status tree after modifying the on-disk
149 * extent tree. This ensures exclusion against concurrent writebacks
150 * that do not hold those locks but only holds a folio lock.
151 *
152 * 3. Based on the rules above, when querying block mappings, Ext4 should at
153 * least hold the i_rwsem or invalidate_lock or folio lock(s) for the
154 * specified querying range.
155 *
156 * ==========================================================================
157 * 4. Performance analysis
158 *
159 * -- overhead
160 * 1. There is a cache extent for write access, so if writes are
161 * not very random, adding space operaions are in O(1) time.
162 *
163 * -- gain
164 * 2. Code is much simpler, more readable, more maintainable and
165 * more efficient.
166 *
167 *
168 * ==========================================================================
169 * 5. TODO list
170 *
171 * -- Refactor delayed space reservation
172 *
173 * -- Extent-level locking
174 */
175
176 static struct kmem_cache *ext4_es_cachep;
177 static struct kmem_cache *ext4_pending_cachep;
178
179 static int __es_insert_extent(struct inode *inode, struct extent_status *newes,
180 struct extent_status *prealloc);
181 static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
182 ext4_lblk_t end, unsigned int status,
183 int *reserved, struct extent_status *res,
184 struct extent_status *prealloc);
185 static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan);
186 static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
187 struct ext4_inode_info *locked_ei);
188 static int __revise_pending(struct inode *inode, ext4_lblk_t lblk,
189 ext4_lblk_t len,
190 struct pending_reservation **prealloc);
191
ext4_init_es(void)192 int __init ext4_init_es(void)
193 {
194 ext4_es_cachep = KMEM_CACHE(extent_status, SLAB_RECLAIM_ACCOUNT);
195 if (ext4_es_cachep == NULL)
196 return -ENOMEM;
197 return 0;
198 }
199
ext4_exit_es(void)200 void ext4_exit_es(void)
201 {
202 kmem_cache_destroy(ext4_es_cachep);
203 }
204
ext4_es_init_tree(struct ext4_es_tree * tree)205 void ext4_es_init_tree(struct ext4_es_tree *tree)
206 {
207 tree->root = RB_ROOT;
208 tree->cache_es = NULL;
209 }
210
211 #ifdef ES_DEBUG__
ext4_es_print_tree(struct inode * inode)212 static void ext4_es_print_tree(struct inode *inode)
213 {
214 struct ext4_es_tree *tree;
215 struct rb_node *node;
216
217 printk(KERN_DEBUG "status extents for inode %lu:", inode->i_ino);
218 tree = &EXT4_I(inode)->i_es_tree;
219 node = rb_first(&tree->root);
220 while (node) {
221 struct extent_status *es;
222 es = rb_entry(node, struct extent_status, rb_node);
223 printk(KERN_DEBUG " [%u/%u) %llu %x",
224 es->es_lblk, es->es_len,
225 ext4_es_pblock(es), ext4_es_status(es));
226 node = rb_next(node);
227 }
228 printk(KERN_DEBUG "\n");
229 }
230 #else
231 #define ext4_es_print_tree(inode)
232 #endif
233
ext4_es_end(struct extent_status * es)234 static inline ext4_lblk_t ext4_es_end(struct extent_status *es)
235 {
236 BUG_ON(es->es_lblk + es->es_len < es->es_lblk);
237 return es->es_lblk + es->es_len - 1;
238 }
239
ext4_es_inc_seq(struct inode * inode)240 static inline void ext4_es_inc_seq(struct inode *inode)
241 {
242 struct ext4_inode_info *ei = EXT4_I(inode);
243
244 WRITE_ONCE(ei->i_es_seq, ei->i_es_seq + 1);
245 }
246
__es_check_extent_status(struct extent_status * es,unsigned int status,struct extent_status * res)247 static inline int __es_check_extent_status(struct extent_status *es,
248 unsigned int status,
249 struct extent_status *res)
250 {
251 if (ext4_es_type(es) & status)
252 return 0;
253
254 if (res) {
255 res->es_lblk = es->es_lblk;
256 res->es_len = es->es_len;
257 res->es_pblk = es->es_pblk;
258 }
259 return -EINVAL;
260 }
261
262 /*
263 * search through the tree for an delayed extent with a given offset. If
264 * it can't be found, try to find next extent.
265 */
__es_tree_search(struct rb_root * root,ext4_lblk_t lblk)266 static struct extent_status *__es_tree_search(struct rb_root *root,
267 ext4_lblk_t lblk)
268 {
269 struct rb_node *node = root->rb_node;
270 struct extent_status *es = NULL;
271
272 while (node) {
273 es = rb_entry(node, struct extent_status, rb_node);
274 if (lblk < es->es_lblk)
275 node = node->rb_left;
276 else if (lblk > ext4_es_end(es))
277 node = node->rb_right;
278 else
279 return es;
280 }
281
282 if (es && lblk < es->es_lblk)
283 return es;
284
285 if (es && lblk > ext4_es_end(es)) {
286 node = rb_next(&es->rb_node);
287 return node ? rb_entry(node, struct extent_status, rb_node) :
288 NULL;
289 }
290
291 return NULL;
292 }
293
294 /*
295 * ext4_es_find_extent_range - find extent with specified status within block
296 * range or next extent following block range in
297 * extents status tree
298 *
299 * @inode - file containing the range
300 * @matching_fn - pointer to function that matches extents with desired status
301 * @lblk - logical block defining start of range
302 * @end - logical block defining end of range
303 * @es - extent found, if any
304 *
305 * Find the first extent within the block range specified by @lblk and @end
306 * in the extents status tree that satisfies @matching_fn. If a match
307 * is found, it's returned in @es. If not, and a matching extent is found
308 * beyond the block range, it's returned in @es. If no match is found, an
309 * extent is returned in @es whose es_lblk, es_len, and es_pblk components
310 * are 0.
311 */
__es_find_extent_range(struct inode * inode,int (* matching_fn)(struct extent_status * es),ext4_lblk_t lblk,ext4_lblk_t end,struct extent_status * es)312 static void __es_find_extent_range(struct inode *inode,
313 int (*matching_fn)(struct extent_status *es),
314 ext4_lblk_t lblk, ext4_lblk_t end,
315 struct extent_status *es)
316 {
317 struct ext4_es_tree *tree = NULL;
318 struct extent_status *es1 = NULL;
319 struct rb_node *node;
320
321 WARN_ON(es == NULL);
322 WARN_ON(end < lblk);
323
324 tree = &EXT4_I(inode)->i_es_tree;
325
326 /* see if the extent has been cached */
327 es->es_lblk = es->es_len = es->es_pblk = 0;
328 es1 = READ_ONCE(tree->cache_es);
329 if (es1 && in_range(lblk, es1->es_lblk, es1->es_len)) {
330 es_debug("%u cached by [%u/%u) %llu %x\n",
331 lblk, es1->es_lblk, es1->es_len,
332 ext4_es_pblock(es1), ext4_es_status(es1));
333 goto out;
334 }
335
336 es1 = __es_tree_search(&tree->root, lblk);
337
338 out:
339 if (es1 && !matching_fn(es1)) {
340 while ((node = rb_next(&es1->rb_node)) != NULL) {
341 es1 = rb_entry(node, struct extent_status, rb_node);
342 if (es1->es_lblk > end) {
343 es1 = NULL;
344 break;
345 }
346 if (matching_fn(es1))
347 break;
348 }
349 }
350
351 if (es1 && matching_fn(es1)) {
352 WRITE_ONCE(tree->cache_es, es1);
353 es->es_lblk = es1->es_lblk;
354 es->es_len = es1->es_len;
355 es->es_pblk = es1->es_pblk;
356 }
357
358 }
359
360 /*
361 * Locking for __es_find_extent_range() for external use
362 */
ext4_es_find_extent_range(struct inode * inode,int (* matching_fn)(struct extent_status * es),ext4_lblk_t lblk,ext4_lblk_t end,struct extent_status * es)363 void ext4_es_find_extent_range(struct inode *inode,
364 int (*matching_fn)(struct extent_status *es),
365 ext4_lblk_t lblk, ext4_lblk_t end,
366 struct extent_status *es)
367 {
368 es->es_lblk = es->es_len = es->es_pblk = 0;
369
370 if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
371 return;
372
373 trace_ext4_es_find_extent_range_enter(inode, lblk);
374
375 read_lock(&EXT4_I(inode)->i_es_lock);
376 __es_find_extent_range(inode, matching_fn, lblk, end, es);
377 read_unlock(&EXT4_I(inode)->i_es_lock);
378
379 trace_ext4_es_find_extent_range_exit(inode, es);
380 }
381
382 /*
383 * __es_scan_range - search block range for block with specified status
384 * in extents status tree
385 *
386 * @inode - file containing the range
387 * @matching_fn - pointer to function that matches extents with desired status
388 * @lblk - logical block defining start of range
389 * @end - logical block defining end of range
390 *
391 * Returns true if at least one block in the specified block range satisfies
392 * the criterion specified by @matching_fn, and false if not. If at least
393 * one extent has the specified status, then there is at least one block
394 * in the cluster with that status. Should only be called by code that has
395 * taken i_es_lock.
396 */
__es_scan_range(struct inode * inode,int (* matching_fn)(struct extent_status * es),ext4_lblk_t start,ext4_lblk_t end)397 static bool __es_scan_range(struct inode *inode,
398 int (*matching_fn)(struct extent_status *es),
399 ext4_lblk_t start, ext4_lblk_t end)
400 {
401 struct extent_status es;
402
403 __es_find_extent_range(inode, matching_fn, start, end, &es);
404 if (es.es_len == 0)
405 return false; /* no matching extent in the tree */
406 else if (es.es_lblk <= start &&
407 start < es.es_lblk + es.es_len)
408 return true;
409 else if (start <= es.es_lblk && es.es_lblk <= end)
410 return true;
411 else
412 return false;
413 }
414 /*
415 * Locking for __es_scan_range() for external use
416 */
ext4_es_scan_range(struct inode * inode,int (* matching_fn)(struct extent_status * es),ext4_lblk_t lblk,ext4_lblk_t end)417 bool ext4_es_scan_range(struct inode *inode,
418 int (*matching_fn)(struct extent_status *es),
419 ext4_lblk_t lblk, ext4_lblk_t end)
420 {
421 bool ret;
422
423 if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
424 return false;
425
426 read_lock(&EXT4_I(inode)->i_es_lock);
427 ret = __es_scan_range(inode, matching_fn, lblk, end);
428 read_unlock(&EXT4_I(inode)->i_es_lock);
429
430 return ret;
431 }
432
433 /*
434 * __es_scan_clu - search cluster for block with specified status in
435 * extents status tree
436 *
437 * @inode - file containing the cluster
438 * @matching_fn - pointer to function that matches extents with desired status
439 * @lblk - logical block in cluster to be searched
440 *
441 * Returns true if at least one extent in the cluster containing @lblk
442 * satisfies the criterion specified by @matching_fn, and false if not. If at
443 * least one extent has the specified status, then there is at least one block
444 * in the cluster with that status. Should only be called by code that has
445 * taken i_es_lock.
446 */
__es_scan_clu(struct inode * inode,int (* matching_fn)(struct extent_status * es),ext4_lblk_t lblk)447 static bool __es_scan_clu(struct inode *inode,
448 int (*matching_fn)(struct extent_status *es),
449 ext4_lblk_t lblk)
450 {
451 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
452 ext4_lblk_t lblk_start, lblk_end;
453
454 lblk_start = EXT4_LBLK_CMASK(sbi, lblk);
455 lblk_end = lblk_start + sbi->s_cluster_ratio - 1;
456
457 return __es_scan_range(inode, matching_fn, lblk_start, lblk_end);
458 }
459
460 /*
461 * Locking for __es_scan_clu() for external use
462 */
ext4_es_scan_clu(struct inode * inode,int (* matching_fn)(struct extent_status * es),ext4_lblk_t lblk)463 bool ext4_es_scan_clu(struct inode *inode,
464 int (*matching_fn)(struct extent_status *es),
465 ext4_lblk_t lblk)
466 {
467 bool ret;
468
469 if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
470 return false;
471
472 read_lock(&EXT4_I(inode)->i_es_lock);
473 ret = __es_scan_clu(inode, matching_fn, lblk);
474 read_unlock(&EXT4_I(inode)->i_es_lock);
475
476 return ret;
477 }
478
ext4_es_list_add(struct inode * inode)479 static void ext4_es_list_add(struct inode *inode)
480 {
481 struct ext4_inode_info *ei = EXT4_I(inode);
482 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
483
484 if (!list_empty(&ei->i_es_list))
485 return;
486
487 spin_lock(&sbi->s_es_lock);
488 if (list_empty(&ei->i_es_list)) {
489 list_add_tail(&ei->i_es_list, &sbi->s_es_list);
490 sbi->s_es_nr_inode++;
491 }
492 spin_unlock(&sbi->s_es_lock);
493 }
494
ext4_es_list_del(struct inode * inode)495 static void ext4_es_list_del(struct inode *inode)
496 {
497 struct ext4_inode_info *ei = EXT4_I(inode);
498 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
499
500 spin_lock(&sbi->s_es_lock);
501 if (!list_empty(&ei->i_es_list)) {
502 list_del_init(&ei->i_es_list);
503 sbi->s_es_nr_inode--;
504 WARN_ON_ONCE(sbi->s_es_nr_inode < 0);
505 }
506 spin_unlock(&sbi->s_es_lock);
507 }
508
__alloc_pending(bool nofail)509 static inline struct pending_reservation *__alloc_pending(bool nofail)
510 {
511 if (!nofail)
512 return kmem_cache_alloc(ext4_pending_cachep, GFP_ATOMIC);
513
514 return kmem_cache_zalloc(ext4_pending_cachep, GFP_KERNEL | __GFP_NOFAIL);
515 }
516
__free_pending(struct pending_reservation * pr)517 static inline void __free_pending(struct pending_reservation *pr)
518 {
519 kmem_cache_free(ext4_pending_cachep, pr);
520 }
521
522 /*
523 * Returns true if we cannot fail to allocate memory for this extent_status
524 * entry and cannot reclaim it until its status changes.
525 */
ext4_es_must_keep(struct extent_status * es)526 static inline bool ext4_es_must_keep(struct extent_status *es)
527 {
528 /* fiemap, bigalloc, and seek_data/hole need to use it. */
529 if (ext4_es_is_delayed(es))
530 return true;
531
532 return false;
533 }
534
__es_alloc_extent(bool nofail)535 static inline struct extent_status *__es_alloc_extent(bool nofail)
536 {
537 if (!nofail)
538 return kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC);
539
540 return kmem_cache_zalloc(ext4_es_cachep, GFP_KERNEL | __GFP_NOFAIL);
541 }
542
ext4_es_init_extent(struct inode * inode,struct extent_status * es,ext4_lblk_t lblk,ext4_lblk_t len,ext4_fsblk_t pblk)543 static void ext4_es_init_extent(struct inode *inode, struct extent_status *es,
544 ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t pblk)
545 {
546 es->es_lblk = lblk;
547 es->es_len = len;
548 es->es_pblk = pblk;
549
550 /* We never try to reclaim a must kept extent, so we don't count it. */
551 if (!ext4_es_must_keep(es)) {
552 if (!EXT4_I(inode)->i_es_shk_nr++)
553 ext4_es_list_add(inode);
554 percpu_counter_inc(&EXT4_SB(inode->i_sb)->
555 s_es_stats.es_stats_shk_cnt);
556 }
557
558 EXT4_I(inode)->i_es_all_nr++;
559 percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
560 }
561
__es_free_extent(struct extent_status * es)562 static inline void __es_free_extent(struct extent_status *es)
563 {
564 kmem_cache_free(ext4_es_cachep, es);
565 }
566
ext4_es_free_extent(struct inode * inode,struct extent_status * es)567 static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)
568 {
569 EXT4_I(inode)->i_es_all_nr--;
570 percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
571
572 /* Decrease the shrink counter when we can reclaim the extent. */
573 if (!ext4_es_must_keep(es)) {
574 BUG_ON(EXT4_I(inode)->i_es_shk_nr == 0);
575 if (!--EXT4_I(inode)->i_es_shk_nr)
576 ext4_es_list_del(inode);
577 percpu_counter_dec(&EXT4_SB(inode->i_sb)->
578 s_es_stats.es_stats_shk_cnt);
579 }
580
581 __es_free_extent(es);
582 }
583
584 /*
585 * Check whether or not two extents can be merged
586 * Condition:
587 * - logical block number is contiguous
588 * - physical block number is contiguous
589 * - status is equal
590 */
ext4_es_can_be_merged(struct extent_status * es1,struct extent_status * es2)591 static int ext4_es_can_be_merged(struct extent_status *es1,
592 struct extent_status *es2)
593 {
594 if (ext4_es_type(es1) != ext4_es_type(es2))
595 return 0;
596
597 if (((__u64) es1->es_len) + es2->es_len > EXT_MAX_BLOCKS) {
598 pr_warn("ES assertion failed when merging extents. "
599 "The sum of lengths of es1 (%d) and es2 (%d) "
600 "is bigger than allowed file size (%d)\n",
601 es1->es_len, es2->es_len, EXT_MAX_BLOCKS);
602 WARN_ON(1);
603 return 0;
604 }
605
606 if (((__u64) es1->es_lblk) + es1->es_len != es2->es_lblk)
607 return 0;
608
609 if ((ext4_es_is_written(es1) || ext4_es_is_unwritten(es1)) &&
610 (ext4_es_pblock(es1) + es1->es_len == ext4_es_pblock(es2)))
611 return 1;
612
613 if (ext4_es_is_hole(es1))
614 return 1;
615
616 /* we need to check delayed extent */
617 if (ext4_es_is_delayed(es1))
618 return 1;
619
620 return 0;
621 }
622
623 static struct extent_status *
ext4_es_try_to_merge_left(struct inode * inode,struct extent_status * es)624 ext4_es_try_to_merge_left(struct inode *inode, struct extent_status *es)
625 {
626 struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
627 struct extent_status *es1;
628 struct rb_node *node;
629
630 node = rb_prev(&es->rb_node);
631 if (!node)
632 return es;
633
634 es1 = rb_entry(node, struct extent_status, rb_node);
635 if (ext4_es_can_be_merged(es1, es)) {
636 es1->es_len += es->es_len;
637 if (ext4_es_is_referenced(es))
638 ext4_es_set_referenced(es1);
639 rb_erase(&es->rb_node, &tree->root);
640 ext4_es_free_extent(inode, es);
641 es = es1;
642 }
643
644 return es;
645 }
646
647 static struct extent_status *
ext4_es_try_to_merge_right(struct inode * inode,struct extent_status * es)648 ext4_es_try_to_merge_right(struct inode *inode, struct extent_status *es)
649 {
650 struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
651 struct extent_status *es1;
652 struct rb_node *node;
653
654 node = rb_next(&es->rb_node);
655 if (!node)
656 return es;
657
658 es1 = rb_entry(node, struct extent_status, rb_node);
659 if (ext4_es_can_be_merged(es, es1)) {
660 es->es_len += es1->es_len;
661 if (ext4_es_is_referenced(es1))
662 ext4_es_set_referenced(es);
663 rb_erase(node, &tree->root);
664 ext4_es_free_extent(inode, es1);
665 }
666
667 return es;
668 }
669
670 #ifdef ES_AGGRESSIVE_TEST
671 #include "ext4_extents.h" /* Needed when ES_AGGRESSIVE_TEST is defined */
672
ext4_es_insert_extent_ext_check(struct inode * inode,struct extent_status * es)673 static void ext4_es_insert_extent_ext_check(struct inode *inode,
674 struct extent_status *es)
675 {
676 struct ext4_ext_path *path = NULL;
677 struct ext4_extent *ex;
678 ext4_lblk_t ee_block;
679 ext4_fsblk_t ee_start;
680 unsigned short ee_len;
681 int depth, ee_status, es_status;
682
683 path = ext4_find_extent(inode, es->es_lblk, NULL, EXT4_EX_NOCACHE);
684 if (IS_ERR(path))
685 return;
686
687 depth = ext_depth(inode);
688 ex = path[depth].p_ext;
689
690 if (ex) {
691
692 ee_block = le32_to_cpu(ex->ee_block);
693 ee_start = ext4_ext_pblock(ex);
694 ee_len = ext4_ext_get_actual_len(ex);
695
696 ee_status = ext4_ext_is_unwritten(ex) ? 1 : 0;
697 es_status = ext4_es_is_unwritten(es) ? 1 : 0;
698
699 /*
700 * Make sure ex and es are not overlap when we try to insert
701 * a delayed/hole extent.
702 */
703 if (!ext4_es_is_written(es) && !ext4_es_is_unwritten(es)) {
704 if (in_range(es->es_lblk, ee_block, ee_len)) {
705 pr_warn("ES insert assertion failed for "
706 "inode: %lu we can find an extent "
707 "at block [%d/%d/%llu/%c], but we "
708 "want to add a delayed/hole extent "
709 "[%d/%d/%llu/%x]\n",
710 inode->i_ino, ee_block, ee_len,
711 ee_start, ee_status ? 'u' : 'w',
712 es->es_lblk, es->es_len,
713 ext4_es_pblock(es), ext4_es_status(es));
714 }
715 goto out;
716 }
717
718 /*
719 * We don't check ee_block == es->es_lblk, etc. because es
720 * might be a part of whole extent, vice versa.
721 */
722 if (es->es_lblk < ee_block ||
723 ext4_es_pblock(es) != ee_start + es->es_lblk - ee_block) {
724 pr_warn("ES insert assertion failed for inode: %lu "
725 "ex_status [%d/%d/%llu/%c] != "
726 "es_status [%d/%d/%llu/%c]\n", inode->i_ino,
727 ee_block, ee_len, ee_start,
728 ee_status ? 'u' : 'w', es->es_lblk, es->es_len,
729 ext4_es_pblock(es), es_status ? 'u' : 'w');
730 goto out;
731 }
732
733 if (ee_status ^ es_status) {
734 pr_warn("ES insert assertion failed for inode: %lu "
735 "ex_status [%d/%d/%llu/%c] != "
736 "es_status [%d/%d/%llu/%c]\n", inode->i_ino,
737 ee_block, ee_len, ee_start,
738 ee_status ? 'u' : 'w', es->es_lblk, es->es_len,
739 ext4_es_pblock(es), es_status ? 'u' : 'w');
740 }
741 } else {
742 /*
743 * We can't find an extent on disk. So we need to make sure
744 * that we don't want to add an written/unwritten extent.
745 */
746 if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) {
747 pr_warn("ES insert assertion failed for inode: %lu "
748 "can't find an extent at block %d but we want "
749 "to add a written/unwritten extent "
750 "[%d/%d/%llu/%x]\n", inode->i_ino,
751 es->es_lblk, es->es_lblk, es->es_len,
752 ext4_es_pblock(es), ext4_es_status(es));
753 }
754 }
755 out:
756 ext4_free_ext_path(path);
757 }
758
ext4_es_insert_extent_ind_check(struct inode * inode,struct extent_status * es)759 static void ext4_es_insert_extent_ind_check(struct inode *inode,
760 struct extent_status *es)
761 {
762 struct ext4_map_blocks map;
763 int retval;
764
765 /*
766 * Here we call ext4_ind_map_blocks to lookup a block mapping because
767 * 'Indirect' structure is defined in indirect.c. So we couldn't
768 * access direct/indirect tree from outside. It is too dirty to define
769 * this function in indirect.c file.
770 */
771
772 map.m_lblk = es->es_lblk;
773 map.m_len = es->es_len;
774
775 retval = ext4_ind_map_blocks(NULL, inode, &map, 0);
776 if (retval > 0) {
777 if (ext4_es_is_delayed(es) || ext4_es_is_hole(es)) {
778 /*
779 * We want to add a delayed/hole extent but this
780 * block has been allocated.
781 */
782 pr_warn("ES insert assertion failed for inode: %lu "
783 "We can find blocks but we want to add a "
784 "delayed/hole extent [%d/%d/%llu/%x]\n",
785 inode->i_ino, es->es_lblk, es->es_len,
786 ext4_es_pblock(es), ext4_es_status(es));
787 return;
788 } else if (ext4_es_is_written(es)) {
789 if (retval != es->es_len) {
790 pr_warn("ES insert assertion failed for "
791 "inode: %lu retval %d != es_len %d\n",
792 inode->i_ino, retval, es->es_len);
793 return;
794 }
795 if (map.m_pblk != ext4_es_pblock(es)) {
796 pr_warn("ES insert assertion failed for "
797 "inode: %lu m_pblk %llu != "
798 "es_pblk %llu\n",
799 inode->i_ino, map.m_pblk,
800 ext4_es_pblock(es));
801 return;
802 }
803 } else {
804 /*
805 * We don't need to check unwritten extent because
806 * indirect-based file doesn't have it.
807 */
808 BUG();
809 }
810 } else if (retval == 0) {
811 if (ext4_es_is_written(es)) {
812 pr_warn("ES insert assertion failed for inode: %lu "
813 "We can't find the block but we want to add "
814 "a written extent [%d/%d/%llu/%x]\n",
815 inode->i_ino, es->es_lblk, es->es_len,
816 ext4_es_pblock(es), ext4_es_status(es));
817 return;
818 }
819 }
820 }
821
ext4_es_insert_extent_check(struct inode * inode,struct extent_status * es)822 static inline void ext4_es_insert_extent_check(struct inode *inode,
823 struct extent_status *es)
824 {
825 /*
826 * We don't need to worry about the race condition because
827 * caller takes i_data_sem locking.
828 */
829 BUG_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem));
830 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
831 ext4_es_insert_extent_ext_check(inode, es);
832 else
833 ext4_es_insert_extent_ind_check(inode, es);
834 }
835 #else
ext4_es_insert_extent_check(struct inode * inode,struct extent_status * es)836 static inline void ext4_es_insert_extent_check(struct inode *inode,
837 struct extent_status *es)
838 {
839 }
840 #endif
841
__es_insert_extent(struct inode * inode,struct extent_status * newes,struct extent_status * prealloc)842 static int __es_insert_extent(struct inode *inode, struct extent_status *newes,
843 struct extent_status *prealloc)
844 {
845 struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
846 struct rb_node **p = &tree->root.rb_node;
847 struct rb_node *parent = NULL;
848 struct extent_status *es;
849
850 while (*p) {
851 parent = *p;
852 es = rb_entry(parent, struct extent_status, rb_node);
853
854 if (newes->es_lblk < es->es_lblk) {
855 if (ext4_es_can_be_merged(newes, es)) {
856 /*
857 * Here we can modify es_lblk directly
858 * because it isn't overlapped.
859 */
860 es->es_lblk = newes->es_lblk;
861 es->es_len += newes->es_len;
862 if (ext4_es_is_written(es) ||
863 ext4_es_is_unwritten(es))
864 ext4_es_store_pblock(es,
865 newes->es_pblk);
866 es = ext4_es_try_to_merge_left(inode, es);
867 goto out;
868 }
869 p = &(*p)->rb_left;
870 } else if (newes->es_lblk > ext4_es_end(es)) {
871 if (ext4_es_can_be_merged(es, newes)) {
872 es->es_len += newes->es_len;
873 es = ext4_es_try_to_merge_right(inode, es);
874 goto out;
875 }
876 p = &(*p)->rb_right;
877 } else {
878 BUG();
879 return -EINVAL;
880 }
881 }
882
883 if (prealloc)
884 es = prealloc;
885 else
886 es = __es_alloc_extent(false);
887 if (!es)
888 return -ENOMEM;
889 ext4_es_init_extent(inode, es, newes->es_lblk, newes->es_len,
890 newes->es_pblk);
891
892 rb_link_node(&es->rb_node, parent, p);
893 rb_insert_color(&es->rb_node, &tree->root);
894
895 out:
896 tree->cache_es = es;
897 return 0;
898 }
899
900 /*
901 * ext4_es_insert_extent() adds information to an inode's extent
902 * status tree. This interface is used for modifying extents. To cache
903 * on-disk extents, use ext4_es_cache_extent() instead.
904 */
ext4_es_insert_extent(struct inode * inode,ext4_lblk_t lblk,ext4_lblk_t len,ext4_fsblk_t pblk,unsigned int status,bool delalloc_reserve_used)905 void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
906 ext4_lblk_t len, ext4_fsblk_t pblk,
907 unsigned int status, bool delalloc_reserve_used)
908 {
909 struct extent_status newes;
910 ext4_lblk_t end = lblk + len - 1;
911 int err1 = 0, err2 = 0, err3 = 0;
912 int resv_used = 0, pending = 0;
913 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
914 struct extent_status *es1 = NULL;
915 struct extent_status *es2 = NULL;
916 struct pending_reservation *pr = NULL;
917 bool revise_pending = false;
918
919 if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
920 return;
921
922 es_debug("add [%u/%u) %llu %x %d to extent status tree of inode %lu\n",
923 lblk, len, pblk, status, delalloc_reserve_used, inode->i_ino);
924
925 if (!len)
926 return;
927
928 BUG_ON(end < lblk);
929 WARN_ON_ONCE(status & EXTENT_STATUS_DELAYED);
930
931 newes.es_lblk = lblk;
932 newes.es_len = len;
933 ext4_es_store_pblock_status(&newes, pblk, status);
934
935 ext4_es_insert_extent_check(inode, &newes);
936
937 revise_pending = sbi->s_cluster_ratio > 1 &&
938 test_opt(inode->i_sb, DELALLOC) &&
939 (status & (EXTENT_STATUS_WRITTEN |
940 EXTENT_STATUS_UNWRITTEN));
941 retry:
942 if (err1 && !es1)
943 es1 = __es_alloc_extent(true);
944 if ((err1 || err2) && !es2)
945 es2 = __es_alloc_extent(true);
946 if ((err1 || err2 || err3 < 0) && revise_pending && !pr)
947 pr = __alloc_pending(true);
948 write_lock(&EXT4_I(inode)->i_es_lock);
949
950 err1 = __es_remove_extent(inode, lblk, end, 0, &resv_used, NULL, es1);
951 if (err1 != 0)
952 goto error;
953 /* Free preallocated extent if it didn't get used. */
954 if (es1) {
955 if (!es1->es_len)
956 __es_free_extent(es1);
957 es1 = NULL;
958 }
959
960 err2 = __es_insert_extent(inode, &newes, es2);
961 if (err2 == -ENOMEM && !ext4_es_must_keep(&newes))
962 err2 = 0;
963 if (err2 != 0)
964 goto error;
965 /* Free preallocated extent if it didn't get used. */
966 if (es2) {
967 if (!es2->es_len)
968 __es_free_extent(es2);
969 es2 = NULL;
970 }
971
972 if (revise_pending) {
973 err3 = __revise_pending(inode, lblk, len, &pr);
974 if (err3 < 0)
975 goto error;
976 if (pr) {
977 __free_pending(pr);
978 pr = NULL;
979 }
980 pending = err3;
981 }
982 ext4_es_inc_seq(inode);
983 error:
984 write_unlock(&EXT4_I(inode)->i_es_lock);
985 /*
986 * Reduce the reserved cluster count to reflect successful deferred
987 * allocation of delayed allocated clusters or direct allocation of
988 * clusters discovered to be delayed allocated. Once allocated, a
989 * cluster is not included in the reserved count.
990 *
991 * When direct allocating (from fallocate, filemap, DIO, or clusters
992 * allocated when delalloc has been disabled by ext4_nonda_switch())
993 * an extent either 1) contains delayed blocks but start with
994 * non-delayed allocated blocks (e.g. hole) or 2) contains non-delayed
995 * allocated blocks which belong to delayed allocated clusters when
996 * bigalloc feature is enabled, quota has already been claimed by
997 * ext4_mb_new_blocks(), so release the quota reservations made for
998 * any previously delayed allocated clusters instead of claim them
999 * again.
1000 */
1001 resv_used += pending;
1002 if (resv_used)
1003 ext4_da_update_reserve_space(inode, resv_used,
1004 delalloc_reserve_used);
1005
1006 if (err1 || err2 || err3 < 0)
1007 goto retry;
1008
1009 trace_ext4_es_insert_extent(inode, &newes);
1010 ext4_es_print_tree(inode);
1011 return;
1012 }
1013
1014 /*
1015 * ext4_es_cache_extent() inserts information into the extent status tree
1016 * only if there is no existing information about the specified range or
1017 * if the existing extents have the same status.
1018 *
1019 * Note that this interface is only used for caching on-disk extent
1020 * information and cannot be used to convert existing extents in the extent
1021 * status tree. To convert existing extents, use ext4_es_insert_extent()
1022 * instead.
1023 */
ext4_es_cache_extent(struct inode * inode,ext4_lblk_t lblk,ext4_lblk_t len,ext4_fsblk_t pblk,unsigned int status)1024 void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
1025 ext4_lblk_t len, ext4_fsblk_t pblk,
1026 unsigned int status)
1027 {
1028 struct extent_status *es;
1029 struct extent_status chkes, newes;
1030 ext4_lblk_t end = lblk + len - 1;
1031 bool conflict = false;
1032 int err;
1033
1034 if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
1035 return;
1036
1037 newes.es_lblk = lblk;
1038 newes.es_len = len;
1039 ext4_es_store_pblock_status(&newes, pblk, status);
1040
1041 if (!len)
1042 return;
1043
1044 BUG_ON(end < lblk);
1045
1046 write_lock(&EXT4_I(inode)->i_es_lock);
1047 es = __es_tree_search(&EXT4_I(inode)->i_es_tree.root, lblk);
1048 if (es && es->es_lblk <= end) {
1049 /* Found an extent that covers the entire range. */
1050 if (es->es_lblk <= lblk && es->es_lblk + es->es_len > end) {
1051 if (__es_check_extent_status(es, status, &chkes))
1052 conflict = true;
1053 goto unlock;
1054 }
1055 /* Check and remove all extents in range. */
1056 err = __es_remove_extent(inode, lblk, end, status, NULL,
1057 &chkes, NULL);
1058 if (err) {
1059 if (err == -EINVAL)
1060 conflict = true;
1061 goto unlock;
1062 }
1063 }
1064 __es_insert_extent(inode, &newes, NULL);
1065 trace_ext4_es_cache_extent(inode, &newes);
1066 ext4_es_print_tree(inode);
1067 unlock:
1068 write_unlock(&EXT4_I(inode)->i_es_lock);
1069 if (!conflict)
1070 return;
1071 /*
1072 * A hole in the on-disk extent but a delayed extent in the extent
1073 * status tree, is allowed.
1074 */
1075 if (status == EXTENT_STATUS_HOLE &&
1076 ext4_es_type(&chkes) == EXTENT_STATUS_DELAYED)
1077 return;
1078
1079 ext4_warning_inode(inode,
1080 "ES cache extent failed: add [%d,%d,%llu,0x%x] conflict with existing [%d,%d,%llu,0x%x]\n",
1081 lblk, len, pblk, status, chkes.es_lblk, chkes.es_len,
1082 ext4_es_pblock(&chkes), ext4_es_status(&chkes));
1083 }
1084
1085 /*
1086 * ext4_es_lookup_extent() looks up an extent in extent status tree.
1087 *
1088 * ext4_es_lookup_extent is called by ext4_map_blocks/ext4_da_map_blocks.
1089 *
1090 * Return: 1 on found, 0 on not
1091 */
ext4_es_lookup_extent(struct inode * inode,ext4_lblk_t lblk,ext4_lblk_t * next_lblk,struct extent_status * es,u64 * pseq)1092 int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
1093 ext4_lblk_t *next_lblk, struct extent_status *es,
1094 u64 *pseq)
1095 {
1096 struct ext4_es_tree *tree;
1097 struct ext4_es_stats *stats;
1098 struct extent_status *es1 = NULL;
1099 struct rb_node *node;
1100 int found = 0;
1101
1102 if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
1103 return 0;
1104
1105 trace_ext4_es_lookup_extent_enter(inode, lblk);
1106 es_debug("lookup extent in block %u\n", lblk);
1107
1108 tree = &EXT4_I(inode)->i_es_tree;
1109 read_lock(&EXT4_I(inode)->i_es_lock);
1110
1111 /* find extent in cache firstly */
1112 es->es_lblk = es->es_len = es->es_pblk = 0;
1113 es1 = READ_ONCE(tree->cache_es);
1114 if (es1 && in_range(lblk, es1->es_lblk, es1->es_len)) {
1115 es_debug("%u cached by [%u/%u)\n",
1116 lblk, es1->es_lblk, es1->es_len);
1117 found = 1;
1118 goto out;
1119 }
1120
1121 node = tree->root.rb_node;
1122 while (node) {
1123 es1 = rb_entry(node, struct extent_status, rb_node);
1124 if (lblk < es1->es_lblk)
1125 node = node->rb_left;
1126 else if (lblk > ext4_es_end(es1))
1127 node = node->rb_right;
1128 else {
1129 found = 1;
1130 break;
1131 }
1132 }
1133
1134 out:
1135 stats = &EXT4_SB(inode->i_sb)->s_es_stats;
1136 if (found) {
1137 BUG_ON(!es1);
1138 es->es_lblk = es1->es_lblk;
1139 es->es_len = es1->es_len;
1140 es->es_pblk = es1->es_pblk;
1141 if (!ext4_es_is_referenced(es1))
1142 ext4_es_set_referenced(es1);
1143 percpu_counter_inc(&stats->es_stats_cache_hits);
1144 if (next_lblk) {
1145 node = rb_next(&es1->rb_node);
1146 if (node) {
1147 es1 = rb_entry(node, struct extent_status,
1148 rb_node);
1149 *next_lblk = es1->es_lblk;
1150 } else
1151 *next_lblk = 0;
1152 }
1153 if (pseq)
1154 *pseq = EXT4_I(inode)->i_es_seq;
1155 } else {
1156 percpu_counter_inc(&stats->es_stats_cache_misses);
1157 }
1158
1159 read_unlock(&EXT4_I(inode)->i_es_lock);
1160
1161 trace_ext4_es_lookup_extent_exit(inode, es, found);
1162 return found;
1163 }
1164
1165 struct rsvd_count {
1166 int ndelayed;
1167 bool first_do_lblk_found;
1168 ext4_lblk_t first_do_lblk;
1169 ext4_lblk_t last_do_lblk;
1170 struct extent_status *left_es;
1171 bool partial;
1172 ext4_lblk_t lclu;
1173 };
1174
1175 /*
1176 * init_rsvd - initialize reserved count data before removing block range
1177 * in file from extent status tree
1178 *
1179 * @inode - file containing range
1180 * @lblk - first block in range
1181 * @es - pointer to first extent in range
1182 * @rc - pointer to reserved count data
1183 *
1184 * Assumes es is not NULL
1185 */
init_rsvd(struct inode * inode,ext4_lblk_t lblk,struct extent_status * es,struct rsvd_count * rc)1186 static void init_rsvd(struct inode *inode, ext4_lblk_t lblk,
1187 struct extent_status *es, struct rsvd_count *rc)
1188 {
1189 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1190 struct rb_node *node;
1191
1192 rc->ndelayed = 0;
1193
1194 /*
1195 * for bigalloc, note the first delayed block in the range has not
1196 * been found, record the extent containing the block to the left of
1197 * the region to be removed, if any, and note that there's no partial
1198 * cluster to track
1199 */
1200 if (sbi->s_cluster_ratio > 1) {
1201 rc->first_do_lblk_found = false;
1202 if (lblk > es->es_lblk) {
1203 rc->left_es = es;
1204 } else {
1205 node = rb_prev(&es->rb_node);
1206 rc->left_es = node ? rb_entry(node,
1207 struct extent_status,
1208 rb_node) : NULL;
1209 }
1210 rc->partial = false;
1211 }
1212 }
1213
1214 /*
1215 * count_rsvd - count the clusters containing delayed blocks in a range
1216 * within an extent and add to the running tally in rsvd_count
1217 *
1218 * @inode - file containing extent
1219 * @lblk - first block in range
1220 * @len - length of range in blocks
1221 * @es - pointer to extent containing clusters to be counted
1222 * @rc - pointer to reserved count data
1223 *
1224 * Tracks partial clusters found at the beginning and end of extents so
1225 * they aren't overcounted when they span adjacent extents
1226 */
count_rsvd(struct inode * inode,ext4_lblk_t lblk,long len,struct extent_status * es,struct rsvd_count * rc)1227 static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len,
1228 struct extent_status *es, struct rsvd_count *rc)
1229 {
1230 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1231 ext4_lblk_t i, end, nclu;
1232
1233 if (!ext4_es_is_delayed(es))
1234 return;
1235
1236 WARN_ON(len <= 0);
1237
1238 if (sbi->s_cluster_ratio == 1) {
1239 rc->ndelayed += (int) len;
1240 return;
1241 }
1242
1243 /* bigalloc */
1244
1245 i = (lblk < es->es_lblk) ? es->es_lblk : lblk;
1246 end = lblk + (ext4_lblk_t) len - 1;
1247 end = (end > ext4_es_end(es)) ? ext4_es_end(es) : end;
1248
1249 /* record the first block of the first delayed extent seen */
1250 if (!rc->first_do_lblk_found) {
1251 rc->first_do_lblk = i;
1252 rc->first_do_lblk_found = true;
1253 }
1254
1255 /* update the last lblk in the region seen so far */
1256 rc->last_do_lblk = end;
1257
1258 /*
1259 * if we're tracking a partial cluster and the current extent
1260 * doesn't start with it, count it and stop tracking
1261 */
1262 if (rc->partial && (rc->lclu != EXT4_B2C(sbi, i))) {
1263 rc->ndelayed++;
1264 rc->partial = false;
1265 }
1266
1267 /*
1268 * if the first cluster doesn't start on a cluster boundary but
1269 * ends on one, count it
1270 */
1271 if (EXT4_LBLK_COFF(sbi, i) != 0) {
1272 if (end >= EXT4_LBLK_CFILL(sbi, i)) {
1273 rc->ndelayed++;
1274 rc->partial = false;
1275 i = EXT4_LBLK_CFILL(sbi, i) + 1;
1276 }
1277 }
1278
1279 /*
1280 * if the current cluster starts on a cluster boundary, count the
1281 * number of whole delayed clusters in the extent
1282 */
1283 if ((i + sbi->s_cluster_ratio - 1) <= end) {
1284 nclu = (end - i + 1) >> sbi->s_cluster_bits;
1285 rc->ndelayed += nclu;
1286 i += nclu << sbi->s_cluster_bits;
1287 }
1288
1289 /*
1290 * start tracking a partial cluster if there's a partial at the end
1291 * of the current extent and we're not already tracking one
1292 */
1293 if (!rc->partial && i <= end) {
1294 rc->partial = true;
1295 rc->lclu = EXT4_B2C(sbi, i);
1296 }
1297 }
1298
1299 /*
1300 * __pr_tree_search - search for a pending cluster reservation
1301 *
1302 * @root - root of pending reservation tree
1303 * @lclu - logical cluster to search for
1304 *
1305 * Returns the pending reservation for the cluster identified by @lclu
1306 * if found. If not, returns a reservation for the next cluster if any,
1307 * and if not, returns NULL.
1308 */
__pr_tree_search(struct rb_root * root,ext4_lblk_t lclu)1309 static struct pending_reservation *__pr_tree_search(struct rb_root *root,
1310 ext4_lblk_t lclu)
1311 {
1312 struct rb_node *node = root->rb_node;
1313 struct pending_reservation *pr = NULL;
1314
1315 while (node) {
1316 pr = rb_entry(node, struct pending_reservation, rb_node);
1317 if (lclu < pr->lclu)
1318 node = node->rb_left;
1319 else if (lclu > pr->lclu)
1320 node = node->rb_right;
1321 else
1322 return pr;
1323 }
1324 if (pr && lclu < pr->lclu)
1325 return pr;
1326 if (pr && lclu > pr->lclu) {
1327 node = rb_next(&pr->rb_node);
1328 return node ? rb_entry(node, struct pending_reservation,
1329 rb_node) : NULL;
1330 }
1331 return NULL;
1332 }
1333
1334 /*
1335 * get_rsvd - calculates and returns the number of cluster reservations to be
1336 * released when removing a block range from the extent status tree
1337 * and releases any pending reservations within the range
1338 *
1339 * @inode - file containing block range
1340 * @end - last block in range
1341 * @right_es - pointer to extent containing next block beyond end or NULL
1342 * @rc - pointer to reserved count data
1343 *
1344 * The number of reservations to be released is equal to the number of
1345 * clusters containing delayed blocks within the range, minus the number of
1346 * clusters still containing delayed blocks at the ends of the range, and
1347 * minus the number of pending reservations within the range.
1348 */
get_rsvd(struct inode * inode,ext4_lblk_t end,struct extent_status * right_es,struct rsvd_count * rc)1349 static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
1350 struct extent_status *right_es,
1351 struct rsvd_count *rc)
1352 {
1353 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1354 struct pending_reservation *pr;
1355 struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree;
1356 struct rb_node *node;
1357 ext4_lblk_t first_lclu, last_lclu;
1358 bool left_delayed, right_delayed, count_pending;
1359 struct extent_status *es;
1360
1361 if (sbi->s_cluster_ratio > 1) {
1362 /* count any remaining partial cluster */
1363 if (rc->partial)
1364 rc->ndelayed++;
1365
1366 if (rc->ndelayed == 0)
1367 return 0;
1368
1369 first_lclu = EXT4_B2C(sbi, rc->first_do_lblk);
1370 last_lclu = EXT4_B2C(sbi, rc->last_do_lblk);
1371
1372 /*
1373 * decrease the delayed count by the number of clusters at the
1374 * ends of the range that still contain delayed blocks -
1375 * these clusters still need to be reserved
1376 */
1377 left_delayed = right_delayed = false;
1378
1379 es = rc->left_es;
1380 while (es && ext4_es_end(es) >=
1381 EXT4_LBLK_CMASK(sbi, rc->first_do_lblk)) {
1382 if (ext4_es_is_delayed(es)) {
1383 rc->ndelayed--;
1384 left_delayed = true;
1385 break;
1386 }
1387 node = rb_prev(&es->rb_node);
1388 if (!node)
1389 break;
1390 es = rb_entry(node, struct extent_status, rb_node);
1391 }
1392 if (right_es && (!left_delayed || first_lclu != last_lclu)) {
1393 if (end < ext4_es_end(right_es)) {
1394 es = right_es;
1395 } else {
1396 node = rb_next(&right_es->rb_node);
1397 es = node ? rb_entry(node, struct extent_status,
1398 rb_node) : NULL;
1399 }
1400 while (es && es->es_lblk <=
1401 EXT4_LBLK_CFILL(sbi, rc->last_do_lblk)) {
1402 if (ext4_es_is_delayed(es)) {
1403 rc->ndelayed--;
1404 right_delayed = true;
1405 break;
1406 }
1407 node = rb_next(&es->rb_node);
1408 if (!node)
1409 break;
1410 es = rb_entry(node, struct extent_status,
1411 rb_node);
1412 }
1413 }
1414
1415 /*
1416 * Determine the block range that should be searched for
1417 * pending reservations, if any. Clusters on the ends of the
1418 * original removed range containing delayed blocks are
1419 * excluded. They've already been accounted for and it's not
1420 * possible to determine if an associated pending reservation
1421 * should be released with the information available in the
1422 * extents status tree.
1423 */
1424 if (first_lclu == last_lclu) {
1425 if (left_delayed | right_delayed)
1426 count_pending = false;
1427 else
1428 count_pending = true;
1429 } else {
1430 if (left_delayed)
1431 first_lclu++;
1432 if (right_delayed)
1433 last_lclu--;
1434 if (first_lclu <= last_lclu)
1435 count_pending = true;
1436 else
1437 count_pending = false;
1438 }
1439
1440 /*
1441 * a pending reservation found between first_lclu and last_lclu
1442 * represents an allocated cluster that contained at least one
1443 * delayed block, so the delayed total must be reduced by one
1444 * for each pending reservation found and released
1445 */
1446 if (count_pending) {
1447 pr = __pr_tree_search(&tree->root, first_lclu);
1448 while (pr && pr->lclu <= last_lclu) {
1449 rc->ndelayed--;
1450 node = rb_next(&pr->rb_node);
1451 rb_erase(&pr->rb_node, &tree->root);
1452 __free_pending(pr);
1453 if (!node)
1454 break;
1455 pr = rb_entry(node, struct pending_reservation,
1456 rb_node);
1457 }
1458 }
1459 }
1460 return rc->ndelayed;
1461 }
1462
1463 /*
1464 * __es_remove_extent - removes block range from extent status tree
1465 *
1466 * @inode - file containing range
1467 * @lblk - first block in range
1468 * @end - last block in range
1469 * @status - the extent status to be checked
1470 * @reserved - number of cluster reservations released
1471 * @res - return the extent if the status is not match
1472 * @prealloc - pre-allocated es to avoid memory allocation failures
1473 *
1474 * If @reserved is not NULL and delayed allocation is enabled, counts
1475 * block/cluster reservations freed by removing range and if bigalloc
1476 * enabled cancels pending reservations as needed. If @status is not
1477 * zero, check extent status type while removing extent, return -EINVAL
1478 * and pass out the extent through @res if not match. Returns 0 on
1479 * success, error code on failure.
1480 */
__es_remove_extent(struct inode * inode,ext4_lblk_t lblk,ext4_lblk_t end,unsigned int status,int * reserved,struct extent_status * res,struct extent_status * prealloc)1481 static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
1482 ext4_lblk_t end, unsigned int status,
1483 int *reserved, struct extent_status *res,
1484 struct extent_status *prealloc)
1485 {
1486 struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
1487 struct rb_node *node;
1488 struct extent_status *es;
1489 struct extent_status orig_es;
1490 ext4_lblk_t len1, len2;
1491 ext4_fsblk_t block;
1492 int err;
1493 bool count_reserved = true;
1494 struct rsvd_count rc;
1495
1496 if (reserved == NULL || !test_opt(inode->i_sb, DELALLOC))
1497 count_reserved = false;
1498 if (status == 0)
1499 status = ES_TYPE_MASK;
1500
1501 es = __es_tree_search(&tree->root, lblk);
1502 if (!es)
1503 return 0;
1504 if (es->es_lblk > end)
1505 return 0;
1506
1507 err = __es_check_extent_status(es, status, res);
1508 if (err)
1509 return err;
1510
1511 /* Simply invalidate cache_es. */
1512 tree->cache_es = NULL;
1513 if (count_reserved)
1514 init_rsvd(inode, lblk, es, &rc);
1515
1516 orig_es.es_lblk = es->es_lblk;
1517 orig_es.es_len = es->es_len;
1518 orig_es.es_pblk = es->es_pblk;
1519
1520 len1 = lblk > es->es_lblk ? lblk - es->es_lblk : 0;
1521 len2 = ext4_es_end(es) > end ? ext4_es_end(es) - end : 0;
1522 if (len1 > 0)
1523 es->es_len = len1;
1524 if (len2 > 0) {
1525 if (len1 > 0) {
1526 struct extent_status newes;
1527
1528 newes.es_lblk = end + 1;
1529 newes.es_len = len2;
1530 block = 0x7FDEADBEEFULL;
1531 if (ext4_es_is_written(&orig_es) ||
1532 ext4_es_is_unwritten(&orig_es))
1533 block = ext4_es_pblock(&orig_es) +
1534 orig_es.es_len - len2;
1535 ext4_es_store_pblock_status(&newes, block,
1536 ext4_es_status(&orig_es));
1537 err = __es_insert_extent(inode, &newes, prealloc);
1538 if (err) {
1539 if (!ext4_es_must_keep(&newes))
1540 return 0;
1541
1542 es->es_lblk = orig_es.es_lblk;
1543 es->es_len = orig_es.es_len;
1544 return err;
1545 }
1546 } else {
1547 es->es_lblk = end + 1;
1548 es->es_len = len2;
1549 if (ext4_es_is_written(es) ||
1550 ext4_es_is_unwritten(es)) {
1551 block = orig_es.es_pblk + orig_es.es_len - len2;
1552 ext4_es_store_pblock(es, block);
1553 }
1554 }
1555 if (count_reserved)
1556 count_rsvd(inode, orig_es.es_lblk + len1,
1557 orig_es.es_len - len1 - len2, &orig_es, &rc);
1558 goto out;
1559 }
1560
1561 if (len1 > 0) {
1562 if (count_reserved)
1563 count_rsvd(inode, lblk, orig_es.es_len - len1,
1564 &orig_es, &rc);
1565 node = rb_next(&es->rb_node);
1566 if (node)
1567 es = rb_entry(node, struct extent_status, rb_node);
1568 else
1569 es = NULL;
1570 }
1571
1572 while (es && ext4_es_end(es) <= end) {
1573 err = __es_check_extent_status(es, status, res);
1574 if (err)
1575 return err;
1576 if (count_reserved)
1577 count_rsvd(inode, es->es_lblk, es->es_len, es, &rc);
1578 node = rb_next(&es->rb_node);
1579 rb_erase(&es->rb_node, &tree->root);
1580 ext4_es_free_extent(inode, es);
1581 if (!node) {
1582 es = NULL;
1583 break;
1584 }
1585 es = rb_entry(node, struct extent_status, rb_node);
1586 }
1587
1588 if (es && es->es_lblk < end + 1) {
1589 ext4_lblk_t orig_len = es->es_len;
1590
1591 err = __es_check_extent_status(es, status, res);
1592 if (err)
1593 return err;
1594
1595 len1 = ext4_es_end(es) - end;
1596 if (count_reserved)
1597 count_rsvd(inode, es->es_lblk, orig_len - len1,
1598 es, &rc);
1599 es->es_lblk = end + 1;
1600 es->es_len = len1;
1601 if (ext4_es_is_written(es) || ext4_es_is_unwritten(es)) {
1602 block = es->es_pblk + orig_len - len1;
1603 ext4_es_store_pblock(es, block);
1604 }
1605 }
1606
1607 out:
1608 if (count_reserved)
1609 *reserved = get_rsvd(inode, end, es, &rc);
1610 return 0;
1611 }
1612
1613 /*
1614 * ext4_es_remove_extent - removes block range from extent status tree
1615 *
1616 * @inode - file containing range
1617 * @lblk - first block in range
1618 * @len - number of blocks to remove
1619 *
1620 * Reduces block/cluster reservation count and for bigalloc cancels pending
1621 * reservations as needed.
1622 */
ext4_es_remove_extent(struct inode * inode,ext4_lblk_t lblk,ext4_lblk_t len)1623 void ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
1624 ext4_lblk_t len)
1625 {
1626 ext4_lblk_t end;
1627 int err = 0;
1628 int reserved = 0;
1629 struct extent_status *es = NULL;
1630
1631 if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
1632 return;
1633
1634 es_debug("remove [%u/%u) from extent status tree of inode %lu\n",
1635 lblk, len, inode->i_ino);
1636
1637 if (!len)
1638 return;
1639
1640 end = lblk + len - 1;
1641 BUG_ON(end < lblk);
1642
1643 retry:
1644 if (err && !es)
1645 es = __es_alloc_extent(true);
1646 /*
1647 * ext4_clear_inode() depends on us taking i_es_lock unconditionally
1648 * so that we are sure __es_shrink() is done with the inode before it
1649 * is reclaimed.
1650 */
1651 write_lock(&EXT4_I(inode)->i_es_lock);
1652 err = __es_remove_extent(inode, lblk, end, 0, &reserved, NULL, es);
1653 if (err)
1654 goto error;
1655 /* Free preallocated extent if it didn't get used. */
1656 if (es) {
1657 if (!es->es_len)
1658 __es_free_extent(es);
1659 es = NULL;
1660 }
1661 ext4_es_inc_seq(inode);
1662 error:
1663 write_unlock(&EXT4_I(inode)->i_es_lock);
1664 if (err)
1665 goto retry;
1666
1667 trace_ext4_es_remove_extent(inode, lblk, len);
1668 ext4_es_print_tree(inode);
1669 ext4_da_release_space(inode, reserved);
1670 }
1671
__es_shrink(struct ext4_sb_info * sbi,int nr_to_scan,struct ext4_inode_info * locked_ei)1672 static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
1673 struct ext4_inode_info *locked_ei)
1674 {
1675 struct ext4_inode_info *ei;
1676 struct ext4_es_stats *es_stats;
1677 ktime_t start_time;
1678 u64 scan_time;
1679 int nr_to_walk;
1680 int nr_shrunk = 0;
1681 int retried = 0, nr_skipped = 0;
1682
1683 es_stats = &sbi->s_es_stats;
1684 start_time = ktime_get();
1685
1686 retry:
1687 spin_lock(&sbi->s_es_lock);
1688 nr_to_walk = sbi->s_es_nr_inode;
1689 while (nr_to_walk-- > 0) {
1690 if (list_empty(&sbi->s_es_list)) {
1691 spin_unlock(&sbi->s_es_lock);
1692 goto out;
1693 }
1694 ei = list_first_entry(&sbi->s_es_list, struct ext4_inode_info,
1695 i_es_list);
1696 /* Move the inode to the tail */
1697 list_move_tail(&ei->i_es_list, &sbi->s_es_list);
1698
1699 /*
1700 * Normally we try hard to avoid shrinking precached inodes,
1701 * but we will as a last resort.
1702 */
1703 if (!retried && ext4_test_inode_state(&ei->vfs_inode,
1704 EXT4_STATE_EXT_PRECACHED)) {
1705 nr_skipped++;
1706 continue;
1707 }
1708
1709 if (ei == locked_ei || !write_trylock(&ei->i_es_lock)) {
1710 nr_skipped++;
1711 continue;
1712 }
1713 /*
1714 * Now we hold i_es_lock which protects us from inode reclaim
1715 * freeing inode under us
1716 */
1717 spin_unlock(&sbi->s_es_lock);
1718
1719 nr_shrunk += es_reclaim_extents(ei, &nr_to_scan);
1720 write_unlock(&ei->i_es_lock);
1721
1722 if (nr_to_scan <= 0)
1723 goto out;
1724 spin_lock(&sbi->s_es_lock);
1725 }
1726 spin_unlock(&sbi->s_es_lock);
1727
1728 /*
1729 * If we skipped any inodes, and we weren't able to make any
1730 * forward progress, try again to scan precached inodes.
1731 */
1732 if ((nr_shrunk == 0) && nr_skipped && !retried) {
1733 retried++;
1734 goto retry;
1735 }
1736
1737 if (locked_ei && nr_shrunk == 0)
1738 nr_shrunk = es_reclaim_extents(locked_ei, &nr_to_scan);
1739
1740 out:
1741 scan_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1742 if (likely(es_stats->es_stats_scan_time))
1743 es_stats->es_stats_scan_time = (scan_time +
1744 es_stats->es_stats_scan_time*3) / 4;
1745 else
1746 es_stats->es_stats_scan_time = scan_time;
1747 if (scan_time > es_stats->es_stats_max_scan_time)
1748 es_stats->es_stats_max_scan_time = scan_time;
1749 if (likely(es_stats->es_stats_shrunk))
1750 es_stats->es_stats_shrunk = (nr_shrunk +
1751 es_stats->es_stats_shrunk*3) / 4;
1752 else
1753 es_stats->es_stats_shrunk = nr_shrunk;
1754
1755 trace_ext4_es_shrink(sbi->s_sb, nr_shrunk, scan_time,
1756 nr_skipped, retried);
1757 return nr_shrunk;
1758 }
1759
ext4_es_count(struct shrinker * shrink,struct shrink_control * sc)1760 static unsigned long ext4_es_count(struct shrinker *shrink,
1761 struct shrink_control *sc)
1762 {
1763 unsigned long nr;
1764 struct ext4_sb_info *sbi;
1765
1766 sbi = shrink->private_data;
1767 nr = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
1768 trace_ext4_es_shrink_count(sbi->s_sb, sc->nr_to_scan, nr);
1769 return nr;
1770 }
1771
ext4_es_scan(struct shrinker * shrink,struct shrink_control * sc)1772 static unsigned long ext4_es_scan(struct shrinker *shrink,
1773 struct shrink_control *sc)
1774 {
1775 struct ext4_sb_info *sbi = shrink->private_data;
1776 int nr_to_scan = sc->nr_to_scan;
1777 int ret, nr_shrunk;
1778
1779 ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
1780 trace_ext4_es_shrink_scan_enter(sbi->s_sb, nr_to_scan, ret);
1781
1782 nr_shrunk = __es_shrink(sbi, nr_to_scan, NULL);
1783
1784 ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
1785 trace_ext4_es_shrink_scan_exit(sbi->s_sb, nr_shrunk, ret);
1786 return nr_shrunk;
1787 }
1788
ext4_seq_es_shrinker_info_show(struct seq_file * seq,void * v)1789 int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v)
1790 {
1791 struct ext4_sb_info *sbi = EXT4_SB((struct super_block *) seq->private);
1792 struct ext4_es_stats *es_stats = &sbi->s_es_stats;
1793 struct ext4_inode_info *ei, *max = NULL;
1794 unsigned int inode_cnt = 0;
1795
1796 if (v != SEQ_START_TOKEN)
1797 return 0;
1798
1799 /* here we just find an inode that has the max nr. of objects */
1800 spin_lock(&sbi->s_es_lock);
1801 list_for_each_entry(ei, &sbi->s_es_list, i_es_list) {
1802 inode_cnt++;
1803 if (max && max->i_es_all_nr < ei->i_es_all_nr)
1804 max = ei;
1805 else if (!max)
1806 max = ei;
1807 }
1808 spin_unlock(&sbi->s_es_lock);
1809
1810 seq_printf(seq, "stats:\n %lld objects\n %lld reclaimable objects\n",
1811 percpu_counter_sum_positive(&es_stats->es_stats_all_cnt),
1812 percpu_counter_sum_positive(&es_stats->es_stats_shk_cnt));
1813 seq_printf(seq, " %lld/%lld cache hits/misses\n",
1814 percpu_counter_sum_positive(&es_stats->es_stats_cache_hits),
1815 percpu_counter_sum_positive(&es_stats->es_stats_cache_misses));
1816 if (inode_cnt)
1817 seq_printf(seq, " %d inodes on list\n", inode_cnt);
1818
1819 seq_printf(seq, "average:\n %llu us scan time\n",
1820 div_u64(es_stats->es_stats_scan_time, 1000));
1821 seq_printf(seq, " %lu shrunk objects\n", es_stats->es_stats_shrunk);
1822 if (inode_cnt)
1823 seq_printf(seq,
1824 "maximum:\n %lu inode (%u objects, %u reclaimable)\n"
1825 " %llu us max scan time\n",
1826 max->vfs_inode.i_ino, max->i_es_all_nr, max->i_es_shk_nr,
1827 div_u64(es_stats->es_stats_max_scan_time, 1000));
1828
1829 return 0;
1830 }
1831
ext4_es_register_shrinker(struct ext4_sb_info * sbi)1832 int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
1833 {
1834 int err;
1835
1836 /* Make sure we have enough bits for physical block number */
1837 BUILD_BUG_ON(ES_SHIFT < 48);
1838 INIT_LIST_HEAD(&sbi->s_es_list);
1839 sbi->s_es_nr_inode = 0;
1840 spin_lock_init(&sbi->s_es_lock);
1841 sbi->s_es_stats.es_stats_shrunk = 0;
1842 err = percpu_counter_init(&sbi->s_es_stats.es_stats_cache_hits, 0,
1843 GFP_KERNEL);
1844 if (err)
1845 return err;
1846 err = percpu_counter_init(&sbi->s_es_stats.es_stats_cache_misses, 0,
1847 GFP_KERNEL);
1848 if (err)
1849 goto err1;
1850 sbi->s_es_stats.es_stats_scan_time = 0;
1851 sbi->s_es_stats.es_stats_max_scan_time = 0;
1852 err = percpu_counter_init(&sbi->s_es_stats.es_stats_all_cnt, 0, GFP_KERNEL);
1853 if (err)
1854 goto err2;
1855 err = percpu_counter_init(&sbi->s_es_stats.es_stats_shk_cnt, 0, GFP_KERNEL);
1856 if (err)
1857 goto err3;
1858
1859 sbi->s_es_shrinker = shrinker_alloc(0, "ext4-es:%s", sbi->s_sb->s_id);
1860 if (!sbi->s_es_shrinker) {
1861 err = -ENOMEM;
1862 goto err4;
1863 }
1864
1865 sbi->s_es_shrinker->scan_objects = ext4_es_scan;
1866 sbi->s_es_shrinker->count_objects = ext4_es_count;
1867 sbi->s_es_shrinker->private_data = sbi;
1868
1869 shrinker_register(sbi->s_es_shrinker);
1870
1871 return 0;
1872 err4:
1873 percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt);
1874 err3:
1875 percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
1876 err2:
1877 percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_misses);
1878 err1:
1879 percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_hits);
1880 return err;
1881 }
1882
ext4_es_unregister_shrinker(struct ext4_sb_info * sbi)1883 void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
1884 {
1885 percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_hits);
1886 percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_misses);
1887 percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
1888 percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt);
1889 shrinker_free(sbi->s_es_shrinker);
1890 }
1891
1892 /*
1893 * Shrink extents in given inode from ei->i_es_shrink_lblk till end. Scan at
1894 * most *nr_to_scan extents, update *nr_to_scan accordingly.
1895 *
1896 * Return 0 if we hit end of tree / interval, 1 if we exhausted nr_to_scan.
1897 * Increment *nr_shrunk by the number of reclaimed extents. Also update
1898 * ei->i_es_shrink_lblk to where we should continue scanning.
1899 */
es_do_reclaim_extents(struct ext4_inode_info * ei,ext4_lblk_t end,int * nr_to_scan,int * nr_shrunk)1900 static int es_do_reclaim_extents(struct ext4_inode_info *ei, ext4_lblk_t end,
1901 int *nr_to_scan, int *nr_shrunk)
1902 {
1903 struct inode *inode = &ei->vfs_inode;
1904 struct ext4_es_tree *tree = &ei->i_es_tree;
1905 struct extent_status *es;
1906 struct rb_node *node;
1907
1908 es = __es_tree_search(&tree->root, ei->i_es_shrink_lblk);
1909 if (!es)
1910 goto out_wrap;
1911
1912 while (*nr_to_scan > 0) {
1913 if (es->es_lblk > end) {
1914 ei->i_es_shrink_lblk = end + 1;
1915 return 0;
1916 }
1917
1918 (*nr_to_scan)--;
1919 node = rb_next(&es->rb_node);
1920
1921 if (ext4_es_must_keep(es))
1922 goto next;
1923 if (ext4_es_is_referenced(es)) {
1924 ext4_es_clear_referenced(es);
1925 goto next;
1926 }
1927
1928 rb_erase(&es->rb_node, &tree->root);
1929 ext4_es_free_extent(inode, es);
1930 (*nr_shrunk)++;
1931 next:
1932 if (!node)
1933 goto out_wrap;
1934 es = rb_entry(node, struct extent_status, rb_node);
1935 }
1936 ei->i_es_shrink_lblk = es->es_lblk;
1937 return 1;
1938 out_wrap:
1939 ei->i_es_shrink_lblk = 0;
1940 return 0;
1941 }
1942
es_reclaim_extents(struct ext4_inode_info * ei,int * nr_to_scan)1943 static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan)
1944 {
1945 struct inode *inode = &ei->vfs_inode;
1946 int nr_shrunk = 0;
1947 ext4_lblk_t start = ei->i_es_shrink_lblk;
1948 static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
1949 DEFAULT_RATELIMIT_BURST);
1950
1951 if (ei->i_es_shk_nr == 0)
1952 return 0;
1953
1954 if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED) &&
1955 __ratelimit(&_rs))
1956 ext4_warning(inode->i_sb, "forced shrink of precached extents");
1957
1958 if (!es_do_reclaim_extents(ei, EXT_MAX_BLOCKS, nr_to_scan, &nr_shrunk) &&
1959 start != 0)
1960 es_do_reclaim_extents(ei, start - 1, nr_to_scan, &nr_shrunk);
1961
1962 ei->i_es_tree.cache_es = NULL;
1963 return nr_shrunk;
1964 }
1965
1966 /*
1967 * Called to support EXT4_IOC_CLEAR_ES_CACHE. We can only remove
1968 * discretionary entries from the extent status cache. (Some entries
1969 * must be present for proper operations.)
1970 */
ext4_clear_inode_es(struct inode * inode)1971 void ext4_clear_inode_es(struct inode *inode)
1972 {
1973 struct ext4_inode_info *ei = EXT4_I(inode);
1974 struct extent_status *es;
1975 struct ext4_es_tree *tree;
1976 struct rb_node *node;
1977
1978 write_lock(&ei->i_es_lock);
1979 tree = &EXT4_I(inode)->i_es_tree;
1980 tree->cache_es = NULL;
1981 node = rb_first(&tree->root);
1982 while (node) {
1983 es = rb_entry(node, struct extent_status, rb_node);
1984 node = rb_next(node);
1985 if (!ext4_es_must_keep(es)) {
1986 rb_erase(&es->rb_node, &tree->root);
1987 ext4_es_free_extent(inode, es);
1988 }
1989 }
1990 ext4_clear_inode_state(inode, EXT4_STATE_EXT_PRECACHED);
1991 write_unlock(&ei->i_es_lock);
1992 }
1993
1994 #ifdef ES_DEBUG__
ext4_print_pending_tree(struct inode * inode)1995 static void ext4_print_pending_tree(struct inode *inode)
1996 {
1997 struct ext4_pending_tree *tree;
1998 struct rb_node *node;
1999 struct pending_reservation *pr;
2000
2001 printk(KERN_DEBUG "pending reservations for inode %lu:", inode->i_ino);
2002 tree = &EXT4_I(inode)->i_pending_tree;
2003 node = rb_first(&tree->root);
2004 while (node) {
2005 pr = rb_entry(node, struct pending_reservation, rb_node);
2006 printk(KERN_DEBUG " %u", pr->lclu);
2007 node = rb_next(node);
2008 }
2009 printk(KERN_DEBUG "\n");
2010 }
2011 #else
2012 #define ext4_print_pending_tree(inode)
2013 #endif
2014
ext4_init_pending(void)2015 int __init ext4_init_pending(void)
2016 {
2017 ext4_pending_cachep = KMEM_CACHE(pending_reservation, SLAB_RECLAIM_ACCOUNT);
2018 if (ext4_pending_cachep == NULL)
2019 return -ENOMEM;
2020 return 0;
2021 }
2022
ext4_exit_pending(void)2023 void ext4_exit_pending(void)
2024 {
2025 kmem_cache_destroy(ext4_pending_cachep);
2026 }
2027
ext4_init_pending_tree(struct ext4_pending_tree * tree)2028 void ext4_init_pending_tree(struct ext4_pending_tree *tree)
2029 {
2030 tree->root = RB_ROOT;
2031 }
2032
2033 /*
2034 * __get_pending - retrieve a pointer to a pending reservation
2035 *
2036 * @inode - file containing the pending cluster reservation
2037 * @lclu - logical cluster of interest
2038 *
2039 * Returns a pointer to a pending reservation if it's a member of
2040 * the set, and NULL if not. Must be called holding i_es_lock.
2041 */
__get_pending(struct inode * inode,ext4_lblk_t lclu)2042 static struct pending_reservation *__get_pending(struct inode *inode,
2043 ext4_lblk_t lclu)
2044 {
2045 struct ext4_pending_tree *tree;
2046 struct rb_node *node;
2047 struct pending_reservation *pr = NULL;
2048
2049 tree = &EXT4_I(inode)->i_pending_tree;
2050 node = (&tree->root)->rb_node;
2051
2052 while (node) {
2053 pr = rb_entry(node, struct pending_reservation, rb_node);
2054 if (lclu < pr->lclu)
2055 node = node->rb_left;
2056 else if (lclu > pr->lclu)
2057 node = node->rb_right;
2058 else if (lclu == pr->lclu)
2059 return pr;
2060 }
2061 return NULL;
2062 }
2063
2064 /*
2065 * __insert_pending - adds a pending cluster reservation to the set of
2066 * pending reservations
2067 *
2068 * @inode - file containing the cluster
2069 * @lblk - logical block in the cluster to be added
2070 * @prealloc - preallocated pending entry
2071 *
2072 * Returns 1 on successful insertion and -ENOMEM on failure. If the
2073 * pending reservation is already in the set, returns successfully.
2074 */
__insert_pending(struct inode * inode,ext4_lblk_t lblk,struct pending_reservation ** prealloc)2075 static int __insert_pending(struct inode *inode, ext4_lblk_t lblk,
2076 struct pending_reservation **prealloc)
2077 {
2078 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2079 struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree;
2080 struct rb_node **p = &tree->root.rb_node;
2081 struct rb_node *parent = NULL;
2082 struct pending_reservation *pr;
2083 ext4_lblk_t lclu;
2084 int ret = 0;
2085
2086 lclu = EXT4_B2C(sbi, lblk);
2087 /* search to find parent for insertion */
2088 while (*p) {
2089 parent = *p;
2090 pr = rb_entry(parent, struct pending_reservation, rb_node);
2091
2092 if (lclu < pr->lclu) {
2093 p = &(*p)->rb_left;
2094 } else if (lclu > pr->lclu) {
2095 p = &(*p)->rb_right;
2096 } else {
2097 /* pending reservation already inserted */
2098 goto out;
2099 }
2100 }
2101
2102 if (likely(*prealloc == NULL)) {
2103 pr = __alloc_pending(false);
2104 if (!pr) {
2105 ret = -ENOMEM;
2106 goto out;
2107 }
2108 } else {
2109 pr = *prealloc;
2110 *prealloc = NULL;
2111 }
2112 pr->lclu = lclu;
2113
2114 rb_link_node(&pr->rb_node, parent, p);
2115 rb_insert_color(&pr->rb_node, &tree->root);
2116 ret = 1;
2117
2118 out:
2119 return ret;
2120 }
2121
2122 /*
2123 * __remove_pending - removes a pending cluster reservation from the set
2124 * of pending reservations
2125 *
2126 * @inode - file containing the cluster
2127 * @lblk - logical block in the pending cluster reservation to be removed
2128 *
2129 * Returns successfully if pending reservation is not a member of the set.
2130 */
__remove_pending(struct inode * inode,ext4_lblk_t lblk)2131 static void __remove_pending(struct inode *inode, ext4_lblk_t lblk)
2132 {
2133 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2134 struct pending_reservation *pr;
2135 struct ext4_pending_tree *tree;
2136
2137 pr = __get_pending(inode, EXT4_B2C(sbi, lblk));
2138 if (pr != NULL) {
2139 tree = &EXT4_I(inode)->i_pending_tree;
2140 rb_erase(&pr->rb_node, &tree->root);
2141 __free_pending(pr);
2142 }
2143 }
2144
2145 /*
2146 * ext4_remove_pending - removes a pending cluster reservation from the set
2147 * of pending reservations
2148 *
2149 * @inode - file containing the cluster
2150 * @lblk - logical block in the pending cluster reservation to be removed
2151 *
2152 * Locking for external use of __remove_pending.
2153 */
ext4_remove_pending(struct inode * inode,ext4_lblk_t lblk)2154 void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk)
2155 {
2156 struct ext4_inode_info *ei = EXT4_I(inode);
2157
2158 write_lock(&ei->i_es_lock);
2159 __remove_pending(inode, lblk);
2160 write_unlock(&ei->i_es_lock);
2161 }
2162
2163 /*
2164 * ext4_is_pending - determine whether a cluster has a pending reservation
2165 * on it
2166 *
2167 * @inode - file containing the cluster
2168 * @lblk - logical block in the cluster
2169 *
2170 * Returns true if there's a pending reservation for the cluster in the
2171 * set of pending reservations, and false if not.
2172 */
ext4_is_pending(struct inode * inode,ext4_lblk_t lblk)2173 bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk)
2174 {
2175 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2176 struct ext4_inode_info *ei = EXT4_I(inode);
2177 bool ret;
2178
2179 read_lock(&ei->i_es_lock);
2180 ret = (bool)(__get_pending(inode, EXT4_B2C(sbi, lblk)) != NULL);
2181 read_unlock(&ei->i_es_lock);
2182
2183 return ret;
2184 }
2185
2186 /*
2187 * ext4_es_insert_delayed_extent - adds some delayed blocks to the extents
2188 * status tree, adding a pending reservation
2189 * where needed
2190 *
2191 * @inode - file containing the newly added block
2192 * @lblk - start logical block to be added
2193 * @len - length of blocks to be added
2194 * @lclu_allocated/end_allocated - indicates whether a physical cluster has
2195 * been allocated for the logical cluster
2196 * that contains the start/end block. Note that
2197 * end_allocated should always be set to false
2198 * if the start and the end block are in the
2199 * same cluster
2200 */
ext4_es_insert_delayed_extent(struct inode * inode,ext4_lblk_t lblk,ext4_lblk_t len,bool lclu_allocated,bool end_allocated)2201 void ext4_es_insert_delayed_extent(struct inode *inode, ext4_lblk_t lblk,
2202 ext4_lblk_t len, bool lclu_allocated,
2203 bool end_allocated)
2204 {
2205 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2206 struct extent_status newes;
2207 ext4_lblk_t end = lblk + len - 1;
2208 int err1 = 0, err2 = 0, err3 = 0;
2209 struct extent_status *es1 = NULL;
2210 struct extent_status *es2 = NULL;
2211 struct pending_reservation *pr1 = NULL;
2212 struct pending_reservation *pr2 = NULL;
2213
2214 if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
2215 return;
2216
2217 es_debug("add [%u/%u) delayed to extent status tree of inode %lu\n",
2218 lblk, len, inode->i_ino);
2219 if (!len)
2220 return;
2221
2222 WARN_ON_ONCE((EXT4_B2C(sbi, lblk) == EXT4_B2C(sbi, end)) &&
2223 end_allocated);
2224
2225 newes.es_lblk = lblk;
2226 newes.es_len = len;
2227 ext4_es_store_pblock_status(&newes, ~0, EXTENT_STATUS_DELAYED);
2228
2229 ext4_es_insert_extent_check(inode, &newes);
2230
2231 retry:
2232 if (err1 && !es1)
2233 es1 = __es_alloc_extent(true);
2234 if ((err1 || err2) && !es2)
2235 es2 = __es_alloc_extent(true);
2236 if (err1 || err2 || err3 < 0) {
2237 if (lclu_allocated && !pr1)
2238 pr1 = __alloc_pending(true);
2239 if (end_allocated && !pr2)
2240 pr2 = __alloc_pending(true);
2241 }
2242 write_lock(&EXT4_I(inode)->i_es_lock);
2243
2244 err1 = __es_remove_extent(inode, lblk, end, 0, NULL, NULL, es1);
2245 if (err1 != 0)
2246 goto error;
2247 /* Free preallocated extent if it didn't get used. */
2248 if (es1) {
2249 if (!es1->es_len)
2250 __es_free_extent(es1);
2251 es1 = NULL;
2252 }
2253
2254 err2 = __es_insert_extent(inode, &newes, es2);
2255 if (err2 != 0)
2256 goto error;
2257 /* Free preallocated extent if it didn't get used. */
2258 if (es2) {
2259 if (!es2->es_len)
2260 __es_free_extent(es2);
2261 es2 = NULL;
2262 }
2263
2264 if (lclu_allocated) {
2265 err3 = __insert_pending(inode, lblk, &pr1);
2266 if (err3 < 0)
2267 goto error;
2268 if (pr1) {
2269 __free_pending(pr1);
2270 pr1 = NULL;
2271 }
2272 }
2273 if (end_allocated) {
2274 err3 = __insert_pending(inode, end, &pr2);
2275 if (err3 < 0)
2276 goto error;
2277 if (pr2) {
2278 __free_pending(pr2);
2279 pr2 = NULL;
2280 }
2281 }
2282 ext4_es_inc_seq(inode);
2283 error:
2284 write_unlock(&EXT4_I(inode)->i_es_lock);
2285 if (err1 || err2 || err3 < 0)
2286 goto retry;
2287
2288 trace_ext4_es_insert_delayed_extent(inode, &newes, lclu_allocated,
2289 end_allocated);
2290 ext4_es_print_tree(inode);
2291 ext4_print_pending_tree(inode);
2292 return;
2293 }
2294
2295 /*
2296 * __revise_pending - makes, cancels, or leaves unchanged pending cluster
2297 * reservations for a specified block range depending
2298 * upon the presence or absence of delayed blocks
2299 * outside the range within clusters at the ends of the
2300 * range
2301 *
2302 * @inode - file containing the range
2303 * @lblk - logical block defining the start of range
2304 * @len - length of range in blocks
2305 * @prealloc - preallocated pending entry
2306 *
2307 * Used after a newly allocated extent is added to the extents status tree.
2308 * Requires that the extents in the range have either written or unwritten
2309 * status. Must be called while holding i_es_lock. Returns number of new
2310 * inserts pending cluster on insert pendings, returns 0 on remove pendings,
2311 * return -ENOMEM on failure.
2312 */
__revise_pending(struct inode * inode,ext4_lblk_t lblk,ext4_lblk_t len,struct pending_reservation ** prealloc)2313 static int __revise_pending(struct inode *inode, ext4_lblk_t lblk,
2314 ext4_lblk_t len,
2315 struct pending_reservation **prealloc)
2316 {
2317 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2318 ext4_lblk_t end = lblk + len - 1;
2319 ext4_lblk_t first, last;
2320 bool f_del = false, l_del = false;
2321 int pendings = 0;
2322 int ret = 0;
2323
2324 if (len == 0)
2325 return 0;
2326
2327 /*
2328 * Two cases - block range within single cluster and block range
2329 * spanning two or more clusters. Note that a cluster belonging
2330 * to a range starting and/or ending on a cluster boundary is treated
2331 * as if it does not contain a delayed extent. The new range may
2332 * have allocated space for previously delayed blocks out to the
2333 * cluster boundary, requiring that any pre-existing pending
2334 * reservation be canceled. Because this code only looks at blocks
2335 * outside the range, it should revise pending reservations
2336 * correctly even if the extent represented by the range can't be
2337 * inserted in the extents status tree due to ENOSPC.
2338 */
2339
2340 if (EXT4_B2C(sbi, lblk) == EXT4_B2C(sbi, end)) {
2341 first = EXT4_LBLK_CMASK(sbi, lblk);
2342 if (first != lblk)
2343 f_del = __es_scan_range(inode, &ext4_es_is_delayed,
2344 first, lblk - 1);
2345 if (f_del) {
2346 ret = __insert_pending(inode, first, prealloc);
2347 if (ret < 0)
2348 goto out;
2349 pendings += ret;
2350 } else {
2351 last = EXT4_LBLK_CMASK(sbi, end) +
2352 sbi->s_cluster_ratio - 1;
2353 if (last != end)
2354 l_del = __es_scan_range(inode,
2355 &ext4_es_is_delayed,
2356 end + 1, last);
2357 if (l_del) {
2358 ret = __insert_pending(inode, last, prealloc);
2359 if (ret < 0)
2360 goto out;
2361 pendings += ret;
2362 } else
2363 __remove_pending(inode, last);
2364 }
2365 } else {
2366 first = EXT4_LBLK_CMASK(sbi, lblk);
2367 if (first != lblk)
2368 f_del = __es_scan_range(inode, &ext4_es_is_delayed,
2369 first, lblk - 1);
2370 if (f_del) {
2371 ret = __insert_pending(inode, first, prealloc);
2372 if (ret < 0)
2373 goto out;
2374 pendings += ret;
2375 } else
2376 __remove_pending(inode, first);
2377
2378 last = EXT4_LBLK_CMASK(sbi, end) + sbi->s_cluster_ratio - 1;
2379 if (last != end)
2380 l_del = __es_scan_range(inode, &ext4_es_is_delayed,
2381 end + 1, last);
2382 if (l_del) {
2383 ret = __insert_pending(inode, last, prealloc);
2384 if (ret < 0)
2385 goto out;
2386 pendings += ret;
2387 } else
2388 __remove_pending(inode, last);
2389 }
2390 out:
2391 return (ret < 0) ? ret : pendings;
2392 }
2393