1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * fs/ext4/extents_status.c
4 *
5 * Written by Yongqiang Yang <xiaoqiangnk@gmail.com>
6 * Modified by
7 * Allison Henderson <achender@linux.vnet.ibm.com>
8 * Hugh Dickins <hughd@google.com>
9 * Zheng Liu <wenqing.lz@taobao.com>
10 *
11 * Ext4 extents status tree core functions.
12 */
13 #include <linux/list_sort.h>
14 #include <linux/proc_fs.h>
15 #include <linux/seq_file.h>
16 #include "ext4.h"
17
18 #include <trace/events/ext4.h>
19
20 /*
21 * According to previous discussion in Ext4 Developer Workshop, we
22 * will introduce a new structure called io tree to track all extent
23 * status in order to solve some problems that we have met
24 * (e.g. Reservation space warning), and provide extent-level locking.
25 * Delay extent tree is the first step to achieve this goal. It is
26 * original built by Yongqiang Yang. At that time it is called delay
27 * extent tree, whose goal is only track delayed extents in memory to
28 * simplify the implementation of fiemap and bigalloc, and introduce
29 * lseek SEEK_DATA/SEEK_HOLE support. That is why it is still called
30 * delay extent tree at the first commit. But for better understand
31 * what it does, it has been rename to extent status tree.
32 *
33 * Step1:
34 * Currently the first step has been done. All delayed extents are
35 * tracked in the tree. It maintains the delayed extent when a delayed
36 * allocation is issued, and the delayed extent is written out or
37 * invalidated. Therefore the implementation of fiemap and bigalloc
38 * are simplified, and SEEK_DATA/SEEK_HOLE are introduced.
39 *
40 * The following comment describes the implemenmtation of extent
41 * status tree and future works.
42 *
43 * Step2:
44 * In this step all extent status are tracked by extent status tree.
45 * Thus, we can first try to lookup a block mapping in this tree before
46 * finding it in extent tree. Hence, single extent cache can be removed
47 * because extent status tree can do a better job. Extents in status
48 * tree are loaded on-demand. Therefore, the extent status tree may not
49 * contain all of the extents in a file. Meanwhile we define a shrinker
50 * to reclaim memory from extent status tree because fragmented extent
51 * tree will make status tree cost too much memory. written/unwritten/-
52 * hole extents in the tree will be reclaimed by this shrinker when we
53 * are under high memory pressure. Delayed extents will not be
54 * reclimed because fiemap, bigalloc, and seek_data/hole need it.
55 */
56
57 /*
58 * Extent status tree implementation for ext4.
59 *
60 *
61 * ==========================================================================
62 * Extent status tree tracks all extent status.
63 *
64 * 1. Why we need to implement extent status tree?
65 *
66 * Without extent status tree, ext4 identifies a delayed extent by looking
67 * up page cache, this has several deficiencies - complicated, buggy,
68 * and inefficient code.
69 *
70 * FIEMAP, SEEK_HOLE/DATA, bigalloc, and writeout all need to know if a
71 * block or a range of blocks are belonged to a delayed extent.
72 *
73 * Let us have a look at how they do without extent status tree.
74 * -- FIEMAP
75 * FIEMAP looks up page cache to identify delayed allocations from holes.
76 *
77 * -- SEEK_HOLE/DATA
78 * SEEK_HOLE/DATA has the same problem as FIEMAP.
79 *
80 * -- bigalloc
81 * bigalloc looks up page cache to figure out if a block is
82 * already under delayed allocation or not to determine whether
83 * quota reserving is needed for the cluster.
84 *
85 * -- writeout
86 * Writeout looks up whole page cache to see if a buffer is
87 * mapped, If there are not very many delayed buffers, then it is
88 * time consuming.
89 *
90 * With extent status tree implementation, FIEMAP, SEEK_HOLE/DATA,
91 * bigalloc and writeout can figure out if a block or a range of
92 * blocks is under delayed allocation(belonged to a delayed extent) or
93 * not by searching the extent tree.
94 *
95 *
96 * ==========================================================================
97 * 2. Ext4 extent status tree impelmentation
98 *
99 * -- extent
100 * A extent is a range of blocks which are contiguous logically and
101 * physically. Unlike extent in extent tree, this extent in ext4 is
102 * a in-memory struct, there is no corresponding on-disk data. There
103 * is no limit on length of extent, so an extent can contain as many
104 * blocks as they are contiguous logically and physically.
105 *
106 * -- extent status tree
107 * Every inode has an extent status tree and all allocation blocks
108 * are added to the tree with different status. The extent in the
109 * tree are ordered by logical block no.
110 *
111 * -- operations on a extent status tree
112 * There are three important operations on a delayed extent tree: find
113 * next extent, adding a extent(a range of blocks) and removing a extent.
114 *
115 * -- race on a extent status tree
116 * Extent status tree is protected by inode->i_es_lock.
117 *
118 * -- memory consumption
119 * Fragmented extent tree will make extent status tree cost too much
120 * memory. Hence, we will reclaim written/unwritten/hole extents from
121 * the tree under a heavy memory pressure.
122 *
123 * ==========================================================================
124 * 3. Assurance of Ext4 extent status tree consistency
125 *
126 * When mapping blocks, Ext4 queries the extent status tree first and should
127 * always trusts that the extent status tree is consistent and up to date.
128 * Therefore, it is important to adheres to the following rules when createing,
129 * modifying and removing extents.
130 *
131 * 1. Besides fastcommit replay, when Ext4 creates or queries block mappings,
132 * the extent information should always be processed through the extent
133 * status tree instead of being organized manually through the on-disk
134 * extent tree.
135 *
136 * 2. When updating the extent tree, Ext4 should acquire the i_data_sem
137 * exclusively and update the extent status tree atomically. If the extents
138 * to be modified are large enough to exceed the range that a single
139 * i_data_sem can process (as ext4_datasem_ensure_credits() may drop
140 * i_data_sem to restart a transaction), it must (e.g. as ext4_punch_hole()
141 * does):
142 *
143 * a) Hold the i_rwsem and invalidate_lock exclusively. This ensures
144 * exclusion against page faults, as well as reads and writes that may
145 * concurrently modify the extent status tree.
146 * b) Evict all page cache in the affected range and recommend rebuilding
147 * or dropping the extent status tree after modifying the on-disk
148 * extent tree. This ensures exclusion against concurrent writebacks
149 * that do not hold those locks but only holds a folio lock.
150 *
151 * 3. Based on the rules above, when querying block mappings, Ext4 should at
152 * least hold the i_rwsem or invalidate_lock or folio lock(s) for the
153 * specified querying range.
154 *
155 * ==========================================================================
156 * 4. Performance analysis
157 *
158 * -- overhead
159 * 1. There is a cache extent for write access, so if writes are
160 * not very random, adding space operaions are in O(1) time.
161 *
162 * -- gain
163 * 2. Code is much simpler, more readable, more maintainable and
164 * more efficient.
165 *
166 *
167 * ==========================================================================
168 * 5. TODO list
169 *
170 * -- Refactor delayed space reservation
171 *
172 * -- Extent-level locking
173 */
174
175 static struct kmem_cache *ext4_es_cachep;
176 static struct kmem_cache *ext4_pending_cachep;
177
178 static int __es_insert_extent(struct inode *inode, struct extent_status *newes,
179 struct extent_status *prealloc);
180 static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
181 ext4_lblk_t end, int *reserved,
182 struct extent_status *prealloc);
183 static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan);
184 static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
185 struct ext4_inode_info *locked_ei);
186 static int __revise_pending(struct inode *inode, ext4_lblk_t lblk,
187 ext4_lblk_t len,
188 struct pending_reservation **prealloc);
189
ext4_init_es(void)190 int __init ext4_init_es(void)
191 {
192 ext4_es_cachep = KMEM_CACHE(extent_status, SLAB_RECLAIM_ACCOUNT);
193 if (ext4_es_cachep == NULL)
194 return -ENOMEM;
195 return 0;
196 }
197
ext4_exit_es(void)198 void ext4_exit_es(void)
199 {
200 kmem_cache_destroy(ext4_es_cachep);
201 }
202
ext4_es_init_tree(struct ext4_es_tree * tree)203 void ext4_es_init_tree(struct ext4_es_tree *tree)
204 {
205 tree->root = RB_ROOT;
206 tree->cache_es = NULL;
207 }
208
209 #ifdef ES_DEBUG__
ext4_es_print_tree(struct inode * inode)210 static void ext4_es_print_tree(struct inode *inode)
211 {
212 struct ext4_es_tree *tree;
213 struct rb_node *node;
214
215 printk(KERN_DEBUG "status extents for inode %lu:", inode->i_ino);
216 tree = &EXT4_I(inode)->i_es_tree;
217 node = rb_first(&tree->root);
218 while (node) {
219 struct extent_status *es;
220 es = rb_entry(node, struct extent_status, rb_node);
221 printk(KERN_DEBUG " [%u/%u) %llu %x",
222 es->es_lblk, es->es_len,
223 ext4_es_pblock(es), ext4_es_status(es));
224 node = rb_next(node);
225 }
226 printk(KERN_DEBUG "\n");
227 }
228 #else
229 #define ext4_es_print_tree(inode)
230 #endif
231
ext4_es_end(struct extent_status * es)232 static inline ext4_lblk_t ext4_es_end(struct extent_status *es)
233 {
234 BUG_ON(es->es_lblk + es->es_len < es->es_lblk);
235 return es->es_lblk + es->es_len - 1;
236 }
237
238 /*
239 * search through the tree for an delayed extent with a given offset. If
240 * it can't be found, try to find next extent.
241 */
__es_tree_search(struct rb_root * root,ext4_lblk_t lblk)242 static struct extent_status *__es_tree_search(struct rb_root *root,
243 ext4_lblk_t lblk)
244 {
245 struct rb_node *node = root->rb_node;
246 struct extent_status *es = NULL;
247
248 while (node) {
249 es = rb_entry(node, struct extent_status, rb_node);
250 if (lblk < es->es_lblk)
251 node = node->rb_left;
252 else if (lblk > ext4_es_end(es))
253 node = node->rb_right;
254 else
255 return es;
256 }
257
258 if (es && lblk < es->es_lblk)
259 return es;
260
261 if (es && lblk > ext4_es_end(es)) {
262 node = rb_next(&es->rb_node);
263 return node ? rb_entry(node, struct extent_status, rb_node) :
264 NULL;
265 }
266
267 return NULL;
268 }
269
270 /*
271 * ext4_es_find_extent_range - find extent with specified status within block
272 * range or next extent following block range in
273 * extents status tree
274 *
275 * @inode - file containing the range
276 * @matching_fn - pointer to function that matches extents with desired status
277 * @lblk - logical block defining start of range
278 * @end - logical block defining end of range
279 * @es - extent found, if any
280 *
281 * Find the first extent within the block range specified by @lblk and @end
282 * in the extents status tree that satisfies @matching_fn. If a match
283 * is found, it's returned in @es. If not, and a matching extent is found
284 * beyond the block range, it's returned in @es. If no match is found, an
285 * extent is returned in @es whose es_lblk, es_len, and es_pblk components
286 * are 0.
287 */
__es_find_extent_range(struct inode * inode,int (* matching_fn)(struct extent_status * es),ext4_lblk_t lblk,ext4_lblk_t end,struct extent_status * es)288 static void __es_find_extent_range(struct inode *inode,
289 int (*matching_fn)(struct extent_status *es),
290 ext4_lblk_t lblk, ext4_lblk_t end,
291 struct extent_status *es)
292 {
293 struct ext4_es_tree *tree = NULL;
294 struct extent_status *es1 = NULL;
295 struct rb_node *node;
296
297 WARN_ON(es == NULL);
298 WARN_ON(end < lblk);
299
300 tree = &EXT4_I(inode)->i_es_tree;
301
302 /* see if the extent has been cached */
303 es->es_lblk = es->es_len = es->es_pblk = 0;
304 es1 = READ_ONCE(tree->cache_es);
305 if (es1 && in_range(lblk, es1->es_lblk, es1->es_len)) {
306 es_debug("%u cached by [%u/%u) %llu %x\n",
307 lblk, es1->es_lblk, es1->es_len,
308 ext4_es_pblock(es1), ext4_es_status(es1));
309 goto out;
310 }
311
312 es1 = __es_tree_search(&tree->root, lblk);
313
314 out:
315 if (es1 && !matching_fn(es1)) {
316 while ((node = rb_next(&es1->rb_node)) != NULL) {
317 es1 = rb_entry(node, struct extent_status, rb_node);
318 if (es1->es_lblk > end) {
319 es1 = NULL;
320 break;
321 }
322 if (matching_fn(es1))
323 break;
324 }
325 }
326
327 if (es1 && matching_fn(es1)) {
328 WRITE_ONCE(tree->cache_es, es1);
329 es->es_lblk = es1->es_lblk;
330 es->es_len = es1->es_len;
331 es->es_pblk = es1->es_pblk;
332 }
333
334 }
335
336 /*
337 * Locking for __es_find_extent_range() for external use
338 */
ext4_es_find_extent_range(struct inode * inode,int (* matching_fn)(struct extent_status * es),ext4_lblk_t lblk,ext4_lblk_t end,struct extent_status * es)339 void ext4_es_find_extent_range(struct inode *inode,
340 int (*matching_fn)(struct extent_status *es),
341 ext4_lblk_t lblk, ext4_lblk_t end,
342 struct extent_status *es)
343 {
344 es->es_lblk = es->es_len = es->es_pblk = 0;
345
346 if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
347 return;
348
349 trace_ext4_es_find_extent_range_enter(inode, lblk);
350
351 read_lock(&EXT4_I(inode)->i_es_lock);
352 __es_find_extent_range(inode, matching_fn, lblk, end, es);
353 read_unlock(&EXT4_I(inode)->i_es_lock);
354
355 trace_ext4_es_find_extent_range_exit(inode, es);
356 }
357
358 /*
359 * __es_scan_range - search block range for block with specified status
360 * in extents status tree
361 *
362 * @inode - file containing the range
363 * @matching_fn - pointer to function that matches extents with desired status
364 * @lblk - logical block defining start of range
365 * @end - logical block defining end of range
366 *
367 * Returns true if at least one block in the specified block range satisfies
368 * the criterion specified by @matching_fn, and false if not. If at least
369 * one extent has the specified status, then there is at least one block
370 * in the cluster with that status. Should only be called by code that has
371 * taken i_es_lock.
372 */
__es_scan_range(struct inode * inode,int (* matching_fn)(struct extent_status * es),ext4_lblk_t start,ext4_lblk_t end)373 static bool __es_scan_range(struct inode *inode,
374 int (*matching_fn)(struct extent_status *es),
375 ext4_lblk_t start, ext4_lblk_t end)
376 {
377 struct extent_status es;
378
379 __es_find_extent_range(inode, matching_fn, start, end, &es);
380 if (es.es_len == 0)
381 return false; /* no matching extent in the tree */
382 else if (es.es_lblk <= start &&
383 start < es.es_lblk + es.es_len)
384 return true;
385 else if (start <= es.es_lblk && es.es_lblk <= end)
386 return true;
387 else
388 return false;
389 }
390 /*
391 * Locking for __es_scan_range() for external use
392 */
ext4_es_scan_range(struct inode * inode,int (* matching_fn)(struct extent_status * es),ext4_lblk_t lblk,ext4_lblk_t end)393 bool ext4_es_scan_range(struct inode *inode,
394 int (*matching_fn)(struct extent_status *es),
395 ext4_lblk_t lblk, ext4_lblk_t end)
396 {
397 bool ret;
398
399 if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
400 return false;
401
402 read_lock(&EXT4_I(inode)->i_es_lock);
403 ret = __es_scan_range(inode, matching_fn, lblk, end);
404 read_unlock(&EXT4_I(inode)->i_es_lock);
405
406 return ret;
407 }
408
409 /*
410 * __es_scan_clu - search cluster for block with specified status in
411 * extents status tree
412 *
413 * @inode - file containing the cluster
414 * @matching_fn - pointer to function that matches extents with desired status
415 * @lblk - logical block in cluster to be searched
416 *
417 * Returns true if at least one extent in the cluster containing @lblk
418 * satisfies the criterion specified by @matching_fn, and false if not. If at
419 * least one extent has the specified status, then there is at least one block
420 * in the cluster with that status. Should only be called by code that has
421 * taken i_es_lock.
422 */
__es_scan_clu(struct inode * inode,int (* matching_fn)(struct extent_status * es),ext4_lblk_t lblk)423 static bool __es_scan_clu(struct inode *inode,
424 int (*matching_fn)(struct extent_status *es),
425 ext4_lblk_t lblk)
426 {
427 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
428 ext4_lblk_t lblk_start, lblk_end;
429
430 lblk_start = EXT4_LBLK_CMASK(sbi, lblk);
431 lblk_end = lblk_start + sbi->s_cluster_ratio - 1;
432
433 return __es_scan_range(inode, matching_fn, lblk_start, lblk_end);
434 }
435
436 /*
437 * Locking for __es_scan_clu() for external use
438 */
ext4_es_scan_clu(struct inode * inode,int (* matching_fn)(struct extent_status * es),ext4_lblk_t lblk)439 bool ext4_es_scan_clu(struct inode *inode,
440 int (*matching_fn)(struct extent_status *es),
441 ext4_lblk_t lblk)
442 {
443 bool ret;
444
445 if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
446 return false;
447
448 read_lock(&EXT4_I(inode)->i_es_lock);
449 ret = __es_scan_clu(inode, matching_fn, lblk);
450 read_unlock(&EXT4_I(inode)->i_es_lock);
451
452 return ret;
453 }
454
ext4_es_list_add(struct inode * inode)455 static void ext4_es_list_add(struct inode *inode)
456 {
457 struct ext4_inode_info *ei = EXT4_I(inode);
458 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
459
460 if (!list_empty(&ei->i_es_list))
461 return;
462
463 spin_lock(&sbi->s_es_lock);
464 if (list_empty(&ei->i_es_list)) {
465 list_add_tail(&ei->i_es_list, &sbi->s_es_list);
466 sbi->s_es_nr_inode++;
467 }
468 spin_unlock(&sbi->s_es_lock);
469 }
470
ext4_es_list_del(struct inode * inode)471 static void ext4_es_list_del(struct inode *inode)
472 {
473 struct ext4_inode_info *ei = EXT4_I(inode);
474 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
475
476 spin_lock(&sbi->s_es_lock);
477 if (!list_empty(&ei->i_es_list)) {
478 list_del_init(&ei->i_es_list);
479 sbi->s_es_nr_inode--;
480 WARN_ON_ONCE(sbi->s_es_nr_inode < 0);
481 }
482 spin_unlock(&sbi->s_es_lock);
483 }
484
__alloc_pending(bool nofail)485 static inline struct pending_reservation *__alloc_pending(bool nofail)
486 {
487 if (!nofail)
488 return kmem_cache_alloc(ext4_pending_cachep, GFP_ATOMIC);
489
490 return kmem_cache_zalloc(ext4_pending_cachep, GFP_KERNEL | __GFP_NOFAIL);
491 }
492
__free_pending(struct pending_reservation * pr)493 static inline void __free_pending(struct pending_reservation *pr)
494 {
495 kmem_cache_free(ext4_pending_cachep, pr);
496 }
497
498 /*
499 * Returns true if we cannot fail to allocate memory for this extent_status
500 * entry and cannot reclaim it until its status changes.
501 */
ext4_es_must_keep(struct extent_status * es)502 static inline bool ext4_es_must_keep(struct extent_status *es)
503 {
504 /* fiemap, bigalloc, and seek_data/hole need to use it. */
505 if (ext4_es_is_delayed(es))
506 return true;
507
508 return false;
509 }
510
__es_alloc_extent(bool nofail)511 static inline struct extent_status *__es_alloc_extent(bool nofail)
512 {
513 if (!nofail)
514 return kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC);
515
516 return kmem_cache_zalloc(ext4_es_cachep, GFP_KERNEL | __GFP_NOFAIL);
517 }
518
ext4_es_init_extent(struct inode * inode,struct extent_status * es,ext4_lblk_t lblk,ext4_lblk_t len,ext4_fsblk_t pblk)519 static void ext4_es_init_extent(struct inode *inode, struct extent_status *es,
520 ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t pblk)
521 {
522 es->es_lblk = lblk;
523 es->es_len = len;
524 es->es_pblk = pblk;
525
526 /* We never try to reclaim a must kept extent, so we don't count it. */
527 if (!ext4_es_must_keep(es)) {
528 if (!EXT4_I(inode)->i_es_shk_nr++)
529 ext4_es_list_add(inode);
530 percpu_counter_inc(&EXT4_SB(inode->i_sb)->
531 s_es_stats.es_stats_shk_cnt);
532 }
533
534 EXT4_I(inode)->i_es_all_nr++;
535 percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
536 }
537
__es_free_extent(struct extent_status * es)538 static inline void __es_free_extent(struct extent_status *es)
539 {
540 kmem_cache_free(ext4_es_cachep, es);
541 }
542
ext4_es_free_extent(struct inode * inode,struct extent_status * es)543 static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)
544 {
545 EXT4_I(inode)->i_es_all_nr--;
546 percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
547
548 /* Decrease the shrink counter when we can reclaim the extent. */
549 if (!ext4_es_must_keep(es)) {
550 BUG_ON(EXT4_I(inode)->i_es_shk_nr == 0);
551 if (!--EXT4_I(inode)->i_es_shk_nr)
552 ext4_es_list_del(inode);
553 percpu_counter_dec(&EXT4_SB(inode->i_sb)->
554 s_es_stats.es_stats_shk_cnt);
555 }
556
557 __es_free_extent(es);
558 }
559
560 /*
561 * Check whether or not two extents can be merged
562 * Condition:
563 * - logical block number is contiguous
564 * - physical block number is contiguous
565 * - status is equal
566 */
ext4_es_can_be_merged(struct extent_status * es1,struct extent_status * es2)567 static int ext4_es_can_be_merged(struct extent_status *es1,
568 struct extent_status *es2)
569 {
570 if (ext4_es_type(es1) != ext4_es_type(es2))
571 return 0;
572
573 if (((__u64) es1->es_len) + es2->es_len > EXT_MAX_BLOCKS) {
574 pr_warn("ES assertion failed when merging extents. "
575 "The sum of lengths of es1 (%d) and es2 (%d) "
576 "is bigger than allowed file size (%d)\n",
577 es1->es_len, es2->es_len, EXT_MAX_BLOCKS);
578 WARN_ON(1);
579 return 0;
580 }
581
582 if (((__u64) es1->es_lblk) + es1->es_len != es2->es_lblk)
583 return 0;
584
585 if ((ext4_es_is_written(es1) || ext4_es_is_unwritten(es1)) &&
586 (ext4_es_pblock(es1) + es1->es_len == ext4_es_pblock(es2)))
587 return 1;
588
589 if (ext4_es_is_hole(es1))
590 return 1;
591
592 /* we need to check delayed extent */
593 if (ext4_es_is_delayed(es1))
594 return 1;
595
596 return 0;
597 }
598
599 static struct extent_status *
ext4_es_try_to_merge_left(struct inode * inode,struct extent_status * es)600 ext4_es_try_to_merge_left(struct inode *inode, struct extent_status *es)
601 {
602 struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
603 struct extent_status *es1;
604 struct rb_node *node;
605
606 node = rb_prev(&es->rb_node);
607 if (!node)
608 return es;
609
610 es1 = rb_entry(node, struct extent_status, rb_node);
611 if (ext4_es_can_be_merged(es1, es)) {
612 es1->es_len += es->es_len;
613 if (ext4_es_is_referenced(es))
614 ext4_es_set_referenced(es1);
615 rb_erase(&es->rb_node, &tree->root);
616 ext4_es_free_extent(inode, es);
617 es = es1;
618 }
619
620 return es;
621 }
622
623 static struct extent_status *
ext4_es_try_to_merge_right(struct inode * inode,struct extent_status * es)624 ext4_es_try_to_merge_right(struct inode *inode, struct extent_status *es)
625 {
626 struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
627 struct extent_status *es1;
628 struct rb_node *node;
629
630 node = rb_next(&es->rb_node);
631 if (!node)
632 return es;
633
634 es1 = rb_entry(node, struct extent_status, rb_node);
635 if (ext4_es_can_be_merged(es, es1)) {
636 es->es_len += es1->es_len;
637 if (ext4_es_is_referenced(es1))
638 ext4_es_set_referenced(es);
639 rb_erase(node, &tree->root);
640 ext4_es_free_extent(inode, es1);
641 }
642
643 return es;
644 }
645
646 #ifdef ES_AGGRESSIVE_TEST
647 #include "ext4_extents.h" /* Needed when ES_AGGRESSIVE_TEST is defined */
648
ext4_es_insert_extent_ext_check(struct inode * inode,struct extent_status * es)649 static void ext4_es_insert_extent_ext_check(struct inode *inode,
650 struct extent_status *es)
651 {
652 struct ext4_ext_path *path = NULL;
653 struct ext4_extent *ex;
654 ext4_lblk_t ee_block;
655 ext4_fsblk_t ee_start;
656 unsigned short ee_len;
657 int depth, ee_status, es_status;
658
659 path = ext4_find_extent(inode, es->es_lblk, NULL, EXT4_EX_NOCACHE);
660 if (IS_ERR(path))
661 return;
662
663 depth = ext_depth(inode);
664 ex = path[depth].p_ext;
665
666 if (ex) {
667
668 ee_block = le32_to_cpu(ex->ee_block);
669 ee_start = ext4_ext_pblock(ex);
670 ee_len = ext4_ext_get_actual_len(ex);
671
672 ee_status = ext4_ext_is_unwritten(ex) ? 1 : 0;
673 es_status = ext4_es_is_unwritten(es) ? 1 : 0;
674
675 /*
676 * Make sure ex and es are not overlap when we try to insert
677 * a delayed/hole extent.
678 */
679 if (!ext4_es_is_written(es) && !ext4_es_is_unwritten(es)) {
680 if (in_range(es->es_lblk, ee_block, ee_len)) {
681 pr_warn("ES insert assertion failed for "
682 "inode: %lu we can find an extent "
683 "at block [%d/%d/%llu/%c], but we "
684 "want to add a delayed/hole extent "
685 "[%d/%d/%llu/%x]\n",
686 inode->i_ino, ee_block, ee_len,
687 ee_start, ee_status ? 'u' : 'w',
688 es->es_lblk, es->es_len,
689 ext4_es_pblock(es), ext4_es_status(es));
690 }
691 goto out;
692 }
693
694 /*
695 * We don't check ee_block == es->es_lblk, etc. because es
696 * might be a part of whole extent, vice versa.
697 */
698 if (es->es_lblk < ee_block ||
699 ext4_es_pblock(es) != ee_start + es->es_lblk - ee_block) {
700 pr_warn("ES insert assertion failed for inode: %lu "
701 "ex_status [%d/%d/%llu/%c] != "
702 "es_status [%d/%d/%llu/%c]\n", inode->i_ino,
703 ee_block, ee_len, ee_start,
704 ee_status ? 'u' : 'w', es->es_lblk, es->es_len,
705 ext4_es_pblock(es), es_status ? 'u' : 'w');
706 goto out;
707 }
708
709 if (ee_status ^ es_status) {
710 pr_warn("ES insert assertion failed for inode: %lu "
711 "ex_status [%d/%d/%llu/%c] != "
712 "es_status [%d/%d/%llu/%c]\n", inode->i_ino,
713 ee_block, ee_len, ee_start,
714 ee_status ? 'u' : 'w', es->es_lblk, es->es_len,
715 ext4_es_pblock(es), es_status ? 'u' : 'w');
716 }
717 } else {
718 /*
719 * We can't find an extent on disk. So we need to make sure
720 * that we don't want to add an written/unwritten extent.
721 */
722 if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) {
723 pr_warn("ES insert assertion failed for inode: %lu "
724 "can't find an extent at block %d but we want "
725 "to add a written/unwritten extent "
726 "[%d/%d/%llu/%x]\n", inode->i_ino,
727 es->es_lblk, es->es_lblk, es->es_len,
728 ext4_es_pblock(es), ext4_es_status(es));
729 }
730 }
731 out:
732 ext4_free_ext_path(path);
733 }
734
ext4_es_insert_extent_ind_check(struct inode * inode,struct extent_status * es)735 static void ext4_es_insert_extent_ind_check(struct inode *inode,
736 struct extent_status *es)
737 {
738 struct ext4_map_blocks map;
739 int retval;
740
741 /*
742 * Here we call ext4_ind_map_blocks to lookup a block mapping because
743 * 'Indirect' structure is defined in indirect.c. So we couldn't
744 * access direct/indirect tree from outside. It is too dirty to define
745 * this function in indirect.c file.
746 */
747
748 map.m_lblk = es->es_lblk;
749 map.m_len = es->es_len;
750
751 retval = ext4_ind_map_blocks(NULL, inode, &map, 0);
752 if (retval > 0) {
753 if (ext4_es_is_delayed(es) || ext4_es_is_hole(es)) {
754 /*
755 * We want to add a delayed/hole extent but this
756 * block has been allocated.
757 */
758 pr_warn("ES insert assertion failed for inode: %lu "
759 "We can find blocks but we want to add a "
760 "delayed/hole extent [%d/%d/%llu/%x]\n",
761 inode->i_ino, es->es_lblk, es->es_len,
762 ext4_es_pblock(es), ext4_es_status(es));
763 return;
764 } else if (ext4_es_is_written(es)) {
765 if (retval != es->es_len) {
766 pr_warn("ES insert assertion failed for "
767 "inode: %lu retval %d != es_len %d\n",
768 inode->i_ino, retval, es->es_len);
769 return;
770 }
771 if (map.m_pblk != ext4_es_pblock(es)) {
772 pr_warn("ES insert assertion failed for "
773 "inode: %lu m_pblk %llu != "
774 "es_pblk %llu\n",
775 inode->i_ino, map.m_pblk,
776 ext4_es_pblock(es));
777 return;
778 }
779 } else {
780 /*
781 * We don't need to check unwritten extent because
782 * indirect-based file doesn't have it.
783 */
784 BUG();
785 }
786 } else if (retval == 0) {
787 if (ext4_es_is_written(es)) {
788 pr_warn("ES insert assertion failed for inode: %lu "
789 "We can't find the block but we want to add "
790 "a written extent [%d/%d/%llu/%x]\n",
791 inode->i_ino, es->es_lblk, es->es_len,
792 ext4_es_pblock(es), ext4_es_status(es));
793 return;
794 }
795 }
796 }
797
ext4_es_insert_extent_check(struct inode * inode,struct extent_status * es)798 static inline void ext4_es_insert_extent_check(struct inode *inode,
799 struct extent_status *es)
800 {
801 /*
802 * We don't need to worry about the race condition because
803 * caller takes i_data_sem locking.
804 */
805 BUG_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem));
806 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
807 ext4_es_insert_extent_ext_check(inode, es);
808 else
809 ext4_es_insert_extent_ind_check(inode, es);
810 }
811 #else
ext4_es_insert_extent_check(struct inode * inode,struct extent_status * es)812 static inline void ext4_es_insert_extent_check(struct inode *inode,
813 struct extent_status *es)
814 {
815 }
816 #endif
817
__es_insert_extent(struct inode * inode,struct extent_status * newes,struct extent_status * prealloc)818 static int __es_insert_extent(struct inode *inode, struct extent_status *newes,
819 struct extent_status *prealloc)
820 {
821 struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
822 struct rb_node **p = &tree->root.rb_node;
823 struct rb_node *parent = NULL;
824 struct extent_status *es;
825
826 while (*p) {
827 parent = *p;
828 es = rb_entry(parent, struct extent_status, rb_node);
829
830 if (newes->es_lblk < es->es_lblk) {
831 if (ext4_es_can_be_merged(newes, es)) {
832 /*
833 * Here we can modify es_lblk directly
834 * because it isn't overlapped.
835 */
836 es->es_lblk = newes->es_lblk;
837 es->es_len += newes->es_len;
838 if (ext4_es_is_written(es) ||
839 ext4_es_is_unwritten(es))
840 ext4_es_store_pblock(es,
841 newes->es_pblk);
842 es = ext4_es_try_to_merge_left(inode, es);
843 goto out;
844 }
845 p = &(*p)->rb_left;
846 } else if (newes->es_lblk > ext4_es_end(es)) {
847 if (ext4_es_can_be_merged(es, newes)) {
848 es->es_len += newes->es_len;
849 es = ext4_es_try_to_merge_right(inode, es);
850 goto out;
851 }
852 p = &(*p)->rb_right;
853 } else {
854 BUG();
855 return -EINVAL;
856 }
857 }
858
859 if (prealloc)
860 es = prealloc;
861 else
862 es = __es_alloc_extent(false);
863 if (!es)
864 return -ENOMEM;
865 ext4_es_init_extent(inode, es, newes->es_lblk, newes->es_len,
866 newes->es_pblk);
867
868 rb_link_node(&es->rb_node, parent, p);
869 rb_insert_color(&es->rb_node, &tree->root);
870
871 out:
872 tree->cache_es = es;
873 return 0;
874 }
875
876 /*
877 * ext4_es_insert_extent() adds information to an inode's extent
878 * status tree.
879 */
ext4_es_insert_extent(struct inode * inode,ext4_lblk_t lblk,ext4_lblk_t len,ext4_fsblk_t pblk,unsigned int status,bool delalloc_reserve_used)880 void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
881 ext4_lblk_t len, ext4_fsblk_t pblk,
882 unsigned int status, bool delalloc_reserve_used)
883 {
884 struct extent_status newes;
885 ext4_lblk_t end = lblk + len - 1;
886 int err1 = 0, err2 = 0, err3 = 0;
887 int resv_used = 0, pending = 0;
888 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
889 struct extent_status *es1 = NULL;
890 struct extent_status *es2 = NULL;
891 struct pending_reservation *pr = NULL;
892 bool revise_pending = false;
893
894 if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
895 return;
896
897 es_debug("add [%u/%u) %llu %x %d to extent status tree of inode %lu\n",
898 lblk, len, pblk, status, delalloc_reserve_used, inode->i_ino);
899
900 if (!len)
901 return;
902
903 BUG_ON(end < lblk);
904 WARN_ON_ONCE(status & EXTENT_STATUS_DELAYED);
905
906 newes.es_lblk = lblk;
907 newes.es_len = len;
908 ext4_es_store_pblock_status(&newes, pblk, status);
909 trace_ext4_es_insert_extent(inode, &newes);
910
911 ext4_es_insert_extent_check(inode, &newes);
912
913 revise_pending = sbi->s_cluster_ratio > 1 &&
914 test_opt(inode->i_sb, DELALLOC) &&
915 (status & (EXTENT_STATUS_WRITTEN |
916 EXTENT_STATUS_UNWRITTEN));
917 retry:
918 if (err1 && !es1)
919 es1 = __es_alloc_extent(true);
920 if ((err1 || err2) && !es2)
921 es2 = __es_alloc_extent(true);
922 if ((err1 || err2 || err3 < 0) && revise_pending && !pr)
923 pr = __alloc_pending(true);
924 write_lock(&EXT4_I(inode)->i_es_lock);
925
926 err1 = __es_remove_extent(inode, lblk, end, &resv_used, es1);
927 if (err1 != 0)
928 goto error;
929 /* Free preallocated extent if it didn't get used. */
930 if (es1) {
931 if (!es1->es_len)
932 __es_free_extent(es1);
933 es1 = NULL;
934 }
935
936 err2 = __es_insert_extent(inode, &newes, es2);
937 if (err2 == -ENOMEM && !ext4_es_must_keep(&newes))
938 err2 = 0;
939 if (err2 != 0)
940 goto error;
941 /* Free preallocated extent if it didn't get used. */
942 if (es2) {
943 if (!es2->es_len)
944 __es_free_extent(es2);
945 es2 = NULL;
946 }
947
948 if (revise_pending) {
949 err3 = __revise_pending(inode, lblk, len, &pr);
950 if (err3 < 0)
951 goto error;
952 if (pr) {
953 __free_pending(pr);
954 pr = NULL;
955 }
956 pending = err3;
957 }
958 error:
959 write_unlock(&EXT4_I(inode)->i_es_lock);
960 /*
961 * Reduce the reserved cluster count to reflect successful deferred
962 * allocation of delayed allocated clusters or direct allocation of
963 * clusters discovered to be delayed allocated. Once allocated, a
964 * cluster is not included in the reserved count.
965 *
966 * When direct allocating (from fallocate, filemap, DIO, or clusters
967 * allocated when delalloc has been disabled by ext4_nonda_switch())
968 * an extent either 1) contains delayed blocks but start with
969 * non-delayed allocated blocks (e.g. hole) or 2) contains non-delayed
970 * allocated blocks which belong to delayed allocated clusters when
971 * bigalloc feature is enabled, quota has already been claimed by
972 * ext4_mb_new_blocks(), so release the quota reservations made for
973 * any previously delayed allocated clusters instead of claim them
974 * again.
975 */
976 resv_used += pending;
977 if (resv_used)
978 ext4_da_update_reserve_space(inode, resv_used,
979 delalloc_reserve_used);
980
981 if (err1 || err2 || err3 < 0)
982 goto retry;
983
984 ext4_es_print_tree(inode);
985 return;
986 }
987
988 /*
989 * ext4_es_cache_extent() inserts information into the extent status
990 * tree if and only if there isn't information about the range in
991 * question already.
992 */
ext4_es_cache_extent(struct inode * inode,ext4_lblk_t lblk,ext4_lblk_t len,ext4_fsblk_t pblk,unsigned int status)993 void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
994 ext4_lblk_t len, ext4_fsblk_t pblk,
995 unsigned int status)
996 {
997 struct extent_status *es;
998 struct extent_status newes;
999 ext4_lblk_t end = lblk + len - 1;
1000
1001 if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
1002 return;
1003
1004 newes.es_lblk = lblk;
1005 newes.es_len = len;
1006 ext4_es_store_pblock_status(&newes, pblk, status);
1007 trace_ext4_es_cache_extent(inode, &newes);
1008
1009 if (!len)
1010 return;
1011
1012 BUG_ON(end < lblk);
1013
1014 write_lock(&EXT4_I(inode)->i_es_lock);
1015
1016 es = __es_tree_search(&EXT4_I(inode)->i_es_tree.root, lblk);
1017 if (!es || es->es_lblk > end)
1018 __es_insert_extent(inode, &newes, NULL);
1019 write_unlock(&EXT4_I(inode)->i_es_lock);
1020 }
1021
1022 /*
1023 * ext4_es_lookup_extent() looks up an extent in extent status tree.
1024 *
1025 * ext4_es_lookup_extent is called by ext4_map_blocks/ext4_da_map_blocks.
1026 *
1027 * Return: 1 on found, 0 on not
1028 */
ext4_es_lookup_extent(struct inode * inode,ext4_lblk_t lblk,ext4_lblk_t * next_lblk,struct extent_status * es)1029 int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
1030 ext4_lblk_t *next_lblk,
1031 struct extent_status *es)
1032 {
1033 struct ext4_es_tree *tree;
1034 struct ext4_es_stats *stats;
1035 struct extent_status *es1 = NULL;
1036 struct rb_node *node;
1037 int found = 0;
1038
1039 if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
1040 return 0;
1041
1042 trace_ext4_es_lookup_extent_enter(inode, lblk);
1043 es_debug("lookup extent in block %u\n", lblk);
1044
1045 tree = &EXT4_I(inode)->i_es_tree;
1046 read_lock(&EXT4_I(inode)->i_es_lock);
1047
1048 /* find extent in cache firstly */
1049 es->es_lblk = es->es_len = es->es_pblk = 0;
1050 es1 = READ_ONCE(tree->cache_es);
1051 if (es1 && in_range(lblk, es1->es_lblk, es1->es_len)) {
1052 es_debug("%u cached by [%u/%u)\n",
1053 lblk, es1->es_lblk, es1->es_len);
1054 found = 1;
1055 goto out;
1056 }
1057
1058 node = tree->root.rb_node;
1059 while (node) {
1060 es1 = rb_entry(node, struct extent_status, rb_node);
1061 if (lblk < es1->es_lblk)
1062 node = node->rb_left;
1063 else if (lblk > ext4_es_end(es1))
1064 node = node->rb_right;
1065 else {
1066 found = 1;
1067 break;
1068 }
1069 }
1070
1071 out:
1072 stats = &EXT4_SB(inode->i_sb)->s_es_stats;
1073 if (found) {
1074 BUG_ON(!es1);
1075 es->es_lblk = es1->es_lblk;
1076 es->es_len = es1->es_len;
1077 es->es_pblk = es1->es_pblk;
1078 if (!ext4_es_is_referenced(es1))
1079 ext4_es_set_referenced(es1);
1080 percpu_counter_inc(&stats->es_stats_cache_hits);
1081 if (next_lblk) {
1082 node = rb_next(&es1->rb_node);
1083 if (node) {
1084 es1 = rb_entry(node, struct extent_status,
1085 rb_node);
1086 *next_lblk = es1->es_lblk;
1087 } else
1088 *next_lblk = 0;
1089 }
1090 } else {
1091 percpu_counter_inc(&stats->es_stats_cache_misses);
1092 }
1093
1094 read_unlock(&EXT4_I(inode)->i_es_lock);
1095
1096 trace_ext4_es_lookup_extent_exit(inode, es, found);
1097 return found;
1098 }
1099
1100 struct rsvd_count {
1101 int ndelayed;
1102 bool first_do_lblk_found;
1103 ext4_lblk_t first_do_lblk;
1104 ext4_lblk_t last_do_lblk;
1105 struct extent_status *left_es;
1106 bool partial;
1107 ext4_lblk_t lclu;
1108 };
1109
1110 /*
1111 * init_rsvd - initialize reserved count data before removing block range
1112 * in file from extent status tree
1113 *
1114 * @inode - file containing range
1115 * @lblk - first block in range
1116 * @es - pointer to first extent in range
1117 * @rc - pointer to reserved count data
1118 *
1119 * Assumes es is not NULL
1120 */
init_rsvd(struct inode * inode,ext4_lblk_t lblk,struct extent_status * es,struct rsvd_count * rc)1121 static void init_rsvd(struct inode *inode, ext4_lblk_t lblk,
1122 struct extent_status *es, struct rsvd_count *rc)
1123 {
1124 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1125 struct rb_node *node;
1126
1127 rc->ndelayed = 0;
1128
1129 /*
1130 * for bigalloc, note the first delayed block in the range has not
1131 * been found, record the extent containing the block to the left of
1132 * the region to be removed, if any, and note that there's no partial
1133 * cluster to track
1134 */
1135 if (sbi->s_cluster_ratio > 1) {
1136 rc->first_do_lblk_found = false;
1137 if (lblk > es->es_lblk) {
1138 rc->left_es = es;
1139 } else {
1140 node = rb_prev(&es->rb_node);
1141 rc->left_es = node ? rb_entry(node,
1142 struct extent_status,
1143 rb_node) : NULL;
1144 }
1145 rc->partial = false;
1146 }
1147 }
1148
1149 /*
1150 * count_rsvd - count the clusters containing delayed blocks in a range
1151 * within an extent and add to the running tally in rsvd_count
1152 *
1153 * @inode - file containing extent
1154 * @lblk - first block in range
1155 * @len - length of range in blocks
1156 * @es - pointer to extent containing clusters to be counted
1157 * @rc - pointer to reserved count data
1158 *
1159 * Tracks partial clusters found at the beginning and end of extents so
1160 * they aren't overcounted when they span adjacent extents
1161 */
count_rsvd(struct inode * inode,ext4_lblk_t lblk,long len,struct extent_status * es,struct rsvd_count * rc)1162 static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len,
1163 struct extent_status *es, struct rsvd_count *rc)
1164 {
1165 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1166 ext4_lblk_t i, end, nclu;
1167
1168 if (!ext4_es_is_delayed(es))
1169 return;
1170
1171 WARN_ON(len <= 0);
1172
1173 if (sbi->s_cluster_ratio == 1) {
1174 rc->ndelayed += (int) len;
1175 return;
1176 }
1177
1178 /* bigalloc */
1179
1180 i = (lblk < es->es_lblk) ? es->es_lblk : lblk;
1181 end = lblk + (ext4_lblk_t) len - 1;
1182 end = (end > ext4_es_end(es)) ? ext4_es_end(es) : end;
1183
1184 /* record the first block of the first delayed extent seen */
1185 if (!rc->first_do_lblk_found) {
1186 rc->first_do_lblk = i;
1187 rc->first_do_lblk_found = true;
1188 }
1189
1190 /* update the last lblk in the region seen so far */
1191 rc->last_do_lblk = end;
1192
1193 /*
1194 * if we're tracking a partial cluster and the current extent
1195 * doesn't start with it, count it and stop tracking
1196 */
1197 if (rc->partial && (rc->lclu != EXT4_B2C(sbi, i))) {
1198 rc->ndelayed++;
1199 rc->partial = false;
1200 }
1201
1202 /*
1203 * if the first cluster doesn't start on a cluster boundary but
1204 * ends on one, count it
1205 */
1206 if (EXT4_LBLK_COFF(sbi, i) != 0) {
1207 if (end >= EXT4_LBLK_CFILL(sbi, i)) {
1208 rc->ndelayed++;
1209 rc->partial = false;
1210 i = EXT4_LBLK_CFILL(sbi, i) + 1;
1211 }
1212 }
1213
1214 /*
1215 * if the current cluster starts on a cluster boundary, count the
1216 * number of whole delayed clusters in the extent
1217 */
1218 if ((i + sbi->s_cluster_ratio - 1) <= end) {
1219 nclu = (end - i + 1) >> sbi->s_cluster_bits;
1220 rc->ndelayed += nclu;
1221 i += nclu << sbi->s_cluster_bits;
1222 }
1223
1224 /*
1225 * start tracking a partial cluster if there's a partial at the end
1226 * of the current extent and we're not already tracking one
1227 */
1228 if (!rc->partial && i <= end) {
1229 rc->partial = true;
1230 rc->lclu = EXT4_B2C(sbi, i);
1231 }
1232 }
1233
1234 /*
1235 * __pr_tree_search - search for a pending cluster reservation
1236 *
1237 * @root - root of pending reservation tree
1238 * @lclu - logical cluster to search for
1239 *
1240 * Returns the pending reservation for the cluster identified by @lclu
1241 * if found. If not, returns a reservation for the next cluster if any,
1242 * and if not, returns NULL.
1243 */
__pr_tree_search(struct rb_root * root,ext4_lblk_t lclu)1244 static struct pending_reservation *__pr_tree_search(struct rb_root *root,
1245 ext4_lblk_t lclu)
1246 {
1247 struct rb_node *node = root->rb_node;
1248 struct pending_reservation *pr = NULL;
1249
1250 while (node) {
1251 pr = rb_entry(node, struct pending_reservation, rb_node);
1252 if (lclu < pr->lclu)
1253 node = node->rb_left;
1254 else if (lclu > pr->lclu)
1255 node = node->rb_right;
1256 else
1257 return pr;
1258 }
1259 if (pr && lclu < pr->lclu)
1260 return pr;
1261 if (pr && lclu > pr->lclu) {
1262 node = rb_next(&pr->rb_node);
1263 return node ? rb_entry(node, struct pending_reservation,
1264 rb_node) : NULL;
1265 }
1266 return NULL;
1267 }
1268
1269 /*
1270 * get_rsvd - calculates and returns the number of cluster reservations to be
1271 * released when removing a block range from the extent status tree
1272 * and releases any pending reservations within the range
1273 *
1274 * @inode - file containing block range
1275 * @end - last block in range
1276 * @right_es - pointer to extent containing next block beyond end or NULL
1277 * @rc - pointer to reserved count data
1278 *
1279 * The number of reservations to be released is equal to the number of
1280 * clusters containing delayed blocks within the range, minus the number of
1281 * clusters still containing delayed blocks at the ends of the range, and
1282 * minus the number of pending reservations within the range.
1283 */
get_rsvd(struct inode * inode,ext4_lblk_t end,struct extent_status * right_es,struct rsvd_count * rc)1284 static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
1285 struct extent_status *right_es,
1286 struct rsvd_count *rc)
1287 {
1288 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1289 struct pending_reservation *pr;
1290 struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree;
1291 struct rb_node *node;
1292 ext4_lblk_t first_lclu, last_lclu;
1293 bool left_delayed, right_delayed, count_pending;
1294 struct extent_status *es;
1295
1296 if (sbi->s_cluster_ratio > 1) {
1297 /* count any remaining partial cluster */
1298 if (rc->partial)
1299 rc->ndelayed++;
1300
1301 if (rc->ndelayed == 0)
1302 return 0;
1303
1304 first_lclu = EXT4_B2C(sbi, rc->first_do_lblk);
1305 last_lclu = EXT4_B2C(sbi, rc->last_do_lblk);
1306
1307 /*
1308 * decrease the delayed count by the number of clusters at the
1309 * ends of the range that still contain delayed blocks -
1310 * these clusters still need to be reserved
1311 */
1312 left_delayed = right_delayed = false;
1313
1314 es = rc->left_es;
1315 while (es && ext4_es_end(es) >=
1316 EXT4_LBLK_CMASK(sbi, rc->first_do_lblk)) {
1317 if (ext4_es_is_delayed(es)) {
1318 rc->ndelayed--;
1319 left_delayed = true;
1320 break;
1321 }
1322 node = rb_prev(&es->rb_node);
1323 if (!node)
1324 break;
1325 es = rb_entry(node, struct extent_status, rb_node);
1326 }
1327 if (right_es && (!left_delayed || first_lclu != last_lclu)) {
1328 if (end < ext4_es_end(right_es)) {
1329 es = right_es;
1330 } else {
1331 node = rb_next(&right_es->rb_node);
1332 es = node ? rb_entry(node, struct extent_status,
1333 rb_node) : NULL;
1334 }
1335 while (es && es->es_lblk <=
1336 EXT4_LBLK_CFILL(sbi, rc->last_do_lblk)) {
1337 if (ext4_es_is_delayed(es)) {
1338 rc->ndelayed--;
1339 right_delayed = true;
1340 break;
1341 }
1342 node = rb_next(&es->rb_node);
1343 if (!node)
1344 break;
1345 es = rb_entry(node, struct extent_status,
1346 rb_node);
1347 }
1348 }
1349
1350 /*
1351 * Determine the block range that should be searched for
1352 * pending reservations, if any. Clusters on the ends of the
1353 * original removed range containing delayed blocks are
1354 * excluded. They've already been accounted for and it's not
1355 * possible to determine if an associated pending reservation
1356 * should be released with the information available in the
1357 * extents status tree.
1358 */
1359 if (first_lclu == last_lclu) {
1360 if (left_delayed | right_delayed)
1361 count_pending = false;
1362 else
1363 count_pending = true;
1364 } else {
1365 if (left_delayed)
1366 first_lclu++;
1367 if (right_delayed)
1368 last_lclu--;
1369 if (first_lclu <= last_lclu)
1370 count_pending = true;
1371 else
1372 count_pending = false;
1373 }
1374
1375 /*
1376 * a pending reservation found between first_lclu and last_lclu
1377 * represents an allocated cluster that contained at least one
1378 * delayed block, so the delayed total must be reduced by one
1379 * for each pending reservation found and released
1380 */
1381 if (count_pending) {
1382 pr = __pr_tree_search(&tree->root, first_lclu);
1383 while (pr && pr->lclu <= last_lclu) {
1384 rc->ndelayed--;
1385 node = rb_next(&pr->rb_node);
1386 rb_erase(&pr->rb_node, &tree->root);
1387 __free_pending(pr);
1388 if (!node)
1389 break;
1390 pr = rb_entry(node, struct pending_reservation,
1391 rb_node);
1392 }
1393 }
1394 }
1395 return rc->ndelayed;
1396 }
1397
1398
1399 /*
1400 * __es_remove_extent - removes block range from extent status tree
1401 *
1402 * @inode - file containing range
1403 * @lblk - first block in range
1404 * @end - last block in range
1405 * @reserved - number of cluster reservations released
1406 * @prealloc - pre-allocated es to avoid memory allocation failures
1407 *
1408 * If @reserved is not NULL and delayed allocation is enabled, counts
1409 * block/cluster reservations freed by removing range and if bigalloc
1410 * enabled cancels pending reservations as needed. Returns 0 on success,
1411 * error code on failure.
1412 */
__es_remove_extent(struct inode * inode,ext4_lblk_t lblk,ext4_lblk_t end,int * reserved,struct extent_status * prealloc)1413 static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
1414 ext4_lblk_t end, int *reserved,
1415 struct extent_status *prealloc)
1416 {
1417 struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
1418 struct rb_node *node;
1419 struct extent_status *es;
1420 struct extent_status orig_es;
1421 ext4_lblk_t len1, len2;
1422 ext4_fsblk_t block;
1423 int err = 0;
1424 bool count_reserved = true;
1425 struct rsvd_count rc;
1426
1427 if (reserved == NULL || !test_opt(inode->i_sb, DELALLOC))
1428 count_reserved = false;
1429
1430 es = __es_tree_search(&tree->root, lblk);
1431 if (!es)
1432 goto out;
1433 if (es->es_lblk > end)
1434 goto out;
1435
1436 /* Simply invalidate cache_es. */
1437 tree->cache_es = NULL;
1438 if (count_reserved)
1439 init_rsvd(inode, lblk, es, &rc);
1440
1441 orig_es.es_lblk = es->es_lblk;
1442 orig_es.es_len = es->es_len;
1443 orig_es.es_pblk = es->es_pblk;
1444
1445 len1 = lblk > es->es_lblk ? lblk - es->es_lblk : 0;
1446 len2 = ext4_es_end(es) > end ? ext4_es_end(es) - end : 0;
1447 if (len1 > 0)
1448 es->es_len = len1;
1449 if (len2 > 0) {
1450 if (len1 > 0) {
1451 struct extent_status newes;
1452
1453 newes.es_lblk = end + 1;
1454 newes.es_len = len2;
1455 block = 0x7FDEADBEEFULL;
1456 if (ext4_es_is_written(&orig_es) ||
1457 ext4_es_is_unwritten(&orig_es))
1458 block = ext4_es_pblock(&orig_es) +
1459 orig_es.es_len - len2;
1460 ext4_es_store_pblock_status(&newes, block,
1461 ext4_es_status(&orig_es));
1462 err = __es_insert_extent(inode, &newes, prealloc);
1463 if (err) {
1464 if (!ext4_es_must_keep(&newes))
1465 return 0;
1466
1467 es->es_lblk = orig_es.es_lblk;
1468 es->es_len = orig_es.es_len;
1469 goto out;
1470 }
1471 } else {
1472 es->es_lblk = end + 1;
1473 es->es_len = len2;
1474 if (ext4_es_is_written(es) ||
1475 ext4_es_is_unwritten(es)) {
1476 block = orig_es.es_pblk + orig_es.es_len - len2;
1477 ext4_es_store_pblock(es, block);
1478 }
1479 }
1480 if (count_reserved)
1481 count_rsvd(inode, orig_es.es_lblk + len1,
1482 orig_es.es_len - len1 - len2, &orig_es, &rc);
1483 goto out_get_reserved;
1484 }
1485
1486 if (len1 > 0) {
1487 if (count_reserved)
1488 count_rsvd(inode, lblk, orig_es.es_len - len1,
1489 &orig_es, &rc);
1490 node = rb_next(&es->rb_node);
1491 if (node)
1492 es = rb_entry(node, struct extent_status, rb_node);
1493 else
1494 es = NULL;
1495 }
1496
1497 while (es && ext4_es_end(es) <= end) {
1498 if (count_reserved)
1499 count_rsvd(inode, es->es_lblk, es->es_len, es, &rc);
1500 node = rb_next(&es->rb_node);
1501 rb_erase(&es->rb_node, &tree->root);
1502 ext4_es_free_extent(inode, es);
1503 if (!node) {
1504 es = NULL;
1505 break;
1506 }
1507 es = rb_entry(node, struct extent_status, rb_node);
1508 }
1509
1510 if (es && es->es_lblk < end + 1) {
1511 ext4_lblk_t orig_len = es->es_len;
1512
1513 len1 = ext4_es_end(es) - end;
1514 if (count_reserved)
1515 count_rsvd(inode, es->es_lblk, orig_len - len1,
1516 es, &rc);
1517 es->es_lblk = end + 1;
1518 es->es_len = len1;
1519 if (ext4_es_is_written(es) || ext4_es_is_unwritten(es)) {
1520 block = es->es_pblk + orig_len - len1;
1521 ext4_es_store_pblock(es, block);
1522 }
1523 }
1524
1525 out_get_reserved:
1526 if (count_reserved)
1527 *reserved = get_rsvd(inode, end, es, &rc);
1528 out:
1529 return err;
1530 }
1531
1532 /*
1533 * ext4_es_remove_extent - removes block range from extent status tree
1534 *
1535 * @inode - file containing range
1536 * @lblk - first block in range
1537 * @len - number of blocks to remove
1538 *
1539 * Reduces block/cluster reservation count and for bigalloc cancels pending
1540 * reservations as needed.
1541 */
ext4_es_remove_extent(struct inode * inode,ext4_lblk_t lblk,ext4_lblk_t len)1542 void ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
1543 ext4_lblk_t len)
1544 {
1545 ext4_lblk_t end;
1546 int err = 0;
1547 int reserved = 0;
1548 struct extent_status *es = NULL;
1549
1550 if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
1551 return;
1552
1553 trace_ext4_es_remove_extent(inode, lblk, len);
1554 es_debug("remove [%u/%u) from extent status tree of inode %lu\n",
1555 lblk, len, inode->i_ino);
1556
1557 if (!len)
1558 return;
1559
1560 end = lblk + len - 1;
1561 BUG_ON(end < lblk);
1562
1563 retry:
1564 if (err && !es)
1565 es = __es_alloc_extent(true);
1566 /*
1567 * ext4_clear_inode() depends on us taking i_es_lock unconditionally
1568 * so that we are sure __es_shrink() is done with the inode before it
1569 * is reclaimed.
1570 */
1571 write_lock(&EXT4_I(inode)->i_es_lock);
1572 err = __es_remove_extent(inode, lblk, end, &reserved, es);
1573 /* Free preallocated extent if it didn't get used. */
1574 if (es) {
1575 if (!es->es_len)
1576 __es_free_extent(es);
1577 es = NULL;
1578 }
1579 write_unlock(&EXT4_I(inode)->i_es_lock);
1580 if (err)
1581 goto retry;
1582
1583 ext4_es_print_tree(inode);
1584 ext4_da_release_space(inode, reserved);
1585 }
1586
__es_shrink(struct ext4_sb_info * sbi,int nr_to_scan,struct ext4_inode_info * locked_ei)1587 static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
1588 struct ext4_inode_info *locked_ei)
1589 {
1590 struct ext4_inode_info *ei;
1591 struct ext4_es_stats *es_stats;
1592 ktime_t start_time;
1593 u64 scan_time;
1594 int nr_to_walk;
1595 int nr_shrunk = 0;
1596 int retried = 0, nr_skipped = 0;
1597
1598 es_stats = &sbi->s_es_stats;
1599 start_time = ktime_get();
1600
1601 retry:
1602 spin_lock(&sbi->s_es_lock);
1603 nr_to_walk = sbi->s_es_nr_inode;
1604 while (nr_to_walk-- > 0) {
1605 if (list_empty(&sbi->s_es_list)) {
1606 spin_unlock(&sbi->s_es_lock);
1607 goto out;
1608 }
1609 ei = list_first_entry(&sbi->s_es_list, struct ext4_inode_info,
1610 i_es_list);
1611 /* Move the inode to the tail */
1612 list_move_tail(&ei->i_es_list, &sbi->s_es_list);
1613
1614 /*
1615 * Normally we try hard to avoid shrinking precached inodes,
1616 * but we will as a last resort.
1617 */
1618 if (!retried && ext4_test_inode_state(&ei->vfs_inode,
1619 EXT4_STATE_EXT_PRECACHED)) {
1620 nr_skipped++;
1621 continue;
1622 }
1623
1624 if (ei == locked_ei || !write_trylock(&ei->i_es_lock)) {
1625 nr_skipped++;
1626 continue;
1627 }
1628 /*
1629 * Now we hold i_es_lock which protects us from inode reclaim
1630 * freeing inode under us
1631 */
1632 spin_unlock(&sbi->s_es_lock);
1633
1634 nr_shrunk += es_reclaim_extents(ei, &nr_to_scan);
1635 write_unlock(&ei->i_es_lock);
1636
1637 if (nr_to_scan <= 0)
1638 goto out;
1639 spin_lock(&sbi->s_es_lock);
1640 }
1641 spin_unlock(&sbi->s_es_lock);
1642
1643 /*
1644 * If we skipped any inodes, and we weren't able to make any
1645 * forward progress, try again to scan precached inodes.
1646 */
1647 if ((nr_shrunk == 0) && nr_skipped && !retried) {
1648 retried++;
1649 goto retry;
1650 }
1651
1652 if (locked_ei && nr_shrunk == 0)
1653 nr_shrunk = es_reclaim_extents(locked_ei, &nr_to_scan);
1654
1655 out:
1656 scan_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1657 if (likely(es_stats->es_stats_scan_time))
1658 es_stats->es_stats_scan_time = (scan_time +
1659 es_stats->es_stats_scan_time*3) / 4;
1660 else
1661 es_stats->es_stats_scan_time = scan_time;
1662 if (scan_time > es_stats->es_stats_max_scan_time)
1663 es_stats->es_stats_max_scan_time = scan_time;
1664 if (likely(es_stats->es_stats_shrunk))
1665 es_stats->es_stats_shrunk = (nr_shrunk +
1666 es_stats->es_stats_shrunk*3) / 4;
1667 else
1668 es_stats->es_stats_shrunk = nr_shrunk;
1669
1670 trace_ext4_es_shrink(sbi->s_sb, nr_shrunk, scan_time,
1671 nr_skipped, retried);
1672 return nr_shrunk;
1673 }
1674
ext4_es_count(struct shrinker * shrink,struct shrink_control * sc)1675 static unsigned long ext4_es_count(struct shrinker *shrink,
1676 struct shrink_control *sc)
1677 {
1678 unsigned long nr;
1679 struct ext4_sb_info *sbi;
1680
1681 sbi = shrink->private_data;
1682 nr = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
1683 trace_ext4_es_shrink_count(sbi->s_sb, sc->nr_to_scan, nr);
1684 return nr;
1685 }
1686
ext4_es_scan(struct shrinker * shrink,struct shrink_control * sc)1687 static unsigned long ext4_es_scan(struct shrinker *shrink,
1688 struct shrink_control *sc)
1689 {
1690 struct ext4_sb_info *sbi = shrink->private_data;
1691 int nr_to_scan = sc->nr_to_scan;
1692 int ret, nr_shrunk;
1693
1694 ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
1695 trace_ext4_es_shrink_scan_enter(sbi->s_sb, nr_to_scan, ret);
1696
1697 nr_shrunk = __es_shrink(sbi, nr_to_scan, NULL);
1698
1699 ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
1700 trace_ext4_es_shrink_scan_exit(sbi->s_sb, nr_shrunk, ret);
1701 return nr_shrunk;
1702 }
1703
ext4_seq_es_shrinker_info_show(struct seq_file * seq,void * v)1704 int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v)
1705 {
1706 struct ext4_sb_info *sbi = EXT4_SB((struct super_block *) seq->private);
1707 struct ext4_es_stats *es_stats = &sbi->s_es_stats;
1708 struct ext4_inode_info *ei, *max = NULL;
1709 unsigned int inode_cnt = 0;
1710
1711 if (v != SEQ_START_TOKEN)
1712 return 0;
1713
1714 /* here we just find an inode that has the max nr. of objects */
1715 spin_lock(&sbi->s_es_lock);
1716 list_for_each_entry(ei, &sbi->s_es_list, i_es_list) {
1717 inode_cnt++;
1718 if (max && max->i_es_all_nr < ei->i_es_all_nr)
1719 max = ei;
1720 else if (!max)
1721 max = ei;
1722 }
1723 spin_unlock(&sbi->s_es_lock);
1724
1725 seq_printf(seq, "stats:\n %lld objects\n %lld reclaimable objects\n",
1726 percpu_counter_sum_positive(&es_stats->es_stats_all_cnt),
1727 percpu_counter_sum_positive(&es_stats->es_stats_shk_cnt));
1728 seq_printf(seq, " %lld/%lld cache hits/misses\n",
1729 percpu_counter_sum_positive(&es_stats->es_stats_cache_hits),
1730 percpu_counter_sum_positive(&es_stats->es_stats_cache_misses));
1731 if (inode_cnt)
1732 seq_printf(seq, " %d inodes on list\n", inode_cnt);
1733
1734 seq_printf(seq, "average:\n %llu us scan time\n",
1735 div_u64(es_stats->es_stats_scan_time, 1000));
1736 seq_printf(seq, " %lu shrunk objects\n", es_stats->es_stats_shrunk);
1737 if (inode_cnt)
1738 seq_printf(seq,
1739 "maximum:\n %lu inode (%u objects, %u reclaimable)\n"
1740 " %llu us max scan time\n",
1741 max->vfs_inode.i_ino, max->i_es_all_nr, max->i_es_shk_nr,
1742 div_u64(es_stats->es_stats_max_scan_time, 1000));
1743
1744 return 0;
1745 }
1746
ext4_es_register_shrinker(struct ext4_sb_info * sbi)1747 int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
1748 {
1749 int err;
1750
1751 /* Make sure we have enough bits for physical block number */
1752 BUILD_BUG_ON(ES_SHIFT < 48);
1753 INIT_LIST_HEAD(&sbi->s_es_list);
1754 sbi->s_es_nr_inode = 0;
1755 spin_lock_init(&sbi->s_es_lock);
1756 sbi->s_es_stats.es_stats_shrunk = 0;
1757 err = percpu_counter_init(&sbi->s_es_stats.es_stats_cache_hits, 0,
1758 GFP_KERNEL);
1759 if (err)
1760 return err;
1761 err = percpu_counter_init(&sbi->s_es_stats.es_stats_cache_misses, 0,
1762 GFP_KERNEL);
1763 if (err)
1764 goto err1;
1765 sbi->s_es_stats.es_stats_scan_time = 0;
1766 sbi->s_es_stats.es_stats_max_scan_time = 0;
1767 err = percpu_counter_init(&sbi->s_es_stats.es_stats_all_cnt, 0, GFP_KERNEL);
1768 if (err)
1769 goto err2;
1770 err = percpu_counter_init(&sbi->s_es_stats.es_stats_shk_cnt, 0, GFP_KERNEL);
1771 if (err)
1772 goto err3;
1773
1774 sbi->s_es_shrinker = shrinker_alloc(0, "ext4-es:%s", sbi->s_sb->s_id);
1775 if (!sbi->s_es_shrinker) {
1776 err = -ENOMEM;
1777 goto err4;
1778 }
1779
1780 sbi->s_es_shrinker->scan_objects = ext4_es_scan;
1781 sbi->s_es_shrinker->count_objects = ext4_es_count;
1782 sbi->s_es_shrinker->private_data = sbi;
1783
1784 shrinker_register(sbi->s_es_shrinker);
1785
1786 return 0;
1787 err4:
1788 percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt);
1789 err3:
1790 percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
1791 err2:
1792 percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_misses);
1793 err1:
1794 percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_hits);
1795 return err;
1796 }
1797
ext4_es_unregister_shrinker(struct ext4_sb_info * sbi)1798 void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
1799 {
1800 percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_hits);
1801 percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_misses);
1802 percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
1803 percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt);
1804 shrinker_free(sbi->s_es_shrinker);
1805 }
1806
1807 /*
1808 * Shrink extents in given inode from ei->i_es_shrink_lblk till end. Scan at
1809 * most *nr_to_scan extents, update *nr_to_scan accordingly.
1810 *
1811 * Return 0 if we hit end of tree / interval, 1 if we exhausted nr_to_scan.
1812 * Increment *nr_shrunk by the number of reclaimed extents. Also update
1813 * ei->i_es_shrink_lblk to where we should continue scanning.
1814 */
es_do_reclaim_extents(struct ext4_inode_info * ei,ext4_lblk_t end,int * nr_to_scan,int * nr_shrunk)1815 static int es_do_reclaim_extents(struct ext4_inode_info *ei, ext4_lblk_t end,
1816 int *nr_to_scan, int *nr_shrunk)
1817 {
1818 struct inode *inode = &ei->vfs_inode;
1819 struct ext4_es_tree *tree = &ei->i_es_tree;
1820 struct extent_status *es;
1821 struct rb_node *node;
1822
1823 es = __es_tree_search(&tree->root, ei->i_es_shrink_lblk);
1824 if (!es)
1825 goto out_wrap;
1826
1827 while (*nr_to_scan > 0) {
1828 if (es->es_lblk > end) {
1829 ei->i_es_shrink_lblk = end + 1;
1830 return 0;
1831 }
1832
1833 (*nr_to_scan)--;
1834 node = rb_next(&es->rb_node);
1835
1836 if (ext4_es_must_keep(es))
1837 goto next;
1838 if (ext4_es_is_referenced(es)) {
1839 ext4_es_clear_referenced(es);
1840 goto next;
1841 }
1842
1843 rb_erase(&es->rb_node, &tree->root);
1844 ext4_es_free_extent(inode, es);
1845 (*nr_shrunk)++;
1846 next:
1847 if (!node)
1848 goto out_wrap;
1849 es = rb_entry(node, struct extent_status, rb_node);
1850 }
1851 ei->i_es_shrink_lblk = es->es_lblk;
1852 return 1;
1853 out_wrap:
1854 ei->i_es_shrink_lblk = 0;
1855 return 0;
1856 }
1857
es_reclaim_extents(struct ext4_inode_info * ei,int * nr_to_scan)1858 static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan)
1859 {
1860 struct inode *inode = &ei->vfs_inode;
1861 int nr_shrunk = 0;
1862 ext4_lblk_t start = ei->i_es_shrink_lblk;
1863 static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
1864 DEFAULT_RATELIMIT_BURST);
1865
1866 if (ei->i_es_shk_nr == 0)
1867 return 0;
1868
1869 if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED) &&
1870 __ratelimit(&_rs))
1871 ext4_warning(inode->i_sb, "forced shrink of precached extents");
1872
1873 if (!es_do_reclaim_extents(ei, EXT_MAX_BLOCKS, nr_to_scan, &nr_shrunk) &&
1874 start != 0)
1875 es_do_reclaim_extents(ei, start - 1, nr_to_scan, &nr_shrunk);
1876
1877 ei->i_es_tree.cache_es = NULL;
1878 return nr_shrunk;
1879 }
1880
1881 /*
1882 * Called to support EXT4_IOC_CLEAR_ES_CACHE. We can only remove
1883 * discretionary entries from the extent status cache. (Some entries
1884 * must be present for proper operations.)
1885 */
ext4_clear_inode_es(struct inode * inode)1886 void ext4_clear_inode_es(struct inode *inode)
1887 {
1888 struct ext4_inode_info *ei = EXT4_I(inode);
1889 struct extent_status *es;
1890 struct ext4_es_tree *tree;
1891 struct rb_node *node;
1892
1893 write_lock(&ei->i_es_lock);
1894 tree = &EXT4_I(inode)->i_es_tree;
1895 tree->cache_es = NULL;
1896 node = rb_first(&tree->root);
1897 while (node) {
1898 es = rb_entry(node, struct extent_status, rb_node);
1899 node = rb_next(node);
1900 if (!ext4_es_must_keep(es)) {
1901 rb_erase(&es->rb_node, &tree->root);
1902 ext4_es_free_extent(inode, es);
1903 }
1904 }
1905 ext4_clear_inode_state(inode, EXT4_STATE_EXT_PRECACHED);
1906 write_unlock(&ei->i_es_lock);
1907 }
1908
1909 #ifdef ES_DEBUG__
ext4_print_pending_tree(struct inode * inode)1910 static void ext4_print_pending_tree(struct inode *inode)
1911 {
1912 struct ext4_pending_tree *tree;
1913 struct rb_node *node;
1914 struct pending_reservation *pr;
1915
1916 printk(KERN_DEBUG "pending reservations for inode %lu:", inode->i_ino);
1917 tree = &EXT4_I(inode)->i_pending_tree;
1918 node = rb_first(&tree->root);
1919 while (node) {
1920 pr = rb_entry(node, struct pending_reservation, rb_node);
1921 printk(KERN_DEBUG " %u", pr->lclu);
1922 node = rb_next(node);
1923 }
1924 printk(KERN_DEBUG "\n");
1925 }
1926 #else
1927 #define ext4_print_pending_tree(inode)
1928 #endif
1929
ext4_init_pending(void)1930 int __init ext4_init_pending(void)
1931 {
1932 ext4_pending_cachep = KMEM_CACHE(pending_reservation, SLAB_RECLAIM_ACCOUNT);
1933 if (ext4_pending_cachep == NULL)
1934 return -ENOMEM;
1935 return 0;
1936 }
1937
ext4_exit_pending(void)1938 void ext4_exit_pending(void)
1939 {
1940 kmem_cache_destroy(ext4_pending_cachep);
1941 }
1942
ext4_init_pending_tree(struct ext4_pending_tree * tree)1943 void ext4_init_pending_tree(struct ext4_pending_tree *tree)
1944 {
1945 tree->root = RB_ROOT;
1946 }
1947
1948 /*
1949 * __get_pending - retrieve a pointer to a pending reservation
1950 *
1951 * @inode - file containing the pending cluster reservation
1952 * @lclu - logical cluster of interest
1953 *
1954 * Returns a pointer to a pending reservation if it's a member of
1955 * the set, and NULL if not. Must be called holding i_es_lock.
1956 */
__get_pending(struct inode * inode,ext4_lblk_t lclu)1957 static struct pending_reservation *__get_pending(struct inode *inode,
1958 ext4_lblk_t lclu)
1959 {
1960 struct ext4_pending_tree *tree;
1961 struct rb_node *node;
1962 struct pending_reservation *pr = NULL;
1963
1964 tree = &EXT4_I(inode)->i_pending_tree;
1965 node = (&tree->root)->rb_node;
1966
1967 while (node) {
1968 pr = rb_entry(node, struct pending_reservation, rb_node);
1969 if (lclu < pr->lclu)
1970 node = node->rb_left;
1971 else if (lclu > pr->lclu)
1972 node = node->rb_right;
1973 else if (lclu == pr->lclu)
1974 return pr;
1975 }
1976 return NULL;
1977 }
1978
1979 /*
1980 * __insert_pending - adds a pending cluster reservation to the set of
1981 * pending reservations
1982 *
1983 * @inode - file containing the cluster
1984 * @lblk - logical block in the cluster to be added
1985 * @prealloc - preallocated pending entry
1986 *
1987 * Returns 1 on successful insertion and -ENOMEM on failure. If the
1988 * pending reservation is already in the set, returns successfully.
1989 */
__insert_pending(struct inode * inode,ext4_lblk_t lblk,struct pending_reservation ** prealloc)1990 static int __insert_pending(struct inode *inode, ext4_lblk_t lblk,
1991 struct pending_reservation **prealloc)
1992 {
1993 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1994 struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree;
1995 struct rb_node **p = &tree->root.rb_node;
1996 struct rb_node *parent = NULL;
1997 struct pending_reservation *pr;
1998 ext4_lblk_t lclu;
1999 int ret = 0;
2000
2001 lclu = EXT4_B2C(sbi, lblk);
2002 /* search to find parent for insertion */
2003 while (*p) {
2004 parent = *p;
2005 pr = rb_entry(parent, struct pending_reservation, rb_node);
2006
2007 if (lclu < pr->lclu) {
2008 p = &(*p)->rb_left;
2009 } else if (lclu > pr->lclu) {
2010 p = &(*p)->rb_right;
2011 } else {
2012 /* pending reservation already inserted */
2013 goto out;
2014 }
2015 }
2016
2017 if (likely(*prealloc == NULL)) {
2018 pr = __alloc_pending(false);
2019 if (!pr) {
2020 ret = -ENOMEM;
2021 goto out;
2022 }
2023 } else {
2024 pr = *prealloc;
2025 *prealloc = NULL;
2026 }
2027 pr->lclu = lclu;
2028
2029 rb_link_node(&pr->rb_node, parent, p);
2030 rb_insert_color(&pr->rb_node, &tree->root);
2031 ret = 1;
2032
2033 out:
2034 return ret;
2035 }
2036
2037 /*
2038 * __remove_pending - removes a pending cluster reservation from the set
2039 * of pending reservations
2040 *
2041 * @inode - file containing the cluster
2042 * @lblk - logical block in the pending cluster reservation to be removed
2043 *
2044 * Returns successfully if pending reservation is not a member of the set.
2045 */
__remove_pending(struct inode * inode,ext4_lblk_t lblk)2046 static void __remove_pending(struct inode *inode, ext4_lblk_t lblk)
2047 {
2048 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2049 struct pending_reservation *pr;
2050 struct ext4_pending_tree *tree;
2051
2052 pr = __get_pending(inode, EXT4_B2C(sbi, lblk));
2053 if (pr != NULL) {
2054 tree = &EXT4_I(inode)->i_pending_tree;
2055 rb_erase(&pr->rb_node, &tree->root);
2056 __free_pending(pr);
2057 }
2058 }
2059
2060 /*
2061 * ext4_remove_pending - removes a pending cluster reservation from the set
2062 * of pending reservations
2063 *
2064 * @inode - file containing the cluster
2065 * @lblk - logical block in the pending cluster reservation to be removed
2066 *
2067 * Locking for external use of __remove_pending.
2068 */
ext4_remove_pending(struct inode * inode,ext4_lblk_t lblk)2069 void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk)
2070 {
2071 struct ext4_inode_info *ei = EXT4_I(inode);
2072
2073 write_lock(&ei->i_es_lock);
2074 __remove_pending(inode, lblk);
2075 write_unlock(&ei->i_es_lock);
2076 }
2077
2078 /*
2079 * ext4_is_pending - determine whether a cluster has a pending reservation
2080 * on it
2081 *
2082 * @inode - file containing the cluster
2083 * @lblk - logical block in the cluster
2084 *
2085 * Returns true if there's a pending reservation for the cluster in the
2086 * set of pending reservations, and false if not.
2087 */
ext4_is_pending(struct inode * inode,ext4_lblk_t lblk)2088 bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk)
2089 {
2090 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2091 struct ext4_inode_info *ei = EXT4_I(inode);
2092 bool ret;
2093
2094 read_lock(&ei->i_es_lock);
2095 ret = (bool)(__get_pending(inode, EXT4_B2C(sbi, lblk)) != NULL);
2096 read_unlock(&ei->i_es_lock);
2097
2098 return ret;
2099 }
2100
2101 /*
2102 * ext4_es_insert_delayed_extent - adds some delayed blocks to the extents
2103 * status tree, adding a pending reservation
2104 * where needed
2105 *
2106 * @inode - file containing the newly added block
2107 * @lblk - start logical block to be added
2108 * @len - length of blocks to be added
2109 * @lclu_allocated/end_allocated - indicates whether a physical cluster has
2110 * been allocated for the logical cluster
2111 * that contains the start/end block. Note that
2112 * end_allocated should always be set to false
2113 * if the start and the end block are in the
2114 * same cluster
2115 */
ext4_es_insert_delayed_extent(struct inode * inode,ext4_lblk_t lblk,ext4_lblk_t len,bool lclu_allocated,bool end_allocated)2116 void ext4_es_insert_delayed_extent(struct inode *inode, ext4_lblk_t lblk,
2117 ext4_lblk_t len, bool lclu_allocated,
2118 bool end_allocated)
2119 {
2120 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2121 struct extent_status newes;
2122 ext4_lblk_t end = lblk + len - 1;
2123 int err1 = 0, err2 = 0, err3 = 0;
2124 struct extent_status *es1 = NULL;
2125 struct extent_status *es2 = NULL;
2126 struct pending_reservation *pr1 = NULL;
2127 struct pending_reservation *pr2 = NULL;
2128
2129 if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
2130 return;
2131
2132 es_debug("add [%u/%u) delayed to extent status tree of inode %lu\n",
2133 lblk, len, inode->i_ino);
2134 if (!len)
2135 return;
2136
2137 WARN_ON_ONCE((EXT4_B2C(sbi, lblk) == EXT4_B2C(sbi, end)) &&
2138 end_allocated);
2139
2140 newes.es_lblk = lblk;
2141 newes.es_len = len;
2142 ext4_es_store_pblock_status(&newes, ~0, EXTENT_STATUS_DELAYED);
2143 trace_ext4_es_insert_delayed_extent(inode, &newes, lclu_allocated,
2144 end_allocated);
2145
2146 ext4_es_insert_extent_check(inode, &newes);
2147
2148 retry:
2149 if (err1 && !es1)
2150 es1 = __es_alloc_extent(true);
2151 if ((err1 || err2) && !es2)
2152 es2 = __es_alloc_extent(true);
2153 if (err1 || err2 || err3 < 0) {
2154 if (lclu_allocated && !pr1)
2155 pr1 = __alloc_pending(true);
2156 if (end_allocated && !pr2)
2157 pr2 = __alloc_pending(true);
2158 }
2159 write_lock(&EXT4_I(inode)->i_es_lock);
2160
2161 err1 = __es_remove_extent(inode, lblk, end, NULL, es1);
2162 if (err1 != 0)
2163 goto error;
2164 /* Free preallocated extent if it didn't get used. */
2165 if (es1) {
2166 if (!es1->es_len)
2167 __es_free_extent(es1);
2168 es1 = NULL;
2169 }
2170
2171 err2 = __es_insert_extent(inode, &newes, es2);
2172 if (err2 != 0)
2173 goto error;
2174 /* Free preallocated extent if it didn't get used. */
2175 if (es2) {
2176 if (!es2->es_len)
2177 __es_free_extent(es2);
2178 es2 = NULL;
2179 }
2180
2181 if (lclu_allocated) {
2182 err3 = __insert_pending(inode, lblk, &pr1);
2183 if (err3 < 0)
2184 goto error;
2185 if (pr1) {
2186 __free_pending(pr1);
2187 pr1 = NULL;
2188 }
2189 }
2190 if (end_allocated) {
2191 err3 = __insert_pending(inode, end, &pr2);
2192 if (err3 < 0)
2193 goto error;
2194 if (pr2) {
2195 __free_pending(pr2);
2196 pr2 = NULL;
2197 }
2198 }
2199 error:
2200 write_unlock(&EXT4_I(inode)->i_es_lock);
2201 if (err1 || err2 || err3 < 0)
2202 goto retry;
2203
2204 ext4_es_print_tree(inode);
2205 ext4_print_pending_tree(inode);
2206 return;
2207 }
2208
2209 /*
2210 * __revise_pending - makes, cancels, or leaves unchanged pending cluster
2211 * reservations for a specified block range depending
2212 * upon the presence or absence of delayed blocks
2213 * outside the range within clusters at the ends of the
2214 * range
2215 *
2216 * @inode - file containing the range
2217 * @lblk - logical block defining the start of range
2218 * @len - length of range in blocks
2219 * @prealloc - preallocated pending entry
2220 *
2221 * Used after a newly allocated extent is added to the extents status tree.
2222 * Requires that the extents in the range have either written or unwritten
2223 * status. Must be called while holding i_es_lock. Returns number of new
2224 * inserts pending cluster on insert pendings, returns 0 on remove pendings,
2225 * return -ENOMEM on failure.
2226 */
__revise_pending(struct inode * inode,ext4_lblk_t lblk,ext4_lblk_t len,struct pending_reservation ** prealloc)2227 static int __revise_pending(struct inode *inode, ext4_lblk_t lblk,
2228 ext4_lblk_t len,
2229 struct pending_reservation **prealloc)
2230 {
2231 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2232 ext4_lblk_t end = lblk + len - 1;
2233 ext4_lblk_t first, last;
2234 bool f_del = false, l_del = false;
2235 int pendings = 0;
2236 int ret = 0;
2237
2238 if (len == 0)
2239 return 0;
2240
2241 /*
2242 * Two cases - block range within single cluster and block range
2243 * spanning two or more clusters. Note that a cluster belonging
2244 * to a range starting and/or ending on a cluster boundary is treated
2245 * as if it does not contain a delayed extent. The new range may
2246 * have allocated space for previously delayed blocks out to the
2247 * cluster boundary, requiring that any pre-existing pending
2248 * reservation be canceled. Because this code only looks at blocks
2249 * outside the range, it should revise pending reservations
2250 * correctly even if the extent represented by the range can't be
2251 * inserted in the extents status tree due to ENOSPC.
2252 */
2253
2254 if (EXT4_B2C(sbi, lblk) == EXT4_B2C(sbi, end)) {
2255 first = EXT4_LBLK_CMASK(sbi, lblk);
2256 if (first != lblk)
2257 f_del = __es_scan_range(inode, &ext4_es_is_delayed,
2258 first, lblk - 1);
2259 if (f_del) {
2260 ret = __insert_pending(inode, first, prealloc);
2261 if (ret < 0)
2262 goto out;
2263 pendings += ret;
2264 } else {
2265 last = EXT4_LBLK_CMASK(sbi, end) +
2266 sbi->s_cluster_ratio - 1;
2267 if (last != end)
2268 l_del = __es_scan_range(inode,
2269 &ext4_es_is_delayed,
2270 end + 1, last);
2271 if (l_del) {
2272 ret = __insert_pending(inode, last, prealloc);
2273 if (ret < 0)
2274 goto out;
2275 pendings += ret;
2276 } else
2277 __remove_pending(inode, last);
2278 }
2279 } else {
2280 first = EXT4_LBLK_CMASK(sbi, lblk);
2281 if (first != lblk)
2282 f_del = __es_scan_range(inode, &ext4_es_is_delayed,
2283 first, lblk - 1);
2284 if (f_del) {
2285 ret = __insert_pending(inode, first, prealloc);
2286 if (ret < 0)
2287 goto out;
2288 pendings += ret;
2289 } else
2290 __remove_pending(inode, first);
2291
2292 last = EXT4_LBLK_CMASK(sbi, end) + sbi->s_cluster_ratio - 1;
2293 if (last != end)
2294 l_del = __es_scan_range(inode, &ext4_es_is_delayed,
2295 end + 1, last);
2296 if (l_del) {
2297 ret = __insert_pending(inode, last, prealloc);
2298 if (ret < 0)
2299 goto out;
2300 pendings += ret;
2301 } else
2302 __remove_pending(inode, last);
2303 }
2304 out:
2305 return (ret < 0) ? ret : pendings;
2306 }
2307