1 // SPDX-License-Identifier: GPL-2.0
2
3 #include <linux/jiffies.h>
4 #include <linux/kernel.h>
5 #include <linux/ktime.h>
6 #include <linux/list.h>
7 #include <linux/math64.h>
8 #include <linux/sizes.h>
9 #include <linux/workqueue.h>
10 #include "ctree.h"
11 #include "block-group.h"
12 #include "discard.h"
13 #include "free-space-cache.h"
14 #include "fs.h"
15
16 /*
17 * This contains the logic to handle async discard.
18 *
19 * Async discard manages trimming of free space outside of transaction commit.
20 * Discarding is done by managing the block_groups on a LRU list based on free
21 * space recency. Two passes are used to first prioritize discarding extents
22 * and then allow for trimming in the bitmap the best opportunity to coalesce.
23 * The block_groups are maintained on multiple lists to allow for multiple
24 * passes with different discard filter requirements. A delayed work item is
25 * used to manage discarding with timeout determined by a max of the delay
26 * incurred by the iops rate limit, the byte rate limit, and the max delay of
27 * BTRFS_DISCARD_MAX_DELAY.
28 *
29 * Note, this only keeps track of block_groups that are explicitly for data.
30 * Mixed block_groups are not supported.
31 *
32 * The first list is special to manage discarding of fully free block groups.
33 * This is necessary because we issue a final trim for a full free block group
34 * after forgetting it. When a block group becomes unused, instead of directly
35 * being added to the unused_bgs list, we add it to this first list. Then
36 * from there, if it becomes fully discarded, we place it onto the unused_bgs
37 * list.
38 *
39 * The in-memory free space cache serves as the backing state for discard.
40 * Consequently this means there is no persistence. We opt to load all the
41 * block groups in as not discarded, so the mount case degenerates to the
42 * crashing case.
43 *
44 * As the free space cache uses bitmaps, there exists a tradeoff between
45 * ease/efficiency for find_free_extent() and the accuracy of discard state.
46 * Here we opt to let untrimmed regions merge with everything while only letting
47 * trimmed regions merge with other trimmed regions. This can cause
48 * overtrimming, but the coalescing benefit seems to be worth it. Additionally,
49 * bitmap state is tracked as a whole. If we're able to fully trim a bitmap,
50 * the trimmed flag is set on the bitmap. Otherwise, if an allocation comes in,
51 * this resets the state and we will retry trimming the whole bitmap. This is a
52 * tradeoff between discard state accuracy and the cost of accounting.
53 */
54
55 /* This is an initial delay to give some chance for block reuse */
56 #define BTRFS_DISCARD_DELAY (120ULL * NSEC_PER_SEC)
57 #define BTRFS_DISCARD_UNUSED_DELAY (10ULL * NSEC_PER_SEC)
58
59 #define BTRFS_DISCARD_MIN_DELAY_MSEC (1UL)
60 #define BTRFS_DISCARD_MAX_DELAY_MSEC (1000UL)
61 #define BTRFS_DISCARD_MAX_IOPS (1000U)
62
63 /* Monotonically decreasing minimum length filters after index 0 */
64 static int discard_minlen[BTRFS_NR_DISCARD_LISTS] = {
65 0,
66 BTRFS_ASYNC_DISCARD_MAX_FILTER,
67 BTRFS_ASYNC_DISCARD_MIN_FILTER
68 };
69
get_discard_list(struct btrfs_discard_ctl * discard_ctl,const struct btrfs_block_group * block_group)70 static struct list_head *get_discard_list(struct btrfs_discard_ctl *discard_ctl,
71 const struct btrfs_block_group *block_group)
72 {
73 return &discard_ctl->discard_list[block_group->discard_index];
74 }
75
76 /*
77 * Determine if async discard should be running.
78 *
79 * @discard_ctl: discard control
80 *
81 * Check if the file system is writeable and BTRFS_FS_DISCARD_RUNNING is set.
82 */
btrfs_run_discard_work(const struct btrfs_discard_ctl * discard_ctl)83 static bool btrfs_run_discard_work(const struct btrfs_discard_ctl *discard_ctl)
84 {
85 struct btrfs_fs_info *fs_info = container_of(discard_ctl,
86 struct btrfs_fs_info,
87 discard_ctl);
88
89 return (!(fs_info->sb->s_flags & SB_RDONLY) &&
90 test_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags));
91 }
92
__add_to_discard_list(struct btrfs_discard_ctl * discard_ctl,struct btrfs_block_group * block_group)93 static void __add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
94 struct btrfs_block_group *block_group)
95 {
96 lockdep_assert_held(&discard_ctl->lock);
97
98 if (list_empty(&block_group->discard_list) ||
99 block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED) {
100 if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED)
101 block_group->discard_index = BTRFS_DISCARD_INDEX_START;
102 block_group->discard_eligible_time = (ktime_get_ns() +
103 BTRFS_DISCARD_DELAY);
104 block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR;
105 }
106 if (list_empty(&block_group->discard_list))
107 btrfs_get_block_group(block_group);
108
109 list_move_tail(&block_group->discard_list,
110 get_discard_list(discard_ctl, block_group));
111 }
112
add_to_discard_list(struct btrfs_discard_ctl * discard_ctl,struct btrfs_block_group * block_group)113 static void add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
114 struct btrfs_block_group *block_group)
115 {
116 if (!btrfs_is_block_group_data_only(block_group))
117 return;
118
119 if (!btrfs_run_discard_work(discard_ctl))
120 return;
121
122 spin_lock(&discard_ctl->lock);
123 __add_to_discard_list(discard_ctl, block_group);
124 spin_unlock(&discard_ctl->lock);
125 }
126
add_to_discard_unused_list(struct btrfs_discard_ctl * discard_ctl,struct btrfs_block_group * block_group)127 static void add_to_discard_unused_list(struct btrfs_discard_ctl *discard_ctl,
128 struct btrfs_block_group *block_group)
129 {
130 bool queued;
131
132 spin_lock(&discard_ctl->lock);
133
134 queued = !list_empty(&block_group->discard_list);
135
136 if (!btrfs_run_discard_work(discard_ctl)) {
137 spin_unlock(&discard_ctl->lock);
138 return;
139 }
140
141 list_del_init(&block_group->discard_list);
142
143 block_group->discard_index = BTRFS_DISCARD_INDEX_UNUSED;
144 block_group->discard_eligible_time = (ktime_get_ns() +
145 BTRFS_DISCARD_UNUSED_DELAY);
146 block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR;
147 if (!queued)
148 btrfs_get_block_group(block_group);
149 list_add_tail(&block_group->discard_list,
150 &discard_ctl->discard_list[BTRFS_DISCARD_INDEX_UNUSED]);
151
152 spin_unlock(&discard_ctl->lock);
153 }
154
remove_from_discard_list(struct btrfs_discard_ctl * discard_ctl,struct btrfs_block_group * block_group)155 static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl,
156 struct btrfs_block_group *block_group)
157 {
158 bool running = false;
159 bool queued = false;
160
161 spin_lock(&discard_ctl->lock);
162
163 if (block_group == discard_ctl->block_group) {
164 running = true;
165 discard_ctl->block_group = NULL;
166 }
167
168 block_group->discard_eligible_time = 0;
169 queued = !list_empty(&block_group->discard_list);
170 list_del_init(&block_group->discard_list);
171 if (queued)
172 btrfs_put_block_group(block_group);
173
174 spin_unlock(&discard_ctl->lock);
175
176 return running;
177 }
178
179 /*
180 * Find block_group that's up next for discarding.
181 *
182 * @discard_ctl: discard control
183 * @now: current time
184 *
185 * Iterate over the discard lists to find the next block_group up for
186 * discarding checking the discard_eligible_time of block_group.
187 */
find_next_block_group(struct btrfs_discard_ctl * discard_ctl,u64 now)188 static struct btrfs_block_group *find_next_block_group(
189 struct btrfs_discard_ctl *discard_ctl,
190 u64 now)
191 {
192 struct btrfs_block_group *ret_block_group = NULL, *block_group;
193 int i;
194
195 for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++) {
196 struct list_head *discard_list = &discard_ctl->discard_list[i];
197
198 if (!list_empty(discard_list)) {
199 block_group = list_first_entry(discard_list,
200 struct btrfs_block_group,
201 discard_list);
202
203 if (!ret_block_group)
204 ret_block_group = block_group;
205
206 if (ret_block_group->discard_eligible_time < now)
207 break;
208
209 if (ret_block_group->discard_eligible_time >
210 block_group->discard_eligible_time)
211 ret_block_group = block_group;
212 }
213 }
214
215 return ret_block_group;
216 }
217
218 /*
219 * Look up next block group and set it for use.
220 *
221 * @discard_ctl: discard control
222 * @discard_state: the discard_state of the block_group after state management
223 * @discard_index: the discard_index of the block_group after state management
224 * @now: time when discard was invoked, in ns
225 *
226 * Wrap find_next_block_group() and set the block_group to be in use.
227 * @discard_state's control flow is managed here. Variables related to
228 * @discard_state are reset here as needed (eg. @discard_cursor). @discard_state
229 * and @discard_index are remembered as it may change while we're discarding,
230 * but we want the discard to execute in the context determined here.
231 */
peek_discard_list(struct btrfs_discard_ctl * discard_ctl,enum btrfs_discard_state * discard_state,int * discard_index,u64 now)232 static struct btrfs_block_group *peek_discard_list(
233 struct btrfs_discard_ctl *discard_ctl,
234 enum btrfs_discard_state *discard_state,
235 int *discard_index, u64 now)
236 {
237 struct btrfs_block_group *block_group;
238
239 spin_lock(&discard_ctl->lock);
240 again:
241 block_group = find_next_block_group(discard_ctl, now);
242
243 if (block_group && now >= block_group->discard_eligible_time) {
244 if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED &&
245 block_group->used != 0) {
246 if (btrfs_is_block_group_data_only(block_group)) {
247 __add_to_discard_list(discard_ctl, block_group);
248 /*
249 * The block group must have been moved to other
250 * discard list even if discard was disabled in
251 * the meantime or a transaction abort happened,
252 * otherwise we can end up in an infinite loop,
253 * always jumping into the 'again' label and
254 * keep getting this block group over and over
255 * in case there are no other block groups in
256 * the discard lists.
257 */
258 ASSERT(block_group->discard_index !=
259 BTRFS_DISCARD_INDEX_UNUSED);
260 } else {
261 list_del_init(&block_group->discard_list);
262 btrfs_put_block_group(block_group);
263 }
264 goto again;
265 }
266 if (block_group->discard_state == BTRFS_DISCARD_RESET_CURSOR) {
267 block_group->discard_cursor = block_group->start;
268 block_group->discard_state = BTRFS_DISCARD_EXTENTS;
269 }
270 }
271 if (block_group) {
272 btrfs_get_block_group(block_group);
273 discard_ctl->block_group = block_group;
274 *discard_state = block_group->discard_state;
275 *discard_index = block_group->discard_index;
276 }
277 spin_unlock(&discard_ctl->lock);
278
279 return block_group;
280 }
281
282 /*
283 * Update a block group's filters.
284 *
285 * @block_group: block group of interest
286 * @bytes: recently freed region size after coalescing
287 *
288 * Async discard maintains multiple lists with progressively smaller filters
289 * to prioritize discarding based on size. Should a free space that matches
290 * a larger filter be returned to the free_space_cache, prioritize that discard
291 * by moving @block_group to the proper filter.
292 */
btrfs_discard_check_filter(struct btrfs_block_group * block_group,u64 bytes)293 void btrfs_discard_check_filter(struct btrfs_block_group *block_group,
294 u64 bytes)
295 {
296 struct btrfs_discard_ctl *discard_ctl;
297
298 if (!block_group ||
299 !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
300 return;
301
302 discard_ctl = &block_group->fs_info->discard_ctl;
303
304 if (block_group->discard_index > BTRFS_DISCARD_INDEX_START &&
305 bytes >= discard_minlen[block_group->discard_index - 1]) {
306 int i;
307
308 remove_from_discard_list(discard_ctl, block_group);
309
310 for (i = BTRFS_DISCARD_INDEX_START; i < BTRFS_NR_DISCARD_LISTS;
311 i++) {
312 if (bytes >= discard_minlen[i]) {
313 block_group->discard_index = i;
314 add_to_discard_list(discard_ctl, block_group);
315 break;
316 }
317 }
318 }
319 }
320
321 /*
322 * Move a block group along the discard lists.
323 *
324 * @discard_ctl: discard control
325 * @block_group: block_group of interest
326 *
327 * Increment @block_group's discard_index. If it falls of the list, let it be.
328 * Otherwise add it back to the appropriate list.
329 */
btrfs_update_discard_index(struct btrfs_discard_ctl * discard_ctl,struct btrfs_block_group * block_group)330 static void btrfs_update_discard_index(struct btrfs_discard_ctl *discard_ctl,
331 struct btrfs_block_group *block_group)
332 {
333 block_group->discard_index++;
334 if (block_group->discard_index == BTRFS_NR_DISCARD_LISTS) {
335 block_group->discard_index = 1;
336 return;
337 }
338
339 add_to_discard_list(discard_ctl, block_group);
340 }
341
342 /*
343 * Remove a block_group from the discard lists.
344 *
345 * @discard_ctl: discard control
346 * @block_group: block_group of interest
347 *
348 * Remove @block_group from the discard lists. If necessary, wait on the
349 * current work and then reschedule the delayed work.
350 */
btrfs_discard_cancel_work(struct btrfs_discard_ctl * discard_ctl,struct btrfs_block_group * block_group)351 void btrfs_discard_cancel_work(struct btrfs_discard_ctl *discard_ctl,
352 struct btrfs_block_group *block_group)
353 {
354 if (remove_from_discard_list(discard_ctl, block_group)) {
355 cancel_delayed_work_sync(&discard_ctl->work);
356 btrfs_discard_schedule_work(discard_ctl, true);
357 }
358 }
359
360 /*
361 * Handles queuing the block_groups.
362 *
363 * @discard_ctl: discard control
364 * @block_group: block_group of interest
365 *
366 * Maintain the LRU order of the discard lists.
367 */
btrfs_discard_queue_work(struct btrfs_discard_ctl * discard_ctl,struct btrfs_block_group * block_group)368 void btrfs_discard_queue_work(struct btrfs_discard_ctl *discard_ctl,
369 struct btrfs_block_group *block_group)
370 {
371 if (!block_group || !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
372 return;
373
374 if (block_group->used == 0)
375 add_to_discard_unused_list(discard_ctl, block_group);
376 else
377 add_to_discard_list(discard_ctl, block_group);
378
379 if (!delayed_work_pending(&discard_ctl->work))
380 btrfs_discard_schedule_work(discard_ctl, false);
381 }
382
__btrfs_discard_schedule_work(struct btrfs_discard_ctl * discard_ctl,u64 now,bool override)383 static void __btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl,
384 u64 now, bool override)
385 {
386 struct btrfs_block_group *block_group;
387
388 if (!btrfs_run_discard_work(discard_ctl))
389 return;
390 if (!override && delayed_work_pending(&discard_ctl->work))
391 return;
392
393 block_group = find_next_block_group(discard_ctl, now);
394 if (block_group) {
395 u64 delay = discard_ctl->delay_ms * NSEC_PER_MSEC;
396 u32 kbps_limit = READ_ONCE(discard_ctl->kbps_limit);
397
398 /*
399 * A single delayed workqueue item is responsible for
400 * discarding, so we can manage the bytes rate limit by keeping
401 * track of the previous discard.
402 */
403 if (kbps_limit && discard_ctl->prev_discard) {
404 u64 bps_limit = ((u64)kbps_limit) * SZ_1K;
405 u64 bps_delay = div64_u64(discard_ctl->prev_discard *
406 NSEC_PER_SEC, bps_limit);
407
408 delay = max(delay, bps_delay);
409 }
410
411 /*
412 * This timeout is to hopefully prevent immediate discarding
413 * in a recently allocated block group.
414 */
415 if (now < block_group->discard_eligible_time) {
416 u64 bg_timeout = block_group->discard_eligible_time - now;
417
418 delay = max(delay, bg_timeout);
419 }
420
421 if (override && discard_ctl->prev_discard) {
422 u64 elapsed = now - discard_ctl->prev_discard_time;
423
424 if (delay > elapsed)
425 delay -= elapsed;
426 else
427 delay = 0;
428 }
429
430 mod_delayed_work(discard_ctl->discard_workers,
431 &discard_ctl->work, nsecs_to_jiffies(delay));
432 }
433 }
434
435 /*
436 * Responsible for scheduling the discard work.
437 *
438 * @discard_ctl: discard control
439 * @override: override the current timer
440 *
441 * Discards are issued by a delayed workqueue item. @override is used to
442 * update the current delay as the baseline delay interval is reevaluated on
443 * transaction commit. This is also maxed with any other rate limit.
444 */
btrfs_discard_schedule_work(struct btrfs_discard_ctl * discard_ctl,bool override)445 void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl,
446 bool override)
447 {
448 const u64 now = ktime_get_ns();
449
450 spin_lock(&discard_ctl->lock);
451 __btrfs_discard_schedule_work(discard_ctl, now, override);
452 spin_unlock(&discard_ctl->lock);
453 }
454
455 /*
456 * Determine next step of a block_group.
457 *
458 * @discard_ctl: discard control
459 * @block_group: block_group of interest
460 *
461 * Determine the next step for a block group after it's finished going through
462 * a pass on a discard list. If it is unused and fully trimmed, we can mark it
463 * unused and send it to the unused_bgs path. Otherwise, pass it onto the
464 * appropriate filter list or let it fall off.
465 */
btrfs_finish_discard_pass(struct btrfs_discard_ctl * discard_ctl,struct btrfs_block_group * block_group)466 static void btrfs_finish_discard_pass(struct btrfs_discard_ctl *discard_ctl,
467 struct btrfs_block_group *block_group)
468 {
469 remove_from_discard_list(discard_ctl, block_group);
470
471 if (block_group->used == 0) {
472 if (btrfs_is_free_space_trimmed(block_group))
473 btrfs_mark_bg_unused(block_group);
474 else
475 add_to_discard_unused_list(discard_ctl, block_group);
476 } else {
477 btrfs_update_discard_index(discard_ctl, block_group);
478 }
479 }
480
481 /*
482 * Discard work queue callback
483 *
484 * @work: work
485 *
486 * Find the next block_group to start discarding and then discard a single
487 * region. It does this in a two-pass fashion: first extents and second
488 * bitmaps. Completely discarded block groups are sent to the unused_bgs path.
489 */
btrfs_discard_workfn(struct work_struct * work)490 static void btrfs_discard_workfn(struct work_struct *work)
491 {
492 struct btrfs_discard_ctl *discard_ctl;
493 struct btrfs_block_group *block_group;
494 enum btrfs_discard_state discard_state;
495 int discard_index = 0;
496 u64 trimmed = 0;
497 u64 minlen = 0;
498 u64 now = ktime_get_ns();
499
500 discard_ctl = container_of(work, struct btrfs_discard_ctl, work.work);
501
502 block_group = peek_discard_list(discard_ctl, &discard_state,
503 &discard_index, now);
504 if (!block_group)
505 return;
506 if (!btrfs_run_discard_work(discard_ctl)) {
507 spin_lock(&discard_ctl->lock);
508 btrfs_put_block_group(block_group);
509 discard_ctl->block_group = NULL;
510 spin_unlock(&discard_ctl->lock);
511 return;
512 }
513 if (now < block_group->discard_eligible_time) {
514 spin_lock(&discard_ctl->lock);
515 btrfs_put_block_group(block_group);
516 discard_ctl->block_group = NULL;
517 spin_unlock(&discard_ctl->lock);
518 btrfs_discard_schedule_work(discard_ctl, false);
519 return;
520 }
521
522 /* Perform discarding */
523 minlen = discard_minlen[discard_index];
524
525 if (discard_state == BTRFS_DISCARD_BITMAPS) {
526 u64 maxlen = 0;
527
528 /*
529 * Use the previous levels minimum discard length as the max
530 * length filter. In the case something is added to make a
531 * region go beyond the max filter, the entire bitmap is set
532 * back to BTRFS_TRIM_STATE_UNTRIMMED.
533 */
534 if (discard_index != BTRFS_DISCARD_INDEX_UNUSED)
535 maxlen = discard_minlen[discard_index - 1];
536
537 btrfs_trim_block_group_bitmaps(block_group, &trimmed,
538 block_group->discard_cursor,
539 btrfs_block_group_end(block_group),
540 minlen, maxlen, true);
541 discard_ctl->discard_bitmap_bytes += trimmed;
542 } else {
543 btrfs_trim_block_group_extents(block_group, &trimmed,
544 block_group->discard_cursor,
545 btrfs_block_group_end(block_group),
546 minlen, true);
547 discard_ctl->discard_extent_bytes += trimmed;
548 }
549
550 /* Determine next steps for a block_group */
551 if (block_group->discard_cursor >= btrfs_block_group_end(block_group)) {
552 if (discard_state == BTRFS_DISCARD_BITMAPS) {
553 btrfs_finish_discard_pass(discard_ctl, block_group);
554 } else {
555 block_group->discard_cursor = block_group->start;
556 spin_lock(&discard_ctl->lock);
557 if (block_group->discard_state !=
558 BTRFS_DISCARD_RESET_CURSOR)
559 block_group->discard_state =
560 BTRFS_DISCARD_BITMAPS;
561 spin_unlock(&discard_ctl->lock);
562 }
563 }
564
565 now = ktime_get_ns();
566 spin_lock(&discard_ctl->lock);
567 discard_ctl->prev_discard = trimmed;
568 discard_ctl->prev_discard_time = now;
569 btrfs_put_block_group(block_group);
570 discard_ctl->block_group = NULL;
571 __btrfs_discard_schedule_work(discard_ctl, now, false);
572 spin_unlock(&discard_ctl->lock);
573 }
574
575 /*
576 * Recalculate the base delay.
577 *
578 * @discard_ctl: discard control
579 *
580 * Recalculate the base delay which is based off the total number of
581 * discardable_extents. Clamp this between the lower_limit (iops_limit or 1ms)
582 * and the upper_limit (BTRFS_DISCARD_MAX_DELAY_MSEC).
583 */
btrfs_discard_calc_delay(struct btrfs_discard_ctl * discard_ctl)584 void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl)
585 {
586 s32 discardable_extents;
587 s64 discardable_bytes;
588 u32 iops_limit;
589 unsigned long min_delay = BTRFS_DISCARD_MIN_DELAY_MSEC;
590 unsigned long delay;
591
592 discardable_extents = atomic_read(&discard_ctl->discardable_extents);
593 if (!discardable_extents)
594 return;
595
596 spin_lock(&discard_ctl->lock);
597
598 /*
599 * The following is to fix a potential -1 discrepancy that we're not
600 * sure how to reproduce. But given that this is the only place that
601 * utilizes these numbers and this is only called by from
602 * btrfs_finish_extent_commit() which is synchronized, we can correct
603 * here.
604 */
605 if (discardable_extents < 0)
606 atomic_add(-discardable_extents,
607 &discard_ctl->discardable_extents);
608
609 discardable_bytes = atomic64_read(&discard_ctl->discardable_bytes);
610 if (discardable_bytes < 0)
611 atomic64_add(-discardable_bytes,
612 &discard_ctl->discardable_bytes);
613
614 if (discardable_extents <= 0) {
615 spin_unlock(&discard_ctl->lock);
616 return;
617 }
618
619 iops_limit = READ_ONCE(discard_ctl->iops_limit);
620
621 if (iops_limit) {
622 delay = MSEC_PER_SEC / iops_limit;
623 } else {
624 /*
625 * Unset iops_limit means go as fast as possible, so allow a
626 * delay of 0.
627 */
628 delay = 0;
629 min_delay = 0;
630 }
631
632 delay = clamp(delay, min_delay, BTRFS_DISCARD_MAX_DELAY_MSEC);
633 discard_ctl->delay_ms = delay;
634
635 spin_unlock(&discard_ctl->lock);
636 }
637
638 /*
639 * Propagate discard counters.
640 *
641 * @block_group: block_group of interest
642 *
643 * Propagate deltas of counters up to the discard_ctl. It maintains a current
644 * counter and a previous counter passing the delta up to the global stat.
645 * Then the current counter value becomes the previous counter value.
646 */
btrfs_discard_update_discardable(struct btrfs_block_group * block_group)647 void btrfs_discard_update_discardable(struct btrfs_block_group *block_group)
648 {
649 struct btrfs_free_space_ctl *ctl;
650 struct btrfs_discard_ctl *discard_ctl;
651 s32 extents_delta;
652 s64 bytes_delta;
653
654 if (!block_group ||
655 !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC) ||
656 !btrfs_is_block_group_data_only(block_group))
657 return;
658
659 ctl = block_group->free_space_ctl;
660 discard_ctl = &block_group->fs_info->discard_ctl;
661
662 lockdep_assert_held(&ctl->tree_lock);
663 extents_delta = ctl->discardable_extents[BTRFS_STAT_CURR] -
664 ctl->discardable_extents[BTRFS_STAT_PREV];
665 if (extents_delta) {
666 atomic_add(extents_delta, &discard_ctl->discardable_extents);
667 ctl->discardable_extents[BTRFS_STAT_PREV] =
668 ctl->discardable_extents[BTRFS_STAT_CURR];
669 }
670
671 bytes_delta = ctl->discardable_bytes[BTRFS_STAT_CURR] -
672 ctl->discardable_bytes[BTRFS_STAT_PREV];
673 if (bytes_delta) {
674 atomic64_add(bytes_delta, &discard_ctl->discardable_bytes);
675 ctl->discardable_bytes[BTRFS_STAT_PREV] =
676 ctl->discardable_bytes[BTRFS_STAT_CURR];
677 }
678 }
679
680 /*
681 * Punt unused_bgs list to discard lists.
682 *
683 * @fs_info: fs_info of interest
684 *
685 * The unused_bgs list needs to be punted to the discard lists because the
686 * order of operations is changed. In the normal synchronous discard path, the
687 * block groups are trimmed via a single large trim in transaction commit. This
688 * is ultimately what we are trying to avoid with asynchronous discard. Thus,
689 * it must be done before going down the unused_bgs path.
690 */
btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info * fs_info)691 void btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info *fs_info)
692 {
693 struct btrfs_block_group *block_group, *next;
694
695 spin_lock(&fs_info->unused_bgs_lock);
696 /* We enabled async discard, so punt all to the queue */
697 list_for_each_entry_safe(block_group, next, &fs_info->unused_bgs,
698 bg_list) {
699 list_del_init(&block_group->bg_list);
700 btrfs_discard_queue_work(&fs_info->discard_ctl, block_group);
701 /*
702 * This put is for the get done by btrfs_mark_bg_unused.
703 * Queueing discard incremented it for discard's reference.
704 */
705 btrfs_put_block_group(block_group);
706 }
707 spin_unlock(&fs_info->unused_bgs_lock);
708 }
709
710 /*
711 * Purge discard lists.
712 *
713 * @discard_ctl: discard control
714 *
715 * If we are disabling async discard, we may have intercepted block groups that
716 * are completely free and ready for the unused_bgs path. As discarding will
717 * now happen in transaction commit or not at all, we can safely mark the
718 * corresponding block groups as unused and they will be sent on their merry
719 * way to the unused_bgs list.
720 */
btrfs_discard_purge_list(struct btrfs_discard_ctl * discard_ctl)721 static void btrfs_discard_purge_list(struct btrfs_discard_ctl *discard_ctl)
722 {
723 struct btrfs_block_group *block_group, *next;
724 int i;
725
726 spin_lock(&discard_ctl->lock);
727 for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++) {
728 list_for_each_entry_safe(block_group, next,
729 &discard_ctl->discard_list[i],
730 discard_list) {
731 list_del_init(&block_group->discard_list);
732 spin_unlock(&discard_ctl->lock);
733 if (block_group->used == 0)
734 btrfs_mark_bg_unused(block_group);
735 spin_lock(&discard_ctl->lock);
736 btrfs_put_block_group(block_group);
737 }
738 }
739 spin_unlock(&discard_ctl->lock);
740 }
741
btrfs_discard_resume(struct btrfs_fs_info * fs_info)742 void btrfs_discard_resume(struct btrfs_fs_info *fs_info)
743 {
744 if (!btrfs_test_opt(fs_info, DISCARD_ASYNC)) {
745 btrfs_discard_cleanup(fs_info);
746 return;
747 }
748
749 btrfs_discard_punt_unused_bgs_list(fs_info);
750
751 set_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags);
752 }
753
btrfs_discard_stop(struct btrfs_fs_info * fs_info)754 void btrfs_discard_stop(struct btrfs_fs_info *fs_info)
755 {
756 clear_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags);
757 }
758
btrfs_discard_init(struct btrfs_fs_info * fs_info)759 void btrfs_discard_init(struct btrfs_fs_info *fs_info)
760 {
761 struct btrfs_discard_ctl *discard_ctl = &fs_info->discard_ctl;
762 int i;
763
764 spin_lock_init(&discard_ctl->lock);
765 INIT_DELAYED_WORK(&discard_ctl->work, btrfs_discard_workfn);
766
767 for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++)
768 INIT_LIST_HEAD(&discard_ctl->discard_list[i]);
769
770 discard_ctl->prev_discard = 0;
771 discard_ctl->prev_discard_time = 0;
772 atomic_set(&discard_ctl->discardable_extents, 0);
773 atomic64_set(&discard_ctl->discardable_bytes, 0);
774 discard_ctl->max_discard_size = BTRFS_ASYNC_DISCARD_DEFAULT_MAX_SIZE;
775 discard_ctl->delay_ms = BTRFS_DISCARD_MAX_DELAY_MSEC;
776 discard_ctl->iops_limit = BTRFS_DISCARD_MAX_IOPS;
777 discard_ctl->kbps_limit = 0;
778 discard_ctl->discard_extent_bytes = 0;
779 discard_ctl->discard_bitmap_bytes = 0;
780 atomic64_set(&discard_ctl->discard_bytes_saved, 0);
781 }
782
btrfs_discard_cleanup(struct btrfs_fs_info * fs_info)783 void btrfs_discard_cleanup(struct btrfs_fs_info *fs_info)
784 {
785 btrfs_discard_stop(fs_info);
786 cancel_delayed_work_sync(&fs_info->discard_ctl.work);
787 btrfs_discard_purge_list(&fs_info->discard_ctl);
788 }
789