1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright (C) 2007 Oracle. All rights reserved.
4 */
5
6 #include <linux/fs.h>
7 #include <linux/pagemap.h>
8 #include <linux/time.h>
9 #include <linux/init.h>
10 #include <linux/string.h>
11 #include <linux/backing-dev.h>
12 #include <linux/falloc.h>
13 #include <linux/writeback.h>
14 #include <linux/compat.h>
15 #include <linux/slab.h>
16 #include <linux/btrfs.h>
17 #include <linux/uio.h>
18 #include <linux/iversion.h>
19 #include <linux/fsverity.h>
20 #include "ctree.h"
21 #include "direct-io.h"
22 #include "disk-io.h"
23 #include "transaction.h"
24 #include "btrfs_inode.h"
25 #include "tree-log.h"
26 #include "locking.h"
27 #include "qgroup.h"
28 #include "compression.h"
29 #include "delalloc-space.h"
30 #include "reflink.h"
31 #include "subpage.h"
32 #include "fs.h"
33 #include "accessors.h"
34 #include "extent-tree.h"
35 #include "file-item.h"
36 #include "ioctl.h"
37 #include "file.h"
38 #include "super.h"
39 #include "print-tree.h"
40
41 /*
42 * Unlock folio after btrfs_file_write() is done with it.
43 */
btrfs_drop_folio(struct btrfs_fs_info * fs_info,struct folio * folio,u64 pos,u64 copied)44 static void btrfs_drop_folio(struct btrfs_fs_info *fs_info, struct folio *folio,
45 u64 pos, u64 copied)
46 {
47 u64 block_start = round_down(pos, fs_info->sectorsize);
48 u64 block_len = round_up(pos + copied, fs_info->sectorsize) - block_start;
49
50 ASSERT(block_len <= U32_MAX);
51 /*
52 * Folio checked is some magic around finding folios that have been
53 * modified without going through btrfs_dirty_folio(). Clear it here.
54 * There should be no need to mark the pages accessed as
55 * prepare_one_folio() should have marked them accessed in
56 * prepare_one_folio() via find_or_create_page()
57 */
58 btrfs_folio_clamp_clear_checked(fs_info, folio, block_start, block_len);
59 folio_unlock(folio);
60 folio_put(folio);
61 }
62
63 /*
64 * After copy_folio_from_iter_atomic(), update the following things for delalloc:
65 * - Mark newly dirtied folio as DELALLOC in the io tree.
66 * Used to advise which range is to be written back.
67 * - Mark modified folio as Uptodate/Dirty and not needing COW fixup
68 * - Update inode size for past EOF write
69 */
btrfs_dirty_folio(struct btrfs_inode * inode,struct folio * folio,loff_t pos,size_t write_bytes,struct extent_state ** cached,bool noreserve)70 int btrfs_dirty_folio(struct btrfs_inode *inode, struct folio *folio, loff_t pos,
71 size_t write_bytes, struct extent_state **cached, bool noreserve)
72 {
73 struct btrfs_fs_info *fs_info = inode->root->fs_info;
74 int ret = 0;
75 u64 num_bytes;
76 u64 start_pos;
77 u64 end_of_last_block;
78 u64 end_pos = pos + write_bytes;
79 loff_t isize = i_size_read(&inode->vfs_inode);
80 unsigned int extra_bits = 0;
81
82 if (write_bytes == 0)
83 return 0;
84
85 if (noreserve)
86 extra_bits |= EXTENT_NORESERVE;
87
88 start_pos = round_down(pos, fs_info->sectorsize);
89 num_bytes = round_up(write_bytes + pos - start_pos,
90 fs_info->sectorsize);
91 ASSERT(num_bytes <= U32_MAX);
92 ASSERT(folio_pos(folio) <= pos &&
93 folio_pos(folio) + folio_size(folio) >= pos + write_bytes);
94
95 end_of_last_block = start_pos + num_bytes - 1;
96
97 /*
98 * The pages may have already been dirty, clear out old accounting so
99 * we can set things up properly
100 */
101 clear_extent_bit(&inode->io_tree, start_pos, end_of_last_block,
102 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
103 cached);
104
105 ret = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
106 extra_bits, cached);
107 if (ret)
108 return ret;
109
110 btrfs_folio_clamp_set_uptodate(fs_info, folio, start_pos, num_bytes);
111 btrfs_folio_clamp_clear_checked(fs_info, folio, start_pos, num_bytes);
112 btrfs_folio_clamp_set_dirty(fs_info, folio, start_pos, num_bytes);
113
114 /*
115 * we've only changed i_size in ram, and we haven't updated
116 * the disk i_size. There is no need to log the inode
117 * at this time.
118 */
119 if (end_pos > isize)
120 i_size_write(&inode->vfs_inode, end_pos);
121 return 0;
122 }
123
124 /*
125 * this is very complex, but the basic idea is to drop all extents
126 * in the range start - end. hint_block is filled in with a block number
127 * that would be a good hint to the block allocator for this file.
128 *
129 * If an extent intersects the range but is not entirely inside the range
130 * it is either truncated or split. Anything entirely inside the range
131 * is deleted from the tree.
132 *
133 * Note: the VFS' inode number of bytes is not updated, it's up to the caller
134 * to deal with that. We set the field 'bytes_found' of the arguments structure
135 * with the number of allocated bytes found in the target range, so that the
136 * caller can update the inode's number of bytes in an atomic way when
137 * replacing extents in a range to avoid races with stat(2).
138 */
btrfs_drop_extents(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_inode * inode,struct btrfs_drop_extents_args * args)139 int btrfs_drop_extents(struct btrfs_trans_handle *trans,
140 struct btrfs_root *root, struct btrfs_inode *inode,
141 struct btrfs_drop_extents_args *args)
142 {
143 struct btrfs_fs_info *fs_info = root->fs_info;
144 struct extent_buffer *leaf;
145 struct btrfs_file_extent_item *fi;
146 struct btrfs_key key;
147 struct btrfs_key new_key;
148 u64 ino = btrfs_ino(inode);
149 u64 search_start = args->start;
150 u64 disk_bytenr = 0;
151 u64 num_bytes = 0;
152 u64 extent_offset = 0;
153 u64 extent_end = 0;
154 u64 last_end = args->start;
155 int del_nr = 0;
156 int del_slot = 0;
157 int extent_type;
158 int recow;
159 int ret;
160 int modify_tree = -1;
161 int update_refs;
162 int found = 0;
163 struct btrfs_path *path = args->path;
164
165 args->bytes_found = 0;
166 args->extent_inserted = false;
167
168 /* Must always have a path if ->replace_extent is true */
169 ASSERT(!(args->replace_extent && !args->path));
170
171 if (!path) {
172 path = btrfs_alloc_path();
173 if (!path) {
174 ret = -ENOMEM;
175 goto out;
176 }
177 }
178
179 if (args->drop_cache)
180 btrfs_drop_extent_map_range(inode, args->start, args->end - 1, false);
181
182 if (data_race(args->start >= inode->disk_i_size) && !args->replace_extent)
183 modify_tree = 0;
184
185 update_refs = (btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID);
186 while (1) {
187 recow = 0;
188 ret = btrfs_lookup_file_extent(trans, root, path, ino,
189 search_start, modify_tree);
190 if (ret < 0)
191 break;
192 if (ret > 0 && path->slots[0] > 0 && search_start == args->start) {
193 leaf = path->nodes[0];
194 btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
195 if (key.objectid == ino &&
196 key.type == BTRFS_EXTENT_DATA_KEY)
197 path->slots[0]--;
198 }
199 ret = 0;
200 next_slot:
201 leaf = path->nodes[0];
202 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
203 if (WARN_ON(del_nr > 0)) {
204 btrfs_print_leaf(leaf);
205 ret = -EINVAL;
206 break;
207 }
208 ret = btrfs_next_leaf(root, path);
209 if (ret < 0)
210 break;
211 if (ret > 0) {
212 ret = 0;
213 break;
214 }
215 leaf = path->nodes[0];
216 recow = 1;
217 }
218
219 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
220
221 if (key.objectid > ino)
222 break;
223 if (WARN_ON_ONCE(key.objectid < ino) ||
224 key.type < BTRFS_EXTENT_DATA_KEY) {
225 ASSERT(del_nr == 0);
226 path->slots[0]++;
227 goto next_slot;
228 }
229 if (key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= args->end)
230 break;
231
232 fi = btrfs_item_ptr(leaf, path->slots[0],
233 struct btrfs_file_extent_item);
234 extent_type = btrfs_file_extent_type(leaf, fi);
235
236 if (extent_type == BTRFS_FILE_EXTENT_REG ||
237 extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
238 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
239 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
240 extent_offset = btrfs_file_extent_offset(leaf, fi);
241 extent_end = key.offset +
242 btrfs_file_extent_num_bytes(leaf, fi);
243 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
244 extent_end = key.offset +
245 btrfs_file_extent_ram_bytes(leaf, fi);
246 } else {
247 /* can't happen */
248 BUG();
249 }
250
251 /*
252 * Don't skip extent items representing 0 byte lengths. They
253 * used to be created (bug) if while punching holes we hit
254 * -ENOSPC condition. So if we find one here, just ensure we
255 * delete it, otherwise we would insert a new file extent item
256 * with the same key (offset) as that 0 bytes length file
257 * extent item in the call to setup_items_for_insert() later
258 * in this function.
259 */
260 if (extent_end == key.offset && extent_end >= search_start) {
261 last_end = extent_end;
262 goto delete_extent_item;
263 }
264
265 if (extent_end <= search_start) {
266 path->slots[0]++;
267 goto next_slot;
268 }
269
270 found = 1;
271 search_start = max(key.offset, args->start);
272 if (recow || !modify_tree) {
273 modify_tree = -1;
274 btrfs_release_path(path);
275 continue;
276 }
277
278 /*
279 * | - range to drop - |
280 * | -------- extent -------- |
281 */
282 if (args->start > key.offset && args->end < extent_end) {
283 if (WARN_ON(del_nr > 0)) {
284 btrfs_print_leaf(leaf);
285 ret = -EINVAL;
286 break;
287 }
288 if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
289 ret = -EOPNOTSUPP;
290 break;
291 }
292
293 memcpy(&new_key, &key, sizeof(new_key));
294 new_key.offset = args->start;
295 ret = btrfs_duplicate_item(trans, root, path,
296 &new_key);
297 if (ret == -EAGAIN) {
298 btrfs_release_path(path);
299 continue;
300 }
301 if (ret < 0)
302 break;
303
304 leaf = path->nodes[0];
305 fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
306 struct btrfs_file_extent_item);
307 btrfs_set_file_extent_num_bytes(leaf, fi,
308 args->start - key.offset);
309
310 fi = btrfs_item_ptr(leaf, path->slots[0],
311 struct btrfs_file_extent_item);
312
313 extent_offset += args->start - key.offset;
314 btrfs_set_file_extent_offset(leaf, fi, extent_offset);
315 btrfs_set_file_extent_num_bytes(leaf, fi,
316 extent_end - args->start);
317
318 if (update_refs && disk_bytenr > 0) {
319 struct btrfs_ref ref = {
320 .action = BTRFS_ADD_DELAYED_REF,
321 .bytenr = disk_bytenr,
322 .num_bytes = num_bytes,
323 .parent = 0,
324 .owning_root = btrfs_root_id(root),
325 .ref_root = btrfs_root_id(root),
326 };
327 btrfs_init_data_ref(&ref, new_key.objectid,
328 args->start - extent_offset,
329 0, false);
330 ret = btrfs_inc_extent_ref(trans, &ref);
331 if (ret) {
332 btrfs_abort_transaction(trans, ret);
333 break;
334 }
335 }
336 key.offset = args->start;
337 }
338 /*
339 * From here on out we will have actually dropped something, so
340 * last_end can be updated.
341 */
342 last_end = extent_end;
343
344 /*
345 * | ---- range to drop ----- |
346 * | -------- extent -------- |
347 */
348 if (args->start <= key.offset && args->end < extent_end) {
349 if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
350 ret = -EOPNOTSUPP;
351 break;
352 }
353
354 memcpy(&new_key, &key, sizeof(new_key));
355 new_key.offset = args->end;
356 btrfs_set_item_key_safe(trans, path, &new_key);
357
358 extent_offset += args->end - key.offset;
359 btrfs_set_file_extent_offset(leaf, fi, extent_offset);
360 btrfs_set_file_extent_num_bytes(leaf, fi,
361 extent_end - args->end);
362 if (update_refs && disk_bytenr > 0)
363 args->bytes_found += args->end - key.offset;
364 break;
365 }
366
367 search_start = extent_end;
368 /*
369 * | ---- range to drop ----- |
370 * | -------- extent -------- |
371 */
372 if (args->start > key.offset && args->end >= extent_end) {
373 if (WARN_ON(del_nr > 0)) {
374 btrfs_print_leaf(leaf);
375 ret = -EINVAL;
376 break;
377 }
378 if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
379 ret = -EOPNOTSUPP;
380 break;
381 }
382
383 btrfs_set_file_extent_num_bytes(leaf, fi,
384 args->start - key.offset);
385 if (update_refs && disk_bytenr > 0)
386 args->bytes_found += extent_end - args->start;
387 if (args->end == extent_end)
388 break;
389
390 path->slots[0]++;
391 goto next_slot;
392 }
393
394 /*
395 * | ---- range to drop ----- |
396 * | ------ extent ------ |
397 */
398 if (args->start <= key.offset && args->end >= extent_end) {
399 delete_extent_item:
400 if (del_nr == 0) {
401 del_slot = path->slots[0];
402 del_nr = 1;
403 } else {
404 if (WARN_ON(del_slot + del_nr != path->slots[0])) {
405 btrfs_print_leaf(leaf);
406 ret = -EINVAL;
407 break;
408 }
409 del_nr++;
410 }
411
412 if (update_refs &&
413 extent_type == BTRFS_FILE_EXTENT_INLINE) {
414 args->bytes_found += extent_end - key.offset;
415 extent_end = ALIGN(extent_end,
416 fs_info->sectorsize);
417 } else if (update_refs && disk_bytenr > 0) {
418 struct btrfs_ref ref = {
419 .action = BTRFS_DROP_DELAYED_REF,
420 .bytenr = disk_bytenr,
421 .num_bytes = num_bytes,
422 .parent = 0,
423 .owning_root = btrfs_root_id(root),
424 .ref_root = btrfs_root_id(root),
425 };
426 btrfs_init_data_ref(&ref, key.objectid,
427 key.offset - extent_offset,
428 0, false);
429 ret = btrfs_free_extent(trans, &ref);
430 if (ret) {
431 btrfs_abort_transaction(trans, ret);
432 break;
433 }
434 args->bytes_found += extent_end - key.offset;
435 }
436
437 if (args->end == extent_end)
438 break;
439
440 if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) {
441 path->slots[0]++;
442 goto next_slot;
443 }
444
445 ret = btrfs_del_items(trans, root, path, del_slot,
446 del_nr);
447 if (ret) {
448 btrfs_abort_transaction(trans, ret);
449 break;
450 }
451
452 del_nr = 0;
453 del_slot = 0;
454
455 btrfs_release_path(path);
456 continue;
457 }
458
459 BUG();
460 }
461
462 if (!ret && del_nr > 0) {
463 /*
464 * Set path->slots[0] to first slot, so that after the delete
465 * if items are move off from our leaf to its immediate left or
466 * right neighbor leafs, we end up with a correct and adjusted
467 * path->slots[0] for our insertion (if args->replace_extent).
468 */
469 path->slots[0] = del_slot;
470 ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
471 if (ret)
472 btrfs_abort_transaction(trans, ret);
473 }
474
475 leaf = path->nodes[0];
476 /*
477 * If btrfs_del_items() was called, it might have deleted a leaf, in
478 * which case it unlocked our path, so check path->locks[0] matches a
479 * write lock.
480 */
481 if (!ret && args->replace_extent &&
482 path->locks[0] == BTRFS_WRITE_LOCK &&
483 btrfs_leaf_free_space(leaf) >=
484 sizeof(struct btrfs_item) + args->extent_item_size) {
485
486 key.objectid = ino;
487 key.type = BTRFS_EXTENT_DATA_KEY;
488 key.offset = args->start;
489 if (!del_nr && path->slots[0] < btrfs_header_nritems(leaf)) {
490 struct btrfs_key slot_key;
491
492 btrfs_item_key_to_cpu(leaf, &slot_key, path->slots[0]);
493 if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
494 path->slots[0]++;
495 }
496 btrfs_setup_item_for_insert(trans, root, path, &key,
497 args->extent_item_size);
498 args->extent_inserted = true;
499 }
500
501 if (!args->path)
502 btrfs_free_path(path);
503 else if (!args->extent_inserted)
504 btrfs_release_path(path);
505 out:
506 args->drop_end = found ? min(args->end, last_end) : args->end;
507
508 return ret;
509 }
510
extent_mergeable(struct extent_buffer * leaf,int slot,u64 objectid,u64 bytenr,u64 orig_offset,u64 * start,u64 * end)511 static int extent_mergeable(struct extent_buffer *leaf, int slot,
512 u64 objectid, u64 bytenr, u64 orig_offset,
513 u64 *start, u64 *end)
514 {
515 struct btrfs_file_extent_item *fi;
516 struct btrfs_key key;
517 u64 extent_end;
518
519 if (slot < 0 || slot >= btrfs_header_nritems(leaf))
520 return 0;
521
522 btrfs_item_key_to_cpu(leaf, &key, slot);
523 if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY)
524 return 0;
525
526 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
527 if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||
528 btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr ||
529 btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset ||
530 btrfs_file_extent_compression(leaf, fi) ||
531 btrfs_file_extent_encryption(leaf, fi) ||
532 btrfs_file_extent_other_encoding(leaf, fi))
533 return 0;
534
535 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
536 if ((*start && *start != key.offset) || (*end && *end != extent_end))
537 return 0;
538
539 *start = key.offset;
540 *end = extent_end;
541 return 1;
542 }
543
544 /*
545 * Mark extent in the range start - end as written.
546 *
547 * This changes extent type from 'pre-allocated' to 'regular'. If only
548 * part of extent is marked as written, the extent will be split into
549 * two or three.
550 */
btrfs_mark_extent_written(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,u64 start,u64 end)551 int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
552 struct btrfs_inode *inode, u64 start, u64 end)
553 {
554 struct btrfs_root *root = inode->root;
555 struct extent_buffer *leaf;
556 struct btrfs_path *path;
557 struct btrfs_file_extent_item *fi;
558 struct btrfs_ref ref = { 0 };
559 struct btrfs_key key;
560 struct btrfs_key new_key;
561 u64 bytenr;
562 u64 num_bytes;
563 u64 extent_end;
564 u64 orig_offset;
565 u64 other_start;
566 u64 other_end;
567 u64 split;
568 int del_nr = 0;
569 int del_slot = 0;
570 int recow;
571 int ret = 0;
572 u64 ino = btrfs_ino(inode);
573
574 path = btrfs_alloc_path();
575 if (!path)
576 return -ENOMEM;
577 again:
578 recow = 0;
579 split = start;
580 key.objectid = ino;
581 key.type = BTRFS_EXTENT_DATA_KEY;
582 key.offset = split;
583
584 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
585 if (ret < 0)
586 goto out;
587 if (ret > 0 && path->slots[0] > 0)
588 path->slots[0]--;
589
590 leaf = path->nodes[0];
591 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
592 if (key.objectid != ino ||
593 key.type != BTRFS_EXTENT_DATA_KEY) {
594 ret = -EINVAL;
595 btrfs_abort_transaction(trans, ret);
596 goto out;
597 }
598 fi = btrfs_item_ptr(leaf, path->slots[0],
599 struct btrfs_file_extent_item);
600 if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_PREALLOC) {
601 ret = -EINVAL;
602 btrfs_abort_transaction(trans, ret);
603 goto out;
604 }
605 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
606 if (key.offset > start || extent_end < end) {
607 ret = -EINVAL;
608 btrfs_abort_transaction(trans, ret);
609 goto out;
610 }
611
612 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
613 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
614 orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi);
615 memcpy(&new_key, &key, sizeof(new_key));
616
617 if (start == key.offset && end < extent_end) {
618 other_start = 0;
619 other_end = start;
620 if (extent_mergeable(leaf, path->slots[0] - 1,
621 ino, bytenr, orig_offset,
622 &other_start, &other_end)) {
623 new_key.offset = end;
624 btrfs_set_item_key_safe(trans, path, &new_key);
625 fi = btrfs_item_ptr(leaf, path->slots[0],
626 struct btrfs_file_extent_item);
627 btrfs_set_file_extent_generation(leaf, fi,
628 trans->transid);
629 btrfs_set_file_extent_num_bytes(leaf, fi,
630 extent_end - end);
631 btrfs_set_file_extent_offset(leaf, fi,
632 end - orig_offset);
633 fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
634 struct btrfs_file_extent_item);
635 btrfs_set_file_extent_generation(leaf, fi,
636 trans->transid);
637 btrfs_set_file_extent_num_bytes(leaf, fi,
638 end - other_start);
639 goto out;
640 }
641 }
642
643 if (start > key.offset && end == extent_end) {
644 other_start = end;
645 other_end = 0;
646 if (extent_mergeable(leaf, path->slots[0] + 1,
647 ino, bytenr, orig_offset,
648 &other_start, &other_end)) {
649 fi = btrfs_item_ptr(leaf, path->slots[0],
650 struct btrfs_file_extent_item);
651 btrfs_set_file_extent_num_bytes(leaf, fi,
652 start - key.offset);
653 btrfs_set_file_extent_generation(leaf, fi,
654 trans->transid);
655 path->slots[0]++;
656 new_key.offset = start;
657 btrfs_set_item_key_safe(trans, path, &new_key);
658
659 fi = btrfs_item_ptr(leaf, path->slots[0],
660 struct btrfs_file_extent_item);
661 btrfs_set_file_extent_generation(leaf, fi,
662 trans->transid);
663 btrfs_set_file_extent_num_bytes(leaf, fi,
664 other_end - start);
665 btrfs_set_file_extent_offset(leaf, fi,
666 start - orig_offset);
667 goto out;
668 }
669 }
670
671 while (start > key.offset || end < extent_end) {
672 if (key.offset == start)
673 split = end;
674
675 new_key.offset = split;
676 ret = btrfs_duplicate_item(trans, root, path, &new_key);
677 if (ret == -EAGAIN) {
678 btrfs_release_path(path);
679 goto again;
680 }
681 if (ret < 0) {
682 btrfs_abort_transaction(trans, ret);
683 goto out;
684 }
685
686 leaf = path->nodes[0];
687 fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
688 struct btrfs_file_extent_item);
689 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
690 btrfs_set_file_extent_num_bytes(leaf, fi,
691 split - key.offset);
692
693 fi = btrfs_item_ptr(leaf, path->slots[0],
694 struct btrfs_file_extent_item);
695
696 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
697 btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);
698 btrfs_set_file_extent_num_bytes(leaf, fi,
699 extent_end - split);
700
701 ref.action = BTRFS_ADD_DELAYED_REF;
702 ref.bytenr = bytenr;
703 ref.num_bytes = num_bytes;
704 ref.parent = 0;
705 ref.owning_root = btrfs_root_id(root);
706 ref.ref_root = btrfs_root_id(root);
707 btrfs_init_data_ref(&ref, ino, orig_offset, 0, false);
708 ret = btrfs_inc_extent_ref(trans, &ref);
709 if (ret) {
710 btrfs_abort_transaction(trans, ret);
711 goto out;
712 }
713
714 if (split == start) {
715 key.offset = start;
716 } else {
717 if (start != key.offset) {
718 ret = -EINVAL;
719 btrfs_abort_transaction(trans, ret);
720 goto out;
721 }
722 path->slots[0]--;
723 extent_end = end;
724 }
725 recow = 1;
726 }
727
728 other_start = end;
729 other_end = 0;
730
731 ref.action = BTRFS_DROP_DELAYED_REF;
732 ref.bytenr = bytenr;
733 ref.num_bytes = num_bytes;
734 ref.parent = 0;
735 ref.owning_root = btrfs_root_id(root);
736 ref.ref_root = btrfs_root_id(root);
737 btrfs_init_data_ref(&ref, ino, orig_offset, 0, false);
738 if (extent_mergeable(leaf, path->slots[0] + 1,
739 ino, bytenr, orig_offset,
740 &other_start, &other_end)) {
741 if (recow) {
742 btrfs_release_path(path);
743 goto again;
744 }
745 extent_end = other_end;
746 del_slot = path->slots[0] + 1;
747 del_nr++;
748 ret = btrfs_free_extent(trans, &ref);
749 if (ret) {
750 btrfs_abort_transaction(trans, ret);
751 goto out;
752 }
753 }
754 other_start = 0;
755 other_end = start;
756 if (extent_mergeable(leaf, path->slots[0] - 1,
757 ino, bytenr, orig_offset,
758 &other_start, &other_end)) {
759 if (recow) {
760 btrfs_release_path(path);
761 goto again;
762 }
763 key.offset = other_start;
764 del_slot = path->slots[0];
765 del_nr++;
766 ret = btrfs_free_extent(trans, &ref);
767 if (ret) {
768 btrfs_abort_transaction(trans, ret);
769 goto out;
770 }
771 }
772 if (del_nr == 0) {
773 fi = btrfs_item_ptr(leaf, path->slots[0],
774 struct btrfs_file_extent_item);
775 btrfs_set_file_extent_type(leaf, fi,
776 BTRFS_FILE_EXTENT_REG);
777 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
778 } else {
779 fi = btrfs_item_ptr(leaf, del_slot - 1,
780 struct btrfs_file_extent_item);
781 btrfs_set_file_extent_type(leaf, fi,
782 BTRFS_FILE_EXTENT_REG);
783 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
784 btrfs_set_file_extent_num_bytes(leaf, fi,
785 extent_end - key.offset);
786
787 ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
788 if (ret < 0) {
789 btrfs_abort_transaction(trans, ret);
790 goto out;
791 }
792 }
793 out:
794 btrfs_free_path(path);
795 return ret;
796 }
797
798 /*
799 * On error return an unlocked folio and the error value
800 * On success return a locked folio and 0
801 */
prepare_uptodate_folio(struct inode * inode,struct folio * folio,u64 pos,u64 len,bool force_uptodate)802 static int prepare_uptodate_folio(struct inode *inode, struct folio *folio, u64 pos,
803 u64 len, bool force_uptodate)
804 {
805 u64 clamp_start = max_t(u64, pos, folio_pos(folio));
806 u64 clamp_end = min_t(u64, pos + len, folio_pos(folio) + folio_size(folio));
807 const u32 blocksize = inode_to_fs_info(inode)->sectorsize;
808 int ret = 0;
809
810 if (folio_test_uptodate(folio))
811 return 0;
812
813 if (!force_uptodate &&
814 IS_ALIGNED(clamp_start, blocksize) &&
815 IS_ALIGNED(clamp_end, blocksize))
816 return 0;
817
818 ret = btrfs_read_folio(NULL, folio);
819 if (ret)
820 return ret;
821 folio_lock(folio);
822 if (!folio_test_uptodate(folio)) {
823 folio_unlock(folio);
824 return -EIO;
825 }
826
827 /*
828 * Since btrfs_read_folio() will unlock the folio before it returns,
829 * there is a window where btrfs_release_folio() can be called to
830 * release the page. Here we check both inode mapping and page
831 * private to make sure the page was not released.
832 *
833 * The private flag check is essential for subpage as we need to store
834 * extra bitmap using folio private.
835 */
836 if (folio->mapping != inode->i_mapping || !folio_test_private(folio)) {
837 folio_unlock(folio);
838 return -EAGAIN;
839 }
840 return 0;
841 }
842
get_prepare_gfp_flags(struct inode * inode,bool nowait)843 static gfp_t get_prepare_gfp_flags(struct inode *inode, bool nowait)
844 {
845 gfp_t gfp;
846
847 gfp = btrfs_alloc_write_mask(inode->i_mapping);
848 if (nowait) {
849 gfp &= ~__GFP_DIRECT_RECLAIM;
850 gfp |= GFP_NOWAIT;
851 }
852
853 return gfp;
854 }
855
856 /*
857 * Get folio into the page cache and lock it.
858 */
prepare_one_folio(struct inode * inode,struct folio ** folio_ret,loff_t pos,size_t write_bytes,bool force_uptodate,bool nowait)859 static noinline int prepare_one_folio(struct inode *inode, struct folio **folio_ret,
860 loff_t pos, size_t write_bytes,
861 bool force_uptodate, bool nowait)
862 {
863 unsigned long index = pos >> PAGE_SHIFT;
864 gfp_t mask = get_prepare_gfp_flags(inode, nowait);
865 fgf_t fgp_flags = (nowait ? FGP_WRITEBEGIN | FGP_NOWAIT : FGP_WRITEBEGIN);
866 struct folio *folio;
867 int ret = 0;
868
869 again:
870 folio = __filemap_get_folio(inode->i_mapping, index, fgp_flags, mask);
871 if (IS_ERR(folio)) {
872 if (nowait)
873 ret = -EAGAIN;
874 else
875 ret = PTR_ERR(folio);
876 return ret;
877 }
878 /* Only support page sized folio yet. */
879 ASSERT(folio_order(folio) == 0);
880 ret = set_folio_extent_mapped(folio);
881 if (ret < 0) {
882 folio_unlock(folio);
883 folio_put(folio);
884 return ret;
885 }
886 ret = prepare_uptodate_folio(inode, folio, pos, write_bytes, force_uptodate);
887 if (ret) {
888 /* The folio is already unlocked. */
889 folio_put(folio);
890 if (!nowait && ret == -EAGAIN) {
891 ret = 0;
892 goto again;
893 }
894 return ret;
895 }
896 *folio_ret = folio;
897 return 0;
898 }
899
900 /*
901 * Locks the extent and properly waits for data=ordered extents to finish
902 * before allowing the folios to be modified if need.
903 *
904 * Return:
905 * 1 - the extent is locked
906 * 0 - the extent is not locked, and everything is OK
907 * -EAGAIN - need to prepare the folios again
908 */
909 static noinline int
lock_and_cleanup_extent_if_need(struct btrfs_inode * inode,struct folio * folio,loff_t pos,size_t write_bytes,u64 * lockstart,u64 * lockend,bool nowait,struct extent_state ** cached_state)910 lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct folio *folio,
911 loff_t pos, size_t write_bytes,
912 u64 *lockstart, u64 *lockend, bool nowait,
913 struct extent_state **cached_state)
914 {
915 struct btrfs_fs_info *fs_info = inode->root->fs_info;
916 u64 start_pos;
917 u64 last_pos;
918 int ret = 0;
919
920 start_pos = round_down(pos, fs_info->sectorsize);
921 last_pos = round_up(pos + write_bytes, fs_info->sectorsize) - 1;
922
923 if (start_pos < inode->vfs_inode.i_size) {
924 struct btrfs_ordered_extent *ordered;
925
926 if (nowait) {
927 if (!try_lock_extent(&inode->io_tree, start_pos, last_pos,
928 cached_state)) {
929 folio_unlock(folio);
930 folio_put(folio);
931 return -EAGAIN;
932 }
933 } else {
934 lock_extent(&inode->io_tree, start_pos, last_pos, cached_state);
935 }
936
937 ordered = btrfs_lookup_ordered_range(inode, start_pos,
938 last_pos - start_pos + 1);
939 if (ordered &&
940 ordered->file_offset + ordered->num_bytes > start_pos &&
941 ordered->file_offset <= last_pos) {
942 unlock_extent(&inode->io_tree, start_pos, last_pos,
943 cached_state);
944 folio_unlock(folio);
945 folio_put(folio);
946 btrfs_start_ordered_extent(ordered);
947 btrfs_put_ordered_extent(ordered);
948 return -EAGAIN;
949 }
950 if (ordered)
951 btrfs_put_ordered_extent(ordered);
952
953 *lockstart = start_pos;
954 *lockend = last_pos;
955 ret = 1;
956 }
957
958 /*
959 * We should be called after prepare_one_folio() which should have locked
960 * all pages in the range.
961 */
962 WARN_ON(!folio_test_locked(folio));
963
964 return ret;
965 }
966
967 /*
968 * Check if we can do nocow write into the range [@pos, @pos + @write_bytes)
969 *
970 * @pos: File offset.
971 * @write_bytes: The length to write, will be updated to the nocow writeable
972 * range.
973 *
974 * This function will flush ordered extents in the range to ensure proper
975 * nocow checks.
976 *
977 * Return:
978 * > 0 If we can nocow, and updates @write_bytes.
979 * 0 If we can't do a nocow write.
980 * -EAGAIN If we can't do a nocow write because snapshoting of the inode's
981 * root is in progress.
982 * < 0 If an error happened.
983 *
984 * NOTE: Callers need to call btrfs_check_nocow_unlock() if we return > 0.
985 */
btrfs_check_nocow_lock(struct btrfs_inode * inode,loff_t pos,size_t * write_bytes,bool nowait)986 int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
987 size_t *write_bytes, bool nowait)
988 {
989 struct btrfs_fs_info *fs_info = inode->root->fs_info;
990 struct btrfs_root *root = inode->root;
991 struct extent_state *cached_state = NULL;
992 u64 lockstart, lockend;
993 u64 num_bytes;
994 int ret;
995
996 if (!(inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
997 return 0;
998
999 if (!btrfs_drew_try_write_lock(&root->snapshot_lock))
1000 return -EAGAIN;
1001
1002 lockstart = round_down(pos, fs_info->sectorsize);
1003 lockend = round_up(pos + *write_bytes,
1004 fs_info->sectorsize) - 1;
1005 num_bytes = lockend - lockstart + 1;
1006
1007 if (nowait) {
1008 if (!btrfs_try_lock_ordered_range(inode, lockstart, lockend,
1009 &cached_state)) {
1010 btrfs_drew_write_unlock(&root->snapshot_lock);
1011 return -EAGAIN;
1012 }
1013 } else {
1014 btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend,
1015 &cached_state);
1016 }
1017 ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, nowait);
1018 if (ret <= 0)
1019 btrfs_drew_write_unlock(&root->snapshot_lock);
1020 else
1021 *write_bytes = min_t(size_t, *write_bytes ,
1022 num_bytes - pos + lockstart);
1023 unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
1024
1025 return ret;
1026 }
1027
btrfs_check_nocow_unlock(struct btrfs_inode * inode)1028 void btrfs_check_nocow_unlock(struct btrfs_inode *inode)
1029 {
1030 btrfs_drew_write_unlock(&inode->root->snapshot_lock);
1031 }
1032
btrfs_write_check(struct kiocb * iocb,size_t count)1033 int btrfs_write_check(struct kiocb *iocb, size_t count)
1034 {
1035 struct file *file = iocb->ki_filp;
1036 struct inode *inode = file_inode(file);
1037 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
1038 loff_t pos = iocb->ki_pos;
1039 int ret;
1040 loff_t oldsize;
1041
1042 /*
1043 * Quickly bail out on NOWAIT writes if we don't have the nodatacow or
1044 * prealloc flags, as without those flags we always have to COW. We will
1045 * later check if we can really COW into the target range (using
1046 * can_nocow_extent() at btrfs_get_blocks_direct_write()).
1047 */
1048 if ((iocb->ki_flags & IOCB_NOWAIT) &&
1049 !(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
1050 return -EAGAIN;
1051
1052 ret = file_remove_privs(file);
1053 if (ret)
1054 return ret;
1055
1056 /*
1057 * We reserve space for updating the inode when we reserve space for the
1058 * extent we are going to write, so we will enospc out there. We don't
1059 * need to start yet another transaction to update the inode as we will
1060 * update the inode when we finish writing whatever data we write.
1061 */
1062 if (!IS_NOCMTIME(inode)) {
1063 inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
1064 inode_inc_iversion(inode);
1065 }
1066
1067 oldsize = i_size_read(inode);
1068 if (pos > oldsize) {
1069 /* Expand hole size to cover write data, preventing empty gap */
1070 loff_t end_pos = round_up(pos + count, fs_info->sectorsize);
1071
1072 ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, end_pos);
1073 if (ret)
1074 return ret;
1075 }
1076
1077 return 0;
1078 }
1079
btrfs_buffered_write(struct kiocb * iocb,struct iov_iter * i)1080 ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i)
1081 {
1082 struct file *file = iocb->ki_filp;
1083 loff_t pos;
1084 struct inode *inode = file_inode(file);
1085 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
1086 struct extent_changeset *data_reserved = NULL;
1087 u64 release_bytes = 0;
1088 u64 lockstart;
1089 u64 lockend;
1090 size_t num_written = 0;
1091 ssize_t ret;
1092 loff_t old_isize;
1093 unsigned int ilock_flags = 0;
1094 const bool nowait = (iocb->ki_flags & IOCB_NOWAIT);
1095 unsigned int bdp_flags = (nowait ? BDP_ASYNC : 0);
1096 bool only_release_metadata = false;
1097
1098 if (nowait)
1099 ilock_flags |= BTRFS_ILOCK_TRY;
1100
1101 ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags);
1102 if (ret < 0)
1103 return ret;
1104
1105 /*
1106 * We can only trust the isize with inode lock held, or it can race with
1107 * other buffered writes and cause incorrect call of
1108 * pagecache_isize_extended() to overwrite existing data.
1109 */
1110 old_isize = i_size_read(inode);
1111
1112 ret = generic_write_checks(iocb, i);
1113 if (ret <= 0)
1114 goto out;
1115
1116 ret = btrfs_write_check(iocb, ret);
1117 if (ret < 0)
1118 goto out;
1119
1120 pos = iocb->ki_pos;
1121 while (iov_iter_count(i) > 0) {
1122 struct extent_state *cached_state = NULL;
1123 size_t offset = offset_in_page(pos);
1124 size_t sector_offset;
1125 size_t write_bytes = min(iov_iter_count(i), PAGE_SIZE - offset);
1126 size_t reserve_bytes;
1127 size_t copied;
1128 size_t dirty_sectors;
1129 size_t num_sectors;
1130 struct folio *folio = NULL;
1131 int extents_locked;
1132 bool force_page_uptodate = false;
1133
1134 /*
1135 * Fault pages before locking them in prepare_one_folio()
1136 * to avoid recursive lock
1137 */
1138 if (unlikely(fault_in_iov_iter_readable(i, write_bytes))) {
1139 ret = -EFAULT;
1140 break;
1141 }
1142
1143 only_release_metadata = false;
1144 sector_offset = pos & (fs_info->sectorsize - 1);
1145
1146 extent_changeset_release(data_reserved);
1147 ret = btrfs_check_data_free_space(BTRFS_I(inode),
1148 &data_reserved, pos,
1149 write_bytes, nowait);
1150 if (ret < 0) {
1151 int can_nocow;
1152
1153 if (nowait && (ret == -ENOSPC || ret == -EAGAIN)) {
1154 ret = -EAGAIN;
1155 break;
1156 }
1157
1158 /*
1159 * If we don't have to COW at the offset, reserve
1160 * metadata only. write_bytes may get smaller than
1161 * requested here.
1162 */
1163 can_nocow = btrfs_check_nocow_lock(BTRFS_I(inode), pos,
1164 &write_bytes, nowait);
1165 if (can_nocow < 0)
1166 ret = can_nocow;
1167 if (can_nocow > 0)
1168 ret = 0;
1169 if (ret)
1170 break;
1171 only_release_metadata = true;
1172 }
1173
1174 reserve_bytes = round_up(write_bytes + sector_offset,
1175 fs_info->sectorsize);
1176 WARN_ON(reserve_bytes == 0);
1177 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
1178 reserve_bytes,
1179 reserve_bytes, nowait);
1180 if (ret) {
1181 if (!only_release_metadata)
1182 btrfs_free_reserved_data_space(BTRFS_I(inode),
1183 data_reserved, pos,
1184 write_bytes);
1185 else
1186 btrfs_check_nocow_unlock(BTRFS_I(inode));
1187
1188 if (nowait && ret == -ENOSPC)
1189 ret = -EAGAIN;
1190 break;
1191 }
1192
1193 release_bytes = reserve_bytes;
1194 again:
1195 ret = balance_dirty_pages_ratelimited_flags(inode->i_mapping, bdp_flags);
1196 if (ret) {
1197 btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
1198 break;
1199 }
1200
1201 ret = prepare_one_folio(inode, &folio, pos, write_bytes,
1202 force_page_uptodate, false);
1203 if (ret) {
1204 btrfs_delalloc_release_extents(BTRFS_I(inode),
1205 reserve_bytes);
1206 break;
1207 }
1208
1209 extents_locked = lock_and_cleanup_extent_if_need(BTRFS_I(inode),
1210 folio, pos, write_bytes, &lockstart,
1211 &lockend, nowait, &cached_state);
1212 if (extents_locked < 0) {
1213 if (!nowait && extents_locked == -EAGAIN)
1214 goto again;
1215
1216 btrfs_delalloc_release_extents(BTRFS_I(inode),
1217 reserve_bytes);
1218 ret = extents_locked;
1219 break;
1220 }
1221
1222 copied = copy_folio_from_iter_atomic(folio,
1223 offset_in_folio(folio, pos), write_bytes, i);
1224 flush_dcache_folio(folio);
1225
1226 /*
1227 * If we get a partial write, we can end up with partially
1228 * uptodate page. Although if sector size < page size we can
1229 * handle it, but if it's not sector aligned it can cause
1230 * a lot of complexity, so make sure they don't happen by
1231 * forcing retry this copy.
1232 */
1233 if (unlikely(copied < write_bytes)) {
1234 if (!folio_test_uptodate(folio)) {
1235 iov_iter_revert(i, copied);
1236 copied = 0;
1237 }
1238 }
1239
1240 num_sectors = BTRFS_BYTES_TO_BLKS(fs_info, reserve_bytes);
1241 dirty_sectors = round_up(copied + sector_offset,
1242 fs_info->sectorsize);
1243 dirty_sectors = BTRFS_BYTES_TO_BLKS(fs_info, dirty_sectors);
1244
1245 if (copied == 0) {
1246 force_page_uptodate = true;
1247 dirty_sectors = 0;
1248 } else {
1249 force_page_uptodate = false;
1250 }
1251
1252 if (num_sectors > dirty_sectors) {
1253 /* release everything except the sectors we dirtied */
1254 release_bytes -= dirty_sectors << fs_info->sectorsize_bits;
1255 if (only_release_metadata) {
1256 btrfs_delalloc_release_metadata(BTRFS_I(inode),
1257 release_bytes, true);
1258 } else {
1259 u64 release_start = round_up(pos + copied,
1260 fs_info->sectorsize);
1261 btrfs_delalloc_release_space(BTRFS_I(inode),
1262 data_reserved, release_start,
1263 release_bytes, true);
1264 }
1265 }
1266
1267 release_bytes = round_up(copied + sector_offset,
1268 fs_info->sectorsize);
1269
1270 ret = btrfs_dirty_folio(BTRFS_I(inode), folio, pos, copied,
1271 &cached_state, only_release_metadata);
1272
1273 /*
1274 * If we have not locked the extent range, because the range's
1275 * start offset is >= i_size, we might still have a non-NULL
1276 * cached extent state, acquired while marking the extent range
1277 * as delalloc through btrfs_dirty_page(). Therefore free any
1278 * possible cached extent state to avoid a memory leak.
1279 */
1280 if (extents_locked)
1281 unlock_extent(&BTRFS_I(inode)->io_tree, lockstart,
1282 lockend, &cached_state);
1283 else
1284 free_extent_state(cached_state);
1285
1286 btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
1287 if (ret) {
1288 btrfs_drop_folio(fs_info, folio, pos, copied);
1289 break;
1290 }
1291
1292 release_bytes = 0;
1293 if (only_release_metadata)
1294 btrfs_check_nocow_unlock(BTRFS_I(inode));
1295
1296 btrfs_drop_folio(fs_info, folio, pos, copied);
1297
1298 cond_resched();
1299
1300 pos += copied;
1301 num_written += copied;
1302 }
1303
1304 if (release_bytes) {
1305 if (only_release_metadata) {
1306 btrfs_check_nocow_unlock(BTRFS_I(inode));
1307 btrfs_delalloc_release_metadata(BTRFS_I(inode),
1308 release_bytes, true);
1309 } else {
1310 btrfs_delalloc_release_space(BTRFS_I(inode),
1311 data_reserved,
1312 round_down(pos, fs_info->sectorsize),
1313 release_bytes, true);
1314 }
1315 }
1316
1317 extent_changeset_free(data_reserved);
1318 if (num_written > 0) {
1319 pagecache_isize_extended(inode, old_isize, iocb->ki_pos);
1320 iocb->ki_pos += num_written;
1321 }
1322 out:
1323 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
1324 return num_written ? num_written : ret;
1325 }
1326
btrfs_encoded_write(struct kiocb * iocb,struct iov_iter * from,const struct btrfs_ioctl_encoded_io_args * encoded)1327 static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from,
1328 const struct btrfs_ioctl_encoded_io_args *encoded)
1329 {
1330 struct file *file = iocb->ki_filp;
1331 struct inode *inode = file_inode(file);
1332 loff_t count;
1333 ssize_t ret;
1334
1335 btrfs_inode_lock(BTRFS_I(inode), 0);
1336 count = encoded->len;
1337 ret = generic_write_checks_count(iocb, &count);
1338 if (ret == 0 && count != encoded->len) {
1339 /*
1340 * The write got truncated by generic_write_checks_count(). We
1341 * can't do a partial encoded write.
1342 */
1343 ret = -EFBIG;
1344 }
1345 if (ret || encoded->len == 0)
1346 goto out;
1347
1348 ret = btrfs_write_check(iocb, encoded->len);
1349 if (ret < 0)
1350 goto out;
1351
1352 ret = btrfs_do_encoded_write(iocb, from, encoded);
1353 out:
1354 btrfs_inode_unlock(BTRFS_I(inode), 0);
1355 return ret;
1356 }
1357
btrfs_do_write_iter(struct kiocb * iocb,struct iov_iter * from,const struct btrfs_ioctl_encoded_io_args * encoded)1358 ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from,
1359 const struct btrfs_ioctl_encoded_io_args *encoded)
1360 {
1361 struct file *file = iocb->ki_filp;
1362 struct btrfs_inode *inode = BTRFS_I(file_inode(file));
1363 ssize_t num_written, num_sync;
1364
1365 /*
1366 * If the fs flips readonly due to some impossible error, although we
1367 * have opened a file as writable, we have to stop this write operation
1368 * to ensure consistency.
1369 */
1370 if (BTRFS_FS_ERROR(inode->root->fs_info))
1371 return -EROFS;
1372
1373 if (encoded && (iocb->ki_flags & IOCB_NOWAIT))
1374 return -EOPNOTSUPP;
1375
1376 if (encoded) {
1377 num_written = btrfs_encoded_write(iocb, from, encoded);
1378 num_sync = encoded->len;
1379 } else if (iocb->ki_flags & IOCB_DIRECT) {
1380 num_written = btrfs_direct_write(iocb, from);
1381 num_sync = num_written;
1382 } else {
1383 num_written = btrfs_buffered_write(iocb, from);
1384 num_sync = num_written;
1385 }
1386
1387 btrfs_set_inode_last_sub_trans(inode);
1388
1389 if (num_sync > 0) {
1390 num_sync = generic_write_sync(iocb, num_sync);
1391 if (num_sync < 0)
1392 num_written = num_sync;
1393 }
1394
1395 return num_written;
1396 }
1397
btrfs_file_write_iter(struct kiocb * iocb,struct iov_iter * from)1398 static ssize_t btrfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1399 {
1400 return btrfs_do_write_iter(iocb, from, NULL);
1401 }
1402
btrfs_release_file(struct inode * inode,struct file * filp)1403 int btrfs_release_file(struct inode *inode, struct file *filp)
1404 {
1405 struct btrfs_file_private *private = filp->private_data;
1406
1407 if (private) {
1408 kfree(private->filldir_buf);
1409 free_extent_state(private->llseek_cached_state);
1410 kfree(private);
1411 filp->private_data = NULL;
1412 }
1413
1414 /*
1415 * Set by setattr when we are about to truncate a file from a non-zero
1416 * size to a zero size. This tries to flush down new bytes that may
1417 * have been written if the application were using truncate to replace
1418 * a file in place.
1419 */
1420 if (test_and_clear_bit(BTRFS_INODE_FLUSH_ON_CLOSE,
1421 &BTRFS_I(inode)->runtime_flags))
1422 filemap_flush(inode->i_mapping);
1423 return 0;
1424 }
1425
start_ordered_ops(struct btrfs_inode * inode,loff_t start,loff_t end)1426 static int start_ordered_ops(struct btrfs_inode *inode, loff_t start, loff_t end)
1427 {
1428 int ret;
1429 struct blk_plug plug;
1430
1431 /*
1432 * This is only called in fsync, which would do synchronous writes, so
1433 * a plug can merge adjacent IOs as much as possible. Esp. in case of
1434 * multiple disks using raid profile, a large IO can be split to
1435 * several segments of stripe length (currently 64K).
1436 */
1437 blk_start_plug(&plug);
1438 ret = btrfs_fdatawrite_range(inode, start, end);
1439 blk_finish_plug(&plug);
1440
1441 return ret;
1442 }
1443
skip_inode_logging(const struct btrfs_log_ctx * ctx)1444 static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx)
1445 {
1446 struct btrfs_inode *inode = ctx->inode;
1447 struct btrfs_fs_info *fs_info = inode->root->fs_info;
1448
1449 if (btrfs_inode_in_log(inode, btrfs_get_fs_generation(fs_info)) &&
1450 list_empty(&ctx->ordered_extents))
1451 return true;
1452
1453 /*
1454 * If we are doing a fast fsync we can not bail out if the inode's
1455 * last_trans is <= then the last committed transaction, because we only
1456 * update the last_trans of the inode during ordered extent completion,
1457 * and for a fast fsync we don't wait for that, we only wait for the
1458 * writeback to complete.
1459 */
1460 if (inode->last_trans <= btrfs_get_last_trans_committed(fs_info) &&
1461 (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) ||
1462 list_empty(&ctx->ordered_extents)))
1463 return true;
1464
1465 return false;
1466 }
1467
1468 /*
1469 * fsync call for both files and directories. This logs the inode into
1470 * the tree log instead of forcing full commits whenever possible.
1471 *
1472 * It needs to call filemap_fdatawait so that all ordered extent updates are
1473 * in the metadata btree are up to date for copying to the log.
1474 *
1475 * It drops the inode mutex before doing the tree log commit. This is an
1476 * important optimization for directories because holding the mutex prevents
1477 * new operations on the dir while we write to disk.
1478 */
btrfs_sync_file(struct file * file,loff_t start,loff_t end,int datasync)1479 int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1480 {
1481 struct dentry *dentry = file_dentry(file);
1482 struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
1483 struct btrfs_root *root = inode->root;
1484 struct btrfs_fs_info *fs_info = root->fs_info;
1485 struct btrfs_trans_handle *trans;
1486 struct btrfs_log_ctx ctx;
1487 int ret = 0, err;
1488 u64 len;
1489 bool full_sync;
1490 bool skip_ilock = false;
1491
1492 if (current->journal_info == BTRFS_TRANS_DIO_WRITE_STUB) {
1493 skip_ilock = true;
1494 current->journal_info = NULL;
1495 btrfs_assert_inode_locked(inode);
1496 }
1497
1498 trace_btrfs_sync_file(file, datasync);
1499
1500 btrfs_init_log_ctx(&ctx, inode);
1501
1502 /*
1503 * Always set the range to a full range, otherwise we can get into
1504 * several problems, from missing file extent items to represent holes
1505 * when not using the NO_HOLES feature, to log tree corruption due to
1506 * races between hole detection during logging and completion of ordered
1507 * extents outside the range, to missing checksums due to ordered extents
1508 * for which we flushed only a subset of their pages.
1509 */
1510 start = 0;
1511 end = LLONG_MAX;
1512 len = (u64)LLONG_MAX + 1;
1513
1514 /*
1515 * We write the dirty pages in the range and wait until they complete
1516 * out of the ->i_mutex. If so, we can flush the dirty pages by
1517 * multi-task, and make the performance up. See
1518 * btrfs_wait_ordered_range for an explanation of the ASYNC check.
1519 */
1520 ret = start_ordered_ops(inode, start, end);
1521 if (ret)
1522 goto out;
1523
1524 if (skip_ilock)
1525 down_write(&inode->i_mmap_lock);
1526 else
1527 btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
1528
1529 atomic_inc(&root->log_batch);
1530
1531 /*
1532 * Before we acquired the inode's lock and the mmap lock, someone may
1533 * have dirtied more pages in the target range. We need to make sure
1534 * that writeback for any such pages does not start while we are logging
1535 * the inode, because if it does, any of the following might happen when
1536 * we are not doing a full inode sync:
1537 *
1538 * 1) We log an extent after its writeback finishes but before its
1539 * checksums are added to the csum tree, leading to -EIO errors
1540 * when attempting to read the extent after a log replay.
1541 *
1542 * 2) We can end up logging an extent before its writeback finishes.
1543 * Therefore after the log replay we will have a file extent item
1544 * pointing to an unwritten extent (and no data checksums as well).
1545 *
1546 * So trigger writeback for any eventual new dirty pages and then we
1547 * wait for all ordered extents to complete below.
1548 */
1549 ret = start_ordered_ops(inode, start, end);
1550 if (ret) {
1551 if (skip_ilock)
1552 up_write(&inode->i_mmap_lock);
1553 else
1554 btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
1555 goto out;
1556 }
1557
1558 /*
1559 * Always check for the full sync flag while holding the inode's lock,
1560 * to avoid races with other tasks. The flag must be either set all the
1561 * time during logging or always off all the time while logging.
1562 * We check the flag here after starting delalloc above, because when
1563 * running delalloc the full sync flag may be set if we need to drop
1564 * extra extent map ranges due to temporary memory allocation failures.
1565 */
1566 full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
1567
1568 /*
1569 * We have to do this here to avoid the priority inversion of waiting on
1570 * IO of a lower priority task while holding a transaction open.
1571 *
1572 * For a full fsync we wait for the ordered extents to complete while
1573 * for a fast fsync we wait just for writeback to complete, and then
1574 * attach the ordered extents to the transaction so that a transaction
1575 * commit waits for their completion, to avoid data loss if we fsync,
1576 * the current transaction commits before the ordered extents complete
1577 * and a power failure happens right after that.
1578 *
1579 * For zoned filesystem, if a write IO uses a ZONE_APPEND command, the
1580 * logical address recorded in the ordered extent may change. We need
1581 * to wait for the IO to stabilize the logical address.
1582 */
1583 if (full_sync || btrfs_is_zoned(fs_info)) {
1584 ret = btrfs_wait_ordered_range(inode, start, len);
1585 clear_bit(BTRFS_INODE_COW_WRITE_ERROR, &inode->runtime_flags);
1586 } else {
1587 /*
1588 * Get our ordered extents as soon as possible to avoid doing
1589 * checksum lookups in the csum tree, and use instead the
1590 * checksums attached to the ordered extents.
1591 */
1592 btrfs_get_ordered_extents_for_logging(inode, &ctx.ordered_extents);
1593 ret = filemap_fdatawait_range(inode->vfs_inode.i_mapping, start, end);
1594 if (ret)
1595 goto out_release_extents;
1596
1597 /*
1598 * Check and clear the BTRFS_INODE_COW_WRITE_ERROR now after
1599 * starting and waiting for writeback, because for buffered IO
1600 * it may have been set during the end IO callback
1601 * (end_bbio_data_write() -> btrfs_finish_ordered_extent()) in
1602 * case an error happened and we need to wait for ordered
1603 * extents to complete so that any extent maps that point to
1604 * unwritten locations are dropped and we don't log them.
1605 */
1606 if (test_and_clear_bit(BTRFS_INODE_COW_WRITE_ERROR, &inode->runtime_flags))
1607 ret = btrfs_wait_ordered_range(inode, start, len);
1608 }
1609
1610 if (ret)
1611 goto out_release_extents;
1612
1613 atomic_inc(&root->log_batch);
1614
1615 if (skip_inode_logging(&ctx)) {
1616 /*
1617 * We've had everything committed since the last time we were
1618 * modified so clear this flag in case it was set for whatever
1619 * reason, it's no longer relevant.
1620 */
1621 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
1622 /*
1623 * An ordered extent might have started before and completed
1624 * already with io errors, in which case the inode was not
1625 * updated and we end up here. So check the inode's mapping
1626 * for any errors that might have happened since we last
1627 * checked called fsync.
1628 */
1629 ret = filemap_check_wb_err(inode->vfs_inode.i_mapping, file->f_wb_err);
1630 goto out_release_extents;
1631 }
1632
1633 btrfs_init_log_ctx_scratch_eb(&ctx);
1634
1635 /*
1636 * We use start here because we will need to wait on the IO to complete
1637 * in btrfs_sync_log, which could require joining a transaction (for
1638 * example checking cross references in the nocow path). If we use join
1639 * here we could get into a situation where we're waiting on IO to
1640 * happen that is blocked on a transaction trying to commit. With start
1641 * we inc the extwriter counter, so we wait for all extwriters to exit
1642 * before we start blocking joiners. This comment is to keep somebody
1643 * from thinking they are super smart and changing this to
1644 * btrfs_join_transaction *cough*Josef*cough*.
1645 */
1646 trans = btrfs_start_transaction(root, 0);
1647 if (IS_ERR(trans)) {
1648 ret = PTR_ERR(trans);
1649 goto out_release_extents;
1650 }
1651 trans->in_fsync = true;
1652
1653 ret = btrfs_log_dentry_safe(trans, dentry, &ctx);
1654 /*
1655 * Scratch eb no longer needed, release before syncing log or commit
1656 * transaction, to avoid holding unnecessary memory during such long
1657 * operations.
1658 */
1659 if (ctx.scratch_eb) {
1660 free_extent_buffer(ctx.scratch_eb);
1661 ctx.scratch_eb = NULL;
1662 }
1663 btrfs_release_log_ctx_extents(&ctx);
1664 if (ret < 0) {
1665 /* Fallthrough and commit/free transaction. */
1666 ret = BTRFS_LOG_FORCE_COMMIT;
1667 }
1668
1669 /* we've logged all the items and now have a consistent
1670 * version of the file in the log. It is possible that
1671 * someone will come in and modify the file, but that's
1672 * fine because the log is consistent on disk, and we
1673 * have references to all of the file's extents
1674 *
1675 * It is possible that someone will come in and log the
1676 * file again, but that will end up using the synchronization
1677 * inside btrfs_sync_log to keep things safe.
1678 */
1679 if (skip_ilock)
1680 up_write(&inode->i_mmap_lock);
1681 else
1682 btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
1683
1684 if (ret == BTRFS_NO_LOG_SYNC) {
1685 ret = btrfs_end_transaction(trans);
1686 goto out;
1687 }
1688
1689 /* We successfully logged the inode, attempt to sync the log. */
1690 if (!ret) {
1691 ret = btrfs_sync_log(trans, root, &ctx);
1692 if (!ret) {
1693 ret = btrfs_end_transaction(trans);
1694 goto out;
1695 }
1696 }
1697
1698 /*
1699 * At this point we need to commit the transaction because we had
1700 * btrfs_need_log_full_commit() or some other error.
1701 *
1702 * If we didn't do a full sync we have to stop the trans handle, wait on
1703 * the ordered extents, start it again and commit the transaction. If
1704 * we attempt to wait on the ordered extents here we could deadlock with
1705 * something like fallocate() that is holding the extent lock trying to
1706 * start a transaction while some other thread is trying to commit the
1707 * transaction while we (fsync) are currently holding the transaction
1708 * open.
1709 */
1710 if (!full_sync) {
1711 ret = btrfs_end_transaction(trans);
1712 if (ret)
1713 goto out;
1714 ret = btrfs_wait_ordered_range(inode, start, len);
1715 if (ret)
1716 goto out;
1717
1718 /*
1719 * This is safe to use here because we're only interested in
1720 * making sure the transaction that had the ordered extents is
1721 * committed. We aren't waiting on anything past this point,
1722 * we're purely getting the transaction and committing it.
1723 */
1724 trans = btrfs_attach_transaction_barrier(root);
1725 if (IS_ERR(trans)) {
1726 ret = PTR_ERR(trans);
1727
1728 /*
1729 * We committed the transaction and there's no currently
1730 * running transaction, this means everything we care
1731 * about made it to disk and we are done.
1732 */
1733 if (ret == -ENOENT)
1734 ret = 0;
1735 goto out;
1736 }
1737 }
1738
1739 ret = btrfs_commit_transaction(trans);
1740 out:
1741 free_extent_buffer(ctx.scratch_eb);
1742 ASSERT(list_empty(&ctx.list));
1743 ASSERT(list_empty(&ctx.conflict_inodes));
1744 err = file_check_and_advance_wb_err(file);
1745 if (!ret)
1746 ret = err;
1747 return ret > 0 ? -EIO : ret;
1748
1749 out_release_extents:
1750 btrfs_release_log_ctx_extents(&ctx);
1751 if (skip_ilock)
1752 up_write(&inode->i_mmap_lock);
1753 else
1754 btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
1755 goto out;
1756 }
1757
1758 /*
1759 * btrfs_page_mkwrite() is not allowed to change the file size as it gets
1760 * called from a page fault handler when a page is first dirtied. Hence we must
1761 * be careful to check for EOF conditions here. We set the page up correctly
1762 * for a written page which means we get ENOSPC checking when writing into
1763 * holes and correct delalloc and unwritten extent mapping on filesystems that
1764 * support these features.
1765 *
1766 * We are not allowed to take the i_mutex here so we have to play games to
1767 * protect against truncate races as the page could now be beyond EOF. Because
1768 * truncate_setsize() writes the inode size before removing pages, once we have
1769 * the page lock we can determine safely if the page is beyond EOF. If it is not
1770 * beyond EOF, then the page is guaranteed safe against truncation until we
1771 * unlock the page.
1772 */
btrfs_page_mkwrite(struct vm_fault * vmf)1773 static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
1774 {
1775 struct page *page = vmf->page;
1776 struct folio *folio = page_folio(page);
1777 struct inode *inode = file_inode(vmf->vma->vm_file);
1778 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
1779 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
1780 struct btrfs_ordered_extent *ordered;
1781 struct extent_state *cached_state = NULL;
1782 struct extent_changeset *data_reserved = NULL;
1783 unsigned long zero_start;
1784 loff_t size;
1785 size_t fsize = folio_size(folio);
1786 vm_fault_t ret;
1787 int ret2;
1788 int reserved = 0;
1789 u64 reserved_space;
1790 u64 page_start;
1791 u64 page_end;
1792 u64 end;
1793
1794 ASSERT(folio_order(folio) == 0);
1795
1796 reserved_space = fsize;
1797
1798 sb_start_pagefault(inode->i_sb);
1799 page_start = folio_pos(folio);
1800 page_end = page_start + folio_size(folio) - 1;
1801 end = page_end;
1802
1803 /*
1804 * Reserving delalloc space after obtaining the page lock can lead to
1805 * deadlock. For example, if a dirty page is locked by this function
1806 * and the call to btrfs_delalloc_reserve_space() ends up triggering
1807 * dirty page write out, then the btrfs_writepages() function could
1808 * end up waiting indefinitely to get a lock on the page currently
1809 * being processed by btrfs_page_mkwrite() function.
1810 */
1811 ret2 = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved,
1812 page_start, reserved_space);
1813 if (!ret2) {
1814 ret2 = file_update_time(vmf->vma->vm_file);
1815 reserved = 1;
1816 }
1817 if (ret2) {
1818 ret = vmf_error(ret2);
1819 if (reserved)
1820 goto out;
1821 goto out_noreserve;
1822 }
1823
1824 /* Make the VM retry the fault. */
1825 ret = VM_FAULT_NOPAGE;
1826 again:
1827 down_read(&BTRFS_I(inode)->i_mmap_lock);
1828 folio_lock(folio);
1829 size = i_size_read(inode);
1830
1831 if ((folio->mapping != inode->i_mapping) ||
1832 (page_start >= size)) {
1833 /* Page got truncated out from underneath us. */
1834 goto out_unlock;
1835 }
1836 folio_wait_writeback(folio);
1837
1838 lock_extent(io_tree, page_start, page_end, &cached_state);
1839 ret2 = set_folio_extent_mapped(folio);
1840 if (ret2 < 0) {
1841 ret = vmf_error(ret2);
1842 unlock_extent(io_tree, page_start, page_end, &cached_state);
1843 goto out_unlock;
1844 }
1845
1846 /*
1847 * We can't set the delalloc bits if there are pending ordered
1848 * extents. Drop our locks and wait for them to finish.
1849 */
1850 ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, fsize);
1851 if (ordered) {
1852 unlock_extent(io_tree, page_start, page_end, &cached_state);
1853 folio_unlock(folio);
1854 up_read(&BTRFS_I(inode)->i_mmap_lock);
1855 btrfs_start_ordered_extent(ordered);
1856 btrfs_put_ordered_extent(ordered);
1857 goto again;
1858 }
1859
1860 if (folio->index == ((size - 1) >> PAGE_SHIFT)) {
1861 reserved_space = round_up(size - page_start, fs_info->sectorsize);
1862 if (reserved_space < fsize) {
1863 end = page_start + reserved_space - 1;
1864 btrfs_delalloc_release_space(BTRFS_I(inode),
1865 data_reserved, page_start,
1866 fsize - reserved_space, true);
1867 }
1868 }
1869
1870 /*
1871 * page_mkwrite gets called when the page is firstly dirtied after it's
1872 * faulted in, but write(2) could also dirty a page and set delalloc
1873 * bits, thus in this case for space account reason, we still need to
1874 * clear any delalloc bits within this page range since we have to
1875 * reserve data&meta space before lock_page() (see above comments).
1876 */
1877 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end,
1878 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
1879 EXTENT_DEFRAG, &cached_state);
1880
1881 ret2 = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0,
1882 &cached_state);
1883 if (ret2) {
1884 unlock_extent(io_tree, page_start, page_end, &cached_state);
1885 ret = VM_FAULT_SIGBUS;
1886 goto out_unlock;
1887 }
1888
1889 /* Page is wholly or partially inside EOF. */
1890 if (page_start + folio_size(folio) > size)
1891 zero_start = offset_in_folio(folio, size);
1892 else
1893 zero_start = fsize;
1894
1895 if (zero_start != fsize)
1896 folio_zero_range(folio, zero_start, folio_size(folio) - zero_start);
1897
1898 btrfs_folio_clear_checked(fs_info, folio, page_start, fsize);
1899 btrfs_folio_set_dirty(fs_info, folio, page_start, end + 1 - page_start);
1900 btrfs_folio_set_uptodate(fs_info, folio, page_start, end + 1 - page_start);
1901
1902 btrfs_set_inode_last_sub_trans(BTRFS_I(inode));
1903
1904 unlock_extent(io_tree, page_start, page_end, &cached_state);
1905 up_read(&BTRFS_I(inode)->i_mmap_lock);
1906
1907 btrfs_delalloc_release_extents(BTRFS_I(inode), fsize);
1908 sb_end_pagefault(inode->i_sb);
1909 extent_changeset_free(data_reserved);
1910 return VM_FAULT_LOCKED;
1911
1912 out_unlock:
1913 folio_unlock(folio);
1914 up_read(&BTRFS_I(inode)->i_mmap_lock);
1915 out:
1916 btrfs_delalloc_release_extents(BTRFS_I(inode), fsize);
1917 btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start,
1918 reserved_space, (ret != 0));
1919 out_noreserve:
1920 sb_end_pagefault(inode->i_sb);
1921 extent_changeset_free(data_reserved);
1922 return ret;
1923 }
1924
1925 static const struct vm_operations_struct btrfs_file_vm_ops = {
1926 .fault = filemap_fault,
1927 .map_pages = filemap_map_pages,
1928 .page_mkwrite = btrfs_page_mkwrite,
1929 };
1930
btrfs_file_mmap(struct file * filp,struct vm_area_struct * vma)1931 static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
1932 {
1933 struct address_space *mapping = filp->f_mapping;
1934
1935 if (!mapping->a_ops->read_folio)
1936 return -ENOEXEC;
1937
1938 file_accessed(filp);
1939 vma->vm_ops = &btrfs_file_vm_ops;
1940
1941 return 0;
1942 }
1943
hole_mergeable(struct btrfs_inode * inode,struct extent_buffer * leaf,int slot,u64 start,u64 end)1944 static int hole_mergeable(struct btrfs_inode *inode, struct extent_buffer *leaf,
1945 int slot, u64 start, u64 end)
1946 {
1947 struct btrfs_file_extent_item *fi;
1948 struct btrfs_key key;
1949
1950 if (slot < 0 || slot >= btrfs_header_nritems(leaf))
1951 return 0;
1952
1953 btrfs_item_key_to_cpu(leaf, &key, slot);
1954 if (key.objectid != btrfs_ino(inode) ||
1955 key.type != BTRFS_EXTENT_DATA_KEY)
1956 return 0;
1957
1958 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
1959
1960 if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
1961 return 0;
1962
1963 if (btrfs_file_extent_disk_bytenr(leaf, fi))
1964 return 0;
1965
1966 if (key.offset == end)
1967 return 1;
1968 if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start)
1969 return 1;
1970 return 0;
1971 }
1972
fill_holes(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path,u64 offset,u64 end)1973 static int fill_holes(struct btrfs_trans_handle *trans,
1974 struct btrfs_inode *inode,
1975 struct btrfs_path *path, u64 offset, u64 end)
1976 {
1977 struct btrfs_fs_info *fs_info = trans->fs_info;
1978 struct btrfs_root *root = inode->root;
1979 struct extent_buffer *leaf;
1980 struct btrfs_file_extent_item *fi;
1981 struct extent_map *hole_em;
1982 struct btrfs_key key;
1983 int ret;
1984
1985 if (btrfs_fs_incompat(fs_info, NO_HOLES))
1986 goto out;
1987
1988 key.objectid = btrfs_ino(inode);
1989 key.type = BTRFS_EXTENT_DATA_KEY;
1990 key.offset = offset;
1991
1992 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1993 if (ret <= 0) {
1994 /*
1995 * We should have dropped this offset, so if we find it then
1996 * something has gone horribly wrong.
1997 */
1998 if (ret == 0)
1999 ret = -EINVAL;
2000 return ret;
2001 }
2002
2003 leaf = path->nodes[0];
2004 if (hole_mergeable(inode, leaf, path->slots[0] - 1, offset, end)) {
2005 u64 num_bytes;
2006
2007 path->slots[0]--;
2008 fi = btrfs_item_ptr(leaf, path->slots[0],
2009 struct btrfs_file_extent_item);
2010 num_bytes = btrfs_file_extent_num_bytes(leaf, fi) +
2011 end - offset;
2012 btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2013 btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
2014 btrfs_set_file_extent_offset(leaf, fi, 0);
2015 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
2016 goto out;
2017 }
2018
2019 if (hole_mergeable(inode, leaf, path->slots[0], offset, end)) {
2020 u64 num_bytes;
2021
2022 key.offset = offset;
2023 btrfs_set_item_key_safe(trans, path, &key);
2024 fi = btrfs_item_ptr(leaf, path->slots[0],
2025 struct btrfs_file_extent_item);
2026 num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end -
2027 offset;
2028 btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2029 btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
2030 btrfs_set_file_extent_offset(leaf, fi, 0);
2031 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
2032 goto out;
2033 }
2034 btrfs_release_path(path);
2035
2036 ret = btrfs_insert_hole_extent(trans, root, btrfs_ino(inode), offset,
2037 end - offset);
2038 if (ret)
2039 return ret;
2040
2041 out:
2042 btrfs_release_path(path);
2043
2044 hole_em = alloc_extent_map();
2045 if (!hole_em) {
2046 btrfs_drop_extent_map_range(inode, offset, end - 1, false);
2047 btrfs_set_inode_full_sync(inode);
2048 } else {
2049 hole_em->start = offset;
2050 hole_em->len = end - offset;
2051 hole_em->ram_bytes = hole_em->len;
2052
2053 hole_em->disk_bytenr = EXTENT_MAP_HOLE;
2054 hole_em->disk_num_bytes = 0;
2055 hole_em->generation = trans->transid;
2056
2057 ret = btrfs_replace_extent_map_range(inode, hole_em, true);
2058 free_extent_map(hole_em);
2059 if (ret)
2060 btrfs_set_inode_full_sync(inode);
2061 }
2062
2063 return 0;
2064 }
2065
2066 /*
2067 * Find a hole extent on given inode and change start/len to the end of hole
2068 * extent.(hole/vacuum extent whose em->start <= start &&
2069 * em->start + em->len > start)
2070 * When a hole extent is found, return 1 and modify start/len.
2071 */
find_first_non_hole(struct btrfs_inode * inode,u64 * start,u64 * len)2072 static int find_first_non_hole(struct btrfs_inode *inode, u64 *start, u64 *len)
2073 {
2074 struct btrfs_fs_info *fs_info = inode->root->fs_info;
2075 struct extent_map *em;
2076 int ret = 0;
2077
2078 em = btrfs_get_extent(inode, NULL,
2079 round_down(*start, fs_info->sectorsize),
2080 round_up(*len, fs_info->sectorsize));
2081 if (IS_ERR(em))
2082 return PTR_ERR(em);
2083
2084 /* Hole or vacuum extent(only exists in no-hole mode) */
2085 if (em->disk_bytenr == EXTENT_MAP_HOLE) {
2086 ret = 1;
2087 *len = em->start + em->len > *start + *len ?
2088 0 : *start + *len - em->start - em->len;
2089 *start = em->start + em->len;
2090 }
2091 free_extent_map(em);
2092 return ret;
2093 }
2094
btrfs_punch_hole_lock_range(struct inode * inode,const u64 lockstart,const u64 lockend,struct extent_state ** cached_state)2095 static void btrfs_punch_hole_lock_range(struct inode *inode,
2096 const u64 lockstart,
2097 const u64 lockend,
2098 struct extent_state **cached_state)
2099 {
2100 /*
2101 * For subpage case, if the range is not at page boundary, we could
2102 * have pages at the leading/tailing part of the range.
2103 * This could lead to dead loop since filemap_range_has_page()
2104 * will always return true.
2105 * So here we need to do extra page alignment for
2106 * filemap_range_has_page().
2107 *
2108 * And do not decrease page_lockend right now, as it can be 0.
2109 */
2110 const u64 page_lockstart = round_up(lockstart, PAGE_SIZE);
2111 const u64 page_lockend = round_down(lockend + 1, PAGE_SIZE);
2112
2113 while (1) {
2114 truncate_pagecache_range(inode, lockstart, lockend);
2115
2116 lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2117 cached_state);
2118 /* The same page or adjacent pages. */
2119 if (page_lockend <= page_lockstart)
2120 break;
2121 /*
2122 * We can't have ordered extents in the range, nor dirty/writeback
2123 * pages, because we have locked the inode's VFS lock in exclusive
2124 * mode, we have locked the inode's i_mmap_lock in exclusive mode,
2125 * we have flushed all delalloc in the range and we have waited
2126 * for any ordered extents in the range to complete.
2127 * We can race with anyone reading pages from this range, so after
2128 * locking the range check if we have pages in the range, and if
2129 * we do, unlock the range and retry.
2130 */
2131 if (!filemap_range_has_page(inode->i_mapping, page_lockstart,
2132 page_lockend - 1))
2133 break;
2134
2135 unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2136 cached_state);
2137 }
2138
2139 btrfs_assert_inode_range_clean(BTRFS_I(inode), lockstart, lockend);
2140 }
2141
btrfs_insert_replace_extent(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path,struct btrfs_replace_extent_info * extent_info,const u64 replace_len,const u64 bytes_to_drop)2142 static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
2143 struct btrfs_inode *inode,
2144 struct btrfs_path *path,
2145 struct btrfs_replace_extent_info *extent_info,
2146 const u64 replace_len,
2147 const u64 bytes_to_drop)
2148 {
2149 struct btrfs_fs_info *fs_info = trans->fs_info;
2150 struct btrfs_root *root = inode->root;
2151 struct btrfs_file_extent_item *extent;
2152 struct extent_buffer *leaf;
2153 struct btrfs_key key;
2154 int slot;
2155 int ret;
2156
2157 if (replace_len == 0)
2158 return 0;
2159
2160 if (extent_info->disk_offset == 0 &&
2161 btrfs_fs_incompat(fs_info, NO_HOLES)) {
2162 btrfs_update_inode_bytes(inode, 0, bytes_to_drop);
2163 return 0;
2164 }
2165
2166 key.objectid = btrfs_ino(inode);
2167 key.type = BTRFS_EXTENT_DATA_KEY;
2168 key.offset = extent_info->file_offset;
2169 ret = btrfs_insert_empty_item(trans, root, path, &key,
2170 sizeof(struct btrfs_file_extent_item));
2171 if (ret)
2172 return ret;
2173 leaf = path->nodes[0];
2174 slot = path->slots[0];
2175 write_extent_buffer(leaf, extent_info->extent_buf,
2176 btrfs_item_ptr_offset(leaf, slot),
2177 sizeof(struct btrfs_file_extent_item));
2178 extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
2179 ASSERT(btrfs_file_extent_type(leaf, extent) != BTRFS_FILE_EXTENT_INLINE);
2180 btrfs_set_file_extent_offset(leaf, extent, extent_info->data_offset);
2181 btrfs_set_file_extent_num_bytes(leaf, extent, replace_len);
2182 if (extent_info->is_new_extent)
2183 btrfs_set_file_extent_generation(leaf, extent, trans->transid);
2184 btrfs_release_path(path);
2185
2186 ret = btrfs_inode_set_file_extent_range(inode, extent_info->file_offset,
2187 replace_len);
2188 if (ret)
2189 return ret;
2190
2191 /* If it's a hole, nothing more needs to be done. */
2192 if (extent_info->disk_offset == 0) {
2193 btrfs_update_inode_bytes(inode, 0, bytes_to_drop);
2194 return 0;
2195 }
2196
2197 btrfs_update_inode_bytes(inode, replace_len, bytes_to_drop);
2198
2199 if (extent_info->is_new_extent && extent_info->insertions == 0) {
2200 key.objectid = extent_info->disk_offset;
2201 key.type = BTRFS_EXTENT_ITEM_KEY;
2202 key.offset = extent_info->disk_len;
2203 ret = btrfs_alloc_reserved_file_extent(trans, root,
2204 btrfs_ino(inode),
2205 extent_info->file_offset,
2206 extent_info->qgroup_reserved,
2207 &key);
2208 } else {
2209 struct btrfs_ref ref = {
2210 .action = BTRFS_ADD_DELAYED_REF,
2211 .bytenr = extent_info->disk_offset,
2212 .num_bytes = extent_info->disk_len,
2213 .owning_root = btrfs_root_id(root),
2214 .ref_root = btrfs_root_id(root),
2215 };
2216 u64 ref_offset;
2217
2218 ref_offset = extent_info->file_offset - extent_info->data_offset;
2219 btrfs_init_data_ref(&ref, btrfs_ino(inode), ref_offset, 0, false);
2220 ret = btrfs_inc_extent_ref(trans, &ref);
2221 }
2222
2223 extent_info->insertions++;
2224
2225 return ret;
2226 }
2227
2228 /*
2229 * The respective range must have been previously locked, as well as the inode.
2230 * The end offset is inclusive (last byte of the range).
2231 * @extent_info is NULL for fallocate's hole punching and non-NULL when replacing
2232 * the file range with an extent.
2233 * When not punching a hole, we don't want to end up in a state where we dropped
2234 * extents without inserting a new one, so we must abort the transaction to avoid
2235 * a corruption.
2236 */
btrfs_replace_file_extents(struct btrfs_inode * inode,struct btrfs_path * path,const u64 start,const u64 end,struct btrfs_replace_extent_info * extent_info,struct btrfs_trans_handle ** trans_out)2237 int btrfs_replace_file_extents(struct btrfs_inode *inode,
2238 struct btrfs_path *path, const u64 start,
2239 const u64 end,
2240 struct btrfs_replace_extent_info *extent_info,
2241 struct btrfs_trans_handle **trans_out)
2242 {
2243 struct btrfs_drop_extents_args drop_args = { 0 };
2244 struct btrfs_root *root = inode->root;
2245 struct btrfs_fs_info *fs_info = root->fs_info;
2246 u64 min_size = btrfs_calc_insert_metadata_size(fs_info, 1);
2247 u64 ino_size = round_up(inode->vfs_inode.i_size, fs_info->sectorsize);
2248 struct btrfs_trans_handle *trans = NULL;
2249 struct btrfs_block_rsv *rsv;
2250 unsigned int rsv_count;
2251 u64 cur_offset;
2252 u64 len = end - start;
2253 int ret = 0;
2254
2255 if (end <= start)
2256 return -EINVAL;
2257
2258 rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
2259 if (!rsv) {
2260 ret = -ENOMEM;
2261 goto out;
2262 }
2263 rsv->size = btrfs_calc_insert_metadata_size(fs_info, 1);
2264 rsv->failfast = true;
2265
2266 /*
2267 * 1 - update the inode
2268 * 1 - removing the extents in the range
2269 * 1 - adding the hole extent if no_holes isn't set or if we are
2270 * replacing the range with a new extent
2271 */
2272 if (!btrfs_fs_incompat(fs_info, NO_HOLES) || extent_info)
2273 rsv_count = 3;
2274 else
2275 rsv_count = 2;
2276
2277 trans = btrfs_start_transaction(root, rsv_count);
2278 if (IS_ERR(trans)) {
2279 ret = PTR_ERR(trans);
2280 trans = NULL;
2281 goto out_free;
2282 }
2283
2284 ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
2285 min_size, false);
2286 if (WARN_ON(ret))
2287 goto out_trans;
2288 trans->block_rsv = rsv;
2289
2290 cur_offset = start;
2291 drop_args.path = path;
2292 drop_args.end = end + 1;
2293 drop_args.drop_cache = true;
2294 while (cur_offset < end) {
2295 drop_args.start = cur_offset;
2296 ret = btrfs_drop_extents(trans, root, inode, &drop_args);
2297 /* If we are punching a hole decrement the inode's byte count */
2298 if (!extent_info)
2299 btrfs_update_inode_bytes(inode, 0,
2300 drop_args.bytes_found);
2301 if (ret != -ENOSPC) {
2302 /*
2303 * The only time we don't want to abort is if we are
2304 * attempting to clone a partial inline extent, in which
2305 * case we'll get EOPNOTSUPP. However if we aren't
2306 * clone we need to abort no matter what, because if we
2307 * got EOPNOTSUPP via prealloc then we messed up and
2308 * need to abort.
2309 */
2310 if (ret &&
2311 (ret != -EOPNOTSUPP ||
2312 (extent_info && extent_info->is_new_extent)))
2313 btrfs_abort_transaction(trans, ret);
2314 break;
2315 }
2316
2317 trans->block_rsv = &fs_info->trans_block_rsv;
2318
2319 if (!extent_info && cur_offset < drop_args.drop_end &&
2320 cur_offset < ino_size) {
2321 ret = fill_holes(trans, inode, path, cur_offset,
2322 drop_args.drop_end);
2323 if (ret) {
2324 /*
2325 * If we failed then we didn't insert our hole
2326 * entries for the area we dropped, so now the
2327 * fs is corrupted, so we must abort the
2328 * transaction.
2329 */
2330 btrfs_abort_transaction(trans, ret);
2331 break;
2332 }
2333 } else if (!extent_info && cur_offset < drop_args.drop_end) {
2334 /*
2335 * We are past the i_size here, but since we didn't
2336 * insert holes we need to clear the mapped area so we
2337 * know to not set disk_i_size in this area until a new
2338 * file extent is inserted here.
2339 */
2340 ret = btrfs_inode_clear_file_extent_range(inode,
2341 cur_offset,
2342 drop_args.drop_end - cur_offset);
2343 if (ret) {
2344 /*
2345 * We couldn't clear our area, so we could
2346 * presumably adjust up and corrupt the fs, so
2347 * we need to abort.
2348 */
2349 btrfs_abort_transaction(trans, ret);
2350 break;
2351 }
2352 }
2353
2354 if (extent_info &&
2355 drop_args.drop_end > extent_info->file_offset) {
2356 u64 replace_len = drop_args.drop_end -
2357 extent_info->file_offset;
2358
2359 ret = btrfs_insert_replace_extent(trans, inode, path,
2360 extent_info, replace_len,
2361 drop_args.bytes_found);
2362 if (ret) {
2363 btrfs_abort_transaction(trans, ret);
2364 break;
2365 }
2366 extent_info->data_len -= replace_len;
2367 extent_info->data_offset += replace_len;
2368 extent_info->file_offset += replace_len;
2369 }
2370
2371 /*
2372 * We are releasing our handle on the transaction, balance the
2373 * dirty pages of the btree inode and flush delayed items, and
2374 * then get a new transaction handle, which may now point to a
2375 * new transaction in case someone else may have committed the
2376 * transaction we used to replace/drop file extent items. So
2377 * bump the inode's iversion and update mtime and ctime except
2378 * if we are called from a dedupe context. This is because a
2379 * power failure/crash may happen after the transaction is
2380 * committed and before we finish replacing/dropping all the
2381 * file extent items we need.
2382 */
2383 inode_inc_iversion(&inode->vfs_inode);
2384
2385 if (!extent_info || extent_info->update_times)
2386 inode_set_mtime_to_ts(&inode->vfs_inode,
2387 inode_set_ctime_current(&inode->vfs_inode));
2388
2389 ret = btrfs_update_inode(trans, inode);
2390 if (ret)
2391 break;
2392
2393 btrfs_end_transaction(trans);
2394 btrfs_btree_balance_dirty(fs_info);
2395
2396 trans = btrfs_start_transaction(root, rsv_count);
2397 if (IS_ERR(trans)) {
2398 ret = PTR_ERR(trans);
2399 trans = NULL;
2400 break;
2401 }
2402
2403 ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
2404 rsv, min_size, false);
2405 if (WARN_ON(ret))
2406 break;
2407 trans->block_rsv = rsv;
2408
2409 cur_offset = drop_args.drop_end;
2410 len = end - cur_offset;
2411 if (!extent_info && len) {
2412 ret = find_first_non_hole(inode, &cur_offset, &len);
2413 if (unlikely(ret < 0))
2414 break;
2415 if (ret && !len) {
2416 ret = 0;
2417 break;
2418 }
2419 }
2420 }
2421
2422 /*
2423 * If we were cloning, force the next fsync to be a full one since we
2424 * we replaced (or just dropped in the case of cloning holes when
2425 * NO_HOLES is enabled) file extent items and did not setup new extent
2426 * maps for the replacement extents (or holes).
2427 */
2428 if (extent_info && !extent_info->is_new_extent)
2429 btrfs_set_inode_full_sync(inode);
2430
2431 if (ret)
2432 goto out_trans;
2433
2434 trans->block_rsv = &fs_info->trans_block_rsv;
2435 /*
2436 * If we are using the NO_HOLES feature we might have had already an
2437 * hole that overlaps a part of the region [lockstart, lockend] and
2438 * ends at (or beyond) lockend. Since we have no file extent items to
2439 * represent holes, drop_end can be less than lockend and so we must
2440 * make sure we have an extent map representing the existing hole (the
2441 * call to __btrfs_drop_extents() might have dropped the existing extent
2442 * map representing the existing hole), otherwise the fast fsync path
2443 * will not record the existence of the hole region
2444 * [existing_hole_start, lockend].
2445 */
2446 if (drop_args.drop_end <= end)
2447 drop_args.drop_end = end + 1;
2448 /*
2449 * Don't insert file hole extent item if it's for a range beyond eof
2450 * (because it's useless) or if it represents a 0 bytes range (when
2451 * cur_offset == drop_end).
2452 */
2453 if (!extent_info && cur_offset < ino_size &&
2454 cur_offset < drop_args.drop_end) {
2455 ret = fill_holes(trans, inode, path, cur_offset,
2456 drop_args.drop_end);
2457 if (ret) {
2458 /* Same comment as above. */
2459 btrfs_abort_transaction(trans, ret);
2460 goto out_trans;
2461 }
2462 } else if (!extent_info && cur_offset < drop_args.drop_end) {
2463 /* See the comment in the loop above for the reasoning here. */
2464 ret = btrfs_inode_clear_file_extent_range(inode, cur_offset,
2465 drop_args.drop_end - cur_offset);
2466 if (ret) {
2467 btrfs_abort_transaction(trans, ret);
2468 goto out_trans;
2469 }
2470
2471 }
2472 if (extent_info) {
2473 ret = btrfs_insert_replace_extent(trans, inode, path,
2474 extent_info, extent_info->data_len,
2475 drop_args.bytes_found);
2476 if (ret) {
2477 btrfs_abort_transaction(trans, ret);
2478 goto out_trans;
2479 }
2480 }
2481
2482 out_trans:
2483 if (!trans)
2484 goto out_free;
2485
2486 trans->block_rsv = &fs_info->trans_block_rsv;
2487 if (ret)
2488 btrfs_end_transaction(trans);
2489 else
2490 *trans_out = trans;
2491 out_free:
2492 btrfs_free_block_rsv(fs_info, rsv);
2493 out:
2494 return ret;
2495 }
2496
btrfs_punch_hole(struct file * file,loff_t offset,loff_t len)2497 static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
2498 {
2499 struct inode *inode = file_inode(file);
2500 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
2501 struct btrfs_root *root = BTRFS_I(inode)->root;
2502 struct extent_state *cached_state = NULL;
2503 struct btrfs_path *path;
2504 struct btrfs_trans_handle *trans = NULL;
2505 u64 lockstart;
2506 u64 lockend;
2507 u64 tail_start;
2508 u64 tail_len;
2509 u64 orig_start = offset;
2510 int ret = 0;
2511 bool same_block;
2512 u64 ino_size;
2513 bool truncated_block = false;
2514 bool updated_inode = false;
2515
2516 btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
2517
2518 ret = btrfs_wait_ordered_range(BTRFS_I(inode), offset, len);
2519 if (ret)
2520 goto out_only_mutex;
2521
2522 ino_size = round_up(inode->i_size, fs_info->sectorsize);
2523 ret = find_first_non_hole(BTRFS_I(inode), &offset, &len);
2524 if (ret < 0)
2525 goto out_only_mutex;
2526 if (ret && !len) {
2527 /* Already in a large hole */
2528 ret = 0;
2529 goto out_only_mutex;
2530 }
2531
2532 ret = file_modified(file);
2533 if (ret)
2534 goto out_only_mutex;
2535
2536 lockstart = round_up(offset, fs_info->sectorsize);
2537 lockend = round_down(offset + len, fs_info->sectorsize) - 1;
2538 same_block = (BTRFS_BYTES_TO_BLKS(fs_info, offset))
2539 == (BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1));
2540 /*
2541 * We needn't truncate any block which is beyond the end of the file
2542 * because we are sure there is no data there.
2543 */
2544 /*
2545 * Only do this if we are in the same block and we aren't doing the
2546 * entire block.
2547 */
2548 if (same_block && len < fs_info->sectorsize) {
2549 if (offset < ino_size) {
2550 truncated_block = true;
2551 ret = btrfs_truncate_block(BTRFS_I(inode), offset, len,
2552 0);
2553 } else {
2554 ret = 0;
2555 }
2556 goto out_only_mutex;
2557 }
2558
2559 /* zero back part of the first block */
2560 if (offset < ino_size) {
2561 truncated_block = true;
2562 ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0);
2563 if (ret) {
2564 btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
2565 return ret;
2566 }
2567 }
2568
2569 /* Check the aligned pages after the first unaligned page,
2570 * if offset != orig_start, which means the first unaligned page
2571 * including several following pages are already in holes,
2572 * the extra check can be skipped */
2573 if (offset == orig_start) {
2574 /* after truncate page, check hole again */
2575 len = offset + len - lockstart;
2576 offset = lockstart;
2577 ret = find_first_non_hole(BTRFS_I(inode), &offset, &len);
2578 if (ret < 0)
2579 goto out_only_mutex;
2580 if (ret && !len) {
2581 ret = 0;
2582 goto out_only_mutex;
2583 }
2584 lockstart = offset;
2585 }
2586
2587 /* Check the tail unaligned part is in a hole */
2588 tail_start = lockend + 1;
2589 tail_len = offset + len - tail_start;
2590 if (tail_len) {
2591 ret = find_first_non_hole(BTRFS_I(inode), &tail_start, &tail_len);
2592 if (unlikely(ret < 0))
2593 goto out_only_mutex;
2594 if (!ret) {
2595 /* zero the front end of the last page */
2596 if (tail_start + tail_len < ino_size) {
2597 truncated_block = true;
2598 ret = btrfs_truncate_block(BTRFS_I(inode),
2599 tail_start + tail_len,
2600 0, 1);
2601 if (ret)
2602 goto out_only_mutex;
2603 }
2604 }
2605 }
2606
2607 if (lockend < lockstart) {
2608 ret = 0;
2609 goto out_only_mutex;
2610 }
2611
2612 btrfs_punch_hole_lock_range(inode, lockstart, lockend, &cached_state);
2613
2614 path = btrfs_alloc_path();
2615 if (!path) {
2616 ret = -ENOMEM;
2617 goto out;
2618 }
2619
2620 ret = btrfs_replace_file_extents(BTRFS_I(inode), path, lockstart,
2621 lockend, NULL, &trans);
2622 btrfs_free_path(path);
2623 if (ret)
2624 goto out;
2625
2626 ASSERT(trans != NULL);
2627 inode_inc_iversion(inode);
2628 inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
2629 ret = btrfs_update_inode(trans, BTRFS_I(inode));
2630 updated_inode = true;
2631 btrfs_end_transaction(trans);
2632 btrfs_btree_balance_dirty(fs_info);
2633 out:
2634 unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2635 &cached_state);
2636 out_only_mutex:
2637 if (!updated_inode && truncated_block && !ret) {
2638 /*
2639 * If we only end up zeroing part of a page, we still need to
2640 * update the inode item, so that all the time fields are
2641 * updated as well as the necessary btrfs inode in memory fields
2642 * for detecting, at fsync time, if the inode isn't yet in the
2643 * log tree or it's there but not up to date.
2644 */
2645 struct timespec64 now = inode_set_ctime_current(inode);
2646
2647 inode_inc_iversion(inode);
2648 inode_set_mtime_to_ts(inode, now);
2649 trans = btrfs_start_transaction(root, 1);
2650 if (IS_ERR(trans)) {
2651 ret = PTR_ERR(trans);
2652 } else {
2653 int ret2;
2654
2655 ret = btrfs_update_inode(trans, BTRFS_I(inode));
2656 ret2 = btrfs_end_transaction(trans);
2657 if (!ret)
2658 ret = ret2;
2659 }
2660 }
2661 btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
2662 return ret;
2663 }
2664
2665 /* Helper structure to record which range is already reserved */
2666 struct falloc_range {
2667 struct list_head list;
2668 u64 start;
2669 u64 len;
2670 };
2671
2672 /*
2673 * Helper function to add falloc range
2674 *
2675 * Caller should have locked the larger range of extent containing
2676 * [start, len)
2677 */
add_falloc_range(struct list_head * head,u64 start,u64 len)2678 static int add_falloc_range(struct list_head *head, u64 start, u64 len)
2679 {
2680 struct falloc_range *range = NULL;
2681
2682 if (!list_empty(head)) {
2683 /*
2684 * As fallocate iterates by bytenr order, we only need to check
2685 * the last range.
2686 */
2687 range = list_last_entry(head, struct falloc_range, list);
2688 if (range->start + range->len == start) {
2689 range->len += len;
2690 return 0;
2691 }
2692 }
2693
2694 range = kmalloc(sizeof(*range), GFP_KERNEL);
2695 if (!range)
2696 return -ENOMEM;
2697 range->start = start;
2698 range->len = len;
2699 list_add_tail(&range->list, head);
2700 return 0;
2701 }
2702
btrfs_fallocate_update_isize(struct inode * inode,const u64 end,const int mode)2703 static int btrfs_fallocate_update_isize(struct inode *inode,
2704 const u64 end,
2705 const int mode)
2706 {
2707 struct btrfs_trans_handle *trans;
2708 struct btrfs_root *root = BTRFS_I(inode)->root;
2709 int ret;
2710 int ret2;
2711
2712 if (mode & FALLOC_FL_KEEP_SIZE || end <= i_size_read(inode))
2713 return 0;
2714
2715 trans = btrfs_start_transaction(root, 1);
2716 if (IS_ERR(trans))
2717 return PTR_ERR(trans);
2718
2719 inode_set_ctime_current(inode);
2720 i_size_write(inode, end);
2721 btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
2722 ret = btrfs_update_inode(trans, BTRFS_I(inode));
2723 ret2 = btrfs_end_transaction(trans);
2724
2725 return ret ? ret : ret2;
2726 }
2727
2728 enum {
2729 RANGE_BOUNDARY_WRITTEN_EXTENT,
2730 RANGE_BOUNDARY_PREALLOC_EXTENT,
2731 RANGE_BOUNDARY_HOLE,
2732 };
2733
btrfs_zero_range_check_range_boundary(struct btrfs_inode * inode,u64 offset)2734 static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode,
2735 u64 offset)
2736 {
2737 const u64 sectorsize = inode->root->fs_info->sectorsize;
2738 struct extent_map *em;
2739 int ret;
2740
2741 offset = round_down(offset, sectorsize);
2742 em = btrfs_get_extent(inode, NULL, offset, sectorsize);
2743 if (IS_ERR(em))
2744 return PTR_ERR(em);
2745
2746 if (em->disk_bytenr == EXTENT_MAP_HOLE)
2747 ret = RANGE_BOUNDARY_HOLE;
2748 else if (em->flags & EXTENT_FLAG_PREALLOC)
2749 ret = RANGE_BOUNDARY_PREALLOC_EXTENT;
2750 else
2751 ret = RANGE_BOUNDARY_WRITTEN_EXTENT;
2752
2753 free_extent_map(em);
2754 return ret;
2755 }
2756
btrfs_zero_range(struct inode * inode,loff_t offset,loff_t len,const int mode)2757 static int btrfs_zero_range(struct inode *inode,
2758 loff_t offset,
2759 loff_t len,
2760 const int mode)
2761 {
2762 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
2763 struct extent_map *em;
2764 struct extent_changeset *data_reserved = NULL;
2765 int ret;
2766 u64 alloc_hint = 0;
2767 const u64 sectorsize = fs_info->sectorsize;
2768 u64 alloc_start = round_down(offset, sectorsize);
2769 u64 alloc_end = round_up(offset + len, sectorsize);
2770 u64 bytes_to_reserve = 0;
2771 bool space_reserved = false;
2772
2773 em = btrfs_get_extent(BTRFS_I(inode), NULL, alloc_start,
2774 alloc_end - alloc_start);
2775 if (IS_ERR(em)) {
2776 ret = PTR_ERR(em);
2777 goto out;
2778 }
2779
2780 /*
2781 * Avoid hole punching and extent allocation for some cases. More cases
2782 * could be considered, but these are unlikely common and we keep things
2783 * as simple as possible for now. Also, intentionally, if the target
2784 * range contains one or more prealloc extents together with regular
2785 * extents and holes, we drop all the existing extents and allocate a
2786 * new prealloc extent, so that we get a larger contiguous disk extent.
2787 */
2788 if (em->start <= alloc_start && (em->flags & EXTENT_FLAG_PREALLOC)) {
2789 const u64 em_end = em->start + em->len;
2790
2791 if (em_end >= offset + len) {
2792 /*
2793 * The whole range is already a prealloc extent,
2794 * do nothing except updating the inode's i_size if
2795 * needed.
2796 */
2797 free_extent_map(em);
2798 ret = btrfs_fallocate_update_isize(inode, offset + len,
2799 mode);
2800 goto out;
2801 }
2802 /*
2803 * Part of the range is already a prealloc extent, so operate
2804 * only on the remaining part of the range.
2805 */
2806 alloc_start = em_end;
2807 ASSERT(IS_ALIGNED(alloc_start, sectorsize));
2808 len = offset + len - alloc_start;
2809 offset = alloc_start;
2810 alloc_hint = extent_map_block_start(em) + em->len;
2811 }
2812 free_extent_map(em);
2813
2814 if (BTRFS_BYTES_TO_BLKS(fs_info, offset) ==
2815 BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) {
2816 em = btrfs_get_extent(BTRFS_I(inode), NULL, alloc_start, sectorsize);
2817 if (IS_ERR(em)) {
2818 ret = PTR_ERR(em);
2819 goto out;
2820 }
2821
2822 if (em->flags & EXTENT_FLAG_PREALLOC) {
2823 free_extent_map(em);
2824 ret = btrfs_fallocate_update_isize(inode, offset + len,
2825 mode);
2826 goto out;
2827 }
2828 if (len < sectorsize && em->disk_bytenr != EXTENT_MAP_HOLE) {
2829 free_extent_map(em);
2830 ret = btrfs_truncate_block(BTRFS_I(inode), offset, len,
2831 0);
2832 if (!ret)
2833 ret = btrfs_fallocate_update_isize(inode,
2834 offset + len,
2835 mode);
2836 return ret;
2837 }
2838 free_extent_map(em);
2839 alloc_start = round_down(offset, sectorsize);
2840 alloc_end = alloc_start + sectorsize;
2841 goto reserve_space;
2842 }
2843
2844 alloc_start = round_up(offset, sectorsize);
2845 alloc_end = round_down(offset + len, sectorsize);
2846
2847 /*
2848 * For unaligned ranges, check the pages at the boundaries, they might
2849 * map to an extent, in which case we need to partially zero them, or
2850 * they might map to a hole, in which case we need our allocation range
2851 * to cover them.
2852 */
2853 if (!IS_ALIGNED(offset, sectorsize)) {
2854 ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode),
2855 offset);
2856 if (ret < 0)
2857 goto out;
2858 if (ret == RANGE_BOUNDARY_HOLE) {
2859 alloc_start = round_down(offset, sectorsize);
2860 ret = 0;
2861 } else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) {
2862 ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0);
2863 if (ret)
2864 goto out;
2865 } else {
2866 ret = 0;
2867 }
2868 }
2869
2870 if (!IS_ALIGNED(offset + len, sectorsize)) {
2871 ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode),
2872 offset + len);
2873 if (ret < 0)
2874 goto out;
2875 if (ret == RANGE_BOUNDARY_HOLE) {
2876 alloc_end = round_up(offset + len, sectorsize);
2877 ret = 0;
2878 } else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) {
2879 ret = btrfs_truncate_block(BTRFS_I(inode), offset + len,
2880 0, 1);
2881 if (ret)
2882 goto out;
2883 } else {
2884 ret = 0;
2885 }
2886 }
2887
2888 reserve_space:
2889 if (alloc_start < alloc_end) {
2890 struct extent_state *cached_state = NULL;
2891 const u64 lockstart = alloc_start;
2892 const u64 lockend = alloc_end - 1;
2893
2894 bytes_to_reserve = alloc_end - alloc_start;
2895 ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
2896 bytes_to_reserve);
2897 if (ret < 0)
2898 goto out;
2899 space_reserved = true;
2900 btrfs_punch_hole_lock_range(inode, lockstart, lockend,
2901 &cached_state);
2902 ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved,
2903 alloc_start, bytes_to_reserve);
2904 if (ret) {
2905 unlock_extent(&BTRFS_I(inode)->io_tree, lockstart,
2906 lockend, &cached_state);
2907 goto out;
2908 }
2909 ret = btrfs_prealloc_file_range(inode, mode, alloc_start,
2910 alloc_end - alloc_start,
2911 fs_info->sectorsize,
2912 offset + len, &alloc_hint);
2913 unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2914 &cached_state);
2915 /* btrfs_prealloc_file_range releases reserved space on error */
2916 if (ret) {
2917 space_reserved = false;
2918 goto out;
2919 }
2920 }
2921 ret = btrfs_fallocate_update_isize(inode, offset + len, mode);
2922 out:
2923 if (ret && space_reserved)
2924 btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved,
2925 alloc_start, bytes_to_reserve);
2926 extent_changeset_free(data_reserved);
2927
2928 return ret;
2929 }
2930
btrfs_fallocate(struct file * file,int mode,loff_t offset,loff_t len)2931 static long btrfs_fallocate(struct file *file, int mode,
2932 loff_t offset, loff_t len)
2933 {
2934 struct inode *inode = file_inode(file);
2935 struct extent_state *cached_state = NULL;
2936 struct extent_changeset *data_reserved = NULL;
2937 struct falloc_range *range;
2938 struct falloc_range *tmp;
2939 LIST_HEAD(reserve_list);
2940 u64 cur_offset;
2941 u64 last_byte;
2942 u64 alloc_start;
2943 u64 alloc_end;
2944 u64 alloc_hint = 0;
2945 u64 locked_end;
2946 u64 actual_end = 0;
2947 u64 data_space_needed = 0;
2948 u64 data_space_reserved = 0;
2949 u64 qgroup_reserved = 0;
2950 struct extent_map *em;
2951 int blocksize = BTRFS_I(inode)->root->fs_info->sectorsize;
2952 int ret;
2953
2954 /* Do not allow fallocate in ZONED mode */
2955 if (btrfs_is_zoned(inode_to_fs_info(inode)))
2956 return -EOPNOTSUPP;
2957
2958 alloc_start = round_down(offset, blocksize);
2959 alloc_end = round_up(offset + len, blocksize);
2960 cur_offset = alloc_start;
2961
2962 /* Make sure we aren't being give some crap mode */
2963 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
2964 FALLOC_FL_ZERO_RANGE))
2965 return -EOPNOTSUPP;
2966
2967 if (mode & FALLOC_FL_PUNCH_HOLE)
2968 return btrfs_punch_hole(file, offset, len);
2969
2970 btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
2971
2972 if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) {
2973 ret = inode_newsize_ok(inode, offset + len);
2974 if (ret)
2975 goto out;
2976 }
2977
2978 ret = file_modified(file);
2979 if (ret)
2980 goto out;
2981
2982 /*
2983 * TODO: Move these two operations after we have checked
2984 * accurate reserved space, or fallocate can still fail but
2985 * with page truncated or size expanded.
2986 *
2987 * But that's a minor problem and won't do much harm BTW.
2988 */
2989 if (alloc_start > inode->i_size) {
2990 ret = btrfs_cont_expand(BTRFS_I(inode), i_size_read(inode),
2991 alloc_start);
2992 if (ret)
2993 goto out;
2994 } else if (offset + len > inode->i_size) {
2995 /*
2996 * If we are fallocating from the end of the file onward we
2997 * need to zero out the end of the block if i_size lands in the
2998 * middle of a block.
2999 */
3000 ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size, 0, 0);
3001 if (ret)
3002 goto out;
3003 }
3004
3005 /*
3006 * We have locked the inode at the VFS level (in exclusive mode) and we
3007 * have locked the i_mmap_lock lock (in exclusive mode). Now before
3008 * locking the file range, flush all dealloc in the range and wait for
3009 * all ordered extents in the range to complete. After this we can lock
3010 * the file range and, due to the previous locking we did, we know there
3011 * can't be more delalloc or ordered extents in the range.
3012 */
3013 ret = btrfs_wait_ordered_range(BTRFS_I(inode), alloc_start,
3014 alloc_end - alloc_start);
3015 if (ret)
3016 goto out;
3017
3018 if (mode & FALLOC_FL_ZERO_RANGE) {
3019 ret = btrfs_zero_range(inode, offset, len, mode);
3020 btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
3021 return ret;
3022 }
3023
3024 locked_end = alloc_end - 1;
3025 lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
3026 &cached_state);
3027
3028 btrfs_assert_inode_range_clean(BTRFS_I(inode), alloc_start, locked_end);
3029
3030 /* First, check if we exceed the qgroup limit */
3031 while (cur_offset < alloc_end) {
3032 em = btrfs_get_extent(BTRFS_I(inode), NULL, cur_offset,
3033 alloc_end - cur_offset);
3034 if (IS_ERR(em)) {
3035 ret = PTR_ERR(em);
3036 break;
3037 }
3038 last_byte = min(extent_map_end(em), alloc_end);
3039 actual_end = min_t(u64, extent_map_end(em), offset + len);
3040 last_byte = ALIGN(last_byte, blocksize);
3041 if (em->disk_bytenr == EXTENT_MAP_HOLE ||
3042 (cur_offset >= inode->i_size &&
3043 !(em->flags & EXTENT_FLAG_PREALLOC))) {
3044 const u64 range_len = last_byte - cur_offset;
3045
3046 ret = add_falloc_range(&reserve_list, cur_offset, range_len);
3047 if (ret < 0) {
3048 free_extent_map(em);
3049 break;
3050 }
3051 ret = btrfs_qgroup_reserve_data(BTRFS_I(inode),
3052 &data_reserved, cur_offset, range_len);
3053 if (ret < 0) {
3054 free_extent_map(em);
3055 break;
3056 }
3057 qgroup_reserved += range_len;
3058 data_space_needed += range_len;
3059 }
3060 free_extent_map(em);
3061 cur_offset = last_byte;
3062 }
3063
3064 if (!ret && data_space_needed > 0) {
3065 /*
3066 * We are safe to reserve space here as we can't have delalloc
3067 * in the range, see above.
3068 */
3069 ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
3070 data_space_needed);
3071 if (!ret)
3072 data_space_reserved = data_space_needed;
3073 }
3074
3075 /*
3076 * If ret is still 0, means we're OK to fallocate.
3077 * Or just cleanup the list and exit.
3078 */
3079 list_for_each_entry_safe(range, tmp, &reserve_list, list) {
3080 if (!ret) {
3081 ret = btrfs_prealloc_file_range(inode, mode,
3082 range->start,
3083 range->len, blocksize,
3084 offset + len, &alloc_hint);
3085 /*
3086 * btrfs_prealloc_file_range() releases space even
3087 * if it returns an error.
3088 */
3089 data_space_reserved -= range->len;
3090 qgroup_reserved -= range->len;
3091 } else if (data_space_reserved > 0) {
3092 btrfs_free_reserved_data_space(BTRFS_I(inode),
3093 data_reserved, range->start,
3094 range->len);
3095 data_space_reserved -= range->len;
3096 qgroup_reserved -= range->len;
3097 } else if (qgroup_reserved > 0) {
3098 btrfs_qgroup_free_data(BTRFS_I(inode), data_reserved,
3099 range->start, range->len, NULL);
3100 qgroup_reserved -= range->len;
3101 }
3102 list_del(&range->list);
3103 kfree(range);
3104 }
3105 if (ret < 0)
3106 goto out_unlock;
3107
3108 /*
3109 * We didn't need to allocate any more space, but we still extended the
3110 * size of the file so we need to update i_size and the inode item.
3111 */
3112 ret = btrfs_fallocate_update_isize(inode, actual_end, mode);
3113 out_unlock:
3114 unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
3115 &cached_state);
3116 out:
3117 btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
3118 extent_changeset_free(data_reserved);
3119 return ret;
3120 }
3121
3122 /*
3123 * Helper for btrfs_find_delalloc_in_range(). Find a subrange in a given range
3124 * that has unflushed and/or flushing delalloc. There might be other adjacent
3125 * subranges after the one it found, so btrfs_find_delalloc_in_range() keeps
3126 * looping while it gets adjacent subranges, and merging them together.
3127 */
find_delalloc_subrange(struct btrfs_inode * inode,u64 start,u64 end,struct extent_state ** cached_state,bool * search_io_tree,u64 * delalloc_start_ret,u64 * delalloc_end_ret)3128 static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end,
3129 struct extent_state **cached_state,
3130 bool *search_io_tree,
3131 u64 *delalloc_start_ret, u64 *delalloc_end_ret)
3132 {
3133 u64 len = end + 1 - start;
3134 u64 delalloc_len = 0;
3135 struct btrfs_ordered_extent *oe;
3136 u64 oe_start;
3137 u64 oe_end;
3138
3139 /*
3140 * Search the io tree first for EXTENT_DELALLOC. If we find any, it
3141 * means we have delalloc (dirty pages) for which writeback has not
3142 * started yet.
3143 */
3144 if (*search_io_tree) {
3145 spin_lock(&inode->lock);
3146 if (inode->delalloc_bytes > 0) {
3147 spin_unlock(&inode->lock);
3148 *delalloc_start_ret = start;
3149 delalloc_len = count_range_bits(&inode->io_tree,
3150 delalloc_start_ret, end,
3151 len, EXTENT_DELALLOC, 1,
3152 cached_state);
3153 } else {
3154 spin_unlock(&inode->lock);
3155 }
3156 }
3157
3158 if (delalloc_len > 0) {
3159 /*
3160 * If delalloc was found then *delalloc_start_ret has a sector size
3161 * aligned value (rounded down).
3162 */
3163 *delalloc_end_ret = *delalloc_start_ret + delalloc_len - 1;
3164
3165 if (*delalloc_start_ret == start) {
3166 /* Delalloc for the whole range, nothing more to do. */
3167 if (*delalloc_end_ret == end)
3168 return true;
3169 /* Else trim our search range for ordered extents. */
3170 start = *delalloc_end_ret + 1;
3171 len = end + 1 - start;
3172 }
3173 } else {
3174 /* No delalloc, future calls don't need to search again. */
3175 *search_io_tree = false;
3176 }
3177
3178 /*
3179 * Now also check if there's any ordered extent in the range.
3180 * We do this because:
3181 *
3182 * 1) When delalloc is flushed, the file range is locked, we clear the
3183 * EXTENT_DELALLOC bit from the io tree and create an extent map and
3184 * an ordered extent for the write. So we might just have been called
3185 * after delalloc is flushed and before the ordered extent completes
3186 * and inserts the new file extent item in the subvolume's btree;
3187 *
3188 * 2) We may have an ordered extent created by flushing delalloc for a
3189 * subrange that starts before the subrange we found marked with
3190 * EXTENT_DELALLOC in the io tree.
3191 *
3192 * We could also use the extent map tree to find such delalloc that is
3193 * being flushed, but using the ordered extents tree is more efficient
3194 * because it's usually much smaller as ordered extents are removed from
3195 * the tree once they complete. With the extent maps, we mau have them
3196 * in the extent map tree for a very long time, and they were either
3197 * created by previous writes or loaded by read operations.
3198 */
3199 oe = btrfs_lookup_first_ordered_range(inode, start, len);
3200 if (!oe)
3201 return (delalloc_len > 0);
3202
3203 /* The ordered extent may span beyond our search range. */
3204 oe_start = max(oe->file_offset, start);
3205 oe_end = min(oe->file_offset + oe->num_bytes - 1, end);
3206
3207 btrfs_put_ordered_extent(oe);
3208
3209 /* Don't have unflushed delalloc, return the ordered extent range. */
3210 if (delalloc_len == 0) {
3211 *delalloc_start_ret = oe_start;
3212 *delalloc_end_ret = oe_end;
3213 return true;
3214 }
3215
3216 /*
3217 * We have both unflushed delalloc (io_tree) and an ordered extent.
3218 * If the ranges are adjacent returned a combined range, otherwise
3219 * return the leftmost range.
3220 */
3221 if (oe_start < *delalloc_start_ret) {
3222 if (oe_end < *delalloc_start_ret)
3223 *delalloc_end_ret = oe_end;
3224 *delalloc_start_ret = oe_start;
3225 } else if (*delalloc_end_ret + 1 == oe_start) {
3226 *delalloc_end_ret = oe_end;
3227 }
3228
3229 return true;
3230 }
3231
3232 /*
3233 * Check if there's delalloc in a given range.
3234 *
3235 * @inode: The inode.
3236 * @start: The start offset of the range. It does not need to be
3237 * sector size aligned.
3238 * @end: The end offset (inclusive value) of the search range.
3239 * It does not need to be sector size aligned.
3240 * @cached_state: Extent state record used for speeding up delalloc
3241 * searches in the inode's io_tree. Can be NULL.
3242 * @delalloc_start_ret: Output argument, set to the start offset of the
3243 * subrange found with delalloc (may not be sector size
3244 * aligned).
3245 * @delalloc_end_ret: Output argument, set to he end offset (inclusive value)
3246 * of the subrange found with delalloc.
3247 *
3248 * Returns true if a subrange with delalloc is found within the given range, and
3249 * if so it sets @delalloc_start_ret and @delalloc_end_ret with the start and
3250 * end offsets of the subrange.
3251 */
btrfs_find_delalloc_in_range(struct btrfs_inode * inode,u64 start,u64 end,struct extent_state ** cached_state,u64 * delalloc_start_ret,u64 * delalloc_end_ret)3252 bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end,
3253 struct extent_state **cached_state,
3254 u64 *delalloc_start_ret, u64 *delalloc_end_ret)
3255 {
3256 u64 cur_offset = round_down(start, inode->root->fs_info->sectorsize);
3257 u64 prev_delalloc_end = 0;
3258 bool search_io_tree = true;
3259 bool ret = false;
3260
3261 while (cur_offset <= end) {
3262 u64 delalloc_start;
3263 u64 delalloc_end;
3264 bool delalloc;
3265
3266 delalloc = find_delalloc_subrange(inode, cur_offset, end,
3267 cached_state, &search_io_tree,
3268 &delalloc_start,
3269 &delalloc_end);
3270 if (!delalloc)
3271 break;
3272
3273 if (prev_delalloc_end == 0) {
3274 /* First subrange found. */
3275 *delalloc_start_ret = max(delalloc_start, start);
3276 *delalloc_end_ret = delalloc_end;
3277 ret = true;
3278 } else if (delalloc_start == prev_delalloc_end + 1) {
3279 /* Subrange adjacent to the previous one, merge them. */
3280 *delalloc_end_ret = delalloc_end;
3281 } else {
3282 /* Subrange not adjacent to the previous one, exit. */
3283 break;
3284 }
3285
3286 prev_delalloc_end = delalloc_end;
3287 cur_offset = delalloc_end + 1;
3288 cond_resched();
3289 }
3290
3291 return ret;
3292 }
3293
3294 /*
3295 * Check if there's a hole or delalloc range in a range representing a hole (or
3296 * prealloc extent) found in the inode's subvolume btree.
3297 *
3298 * @inode: The inode.
3299 * @whence: Seek mode (SEEK_DATA or SEEK_HOLE).
3300 * @start: Start offset of the hole region. It does not need to be sector
3301 * size aligned.
3302 * @end: End offset (inclusive value) of the hole region. It does not
3303 * need to be sector size aligned.
3304 * @start_ret: Return parameter, used to set the start of the subrange in the
3305 * hole that matches the search criteria (seek mode), if such
3306 * subrange is found (return value of the function is true).
3307 * The value returned here may not be sector size aligned.
3308 *
3309 * Returns true if a subrange matching the given seek mode is found, and if one
3310 * is found, it updates @start_ret with the start of the subrange.
3311 */
find_desired_extent_in_hole(struct btrfs_inode * inode,int whence,struct extent_state ** cached_state,u64 start,u64 end,u64 * start_ret)3312 static bool find_desired_extent_in_hole(struct btrfs_inode *inode, int whence,
3313 struct extent_state **cached_state,
3314 u64 start, u64 end, u64 *start_ret)
3315 {
3316 u64 delalloc_start;
3317 u64 delalloc_end;
3318 bool delalloc;
3319
3320 delalloc = btrfs_find_delalloc_in_range(inode, start, end, cached_state,
3321 &delalloc_start, &delalloc_end);
3322 if (delalloc && whence == SEEK_DATA) {
3323 *start_ret = delalloc_start;
3324 return true;
3325 }
3326
3327 if (delalloc && whence == SEEK_HOLE) {
3328 /*
3329 * We found delalloc but it starts after out start offset. So we
3330 * have a hole between our start offset and the delalloc start.
3331 */
3332 if (start < delalloc_start) {
3333 *start_ret = start;
3334 return true;
3335 }
3336 /*
3337 * Delalloc range starts at our start offset.
3338 * If the delalloc range's length is smaller than our range,
3339 * then it means we have a hole that starts where the delalloc
3340 * subrange ends.
3341 */
3342 if (delalloc_end < end) {
3343 *start_ret = delalloc_end + 1;
3344 return true;
3345 }
3346
3347 /* There's delalloc for the whole range. */
3348 return false;
3349 }
3350
3351 if (!delalloc && whence == SEEK_HOLE) {
3352 *start_ret = start;
3353 return true;
3354 }
3355
3356 /*
3357 * No delalloc in the range and we are seeking for data. The caller has
3358 * to iterate to the next extent item in the subvolume btree.
3359 */
3360 return false;
3361 }
3362
find_desired_extent(struct file * file,loff_t offset,int whence)3363 static loff_t find_desired_extent(struct file *file, loff_t offset, int whence)
3364 {
3365 struct btrfs_inode *inode = BTRFS_I(file->f_mapping->host);
3366 struct btrfs_file_private *private;
3367 struct btrfs_fs_info *fs_info = inode->root->fs_info;
3368 struct extent_state *cached_state = NULL;
3369 struct extent_state **delalloc_cached_state;
3370 const loff_t i_size = i_size_read(&inode->vfs_inode);
3371 const u64 ino = btrfs_ino(inode);
3372 struct btrfs_root *root = inode->root;
3373 struct btrfs_path *path;
3374 struct btrfs_key key;
3375 u64 last_extent_end;
3376 u64 lockstart;
3377 u64 lockend;
3378 u64 start;
3379 int ret;
3380 bool found = false;
3381
3382 if (i_size == 0 || offset >= i_size)
3383 return -ENXIO;
3384
3385 /*
3386 * Quick path. If the inode has no prealloc extents and its number of
3387 * bytes used matches its i_size, then it can not have holes.
3388 */
3389 if (whence == SEEK_HOLE &&
3390 !(inode->flags & BTRFS_INODE_PREALLOC) &&
3391 inode_get_bytes(&inode->vfs_inode) == i_size)
3392 return i_size;
3393
3394 spin_lock(&inode->lock);
3395 private = file->private_data;
3396 spin_unlock(&inode->lock);
3397
3398 if (private && private->owner_task != current) {
3399 /*
3400 * Not allocated by us, don't use it as its cached state is used
3401 * by the task that allocated it and we don't want neither to
3402 * mess with it nor get incorrect results because it reflects an
3403 * invalid state for the current task.
3404 */
3405 private = NULL;
3406 } else if (!private) {
3407 private = kzalloc(sizeof(*private), GFP_KERNEL);
3408 /*
3409 * No worries if memory allocation failed.
3410 * The private structure is used only for speeding up multiple
3411 * lseek SEEK_HOLE/DATA calls to a file when there's delalloc,
3412 * so everything will still be correct.
3413 */
3414 if (private) {
3415 bool free = false;
3416
3417 private->owner_task = current;
3418
3419 spin_lock(&inode->lock);
3420 if (file->private_data)
3421 free = true;
3422 else
3423 file->private_data = private;
3424 spin_unlock(&inode->lock);
3425
3426 if (free) {
3427 kfree(private);
3428 private = NULL;
3429 }
3430 }
3431 }
3432
3433 if (private)
3434 delalloc_cached_state = &private->llseek_cached_state;
3435 else
3436 delalloc_cached_state = NULL;
3437
3438 /*
3439 * offset can be negative, in this case we start finding DATA/HOLE from
3440 * the very start of the file.
3441 */
3442 start = max_t(loff_t, 0, offset);
3443
3444 lockstart = round_down(start, fs_info->sectorsize);
3445 lockend = round_up(i_size, fs_info->sectorsize);
3446 if (lockend <= lockstart)
3447 lockend = lockstart + fs_info->sectorsize;
3448 lockend--;
3449
3450 path = btrfs_alloc_path();
3451 if (!path)
3452 return -ENOMEM;
3453 path->reada = READA_FORWARD;
3454
3455 key.objectid = ino;
3456 key.type = BTRFS_EXTENT_DATA_KEY;
3457 key.offset = start;
3458
3459 last_extent_end = lockstart;
3460
3461 lock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
3462
3463 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3464 if (ret < 0) {
3465 goto out;
3466 } else if (ret > 0 && path->slots[0] > 0) {
3467 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
3468 if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)
3469 path->slots[0]--;
3470 }
3471
3472 while (start < i_size) {
3473 struct extent_buffer *leaf = path->nodes[0];
3474 struct btrfs_file_extent_item *extent;
3475 u64 extent_end;
3476 u8 type;
3477
3478 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
3479 ret = btrfs_next_leaf(root, path);
3480 if (ret < 0)
3481 goto out;
3482 else if (ret > 0)
3483 break;
3484
3485 leaf = path->nodes[0];
3486 }
3487
3488 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3489 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
3490 break;
3491
3492 extent_end = btrfs_file_extent_end(path);
3493
3494 /*
3495 * In the first iteration we may have a slot that points to an
3496 * extent that ends before our start offset, so skip it.
3497 */
3498 if (extent_end <= start) {
3499 path->slots[0]++;
3500 continue;
3501 }
3502
3503 /* We have an implicit hole, NO_HOLES feature is likely set. */
3504 if (last_extent_end < key.offset) {
3505 u64 search_start = last_extent_end;
3506 u64 found_start;
3507
3508 /*
3509 * First iteration, @start matches @offset and it's
3510 * within the hole.
3511 */
3512 if (start == offset)
3513 search_start = offset;
3514
3515 found = find_desired_extent_in_hole(inode, whence,
3516 delalloc_cached_state,
3517 search_start,
3518 key.offset - 1,
3519 &found_start);
3520 if (found) {
3521 start = found_start;
3522 break;
3523 }
3524 /*
3525 * Didn't find data or a hole (due to delalloc) in the
3526 * implicit hole range, so need to analyze the extent.
3527 */
3528 }
3529
3530 extent = btrfs_item_ptr(leaf, path->slots[0],
3531 struct btrfs_file_extent_item);
3532 type = btrfs_file_extent_type(leaf, extent);
3533
3534 /*
3535 * Can't access the extent's disk_bytenr field if this is an
3536 * inline extent, since at that offset, it's where the extent
3537 * data starts.
3538 */
3539 if (type == BTRFS_FILE_EXTENT_PREALLOC ||
3540 (type == BTRFS_FILE_EXTENT_REG &&
3541 btrfs_file_extent_disk_bytenr(leaf, extent) == 0)) {
3542 /*
3543 * Explicit hole or prealloc extent, search for delalloc.
3544 * A prealloc extent is treated like a hole.
3545 */
3546 u64 search_start = key.offset;
3547 u64 found_start;
3548
3549 /*
3550 * First iteration, @start matches @offset and it's
3551 * within the hole.
3552 */
3553 if (start == offset)
3554 search_start = offset;
3555
3556 found = find_desired_extent_in_hole(inode, whence,
3557 delalloc_cached_state,
3558 search_start,
3559 extent_end - 1,
3560 &found_start);
3561 if (found) {
3562 start = found_start;
3563 break;
3564 }
3565 /*
3566 * Didn't find data or a hole (due to delalloc) in the
3567 * implicit hole range, so need to analyze the next
3568 * extent item.
3569 */
3570 } else {
3571 /*
3572 * Found a regular or inline extent.
3573 * If we are seeking for data, adjust the start offset
3574 * and stop, we're done.
3575 */
3576 if (whence == SEEK_DATA) {
3577 start = max_t(u64, key.offset, offset);
3578 found = true;
3579 break;
3580 }
3581 /*
3582 * Else, we are seeking for a hole, check the next file
3583 * extent item.
3584 */
3585 }
3586
3587 start = extent_end;
3588 last_extent_end = extent_end;
3589 path->slots[0]++;
3590 if (fatal_signal_pending(current)) {
3591 ret = -EINTR;
3592 goto out;
3593 }
3594 cond_resched();
3595 }
3596
3597 /* We have an implicit hole from the last extent found up to i_size. */
3598 if (!found && start < i_size) {
3599 found = find_desired_extent_in_hole(inode, whence,
3600 delalloc_cached_state, start,
3601 i_size - 1, &start);
3602 if (!found)
3603 start = i_size;
3604 }
3605
3606 out:
3607 unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
3608 btrfs_free_path(path);
3609
3610 if (ret < 0)
3611 return ret;
3612
3613 if (whence == SEEK_DATA && start >= i_size)
3614 return -ENXIO;
3615
3616 return min_t(loff_t, start, i_size);
3617 }
3618
btrfs_file_llseek(struct file * file,loff_t offset,int whence)3619 static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
3620 {
3621 struct inode *inode = file->f_mapping->host;
3622
3623 switch (whence) {
3624 default:
3625 return generic_file_llseek(file, offset, whence);
3626 case SEEK_DATA:
3627 case SEEK_HOLE:
3628 btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
3629 offset = find_desired_extent(file, offset, whence);
3630 btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
3631 break;
3632 }
3633
3634 if (offset < 0)
3635 return offset;
3636
3637 return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
3638 }
3639
btrfs_file_open(struct inode * inode,struct file * filp)3640 static int btrfs_file_open(struct inode *inode, struct file *filp)
3641 {
3642 int ret;
3643
3644 filp->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
3645
3646 ret = fsverity_file_open(inode, filp);
3647 if (ret)
3648 return ret;
3649 return generic_file_open(inode, filp);
3650 }
3651
btrfs_file_read_iter(struct kiocb * iocb,struct iov_iter * to)3652 static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
3653 {
3654 ssize_t ret = 0;
3655
3656 if (iocb->ki_flags & IOCB_DIRECT) {
3657 ret = btrfs_direct_read(iocb, to);
3658 if (ret < 0 || !iov_iter_count(to) ||
3659 iocb->ki_pos >= i_size_read(file_inode(iocb->ki_filp)))
3660 return ret;
3661 }
3662
3663 return filemap_read(iocb, to, ret);
3664 }
3665
3666 const struct file_operations btrfs_file_operations = {
3667 .llseek = btrfs_file_llseek,
3668 .read_iter = btrfs_file_read_iter,
3669 .splice_read = filemap_splice_read,
3670 .write_iter = btrfs_file_write_iter,
3671 .splice_write = iter_file_splice_write,
3672 .mmap = btrfs_file_mmap,
3673 .open = btrfs_file_open,
3674 .release = btrfs_release_file,
3675 .get_unmapped_area = thp_get_unmapped_area,
3676 .fsync = btrfs_sync_file,
3677 .fallocate = btrfs_fallocate,
3678 .unlocked_ioctl = btrfs_ioctl,
3679 #ifdef CONFIG_COMPAT
3680 .compat_ioctl = btrfs_compat_ioctl,
3681 #endif
3682 .remap_file_range = btrfs_remap_file_range,
3683 .uring_cmd = btrfs_uring_cmd,
3684 .fop_flags = FOP_BUFFER_RASYNC | FOP_BUFFER_WASYNC,
3685 };
3686
btrfs_fdatawrite_range(struct btrfs_inode * inode,loff_t start,loff_t end)3687 int btrfs_fdatawrite_range(struct btrfs_inode *inode, loff_t start, loff_t end)
3688 {
3689 struct address_space *mapping = inode->vfs_inode.i_mapping;
3690 int ret;
3691
3692 /*
3693 * So with compression we will find and lock a dirty page and clear the
3694 * first one as dirty, setup an async extent, and immediately return
3695 * with the entire range locked but with nobody actually marked with
3696 * writeback. So we can't just filemap_write_and_wait_range() and
3697 * expect it to work since it will just kick off a thread to do the
3698 * actual work. So we need to call filemap_fdatawrite_range _again_
3699 * since it will wait on the page lock, which won't be unlocked until
3700 * after the pages have been marked as writeback and so we're good to go
3701 * from there. We have to do this otherwise we'll miss the ordered
3702 * extents and that results in badness. Please Josef, do not think you
3703 * know better and pull this out at some point in the future, it is
3704 * right and you are wrong.
3705 */
3706 ret = filemap_fdatawrite_range(mapping, start, end);
3707 if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags))
3708 ret = filemap_fdatawrite_range(mapping, start, end);
3709
3710 return ret;
3711 }
3712