1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright (C) 2007 Oracle. All rights reserved.
4 */
5
6 #include <crypto/hash.h>
7 #include <linux/kernel.h>
8 #include <linux/bio.h>
9 #include <linux/blk-cgroup.h>
10 #include <linux/file.h>
11 #include <linux/fs.h>
12 #include <linux/pagemap.h>
13 #include <linux/highmem.h>
14 #include <linux/time.h>
15 #include <linux/init.h>
16 #include <linux/string.h>
17 #include <linux/backing-dev.h>
18 #include <linux/writeback.h>
19 #include <linux/compat.h>
20 #include <linux/xattr.h>
21 #include <linux/posix_acl.h>
22 #include <linux/falloc.h>
23 #include <linux/slab.h>
24 #include <linux/ratelimit.h>
25 #include <linux/btrfs.h>
26 #include <linux/blkdev.h>
27 #include <linux/posix_acl_xattr.h>
28 #include <linux/uio.h>
29 #include <linux/magic.h>
30 #include <linux/iversion.h>
31 #include <linux/swap.h>
32 #include <linux/migrate.h>
33 #include <linux/sched/mm.h>
34 #include <linux/iomap.h>
35 #include <linux/unaligned.h>
36 #include <linux/fsverity.h>
37 #include "misc.h"
38 #include "ctree.h"
39 #include "disk-io.h"
40 #include "transaction.h"
41 #include "btrfs_inode.h"
42 #include "ordered-data.h"
43 #include "xattr.h"
44 #include "tree-log.h"
45 #include "bio.h"
46 #include "compression.h"
47 #include "locking.h"
48 #include "props.h"
49 #include "qgroup.h"
50 #include "delalloc-space.h"
51 #include "block-group.h"
52 #include "space-info.h"
53 #include "zoned.h"
54 #include "subpage.h"
55 #include "inode-item.h"
56 #include "fs.h"
57 #include "accessors.h"
58 #include "extent-tree.h"
59 #include "root-tree.h"
60 #include "defrag.h"
61 #include "dir-item.h"
62 #include "file-item.h"
63 #include "uuid-tree.h"
64 #include "ioctl.h"
65 #include "file.h"
66 #include "acl.h"
67 #include "relocation.h"
68 #include "verity.h"
69 #include "super.h"
70 #include "orphan.h"
71 #include "backref.h"
72 #include "raid-stripe-tree.h"
73 #include "fiemap.h"
74
75 struct btrfs_iget_args {
76 u64 ino;
77 struct btrfs_root *root;
78 };
79
80 struct btrfs_rename_ctx {
81 /* Output field. Stores the index number of the old directory entry. */
82 u64 index;
83 };
84
85 /*
86 * Used by data_reloc_print_warning_inode() to pass needed info for filename
87 * resolution and output of error message.
88 */
89 struct data_reloc_warn {
90 struct btrfs_path path;
91 struct btrfs_fs_info *fs_info;
92 u64 extent_item_size;
93 u64 logical;
94 int mirror_num;
95 };
96
97 /*
98 * For the file_extent_tree, we want to hold the inode lock when we lookup and
99 * update the disk_i_size, but lockdep will complain because our io_tree we hold
100 * the tree lock and get the inode lock when setting delalloc. These two things
101 * are unrelated, so make a class for the file_extent_tree so we don't get the
102 * two locking patterns mixed up.
103 */
104 static struct lock_class_key file_extent_tree_class;
105
106 static const struct inode_operations btrfs_dir_inode_operations;
107 static const struct inode_operations btrfs_symlink_inode_operations;
108 static const struct inode_operations btrfs_special_inode_operations;
109 static const struct inode_operations btrfs_file_inode_operations;
110 static const struct address_space_operations btrfs_aops;
111 static const struct file_operations btrfs_dir_file_operations;
112
113 static struct kmem_cache *btrfs_inode_cachep;
114
115 static int btrfs_setsize(struct inode *inode, struct iattr *attr);
116 static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback);
117
118 static noinline int run_delalloc_cow(struct btrfs_inode *inode,
119 struct folio *locked_folio, u64 start,
120 u64 end, struct writeback_control *wbc,
121 bool pages_dirty);
122
data_reloc_print_warning_inode(u64 inum,u64 offset,u64 num_bytes,u64 root,void * warn_ctx)123 static int data_reloc_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,
124 u64 root, void *warn_ctx)
125 {
126 struct data_reloc_warn *warn = warn_ctx;
127 struct btrfs_fs_info *fs_info = warn->fs_info;
128 struct extent_buffer *eb;
129 struct btrfs_inode_item *inode_item;
130 struct inode_fs_paths *ipath = NULL;
131 struct btrfs_root *local_root;
132 struct btrfs_key key;
133 unsigned int nofs_flag;
134 u32 nlink;
135 int ret;
136
137 local_root = btrfs_get_fs_root(fs_info, root, true);
138 if (IS_ERR(local_root)) {
139 ret = PTR_ERR(local_root);
140 goto err;
141 }
142
143 /* This makes the path point to (inum INODE_ITEM ioff). */
144 key.objectid = inum;
145 key.type = BTRFS_INODE_ITEM_KEY;
146 key.offset = 0;
147
148 ret = btrfs_search_slot(NULL, local_root, &key, &warn->path, 0, 0);
149 if (ret) {
150 btrfs_put_root(local_root);
151 btrfs_release_path(&warn->path);
152 goto err;
153 }
154
155 eb = warn->path.nodes[0];
156 inode_item = btrfs_item_ptr(eb, warn->path.slots[0], struct btrfs_inode_item);
157 nlink = btrfs_inode_nlink(eb, inode_item);
158 btrfs_release_path(&warn->path);
159
160 nofs_flag = memalloc_nofs_save();
161 ipath = init_ipath(4096, local_root, &warn->path);
162 memalloc_nofs_restore(nofs_flag);
163 if (IS_ERR(ipath)) {
164 btrfs_put_root(local_root);
165 ret = PTR_ERR(ipath);
166 ipath = NULL;
167 /*
168 * -ENOMEM, not a critical error, just output an generic error
169 * without filename.
170 */
171 btrfs_warn(fs_info,
172 "checksum error at logical %llu mirror %u root %llu, inode %llu offset %llu",
173 warn->logical, warn->mirror_num, root, inum, offset);
174 return ret;
175 }
176 ret = paths_from_inode(inum, ipath);
177 if (ret < 0)
178 goto err;
179
180 /*
181 * We deliberately ignore the bit ipath might have been too small to
182 * hold all of the paths here
183 */
184 for (int i = 0; i < ipath->fspath->elem_cnt; i++) {
185 btrfs_warn(fs_info,
186 "checksum error at logical %llu mirror %u root %llu inode %llu offset %llu length %u links %u (path: %s)",
187 warn->logical, warn->mirror_num, root, inum, offset,
188 fs_info->sectorsize, nlink,
189 (char *)(unsigned long)ipath->fspath->val[i]);
190 }
191
192 btrfs_put_root(local_root);
193 free_ipath(ipath);
194 return 0;
195
196 err:
197 btrfs_warn(fs_info,
198 "checksum error at logical %llu mirror %u root %llu inode %llu offset %llu, path resolving failed with ret=%d",
199 warn->logical, warn->mirror_num, root, inum, offset, ret);
200
201 free_ipath(ipath);
202 return ret;
203 }
204
205 /*
206 * Do extra user-friendly error output (e.g. lookup all the affected files).
207 *
208 * Return true if we succeeded doing the backref lookup.
209 * Return false if such lookup failed, and has to fallback to the old error message.
210 */
print_data_reloc_error(const struct btrfs_inode * inode,u64 file_off,const u8 * csum,const u8 * csum_expected,int mirror_num)211 static void print_data_reloc_error(const struct btrfs_inode *inode, u64 file_off,
212 const u8 *csum, const u8 *csum_expected,
213 int mirror_num)
214 {
215 struct btrfs_fs_info *fs_info = inode->root->fs_info;
216 struct btrfs_path path = { 0 };
217 struct btrfs_key found_key = { 0 };
218 struct extent_buffer *eb;
219 struct btrfs_extent_item *ei;
220 const u32 csum_size = fs_info->csum_size;
221 u64 logical;
222 u64 flags;
223 u32 item_size;
224 int ret;
225
226 mutex_lock(&fs_info->reloc_mutex);
227 logical = btrfs_get_reloc_bg_bytenr(fs_info);
228 mutex_unlock(&fs_info->reloc_mutex);
229
230 if (logical == U64_MAX) {
231 btrfs_warn_rl(fs_info, "has data reloc tree but no running relocation");
232 btrfs_warn_rl(fs_info,
233 "csum failed root %lld ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
234 btrfs_root_id(inode->root), btrfs_ino(inode), file_off,
235 CSUM_FMT_VALUE(csum_size, csum),
236 CSUM_FMT_VALUE(csum_size, csum_expected),
237 mirror_num);
238 return;
239 }
240
241 logical += file_off;
242 btrfs_warn_rl(fs_info,
243 "csum failed root %lld ino %llu off %llu logical %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
244 btrfs_root_id(inode->root),
245 btrfs_ino(inode), file_off, logical,
246 CSUM_FMT_VALUE(csum_size, csum),
247 CSUM_FMT_VALUE(csum_size, csum_expected),
248 mirror_num);
249
250 ret = extent_from_logical(fs_info, logical, &path, &found_key, &flags);
251 if (ret < 0) {
252 btrfs_err_rl(fs_info, "failed to lookup extent item for logical %llu: %d",
253 logical, ret);
254 return;
255 }
256 eb = path.nodes[0];
257 ei = btrfs_item_ptr(eb, path.slots[0], struct btrfs_extent_item);
258 item_size = btrfs_item_size(eb, path.slots[0]);
259 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
260 unsigned long ptr = 0;
261 u64 ref_root;
262 u8 ref_level;
263
264 while (true) {
265 ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
266 item_size, &ref_root,
267 &ref_level);
268 if (ret < 0) {
269 btrfs_warn_rl(fs_info,
270 "failed to resolve tree backref for logical %llu: %d",
271 logical, ret);
272 break;
273 }
274 if (ret > 0)
275 break;
276
277 btrfs_warn_rl(fs_info,
278 "csum error at logical %llu mirror %u: metadata %s (level %d) in tree %llu",
279 logical, mirror_num,
280 (ref_level ? "node" : "leaf"),
281 ref_level, ref_root);
282 }
283 btrfs_release_path(&path);
284 } else {
285 struct btrfs_backref_walk_ctx ctx = { 0 };
286 struct data_reloc_warn reloc_warn = { 0 };
287
288 btrfs_release_path(&path);
289
290 ctx.bytenr = found_key.objectid;
291 ctx.extent_item_pos = logical - found_key.objectid;
292 ctx.fs_info = fs_info;
293
294 reloc_warn.logical = logical;
295 reloc_warn.extent_item_size = found_key.offset;
296 reloc_warn.mirror_num = mirror_num;
297 reloc_warn.fs_info = fs_info;
298
299 iterate_extent_inodes(&ctx, true,
300 data_reloc_print_warning_inode, &reloc_warn);
301 }
302 }
303
btrfs_print_data_csum_error(struct btrfs_inode * inode,u64 logical_start,u8 * csum,u8 * csum_expected,int mirror_num)304 static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode,
305 u64 logical_start, u8 *csum, u8 *csum_expected, int mirror_num)
306 {
307 struct btrfs_root *root = inode->root;
308 const u32 csum_size = root->fs_info->csum_size;
309
310 /* For data reloc tree, it's better to do a backref lookup instead. */
311 if (btrfs_root_id(root) == BTRFS_DATA_RELOC_TREE_OBJECTID)
312 return print_data_reloc_error(inode, logical_start, csum,
313 csum_expected, mirror_num);
314
315 /* Output without objectid, which is more meaningful */
316 if (btrfs_root_id(root) >= BTRFS_LAST_FREE_OBJECTID) {
317 btrfs_warn_rl(root->fs_info,
318 "csum failed root %lld ino %lld off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
319 btrfs_root_id(root), btrfs_ino(inode),
320 logical_start,
321 CSUM_FMT_VALUE(csum_size, csum),
322 CSUM_FMT_VALUE(csum_size, csum_expected),
323 mirror_num);
324 } else {
325 btrfs_warn_rl(root->fs_info,
326 "csum failed root %llu ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
327 btrfs_root_id(root), btrfs_ino(inode),
328 logical_start,
329 CSUM_FMT_VALUE(csum_size, csum),
330 CSUM_FMT_VALUE(csum_size, csum_expected),
331 mirror_num);
332 }
333 }
334
335 /*
336 * Lock inode i_rwsem based on arguments passed.
337 *
338 * ilock_flags can have the following bit set:
339 *
340 * BTRFS_ILOCK_SHARED - acquire a shared lock on the inode
341 * BTRFS_ILOCK_TRY - try to acquire the lock, if fails on first attempt
342 * return -EAGAIN
343 * BTRFS_ILOCK_MMAP - acquire a write lock on the i_mmap_lock
344 */
btrfs_inode_lock(struct btrfs_inode * inode,unsigned int ilock_flags)345 int btrfs_inode_lock(struct btrfs_inode *inode, unsigned int ilock_flags)
346 {
347 if (ilock_flags & BTRFS_ILOCK_SHARED) {
348 if (ilock_flags & BTRFS_ILOCK_TRY) {
349 if (!inode_trylock_shared(&inode->vfs_inode))
350 return -EAGAIN;
351 else
352 return 0;
353 }
354 inode_lock_shared(&inode->vfs_inode);
355 } else {
356 if (ilock_flags & BTRFS_ILOCK_TRY) {
357 if (!inode_trylock(&inode->vfs_inode))
358 return -EAGAIN;
359 else
360 return 0;
361 }
362 inode_lock(&inode->vfs_inode);
363 }
364 if (ilock_flags & BTRFS_ILOCK_MMAP)
365 down_write(&inode->i_mmap_lock);
366 return 0;
367 }
368
369 /*
370 * Unock inode i_rwsem.
371 *
372 * ilock_flags should contain the same bits set as passed to btrfs_inode_lock()
373 * to decide whether the lock acquired is shared or exclusive.
374 */
btrfs_inode_unlock(struct btrfs_inode * inode,unsigned int ilock_flags)375 void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags)
376 {
377 if (ilock_flags & BTRFS_ILOCK_MMAP)
378 up_write(&inode->i_mmap_lock);
379 if (ilock_flags & BTRFS_ILOCK_SHARED)
380 inode_unlock_shared(&inode->vfs_inode);
381 else
382 inode_unlock(&inode->vfs_inode);
383 }
384
385 /*
386 * Cleanup all submitted ordered extents in specified range to handle errors
387 * from the btrfs_run_delalloc_range() callback.
388 *
389 * NOTE: caller must ensure that when an error happens, it can not call
390 * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING
391 * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata
392 * to be released, which we want to happen only when finishing the ordered
393 * extent (btrfs_finish_ordered_io()).
394 */
btrfs_cleanup_ordered_extents(struct btrfs_inode * inode,u64 offset,u64 bytes)395 static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
396 u64 offset, u64 bytes)
397 {
398 unsigned long index = offset >> PAGE_SHIFT;
399 unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
400 struct folio *folio;
401
402 while (index <= end_index) {
403 folio = filemap_get_folio(inode->vfs_inode.i_mapping, index);
404 index++;
405 if (IS_ERR(folio))
406 continue;
407
408 /*
409 * Here we just clear all Ordered bits for every page in the
410 * range, then btrfs_mark_ordered_io_finished() will handle
411 * the ordered extent accounting for the range.
412 */
413 btrfs_folio_clamp_clear_ordered(inode->root->fs_info, folio,
414 offset, bytes);
415 folio_put(folio);
416 }
417
418 return btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, false);
419 }
420
421 static int btrfs_dirty_inode(struct btrfs_inode *inode);
422
btrfs_init_inode_security(struct btrfs_trans_handle * trans,struct btrfs_new_inode_args * args)423 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
424 struct btrfs_new_inode_args *args)
425 {
426 int err;
427
428 if (args->default_acl) {
429 err = __btrfs_set_acl(trans, args->inode, args->default_acl,
430 ACL_TYPE_DEFAULT);
431 if (err)
432 return err;
433 }
434 if (args->acl) {
435 err = __btrfs_set_acl(trans, args->inode, args->acl, ACL_TYPE_ACCESS);
436 if (err)
437 return err;
438 }
439 if (!args->default_acl && !args->acl)
440 cache_no_acl(args->inode);
441 return btrfs_xattr_security_init(trans, args->inode, args->dir,
442 &args->dentry->d_name);
443 }
444
445 /*
446 * this does all the hard work for inserting an inline extent into
447 * the btree. The caller should have done a btrfs_drop_extents so that
448 * no overlapping inline items exist in the btree
449 */
insert_inline_extent(struct btrfs_trans_handle * trans,struct btrfs_path * path,struct btrfs_inode * inode,bool extent_inserted,size_t size,size_t compressed_size,int compress_type,struct folio * compressed_folio,bool update_i_size)450 static int insert_inline_extent(struct btrfs_trans_handle *trans,
451 struct btrfs_path *path,
452 struct btrfs_inode *inode, bool extent_inserted,
453 size_t size, size_t compressed_size,
454 int compress_type,
455 struct folio *compressed_folio,
456 bool update_i_size)
457 {
458 struct btrfs_root *root = inode->root;
459 struct extent_buffer *leaf;
460 const u32 sectorsize = trans->fs_info->sectorsize;
461 char *kaddr;
462 unsigned long ptr;
463 struct btrfs_file_extent_item *ei;
464 int ret;
465 size_t cur_size = size;
466 u64 i_size;
467
468 /*
469 * The decompressed size must still be no larger than a sector. Under
470 * heavy race, we can have size == 0 passed in, but that shouldn't be a
471 * big deal and we can continue the insertion.
472 */
473 ASSERT(size <= sectorsize);
474
475 /*
476 * The compressed size also needs to be no larger than a sector.
477 * That's also why we only need one page as the parameter.
478 */
479 if (compressed_folio)
480 ASSERT(compressed_size <= sectorsize);
481 else
482 ASSERT(compressed_size == 0);
483
484 if (compressed_size && compressed_folio)
485 cur_size = compressed_size;
486
487 if (!extent_inserted) {
488 struct btrfs_key key;
489 size_t datasize;
490
491 key.objectid = btrfs_ino(inode);
492 key.type = BTRFS_EXTENT_DATA_KEY;
493 key.offset = 0;
494
495 datasize = btrfs_file_extent_calc_inline_size(cur_size);
496 ret = btrfs_insert_empty_item(trans, root, path, &key,
497 datasize);
498 if (ret)
499 goto fail;
500 }
501 leaf = path->nodes[0];
502 ei = btrfs_item_ptr(leaf, path->slots[0],
503 struct btrfs_file_extent_item);
504 btrfs_set_file_extent_generation(leaf, ei, trans->transid);
505 btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
506 btrfs_set_file_extent_encryption(leaf, ei, 0);
507 btrfs_set_file_extent_other_encoding(leaf, ei, 0);
508 btrfs_set_file_extent_ram_bytes(leaf, ei, size);
509 ptr = btrfs_file_extent_inline_start(ei);
510
511 if (compress_type != BTRFS_COMPRESS_NONE) {
512 kaddr = kmap_local_folio(compressed_folio, 0);
513 write_extent_buffer(leaf, kaddr, ptr, compressed_size);
514 kunmap_local(kaddr);
515
516 btrfs_set_file_extent_compression(leaf, ei,
517 compress_type);
518 } else {
519 struct folio *folio;
520
521 folio = filemap_get_folio(inode->vfs_inode.i_mapping, 0);
522 ASSERT(!IS_ERR(folio));
523 btrfs_set_file_extent_compression(leaf, ei, 0);
524 kaddr = kmap_local_folio(folio, 0);
525 write_extent_buffer(leaf, kaddr, ptr, size);
526 kunmap_local(kaddr);
527 folio_put(folio);
528 }
529 btrfs_release_path(path);
530
531 /*
532 * We align size to sectorsize for inline extents just for simplicity
533 * sake.
534 */
535 ret = btrfs_inode_set_file_extent_range(inode, 0,
536 ALIGN(size, root->fs_info->sectorsize));
537 if (ret)
538 goto fail;
539
540 /*
541 * We're an inline extent, so nobody can extend the file past i_size
542 * without locking a page we already have locked.
543 *
544 * We must do any i_size and inode updates before we unlock the pages.
545 * Otherwise we could end up racing with unlink.
546 */
547 i_size = i_size_read(&inode->vfs_inode);
548 if (update_i_size && size > i_size) {
549 i_size_write(&inode->vfs_inode, size);
550 i_size = size;
551 }
552 inode->disk_i_size = i_size;
553
554 fail:
555 return ret;
556 }
557
can_cow_file_range_inline(struct btrfs_inode * inode,u64 offset,u64 size,size_t compressed_size)558 static bool can_cow_file_range_inline(struct btrfs_inode *inode,
559 u64 offset, u64 size,
560 size_t compressed_size)
561 {
562 struct btrfs_fs_info *fs_info = inode->root->fs_info;
563 u64 data_len = (compressed_size ?: size);
564
565 /* Inline extents must start at offset 0. */
566 if (offset != 0)
567 return false;
568
569 /* Inline extents are limited to sectorsize. */
570 if (size > fs_info->sectorsize)
571 return false;
572
573 /* We do not allow a non-compressed extent to be as large as block size. */
574 if (data_len >= fs_info->sectorsize)
575 return false;
576
577 /* We cannot exceed the maximum inline data size. */
578 if (data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info))
579 return false;
580
581 /* We cannot exceed the user specified max_inline size. */
582 if (data_len > fs_info->max_inline)
583 return false;
584
585 /* Inline extents must be the entirety of the file. */
586 if (size < i_size_read(&inode->vfs_inode))
587 return false;
588
589 return true;
590 }
591
592 /*
593 * conditionally insert an inline extent into the file. This
594 * does the checks required to make sure the data is small enough
595 * to fit as an inline extent.
596 *
597 * If being used directly, you must have already checked we're allowed to cow
598 * the range by getting true from can_cow_file_range_inline().
599 */
__cow_file_range_inline(struct btrfs_inode * inode,u64 size,size_t compressed_size,int compress_type,struct folio * compressed_folio,bool update_i_size)600 static noinline int __cow_file_range_inline(struct btrfs_inode *inode,
601 u64 size, size_t compressed_size,
602 int compress_type,
603 struct folio *compressed_folio,
604 bool update_i_size)
605 {
606 struct btrfs_drop_extents_args drop_args = { 0 };
607 struct btrfs_root *root = inode->root;
608 struct btrfs_fs_info *fs_info = root->fs_info;
609 struct btrfs_trans_handle *trans;
610 u64 data_len = (compressed_size ?: size);
611 int ret;
612 struct btrfs_path *path;
613
614 path = btrfs_alloc_path();
615 if (!path)
616 return -ENOMEM;
617
618 trans = btrfs_join_transaction(root);
619 if (IS_ERR(trans)) {
620 btrfs_free_path(path);
621 return PTR_ERR(trans);
622 }
623 trans->block_rsv = &inode->block_rsv;
624
625 drop_args.path = path;
626 drop_args.start = 0;
627 drop_args.end = fs_info->sectorsize;
628 drop_args.drop_cache = true;
629 drop_args.replace_extent = true;
630 drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(data_len);
631 ret = btrfs_drop_extents(trans, root, inode, &drop_args);
632 if (ret) {
633 btrfs_abort_transaction(trans, ret);
634 goto out;
635 }
636
637 ret = insert_inline_extent(trans, path, inode, drop_args.extent_inserted,
638 size, compressed_size, compress_type,
639 compressed_folio, update_i_size);
640 if (ret && ret != -ENOSPC) {
641 btrfs_abort_transaction(trans, ret);
642 goto out;
643 } else if (ret == -ENOSPC) {
644 ret = 1;
645 goto out;
646 }
647
648 btrfs_update_inode_bytes(inode, size, drop_args.bytes_found);
649 ret = btrfs_update_inode(trans, inode);
650 if (ret && ret != -ENOSPC) {
651 btrfs_abort_transaction(trans, ret);
652 goto out;
653 } else if (ret == -ENOSPC) {
654 ret = 1;
655 goto out;
656 }
657
658 btrfs_set_inode_full_sync(inode);
659 out:
660 /*
661 * Don't forget to free the reserved space, as for inlined extent
662 * it won't count as data extent, free them directly here.
663 * And at reserve time, it's always aligned to page size, so
664 * just free one page here.
665 */
666 btrfs_qgroup_free_data(inode, NULL, 0, fs_info->sectorsize, NULL);
667 btrfs_free_path(path);
668 btrfs_end_transaction(trans);
669 return ret;
670 }
671
cow_file_range_inline(struct btrfs_inode * inode,struct folio * locked_folio,u64 offset,u64 end,size_t compressed_size,int compress_type,struct folio * compressed_folio,bool update_i_size)672 static noinline int cow_file_range_inline(struct btrfs_inode *inode,
673 struct folio *locked_folio,
674 u64 offset, u64 end,
675 size_t compressed_size,
676 int compress_type,
677 struct folio *compressed_folio,
678 bool update_i_size)
679 {
680 struct extent_state *cached = NULL;
681 unsigned long clear_flags = EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
682 EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING | EXTENT_LOCKED;
683 u64 size = min_t(u64, i_size_read(&inode->vfs_inode), end + 1);
684 int ret;
685
686 if (!can_cow_file_range_inline(inode, offset, size, compressed_size))
687 return 1;
688
689 lock_extent(&inode->io_tree, offset, end, &cached);
690 ret = __cow_file_range_inline(inode, size, compressed_size,
691 compress_type, compressed_folio,
692 update_i_size);
693 if (ret > 0) {
694 unlock_extent(&inode->io_tree, offset, end, &cached);
695 return ret;
696 }
697
698 /*
699 * In the successful case (ret == 0 here), cow_file_range will return 1.
700 *
701 * Quite a bit further up the callstack in extent_writepage(), ret == 1
702 * is treated as a short circuited success and does not unlock the folio,
703 * so we must do it here.
704 *
705 * In the failure case, the locked_folio does get unlocked by
706 * btrfs_folio_end_all_writers, which asserts that it is still locked
707 * at that point, so we must *not* unlock it here.
708 *
709 * The other two callsites in compress_file_range do not have a
710 * locked_folio, so they are not relevant to this logic.
711 */
712 if (ret == 0)
713 locked_folio = NULL;
714
715 extent_clear_unlock_delalloc(inode, offset, end, locked_folio, &cached,
716 clear_flags, PAGE_UNLOCK |
717 PAGE_START_WRITEBACK | PAGE_END_WRITEBACK);
718 return ret;
719 }
720
721 struct async_extent {
722 u64 start;
723 u64 ram_size;
724 u64 compressed_size;
725 struct folio **folios;
726 unsigned long nr_folios;
727 int compress_type;
728 struct list_head list;
729 };
730
731 struct async_chunk {
732 struct btrfs_inode *inode;
733 struct folio *locked_folio;
734 u64 start;
735 u64 end;
736 blk_opf_t write_flags;
737 struct list_head extents;
738 struct cgroup_subsys_state *blkcg_css;
739 struct btrfs_work work;
740 struct async_cow *async_cow;
741 };
742
743 struct async_cow {
744 atomic_t num_chunks;
745 struct async_chunk chunks[];
746 };
747
add_async_extent(struct async_chunk * cow,u64 start,u64 ram_size,u64 compressed_size,struct folio ** folios,unsigned long nr_folios,int compress_type)748 static noinline int add_async_extent(struct async_chunk *cow,
749 u64 start, u64 ram_size,
750 u64 compressed_size,
751 struct folio **folios,
752 unsigned long nr_folios,
753 int compress_type)
754 {
755 struct async_extent *async_extent;
756
757 async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
758 if (!async_extent)
759 return -ENOMEM;
760 async_extent->start = start;
761 async_extent->ram_size = ram_size;
762 async_extent->compressed_size = compressed_size;
763 async_extent->folios = folios;
764 async_extent->nr_folios = nr_folios;
765 async_extent->compress_type = compress_type;
766 list_add_tail(&async_extent->list, &cow->extents);
767 return 0;
768 }
769
770 /*
771 * Check if the inode needs to be submitted to compression, based on mount
772 * options, defragmentation, properties or heuristics.
773 */
inode_need_compress(struct btrfs_inode * inode,u64 start,u64 end)774 static inline int inode_need_compress(struct btrfs_inode *inode, u64 start,
775 u64 end)
776 {
777 struct btrfs_fs_info *fs_info = inode->root->fs_info;
778
779 if (!btrfs_inode_can_compress(inode)) {
780 WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
781 KERN_ERR "BTRFS: unexpected compression for ino %llu\n",
782 btrfs_ino(inode));
783 return 0;
784 }
785 /*
786 * Only enable sector perfect compression for experimental builds.
787 *
788 * This is a big feature change for subpage cases, and can hit
789 * different corner cases, so only limit this feature for
790 * experimental build for now.
791 *
792 * ETA for moving this out of experimental builds is 6.15.
793 */
794 if (fs_info->sectorsize < PAGE_SIZE &&
795 !IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL)) {
796 if (!PAGE_ALIGNED(start) ||
797 !PAGE_ALIGNED(end + 1))
798 return 0;
799 }
800
801 /* force compress */
802 if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
803 return 1;
804 /* defrag ioctl */
805 if (inode->defrag_compress)
806 return 1;
807 /* bad compression ratios */
808 if (inode->flags & BTRFS_INODE_NOCOMPRESS)
809 return 0;
810 if (btrfs_test_opt(fs_info, COMPRESS) ||
811 inode->flags & BTRFS_INODE_COMPRESS ||
812 inode->prop_compress)
813 return btrfs_compress_heuristic(inode, start, end);
814 return 0;
815 }
816
inode_should_defrag(struct btrfs_inode * inode,u64 start,u64 end,u64 num_bytes,u32 small_write)817 static inline void inode_should_defrag(struct btrfs_inode *inode,
818 u64 start, u64 end, u64 num_bytes, u32 small_write)
819 {
820 /* If this is a small write inside eof, kick off a defrag */
821 if (num_bytes < small_write &&
822 (start > 0 || end + 1 < inode->disk_i_size))
823 btrfs_add_inode_defrag(inode, small_write);
824 }
825
extent_range_clear_dirty_for_io(struct btrfs_inode * inode,u64 start,u64 end)826 static int extent_range_clear_dirty_for_io(struct btrfs_inode *inode, u64 start, u64 end)
827 {
828 unsigned long end_index = end >> PAGE_SHIFT;
829 struct folio *folio;
830 int ret = 0;
831
832 for (unsigned long index = start >> PAGE_SHIFT;
833 index <= end_index; index++) {
834 folio = filemap_get_folio(inode->vfs_inode.i_mapping, index);
835 if (IS_ERR(folio)) {
836 if (!ret)
837 ret = PTR_ERR(folio);
838 continue;
839 }
840 btrfs_folio_clamp_clear_dirty(inode->root->fs_info, folio, start,
841 end + 1 - start);
842 folio_put(folio);
843 }
844 return ret;
845 }
846
847 /*
848 * Work queue call back to started compression on a file and pages.
849 *
850 * This is done inside an ordered work queue, and the compression is spread
851 * across many cpus. The actual IO submission is step two, and the ordered work
852 * queue takes care of making sure that happens in the same order things were
853 * put onto the queue by writepages and friends.
854 *
855 * If this code finds it can't get good compression, it puts an entry onto the
856 * work queue to write the uncompressed bytes. This makes sure that both
857 * compressed inodes and uncompressed inodes are written in the same order that
858 * the flusher thread sent them down.
859 */
compress_file_range(struct btrfs_work * work)860 static void compress_file_range(struct btrfs_work *work)
861 {
862 struct async_chunk *async_chunk =
863 container_of(work, struct async_chunk, work);
864 struct btrfs_inode *inode = async_chunk->inode;
865 struct btrfs_fs_info *fs_info = inode->root->fs_info;
866 struct address_space *mapping = inode->vfs_inode.i_mapping;
867 u64 blocksize = fs_info->sectorsize;
868 u64 start = async_chunk->start;
869 u64 end = async_chunk->end;
870 u64 actual_end;
871 u64 i_size;
872 int ret = 0;
873 struct folio **folios;
874 unsigned long nr_folios;
875 unsigned long total_compressed = 0;
876 unsigned long total_in = 0;
877 unsigned int poff;
878 int i;
879 int compress_type = fs_info->compress_type;
880 int compress_level = fs_info->compress_level;
881
882 inode_should_defrag(inode, start, end, end - start + 1, SZ_16K);
883
884 /*
885 * We need to call clear_page_dirty_for_io on each page in the range.
886 * Otherwise applications with the file mmap'd can wander in and change
887 * the page contents while we are compressing them.
888 */
889 ret = extent_range_clear_dirty_for_io(inode, start, end);
890
891 /*
892 * All the folios should have been locked thus no failure.
893 *
894 * And even if some folios are missing, btrfs_compress_folios()
895 * would handle them correctly, so here just do an ASSERT() check for
896 * early logic errors.
897 */
898 ASSERT(ret == 0);
899
900 /*
901 * We need to save i_size before now because it could change in between
902 * us evaluating the size and assigning it. This is because we lock and
903 * unlock the page in truncate and fallocate, and then modify the i_size
904 * later on.
905 *
906 * The barriers are to emulate READ_ONCE, remove that once i_size_read
907 * does that for us.
908 */
909 barrier();
910 i_size = i_size_read(&inode->vfs_inode);
911 barrier();
912 actual_end = min_t(u64, i_size, end + 1);
913 again:
914 folios = NULL;
915 nr_folios = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
916 nr_folios = min_t(unsigned long, nr_folios, BTRFS_MAX_COMPRESSED_PAGES);
917
918 /*
919 * we don't want to send crud past the end of i_size through
920 * compression, that's just a waste of CPU time. So, if the
921 * end of the file is before the start of our current
922 * requested range of bytes, we bail out to the uncompressed
923 * cleanup code that can deal with all of this.
924 *
925 * It isn't really the fastest way to fix things, but this is a
926 * very uncommon corner.
927 */
928 if (actual_end <= start)
929 goto cleanup_and_bail_uncompressed;
930
931 total_compressed = actual_end - start;
932
933 /*
934 * Skip compression for a small file range(<=blocksize) that
935 * isn't an inline extent, since it doesn't save disk space at all.
936 */
937 if (total_compressed <= blocksize &&
938 (start > 0 || end + 1 < inode->disk_i_size))
939 goto cleanup_and_bail_uncompressed;
940
941 total_compressed = min_t(unsigned long, total_compressed,
942 BTRFS_MAX_UNCOMPRESSED);
943 total_in = 0;
944 ret = 0;
945
946 /*
947 * We do compression for mount -o compress and when the inode has not
948 * been flagged as NOCOMPRESS. This flag can change at any time if we
949 * discover bad compression ratios.
950 */
951 if (!inode_need_compress(inode, start, end))
952 goto cleanup_and_bail_uncompressed;
953
954 folios = kcalloc(nr_folios, sizeof(struct folio *), GFP_NOFS);
955 if (!folios) {
956 /*
957 * Memory allocation failure is not a fatal error, we can fall
958 * back to uncompressed code.
959 */
960 goto cleanup_and_bail_uncompressed;
961 }
962
963 if (inode->defrag_compress) {
964 compress_type = inode->defrag_compress;
965 compress_level = inode->defrag_compress_level;
966 } else if (inode->prop_compress) {
967 compress_type = inode->prop_compress;
968 }
969
970 /* Compression level is applied here. */
971 ret = btrfs_compress_folios(compress_type, compress_level,
972 mapping, start, folios, &nr_folios, &total_in,
973 &total_compressed);
974 if (ret)
975 goto mark_incompressible;
976
977 /*
978 * Zero the tail end of the last page, as we might be sending it down
979 * to disk.
980 */
981 poff = offset_in_page(total_compressed);
982 if (poff)
983 folio_zero_range(folios[nr_folios - 1], poff, PAGE_SIZE - poff);
984
985 /*
986 * Try to create an inline extent.
987 *
988 * If we didn't compress the entire range, try to create an uncompressed
989 * inline extent, else a compressed one.
990 *
991 * Check cow_file_range() for why we don't even try to create inline
992 * extent for the subpage case.
993 */
994 if (total_in < actual_end)
995 ret = cow_file_range_inline(inode, NULL, start, end, 0,
996 BTRFS_COMPRESS_NONE, NULL, false);
997 else
998 ret = cow_file_range_inline(inode, NULL, start, end, total_compressed,
999 compress_type, folios[0], false);
1000 if (ret <= 0) {
1001 if (ret < 0)
1002 mapping_set_error(mapping, -EIO);
1003 goto free_pages;
1004 }
1005
1006 /*
1007 * We aren't doing an inline extent. Round the compressed size up to a
1008 * block size boundary so the allocator does sane things.
1009 */
1010 total_compressed = ALIGN(total_compressed, blocksize);
1011
1012 /*
1013 * One last check to make sure the compression is really a win, compare
1014 * the page count read with the blocks on disk, compression must free at
1015 * least one sector.
1016 */
1017 total_in = round_up(total_in, fs_info->sectorsize);
1018 if (total_compressed + blocksize > total_in)
1019 goto mark_incompressible;
1020
1021 /*
1022 * The async work queues will take care of doing actual allocation on
1023 * disk for these compressed pages, and will submit the bios.
1024 */
1025 ret = add_async_extent(async_chunk, start, total_in, total_compressed, folios,
1026 nr_folios, compress_type);
1027 BUG_ON(ret);
1028 if (start + total_in < end) {
1029 start += total_in;
1030 cond_resched();
1031 goto again;
1032 }
1033 return;
1034
1035 mark_incompressible:
1036 if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) && !inode->prop_compress)
1037 inode->flags |= BTRFS_INODE_NOCOMPRESS;
1038 cleanup_and_bail_uncompressed:
1039 ret = add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0,
1040 BTRFS_COMPRESS_NONE);
1041 BUG_ON(ret);
1042 free_pages:
1043 if (folios) {
1044 for (i = 0; i < nr_folios; i++) {
1045 WARN_ON(folios[i]->mapping);
1046 btrfs_free_compr_folio(folios[i]);
1047 }
1048 kfree(folios);
1049 }
1050 }
1051
free_async_extent_pages(struct async_extent * async_extent)1052 static void free_async_extent_pages(struct async_extent *async_extent)
1053 {
1054 int i;
1055
1056 if (!async_extent->folios)
1057 return;
1058
1059 for (i = 0; i < async_extent->nr_folios; i++) {
1060 WARN_ON(async_extent->folios[i]->mapping);
1061 btrfs_free_compr_folio(async_extent->folios[i]);
1062 }
1063 kfree(async_extent->folios);
1064 async_extent->nr_folios = 0;
1065 async_extent->folios = NULL;
1066 }
1067
submit_uncompressed_range(struct btrfs_inode * inode,struct async_extent * async_extent,struct folio * locked_folio)1068 static void submit_uncompressed_range(struct btrfs_inode *inode,
1069 struct async_extent *async_extent,
1070 struct folio *locked_folio)
1071 {
1072 u64 start = async_extent->start;
1073 u64 end = async_extent->start + async_extent->ram_size - 1;
1074 int ret;
1075 struct writeback_control wbc = {
1076 .sync_mode = WB_SYNC_ALL,
1077 .range_start = start,
1078 .range_end = end,
1079 .no_cgroup_owner = 1,
1080 };
1081
1082 wbc_attach_fdatawrite_inode(&wbc, &inode->vfs_inode);
1083 ret = run_delalloc_cow(inode, locked_folio, start, end,
1084 &wbc, false);
1085 wbc_detach_inode(&wbc);
1086 if (ret < 0) {
1087 if (locked_folio)
1088 btrfs_folio_end_lock(inode->root->fs_info, locked_folio,
1089 start, async_extent->ram_size);
1090 btrfs_err_rl(inode->root->fs_info,
1091 "%s failed, root=%llu inode=%llu start=%llu len=%llu: %d",
1092 __func__, btrfs_root_id(inode->root),
1093 btrfs_ino(inode), start, async_extent->ram_size, ret);
1094 }
1095 }
1096
submit_one_async_extent(struct async_chunk * async_chunk,struct async_extent * async_extent,u64 * alloc_hint)1097 static void submit_one_async_extent(struct async_chunk *async_chunk,
1098 struct async_extent *async_extent,
1099 u64 *alloc_hint)
1100 {
1101 struct btrfs_inode *inode = async_chunk->inode;
1102 struct extent_io_tree *io_tree = &inode->io_tree;
1103 struct btrfs_root *root = inode->root;
1104 struct btrfs_fs_info *fs_info = root->fs_info;
1105 struct btrfs_ordered_extent *ordered;
1106 struct btrfs_file_extent file_extent;
1107 struct btrfs_key ins;
1108 struct folio *locked_folio = NULL;
1109 struct extent_state *cached = NULL;
1110 struct extent_map *em;
1111 int ret = 0;
1112 bool free_pages = false;
1113 u64 start = async_extent->start;
1114 u64 end = async_extent->start + async_extent->ram_size - 1;
1115
1116 if (async_chunk->blkcg_css)
1117 kthread_associate_blkcg(async_chunk->blkcg_css);
1118
1119 /*
1120 * If async_chunk->locked_folio is in the async_extent range, we need to
1121 * handle it.
1122 */
1123 if (async_chunk->locked_folio) {
1124 u64 locked_folio_start = folio_pos(async_chunk->locked_folio);
1125 u64 locked_folio_end = locked_folio_start +
1126 folio_size(async_chunk->locked_folio) - 1;
1127
1128 if (!(start >= locked_folio_end || end <= locked_folio_start))
1129 locked_folio = async_chunk->locked_folio;
1130 }
1131
1132 if (async_extent->compress_type == BTRFS_COMPRESS_NONE) {
1133 ASSERT(!async_extent->folios);
1134 ASSERT(async_extent->nr_folios == 0);
1135 submit_uncompressed_range(inode, async_extent, locked_folio);
1136 free_pages = true;
1137 goto done;
1138 }
1139
1140 ret = btrfs_reserve_extent(root, async_extent->ram_size,
1141 async_extent->compressed_size,
1142 async_extent->compressed_size,
1143 0, *alloc_hint, &ins, 1, 1);
1144 if (ret) {
1145 /*
1146 * We can't reserve contiguous space for the compressed size.
1147 * Unlikely, but it's possible that we could have enough
1148 * non-contiguous space for the uncompressed size instead. So
1149 * fall back to uncompressed.
1150 */
1151 submit_uncompressed_range(inode, async_extent, locked_folio);
1152 free_pages = true;
1153 goto done;
1154 }
1155
1156 lock_extent(io_tree, start, end, &cached);
1157
1158 /* Here we're doing allocation and writeback of the compressed pages */
1159 file_extent.disk_bytenr = ins.objectid;
1160 file_extent.disk_num_bytes = ins.offset;
1161 file_extent.ram_bytes = async_extent->ram_size;
1162 file_extent.num_bytes = async_extent->ram_size;
1163 file_extent.offset = 0;
1164 file_extent.compression = async_extent->compress_type;
1165
1166 em = btrfs_create_io_em(inode, start, &file_extent, BTRFS_ORDERED_COMPRESSED);
1167 if (IS_ERR(em)) {
1168 ret = PTR_ERR(em);
1169 goto out_free_reserve;
1170 }
1171 free_extent_map(em);
1172
1173 ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent,
1174 1 << BTRFS_ORDERED_COMPRESSED);
1175 if (IS_ERR(ordered)) {
1176 btrfs_drop_extent_map_range(inode, start, end, false);
1177 ret = PTR_ERR(ordered);
1178 goto out_free_reserve;
1179 }
1180 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1181
1182 /* Clear dirty, set writeback and unlock the pages. */
1183 extent_clear_unlock_delalloc(inode, start, end,
1184 NULL, &cached, EXTENT_LOCKED | EXTENT_DELALLOC,
1185 PAGE_UNLOCK | PAGE_START_WRITEBACK);
1186 btrfs_submit_compressed_write(ordered,
1187 async_extent->folios, /* compressed_folios */
1188 async_extent->nr_folios,
1189 async_chunk->write_flags, true);
1190 *alloc_hint = ins.objectid + ins.offset;
1191 done:
1192 if (async_chunk->blkcg_css)
1193 kthread_associate_blkcg(NULL);
1194 if (free_pages)
1195 free_async_extent_pages(async_extent);
1196 kfree(async_extent);
1197 return;
1198
1199 out_free_reserve:
1200 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1201 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
1202 mapping_set_error(inode->vfs_inode.i_mapping, -EIO);
1203 extent_clear_unlock_delalloc(inode, start, end,
1204 NULL, &cached,
1205 EXTENT_LOCKED | EXTENT_DELALLOC |
1206 EXTENT_DELALLOC_NEW |
1207 EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
1208 PAGE_UNLOCK | PAGE_START_WRITEBACK |
1209 PAGE_END_WRITEBACK);
1210 free_async_extent_pages(async_extent);
1211 if (async_chunk->blkcg_css)
1212 kthread_associate_blkcg(NULL);
1213 btrfs_debug(fs_info,
1214 "async extent submission failed root=%lld inode=%llu start=%llu len=%llu ret=%d",
1215 btrfs_root_id(root), btrfs_ino(inode), start,
1216 async_extent->ram_size, ret);
1217 kfree(async_extent);
1218 }
1219
btrfs_get_extent_allocation_hint(struct btrfs_inode * inode,u64 start,u64 num_bytes)1220 u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
1221 u64 num_bytes)
1222 {
1223 struct extent_map_tree *em_tree = &inode->extent_tree;
1224 struct extent_map *em;
1225 u64 alloc_hint = 0;
1226
1227 read_lock(&em_tree->lock);
1228 em = search_extent_mapping(em_tree, start, num_bytes);
1229 if (em) {
1230 /*
1231 * if block start isn't an actual block number then find the
1232 * first block in this inode and use that as a hint. If that
1233 * block is also bogus then just don't worry about it.
1234 */
1235 if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) {
1236 free_extent_map(em);
1237 em = search_extent_mapping(em_tree, 0, 0);
1238 if (em && em->disk_bytenr < EXTENT_MAP_LAST_BYTE)
1239 alloc_hint = extent_map_block_start(em);
1240 if (em)
1241 free_extent_map(em);
1242 } else {
1243 alloc_hint = extent_map_block_start(em);
1244 free_extent_map(em);
1245 }
1246 }
1247 read_unlock(&em_tree->lock);
1248
1249 return alloc_hint;
1250 }
1251
1252 /*
1253 * when extent_io.c finds a delayed allocation range in the file,
1254 * the call backs end up in this code. The basic idea is to
1255 * allocate extents on disk for the range, and create ordered data structs
1256 * in ram to track those extents.
1257 *
1258 * locked_folio is the folio that writepage had locked already. We use
1259 * it to make sure we don't do extra locks or unlocks.
1260 *
1261 * When this function fails, it unlocks all pages except @locked_folio.
1262 *
1263 * When this function successfully creates an inline extent, it returns 1 and
1264 * unlocks all pages including locked_folio and starts I/O on them.
1265 * (In reality inline extents are limited to a single page, so locked_folio is
1266 * the only page handled anyway).
1267 *
1268 * When this function succeed and creates a normal extent, the page locking
1269 * status depends on the passed in flags:
1270 *
1271 * - If @keep_locked is set, all pages are kept locked.
1272 * - Else all pages except for @locked_folio are unlocked.
1273 *
1274 * When a failure happens in the second or later iteration of the
1275 * while-loop, the ordered extents created in previous iterations are cleaned up.
1276 */
cow_file_range(struct btrfs_inode * inode,struct folio * locked_folio,u64 start,u64 end,u64 * done_offset,bool keep_locked,bool no_inline)1277 static noinline int cow_file_range(struct btrfs_inode *inode,
1278 struct folio *locked_folio, u64 start,
1279 u64 end, u64 *done_offset,
1280 bool keep_locked, bool no_inline)
1281 {
1282 struct btrfs_root *root = inode->root;
1283 struct btrfs_fs_info *fs_info = root->fs_info;
1284 struct extent_state *cached = NULL;
1285 u64 alloc_hint = 0;
1286 u64 orig_start = start;
1287 u64 num_bytes;
1288 u64 cur_alloc_size = 0;
1289 u64 min_alloc_size;
1290 u64 blocksize = fs_info->sectorsize;
1291 struct btrfs_key ins;
1292 struct extent_map *em;
1293 unsigned clear_bits;
1294 unsigned long page_ops;
1295 int ret = 0;
1296
1297 if (btrfs_is_free_space_inode(inode)) {
1298 ret = -EINVAL;
1299 goto out_unlock;
1300 }
1301
1302 num_bytes = ALIGN(end - start + 1, blocksize);
1303 num_bytes = max(blocksize, num_bytes);
1304 ASSERT(num_bytes <= btrfs_super_total_bytes(fs_info->super_copy));
1305
1306 inode_should_defrag(inode, start, end, num_bytes, SZ_64K);
1307
1308 if (!no_inline) {
1309 /* lets try to make an inline extent */
1310 ret = cow_file_range_inline(inode, locked_folio, start, end, 0,
1311 BTRFS_COMPRESS_NONE, NULL, false);
1312 if (ret <= 0) {
1313 /*
1314 * We succeeded, return 1 so the caller knows we're done
1315 * with this page and already handled the IO.
1316 *
1317 * If there was an error then cow_file_range_inline() has
1318 * already done the cleanup.
1319 */
1320 if (ret == 0)
1321 ret = 1;
1322 goto done;
1323 }
1324 }
1325
1326 alloc_hint = btrfs_get_extent_allocation_hint(inode, start, num_bytes);
1327
1328 /*
1329 * We're not doing compressed IO, don't unlock the first page (which
1330 * the caller expects to stay locked), don't clear any dirty bits and
1331 * don't set any writeback bits.
1332 *
1333 * Do set the Ordered (Private2) bit so we know this page was properly
1334 * setup for writepage.
1335 */
1336 page_ops = (keep_locked ? 0 : PAGE_UNLOCK);
1337 page_ops |= PAGE_SET_ORDERED;
1338
1339 /*
1340 * Relocation relies on the relocated extents to have exactly the same
1341 * size as the original extents. Normally writeback for relocation data
1342 * extents follows a NOCOW path because relocation preallocates the
1343 * extents. However, due to an operation such as scrub turning a block
1344 * group to RO mode, it may fallback to COW mode, so we must make sure
1345 * an extent allocated during COW has exactly the requested size and can
1346 * not be split into smaller extents, otherwise relocation breaks and
1347 * fails during the stage where it updates the bytenr of file extent
1348 * items.
1349 */
1350 if (btrfs_is_data_reloc_root(root))
1351 min_alloc_size = num_bytes;
1352 else
1353 min_alloc_size = fs_info->sectorsize;
1354
1355 while (num_bytes > 0) {
1356 struct btrfs_ordered_extent *ordered;
1357 struct btrfs_file_extent file_extent;
1358
1359 ret = btrfs_reserve_extent(root, num_bytes, num_bytes,
1360 min_alloc_size, 0, alloc_hint,
1361 &ins, 1, 1);
1362 if (ret == -EAGAIN) {
1363 /*
1364 * btrfs_reserve_extent only returns -EAGAIN for zoned
1365 * file systems, which is an indication that there are
1366 * no active zones to allocate from at the moment.
1367 *
1368 * If this is the first loop iteration, wait for at
1369 * least one zone to finish before retrying the
1370 * allocation. Otherwise ask the caller to write out
1371 * the already allocated blocks before coming back to
1372 * us, or return -ENOSPC if it can't handle retries.
1373 */
1374 ASSERT(btrfs_is_zoned(fs_info));
1375 if (start == orig_start) {
1376 wait_on_bit_io(&inode->root->fs_info->flags,
1377 BTRFS_FS_NEED_ZONE_FINISH,
1378 TASK_UNINTERRUPTIBLE);
1379 continue;
1380 }
1381 if (done_offset) {
1382 /*
1383 * Move @end to the end of the processed range,
1384 * and exit the loop to unlock the processed extents.
1385 */
1386 end = start - 1;
1387 ret = 0;
1388 break;
1389 }
1390 ret = -ENOSPC;
1391 }
1392 if (ret < 0)
1393 goto out_unlock;
1394 cur_alloc_size = ins.offset;
1395
1396 file_extent.disk_bytenr = ins.objectid;
1397 file_extent.disk_num_bytes = ins.offset;
1398 file_extent.num_bytes = ins.offset;
1399 file_extent.ram_bytes = ins.offset;
1400 file_extent.offset = 0;
1401 file_extent.compression = BTRFS_COMPRESS_NONE;
1402
1403 /*
1404 * Locked range will be released either during error clean up or
1405 * after the whole range is finished.
1406 */
1407 lock_extent(&inode->io_tree, start, start + cur_alloc_size - 1,
1408 &cached);
1409
1410 em = btrfs_create_io_em(inode, start, &file_extent,
1411 BTRFS_ORDERED_REGULAR);
1412 if (IS_ERR(em)) {
1413 unlock_extent(&inode->io_tree, start,
1414 start + cur_alloc_size - 1, &cached);
1415 ret = PTR_ERR(em);
1416 goto out_reserve;
1417 }
1418 free_extent_map(em);
1419
1420 ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent,
1421 1 << BTRFS_ORDERED_REGULAR);
1422 if (IS_ERR(ordered)) {
1423 unlock_extent(&inode->io_tree, start,
1424 start + cur_alloc_size - 1, &cached);
1425 ret = PTR_ERR(ordered);
1426 goto out_drop_extent_cache;
1427 }
1428
1429 if (btrfs_is_data_reloc_root(root)) {
1430 ret = btrfs_reloc_clone_csums(ordered);
1431
1432 /*
1433 * Only drop cache here, and process as normal.
1434 *
1435 * We must not allow extent_clear_unlock_delalloc()
1436 * at out_unlock label to free meta of this ordered
1437 * extent, as its meta should be freed by
1438 * btrfs_finish_ordered_io().
1439 *
1440 * So we must continue until @start is increased to
1441 * skip current ordered extent.
1442 */
1443 if (ret)
1444 btrfs_drop_extent_map_range(inode, start,
1445 start + cur_alloc_size - 1,
1446 false);
1447 }
1448 btrfs_put_ordered_extent(ordered);
1449
1450 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1451
1452 if (num_bytes < cur_alloc_size)
1453 num_bytes = 0;
1454 else
1455 num_bytes -= cur_alloc_size;
1456 alloc_hint = ins.objectid + ins.offset;
1457 start += cur_alloc_size;
1458 cur_alloc_size = 0;
1459
1460 /*
1461 * btrfs_reloc_clone_csums() error, since start is increased
1462 * extent_clear_unlock_delalloc() at out_unlock label won't
1463 * free metadata of current ordered extent, we're OK to exit.
1464 */
1465 if (ret)
1466 goto out_unlock;
1467 }
1468 extent_clear_unlock_delalloc(inode, orig_start, end, locked_folio, &cached,
1469 EXTENT_LOCKED | EXTENT_DELALLOC, page_ops);
1470 done:
1471 if (done_offset)
1472 *done_offset = end;
1473 return ret;
1474
1475 out_drop_extent_cache:
1476 btrfs_drop_extent_map_range(inode, start, start + cur_alloc_size - 1, false);
1477 out_reserve:
1478 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1479 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
1480 out_unlock:
1481 /*
1482 * Now, we have three regions to clean up:
1483 *
1484 * |-------(1)----|---(2)---|-------------(3)----------|
1485 * `- orig_start `- start `- start + cur_alloc_size `- end
1486 *
1487 * We process each region below.
1488 */
1489
1490 /*
1491 * For the range (1). We have already instantiated the ordered extents
1492 * for this region, thus we need to cleanup those ordered extents.
1493 * EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV
1494 * are also handled by the ordered extents cleanup.
1495 *
1496 * So here we only clear EXTENT_LOCKED and EXTENT_DELALLOC flag, and
1497 * finish the writeback of the involved folios, which will be never submitted.
1498 */
1499 if (orig_start < start) {
1500 clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC;
1501 page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;
1502
1503 if (!locked_folio)
1504 mapping_set_error(inode->vfs_inode.i_mapping, ret);
1505
1506 btrfs_cleanup_ordered_extents(inode, orig_start, start - orig_start);
1507 extent_clear_unlock_delalloc(inode, orig_start, start - 1,
1508 locked_folio, NULL, clear_bits, page_ops);
1509 }
1510
1511 clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
1512 EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
1513 page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;
1514
1515 /*
1516 * For the range (2). If we reserved an extent for our delalloc range
1517 * (or a subrange) and failed to create the respective ordered extent,
1518 * then it means that when we reserved the extent we decremented the
1519 * extent's size from the data space_info's bytes_may_use counter and
1520 * incremented the space_info's bytes_reserved counter by the same
1521 * amount. We must make sure extent_clear_unlock_delalloc() does not try
1522 * to decrement again the data space_info's bytes_may_use counter,
1523 * therefore we do not pass it the flag EXTENT_CLEAR_DATA_RESV.
1524 */
1525 if (cur_alloc_size) {
1526 extent_clear_unlock_delalloc(inode, start,
1527 start + cur_alloc_size - 1,
1528 locked_folio, &cached, clear_bits,
1529 page_ops);
1530 btrfs_qgroup_free_data(inode, NULL, start, cur_alloc_size, NULL);
1531 }
1532
1533 /*
1534 * For the range (3). We never touched the region. In addition to the
1535 * clear_bits above, we add EXTENT_CLEAR_DATA_RESV to release the data
1536 * space_info's bytes_may_use counter, reserved in
1537 * btrfs_check_data_free_space().
1538 */
1539 if (start + cur_alloc_size < end) {
1540 clear_bits |= EXTENT_CLEAR_DATA_RESV;
1541 extent_clear_unlock_delalloc(inode, start + cur_alloc_size,
1542 end, locked_folio,
1543 &cached, clear_bits, page_ops);
1544 btrfs_qgroup_free_data(inode, NULL, start + cur_alloc_size,
1545 end - start - cur_alloc_size + 1, NULL);
1546 }
1547 btrfs_err_rl(fs_info,
1548 "%s failed, root=%llu inode=%llu start=%llu len=%llu: %d",
1549 __func__, btrfs_root_id(inode->root),
1550 btrfs_ino(inode), orig_start, end + 1 - orig_start, ret);
1551 return ret;
1552 }
1553
1554 /*
1555 * Phase two of compressed writeback. This is the ordered portion of the code,
1556 * which only gets called in the order the work was queued. We walk all the
1557 * async extents created by compress_file_range and send them down to the disk.
1558 *
1559 * If called with @do_free == true then it'll try to finish the work and free
1560 * the work struct eventually.
1561 */
submit_compressed_extents(struct btrfs_work * work,bool do_free)1562 static noinline void submit_compressed_extents(struct btrfs_work *work, bool do_free)
1563 {
1564 struct async_chunk *async_chunk = container_of(work, struct async_chunk,
1565 work);
1566 struct btrfs_fs_info *fs_info = btrfs_work_owner(work);
1567 struct async_extent *async_extent;
1568 unsigned long nr_pages;
1569 u64 alloc_hint = 0;
1570
1571 if (do_free) {
1572 struct async_cow *async_cow;
1573
1574 btrfs_add_delayed_iput(async_chunk->inode);
1575 if (async_chunk->blkcg_css)
1576 css_put(async_chunk->blkcg_css);
1577
1578 async_cow = async_chunk->async_cow;
1579 if (atomic_dec_and_test(&async_cow->num_chunks))
1580 kvfree(async_cow);
1581 return;
1582 }
1583
1584 nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >>
1585 PAGE_SHIFT;
1586
1587 while (!list_empty(&async_chunk->extents)) {
1588 async_extent = list_entry(async_chunk->extents.next,
1589 struct async_extent, list);
1590 list_del(&async_extent->list);
1591 submit_one_async_extent(async_chunk, async_extent, &alloc_hint);
1592 }
1593
1594 /* atomic_sub_return implies a barrier */
1595 if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
1596 5 * SZ_1M)
1597 cond_wake_up_nomb(&fs_info->async_submit_wait);
1598 }
1599
run_delalloc_compressed(struct btrfs_inode * inode,struct folio * locked_folio,u64 start,u64 end,struct writeback_control * wbc)1600 static bool run_delalloc_compressed(struct btrfs_inode *inode,
1601 struct folio *locked_folio, u64 start,
1602 u64 end, struct writeback_control *wbc)
1603 {
1604 struct btrfs_fs_info *fs_info = inode->root->fs_info;
1605 struct cgroup_subsys_state *blkcg_css = wbc_blkcg_css(wbc);
1606 struct async_cow *ctx;
1607 struct async_chunk *async_chunk;
1608 unsigned long nr_pages;
1609 u64 num_chunks = DIV_ROUND_UP(end - start, SZ_512K);
1610 int i;
1611 unsigned nofs_flag;
1612 const blk_opf_t write_flags = wbc_to_write_flags(wbc);
1613
1614 nofs_flag = memalloc_nofs_save();
1615 ctx = kvmalloc(struct_size(ctx, chunks, num_chunks), GFP_KERNEL);
1616 memalloc_nofs_restore(nofs_flag);
1617 if (!ctx)
1618 return false;
1619
1620 set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags);
1621
1622 async_chunk = ctx->chunks;
1623 atomic_set(&ctx->num_chunks, num_chunks);
1624
1625 for (i = 0; i < num_chunks; i++) {
1626 u64 cur_end = min(end, start + SZ_512K - 1);
1627
1628 /*
1629 * igrab is called higher up in the call chain, take only the
1630 * lightweight reference for the callback lifetime
1631 */
1632 ihold(&inode->vfs_inode);
1633 async_chunk[i].async_cow = ctx;
1634 async_chunk[i].inode = inode;
1635 async_chunk[i].start = start;
1636 async_chunk[i].end = cur_end;
1637 async_chunk[i].write_flags = write_flags;
1638 INIT_LIST_HEAD(&async_chunk[i].extents);
1639
1640 /*
1641 * The locked_folio comes all the way from writepage and its
1642 * the original folio we were actually given. As we spread
1643 * this large delalloc region across multiple async_chunk
1644 * structs, only the first struct needs a pointer to
1645 * locked_folio.
1646 *
1647 * This way we don't need racey decisions about who is supposed
1648 * to unlock it.
1649 */
1650 if (locked_folio) {
1651 /*
1652 * Depending on the compressibility, the pages might or
1653 * might not go through async. We want all of them to
1654 * be accounted against wbc once. Let's do it here
1655 * before the paths diverge. wbc accounting is used
1656 * only for foreign writeback detection and doesn't
1657 * need full accuracy. Just account the whole thing
1658 * against the first page.
1659 */
1660 wbc_account_cgroup_owner(wbc, locked_folio,
1661 cur_end - start);
1662 async_chunk[i].locked_folio = locked_folio;
1663 locked_folio = NULL;
1664 } else {
1665 async_chunk[i].locked_folio = NULL;
1666 }
1667
1668 if (blkcg_css != blkcg_root_css) {
1669 css_get(blkcg_css);
1670 async_chunk[i].blkcg_css = blkcg_css;
1671 async_chunk[i].write_flags |= REQ_BTRFS_CGROUP_PUNT;
1672 } else {
1673 async_chunk[i].blkcg_css = NULL;
1674 }
1675
1676 btrfs_init_work(&async_chunk[i].work, compress_file_range,
1677 submit_compressed_extents);
1678
1679 nr_pages = DIV_ROUND_UP(cur_end - start, PAGE_SIZE);
1680 atomic_add(nr_pages, &fs_info->async_delalloc_pages);
1681
1682 btrfs_queue_work(fs_info->delalloc_workers, &async_chunk[i].work);
1683
1684 start = cur_end + 1;
1685 }
1686 return true;
1687 }
1688
1689 /*
1690 * Run the delalloc range from start to end, and write back any dirty pages
1691 * covered by the range.
1692 */
run_delalloc_cow(struct btrfs_inode * inode,struct folio * locked_folio,u64 start,u64 end,struct writeback_control * wbc,bool pages_dirty)1693 static noinline int run_delalloc_cow(struct btrfs_inode *inode,
1694 struct folio *locked_folio, u64 start,
1695 u64 end, struct writeback_control *wbc,
1696 bool pages_dirty)
1697 {
1698 u64 done_offset = end;
1699 int ret;
1700
1701 while (start <= end) {
1702 ret = cow_file_range(inode, locked_folio, start, end,
1703 &done_offset, true, false);
1704 if (ret)
1705 return ret;
1706 extent_write_locked_range(&inode->vfs_inode, locked_folio,
1707 start, done_offset, wbc, pages_dirty);
1708 start = done_offset + 1;
1709 }
1710
1711 return 1;
1712 }
1713
fallback_to_cow(struct btrfs_inode * inode,struct folio * locked_folio,const u64 start,const u64 end)1714 static int fallback_to_cow(struct btrfs_inode *inode,
1715 struct folio *locked_folio, const u64 start,
1716 const u64 end)
1717 {
1718 const bool is_space_ino = btrfs_is_free_space_inode(inode);
1719 const bool is_reloc_ino = btrfs_is_data_reloc_root(inode->root);
1720 const u64 range_bytes = end + 1 - start;
1721 struct extent_io_tree *io_tree = &inode->io_tree;
1722 struct extent_state *cached_state = NULL;
1723 u64 range_start = start;
1724 u64 count;
1725 int ret;
1726
1727 /*
1728 * If EXTENT_NORESERVE is set it means that when the buffered write was
1729 * made we had not enough available data space and therefore we did not
1730 * reserve data space for it, since we though we could do NOCOW for the
1731 * respective file range (either there is prealloc extent or the inode
1732 * has the NOCOW bit set).
1733 *
1734 * However when we need to fallback to COW mode (because for example the
1735 * block group for the corresponding extent was turned to RO mode by a
1736 * scrub or relocation) we need to do the following:
1737 *
1738 * 1) We increment the bytes_may_use counter of the data space info.
1739 * If COW succeeds, it allocates a new data extent and after doing
1740 * that it decrements the space info's bytes_may_use counter and
1741 * increments its bytes_reserved counter by the same amount (we do
1742 * this at btrfs_add_reserved_bytes()). So we need to increment the
1743 * bytes_may_use counter to compensate (when space is reserved at
1744 * buffered write time, the bytes_may_use counter is incremented);
1745 *
1746 * 2) We clear the EXTENT_NORESERVE bit from the range. We do this so
1747 * that if the COW path fails for any reason, it decrements (through
1748 * extent_clear_unlock_delalloc()) the bytes_may_use counter of the
1749 * data space info, which we incremented in the step above.
1750 *
1751 * If we need to fallback to cow and the inode corresponds to a free
1752 * space cache inode or an inode of the data relocation tree, we must
1753 * also increment bytes_may_use of the data space_info for the same
1754 * reason. Space caches and relocated data extents always get a prealloc
1755 * extent for them, however scrub or balance may have set the block
1756 * group that contains that extent to RO mode and therefore force COW
1757 * when starting writeback.
1758 */
1759 lock_extent(io_tree, start, end, &cached_state);
1760 count = count_range_bits(io_tree, &range_start, end, range_bytes,
1761 EXTENT_NORESERVE, 0, NULL);
1762 if (count > 0 || is_space_ino || is_reloc_ino) {
1763 u64 bytes = count;
1764 struct btrfs_fs_info *fs_info = inode->root->fs_info;
1765 struct btrfs_space_info *sinfo = fs_info->data_sinfo;
1766
1767 if (is_space_ino || is_reloc_ino)
1768 bytes = range_bytes;
1769
1770 spin_lock(&sinfo->lock);
1771 btrfs_space_info_update_bytes_may_use(sinfo, bytes);
1772 spin_unlock(&sinfo->lock);
1773
1774 if (count > 0)
1775 clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE,
1776 NULL);
1777 }
1778 unlock_extent(io_tree, start, end, &cached_state);
1779
1780 /*
1781 * Don't try to create inline extents, as a mix of inline extent that
1782 * is written out and unlocked directly and a normal NOCOW extent
1783 * doesn't work.
1784 */
1785 ret = cow_file_range(inode, locked_folio, start, end, NULL, false,
1786 true);
1787 ASSERT(ret != 1);
1788 return ret;
1789 }
1790
1791 struct can_nocow_file_extent_args {
1792 /* Input fields. */
1793
1794 /* Start file offset of the range we want to NOCOW. */
1795 u64 start;
1796 /* End file offset (inclusive) of the range we want to NOCOW. */
1797 u64 end;
1798 bool writeback_path;
1799 /*
1800 * Free the path passed to can_nocow_file_extent() once it's not needed
1801 * anymore.
1802 */
1803 bool free_path;
1804
1805 /*
1806 * Output fields. Only set when can_nocow_file_extent() returns 1.
1807 * The expected file extent for the NOCOW write.
1808 */
1809 struct btrfs_file_extent file_extent;
1810 };
1811
1812 /*
1813 * Check if we can NOCOW the file extent that the path points to.
1814 * This function may return with the path released, so the caller should check
1815 * if path->nodes[0] is NULL or not if it needs to use the path afterwards.
1816 *
1817 * Returns: < 0 on error
1818 * 0 if we can not NOCOW
1819 * 1 if we can NOCOW
1820 */
can_nocow_file_extent(struct btrfs_path * path,struct btrfs_key * key,struct btrfs_inode * inode,struct can_nocow_file_extent_args * args)1821 static int can_nocow_file_extent(struct btrfs_path *path,
1822 struct btrfs_key *key,
1823 struct btrfs_inode *inode,
1824 struct can_nocow_file_extent_args *args)
1825 {
1826 const bool is_freespace_inode = btrfs_is_free_space_inode(inode);
1827 struct extent_buffer *leaf = path->nodes[0];
1828 struct btrfs_root *root = inode->root;
1829 struct btrfs_file_extent_item *fi;
1830 struct btrfs_root *csum_root;
1831 u64 io_start;
1832 u64 extent_end;
1833 u8 extent_type;
1834 int can_nocow = 0;
1835 int ret = 0;
1836 bool nowait = path->nowait;
1837
1838 fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
1839 extent_type = btrfs_file_extent_type(leaf, fi);
1840
1841 if (extent_type == BTRFS_FILE_EXTENT_INLINE)
1842 goto out;
1843
1844 if (!(inode->flags & BTRFS_INODE_NODATACOW) &&
1845 extent_type == BTRFS_FILE_EXTENT_REG)
1846 goto out;
1847
1848 /*
1849 * If the extent was created before the generation where the last snapshot
1850 * for its subvolume was created, then this implies the extent is shared,
1851 * hence we must COW.
1852 */
1853 if (btrfs_file_extent_generation(leaf, fi) <=
1854 btrfs_root_last_snapshot(&root->root_item))
1855 goto out;
1856
1857 /* An explicit hole, must COW. */
1858 if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
1859 goto out;
1860
1861 /* Compressed/encrypted/encoded extents must be COWed. */
1862 if (btrfs_file_extent_compression(leaf, fi) ||
1863 btrfs_file_extent_encryption(leaf, fi) ||
1864 btrfs_file_extent_other_encoding(leaf, fi))
1865 goto out;
1866
1867 extent_end = btrfs_file_extent_end(path);
1868
1869 args->file_extent.disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1870 args->file_extent.disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
1871 args->file_extent.ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
1872 args->file_extent.offset = btrfs_file_extent_offset(leaf, fi);
1873 args->file_extent.compression = btrfs_file_extent_compression(leaf, fi);
1874
1875 /*
1876 * The following checks can be expensive, as they need to take other
1877 * locks and do btree or rbtree searches, so release the path to avoid
1878 * blocking other tasks for too long.
1879 */
1880 btrfs_release_path(path);
1881
1882 ret = btrfs_cross_ref_exist(inode, key->offset - args->file_extent.offset,
1883 args->file_extent.disk_bytenr, path);
1884 WARN_ON_ONCE(ret > 0 && is_freespace_inode);
1885 if (ret != 0)
1886 goto out;
1887
1888 if (args->free_path) {
1889 /*
1890 * We don't need the path anymore, plus through the
1891 * btrfs_lookup_csums_list() call below we will end up allocating
1892 * another path. So free the path to avoid unnecessary extra
1893 * memory usage.
1894 */
1895 btrfs_free_path(path);
1896 path = NULL;
1897 }
1898
1899 /* If there are pending snapshots for this root, we must COW. */
1900 if (args->writeback_path && !is_freespace_inode &&
1901 atomic_read(&root->snapshot_force_cow))
1902 goto out;
1903
1904 args->file_extent.num_bytes = min(args->end + 1, extent_end) - args->start;
1905 args->file_extent.offset += args->start - key->offset;
1906 io_start = args->file_extent.disk_bytenr + args->file_extent.offset;
1907
1908 /*
1909 * Force COW if csums exist in the range. This ensures that csums for a
1910 * given extent are either valid or do not exist.
1911 */
1912
1913 csum_root = btrfs_csum_root(root->fs_info, io_start);
1914 ret = btrfs_lookup_csums_list(csum_root, io_start,
1915 io_start + args->file_extent.num_bytes - 1,
1916 NULL, nowait);
1917 WARN_ON_ONCE(ret > 0 && is_freespace_inode);
1918 if (ret != 0)
1919 goto out;
1920
1921 can_nocow = 1;
1922 out:
1923 if (args->free_path && path)
1924 btrfs_free_path(path);
1925
1926 return ret < 0 ? ret : can_nocow;
1927 }
1928
1929 /*
1930 * Cleanup the dirty folios which will never be submitted due to error.
1931 *
1932 * When running a delalloc range, we may need to split the ranges (due to
1933 * fragmentation or NOCOW). If we hit an error in the later part, we will error
1934 * out and previously successfully executed range will never be submitted, thus
1935 * we have to cleanup those folios by clearing their dirty flag, starting and
1936 * finishing the writeback.
1937 */
cleanup_dirty_folios(struct btrfs_inode * inode,struct folio * locked_folio,u64 start,u64 end,int error)1938 static void cleanup_dirty_folios(struct btrfs_inode *inode,
1939 struct folio *locked_folio,
1940 u64 start, u64 end, int error)
1941 {
1942 struct btrfs_fs_info *fs_info = inode->root->fs_info;
1943 struct address_space *mapping = inode->vfs_inode.i_mapping;
1944 pgoff_t start_index = start >> PAGE_SHIFT;
1945 pgoff_t end_index = end >> PAGE_SHIFT;
1946 u32 len;
1947
1948 ASSERT(end + 1 - start < U32_MAX);
1949 ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
1950 IS_ALIGNED(end + 1, fs_info->sectorsize));
1951 len = end + 1 - start;
1952
1953 /*
1954 * Handle the locked folio first.
1955 * The btrfs_folio_clamp_*() helpers can handle range out of the folio case.
1956 */
1957 btrfs_folio_clamp_finish_io(fs_info, locked_folio, start, len);
1958
1959 for (pgoff_t index = start_index; index <= end_index; index++) {
1960 struct folio *folio;
1961
1962 /* Already handled at the beginning. */
1963 if (index == locked_folio->index)
1964 continue;
1965 folio = __filemap_get_folio(mapping, index, FGP_LOCK, GFP_NOFS);
1966 /* Cache already dropped, no need to do any cleanup. */
1967 if (IS_ERR(folio))
1968 continue;
1969 btrfs_folio_clamp_finish_io(fs_info, locked_folio, start, len);
1970 folio_unlock(folio);
1971 folio_put(folio);
1972 }
1973 mapping_set_error(mapping, error);
1974 }
1975
nocow_one_range(struct btrfs_inode * inode,struct folio * locked_folio,struct extent_state ** cached,struct can_nocow_file_extent_args * nocow_args,u64 file_pos,bool is_prealloc)1976 static int nocow_one_range(struct btrfs_inode *inode, struct folio *locked_folio,
1977 struct extent_state **cached,
1978 struct can_nocow_file_extent_args *nocow_args,
1979 u64 file_pos, bool is_prealloc)
1980 {
1981 struct btrfs_ordered_extent *ordered;
1982 u64 len = nocow_args->file_extent.num_bytes;
1983 u64 end = file_pos + len - 1;
1984 int ret = 0;
1985
1986 lock_extent(&inode->io_tree, file_pos, end, cached);
1987
1988 if (is_prealloc) {
1989 struct extent_map *em;
1990
1991 em = btrfs_create_io_em(inode, file_pos, &nocow_args->file_extent,
1992 BTRFS_ORDERED_PREALLOC);
1993 if (IS_ERR(em)) {
1994 unlock_extent(&inode->io_tree, file_pos, end, cached);
1995 return PTR_ERR(em);
1996 }
1997 free_extent_map(em);
1998 }
1999
2000 ordered = btrfs_alloc_ordered_extent(inode, file_pos, &nocow_args->file_extent,
2001 is_prealloc
2002 ? (1 << BTRFS_ORDERED_PREALLOC)
2003 : (1 << BTRFS_ORDERED_NOCOW));
2004 if (IS_ERR(ordered)) {
2005 if (is_prealloc)
2006 btrfs_drop_extent_map_range(inode, file_pos, end, false);
2007 unlock_extent(&inode->io_tree, file_pos, end, cached);
2008 return PTR_ERR(ordered);
2009 }
2010
2011 if (btrfs_is_data_reloc_root(inode->root))
2012 /*
2013 * Errors are handled later, as we must prevent
2014 * extent_clear_unlock_delalloc() in error handler from freeing
2015 * metadata of the created ordered extent.
2016 */
2017 ret = btrfs_reloc_clone_csums(ordered);
2018 btrfs_put_ordered_extent(ordered);
2019
2020 extent_clear_unlock_delalloc(inode, file_pos, end, locked_folio, cached,
2021 EXTENT_LOCKED | EXTENT_DELALLOC |
2022 EXTENT_CLEAR_DATA_RESV,
2023 PAGE_UNLOCK | PAGE_SET_ORDERED);
2024 /*
2025 * On error, we need to cleanup the ordered extents we created.
2026 *
2027 * We do not clear the folio Dirty flags because they are set and
2028 * cleaered by the caller.
2029 */
2030 if (ret < 0)
2031 btrfs_cleanup_ordered_extents(inode, file_pos, end);
2032 return ret;
2033 }
2034
2035 /*
2036 * when nowcow writeback call back. This checks for snapshots or COW copies
2037 * of the extents that exist in the file, and COWs the file as required.
2038 *
2039 * If no cow copies or snapshots exist, we write directly to the existing
2040 * blocks on disk
2041 */
run_delalloc_nocow(struct btrfs_inode * inode,struct folio * locked_folio,const u64 start,const u64 end)2042 static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
2043 struct folio *locked_folio,
2044 const u64 start, const u64 end)
2045 {
2046 struct btrfs_fs_info *fs_info = inode->root->fs_info;
2047 struct btrfs_root *root = inode->root;
2048 struct btrfs_path *path;
2049 u64 cow_start = (u64)-1;
2050 /*
2051 * If not 0, represents the inclusive end of the last fallback_to_cow()
2052 * range. Only for error handling.
2053 */
2054 u64 cow_end = 0;
2055 u64 cur_offset = start;
2056 int ret;
2057 bool check_prev = true;
2058 u64 ino = btrfs_ino(inode);
2059 struct can_nocow_file_extent_args nocow_args = { 0 };
2060
2061 /*
2062 * Normally on a zoned device we're only doing COW writes, but in case
2063 * of relocation on a zoned filesystem serializes I/O so that we're only
2064 * writing sequentially and can end up here as well.
2065 */
2066 ASSERT(!btrfs_is_zoned(fs_info) || btrfs_is_data_reloc_root(root));
2067
2068 path = btrfs_alloc_path();
2069 if (!path) {
2070 ret = -ENOMEM;
2071 goto error;
2072 }
2073
2074 nocow_args.end = end;
2075 nocow_args.writeback_path = true;
2076
2077 while (cur_offset <= end) {
2078 struct btrfs_block_group *nocow_bg = NULL;
2079 struct btrfs_key found_key;
2080 struct btrfs_file_extent_item *fi;
2081 struct extent_buffer *leaf;
2082 struct extent_state *cached_state = NULL;
2083 u64 extent_end;
2084 int extent_type;
2085
2086 ret = btrfs_lookup_file_extent(NULL, root, path, ino,
2087 cur_offset, 0);
2088 if (ret < 0)
2089 goto error;
2090
2091 /*
2092 * If there is no extent for our range when doing the initial
2093 * search, then go back to the previous slot as it will be the
2094 * one containing the search offset
2095 */
2096 if (ret > 0 && path->slots[0] > 0 && check_prev) {
2097 leaf = path->nodes[0];
2098 btrfs_item_key_to_cpu(leaf, &found_key,
2099 path->slots[0] - 1);
2100 if (found_key.objectid == ino &&
2101 found_key.type == BTRFS_EXTENT_DATA_KEY)
2102 path->slots[0]--;
2103 }
2104 check_prev = false;
2105 next_slot:
2106 /* Go to next leaf if we have exhausted the current one */
2107 leaf = path->nodes[0];
2108 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
2109 ret = btrfs_next_leaf(root, path);
2110 if (ret < 0)
2111 goto error;
2112 if (ret > 0)
2113 break;
2114 leaf = path->nodes[0];
2115 }
2116
2117 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2118
2119 /* Didn't find anything for our INO */
2120 if (found_key.objectid > ino)
2121 break;
2122 /*
2123 * Keep searching until we find an EXTENT_ITEM or there are no
2124 * more extents for this inode
2125 */
2126 if (WARN_ON_ONCE(found_key.objectid < ino) ||
2127 found_key.type < BTRFS_EXTENT_DATA_KEY) {
2128 path->slots[0]++;
2129 goto next_slot;
2130 }
2131
2132 /* Found key is not EXTENT_DATA_KEY or starts after req range */
2133 if (found_key.type > BTRFS_EXTENT_DATA_KEY ||
2134 found_key.offset > end)
2135 break;
2136
2137 /*
2138 * If the found extent starts after requested offset, then
2139 * adjust cur_offset to be right before this extent begins.
2140 */
2141 if (found_key.offset > cur_offset) {
2142 if (cow_start == (u64)-1)
2143 cow_start = cur_offset;
2144 cur_offset = found_key.offset;
2145 goto next_slot;
2146 }
2147
2148 /*
2149 * Found extent which begins before our range and potentially
2150 * intersect it
2151 */
2152 fi = btrfs_item_ptr(leaf, path->slots[0],
2153 struct btrfs_file_extent_item);
2154 extent_type = btrfs_file_extent_type(leaf, fi);
2155 /* If this is triggered then we have a memory corruption. */
2156 ASSERT(extent_type < BTRFS_NR_FILE_EXTENT_TYPES);
2157 if (WARN_ON(extent_type >= BTRFS_NR_FILE_EXTENT_TYPES)) {
2158 ret = -EUCLEAN;
2159 goto error;
2160 }
2161 extent_end = btrfs_file_extent_end(path);
2162
2163 /*
2164 * If the extent we got ends before our current offset, skip to
2165 * the next extent.
2166 */
2167 if (extent_end <= cur_offset) {
2168 path->slots[0]++;
2169 goto next_slot;
2170 }
2171
2172 nocow_args.start = cur_offset;
2173 ret = can_nocow_file_extent(path, &found_key, inode, &nocow_args);
2174 if (ret < 0)
2175 goto error;
2176 if (ret == 0)
2177 goto must_cow;
2178
2179 ret = 0;
2180 nocow_bg = btrfs_inc_nocow_writers(fs_info,
2181 nocow_args.file_extent.disk_bytenr +
2182 nocow_args.file_extent.offset);
2183 if (!nocow_bg) {
2184 must_cow:
2185 /*
2186 * If we can't perform NOCOW writeback for the range,
2187 * then record the beginning of the range that needs to
2188 * be COWed. It will be written out before the next
2189 * NOCOW range if we find one, or when exiting this
2190 * loop.
2191 */
2192 if (cow_start == (u64)-1)
2193 cow_start = cur_offset;
2194 cur_offset = extent_end;
2195 if (cur_offset > end)
2196 break;
2197 if (!path->nodes[0])
2198 continue;
2199 path->slots[0]++;
2200 goto next_slot;
2201 }
2202
2203 /*
2204 * COW range from cow_start to found_key.offset - 1. As the key
2205 * will contain the beginning of the first extent that can be
2206 * NOCOW, following one which needs to be COW'ed
2207 */
2208 if (cow_start != (u64)-1) {
2209 ret = fallback_to_cow(inode, locked_folio, cow_start,
2210 found_key.offset - 1);
2211 if (ret) {
2212 cow_end = found_key.offset - 1;
2213 btrfs_dec_nocow_writers(nocow_bg);
2214 goto error;
2215 }
2216 cow_start = (u64)-1;
2217 }
2218
2219 ret = nocow_one_range(inode, locked_folio, &cached_state,
2220 &nocow_args, cur_offset,
2221 extent_type == BTRFS_FILE_EXTENT_PREALLOC);
2222 btrfs_dec_nocow_writers(nocow_bg);
2223 if (ret < 0)
2224 goto error;
2225 cur_offset = extent_end;
2226 }
2227 btrfs_release_path(path);
2228
2229 if (cur_offset <= end && cow_start == (u64)-1)
2230 cow_start = cur_offset;
2231
2232 if (cow_start != (u64)-1) {
2233 ret = fallback_to_cow(inode, locked_folio, cow_start, end);
2234 if (ret) {
2235 cow_end = end;
2236 goto error;
2237 }
2238 cow_start = (u64)-1;
2239 }
2240
2241 btrfs_free_path(path);
2242 return 0;
2243
2244 error:
2245 /*
2246 * There are several error cases:
2247 *
2248 * 1) Failed without falling back to COW
2249 * start cur_offset end
2250 * |/////////////| |
2251 *
2252 * In this case, cow_start should be (u64)-1.
2253 *
2254 * For range [start, cur_offset) the folios are already unlocked (except
2255 * @locked_folio), EXTENT_DELALLOC already removed.
2256 * Need to clear the dirty flags and finish the ordered extents.
2257 *
2258 * 2) Failed with error before calling fallback_to_cow()
2259 *
2260 * start cow_start end
2261 * |/////////////| |
2262 *
2263 * In this case, only @cow_start is set, @cur_offset is between
2264 * [cow_start, end)
2265 *
2266 * It's mostly the same as case 1), just replace @cur_offset with
2267 * @cow_start.
2268 *
2269 * 3) Failed with error from fallback_to_cow()
2270 *
2271 * start cow_start cow_end end
2272 * |/////////////|-----------| |
2273 *
2274 * In this case, both @cow_start and @cow_end is set.
2275 *
2276 * For range [start, cow_start) it's the same as case 1).
2277 * But for range [cow_start, cow_end), all the cleanup is handled by
2278 * cow_file_range(), we should not touch anything in that range.
2279 *
2280 * So for all above cases, if @cow_start is set, cleanup ordered extents
2281 * for range [start, @cow_start), other wise cleanup range [start, @cur_offset).
2282 */
2283 if (cow_start != (u64)-1)
2284 cur_offset = cow_start;
2285
2286 if (cur_offset > start) {
2287 btrfs_cleanup_ordered_extents(inode, start, cur_offset - start);
2288 cleanup_dirty_folios(inode, locked_folio, start, cur_offset - 1, ret);
2289 }
2290
2291 /*
2292 * If an error happened while a COW region is outstanding, cur_offset
2293 * needs to be reset to @cow_end + 1 to skip the COW range, as
2294 * cow_file_range() will do the proper cleanup at error.
2295 */
2296 if (cow_end)
2297 cur_offset = cow_end + 1;
2298
2299 /*
2300 * We need to lock the extent here because we're clearing DELALLOC and
2301 * we're not locked at this point.
2302 */
2303 if (cur_offset < end) {
2304 struct extent_state *cached = NULL;
2305
2306 lock_extent(&inode->io_tree, cur_offset, end, &cached);
2307 extent_clear_unlock_delalloc(inode, cur_offset, end,
2308 locked_folio, &cached,
2309 EXTENT_LOCKED | EXTENT_DELALLOC |
2310 EXTENT_DEFRAG |
2311 EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
2312 PAGE_START_WRITEBACK |
2313 PAGE_END_WRITEBACK);
2314 btrfs_qgroup_free_data(inode, NULL, cur_offset, end - cur_offset + 1, NULL);
2315 }
2316 btrfs_free_path(path);
2317 btrfs_err_rl(fs_info,
2318 "%s failed, root=%llu inode=%llu start=%llu len=%llu: %d",
2319 __func__, btrfs_root_id(inode->root),
2320 btrfs_ino(inode), start, end + 1 - start, ret);
2321 return ret;
2322 }
2323
should_nocow(struct btrfs_inode * inode,u64 start,u64 end)2324 static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end)
2325 {
2326 if (inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) {
2327 if (inode->defrag_bytes &&
2328 test_range_bit_exists(&inode->io_tree, start, end, EXTENT_DEFRAG))
2329 return false;
2330 return true;
2331 }
2332 return false;
2333 }
2334
2335 /*
2336 * Function to process delayed allocation (create CoW) for ranges which are
2337 * being touched for the first time.
2338 */
btrfs_run_delalloc_range(struct btrfs_inode * inode,struct folio * locked_folio,u64 start,u64 end,struct writeback_control * wbc)2339 int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_folio,
2340 u64 start, u64 end, struct writeback_control *wbc)
2341 {
2342 const bool zoned = btrfs_is_zoned(inode->root->fs_info);
2343 int ret;
2344
2345 /*
2346 * The range must cover part of the @locked_folio, or a return of 1
2347 * can confuse the caller.
2348 */
2349 ASSERT(!(end <= folio_pos(locked_folio) ||
2350 start >= folio_pos(locked_folio) + folio_size(locked_folio)));
2351
2352 if (should_nocow(inode, start, end)) {
2353 ret = run_delalloc_nocow(inode, locked_folio, start, end);
2354 return ret;
2355 }
2356
2357 if (btrfs_inode_can_compress(inode) &&
2358 inode_need_compress(inode, start, end) &&
2359 run_delalloc_compressed(inode, locked_folio, start, end, wbc))
2360 return 1;
2361
2362 if (zoned)
2363 ret = run_delalloc_cow(inode, locked_folio, start, end, wbc,
2364 true);
2365 else
2366 ret = cow_file_range(inode, locked_folio, start, end, NULL,
2367 false, false);
2368 return ret;
2369 }
2370
btrfs_split_delalloc_extent(struct btrfs_inode * inode,struct extent_state * orig,u64 split)2371 void btrfs_split_delalloc_extent(struct btrfs_inode *inode,
2372 struct extent_state *orig, u64 split)
2373 {
2374 struct btrfs_fs_info *fs_info = inode->root->fs_info;
2375 u64 size;
2376
2377 lockdep_assert_held(&inode->io_tree.lock);
2378
2379 /* not delalloc, ignore it */
2380 if (!(orig->state & EXTENT_DELALLOC))
2381 return;
2382
2383 size = orig->end - orig->start + 1;
2384 if (size > fs_info->max_extent_size) {
2385 u32 num_extents;
2386 u64 new_size;
2387
2388 /*
2389 * See the explanation in btrfs_merge_delalloc_extent, the same
2390 * applies here, just in reverse.
2391 */
2392 new_size = orig->end - split + 1;
2393 num_extents = count_max_extents(fs_info, new_size);
2394 new_size = split - orig->start;
2395 num_extents += count_max_extents(fs_info, new_size);
2396 if (count_max_extents(fs_info, size) >= num_extents)
2397 return;
2398 }
2399
2400 spin_lock(&inode->lock);
2401 btrfs_mod_outstanding_extents(inode, 1);
2402 spin_unlock(&inode->lock);
2403 }
2404
2405 /*
2406 * Handle merged delayed allocation extents so we can keep track of new extents
2407 * that are just merged onto old extents, such as when we are doing sequential
2408 * writes, so we can properly account for the metadata space we'll need.
2409 */
btrfs_merge_delalloc_extent(struct btrfs_inode * inode,struct extent_state * new,struct extent_state * other)2410 void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state *new,
2411 struct extent_state *other)
2412 {
2413 struct btrfs_fs_info *fs_info = inode->root->fs_info;
2414 u64 new_size, old_size;
2415 u32 num_extents;
2416
2417 lockdep_assert_held(&inode->io_tree.lock);
2418
2419 /* not delalloc, ignore it */
2420 if (!(other->state & EXTENT_DELALLOC))
2421 return;
2422
2423 if (new->start > other->start)
2424 new_size = new->end - other->start + 1;
2425 else
2426 new_size = other->end - new->start + 1;
2427
2428 /* we're not bigger than the max, unreserve the space and go */
2429 if (new_size <= fs_info->max_extent_size) {
2430 spin_lock(&inode->lock);
2431 btrfs_mod_outstanding_extents(inode, -1);
2432 spin_unlock(&inode->lock);
2433 return;
2434 }
2435
2436 /*
2437 * We have to add up either side to figure out how many extents were
2438 * accounted for before we merged into one big extent. If the number of
2439 * extents we accounted for is <= the amount we need for the new range
2440 * then we can return, otherwise drop. Think of it like this
2441 *
2442 * [ 4k][MAX_SIZE]
2443 *
2444 * So we've grown the extent by a MAX_SIZE extent, this would mean we
2445 * need 2 outstanding extents, on one side we have 1 and the other side
2446 * we have 1 so they are == and we can return. But in this case
2447 *
2448 * [MAX_SIZE+4k][MAX_SIZE+4k]
2449 *
2450 * Each range on their own accounts for 2 extents, but merged together
2451 * they are only 3 extents worth of accounting, so we need to drop in
2452 * this case.
2453 */
2454 old_size = other->end - other->start + 1;
2455 num_extents = count_max_extents(fs_info, old_size);
2456 old_size = new->end - new->start + 1;
2457 num_extents += count_max_extents(fs_info, old_size);
2458 if (count_max_extents(fs_info, new_size) >= num_extents)
2459 return;
2460
2461 spin_lock(&inode->lock);
2462 btrfs_mod_outstanding_extents(inode, -1);
2463 spin_unlock(&inode->lock);
2464 }
2465
btrfs_add_delalloc_inode(struct btrfs_inode * inode)2466 static void btrfs_add_delalloc_inode(struct btrfs_inode *inode)
2467 {
2468 struct btrfs_root *root = inode->root;
2469 struct btrfs_fs_info *fs_info = root->fs_info;
2470
2471 spin_lock(&root->delalloc_lock);
2472 ASSERT(list_empty(&inode->delalloc_inodes));
2473 list_add_tail(&inode->delalloc_inodes, &root->delalloc_inodes);
2474 root->nr_delalloc_inodes++;
2475 if (root->nr_delalloc_inodes == 1) {
2476 spin_lock(&fs_info->delalloc_root_lock);
2477 ASSERT(list_empty(&root->delalloc_root));
2478 list_add_tail(&root->delalloc_root, &fs_info->delalloc_roots);
2479 spin_unlock(&fs_info->delalloc_root_lock);
2480 }
2481 spin_unlock(&root->delalloc_lock);
2482 }
2483
btrfs_del_delalloc_inode(struct btrfs_inode * inode)2484 void btrfs_del_delalloc_inode(struct btrfs_inode *inode)
2485 {
2486 struct btrfs_root *root = inode->root;
2487 struct btrfs_fs_info *fs_info = root->fs_info;
2488
2489 lockdep_assert_held(&root->delalloc_lock);
2490
2491 /*
2492 * We may be called after the inode was already deleted from the list,
2493 * namely in the transaction abort path btrfs_destroy_delalloc_inodes(),
2494 * and then later through btrfs_clear_delalloc_extent() while the inode
2495 * still has ->delalloc_bytes > 0.
2496 */
2497 if (!list_empty(&inode->delalloc_inodes)) {
2498 list_del_init(&inode->delalloc_inodes);
2499 root->nr_delalloc_inodes--;
2500 if (!root->nr_delalloc_inodes) {
2501 ASSERT(list_empty(&root->delalloc_inodes));
2502 spin_lock(&fs_info->delalloc_root_lock);
2503 ASSERT(!list_empty(&root->delalloc_root));
2504 list_del_init(&root->delalloc_root);
2505 spin_unlock(&fs_info->delalloc_root_lock);
2506 }
2507 }
2508 }
2509
2510 /*
2511 * Properly track delayed allocation bytes in the inode and to maintain the
2512 * list of inodes that have pending delalloc work to be done.
2513 */
btrfs_set_delalloc_extent(struct btrfs_inode * inode,struct extent_state * state,u32 bits)2514 void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *state,
2515 u32 bits)
2516 {
2517 struct btrfs_fs_info *fs_info = inode->root->fs_info;
2518
2519 lockdep_assert_held(&inode->io_tree.lock);
2520
2521 if ((bits & EXTENT_DEFRAG) && !(bits & EXTENT_DELALLOC))
2522 WARN_ON(1);
2523 /*
2524 * set_bit and clear bit hooks normally require _irqsave/restore
2525 * but in this case, we are only testing for the DELALLOC
2526 * bit, which is only set or cleared with irqs on
2527 */
2528 if (!(state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
2529 u64 len = state->end + 1 - state->start;
2530 u64 prev_delalloc_bytes;
2531 u32 num_extents = count_max_extents(fs_info, len);
2532
2533 spin_lock(&inode->lock);
2534 btrfs_mod_outstanding_extents(inode, num_extents);
2535 spin_unlock(&inode->lock);
2536
2537 /* For sanity tests */
2538 if (btrfs_is_testing(fs_info))
2539 return;
2540
2541 percpu_counter_add_batch(&fs_info->delalloc_bytes, len,
2542 fs_info->delalloc_batch);
2543 spin_lock(&inode->lock);
2544 prev_delalloc_bytes = inode->delalloc_bytes;
2545 inode->delalloc_bytes += len;
2546 if (bits & EXTENT_DEFRAG)
2547 inode->defrag_bytes += len;
2548 spin_unlock(&inode->lock);
2549
2550 /*
2551 * We don't need to be under the protection of the inode's lock,
2552 * because we are called while holding the inode's io_tree lock
2553 * and are therefore protected against concurrent calls of this
2554 * function and btrfs_clear_delalloc_extent().
2555 */
2556 if (!btrfs_is_free_space_inode(inode) && prev_delalloc_bytes == 0)
2557 btrfs_add_delalloc_inode(inode);
2558 }
2559
2560 if (!(state->state & EXTENT_DELALLOC_NEW) &&
2561 (bits & EXTENT_DELALLOC_NEW)) {
2562 spin_lock(&inode->lock);
2563 inode->new_delalloc_bytes += state->end + 1 - state->start;
2564 spin_unlock(&inode->lock);
2565 }
2566 }
2567
2568 /*
2569 * Once a range is no longer delalloc this function ensures that proper
2570 * accounting happens.
2571 */
btrfs_clear_delalloc_extent(struct btrfs_inode * inode,struct extent_state * state,u32 bits)2572 void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,
2573 struct extent_state *state, u32 bits)
2574 {
2575 struct btrfs_fs_info *fs_info = inode->root->fs_info;
2576 u64 len = state->end + 1 - state->start;
2577 u32 num_extents = count_max_extents(fs_info, len);
2578
2579 lockdep_assert_held(&inode->io_tree.lock);
2580
2581 if ((state->state & EXTENT_DEFRAG) && (bits & EXTENT_DEFRAG)) {
2582 spin_lock(&inode->lock);
2583 inode->defrag_bytes -= len;
2584 spin_unlock(&inode->lock);
2585 }
2586
2587 /*
2588 * set_bit and clear bit hooks normally require _irqsave/restore
2589 * but in this case, we are only testing for the DELALLOC
2590 * bit, which is only set or cleared with irqs on
2591 */
2592 if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
2593 struct btrfs_root *root = inode->root;
2594 u64 new_delalloc_bytes;
2595
2596 spin_lock(&inode->lock);
2597 btrfs_mod_outstanding_extents(inode, -num_extents);
2598 spin_unlock(&inode->lock);
2599
2600 /*
2601 * We don't reserve metadata space for space cache inodes so we
2602 * don't need to call delalloc_release_metadata if there is an
2603 * error.
2604 */
2605 if (bits & EXTENT_CLEAR_META_RESV &&
2606 root != fs_info->tree_root)
2607 btrfs_delalloc_release_metadata(inode, len, true);
2608
2609 /* For sanity tests. */
2610 if (btrfs_is_testing(fs_info))
2611 return;
2612
2613 if (!btrfs_is_data_reloc_root(root) &&
2614 !btrfs_is_free_space_inode(inode) &&
2615 !(state->state & EXTENT_NORESERVE) &&
2616 (bits & EXTENT_CLEAR_DATA_RESV))
2617 btrfs_free_reserved_data_space_noquota(fs_info, len);
2618
2619 percpu_counter_add_batch(&fs_info->delalloc_bytes, -len,
2620 fs_info->delalloc_batch);
2621 spin_lock(&inode->lock);
2622 inode->delalloc_bytes -= len;
2623 new_delalloc_bytes = inode->delalloc_bytes;
2624 spin_unlock(&inode->lock);
2625
2626 /*
2627 * We don't need to be under the protection of the inode's lock,
2628 * because we are called while holding the inode's io_tree lock
2629 * and are therefore protected against concurrent calls of this
2630 * function and btrfs_set_delalloc_extent().
2631 */
2632 if (!btrfs_is_free_space_inode(inode) && new_delalloc_bytes == 0) {
2633 spin_lock(&root->delalloc_lock);
2634 btrfs_del_delalloc_inode(inode);
2635 spin_unlock(&root->delalloc_lock);
2636 }
2637 }
2638
2639 if ((state->state & EXTENT_DELALLOC_NEW) &&
2640 (bits & EXTENT_DELALLOC_NEW)) {
2641 spin_lock(&inode->lock);
2642 ASSERT(inode->new_delalloc_bytes >= len);
2643 inode->new_delalloc_bytes -= len;
2644 if (bits & EXTENT_ADD_INODE_BYTES)
2645 inode_add_bytes(&inode->vfs_inode, len);
2646 spin_unlock(&inode->lock);
2647 }
2648 }
2649
2650 /*
2651 * given a list of ordered sums record them in the inode. This happens
2652 * at IO completion time based on sums calculated at bio submission time.
2653 */
add_pending_csums(struct btrfs_trans_handle * trans,struct list_head * list)2654 static int add_pending_csums(struct btrfs_trans_handle *trans,
2655 struct list_head *list)
2656 {
2657 struct btrfs_ordered_sum *sum;
2658 struct btrfs_root *csum_root = NULL;
2659 int ret;
2660
2661 list_for_each_entry(sum, list, list) {
2662 trans->adding_csums = true;
2663 if (!csum_root)
2664 csum_root = btrfs_csum_root(trans->fs_info,
2665 sum->logical);
2666 ret = btrfs_csum_file_blocks(trans, csum_root, sum);
2667 trans->adding_csums = false;
2668 if (ret)
2669 return ret;
2670 }
2671 return 0;
2672 }
2673
btrfs_find_new_delalloc_bytes(struct btrfs_inode * inode,const u64 start,const u64 len,struct extent_state ** cached_state)2674 static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
2675 const u64 start,
2676 const u64 len,
2677 struct extent_state **cached_state)
2678 {
2679 u64 search_start = start;
2680 const u64 end = start + len - 1;
2681
2682 while (search_start < end) {
2683 const u64 search_len = end - search_start + 1;
2684 struct extent_map *em;
2685 u64 em_len;
2686 int ret = 0;
2687
2688 em = btrfs_get_extent(inode, NULL, search_start, search_len);
2689 if (IS_ERR(em))
2690 return PTR_ERR(em);
2691
2692 if (em->disk_bytenr != EXTENT_MAP_HOLE)
2693 goto next;
2694
2695 em_len = em->len;
2696 if (em->start < search_start)
2697 em_len -= search_start - em->start;
2698 if (em_len > search_len)
2699 em_len = search_len;
2700
2701 ret = set_extent_bit(&inode->io_tree, search_start,
2702 search_start + em_len - 1,
2703 EXTENT_DELALLOC_NEW, cached_state);
2704 next:
2705 search_start = extent_map_end(em);
2706 free_extent_map(em);
2707 if (ret)
2708 return ret;
2709 }
2710 return 0;
2711 }
2712
btrfs_set_extent_delalloc(struct btrfs_inode * inode,u64 start,u64 end,unsigned int extra_bits,struct extent_state ** cached_state)2713 int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
2714 unsigned int extra_bits,
2715 struct extent_state **cached_state)
2716 {
2717 WARN_ON(PAGE_ALIGNED(end));
2718
2719 if (start >= i_size_read(&inode->vfs_inode) &&
2720 !(inode->flags & BTRFS_INODE_PREALLOC)) {
2721 /*
2722 * There can't be any extents following eof in this case so just
2723 * set the delalloc new bit for the range directly.
2724 */
2725 extra_bits |= EXTENT_DELALLOC_NEW;
2726 } else {
2727 int ret;
2728
2729 ret = btrfs_find_new_delalloc_bytes(inode, start,
2730 end + 1 - start,
2731 cached_state);
2732 if (ret)
2733 return ret;
2734 }
2735
2736 return set_extent_bit(&inode->io_tree, start, end,
2737 EXTENT_DELALLOC | extra_bits, cached_state);
2738 }
2739
2740 /* see btrfs_writepage_start_hook for details on why this is required */
2741 struct btrfs_writepage_fixup {
2742 struct folio *folio;
2743 struct btrfs_inode *inode;
2744 struct btrfs_work work;
2745 };
2746
btrfs_writepage_fixup_worker(struct btrfs_work * work)2747 static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
2748 {
2749 struct btrfs_writepage_fixup *fixup =
2750 container_of(work, struct btrfs_writepage_fixup, work);
2751 struct btrfs_ordered_extent *ordered;
2752 struct extent_state *cached_state = NULL;
2753 struct extent_changeset *data_reserved = NULL;
2754 struct folio *folio = fixup->folio;
2755 struct btrfs_inode *inode = fixup->inode;
2756 struct btrfs_fs_info *fs_info = inode->root->fs_info;
2757 u64 page_start = folio_pos(folio);
2758 u64 page_end = folio_pos(folio) + folio_size(folio) - 1;
2759 int ret = 0;
2760 bool free_delalloc_space = true;
2761
2762 /*
2763 * This is similar to page_mkwrite, we need to reserve the space before
2764 * we take the folio lock.
2765 */
2766 ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
2767 folio_size(folio));
2768 again:
2769 folio_lock(folio);
2770
2771 /*
2772 * Before we queued this fixup, we took a reference on the folio.
2773 * folio->mapping may go NULL, but it shouldn't be moved to a different
2774 * address space.
2775 */
2776 if (!folio->mapping || !folio_test_dirty(folio) ||
2777 !folio_test_checked(folio)) {
2778 /*
2779 * Unfortunately this is a little tricky, either
2780 *
2781 * 1) We got here and our folio had already been dealt with and
2782 * we reserved our space, thus ret == 0, so we need to just
2783 * drop our space reservation and bail. This can happen the
2784 * first time we come into the fixup worker, or could happen
2785 * while waiting for the ordered extent.
2786 * 2) Our folio was already dealt with, but we happened to get an
2787 * ENOSPC above from the btrfs_delalloc_reserve_space. In
2788 * this case we obviously don't have anything to release, but
2789 * because the folio was already dealt with we don't want to
2790 * mark the folio with an error, so make sure we're resetting
2791 * ret to 0. This is why we have this check _before_ the ret
2792 * check, because we do not want to have a surprise ENOSPC
2793 * when the folio was already properly dealt with.
2794 */
2795 if (!ret) {
2796 btrfs_delalloc_release_extents(inode, folio_size(folio));
2797 btrfs_delalloc_release_space(inode, data_reserved,
2798 page_start, folio_size(folio),
2799 true);
2800 }
2801 ret = 0;
2802 goto out_page;
2803 }
2804
2805 /*
2806 * We can't mess with the folio state unless it is locked, so now that
2807 * it is locked bail if we failed to make our space reservation.
2808 */
2809 if (ret)
2810 goto out_page;
2811
2812 lock_extent(&inode->io_tree, page_start, page_end, &cached_state);
2813
2814 /* already ordered? We're done */
2815 if (folio_test_ordered(folio))
2816 goto out_reserved;
2817
2818 ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
2819 if (ordered) {
2820 unlock_extent(&inode->io_tree, page_start, page_end,
2821 &cached_state);
2822 folio_unlock(folio);
2823 btrfs_start_ordered_extent(ordered);
2824 btrfs_put_ordered_extent(ordered);
2825 goto again;
2826 }
2827
2828 ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0,
2829 &cached_state);
2830 if (ret)
2831 goto out_reserved;
2832
2833 /*
2834 * Everything went as planned, we're now the owner of a dirty page with
2835 * delayed allocation bits set and space reserved for our COW
2836 * destination.
2837 *
2838 * The page was dirty when we started, nothing should have cleaned it.
2839 */
2840 BUG_ON(!folio_test_dirty(folio));
2841 free_delalloc_space = false;
2842 out_reserved:
2843 btrfs_delalloc_release_extents(inode, PAGE_SIZE);
2844 if (free_delalloc_space)
2845 btrfs_delalloc_release_space(inode, data_reserved, page_start,
2846 PAGE_SIZE, true);
2847 unlock_extent(&inode->io_tree, page_start, page_end, &cached_state);
2848 out_page:
2849 if (ret) {
2850 /*
2851 * We hit ENOSPC or other errors. Update the mapping and page
2852 * to reflect the errors and clean the page.
2853 */
2854 mapping_set_error(folio->mapping, ret);
2855 btrfs_mark_ordered_io_finished(inode, folio, page_start,
2856 folio_size(folio), !ret);
2857 folio_clear_dirty_for_io(folio);
2858 }
2859 btrfs_folio_clear_checked(fs_info, folio, page_start, PAGE_SIZE);
2860 folio_unlock(folio);
2861 folio_put(folio);
2862 kfree(fixup);
2863 extent_changeset_free(data_reserved);
2864 /*
2865 * As a precaution, do a delayed iput in case it would be the last iput
2866 * that could need flushing space. Recursing back to fixup worker would
2867 * deadlock.
2868 */
2869 btrfs_add_delayed_iput(inode);
2870 }
2871
2872 /*
2873 * There are a few paths in the higher layers of the kernel that directly
2874 * set the folio dirty bit without asking the filesystem if it is a
2875 * good idea. This causes problems because we want to make sure COW
2876 * properly happens and the data=ordered rules are followed.
2877 *
2878 * In our case any range that doesn't have the ORDERED bit set
2879 * hasn't been properly setup for IO. We kick off an async process
2880 * to fix it up. The async helper will wait for ordered extents, set
2881 * the delalloc bit and make it safe to write the folio.
2882 */
btrfs_writepage_cow_fixup(struct folio * folio)2883 int btrfs_writepage_cow_fixup(struct folio *folio)
2884 {
2885 struct inode *inode = folio->mapping->host;
2886 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
2887 struct btrfs_writepage_fixup *fixup;
2888
2889 /* This folio has ordered extent covering it already */
2890 if (folio_test_ordered(folio))
2891 return 0;
2892
2893 /*
2894 * For experimental build, we error out instead of EAGAIN.
2895 *
2896 * We should not hit such out-of-band dirty folios anymore.
2897 */
2898 if (IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL)) {
2899 WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
2900 btrfs_err_rl(fs_info,
2901 "root %lld ino %llu folio %llu is marked dirty without notifying the fs",
2902 BTRFS_I(inode)->root->root_key.objectid,
2903 btrfs_ino(BTRFS_I(inode)),
2904 folio_pos(folio));
2905 return -EUCLEAN;
2906 }
2907
2908 /*
2909 * folio_checked is set below when we create a fixup worker for this
2910 * folio, don't try to create another one if we're already
2911 * folio_test_checked.
2912 *
2913 * The extent_io writepage code will redirty the foio if we send back
2914 * EAGAIN.
2915 */
2916 if (folio_test_checked(folio))
2917 return -EAGAIN;
2918
2919 fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
2920 if (!fixup)
2921 return -EAGAIN;
2922
2923 /*
2924 * We are already holding a reference to this inode from
2925 * write_cache_pages. We need to hold it because the space reservation
2926 * takes place outside of the folio lock, and we can't trust
2927 * folio->mapping outside of the folio lock.
2928 */
2929 ihold(inode);
2930 btrfs_folio_set_checked(fs_info, folio, folio_pos(folio), folio_size(folio));
2931 folio_get(folio);
2932 btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL);
2933 fixup->folio = folio;
2934 fixup->inode = BTRFS_I(inode);
2935 btrfs_queue_work(fs_info->fixup_workers, &fixup->work);
2936
2937 return -EAGAIN;
2938 }
2939
insert_reserved_file_extent(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,u64 file_pos,struct btrfs_file_extent_item * stack_fi,const bool update_inode_bytes,u64 qgroup_reserved)2940 static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
2941 struct btrfs_inode *inode, u64 file_pos,
2942 struct btrfs_file_extent_item *stack_fi,
2943 const bool update_inode_bytes,
2944 u64 qgroup_reserved)
2945 {
2946 struct btrfs_root *root = inode->root;
2947 const u64 sectorsize = root->fs_info->sectorsize;
2948 struct btrfs_path *path;
2949 struct extent_buffer *leaf;
2950 struct btrfs_key ins;
2951 u64 disk_num_bytes = btrfs_stack_file_extent_disk_num_bytes(stack_fi);
2952 u64 disk_bytenr = btrfs_stack_file_extent_disk_bytenr(stack_fi);
2953 u64 offset = btrfs_stack_file_extent_offset(stack_fi);
2954 u64 num_bytes = btrfs_stack_file_extent_num_bytes(stack_fi);
2955 u64 ram_bytes = btrfs_stack_file_extent_ram_bytes(stack_fi);
2956 struct btrfs_drop_extents_args drop_args = { 0 };
2957 int ret;
2958
2959 path = btrfs_alloc_path();
2960 if (!path)
2961 return -ENOMEM;
2962
2963 /*
2964 * we may be replacing one extent in the tree with another.
2965 * The new extent is pinned in the extent map, and we don't want
2966 * to drop it from the cache until it is completely in the btree.
2967 *
2968 * So, tell btrfs_drop_extents to leave this extent in the cache.
2969 * the caller is expected to unpin it and allow it to be merged
2970 * with the others.
2971 */
2972 drop_args.path = path;
2973 drop_args.start = file_pos;
2974 drop_args.end = file_pos + num_bytes;
2975 drop_args.replace_extent = true;
2976 drop_args.extent_item_size = sizeof(*stack_fi);
2977 ret = btrfs_drop_extents(trans, root, inode, &drop_args);
2978 if (ret)
2979 goto out;
2980
2981 if (!drop_args.extent_inserted) {
2982 ins.objectid = btrfs_ino(inode);
2983 ins.type = BTRFS_EXTENT_DATA_KEY;
2984 ins.offset = file_pos;
2985
2986 ret = btrfs_insert_empty_item(trans, root, path, &ins,
2987 sizeof(*stack_fi));
2988 if (ret)
2989 goto out;
2990 }
2991 leaf = path->nodes[0];
2992 btrfs_set_stack_file_extent_generation(stack_fi, trans->transid);
2993 write_extent_buffer(leaf, stack_fi,
2994 btrfs_item_ptr_offset(leaf, path->slots[0]),
2995 sizeof(struct btrfs_file_extent_item));
2996
2997 btrfs_release_path(path);
2998
2999 /*
3000 * If we dropped an inline extent here, we know the range where it is
3001 * was not marked with the EXTENT_DELALLOC_NEW bit, so we update the
3002 * number of bytes only for that range containing the inline extent.
3003 * The remaining of the range will be processed when clearning the
3004 * EXTENT_DELALLOC_BIT bit through the ordered extent completion.
3005 */
3006 if (file_pos == 0 && !IS_ALIGNED(drop_args.bytes_found, sectorsize)) {
3007 u64 inline_size = round_down(drop_args.bytes_found, sectorsize);
3008
3009 inline_size = drop_args.bytes_found - inline_size;
3010 btrfs_update_inode_bytes(inode, sectorsize, inline_size);
3011 drop_args.bytes_found -= inline_size;
3012 num_bytes -= sectorsize;
3013 }
3014
3015 if (update_inode_bytes)
3016 btrfs_update_inode_bytes(inode, num_bytes, drop_args.bytes_found);
3017
3018 ins.objectid = disk_bytenr;
3019 ins.type = BTRFS_EXTENT_ITEM_KEY;
3020 ins.offset = disk_num_bytes;
3021
3022 ret = btrfs_inode_set_file_extent_range(inode, file_pos, ram_bytes);
3023 if (ret)
3024 goto out;
3025
3026 ret = btrfs_alloc_reserved_file_extent(trans, root, btrfs_ino(inode),
3027 file_pos - offset,
3028 qgroup_reserved, &ins);
3029 out:
3030 btrfs_free_path(path);
3031
3032 return ret;
3033 }
3034
btrfs_release_delalloc_bytes(struct btrfs_fs_info * fs_info,u64 start,u64 len)3035 static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info,
3036 u64 start, u64 len)
3037 {
3038 struct btrfs_block_group *cache;
3039
3040 cache = btrfs_lookup_block_group(fs_info, start);
3041 ASSERT(cache);
3042
3043 spin_lock(&cache->lock);
3044 cache->delalloc_bytes -= len;
3045 spin_unlock(&cache->lock);
3046
3047 btrfs_put_block_group(cache);
3048 }
3049
insert_ordered_extent_file_extent(struct btrfs_trans_handle * trans,struct btrfs_ordered_extent * oe)3050 static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
3051 struct btrfs_ordered_extent *oe)
3052 {
3053 struct btrfs_file_extent_item stack_fi;
3054 bool update_inode_bytes;
3055 u64 num_bytes = oe->num_bytes;
3056 u64 ram_bytes = oe->ram_bytes;
3057
3058 memset(&stack_fi, 0, sizeof(stack_fi));
3059 btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_REG);
3060 btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, oe->disk_bytenr);
3061 btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi,
3062 oe->disk_num_bytes);
3063 btrfs_set_stack_file_extent_offset(&stack_fi, oe->offset);
3064 if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags))
3065 num_bytes = oe->truncated_len;
3066 btrfs_set_stack_file_extent_num_bytes(&stack_fi, num_bytes);
3067 btrfs_set_stack_file_extent_ram_bytes(&stack_fi, ram_bytes);
3068 btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type);
3069 /* Encryption and other encoding is reserved and all 0 */
3070
3071 /*
3072 * For delalloc, when completing an ordered extent we update the inode's
3073 * bytes when clearing the range in the inode's io tree, so pass false
3074 * as the argument 'update_inode_bytes' to insert_reserved_file_extent(),
3075 * except if the ordered extent was truncated.
3076 */
3077 update_inode_bytes = test_bit(BTRFS_ORDERED_DIRECT, &oe->flags) ||
3078 test_bit(BTRFS_ORDERED_ENCODED, &oe->flags) ||
3079 test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags);
3080
3081 return insert_reserved_file_extent(trans, oe->inode,
3082 oe->file_offset, &stack_fi,
3083 update_inode_bytes, oe->qgroup_rsv);
3084 }
3085
3086 /*
3087 * As ordered data IO finishes, this gets called so we can finish
3088 * an ordered extent if the range of bytes in the file it covers are
3089 * fully written.
3090 */
btrfs_finish_one_ordered(struct btrfs_ordered_extent * ordered_extent)3091 int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
3092 {
3093 struct btrfs_inode *inode = ordered_extent->inode;
3094 struct btrfs_root *root = inode->root;
3095 struct btrfs_fs_info *fs_info = root->fs_info;
3096 struct btrfs_trans_handle *trans = NULL;
3097 struct extent_io_tree *io_tree = &inode->io_tree;
3098 struct extent_state *cached_state = NULL;
3099 u64 start, end;
3100 int compress_type = 0;
3101 int ret = 0;
3102 u64 logical_len = ordered_extent->num_bytes;
3103 bool freespace_inode;
3104 bool truncated = false;
3105 bool clear_reserved_extent = true;
3106 unsigned int clear_bits = EXTENT_DEFRAG;
3107
3108 start = ordered_extent->file_offset;
3109 end = start + ordered_extent->num_bytes - 1;
3110
3111 if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
3112 !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) &&
3113 !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags) &&
3114 !test_bit(BTRFS_ORDERED_ENCODED, &ordered_extent->flags))
3115 clear_bits |= EXTENT_DELALLOC_NEW;
3116
3117 freespace_inode = btrfs_is_free_space_inode(inode);
3118 if (!freespace_inode)
3119 btrfs_lockdep_acquire(fs_info, btrfs_ordered_extent);
3120
3121 if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
3122 ret = -EIO;
3123 goto out;
3124 }
3125
3126 if (btrfs_is_zoned(fs_info))
3127 btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr,
3128 ordered_extent->disk_num_bytes);
3129
3130 if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
3131 truncated = true;
3132 logical_len = ordered_extent->truncated_len;
3133 /* Truncated the entire extent, don't bother adding */
3134 if (!logical_len)
3135 goto out;
3136 }
3137
3138 /*
3139 * If it's a COW write we need to lock the extent range as we will be
3140 * inserting/replacing file extent items and unpinning an extent map.
3141 * This must be taken before joining a transaction, as it's a higher
3142 * level lock (like the inode's VFS lock), otherwise we can run into an
3143 * ABBA deadlock with other tasks (transactions work like a lock,
3144 * depending on their current state).
3145 */
3146 if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
3147 clear_bits |= EXTENT_LOCKED;
3148 lock_extent(io_tree, start, end, &cached_state);
3149 }
3150
3151 if (freespace_inode)
3152 trans = btrfs_join_transaction_spacecache(root);
3153 else
3154 trans = btrfs_join_transaction(root);
3155 if (IS_ERR(trans)) {
3156 ret = PTR_ERR(trans);
3157 trans = NULL;
3158 goto out;
3159 }
3160
3161 trans->block_rsv = &inode->block_rsv;
3162
3163 ret = btrfs_insert_raid_extent(trans, ordered_extent);
3164 if (ret) {
3165 btrfs_abort_transaction(trans, ret);
3166 goto out;
3167 }
3168
3169 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
3170 /* Logic error */
3171 ASSERT(list_empty(&ordered_extent->list));
3172 if (!list_empty(&ordered_extent->list)) {
3173 ret = -EINVAL;
3174 btrfs_abort_transaction(trans, ret);
3175 goto out;
3176 }
3177
3178 btrfs_inode_safe_disk_i_size_write(inode, 0);
3179 ret = btrfs_update_inode_fallback(trans, inode);
3180 if (ret) {
3181 /* -ENOMEM or corruption */
3182 btrfs_abort_transaction(trans, ret);
3183 }
3184 goto out;
3185 }
3186
3187 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
3188 compress_type = ordered_extent->compress_type;
3189 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
3190 BUG_ON(compress_type);
3191 ret = btrfs_mark_extent_written(trans, inode,
3192 ordered_extent->file_offset,
3193 ordered_extent->file_offset +
3194 logical_len);
3195 btrfs_zoned_release_data_reloc_bg(fs_info, ordered_extent->disk_bytenr,
3196 ordered_extent->disk_num_bytes);
3197 } else {
3198 BUG_ON(root == fs_info->tree_root);
3199 ret = insert_ordered_extent_file_extent(trans, ordered_extent);
3200 if (!ret) {
3201 clear_reserved_extent = false;
3202 btrfs_release_delalloc_bytes(fs_info,
3203 ordered_extent->disk_bytenr,
3204 ordered_extent->disk_num_bytes);
3205 }
3206 }
3207 if (ret < 0) {
3208 btrfs_abort_transaction(trans, ret);
3209 goto out;
3210 }
3211
3212 ret = unpin_extent_cache(inode, ordered_extent->file_offset,
3213 ordered_extent->num_bytes, trans->transid);
3214 if (ret < 0) {
3215 btrfs_abort_transaction(trans, ret);
3216 goto out;
3217 }
3218
3219 ret = add_pending_csums(trans, &ordered_extent->list);
3220 if (ret) {
3221 btrfs_abort_transaction(trans, ret);
3222 goto out;
3223 }
3224
3225 /*
3226 * If this is a new delalloc range, clear its new delalloc flag to
3227 * update the inode's number of bytes. This needs to be done first
3228 * before updating the inode item.
3229 */
3230 if ((clear_bits & EXTENT_DELALLOC_NEW) &&
3231 !test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags))
3232 clear_extent_bit(&inode->io_tree, start, end,
3233 EXTENT_DELALLOC_NEW | EXTENT_ADD_INODE_BYTES,
3234 &cached_state);
3235
3236 btrfs_inode_safe_disk_i_size_write(inode, 0);
3237 ret = btrfs_update_inode_fallback(trans, inode);
3238 if (ret) { /* -ENOMEM or corruption */
3239 btrfs_abort_transaction(trans, ret);
3240 goto out;
3241 }
3242 out:
3243 clear_extent_bit(&inode->io_tree, start, end, clear_bits,
3244 &cached_state);
3245
3246 if (trans)
3247 btrfs_end_transaction(trans);
3248
3249 if (ret || truncated) {
3250 u64 unwritten_start = start;
3251
3252 /*
3253 * If we failed to finish this ordered extent for any reason we
3254 * need to make sure BTRFS_ORDERED_IOERR is set on the ordered
3255 * extent, and mark the inode with the error if it wasn't
3256 * already set. Any error during writeback would have already
3257 * set the mapping error, so we need to set it if we're the ones
3258 * marking this ordered extent as failed.
3259 */
3260 if (ret)
3261 btrfs_mark_ordered_extent_error(ordered_extent);
3262
3263 if (truncated)
3264 unwritten_start += logical_len;
3265 clear_extent_uptodate(io_tree, unwritten_start, end, NULL);
3266
3267 /*
3268 * Drop extent maps for the part of the extent we didn't write.
3269 *
3270 * We have an exception here for the free_space_inode, this is
3271 * because when we do btrfs_get_extent() on the free space inode
3272 * we will search the commit root. If this is a new block group
3273 * we won't find anything, and we will trip over the assert in
3274 * writepage where we do ASSERT(em->block_start !=
3275 * EXTENT_MAP_HOLE).
3276 *
3277 * Theoretically we could also skip this for any NOCOW extent as
3278 * we don't mess with the extent map tree in the NOCOW case, but
3279 * for now simply skip this if we are the free space inode.
3280 */
3281 if (!btrfs_is_free_space_inode(inode))
3282 btrfs_drop_extent_map_range(inode, unwritten_start,
3283 end, false);
3284
3285 /*
3286 * If the ordered extent had an IOERR or something else went
3287 * wrong we need to return the space for this ordered extent
3288 * back to the allocator. We only free the extent in the
3289 * truncated case if we didn't write out the extent at all.
3290 *
3291 * If we made it past insert_reserved_file_extent before we
3292 * errored out then we don't need to do this as the accounting
3293 * has already been done.
3294 */
3295 if ((ret || !logical_len) &&
3296 clear_reserved_extent &&
3297 !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
3298 !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
3299 /*
3300 * Discard the range before returning it back to the
3301 * free space pool
3302 */
3303 if (ret && btrfs_test_opt(fs_info, DISCARD_SYNC))
3304 btrfs_discard_extent(fs_info,
3305 ordered_extent->disk_bytenr,
3306 ordered_extent->disk_num_bytes,
3307 NULL);
3308 btrfs_free_reserved_extent(fs_info,
3309 ordered_extent->disk_bytenr,
3310 ordered_extent->disk_num_bytes, 1);
3311 /*
3312 * Actually free the qgroup rsv which was released when
3313 * the ordered extent was created.
3314 */
3315 btrfs_qgroup_free_refroot(fs_info, btrfs_root_id(inode->root),
3316 ordered_extent->qgroup_rsv,
3317 BTRFS_QGROUP_RSV_DATA);
3318 }
3319 }
3320
3321 /*
3322 * This needs to be done to make sure anybody waiting knows we are done
3323 * updating everything for this ordered extent.
3324 */
3325 btrfs_remove_ordered_extent(inode, ordered_extent);
3326
3327 /* once for us */
3328 btrfs_put_ordered_extent(ordered_extent);
3329 /* once for the tree */
3330 btrfs_put_ordered_extent(ordered_extent);
3331
3332 return ret;
3333 }
3334
btrfs_finish_ordered_io(struct btrfs_ordered_extent * ordered)3335 int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered)
3336 {
3337 if (btrfs_is_zoned(ordered->inode->root->fs_info) &&
3338 !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) &&
3339 list_empty(&ordered->bioc_list))
3340 btrfs_finish_ordered_zoned(ordered);
3341 return btrfs_finish_one_ordered(ordered);
3342 }
3343
3344 /*
3345 * Verify the checksum for a single sector without any extra action that depend
3346 * on the type of I/O.
3347 */
btrfs_check_sector_csum(struct btrfs_fs_info * fs_info,struct page * page,u32 pgoff,u8 * csum,const u8 * const csum_expected)3348 int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
3349 u32 pgoff, u8 *csum, const u8 * const csum_expected)
3350 {
3351 SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
3352 char *kaddr;
3353
3354 ASSERT(pgoff + fs_info->sectorsize <= PAGE_SIZE);
3355
3356 shash->tfm = fs_info->csum_shash;
3357
3358 kaddr = kmap_local_page(page) + pgoff;
3359 crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum);
3360 kunmap_local(kaddr);
3361
3362 if (memcmp(csum, csum_expected, fs_info->csum_size))
3363 return -EIO;
3364 return 0;
3365 }
3366
3367 /*
3368 * Verify the checksum of a single data sector.
3369 *
3370 * @bbio: btrfs_io_bio which contains the csum
3371 * @dev: device the sector is on
3372 * @bio_offset: offset to the beginning of the bio (in bytes)
3373 * @bv: bio_vec to check
3374 *
3375 * Check if the checksum on a data block is valid. When a checksum mismatch is
3376 * detected, report the error and fill the corrupted range with zero.
3377 *
3378 * Return %true if the sector is ok or had no checksum to start with, else %false.
3379 */
btrfs_data_csum_ok(struct btrfs_bio * bbio,struct btrfs_device * dev,u32 bio_offset,struct bio_vec * bv)3380 bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev,
3381 u32 bio_offset, struct bio_vec *bv)
3382 {
3383 struct btrfs_inode *inode = bbio->inode;
3384 struct btrfs_fs_info *fs_info = inode->root->fs_info;
3385 u64 file_offset = bbio->file_offset + bio_offset;
3386 u64 end = file_offset + bv->bv_len - 1;
3387 u8 *csum_expected;
3388 u8 csum[BTRFS_CSUM_SIZE];
3389
3390 ASSERT(bv->bv_len == fs_info->sectorsize);
3391
3392 if (!bbio->csum)
3393 return true;
3394
3395 if (btrfs_is_data_reloc_root(inode->root) &&
3396 test_range_bit(&inode->io_tree, file_offset, end, EXTENT_NODATASUM,
3397 NULL)) {
3398 /* Skip the range without csum for data reloc inode */
3399 clear_extent_bits(&inode->io_tree, file_offset, end,
3400 EXTENT_NODATASUM);
3401 return true;
3402 }
3403
3404 csum_expected = bbio->csum + (bio_offset >> fs_info->sectorsize_bits) *
3405 fs_info->csum_size;
3406 if (btrfs_check_sector_csum(fs_info, bv->bv_page, bv->bv_offset, csum,
3407 csum_expected))
3408 goto zeroit;
3409 return true;
3410
3411 zeroit:
3412 btrfs_print_data_csum_error(inode, file_offset, csum, csum_expected,
3413 bbio->mirror_num);
3414 if (dev)
3415 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS);
3416 memzero_bvec(bv);
3417 return false;
3418 }
3419
3420 /*
3421 * Perform a delayed iput on @inode.
3422 *
3423 * @inode: The inode we want to perform iput on
3424 *
3425 * This function uses the generic vfs_inode::i_count to track whether we should
3426 * just decrement it (in case it's > 1) or if this is the last iput then link
3427 * the inode to the delayed iput machinery. Delayed iputs are processed at
3428 * transaction commit time/superblock commit/cleaner kthread.
3429 */
btrfs_add_delayed_iput(struct btrfs_inode * inode)3430 void btrfs_add_delayed_iput(struct btrfs_inode *inode)
3431 {
3432 struct btrfs_fs_info *fs_info = inode->root->fs_info;
3433 unsigned long flags;
3434
3435 if (atomic_add_unless(&inode->vfs_inode.i_count, -1, 1))
3436 return;
3437
3438 WARN_ON_ONCE(test_bit(BTRFS_FS_STATE_NO_DELAYED_IPUT, &fs_info->fs_state));
3439 atomic_inc(&fs_info->nr_delayed_iputs);
3440 /*
3441 * Need to be irq safe here because we can be called from either an irq
3442 * context (see bio.c and btrfs_put_ordered_extent()) or a non-irq
3443 * context.
3444 */
3445 spin_lock_irqsave(&fs_info->delayed_iput_lock, flags);
3446 ASSERT(list_empty(&inode->delayed_iput));
3447 list_add_tail(&inode->delayed_iput, &fs_info->delayed_iputs);
3448 spin_unlock_irqrestore(&fs_info->delayed_iput_lock, flags);
3449 if (!test_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags))
3450 wake_up_process(fs_info->cleaner_kthread);
3451 }
3452
run_delayed_iput_locked(struct btrfs_fs_info * fs_info,struct btrfs_inode * inode)3453 static void run_delayed_iput_locked(struct btrfs_fs_info *fs_info,
3454 struct btrfs_inode *inode)
3455 {
3456 list_del_init(&inode->delayed_iput);
3457 spin_unlock_irq(&fs_info->delayed_iput_lock);
3458 iput(&inode->vfs_inode);
3459 if (atomic_dec_and_test(&fs_info->nr_delayed_iputs))
3460 wake_up(&fs_info->delayed_iputs_wait);
3461 spin_lock_irq(&fs_info->delayed_iput_lock);
3462 }
3463
btrfs_run_delayed_iput(struct btrfs_fs_info * fs_info,struct btrfs_inode * inode)3464 static void btrfs_run_delayed_iput(struct btrfs_fs_info *fs_info,
3465 struct btrfs_inode *inode)
3466 {
3467 if (!list_empty(&inode->delayed_iput)) {
3468 spin_lock_irq(&fs_info->delayed_iput_lock);
3469 if (!list_empty(&inode->delayed_iput))
3470 run_delayed_iput_locked(fs_info, inode);
3471 spin_unlock_irq(&fs_info->delayed_iput_lock);
3472 }
3473 }
3474
btrfs_run_delayed_iputs(struct btrfs_fs_info * fs_info)3475 void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
3476 {
3477 /*
3478 * btrfs_put_ordered_extent() can run in irq context (see bio.c), which
3479 * calls btrfs_add_delayed_iput() and that needs to lock
3480 * fs_info->delayed_iput_lock. So we need to disable irqs here to
3481 * prevent a deadlock.
3482 */
3483 spin_lock_irq(&fs_info->delayed_iput_lock);
3484 while (!list_empty(&fs_info->delayed_iputs)) {
3485 struct btrfs_inode *inode;
3486
3487 inode = list_first_entry(&fs_info->delayed_iputs,
3488 struct btrfs_inode, delayed_iput);
3489 run_delayed_iput_locked(fs_info, inode);
3490 if (need_resched()) {
3491 spin_unlock_irq(&fs_info->delayed_iput_lock);
3492 cond_resched();
3493 spin_lock_irq(&fs_info->delayed_iput_lock);
3494 }
3495 }
3496 spin_unlock_irq(&fs_info->delayed_iput_lock);
3497 }
3498
3499 /*
3500 * Wait for flushing all delayed iputs
3501 *
3502 * @fs_info: the filesystem
3503 *
3504 * This will wait on any delayed iputs that are currently running with KILLABLE
3505 * set. Once they are all done running we will return, unless we are killed in
3506 * which case we return EINTR. This helps in user operations like fallocate etc
3507 * that might get blocked on the iputs.
3508 *
3509 * Return EINTR if we were killed, 0 if nothing's pending
3510 */
btrfs_wait_on_delayed_iputs(struct btrfs_fs_info * fs_info)3511 int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info)
3512 {
3513 int ret = wait_event_killable(fs_info->delayed_iputs_wait,
3514 atomic_read(&fs_info->nr_delayed_iputs) == 0);
3515 if (ret)
3516 return -EINTR;
3517 return 0;
3518 }
3519
3520 /*
3521 * This creates an orphan entry for the given inode in case something goes wrong
3522 * in the middle of an unlink.
3523 */
btrfs_orphan_add(struct btrfs_trans_handle * trans,struct btrfs_inode * inode)3524 int btrfs_orphan_add(struct btrfs_trans_handle *trans,
3525 struct btrfs_inode *inode)
3526 {
3527 int ret;
3528
3529 ret = btrfs_insert_orphan_item(trans, inode->root, btrfs_ino(inode));
3530 if (ret && ret != -EEXIST) {
3531 btrfs_abort_transaction(trans, ret);
3532 return ret;
3533 }
3534
3535 return 0;
3536 }
3537
3538 /*
3539 * We have done the delete so we can go ahead and remove the orphan item for
3540 * this particular inode.
3541 */
btrfs_orphan_del(struct btrfs_trans_handle * trans,struct btrfs_inode * inode)3542 static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
3543 struct btrfs_inode *inode)
3544 {
3545 return btrfs_del_orphan_item(trans, inode->root, btrfs_ino(inode));
3546 }
3547
3548 /*
3549 * this cleans up any orphans that may be left on the list from the last use
3550 * of this root.
3551 */
btrfs_orphan_cleanup(struct btrfs_root * root)3552 int btrfs_orphan_cleanup(struct btrfs_root *root)
3553 {
3554 struct btrfs_fs_info *fs_info = root->fs_info;
3555 struct btrfs_path *path;
3556 struct extent_buffer *leaf;
3557 struct btrfs_key key, found_key;
3558 struct btrfs_trans_handle *trans;
3559 u64 last_objectid = 0;
3560 int ret = 0, nr_unlink = 0;
3561
3562 if (test_and_set_bit(BTRFS_ROOT_ORPHAN_CLEANUP, &root->state))
3563 return 0;
3564
3565 path = btrfs_alloc_path();
3566 if (!path) {
3567 ret = -ENOMEM;
3568 goto out;
3569 }
3570 path->reada = READA_BACK;
3571
3572 key.objectid = BTRFS_ORPHAN_OBJECTID;
3573 key.type = BTRFS_ORPHAN_ITEM_KEY;
3574 key.offset = (u64)-1;
3575
3576 while (1) {
3577 struct btrfs_inode *inode;
3578
3579 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3580 if (ret < 0)
3581 goto out;
3582
3583 /*
3584 * if ret == 0 means we found what we were searching for, which
3585 * is weird, but possible, so only screw with path if we didn't
3586 * find the key and see if we have stuff that matches
3587 */
3588 if (ret > 0) {
3589 ret = 0;
3590 if (path->slots[0] == 0)
3591 break;
3592 path->slots[0]--;
3593 }
3594
3595 /* pull out the item */
3596 leaf = path->nodes[0];
3597 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3598
3599 /* make sure the item matches what we want */
3600 if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
3601 break;
3602 if (found_key.type != BTRFS_ORPHAN_ITEM_KEY)
3603 break;
3604
3605 /* release the path since we're done with it */
3606 btrfs_release_path(path);
3607
3608 /*
3609 * this is where we are basically btrfs_lookup, without the
3610 * crossing root thing. we store the inode number in the
3611 * offset of the orphan item.
3612 */
3613
3614 if (found_key.offset == last_objectid) {
3615 /*
3616 * We found the same inode as before. This means we were
3617 * not able to remove its items via eviction triggered
3618 * by an iput(). A transaction abort may have happened,
3619 * due to -ENOSPC for example, so try to grab the error
3620 * that lead to a transaction abort, if any.
3621 */
3622 btrfs_err(fs_info,
3623 "Error removing orphan entry, stopping orphan cleanup");
3624 ret = BTRFS_FS_ERROR(fs_info) ?: -EINVAL;
3625 goto out;
3626 }
3627
3628 last_objectid = found_key.offset;
3629
3630 found_key.objectid = found_key.offset;
3631 found_key.type = BTRFS_INODE_ITEM_KEY;
3632 found_key.offset = 0;
3633 inode = btrfs_iget(last_objectid, root);
3634 if (IS_ERR(inode)) {
3635 ret = PTR_ERR(inode);
3636 inode = NULL;
3637 if (ret != -ENOENT)
3638 goto out;
3639 }
3640
3641 if (!inode && root == fs_info->tree_root) {
3642 struct btrfs_root *dead_root;
3643 int is_dead_root = 0;
3644
3645 /*
3646 * This is an orphan in the tree root. Currently these
3647 * could come from 2 sources:
3648 * a) a root (snapshot/subvolume) deletion in progress
3649 * b) a free space cache inode
3650 * We need to distinguish those two, as the orphan item
3651 * for a root must not get deleted before the deletion
3652 * of the snapshot/subvolume's tree completes.
3653 *
3654 * btrfs_find_orphan_roots() ran before us, which has
3655 * found all deleted roots and loaded them into
3656 * fs_info->fs_roots_radix. So here we can find if an
3657 * orphan item corresponds to a deleted root by looking
3658 * up the root from that radix tree.
3659 */
3660
3661 spin_lock(&fs_info->fs_roots_radix_lock);
3662 dead_root = radix_tree_lookup(&fs_info->fs_roots_radix,
3663 (unsigned long)found_key.objectid);
3664 if (dead_root && btrfs_root_refs(&dead_root->root_item) == 0)
3665 is_dead_root = 1;
3666 spin_unlock(&fs_info->fs_roots_radix_lock);
3667
3668 if (is_dead_root) {
3669 /* prevent this orphan from being found again */
3670 key.offset = found_key.objectid - 1;
3671 continue;
3672 }
3673
3674 }
3675
3676 /*
3677 * If we have an inode with links, there are a couple of
3678 * possibilities:
3679 *
3680 * 1. We were halfway through creating fsverity metadata for the
3681 * file. In that case, the orphan item represents incomplete
3682 * fsverity metadata which must be cleaned up with
3683 * btrfs_drop_verity_items and deleting the orphan item.
3684
3685 * 2. Old kernels (before v3.12) used to create an
3686 * orphan item for truncate indicating that there were possibly
3687 * extent items past i_size that needed to be deleted. In v3.12,
3688 * truncate was changed to update i_size in sync with the extent
3689 * items, but the (useless) orphan item was still created. Since
3690 * v4.18, we don't create the orphan item for truncate at all.
3691 *
3692 * So, this item could mean that we need to do a truncate, but
3693 * only if this filesystem was last used on a pre-v3.12 kernel
3694 * and was not cleanly unmounted. The odds of that are quite
3695 * slim, and it's a pain to do the truncate now, so just delete
3696 * the orphan item.
3697 *
3698 * It's also possible that this orphan item was supposed to be
3699 * deleted but wasn't. The inode number may have been reused,
3700 * but either way, we can delete the orphan item.
3701 */
3702 if (!inode || inode->vfs_inode.i_nlink) {
3703 if (inode) {
3704 ret = btrfs_drop_verity_items(inode);
3705 iput(&inode->vfs_inode);
3706 inode = NULL;
3707 if (ret)
3708 goto out;
3709 }
3710 trans = btrfs_start_transaction(root, 1);
3711 if (IS_ERR(trans)) {
3712 ret = PTR_ERR(trans);
3713 goto out;
3714 }
3715 btrfs_debug(fs_info, "auto deleting %Lu",
3716 found_key.objectid);
3717 ret = btrfs_del_orphan_item(trans, root,
3718 found_key.objectid);
3719 btrfs_end_transaction(trans);
3720 if (ret)
3721 goto out;
3722 continue;
3723 }
3724
3725 nr_unlink++;
3726
3727 /* this will do delete_inode and everything for us */
3728 iput(&inode->vfs_inode);
3729 }
3730 /* release the path since we're done with it */
3731 btrfs_release_path(path);
3732
3733 if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) {
3734 trans = btrfs_join_transaction(root);
3735 if (!IS_ERR(trans))
3736 btrfs_end_transaction(trans);
3737 }
3738
3739 if (nr_unlink)
3740 btrfs_debug(fs_info, "unlinked %d orphans", nr_unlink);
3741
3742 out:
3743 if (ret)
3744 btrfs_err(fs_info, "could not do orphan cleanup %d", ret);
3745 btrfs_free_path(path);
3746 return ret;
3747 }
3748
3749 /*
3750 * very simple check to peek ahead in the leaf looking for xattrs. If we
3751 * don't find any xattrs, we know there can't be any acls.
3752 *
3753 * slot is the slot the inode is in, objectid is the objectid of the inode
3754 */
acls_after_inode_item(struct extent_buffer * leaf,int slot,u64 objectid,int * first_xattr_slot)3755 static noinline int acls_after_inode_item(struct extent_buffer *leaf,
3756 int slot, u64 objectid,
3757 int *first_xattr_slot)
3758 {
3759 u32 nritems = btrfs_header_nritems(leaf);
3760 struct btrfs_key found_key;
3761 static u64 xattr_access = 0;
3762 static u64 xattr_default = 0;
3763 int scanned = 0;
3764
3765 if (!xattr_access) {
3766 xattr_access = btrfs_name_hash(XATTR_NAME_POSIX_ACL_ACCESS,
3767 strlen(XATTR_NAME_POSIX_ACL_ACCESS));
3768 xattr_default = btrfs_name_hash(XATTR_NAME_POSIX_ACL_DEFAULT,
3769 strlen(XATTR_NAME_POSIX_ACL_DEFAULT));
3770 }
3771
3772 slot++;
3773 *first_xattr_slot = -1;
3774 while (slot < nritems) {
3775 btrfs_item_key_to_cpu(leaf, &found_key, slot);
3776
3777 /* we found a different objectid, there must not be acls */
3778 if (found_key.objectid != objectid)
3779 return 0;
3780
3781 /* we found an xattr, assume we've got an acl */
3782 if (found_key.type == BTRFS_XATTR_ITEM_KEY) {
3783 if (*first_xattr_slot == -1)
3784 *first_xattr_slot = slot;
3785 if (found_key.offset == xattr_access ||
3786 found_key.offset == xattr_default)
3787 return 1;
3788 }
3789
3790 /*
3791 * we found a key greater than an xattr key, there can't
3792 * be any acls later on
3793 */
3794 if (found_key.type > BTRFS_XATTR_ITEM_KEY)
3795 return 0;
3796
3797 slot++;
3798 scanned++;
3799
3800 /*
3801 * it goes inode, inode backrefs, xattrs, extents,
3802 * so if there are a ton of hard links to an inode there can
3803 * be a lot of backrefs. Don't waste time searching too hard,
3804 * this is just an optimization
3805 */
3806 if (scanned >= 8)
3807 break;
3808 }
3809 /* we hit the end of the leaf before we found an xattr or
3810 * something larger than an xattr. We have to assume the inode
3811 * has acls
3812 */
3813 if (*first_xattr_slot == -1)
3814 *first_xattr_slot = slot;
3815 return 1;
3816 }
3817
btrfs_init_file_extent_tree(struct btrfs_inode * inode)3818 static int btrfs_init_file_extent_tree(struct btrfs_inode *inode)
3819 {
3820 struct btrfs_fs_info *fs_info = inode->root->fs_info;
3821
3822 if (WARN_ON_ONCE(inode->file_extent_tree))
3823 return 0;
3824 if (btrfs_fs_incompat(fs_info, NO_HOLES))
3825 return 0;
3826 if (!S_ISREG(inode->vfs_inode.i_mode))
3827 return 0;
3828 if (btrfs_is_free_space_inode(inode))
3829 return 0;
3830
3831 inode->file_extent_tree = kmalloc(sizeof(struct extent_io_tree), GFP_KERNEL);
3832 if (!inode->file_extent_tree)
3833 return -ENOMEM;
3834
3835 extent_io_tree_init(fs_info, inode->file_extent_tree, IO_TREE_INODE_FILE_EXTENT);
3836 /* Lockdep class is set only for the file extent tree. */
3837 lockdep_set_class(&inode->file_extent_tree->lock, &file_extent_tree_class);
3838
3839 return 0;
3840 }
3841
btrfs_add_inode_to_root(struct btrfs_inode * inode,bool prealloc)3842 static int btrfs_add_inode_to_root(struct btrfs_inode *inode, bool prealloc)
3843 {
3844 struct btrfs_root *root = inode->root;
3845 struct btrfs_inode *existing;
3846 const u64 ino = btrfs_ino(inode);
3847 int ret;
3848
3849 if (inode_unhashed(&inode->vfs_inode))
3850 return 0;
3851
3852 if (prealloc) {
3853 ret = xa_reserve(&root->inodes, ino, GFP_NOFS);
3854 if (ret)
3855 return ret;
3856 }
3857
3858 existing = xa_store(&root->inodes, ino, inode, GFP_ATOMIC);
3859
3860 if (xa_is_err(existing)) {
3861 ret = xa_err(existing);
3862 ASSERT(ret != -EINVAL);
3863 ASSERT(ret != -ENOMEM);
3864 return ret;
3865 } else if (existing) {
3866 WARN_ON(!(existing->vfs_inode.i_state & (I_WILL_FREE | I_FREEING)));
3867 }
3868
3869 return 0;
3870 }
3871
3872 /*
3873 * Read a locked inode from the btree into the in-memory inode and add it to
3874 * its root list/tree.
3875 *
3876 * On failure clean up the inode.
3877 */
btrfs_read_locked_inode(struct btrfs_inode * inode,struct btrfs_path * path)3878 static int btrfs_read_locked_inode(struct btrfs_inode *inode, struct btrfs_path *path)
3879 {
3880 struct btrfs_root *root = inode->root;
3881 struct btrfs_fs_info *fs_info = root->fs_info;
3882 struct extent_buffer *leaf;
3883 struct btrfs_inode_item *inode_item;
3884 struct inode *vfs_inode = &inode->vfs_inode;
3885 struct btrfs_key location;
3886 unsigned long ptr;
3887 int maybe_acls;
3888 u32 rdev;
3889 int ret;
3890 bool filled = false;
3891 int first_xattr_slot;
3892
3893 ret = btrfs_init_file_extent_tree(inode);
3894 if (ret)
3895 goto out;
3896
3897 ret = btrfs_fill_inode(inode, &rdev);
3898 if (!ret)
3899 filled = true;
3900
3901 ASSERT(path);
3902
3903 btrfs_get_inode_key(inode, &location);
3904
3905 ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
3906 if (ret) {
3907 /*
3908 * ret > 0 can come from btrfs_search_slot called by
3909 * btrfs_lookup_inode(), this means the inode was not found.
3910 */
3911 if (ret > 0)
3912 ret = -ENOENT;
3913 goto out;
3914 }
3915
3916 leaf = path->nodes[0];
3917
3918 if (filled)
3919 goto cache_index;
3920
3921 inode_item = btrfs_item_ptr(leaf, path->slots[0],
3922 struct btrfs_inode_item);
3923 vfs_inode->i_mode = btrfs_inode_mode(leaf, inode_item);
3924 set_nlink(vfs_inode, btrfs_inode_nlink(leaf, inode_item));
3925 i_uid_write(vfs_inode, btrfs_inode_uid(leaf, inode_item));
3926 i_gid_write(vfs_inode, btrfs_inode_gid(leaf, inode_item));
3927 btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
3928 btrfs_inode_set_file_extent_range(inode, 0,
3929 round_up(i_size_read(vfs_inode), fs_info->sectorsize));
3930
3931 inode_set_atime(vfs_inode, btrfs_timespec_sec(leaf, &inode_item->atime),
3932 btrfs_timespec_nsec(leaf, &inode_item->atime));
3933
3934 inode_set_mtime(vfs_inode, btrfs_timespec_sec(leaf, &inode_item->mtime),
3935 btrfs_timespec_nsec(leaf, &inode_item->mtime));
3936
3937 inode_set_ctime(vfs_inode, btrfs_timespec_sec(leaf, &inode_item->ctime),
3938 btrfs_timespec_nsec(leaf, &inode_item->ctime));
3939
3940 inode->i_otime_sec = btrfs_timespec_sec(leaf, &inode_item->otime);
3941 inode->i_otime_nsec = btrfs_timespec_nsec(leaf, &inode_item->otime);
3942
3943 inode_set_bytes(vfs_inode, btrfs_inode_nbytes(leaf, inode_item));
3944 inode->generation = btrfs_inode_generation(leaf, inode_item);
3945 inode->last_trans = btrfs_inode_transid(leaf, inode_item);
3946
3947 inode_set_iversion_queried(vfs_inode, btrfs_inode_sequence(leaf, inode_item));
3948 vfs_inode->i_generation = inode->generation;
3949 vfs_inode->i_rdev = 0;
3950 rdev = btrfs_inode_rdev(leaf, inode_item);
3951
3952 if (S_ISDIR(vfs_inode->i_mode))
3953 inode->index_cnt = (u64)-1;
3954
3955 btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item),
3956 &inode->flags, &inode->ro_flags);
3957 btrfs_update_inode_mapping_flags(inode);
3958
3959 cache_index:
3960 /*
3961 * If we were modified in the current generation and evicted from memory
3962 * and then re-read we need to do a full sync since we don't have any
3963 * idea about which extents were modified before we were evicted from
3964 * cache.
3965 *
3966 * This is required for both inode re-read from disk and delayed inode
3967 * in the delayed_nodes xarray.
3968 */
3969 if (inode->last_trans == btrfs_get_fs_generation(fs_info))
3970 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
3971
3972 /*
3973 * We don't persist the id of the transaction where an unlink operation
3974 * against the inode was last made. So here we assume the inode might
3975 * have been evicted, and therefore the exact value of last_unlink_trans
3976 * lost, and set it to last_trans to avoid metadata inconsistencies
3977 * between the inode and its parent if the inode is fsync'ed and the log
3978 * replayed. For example, in the scenario:
3979 *
3980 * touch mydir/foo
3981 * ln mydir/foo mydir/bar
3982 * sync
3983 * unlink mydir/bar
3984 * echo 2 > /proc/sys/vm/drop_caches # evicts inode
3985 * xfs_io -c fsync mydir/foo
3986 * <power failure>
3987 * mount fs, triggers fsync log replay
3988 *
3989 * We must make sure that when we fsync our inode foo we also log its
3990 * parent inode, otherwise after log replay the parent still has the
3991 * dentry with the "bar" name but our inode foo has a link count of 1
3992 * and doesn't have an inode ref with the name "bar" anymore.
3993 *
3994 * Setting last_unlink_trans to last_trans is a pessimistic approach,
3995 * but it guarantees correctness at the expense of occasional full
3996 * transaction commits on fsync if our inode is a directory, or if our
3997 * inode is not a directory, logging its parent unnecessarily.
3998 */
3999 inode->last_unlink_trans = inode->last_trans;
4000
4001 /*
4002 * Same logic as for last_unlink_trans. We don't persist the generation
4003 * of the last transaction where this inode was used for a reflink
4004 * operation, so after eviction and reloading the inode we must be
4005 * pessimistic and assume the last transaction that modified the inode.
4006 */
4007 inode->last_reflink_trans = inode->last_trans;
4008
4009 path->slots[0]++;
4010 if (vfs_inode->i_nlink != 1 ||
4011 path->slots[0] >= btrfs_header_nritems(leaf))
4012 goto cache_acl;
4013
4014 btrfs_item_key_to_cpu(leaf, &location, path->slots[0]);
4015 if (location.objectid != btrfs_ino(inode))
4016 goto cache_acl;
4017
4018 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
4019 if (location.type == BTRFS_INODE_REF_KEY) {
4020 struct btrfs_inode_ref *ref;
4021
4022 ref = (struct btrfs_inode_ref *)ptr;
4023 inode->dir_index = btrfs_inode_ref_index(leaf, ref);
4024 } else if (location.type == BTRFS_INODE_EXTREF_KEY) {
4025 struct btrfs_inode_extref *extref;
4026
4027 extref = (struct btrfs_inode_extref *)ptr;
4028 inode->dir_index = btrfs_inode_extref_index(leaf, extref);
4029 }
4030 cache_acl:
4031 /*
4032 * try to precache a NULL acl entry for files that don't have
4033 * any xattrs or acls
4034 */
4035 maybe_acls = acls_after_inode_item(leaf, path->slots[0],
4036 btrfs_ino(inode), &first_xattr_slot);
4037 if (first_xattr_slot != -1) {
4038 path->slots[0] = first_xattr_slot;
4039 ret = btrfs_load_inode_props(inode, path);
4040 if (ret)
4041 btrfs_err(fs_info,
4042 "error loading props for ino %llu (root %llu): %d",
4043 btrfs_ino(inode), btrfs_root_id(root), ret);
4044 }
4045
4046 if (!maybe_acls)
4047 cache_no_acl(vfs_inode);
4048
4049 switch (vfs_inode->i_mode & S_IFMT) {
4050 case S_IFREG:
4051 vfs_inode->i_mapping->a_ops = &btrfs_aops;
4052 vfs_inode->i_fop = &btrfs_file_operations;
4053 vfs_inode->i_op = &btrfs_file_inode_operations;
4054 break;
4055 case S_IFDIR:
4056 vfs_inode->i_fop = &btrfs_dir_file_operations;
4057 vfs_inode->i_op = &btrfs_dir_inode_operations;
4058 break;
4059 case S_IFLNK:
4060 vfs_inode->i_op = &btrfs_symlink_inode_operations;
4061 inode_nohighmem(vfs_inode);
4062 vfs_inode->i_mapping->a_ops = &btrfs_aops;
4063 break;
4064 default:
4065 vfs_inode->i_op = &btrfs_special_inode_operations;
4066 init_special_inode(vfs_inode, vfs_inode->i_mode, rdev);
4067 break;
4068 }
4069
4070 btrfs_sync_inode_flags_to_i_flags(inode);
4071
4072 ret = btrfs_add_inode_to_root(inode, true);
4073 if (ret)
4074 goto out;
4075
4076 return 0;
4077 out:
4078 iget_failed(vfs_inode);
4079 return ret;
4080 }
4081
4082 /*
4083 * given a leaf and an inode, copy the inode fields into the leaf
4084 */
fill_inode_item(struct btrfs_trans_handle * trans,struct extent_buffer * leaf,struct btrfs_inode_item * item,struct inode * inode)4085 static void fill_inode_item(struct btrfs_trans_handle *trans,
4086 struct extent_buffer *leaf,
4087 struct btrfs_inode_item *item,
4088 struct inode *inode)
4089 {
4090 struct btrfs_map_token token;
4091 u64 flags;
4092
4093 btrfs_init_map_token(&token, leaf);
4094
4095 btrfs_set_token_inode_uid(&token, item, i_uid_read(inode));
4096 btrfs_set_token_inode_gid(&token, item, i_gid_read(inode));
4097 btrfs_set_token_inode_size(&token, item, BTRFS_I(inode)->disk_i_size);
4098 btrfs_set_token_inode_mode(&token, item, inode->i_mode);
4099 btrfs_set_token_inode_nlink(&token, item, inode->i_nlink);
4100
4101 btrfs_set_token_timespec_sec(&token, &item->atime,
4102 inode_get_atime_sec(inode));
4103 btrfs_set_token_timespec_nsec(&token, &item->atime,
4104 inode_get_atime_nsec(inode));
4105
4106 btrfs_set_token_timespec_sec(&token, &item->mtime,
4107 inode_get_mtime_sec(inode));
4108 btrfs_set_token_timespec_nsec(&token, &item->mtime,
4109 inode_get_mtime_nsec(inode));
4110
4111 btrfs_set_token_timespec_sec(&token, &item->ctime,
4112 inode_get_ctime_sec(inode));
4113 btrfs_set_token_timespec_nsec(&token, &item->ctime,
4114 inode_get_ctime_nsec(inode));
4115
4116 btrfs_set_token_timespec_sec(&token, &item->otime, BTRFS_I(inode)->i_otime_sec);
4117 btrfs_set_token_timespec_nsec(&token, &item->otime, BTRFS_I(inode)->i_otime_nsec);
4118
4119 btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode));
4120 btrfs_set_token_inode_generation(&token, item,
4121 BTRFS_I(inode)->generation);
4122 btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
4123 btrfs_set_token_inode_transid(&token, item, trans->transid);
4124 btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
4125 flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
4126 BTRFS_I(inode)->ro_flags);
4127 btrfs_set_token_inode_flags(&token, item, flags);
4128 btrfs_set_token_inode_block_group(&token, item, 0);
4129 }
4130
4131 /*
4132 * copy everything in the in-memory inode into the btree.
4133 */
btrfs_update_inode_item(struct btrfs_trans_handle * trans,struct btrfs_inode * inode)4134 static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
4135 struct btrfs_inode *inode)
4136 {
4137 struct btrfs_inode_item *inode_item;
4138 struct btrfs_path *path;
4139 struct extent_buffer *leaf;
4140 struct btrfs_key key;
4141 int ret;
4142
4143 path = btrfs_alloc_path();
4144 if (!path)
4145 return -ENOMEM;
4146
4147 btrfs_get_inode_key(inode, &key);
4148 ret = btrfs_lookup_inode(trans, inode->root, path, &key, 1);
4149 if (ret) {
4150 if (ret > 0)
4151 ret = -ENOENT;
4152 goto failed;
4153 }
4154
4155 leaf = path->nodes[0];
4156 inode_item = btrfs_item_ptr(leaf, path->slots[0],
4157 struct btrfs_inode_item);
4158
4159 fill_inode_item(trans, leaf, inode_item, &inode->vfs_inode);
4160 btrfs_set_inode_last_trans(trans, inode);
4161 ret = 0;
4162 failed:
4163 btrfs_free_path(path);
4164 return ret;
4165 }
4166
4167 /*
4168 * copy everything in the in-memory inode into the btree.
4169 */
btrfs_update_inode(struct btrfs_trans_handle * trans,struct btrfs_inode * inode)4170 int btrfs_update_inode(struct btrfs_trans_handle *trans,
4171 struct btrfs_inode *inode)
4172 {
4173 struct btrfs_root *root = inode->root;
4174 struct btrfs_fs_info *fs_info = root->fs_info;
4175 int ret;
4176
4177 /*
4178 * If the inode is a free space inode, we can deadlock during commit
4179 * if we put it into the delayed code.
4180 *
4181 * The data relocation inode should also be directly updated
4182 * without delay
4183 */
4184 if (!btrfs_is_free_space_inode(inode)
4185 && !btrfs_is_data_reloc_root(root)
4186 && !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) {
4187 btrfs_update_root_times(trans, root);
4188
4189 ret = btrfs_delayed_update_inode(trans, inode);
4190 if (!ret)
4191 btrfs_set_inode_last_trans(trans, inode);
4192 return ret;
4193 }
4194
4195 return btrfs_update_inode_item(trans, inode);
4196 }
4197
btrfs_update_inode_fallback(struct btrfs_trans_handle * trans,struct btrfs_inode * inode)4198 int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
4199 struct btrfs_inode *inode)
4200 {
4201 int ret;
4202
4203 ret = btrfs_update_inode(trans, inode);
4204 if (ret == -ENOSPC)
4205 return btrfs_update_inode_item(trans, inode);
4206 return ret;
4207 }
4208
4209 /*
4210 * unlink helper that gets used here in inode.c and in the tree logging
4211 * recovery code. It remove a link in a directory with a given name, and
4212 * also drops the back refs in the inode to the directory
4213 */
__btrfs_unlink_inode(struct btrfs_trans_handle * trans,struct btrfs_inode * dir,struct btrfs_inode * inode,const struct fscrypt_str * name,struct btrfs_rename_ctx * rename_ctx)4214 static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
4215 struct btrfs_inode *dir,
4216 struct btrfs_inode *inode,
4217 const struct fscrypt_str *name,
4218 struct btrfs_rename_ctx *rename_ctx)
4219 {
4220 struct btrfs_root *root = dir->root;
4221 struct btrfs_fs_info *fs_info = root->fs_info;
4222 struct btrfs_path *path;
4223 int ret = 0;
4224 struct btrfs_dir_item *di;
4225 u64 index;
4226 u64 ino = btrfs_ino(inode);
4227 u64 dir_ino = btrfs_ino(dir);
4228
4229 path = btrfs_alloc_path();
4230 if (!path) {
4231 ret = -ENOMEM;
4232 goto out;
4233 }
4234
4235 di = btrfs_lookup_dir_item(trans, root, path, dir_ino, name, -1);
4236 if (IS_ERR_OR_NULL(di)) {
4237 ret = di ? PTR_ERR(di) : -ENOENT;
4238 goto err;
4239 }
4240 ret = btrfs_delete_one_dir_name(trans, root, path, di);
4241 if (ret)
4242 goto err;
4243 btrfs_release_path(path);
4244
4245 /*
4246 * If we don't have dir index, we have to get it by looking up
4247 * the inode ref, since we get the inode ref, remove it directly,
4248 * it is unnecessary to do delayed deletion.
4249 *
4250 * But if we have dir index, needn't search inode ref to get it.
4251 * Since the inode ref is close to the inode item, it is better
4252 * that we delay to delete it, and just do this deletion when
4253 * we update the inode item.
4254 */
4255 if (inode->dir_index) {
4256 ret = btrfs_delayed_delete_inode_ref(inode);
4257 if (!ret) {
4258 index = inode->dir_index;
4259 goto skip_backref;
4260 }
4261 }
4262
4263 ret = btrfs_del_inode_ref(trans, root, name, ino, dir_ino, &index);
4264 if (ret) {
4265 btrfs_info(fs_info,
4266 "failed to delete reference to %.*s, inode %llu parent %llu",
4267 name->len, name->name, ino, dir_ino);
4268 btrfs_abort_transaction(trans, ret);
4269 goto err;
4270 }
4271 skip_backref:
4272 if (rename_ctx)
4273 rename_ctx->index = index;
4274
4275 ret = btrfs_delete_delayed_dir_index(trans, dir, index);
4276 if (ret) {
4277 btrfs_abort_transaction(trans, ret);
4278 goto err;
4279 }
4280
4281 /*
4282 * If we are in a rename context, we don't need to update anything in the
4283 * log. That will be done later during the rename by btrfs_log_new_name().
4284 * Besides that, doing it here would only cause extra unnecessary btree
4285 * operations on the log tree, increasing latency for applications.
4286 */
4287 if (!rename_ctx) {
4288 btrfs_del_inode_ref_in_log(trans, root, name, inode, dir_ino);
4289 btrfs_del_dir_entries_in_log(trans, root, name, dir, index);
4290 }
4291
4292 /*
4293 * If we have a pending delayed iput we could end up with the final iput
4294 * being run in btrfs-cleaner context. If we have enough of these built
4295 * up we can end up burning a lot of time in btrfs-cleaner without any
4296 * way to throttle the unlinks. Since we're currently holding a ref on
4297 * the inode we can run the delayed iput here without any issues as the
4298 * final iput won't be done until after we drop the ref we're currently
4299 * holding.
4300 */
4301 btrfs_run_delayed_iput(fs_info, inode);
4302 err:
4303 btrfs_free_path(path);
4304 if (ret)
4305 goto out;
4306
4307 btrfs_i_size_write(dir, dir->vfs_inode.i_size - name->len * 2);
4308 inode_inc_iversion(&inode->vfs_inode);
4309 inode_set_ctime_current(&inode->vfs_inode);
4310 inode_inc_iversion(&dir->vfs_inode);
4311 inode_set_mtime_to_ts(&dir->vfs_inode, inode_set_ctime_current(&dir->vfs_inode));
4312 ret = btrfs_update_inode(trans, dir);
4313 out:
4314 return ret;
4315 }
4316
btrfs_unlink_inode(struct btrfs_trans_handle * trans,struct btrfs_inode * dir,struct btrfs_inode * inode,const struct fscrypt_str * name)4317 int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
4318 struct btrfs_inode *dir, struct btrfs_inode *inode,
4319 const struct fscrypt_str *name)
4320 {
4321 int ret;
4322
4323 ret = __btrfs_unlink_inode(trans, dir, inode, name, NULL);
4324 if (!ret) {
4325 drop_nlink(&inode->vfs_inode);
4326 ret = btrfs_update_inode(trans, inode);
4327 }
4328 return ret;
4329 }
4330
4331 /*
4332 * helper to start transaction for unlink and rmdir.
4333 *
4334 * unlink and rmdir are special in btrfs, they do not always free space, so
4335 * if we cannot make our reservations the normal way try and see if there is
4336 * plenty of slack room in the global reserve to migrate, otherwise we cannot
4337 * allow the unlink to occur.
4338 */
__unlink_start_trans(struct btrfs_inode * dir)4339 static struct btrfs_trans_handle *__unlink_start_trans(struct btrfs_inode *dir)
4340 {
4341 struct btrfs_root *root = dir->root;
4342
4343 return btrfs_start_transaction_fallback_global_rsv(root,
4344 BTRFS_UNLINK_METADATA_UNITS);
4345 }
4346
btrfs_unlink(struct inode * dir,struct dentry * dentry)4347 static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
4348 {
4349 struct btrfs_trans_handle *trans;
4350 struct inode *inode = d_inode(dentry);
4351 int ret;
4352 struct fscrypt_name fname;
4353
4354 ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname);
4355 if (ret)
4356 return ret;
4357
4358 /* This needs to handle no-key deletions later on */
4359
4360 trans = __unlink_start_trans(BTRFS_I(dir));
4361 if (IS_ERR(trans)) {
4362 ret = PTR_ERR(trans);
4363 goto fscrypt_free;
4364 }
4365
4366 btrfs_record_unlink_dir(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
4367 false);
4368
4369 ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
4370 &fname.disk_name);
4371 if (ret)
4372 goto end_trans;
4373
4374 if (inode->i_nlink == 0) {
4375 ret = btrfs_orphan_add(trans, BTRFS_I(inode));
4376 if (ret)
4377 goto end_trans;
4378 }
4379
4380 end_trans:
4381 btrfs_end_transaction(trans);
4382 btrfs_btree_balance_dirty(BTRFS_I(dir)->root->fs_info);
4383 fscrypt_free:
4384 fscrypt_free_filename(&fname);
4385 return ret;
4386 }
4387
btrfs_unlink_subvol(struct btrfs_trans_handle * trans,struct btrfs_inode * dir,struct dentry * dentry)4388 static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
4389 struct btrfs_inode *dir, struct dentry *dentry)
4390 {
4391 struct btrfs_root *root = dir->root;
4392 struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
4393 struct btrfs_path *path;
4394 struct extent_buffer *leaf;
4395 struct btrfs_dir_item *di;
4396 struct btrfs_key key;
4397 u64 index;
4398 int ret;
4399 u64 objectid;
4400 u64 dir_ino = btrfs_ino(dir);
4401 struct fscrypt_name fname;
4402
4403 ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 1, &fname);
4404 if (ret)
4405 return ret;
4406
4407 /* This needs to handle no-key deletions later on */
4408
4409 if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) {
4410 objectid = btrfs_root_id(inode->root);
4411 } else if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
4412 objectid = inode->ref_root_id;
4413 } else {
4414 WARN_ON(1);
4415 fscrypt_free_filename(&fname);
4416 return -EINVAL;
4417 }
4418
4419 path = btrfs_alloc_path();
4420 if (!path) {
4421 ret = -ENOMEM;
4422 goto out;
4423 }
4424
4425 di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
4426 &fname.disk_name, -1);
4427 if (IS_ERR_OR_NULL(di)) {
4428 ret = di ? PTR_ERR(di) : -ENOENT;
4429 goto out;
4430 }
4431
4432 leaf = path->nodes[0];
4433 btrfs_dir_item_key_to_cpu(leaf, di, &key);
4434 WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
4435 ret = btrfs_delete_one_dir_name(trans, root, path, di);
4436 if (ret) {
4437 btrfs_abort_transaction(trans, ret);
4438 goto out;
4439 }
4440 btrfs_release_path(path);
4441
4442 /*
4443 * This is a placeholder inode for a subvolume we didn't have a
4444 * reference to at the time of the snapshot creation. In the meantime
4445 * we could have renamed the real subvol link into our snapshot, so
4446 * depending on btrfs_del_root_ref to return -ENOENT here is incorrect.
4447 * Instead simply lookup the dir_index_item for this entry so we can
4448 * remove it. Otherwise we know we have a ref to the root and we can
4449 * call btrfs_del_root_ref, and it _shouldn't_ fail.
4450 */
4451 if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
4452 di = btrfs_search_dir_index_item(root, path, dir_ino, &fname.disk_name);
4453 if (IS_ERR(di)) {
4454 ret = PTR_ERR(di);
4455 btrfs_abort_transaction(trans, ret);
4456 goto out;
4457 }
4458
4459 leaf = path->nodes[0];
4460 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4461 index = key.offset;
4462 btrfs_release_path(path);
4463 } else {
4464 ret = btrfs_del_root_ref(trans, objectid,
4465 btrfs_root_id(root), dir_ino,
4466 &index, &fname.disk_name);
4467 if (ret) {
4468 btrfs_abort_transaction(trans, ret);
4469 goto out;
4470 }
4471 }
4472
4473 ret = btrfs_delete_delayed_dir_index(trans, dir, index);
4474 if (ret) {
4475 btrfs_abort_transaction(trans, ret);
4476 goto out;
4477 }
4478
4479 btrfs_i_size_write(dir, dir->vfs_inode.i_size - fname.disk_name.len * 2);
4480 inode_inc_iversion(&dir->vfs_inode);
4481 inode_set_mtime_to_ts(&dir->vfs_inode, inode_set_ctime_current(&dir->vfs_inode));
4482 ret = btrfs_update_inode_fallback(trans, dir);
4483 if (ret)
4484 btrfs_abort_transaction(trans, ret);
4485 out:
4486 btrfs_free_path(path);
4487 fscrypt_free_filename(&fname);
4488 return ret;
4489 }
4490
4491 /*
4492 * Helper to check if the subvolume references other subvolumes or if it's
4493 * default.
4494 */
may_destroy_subvol(struct btrfs_root * root)4495 static noinline int may_destroy_subvol(struct btrfs_root *root)
4496 {
4497 struct btrfs_fs_info *fs_info = root->fs_info;
4498 struct btrfs_path *path;
4499 struct btrfs_dir_item *di;
4500 struct btrfs_key key;
4501 struct fscrypt_str name = FSTR_INIT("default", 7);
4502 u64 dir_id;
4503 int ret;
4504
4505 path = btrfs_alloc_path();
4506 if (!path)
4507 return -ENOMEM;
4508
4509 /* Make sure this root isn't set as the default subvol */
4510 dir_id = btrfs_super_root_dir(fs_info->super_copy);
4511 di = btrfs_lookup_dir_item(NULL, fs_info->tree_root, path,
4512 dir_id, &name, 0);
4513 if (di && !IS_ERR(di)) {
4514 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
4515 if (key.objectid == btrfs_root_id(root)) {
4516 ret = -EPERM;
4517 btrfs_err(fs_info,
4518 "deleting default subvolume %llu is not allowed",
4519 key.objectid);
4520 goto out;
4521 }
4522 btrfs_release_path(path);
4523 }
4524
4525 key.objectid = btrfs_root_id(root);
4526 key.type = BTRFS_ROOT_REF_KEY;
4527 key.offset = (u64)-1;
4528
4529 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
4530 if (ret < 0)
4531 goto out;
4532 if (ret == 0) {
4533 /*
4534 * Key with offset -1 found, there would have to exist a root
4535 * with such id, but this is out of valid range.
4536 */
4537 ret = -EUCLEAN;
4538 goto out;
4539 }
4540
4541 ret = 0;
4542 if (path->slots[0] > 0) {
4543 path->slots[0]--;
4544 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
4545 if (key.objectid == btrfs_root_id(root) && key.type == BTRFS_ROOT_REF_KEY)
4546 ret = -ENOTEMPTY;
4547 }
4548 out:
4549 btrfs_free_path(path);
4550 return ret;
4551 }
4552
4553 /* Delete all dentries for inodes belonging to the root */
btrfs_prune_dentries(struct btrfs_root * root)4554 static void btrfs_prune_dentries(struct btrfs_root *root)
4555 {
4556 struct btrfs_fs_info *fs_info = root->fs_info;
4557 struct btrfs_inode *inode;
4558 u64 min_ino = 0;
4559
4560 if (!BTRFS_FS_ERROR(fs_info))
4561 WARN_ON(btrfs_root_refs(&root->root_item) != 0);
4562
4563 inode = btrfs_find_first_inode(root, min_ino);
4564 while (inode) {
4565 if (atomic_read(&inode->vfs_inode.i_count) > 1)
4566 d_prune_aliases(&inode->vfs_inode);
4567
4568 min_ino = btrfs_ino(inode) + 1;
4569 /*
4570 * btrfs_drop_inode() will have it removed from the inode
4571 * cache when its usage count hits zero.
4572 */
4573 iput(&inode->vfs_inode);
4574 cond_resched();
4575 inode = btrfs_find_first_inode(root, min_ino);
4576 }
4577 }
4578
btrfs_delete_subvolume(struct btrfs_inode * dir,struct dentry * dentry)4579 int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
4580 {
4581 struct btrfs_root *root = dir->root;
4582 struct btrfs_fs_info *fs_info = root->fs_info;
4583 struct inode *inode = d_inode(dentry);
4584 struct btrfs_root *dest = BTRFS_I(inode)->root;
4585 struct btrfs_trans_handle *trans;
4586 struct btrfs_block_rsv block_rsv;
4587 u64 root_flags;
4588 u64 qgroup_reserved = 0;
4589 int ret;
4590
4591 down_write(&fs_info->subvol_sem);
4592
4593 /*
4594 * Don't allow to delete a subvolume with send in progress. This is
4595 * inside the inode lock so the error handling that has to drop the bit
4596 * again is not run concurrently.
4597 */
4598 spin_lock(&dest->root_item_lock);
4599 if (dest->send_in_progress) {
4600 spin_unlock(&dest->root_item_lock);
4601 btrfs_warn(fs_info,
4602 "attempt to delete subvolume %llu during send",
4603 btrfs_root_id(dest));
4604 ret = -EPERM;
4605 goto out_up_write;
4606 }
4607 if (atomic_read(&dest->nr_swapfiles)) {
4608 spin_unlock(&dest->root_item_lock);
4609 btrfs_warn(fs_info,
4610 "attempt to delete subvolume %llu with active swapfile",
4611 btrfs_root_id(root));
4612 ret = -EPERM;
4613 goto out_up_write;
4614 }
4615 root_flags = btrfs_root_flags(&dest->root_item);
4616 btrfs_set_root_flags(&dest->root_item,
4617 root_flags | BTRFS_ROOT_SUBVOL_DEAD);
4618 spin_unlock(&dest->root_item_lock);
4619
4620 ret = may_destroy_subvol(dest);
4621 if (ret)
4622 goto out_undead;
4623
4624 btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
4625 /*
4626 * One for dir inode,
4627 * two for dir entries,
4628 * two for root ref/backref.
4629 */
4630 ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true);
4631 if (ret)
4632 goto out_undead;
4633 qgroup_reserved = block_rsv.qgroup_rsv_reserved;
4634
4635 trans = btrfs_start_transaction(root, 0);
4636 if (IS_ERR(trans)) {
4637 ret = PTR_ERR(trans);
4638 goto out_release;
4639 }
4640 btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved);
4641 qgroup_reserved = 0;
4642 trans->block_rsv = &block_rsv;
4643 trans->bytes_reserved = block_rsv.size;
4644
4645 btrfs_record_snapshot_destroy(trans, dir);
4646
4647 ret = btrfs_unlink_subvol(trans, dir, dentry);
4648 if (ret) {
4649 btrfs_abort_transaction(trans, ret);
4650 goto out_end_trans;
4651 }
4652
4653 ret = btrfs_record_root_in_trans(trans, dest);
4654 if (ret) {
4655 btrfs_abort_transaction(trans, ret);
4656 goto out_end_trans;
4657 }
4658
4659 memset(&dest->root_item.drop_progress, 0,
4660 sizeof(dest->root_item.drop_progress));
4661 btrfs_set_root_drop_level(&dest->root_item, 0);
4662 btrfs_set_root_refs(&dest->root_item, 0);
4663
4664 if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) {
4665 ret = btrfs_insert_orphan_item(trans,
4666 fs_info->tree_root,
4667 btrfs_root_id(dest));
4668 if (ret) {
4669 btrfs_abort_transaction(trans, ret);
4670 goto out_end_trans;
4671 }
4672 }
4673
4674 ret = btrfs_uuid_tree_remove(trans, dest->root_item.uuid,
4675 BTRFS_UUID_KEY_SUBVOL, btrfs_root_id(dest));
4676 if (ret && ret != -ENOENT) {
4677 btrfs_abort_transaction(trans, ret);
4678 goto out_end_trans;
4679 }
4680 if (!btrfs_is_empty_uuid(dest->root_item.received_uuid)) {
4681 ret = btrfs_uuid_tree_remove(trans,
4682 dest->root_item.received_uuid,
4683 BTRFS_UUID_KEY_RECEIVED_SUBVOL,
4684 btrfs_root_id(dest));
4685 if (ret && ret != -ENOENT) {
4686 btrfs_abort_transaction(trans, ret);
4687 goto out_end_trans;
4688 }
4689 }
4690
4691 free_anon_bdev(dest->anon_dev);
4692 dest->anon_dev = 0;
4693 out_end_trans:
4694 trans->block_rsv = NULL;
4695 trans->bytes_reserved = 0;
4696 ret = btrfs_end_transaction(trans);
4697 inode->i_flags |= S_DEAD;
4698 out_release:
4699 btrfs_block_rsv_release(fs_info, &block_rsv, (u64)-1, NULL);
4700 if (qgroup_reserved)
4701 btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved);
4702 out_undead:
4703 if (ret) {
4704 spin_lock(&dest->root_item_lock);
4705 root_flags = btrfs_root_flags(&dest->root_item);
4706 btrfs_set_root_flags(&dest->root_item,
4707 root_flags & ~BTRFS_ROOT_SUBVOL_DEAD);
4708 spin_unlock(&dest->root_item_lock);
4709 }
4710 out_up_write:
4711 up_write(&fs_info->subvol_sem);
4712 if (!ret) {
4713 d_invalidate(dentry);
4714 btrfs_prune_dentries(dest);
4715 ASSERT(dest->send_in_progress == 0);
4716 }
4717
4718 return ret;
4719 }
4720
btrfs_rmdir(struct inode * dir,struct dentry * dentry)4721 static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
4722 {
4723 struct inode *inode = d_inode(dentry);
4724 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
4725 int ret = 0;
4726 struct btrfs_trans_handle *trans;
4727 u64 last_unlink_trans;
4728 struct fscrypt_name fname;
4729
4730 if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
4731 return -ENOTEMPTY;
4732 if (btrfs_ino(BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID) {
4733 if (unlikely(btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))) {
4734 btrfs_err(fs_info,
4735 "extent tree v2 doesn't support snapshot deletion yet");
4736 return -EOPNOTSUPP;
4737 }
4738 return btrfs_delete_subvolume(BTRFS_I(dir), dentry);
4739 }
4740
4741 ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname);
4742 if (ret)
4743 return ret;
4744
4745 /* This needs to handle no-key deletions later on */
4746
4747 trans = __unlink_start_trans(BTRFS_I(dir));
4748 if (IS_ERR(trans)) {
4749 ret = PTR_ERR(trans);
4750 goto out_notrans;
4751 }
4752
4753 if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
4754 ret = btrfs_unlink_subvol(trans, BTRFS_I(dir), dentry);
4755 goto out;
4756 }
4757
4758 ret = btrfs_orphan_add(trans, BTRFS_I(inode));
4759 if (ret)
4760 goto out;
4761
4762 last_unlink_trans = BTRFS_I(inode)->last_unlink_trans;
4763
4764 /* now the directory is empty */
4765 ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
4766 &fname.disk_name);
4767 if (!ret) {
4768 btrfs_i_size_write(BTRFS_I(inode), 0);
4769 /*
4770 * Propagate the last_unlink_trans value of the deleted dir to
4771 * its parent directory. This is to prevent an unrecoverable
4772 * log tree in the case we do something like this:
4773 * 1) create dir foo
4774 * 2) create snapshot under dir foo
4775 * 3) delete the snapshot
4776 * 4) rmdir foo
4777 * 5) mkdir foo
4778 * 6) fsync foo or some file inside foo
4779 */
4780 if (last_unlink_trans >= trans->transid)
4781 BTRFS_I(dir)->last_unlink_trans = last_unlink_trans;
4782 }
4783 out:
4784 btrfs_end_transaction(trans);
4785 out_notrans:
4786 btrfs_btree_balance_dirty(fs_info);
4787 fscrypt_free_filename(&fname);
4788
4789 return ret;
4790 }
4791
4792 /*
4793 * Read, zero a chunk and write a block.
4794 *
4795 * @inode - inode that we're zeroing
4796 * @from - the offset to start zeroing
4797 * @len - the length to zero, 0 to zero the entire range respective to the
4798 * offset
4799 * @front - zero up to the offset instead of from the offset on
4800 *
4801 * This will find the block for the "from" offset and cow the block and zero the
4802 * part we want to zero. This is used with truncate and hole punching.
4803 */
btrfs_truncate_block(struct btrfs_inode * inode,loff_t from,loff_t len,int front)4804 int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
4805 int front)
4806 {
4807 struct btrfs_fs_info *fs_info = inode->root->fs_info;
4808 struct address_space *mapping = inode->vfs_inode.i_mapping;
4809 struct extent_io_tree *io_tree = &inode->io_tree;
4810 struct btrfs_ordered_extent *ordered;
4811 struct extent_state *cached_state = NULL;
4812 struct extent_changeset *data_reserved = NULL;
4813 bool only_release_metadata = false;
4814 u32 blocksize = fs_info->sectorsize;
4815 pgoff_t index = from >> PAGE_SHIFT;
4816 unsigned offset = from & (blocksize - 1);
4817 struct folio *folio;
4818 gfp_t mask = btrfs_alloc_write_mask(mapping);
4819 size_t write_bytes = blocksize;
4820 int ret = 0;
4821 u64 block_start;
4822 u64 block_end;
4823
4824 if (IS_ALIGNED(offset, blocksize) &&
4825 (!len || IS_ALIGNED(len, blocksize)))
4826 goto out;
4827
4828 block_start = round_down(from, blocksize);
4829 block_end = block_start + blocksize - 1;
4830
4831 ret = btrfs_check_data_free_space(inode, &data_reserved, block_start,
4832 blocksize, false);
4833 if (ret < 0) {
4834 if (btrfs_check_nocow_lock(inode, block_start, &write_bytes, false) > 0) {
4835 /* For nocow case, no need to reserve data space */
4836 only_release_metadata = true;
4837 } else {
4838 goto out;
4839 }
4840 }
4841 ret = btrfs_delalloc_reserve_metadata(inode, blocksize, blocksize, false);
4842 if (ret < 0) {
4843 if (!only_release_metadata)
4844 btrfs_free_reserved_data_space(inode, data_reserved,
4845 block_start, blocksize);
4846 goto out;
4847 }
4848 again:
4849 folio = __filemap_get_folio(mapping, index,
4850 FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask);
4851 if (IS_ERR(folio)) {
4852 btrfs_delalloc_release_space(inode, data_reserved, block_start,
4853 blocksize, true);
4854 btrfs_delalloc_release_extents(inode, blocksize);
4855 ret = -ENOMEM;
4856 goto out;
4857 }
4858
4859 if (!folio_test_uptodate(folio)) {
4860 ret = btrfs_read_folio(NULL, folio);
4861 folio_lock(folio);
4862 if (folio->mapping != mapping) {
4863 folio_unlock(folio);
4864 folio_put(folio);
4865 goto again;
4866 }
4867 if (!folio_test_uptodate(folio)) {
4868 ret = -EIO;
4869 goto out_unlock;
4870 }
4871 }
4872
4873 /*
4874 * We unlock the page after the io is completed and then re-lock it
4875 * above. release_folio() could have come in between that and cleared
4876 * folio private, but left the page in the mapping. Set the page mapped
4877 * here to make sure it's properly set for the subpage stuff.
4878 */
4879 ret = set_folio_extent_mapped(folio);
4880 if (ret < 0)
4881 goto out_unlock;
4882
4883 folio_wait_writeback(folio);
4884
4885 lock_extent(io_tree, block_start, block_end, &cached_state);
4886
4887 ordered = btrfs_lookup_ordered_extent(inode, block_start);
4888 if (ordered) {
4889 unlock_extent(io_tree, block_start, block_end, &cached_state);
4890 folio_unlock(folio);
4891 folio_put(folio);
4892 btrfs_start_ordered_extent(ordered);
4893 btrfs_put_ordered_extent(ordered);
4894 goto again;
4895 }
4896
4897 clear_extent_bit(&inode->io_tree, block_start, block_end,
4898 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
4899 &cached_state);
4900
4901 ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0,
4902 &cached_state);
4903 if (ret) {
4904 unlock_extent(io_tree, block_start, block_end, &cached_state);
4905 goto out_unlock;
4906 }
4907
4908 if (offset != blocksize) {
4909 if (!len)
4910 len = blocksize - offset;
4911 if (front)
4912 folio_zero_range(folio, block_start - folio_pos(folio),
4913 offset);
4914 else
4915 folio_zero_range(folio,
4916 (block_start - folio_pos(folio)) + offset,
4917 len);
4918 }
4919 btrfs_folio_clear_checked(fs_info, folio, block_start,
4920 block_end + 1 - block_start);
4921 btrfs_folio_set_dirty(fs_info, folio, block_start,
4922 block_end + 1 - block_start);
4923 unlock_extent(io_tree, block_start, block_end, &cached_state);
4924
4925 if (only_release_metadata)
4926 set_extent_bit(&inode->io_tree, block_start, block_end,
4927 EXTENT_NORESERVE, NULL);
4928
4929 out_unlock:
4930 if (ret) {
4931 if (only_release_metadata)
4932 btrfs_delalloc_release_metadata(inode, blocksize, true);
4933 else
4934 btrfs_delalloc_release_space(inode, data_reserved,
4935 block_start, blocksize, true);
4936 }
4937 btrfs_delalloc_release_extents(inode, blocksize);
4938 folio_unlock(folio);
4939 folio_put(folio);
4940 out:
4941 if (only_release_metadata)
4942 btrfs_check_nocow_unlock(inode);
4943 extent_changeset_free(data_reserved);
4944 return ret;
4945 }
4946
maybe_insert_hole(struct btrfs_inode * inode,u64 offset,u64 len)4947 static int maybe_insert_hole(struct btrfs_inode *inode, u64 offset, u64 len)
4948 {
4949 struct btrfs_root *root = inode->root;
4950 struct btrfs_fs_info *fs_info = root->fs_info;
4951 struct btrfs_trans_handle *trans;
4952 struct btrfs_drop_extents_args drop_args = { 0 };
4953 int ret;
4954
4955 /*
4956 * If NO_HOLES is enabled, we don't need to do anything.
4957 * Later, up in the call chain, either btrfs_set_inode_last_sub_trans()
4958 * or btrfs_update_inode() will be called, which guarantee that the next
4959 * fsync will know this inode was changed and needs to be logged.
4960 */
4961 if (btrfs_fs_incompat(fs_info, NO_HOLES))
4962 return 0;
4963
4964 /*
4965 * 1 - for the one we're dropping
4966 * 1 - for the one we're adding
4967 * 1 - for updating the inode.
4968 */
4969 trans = btrfs_start_transaction(root, 3);
4970 if (IS_ERR(trans))
4971 return PTR_ERR(trans);
4972
4973 drop_args.start = offset;
4974 drop_args.end = offset + len;
4975 drop_args.drop_cache = true;
4976
4977 ret = btrfs_drop_extents(trans, root, inode, &drop_args);
4978 if (ret) {
4979 btrfs_abort_transaction(trans, ret);
4980 btrfs_end_transaction(trans);
4981 return ret;
4982 }
4983
4984 ret = btrfs_insert_hole_extent(trans, root, btrfs_ino(inode), offset, len);
4985 if (ret) {
4986 btrfs_abort_transaction(trans, ret);
4987 } else {
4988 btrfs_update_inode_bytes(inode, 0, drop_args.bytes_found);
4989 btrfs_update_inode(trans, inode);
4990 }
4991 btrfs_end_transaction(trans);
4992 return ret;
4993 }
4994
4995 /*
4996 * This function puts in dummy file extents for the area we're creating a hole
4997 * for. So if we are truncating this file to a larger size we need to insert
4998 * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for
4999 * the range between oldsize and size
5000 */
btrfs_cont_expand(struct btrfs_inode * inode,loff_t oldsize,loff_t size)5001 int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
5002 {
5003 struct btrfs_root *root = inode->root;
5004 struct btrfs_fs_info *fs_info = root->fs_info;
5005 struct extent_io_tree *io_tree = &inode->io_tree;
5006 struct extent_map *em = NULL;
5007 struct extent_state *cached_state = NULL;
5008 u64 hole_start = ALIGN(oldsize, fs_info->sectorsize);
5009 u64 block_end = ALIGN(size, fs_info->sectorsize);
5010 u64 last_byte;
5011 u64 cur_offset;
5012 u64 hole_size;
5013 int ret = 0;
5014
5015 /*
5016 * If our size started in the middle of a block we need to zero out the
5017 * rest of the block before we expand the i_size, otherwise we could
5018 * expose stale data.
5019 */
5020 ret = btrfs_truncate_block(inode, oldsize, 0, 0);
5021 if (ret)
5022 return ret;
5023
5024 if (size <= hole_start)
5025 return 0;
5026
5027 btrfs_lock_and_flush_ordered_range(inode, hole_start, block_end - 1,
5028 &cached_state);
5029 cur_offset = hole_start;
5030 while (1) {
5031 em = btrfs_get_extent(inode, NULL, cur_offset, block_end - cur_offset);
5032 if (IS_ERR(em)) {
5033 ret = PTR_ERR(em);
5034 em = NULL;
5035 break;
5036 }
5037 last_byte = min(extent_map_end(em), block_end);
5038 last_byte = ALIGN(last_byte, fs_info->sectorsize);
5039 hole_size = last_byte - cur_offset;
5040
5041 if (!(em->flags & EXTENT_FLAG_PREALLOC)) {
5042 struct extent_map *hole_em;
5043
5044 ret = maybe_insert_hole(inode, cur_offset, hole_size);
5045 if (ret)
5046 break;
5047
5048 ret = btrfs_inode_set_file_extent_range(inode,
5049 cur_offset, hole_size);
5050 if (ret)
5051 break;
5052
5053 hole_em = alloc_extent_map();
5054 if (!hole_em) {
5055 btrfs_drop_extent_map_range(inode, cur_offset,
5056 cur_offset + hole_size - 1,
5057 false);
5058 btrfs_set_inode_full_sync(inode);
5059 goto next;
5060 }
5061 hole_em->start = cur_offset;
5062 hole_em->len = hole_size;
5063
5064 hole_em->disk_bytenr = EXTENT_MAP_HOLE;
5065 hole_em->disk_num_bytes = 0;
5066 hole_em->ram_bytes = hole_size;
5067 hole_em->generation = btrfs_get_fs_generation(fs_info);
5068
5069 ret = btrfs_replace_extent_map_range(inode, hole_em, true);
5070 free_extent_map(hole_em);
5071 } else {
5072 ret = btrfs_inode_set_file_extent_range(inode,
5073 cur_offset, hole_size);
5074 if (ret)
5075 break;
5076 }
5077 next:
5078 free_extent_map(em);
5079 em = NULL;
5080 cur_offset = last_byte;
5081 if (cur_offset >= block_end)
5082 break;
5083 }
5084 free_extent_map(em);
5085 unlock_extent(io_tree, hole_start, block_end - 1, &cached_state);
5086 return ret;
5087 }
5088
btrfs_setsize(struct inode * inode,struct iattr * attr)5089 static int btrfs_setsize(struct inode *inode, struct iattr *attr)
5090 {
5091 struct btrfs_root *root = BTRFS_I(inode)->root;
5092 struct btrfs_trans_handle *trans;
5093 loff_t oldsize = i_size_read(inode);
5094 loff_t newsize = attr->ia_size;
5095 int mask = attr->ia_valid;
5096 int ret;
5097
5098 /*
5099 * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
5100 * special case where we need to update the times despite not having
5101 * these flags set. For all other operations the VFS set these flags
5102 * explicitly if it wants a timestamp update.
5103 */
5104 if (newsize != oldsize) {
5105 inode_inc_iversion(inode);
5106 if (!(mask & (ATTR_CTIME | ATTR_MTIME))) {
5107 inode_set_mtime_to_ts(inode,
5108 inode_set_ctime_current(inode));
5109 }
5110 }
5111
5112 if (newsize > oldsize) {
5113 /*
5114 * Don't do an expanding truncate while snapshotting is ongoing.
5115 * This is to ensure the snapshot captures a fully consistent
5116 * state of this file - if the snapshot captures this expanding
5117 * truncation, it must capture all writes that happened before
5118 * this truncation.
5119 */
5120 btrfs_drew_write_lock(&root->snapshot_lock);
5121 ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, newsize);
5122 if (ret) {
5123 btrfs_drew_write_unlock(&root->snapshot_lock);
5124 return ret;
5125 }
5126
5127 trans = btrfs_start_transaction(root, 1);
5128 if (IS_ERR(trans)) {
5129 btrfs_drew_write_unlock(&root->snapshot_lock);
5130 return PTR_ERR(trans);
5131 }
5132
5133 i_size_write(inode, newsize);
5134 btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
5135 pagecache_isize_extended(inode, oldsize, newsize);
5136 ret = btrfs_update_inode(trans, BTRFS_I(inode));
5137 btrfs_drew_write_unlock(&root->snapshot_lock);
5138 btrfs_end_transaction(trans);
5139 } else {
5140 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
5141
5142 if (btrfs_is_zoned(fs_info)) {
5143 ret = btrfs_wait_ordered_range(BTRFS_I(inode),
5144 ALIGN(newsize, fs_info->sectorsize),
5145 (u64)-1);
5146 if (ret)
5147 return ret;
5148 }
5149
5150 /*
5151 * We're truncating a file that used to have good data down to
5152 * zero. Make sure any new writes to the file get on disk
5153 * on close.
5154 */
5155 if (newsize == 0)
5156 set_bit(BTRFS_INODE_FLUSH_ON_CLOSE,
5157 &BTRFS_I(inode)->runtime_flags);
5158
5159 truncate_setsize(inode, newsize);
5160
5161 inode_dio_wait(inode);
5162
5163 ret = btrfs_truncate(BTRFS_I(inode), newsize == oldsize);
5164 if (ret && inode->i_nlink) {
5165 int err;
5166
5167 /*
5168 * Truncate failed, so fix up the in-memory size. We
5169 * adjusted disk_i_size down as we removed extents, so
5170 * wait for disk_i_size to be stable and then update the
5171 * in-memory size to match.
5172 */
5173 err = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1);
5174 if (err)
5175 return err;
5176 i_size_write(inode, BTRFS_I(inode)->disk_i_size);
5177 }
5178 }
5179
5180 return ret;
5181 }
5182
btrfs_setattr(struct mnt_idmap * idmap,struct dentry * dentry,struct iattr * attr)5183 static int btrfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
5184 struct iattr *attr)
5185 {
5186 struct inode *inode = d_inode(dentry);
5187 struct btrfs_root *root = BTRFS_I(inode)->root;
5188 int err;
5189
5190 if (btrfs_root_readonly(root))
5191 return -EROFS;
5192
5193 err = setattr_prepare(idmap, dentry, attr);
5194 if (err)
5195 return err;
5196
5197 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
5198 err = btrfs_setsize(inode, attr);
5199 if (err)
5200 return err;
5201 }
5202
5203 if (attr->ia_valid) {
5204 setattr_copy(idmap, inode, attr);
5205 inode_inc_iversion(inode);
5206 err = btrfs_dirty_inode(BTRFS_I(inode));
5207
5208 if (!err && attr->ia_valid & ATTR_MODE)
5209 err = posix_acl_chmod(idmap, dentry, inode->i_mode);
5210 }
5211
5212 return err;
5213 }
5214
5215 /*
5216 * While truncating the inode pages during eviction, we get the VFS
5217 * calling btrfs_invalidate_folio() against each folio of the inode. This
5218 * is slow because the calls to btrfs_invalidate_folio() result in a
5219 * huge amount of calls to lock_extent() and clear_extent_bit(),
5220 * which keep merging and splitting extent_state structures over and over,
5221 * wasting lots of time.
5222 *
5223 * Therefore if the inode is being evicted, let btrfs_invalidate_folio()
5224 * skip all those expensive operations on a per folio basis and do only
5225 * the ordered io finishing, while we release here the extent_map and
5226 * extent_state structures, without the excessive merging and splitting.
5227 */
evict_inode_truncate_pages(struct inode * inode)5228 static void evict_inode_truncate_pages(struct inode *inode)
5229 {
5230 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
5231 struct rb_node *node;
5232
5233 ASSERT(inode->i_state & I_FREEING);
5234 truncate_inode_pages_final(&inode->i_data);
5235
5236 btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false);
5237
5238 /*
5239 * Keep looping until we have no more ranges in the io tree.
5240 * We can have ongoing bios started by readahead that have
5241 * their endio callback (extent_io.c:end_bio_extent_readpage)
5242 * still in progress (unlocked the pages in the bio but did not yet
5243 * unlocked the ranges in the io tree). Therefore this means some
5244 * ranges can still be locked and eviction started because before
5245 * submitting those bios, which are executed by a separate task (work
5246 * queue kthread), inode references (inode->i_count) were not taken
5247 * (which would be dropped in the end io callback of each bio).
5248 * Therefore here we effectively end up waiting for those bios and
5249 * anyone else holding locked ranges without having bumped the inode's
5250 * reference count - if we don't do it, when they access the inode's
5251 * io_tree to unlock a range it may be too late, leading to an
5252 * use-after-free issue.
5253 */
5254 spin_lock(&io_tree->lock);
5255 while (!RB_EMPTY_ROOT(&io_tree->state)) {
5256 struct extent_state *state;
5257 struct extent_state *cached_state = NULL;
5258 u64 start;
5259 u64 end;
5260 unsigned state_flags;
5261
5262 node = rb_first(&io_tree->state);
5263 state = rb_entry(node, struct extent_state, rb_node);
5264 start = state->start;
5265 end = state->end;
5266 state_flags = state->state;
5267 spin_unlock(&io_tree->lock);
5268
5269 lock_extent(io_tree, start, end, &cached_state);
5270
5271 /*
5272 * If still has DELALLOC flag, the extent didn't reach disk,
5273 * and its reserved space won't be freed by delayed_ref.
5274 * So we need to free its reserved space here.
5275 * (Refer to comment in btrfs_invalidate_folio, case 2)
5276 *
5277 * Note, end is the bytenr of last byte, so we need + 1 here.
5278 */
5279 if (state_flags & EXTENT_DELALLOC)
5280 btrfs_qgroup_free_data(BTRFS_I(inode), NULL, start,
5281 end - start + 1, NULL);
5282
5283 clear_extent_bit(io_tree, start, end,
5284 EXTENT_CLEAR_ALL_BITS | EXTENT_DO_ACCOUNTING,
5285 &cached_state);
5286
5287 cond_resched();
5288 spin_lock(&io_tree->lock);
5289 }
5290 spin_unlock(&io_tree->lock);
5291 }
5292
evict_refill_and_join(struct btrfs_root * root,struct btrfs_block_rsv * rsv)5293 static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
5294 struct btrfs_block_rsv *rsv)
5295 {
5296 struct btrfs_fs_info *fs_info = root->fs_info;
5297 struct btrfs_trans_handle *trans;
5298 u64 delayed_refs_extra = btrfs_calc_delayed_ref_bytes(fs_info, 1);
5299 int ret;
5300
5301 /*
5302 * Eviction should be taking place at some place safe because of our
5303 * delayed iputs. However the normal flushing code will run delayed
5304 * iputs, so we cannot use FLUSH_ALL otherwise we'll deadlock.
5305 *
5306 * We reserve the delayed_refs_extra here again because we can't use
5307 * btrfs_start_transaction(root, 0) for the same deadlocky reason as
5308 * above. We reserve our extra bit here because we generate a ton of
5309 * delayed refs activity by truncating.
5310 *
5311 * BTRFS_RESERVE_FLUSH_EVICT will steal from the global_rsv if it can,
5312 * if we fail to make this reservation we can re-try without the
5313 * delayed_refs_extra so we can make some forward progress.
5314 */
5315 ret = btrfs_block_rsv_refill(fs_info, rsv, rsv->size + delayed_refs_extra,
5316 BTRFS_RESERVE_FLUSH_EVICT);
5317 if (ret) {
5318 ret = btrfs_block_rsv_refill(fs_info, rsv, rsv->size,
5319 BTRFS_RESERVE_FLUSH_EVICT);
5320 if (ret) {
5321 btrfs_warn(fs_info,
5322 "could not allocate space for delete; will truncate on mount");
5323 return ERR_PTR(-ENOSPC);
5324 }
5325 delayed_refs_extra = 0;
5326 }
5327
5328 trans = btrfs_join_transaction(root);
5329 if (IS_ERR(trans))
5330 return trans;
5331
5332 if (delayed_refs_extra) {
5333 trans->block_rsv = &fs_info->trans_block_rsv;
5334 trans->bytes_reserved = delayed_refs_extra;
5335 btrfs_block_rsv_migrate(rsv, trans->block_rsv,
5336 delayed_refs_extra, true);
5337 }
5338 return trans;
5339 }
5340
btrfs_evict_inode(struct inode * inode)5341 void btrfs_evict_inode(struct inode *inode)
5342 {
5343 struct btrfs_fs_info *fs_info;
5344 struct btrfs_trans_handle *trans;
5345 struct btrfs_root *root = BTRFS_I(inode)->root;
5346 struct btrfs_block_rsv *rsv = NULL;
5347 int ret;
5348
5349 trace_btrfs_inode_evict(inode);
5350
5351 if (!root) {
5352 fsverity_cleanup_inode(inode);
5353 clear_inode(inode);
5354 return;
5355 }
5356
5357 fs_info = inode_to_fs_info(inode);
5358 evict_inode_truncate_pages(inode);
5359
5360 if (inode->i_nlink &&
5361 ((btrfs_root_refs(&root->root_item) != 0 &&
5362 btrfs_root_id(root) != BTRFS_ROOT_TREE_OBJECTID) ||
5363 btrfs_is_free_space_inode(BTRFS_I(inode))))
5364 goto out;
5365
5366 if (is_bad_inode(inode))
5367 goto out;
5368
5369 if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
5370 goto out;
5371
5372 if (inode->i_nlink > 0) {
5373 BUG_ON(btrfs_root_refs(&root->root_item) != 0 &&
5374 btrfs_root_id(root) != BTRFS_ROOT_TREE_OBJECTID);
5375 goto out;
5376 }
5377
5378 /*
5379 * This makes sure the inode item in tree is uptodate and the space for
5380 * the inode update is released.
5381 */
5382 ret = btrfs_commit_inode_delayed_inode(BTRFS_I(inode));
5383 if (ret)
5384 goto out;
5385
5386 /*
5387 * This drops any pending insert or delete operations we have for this
5388 * inode. We could have a delayed dir index deletion queued up, but
5389 * we're removing the inode completely so that'll be taken care of in
5390 * the truncate.
5391 */
5392 btrfs_kill_delayed_inode_items(BTRFS_I(inode));
5393
5394 rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
5395 if (!rsv)
5396 goto out;
5397 rsv->size = btrfs_calc_metadata_size(fs_info, 1);
5398 rsv->failfast = true;
5399
5400 btrfs_i_size_write(BTRFS_I(inode), 0);
5401
5402 while (1) {
5403 struct btrfs_truncate_control control = {
5404 .inode = BTRFS_I(inode),
5405 .ino = btrfs_ino(BTRFS_I(inode)),
5406 .new_size = 0,
5407 .min_type = 0,
5408 };
5409
5410 trans = evict_refill_and_join(root, rsv);
5411 if (IS_ERR(trans))
5412 goto out;
5413
5414 trans->block_rsv = rsv;
5415
5416 ret = btrfs_truncate_inode_items(trans, root, &control);
5417 trans->block_rsv = &fs_info->trans_block_rsv;
5418 btrfs_end_transaction(trans);
5419 /*
5420 * We have not added new delayed items for our inode after we
5421 * have flushed its delayed items, so no need to throttle on
5422 * delayed items. However we have modified extent buffers.
5423 */
5424 btrfs_btree_balance_dirty_nodelay(fs_info);
5425 if (ret && ret != -ENOSPC && ret != -EAGAIN)
5426 goto out;
5427 else if (!ret)
5428 break;
5429 }
5430
5431 /*
5432 * Errors here aren't a big deal, it just means we leave orphan items in
5433 * the tree. They will be cleaned up on the next mount. If the inode
5434 * number gets reused, cleanup deletes the orphan item without doing
5435 * anything, and unlink reuses the existing orphan item.
5436 *
5437 * If it turns out that we are dropping too many of these, we might want
5438 * to add a mechanism for retrying these after a commit.
5439 */
5440 trans = evict_refill_and_join(root, rsv);
5441 if (!IS_ERR(trans)) {
5442 trans->block_rsv = rsv;
5443 btrfs_orphan_del(trans, BTRFS_I(inode));
5444 trans->block_rsv = &fs_info->trans_block_rsv;
5445 btrfs_end_transaction(trans);
5446 }
5447
5448 out:
5449 btrfs_free_block_rsv(fs_info, rsv);
5450 /*
5451 * If we didn't successfully delete, the orphan item will still be in
5452 * the tree and we'll retry on the next mount. Again, we might also want
5453 * to retry these periodically in the future.
5454 */
5455 btrfs_remove_delayed_node(BTRFS_I(inode));
5456 fsverity_cleanup_inode(inode);
5457 clear_inode(inode);
5458 }
5459
5460 /*
5461 * Return the key found in the dir entry in the location pointer, fill @type
5462 * with BTRFS_FT_*, and return 0.
5463 *
5464 * If no dir entries were found, returns -ENOENT.
5465 * If found a corrupted location in dir entry, returns -EUCLEAN.
5466 */
btrfs_inode_by_name(struct btrfs_inode * dir,struct dentry * dentry,struct btrfs_key * location,u8 * type)5467 static int btrfs_inode_by_name(struct btrfs_inode *dir, struct dentry *dentry,
5468 struct btrfs_key *location, u8 *type)
5469 {
5470 struct btrfs_dir_item *di;
5471 struct btrfs_path *path;
5472 struct btrfs_root *root = dir->root;
5473 int ret = 0;
5474 struct fscrypt_name fname;
5475
5476 path = btrfs_alloc_path();
5477 if (!path)
5478 return -ENOMEM;
5479
5480 ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 1, &fname);
5481 if (ret < 0)
5482 goto out;
5483 /*
5484 * fscrypt_setup_filename() should never return a positive value, but
5485 * gcc on sparc/parisc thinks it can, so assert that doesn't happen.
5486 */
5487 ASSERT(ret == 0);
5488
5489 /* This needs to handle no-key deletions later on */
5490
5491 di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir),
5492 &fname.disk_name, 0);
5493 if (IS_ERR_OR_NULL(di)) {
5494 ret = di ? PTR_ERR(di) : -ENOENT;
5495 goto out;
5496 }
5497
5498 btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
5499 if (location->type != BTRFS_INODE_ITEM_KEY &&
5500 location->type != BTRFS_ROOT_ITEM_KEY) {
5501 ret = -EUCLEAN;
5502 btrfs_warn(root->fs_info,
5503 "%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))",
5504 __func__, fname.disk_name.name, btrfs_ino(dir),
5505 location->objectid, location->type, location->offset);
5506 }
5507 if (!ret)
5508 *type = btrfs_dir_ftype(path->nodes[0], di);
5509 out:
5510 fscrypt_free_filename(&fname);
5511 btrfs_free_path(path);
5512 return ret;
5513 }
5514
5515 /*
5516 * when we hit a tree root in a directory, the btrfs part of the inode
5517 * needs to be changed to reflect the root directory of the tree root. This
5518 * is kind of like crossing a mount point.
5519 */
fixup_tree_root_location(struct btrfs_fs_info * fs_info,struct btrfs_inode * dir,struct dentry * dentry,struct btrfs_key * location,struct btrfs_root ** sub_root)5520 static int fixup_tree_root_location(struct btrfs_fs_info *fs_info,
5521 struct btrfs_inode *dir,
5522 struct dentry *dentry,
5523 struct btrfs_key *location,
5524 struct btrfs_root **sub_root)
5525 {
5526 struct btrfs_path *path;
5527 struct btrfs_root *new_root;
5528 struct btrfs_root_ref *ref;
5529 struct extent_buffer *leaf;
5530 struct btrfs_key key;
5531 int ret;
5532 int err = 0;
5533 struct fscrypt_name fname;
5534
5535 ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 0, &fname);
5536 if (ret)
5537 return ret;
5538
5539 path = btrfs_alloc_path();
5540 if (!path) {
5541 err = -ENOMEM;
5542 goto out;
5543 }
5544
5545 err = -ENOENT;
5546 key.objectid = btrfs_root_id(dir->root);
5547 key.type = BTRFS_ROOT_REF_KEY;
5548 key.offset = location->objectid;
5549
5550 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
5551 if (ret) {
5552 if (ret < 0)
5553 err = ret;
5554 goto out;
5555 }
5556
5557 leaf = path->nodes[0];
5558 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
5559 if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(dir) ||
5560 btrfs_root_ref_name_len(leaf, ref) != fname.disk_name.len)
5561 goto out;
5562
5563 ret = memcmp_extent_buffer(leaf, fname.disk_name.name,
5564 (unsigned long)(ref + 1), fname.disk_name.len);
5565 if (ret)
5566 goto out;
5567
5568 btrfs_release_path(path);
5569
5570 new_root = btrfs_get_fs_root(fs_info, location->objectid, true);
5571 if (IS_ERR(new_root)) {
5572 err = PTR_ERR(new_root);
5573 goto out;
5574 }
5575
5576 *sub_root = new_root;
5577 location->objectid = btrfs_root_dirid(&new_root->root_item);
5578 location->type = BTRFS_INODE_ITEM_KEY;
5579 location->offset = 0;
5580 err = 0;
5581 out:
5582 btrfs_free_path(path);
5583 fscrypt_free_filename(&fname);
5584 return err;
5585 }
5586
5587
5588
btrfs_del_inode_from_root(struct btrfs_inode * inode)5589 static void btrfs_del_inode_from_root(struct btrfs_inode *inode)
5590 {
5591 struct btrfs_root *root = inode->root;
5592 struct btrfs_inode *entry;
5593 bool empty = false;
5594
5595 xa_lock(&root->inodes);
5596 entry = __xa_erase(&root->inodes, btrfs_ino(inode));
5597 if (entry == inode)
5598 empty = xa_empty(&root->inodes);
5599 xa_unlock(&root->inodes);
5600
5601 if (empty && btrfs_root_refs(&root->root_item) == 0) {
5602 xa_lock(&root->inodes);
5603 empty = xa_empty(&root->inodes);
5604 xa_unlock(&root->inodes);
5605 if (empty)
5606 btrfs_add_dead_root(root);
5607 }
5608 }
5609
5610
btrfs_init_locked_inode(struct inode * inode,void * p)5611 static int btrfs_init_locked_inode(struct inode *inode, void *p)
5612 {
5613 struct btrfs_iget_args *args = p;
5614
5615 btrfs_set_inode_number(BTRFS_I(inode), args->ino);
5616 BTRFS_I(inode)->root = btrfs_grab_root(args->root);
5617
5618 if (args->root && args->root == args->root->fs_info->tree_root &&
5619 args->ino != BTRFS_BTREE_INODE_OBJECTID)
5620 set_bit(BTRFS_INODE_FREE_SPACE_INODE,
5621 &BTRFS_I(inode)->runtime_flags);
5622 return 0;
5623 }
5624
btrfs_find_actor(struct inode * inode,void * opaque)5625 static int btrfs_find_actor(struct inode *inode, void *opaque)
5626 {
5627 struct btrfs_iget_args *args = opaque;
5628
5629 return args->ino == btrfs_ino(BTRFS_I(inode)) &&
5630 args->root == BTRFS_I(inode)->root;
5631 }
5632
btrfs_iget_locked(u64 ino,struct btrfs_root * root)5633 static struct btrfs_inode *btrfs_iget_locked(u64 ino, struct btrfs_root *root)
5634 {
5635 struct inode *inode;
5636 struct btrfs_iget_args args;
5637 unsigned long hashval = btrfs_inode_hash(ino, root);
5638
5639 args.ino = ino;
5640 args.root = root;
5641
5642 inode = iget5_locked_rcu(root->fs_info->sb, hashval, btrfs_find_actor,
5643 btrfs_init_locked_inode,
5644 (void *)&args);
5645 if (!inode)
5646 return NULL;
5647 return BTRFS_I(inode);
5648 }
5649
5650 /*
5651 * Get an inode object given its inode number and corresponding root. Path is
5652 * preallocated to prevent recursing back to iget through allocator.
5653 */
btrfs_iget_path(u64 ino,struct btrfs_root * root,struct btrfs_path * path)5654 struct btrfs_inode *btrfs_iget_path(u64 ino, struct btrfs_root *root,
5655 struct btrfs_path *path)
5656 {
5657 struct btrfs_inode *inode;
5658 int ret;
5659
5660 inode = btrfs_iget_locked(ino, root);
5661 if (!inode)
5662 return ERR_PTR(-ENOMEM);
5663
5664 if (!(inode->vfs_inode.i_state & I_NEW))
5665 return inode;
5666
5667 ret = btrfs_read_locked_inode(inode, path);
5668 if (ret)
5669 return ERR_PTR(ret);
5670
5671 unlock_new_inode(&inode->vfs_inode);
5672 return inode;
5673 }
5674
5675 /*
5676 * Get an inode object given its inode number and corresponding root.
5677 */
btrfs_iget(u64 ino,struct btrfs_root * root)5678 struct btrfs_inode *btrfs_iget(u64 ino, struct btrfs_root *root)
5679 {
5680 struct btrfs_inode *inode;
5681 struct btrfs_path *path;
5682 int ret;
5683
5684 inode = btrfs_iget_locked(ino, root);
5685 if (!inode)
5686 return ERR_PTR(-ENOMEM);
5687
5688 if (!(inode->vfs_inode.i_state & I_NEW))
5689 return inode;
5690
5691 path = btrfs_alloc_path();
5692 if (!path) {
5693 iget_failed(&inode->vfs_inode);
5694 return ERR_PTR(-ENOMEM);
5695 }
5696
5697 ret = btrfs_read_locked_inode(inode, path);
5698 btrfs_free_path(path);
5699 if (ret)
5700 return ERR_PTR(ret);
5701
5702 unlock_new_inode(&inode->vfs_inode);
5703 return inode;
5704 }
5705
new_simple_dir(struct inode * dir,struct btrfs_key * key,struct btrfs_root * root)5706 static struct btrfs_inode *new_simple_dir(struct inode *dir,
5707 struct btrfs_key *key,
5708 struct btrfs_root *root)
5709 {
5710 struct timespec64 ts;
5711 struct inode *vfs_inode;
5712 struct btrfs_inode *inode;
5713
5714 vfs_inode = new_inode(dir->i_sb);
5715 if (!vfs_inode)
5716 return ERR_PTR(-ENOMEM);
5717
5718 inode = BTRFS_I(vfs_inode);
5719 inode->root = btrfs_grab_root(root);
5720 inode->ref_root_id = key->objectid;
5721 set_bit(BTRFS_INODE_ROOT_STUB, &inode->runtime_flags);
5722 set_bit(BTRFS_INODE_DUMMY, &inode->runtime_flags);
5723
5724 btrfs_set_inode_number(inode, BTRFS_EMPTY_SUBVOL_DIR_OBJECTID);
5725 /*
5726 * We only need lookup, the rest is read-only and there's no inode
5727 * associated with the dentry
5728 */
5729 vfs_inode->i_op = &simple_dir_inode_operations;
5730 vfs_inode->i_opflags &= ~IOP_XATTR;
5731 vfs_inode->i_fop = &simple_dir_operations;
5732 vfs_inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
5733
5734 ts = inode_set_ctime_current(vfs_inode);
5735 inode_set_mtime_to_ts(vfs_inode, ts);
5736 inode_set_atime_to_ts(vfs_inode, inode_get_atime(dir));
5737 inode->i_otime_sec = ts.tv_sec;
5738 inode->i_otime_nsec = ts.tv_nsec;
5739
5740 vfs_inode->i_uid = dir->i_uid;
5741 vfs_inode->i_gid = dir->i_gid;
5742
5743 return inode;
5744 }
5745
5746 static_assert(BTRFS_FT_UNKNOWN == FT_UNKNOWN);
5747 static_assert(BTRFS_FT_REG_FILE == FT_REG_FILE);
5748 static_assert(BTRFS_FT_DIR == FT_DIR);
5749 static_assert(BTRFS_FT_CHRDEV == FT_CHRDEV);
5750 static_assert(BTRFS_FT_BLKDEV == FT_BLKDEV);
5751 static_assert(BTRFS_FT_FIFO == FT_FIFO);
5752 static_assert(BTRFS_FT_SOCK == FT_SOCK);
5753 static_assert(BTRFS_FT_SYMLINK == FT_SYMLINK);
5754
btrfs_inode_type(const struct btrfs_inode * inode)5755 static inline u8 btrfs_inode_type(const struct btrfs_inode *inode)
5756 {
5757 return fs_umode_to_ftype(inode->vfs_inode.i_mode);
5758 }
5759
btrfs_lookup_dentry(struct inode * dir,struct dentry * dentry)5760 struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
5761 {
5762 struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
5763 struct btrfs_inode *inode;
5764 struct btrfs_root *root = BTRFS_I(dir)->root;
5765 struct btrfs_root *sub_root = root;
5766 struct btrfs_key location = { 0 };
5767 u8 di_type = 0;
5768 int ret = 0;
5769
5770 if (dentry->d_name.len > BTRFS_NAME_LEN)
5771 return ERR_PTR(-ENAMETOOLONG);
5772
5773 ret = btrfs_inode_by_name(BTRFS_I(dir), dentry, &location, &di_type);
5774 if (ret < 0)
5775 return ERR_PTR(ret);
5776
5777 if (location.type == BTRFS_INODE_ITEM_KEY) {
5778 inode = btrfs_iget(location.objectid, root);
5779 if (IS_ERR(inode))
5780 return ERR_CAST(inode);
5781
5782 /* Do extra check against inode mode with di_type */
5783 if (btrfs_inode_type(inode) != di_type) {
5784 btrfs_crit(fs_info,
5785 "inode mode mismatch with dir: inode mode=0%o btrfs type=%u dir type=%u",
5786 inode->vfs_inode.i_mode, btrfs_inode_type(inode),
5787 di_type);
5788 iput(&inode->vfs_inode);
5789 return ERR_PTR(-EUCLEAN);
5790 }
5791 return &inode->vfs_inode;
5792 }
5793
5794 ret = fixup_tree_root_location(fs_info, BTRFS_I(dir), dentry,
5795 &location, &sub_root);
5796 if (ret < 0) {
5797 if (ret != -ENOENT)
5798 inode = ERR_PTR(ret);
5799 else
5800 inode = new_simple_dir(dir, &location, root);
5801 } else {
5802 inode = btrfs_iget(location.objectid, sub_root);
5803 btrfs_put_root(sub_root);
5804
5805 if (IS_ERR(inode))
5806 return ERR_CAST(inode);
5807
5808 down_read(&fs_info->cleanup_work_sem);
5809 if (!sb_rdonly(inode->vfs_inode.i_sb))
5810 ret = btrfs_orphan_cleanup(sub_root);
5811 up_read(&fs_info->cleanup_work_sem);
5812 if (ret) {
5813 iput(&inode->vfs_inode);
5814 inode = ERR_PTR(ret);
5815 }
5816 }
5817
5818 if (IS_ERR(inode))
5819 return ERR_CAST(inode);
5820
5821 return &inode->vfs_inode;
5822 }
5823
btrfs_dentry_delete(const struct dentry * dentry)5824 static int btrfs_dentry_delete(const struct dentry *dentry)
5825 {
5826 struct btrfs_root *root;
5827 struct inode *inode = d_inode(dentry);
5828
5829 if (!inode && !IS_ROOT(dentry))
5830 inode = d_inode(dentry->d_parent);
5831
5832 if (inode) {
5833 root = BTRFS_I(inode)->root;
5834 if (btrfs_root_refs(&root->root_item) == 0)
5835 return 1;
5836
5837 if (btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
5838 return 1;
5839 }
5840 return 0;
5841 }
5842
btrfs_lookup(struct inode * dir,struct dentry * dentry,unsigned int flags)5843 static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
5844 unsigned int flags)
5845 {
5846 struct inode *inode = btrfs_lookup_dentry(dir, dentry);
5847
5848 if (inode == ERR_PTR(-ENOENT))
5849 inode = NULL;
5850 return d_splice_alias(inode, dentry);
5851 }
5852
5853 /*
5854 * Find the highest existing sequence number in a directory and then set the
5855 * in-memory index_cnt variable to the first free sequence number.
5856 */
btrfs_set_inode_index_count(struct btrfs_inode * inode)5857 static int btrfs_set_inode_index_count(struct btrfs_inode *inode)
5858 {
5859 struct btrfs_root *root = inode->root;
5860 struct btrfs_key key, found_key;
5861 struct btrfs_path *path;
5862 struct extent_buffer *leaf;
5863 int ret;
5864
5865 key.objectid = btrfs_ino(inode);
5866 key.type = BTRFS_DIR_INDEX_KEY;
5867 key.offset = (u64)-1;
5868
5869 path = btrfs_alloc_path();
5870 if (!path)
5871 return -ENOMEM;
5872
5873 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5874 if (ret < 0)
5875 goto out;
5876 /* FIXME: we should be able to handle this */
5877 if (ret == 0)
5878 goto out;
5879 ret = 0;
5880
5881 if (path->slots[0] == 0) {
5882 inode->index_cnt = BTRFS_DIR_START_INDEX;
5883 goto out;
5884 }
5885
5886 path->slots[0]--;
5887
5888 leaf = path->nodes[0];
5889 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5890
5891 if (found_key.objectid != btrfs_ino(inode) ||
5892 found_key.type != BTRFS_DIR_INDEX_KEY) {
5893 inode->index_cnt = BTRFS_DIR_START_INDEX;
5894 goto out;
5895 }
5896
5897 inode->index_cnt = found_key.offset + 1;
5898 out:
5899 btrfs_free_path(path);
5900 return ret;
5901 }
5902
btrfs_get_dir_last_index(struct btrfs_inode * dir,u64 * index)5903 static int btrfs_get_dir_last_index(struct btrfs_inode *dir, u64 *index)
5904 {
5905 int ret = 0;
5906
5907 btrfs_inode_lock(dir, 0);
5908 if (dir->index_cnt == (u64)-1) {
5909 ret = btrfs_inode_delayed_dir_index_count(dir);
5910 if (ret) {
5911 ret = btrfs_set_inode_index_count(dir);
5912 if (ret)
5913 goto out;
5914 }
5915 }
5916
5917 /* index_cnt is the index number of next new entry, so decrement it. */
5918 *index = dir->index_cnt - 1;
5919 out:
5920 btrfs_inode_unlock(dir, 0);
5921
5922 return ret;
5923 }
5924
5925 /*
5926 * All this infrastructure exists because dir_emit can fault, and we are holding
5927 * the tree lock when doing readdir. For now just allocate a buffer and copy
5928 * our information into that, and then dir_emit from the buffer. This is
5929 * similar to what NFS does, only we don't keep the buffer around in pagecache
5930 * because I'm afraid I'll mess that up. Long term we need to make filldir do
5931 * copy_to_user_inatomic so we don't have to worry about page faulting under the
5932 * tree lock.
5933 */
btrfs_opendir(struct inode * inode,struct file * file)5934 static int btrfs_opendir(struct inode *inode, struct file *file)
5935 {
5936 struct btrfs_file_private *private;
5937 u64 last_index;
5938 int ret;
5939
5940 ret = btrfs_get_dir_last_index(BTRFS_I(inode), &last_index);
5941 if (ret)
5942 return ret;
5943
5944 private = kzalloc(sizeof(struct btrfs_file_private), GFP_KERNEL);
5945 if (!private)
5946 return -ENOMEM;
5947 private->last_index = last_index;
5948 private->filldir_buf = kzalloc(PAGE_SIZE, GFP_KERNEL);
5949 if (!private->filldir_buf) {
5950 kfree(private);
5951 return -ENOMEM;
5952 }
5953 file->private_data = private;
5954 return 0;
5955 }
5956
btrfs_dir_llseek(struct file * file,loff_t offset,int whence)5957 static loff_t btrfs_dir_llseek(struct file *file, loff_t offset, int whence)
5958 {
5959 struct btrfs_file_private *private = file->private_data;
5960 int ret;
5961
5962 ret = btrfs_get_dir_last_index(BTRFS_I(file_inode(file)),
5963 &private->last_index);
5964 if (ret)
5965 return ret;
5966
5967 return generic_file_llseek(file, offset, whence);
5968 }
5969
5970 struct dir_entry {
5971 u64 ino;
5972 u64 offset;
5973 unsigned type;
5974 int name_len;
5975 };
5976
btrfs_filldir(void * addr,int entries,struct dir_context * ctx)5977 static int btrfs_filldir(void *addr, int entries, struct dir_context *ctx)
5978 {
5979 while (entries--) {
5980 struct dir_entry *entry = addr;
5981 char *name = (char *)(entry + 1);
5982
5983 ctx->pos = get_unaligned(&entry->offset);
5984 if (!dir_emit(ctx, name, get_unaligned(&entry->name_len),
5985 get_unaligned(&entry->ino),
5986 get_unaligned(&entry->type)))
5987 return 1;
5988 addr += sizeof(struct dir_entry) +
5989 get_unaligned(&entry->name_len);
5990 ctx->pos++;
5991 }
5992 return 0;
5993 }
5994
btrfs_real_readdir(struct file * file,struct dir_context * ctx)5995 static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
5996 {
5997 struct inode *inode = file_inode(file);
5998 struct btrfs_root *root = BTRFS_I(inode)->root;
5999 struct btrfs_file_private *private = file->private_data;
6000 struct btrfs_dir_item *di;
6001 struct btrfs_key key;
6002 struct btrfs_key found_key;
6003 struct btrfs_path *path;
6004 void *addr;
6005 LIST_HEAD(ins_list);
6006 LIST_HEAD(del_list);
6007 int ret;
6008 char *name_ptr;
6009 int name_len;
6010 int entries = 0;
6011 int total_len = 0;
6012 bool put = false;
6013 struct btrfs_key location;
6014
6015 if (!dir_emit_dots(file, ctx))
6016 return 0;
6017
6018 path = btrfs_alloc_path();
6019 if (!path)
6020 return -ENOMEM;
6021
6022 addr = private->filldir_buf;
6023 path->reada = READA_FORWARD;
6024
6025 put = btrfs_readdir_get_delayed_items(BTRFS_I(inode), private->last_index,
6026 &ins_list, &del_list);
6027
6028 again:
6029 key.type = BTRFS_DIR_INDEX_KEY;
6030 key.offset = ctx->pos;
6031 key.objectid = btrfs_ino(BTRFS_I(inode));
6032
6033 btrfs_for_each_slot(root, &key, &found_key, path, ret) {
6034 struct dir_entry *entry;
6035 struct extent_buffer *leaf = path->nodes[0];
6036 u8 ftype;
6037
6038 if (found_key.objectid != key.objectid)
6039 break;
6040 if (found_key.type != BTRFS_DIR_INDEX_KEY)
6041 break;
6042 if (found_key.offset < ctx->pos)
6043 continue;
6044 if (found_key.offset > private->last_index)
6045 break;
6046 if (btrfs_should_delete_dir_index(&del_list, found_key.offset))
6047 continue;
6048 di = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
6049 name_len = btrfs_dir_name_len(leaf, di);
6050 if ((total_len + sizeof(struct dir_entry) + name_len) >=
6051 PAGE_SIZE) {
6052 btrfs_release_path(path);
6053 ret = btrfs_filldir(private->filldir_buf, entries, ctx);
6054 if (ret)
6055 goto nopos;
6056 addr = private->filldir_buf;
6057 entries = 0;
6058 total_len = 0;
6059 goto again;
6060 }
6061
6062 ftype = btrfs_dir_flags_to_ftype(btrfs_dir_flags(leaf, di));
6063 entry = addr;
6064 name_ptr = (char *)(entry + 1);
6065 read_extent_buffer(leaf, name_ptr,
6066 (unsigned long)(di + 1), name_len);
6067 put_unaligned(name_len, &entry->name_len);
6068 put_unaligned(fs_ftype_to_dtype(ftype), &entry->type);
6069 btrfs_dir_item_key_to_cpu(leaf, di, &location);
6070 put_unaligned(location.objectid, &entry->ino);
6071 put_unaligned(found_key.offset, &entry->offset);
6072 entries++;
6073 addr += sizeof(struct dir_entry) + name_len;
6074 total_len += sizeof(struct dir_entry) + name_len;
6075 }
6076 /* Catch error encountered during iteration */
6077 if (ret < 0)
6078 goto err;
6079
6080 btrfs_release_path(path);
6081
6082 ret = btrfs_filldir(private->filldir_buf, entries, ctx);
6083 if (ret)
6084 goto nopos;
6085
6086 ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list);
6087 if (ret)
6088 goto nopos;
6089
6090 /*
6091 * Stop new entries from being returned after we return the last
6092 * entry.
6093 *
6094 * New directory entries are assigned a strictly increasing
6095 * offset. This means that new entries created during readdir
6096 * are *guaranteed* to be seen in the future by that readdir.
6097 * This has broken buggy programs which operate on names as
6098 * they're returned by readdir. Until we reuse freed offsets
6099 * we have this hack to stop new entries from being returned
6100 * under the assumption that they'll never reach this huge
6101 * offset.
6102 *
6103 * This is being careful not to overflow 32bit loff_t unless the
6104 * last entry requires it because doing so has broken 32bit apps
6105 * in the past.
6106 */
6107 if (ctx->pos >= INT_MAX)
6108 ctx->pos = LLONG_MAX;
6109 else
6110 ctx->pos = INT_MAX;
6111 nopos:
6112 ret = 0;
6113 err:
6114 if (put)
6115 btrfs_readdir_put_delayed_items(BTRFS_I(inode), &ins_list, &del_list);
6116 btrfs_free_path(path);
6117 return ret;
6118 }
6119
6120 /*
6121 * This is somewhat expensive, updating the tree every time the
6122 * inode changes. But, it is most likely to find the inode in cache.
6123 * FIXME, needs more benchmarking...there are no reasons other than performance
6124 * to keep or drop this code.
6125 */
btrfs_dirty_inode(struct btrfs_inode * inode)6126 static int btrfs_dirty_inode(struct btrfs_inode *inode)
6127 {
6128 struct btrfs_root *root = inode->root;
6129 struct btrfs_fs_info *fs_info = root->fs_info;
6130 struct btrfs_trans_handle *trans;
6131 int ret;
6132
6133 if (test_bit(BTRFS_INODE_DUMMY, &inode->runtime_flags))
6134 return 0;
6135
6136 trans = btrfs_join_transaction(root);
6137 if (IS_ERR(trans))
6138 return PTR_ERR(trans);
6139
6140 ret = btrfs_update_inode(trans, inode);
6141 if (ret == -ENOSPC || ret == -EDQUOT) {
6142 /* whoops, lets try again with the full transaction */
6143 btrfs_end_transaction(trans);
6144 trans = btrfs_start_transaction(root, 1);
6145 if (IS_ERR(trans))
6146 return PTR_ERR(trans);
6147
6148 ret = btrfs_update_inode(trans, inode);
6149 }
6150 btrfs_end_transaction(trans);
6151 if (inode->delayed_node)
6152 btrfs_balance_delayed_items(fs_info);
6153
6154 return ret;
6155 }
6156
6157 /*
6158 * This is a copy of file_update_time. We need this so we can return error on
6159 * ENOSPC for updating the inode in the case of file write and mmap writes.
6160 */
btrfs_update_time(struct inode * inode,int flags)6161 static int btrfs_update_time(struct inode *inode, int flags)
6162 {
6163 struct btrfs_root *root = BTRFS_I(inode)->root;
6164 bool dirty;
6165
6166 if (btrfs_root_readonly(root))
6167 return -EROFS;
6168
6169 dirty = inode_update_timestamps(inode, flags);
6170 return dirty ? btrfs_dirty_inode(BTRFS_I(inode)) : 0;
6171 }
6172
6173 /*
6174 * helper to find a free sequence number in a given directory. This current
6175 * code is very simple, later versions will do smarter things in the btree
6176 */
btrfs_set_inode_index(struct btrfs_inode * dir,u64 * index)6177 int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index)
6178 {
6179 int ret = 0;
6180
6181 if (dir->index_cnt == (u64)-1) {
6182 ret = btrfs_inode_delayed_dir_index_count(dir);
6183 if (ret) {
6184 ret = btrfs_set_inode_index_count(dir);
6185 if (ret)
6186 return ret;
6187 }
6188 }
6189
6190 *index = dir->index_cnt;
6191 dir->index_cnt++;
6192
6193 return ret;
6194 }
6195
btrfs_insert_inode_locked(struct inode * inode)6196 static int btrfs_insert_inode_locked(struct inode *inode)
6197 {
6198 struct btrfs_iget_args args;
6199
6200 args.ino = btrfs_ino(BTRFS_I(inode));
6201 args.root = BTRFS_I(inode)->root;
6202
6203 return insert_inode_locked4(inode,
6204 btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root),
6205 btrfs_find_actor, &args);
6206 }
6207
btrfs_new_inode_prepare(struct btrfs_new_inode_args * args,unsigned int * trans_num_items)6208 int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args,
6209 unsigned int *trans_num_items)
6210 {
6211 struct inode *dir = args->dir;
6212 struct inode *inode = args->inode;
6213 int ret;
6214
6215 if (!args->orphan) {
6216 ret = fscrypt_setup_filename(dir, &args->dentry->d_name, 0,
6217 &args->fname);
6218 if (ret)
6219 return ret;
6220 }
6221
6222 ret = posix_acl_create(dir, &inode->i_mode, &args->default_acl, &args->acl);
6223 if (ret) {
6224 fscrypt_free_filename(&args->fname);
6225 return ret;
6226 }
6227
6228 /* 1 to add inode item */
6229 *trans_num_items = 1;
6230 /* 1 to add compression property */
6231 if (BTRFS_I(dir)->prop_compress)
6232 (*trans_num_items)++;
6233 /* 1 to add default ACL xattr */
6234 if (args->default_acl)
6235 (*trans_num_items)++;
6236 /* 1 to add access ACL xattr */
6237 if (args->acl)
6238 (*trans_num_items)++;
6239 #ifdef CONFIG_SECURITY
6240 /* 1 to add LSM xattr */
6241 if (dir->i_security)
6242 (*trans_num_items)++;
6243 #endif
6244 if (args->orphan) {
6245 /* 1 to add orphan item */
6246 (*trans_num_items)++;
6247 } else {
6248 /*
6249 * 1 to add dir item
6250 * 1 to add dir index
6251 * 1 to update parent inode item
6252 *
6253 * No need for 1 unit for the inode ref item because it is
6254 * inserted in a batch together with the inode item at
6255 * btrfs_create_new_inode().
6256 */
6257 *trans_num_items += 3;
6258 }
6259 return 0;
6260 }
6261
btrfs_new_inode_args_destroy(struct btrfs_new_inode_args * args)6262 void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args)
6263 {
6264 posix_acl_release(args->acl);
6265 posix_acl_release(args->default_acl);
6266 fscrypt_free_filename(&args->fname);
6267 }
6268
6269 /*
6270 * Inherit flags from the parent inode.
6271 *
6272 * Currently only the compression flags and the cow flags are inherited.
6273 */
btrfs_inherit_iflags(struct btrfs_inode * inode,struct btrfs_inode * dir)6274 static void btrfs_inherit_iflags(struct btrfs_inode *inode, struct btrfs_inode *dir)
6275 {
6276 unsigned int flags;
6277
6278 flags = dir->flags;
6279
6280 if (flags & BTRFS_INODE_NOCOMPRESS) {
6281 inode->flags &= ~BTRFS_INODE_COMPRESS;
6282 inode->flags |= BTRFS_INODE_NOCOMPRESS;
6283 } else if (flags & BTRFS_INODE_COMPRESS) {
6284 inode->flags &= ~BTRFS_INODE_NOCOMPRESS;
6285 inode->flags |= BTRFS_INODE_COMPRESS;
6286 }
6287
6288 if (flags & BTRFS_INODE_NODATACOW) {
6289 inode->flags |= BTRFS_INODE_NODATACOW;
6290 if (S_ISREG(inode->vfs_inode.i_mode))
6291 inode->flags |= BTRFS_INODE_NODATASUM;
6292 }
6293
6294 btrfs_sync_inode_flags_to_i_flags(inode);
6295 }
6296
btrfs_create_new_inode(struct btrfs_trans_handle * trans,struct btrfs_new_inode_args * args)6297 int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
6298 struct btrfs_new_inode_args *args)
6299 {
6300 struct timespec64 ts;
6301 struct inode *dir = args->dir;
6302 struct inode *inode = args->inode;
6303 const struct fscrypt_str *name = args->orphan ? NULL : &args->fname.disk_name;
6304 struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
6305 struct btrfs_root *root;
6306 struct btrfs_inode_item *inode_item;
6307 struct btrfs_path *path;
6308 u64 objectid;
6309 struct btrfs_inode_ref *ref;
6310 struct btrfs_key key[2];
6311 u32 sizes[2];
6312 struct btrfs_item_batch batch;
6313 unsigned long ptr;
6314 int ret;
6315 bool xa_reserved = false;
6316
6317 path = btrfs_alloc_path();
6318 if (!path)
6319 return -ENOMEM;
6320
6321 if (!args->subvol)
6322 BTRFS_I(inode)->root = btrfs_grab_root(BTRFS_I(dir)->root);
6323 root = BTRFS_I(inode)->root;
6324
6325 ret = btrfs_init_file_extent_tree(BTRFS_I(inode));
6326 if (ret)
6327 goto out;
6328
6329 ret = btrfs_get_free_objectid(root, &objectid);
6330 if (ret)
6331 goto out;
6332 btrfs_set_inode_number(BTRFS_I(inode), objectid);
6333
6334 ret = xa_reserve(&root->inodes, objectid, GFP_NOFS);
6335 if (ret)
6336 goto out;
6337 xa_reserved = true;
6338
6339 if (args->orphan) {
6340 /*
6341 * O_TMPFILE, set link count to 0, so that after this point, we
6342 * fill in an inode item with the correct link count.
6343 */
6344 set_nlink(inode, 0);
6345 } else {
6346 trace_btrfs_inode_request(dir);
6347
6348 ret = btrfs_set_inode_index(BTRFS_I(dir), &BTRFS_I(inode)->dir_index);
6349 if (ret)
6350 goto out;
6351 }
6352
6353 if (S_ISDIR(inode->i_mode))
6354 BTRFS_I(inode)->index_cnt = BTRFS_DIR_START_INDEX;
6355
6356 BTRFS_I(inode)->generation = trans->transid;
6357 inode->i_generation = BTRFS_I(inode)->generation;
6358
6359 /*
6360 * We don't have any capability xattrs set here yet, shortcut any
6361 * queries for the xattrs here. If we add them later via the inode
6362 * security init path or any other path this flag will be cleared.
6363 */
6364 set_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags);
6365
6366 /*
6367 * Subvolumes don't inherit flags from their parent directory.
6368 * Originally this was probably by accident, but we probably can't
6369 * change it now without compatibility issues.
6370 */
6371 if (!args->subvol)
6372 btrfs_inherit_iflags(BTRFS_I(inode), BTRFS_I(dir));
6373
6374 if (S_ISREG(inode->i_mode)) {
6375 if (btrfs_test_opt(fs_info, NODATASUM))
6376 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
6377 if (btrfs_test_opt(fs_info, NODATACOW))
6378 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW |
6379 BTRFS_INODE_NODATASUM;
6380 btrfs_update_inode_mapping_flags(BTRFS_I(inode));
6381 }
6382
6383 ret = btrfs_insert_inode_locked(inode);
6384 if (ret < 0) {
6385 if (!args->orphan)
6386 BTRFS_I(dir)->index_cnt--;
6387 goto out;
6388 }
6389
6390 /*
6391 * We could have gotten an inode number from somebody who was fsynced
6392 * and then removed in this same transaction, so let's just set full
6393 * sync since it will be a full sync anyway and this will blow away the
6394 * old info in the log.
6395 */
6396 btrfs_set_inode_full_sync(BTRFS_I(inode));
6397
6398 key[0].objectid = objectid;
6399 key[0].type = BTRFS_INODE_ITEM_KEY;
6400 key[0].offset = 0;
6401
6402 sizes[0] = sizeof(struct btrfs_inode_item);
6403
6404 if (!args->orphan) {
6405 /*
6406 * Start new inodes with an inode_ref. This is slightly more
6407 * efficient for small numbers of hard links since they will
6408 * be packed into one item. Extended refs will kick in if we
6409 * add more hard links than can fit in the ref item.
6410 */
6411 key[1].objectid = objectid;
6412 key[1].type = BTRFS_INODE_REF_KEY;
6413 if (args->subvol) {
6414 key[1].offset = objectid;
6415 sizes[1] = 2 + sizeof(*ref);
6416 } else {
6417 key[1].offset = btrfs_ino(BTRFS_I(dir));
6418 sizes[1] = name->len + sizeof(*ref);
6419 }
6420 }
6421
6422 batch.keys = &key[0];
6423 batch.data_sizes = &sizes[0];
6424 batch.total_data_size = sizes[0] + (args->orphan ? 0 : sizes[1]);
6425 batch.nr = args->orphan ? 1 : 2;
6426 ret = btrfs_insert_empty_items(trans, root, path, &batch);
6427 if (ret != 0) {
6428 btrfs_abort_transaction(trans, ret);
6429 goto discard;
6430 }
6431
6432 ts = simple_inode_init_ts(inode);
6433 BTRFS_I(inode)->i_otime_sec = ts.tv_sec;
6434 BTRFS_I(inode)->i_otime_nsec = ts.tv_nsec;
6435
6436 /*
6437 * We're going to fill the inode item now, so at this point the inode
6438 * must be fully initialized.
6439 */
6440
6441 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
6442 struct btrfs_inode_item);
6443 memzero_extent_buffer(path->nodes[0], (unsigned long)inode_item,
6444 sizeof(*inode_item));
6445 fill_inode_item(trans, path->nodes[0], inode_item, inode);
6446
6447 if (!args->orphan) {
6448 ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
6449 struct btrfs_inode_ref);
6450 ptr = (unsigned long)(ref + 1);
6451 if (args->subvol) {
6452 btrfs_set_inode_ref_name_len(path->nodes[0], ref, 2);
6453 btrfs_set_inode_ref_index(path->nodes[0], ref, 0);
6454 write_extent_buffer(path->nodes[0], "..", ptr, 2);
6455 } else {
6456 btrfs_set_inode_ref_name_len(path->nodes[0], ref,
6457 name->len);
6458 btrfs_set_inode_ref_index(path->nodes[0], ref,
6459 BTRFS_I(inode)->dir_index);
6460 write_extent_buffer(path->nodes[0], name->name, ptr,
6461 name->len);
6462 }
6463 }
6464
6465 /*
6466 * We don't need the path anymore, plus inheriting properties, adding
6467 * ACLs, security xattrs, orphan item or adding the link, will result in
6468 * allocating yet another path. So just free our path.
6469 */
6470 btrfs_free_path(path);
6471 path = NULL;
6472
6473 if (args->subvol) {
6474 struct btrfs_inode *parent;
6475
6476 /*
6477 * Subvolumes inherit properties from their parent subvolume,
6478 * not the directory they were created in.
6479 */
6480 parent = btrfs_iget(BTRFS_FIRST_FREE_OBJECTID, BTRFS_I(dir)->root);
6481 if (IS_ERR(parent)) {
6482 ret = PTR_ERR(parent);
6483 } else {
6484 ret = btrfs_inode_inherit_props(trans, BTRFS_I(inode),
6485 parent);
6486 iput(&parent->vfs_inode);
6487 }
6488 } else {
6489 ret = btrfs_inode_inherit_props(trans, BTRFS_I(inode),
6490 BTRFS_I(dir));
6491 }
6492 if (ret) {
6493 btrfs_err(fs_info,
6494 "error inheriting props for ino %llu (root %llu): %d",
6495 btrfs_ino(BTRFS_I(inode)), btrfs_root_id(root), ret);
6496 }
6497
6498 /*
6499 * Subvolumes don't inherit ACLs or get passed to the LSM. This is
6500 * probably a bug.
6501 */
6502 if (!args->subvol) {
6503 ret = btrfs_init_inode_security(trans, args);
6504 if (ret) {
6505 btrfs_abort_transaction(trans, ret);
6506 goto discard;
6507 }
6508 }
6509
6510 ret = btrfs_add_inode_to_root(BTRFS_I(inode), false);
6511 if (WARN_ON(ret)) {
6512 /* Shouldn't happen, we used xa_reserve() before. */
6513 btrfs_abort_transaction(trans, ret);
6514 goto discard;
6515 }
6516
6517 trace_btrfs_inode_new(inode);
6518 btrfs_set_inode_last_trans(trans, BTRFS_I(inode));
6519
6520 btrfs_update_root_times(trans, root);
6521
6522 if (args->orphan) {
6523 ret = btrfs_orphan_add(trans, BTRFS_I(inode));
6524 } else {
6525 ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
6526 0, BTRFS_I(inode)->dir_index);
6527 }
6528 if (ret) {
6529 btrfs_abort_transaction(trans, ret);
6530 goto discard;
6531 }
6532
6533 return 0;
6534
6535 discard:
6536 /*
6537 * discard_new_inode() calls iput(), but the caller owns the reference
6538 * to the inode.
6539 */
6540 ihold(inode);
6541 discard_new_inode(inode);
6542 out:
6543 if (xa_reserved)
6544 xa_release(&root->inodes, objectid);
6545
6546 btrfs_free_path(path);
6547 return ret;
6548 }
6549
6550 /*
6551 * utility function to add 'inode' into 'parent_inode' with
6552 * a give name and a given sequence number.
6553 * if 'add_backref' is true, also insert a backref from the
6554 * inode to the parent directory.
6555 */
btrfs_add_link(struct btrfs_trans_handle * trans,struct btrfs_inode * parent_inode,struct btrfs_inode * inode,const struct fscrypt_str * name,int add_backref,u64 index)6556 int btrfs_add_link(struct btrfs_trans_handle *trans,
6557 struct btrfs_inode *parent_inode, struct btrfs_inode *inode,
6558 const struct fscrypt_str *name, int add_backref, u64 index)
6559 {
6560 int ret = 0;
6561 struct btrfs_key key;
6562 struct btrfs_root *root = parent_inode->root;
6563 u64 ino = btrfs_ino(inode);
6564 u64 parent_ino = btrfs_ino(parent_inode);
6565
6566 if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
6567 memcpy(&key, &inode->root->root_key, sizeof(key));
6568 } else {
6569 key.objectid = ino;
6570 key.type = BTRFS_INODE_ITEM_KEY;
6571 key.offset = 0;
6572 }
6573
6574 if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
6575 ret = btrfs_add_root_ref(trans, key.objectid,
6576 btrfs_root_id(root), parent_ino,
6577 index, name);
6578 } else if (add_backref) {
6579 ret = btrfs_insert_inode_ref(trans, root, name,
6580 ino, parent_ino, index);
6581 }
6582
6583 /* Nothing to clean up yet */
6584 if (ret)
6585 return ret;
6586
6587 ret = btrfs_insert_dir_item(trans, name, parent_inode, &key,
6588 btrfs_inode_type(inode), index);
6589 if (ret == -EEXIST || ret == -EOVERFLOW)
6590 goto fail_dir_item;
6591 else if (ret) {
6592 btrfs_abort_transaction(trans, ret);
6593 return ret;
6594 }
6595
6596 btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size +
6597 name->len * 2);
6598 inode_inc_iversion(&parent_inode->vfs_inode);
6599 /*
6600 * If we are replaying a log tree, we do not want to update the mtime
6601 * and ctime of the parent directory with the current time, since the
6602 * log replay procedure is responsible for setting them to their correct
6603 * values (the ones it had when the fsync was done).
6604 */
6605 if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags))
6606 inode_set_mtime_to_ts(&parent_inode->vfs_inode,
6607 inode_set_ctime_current(&parent_inode->vfs_inode));
6608
6609 ret = btrfs_update_inode(trans, parent_inode);
6610 if (ret)
6611 btrfs_abort_transaction(trans, ret);
6612 return ret;
6613
6614 fail_dir_item:
6615 if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
6616 u64 local_index;
6617 int err;
6618 err = btrfs_del_root_ref(trans, key.objectid,
6619 btrfs_root_id(root), parent_ino,
6620 &local_index, name);
6621 if (err)
6622 btrfs_abort_transaction(trans, err);
6623 } else if (add_backref) {
6624 u64 local_index;
6625 int err;
6626
6627 err = btrfs_del_inode_ref(trans, root, name, ino, parent_ino,
6628 &local_index);
6629 if (err)
6630 btrfs_abort_transaction(trans, err);
6631 }
6632
6633 /* Return the original error code */
6634 return ret;
6635 }
6636
btrfs_create_common(struct inode * dir,struct dentry * dentry,struct inode * inode)6637 static int btrfs_create_common(struct inode *dir, struct dentry *dentry,
6638 struct inode *inode)
6639 {
6640 struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
6641 struct btrfs_root *root = BTRFS_I(dir)->root;
6642 struct btrfs_new_inode_args new_inode_args = {
6643 .dir = dir,
6644 .dentry = dentry,
6645 .inode = inode,
6646 };
6647 unsigned int trans_num_items;
6648 struct btrfs_trans_handle *trans;
6649 int err;
6650
6651 err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
6652 if (err)
6653 goto out_inode;
6654
6655 trans = btrfs_start_transaction(root, trans_num_items);
6656 if (IS_ERR(trans)) {
6657 err = PTR_ERR(trans);
6658 goto out_new_inode_args;
6659 }
6660
6661 err = btrfs_create_new_inode(trans, &new_inode_args);
6662 if (!err)
6663 d_instantiate_new(dentry, inode);
6664
6665 btrfs_end_transaction(trans);
6666 btrfs_btree_balance_dirty(fs_info);
6667 out_new_inode_args:
6668 btrfs_new_inode_args_destroy(&new_inode_args);
6669 out_inode:
6670 if (err)
6671 iput(inode);
6672 return err;
6673 }
6674
btrfs_mknod(struct mnt_idmap * idmap,struct inode * dir,struct dentry * dentry,umode_t mode,dev_t rdev)6675 static int btrfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
6676 struct dentry *dentry, umode_t mode, dev_t rdev)
6677 {
6678 struct inode *inode;
6679
6680 inode = new_inode(dir->i_sb);
6681 if (!inode)
6682 return -ENOMEM;
6683 inode_init_owner(idmap, inode, dir, mode);
6684 inode->i_op = &btrfs_special_inode_operations;
6685 init_special_inode(inode, inode->i_mode, rdev);
6686 return btrfs_create_common(dir, dentry, inode);
6687 }
6688
btrfs_create(struct mnt_idmap * idmap,struct inode * dir,struct dentry * dentry,umode_t mode,bool excl)6689 static int btrfs_create(struct mnt_idmap *idmap, struct inode *dir,
6690 struct dentry *dentry, umode_t mode, bool excl)
6691 {
6692 struct inode *inode;
6693
6694 inode = new_inode(dir->i_sb);
6695 if (!inode)
6696 return -ENOMEM;
6697 inode_init_owner(idmap, inode, dir, mode);
6698 inode->i_fop = &btrfs_file_operations;
6699 inode->i_op = &btrfs_file_inode_operations;
6700 inode->i_mapping->a_ops = &btrfs_aops;
6701 return btrfs_create_common(dir, dentry, inode);
6702 }
6703
btrfs_link(struct dentry * old_dentry,struct inode * dir,struct dentry * dentry)6704 static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
6705 struct dentry *dentry)
6706 {
6707 struct btrfs_trans_handle *trans = NULL;
6708 struct btrfs_root *root = BTRFS_I(dir)->root;
6709 struct inode *inode = d_inode(old_dentry);
6710 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
6711 struct fscrypt_name fname;
6712 u64 index;
6713 int err;
6714 int drop_inode = 0;
6715
6716 /* do not allow sys_link's with other subvols of the same device */
6717 if (btrfs_root_id(root) != btrfs_root_id(BTRFS_I(inode)->root))
6718 return -EXDEV;
6719
6720 if (inode->i_nlink >= BTRFS_LINK_MAX)
6721 return -EMLINK;
6722
6723 err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &fname);
6724 if (err)
6725 goto fail;
6726
6727 err = btrfs_set_inode_index(BTRFS_I(dir), &index);
6728 if (err)
6729 goto fail;
6730
6731 /*
6732 * 2 items for inode and inode ref
6733 * 2 items for dir items
6734 * 1 item for parent inode
6735 * 1 item for orphan item deletion if O_TMPFILE
6736 */
6737 trans = btrfs_start_transaction(root, inode->i_nlink ? 5 : 6);
6738 if (IS_ERR(trans)) {
6739 err = PTR_ERR(trans);
6740 trans = NULL;
6741 goto fail;
6742 }
6743
6744 /* There are several dir indexes for this inode, clear the cache. */
6745 BTRFS_I(inode)->dir_index = 0ULL;
6746 inc_nlink(inode);
6747 inode_inc_iversion(inode);
6748 inode_set_ctime_current(inode);
6749 ihold(inode);
6750 set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
6751
6752 err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
6753 &fname.disk_name, 1, index);
6754
6755 if (err) {
6756 drop_inode = 1;
6757 } else {
6758 struct dentry *parent = dentry->d_parent;
6759
6760 err = btrfs_update_inode(trans, BTRFS_I(inode));
6761 if (err)
6762 goto fail;
6763 if (inode->i_nlink == 1) {
6764 /*
6765 * If new hard link count is 1, it's a file created
6766 * with open(2) O_TMPFILE flag.
6767 */
6768 err = btrfs_orphan_del(trans, BTRFS_I(inode));
6769 if (err)
6770 goto fail;
6771 }
6772 d_instantiate(dentry, inode);
6773 btrfs_log_new_name(trans, old_dentry, NULL, 0, parent);
6774 }
6775
6776 fail:
6777 fscrypt_free_filename(&fname);
6778 if (trans)
6779 btrfs_end_transaction(trans);
6780 if (drop_inode) {
6781 inode_dec_link_count(inode);
6782 iput(inode);
6783 }
6784 btrfs_btree_balance_dirty(fs_info);
6785 return err;
6786 }
6787
btrfs_mkdir(struct mnt_idmap * idmap,struct inode * dir,struct dentry * dentry,umode_t mode)6788 static struct dentry *btrfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
6789 struct dentry *dentry, umode_t mode)
6790 {
6791 struct inode *inode;
6792
6793 inode = new_inode(dir->i_sb);
6794 if (!inode)
6795 return ERR_PTR(-ENOMEM);
6796 inode_init_owner(idmap, inode, dir, S_IFDIR | mode);
6797 inode->i_op = &btrfs_dir_inode_operations;
6798 inode->i_fop = &btrfs_dir_file_operations;
6799 return ERR_PTR(btrfs_create_common(dir, dentry, inode));
6800 }
6801
uncompress_inline(struct btrfs_path * path,struct folio * folio,struct btrfs_file_extent_item * item)6802 static noinline int uncompress_inline(struct btrfs_path *path,
6803 struct folio *folio,
6804 struct btrfs_file_extent_item *item)
6805 {
6806 int ret;
6807 struct extent_buffer *leaf = path->nodes[0];
6808 const u32 blocksize = leaf->fs_info->sectorsize;
6809 char *tmp;
6810 size_t max_size;
6811 unsigned long inline_size;
6812 unsigned long ptr;
6813 int compress_type;
6814
6815 compress_type = btrfs_file_extent_compression(leaf, item);
6816 max_size = btrfs_file_extent_ram_bytes(leaf, item);
6817 inline_size = btrfs_file_extent_inline_item_len(leaf, path->slots[0]);
6818 tmp = kmalloc(inline_size, GFP_NOFS);
6819 if (!tmp)
6820 return -ENOMEM;
6821 ptr = btrfs_file_extent_inline_start(item);
6822
6823 read_extent_buffer(leaf, tmp, ptr, inline_size);
6824
6825 max_size = min_t(unsigned long, blocksize, max_size);
6826 ret = btrfs_decompress(compress_type, tmp, folio, 0, inline_size,
6827 max_size);
6828
6829 /*
6830 * decompression code contains a memset to fill in any space between the end
6831 * of the uncompressed data and the end of max_size in case the decompressed
6832 * data ends up shorter than ram_bytes. That doesn't cover the hole between
6833 * the end of an inline extent and the beginning of the next block, so we
6834 * cover that region here.
6835 */
6836
6837 if (max_size < blocksize)
6838 folio_zero_range(folio, max_size, blocksize - max_size);
6839 kfree(tmp);
6840 return ret;
6841 }
6842
read_inline_extent(struct btrfs_path * path,struct folio * folio)6843 static int read_inline_extent(struct btrfs_path *path, struct folio *folio)
6844 {
6845 const u32 blocksize = path->nodes[0]->fs_info->sectorsize;
6846 struct btrfs_file_extent_item *fi;
6847 void *kaddr;
6848 size_t copy_size;
6849
6850 if (!folio || folio_test_uptodate(folio))
6851 return 0;
6852
6853 ASSERT(folio_pos(folio) == 0);
6854
6855 fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
6856 struct btrfs_file_extent_item);
6857 if (btrfs_file_extent_compression(path->nodes[0], fi) != BTRFS_COMPRESS_NONE)
6858 return uncompress_inline(path, folio, fi);
6859
6860 copy_size = min_t(u64, blocksize,
6861 btrfs_file_extent_ram_bytes(path->nodes[0], fi));
6862 kaddr = kmap_local_folio(folio, 0);
6863 read_extent_buffer(path->nodes[0], kaddr,
6864 btrfs_file_extent_inline_start(fi), copy_size);
6865 kunmap_local(kaddr);
6866 if (copy_size < blocksize)
6867 folio_zero_range(folio, copy_size, blocksize - copy_size);
6868 return 0;
6869 }
6870
6871 /*
6872 * Lookup the first extent overlapping a range in a file.
6873 *
6874 * @inode: file to search in
6875 * @page: page to read extent data into if the extent is inline
6876 * @start: file offset
6877 * @len: length of range starting at @start
6878 *
6879 * Return the first &struct extent_map which overlaps the given range, reading
6880 * it from the B-tree and caching it if necessary. Note that there may be more
6881 * extents which overlap the given range after the returned extent_map.
6882 *
6883 * If @page is not NULL and the extent is inline, this also reads the extent
6884 * data directly into the page and marks the extent up to date in the io_tree.
6885 *
6886 * Return: ERR_PTR on error, non-NULL extent_map on success.
6887 */
btrfs_get_extent(struct btrfs_inode * inode,struct folio * folio,u64 start,u64 len)6888 struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
6889 struct folio *folio, u64 start, u64 len)
6890 {
6891 struct btrfs_fs_info *fs_info = inode->root->fs_info;
6892 int ret = 0;
6893 u64 extent_start = 0;
6894 u64 extent_end = 0;
6895 u64 objectid = btrfs_ino(inode);
6896 int extent_type = -1;
6897 struct btrfs_path *path = NULL;
6898 struct btrfs_root *root = inode->root;
6899 struct btrfs_file_extent_item *item;
6900 struct extent_buffer *leaf;
6901 struct btrfs_key found_key;
6902 struct extent_map *em = NULL;
6903 struct extent_map_tree *em_tree = &inode->extent_tree;
6904
6905 read_lock(&em_tree->lock);
6906 em = lookup_extent_mapping(em_tree, start, len);
6907 read_unlock(&em_tree->lock);
6908
6909 if (em) {
6910 if (em->start > start || em->start + em->len <= start)
6911 free_extent_map(em);
6912 else if (em->disk_bytenr == EXTENT_MAP_INLINE && folio)
6913 free_extent_map(em);
6914 else
6915 goto out;
6916 }
6917 em = alloc_extent_map();
6918 if (!em) {
6919 ret = -ENOMEM;
6920 goto out;
6921 }
6922 em->start = EXTENT_MAP_HOLE;
6923 em->disk_bytenr = EXTENT_MAP_HOLE;
6924 em->len = (u64)-1;
6925
6926 path = btrfs_alloc_path();
6927 if (!path) {
6928 ret = -ENOMEM;
6929 goto out;
6930 }
6931
6932 /* Chances are we'll be called again, so go ahead and do readahead */
6933 path->reada = READA_FORWARD;
6934
6935 /*
6936 * The same explanation in load_free_space_cache applies here as well,
6937 * we only read when we're loading the free space cache, and at that
6938 * point the commit_root has everything we need.
6939 */
6940 if (btrfs_is_free_space_inode(inode)) {
6941 path->search_commit_root = 1;
6942 path->skip_locking = 1;
6943 }
6944
6945 ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0);
6946 if (ret < 0) {
6947 goto out;
6948 } else if (ret > 0) {
6949 if (path->slots[0] == 0)
6950 goto not_found;
6951 path->slots[0]--;
6952 ret = 0;
6953 }
6954
6955 leaf = path->nodes[0];
6956 item = btrfs_item_ptr(leaf, path->slots[0],
6957 struct btrfs_file_extent_item);
6958 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6959 if (found_key.objectid != objectid ||
6960 found_key.type != BTRFS_EXTENT_DATA_KEY) {
6961 /*
6962 * If we backup past the first extent we want to move forward
6963 * and see if there is an extent in front of us, otherwise we'll
6964 * say there is a hole for our whole search range which can
6965 * cause problems.
6966 */
6967 extent_end = start;
6968 goto next;
6969 }
6970
6971 extent_type = btrfs_file_extent_type(leaf, item);
6972 extent_start = found_key.offset;
6973 extent_end = btrfs_file_extent_end(path);
6974 if (extent_type == BTRFS_FILE_EXTENT_REG ||
6975 extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
6976 /* Only regular file could have regular/prealloc extent */
6977 if (!S_ISREG(inode->vfs_inode.i_mode)) {
6978 ret = -EUCLEAN;
6979 btrfs_crit(fs_info,
6980 "regular/prealloc extent found for non-regular inode %llu",
6981 btrfs_ino(inode));
6982 goto out;
6983 }
6984 trace_btrfs_get_extent_show_fi_regular(inode, leaf, item,
6985 extent_start);
6986 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
6987 trace_btrfs_get_extent_show_fi_inline(inode, leaf, item,
6988 path->slots[0],
6989 extent_start);
6990 }
6991 next:
6992 if (start >= extent_end) {
6993 path->slots[0]++;
6994 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
6995 ret = btrfs_next_leaf(root, path);
6996 if (ret < 0)
6997 goto out;
6998 else if (ret > 0)
6999 goto not_found;
7000
7001 leaf = path->nodes[0];
7002 }
7003 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
7004 if (found_key.objectid != objectid ||
7005 found_key.type != BTRFS_EXTENT_DATA_KEY)
7006 goto not_found;
7007 if (start + len <= found_key.offset)
7008 goto not_found;
7009 if (start > found_key.offset)
7010 goto next;
7011
7012 /* New extent overlaps with existing one */
7013 em->start = start;
7014 em->len = found_key.offset - start;
7015 em->disk_bytenr = EXTENT_MAP_HOLE;
7016 goto insert;
7017 }
7018
7019 btrfs_extent_item_to_extent_map(inode, path, item, em);
7020
7021 if (extent_type == BTRFS_FILE_EXTENT_REG ||
7022 extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
7023 goto insert;
7024 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
7025 /*
7026 * Inline extent can only exist at file offset 0. This is
7027 * ensured by tree-checker and inline extent creation path.
7028 * Thus all members representing file offsets should be zero.
7029 */
7030 ASSERT(extent_start == 0);
7031 ASSERT(em->start == 0);
7032
7033 /*
7034 * btrfs_extent_item_to_extent_map() should have properly
7035 * initialized em members already.
7036 *
7037 * Other members are not utilized for inline extents.
7038 */
7039 ASSERT(em->disk_bytenr == EXTENT_MAP_INLINE);
7040 ASSERT(em->len == fs_info->sectorsize);
7041
7042 ret = read_inline_extent(path, folio);
7043 if (ret < 0)
7044 goto out;
7045 goto insert;
7046 }
7047 not_found:
7048 em->start = start;
7049 em->len = len;
7050 em->disk_bytenr = EXTENT_MAP_HOLE;
7051 insert:
7052 ret = 0;
7053 btrfs_release_path(path);
7054 if (em->start > start || extent_map_end(em) <= start) {
7055 btrfs_err(fs_info,
7056 "bad extent! em: [%llu %llu] passed [%llu %llu]",
7057 em->start, em->len, start, len);
7058 ret = -EIO;
7059 goto out;
7060 }
7061
7062 write_lock(&em_tree->lock);
7063 ret = btrfs_add_extent_mapping(inode, &em, start, len);
7064 write_unlock(&em_tree->lock);
7065 out:
7066 btrfs_free_path(path);
7067
7068 trace_btrfs_get_extent(root, inode, em);
7069
7070 if (ret) {
7071 free_extent_map(em);
7072 return ERR_PTR(ret);
7073 }
7074 return em;
7075 }
7076
btrfs_extent_readonly(struct btrfs_fs_info * fs_info,u64 bytenr)7077 static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
7078 {
7079 struct btrfs_block_group *block_group;
7080 bool readonly = false;
7081
7082 block_group = btrfs_lookup_block_group(fs_info, bytenr);
7083 if (!block_group || block_group->ro)
7084 readonly = true;
7085 if (block_group)
7086 btrfs_put_block_group(block_group);
7087 return readonly;
7088 }
7089
7090 /*
7091 * Check if we can do nocow write into the range [@offset, @offset + @len)
7092 *
7093 * @offset: File offset
7094 * @len: The length to write, will be updated to the nocow writeable
7095 * range
7096 * @orig_start: (optional) Return the original file offset of the file extent
7097 * @orig_len: (optional) Return the original on-disk length of the file extent
7098 * @ram_bytes: (optional) Return the ram_bytes of the file extent
7099 *
7100 * Return:
7101 * >0 and update @len if we can do nocow write
7102 * 0 if we can't do nocow write
7103 * <0 if error happened
7104 *
7105 * NOTE: This only checks the file extents, caller is responsible to wait for
7106 * any ordered extents.
7107 */
can_nocow_extent(struct btrfs_inode * inode,u64 offset,u64 * len,struct btrfs_file_extent * file_extent,bool nowait)7108 noinline int can_nocow_extent(struct btrfs_inode *inode, u64 offset, u64 *len,
7109 struct btrfs_file_extent *file_extent,
7110 bool nowait)
7111 {
7112 struct btrfs_root *root = inode->root;
7113 struct btrfs_fs_info *fs_info = root->fs_info;
7114 struct can_nocow_file_extent_args nocow_args = { 0 };
7115 struct btrfs_path *path;
7116 int ret;
7117 struct extent_buffer *leaf;
7118 struct extent_io_tree *io_tree = &inode->io_tree;
7119 struct btrfs_file_extent_item *fi;
7120 struct btrfs_key key;
7121 int found_type;
7122
7123 path = btrfs_alloc_path();
7124 if (!path)
7125 return -ENOMEM;
7126 path->nowait = nowait;
7127
7128 ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode),
7129 offset, 0);
7130 if (ret < 0)
7131 goto out;
7132
7133 if (ret == 1) {
7134 if (path->slots[0] == 0) {
7135 /* can't find the item, must cow */
7136 ret = 0;
7137 goto out;
7138 }
7139 path->slots[0]--;
7140 }
7141 ret = 0;
7142 leaf = path->nodes[0];
7143 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
7144 if (key.objectid != btrfs_ino(inode) ||
7145 key.type != BTRFS_EXTENT_DATA_KEY) {
7146 /* not our file or wrong item type, must cow */
7147 goto out;
7148 }
7149
7150 if (key.offset > offset) {
7151 /* Wrong offset, must cow */
7152 goto out;
7153 }
7154
7155 if (btrfs_file_extent_end(path) <= offset)
7156 goto out;
7157
7158 fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
7159 found_type = btrfs_file_extent_type(leaf, fi);
7160
7161 nocow_args.start = offset;
7162 nocow_args.end = offset + *len - 1;
7163 nocow_args.free_path = true;
7164
7165 ret = can_nocow_file_extent(path, &key, inode, &nocow_args);
7166 /* can_nocow_file_extent() has freed the path. */
7167 path = NULL;
7168
7169 if (ret != 1) {
7170 /* Treat errors as not being able to NOCOW. */
7171 ret = 0;
7172 goto out;
7173 }
7174
7175 ret = 0;
7176 if (btrfs_extent_readonly(fs_info,
7177 nocow_args.file_extent.disk_bytenr +
7178 nocow_args.file_extent.offset))
7179 goto out;
7180
7181 if (!(inode->flags & BTRFS_INODE_NODATACOW) &&
7182 found_type == BTRFS_FILE_EXTENT_PREALLOC) {
7183 u64 range_end;
7184
7185 range_end = round_up(offset + nocow_args.file_extent.num_bytes,
7186 root->fs_info->sectorsize) - 1;
7187 ret = test_range_bit_exists(io_tree, offset, range_end, EXTENT_DELALLOC);
7188 if (ret) {
7189 ret = -EAGAIN;
7190 goto out;
7191 }
7192 }
7193
7194 if (file_extent)
7195 memcpy(file_extent, &nocow_args.file_extent, sizeof(*file_extent));
7196
7197 *len = nocow_args.file_extent.num_bytes;
7198 ret = 1;
7199 out:
7200 btrfs_free_path(path);
7201 return ret;
7202 }
7203
7204 /* The callers of this must take lock_extent() */
btrfs_create_io_em(struct btrfs_inode * inode,u64 start,const struct btrfs_file_extent * file_extent,int type)7205 struct extent_map *btrfs_create_io_em(struct btrfs_inode *inode, u64 start,
7206 const struct btrfs_file_extent *file_extent,
7207 int type)
7208 {
7209 struct extent_map *em;
7210 int ret;
7211
7212 /*
7213 * Note the missing NOCOW type.
7214 *
7215 * For pure NOCOW writes, we should not create an io extent map, but
7216 * just reusing the existing one.
7217 * Only PREALLOC writes (NOCOW write into preallocated range) can
7218 * create an io extent map.
7219 */
7220 ASSERT(type == BTRFS_ORDERED_PREALLOC ||
7221 type == BTRFS_ORDERED_COMPRESSED ||
7222 type == BTRFS_ORDERED_REGULAR);
7223
7224 switch (type) {
7225 case BTRFS_ORDERED_PREALLOC:
7226 /* We're only referring part of a larger preallocated extent. */
7227 ASSERT(file_extent->num_bytes <= file_extent->ram_bytes);
7228 break;
7229 case BTRFS_ORDERED_REGULAR:
7230 /* COW results a new extent matching our file extent size. */
7231 ASSERT(file_extent->disk_num_bytes == file_extent->num_bytes);
7232 ASSERT(file_extent->ram_bytes == file_extent->num_bytes);
7233
7234 /* Since it's a new extent, we should not have any offset. */
7235 ASSERT(file_extent->offset == 0);
7236 break;
7237 case BTRFS_ORDERED_COMPRESSED:
7238 /* Must be compressed. */
7239 ASSERT(file_extent->compression != BTRFS_COMPRESS_NONE);
7240
7241 /*
7242 * Encoded write can make us to refer to part of the
7243 * uncompressed extent.
7244 */
7245 ASSERT(file_extent->num_bytes <= file_extent->ram_bytes);
7246 break;
7247 }
7248
7249 em = alloc_extent_map();
7250 if (!em)
7251 return ERR_PTR(-ENOMEM);
7252
7253 em->start = start;
7254 em->len = file_extent->num_bytes;
7255 em->disk_bytenr = file_extent->disk_bytenr;
7256 em->disk_num_bytes = file_extent->disk_num_bytes;
7257 em->ram_bytes = file_extent->ram_bytes;
7258 em->generation = -1;
7259 em->offset = file_extent->offset;
7260 em->flags |= EXTENT_FLAG_PINNED;
7261 if (type == BTRFS_ORDERED_COMPRESSED)
7262 extent_map_set_compression(em, file_extent->compression);
7263
7264 ret = btrfs_replace_extent_map_range(inode, em, true);
7265 if (ret) {
7266 free_extent_map(em);
7267 return ERR_PTR(ret);
7268 }
7269
7270 /* em got 2 refs now, callers needs to do free_extent_map once. */
7271 return em;
7272 }
7273
7274 /*
7275 * For release_folio() and invalidate_folio() we have a race window where
7276 * folio_end_writeback() is called but the subpage spinlock is not yet released.
7277 * If we continue to release/invalidate the page, we could cause use-after-free
7278 * for subpage spinlock. So this function is to spin and wait for subpage
7279 * spinlock.
7280 */
wait_subpage_spinlock(struct folio * folio)7281 static void wait_subpage_spinlock(struct folio *folio)
7282 {
7283 struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
7284 struct btrfs_subpage *subpage;
7285
7286 if (!btrfs_is_subpage(fs_info, folio))
7287 return;
7288
7289 ASSERT(folio_test_private(folio) && folio_get_private(folio));
7290 subpage = folio_get_private(folio);
7291
7292 /*
7293 * This may look insane as we just acquire the spinlock and release it,
7294 * without doing anything. But we just want to make sure no one is
7295 * still holding the subpage spinlock.
7296 * And since the page is not dirty nor writeback, and we have page
7297 * locked, the only possible way to hold a spinlock is from the endio
7298 * function to clear page writeback.
7299 *
7300 * Here we just acquire the spinlock so that all existing callers
7301 * should exit and we're safe to release/invalidate the page.
7302 */
7303 spin_lock_irq(&subpage->lock);
7304 spin_unlock_irq(&subpage->lock);
7305 }
7306
btrfs_launder_folio(struct folio * folio)7307 static int btrfs_launder_folio(struct folio *folio)
7308 {
7309 return btrfs_qgroup_free_data(folio_to_inode(folio), NULL, folio_pos(folio),
7310 folio_size(folio), NULL);
7311 }
7312
__btrfs_release_folio(struct folio * folio,gfp_t gfp_flags)7313 static bool __btrfs_release_folio(struct folio *folio, gfp_t gfp_flags)
7314 {
7315 if (try_release_extent_mapping(folio, gfp_flags)) {
7316 wait_subpage_spinlock(folio);
7317 clear_folio_extent_mapped(folio);
7318 return true;
7319 }
7320 return false;
7321 }
7322
btrfs_release_folio(struct folio * folio,gfp_t gfp_flags)7323 static bool btrfs_release_folio(struct folio *folio, gfp_t gfp_flags)
7324 {
7325 if (folio_test_writeback(folio) || folio_test_dirty(folio))
7326 return false;
7327 return __btrfs_release_folio(folio, gfp_flags);
7328 }
7329
7330 #ifdef CONFIG_MIGRATION
btrfs_migrate_folio(struct address_space * mapping,struct folio * dst,struct folio * src,enum migrate_mode mode)7331 static int btrfs_migrate_folio(struct address_space *mapping,
7332 struct folio *dst, struct folio *src,
7333 enum migrate_mode mode)
7334 {
7335 int ret = filemap_migrate_folio(mapping, dst, src, mode);
7336
7337 if (ret != MIGRATEPAGE_SUCCESS)
7338 return ret;
7339
7340 if (folio_test_ordered(src)) {
7341 folio_clear_ordered(src);
7342 folio_set_ordered(dst);
7343 }
7344
7345 return MIGRATEPAGE_SUCCESS;
7346 }
7347 #else
7348 #define btrfs_migrate_folio NULL
7349 #endif
7350
btrfs_invalidate_folio(struct folio * folio,size_t offset,size_t length)7351 static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
7352 size_t length)
7353 {
7354 struct btrfs_inode *inode = folio_to_inode(folio);
7355 struct btrfs_fs_info *fs_info = inode->root->fs_info;
7356 struct extent_io_tree *tree = &inode->io_tree;
7357 struct extent_state *cached_state = NULL;
7358 u64 page_start = folio_pos(folio);
7359 u64 page_end = page_start + folio_size(folio) - 1;
7360 u64 cur;
7361 int inode_evicting = inode->vfs_inode.i_state & I_FREEING;
7362
7363 /*
7364 * We have folio locked so no new ordered extent can be created on this
7365 * page, nor bio can be submitted for this folio.
7366 *
7367 * But already submitted bio can still be finished on this folio.
7368 * Furthermore, endio function won't skip folio which has Ordered
7369 * already cleared, so it's possible for endio and
7370 * invalidate_folio to do the same ordered extent accounting twice
7371 * on one folio.
7372 *
7373 * So here we wait for any submitted bios to finish, so that we won't
7374 * do double ordered extent accounting on the same folio.
7375 */
7376 folio_wait_writeback(folio);
7377 wait_subpage_spinlock(folio);
7378
7379 /*
7380 * For subpage case, we have call sites like
7381 * btrfs_punch_hole_lock_range() which passes range not aligned to
7382 * sectorsize.
7383 * If the range doesn't cover the full folio, we don't need to and
7384 * shouldn't clear page extent mapped, as folio->private can still
7385 * record subpage dirty bits for other part of the range.
7386 *
7387 * For cases that invalidate the full folio even the range doesn't
7388 * cover the full folio, like invalidating the last folio, we're
7389 * still safe to wait for ordered extent to finish.
7390 */
7391 if (!(offset == 0 && length == folio_size(folio))) {
7392 btrfs_release_folio(folio, GFP_NOFS);
7393 return;
7394 }
7395
7396 if (!inode_evicting)
7397 lock_extent(tree, page_start, page_end, &cached_state);
7398
7399 cur = page_start;
7400 while (cur < page_end) {
7401 struct btrfs_ordered_extent *ordered;
7402 u64 range_end;
7403 u32 range_len;
7404 u32 extra_flags = 0;
7405
7406 ordered = btrfs_lookup_first_ordered_range(inode, cur,
7407 page_end + 1 - cur);
7408 if (!ordered) {
7409 range_end = page_end;
7410 /*
7411 * No ordered extent covering this range, we are safe
7412 * to delete all extent states in the range.
7413 */
7414 extra_flags = EXTENT_CLEAR_ALL_BITS;
7415 goto next;
7416 }
7417 if (ordered->file_offset > cur) {
7418 /*
7419 * There is a range between [cur, oe->file_offset) not
7420 * covered by any ordered extent.
7421 * We are safe to delete all extent states, and handle
7422 * the ordered extent in the next iteration.
7423 */
7424 range_end = ordered->file_offset - 1;
7425 extra_flags = EXTENT_CLEAR_ALL_BITS;
7426 goto next;
7427 }
7428
7429 range_end = min(ordered->file_offset + ordered->num_bytes - 1,
7430 page_end);
7431 ASSERT(range_end + 1 - cur < U32_MAX);
7432 range_len = range_end + 1 - cur;
7433 if (!btrfs_folio_test_ordered(fs_info, folio, cur, range_len)) {
7434 /*
7435 * If Ordered is cleared, it means endio has
7436 * already been executed for the range.
7437 * We can't delete the extent states as
7438 * btrfs_finish_ordered_io() may still use some of them.
7439 */
7440 goto next;
7441 }
7442 btrfs_folio_clear_ordered(fs_info, folio, cur, range_len);
7443
7444 /*
7445 * IO on this page will never be started, so we need to account
7446 * for any ordered extents now. Don't clear EXTENT_DELALLOC_NEW
7447 * here, must leave that up for the ordered extent completion.
7448 *
7449 * This will also unlock the range for incoming
7450 * btrfs_finish_ordered_io().
7451 */
7452 if (!inode_evicting)
7453 clear_extent_bit(tree, cur, range_end,
7454 EXTENT_DELALLOC |
7455 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
7456 EXTENT_DEFRAG, &cached_state);
7457
7458 spin_lock_irq(&inode->ordered_tree_lock);
7459 set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
7460 ordered->truncated_len = min(ordered->truncated_len,
7461 cur - ordered->file_offset);
7462 spin_unlock_irq(&inode->ordered_tree_lock);
7463
7464 /*
7465 * If the ordered extent has finished, we're safe to delete all
7466 * the extent states of the range, otherwise
7467 * btrfs_finish_ordered_io() will get executed by endio for
7468 * other pages, so we can't delete extent states.
7469 */
7470 if (btrfs_dec_test_ordered_pending(inode, &ordered,
7471 cur, range_end + 1 - cur)) {
7472 btrfs_finish_ordered_io(ordered);
7473 /*
7474 * The ordered extent has finished, now we're again
7475 * safe to delete all extent states of the range.
7476 */
7477 extra_flags = EXTENT_CLEAR_ALL_BITS;
7478 }
7479 next:
7480 if (ordered)
7481 btrfs_put_ordered_extent(ordered);
7482 /*
7483 * Qgroup reserved space handler
7484 * Sector(s) here will be either:
7485 *
7486 * 1) Already written to disk or bio already finished
7487 * Then its QGROUP_RESERVED bit in io_tree is already cleared.
7488 * Qgroup will be handled by its qgroup_record then.
7489 * btrfs_qgroup_free_data() call will do nothing here.
7490 *
7491 * 2) Not written to disk yet
7492 * Then btrfs_qgroup_free_data() call will clear the
7493 * QGROUP_RESERVED bit of its io_tree, and free the qgroup
7494 * reserved data space.
7495 * Since the IO will never happen for this page.
7496 */
7497 btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur, NULL);
7498 if (!inode_evicting) {
7499 clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED |
7500 EXTENT_DELALLOC | EXTENT_UPTODATE |
7501 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG |
7502 extra_flags, &cached_state);
7503 }
7504 cur = range_end + 1;
7505 }
7506 /*
7507 * We have iterated through all ordered extents of the page, the page
7508 * should not have Ordered anymore, or the above iteration
7509 * did something wrong.
7510 */
7511 ASSERT(!folio_test_ordered(folio));
7512 btrfs_folio_clear_checked(fs_info, folio, folio_pos(folio), folio_size(folio));
7513 if (!inode_evicting)
7514 __btrfs_release_folio(folio, GFP_NOFS);
7515 clear_folio_extent_mapped(folio);
7516 }
7517
btrfs_truncate(struct btrfs_inode * inode,bool skip_writeback)7518 static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
7519 {
7520 struct btrfs_truncate_control control = {
7521 .inode = inode,
7522 .ino = btrfs_ino(inode),
7523 .min_type = BTRFS_EXTENT_DATA_KEY,
7524 .clear_extent_range = true,
7525 };
7526 struct btrfs_root *root = inode->root;
7527 struct btrfs_fs_info *fs_info = root->fs_info;
7528 struct btrfs_block_rsv *rsv;
7529 int ret;
7530 struct btrfs_trans_handle *trans;
7531 u64 mask = fs_info->sectorsize - 1;
7532 const u64 min_size = btrfs_calc_metadata_size(fs_info, 1);
7533
7534 if (!skip_writeback) {
7535 ret = btrfs_wait_ordered_range(inode,
7536 inode->vfs_inode.i_size & (~mask),
7537 (u64)-1);
7538 if (ret)
7539 return ret;
7540 }
7541
7542 /*
7543 * Yes ladies and gentlemen, this is indeed ugly. We have a couple of
7544 * things going on here:
7545 *
7546 * 1) We need to reserve space to update our inode.
7547 *
7548 * 2) We need to have something to cache all the space that is going to
7549 * be free'd up by the truncate operation, but also have some slack
7550 * space reserved in case it uses space during the truncate (thank you
7551 * very much snapshotting).
7552 *
7553 * And we need these to be separate. The fact is we can use a lot of
7554 * space doing the truncate, and we have no earthly idea how much space
7555 * we will use, so we need the truncate reservation to be separate so it
7556 * doesn't end up using space reserved for updating the inode. We also
7557 * need to be able to stop the transaction and start a new one, which
7558 * means we need to be able to update the inode several times, and we
7559 * have no idea of knowing how many times that will be, so we can't just
7560 * reserve 1 item for the entirety of the operation, so that has to be
7561 * done separately as well.
7562 *
7563 * So that leaves us with
7564 *
7565 * 1) rsv - for the truncate reservation, which we will steal from the
7566 * transaction reservation.
7567 * 2) fs_info->trans_block_rsv - this will have 1 items worth left for
7568 * updating the inode.
7569 */
7570 rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
7571 if (!rsv)
7572 return -ENOMEM;
7573 rsv->size = min_size;
7574 rsv->failfast = true;
7575
7576 /*
7577 * 1 for the truncate slack space
7578 * 1 for updating the inode.
7579 */
7580 trans = btrfs_start_transaction(root, 2);
7581 if (IS_ERR(trans)) {
7582 ret = PTR_ERR(trans);
7583 goto out;
7584 }
7585
7586 /* Migrate the slack space for the truncate to our reserve */
7587 ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
7588 min_size, false);
7589 /*
7590 * We have reserved 2 metadata units when we started the transaction and
7591 * min_size matches 1 unit, so this should never fail, but if it does,
7592 * it's not critical we just fail truncation.
7593 */
7594 if (WARN_ON(ret)) {
7595 btrfs_end_transaction(trans);
7596 goto out;
7597 }
7598
7599 trans->block_rsv = rsv;
7600
7601 while (1) {
7602 struct extent_state *cached_state = NULL;
7603 const u64 new_size = inode->vfs_inode.i_size;
7604 const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize);
7605
7606 control.new_size = new_size;
7607 lock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state);
7608 /*
7609 * We want to drop from the next block forward in case this new
7610 * size is not block aligned since we will be keeping the last
7611 * block of the extent just the way it is.
7612 */
7613 btrfs_drop_extent_map_range(inode,
7614 ALIGN(new_size, fs_info->sectorsize),
7615 (u64)-1, false);
7616
7617 ret = btrfs_truncate_inode_items(trans, root, &control);
7618
7619 inode_sub_bytes(&inode->vfs_inode, control.sub_bytes);
7620 btrfs_inode_safe_disk_i_size_write(inode, control.last_size);
7621
7622 unlock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state);
7623
7624 trans->block_rsv = &fs_info->trans_block_rsv;
7625 if (ret != -ENOSPC && ret != -EAGAIN)
7626 break;
7627
7628 ret = btrfs_update_inode(trans, inode);
7629 if (ret)
7630 break;
7631
7632 btrfs_end_transaction(trans);
7633 btrfs_btree_balance_dirty(fs_info);
7634
7635 trans = btrfs_start_transaction(root, 2);
7636 if (IS_ERR(trans)) {
7637 ret = PTR_ERR(trans);
7638 trans = NULL;
7639 break;
7640 }
7641
7642 btrfs_block_rsv_release(fs_info, rsv, -1, NULL);
7643 ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
7644 rsv, min_size, false);
7645 /*
7646 * We have reserved 2 metadata units when we started the
7647 * transaction and min_size matches 1 unit, so this should never
7648 * fail, but if it does, it's not critical we just fail truncation.
7649 */
7650 if (WARN_ON(ret))
7651 break;
7652
7653 trans->block_rsv = rsv;
7654 }
7655
7656 /*
7657 * We can't call btrfs_truncate_block inside a trans handle as we could
7658 * deadlock with freeze, if we got BTRFS_NEED_TRUNCATE_BLOCK then we
7659 * know we've truncated everything except the last little bit, and can
7660 * do btrfs_truncate_block and then update the disk_i_size.
7661 */
7662 if (ret == BTRFS_NEED_TRUNCATE_BLOCK) {
7663 btrfs_end_transaction(trans);
7664 btrfs_btree_balance_dirty(fs_info);
7665
7666 ret = btrfs_truncate_block(inode, inode->vfs_inode.i_size, 0, 0);
7667 if (ret)
7668 goto out;
7669 trans = btrfs_start_transaction(root, 1);
7670 if (IS_ERR(trans)) {
7671 ret = PTR_ERR(trans);
7672 goto out;
7673 }
7674 btrfs_inode_safe_disk_i_size_write(inode, 0);
7675 }
7676
7677 if (trans) {
7678 int ret2;
7679
7680 trans->block_rsv = &fs_info->trans_block_rsv;
7681 ret2 = btrfs_update_inode(trans, inode);
7682 if (ret2 && !ret)
7683 ret = ret2;
7684
7685 ret2 = btrfs_end_transaction(trans);
7686 if (ret2 && !ret)
7687 ret = ret2;
7688 btrfs_btree_balance_dirty(fs_info);
7689 }
7690 out:
7691 btrfs_free_block_rsv(fs_info, rsv);
7692 /*
7693 * So if we truncate and then write and fsync we normally would just
7694 * write the extents that changed, which is a problem if we need to
7695 * first truncate that entire inode. So set this flag so we write out
7696 * all of the extents in the inode to the sync log so we're completely
7697 * safe.
7698 *
7699 * If no extents were dropped or trimmed we don't need to force the next
7700 * fsync to truncate all the inode's items from the log and re-log them
7701 * all. This means the truncate operation did not change the file size,
7702 * or changed it to a smaller size but there was only an implicit hole
7703 * between the old i_size and the new i_size, and there were no prealloc
7704 * extents beyond i_size to drop.
7705 */
7706 if (control.extents_found > 0)
7707 btrfs_set_inode_full_sync(inode);
7708
7709 return ret;
7710 }
7711
btrfs_new_subvol_inode(struct mnt_idmap * idmap,struct inode * dir)7712 struct inode *btrfs_new_subvol_inode(struct mnt_idmap *idmap,
7713 struct inode *dir)
7714 {
7715 struct inode *inode;
7716
7717 inode = new_inode(dir->i_sb);
7718 if (inode) {
7719 /*
7720 * Subvolumes don't inherit the sgid bit or the parent's gid if
7721 * the parent's sgid bit is set. This is probably a bug.
7722 */
7723 inode_init_owner(idmap, inode, NULL,
7724 S_IFDIR | (~current_umask() & S_IRWXUGO));
7725 inode->i_op = &btrfs_dir_inode_operations;
7726 inode->i_fop = &btrfs_dir_file_operations;
7727 }
7728 return inode;
7729 }
7730
btrfs_alloc_inode(struct super_block * sb)7731 struct inode *btrfs_alloc_inode(struct super_block *sb)
7732 {
7733 struct btrfs_fs_info *fs_info = btrfs_sb(sb);
7734 struct btrfs_inode *ei;
7735 struct inode *inode;
7736
7737 ei = alloc_inode_sb(sb, btrfs_inode_cachep, GFP_KERNEL);
7738 if (!ei)
7739 return NULL;
7740
7741 ei->root = NULL;
7742 ei->generation = 0;
7743 ei->last_trans = 0;
7744 ei->last_sub_trans = 0;
7745 ei->logged_trans = 0;
7746 ei->delalloc_bytes = 0;
7747 ei->new_delalloc_bytes = 0;
7748 ei->defrag_bytes = 0;
7749 ei->disk_i_size = 0;
7750 ei->flags = 0;
7751 ei->ro_flags = 0;
7752 /*
7753 * ->index_cnt will be properly initialized later when creating a new
7754 * inode (btrfs_create_new_inode()) or when reading an existing inode
7755 * from disk (btrfs_read_locked_inode()).
7756 */
7757 ei->csum_bytes = 0;
7758 ei->dir_index = 0;
7759 ei->last_unlink_trans = 0;
7760 ei->last_reflink_trans = 0;
7761 ei->last_log_commit = 0;
7762
7763 spin_lock_init(&ei->lock);
7764 ei->outstanding_extents = 0;
7765 if (sb->s_magic != BTRFS_TEST_MAGIC)
7766 btrfs_init_metadata_block_rsv(fs_info, &ei->block_rsv,
7767 BTRFS_BLOCK_RSV_DELALLOC);
7768 ei->runtime_flags = 0;
7769 ei->prop_compress = BTRFS_COMPRESS_NONE;
7770 ei->defrag_compress = BTRFS_COMPRESS_NONE;
7771
7772 ei->delayed_node = NULL;
7773
7774 ei->i_otime_sec = 0;
7775 ei->i_otime_nsec = 0;
7776
7777 inode = &ei->vfs_inode;
7778 extent_map_tree_init(&ei->extent_tree);
7779
7780 /* This io tree sets the valid inode. */
7781 extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO);
7782 ei->io_tree.inode = ei;
7783
7784 ei->file_extent_tree = NULL;
7785
7786 mutex_init(&ei->log_mutex);
7787 spin_lock_init(&ei->ordered_tree_lock);
7788 ei->ordered_tree = RB_ROOT;
7789 ei->ordered_tree_last = NULL;
7790 INIT_LIST_HEAD(&ei->delalloc_inodes);
7791 INIT_LIST_HEAD(&ei->delayed_iput);
7792 init_rwsem(&ei->i_mmap_lock);
7793
7794 return inode;
7795 }
7796
7797 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
btrfs_test_destroy_inode(struct inode * inode)7798 void btrfs_test_destroy_inode(struct inode *inode)
7799 {
7800 btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false);
7801 kfree(BTRFS_I(inode)->file_extent_tree);
7802 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
7803 }
7804 #endif
7805
btrfs_free_inode(struct inode * inode)7806 void btrfs_free_inode(struct inode *inode)
7807 {
7808 kfree(BTRFS_I(inode)->file_extent_tree);
7809 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
7810 }
7811
btrfs_destroy_inode(struct inode * vfs_inode)7812 void btrfs_destroy_inode(struct inode *vfs_inode)
7813 {
7814 struct btrfs_ordered_extent *ordered;
7815 struct btrfs_inode *inode = BTRFS_I(vfs_inode);
7816 struct btrfs_root *root = inode->root;
7817 bool freespace_inode;
7818
7819 WARN_ON(!hlist_empty(&vfs_inode->i_dentry));
7820 WARN_ON(vfs_inode->i_data.nrpages);
7821 WARN_ON(inode->block_rsv.reserved);
7822 WARN_ON(inode->block_rsv.size);
7823 WARN_ON(inode->outstanding_extents);
7824 if (!S_ISDIR(vfs_inode->i_mode)) {
7825 WARN_ON(inode->delalloc_bytes);
7826 WARN_ON(inode->new_delalloc_bytes);
7827 WARN_ON(inode->csum_bytes);
7828 }
7829 if (!root || !btrfs_is_data_reloc_root(root))
7830 WARN_ON(inode->defrag_bytes);
7831
7832 /*
7833 * This can happen where we create an inode, but somebody else also
7834 * created the same inode and we need to destroy the one we already
7835 * created.
7836 */
7837 if (!root)
7838 return;
7839
7840 /*
7841 * If this is a free space inode do not take the ordered extents lockdep
7842 * map.
7843 */
7844 freespace_inode = btrfs_is_free_space_inode(inode);
7845
7846 while (1) {
7847 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
7848 if (!ordered)
7849 break;
7850 else {
7851 btrfs_err(root->fs_info,
7852 "found ordered extent %llu %llu on inode cleanup",
7853 ordered->file_offset, ordered->num_bytes);
7854
7855 if (!freespace_inode)
7856 btrfs_lockdep_acquire(root->fs_info, btrfs_ordered_extent);
7857
7858 btrfs_remove_ordered_extent(inode, ordered);
7859 btrfs_put_ordered_extent(ordered);
7860 btrfs_put_ordered_extent(ordered);
7861 }
7862 }
7863 btrfs_qgroup_check_reserved_leak(inode);
7864 btrfs_del_inode_from_root(inode);
7865 btrfs_drop_extent_map_range(inode, 0, (u64)-1, false);
7866 btrfs_inode_clear_file_extent_range(inode, 0, (u64)-1);
7867 btrfs_put_root(inode->root);
7868 }
7869
btrfs_drop_inode(struct inode * inode)7870 int btrfs_drop_inode(struct inode *inode)
7871 {
7872 struct btrfs_root *root = BTRFS_I(inode)->root;
7873
7874 if (root == NULL)
7875 return 1;
7876
7877 /* the snap/subvol tree is on deleting */
7878 if (btrfs_root_refs(&root->root_item) == 0)
7879 return 1;
7880 else
7881 return generic_drop_inode(inode);
7882 }
7883
init_once(void * foo)7884 static void init_once(void *foo)
7885 {
7886 struct btrfs_inode *ei = foo;
7887
7888 inode_init_once(&ei->vfs_inode);
7889 }
7890
btrfs_destroy_cachep(void)7891 void __cold btrfs_destroy_cachep(void)
7892 {
7893 /*
7894 * Make sure all delayed rcu free inodes are flushed before we
7895 * destroy cache.
7896 */
7897 rcu_barrier();
7898 kmem_cache_destroy(btrfs_inode_cachep);
7899 }
7900
btrfs_init_cachep(void)7901 int __init btrfs_init_cachep(void)
7902 {
7903 btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
7904 sizeof(struct btrfs_inode), 0,
7905 SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT,
7906 init_once);
7907 if (!btrfs_inode_cachep)
7908 return -ENOMEM;
7909
7910 return 0;
7911 }
7912
btrfs_getattr(struct mnt_idmap * idmap,const struct path * path,struct kstat * stat,u32 request_mask,unsigned int flags)7913 static int btrfs_getattr(struct mnt_idmap *idmap,
7914 const struct path *path, struct kstat *stat,
7915 u32 request_mask, unsigned int flags)
7916 {
7917 u64 delalloc_bytes;
7918 u64 inode_bytes;
7919 struct inode *inode = d_inode(path->dentry);
7920 u32 blocksize = btrfs_sb(inode->i_sb)->sectorsize;
7921 u32 bi_flags = BTRFS_I(inode)->flags;
7922 u32 bi_ro_flags = BTRFS_I(inode)->ro_flags;
7923
7924 stat->result_mask |= STATX_BTIME;
7925 stat->btime.tv_sec = BTRFS_I(inode)->i_otime_sec;
7926 stat->btime.tv_nsec = BTRFS_I(inode)->i_otime_nsec;
7927 if (bi_flags & BTRFS_INODE_APPEND)
7928 stat->attributes |= STATX_ATTR_APPEND;
7929 if (bi_flags & BTRFS_INODE_COMPRESS)
7930 stat->attributes |= STATX_ATTR_COMPRESSED;
7931 if (bi_flags & BTRFS_INODE_IMMUTABLE)
7932 stat->attributes |= STATX_ATTR_IMMUTABLE;
7933 if (bi_flags & BTRFS_INODE_NODUMP)
7934 stat->attributes |= STATX_ATTR_NODUMP;
7935 if (bi_ro_flags & BTRFS_INODE_RO_VERITY)
7936 stat->attributes |= STATX_ATTR_VERITY;
7937
7938 stat->attributes_mask |= (STATX_ATTR_APPEND |
7939 STATX_ATTR_COMPRESSED |
7940 STATX_ATTR_IMMUTABLE |
7941 STATX_ATTR_NODUMP);
7942
7943 generic_fillattr(idmap, request_mask, inode, stat);
7944 stat->dev = BTRFS_I(inode)->root->anon_dev;
7945
7946 stat->subvol = BTRFS_I(inode)->root->root_key.objectid;
7947 stat->result_mask |= STATX_SUBVOL;
7948
7949 spin_lock(&BTRFS_I(inode)->lock);
7950 delalloc_bytes = BTRFS_I(inode)->new_delalloc_bytes;
7951 inode_bytes = inode_get_bytes(inode);
7952 spin_unlock(&BTRFS_I(inode)->lock);
7953 stat->blocks = (ALIGN(inode_bytes, blocksize) +
7954 ALIGN(delalloc_bytes, blocksize)) >> SECTOR_SHIFT;
7955 return 0;
7956 }
7957
btrfs_rename_exchange(struct inode * old_dir,struct dentry * old_dentry,struct inode * new_dir,struct dentry * new_dentry)7958 static int btrfs_rename_exchange(struct inode *old_dir,
7959 struct dentry *old_dentry,
7960 struct inode *new_dir,
7961 struct dentry *new_dentry)
7962 {
7963 struct btrfs_fs_info *fs_info = inode_to_fs_info(old_dir);
7964 struct btrfs_trans_handle *trans;
7965 unsigned int trans_num_items;
7966 struct btrfs_root *root = BTRFS_I(old_dir)->root;
7967 struct btrfs_root *dest = BTRFS_I(new_dir)->root;
7968 struct inode *new_inode = new_dentry->d_inode;
7969 struct inode *old_inode = old_dentry->d_inode;
7970 struct btrfs_rename_ctx old_rename_ctx;
7971 struct btrfs_rename_ctx new_rename_ctx;
7972 u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
7973 u64 new_ino = btrfs_ino(BTRFS_I(new_inode));
7974 u64 old_idx = 0;
7975 u64 new_idx = 0;
7976 int ret;
7977 int ret2;
7978 bool need_abort = false;
7979 struct fscrypt_name old_fname, new_fname;
7980 struct fscrypt_str *old_name, *new_name;
7981
7982 /*
7983 * For non-subvolumes allow exchange only within one subvolume, in the
7984 * same inode namespace. Two subvolumes (represented as directory) can
7985 * be exchanged as they're a logical link and have a fixed inode number.
7986 */
7987 if (root != dest &&
7988 (old_ino != BTRFS_FIRST_FREE_OBJECTID ||
7989 new_ino != BTRFS_FIRST_FREE_OBJECTID))
7990 return -EXDEV;
7991
7992 ret = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &old_fname);
7993 if (ret)
7994 return ret;
7995
7996 ret = fscrypt_setup_filename(new_dir, &new_dentry->d_name, 0, &new_fname);
7997 if (ret) {
7998 fscrypt_free_filename(&old_fname);
7999 return ret;
8000 }
8001
8002 old_name = &old_fname.disk_name;
8003 new_name = &new_fname.disk_name;
8004
8005 /* close the race window with snapshot create/destroy ioctl */
8006 if (old_ino == BTRFS_FIRST_FREE_OBJECTID ||
8007 new_ino == BTRFS_FIRST_FREE_OBJECTID)
8008 down_read(&fs_info->subvol_sem);
8009
8010 /*
8011 * For each inode:
8012 * 1 to remove old dir item
8013 * 1 to remove old dir index
8014 * 1 to add new dir item
8015 * 1 to add new dir index
8016 * 1 to update parent inode
8017 *
8018 * If the parents are the same, we only need to account for one
8019 */
8020 trans_num_items = (old_dir == new_dir ? 9 : 10);
8021 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
8022 /*
8023 * 1 to remove old root ref
8024 * 1 to remove old root backref
8025 * 1 to add new root ref
8026 * 1 to add new root backref
8027 */
8028 trans_num_items += 4;
8029 } else {
8030 /*
8031 * 1 to update inode item
8032 * 1 to remove old inode ref
8033 * 1 to add new inode ref
8034 */
8035 trans_num_items += 3;
8036 }
8037 if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
8038 trans_num_items += 4;
8039 else
8040 trans_num_items += 3;
8041 trans = btrfs_start_transaction(root, trans_num_items);
8042 if (IS_ERR(trans)) {
8043 ret = PTR_ERR(trans);
8044 goto out_notrans;
8045 }
8046
8047 if (dest != root) {
8048 ret = btrfs_record_root_in_trans(trans, dest);
8049 if (ret)
8050 goto out_fail;
8051 }
8052
8053 /*
8054 * We need to find a free sequence number both in the source and
8055 * in the destination directory for the exchange.
8056 */
8057 ret = btrfs_set_inode_index(BTRFS_I(new_dir), &old_idx);
8058 if (ret)
8059 goto out_fail;
8060 ret = btrfs_set_inode_index(BTRFS_I(old_dir), &new_idx);
8061 if (ret)
8062 goto out_fail;
8063
8064 BTRFS_I(old_inode)->dir_index = 0ULL;
8065 BTRFS_I(new_inode)->dir_index = 0ULL;
8066
8067 /* Reference for the source. */
8068 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
8069 /* force full log commit if subvolume involved. */
8070 btrfs_set_log_full_commit(trans);
8071 } else {
8072 ret = btrfs_insert_inode_ref(trans, dest, new_name, old_ino,
8073 btrfs_ino(BTRFS_I(new_dir)),
8074 old_idx);
8075 if (ret)
8076 goto out_fail;
8077 need_abort = true;
8078 }
8079
8080 /* And now for the dest. */
8081 if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
8082 /* force full log commit if subvolume involved. */
8083 btrfs_set_log_full_commit(trans);
8084 } else {
8085 ret = btrfs_insert_inode_ref(trans, root, old_name, new_ino,
8086 btrfs_ino(BTRFS_I(old_dir)),
8087 new_idx);
8088 if (ret) {
8089 if (need_abort)
8090 btrfs_abort_transaction(trans, ret);
8091 goto out_fail;
8092 }
8093 }
8094
8095 /* Update inode version and ctime/mtime. */
8096 inode_inc_iversion(old_dir);
8097 inode_inc_iversion(new_dir);
8098 inode_inc_iversion(old_inode);
8099 inode_inc_iversion(new_inode);
8100 simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
8101
8102 if (old_dentry->d_parent != new_dentry->d_parent) {
8103 btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
8104 BTRFS_I(old_inode), true);
8105 btrfs_record_unlink_dir(trans, BTRFS_I(new_dir),
8106 BTRFS_I(new_inode), true);
8107 }
8108
8109 /* src is a subvolume */
8110 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
8111 ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry);
8112 if (ret) {
8113 btrfs_abort_transaction(trans, ret);
8114 goto out_fail;
8115 }
8116 } else { /* src is an inode */
8117 ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
8118 BTRFS_I(old_dentry->d_inode),
8119 old_name, &old_rename_ctx);
8120 if (ret) {
8121 btrfs_abort_transaction(trans, ret);
8122 goto out_fail;
8123 }
8124 ret = btrfs_update_inode(trans, BTRFS_I(old_inode));
8125 if (ret) {
8126 btrfs_abort_transaction(trans, ret);
8127 goto out_fail;
8128 }
8129 }
8130
8131 /* dest is a subvolume */
8132 if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
8133 ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry);
8134 if (ret) {
8135 btrfs_abort_transaction(trans, ret);
8136 goto out_fail;
8137 }
8138 } else { /* dest is an inode */
8139 ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir),
8140 BTRFS_I(new_dentry->d_inode),
8141 new_name, &new_rename_ctx);
8142 if (ret) {
8143 btrfs_abort_transaction(trans, ret);
8144 goto out_fail;
8145 }
8146 ret = btrfs_update_inode(trans, BTRFS_I(new_inode));
8147 if (ret) {
8148 btrfs_abort_transaction(trans, ret);
8149 goto out_fail;
8150 }
8151 }
8152
8153 ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
8154 new_name, 0, old_idx);
8155 if (ret) {
8156 btrfs_abort_transaction(trans, ret);
8157 goto out_fail;
8158 }
8159
8160 ret = btrfs_add_link(trans, BTRFS_I(old_dir), BTRFS_I(new_inode),
8161 old_name, 0, new_idx);
8162 if (ret) {
8163 btrfs_abort_transaction(trans, ret);
8164 goto out_fail;
8165 }
8166
8167 if (old_inode->i_nlink == 1)
8168 BTRFS_I(old_inode)->dir_index = old_idx;
8169 if (new_inode->i_nlink == 1)
8170 BTRFS_I(new_inode)->dir_index = new_idx;
8171
8172 /*
8173 * Now pin the logs of the roots. We do it to ensure that no other task
8174 * can sync the logs while we are in progress with the rename, because
8175 * that could result in an inconsistency in case any of the inodes that
8176 * are part of this rename operation were logged before.
8177 */
8178 if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
8179 btrfs_pin_log_trans(root);
8180 if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
8181 btrfs_pin_log_trans(dest);
8182
8183 /* Do the log updates for all inodes. */
8184 if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
8185 btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
8186 old_rename_ctx.index, new_dentry->d_parent);
8187 if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
8188 btrfs_log_new_name(trans, new_dentry, BTRFS_I(new_dir),
8189 new_rename_ctx.index, old_dentry->d_parent);
8190
8191 /* Now unpin the logs. */
8192 if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
8193 btrfs_end_log_trans(root);
8194 if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
8195 btrfs_end_log_trans(dest);
8196 out_fail:
8197 ret2 = btrfs_end_transaction(trans);
8198 ret = ret ? ret : ret2;
8199 out_notrans:
8200 if (new_ino == BTRFS_FIRST_FREE_OBJECTID ||
8201 old_ino == BTRFS_FIRST_FREE_OBJECTID)
8202 up_read(&fs_info->subvol_sem);
8203
8204 fscrypt_free_filename(&new_fname);
8205 fscrypt_free_filename(&old_fname);
8206 return ret;
8207 }
8208
new_whiteout_inode(struct mnt_idmap * idmap,struct inode * dir)8209 static struct inode *new_whiteout_inode(struct mnt_idmap *idmap,
8210 struct inode *dir)
8211 {
8212 struct inode *inode;
8213
8214 inode = new_inode(dir->i_sb);
8215 if (inode) {
8216 inode_init_owner(idmap, inode, dir,
8217 S_IFCHR | WHITEOUT_MODE);
8218 inode->i_op = &btrfs_special_inode_operations;
8219 init_special_inode(inode, inode->i_mode, WHITEOUT_DEV);
8220 }
8221 return inode;
8222 }
8223
btrfs_rename(struct mnt_idmap * idmap,struct inode * old_dir,struct dentry * old_dentry,struct inode * new_dir,struct dentry * new_dentry,unsigned int flags)8224 static int btrfs_rename(struct mnt_idmap *idmap,
8225 struct inode *old_dir, struct dentry *old_dentry,
8226 struct inode *new_dir, struct dentry *new_dentry,
8227 unsigned int flags)
8228 {
8229 struct btrfs_fs_info *fs_info = inode_to_fs_info(old_dir);
8230 struct btrfs_new_inode_args whiteout_args = {
8231 .dir = old_dir,
8232 .dentry = old_dentry,
8233 };
8234 struct btrfs_trans_handle *trans;
8235 unsigned int trans_num_items;
8236 struct btrfs_root *root = BTRFS_I(old_dir)->root;
8237 struct btrfs_root *dest = BTRFS_I(new_dir)->root;
8238 struct inode *new_inode = d_inode(new_dentry);
8239 struct inode *old_inode = d_inode(old_dentry);
8240 struct btrfs_rename_ctx rename_ctx;
8241 u64 index = 0;
8242 int ret;
8243 int ret2;
8244 u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
8245 struct fscrypt_name old_fname, new_fname;
8246
8247 if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
8248 return -EPERM;
8249
8250 /* we only allow rename subvolume link between subvolumes */
8251 if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
8252 return -EXDEV;
8253
8254 if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID ||
8255 (new_inode && btrfs_ino(BTRFS_I(new_inode)) == BTRFS_FIRST_FREE_OBJECTID))
8256 return -ENOTEMPTY;
8257
8258 if (S_ISDIR(old_inode->i_mode) && new_inode &&
8259 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
8260 return -ENOTEMPTY;
8261
8262 ret = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &old_fname);
8263 if (ret)
8264 return ret;
8265
8266 ret = fscrypt_setup_filename(new_dir, &new_dentry->d_name, 0, &new_fname);
8267 if (ret) {
8268 fscrypt_free_filename(&old_fname);
8269 return ret;
8270 }
8271
8272 /* check for collisions, even if the name isn't there */
8273 ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino, &new_fname.disk_name);
8274 if (ret) {
8275 if (ret == -EEXIST) {
8276 /* we shouldn't get
8277 * eexist without a new_inode */
8278 if (WARN_ON(!new_inode)) {
8279 goto out_fscrypt_names;
8280 }
8281 } else {
8282 /* maybe -EOVERFLOW */
8283 goto out_fscrypt_names;
8284 }
8285 }
8286 ret = 0;
8287
8288 /*
8289 * we're using rename to replace one file with another. Start IO on it
8290 * now so we don't add too much work to the end of the transaction
8291 */
8292 if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size)
8293 filemap_flush(old_inode->i_mapping);
8294
8295 if (flags & RENAME_WHITEOUT) {
8296 whiteout_args.inode = new_whiteout_inode(idmap, old_dir);
8297 if (!whiteout_args.inode) {
8298 ret = -ENOMEM;
8299 goto out_fscrypt_names;
8300 }
8301 ret = btrfs_new_inode_prepare(&whiteout_args, &trans_num_items);
8302 if (ret)
8303 goto out_whiteout_inode;
8304 } else {
8305 /* 1 to update the old parent inode. */
8306 trans_num_items = 1;
8307 }
8308
8309 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
8310 /* Close the race window with snapshot create/destroy ioctl */
8311 down_read(&fs_info->subvol_sem);
8312 /*
8313 * 1 to remove old root ref
8314 * 1 to remove old root backref
8315 * 1 to add new root ref
8316 * 1 to add new root backref
8317 */
8318 trans_num_items += 4;
8319 } else {
8320 /*
8321 * 1 to update inode
8322 * 1 to remove old inode ref
8323 * 1 to add new inode ref
8324 */
8325 trans_num_items += 3;
8326 }
8327 /*
8328 * 1 to remove old dir item
8329 * 1 to remove old dir index
8330 * 1 to add new dir item
8331 * 1 to add new dir index
8332 */
8333 trans_num_items += 4;
8334 /* 1 to update new parent inode if it's not the same as the old parent */
8335 if (new_dir != old_dir)
8336 trans_num_items++;
8337 if (new_inode) {
8338 /*
8339 * 1 to update inode
8340 * 1 to remove inode ref
8341 * 1 to remove dir item
8342 * 1 to remove dir index
8343 * 1 to possibly add orphan item
8344 */
8345 trans_num_items += 5;
8346 }
8347 trans = btrfs_start_transaction(root, trans_num_items);
8348 if (IS_ERR(trans)) {
8349 ret = PTR_ERR(trans);
8350 goto out_notrans;
8351 }
8352
8353 if (dest != root) {
8354 ret = btrfs_record_root_in_trans(trans, dest);
8355 if (ret)
8356 goto out_fail;
8357 }
8358
8359 ret = btrfs_set_inode_index(BTRFS_I(new_dir), &index);
8360 if (ret)
8361 goto out_fail;
8362
8363 BTRFS_I(old_inode)->dir_index = 0ULL;
8364 if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
8365 /* force full log commit if subvolume involved. */
8366 btrfs_set_log_full_commit(trans);
8367 } else {
8368 ret = btrfs_insert_inode_ref(trans, dest, &new_fname.disk_name,
8369 old_ino, btrfs_ino(BTRFS_I(new_dir)),
8370 index);
8371 if (ret)
8372 goto out_fail;
8373 }
8374
8375 inode_inc_iversion(old_dir);
8376 inode_inc_iversion(new_dir);
8377 inode_inc_iversion(old_inode);
8378 simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
8379
8380 if (old_dentry->d_parent != new_dentry->d_parent)
8381 btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
8382 BTRFS_I(old_inode), true);
8383
8384 if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
8385 ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry);
8386 if (ret) {
8387 btrfs_abort_transaction(trans, ret);
8388 goto out_fail;
8389 }
8390 } else {
8391 ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
8392 BTRFS_I(d_inode(old_dentry)),
8393 &old_fname.disk_name, &rename_ctx);
8394 if (ret) {
8395 btrfs_abort_transaction(trans, ret);
8396 goto out_fail;
8397 }
8398 ret = btrfs_update_inode(trans, BTRFS_I(old_inode));
8399 if (ret) {
8400 btrfs_abort_transaction(trans, ret);
8401 goto out_fail;
8402 }
8403 }
8404
8405 if (new_inode) {
8406 inode_inc_iversion(new_inode);
8407 if (unlikely(btrfs_ino(BTRFS_I(new_inode)) ==
8408 BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
8409 ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry);
8410 if (ret) {
8411 btrfs_abort_transaction(trans, ret);
8412 goto out_fail;
8413 }
8414 BUG_ON(new_inode->i_nlink == 0);
8415 } else {
8416 ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir),
8417 BTRFS_I(d_inode(new_dentry)),
8418 &new_fname.disk_name);
8419 if (ret) {
8420 btrfs_abort_transaction(trans, ret);
8421 goto out_fail;
8422 }
8423 }
8424 if (new_inode->i_nlink == 0) {
8425 ret = btrfs_orphan_add(trans,
8426 BTRFS_I(d_inode(new_dentry)));
8427 if (ret) {
8428 btrfs_abort_transaction(trans, ret);
8429 goto out_fail;
8430 }
8431 }
8432 }
8433
8434 ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
8435 &new_fname.disk_name, 0, index);
8436 if (ret) {
8437 btrfs_abort_transaction(trans, ret);
8438 goto out_fail;
8439 }
8440
8441 if (old_inode->i_nlink == 1)
8442 BTRFS_I(old_inode)->dir_index = index;
8443
8444 if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
8445 btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
8446 rename_ctx.index, new_dentry->d_parent);
8447
8448 if (flags & RENAME_WHITEOUT) {
8449 ret = btrfs_create_new_inode(trans, &whiteout_args);
8450 if (ret) {
8451 btrfs_abort_transaction(trans, ret);
8452 goto out_fail;
8453 } else {
8454 unlock_new_inode(whiteout_args.inode);
8455 iput(whiteout_args.inode);
8456 whiteout_args.inode = NULL;
8457 }
8458 }
8459 out_fail:
8460 ret2 = btrfs_end_transaction(trans);
8461 ret = ret ? ret : ret2;
8462 out_notrans:
8463 if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
8464 up_read(&fs_info->subvol_sem);
8465 if (flags & RENAME_WHITEOUT)
8466 btrfs_new_inode_args_destroy(&whiteout_args);
8467 out_whiteout_inode:
8468 if (flags & RENAME_WHITEOUT)
8469 iput(whiteout_args.inode);
8470 out_fscrypt_names:
8471 fscrypt_free_filename(&old_fname);
8472 fscrypt_free_filename(&new_fname);
8473 return ret;
8474 }
8475
btrfs_rename2(struct mnt_idmap * idmap,struct inode * old_dir,struct dentry * old_dentry,struct inode * new_dir,struct dentry * new_dentry,unsigned int flags)8476 static int btrfs_rename2(struct mnt_idmap *idmap, struct inode *old_dir,
8477 struct dentry *old_dentry, struct inode *new_dir,
8478 struct dentry *new_dentry, unsigned int flags)
8479 {
8480 int ret;
8481
8482 if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
8483 return -EINVAL;
8484
8485 if (flags & RENAME_EXCHANGE)
8486 ret = btrfs_rename_exchange(old_dir, old_dentry, new_dir,
8487 new_dentry);
8488 else
8489 ret = btrfs_rename(idmap, old_dir, old_dentry, new_dir,
8490 new_dentry, flags);
8491
8492 btrfs_btree_balance_dirty(BTRFS_I(new_dir)->root->fs_info);
8493
8494 return ret;
8495 }
8496
8497 struct btrfs_delalloc_work {
8498 struct inode *inode;
8499 struct completion completion;
8500 struct list_head list;
8501 struct btrfs_work work;
8502 };
8503
btrfs_run_delalloc_work(struct btrfs_work * work)8504 static void btrfs_run_delalloc_work(struct btrfs_work *work)
8505 {
8506 struct btrfs_delalloc_work *delalloc_work;
8507 struct inode *inode;
8508
8509 delalloc_work = container_of(work, struct btrfs_delalloc_work,
8510 work);
8511 inode = delalloc_work->inode;
8512 filemap_flush(inode->i_mapping);
8513 if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
8514 &BTRFS_I(inode)->runtime_flags))
8515 filemap_flush(inode->i_mapping);
8516
8517 iput(inode);
8518 complete(&delalloc_work->completion);
8519 }
8520
btrfs_alloc_delalloc_work(struct inode * inode)8521 static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode)
8522 {
8523 struct btrfs_delalloc_work *work;
8524
8525 work = kmalloc(sizeof(*work), GFP_NOFS);
8526 if (!work)
8527 return NULL;
8528
8529 init_completion(&work->completion);
8530 INIT_LIST_HEAD(&work->list);
8531 work->inode = inode;
8532 btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL);
8533
8534 return work;
8535 }
8536
8537 /*
8538 * some fairly slow code that needs optimization. This walks the list
8539 * of all the inodes with pending delalloc and forces them to disk.
8540 */
start_delalloc_inodes(struct btrfs_root * root,struct writeback_control * wbc,bool snapshot,bool in_reclaim_context)8541 static int start_delalloc_inodes(struct btrfs_root *root,
8542 struct writeback_control *wbc, bool snapshot,
8543 bool in_reclaim_context)
8544 {
8545 struct btrfs_delalloc_work *work, *next;
8546 LIST_HEAD(works);
8547 LIST_HEAD(splice);
8548 int ret = 0;
8549 bool full_flush = wbc->nr_to_write == LONG_MAX;
8550
8551 mutex_lock(&root->delalloc_mutex);
8552 spin_lock(&root->delalloc_lock);
8553 list_splice_init(&root->delalloc_inodes, &splice);
8554 while (!list_empty(&splice)) {
8555 struct btrfs_inode *inode;
8556 struct inode *tmp_inode;
8557
8558 inode = list_entry(splice.next, struct btrfs_inode, delalloc_inodes);
8559
8560 list_move_tail(&inode->delalloc_inodes, &root->delalloc_inodes);
8561
8562 if (in_reclaim_context &&
8563 test_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &inode->runtime_flags))
8564 continue;
8565
8566 tmp_inode = igrab(&inode->vfs_inode);
8567 if (!tmp_inode) {
8568 cond_resched_lock(&root->delalloc_lock);
8569 continue;
8570 }
8571 spin_unlock(&root->delalloc_lock);
8572
8573 if (snapshot)
8574 set_bit(BTRFS_INODE_SNAPSHOT_FLUSH, &inode->runtime_flags);
8575 if (full_flush) {
8576 work = btrfs_alloc_delalloc_work(&inode->vfs_inode);
8577 if (!work) {
8578 iput(&inode->vfs_inode);
8579 ret = -ENOMEM;
8580 goto out;
8581 }
8582 list_add_tail(&work->list, &works);
8583 btrfs_queue_work(root->fs_info->flush_workers,
8584 &work->work);
8585 } else {
8586 ret = filemap_fdatawrite_wbc(inode->vfs_inode.i_mapping, wbc);
8587 btrfs_add_delayed_iput(inode);
8588 if (ret || wbc->nr_to_write <= 0)
8589 goto out;
8590 }
8591 cond_resched();
8592 spin_lock(&root->delalloc_lock);
8593 }
8594 spin_unlock(&root->delalloc_lock);
8595
8596 out:
8597 list_for_each_entry_safe(work, next, &works, list) {
8598 list_del_init(&work->list);
8599 wait_for_completion(&work->completion);
8600 kfree(work);
8601 }
8602
8603 if (!list_empty(&splice)) {
8604 spin_lock(&root->delalloc_lock);
8605 list_splice_tail(&splice, &root->delalloc_inodes);
8606 spin_unlock(&root->delalloc_lock);
8607 }
8608 mutex_unlock(&root->delalloc_mutex);
8609 return ret;
8610 }
8611
btrfs_start_delalloc_snapshot(struct btrfs_root * root,bool in_reclaim_context)8612 int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context)
8613 {
8614 struct writeback_control wbc = {
8615 .nr_to_write = LONG_MAX,
8616 .sync_mode = WB_SYNC_NONE,
8617 .range_start = 0,
8618 .range_end = LLONG_MAX,
8619 };
8620 struct btrfs_fs_info *fs_info = root->fs_info;
8621
8622 if (BTRFS_FS_ERROR(fs_info))
8623 return -EROFS;
8624
8625 return start_delalloc_inodes(root, &wbc, true, in_reclaim_context);
8626 }
8627
btrfs_start_delalloc_roots(struct btrfs_fs_info * fs_info,long nr,bool in_reclaim_context)8628 int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
8629 bool in_reclaim_context)
8630 {
8631 struct writeback_control wbc = {
8632 .nr_to_write = nr,
8633 .sync_mode = WB_SYNC_NONE,
8634 .range_start = 0,
8635 .range_end = LLONG_MAX,
8636 };
8637 struct btrfs_root *root;
8638 LIST_HEAD(splice);
8639 int ret;
8640
8641 if (BTRFS_FS_ERROR(fs_info))
8642 return -EROFS;
8643
8644 mutex_lock(&fs_info->delalloc_root_mutex);
8645 spin_lock(&fs_info->delalloc_root_lock);
8646 list_splice_init(&fs_info->delalloc_roots, &splice);
8647 while (!list_empty(&splice)) {
8648 /*
8649 * Reset nr_to_write here so we know that we're doing a full
8650 * flush.
8651 */
8652 if (nr == LONG_MAX)
8653 wbc.nr_to_write = LONG_MAX;
8654
8655 root = list_first_entry(&splice, struct btrfs_root,
8656 delalloc_root);
8657 root = btrfs_grab_root(root);
8658 BUG_ON(!root);
8659 list_move_tail(&root->delalloc_root,
8660 &fs_info->delalloc_roots);
8661 spin_unlock(&fs_info->delalloc_root_lock);
8662
8663 ret = start_delalloc_inodes(root, &wbc, false, in_reclaim_context);
8664 btrfs_put_root(root);
8665 if (ret < 0 || wbc.nr_to_write <= 0)
8666 goto out;
8667 spin_lock(&fs_info->delalloc_root_lock);
8668 }
8669 spin_unlock(&fs_info->delalloc_root_lock);
8670
8671 ret = 0;
8672 out:
8673 if (!list_empty(&splice)) {
8674 spin_lock(&fs_info->delalloc_root_lock);
8675 list_splice_tail(&splice, &fs_info->delalloc_roots);
8676 spin_unlock(&fs_info->delalloc_root_lock);
8677 }
8678 mutex_unlock(&fs_info->delalloc_root_mutex);
8679 return ret;
8680 }
8681
btrfs_symlink(struct mnt_idmap * idmap,struct inode * dir,struct dentry * dentry,const char * symname)8682 static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
8683 struct dentry *dentry, const char *symname)
8684 {
8685 struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
8686 struct btrfs_trans_handle *trans;
8687 struct btrfs_root *root = BTRFS_I(dir)->root;
8688 struct btrfs_path *path;
8689 struct btrfs_key key;
8690 struct inode *inode;
8691 struct btrfs_new_inode_args new_inode_args = {
8692 .dir = dir,
8693 .dentry = dentry,
8694 };
8695 unsigned int trans_num_items;
8696 int err;
8697 int name_len;
8698 int datasize;
8699 unsigned long ptr;
8700 struct btrfs_file_extent_item *ei;
8701 struct extent_buffer *leaf;
8702
8703 name_len = strlen(symname);
8704 /*
8705 * Symlinks utilize uncompressed inline extent data, which should not
8706 * reach block size.
8707 */
8708 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) ||
8709 name_len >= fs_info->sectorsize)
8710 return -ENAMETOOLONG;
8711
8712 inode = new_inode(dir->i_sb);
8713 if (!inode)
8714 return -ENOMEM;
8715 inode_init_owner(idmap, inode, dir, S_IFLNK | S_IRWXUGO);
8716 inode->i_op = &btrfs_symlink_inode_operations;
8717 inode_nohighmem(inode);
8718 inode->i_mapping->a_ops = &btrfs_aops;
8719 btrfs_i_size_write(BTRFS_I(inode), name_len);
8720 inode_set_bytes(inode, name_len);
8721
8722 new_inode_args.inode = inode;
8723 err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
8724 if (err)
8725 goto out_inode;
8726 /* 1 additional item for the inline extent */
8727 trans_num_items++;
8728
8729 trans = btrfs_start_transaction(root, trans_num_items);
8730 if (IS_ERR(trans)) {
8731 err = PTR_ERR(trans);
8732 goto out_new_inode_args;
8733 }
8734
8735 err = btrfs_create_new_inode(trans, &new_inode_args);
8736 if (err)
8737 goto out;
8738
8739 path = btrfs_alloc_path();
8740 if (!path) {
8741 err = -ENOMEM;
8742 btrfs_abort_transaction(trans, err);
8743 discard_new_inode(inode);
8744 inode = NULL;
8745 goto out;
8746 }
8747 key.objectid = btrfs_ino(BTRFS_I(inode));
8748 key.type = BTRFS_EXTENT_DATA_KEY;
8749 key.offset = 0;
8750 datasize = btrfs_file_extent_calc_inline_size(name_len);
8751 err = btrfs_insert_empty_item(trans, root, path, &key,
8752 datasize);
8753 if (err) {
8754 btrfs_abort_transaction(trans, err);
8755 btrfs_free_path(path);
8756 discard_new_inode(inode);
8757 inode = NULL;
8758 goto out;
8759 }
8760 leaf = path->nodes[0];
8761 ei = btrfs_item_ptr(leaf, path->slots[0],
8762 struct btrfs_file_extent_item);
8763 btrfs_set_file_extent_generation(leaf, ei, trans->transid);
8764 btrfs_set_file_extent_type(leaf, ei,
8765 BTRFS_FILE_EXTENT_INLINE);
8766 btrfs_set_file_extent_encryption(leaf, ei, 0);
8767 btrfs_set_file_extent_compression(leaf, ei, 0);
8768 btrfs_set_file_extent_other_encoding(leaf, ei, 0);
8769 btrfs_set_file_extent_ram_bytes(leaf, ei, name_len);
8770
8771 ptr = btrfs_file_extent_inline_start(ei);
8772 write_extent_buffer(leaf, symname, ptr, name_len);
8773 btrfs_free_path(path);
8774
8775 d_instantiate_new(dentry, inode);
8776 err = 0;
8777 out:
8778 btrfs_end_transaction(trans);
8779 btrfs_btree_balance_dirty(fs_info);
8780 out_new_inode_args:
8781 btrfs_new_inode_args_destroy(&new_inode_args);
8782 out_inode:
8783 if (err)
8784 iput(inode);
8785 return err;
8786 }
8787
insert_prealloc_file_extent(struct btrfs_trans_handle * trans_in,struct btrfs_inode * inode,struct btrfs_key * ins,u64 file_offset)8788 static struct btrfs_trans_handle *insert_prealloc_file_extent(
8789 struct btrfs_trans_handle *trans_in,
8790 struct btrfs_inode *inode,
8791 struct btrfs_key *ins,
8792 u64 file_offset)
8793 {
8794 struct btrfs_file_extent_item stack_fi;
8795 struct btrfs_replace_extent_info extent_info;
8796 struct btrfs_trans_handle *trans = trans_in;
8797 struct btrfs_path *path;
8798 u64 start = ins->objectid;
8799 u64 len = ins->offset;
8800 u64 qgroup_released = 0;
8801 int ret;
8802
8803 memset(&stack_fi, 0, sizeof(stack_fi));
8804
8805 btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_PREALLOC);
8806 btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, start);
8807 btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi, len);
8808 btrfs_set_stack_file_extent_num_bytes(&stack_fi, len);
8809 btrfs_set_stack_file_extent_ram_bytes(&stack_fi, len);
8810 btrfs_set_stack_file_extent_compression(&stack_fi, BTRFS_COMPRESS_NONE);
8811 /* Encryption and other encoding is reserved and all 0 */
8812
8813 ret = btrfs_qgroup_release_data(inode, file_offset, len, &qgroup_released);
8814 if (ret < 0)
8815 return ERR_PTR(ret);
8816
8817 if (trans) {
8818 ret = insert_reserved_file_extent(trans, inode,
8819 file_offset, &stack_fi,
8820 true, qgroup_released);
8821 if (ret)
8822 goto free_qgroup;
8823 return trans;
8824 }
8825
8826 extent_info.disk_offset = start;
8827 extent_info.disk_len = len;
8828 extent_info.data_offset = 0;
8829 extent_info.data_len = len;
8830 extent_info.file_offset = file_offset;
8831 extent_info.extent_buf = (char *)&stack_fi;
8832 extent_info.is_new_extent = true;
8833 extent_info.update_times = true;
8834 extent_info.qgroup_reserved = qgroup_released;
8835 extent_info.insertions = 0;
8836
8837 path = btrfs_alloc_path();
8838 if (!path) {
8839 ret = -ENOMEM;
8840 goto free_qgroup;
8841 }
8842
8843 ret = btrfs_replace_file_extents(inode, path, file_offset,
8844 file_offset + len - 1, &extent_info,
8845 &trans);
8846 btrfs_free_path(path);
8847 if (ret)
8848 goto free_qgroup;
8849 return trans;
8850
8851 free_qgroup:
8852 /*
8853 * We have released qgroup data range at the beginning of the function,
8854 * and normally qgroup_released bytes will be freed when committing
8855 * transaction.
8856 * But if we error out early, we have to free what we have released
8857 * or we leak qgroup data reservation.
8858 */
8859 btrfs_qgroup_free_refroot(inode->root->fs_info,
8860 btrfs_root_id(inode->root), qgroup_released,
8861 BTRFS_QGROUP_RSV_DATA);
8862 return ERR_PTR(ret);
8863 }
8864
__btrfs_prealloc_file_range(struct inode * inode,int mode,u64 start,u64 num_bytes,u64 min_size,loff_t actual_len,u64 * alloc_hint,struct btrfs_trans_handle * trans)8865 static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
8866 u64 start, u64 num_bytes, u64 min_size,
8867 loff_t actual_len, u64 *alloc_hint,
8868 struct btrfs_trans_handle *trans)
8869 {
8870 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
8871 struct extent_map *em;
8872 struct btrfs_root *root = BTRFS_I(inode)->root;
8873 struct btrfs_key ins;
8874 u64 cur_offset = start;
8875 u64 clear_offset = start;
8876 u64 i_size;
8877 u64 cur_bytes;
8878 u64 last_alloc = (u64)-1;
8879 int ret = 0;
8880 bool own_trans = true;
8881 u64 end = start + num_bytes - 1;
8882
8883 if (trans)
8884 own_trans = false;
8885 while (num_bytes > 0) {
8886 cur_bytes = min_t(u64, num_bytes, SZ_256M);
8887 cur_bytes = max(cur_bytes, min_size);
8888 /*
8889 * If we are severely fragmented we could end up with really
8890 * small allocations, so if the allocator is returning small
8891 * chunks lets make its job easier by only searching for those
8892 * sized chunks.
8893 */
8894 cur_bytes = min(cur_bytes, last_alloc);
8895 ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes,
8896 min_size, 0, *alloc_hint, &ins, 1, 0);
8897 if (ret)
8898 break;
8899
8900 /*
8901 * We've reserved this space, and thus converted it from
8902 * ->bytes_may_use to ->bytes_reserved. Any error that happens
8903 * from here on out we will only need to clear our reservation
8904 * for the remaining unreserved area, so advance our
8905 * clear_offset by our extent size.
8906 */
8907 clear_offset += ins.offset;
8908
8909 last_alloc = ins.offset;
8910 trans = insert_prealloc_file_extent(trans, BTRFS_I(inode),
8911 &ins, cur_offset);
8912 /*
8913 * Now that we inserted the prealloc extent we can finally
8914 * decrement the number of reservations in the block group.
8915 * If we did it before, we could race with relocation and have
8916 * relocation miss the reserved extent, making it fail later.
8917 */
8918 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
8919 if (IS_ERR(trans)) {
8920 ret = PTR_ERR(trans);
8921 btrfs_free_reserved_extent(fs_info, ins.objectid,
8922 ins.offset, 0);
8923 break;
8924 }
8925
8926 em = alloc_extent_map();
8927 if (!em) {
8928 btrfs_drop_extent_map_range(BTRFS_I(inode), cur_offset,
8929 cur_offset + ins.offset - 1, false);
8930 btrfs_set_inode_full_sync(BTRFS_I(inode));
8931 goto next;
8932 }
8933
8934 em->start = cur_offset;
8935 em->len = ins.offset;
8936 em->disk_bytenr = ins.objectid;
8937 em->offset = 0;
8938 em->disk_num_bytes = ins.offset;
8939 em->ram_bytes = ins.offset;
8940 em->flags |= EXTENT_FLAG_PREALLOC;
8941 em->generation = trans->transid;
8942
8943 ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, true);
8944 free_extent_map(em);
8945 next:
8946 num_bytes -= ins.offset;
8947 cur_offset += ins.offset;
8948 *alloc_hint = ins.objectid + ins.offset;
8949
8950 inode_inc_iversion(inode);
8951 inode_set_ctime_current(inode);
8952 BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
8953 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
8954 (actual_len > inode->i_size) &&
8955 (cur_offset > inode->i_size)) {
8956 if (cur_offset > actual_len)
8957 i_size = actual_len;
8958 else
8959 i_size = cur_offset;
8960 i_size_write(inode, i_size);
8961 btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
8962 }
8963
8964 ret = btrfs_update_inode(trans, BTRFS_I(inode));
8965
8966 if (ret) {
8967 btrfs_abort_transaction(trans, ret);
8968 if (own_trans)
8969 btrfs_end_transaction(trans);
8970 break;
8971 }
8972
8973 if (own_trans) {
8974 btrfs_end_transaction(trans);
8975 trans = NULL;
8976 }
8977 }
8978 if (clear_offset < end)
8979 btrfs_free_reserved_data_space(BTRFS_I(inode), NULL, clear_offset,
8980 end - clear_offset + 1);
8981 return ret;
8982 }
8983
btrfs_prealloc_file_range(struct inode * inode,int mode,u64 start,u64 num_bytes,u64 min_size,loff_t actual_len,u64 * alloc_hint)8984 int btrfs_prealloc_file_range(struct inode *inode, int mode,
8985 u64 start, u64 num_bytes, u64 min_size,
8986 loff_t actual_len, u64 *alloc_hint)
8987 {
8988 return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
8989 min_size, actual_len, alloc_hint,
8990 NULL);
8991 }
8992
btrfs_prealloc_file_range_trans(struct inode * inode,struct btrfs_trans_handle * trans,int mode,u64 start,u64 num_bytes,u64 min_size,loff_t actual_len,u64 * alloc_hint)8993 int btrfs_prealloc_file_range_trans(struct inode *inode,
8994 struct btrfs_trans_handle *trans, int mode,
8995 u64 start, u64 num_bytes, u64 min_size,
8996 loff_t actual_len, u64 *alloc_hint)
8997 {
8998 return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
8999 min_size, actual_len, alloc_hint, trans);
9000 }
9001
btrfs_permission(struct mnt_idmap * idmap,struct inode * inode,int mask)9002 static int btrfs_permission(struct mnt_idmap *idmap,
9003 struct inode *inode, int mask)
9004 {
9005 struct btrfs_root *root = BTRFS_I(inode)->root;
9006 umode_t mode = inode->i_mode;
9007
9008 if (mask & MAY_WRITE &&
9009 (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) {
9010 if (btrfs_root_readonly(root))
9011 return -EROFS;
9012 if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY)
9013 return -EACCES;
9014 }
9015 return generic_permission(idmap, inode, mask);
9016 }
9017
btrfs_tmpfile(struct mnt_idmap * idmap,struct inode * dir,struct file * file,umode_t mode)9018 static int btrfs_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
9019 struct file *file, umode_t mode)
9020 {
9021 struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
9022 struct btrfs_trans_handle *trans;
9023 struct btrfs_root *root = BTRFS_I(dir)->root;
9024 struct inode *inode;
9025 struct btrfs_new_inode_args new_inode_args = {
9026 .dir = dir,
9027 .dentry = file->f_path.dentry,
9028 .orphan = true,
9029 };
9030 unsigned int trans_num_items;
9031 int ret;
9032
9033 inode = new_inode(dir->i_sb);
9034 if (!inode)
9035 return -ENOMEM;
9036 inode_init_owner(idmap, inode, dir, mode);
9037 inode->i_fop = &btrfs_file_operations;
9038 inode->i_op = &btrfs_file_inode_operations;
9039 inode->i_mapping->a_ops = &btrfs_aops;
9040
9041 new_inode_args.inode = inode;
9042 ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
9043 if (ret)
9044 goto out_inode;
9045
9046 trans = btrfs_start_transaction(root, trans_num_items);
9047 if (IS_ERR(trans)) {
9048 ret = PTR_ERR(trans);
9049 goto out_new_inode_args;
9050 }
9051
9052 ret = btrfs_create_new_inode(trans, &new_inode_args);
9053
9054 /*
9055 * We set number of links to 0 in btrfs_create_new_inode(), and here we
9056 * set it to 1 because d_tmpfile() will issue a warning if the count is
9057 * 0, through:
9058 *
9059 * d_tmpfile() -> inode_dec_link_count() -> drop_nlink()
9060 */
9061 set_nlink(inode, 1);
9062
9063 if (!ret) {
9064 d_tmpfile(file, inode);
9065 unlock_new_inode(inode);
9066 mark_inode_dirty(inode);
9067 }
9068
9069 btrfs_end_transaction(trans);
9070 btrfs_btree_balance_dirty(fs_info);
9071 out_new_inode_args:
9072 btrfs_new_inode_args_destroy(&new_inode_args);
9073 out_inode:
9074 if (ret)
9075 iput(inode);
9076 return finish_open_simple(file, ret);
9077 }
9078
btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info * fs_info,int compress_type)9079 int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info,
9080 int compress_type)
9081 {
9082 switch (compress_type) {
9083 case BTRFS_COMPRESS_NONE:
9084 return BTRFS_ENCODED_IO_COMPRESSION_NONE;
9085 case BTRFS_COMPRESS_ZLIB:
9086 return BTRFS_ENCODED_IO_COMPRESSION_ZLIB;
9087 case BTRFS_COMPRESS_LZO:
9088 /*
9089 * The LZO format depends on the sector size. 64K is the maximum
9090 * sector size that we support.
9091 */
9092 if (fs_info->sectorsize < SZ_4K || fs_info->sectorsize > SZ_64K)
9093 return -EINVAL;
9094 return BTRFS_ENCODED_IO_COMPRESSION_LZO_4K +
9095 (fs_info->sectorsize_bits - 12);
9096 case BTRFS_COMPRESS_ZSTD:
9097 return BTRFS_ENCODED_IO_COMPRESSION_ZSTD;
9098 default:
9099 return -EUCLEAN;
9100 }
9101 }
9102
btrfs_encoded_read_inline(struct kiocb * iocb,struct iov_iter * iter,u64 start,u64 lockend,struct extent_state ** cached_state,u64 extent_start,size_t count,struct btrfs_ioctl_encoded_io_args * encoded,bool * unlocked)9103 static ssize_t btrfs_encoded_read_inline(
9104 struct kiocb *iocb,
9105 struct iov_iter *iter, u64 start,
9106 u64 lockend,
9107 struct extent_state **cached_state,
9108 u64 extent_start, size_t count,
9109 struct btrfs_ioctl_encoded_io_args *encoded,
9110 bool *unlocked)
9111 {
9112 struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
9113 struct btrfs_root *root = inode->root;
9114 struct btrfs_fs_info *fs_info = root->fs_info;
9115 struct extent_io_tree *io_tree = &inode->io_tree;
9116 struct btrfs_path *path;
9117 struct extent_buffer *leaf;
9118 struct btrfs_file_extent_item *item;
9119 u64 ram_bytes;
9120 unsigned long ptr;
9121 void *tmp;
9122 ssize_t ret;
9123 const bool nowait = (iocb->ki_flags & IOCB_NOWAIT);
9124
9125 path = btrfs_alloc_path();
9126 if (!path) {
9127 ret = -ENOMEM;
9128 goto out;
9129 }
9130
9131 path->nowait = nowait;
9132
9133 ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode),
9134 extent_start, 0);
9135 if (ret) {
9136 if (ret > 0) {
9137 /* The extent item disappeared? */
9138 ret = -EIO;
9139 }
9140 goto out;
9141 }
9142 leaf = path->nodes[0];
9143 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
9144
9145 ram_bytes = btrfs_file_extent_ram_bytes(leaf, item);
9146 ptr = btrfs_file_extent_inline_start(item);
9147
9148 encoded->len = min_t(u64, extent_start + ram_bytes,
9149 inode->vfs_inode.i_size) - iocb->ki_pos;
9150 ret = btrfs_encoded_io_compression_from_extent(fs_info,
9151 btrfs_file_extent_compression(leaf, item));
9152 if (ret < 0)
9153 goto out;
9154 encoded->compression = ret;
9155 if (encoded->compression) {
9156 size_t inline_size;
9157
9158 inline_size = btrfs_file_extent_inline_item_len(leaf,
9159 path->slots[0]);
9160 if (inline_size > count) {
9161 ret = -ENOBUFS;
9162 goto out;
9163 }
9164 count = inline_size;
9165 encoded->unencoded_len = ram_bytes;
9166 encoded->unencoded_offset = iocb->ki_pos - extent_start;
9167 } else {
9168 count = min_t(u64, count, encoded->len);
9169 encoded->len = count;
9170 encoded->unencoded_len = count;
9171 ptr += iocb->ki_pos - extent_start;
9172 }
9173
9174 tmp = kmalloc(count, GFP_NOFS);
9175 if (!tmp) {
9176 ret = -ENOMEM;
9177 goto out;
9178 }
9179 read_extent_buffer(leaf, tmp, ptr, count);
9180 btrfs_release_path(path);
9181 unlock_extent(io_tree, start, lockend, cached_state);
9182 btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
9183 *unlocked = true;
9184
9185 ret = copy_to_iter(tmp, count, iter);
9186 if (ret != count)
9187 ret = -EFAULT;
9188 kfree(tmp);
9189 out:
9190 btrfs_free_path(path);
9191 return ret;
9192 }
9193
9194 struct btrfs_encoded_read_private {
9195 struct completion *sync_reads;
9196 void *uring_ctx;
9197 refcount_t pending_refs;
9198 blk_status_t status;
9199 };
9200
btrfs_encoded_read_endio(struct btrfs_bio * bbio)9201 static void btrfs_encoded_read_endio(struct btrfs_bio *bbio)
9202 {
9203 struct btrfs_encoded_read_private *priv = bbio->private;
9204
9205 if (bbio->bio.bi_status) {
9206 /*
9207 * The memory barrier implied by the refcount_dec_and_test() here
9208 * pairs with the memory barrier implied by the refcount_dec_and_test()
9209 * in btrfs_encoded_read_regular_fill_pages() to ensure that
9210 * this write is observed before the load of status in
9211 * btrfs_encoded_read_regular_fill_pages().
9212 */
9213 WRITE_ONCE(priv->status, bbio->bio.bi_status);
9214 }
9215 if (refcount_dec_and_test(&priv->pending_refs)) {
9216 int err = blk_status_to_errno(READ_ONCE(priv->status));
9217
9218 if (priv->uring_ctx) {
9219 btrfs_uring_read_extent_endio(priv->uring_ctx, err);
9220 kfree(priv);
9221 } else {
9222 complete(priv->sync_reads);
9223 }
9224 }
9225 bio_put(&bbio->bio);
9226 }
9227
btrfs_encoded_read_regular_fill_pages(struct btrfs_inode * inode,u64 disk_bytenr,u64 disk_io_size,struct page ** pages,void * uring_ctx)9228 int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
9229 u64 disk_bytenr, u64 disk_io_size,
9230 struct page **pages, void *uring_ctx)
9231 {
9232 struct btrfs_fs_info *fs_info = inode->root->fs_info;
9233 struct btrfs_encoded_read_private *priv, sync_priv;
9234 struct completion sync_reads;
9235 unsigned long i = 0;
9236 struct btrfs_bio *bbio;
9237 int ret;
9238
9239 /*
9240 * Fast path for synchronous reads which completes in this call, io_uring
9241 * needs longer time span.
9242 */
9243 if (uring_ctx) {
9244 priv = kmalloc(sizeof(struct btrfs_encoded_read_private), GFP_NOFS);
9245 if (!priv)
9246 return -ENOMEM;
9247 } else {
9248 priv = &sync_priv;
9249 init_completion(&sync_reads);
9250 priv->sync_reads = &sync_reads;
9251 }
9252
9253 refcount_set(&priv->pending_refs, 1);
9254 priv->status = 0;
9255 priv->uring_ctx = uring_ctx;
9256
9257 bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info,
9258 btrfs_encoded_read_endio, priv);
9259 bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
9260 bbio->inode = inode;
9261
9262 do {
9263 size_t bytes = min_t(u64, disk_io_size, PAGE_SIZE);
9264
9265 if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) {
9266 refcount_inc(&priv->pending_refs);
9267 btrfs_submit_bbio(bbio, 0);
9268
9269 bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info,
9270 btrfs_encoded_read_endio, priv);
9271 bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
9272 bbio->inode = inode;
9273 continue;
9274 }
9275
9276 i++;
9277 disk_bytenr += bytes;
9278 disk_io_size -= bytes;
9279 } while (disk_io_size);
9280
9281 refcount_inc(&priv->pending_refs);
9282 btrfs_submit_bbio(bbio, 0);
9283
9284 if (uring_ctx) {
9285 if (refcount_dec_and_test(&priv->pending_refs)) {
9286 ret = blk_status_to_errno(READ_ONCE(priv->status));
9287 btrfs_uring_read_extent_endio(uring_ctx, ret);
9288 kfree(priv);
9289 return ret;
9290 }
9291
9292 return -EIOCBQUEUED;
9293 } else {
9294 if (!refcount_dec_and_test(&priv->pending_refs))
9295 wait_for_completion_io(&sync_reads);
9296 /* See btrfs_encoded_read_endio() for ordering. */
9297 return blk_status_to_errno(READ_ONCE(priv->status));
9298 }
9299 }
9300
btrfs_encoded_read_regular(struct kiocb * iocb,struct iov_iter * iter,u64 start,u64 lockend,struct extent_state ** cached_state,u64 disk_bytenr,u64 disk_io_size,size_t count,bool compressed,bool * unlocked)9301 ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, struct iov_iter *iter,
9302 u64 start, u64 lockend,
9303 struct extent_state **cached_state,
9304 u64 disk_bytenr, u64 disk_io_size,
9305 size_t count, bool compressed, bool *unlocked)
9306 {
9307 struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
9308 struct extent_io_tree *io_tree = &inode->io_tree;
9309 struct page **pages;
9310 unsigned long nr_pages, i;
9311 u64 cur;
9312 size_t page_offset;
9313 ssize_t ret;
9314
9315 nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE);
9316 pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
9317 if (!pages)
9318 return -ENOMEM;
9319 ret = btrfs_alloc_page_array(nr_pages, pages, false);
9320 if (ret) {
9321 ret = -ENOMEM;
9322 goto out;
9323 }
9324
9325 ret = btrfs_encoded_read_regular_fill_pages(inode, disk_bytenr,
9326 disk_io_size, pages, NULL);
9327 if (ret)
9328 goto out;
9329
9330 unlock_extent(io_tree, start, lockend, cached_state);
9331 btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
9332 *unlocked = true;
9333
9334 if (compressed) {
9335 i = 0;
9336 page_offset = 0;
9337 } else {
9338 i = (iocb->ki_pos - start) >> PAGE_SHIFT;
9339 page_offset = (iocb->ki_pos - start) & (PAGE_SIZE - 1);
9340 }
9341 cur = 0;
9342 while (cur < count) {
9343 size_t bytes = min_t(size_t, count - cur,
9344 PAGE_SIZE - page_offset);
9345
9346 if (copy_page_to_iter(pages[i], page_offset, bytes,
9347 iter) != bytes) {
9348 ret = -EFAULT;
9349 goto out;
9350 }
9351 i++;
9352 cur += bytes;
9353 page_offset = 0;
9354 }
9355 ret = count;
9356 out:
9357 for (i = 0; i < nr_pages; i++) {
9358 if (pages[i])
9359 __free_page(pages[i]);
9360 }
9361 kfree(pages);
9362 return ret;
9363 }
9364
btrfs_encoded_read(struct kiocb * iocb,struct iov_iter * iter,struct btrfs_ioctl_encoded_io_args * encoded,struct extent_state ** cached_state,u64 * disk_bytenr,u64 * disk_io_size)9365 ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
9366 struct btrfs_ioctl_encoded_io_args *encoded,
9367 struct extent_state **cached_state,
9368 u64 *disk_bytenr, u64 *disk_io_size)
9369 {
9370 struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
9371 struct btrfs_fs_info *fs_info = inode->root->fs_info;
9372 struct extent_io_tree *io_tree = &inode->io_tree;
9373 ssize_t ret;
9374 size_t count = iov_iter_count(iter);
9375 u64 start, lockend;
9376 struct extent_map *em;
9377 const bool nowait = (iocb->ki_flags & IOCB_NOWAIT);
9378 bool unlocked = false;
9379
9380 file_accessed(iocb->ki_filp);
9381
9382 ret = btrfs_inode_lock(inode,
9383 BTRFS_ILOCK_SHARED | (nowait ? BTRFS_ILOCK_TRY : 0));
9384 if (ret)
9385 return ret;
9386
9387 if (iocb->ki_pos >= inode->vfs_inode.i_size) {
9388 btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
9389 return 0;
9390 }
9391 start = ALIGN_DOWN(iocb->ki_pos, fs_info->sectorsize);
9392 /*
9393 * We don't know how long the extent containing iocb->ki_pos is, but if
9394 * it's compressed we know that it won't be longer than this.
9395 */
9396 lockend = start + BTRFS_MAX_UNCOMPRESSED - 1;
9397
9398 if (nowait) {
9399 struct btrfs_ordered_extent *ordered;
9400
9401 if (filemap_range_needs_writeback(inode->vfs_inode.i_mapping,
9402 start, lockend)) {
9403 ret = -EAGAIN;
9404 goto out_unlock_inode;
9405 }
9406
9407 if (!try_lock_extent(io_tree, start, lockend, cached_state)) {
9408 ret = -EAGAIN;
9409 goto out_unlock_inode;
9410 }
9411
9412 ordered = btrfs_lookup_ordered_range(inode, start,
9413 lockend - start + 1);
9414 if (ordered) {
9415 btrfs_put_ordered_extent(ordered);
9416 unlock_extent(io_tree, start, lockend, cached_state);
9417 ret = -EAGAIN;
9418 goto out_unlock_inode;
9419 }
9420 } else {
9421 for (;;) {
9422 struct btrfs_ordered_extent *ordered;
9423
9424 ret = btrfs_wait_ordered_range(inode, start,
9425 lockend - start + 1);
9426 if (ret)
9427 goto out_unlock_inode;
9428
9429 lock_extent(io_tree, start, lockend, cached_state);
9430 ordered = btrfs_lookup_ordered_range(inode, start,
9431 lockend - start + 1);
9432 if (!ordered)
9433 break;
9434 btrfs_put_ordered_extent(ordered);
9435 unlock_extent(io_tree, start, lockend, cached_state);
9436 cond_resched();
9437 }
9438 }
9439
9440 em = btrfs_get_extent(inode, NULL, start, lockend - start + 1);
9441 if (IS_ERR(em)) {
9442 ret = PTR_ERR(em);
9443 goto out_unlock_extent;
9444 }
9445
9446 if (em->disk_bytenr == EXTENT_MAP_INLINE) {
9447 u64 extent_start = em->start;
9448
9449 /*
9450 * For inline extents we get everything we need out of the
9451 * extent item.
9452 */
9453 free_extent_map(em);
9454 em = NULL;
9455 ret = btrfs_encoded_read_inline(iocb, iter, start, lockend,
9456 cached_state, extent_start,
9457 count, encoded, &unlocked);
9458 goto out_unlock_extent;
9459 }
9460
9461 /*
9462 * We only want to return up to EOF even if the extent extends beyond
9463 * that.
9464 */
9465 encoded->len = min_t(u64, extent_map_end(em),
9466 inode->vfs_inode.i_size) - iocb->ki_pos;
9467 if (em->disk_bytenr == EXTENT_MAP_HOLE ||
9468 (em->flags & EXTENT_FLAG_PREALLOC)) {
9469 *disk_bytenr = EXTENT_MAP_HOLE;
9470 count = min_t(u64, count, encoded->len);
9471 encoded->len = count;
9472 encoded->unencoded_len = count;
9473 } else if (extent_map_is_compressed(em)) {
9474 *disk_bytenr = em->disk_bytenr;
9475 /*
9476 * Bail if the buffer isn't large enough to return the whole
9477 * compressed extent.
9478 */
9479 if (em->disk_num_bytes > count) {
9480 ret = -ENOBUFS;
9481 goto out_em;
9482 }
9483 *disk_io_size = em->disk_num_bytes;
9484 count = em->disk_num_bytes;
9485 encoded->unencoded_len = em->ram_bytes;
9486 encoded->unencoded_offset = iocb->ki_pos - (em->start - em->offset);
9487 ret = btrfs_encoded_io_compression_from_extent(fs_info,
9488 extent_map_compression(em));
9489 if (ret < 0)
9490 goto out_em;
9491 encoded->compression = ret;
9492 } else {
9493 *disk_bytenr = extent_map_block_start(em) + (start - em->start);
9494 if (encoded->len > count)
9495 encoded->len = count;
9496 /*
9497 * Don't read beyond what we locked. This also limits the page
9498 * allocations that we'll do.
9499 */
9500 *disk_io_size = min(lockend + 1, iocb->ki_pos + encoded->len) - start;
9501 count = start + *disk_io_size - iocb->ki_pos;
9502 encoded->len = count;
9503 encoded->unencoded_len = count;
9504 *disk_io_size = ALIGN(*disk_io_size, fs_info->sectorsize);
9505 }
9506 free_extent_map(em);
9507 em = NULL;
9508
9509 if (*disk_bytenr == EXTENT_MAP_HOLE) {
9510 unlock_extent(io_tree, start, lockend, cached_state);
9511 btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
9512 unlocked = true;
9513 ret = iov_iter_zero(count, iter);
9514 if (ret != count)
9515 ret = -EFAULT;
9516 } else {
9517 ret = -EIOCBQUEUED;
9518 goto out_unlock_extent;
9519 }
9520
9521 out_em:
9522 free_extent_map(em);
9523 out_unlock_extent:
9524 /* Leave inode and extent locked if we need to do a read. */
9525 if (!unlocked && ret != -EIOCBQUEUED)
9526 unlock_extent(io_tree, start, lockend, cached_state);
9527 out_unlock_inode:
9528 if (!unlocked && ret != -EIOCBQUEUED)
9529 btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
9530 return ret;
9531 }
9532
btrfs_do_encoded_write(struct kiocb * iocb,struct iov_iter * from,const struct btrfs_ioctl_encoded_io_args * encoded)9533 ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
9534 const struct btrfs_ioctl_encoded_io_args *encoded)
9535 {
9536 struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
9537 struct btrfs_root *root = inode->root;
9538 struct btrfs_fs_info *fs_info = root->fs_info;
9539 struct extent_io_tree *io_tree = &inode->io_tree;
9540 struct extent_changeset *data_reserved = NULL;
9541 struct extent_state *cached_state = NULL;
9542 struct btrfs_ordered_extent *ordered;
9543 struct btrfs_file_extent file_extent;
9544 int compression;
9545 size_t orig_count;
9546 u64 start, end;
9547 u64 num_bytes, ram_bytes, disk_num_bytes;
9548 unsigned long nr_folios, i;
9549 struct folio **folios;
9550 struct btrfs_key ins;
9551 bool extent_reserved = false;
9552 struct extent_map *em;
9553 ssize_t ret;
9554
9555 switch (encoded->compression) {
9556 case BTRFS_ENCODED_IO_COMPRESSION_ZLIB:
9557 compression = BTRFS_COMPRESS_ZLIB;
9558 break;
9559 case BTRFS_ENCODED_IO_COMPRESSION_ZSTD:
9560 compression = BTRFS_COMPRESS_ZSTD;
9561 break;
9562 case BTRFS_ENCODED_IO_COMPRESSION_LZO_4K:
9563 case BTRFS_ENCODED_IO_COMPRESSION_LZO_8K:
9564 case BTRFS_ENCODED_IO_COMPRESSION_LZO_16K:
9565 case BTRFS_ENCODED_IO_COMPRESSION_LZO_32K:
9566 case BTRFS_ENCODED_IO_COMPRESSION_LZO_64K:
9567 /* The sector size must match for LZO. */
9568 if (encoded->compression -
9569 BTRFS_ENCODED_IO_COMPRESSION_LZO_4K + 12 !=
9570 fs_info->sectorsize_bits)
9571 return -EINVAL;
9572 compression = BTRFS_COMPRESS_LZO;
9573 break;
9574 default:
9575 return -EINVAL;
9576 }
9577 if (encoded->encryption != BTRFS_ENCODED_IO_ENCRYPTION_NONE)
9578 return -EINVAL;
9579
9580 /*
9581 * Compressed extents should always have checksums, so error out if we
9582 * have a NOCOW file or inode was created while mounted with NODATASUM.
9583 */
9584 if (inode->flags & BTRFS_INODE_NODATASUM)
9585 return -EINVAL;
9586
9587 orig_count = iov_iter_count(from);
9588
9589 /* The extent size must be sane. */
9590 if (encoded->unencoded_len > BTRFS_MAX_UNCOMPRESSED ||
9591 orig_count > BTRFS_MAX_COMPRESSED || orig_count == 0)
9592 return -EINVAL;
9593
9594 /*
9595 * The compressed data must be smaller than the decompressed data.
9596 *
9597 * It's of course possible for data to compress to larger or the same
9598 * size, but the buffered I/O path falls back to no compression for such
9599 * data, and we don't want to break any assumptions by creating these
9600 * extents.
9601 *
9602 * Note that this is less strict than the current check we have that the
9603 * compressed data must be at least one sector smaller than the
9604 * decompressed data. We only want to enforce the weaker requirement
9605 * from old kernels that it is at least one byte smaller.
9606 */
9607 if (orig_count >= encoded->unencoded_len)
9608 return -EINVAL;
9609
9610 /* The extent must start on a sector boundary. */
9611 start = iocb->ki_pos;
9612 if (!IS_ALIGNED(start, fs_info->sectorsize))
9613 return -EINVAL;
9614
9615 /*
9616 * The extent must end on a sector boundary. However, we allow a write
9617 * which ends at or extends i_size to have an unaligned length; we round
9618 * up the extent size and set i_size to the unaligned end.
9619 */
9620 if (start + encoded->len < inode->vfs_inode.i_size &&
9621 !IS_ALIGNED(start + encoded->len, fs_info->sectorsize))
9622 return -EINVAL;
9623
9624 /* Finally, the offset in the unencoded data must be sector-aligned. */
9625 if (!IS_ALIGNED(encoded->unencoded_offset, fs_info->sectorsize))
9626 return -EINVAL;
9627
9628 num_bytes = ALIGN(encoded->len, fs_info->sectorsize);
9629 ram_bytes = ALIGN(encoded->unencoded_len, fs_info->sectorsize);
9630 end = start + num_bytes - 1;
9631
9632 /*
9633 * If the extent cannot be inline, the compressed data on disk must be
9634 * sector-aligned. For convenience, we extend it with zeroes if it
9635 * isn't.
9636 */
9637 disk_num_bytes = ALIGN(orig_count, fs_info->sectorsize);
9638 nr_folios = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE);
9639 folios = kvcalloc(nr_folios, sizeof(struct folio *), GFP_KERNEL_ACCOUNT);
9640 if (!folios)
9641 return -ENOMEM;
9642 for (i = 0; i < nr_folios; i++) {
9643 size_t bytes = min_t(size_t, PAGE_SIZE, iov_iter_count(from));
9644 char *kaddr;
9645
9646 folios[i] = folio_alloc(GFP_KERNEL_ACCOUNT, 0);
9647 if (!folios[i]) {
9648 ret = -ENOMEM;
9649 goto out_folios;
9650 }
9651 kaddr = kmap_local_folio(folios[i], 0);
9652 if (copy_from_iter(kaddr, bytes, from) != bytes) {
9653 kunmap_local(kaddr);
9654 ret = -EFAULT;
9655 goto out_folios;
9656 }
9657 if (bytes < PAGE_SIZE)
9658 memset(kaddr + bytes, 0, PAGE_SIZE - bytes);
9659 kunmap_local(kaddr);
9660 }
9661
9662 for (;;) {
9663 struct btrfs_ordered_extent *ordered;
9664
9665 ret = btrfs_wait_ordered_range(inode, start, num_bytes);
9666 if (ret)
9667 goto out_folios;
9668 ret = invalidate_inode_pages2_range(inode->vfs_inode.i_mapping,
9669 start >> PAGE_SHIFT,
9670 end >> PAGE_SHIFT);
9671 if (ret)
9672 goto out_folios;
9673 lock_extent(io_tree, start, end, &cached_state);
9674 ordered = btrfs_lookup_ordered_range(inode, start, num_bytes);
9675 if (!ordered &&
9676 !filemap_range_has_page(inode->vfs_inode.i_mapping, start, end))
9677 break;
9678 if (ordered)
9679 btrfs_put_ordered_extent(ordered);
9680 unlock_extent(io_tree, start, end, &cached_state);
9681 cond_resched();
9682 }
9683
9684 /*
9685 * We don't use the higher-level delalloc space functions because our
9686 * num_bytes and disk_num_bytes are different.
9687 */
9688 ret = btrfs_alloc_data_chunk_ondemand(inode, disk_num_bytes);
9689 if (ret)
9690 goto out_unlock;
9691 ret = btrfs_qgroup_reserve_data(inode, &data_reserved, start, num_bytes);
9692 if (ret)
9693 goto out_free_data_space;
9694 ret = btrfs_delalloc_reserve_metadata(inode, num_bytes, disk_num_bytes,
9695 false);
9696 if (ret)
9697 goto out_qgroup_free_data;
9698
9699 /* Try an inline extent first. */
9700 if (encoded->unencoded_len == encoded->len &&
9701 encoded->unencoded_offset == 0 &&
9702 can_cow_file_range_inline(inode, start, encoded->len, orig_count)) {
9703 ret = __cow_file_range_inline(inode, encoded->len,
9704 orig_count, compression, folios[0],
9705 true);
9706 if (ret <= 0) {
9707 if (ret == 0)
9708 ret = orig_count;
9709 goto out_delalloc_release;
9710 }
9711 }
9712
9713 ret = btrfs_reserve_extent(root, disk_num_bytes, disk_num_bytes,
9714 disk_num_bytes, 0, 0, &ins, 1, 1);
9715 if (ret)
9716 goto out_delalloc_release;
9717 extent_reserved = true;
9718
9719 file_extent.disk_bytenr = ins.objectid;
9720 file_extent.disk_num_bytes = ins.offset;
9721 file_extent.num_bytes = num_bytes;
9722 file_extent.ram_bytes = ram_bytes;
9723 file_extent.offset = encoded->unencoded_offset;
9724 file_extent.compression = compression;
9725 em = btrfs_create_io_em(inode, start, &file_extent, BTRFS_ORDERED_COMPRESSED);
9726 if (IS_ERR(em)) {
9727 ret = PTR_ERR(em);
9728 goto out_free_reserved;
9729 }
9730 free_extent_map(em);
9731
9732 ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent,
9733 (1 << BTRFS_ORDERED_ENCODED) |
9734 (1 << BTRFS_ORDERED_COMPRESSED));
9735 if (IS_ERR(ordered)) {
9736 btrfs_drop_extent_map_range(inode, start, end, false);
9737 ret = PTR_ERR(ordered);
9738 goto out_free_reserved;
9739 }
9740 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
9741
9742 if (start + encoded->len > inode->vfs_inode.i_size)
9743 i_size_write(&inode->vfs_inode, start + encoded->len);
9744
9745 unlock_extent(io_tree, start, end, &cached_state);
9746
9747 btrfs_delalloc_release_extents(inode, num_bytes);
9748
9749 btrfs_submit_compressed_write(ordered, folios, nr_folios, 0, false);
9750 ret = orig_count;
9751 goto out;
9752
9753 out_free_reserved:
9754 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
9755 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
9756 out_delalloc_release:
9757 btrfs_delalloc_release_extents(inode, num_bytes);
9758 btrfs_delalloc_release_metadata(inode, disk_num_bytes, ret < 0);
9759 out_qgroup_free_data:
9760 if (ret < 0)
9761 btrfs_qgroup_free_data(inode, data_reserved, start, num_bytes, NULL);
9762 out_free_data_space:
9763 /*
9764 * If btrfs_reserve_extent() succeeded, then we already decremented
9765 * bytes_may_use.
9766 */
9767 if (!extent_reserved)
9768 btrfs_free_reserved_data_space_noquota(fs_info, disk_num_bytes);
9769 out_unlock:
9770 unlock_extent(io_tree, start, end, &cached_state);
9771 out_folios:
9772 for (i = 0; i < nr_folios; i++) {
9773 if (folios[i])
9774 folio_put(folios[i]);
9775 }
9776 kvfree(folios);
9777 out:
9778 if (ret >= 0)
9779 iocb->ki_pos += encoded->len;
9780 return ret;
9781 }
9782
9783 #ifdef CONFIG_SWAP
9784 /*
9785 * Add an entry indicating a block group or device which is pinned by a
9786 * swapfile. Returns 0 on success, 1 if there is already an entry for it, or a
9787 * negative errno on failure.
9788 */
btrfs_add_swapfile_pin(struct inode * inode,void * ptr,bool is_block_group)9789 static int btrfs_add_swapfile_pin(struct inode *inode, void *ptr,
9790 bool is_block_group)
9791 {
9792 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
9793 struct btrfs_swapfile_pin *sp, *entry;
9794 struct rb_node **p;
9795 struct rb_node *parent = NULL;
9796
9797 sp = kmalloc(sizeof(*sp), GFP_NOFS);
9798 if (!sp)
9799 return -ENOMEM;
9800 sp->ptr = ptr;
9801 sp->inode = inode;
9802 sp->is_block_group = is_block_group;
9803 sp->bg_extent_count = 1;
9804
9805 spin_lock(&fs_info->swapfile_pins_lock);
9806 p = &fs_info->swapfile_pins.rb_node;
9807 while (*p) {
9808 parent = *p;
9809 entry = rb_entry(parent, struct btrfs_swapfile_pin, node);
9810 if (sp->ptr < entry->ptr ||
9811 (sp->ptr == entry->ptr && sp->inode < entry->inode)) {
9812 p = &(*p)->rb_left;
9813 } else if (sp->ptr > entry->ptr ||
9814 (sp->ptr == entry->ptr && sp->inode > entry->inode)) {
9815 p = &(*p)->rb_right;
9816 } else {
9817 if (is_block_group)
9818 entry->bg_extent_count++;
9819 spin_unlock(&fs_info->swapfile_pins_lock);
9820 kfree(sp);
9821 return 1;
9822 }
9823 }
9824 rb_link_node(&sp->node, parent, p);
9825 rb_insert_color(&sp->node, &fs_info->swapfile_pins);
9826 spin_unlock(&fs_info->swapfile_pins_lock);
9827 return 0;
9828 }
9829
9830 /* Free all of the entries pinned by this swapfile. */
btrfs_free_swapfile_pins(struct inode * inode)9831 static void btrfs_free_swapfile_pins(struct inode *inode)
9832 {
9833 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
9834 struct btrfs_swapfile_pin *sp;
9835 struct rb_node *node, *next;
9836
9837 spin_lock(&fs_info->swapfile_pins_lock);
9838 node = rb_first(&fs_info->swapfile_pins);
9839 while (node) {
9840 next = rb_next(node);
9841 sp = rb_entry(node, struct btrfs_swapfile_pin, node);
9842 if (sp->inode == inode) {
9843 rb_erase(&sp->node, &fs_info->swapfile_pins);
9844 if (sp->is_block_group) {
9845 btrfs_dec_block_group_swap_extents(sp->ptr,
9846 sp->bg_extent_count);
9847 btrfs_put_block_group(sp->ptr);
9848 }
9849 kfree(sp);
9850 }
9851 node = next;
9852 }
9853 spin_unlock(&fs_info->swapfile_pins_lock);
9854 }
9855
9856 struct btrfs_swap_info {
9857 u64 start;
9858 u64 block_start;
9859 u64 block_len;
9860 u64 lowest_ppage;
9861 u64 highest_ppage;
9862 unsigned long nr_pages;
9863 int nr_extents;
9864 };
9865
btrfs_add_swap_extent(struct swap_info_struct * sis,struct btrfs_swap_info * bsi)9866 static int btrfs_add_swap_extent(struct swap_info_struct *sis,
9867 struct btrfs_swap_info *bsi)
9868 {
9869 unsigned long nr_pages;
9870 unsigned long max_pages;
9871 u64 first_ppage, first_ppage_reported, next_ppage;
9872 int ret;
9873
9874 /*
9875 * Our swapfile may have had its size extended after the swap header was
9876 * written. In that case activating the swapfile should not go beyond
9877 * the max size set in the swap header.
9878 */
9879 if (bsi->nr_pages >= sis->max)
9880 return 0;
9881
9882 max_pages = sis->max - bsi->nr_pages;
9883 first_ppage = PAGE_ALIGN(bsi->block_start) >> PAGE_SHIFT;
9884 next_ppage = PAGE_ALIGN_DOWN(bsi->block_start + bsi->block_len) >> PAGE_SHIFT;
9885
9886 if (first_ppage >= next_ppage)
9887 return 0;
9888 nr_pages = next_ppage - first_ppage;
9889 nr_pages = min(nr_pages, max_pages);
9890
9891 first_ppage_reported = first_ppage;
9892 if (bsi->start == 0)
9893 first_ppage_reported++;
9894 if (bsi->lowest_ppage > first_ppage_reported)
9895 bsi->lowest_ppage = first_ppage_reported;
9896 if (bsi->highest_ppage < (next_ppage - 1))
9897 bsi->highest_ppage = next_ppage - 1;
9898
9899 ret = add_swap_extent(sis, bsi->nr_pages, nr_pages, first_ppage);
9900 if (ret < 0)
9901 return ret;
9902 bsi->nr_extents += ret;
9903 bsi->nr_pages += nr_pages;
9904 return 0;
9905 }
9906
btrfs_swap_deactivate(struct file * file)9907 static void btrfs_swap_deactivate(struct file *file)
9908 {
9909 struct inode *inode = file_inode(file);
9910
9911 btrfs_free_swapfile_pins(inode);
9912 atomic_dec(&BTRFS_I(inode)->root->nr_swapfiles);
9913 }
9914
btrfs_swap_activate(struct swap_info_struct * sis,struct file * file,sector_t * span)9915 static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
9916 sector_t *span)
9917 {
9918 struct inode *inode = file_inode(file);
9919 struct btrfs_root *root = BTRFS_I(inode)->root;
9920 struct btrfs_fs_info *fs_info = root->fs_info;
9921 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
9922 struct extent_state *cached_state = NULL;
9923 struct btrfs_chunk_map *map = NULL;
9924 struct btrfs_device *device = NULL;
9925 struct btrfs_swap_info bsi = {
9926 .lowest_ppage = (sector_t)-1ULL,
9927 };
9928 struct btrfs_backref_share_check_ctx *backref_ctx = NULL;
9929 struct btrfs_path *path = NULL;
9930 int ret = 0;
9931 u64 isize;
9932 u64 prev_extent_end = 0;
9933
9934 /*
9935 * Acquire the inode's mmap lock to prevent races with memory mapped
9936 * writes, as they could happen after we flush delalloc below and before
9937 * we lock the extent range further below. The inode was already locked
9938 * up in the call chain.
9939 */
9940 btrfs_assert_inode_locked(BTRFS_I(inode));
9941 down_write(&BTRFS_I(inode)->i_mmap_lock);
9942
9943 /*
9944 * If the swap file was just created, make sure delalloc is done. If the
9945 * file changes again after this, the user is doing something stupid and
9946 * we don't really care.
9947 */
9948 ret = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1);
9949 if (ret)
9950 goto out_unlock_mmap;
9951
9952 /*
9953 * The inode is locked, so these flags won't change after we check them.
9954 */
9955 if (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS) {
9956 btrfs_warn(fs_info, "swapfile must not be compressed");
9957 ret = -EINVAL;
9958 goto out_unlock_mmap;
9959 }
9960 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)) {
9961 btrfs_warn(fs_info, "swapfile must not be copy-on-write");
9962 ret = -EINVAL;
9963 goto out_unlock_mmap;
9964 }
9965 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
9966 btrfs_warn(fs_info, "swapfile must not be checksummed");
9967 ret = -EINVAL;
9968 goto out_unlock_mmap;
9969 }
9970
9971 path = btrfs_alloc_path();
9972 backref_ctx = btrfs_alloc_backref_share_check_ctx();
9973 if (!path || !backref_ctx) {
9974 ret = -ENOMEM;
9975 goto out_unlock_mmap;
9976 }
9977
9978 /*
9979 * Balance or device remove/replace/resize can move stuff around from
9980 * under us. The exclop protection makes sure they aren't running/won't
9981 * run concurrently while we are mapping the swap extents, and
9982 * fs_info->swapfile_pins prevents them from running while the swap
9983 * file is active and moving the extents. Note that this also prevents
9984 * a concurrent device add which isn't actually necessary, but it's not
9985 * really worth the trouble to allow it.
9986 */
9987 if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_SWAP_ACTIVATE)) {
9988 btrfs_warn(fs_info,
9989 "cannot activate swapfile while exclusive operation is running");
9990 ret = -EBUSY;
9991 goto out_unlock_mmap;
9992 }
9993
9994 /*
9995 * Prevent snapshot creation while we are activating the swap file.
9996 * We do not want to race with snapshot creation. If snapshot creation
9997 * already started before we bumped nr_swapfiles from 0 to 1 and
9998 * completes before the first write into the swap file after it is
9999 * activated, than that write would fallback to COW.
10000 */
10001 if (!btrfs_drew_try_write_lock(&root->snapshot_lock)) {
10002 btrfs_exclop_finish(fs_info);
10003 btrfs_warn(fs_info,
10004 "cannot activate swapfile because snapshot creation is in progress");
10005 ret = -EINVAL;
10006 goto out_unlock_mmap;
10007 }
10008 /*
10009 * Snapshots can create extents which require COW even if NODATACOW is
10010 * set. We use this counter to prevent snapshots. We must increment it
10011 * before walking the extents because we don't want a concurrent
10012 * snapshot to run after we've already checked the extents.
10013 *
10014 * It is possible that subvolume is marked for deletion but still not
10015 * removed yet. To prevent this race, we check the root status before
10016 * activating the swapfile.
10017 */
10018 spin_lock(&root->root_item_lock);
10019 if (btrfs_root_dead(root)) {
10020 spin_unlock(&root->root_item_lock);
10021
10022 btrfs_drew_write_unlock(&root->snapshot_lock);
10023 btrfs_exclop_finish(fs_info);
10024 btrfs_warn(fs_info,
10025 "cannot activate swapfile because subvolume %llu is being deleted",
10026 btrfs_root_id(root));
10027 ret = -EPERM;
10028 goto out_unlock_mmap;
10029 }
10030 atomic_inc(&root->nr_swapfiles);
10031 spin_unlock(&root->root_item_lock);
10032
10033 isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize);
10034
10035 lock_extent(io_tree, 0, isize - 1, &cached_state);
10036 while (prev_extent_end < isize) {
10037 struct btrfs_key key;
10038 struct extent_buffer *leaf;
10039 struct btrfs_file_extent_item *ei;
10040 struct btrfs_block_group *bg;
10041 u64 logical_block_start;
10042 u64 physical_block_start;
10043 u64 extent_gen;
10044 u64 disk_bytenr;
10045 u64 len;
10046
10047 key.objectid = btrfs_ino(BTRFS_I(inode));
10048 key.type = BTRFS_EXTENT_DATA_KEY;
10049 key.offset = prev_extent_end;
10050
10051 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
10052 if (ret < 0)
10053 goto out;
10054
10055 /*
10056 * If key not found it means we have an implicit hole (NO_HOLES
10057 * is enabled).
10058 */
10059 if (ret > 0) {
10060 btrfs_warn(fs_info, "swapfile must not have holes");
10061 ret = -EINVAL;
10062 goto out;
10063 }
10064
10065 leaf = path->nodes[0];
10066 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
10067
10068 if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE) {
10069 /*
10070 * It's unlikely we'll ever actually find ourselves
10071 * here, as a file small enough to fit inline won't be
10072 * big enough to store more than the swap header, but in
10073 * case something changes in the future, let's catch it
10074 * here rather than later.
10075 */
10076 btrfs_warn(fs_info, "swapfile must not be inline");
10077 ret = -EINVAL;
10078 goto out;
10079 }
10080
10081 if (btrfs_file_extent_compression(leaf, ei) != BTRFS_COMPRESS_NONE) {
10082 btrfs_warn(fs_info, "swapfile must not be compressed");
10083 ret = -EINVAL;
10084 goto out;
10085 }
10086
10087 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
10088 if (disk_bytenr == 0) {
10089 btrfs_warn(fs_info, "swapfile must not have holes");
10090 ret = -EINVAL;
10091 goto out;
10092 }
10093
10094 logical_block_start = disk_bytenr + btrfs_file_extent_offset(leaf, ei);
10095 extent_gen = btrfs_file_extent_generation(leaf, ei);
10096 prev_extent_end = btrfs_file_extent_end(path);
10097
10098 if (prev_extent_end > isize)
10099 len = isize - key.offset;
10100 else
10101 len = btrfs_file_extent_num_bytes(leaf, ei);
10102
10103 backref_ctx->curr_leaf_bytenr = leaf->start;
10104
10105 /*
10106 * Don't need the path anymore, release to avoid deadlocks when
10107 * calling btrfs_is_data_extent_shared() because when joining a
10108 * transaction it can block waiting for the current one's commit
10109 * which in turn may be trying to lock the same leaf to flush
10110 * delayed items for example.
10111 */
10112 btrfs_release_path(path);
10113
10114 ret = btrfs_is_data_extent_shared(BTRFS_I(inode), disk_bytenr,
10115 extent_gen, backref_ctx);
10116 if (ret < 0) {
10117 goto out;
10118 } else if (ret > 0) {
10119 btrfs_warn(fs_info,
10120 "swapfile must not be copy-on-write");
10121 ret = -EINVAL;
10122 goto out;
10123 }
10124
10125 map = btrfs_get_chunk_map(fs_info, logical_block_start, len);
10126 if (IS_ERR(map)) {
10127 ret = PTR_ERR(map);
10128 goto out;
10129 }
10130
10131 if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
10132 btrfs_warn(fs_info,
10133 "swapfile must have single data profile");
10134 ret = -EINVAL;
10135 goto out;
10136 }
10137
10138 if (device == NULL) {
10139 device = map->stripes[0].dev;
10140 ret = btrfs_add_swapfile_pin(inode, device, false);
10141 if (ret == 1)
10142 ret = 0;
10143 else if (ret)
10144 goto out;
10145 } else if (device != map->stripes[0].dev) {
10146 btrfs_warn(fs_info, "swapfile must be on one device");
10147 ret = -EINVAL;
10148 goto out;
10149 }
10150
10151 physical_block_start = (map->stripes[0].physical +
10152 (logical_block_start - map->start));
10153 btrfs_free_chunk_map(map);
10154 map = NULL;
10155
10156 bg = btrfs_lookup_block_group(fs_info, logical_block_start);
10157 if (!bg) {
10158 btrfs_warn(fs_info,
10159 "could not find block group containing swapfile");
10160 ret = -EINVAL;
10161 goto out;
10162 }
10163
10164 if (!btrfs_inc_block_group_swap_extents(bg)) {
10165 btrfs_warn(fs_info,
10166 "block group for swapfile at %llu is read-only%s",
10167 bg->start,
10168 atomic_read(&fs_info->scrubs_running) ?
10169 " (scrub running)" : "");
10170 btrfs_put_block_group(bg);
10171 ret = -EINVAL;
10172 goto out;
10173 }
10174
10175 ret = btrfs_add_swapfile_pin(inode, bg, true);
10176 if (ret) {
10177 btrfs_put_block_group(bg);
10178 if (ret == 1)
10179 ret = 0;
10180 else
10181 goto out;
10182 }
10183
10184 if (bsi.block_len &&
10185 bsi.block_start + bsi.block_len == physical_block_start) {
10186 bsi.block_len += len;
10187 } else {
10188 if (bsi.block_len) {
10189 ret = btrfs_add_swap_extent(sis, &bsi);
10190 if (ret)
10191 goto out;
10192 }
10193 bsi.start = key.offset;
10194 bsi.block_start = physical_block_start;
10195 bsi.block_len = len;
10196 }
10197
10198 if (fatal_signal_pending(current)) {
10199 ret = -EINTR;
10200 goto out;
10201 }
10202
10203 cond_resched();
10204 }
10205
10206 if (bsi.block_len)
10207 ret = btrfs_add_swap_extent(sis, &bsi);
10208
10209 out:
10210 if (!IS_ERR_OR_NULL(map))
10211 btrfs_free_chunk_map(map);
10212
10213 unlock_extent(io_tree, 0, isize - 1, &cached_state);
10214
10215 if (ret)
10216 btrfs_swap_deactivate(file);
10217
10218 btrfs_drew_write_unlock(&root->snapshot_lock);
10219
10220 btrfs_exclop_finish(fs_info);
10221
10222 out_unlock_mmap:
10223 up_write(&BTRFS_I(inode)->i_mmap_lock);
10224 btrfs_free_backref_share_ctx(backref_ctx);
10225 btrfs_free_path(path);
10226 if (ret)
10227 return ret;
10228
10229 if (device)
10230 sis->bdev = device->bdev;
10231 *span = bsi.highest_ppage - bsi.lowest_ppage + 1;
10232 sis->max = bsi.nr_pages;
10233 sis->pages = bsi.nr_pages - 1;
10234 return bsi.nr_extents;
10235 }
10236 #else
btrfs_swap_deactivate(struct file * file)10237 static void btrfs_swap_deactivate(struct file *file)
10238 {
10239 }
10240
btrfs_swap_activate(struct swap_info_struct * sis,struct file * file,sector_t * span)10241 static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
10242 sector_t *span)
10243 {
10244 return -EOPNOTSUPP;
10245 }
10246 #endif
10247
10248 /*
10249 * Update the number of bytes used in the VFS' inode. When we replace extents in
10250 * a range (clone, dedupe, fallocate's zero range), we must update the number of
10251 * bytes used by the inode in an atomic manner, so that concurrent stat(2) calls
10252 * always get a correct value.
10253 */
btrfs_update_inode_bytes(struct btrfs_inode * inode,const u64 add_bytes,const u64 del_bytes)10254 void btrfs_update_inode_bytes(struct btrfs_inode *inode,
10255 const u64 add_bytes,
10256 const u64 del_bytes)
10257 {
10258 if (add_bytes == del_bytes)
10259 return;
10260
10261 spin_lock(&inode->lock);
10262 if (del_bytes > 0)
10263 inode_sub_bytes(&inode->vfs_inode, del_bytes);
10264 if (add_bytes > 0)
10265 inode_add_bytes(&inode->vfs_inode, add_bytes);
10266 spin_unlock(&inode->lock);
10267 }
10268
10269 /*
10270 * Verify that there are no ordered extents for a given file range.
10271 *
10272 * @inode: The target inode.
10273 * @start: Start offset of the file range, should be sector size aligned.
10274 * @end: End offset (inclusive) of the file range, its value +1 should be
10275 * sector size aligned.
10276 *
10277 * This should typically be used for cases where we locked an inode's VFS lock in
10278 * exclusive mode, we have also locked the inode's i_mmap_lock in exclusive mode,
10279 * we have flushed all delalloc in the range, we have waited for all ordered
10280 * extents in the range to complete and finally we have locked the file range in
10281 * the inode's io_tree.
10282 */
btrfs_assert_inode_range_clean(struct btrfs_inode * inode,u64 start,u64 end)10283 void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end)
10284 {
10285 struct btrfs_root *root = inode->root;
10286 struct btrfs_ordered_extent *ordered;
10287
10288 if (!IS_ENABLED(CONFIG_BTRFS_ASSERT))
10289 return;
10290
10291 ordered = btrfs_lookup_first_ordered_range(inode, start, end + 1 - start);
10292 if (ordered) {
10293 btrfs_err(root->fs_info,
10294 "found unexpected ordered extent in file range [%llu, %llu] for inode %llu root %llu (ordered range [%llu, %llu])",
10295 start, end, btrfs_ino(inode), btrfs_root_id(root),
10296 ordered->file_offset,
10297 ordered->file_offset + ordered->num_bytes - 1);
10298 btrfs_put_ordered_extent(ordered);
10299 }
10300
10301 ASSERT(ordered == NULL);
10302 }
10303
10304 /*
10305 * Find the first inode with a minimum number.
10306 *
10307 * @root: The root to search for.
10308 * @min_ino: The minimum inode number.
10309 *
10310 * Find the first inode in the @root with a number >= @min_ino and return it.
10311 * Returns NULL if no such inode found.
10312 */
btrfs_find_first_inode(struct btrfs_root * root,u64 min_ino)10313 struct btrfs_inode *btrfs_find_first_inode(struct btrfs_root *root, u64 min_ino)
10314 {
10315 struct btrfs_inode *inode;
10316 unsigned long from = min_ino;
10317
10318 xa_lock(&root->inodes);
10319 while (true) {
10320 inode = xa_find(&root->inodes, &from, ULONG_MAX, XA_PRESENT);
10321 if (!inode)
10322 break;
10323 if (igrab(&inode->vfs_inode))
10324 break;
10325
10326 from = btrfs_ino(inode) + 1;
10327 cond_resched_lock(&root->inodes.xa_lock);
10328 }
10329 xa_unlock(&root->inodes);
10330
10331 return inode;
10332 }
10333
10334 static const struct inode_operations btrfs_dir_inode_operations = {
10335 .getattr = btrfs_getattr,
10336 .lookup = btrfs_lookup,
10337 .create = btrfs_create,
10338 .unlink = btrfs_unlink,
10339 .link = btrfs_link,
10340 .mkdir = btrfs_mkdir,
10341 .rmdir = btrfs_rmdir,
10342 .rename = btrfs_rename2,
10343 .symlink = btrfs_symlink,
10344 .setattr = btrfs_setattr,
10345 .mknod = btrfs_mknod,
10346 .listxattr = btrfs_listxattr,
10347 .permission = btrfs_permission,
10348 .get_inode_acl = btrfs_get_acl,
10349 .set_acl = btrfs_set_acl,
10350 .update_time = btrfs_update_time,
10351 .tmpfile = btrfs_tmpfile,
10352 .fileattr_get = btrfs_fileattr_get,
10353 .fileattr_set = btrfs_fileattr_set,
10354 };
10355
10356 static const struct file_operations btrfs_dir_file_operations = {
10357 .llseek = btrfs_dir_llseek,
10358 .read = generic_read_dir,
10359 .iterate_shared = btrfs_real_readdir,
10360 .open = btrfs_opendir,
10361 .unlocked_ioctl = btrfs_ioctl,
10362 #ifdef CONFIG_COMPAT
10363 .compat_ioctl = btrfs_compat_ioctl,
10364 #endif
10365 .release = btrfs_release_file,
10366 .fsync = btrfs_sync_file,
10367 };
10368
10369 /*
10370 * btrfs doesn't support the bmap operation because swapfiles
10371 * use bmap to make a mapping of extents in the file. They assume
10372 * these extents won't change over the life of the file and they
10373 * use the bmap result to do IO directly to the drive.
10374 *
10375 * the btrfs bmap call would return logical addresses that aren't
10376 * suitable for IO and they also will change frequently as COW
10377 * operations happen. So, swapfile + btrfs == corruption.
10378 *
10379 * For now we're avoiding this by dropping bmap.
10380 */
10381 static const struct address_space_operations btrfs_aops = {
10382 .read_folio = btrfs_read_folio,
10383 .writepages = btrfs_writepages,
10384 .readahead = btrfs_readahead,
10385 .invalidate_folio = btrfs_invalidate_folio,
10386 .launder_folio = btrfs_launder_folio,
10387 .release_folio = btrfs_release_folio,
10388 .migrate_folio = btrfs_migrate_folio,
10389 .dirty_folio = filemap_dirty_folio,
10390 .error_remove_folio = generic_error_remove_folio,
10391 .swap_activate = btrfs_swap_activate,
10392 .swap_deactivate = btrfs_swap_deactivate,
10393 };
10394
10395 static const struct inode_operations btrfs_file_inode_operations = {
10396 .getattr = btrfs_getattr,
10397 .setattr = btrfs_setattr,
10398 .listxattr = btrfs_listxattr,
10399 .permission = btrfs_permission,
10400 .fiemap = btrfs_fiemap,
10401 .get_inode_acl = btrfs_get_acl,
10402 .set_acl = btrfs_set_acl,
10403 .update_time = btrfs_update_time,
10404 .fileattr_get = btrfs_fileattr_get,
10405 .fileattr_set = btrfs_fileattr_set,
10406 };
10407 static const struct inode_operations btrfs_special_inode_operations = {
10408 .getattr = btrfs_getattr,
10409 .setattr = btrfs_setattr,
10410 .permission = btrfs_permission,
10411 .listxattr = btrfs_listxattr,
10412 .get_inode_acl = btrfs_get_acl,
10413 .set_acl = btrfs_set_acl,
10414 .update_time = btrfs_update_time,
10415 };
10416 static const struct inode_operations btrfs_symlink_inode_operations = {
10417 .get_link = page_get_link,
10418 .getattr = btrfs_getattr,
10419 .setattr = btrfs_setattr,
10420 .permission = btrfs_permission,
10421 .listxattr = btrfs_listxattr,
10422 .update_time = btrfs_update_time,
10423 };
10424
10425 const struct dentry_operations btrfs_dentry_operations = {
10426 .d_delete = btrfs_dentry_delete,
10427 };
10428