1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2012, 2018 by Delphix. All rights reserved. 25 */ 26 27 /* Portions Copyright 2007 Jeremy Teo */ 28 29 #include <sys/types.h> 30 #include <sys/param.h> 31 #include <sys/time.h> 32 #include <sys/sysmacros.h> 33 #include <sys/mntent.h> 34 #include <sys/u8_textprep.h> 35 #include <sys/dsl_dataset.h> 36 #include <sys/vfs.h> 37 #include <sys/vnode.h> 38 #include <sys/file.h> 39 #include <sys/kmem.h> 40 #include <sys/errno.h> 41 #include <sys/atomic.h> 42 #include <sys/zfs_dir.h> 43 #include <sys/zfs_acl.h> 44 #include <sys/zfs_ioctl.h> 45 #include <sys/zfs_rlock.h> 46 #include <sys/zfs_fuid.h> 47 #include <sys/zfs_vnops.h> 48 #include <sys/zfs_ctldir.h> 49 #include <sys/dnode.h> 50 #include <sys/fs/zfs.h> 51 #include <sys/zpl.h> 52 #include <sys/dmu.h> 53 #include <sys/dmu_objset.h> 54 #include <sys/dmu_tx.h> 55 #include <sys/zfs_refcount.h> 56 #include <sys/stat.h> 57 #include <sys/zap.h> 58 #include <sys/zfs_znode.h> 59 #include <sys/sa.h> 60 #include <sys/zfs_sa.h> 61 #include <sys/zfs_stat.h> 62 #include <linux/mm_compat.h> 63 64 #include "zfs_prop.h" 65 #include "zfs_comutil.h" 66 67 static kmem_cache_t *znode_cache = NULL; 68 static kmem_cache_t *znode_hold_cache = NULL; 69 unsigned int zfs_object_mutex_size = ZFS_OBJ_MTX_SZ; 70 71 /* 72 * This is used by the test suite so that it can delay znodes from being 73 * freed in order to inspect the unlinked set. 74 */ 75 static int zfs_unlink_suspend_progress = 0; 76 77 /* 78 * This callback is invoked when acquiring a RL_WRITER or RL_APPEND lock on 79 * z_rangelock. It will modify the offset and length of the lock to reflect 80 * znode-specific information, and convert RL_APPEND to RL_WRITER. This is 81 * called with the rangelock_t's rl_lock held, which avoids races. 82 */ 83 static void 84 zfs_rangelock_cb(zfs_locked_range_t *new, void *arg) 85 { 86 znode_t *zp = arg; 87 88 /* 89 * If in append mode, convert to writer and lock starting at the 90 * current end of file. 91 */ 92 if (new->lr_type == RL_APPEND) { 93 new->lr_offset = zp->z_size; 94 new->lr_type = RL_WRITER; 95 } 96 97 /* 98 * If we might grow the block size then lock the whole file range. 99 * NB: this test should match the check in zfs_grow_blocksize 100 */ 101 uint64_t end_size = MAX(zp->z_size, new->lr_offset + new->lr_length); 102 if (zp->z_size <= zp->z_blksz && end_size > zp->z_blksz && 103 (!ISP2(zp->z_blksz) || zp->z_blksz < ZTOZSB(zp)->z_max_blksz)) { 104 new->lr_offset = 0; 105 new->lr_length = UINT64_MAX; 106 } 107 } 108 109 static int 110 zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) 111 { 112 (void) arg, (void) kmflags; 113 znode_t *zp = buf; 114 115 inode_init_once(ZTOI(zp)); 116 list_link_init(&zp->z_link_node); 117 118 mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL); 119 rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL); 120 rw_init(&zp->z_name_lock, NULL, RW_NOLOCKDEP, NULL); 121 mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL); 122 rw_init(&zp->z_xattr_lock, NULL, RW_DEFAULT, NULL); 123 124 zfs_rangelock_init(&zp->z_rangelock, zfs_rangelock_cb, zp); 125 126 zp->z_dirlocks = NULL; 127 zp->z_acl_cached = NULL; 128 zp->z_xattr_cached = NULL; 129 zp->z_xattr_parent = 0; 130 131 return (0); 132 } 133 134 static void 135 zfs_znode_cache_destructor(void *buf, void *arg) 136 { 137 (void) arg; 138 znode_t *zp = buf; 139 140 ASSERT(!list_link_active(&zp->z_link_node)); 141 mutex_destroy(&zp->z_lock); 142 rw_destroy(&zp->z_parent_lock); 143 rw_destroy(&zp->z_name_lock); 144 mutex_destroy(&zp->z_acl_lock); 145 rw_destroy(&zp->z_xattr_lock); 146 zfs_rangelock_fini(&zp->z_rangelock); 147 148 ASSERT0P(zp->z_dirlocks); 149 ASSERT0P(zp->z_acl_cached); 150 ASSERT0P(zp->z_xattr_cached); 151 } 152 153 static int 154 zfs_znode_hold_cache_constructor(void *buf, void *arg, int kmflags) 155 { 156 (void) arg, (void) kmflags; 157 znode_hold_t *zh = buf; 158 159 mutex_init(&zh->zh_lock, NULL, MUTEX_DEFAULT, NULL); 160 zh->zh_refcount = 0; 161 162 return (0); 163 } 164 165 static void 166 zfs_znode_hold_cache_destructor(void *buf, void *arg) 167 { 168 (void) arg; 169 znode_hold_t *zh = buf; 170 171 mutex_destroy(&zh->zh_lock); 172 } 173 174 void 175 zfs_znode_init(void) 176 { 177 /* 178 * Initialize zcache. The KMC_SLAB hint is used in order that it be 179 * backed by kmalloc() when on the Linux slab in order that any 180 * wait_on_bit() operations on the related inode operate properly. 181 */ 182 ASSERT0P(znode_cache); 183 znode_cache = kmem_cache_create("zfs_znode_cache", 184 sizeof (znode_t), 0, zfs_znode_cache_constructor, 185 zfs_znode_cache_destructor, NULL, NULL, NULL, 186 KMC_SLAB | KMC_RECLAIMABLE); 187 188 ASSERT0P(znode_hold_cache); 189 znode_hold_cache = kmem_cache_create("zfs_znode_hold_cache", 190 sizeof (znode_hold_t), 0, zfs_znode_hold_cache_constructor, 191 zfs_znode_hold_cache_destructor, NULL, NULL, NULL, 0); 192 } 193 194 void 195 zfs_znode_fini(void) 196 { 197 /* 198 * Cleanup zcache 199 */ 200 if (znode_cache) 201 kmem_cache_destroy(znode_cache); 202 znode_cache = NULL; 203 204 if (znode_hold_cache) 205 kmem_cache_destroy(znode_hold_cache); 206 znode_hold_cache = NULL; 207 } 208 209 /* 210 * The zfs_znode_hold_enter() / zfs_znode_hold_exit() functions are used to 211 * serialize access to a znode and its SA buffer while the object is being 212 * created or destroyed. This kind of locking would normally reside in the 213 * znode itself but in this case that's impossible because the znode and SA 214 * buffer may not yet exist. Therefore the locking is handled externally 215 * with an array of mutexes and AVLs trees which contain per-object locks. 216 * 217 * In zfs_znode_hold_enter() a per-object lock is created as needed, inserted 218 * in to the correct AVL tree and finally the per-object lock is held. In 219 * zfs_znode_hold_exit() the process is reversed. The per-object lock is 220 * released, removed from the AVL tree and destroyed if there are no waiters. 221 * 222 * This scheme has two important properties: 223 * 224 * 1) No memory allocations are performed while holding one of the z_hold_locks. 225 * This ensures evict(), which can be called from direct memory reclaim, will 226 * never block waiting on a z_hold_locks which just happens to have hashed 227 * to the same index. 228 * 229 * 2) All locks used to serialize access to an object are per-object and never 230 * shared. This minimizes lock contention without creating a large number 231 * of dedicated locks. 232 * 233 * On the downside it does require znode_lock_t structures to be frequently 234 * allocated and freed. However, because these are backed by a kmem cache 235 * and very short lived this cost is minimal. 236 */ 237 int 238 zfs_znode_hold_compare(const void *a, const void *b) 239 { 240 const znode_hold_t *zh_a = (const znode_hold_t *)a; 241 const znode_hold_t *zh_b = (const znode_hold_t *)b; 242 243 return (TREE_CMP(zh_a->zh_obj, zh_b->zh_obj)); 244 } 245 246 static boolean_t __maybe_unused 247 zfs_znode_held(zfsvfs_t *zfsvfs, uint64_t obj) 248 { 249 znode_hold_t *zh, search; 250 int i = ZFS_OBJ_HASH(zfsvfs, obj); 251 boolean_t held; 252 253 search.zh_obj = obj; 254 255 mutex_enter(&zfsvfs->z_hold_locks[i]); 256 zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL); 257 held = (zh && MUTEX_HELD(&zh->zh_lock)) ? B_TRUE : B_FALSE; 258 mutex_exit(&zfsvfs->z_hold_locks[i]); 259 260 return (held); 261 } 262 263 znode_hold_t * 264 zfs_znode_hold_enter(zfsvfs_t *zfsvfs, uint64_t obj) 265 { 266 znode_hold_t *zh, *zh_new, search; 267 int i = ZFS_OBJ_HASH(zfsvfs, obj); 268 boolean_t found = B_FALSE; 269 270 zh_new = kmem_cache_alloc(znode_hold_cache, KM_SLEEP); 271 search.zh_obj = obj; 272 273 mutex_enter(&zfsvfs->z_hold_locks[i]); 274 zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL); 275 if (likely(zh == NULL)) { 276 zh = zh_new; 277 zh->zh_obj = obj; 278 avl_add(&zfsvfs->z_hold_trees[i], zh); 279 } else { 280 ASSERT3U(zh->zh_obj, ==, obj); 281 found = B_TRUE; 282 } 283 zh->zh_refcount++; 284 ASSERT3S(zh->zh_refcount, >, 0); 285 mutex_exit(&zfsvfs->z_hold_locks[i]); 286 287 if (found == B_TRUE) 288 kmem_cache_free(znode_hold_cache, zh_new); 289 290 ASSERT(MUTEX_NOT_HELD(&zh->zh_lock)); 291 mutex_enter(&zh->zh_lock); 292 293 return (zh); 294 } 295 296 void 297 zfs_znode_hold_exit(zfsvfs_t *zfsvfs, znode_hold_t *zh) 298 { 299 int i = ZFS_OBJ_HASH(zfsvfs, zh->zh_obj); 300 boolean_t remove = B_FALSE; 301 302 ASSERT(zfs_znode_held(zfsvfs, zh->zh_obj)); 303 mutex_exit(&zh->zh_lock); 304 305 mutex_enter(&zfsvfs->z_hold_locks[i]); 306 ASSERT3S(zh->zh_refcount, >, 0); 307 if (--zh->zh_refcount == 0) { 308 avl_remove(&zfsvfs->z_hold_trees[i], zh); 309 remove = B_TRUE; 310 } 311 mutex_exit(&zfsvfs->z_hold_locks[i]); 312 313 if (remove == B_TRUE) 314 kmem_cache_free(znode_hold_cache, zh); 315 } 316 317 dev_t 318 zfs_cmpldev(uint64_t dev) 319 { 320 return (dev); 321 } 322 323 static void 324 zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp, 325 dmu_buf_t *db, dmu_object_type_t obj_type, sa_handle_t *sa_hdl) 326 { 327 ASSERT(zfs_znode_held(zfsvfs, zp->z_id)); 328 329 mutex_enter(&zp->z_lock); 330 331 ASSERT0P(zp->z_sa_hdl); 332 ASSERT0P(zp->z_acl_cached); 333 if (sa_hdl == NULL) { 334 VERIFY0(sa_handle_get_from_db(zfsvfs->z_os, db, zp, 335 SA_HDL_SHARED, &zp->z_sa_hdl)); 336 } else { 337 zp->z_sa_hdl = sa_hdl; 338 sa_set_userp(sa_hdl, zp); 339 } 340 341 zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE; 342 343 mutex_exit(&zp->z_lock); 344 } 345 346 void 347 zfs_znode_dmu_fini(znode_t *zp) 348 { 349 ASSERT(zfs_znode_held(ZTOZSB(zp), zp->z_id) || 350 RW_WRITE_HELD(&ZTOZSB(zp)->z_teardown_inactive_lock)); 351 352 sa_handle_destroy(zp->z_sa_hdl); 353 zp->z_sa_hdl = NULL; 354 } 355 356 /* 357 * Called by new_inode() to allocate a new inode. 358 */ 359 int 360 zfs_inode_alloc(struct super_block *sb, struct inode **ip) 361 { 362 znode_t *zp; 363 364 zp = kmem_cache_alloc(znode_cache, KM_SLEEP); 365 *ip = ZTOI(zp); 366 367 return (0); 368 } 369 370 void 371 zfs_inode_free(struct inode *ip) 372 { 373 kmem_cache_free(znode_cache, ITOZ(ip)); 374 } 375 376 /* 377 * Called in multiple places when an inode should be destroyed. 378 */ 379 void 380 zfs_inode_destroy(struct inode *ip) 381 { 382 znode_t *zp = ITOZ(ip); 383 zfsvfs_t *zfsvfs = ZTOZSB(zp); 384 385 mutex_enter(&zfsvfs->z_znodes_lock); 386 if (list_link_active(&zp->z_link_node)) { 387 list_remove(&zfsvfs->z_all_znodes, zp); 388 } 389 mutex_exit(&zfsvfs->z_znodes_lock); 390 391 if (zp->z_acl_cached) { 392 zfs_acl_free(zp->z_acl_cached); 393 zp->z_acl_cached = NULL; 394 } 395 396 if (zp->z_xattr_cached) { 397 nvlist_free(zp->z_xattr_cached); 398 zp->z_xattr_cached = NULL; 399 } 400 #ifndef HAVE_SOPS_FREE_INODE 401 /* 402 * inode needs to be freed in RCU callback. If we have 403 * super_operations->free_inode, Linux kernel will do call_rcu 404 * for us. But if we don't have it, since call_rcu is GPL-only 405 * symbol, we can only free synchronously and accept the risk. 406 */ 407 zfs_inode_free(ip); 408 #endif 409 } 410 411 static void 412 zfs_inode_set_ops(zfsvfs_t *zfsvfs, struct inode *ip) 413 { 414 uint64_t rdev = 0; 415 416 switch (ip->i_mode & S_IFMT) { 417 case S_IFREG: 418 ip->i_op = &zpl_inode_operations; 419 ip->i_fop = &zpl_file_operations; 420 ip->i_mapping->a_ops = &zpl_address_space_operations; 421 break; 422 423 case S_IFDIR: 424 ip->i_op = &zpl_dir_inode_operations; 425 ip->i_fop = &zpl_dir_file_operations; 426 ITOZ(ip)->z_zn_prefetch = B_TRUE; 427 break; 428 429 case S_IFLNK: 430 ip->i_op = &zpl_symlink_inode_operations; 431 break; 432 433 /* 434 * rdev is only stored in a SA only for device files. 435 */ 436 case S_IFCHR: 437 case S_IFBLK: 438 (void) sa_lookup(ITOZ(ip)->z_sa_hdl, SA_ZPL_RDEV(zfsvfs), &rdev, 439 sizeof (rdev)); 440 zfs_fallthrough; 441 case S_IFIFO: 442 case S_IFSOCK: 443 init_special_inode(ip, ip->i_mode, rdev); 444 ip->i_op = &zpl_special_inode_operations; 445 break; 446 447 default: 448 zfs_panic_recover("inode %llu has invalid mode: 0x%x\n", 449 (u_longlong_t)ip->i_ino, ip->i_mode); 450 451 /* Assume the inode is a file and attempt to continue */ 452 ip->i_mode = S_IFREG | 0644; 453 ip->i_op = &zpl_inode_operations; 454 ip->i_fop = &zpl_file_operations; 455 ip->i_mapping->a_ops = &zpl_address_space_operations; 456 break; 457 } 458 } 459 460 static void 461 zfs_set_inode_flags(znode_t *zp, struct inode *ip) 462 { 463 /* 464 * Linux and Solaris have different sets of file attributes, so we 465 * restrict this conversion to the intersection of the two. 466 */ 467 unsigned int flags = 0; 468 if (zp->z_pflags & ZFS_IMMUTABLE) 469 flags |= S_IMMUTABLE; 470 if (zp->z_pflags & ZFS_APPENDONLY) 471 flags |= S_APPEND; 472 473 inode_set_flags(ip, flags, S_IMMUTABLE|S_APPEND); 474 } 475 476 /* 477 * Update the embedded inode given the znode. 478 */ 479 void 480 zfs_znode_update_vfs(znode_t *zp) 481 { 482 struct inode *ip; 483 uint32_t blksize; 484 u_longlong_t i_blocks; 485 486 ASSERT(zp != NULL); 487 ip = ZTOI(zp); 488 489 /* Skip .zfs control nodes which do not exist on disk. */ 490 if (zfsctl_is_node(ip)) 491 return; 492 493 dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &blksize, &i_blocks); 494 495 spin_lock(&ip->i_lock); 496 ip->i_mode = zp->z_mode; 497 ip->i_blocks = i_blocks; 498 i_size_write(ip, zp->z_size); 499 spin_unlock(&ip->i_lock); 500 } 501 502 503 /* 504 * Construct a znode+inode and initialize. 505 * 506 * This does not do a call to dmu_set_user() that is 507 * up to the caller to do, in case you don't want to 508 * return the znode 509 */ 510 static znode_t * 511 zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, 512 dmu_object_type_t obj_type, sa_handle_t *hdl) 513 { 514 znode_t *zp; 515 struct inode *ip; 516 uint64_t mode; 517 uint64_t parent; 518 uint64_t tmp_gen; 519 uint64_t links; 520 uint64_t z_uid, z_gid; 521 uint64_t atime[2], mtime[2], ctime[2], btime[2]; 522 inode_timespec_t tmp_ts; 523 uint64_t projid = ZFS_DEFAULT_PROJID; 524 sa_bulk_attr_t bulk[12]; 525 int count = 0; 526 527 ASSERT(zfsvfs != NULL); 528 529 ip = new_inode(zfsvfs->z_sb); 530 if (ip == NULL) 531 return (NULL); 532 533 zp = ITOZ(ip); 534 ASSERT0P(zp->z_dirlocks); 535 ASSERT0P(zp->z_acl_cached); 536 ASSERT0P(zp->z_xattr_cached); 537 zp->z_unlinked = B_FALSE; 538 zp->z_atime_dirty = B_FALSE; 539 zp->z_is_ctldir = B_FALSE; 540 zp->z_suspended = B_FALSE; 541 zp->z_sa_hdl = NULL; 542 zp->z_mapcnt = 0; 543 zp->z_id = db->db_object; 544 zp->z_blksz = blksz; 545 zp->z_seq = 0x7A4653; 546 zp->z_sync_cnt = 0; 547 548 zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl); 549 550 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); 551 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &tmp_gen, 8); 552 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, 553 &zp->z_size, 8); 554 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8); 555 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, 556 &zp->z_pflags, 8); 557 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, 558 &parent, 8); 559 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &z_uid, 8); 560 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &z_gid, 8); 561 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); 562 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); 563 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); 564 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &btime, 16); 565 566 if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || tmp_gen == 0 || 567 (dmu_objset_projectquota_enabled(zfsvfs->z_os) && 568 (zp->z_pflags & ZFS_PROJID) && 569 sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs), &projid, 8) != 0)) { 570 if (hdl == NULL) 571 sa_handle_destroy(zp->z_sa_hdl); 572 zp->z_sa_hdl = NULL; 573 goto error; 574 } 575 576 zp->z_projid = projid; 577 zp->z_mode = ip->i_mode = mode; 578 ip->i_generation = (uint32_t)tmp_gen; 579 ip->i_blkbits = SPA_MINBLOCKSHIFT; 580 set_nlink(ip, (uint32_t)links); 581 zfs_uid_write(ip, z_uid); 582 zfs_gid_write(ip, z_gid); 583 zfs_set_inode_flags(zp, ip); 584 585 /* Cache the xattr parent id */ 586 if (zp->z_pflags & ZFS_XATTR) 587 zp->z_xattr_parent = parent; 588 589 ZFS_TIME_DECODE(&tmp_ts, atime); 590 zpl_inode_set_atime_to_ts(ip, tmp_ts); 591 ZFS_TIME_DECODE(&tmp_ts, mtime); 592 zpl_inode_set_mtime_to_ts(ip, tmp_ts); 593 ZFS_TIME_DECODE(&tmp_ts, ctime); 594 zpl_inode_set_ctime_to_ts(ip, tmp_ts); 595 ZFS_TIME_DECODE(&zp->z_btime, btime); 596 597 ip->i_ino = zp->z_id; 598 zfs_znode_update_vfs(zp); 599 zfs_inode_set_ops(zfsvfs, ip); 600 601 /* 602 * The only way insert_inode_locked() can fail is if the ip->i_ino 603 * number is already hashed for this super block. This can never 604 * happen because the inode numbers map 1:1 with the object numbers. 605 * 606 * Exceptions include rolling back a mounted file system, either 607 * from the zfs rollback or zfs recv command. 608 * 609 * Active inodes are unhashed during the rollback, but since zrele 610 * can happen asynchronously, we can't guarantee they've been 611 * unhashed. This can cause hash collisions in unlinked drain 612 * processing so do not hash unlinked znodes. 613 */ 614 if (links > 0) 615 VERIFY0(insert_inode_locked(ip)); 616 617 mutex_enter(&zfsvfs->z_znodes_lock); 618 list_insert_tail(&zfsvfs->z_all_znodes, zp); 619 mutex_exit(&zfsvfs->z_znodes_lock); 620 621 if (links > 0) 622 unlock_new_inode(ip); 623 return (zp); 624 625 error: 626 iput(ip); 627 return (NULL); 628 } 629 630 /* 631 * Safely mark an inode dirty. Inodes which are part of a read-only 632 * file system or snapshot may not be dirtied. 633 */ 634 void 635 zfs_mark_inode_dirty(struct inode *ip) 636 { 637 zfsvfs_t *zfsvfs = ITOZSB(ip); 638 639 if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os)) 640 return; 641 642 mark_inode_dirty(ip); 643 } 644 645 static uint64_t empty_xattr; 646 static uint64_t pad[4]; 647 static zfs_acl_phys_t acl_phys; 648 /* 649 * Create a new DMU object to hold a zfs znode. 650 * 651 * IN: dzp - parent directory for new znode 652 * vap - file attributes for new znode 653 * tx - dmu transaction id for zap operations 654 * cr - credentials of caller 655 * flag - flags: 656 * IS_ROOT_NODE - new object will be root 657 * IS_TMPFILE - new object is of O_TMPFILE 658 * IS_XATTR - new object is an attribute 659 * acl_ids - ACL related attributes 660 * 661 * OUT: zpp - allocated znode (set to dzp if IS_ROOT_NODE) 662 * 663 */ 664 void 665 zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, 666 uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids) 667 { 668 uint64_t crtime[2], atime[2], mtime[2], ctime[2]; 669 uint64_t mode, size, links, parent, pflags; 670 uint64_t projid = ZFS_DEFAULT_PROJID; 671 uint64_t rdev = 0; 672 zfsvfs_t *zfsvfs = ZTOZSB(dzp); 673 dmu_buf_t *db; 674 inode_timespec_t now; 675 uint64_t gen, obj; 676 int bonuslen; 677 int dnodesize; 678 sa_handle_t *sa_hdl; 679 dmu_object_type_t obj_type; 680 sa_bulk_attr_t *sa_attrs; 681 int cnt = 0; 682 zfs_acl_locator_cb_t locate = { 0 }; 683 znode_hold_t *zh; 684 685 if (zfsvfs->z_replay) { 686 obj = vap->va_nodeid; 687 now = vap->va_ctime; /* see zfs_replay_create() */ 688 gen = vap->va_nblocks; /* ditto */ 689 dnodesize = vap->va_fsid; /* ditto */ 690 } else { 691 obj = 0; 692 gethrestime(&now); 693 gen = dmu_tx_get_txg(tx); 694 dnodesize = dmu_objset_dnodesize(zfsvfs->z_os); 695 } 696 697 if (dnodesize == 0) 698 dnodesize = DNODE_MIN_SIZE; 699 700 obj_type = zfsvfs->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE; 701 702 bonuslen = (obj_type == DMU_OT_SA) ? 703 DN_BONUS_SIZE(dnodesize) : ZFS_OLD_ZNODE_PHYS_SIZE; 704 705 /* 706 * Create a new DMU object. 707 */ 708 /* 709 * There's currently no mechanism for pre-reading the blocks that will 710 * be needed to allocate a new object, so we accept the small chance 711 * that there will be an i/o error and we will fail one of the 712 * assertions below. 713 */ 714 if (S_ISDIR(vap->va_mode)) { 715 if (zfsvfs->z_replay) { 716 VERIFY0(zap_create_claim_norm_dnsize(zfsvfs->z_os, obj, 717 zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, 718 obj_type, bonuslen, dnodesize, tx)); 719 } else { 720 obj = zap_create_norm_dnsize(zfsvfs->z_os, 721 zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, 722 obj_type, bonuslen, dnodesize, tx); 723 } 724 } else { 725 if (zfsvfs->z_replay) { 726 VERIFY0(dmu_object_claim_dnsize(zfsvfs->z_os, obj, 727 DMU_OT_PLAIN_FILE_CONTENTS, 0, 728 obj_type, bonuslen, dnodesize, tx)); 729 } else { 730 obj = dmu_object_alloc_dnsize(zfsvfs->z_os, 731 DMU_OT_PLAIN_FILE_CONTENTS, 0, 732 obj_type, bonuslen, dnodesize, tx); 733 } 734 } 735 736 zh = zfs_znode_hold_enter(zfsvfs, obj); 737 VERIFY0(sa_buf_hold(zfsvfs->z_os, obj, NULL, &db)); 738 739 /* 740 * If this is the root, fix up the half-initialized parent pointer 741 * to reference the just-allocated physical data area. 742 */ 743 if (flag & IS_ROOT_NODE) { 744 dzp->z_id = obj; 745 } 746 747 /* 748 * If parent is an xattr, so am I. 749 */ 750 if (dzp->z_pflags & ZFS_XATTR) { 751 flag |= IS_XATTR; 752 } 753 754 if (zfsvfs->z_use_fuids) 755 pflags = ZFS_ARCHIVE | ZFS_AV_MODIFIED; 756 else 757 pflags = 0; 758 759 if (S_ISDIR(vap->va_mode)) { 760 size = 2; /* contents ("." and "..") */ 761 links = 2; 762 } else { 763 size = 0; 764 links = (flag & IS_TMPFILE) ? 0 : 1; 765 } 766 767 if (S_ISBLK(vap->va_mode) || S_ISCHR(vap->va_mode)) 768 rdev = vap->va_rdev; 769 770 parent = dzp->z_id; 771 mode = acl_ids->z_mode; 772 if (flag & IS_XATTR) 773 pflags |= ZFS_XATTR; 774 775 if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) { 776 /* 777 * With ZFS_PROJID flag, we can easily know whether there is 778 * project ID stored on disk or not. See zpl_get_file_info(). 779 */ 780 if (obj_type != DMU_OT_ZNODE && 781 dmu_objset_projectquota_enabled(zfsvfs->z_os)) 782 pflags |= ZFS_PROJID; 783 784 /* 785 * Inherit project ID from parent if required. 786 */ 787 projid = zfs_inherit_projid(dzp); 788 if (dzp->z_pflags & ZFS_PROJINHERIT) 789 pflags |= ZFS_PROJINHERIT; 790 } 791 792 /* 793 * No execs denied will be determined when zfs_mode_compute() is called. 794 */ 795 pflags |= acl_ids->z_aclp->z_hints & 796 (ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|ZFS_ACL_AUTO_INHERIT| 797 ZFS_ACL_DEFAULTED|ZFS_ACL_PROTECTED); 798 799 ZFS_TIME_ENCODE(&now, crtime); 800 ZFS_TIME_ENCODE(&now, ctime); 801 802 if (vap->va_mask & ATTR_ATIME) { 803 ZFS_TIME_ENCODE(&vap->va_atime, atime); 804 } else { 805 ZFS_TIME_ENCODE(&now, atime); 806 } 807 808 if (vap->va_mask & ATTR_MTIME) { 809 ZFS_TIME_ENCODE(&vap->va_mtime, mtime); 810 } else { 811 ZFS_TIME_ENCODE(&now, mtime); 812 } 813 814 /* Now add in all of the "SA" attributes */ 815 VERIFY0(sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED, 816 &sa_hdl)); 817 818 /* 819 * Setup the array of attributes to be replaced/set on the new file 820 * 821 * order for DMU_OT_ZNODE is critical since it needs to be constructed 822 * in the old znode_phys_t format. Don't change this ordering 823 */ 824 sa_attrs = kmem_alloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP); 825 826 if (obj_type == DMU_OT_ZNODE) { 827 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs), 828 NULL, &atime, 16); 829 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs), 830 NULL, &mtime, 16); 831 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs), 832 NULL, &ctime, 16); 833 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs), 834 NULL, &crtime, 16); 835 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs), 836 NULL, &gen, 8); 837 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs), 838 NULL, &mode, 8); 839 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs), 840 NULL, &size, 8); 841 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs), 842 NULL, &parent, 8); 843 } else { 844 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs), 845 NULL, &mode, 8); 846 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs), 847 NULL, &size, 8); 848 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs), 849 NULL, &gen, 8); 850 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), 851 NULL, &acl_ids->z_fuid, 8); 852 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), 853 NULL, &acl_ids->z_fgid, 8); 854 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs), 855 NULL, &parent, 8); 856 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs), 857 NULL, &pflags, 8); 858 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs), 859 NULL, &atime, 16); 860 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs), 861 NULL, &mtime, 16); 862 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs), 863 NULL, &ctime, 16); 864 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs), 865 NULL, &crtime, 16); 866 } 867 868 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8); 869 870 if (obj_type == DMU_OT_ZNODE) { 871 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zfsvfs), NULL, 872 &empty_xattr, 8); 873 } else if (dmu_objset_projectquota_enabled(zfsvfs->z_os) && 874 pflags & ZFS_PROJID) { 875 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PROJID(zfsvfs), 876 NULL, &projid, 8); 877 } 878 if (obj_type == DMU_OT_ZNODE || 879 (S_ISBLK(vap->va_mode) || S_ISCHR(vap->va_mode))) { 880 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_RDEV(zfsvfs), 881 NULL, &rdev, 8); 882 } 883 if (obj_type == DMU_OT_ZNODE) { 884 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs), 885 NULL, &pflags, 8); 886 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL, 887 &acl_ids->z_fuid, 8); 888 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL, 889 &acl_ids->z_fgid, 8); 890 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PAD(zfsvfs), NULL, pad, 891 sizeof (uint64_t) * 4); 892 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ZNODE_ACL(zfsvfs), NULL, 893 &acl_phys, sizeof (zfs_acl_phys_t)); 894 } else if (acl_ids->z_aclp->z_version >= ZFS_ACL_VERSION_FUID) { 895 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_COUNT(zfsvfs), NULL, 896 &acl_ids->z_aclp->z_acl_count, 8); 897 locate.cb_aclp = acl_ids->z_aclp; 898 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_ACES(zfsvfs), 899 zfs_acl_data_locator, &locate, 900 acl_ids->z_aclp->z_acl_bytes); 901 mode = zfs_mode_compute(mode, acl_ids->z_aclp, &pflags, 902 acl_ids->z_fuid, acl_ids->z_fgid); 903 } 904 905 VERIFY0(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx)); 906 907 if (!(flag & IS_ROOT_NODE)) { 908 /* 909 * The call to zfs_znode_alloc() may fail if memory is low 910 * via the call path: alloc_inode() -> inode_init_always() -> 911 * security_inode_alloc() -> inode_alloc_security(). Since 912 * the existing code is written such that zfs_mknode() can 913 * not fail retry until sufficient memory has been reclaimed. 914 */ 915 do { 916 *zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, sa_hdl); 917 } while (*zpp == NULL); 918 919 VERIFY(*zpp != NULL); 920 VERIFY(dzp != NULL); 921 } else { 922 /* 923 * If we are creating the root node, the "parent" we 924 * passed in is the znode for the root. 925 */ 926 *zpp = dzp; 927 928 (*zpp)->z_sa_hdl = sa_hdl; 929 } 930 931 (*zpp)->z_pflags = pflags; 932 (*zpp)->z_mode = ZTOI(*zpp)->i_mode = mode; 933 (*zpp)->z_dnodesize = dnodesize; 934 (*zpp)->z_projid = projid; 935 936 if (obj_type == DMU_OT_ZNODE || 937 acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) { 938 VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx)); 939 } 940 kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * ZPL_END); 941 zfs_znode_hold_exit(zfsvfs, zh); 942 } 943 944 /* 945 * Update in-core attributes. It is assumed the caller will be doing an 946 * sa_bulk_update to push the changes out. 947 */ 948 void 949 zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx) 950 { 951 xoptattr_t *xoap; 952 boolean_t update_inode = B_FALSE; 953 954 xoap = xva_getxoptattr(xvap); 955 ASSERT(xoap); 956 957 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { 958 uint64_t times[2]; 959 ZFS_TIME_ENCODE(&xoap->xoa_createtime, times); 960 (void) sa_update(zp->z_sa_hdl, SA_ZPL_CRTIME(ZTOZSB(zp)), 961 ×, sizeof (times), tx); 962 XVA_SET_RTN(xvap, XAT_CREATETIME); 963 } 964 if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { 965 ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly, 966 zp->z_pflags, tx); 967 XVA_SET_RTN(xvap, XAT_READONLY); 968 } 969 if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { 970 ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden, 971 zp->z_pflags, tx); 972 XVA_SET_RTN(xvap, XAT_HIDDEN); 973 } 974 if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { 975 ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system, 976 zp->z_pflags, tx); 977 XVA_SET_RTN(xvap, XAT_SYSTEM); 978 } 979 if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { 980 ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive, 981 zp->z_pflags, tx); 982 XVA_SET_RTN(xvap, XAT_ARCHIVE); 983 } 984 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { 985 ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable, 986 zp->z_pflags, tx); 987 XVA_SET_RTN(xvap, XAT_IMMUTABLE); 988 989 update_inode = B_TRUE; 990 } 991 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { 992 ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink, 993 zp->z_pflags, tx); 994 XVA_SET_RTN(xvap, XAT_NOUNLINK); 995 } 996 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { 997 ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly, 998 zp->z_pflags, tx); 999 XVA_SET_RTN(xvap, XAT_APPENDONLY); 1000 1001 update_inode = B_TRUE; 1002 } 1003 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { 1004 ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump, 1005 zp->z_pflags, tx); 1006 XVA_SET_RTN(xvap, XAT_NODUMP); 1007 } 1008 if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { 1009 ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque, 1010 zp->z_pflags, tx); 1011 XVA_SET_RTN(xvap, XAT_OPAQUE); 1012 } 1013 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { 1014 ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED, 1015 xoap->xoa_av_quarantined, zp->z_pflags, tx); 1016 XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); 1017 } 1018 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { 1019 ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified, 1020 zp->z_pflags, tx); 1021 XVA_SET_RTN(xvap, XAT_AV_MODIFIED); 1022 } 1023 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) { 1024 zfs_sa_set_scanstamp(zp, xvap, tx); 1025 XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP); 1026 } 1027 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { 1028 ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse, 1029 zp->z_pflags, tx); 1030 XVA_SET_RTN(xvap, XAT_REPARSE); 1031 } 1032 if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) { 1033 ZFS_ATTR_SET(zp, ZFS_OFFLINE, xoap->xoa_offline, 1034 zp->z_pflags, tx); 1035 XVA_SET_RTN(xvap, XAT_OFFLINE); 1036 } 1037 if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) { 1038 ZFS_ATTR_SET(zp, ZFS_SPARSE, xoap->xoa_sparse, 1039 zp->z_pflags, tx); 1040 XVA_SET_RTN(xvap, XAT_SPARSE); 1041 } 1042 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) { 1043 ZFS_ATTR_SET(zp, ZFS_PROJINHERIT, xoap->xoa_projinherit, 1044 zp->z_pflags, tx); 1045 XVA_SET_RTN(xvap, XAT_PROJINHERIT); 1046 } 1047 1048 if (update_inode) 1049 zfs_set_inode_flags(zp, ZTOI(zp)); 1050 } 1051 1052 int 1053 zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp) 1054 { 1055 dmu_object_info_t doi; 1056 dmu_buf_t *db; 1057 znode_t *zp; 1058 znode_hold_t *zh; 1059 int err; 1060 sa_handle_t *hdl; 1061 1062 *zpp = NULL; 1063 1064 again: 1065 zh = zfs_znode_hold_enter(zfsvfs, obj_num); 1066 1067 err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db); 1068 if (err) { 1069 zfs_znode_hold_exit(zfsvfs, zh); 1070 return (err); 1071 } 1072 1073 dmu_object_info_from_db(db, &doi); 1074 if (doi.doi_bonus_type != DMU_OT_SA && 1075 (doi.doi_bonus_type != DMU_OT_ZNODE || 1076 (doi.doi_bonus_type == DMU_OT_ZNODE && 1077 doi.doi_bonus_size < sizeof (znode_phys_t)))) { 1078 sa_buf_rele(db, NULL); 1079 zfs_znode_hold_exit(zfsvfs, zh); 1080 return (SET_ERROR(EINVAL)); 1081 } 1082 1083 hdl = dmu_buf_get_user(db); 1084 if (hdl != NULL) { 1085 zp = sa_get_userdata(hdl); 1086 1087 1088 /* 1089 * Since "SA" does immediate eviction we 1090 * should never find a sa handle that doesn't 1091 * know about the znode. 1092 */ 1093 1094 ASSERT3P(zp, !=, NULL); 1095 1096 mutex_enter(&zp->z_lock); 1097 ASSERT3U(zp->z_id, ==, obj_num); 1098 /* 1099 * If zp->z_unlinked is set, the znode is already marked 1100 * for deletion and should not be discovered. Check this 1101 * after checking igrab() due to fsetxattr() & O_TMPFILE. 1102 * 1103 * If igrab() returns NULL the VFS has independently 1104 * determined the inode should be evicted and has 1105 * called iput_final() to start the eviction process. 1106 * The SA handle is still valid but because the VFS 1107 * requires that the eviction succeed we must drop 1108 * our locks and references to allow the eviction to 1109 * complete. The zfs_zget() may then be retried. 1110 * 1111 * This unlikely case could be optimized by registering 1112 * a sops->drop_inode() callback. The callback would 1113 * need to detect the active SA hold thereby informing 1114 * the VFS that this inode should not be evicted. 1115 */ 1116 if (igrab(ZTOI(zp)) == NULL) { 1117 if (zp->z_unlinked) 1118 err = SET_ERROR(ENOENT); 1119 else 1120 err = SET_ERROR(EAGAIN); 1121 } else { 1122 *zpp = zp; 1123 err = 0; 1124 } 1125 1126 mutex_exit(&zp->z_lock); 1127 sa_buf_rele(db, NULL); 1128 zfs_znode_hold_exit(zfsvfs, zh); 1129 1130 if (err == EAGAIN) { 1131 /* inode might need this to finish evict */ 1132 cond_resched(); 1133 goto again; 1134 } 1135 return (err); 1136 } 1137 1138 /* 1139 * Not found create new znode/vnode but only if file exists. 1140 * 1141 * There is a small window where zfs_vget() could 1142 * find this object while a file create is still in 1143 * progress. This is checked for in zfs_znode_alloc() 1144 * 1145 * if zfs_znode_alloc() fails it will drop the hold on the 1146 * bonus buffer. 1147 */ 1148 zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size, 1149 doi.doi_bonus_type, NULL); 1150 if (zp == NULL) { 1151 err = SET_ERROR(ENOENT); 1152 } else { 1153 *zpp = zp; 1154 } 1155 zfs_znode_hold_exit(zfsvfs, zh); 1156 return (err); 1157 } 1158 1159 int 1160 zfs_rezget(znode_t *zp) 1161 { 1162 zfsvfs_t *zfsvfs = ZTOZSB(zp); 1163 dmu_object_info_t doi; 1164 dmu_buf_t *db; 1165 uint64_t obj_num = zp->z_id; 1166 uint64_t mode; 1167 uint64_t links; 1168 sa_bulk_attr_t bulk[11]; 1169 int err; 1170 int count = 0; 1171 uint64_t gen; 1172 uint64_t z_uid, z_gid; 1173 uint64_t atime[2], mtime[2], ctime[2], btime[2]; 1174 inode_timespec_t tmp_ts; 1175 uint64_t projid = ZFS_DEFAULT_PROJID; 1176 znode_hold_t *zh; 1177 1178 /* 1179 * skip ctldir, otherwise they will always get invalidated. This will 1180 * cause funny behaviour for the mounted snapdirs. Especially for 1181 * Linux >= 3.18, d_invalidate will detach the mountpoint and prevent 1182 * anyone automount it again as long as someone is still using the 1183 * detached mount. 1184 */ 1185 if (zp->z_is_ctldir) 1186 return (0); 1187 1188 zh = zfs_znode_hold_enter(zfsvfs, obj_num); 1189 1190 mutex_enter(&zp->z_acl_lock); 1191 if (zp->z_acl_cached) { 1192 zfs_acl_free(zp->z_acl_cached); 1193 zp->z_acl_cached = NULL; 1194 } 1195 mutex_exit(&zp->z_acl_lock); 1196 1197 rw_enter(&zp->z_xattr_lock, RW_WRITER); 1198 if (zp->z_xattr_cached) { 1199 nvlist_free(zp->z_xattr_cached); 1200 zp->z_xattr_cached = NULL; 1201 } 1202 rw_exit(&zp->z_xattr_lock); 1203 1204 ASSERT0P(zp->z_sa_hdl); 1205 err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db); 1206 if (err) { 1207 zfs_znode_hold_exit(zfsvfs, zh); 1208 return (err); 1209 } 1210 1211 dmu_object_info_from_db(db, &doi); 1212 if (doi.doi_bonus_type != DMU_OT_SA && 1213 (doi.doi_bonus_type != DMU_OT_ZNODE || 1214 (doi.doi_bonus_type == DMU_OT_ZNODE && 1215 doi.doi_bonus_size < sizeof (znode_phys_t)))) { 1216 sa_buf_rele(db, NULL); 1217 zfs_znode_hold_exit(zfsvfs, zh); 1218 return (SET_ERROR(EINVAL)); 1219 } 1220 1221 zfs_znode_sa_init(zfsvfs, zp, db, doi.doi_bonus_type, NULL); 1222 1223 /* reload cached values */ 1224 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, 1225 &gen, sizeof (gen)); 1226 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, 1227 &zp->z_size, sizeof (zp->z_size)); 1228 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, 1229 &links, sizeof (links)); 1230 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, 1231 &zp->z_pflags, sizeof (zp->z_pflags)); 1232 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, 1233 &z_uid, sizeof (z_uid)); 1234 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, 1235 &z_gid, sizeof (z_gid)); 1236 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, 1237 &mode, sizeof (mode)); 1238 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, 1239 &atime, 16); 1240 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, 1241 &mtime, 16); 1242 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, 1243 &ctime, 16); 1244 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &btime, 16); 1245 1246 if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) { 1247 zfs_znode_dmu_fini(zp); 1248 zfs_znode_hold_exit(zfsvfs, zh); 1249 return (SET_ERROR(EIO)); 1250 } 1251 1252 if (dmu_objset_projectquota_enabled(zfsvfs->z_os)) { 1253 err = sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs), 1254 &projid, 8); 1255 if (err != 0 && err != ENOENT) { 1256 zfs_znode_dmu_fini(zp); 1257 zfs_znode_hold_exit(zfsvfs, zh); 1258 return (SET_ERROR(err)); 1259 } 1260 } 1261 1262 zp->z_projid = projid; 1263 zp->z_mode = ZTOI(zp)->i_mode = mode; 1264 zfs_uid_write(ZTOI(zp), z_uid); 1265 zfs_gid_write(ZTOI(zp), z_gid); 1266 1267 ZFS_TIME_DECODE(&tmp_ts, atime); 1268 zpl_inode_set_atime_to_ts(ZTOI(zp), tmp_ts); 1269 ZFS_TIME_DECODE(&tmp_ts, mtime); 1270 zpl_inode_set_mtime_to_ts(ZTOI(zp), tmp_ts); 1271 ZFS_TIME_DECODE(&tmp_ts, ctime); 1272 zpl_inode_set_ctime_to_ts(ZTOI(zp), tmp_ts); 1273 ZFS_TIME_DECODE(&zp->z_btime, btime); 1274 1275 if ((uint32_t)gen != ZTOI(zp)->i_generation) { 1276 zfs_znode_dmu_fini(zp); 1277 zfs_znode_hold_exit(zfsvfs, zh); 1278 return (SET_ERROR(EIO)); 1279 } 1280 1281 set_nlink(ZTOI(zp), (uint32_t)links); 1282 zfs_set_inode_flags(zp, ZTOI(zp)); 1283 1284 zp->z_blksz = doi.doi_data_block_size; 1285 zp->z_atime_dirty = B_FALSE; 1286 zfs_znode_update_vfs(zp); 1287 1288 /* 1289 * If the file has zero links, then it has been unlinked on the send 1290 * side and it must be in the received unlinked set. 1291 * We call zfs_znode_dmu_fini() now to prevent any accesses to the 1292 * stale data and to prevent automatic removal of the file in 1293 * zfs_zinactive(). The file will be removed either when it is removed 1294 * on the send side and the next incremental stream is received or 1295 * when the unlinked set gets processed. 1296 */ 1297 zp->z_unlinked = (ZTOI(zp)->i_nlink == 0); 1298 if (zp->z_unlinked) 1299 zfs_znode_dmu_fini(zp); 1300 1301 zfs_znode_hold_exit(zfsvfs, zh); 1302 1303 return (0); 1304 } 1305 1306 void 1307 zfs_znode_delete(znode_t *zp, dmu_tx_t *tx) 1308 { 1309 zfsvfs_t *zfsvfs = ZTOZSB(zp); 1310 objset_t *os = zfsvfs->z_os; 1311 uint64_t obj = zp->z_id; 1312 uint64_t acl_obj = zfs_external_acl(zp); 1313 znode_hold_t *zh; 1314 1315 zh = zfs_znode_hold_enter(zfsvfs, obj); 1316 if (acl_obj) { 1317 VERIFY(!zp->z_is_sa); 1318 VERIFY0(dmu_object_free(os, acl_obj, tx)); 1319 } 1320 VERIFY0(dmu_object_free(os, obj, tx)); 1321 zfs_znode_dmu_fini(zp); 1322 zfs_znode_hold_exit(zfsvfs, zh); 1323 } 1324 1325 void 1326 zfs_zinactive(znode_t *zp) 1327 { 1328 zfsvfs_t *zfsvfs = ZTOZSB(zp); 1329 uint64_t z_id = zp->z_id; 1330 znode_hold_t *zh; 1331 1332 ASSERT(zp->z_sa_hdl); 1333 1334 /* 1335 * Don't allow a zfs_zget() while were trying to release this znode. 1336 */ 1337 zh = zfs_znode_hold_enter(zfsvfs, z_id); 1338 1339 mutex_enter(&zp->z_lock); 1340 1341 /* 1342 * If this was the last reference to a file with no links, remove 1343 * the file from the file system unless the file system is mounted 1344 * read-only. That can happen, for example, if the file system was 1345 * originally read-write, the file was opened, then unlinked and 1346 * the file system was made read-only before the file was finally 1347 * closed. The file will remain in the unlinked set. 1348 */ 1349 if (zp->z_unlinked) { 1350 ASSERT(!zfsvfs->z_issnap); 1351 if (!zfs_is_readonly(zfsvfs) && !zfs_unlink_suspend_progress) { 1352 mutex_exit(&zp->z_lock); 1353 zfs_znode_hold_exit(zfsvfs, zh); 1354 zfs_rmnode(zp); 1355 return; 1356 } 1357 } 1358 1359 mutex_exit(&zp->z_lock); 1360 zfs_znode_dmu_fini(zp); 1361 1362 zfs_znode_hold_exit(zfsvfs, zh); 1363 } 1364 1365 /* 1366 * Determine whether the znode's atime must be updated. The logic mostly 1367 * duplicates the Linux kernel's relatime_need_update() functionality. 1368 * This function is only called if the underlying filesystem actually has 1369 * atime updates enabled. 1370 */ 1371 boolean_t 1372 zfs_relatime_need_update(const struct inode *ip) 1373 { 1374 inode_timespec_t now, tmp_atime, tmp_ts; 1375 1376 gethrestime(&now); 1377 tmp_atime = zpl_inode_get_atime(ip); 1378 /* 1379 * In relatime mode, only update the atime if the previous atime 1380 * is earlier than either the ctime or mtime or if at least a day 1381 * has passed since the last update of atime. 1382 */ 1383 tmp_ts = zpl_inode_get_mtime(ip); 1384 if (timespec64_compare(&tmp_ts, &tmp_atime) >= 0) 1385 return (B_TRUE); 1386 1387 tmp_ts = zpl_inode_get_ctime(ip); 1388 if (timespec64_compare(&tmp_ts, &tmp_atime) >= 0) 1389 return (B_TRUE); 1390 1391 if ((hrtime_t)now.tv_sec - (hrtime_t)tmp_atime.tv_sec >= 24*60*60) 1392 return (B_TRUE); 1393 1394 return (B_FALSE); 1395 } 1396 1397 /* 1398 * Prepare to update znode time stamps. 1399 * 1400 * IN: zp - znode requiring timestamp update 1401 * flag - ATTR_MTIME, ATTR_CTIME flags 1402 * 1403 * OUT: zp - z_seq 1404 * mtime - new mtime 1405 * ctime - new ctime 1406 * 1407 * Note: We don't update atime here, because we rely on Linux VFS to do 1408 * atime updating. 1409 */ 1410 void 1411 zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2], 1412 uint64_t ctime[2]) 1413 { 1414 inode_timespec_t now, tmp_ts; 1415 1416 gethrestime(&now); 1417 1418 zp->z_seq++; 1419 1420 if (flag & ATTR_MTIME) { 1421 ZFS_TIME_ENCODE(&now, mtime); 1422 ZFS_TIME_DECODE(&tmp_ts, mtime); 1423 zpl_inode_set_mtime_to_ts(ZTOI(zp), tmp_ts); 1424 if (ZTOZSB(zp)->z_use_fuids) { 1425 zp->z_pflags |= (ZFS_ARCHIVE | 1426 ZFS_AV_MODIFIED); 1427 } 1428 } 1429 1430 if (flag & ATTR_CTIME) { 1431 ZFS_TIME_ENCODE(&now, ctime); 1432 ZFS_TIME_DECODE(&tmp_ts, ctime); 1433 zpl_inode_set_ctime_to_ts(ZTOI(zp), tmp_ts); 1434 if (ZTOZSB(zp)->z_use_fuids) 1435 zp->z_pflags |= ZFS_ARCHIVE; 1436 } 1437 } 1438 1439 /* 1440 * Grow the block size for a file. 1441 * 1442 * IN: zp - znode of file to free data in. 1443 * size - requested block size 1444 * tx - open transaction. 1445 * 1446 * NOTE: this function assumes that the znode is write locked. 1447 */ 1448 void 1449 zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx) 1450 { 1451 int error; 1452 u_longlong_t dummy; 1453 1454 if (size <= zp->z_blksz) 1455 return; 1456 /* 1457 * If the file size is already greater than the current blocksize, 1458 * we will not grow. If there is more than one block in a file, 1459 * the blocksize cannot change. 1460 */ 1461 if (zp->z_blksz && zp->z_size > zp->z_blksz) 1462 return; 1463 1464 error = dmu_object_set_blocksize(ZTOZSB(zp)->z_os, zp->z_id, 1465 size, 0, tx); 1466 1467 if (error == ENOTSUP) 1468 return; 1469 ASSERT0(error); 1470 1471 /* What blocksize did we actually get? */ 1472 dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &zp->z_blksz, &dummy); 1473 } 1474 1475 /* 1476 * Increase the file length 1477 * 1478 * IN: zp - znode of file to free data in. 1479 * end - new end-of-file 1480 * 1481 * RETURN: 0 on success, error code on failure 1482 */ 1483 static int 1484 zfs_extend(znode_t *zp, uint64_t end) 1485 { 1486 zfsvfs_t *zfsvfs = ZTOZSB(zp); 1487 dmu_tx_t *tx; 1488 zfs_locked_range_t *lr; 1489 uint64_t newblksz; 1490 int error; 1491 1492 /* 1493 * We will change zp_size, lock the whole file. 1494 */ 1495 lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER); 1496 1497 /* 1498 * Nothing to do if file already at desired length. 1499 */ 1500 if (end <= zp->z_size) { 1501 zfs_rangelock_exit(lr); 1502 return (0); 1503 } 1504 tx = dmu_tx_create(zfsvfs->z_os); 1505 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 1506 zfs_sa_upgrade_txholds(tx, zp); 1507 if (end > zp->z_blksz && 1508 (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) { 1509 /* 1510 * We are growing the file past the current block size. 1511 */ 1512 if (zp->z_blksz > ZTOZSB(zp)->z_max_blksz) { 1513 /* 1514 * File's blocksize is already larger than the 1515 * "recordsize" property. Only let it grow to 1516 * the next power of 2. 1517 */ 1518 ASSERT(!ISP2(zp->z_blksz)); 1519 newblksz = MIN(end, 1 << highbit64(zp->z_blksz)); 1520 } else { 1521 newblksz = MIN(end, ZTOZSB(zp)->z_max_blksz); 1522 } 1523 dmu_tx_hold_write(tx, zp->z_id, 0, newblksz); 1524 } else { 1525 newblksz = 0; 1526 } 1527 1528 error = dmu_tx_assign(tx, DMU_TX_WAIT); 1529 if (error) { 1530 dmu_tx_abort(tx); 1531 zfs_rangelock_exit(lr); 1532 return (error); 1533 } 1534 1535 if (newblksz) 1536 zfs_grow_blocksize(zp, newblksz, tx); 1537 1538 zp->z_size = end; 1539 1540 VERIFY0(sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(ZTOZSB(zp)), 1541 &zp->z_size, sizeof (zp->z_size), tx)); 1542 1543 zfs_rangelock_exit(lr); 1544 1545 dmu_tx_commit(tx); 1546 1547 return (0); 1548 } 1549 1550 /* 1551 * zfs_zero_partial_page - Modeled after update_pages() but 1552 * with different arguments and semantics for use by zfs_freesp(). 1553 * 1554 * Zeroes a piece of a single page cache entry for zp at offset 1555 * start and length len. 1556 * 1557 * Caller must acquire a range lock on the file for the region 1558 * being zeroed in order that the ARC and page cache stay in sync. 1559 */ 1560 static void 1561 zfs_zero_partial_page(znode_t *zp, uint64_t start, uint64_t len) 1562 { 1563 struct address_space *mp = ZTOI(zp)->i_mapping; 1564 struct page *pp; 1565 int64_t off; 1566 void *pb; 1567 1568 ASSERT((start & PAGE_MASK) == ((start + len - 1) & PAGE_MASK)); 1569 1570 off = start & (PAGE_SIZE - 1); 1571 start &= PAGE_MASK; 1572 1573 pp = find_lock_page(mp, start >> PAGE_SHIFT); 1574 if (pp) { 1575 if (mapping_writably_mapped(mp)) 1576 flush_dcache_page(pp); 1577 1578 pb = kmap(pp); 1579 memset(pb + off, 0, len); 1580 kunmap(pp); 1581 1582 if (mapping_writably_mapped(mp)) 1583 flush_dcache_page(pp); 1584 1585 mark_page_accessed(pp); 1586 SetPageUptodate(pp); 1587 ClearPageError(pp); 1588 unlock_page(pp); 1589 put_page(pp); 1590 } 1591 } 1592 1593 /* 1594 * Free space in a file. 1595 * 1596 * IN: zp - znode of file to free data in. 1597 * off - start of section to free. 1598 * len - length of section to free. 1599 * 1600 * RETURN: 0 on success, error code on failure 1601 */ 1602 static int 1603 zfs_free_range(znode_t *zp, uint64_t off, uint64_t len) 1604 { 1605 zfsvfs_t *zfsvfs = ZTOZSB(zp); 1606 zfs_locked_range_t *lr; 1607 int error; 1608 1609 /* 1610 * Lock the range being freed. 1611 */ 1612 lr = zfs_rangelock_enter(&zp->z_rangelock, off, len, RL_WRITER); 1613 1614 /* 1615 * Nothing to do if file already at desired length. 1616 */ 1617 if (off >= zp->z_size) { 1618 zfs_rangelock_exit(lr); 1619 return (0); 1620 } 1621 1622 if (off + len > zp->z_size) 1623 len = zp->z_size - off; 1624 1625 error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len); 1626 1627 /* 1628 * Zero partial page cache entries. This must be done under a 1629 * range lock in order to keep the ARC and page cache in sync. 1630 */ 1631 if (zn_has_cached_data(zp, off, off + len - 1)) { 1632 loff_t first_page, last_page, page_len; 1633 loff_t first_page_offset, last_page_offset; 1634 1635 /* first possible full page in hole */ 1636 first_page = (off + PAGE_SIZE - 1) >> PAGE_SHIFT; 1637 /* last page of hole */ 1638 last_page = (off + len) >> PAGE_SHIFT; 1639 1640 /* offset of first_page */ 1641 first_page_offset = first_page << PAGE_SHIFT; 1642 /* offset of last_page */ 1643 last_page_offset = last_page << PAGE_SHIFT; 1644 1645 /* truncate whole pages */ 1646 if (last_page_offset > first_page_offset) { 1647 truncate_inode_pages_range(ZTOI(zp)->i_mapping, 1648 first_page_offset, last_page_offset - 1); 1649 } 1650 1651 /* truncate sub-page ranges */ 1652 if (first_page > last_page) { 1653 /* entire punched area within a single page */ 1654 zfs_zero_partial_page(zp, off, len); 1655 } else { 1656 /* beginning of punched area at the end of a page */ 1657 page_len = first_page_offset - off; 1658 if (page_len > 0) 1659 zfs_zero_partial_page(zp, off, page_len); 1660 1661 /* end of punched area at the beginning of a page */ 1662 page_len = off + len - last_page_offset; 1663 if (page_len > 0) 1664 zfs_zero_partial_page(zp, last_page_offset, 1665 page_len); 1666 } 1667 } 1668 zfs_rangelock_exit(lr); 1669 1670 return (error); 1671 } 1672 1673 /* 1674 * Truncate a file 1675 * 1676 * IN: zp - znode of file to free data in. 1677 * end - new end-of-file. 1678 * 1679 * RETURN: 0 on success, error code on failure 1680 */ 1681 static int 1682 zfs_trunc(znode_t *zp, uint64_t end) 1683 { 1684 zfsvfs_t *zfsvfs = ZTOZSB(zp); 1685 dmu_tx_t *tx; 1686 zfs_locked_range_t *lr; 1687 int error; 1688 sa_bulk_attr_t bulk[2]; 1689 int count = 0; 1690 1691 /* 1692 * We will change zp_size, lock the whole file. 1693 */ 1694 lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER); 1695 1696 /* 1697 * Nothing to do if file already at desired length. 1698 */ 1699 if (end >= zp->z_size) { 1700 zfs_rangelock_exit(lr); 1701 return (0); 1702 } 1703 1704 error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end, 1705 DMU_OBJECT_END); 1706 if (error) { 1707 zfs_rangelock_exit(lr); 1708 return (error); 1709 } 1710 tx = dmu_tx_create(zfsvfs->z_os); 1711 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 1712 zfs_sa_upgrade_txholds(tx, zp); 1713 dmu_tx_mark_netfree(tx); 1714 error = dmu_tx_assign(tx, DMU_TX_WAIT); 1715 if (error) { 1716 dmu_tx_abort(tx); 1717 zfs_rangelock_exit(lr); 1718 return (error); 1719 } 1720 1721 zp->z_size = end; 1722 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), 1723 NULL, &zp->z_size, sizeof (zp->z_size)); 1724 1725 if (end == 0) { 1726 zp->z_pflags &= ~ZFS_SPARSE; 1727 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), 1728 NULL, &zp->z_pflags, 8); 1729 } 1730 VERIFY0(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx)); 1731 1732 dmu_tx_commit(tx); 1733 zfs_rangelock_exit(lr); 1734 1735 return (0); 1736 } 1737 1738 /* 1739 * Free space in a file 1740 * 1741 * IN: zp - znode of file to free data in. 1742 * off - start of range 1743 * len - end of range (0 => EOF) 1744 * flag - current file open mode flags. 1745 * log - TRUE if this action should be logged 1746 * 1747 * RETURN: 0 on success, error code on failure 1748 */ 1749 int 1750 zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log) 1751 { 1752 dmu_tx_t *tx; 1753 zfsvfs_t *zfsvfs = ZTOZSB(zp); 1754 zilog_t *zilog = zfsvfs->z_log; 1755 uint64_t mode; 1756 uint64_t mtime[2], ctime[2]; 1757 sa_bulk_attr_t bulk[3]; 1758 int count = 0; 1759 int error; 1760 1761 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), &mode, 1762 sizeof (mode))) != 0) 1763 return (error); 1764 1765 if (off > zp->z_size) { 1766 error = zfs_extend(zp, off+len); 1767 if (error == 0 && log) 1768 goto log; 1769 goto out; 1770 } 1771 1772 if (len == 0) { 1773 error = zfs_trunc(zp, off); 1774 } else { 1775 if ((error = zfs_free_range(zp, off, len)) == 0 && 1776 off + len > zp->z_size) 1777 error = zfs_extend(zp, off+len); 1778 } 1779 if (error || !log) 1780 goto out; 1781 log: 1782 tx = dmu_tx_create(zfsvfs->z_os); 1783 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 1784 zfs_sa_upgrade_txholds(tx, zp); 1785 error = dmu_tx_assign(tx, DMU_TX_WAIT); 1786 if (error) { 1787 dmu_tx_abort(tx); 1788 goto out; 1789 } 1790 1791 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, 16); 1792 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, 16); 1793 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), 1794 NULL, &zp->z_pflags, 8); 1795 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); 1796 error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 1797 ASSERT0(error); 1798 1799 zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len); 1800 1801 dmu_tx_commit(tx); 1802 1803 zfs_znode_update_vfs(zp); 1804 error = 0; 1805 1806 out: 1807 /* 1808 * Truncate the page cache - for file truncate operations, use 1809 * the purpose-built API for truncations. For punching operations, 1810 * the truncation is handled under a range lock in zfs_free_range. 1811 */ 1812 if (len == 0) 1813 truncate_setsize(ZTOI(zp), off); 1814 return (error); 1815 } 1816 1817 void 1818 zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) 1819 { 1820 struct super_block *sb; 1821 zfsvfs_t *zfsvfs; 1822 uint64_t moid, obj, sa_obj, version; 1823 uint64_t sense = ZFS_CASE_SENSITIVE; 1824 uint64_t norm = 0; 1825 nvpair_t *elem; 1826 int size; 1827 int error; 1828 int i; 1829 znode_t *rootzp = NULL; 1830 vattr_t vattr; 1831 znode_t *zp; 1832 zfs_acl_ids_t acl_ids; 1833 1834 /* 1835 * First attempt to create master node. 1836 */ 1837 /* 1838 * In an empty objset, there are no blocks to read and thus 1839 * there can be no i/o errors (which we assert below). 1840 */ 1841 moid = MASTER_NODE_OBJ; 1842 error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE, 1843 DMU_OT_NONE, 0, tx); 1844 ASSERT0(error); 1845 1846 /* 1847 * Set starting attributes. 1848 */ 1849 version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os))); 1850 elem = NULL; 1851 while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) { 1852 /* For the moment we expect all zpl props to be uint64_ts */ 1853 uint64_t val; 1854 const char *name; 1855 1856 ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64); 1857 VERIFY0(nvpair_value_uint64(elem, &val)); 1858 name = nvpair_name(elem); 1859 if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) { 1860 if (val < version) 1861 version = val; 1862 } else { 1863 error = zap_update(os, moid, name, 8, 1, &val, tx); 1864 } 1865 ASSERT0(error); 1866 if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0) 1867 norm = val; 1868 else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0) 1869 sense = val; 1870 } 1871 ASSERT(version != 0); 1872 error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx); 1873 ASSERT0(error); 1874 1875 /* 1876 * Create zap object used for SA attribute registration 1877 */ 1878 1879 if (version >= ZPL_VERSION_SA) { 1880 sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, 1881 DMU_OT_NONE, 0, tx); 1882 error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); 1883 ASSERT0(error); 1884 } else { 1885 sa_obj = 0; 1886 } 1887 /* 1888 * Create a delete queue. 1889 */ 1890 obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx); 1891 1892 error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx); 1893 ASSERT0(error); 1894 1895 /* 1896 * Create root znode. Create minimal znode/inode/zfsvfs/sb 1897 * to allow zfs_mknode to work. 1898 */ 1899 vattr.va_mask = ATTR_MODE|ATTR_UID|ATTR_GID; 1900 vattr.va_mode = S_IFDIR|0755; 1901 vattr.va_uid = crgetuid(cr); 1902 vattr.va_gid = crgetgid(cr); 1903 1904 rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP); 1905 rootzp->z_unlinked = B_FALSE; 1906 rootzp->z_atime_dirty = B_FALSE; 1907 rootzp->z_is_sa = USE_SA(version, os); 1908 rootzp->z_pflags = 0; 1909 1910 zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); 1911 zfsvfs->z_os = os; 1912 zfsvfs->z_parent = zfsvfs; 1913 zfsvfs->z_version = version; 1914 zfsvfs->z_use_fuids = USE_FUIDS(version, os); 1915 zfsvfs->z_use_sa = USE_SA(version, os); 1916 zfsvfs->z_norm = norm; 1917 1918 sb = kmem_zalloc(sizeof (struct super_block), KM_SLEEP); 1919 sb->s_fs_info = zfsvfs; 1920 1921 ZTOI(rootzp)->i_sb = sb; 1922 1923 error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, 1924 &zfsvfs->z_attr_table); 1925 1926 ASSERT0(error); 1927 1928 /* 1929 * Fold case on file systems that are always or sometimes case 1930 * insensitive. 1931 */ 1932 if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED) 1933 zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; 1934 1935 mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); 1936 list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), 1937 offsetof(znode_t, z_link_node)); 1938 1939 size = MIN(1 << (highbit64(zfs_object_mutex_size)-1), ZFS_OBJ_MTX_MAX); 1940 zfsvfs->z_hold_size = size; 1941 zfsvfs->z_hold_trees = vmem_zalloc(sizeof (avl_tree_t) * size, 1942 KM_SLEEP); 1943 zfsvfs->z_hold_locks = vmem_zalloc(sizeof (kmutex_t) * size, KM_SLEEP); 1944 for (i = 0; i != size; i++) { 1945 avl_create(&zfsvfs->z_hold_trees[i], zfs_znode_hold_compare, 1946 sizeof (znode_hold_t), offsetof(znode_hold_t, zh_node)); 1947 mutex_init(&zfsvfs->z_hold_locks[i], NULL, MUTEX_DEFAULT, NULL); 1948 } 1949 1950 VERIFY0(zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr, 1951 cr, NULL, &acl_ids, zfs_init_idmap)); 1952 zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids); 1953 ASSERT3P(zp, ==, rootzp); 1954 error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx); 1955 ASSERT0(error); 1956 zfs_acl_ids_free(&acl_ids); 1957 1958 atomic_set(&ZTOI(rootzp)->i_count, 0); 1959 sa_handle_destroy(rootzp->z_sa_hdl); 1960 kmem_cache_free(znode_cache, rootzp); 1961 1962 for (i = 0; i != size; i++) { 1963 avl_destroy(&zfsvfs->z_hold_trees[i]); 1964 mutex_destroy(&zfsvfs->z_hold_locks[i]); 1965 } 1966 1967 mutex_destroy(&zfsvfs->z_znodes_lock); 1968 1969 vmem_free(zfsvfs->z_hold_trees, sizeof (avl_tree_t) * size); 1970 vmem_free(zfsvfs->z_hold_locks, sizeof (kmutex_t) * size); 1971 kmem_free(sb, sizeof (struct super_block)); 1972 kmem_free(zfsvfs, sizeof (zfsvfs_t)); 1973 } 1974 1975 EXPORT_SYMBOL(zfs_create_fs); 1976 EXPORT_SYMBOL(zfs_obj_to_path); 1977 1978 module_param(zfs_object_mutex_size, uint, 0644); 1979 MODULE_PARM_DESC(zfs_object_mutex_size, "Size of znode hold array"); 1980 module_param(zfs_unlink_suspend_progress, int, 0644); 1981 MODULE_PARM_DESC(zfs_unlink_suspend_progress, "Set to prevent async unlinks " 1982 "(debug - leaks space into the unlinked set)"); 1983