1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 23 /* 24 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 25 * Copyright (c) 2012, 2018 by Delphix. All rights reserved. 26 * Copyright (c) 2015 by Chunwei Chen. All rights reserved. 27 * Copyright 2017 Nexenta Systems, Inc. 28 * Copyright (c) 2025, Klara, Inc. 29 */ 30 31 /* Portions Copyright 2007 Jeremy Teo */ 32 /* Portions Copyright 2010 Robert Milkowski */ 33 34 35 #include <sys/types.h> 36 #include <sys/param.h> 37 #include <sys/time.h> 38 #include <sys/sysmacros.h> 39 #include <sys/vfs.h> 40 #include <sys/file.h> 41 #include <sys/stat.h> 42 #include <sys/kmem.h> 43 #include <sys/taskq.h> 44 #include <sys/uio.h> 45 #include <sys/vmsystm.h> 46 #include <sys/atomic.h> 47 #include <sys/pathname.h> 48 #include <sys/cmn_err.h> 49 #include <sys/errno.h> 50 #include <sys/zfs_dir.h> 51 #include <sys/zfs_acl.h> 52 #include <sys/zfs_ioctl.h> 53 #include <sys/fs/zfs.h> 54 #include <sys/dmu.h> 55 #include <sys/dmu_objset.h> 56 #include <sys/spa.h> 57 #include <sys/txg.h> 58 #include <sys/dbuf.h> 59 #include <sys/zap.h> 60 #include <sys/sa.h> 61 #include <sys/policy.h> 62 #include <sys/sunddi.h> 63 #include <sys/sid.h> 64 #include <sys/zfs_ctldir.h> 65 #include <sys/zfs_fuid.h> 66 #include <sys/zfs_quota.h> 67 #include <sys/zfs_sa.h> 68 #include <sys/zfs_vnops.h> 69 #include <sys/zfs_rlock.h> 70 #include <sys/cred.h> 71 #include <sys/zpl.h> 72 #include <sys/zil.h> 73 #include <sys/sa_impl.h> 74 #include <linux/mm_compat.h> 75 76 /* 77 * Programming rules. 78 * 79 * Each vnode op performs some logical unit of work. To do this, the ZPL must 80 * properly lock its in-core state, create a DMU transaction, do the work, 81 * record this work in the intent log (ZIL), commit the DMU transaction, 82 * and wait for the intent log to commit if it is a synchronous operation. 83 * Moreover, the vnode ops must work in both normal and log replay context. 84 * The ordering of events is important to avoid deadlocks and references 85 * to freed memory. The example below illustrates the following Big Rules: 86 * 87 * (1) A check must be made in each zfs thread for a mounted file system. 88 * This is done avoiding races using zfs_enter(zfsvfs). 89 * A zfs_exit(zfsvfs) is needed before all returns. Any znodes 90 * must be checked with zfs_verify_zp(zp). Both of these macros 91 * can return EIO from the calling function. 92 * 93 * (2) zrele() should always be the last thing except for zil_commit() (if 94 * necessary) and zfs_exit(). This is for 3 reasons: First, if it's the 95 * last reference, the vnode/znode can be freed, so the zp may point to 96 * freed memory. Second, the last reference will call zfs_zinactive(), 97 * which may induce a lot of work -- pushing cached pages (which acquires 98 * range locks) and syncing out cached atime changes. Third, 99 * zfs_zinactive() may require a new tx, which could deadlock the system 100 * if you were already holding one. This deadlock occurs because the tx 101 * currently being operated on prevents a txg from syncing, which 102 * prevents the new tx from progressing, resulting in a deadlock. If you 103 * must call zrele() within a tx, use zfs_zrele_async(). Note that iput() 104 * is a synonym for zrele(). 105 * 106 * (3) All range locks must be grabbed before calling dmu_tx_assign(), 107 * as they can span dmu_tx_assign() calls. 108 * 109 * (4) If ZPL locks are held, pass DMU_TX_NOWAIT as the second argument to 110 * dmu_tx_assign(). This is critical because we don't want to block 111 * while holding locks. 112 * 113 * If no ZPL locks are held (aside from zfs_enter()), use DMU_TX_WAIT. 114 * This reduces lock contention and CPU usage when we must wait (note 115 * that if throughput is constrained by the storage, nearly every 116 * transaction must wait). 117 * 118 * Note, in particular, that if a lock is sometimes acquired before 119 * the tx assigns, and sometimes after (e.g. z_lock), then failing 120 * to use a non-blocking assign can deadlock the system. The scenario: 121 * 122 * Thread A has grabbed a lock before calling dmu_tx_assign(). 123 * Thread B is in an already-assigned tx, and blocks for this lock. 124 * Thread A calls dmu_tx_assign(DMU_TX_WAIT) and blocks in 125 * txg_wait_open() forever, because the previous txg can't quiesce 126 * until B's tx commits. 127 * 128 * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is 129 * DMU_TX_NOWAIT, then drop all locks, call dmu_tx_wait(), and try 130 * again. On subsequent calls to dmu_tx_assign(), pass 131 * DMU_TX_NOTHROTTLE in addition to DMU_TX_NOWAIT, to indicate that 132 * this operation has already called dmu_tx_wait(). This will ensure 133 * that we don't retry forever, waiting a short bit each time. 134 * 135 * (5) If the operation succeeded, generate the intent log entry for it 136 * before dropping locks. This ensures that the ordering of events 137 * in the intent log matches the order in which they actually occurred. 138 * During ZIL replay the zfs_log_* functions will update the sequence 139 * number to indicate the zil transaction has replayed. 140 * 141 * (6) At the end of each vnode op, the DMU tx must always commit, 142 * regardless of whether there were any errors. 143 * 144 * (7) After dropping all locks, invoke zil_commit(zilog, foid) 145 * to ensure that synchronous semantics are provided when necessary. 146 * 147 * In general, this is how things should be ordered in each vnode op: 148 * 149 * zfs_enter(zfsvfs); // exit if unmounted 150 * top: 151 * zfs_dirent_lock(&dl, ...) // lock directory entry (may igrab()) 152 * rw_enter(...); // grab any other locks you need 153 * tx = dmu_tx_create(...); // get DMU tx 154 * dmu_tx_hold_*(); // hold each object you might modify 155 * error = dmu_tx_assign(tx, 156 * (waited ? DMU_TX_NOTHROTTLE : 0) | DMU_TX_NOWAIT); 157 * if (error) { 158 * rw_exit(...); // drop locks 159 * zfs_dirent_unlock(dl); // unlock directory entry 160 * zrele(...); // release held znodes 161 * if (error == ERESTART) { 162 * waited = B_TRUE; 163 * dmu_tx_wait(tx); 164 * dmu_tx_abort(tx); 165 * goto top; 166 * } 167 * dmu_tx_abort(tx); // abort DMU tx 168 * zfs_exit(zfsvfs); // finished in zfs 169 * return (error); // really out of space 170 * } 171 * error = do_real_work(); // do whatever this VOP does 172 * if (error == 0) 173 * zfs_log_*(...); // on success, make ZIL entry 174 * dmu_tx_commit(tx); // commit DMU tx -- error or not 175 * rw_exit(...); // drop locks 176 * zfs_dirent_unlock(dl); // unlock directory entry 177 * zrele(...); // release held znodes 178 * zil_commit(zilog, foid); // synchronous when necessary 179 * zfs_exit(zfsvfs); // finished in zfs 180 * return (error); // done, report error 181 */ 182 int 183 zfs_open(struct inode *ip, int mode, int flag, cred_t *cr) 184 { 185 (void) cr; 186 znode_t *zp = ITOZ(ip); 187 zfsvfs_t *zfsvfs = ITOZSB(ip); 188 int error; 189 190 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 191 return (error); 192 193 /* Honor ZFS_APPENDONLY file attribute */ 194 if (blk_mode_is_open_write(mode) && (zp->z_pflags & ZFS_APPENDONLY) && 195 ((flag & O_APPEND) == 0)) { 196 zfs_exit(zfsvfs, FTAG); 197 return (SET_ERROR(EPERM)); 198 } 199 200 /* 201 * Keep a count of the synchronous opens in the znode. On first 202 * synchronous open we must convert all previous async transactions 203 * into sync to keep correct ordering. 204 */ 205 if (flag & O_SYNC) { 206 if (atomic_inc_32_nv(&zp->z_sync_cnt) == 1) 207 zil_async_to_sync(zfsvfs->z_log, zp->z_id); 208 } 209 210 zfs_exit(zfsvfs, FTAG); 211 return (0); 212 } 213 214 int 215 zfs_close(struct inode *ip, int flag, cred_t *cr) 216 { 217 (void) cr; 218 znode_t *zp = ITOZ(ip); 219 zfsvfs_t *zfsvfs = ITOZSB(ip); 220 int error; 221 222 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 223 return (error); 224 225 /* Decrement the synchronous opens in the znode */ 226 if (flag & O_SYNC) 227 atomic_dec_32(&zp->z_sync_cnt); 228 229 zfs_exit(zfsvfs, FTAG); 230 return (0); 231 } 232 233 #if defined(_KERNEL) 234 235 static int zfs_fillpage(struct inode *ip, struct page *pp); 236 237 /* 238 * When a file is memory mapped, we must keep the IO data synchronized 239 * between the DMU cache and the memory mapped pages. Update all mapped 240 * pages with the contents of the coresponding dmu buffer. 241 */ 242 void 243 update_pages(znode_t *zp, int64_t start, int len, objset_t *os) 244 { 245 struct address_space *mp = ZTOI(zp)->i_mapping; 246 int64_t off = start & (PAGE_SIZE - 1); 247 248 for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) { 249 uint64_t nbytes = MIN(PAGE_SIZE - off, len); 250 251 struct page *pp = find_lock_page(mp, start >> PAGE_SHIFT); 252 if (pp) { 253 if (mapping_writably_mapped(mp)) 254 flush_dcache_page(pp); 255 256 void *pb = kmap(pp); 257 int error = dmu_read(os, zp->z_id, start + off, 258 nbytes, pb + off, DMU_READ_PREFETCH); 259 kunmap(pp); 260 261 if (error) { 262 SetPageError(pp); 263 ClearPageUptodate(pp); 264 } else { 265 ClearPageError(pp); 266 SetPageUptodate(pp); 267 268 if (mapping_writably_mapped(mp)) 269 flush_dcache_page(pp); 270 271 mark_page_accessed(pp); 272 } 273 274 unlock_page(pp); 275 put_page(pp); 276 } 277 278 len -= nbytes; 279 off = 0; 280 } 281 } 282 283 /* 284 * When a file is memory mapped, we must keep the I/O data synchronized 285 * between the DMU cache and the memory mapped pages. Preferentially read 286 * from memory mapped pages, otherwise fallback to reading through the dmu. 287 */ 288 int 289 mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio) 290 { 291 struct inode *ip = ZTOI(zp); 292 struct address_space *mp = ip->i_mapping; 293 int64_t start = uio->uio_loffset; 294 int64_t off = start & (PAGE_SIZE - 1); 295 int len = nbytes; 296 int error = 0; 297 298 for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) { 299 uint64_t bytes = MIN(PAGE_SIZE - off, len); 300 301 struct page *pp = find_lock_page(mp, start >> PAGE_SHIFT); 302 if (pp) { 303 304 /* 305 * If filemap_fault() retries there exists a window 306 * where the page will be unlocked and not up to date. 307 * In this case we must try and fill the page. 308 */ 309 if (unlikely(!PageUptodate(pp))) { 310 error = zfs_fillpage(ip, pp); 311 if (error) { 312 unlock_page(pp); 313 put_page(pp); 314 return (error); 315 } 316 } 317 318 ASSERT(PageUptodate(pp) || PageDirty(pp)); 319 320 unlock_page(pp); 321 322 void *pb = kmap(pp); 323 error = zfs_uiomove(pb + off, bytes, UIO_READ, uio); 324 kunmap(pp); 325 326 if (mapping_writably_mapped(mp)) 327 flush_dcache_page(pp); 328 329 mark_page_accessed(pp); 330 put_page(pp); 331 } else { 332 error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), 333 uio, bytes, DMU_READ_PREFETCH); 334 } 335 336 len -= bytes; 337 off = 0; 338 339 if (error) 340 break; 341 } 342 343 return (error); 344 } 345 #endif /* _KERNEL */ 346 347 static unsigned long zfs_delete_blocks = DMU_MAX_DELETEBLKCNT; 348 349 /* 350 * Write the bytes to a file. 351 * 352 * IN: zp - znode of file to be written to 353 * data - bytes to write 354 * len - number of bytes to write 355 * pos - offset to start writing at 356 * 357 * OUT: resid - remaining bytes to write 358 * 359 * RETURN: 0 if success 360 * positive error code if failure. EIO is returned 361 * for a short write when residp isn't provided. 362 * 363 * Timestamps: 364 * zp - ctime|mtime updated if byte count > 0 365 */ 366 int 367 zfs_write_simple(znode_t *zp, const void *data, size_t len, 368 loff_t pos, size_t *residp) 369 { 370 fstrans_cookie_t cookie; 371 int error; 372 373 struct iovec iov; 374 iov.iov_base = (void *)data; 375 iov.iov_len = len; 376 377 zfs_uio_t uio; 378 zfs_uio_iovec_init(&uio, &iov, 1, pos, UIO_SYSSPACE, len, 0); 379 380 cookie = spl_fstrans_mark(); 381 error = zfs_write(zp, &uio, 0, kcred); 382 spl_fstrans_unmark(cookie); 383 384 if (error == 0) { 385 if (residp != NULL) 386 *residp = zfs_uio_resid(&uio); 387 else if (zfs_uio_resid(&uio) != 0) 388 error = SET_ERROR(EIO); 389 } 390 391 return (error); 392 } 393 394 static void 395 zfs_rele_async_task(void *arg) 396 { 397 iput(arg); 398 } 399 400 void 401 zfs_zrele_async(znode_t *zp) 402 { 403 struct inode *ip = ZTOI(zp); 404 objset_t *os = ITOZSB(ip)->z_os; 405 406 ASSERT(atomic_read(&ip->i_count) > 0); 407 ASSERT(os != NULL); 408 409 /* 410 * If decrementing the count would put us at 0, we can't do it inline 411 * here, because that would be synchronous. Instead, dispatch an iput 412 * to run later. 413 * 414 * For more information on the dangers of a synchronous iput, see the 415 * header comment of this file. 416 */ 417 if (!atomic_add_unless(&ip->i_count, -1, 1)) { 418 VERIFY(taskq_dispatch(dsl_pool_zrele_taskq(dmu_objset_pool(os)), 419 zfs_rele_async_task, ip, TQ_SLEEP) != TASKQID_INVALID); 420 } 421 } 422 423 424 /* 425 * Lookup an entry in a directory, or an extended attribute directory. 426 * If it exists, return a held inode reference for it. 427 * 428 * IN: zdp - znode of directory to search. 429 * nm - name of entry to lookup. 430 * flags - LOOKUP_XATTR set if looking for an attribute. 431 * cr - credentials of caller. 432 * direntflags - directory lookup flags 433 * realpnp - returned pathname. 434 * 435 * OUT: zpp - znode of located entry, NULL if not found. 436 * 437 * RETURN: 0 on success, error code on failure. 438 * 439 * Timestamps: 440 * NA 441 */ 442 int 443 zfs_lookup(znode_t *zdp, char *nm, znode_t **zpp, int flags, cred_t *cr, 444 int *direntflags, pathname_t *realpnp) 445 { 446 zfsvfs_t *zfsvfs = ZTOZSB(zdp); 447 int error = 0; 448 449 /* 450 * Fast path lookup, however we must skip DNLC lookup 451 * for case folding or normalizing lookups because the 452 * DNLC code only stores the passed in name. This means 453 * creating 'a' and removing 'A' on a case insensitive 454 * file system would work, but DNLC still thinks 'a' 455 * exists and won't let you create it again on the next 456 * pass through fast path. 457 */ 458 if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) { 459 460 if (!S_ISDIR(ZTOI(zdp)->i_mode)) { 461 return (SET_ERROR(ENOTDIR)); 462 } else if (zdp->z_sa_hdl == NULL) { 463 return (SET_ERROR(EIO)); 464 } 465 466 if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) { 467 error = zfs_fastaccesschk_execute(zdp, cr); 468 if (!error) { 469 *zpp = zdp; 470 zhold(*zpp); 471 return (0); 472 } 473 return (error); 474 } 475 } 476 477 if ((error = zfs_enter_verify_zp(zfsvfs, zdp, FTAG)) != 0) 478 return (error); 479 480 *zpp = NULL; 481 482 if (flags & LOOKUP_XATTR) { 483 /* 484 * We don't allow recursive attributes.. 485 * Maybe someday we will. 486 */ 487 if (zdp->z_pflags & ZFS_XATTR) { 488 zfs_exit(zfsvfs, FTAG); 489 return (SET_ERROR(EINVAL)); 490 } 491 492 if ((error = zfs_get_xattrdir(zdp, zpp, cr, flags))) { 493 zfs_exit(zfsvfs, FTAG); 494 return (error); 495 } 496 497 /* 498 * Do we have permission to get into attribute directory? 499 */ 500 501 if ((error = zfs_zaccess(*zpp, ACE_EXECUTE, 0, 502 B_TRUE, cr, zfs_init_idmap))) { 503 zrele(*zpp); 504 *zpp = NULL; 505 } 506 507 zfs_exit(zfsvfs, FTAG); 508 return (error); 509 } 510 511 if (!S_ISDIR(ZTOI(zdp)->i_mode)) { 512 zfs_exit(zfsvfs, FTAG); 513 return (SET_ERROR(ENOTDIR)); 514 } 515 516 /* 517 * Check accessibility of directory. 518 */ 519 520 if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr, 521 zfs_init_idmap))) { 522 zfs_exit(zfsvfs, FTAG); 523 return (error); 524 } 525 526 if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), 527 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 528 zfs_exit(zfsvfs, FTAG); 529 return (SET_ERROR(EILSEQ)); 530 } 531 532 error = zfs_dirlook(zdp, nm, zpp, flags, direntflags, realpnp); 533 if ((error == 0) && (*zpp)) 534 zfs_znode_update_vfs(*zpp); 535 536 zfs_exit(zfsvfs, FTAG); 537 return (error); 538 } 539 540 /* 541 * Perform a linear search in directory for the name of specific inode. 542 * Note we don't pass in the buffer size of name because it's hardcoded to 543 * NAME_MAX+1(256) in Linux. 544 * 545 * IN: dzp - znode of directory to search. 546 * zp - znode of the target 547 * 548 * OUT: name - dentry name of the target 549 * 550 * RETURN: 0 on success, error code on failure. 551 */ 552 int 553 zfs_get_name(znode_t *dzp, char *name, znode_t *zp) 554 { 555 zfsvfs_t *zfsvfs = ZTOZSB(dzp); 556 int error = 0; 557 558 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) 559 return (error); 560 561 if ((error = zfs_verify_zp(zp)) != 0) { 562 zfs_exit(zfsvfs, FTAG); 563 return (error); 564 } 565 566 /* ctldir should have got their name in zfs_vget */ 567 if (dzp->z_is_ctldir || zp->z_is_ctldir) { 568 zfs_exit(zfsvfs, FTAG); 569 return (ENOENT); 570 } 571 572 /* buffer len is hardcoded to 256 in Linux kernel */ 573 error = zap_value_search(zfsvfs->z_os, dzp->z_id, zp->z_id, 574 ZFS_DIRENT_OBJ(-1ULL), name, ZAP_MAXNAMELEN); 575 576 zfs_exit(zfsvfs, FTAG); 577 return (error); 578 } 579 580 /* 581 * Attempt to create a new entry in a directory. If the entry 582 * already exists, truncate the file if permissible, else return 583 * an error. Return the ip of the created or trunc'd file. 584 * 585 * IN: dzp - znode of directory to put new file entry in. 586 * name - name of new file entry. 587 * vap - attributes of new file. 588 * excl - flag indicating exclusive or non-exclusive mode. 589 * mode - mode to open file with. 590 * cr - credentials of caller. 591 * flag - file flag. 592 * vsecp - ACL to be set 593 * mnt_ns - user namespace of the mount 594 * 595 * OUT: zpp - znode of created or trunc'd entry. 596 * 597 * RETURN: 0 on success, error code on failure. 598 * 599 * Timestamps: 600 * dzp - ctime|mtime updated if new entry created 601 * zp - ctime|mtime always, atime if new 602 */ 603 int 604 zfs_create(znode_t *dzp, char *name, vattr_t *vap, int excl, 605 int mode, znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp, 606 zidmap_t *mnt_ns) 607 { 608 znode_t *zp; 609 zfsvfs_t *zfsvfs = ZTOZSB(dzp); 610 zilog_t *zilog; 611 objset_t *os; 612 zfs_dirlock_t *dl; 613 dmu_tx_t *tx; 614 int error; 615 uid_t uid; 616 gid_t gid; 617 zfs_acl_ids_t acl_ids; 618 boolean_t fuid_dirtied; 619 boolean_t have_acl = B_FALSE; 620 boolean_t waited = B_FALSE; 621 boolean_t skip_acl = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 622 623 /* 624 * If we have an ephemeral id, ACL, or XVATTR then 625 * make sure file system is at proper version 626 */ 627 628 gid = crgetgid(cr); 629 uid = crgetuid(cr); 630 631 if (zfsvfs->z_use_fuids == B_FALSE && 632 (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) 633 return (SET_ERROR(EINVAL)); 634 635 if (name == NULL) 636 return (SET_ERROR(EINVAL)); 637 638 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) 639 return (error); 640 os = zfsvfs->z_os; 641 zilog = zfsvfs->z_log; 642 643 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), 644 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 645 zfs_exit(zfsvfs, FTAG); 646 return (SET_ERROR(EILSEQ)); 647 } 648 649 if (vap->va_mask & ATTR_XVATTR) { 650 if ((error = secpolicy_xvattr((xvattr_t *)vap, 651 crgetuid(cr), cr, vap->va_mode)) != 0) { 652 zfs_exit(zfsvfs, FTAG); 653 return (error); 654 } 655 } 656 657 top: 658 *zpp = NULL; 659 if (*name == '\0') { 660 /* 661 * Null component name refers to the directory itself. 662 */ 663 zhold(dzp); 664 zp = dzp; 665 dl = NULL; 666 error = 0; 667 } else { 668 /* possible igrab(zp) */ 669 int zflg = 0; 670 671 if (flag & FIGNORECASE) 672 zflg |= ZCILOOK; 673 674 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, 675 NULL, NULL); 676 if (error) { 677 if (have_acl) 678 zfs_acl_ids_free(&acl_ids); 679 if (strcmp(name, "..") == 0) 680 error = SET_ERROR(EISDIR); 681 zfs_exit(zfsvfs, FTAG); 682 return (error); 683 } 684 } 685 686 if (zp == NULL) { 687 uint64_t txtype; 688 uint64_t projid = ZFS_DEFAULT_PROJID; 689 690 /* 691 * Create a new file object and update the directory 692 * to reference it. 693 */ 694 if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, skip_acl, cr, 695 mnt_ns))) { 696 if (have_acl) 697 zfs_acl_ids_free(&acl_ids); 698 goto out; 699 } 700 701 /* 702 * We only support the creation of regular files in 703 * extended attribute directories. 704 */ 705 706 if ((dzp->z_pflags & ZFS_XATTR) && !S_ISREG(vap->va_mode)) { 707 if (have_acl) 708 zfs_acl_ids_free(&acl_ids); 709 error = SET_ERROR(EINVAL); 710 goto out; 711 } 712 713 if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap, 714 cr, vsecp, &acl_ids, mnt_ns)) != 0) 715 goto out; 716 have_acl = B_TRUE; 717 718 if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) 719 projid = zfs_inherit_projid(dzp); 720 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) { 721 zfs_acl_ids_free(&acl_ids); 722 error = SET_ERROR(EDQUOT); 723 goto out; 724 } 725 726 tx = dmu_tx_create(os); 727 728 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 729 ZFS_SA_BASE_ATTR_SIZE); 730 731 fuid_dirtied = zfsvfs->z_fuid_dirty; 732 if (fuid_dirtied) 733 zfs_fuid_txhold(zfsvfs, tx); 734 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 735 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); 736 if (!zfsvfs->z_use_sa && 737 acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 738 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 739 0, acl_ids.z_aclp->z_acl_bytes); 740 } 741 742 error = dmu_tx_assign(tx, 743 (waited ? DMU_TX_NOTHROTTLE : 0) | DMU_TX_NOWAIT); 744 if (error) { 745 zfs_dirent_unlock(dl); 746 if (error == ERESTART) { 747 waited = B_TRUE; 748 dmu_tx_wait(tx); 749 dmu_tx_abort(tx); 750 goto top; 751 } 752 zfs_acl_ids_free(&acl_ids); 753 dmu_tx_abort(tx); 754 zfs_exit(zfsvfs, FTAG); 755 return (error); 756 } 757 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); 758 759 error = zfs_link_create(dl, zp, tx, ZNEW); 760 if (error != 0) { 761 /* 762 * Since, we failed to add the directory entry for it, 763 * delete the newly created dnode. 764 */ 765 zfs_znode_delete(zp, tx); 766 remove_inode_hash(ZTOI(zp)); 767 zfs_acl_ids_free(&acl_ids); 768 dmu_tx_commit(tx); 769 goto out; 770 } 771 772 if (fuid_dirtied) 773 zfs_fuid_sync(zfsvfs, tx); 774 775 txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); 776 if (flag & FIGNORECASE) 777 txtype |= TX_CI; 778 zfs_log_create(zilog, tx, txtype, dzp, zp, name, 779 vsecp, acl_ids.z_fuidp, vap); 780 zfs_acl_ids_free(&acl_ids); 781 dmu_tx_commit(tx); 782 } else { 783 int aflags = (flag & O_APPEND) ? V_APPEND : 0; 784 785 if (have_acl) 786 zfs_acl_ids_free(&acl_ids); 787 788 /* 789 * A directory entry already exists for this name. 790 */ 791 /* 792 * Can't truncate an existing file if in exclusive mode. 793 */ 794 if (excl) { 795 error = SET_ERROR(EEXIST); 796 goto out; 797 } 798 /* 799 * Can't open a directory for writing. 800 */ 801 if (S_ISDIR(ZTOI(zp)->i_mode)) { 802 error = SET_ERROR(EISDIR); 803 goto out; 804 } 805 /* 806 * Verify requested access to file. 807 */ 808 if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr, 809 mnt_ns))) { 810 goto out; 811 } 812 813 mutex_enter(&dzp->z_lock); 814 dzp->z_seq++; 815 mutex_exit(&dzp->z_lock); 816 817 /* 818 * Truncate regular files if requested. 819 */ 820 if (S_ISREG(ZTOI(zp)->i_mode) && 821 (vap->va_mask & ATTR_SIZE) && (vap->va_size == 0)) { 822 /* we can't hold any locks when calling zfs_freesp() */ 823 if (dl) { 824 zfs_dirent_unlock(dl); 825 dl = NULL; 826 } 827 error = zfs_freesp(zp, 0, 0, mode, TRUE); 828 } 829 } 830 out: 831 832 if (dl) 833 zfs_dirent_unlock(dl); 834 835 if (error) { 836 if (zp) 837 zrele(zp); 838 } else { 839 zfs_znode_update_vfs(dzp); 840 zfs_znode_update_vfs(zp); 841 *zpp = zp; 842 } 843 844 if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 845 error = zil_commit(zilog, 0); 846 847 zfs_exit(zfsvfs, FTAG); 848 return (error); 849 } 850 851 int 852 zfs_tmpfile(struct inode *dip, vattr_t *vap, int excl, 853 int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp, 854 zidmap_t *mnt_ns) 855 { 856 (void) excl, (void) mode, (void) flag; 857 znode_t *zp = NULL, *dzp = ITOZ(dip); 858 zfsvfs_t *zfsvfs = ITOZSB(dip); 859 objset_t *os; 860 dmu_tx_t *tx; 861 int error; 862 uid_t uid; 863 gid_t gid; 864 zfs_acl_ids_t acl_ids; 865 uint64_t projid = ZFS_DEFAULT_PROJID; 866 boolean_t fuid_dirtied; 867 boolean_t have_acl = B_FALSE; 868 boolean_t waited = B_FALSE; 869 870 /* 871 * If we have an ephemeral id, ACL, or XVATTR then 872 * make sure file system is at proper version 873 */ 874 875 gid = crgetgid(cr); 876 uid = crgetuid(cr); 877 878 if (zfsvfs->z_use_fuids == B_FALSE && 879 (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) 880 return (SET_ERROR(EINVAL)); 881 882 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) 883 return (error); 884 os = zfsvfs->z_os; 885 886 if (vap->va_mask & ATTR_XVATTR) { 887 if ((error = secpolicy_xvattr((xvattr_t *)vap, 888 crgetuid(cr), cr, vap->va_mode)) != 0) { 889 zfs_exit(zfsvfs, FTAG); 890 return (error); 891 } 892 } 893 894 top: 895 *ipp = NULL; 896 897 /* 898 * Create a new file object and update the directory 899 * to reference it. 900 */ 901 if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns))) { 902 if (have_acl) 903 zfs_acl_ids_free(&acl_ids); 904 goto out; 905 } 906 907 if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap, 908 cr, vsecp, &acl_ids, mnt_ns)) != 0) 909 goto out; 910 have_acl = B_TRUE; 911 912 if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) 913 projid = zfs_inherit_projid(dzp); 914 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) { 915 zfs_acl_ids_free(&acl_ids); 916 error = SET_ERROR(EDQUOT); 917 goto out; 918 } 919 920 tx = dmu_tx_create(os); 921 922 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 923 ZFS_SA_BASE_ATTR_SIZE); 924 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 925 926 fuid_dirtied = zfsvfs->z_fuid_dirty; 927 if (fuid_dirtied) 928 zfs_fuid_txhold(zfsvfs, tx); 929 if (!zfsvfs->z_use_sa && 930 acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 931 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 932 0, acl_ids.z_aclp->z_acl_bytes); 933 } 934 error = dmu_tx_assign(tx, 935 (waited ? DMU_TX_NOTHROTTLE : 0) | DMU_TX_NOWAIT); 936 if (error) { 937 if (error == ERESTART) { 938 waited = B_TRUE; 939 dmu_tx_wait(tx); 940 dmu_tx_abort(tx); 941 goto top; 942 } 943 zfs_acl_ids_free(&acl_ids); 944 dmu_tx_abort(tx); 945 zfs_exit(zfsvfs, FTAG); 946 return (error); 947 } 948 zfs_mknode(dzp, vap, tx, cr, IS_TMPFILE, &zp, &acl_ids); 949 950 if (fuid_dirtied) 951 zfs_fuid_sync(zfsvfs, tx); 952 953 /* Add to unlinked set */ 954 zp->z_unlinked = B_TRUE; 955 zfs_unlinked_add(zp, tx); 956 zfs_acl_ids_free(&acl_ids); 957 dmu_tx_commit(tx); 958 out: 959 960 if (error) { 961 if (zp) 962 zrele(zp); 963 } else { 964 zfs_znode_update_vfs(dzp); 965 zfs_znode_update_vfs(zp); 966 *ipp = ZTOI(zp); 967 } 968 969 zfs_exit(zfsvfs, FTAG); 970 return (error); 971 } 972 973 /* 974 * Remove an entry from a directory. 975 * 976 * IN: dzp - znode of directory to remove entry from. 977 * name - name of entry to remove. 978 * cr - credentials of caller. 979 * flags - case flags. 980 * 981 * RETURN: 0 if success 982 * error code if failure 983 * 984 * Timestamps: 985 * dzp - ctime|mtime 986 * ip - ctime (if nlink > 0) 987 */ 988 989 static uint64_t null_xattr = 0; 990 991 int 992 zfs_remove(znode_t *dzp, char *name, cred_t *cr, int flags) 993 { 994 znode_t *zp; 995 znode_t *xzp; 996 zfsvfs_t *zfsvfs = ZTOZSB(dzp); 997 zilog_t *zilog; 998 uint64_t acl_obj, xattr_obj; 999 uint64_t xattr_obj_unlinked = 0; 1000 uint64_t obj = 0; 1001 uint64_t links; 1002 zfs_dirlock_t *dl; 1003 dmu_tx_t *tx; 1004 boolean_t may_delete_now, delete_now = FALSE; 1005 boolean_t unlinked, toobig = FALSE; 1006 uint64_t txtype; 1007 pathname_t *realnmp = NULL; 1008 pathname_t realnm; 1009 int error; 1010 int zflg = ZEXISTS; 1011 boolean_t waited = B_FALSE; 1012 1013 if (name == NULL) 1014 return (SET_ERROR(EINVAL)); 1015 1016 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) 1017 return (error); 1018 zilog = zfsvfs->z_log; 1019 1020 if (flags & FIGNORECASE) { 1021 zflg |= ZCILOOK; 1022 pn_alloc(&realnm); 1023 realnmp = &realnm; 1024 } 1025 1026 top: 1027 xattr_obj = 0; 1028 xzp = NULL; 1029 /* 1030 * Attempt to lock directory; fail if entry doesn't exist. 1031 */ 1032 if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, 1033 NULL, realnmp))) { 1034 if (realnmp) 1035 pn_free(realnmp); 1036 zfs_exit(zfsvfs, FTAG); 1037 return (error); 1038 } 1039 1040 if ((error = zfs_zaccess_delete(dzp, zp, cr, zfs_init_idmap))) { 1041 goto out; 1042 } 1043 1044 /* 1045 * Need to use rmdir for removing directories. 1046 */ 1047 if (S_ISDIR(ZTOI(zp)->i_mode)) { 1048 error = SET_ERROR(EPERM); 1049 goto out; 1050 } 1051 1052 mutex_enter(&zp->z_lock); 1053 may_delete_now = atomic_read(&ZTOI(zp)->i_count) == 1 && 1054 !zn_has_cached_data(zp, 0, LLONG_MAX); 1055 mutex_exit(&zp->z_lock); 1056 1057 /* 1058 * We may delete the znode now, or we may put it in the unlinked set; 1059 * it depends on whether we're the last link, and on whether there are 1060 * other holds on the inode. So we dmu_tx_hold() the right things to 1061 * allow for either case. 1062 */ 1063 obj = zp->z_id; 1064 tx = dmu_tx_create(zfsvfs->z_os); 1065 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); 1066 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 1067 zfs_sa_upgrade_txholds(tx, zp); 1068 zfs_sa_upgrade_txholds(tx, dzp); 1069 if (may_delete_now) { 1070 toobig = zp->z_size > zp->z_blksz * zfs_delete_blocks; 1071 /* if the file is too big, only hold_free a token amount */ 1072 dmu_tx_hold_free(tx, zp->z_id, 0, 1073 (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END)); 1074 } 1075 1076 /* are there any extended attributes? */ 1077 error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), 1078 &xattr_obj, sizeof (xattr_obj)); 1079 if (error == 0 && xattr_obj) { 1080 error = zfs_zget(zfsvfs, xattr_obj, &xzp); 1081 ASSERT0(error); 1082 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 1083 dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); 1084 } 1085 1086 mutex_enter(&zp->z_lock); 1087 if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now) 1088 dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); 1089 mutex_exit(&zp->z_lock); 1090 1091 /* charge as an update -- would be nice not to charge at all */ 1092 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 1093 1094 /* 1095 * Mark this transaction as typically resulting in a net free of space 1096 */ 1097 dmu_tx_mark_netfree(tx); 1098 1099 error = dmu_tx_assign(tx, 1100 (waited ? DMU_TX_NOTHROTTLE : 0) | DMU_TX_NOWAIT); 1101 if (error) { 1102 zfs_dirent_unlock(dl); 1103 if (error == ERESTART) { 1104 waited = B_TRUE; 1105 dmu_tx_wait(tx); 1106 dmu_tx_abort(tx); 1107 zrele(zp); 1108 if (xzp) 1109 zrele(xzp); 1110 goto top; 1111 } 1112 if (realnmp) 1113 pn_free(realnmp); 1114 dmu_tx_abort(tx); 1115 zrele(zp); 1116 if (xzp) 1117 zrele(xzp); 1118 zfs_exit(zfsvfs, FTAG); 1119 return (error); 1120 } 1121 1122 /* 1123 * Remove the directory entry. 1124 */ 1125 error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked); 1126 1127 if (error) { 1128 dmu_tx_commit(tx); 1129 goto out; 1130 } 1131 1132 if (unlinked) { 1133 /* 1134 * Hold z_lock so that we can make sure that the ACL obj 1135 * hasn't changed. Could have been deleted due to 1136 * zfs_sa_upgrade(). 1137 */ 1138 mutex_enter(&zp->z_lock); 1139 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), 1140 &xattr_obj_unlinked, sizeof (xattr_obj_unlinked)); 1141 delete_now = may_delete_now && !toobig && 1142 atomic_read(&ZTOI(zp)->i_count) == 1 && 1143 !zn_has_cached_data(zp, 0, LLONG_MAX) && 1144 xattr_obj == xattr_obj_unlinked && 1145 zfs_external_acl(zp) == acl_obj; 1146 VERIFY_IMPLY(xattr_obj_unlinked, xzp); 1147 } 1148 1149 if (delete_now) { 1150 if (xattr_obj_unlinked) { 1151 ASSERT3U(ZTOI(xzp)->i_nlink, ==, 2); 1152 mutex_enter(&xzp->z_lock); 1153 xzp->z_unlinked = B_TRUE; 1154 clear_nlink(ZTOI(xzp)); 1155 links = 0; 1156 error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs), 1157 &links, sizeof (links), tx); 1158 ASSERT3U(error, ==, 0); 1159 mutex_exit(&xzp->z_lock); 1160 zfs_unlinked_add(xzp, tx); 1161 1162 if (zp->z_is_sa) 1163 error = sa_remove(zp->z_sa_hdl, 1164 SA_ZPL_XATTR(zfsvfs), tx); 1165 else 1166 error = sa_update(zp->z_sa_hdl, 1167 SA_ZPL_XATTR(zfsvfs), &null_xattr, 1168 sizeof (uint64_t), tx); 1169 ASSERT0(error); 1170 } 1171 /* 1172 * Add to the unlinked set because a new reference could be 1173 * taken concurrently resulting in a deferred destruction. 1174 */ 1175 zfs_unlinked_add(zp, tx); 1176 mutex_exit(&zp->z_lock); 1177 } else if (unlinked) { 1178 mutex_exit(&zp->z_lock); 1179 zfs_unlinked_add(zp, tx); 1180 } 1181 1182 txtype = TX_REMOVE; 1183 if (flags & FIGNORECASE) 1184 txtype |= TX_CI; 1185 zfs_log_remove(zilog, tx, txtype, dzp, name, obj, unlinked); 1186 1187 dmu_tx_commit(tx); 1188 out: 1189 if (realnmp) 1190 pn_free(realnmp); 1191 1192 zfs_dirent_unlock(dl); 1193 zfs_znode_update_vfs(dzp); 1194 zfs_znode_update_vfs(zp); 1195 1196 if (delete_now) 1197 zrele(zp); 1198 else 1199 zfs_zrele_async(zp); 1200 1201 if (xzp) { 1202 zfs_znode_update_vfs(xzp); 1203 zfs_zrele_async(xzp); 1204 } 1205 1206 if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 1207 error = zil_commit(zilog, 0); 1208 1209 zfs_exit(zfsvfs, FTAG); 1210 return (error); 1211 } 1212 1213 /* 1214 * Create a new directory and insert it into dzp using the name 1215 * provided. Return a pointer to the inserted directory. 1216 * 1217 * IN: dzp - znode of directory to add subdir to. 1218 * dirname - name of new directory. 1219 * vap - attributes of new directory. 1220 * cr - credentials of caller. 1221 * flags - case flags. 1222 * vsecp - ACL to be set 1223 * mnt_ns - user namespace of the mount 1224 * 1225 * OUT: zpp - znode of created directory. 1226 * 1227 * RETURN: 0 if success 1228 * error code if failure 1229 * 1230 * Timestamps: 1231 * dzp - ctime|mtime updated 1232 * zpp - ctime|mtime|atime updated 1233 */ 1234 int 1235 zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap, znode_t **zpp, 1236 cred_t *cr, int flags, vsecattr_t *vsecp, zidmap_t *mnt_ns) 1237 { 1238 znode_t *zp; 1239 zfsvfs_t *zfsvfs = ZTOZSB(dzp); 1240 zilog_t *zilog; 1241 zfs_dirlock_t *dl; 1242 uint64_t txtype; 1243 dmu_tx_t *tx; 1244 int error; 1245 int zf = ZNEW; 1246 uid_t uid; 1247 gid_t gid = crgetgid(cr); 1248 zfs_acl_ids_t acl_ids; 1249 boolean_t fuid_dirtied; 1250 boolean_t waited = B_FALSE; 1251 1252 ASSERT(S_ISDIR(vap->va_mode)); 1253 1254 /* 1255 * If we have an ephemeral id, ACL, or XVATTR then 1256 * make sure file system is at proper version 1257 */ 1258 1259 uid = crgetuid(cr); 1260 if (zfsvfs->z_use_fuids == B_FALSE && 1261 (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) 1262 return (SET_ERROR(EINVAL)); 1263 1264 if (dirname == NULL) 1265 return (SET_ERROR(EINVAL)); 1266 1267 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) 1268 return (error); 1269 zilog = zfsvfs->z_log; 1270 1271 if (dzp->z_pflags & ZFS_XATTR) { 1272 zfs_exit(zfsvfs, FTAG); 1273 return (SET_ERROR(EINVAL)); 1274 } 1275 1276 if (zfsvfs->z_utf8 && u8_validate(dirname, 1277 strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 1278 zfs_exit(zfsvfs, FTAG); 1279 return (SET_ERROR(EILSEQ)); 1280 } 1281 if (flags & FIGNORECASE) 1282 zf |= ZCILOOK; 1283 1284 if (vap->va_mask & ATTR_XVATTR) { 1285 if ((error = secpolicy_xvattr((xvattr_t *)vap, 1286 crgetuid(cr), cr, vap->va_mode)) != 0) { 1287 zfs_exit(zfsvfs, FTAG); 1288 return (error); 1289 } 1290 } 1291 1292 if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, 1293 vsecp, &acl_ids, mnt_ns)) != 0) { 1294 zfs_exit(zfsvfs, FTAG); 1295 return (error); 1296 } 1297 /* 1298 * First make sure the new directory doesn't exist. 1299 * 1300 * Existence is checked first to make sure we don't return 1301 * EACCES instead of EEXIST which can cause some applications 1302 * to fail. 1303 */ 1304 top: 1305 *zpp = NULL; 1306 1307 if ((error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf, 1308 NULL, NULL))) { 1309 zfs_acl_ids_free(&acl_ids); 1310 zfs_exit(zfsvfs, FTAG); 1311 return (error); 1312 } 1313 1314 if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr, 1315 mnt_ns))) { 1316 zfs_acl_ids_free(&acl_ids); 1317 zfs_dirent_unlock(dl); 1318 zfs_exit(zfsvfs, FTAG); 1319 return (error); 1320 } 1321 1322 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zfs_inherit_projid(dzp))) { 1323 zfs_acl_ids_free(&acl_ids); 1324 zfs_dirent_unlock(dl); 1325 zfs_exit(zfsvfs, FTAG); 1326 return (SET_ERROR(EDQUOT)); 1327 } 1328 1329 /* 1330 * Add a new entry to the directory. 1331 */ 1332 tx = dmu_tx_create(zfsvfs->z_os); 1333 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); 1334 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); 1335 fuid_dirtied = zfsvfs->z_fuid_dirty; 1336 if (fuid_dirtied) 1337 zfs_fuid_txhold(zfsvfs, tx); 1338 if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 1339 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 1340 acl_ids.z_aclp->z_acl_bytes); 1341 } 1342 1343 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 1344 ZFS_SA_BASE_ATTR_SIZE); 1345 1346 error = dmu_tx_assign(tx, 1347 (waited ? DMU_TX_NOTHROTTLE : 0) | DMU_TX_NOWAIT); 1348 if (error) { 1349 zfs_dirent_unlock(dl); 1350 if (error == ERESTART) { 1351 waited = B_TRUE; 1352 dmu_tx_wait(tx); 1353 dmu_tx_abort(tx); 1354 goto top; 1355 } 1356 zfs_acl_ids_free(&acl_ids); 1357 dmu_tx_abort(tx); 1358 zfs_exit(zfsvfs, FTAG); 1359 return (error); 1360 } 1361 1362 /* 1363 * Create new node. 1364 */ 1365 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); 1366 1367 /* 1368 * Now put new name in parent dir. 1369 */ 1370 error = zfs_link_create(dl, zp, tx, ZNEW); 1371 if (error != 0) { 1372 zfs_znode_delete(zp, tx); 1373 remove_inode_hash(ZTOI(zp)); 1374 goto out; 1375 } 1376 1377 if (fuid_dirtied) 1378 zfs_fuid_sync(zfsvfs, tx); 1379 1380 *zpp = zp; 1381 1382 txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap); 1383 if (flags & FIGNORECASE) 1384 txtype |= TX_CI; 1385 zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp, 1386 acl_ids.z_fuidp, vap); 1387 1388 out: 1389 zfs_acl_ids_free(&acl_ids); 1390 1391 dmu_tx_commit(tx); 1392 1393 zfs_dirent_unlock(dl); 1394 1395 if (error != 0) { 1396 zrele(zp); 1397 } else { 1398 zfs_znode_update_vfs(dzp); 1399 zfs_znode_update_vfs(zp); 1400 1401 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 1402 error = zil_commit(zilog, 0); 1403 1404 } 1405 zfs_exit(zfsvfs, FTAG); 1406 return (error); 1407 } 1408 1409 /* 1410 * Remove a directory subdir entry. If the current working 1411 * directory is the same as the subdir to be removed, the 1412 * remove will fail. 1413 * 1414 * IN: dzp - znode of directory to remove from. 1415 * name - name of directory to be removed. 1416 * cwd - inode of current working directory. 1417 * cr - credentials of caller. 1418 * flags - case flags 1419 * 1420 * RETURN: 0 on success, error code on failure. 1421 * 1422 * Timestamps: 1423 * dzp - ctime|mtime updated 1424 */ 1425 int 1426 zfs_rmdir(znode_t *dzp, char *name, znode_t *cwd, cred_t *cr, 1427 int flags) 1428 { 1429 znode_t *zp; 1430 zfsvfs_t *zfsvfs = ZTOZSB(dzp); 1431 zilog_t *zilog; 1432 zfs_dirlock_t *dl; 1433 dmu_tx_t *tx; 1434 int error; 1435 int zflg = ZEXISTS; 1436 boolean_t waited = B_FALSE; 1437 1438 if (name == NULL) 1439 return (SET_ERROR(EINVAL)); 1440 1441 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) 1442 return (error); 1443 zilog = zfsvfs->z_log; 1444 1445 if (flags & FIGNORECASE) 1446 zflg |= ZCILOOK; 1447 top: 1448 zp = NULL; 1449 1450 /* 1451 * Attempt to lock directory; fail if entry doesn't exist. 1452 */ 1453 if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, 1454 NULL, NULL))) { 1455 zfs_exit(zfsvfs, FTAG); 1456 return (error); 1457 } 1458 1459 if ((error = zfs_zaccess_delete(dzp, zp, cr, zfs_init_idmap))) { 1460 goto out; 1461 } 1462 1463 if (!S_ISDIR(ZTOI(zp)->i_mode)) { 1464 error = SET_ERROR(ENOTDIR); 1465 goto out; 1466 } 1467 1468 if (zp == cwd) { 1469 error = SET_ERROR(EINVAL); 1470 goto out; 1471 } 1472 1473 /* 1474 * Grab a lock on the directory to make sure that no one is 1475 * trying to add (or lookup) entries while we are removing it. 1476 */ 1477 rw_enter(&zp->z_name_lock, RW_WRITER); 1478 1479 /* 1480 * Grab a lock on the parent pointer to make sure we play well 1481 * with the treewalk and directory rename code. 1482 */ 1483 rw_enter(&zp->z_parent_lock, RW_WRITER); 1484 1485 tx = dmu_tx_create(zfsvfs->z_os); 1486 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); 1487 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 1488 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 1489 zfs_sa_upgrade_txholds(tx, zp); 1490 zfs_sa_upgrade_txholds(tx, dzp); 1491 dmu_tx_mark_netfree(tx); 1492 error = dmu_tx_assign(tx, 1493 (waited ? DMU_TX_NOTHROTTLE : 0) | DMU_TX_NOWAIT); 1494 if (error) { 1495 rw_exit(&zp->z_parent_lock); 1496 rw_exit(&zp->z_name_lock); 1497 zfs_dirent_unlock(dl); 1498 if (error == ERESTART) { 1499 waited = B_TRUE; 1500 dmu_tx_wait(tx); 1501 dmu_tx_abort(tx); 1502 zrele(zp); 1503 goto top; 1504 } 1505 dmu_tx_abort(tx); 1506 zrele(zp); 1507 zfs_exit(zfsvfs, FTAG); 1508 return (error); 1509 } 1510 1511 error = zfs_link_destroy(dl, zp, tx, zflg, NULL); 1512 1513 if (error == 0) { 1514 uint64_t txtype = TX_RMDIR; 1515 if (flags & FIGNORECASE) 1516 txtype |= TX_CI; 1517 zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT, 1518 B_FALSE); 1519 } 1520 1521 dmu_tx_commit(tx); 1522 1523 rw_exit(&zp->z_parent_lock); 1524 rw_exit(&zp->z_name_lock); 1525 out: 1526 zfs_dirent_unlock(dl); 1527 1528 zfs_znode_update_vfs(dzp); 1529 zfs_znode_update_vfs(zp); 1530 zrele(zp); 1531 1532 if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 1533 error = zil_commit(zilog, 0); 1534 1535 zfs_exit(zfsvfs, FTAG); 1536 return (error); 1537 } 1538 1539 /* 1540 * Read directory entries from the given directory cursor position and emit 1541 * name and position for each entry. 1542 * 1543 * IN: ip - inode of directory to read. 1544 * ctx - directory entry context. 1545 * cr - credentials of caller. 1546 * 1547 * RETURN: 0 if success 1548 * error code if failure 1549 * 1550 * Timestamps: 1551 * ip - atime updated 1552 * 1553 * Note that the low 4 bits of the cookie returned by zap is always zero. 1554 * This allows us to use the low range for "special" directory entries: 1555 * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, 1556 * we use the offset 2 for the '.zfs' directory. 1557 */ 1558 int 1559 zfs_readdir(struct inode *ip, struct dir_context *ctx, cred_t *cr) 1560 { 1561 (void) cr; 1562 znode_t *zp = ITOZ(ip); 1563 zfsvfs_t *zfsvfs = ITOZSB(ip); 1564 objset_t *os; 1565 zap_cursor_t zc; 1566 zap_attribute_t *zap; 1567 int error; 1568 uint8_t prefetch; 1569 uint8_t type; 1570 int done = 0; 1571 uint64_t parent; 1572 uint64_t offset; /* must be unsigned; checks for < 1 */ 1573 1574 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 1575 return (error); 1576 1577 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), 1578 &parent, sizeof (parent))) != 0) 1579 goto out; 1580 1581 /* 1582 * Quit if directory has been removed (posix) 1583 */ 1584 if (zp->z_unlinked) 1585 goto out; 1586 1587 error = 0; 1588 os = zfsvfs->z_os; 1589 offset = ctx->pos; 1590 prefetch = zp->z_zn_prefetch; 1591 zap = zap_attribute_long_alloc(); 1592 1593 /* 1594 * Initialize the iterator cursor. 1595 */ 1596 if (offset <= 3) { 1597 /* 1598 * Start iteration from the beginning of the directory. 1599 */ 1600 zap_cursor_init(&zc, os, zp->z_id); 1601 } else { 1602 /* 1603 * The offset is a serialized cursor. 1604 */ 1605 zap_cursor_init_serialized(&zc, os, zp->z_id, offset); 1606 } 1607 1608 /* 1609 * Transform to file-system independent format 1610 */ 1611 while (!done) { 1612 uint64_t objnum; 1613 /* 1614 * Special case `.', `..', and `.zfs'. 1615 */ 1616 if (offset == 0) { 1617 (void) strcpy(zap->za_name, "."); 1618 zap->za_normalization_conflict = 0; 1619 objnum = zp->z_id; 1620 type = DT_DIR; 1621 } else if (offset == 1) { 1622 (void) strcpy(zap->za_name, ".."); 1623 zap->za_normalization_conflict = 0; 1624 objnum = parent; 1625 type = DT_DIR; 1626 } else if (offset == 2 && zfs_show_ctldir(zp)) { 1627 (void) strcpy(zap->za_name, ZFS_CTLDIR_NAME); 1628 zap->za_normalization_conflict = 0; 1629 objnum = ZFSCTL_INO_ROOT; 1630 type = DT_DIR; 1631 } else { 1632 /* 1633 * Grab next entry. 1634 */ 1635 if ((error = zap_cursor_retrieve(&zc, zap))) { 1636 if (error == ENOENT) 1637 break; 1638 else 1639 goto update; 1640 } 1641 1642 /* 1643 * Allow multiple entries provided the first entry is 1644 * the object id. Non-zpl consumers may safely make 1645 * use of the additional space. 1646 * 1647 * XXX: This should be a feature flag for compatibility 1648 */ 1649 if (zap->za_integer_length != 8 || 1650 zap->za_num_integers == 0) { 1651 cmn_err(CE_WARN, "zap_readdir: bad directory " 1652 "entry, obj = %lld, offset = %lld, " 1653 "length = %d, num = %lld\n", 1654 (u_longlong_t)zp->z_id, 1655 (u_longlong_t)offset, 1656 zap->za_integer_length, 1657 (u_longlong_t)zap->za_num_integers); 1658 error = SET_ERROR(ENXIO); 1659 goto update; 1660 } 1661 1662 objnum = ZFS_DIRENT_OBJ(zap->za_first_integer); 1663 type = ZFS_DIRENT_TYPE(zap->za_first_integer); 1664 } 1665 1666 done = !dir_emit(ctx, zap->za_name, strlen(zap->za_name), 1667 objnum, type); 1668 if (done) 1669 break; 1670 1671 if (prefetch) 1672 dmu_prefetch_dnode(os, objnum, ZIO_PRIORITY_SYNC_READ); 1673 1674 /* 1675 * Move to the next entry, fill in the previous offset. 1676 */ 1677 if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { 1678 zap_cursor_advance(&zc); 1679 offset = zap_cursor_serialize(&zc); 1680 } else { 1681 offset += 1; 1682 } 1683 ctx->pos = offset; 1684 } 1685 zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ 1686 1687 update: 1688 zap_cursor_fini(&zc); 1689 zap_attribute_free(zap); 1690 if (error == ENOENT) 1691 error = 0; 1692 out: 1693 zfs_exit(zfsvfs, FTAG); 1694 1695 return (error); 1696 } 1697 1698 /* 1699 * Get the basic file attributes and place them in the provided kstat 1700 * structure. The inode is assumed to be the authoritative source 1701 * for most of the attributes. However, the znode currently has the 1702 * authoritative atime, blksize, and block count. 1703 * 1704 * IN: ip - inode of file. 1705 * 1706 * OUT: sp - kstat values. 1707 * 1708 * RETURN: 0 (always succeeds) 1709 */ 1710 int 1711 #ifdef HAVE_GENERIC_FILLATTR_IDMAP_REQMASK 1712 zfs_getattr_fast(zidmap_t *user_ns, u32 request_mask, struct inode *ip, 1713 struct kstat *sp) 1714 #else 1715 zfs_getattr_fast(zidmap_t *user_ns, struct inode *ip, struct kstat *sp) 1716 #endif 1717 { 1718 znode_t *zp = ITOZ(ip); 1719 zfsvfs_t *zfsvfs = ITOZSB(ip); 1720 uint32_t blksize; 1721 u_longlong_t nblocks; 1722 int error; 1723 1724 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 1725 return (error); 1726 1727 mutex_enter(&zp->z_lock); 1728 1729 #ifdef HAVE_GENERIC_FILLATTR_IDMAP_REQMASK 1730 zpl_generic_fillattr(user_ns, request_mask, ip, sp); 1731 #else 1732 zpl_generic_fillattr(user_ns, ip, sp); 1733 #endif 1734 /* 1735 * +1 link count for root inode with visible '.zfs' directory. 1736 */ 1737 if ((zp->z_id == zfsvfs->z_root) && zfs_show_ctldir(zp)) 1738 if (sp->nlink < ZFS_LINK_MAX) 1739 sp->nlink++; 1740 1741 sa_object_size(zp->z_sa_hdl, &blksize, &nblocks); 1742 sp->blksize = blksize; 1743 sp->blocks = nblocks; 1744 1745 if (unlikely(zp->z_blksz == 0)) { 1746 /* 1747 * Block size hasn't been set; suggest maximal I/O transfers. 1748 */ 1749 sp->blksize = zfsvfs->z_max_blksz; 1750 } 1751 1752 mutex_exit(&zp->z_lock); 1753 1754 /* 1755 * Required to prevent NFS client from detecting different inode 1756 * numbers of snapshot root dentry before and after snapshot mount. 1757 */ 1758 if (zfsvfs->z_issnap) { 1759 if (ip->i_sb->s_root->d_inode == ip) 1760 sp->ino = ZFSCTL_INO_SNAPDIRS - 1761 dmu_objset_id(zfsvfs->z_os); 1762 } 1763 1764 zfs_exit(zfsvfs, FTAG); 1765 1766 return (0); 1767 } 1768 1769 /* 1770 * For the operation of changing file's user/group/project, we need to 1771 * handle not only the main object that is assigned to the file directly, 1772 * but also the ones that are used by the file via hidden xattr directory. 1773 * 1774 * Because the xattr directory may contains many EA entries, as to it may 1775 * be impossible to change all of them via the transaction of changing the 1776 * main object's user/group/project attributes. Then we have to change them 1777 * via other multiple independent transactions one by one. It may be not good 1778 * solution, but we have no better idea yet. 1779 */ 1780 static int 1781 zfs_setattr_dir(znode_t *dzp) 1782 { 1783 struct inode *dxip = ZTOI(dzp); 1784 struct inode *xip = NULL; 1785 zfsvfs_t *zfsvfs = ZTOZSB(dzp); 1786 objset_t *os = zfsvfs->z_os; 1787 zap_cursor_t zc; 1788 zap_attribute_t *zap; 1789 zfs_dirlock_t *dl; 1790 znode_t *zp = NULL; 1791 dmu_tx_t *tx = NULL; 1792 uint64_t uid, gid; 1793 sa_bulk_attr_t bulk[4]; 1794 int count; 1795 int err; 1796 1797 zap = zap_attribute_alloc(); 1798 zap_cursor_init(&zc, os, dzp->z_id); 1799 while ((err = zap_cursor_retrieve(&zc, zap)) == 0) { 1800 count = 0; 1801 if (zap->za_integer_length != 8 || zap->za_num_integers != 1) { 1802 err = ENXIO; 1803 break; 1804 } 1805 1806 err = zfs_dirent_lock(&dl, dzp, (char *)zap->za_name, &zp, 1807 ZEXISTS, NULL, NULL); 1808 if (err == ENOENT) 1809 goto next; 1810 if (err) 1811 break; 1812 1813 xip = ZTOI(zp); 1814 if (KUID_TO_SUID(xip->i_uid) == KUID_TO_SUID(dxip->i_uid) && 1815 KGID_TO_SGID(xip->i_gid) == KGID_TO_SGID(dxip->i_gid) && 1816 zp->z_projid == dzp->z_projid) 1817 goto next; 1818 1819 tx = dmu_tx_create(os); 1820 if (!(zp->z_pflags & ZFS_PROJID)) 1821 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 1822 else 1823 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 1824 1825 err = dmu_tx_assign(tx, DMU_TX_WAIT); 1826 if (err) 1827 break; 1828 1829 mutex_enter(&dzp->z_lock); 1830 1831 if (KUID_TO_SUID(xip->i_uid) != KUID_TO_SUID(dxip->i_uid)) { 1832 xip->i_uid = dxip->i_uid; 1833 uid = zfs_uid_read(dxip); 1834 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, 1835 &uid, sizeof (uid)); 1836 } 1837 1838 if (KGID_TO_SGID(xip->i_gid) != KGID_TO_SGID(dxip->i_gid)) { 1839 xip->i_gid = dxip->i_gid; 1840 gid = zfs_gid_read(dxip); 1841 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, 1842 &gid, sizeof (gid)); 1843 } 1844 1845 1846 uint64_t projid = dzp->z_projid; 1847 if (zp->z_projid != projid) { 1848 if (!(zp->z_pflags & ZFS_PROJID)) { 1849 err = sa_add_projid(zp->z_sa_hdl, tx, projid); 1850 if (unlikely(err == EEXIST)) { 1851 err = 0; 1852 } else if (err != 0) { 1853 goto sa_add_projid_err; 1854 } else { 1855 projid = ZFS_INVALID_PROJID; 1856 } 1857 } 1858 1859 if (projid != ZFS_INVALID_PROJID) { 1860 zp->z_projid = projid; 1861 SA_ADD_BULK_ATTR(bulk, count, 1862 SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid, 1863 sizeof (zp->z_projid)); 1864 } 1865 } 1866 1867 sa_add_projid_err: 1868 mutex_exit(&dzp->z_lock); 1869 1870 if (likely(count > 0)) { 1871 err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 1872 dmu_tx_commit(tx); 1873 } else if (projid == ZFS_INVALID_PROJID) { 1874 dmu_tx_commit(tx); 1875 } else { 1876 dmu_tx_abort(tx); 1877 } 1878 tx = NULL; 1879 if (err != 0 && err != ENOENT) 1880 break; 1881 1882 next: 1883 if (zp) { 1884 zrele(zp); 1885 zp = NULL; 1886 zfs_dirent_unlock(dl); 1887 } 1888 zap_cursor_advance(&zc); 1889 } 1890 1891 if (tx) 1892 dmu_tx_abort(tx); 1893 if (zp) { 1894 zrele(zp); 1895 zfs_dirent_unlock(dl); 1896 } 1897 zap_cursor_fini(&zc); 1898 zap_attribute_free(zap); 1899 1900 return (err == ENOENT ? 0 : err); 1901 } 1902 1903 /* 1904 * Set the file attributes to the values contained in the 1905 * vattr structure. 1906 * 1907 * IN: zp - znode of file to be modified. 1908 * vap - new attribute values. 1909 * If ATTR_XVATTR set, then optional attrs are being set 1910 * flags - ATTR_UTIME set if non-default time values provided. 1911 * - ATTR_NOACLCHECK (CIFS context only). 1912 * cr - credentials of caller. 1913 * mnt_ns - user namespace of the mount 1914 * 1915 * RETURN: 0 if success 1916 * error code if failure 1917 * 1918 * Timestamps: 1919 * ip - ctime updated, mtime updated if size changed. 1920 */ 1921 int 1922 zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr, zidmap_t *mnt_ns) 1923 { 1924 struct inode *ip; 1925 zfsvfs_t *zfsvfs = ZTOZSB(zp); 1926 objset_t *os; 1927 zilog_t *zilog; 1928 dmu_tx_t *tx; 1929 vattr_t oldva; 1930 xvattr_t *tmpxvattr; 1931 uint_t mask = vap->va_mask; 1932 uint_t saved_mask = 0; 1933 int trim_mask = 0; 1934 uint64_t new_mode; 1935 uint64_t new_kuid = 0, new_kgid = 0, new_uid, new_gid; 1936 uint64_t xattr_obj; 1937 uint64_t mtime[2], ctime[2], atime[2]; 1938 uint64_t projid = ZFS_INVALID_PROJID; 1939 znode_t *attrzp; 1940 int need_policy = FALSE; 1941 int err, err2 = 0; 1942 zfs_fuid_info_t *fuidp = NULL; 1943 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ 1944 xoptattr_t *xoap; 1945 zfs_acl_t *aclp; 1946 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 1947 boolean_t fuid_dirtied = B_FALSE; 1948 boolean_t handle_eadir = B_FALSE; 1949 sa_bulk_attr_t *bulk, *xattr_bulk; 1950 int count = 0, xattr_count = 0, bulks = 8; 1951 1952 if (mask == 0) 1953 return (0); 1954 1955 if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 1956 return (err); 1957 ip = ZTOI(zp); 1958 os = zfsvfs->z_os; 1959 1960 /* 1961 * If this is a xvattr_t, then get a pointer to the structure of 1962 * optional attributes. If this is NULL, then we have a vattr_t. 1963 */ 1964 xoap = xva_getxoptattr(xvap); 1965 if (xoap != NULL && (mask & ATTR_XVATTR)) { 1966 if (XVA_ISSET_REQ(xvap, XAT_PROJID)) { 1967 if (!dmu_objset_projectquota_enabled(os) || 1968 (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode))) { 1969 zfs_exit(zfsvfs, FTAG); 1970 return (SET_ERROR(ENOTSUP)); 1971 } 1972 1973 projid = xoap->xoa_projid; 1974 if (unlikely(projid == ZFS_INVALID_PROJID)) { 1975 zfs_exit(zfsvfs, FTAG); 1976 return (SET_ERROR(EINVAL)); 1977 } 1978 1979 if (projid == zp->z_projid && zp->z_pflags & ZFS_PROJID) 1980 projid = ZFS_INVALID_PROJID; 1981 else 1982 need_policy = TRUE; 1983 } 1984 1985 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT) && 1986 (xoap->xoa_projinherit != 1987 ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) && 1988 (!dmu_objset_projectquota_enabled(os) || 1989 (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode)))) { 1990 zfs_exit(zfsvfs, FTAG); 1991 return (SET_ERROR(ENOTSUP)); 1992 } 1993 } 1994 1995 zilog = zfsvfs->z_log; 1996 1997 /* 1998 * Make sure that if we have ephemeral uid/gid or xvattr specified 1999 * that file system is at proper version level 2000 */ 2001 2002 if (zfsvfs->z_use_fuids == B_FALSE && 2003 (((mask & ATTR_UID) && IS_EPHEMERAL(vap->va_uid)) || 2004 ((mask & ATTR_GID) && IS_EPHEMERAL(vap->va_gid)) || 2005 (mask & ATTR_XVATTR))) { 2006 zfs_exit(zfsvfs, FTAG); 2007 return (SET_ERROR(EINVAL)); 2008 } 2009 2010 if (mask & ATTR_SIZE && S_ISDIR(ip->i_mode)) { 2011 zfs_exit(zfsvfs, FTAG); 2012 return (SET_ERROR(EISDIR)); 2013 } 2014 2015 if (mask & ATTR_SIZE && !S_ISREG(ip->i_mode) && !S_ISFIFO(ip->i_mode)) { 2016 zfs_exit(zfsvfs, FTAG); 2017 return (SET_ERROR(EINVAL)); 2018 } 2019 2020 tmpxvattr = kmem_alloc(sizeof (xvattr_t), KM_SLEEP); 2021 xva_init(tmpxvattr); 2022 2023 bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP); 2024 xattr_bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP); 2025 2026 /* 2027 * Immutable files can only alter immutable bit and atime 2028 */ 2029 if ((zp->z_pflags & ZFS_IMMUTABLE) && 2030 ((mask & (ATTR_SIZE|ATTR_UID|ATTR_GID|ATTR_MTIME|ATTR_MODE)) || 2031 ((mask & ATTR_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { 2032 err = SET_ERROR(EPERM); 2033 goto out3; 2034 } 2035 2036 /* ZFS_READONLY will be handled in zfs_zaccess() */ 2037 2038 /* 2039 * Verify timestamps doesn't overflow 32 bits. 2040 * ZFS can handle large timestamps, but 32bit syscalls can't 2041 * handle times greater than 2039. This check should be removed 2042 * once large timestamps are fully supported. 2043 */ 2044 if (mask & (ATTR_ATIME | ATTR_MTIME)) { 2045 if (((mask & ATTR_ATIME) && 2046 TIMESPEC_OVERFLOW(&vap->va_atime)) || 2047 ((mask & ATTR_MTIME) && 2048 TIMESPEC_OVERFLOW(&vap->va_mtime))) { 2049 err = SET_ERROR(EOVERFLOW); 2050 goto out3; 2051 } 2052 } 2053 2054 top: 2055 attrzp = NULL; 2056 aclp = NULL; 2057 2058 /* Can this be moved to before the top label? */ 2059 if (zfs_is_readonly(zfsvfs)) { 2060 err = SET_ERROR(EROFS); 2061 goto out3; 2062 } 2063 2064 /* 2065 * First validate permissions 2066 */ 2067 2068 if (mask & ATTR_SIZE) { 2069 err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr, 2070 mnt_ns); 2071 if (err) 2072 goto out3; 2073 2074 /* 2075 * XXX - Note, we are not providing any open 2076 * mode flags here (like FNDELAY), so we may 2077 * block if there are locks present... this 2078 * should be addressed in openat(). 2079 */ 2080 /* XXX - would it be OK to generate a log record here? */ 2081 err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); 2082 if (err) 2083 goto out3; 2084 } 2085 2086 if (mask & (ATTR_ATIME|ATTR_MTIME) || 2087 ((mask & ATTR_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || 2088 XVA_ISSET_REQ(xvap, XAT_READONLY) || 2089 XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || 2090 XVA_ISSET_REQ(xvap, XAT_OFFLINE) || 2091 XVA_ISSET_REQ(xvap, XAT_SPARSE) || 2092 XVA_ISSET_REQ(xvap, XAT_CREATETIME) || 2093 XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) { 2094 need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, 2095 skipaclchk, cr, mnt_ns); 2096 } 2097 2098 if (mask & (ATTR_UID|ATTR_GID)) { 2099 int idmask = (mask & (ATTR_UID|ATTR_GID)); 2100 int take_owner; 2101 int take_group; 2102 uid_t uid; 2103 gid_t gid; 2104 2105 /* 2106 * NOTE: even if a new mode is being set, 2107 * we may clear S_ISUID/S_ISGID bits. 2108 */ 2109 2110 if (!(mask & ATTR_MODE)) 2111 vap->va_mode = zp->z_mode; 2112 2113 /* 2114 * Take ownership or chgrp to group we are a member of 2115 */ 2116 2117 uid = zfs_uid_to_vfsuid(mnt_ns, zfs_i_user_ns(ip), 2118 vap->va_uid); 2119 gid = zfs_gid_to_vfsgid(mnt_ns, zfs_i_user_ns(ip), 2120 vap->va_gid); 2121 take_owner = (mask & ATTR_UID) && (uid == crgetuid(cr)); 2122 take_group = (mask & ATTR_GID) && 2123 zfs_groupmember(zfsvfs, gid, cr); 2124 2125 /* 2126 * If both ATTR_UID and ATTR_GID are set then take_owner and 2127 * take_group must both be set in order to allow taking 2128 * ownership. 2129 * 2130 * Otherwise, send the check through secpolicy_vnode_setattr() 2131 * 2132 */ 2133 2134 if (((idmask == (ATTR_UID|ATTR_GID)) && 2135 take_owner && take_group) || 2136 ((idmask == ATTR_UID) && take_owner) || 2137 ((idmask == ATTR_GID) && take_group)) { 2138 if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, 2139 skipaclchk, cr, mnt_ns) == 0) { 2140 /* 2141 * Remove setuid/setgid for non-privileged users 2142 */ 2143 (void) secpolicy_setid_clear(vap, cr); 2144 trim_mask = (mask & (ATTR_UID|ATTR_GID)); 2145 } else { 2146 need_policy = TRUE; 2147 } 2148 } else { 2149 need_policy = TRUE; 2150 } 2151 } 2152 2153 mutex_enter(&zp->z_lock); 2154 oldva.va_mode = zp->z_mode; 2155 zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); 2156 if (mask & ATTR_XVATTR) { 2157 /* 2158 * Update xvattr mask to include only those attributes 2159 * that are actually changing. 2160 * 2161 * the bits will be restored prior to actually setting 2162 * the attributes so the caller thinks they were set. 2163 */ 2164 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { 2165 if (xoap->xoa_appendonly != 2166 ((zp->z_pflags & ZFS_APPENDONLY) != 0)) { 2167 need_policy = TRUE; 2168 } else { 2169 XVA_CLR_REQ(xvap, XAT_APPENDONLY); 2170 XVA_SET_REQ(tmpxvattr, XAT_APPENDONLY); 2171 } 2172 } 2173 2174 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) { 2175 if (xoap->xoa_projinherit != 2176 ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) { 2177 need_policy = TRUE; 2178 } else { 2179 XVA_CLR_REQ(xvap, XAT_PROJINHERIT); 2180 XVA_SET_REQ(tmpxvattr, XAT_PROJINHERIT); 2181 } 2182 } 2183 2184 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { 2185 if (xoap->xoa_nounlink != 2186 ((zp->z_pflags & ZFS_NOUNLINK) != 0)) { 2187 need_policy = TRUE; 2188 } else { 2189 XVA_CLR_REQ(xvap, XAT_NOUNLINK); 2190 XVA_SET_REQ(tmpxvattr, XAT_NOUNLINK); 2191 } 2192 } 2193 2194 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { 2195 if (xoap->xoa_immutable != 2196 ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) { 2197 need_policy = TRUE; 2198 } else { 2199 XVA_CLR_REQ(xvap, XAT_IMMUTABLE); 2200 XVA_SET_REQ(tmpxvattr, XAT_IMMUTABLE); 2201 } 2202 } 2203 2204 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { 2205 if (xoap->xoa_nodump != 2206 ((zp->z_pflags & ZFS_NODUMP) != 0)) { 2207 need_policy = TRUE; 2208 } else { 2209 XVA_CLR_REQ(xvap, XAT_NODUMP); 2210 XVA_SET_REQ(tmpxvattr, XAT_NODUMP); 2211 } 2212 } 2213 2214 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { 2215 if (xoap->xoa_av_modified != 2216 ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) { 2217 need_policy = TRUE; 2218 } else { 2219 XVA_CLR_REQ(xvap, XAT_AV_MODIFIED); 2220 XVA_SET_REQ(tmpxvattr, XAT_AV_MODIFIED); 2221 } 2222 } 2223 2224 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { 2225 if ((!S_ISREG(ip->i_mode) && 2226 xoap->xoa_av_quarantined) || 2227 xoap->xoa_av_quarantined != 2228 ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) { 2229 need_policy = TRUE; 2230 } else { 2231 XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED); 2232 XVA_SET_REQ(tmpxvattr, XAT_AV_QUARANTINED); 2233 } 2234 } 2235 2236 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { 2237 mutex_exit(&zp->z_lock); 2238 err = SET_ERROR(EPERM); 2239 goto out3; 2240 } 2241 2242 if (need_policy == FALSE && 2243 (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) || 2244 XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { 2245 need_policy = TRUE; 2246 } 2247 } 2248 2249 mutex_exit(&zp->z_lock); 2250 2251 if (mask & ATTR_MODE) { 2252 if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr, 2253 mnt_ns) == 0) { 2254 err = secpolicy_setid_setsticky_clear(ip, vap, 2255 &oldva, cr, mnt_ns, zfs_i_user_ns(ip)); 2256 if (err) 2257 goto out3; 2258 trim_mask |= ATTR_MODE; 2259 } else { 2260 need_policy = TRUE; 2261 } 2262 } 2263 2264 if (need_policy) { 2265 /* 2266 * If trim_mask is set then take ownership 2267 * has been granted or write_acl is present and user 2268 * has the ability to modify mode. In that case remove 2269 * UID|GID and or MODE from mask so that 2270 * secpolicy_vnode_setattr() doesn't revoke it. 2271 */ 2272 2273 if (trim_mask) { 2274 saved_mask = vap->va_mask; 2275 vap->va_mask &= ~trim_mask; 2276 } 2277 err = secpolicy_vnode_setattr(cr, ip, vap, &oldva, flags, 2278 zfs_zaccess_unix, zp); 2279 if (err) 2280 goto out3; 2281 2282 if (trim_mask) 2283 vap->va_mask |= saved_mask; 2284 } 2285 2286 /* 2287 * secpolicy_vnode_setattr, or take ownership may have 2288 * changed va_mask 2289 */ 2290 mask = vap->va_mask; 2291 2292 if ((mask & (ATTR_UID | ATTR_GID)) || projid != ZFS_INVALID_PROJID) { 2293 handle_eadir = B_TRUE; 2294 err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), 2295 &xattr_obj, sizeof (xattr_obj)); 2296 2297 if (err == 0 && xattr_obj) { 2298 err = zfs_zget(ZTOZSB(zp), xattr_obj, &attrzp); 2299 if (err) 2300 goto out2; 2301 } 2302 if (mask & ATTR_UID) { 2303 new_kuid = zfs_fuid_create(zfsvfs, 2304 (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp); 2305 if (new_kuid != KUID_TO_SUID(ZTOI(zp)->i_uid) && 2306 zfs_id_overquota(zfsvfs, DMU_USERUSED_OBJECT, 2307 new_kuid)) { 2308 if (attrzp) 2309 zrele(attrzp); 2310 err = SET_ERROR(EDQUOT); 2311 goto out2; 2312 } 2313 } 2314 2315 if (mask & ATTR_GID) { 2316 new_kgid = zfs_fuid_create(zfsvfs, 2317 (uint64_t)vap->va_gid, cr, ZFS_GROUP, &fuidp); 2318 if (new_kgid != KGID_TO_SGID(ZTOI(zp)->i_gid) && 2319 zfs_id_overquota(zfsvfs, DMU_GROUPUSED_OBJECT, 2320 new_kgid)) { 2321 if (attrzp) 2322 zrele(attrzp); 2323 err = SET_ERROR(EDQUOT); 2324 goto out2; 2325 } 2326 } 2327 2328 if (projid != ZFS_INVALID_PROJID && 2329 zfs_id_overquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid)) { 2330 if (attrzp) 2331 zrele(attrzp); 2332 err = EDQUOT; 2333 goto out2; 2334 } 2335 } 2336 tx = dmu_tx_create(os); 2337 2338 if (mask & ATTR_MODE) { 2339 uint64_t pmode = zp->z_mode; 2340 uint64_t acl_obj; 2341 new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); 2342 2343 if (ZTOZSB(zp)->z_acl_mode == ZFS_ACL_RESTRICTED && 2344 !(zp->z_pflags & ZFS_ACL_TRIVIAL)) { 2345 err = EPERM; 2346 goto out; 2347 } 2348 2349 if ((err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))) 2350 goto out; 2351 2352 mutex_enter(&zp->z_lock); 2353 if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) { 2354 /* 2355 * Are we upgrading ACL from old V0 format 2356 * to V1 format? 2357 */ 2358 if (zfsvfs->z_version >= ZPL_VERSION_FUID && 2359 zfs_znode_acl_version(zp) == 2360 ZFS_ACL_VERSION_INITIAL) { 2361 dmu_tx_hold_free(tx, acl_obj, 0, 2362 DMU_OBJECT_END); 2363 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 2364 0, aclp->z_acl_bytes); 2365 } else { 2366 dmu_tx_hold_write(tx, acl_obj, 0, 2367 aclp->z_acl_bytes); 2368 } 2369 } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { 2370 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 2371 0, aclp->z_acl_bytes); 2372 } 2373 mutex_exit(&zp->z_lock); 2374 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 2375 } else { 2376 if (((mask & ATTR_XVATTR) && 2377 XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) || 2378 (projid != ZFS_INVALID_PROJID && 2379 !(zp->z_pflags & ZFS_PROJID))) 2380 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 2381 else 2382 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 2383 } 2384 2385 if (attrzp) { 2386 dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE); 2387 } 2388 2389 fuid_dirtied = zfsvfs->z_fuid_dirty; 2390 if (fuid_dirtied) 2391 zfs_fuid_txhold(zfsvfs, tx); 2392 2393 zfs_sa_upgrade_txholds(tx, zp); 2394 2395 err = dmu_tx_assign(tx, DMU_TX_WAIT); 2396 if (err) 2397 goto out; 2398 2399 count = 0; 2400 /* 2401 * Set each attribute requested. 2402 * We group settings according to the locks they need to acquire. 2403 * 2404 * Note: you cannot set ctime directly, although it will be 2405 * updated as a side-effect of calling this function. 2406 */ 2407 2408 if (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID)) { 2409 /* 2410 * For the existed object that is upgraded from old system, 2411 * its on-disk layout has no slot for the project ID attribute. 2412 * But quota accounting logic needs to access related slots by 2413 * offset directly. So we need to adjust old objects' layout 2414 * to make the project ID to some unified and fixed offset. 2415 */ 2416 if (attrzp) 2417 err = sa_add_projid(attrzp->z_sa_hdl, tx, projid); 2418 if (err == 0) 2419 err = sa_add_projid(zp->z_sa_hdl, tx, projid); 2420 2421 if (unlikely(err == EEXIST)) 2422 err = 0; 2423 else if (err != 0) 2424 goto out; 2425 else 2426 projid = ZFS_INVALID_PROJID; 2427 } 2428 2429 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) 2430 mutex_enter(&zp->z_acl_lock); 2431 mutex_enter(&zp->z_lock); 2432 2433 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, 2434 &zp->z_pflags, sizeof (zp->z_pflags)); 2435 2436 if (attrzp) { 2437 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) 2438 mutex_enter(&attrzp->z_acl_lock); 2439 mutex_enter(&attrzp->z_lock); 2440 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 2441 SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags, 2442 sizeof (attrzp->z_pflags)); 2443 if (projid != ZFS_INVALID_PROJID) { 2444 attrzp->z_projid = projid; 2445 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 2446 SA_ZPL_PROJID(zfsvfs), NULL, &attrzp->z_projid, 2447 sizeof (attrzp->z_projid)); 2448 } 2449 } 2450 2451 if (mask & (ATTR_UID|ATTR_GID)) { 2452 2453 if (mask & ATTR_UID) { 2454 ZTOI(zp)->i_uid = SUID_TO_KUID(new_kuid); 2455 new_uid = zfs_uid_read(ZTOI(zp)); 2456 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, 2457 &new_uid, sizeof (new_uid)); 2458 if (attrzp) { 2459 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 2460 SA_ZPL_UID(zfsvfs), NULL, &new_uid, 2461 sizeof (new_uid)); 2462 ZTOI(attrzp)->i_uid = SUID_TO_KUID(new_uid); 2463 } 2464 } 2465 2466 if (mask & ATTR_GID) { 2467 ZTOI(zp)->i_gid = SGID_TO_KGID(new_kgid); 2468 new_gid = zfs_gid_read(ZTOI(zp)); 2469 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), 2470 NULL, &new_gid, sizeof (new_gid)); 2471 if (attrzp) { 2472 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 2473 SA_ZPL_GID(zfsvfs), NULL, &new_gid, 2474 sizeof (new_gid)); 2475 ZTOI(attrzp)->i_gid = SGID_TO_KGID(new_kgid); 2476 } 2477 } 2478 if (!(mask & ATTR_MODE)) { 2479 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), 2480 NULL, &new_mode, sizeof (new_mode)); 2481 new_mode = zp->z_mode; 2482 } 2483 err = zfs_acl_chown_setattr(zp); 2484 ASSERT0(err); 2485 if (attrzp) { 2486 err = zfs_acl_chown_setattr(attrzp); 2487 ASSERT0(err); 2488 } 2489 } 2490 2491 if (mask & ATTR_MODE) { 2492 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, 2493 &new_mode, sizeof (new_mode)); 2494 zp->z_mode = ZTOI(zp)->i_mode = new_mode; 2495 ASSERT3P(aclp, !=, NULL); 2496 err = zfs_aclset_common(zp, aclp, cr, tx); 2497 ASSERT0(err); 2498 if (zp->z_acl_cached) 2499 zfs_acl_free(zp->z_acl_cached); 2500 zp->z_acl_cached = aclp; 2501 aclp = NULL; 2502 } 2503 2504 if ((mask & ATTR_ATIME) || zp->z_atime_dirty) { 2505 zp->z_atime_dirty = B_FALSE; 2506 inode_timespec_t tmp_atime = zpl_inode_get_atime(ip); 2507 ZFS_TIME_ENCODE(&tmp_atime, atime); 2508 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, 2509 &atime, sizeof (atime)); 2510 } 2511 2512 if (mask & (ATTR_MTIME | ATTR_SIZE)) { 2513 ZFS_TIME_ENCODE(&vap->va_mtime, mtime); 2514 zpl_inode_set_mtime_to_ts(ZTOI(zp), 2515 zpl_inode_timestamp_truncate(vap->va_mtime, ZTOI(zp))); 2516 2517 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, 2518 mtime, sizeof (mtime)); 2519 } 2520 2521 if (mask & (ATTR_CTIME | ATTR_SIZE)) { 2522 ZFS_TIME_ENCODE(&vap->va_ctime, ctime); 2523 zpl_inode_set_ctime_to_ts(ZTOI(zp), 2524 zpl_inode_timestamp_truncate(vap->va_ctime, ZTOI(zp))); 2525 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, 2526 ctime, sizeof (ctime)); 2527 } 2528 2529 if (projid != ZFS_INVALID_PROJID) { 2530 zp->z_projid = projid; 2531 SA_ADD_BULK_ATTR(bulk, count, 2532 SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid, 2533 sizeof (zp->z_projid)); 2534 } 2535 2536 if (attrzp && mask) { 2537 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 2538 SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 2539 sizeof (ctime)); 2540 } 2541 2542 /* 2543 * Do this after setting timestamps to prevent timestamp 2544 * update from toggling bit 2545 */ 2546 2547 if (xoap && (mask & ATTR_XVATTR)) { 2548 2549 /* 2550 * restore trimmed off masks 2551 * so that return masks can be set for caller. 2552 */ 2553 2554 if (XVA_ISSET_REQ(tmpxvattr, XAT_APPENDONLY)) { 2555 XVA_SET_REQ(xvap, XAT_APPENDONLY); 2556 } 2557 if (XVA_ISSET_REQ(tmpxvattr, XAT_NOUNLINK)) { 2558 XVA_SET_REQ(xvap, XAT_NOUNLINK); 2559 } 2560 if (XVA_ISSET_REQ(tmpxvattr, XAT_IMMUTABLE)) { 2561 XVA_SET_REQ(xvap, XAT_IMMUTABLE); 2562 } 2563 if (XVA_ISSET_REQ(tmpxvattr, XAT_NODUMP)) { 2564 XVA_SET_REQ(xvap, XAT_NODUMP); 2565 } 2566 if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_MODIFIED)) { 2567 XVA_SET_REQ(xvap, XAT_AV_MODIFIED); 2568 } 2569 if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_QUARANTINED)) { 2570 XVA_SET_REQ(xvap, XAT_AV_QUARANTINED); 2571 } 2572 if (XVA_ISSET_REQ(tmpxvattr, XAT_PROJINHERIT)) { 2573 XVA_SET_REQ(xvap, XAT_PROJINHERIT); 2574 } 2575 2576 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) 2577 ASSERT(S_ISREG(ip->i_mode)); 2578 2579 zfs_xvattr_set(zp, xvap, tx); 2580 } 2581 2582 if (fuid_dirtied) 2583 zfs_fuid_sync(zfsvfs, tx); 2584 2585 if (mask != 0) 2586 zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); 2587 2588 mutex_exit(&zp->z_lock); 2589 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) 2590 mutex_exit(&zp->z_acl_lock); 2591 2592 if (attrzp) { 2593 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) 2594 mutex_exit(&attrzp->z_acl_lock); 2595 mutex_exit(&attrzp->z_lock); 2596 } 2597 out: 2598 if (err == 0 && xattr_count > 0) { 2599 err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk, 2600 xattr_count, tx); 2601 ASSERT0(err2); 2602 } 2603 2604 if (aclp) 2605 zfs_acl_free(aclp); 2606 2607 if (fuidp) { 2608 zfs_fuid_info_free(fuidp); 2609 fuidp = NULL; 2610 } 2611 2612 if (err) { 2613 dmu_tx_abort(tx); 2614 if (attrzp) 2615 zrele(attrzp); 2616 if (err == ERESTART) 2617 goto top; 2618 } else { 2619 if (count > 0) 2620 err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 2621 dmu_tx_commit(tx); 2622 if (attrzp) { 2623 if (err2 == 0 && handle_eadir) 2624 err = zfs_setattr_dir(attrzp); 2625 zrele(attrzp); 2626 } 2627 zfs_znode_update_vfs(zp); 2628 } 2629 2630 out2: 2631 if (err == 0 && os->os_sync == ZFS_SYNC_ALWAYS) 2632 err = zil_commit(zilog, 0); 2633 2634 out3: 2635 kmem_free(xattr_bulk, sizeof (sa_bulk_attr_t) * bulks); 2636 kmem_free(bulk, sizeof (sa_bulk_attr_t) * bulks); 2637 kmem_free(tmpxvattr, sizeof (xvattr_t)); 2638 zfs_exit(zfsvfs, FTAG); 2639 return (err); 2640 } 2641 2642 typedef struct zfs_zlock { 2643 krwlock_t *zl_rwlock; /* lock we acquired */ 2644 znode_t *zl_znode; /* znode we held */ 2645 struct zfs_zlock *zl_next; /* next in list */ 2646 } zfs_zlock_t; 2647 2648 /* 2649 * Drop locks and release vnodes that were held by zfs_rename_lock(). 2650 */ 2651 static void 2652 zfs_rename_unlock(zfs_zlock_t **zlpp) 2653 { 2654 zfs_zlock_t *zl; 2655 2656 while ((zl = *zlpp) != NULL) { 2657 if (zl->zl_znode != NULL) 2658 zfs_zrele_async(zl->zl_znode); 2659 rw_exit(zl->zl_rwlock); 2660 *zlpp = zl->zl_next; 2661 kmem_free(zl, sizeof (*zl)); 2662 } 2663 } 2664 2665 /* 2666 * Search back through the directory tree, using the ".." entries. 2667 * Lock each directory in the chain to prevent concurrent renames. 2668 * Fail any attempt to move a directory into one of its own descendants. 2669 * XXX - z_parent_lock can overlap with map or grow locks 2670 */ 2671 static int 2672 zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) 2673 { 2674 zfs_zlock_t *zl; 2675 znode_t *zp = tdzp; 2676 uint64_t rootid = ZTOZSB(zp)->z_root; 2677 uint64_t oidp = zp->z_id; 2678 krwlock_t *rwlp = &szp->z_parent_lock; 2679 krw_t rw = RW_WRITER; 2680 2681 /* 2682 * First pass write-locks szp and compares to zp->z_id. 2683 * Later passes read-lock zp and compare to zp->z_parent. 2684 */ 2685 do { 2686 if (!rw_tryenter(rwlp, rw)) { 2687 /* 2688 * Another thread is renaming in this path. 2689 * Note that if we are a WRITER, we don't have any 2690 * parent_locks held yet. 2691 */ 2692 if (rw == RW_READER && zp->z_id > szp->z_id) { 2693 /* 2694 * Drop our locks and restart 2695 */ 2696 zfs_rename_unlock(&zl); 2697 *zlpp = NULL; 2698 zp = tdzp; 2699 oidp = zp->z_id; 2700 rwlp = &szp->z_parent_lock; 2701 rw = RW_WRITER; 2702 continue; 2703 } else { 2704 /* 2705 * Wait for other thread to drop its locks 2706 */ 2707 rw_enter(rwlp, rw); 2708 } 2709 } 2710 2711 zl = kmem_alloc(sizeof (*zl), KM_SLEEP); 2712 zl->zl_rwlock = rwlp; 2713 zl->zl_znode = NULL; 2714 zl->zl_next = *zlpp; 2715 *zlpp = zl; 2716 2717 if (oidp == szp->z_id) /* We're a descendant of szp */ 2718 return (SET_ERROR(EINVAL)); 2719 2720 if (oidp == rootid) /* We've hit the top */ 2721 return (0); 2722 2723 if (rw == RW_READER) { /* i.e. not the first pass */ 2724 int error = zfs_zget(ZTOZSB(zp), oidp, &zp); 2725 if (error) 2726 return (error); 2727 zl->zl_znode = zp; 2728 } 2729 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(ZTOZSB(zp)), 2730 &oidp, sizeof (oidp)); 2731 rwlp = &zp->z_parent_lock; 2732 rw = RW_READER; 2733 2734 } while (zp->z_id != sdzp->z_id); 2735 2736 return (0); 2737 } 2738 2739 /* 2740 * Move an entry from the provided source directory to the target 2741 * directory. Change the entry name as indicated. 2742 * 2743 * IN: sdzp - Source directory containing the "old entry". 2744 * snm - Old entry name. 2745 * tdzp - Target directory to contain the "new entry". 2746 * tnm - New entry name. 2747 * cr - credentials of caller. 2748 * flags - case flags 2749 * rflags - RENAME_* flags 2750 * wa_vap - attributes for RENAME_WHITEOUT (must be a char 0:0). 2751 * mnt_ns - user namespace of the mount 2752 * 2753 * RETURN: 0 on success, error code on failure. 2754 * 2755 * Timestamps: 2756 * sdzp,tdzp - ctime|mtime updated 2757 */ 2758 int 2759 zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm, 2760 cred_t *cr, int flags, uint64_t rflags, vattr_t *wo_vap, zidmap_t *mnt_ns) 2761 { 2762 znode_t *szp, *tzp; 2763 zfsvfs_t *zfsvfs = ZTOZSB(sdzp); 2764 zilog_t *zilog; 2765 zfs_dirlock_t *sdl, *tdl; 2766 dmu_tx_t *tx; 2767 zfs_zlock_t *zl; 2768 int cmp, serr, terr; 2769 int error = 0; 2770 int zflg = 0; 2771 boolean_t waited = B_FALSE; 2772 /* Needed for whiteout inode creation. */ 2773 boolean_t fuid_dirtied; 2774 zfs_acl_ids_t acl_ids; 2775 boolean_t have_acl = B_FALSE; 2776 znode_t *wzp = NULL; 2777 2778 2779 if (snm == NULL || tnm == NULL) 2780 return (SET_ERROR(EINVAL)); 2781 2782 if (rflags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) 2783 return (SET_ERROR(EINVAL)); 2784 2785 /* Already checked by Linux VFS, but just to make sure. */ 2786 if (rflags & RENAME_EXCHANGE && 2787 (rflags & (RENAME_NOREPLACE | RENAME_WHITEOUT))) 2788 return (SET_ERROR(EINVAL)); 2789 2790 /* 2791 * Make sure we only get wo_vap iff. RENAME_WHITEOUT and that it's the 2792 * right kind of vattr_t for the whiteout file. These are set 2793 * internally by ZFS so should never be incorrect. 2794 */ 2795 VERIFY_EQUIV(rflags & RENAME_WHITEOUT, wo_vap != NULL); 2796 VERIFY_IMPLY(wo_vap, wo_vap->va_mode == S_IFCHR); 2797 VERIFY_IMPLY(wo_vap, wo_vap->va_rdev == makedevice(0, 0)); 2798 2799 if ((error = zfs_enter_verify_zp(zfsvfs, sdzp, FTAG)) != 0) 2800 return (error); 2801 zilog = zfsvfs->z_log; 2802 2803 if ((error = zfs_verify_zp(tdzp)) != 0) { 2804 zfs_exit(zfsvfs, FTAG); 2805 return (error); 2806 } 2807 2808 /* 2809 * We check i_sb because snapshots and the ctldir must have different 2810 * super blocks. 2811 */ 2812 if (ZTOI(tdzp)->i_sb != ZTOI(sdzp)->i_sb || 2813 zfsctl_is_node(ZTOI(tdzp))) { 2814 zfs_exit(zfsvfs, FTAG); 2815 return (SET_ERROR(EXDEV)); 2816 } 2817 2818 if (zfsvfs->z_utf8 && u8_validate(tnm, 2819 strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 2820 zfs_exit(zfsvfs, FTAG); 2821 return (SET_ERROR(EILSEQ)); 2822 } 2823 2824 if (flags & FIGNORECASE) 2825 zflg |= ZCILOOK; 2826 2827 top: 2828 szp = NULL; 2829 tzp = NULL; 2830 zl = NULL; 2831 2832 /* 2833 * This is to prevent the creation of links into attribute space 2834 * by renaming a linked file into/outof an attribute directory. 2835 * See the comment in zfs_link() for why this is considered bad. 2836 */ 2837 if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) { 2838 zfs_exit(zfsvfs, FTAG); 2839 return (SET_ERROR(EINVAL)); 2840 } 2841 2842 /* 2843 * Lock source and target directory entries. To prevent deadlock, 2844 * a lock ordering must be defined. We lock the directory with 2845 * the smallest object id first, or if it's a tie, the one with 2846 * the lexically first name. 2847 */ 2848 if (sdzp->z_id < tdzp->z_id) { 2849 cmp = -1; 2850 } else if (sdzp->z_id > tdzp->z_id) { 2851 cmp = 1; 2852 } else { 2853 /* 2854 * First compare the two name arguments without 2855 * considering any case folding. 2856 */ 2857 int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER); 2858 2859 cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error); 2860 ASSERT(error == 0 || !zfsvfs->z_utf8); 2861 if (cmp == 0) { 2862 /* 2863 * POSIX: "If the old argument and the new argument 2864 * both refer to links to the same existing file, 2865 * the rename() function shall return successfully 2866 * and perform no other action." 2867 */ 2868 zfs_exit(zfsvfs, FTAG); 2869 return (0); 2870 } 2871 /* 2872 * If the file system is case-folding, then we may 2873 * have some more checking to do. A case-folding file 2874 * system is either supporting mixed case sensitivity 2875 * access or is completely case-insensitive. Note 2876 * that the file system is always case preserving. 2877 * 2878 * In mixed sensitivity mode case sensitive behavior 2879 * is the default. FIGNORECASE must be used to 2880 * explicitly request case insensitive behavior. 2881 * 2882 * If the source and target names provided differ only 2883 * by case (e.g., a request to rename 'tim' to 'Tim'), 2884 * we will treat this as a special case in the 2885 * case-insensitive mode: as long as the source name 2886 * is an exact match, we will allow this to proceed as 2887 * a name-change request. 2888 */ 2889 if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE || 2890 (zfsvfs->z_case == ZFS_CASE_MIXED && 2891 flags & FIGNORECASE)) && 2892 u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST, 2893 &error) == 0) { 2894 /* 2895 * case preserving rename request, require exact 2896 * name matches 2897 */ 2898 zflg |= ZCIEXACT; 2899 zflg &= ~ZCILOOK; 2900 } 2901 } 2902 2903 /* 2904 * If the source and destination directories are the same, we should 2905 * grab the z_name_lock of that directory only once. 2906 */ 2907 if (sdzp == tdzp) { 2908 zflg |= ZHAVELOCK; 2909 rw_enter(&sdzp->z_name_lock, RW_READER); 2910 } 2911 2912 if (cmp < 0) { 2913 serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, 2914 ZEXISTS | zflg, NULL, NULL); 2915 terr = zfs_dirent_lock(&tdl, 2916 tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL); 2917 } else { 2918 terr = zfs_dirent_lock(&tdl, 2919 tdzp, tnm, &tzp, zflg, NULL, NULL); 2920 serr = zfs_dirent_lock(&sdl, 2921 sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg, 2922 NULL, NULL); 2923 } 2924 2925 if (serr) { 2926 /* 2927 * Source entry invalid or not there. 2928 */ 2929 if (!terr) { 2930 zfs_dirent_unlock(tdl); 2931 if (tzp) 2932 zrele(tzp); 2933 } 2934 2935 if (sdzp == tdzp) 2936 rw_exit(&sdzp->z_name_lock); 2937 2938 if (strcmp(snm, "..") == 0) 2939 serr = EINVAL; 2940 zfs_exit(zfsvfs, FTAG); 2941 return (serr); 2942 } 2943 if (terr) { 2944 zfs_dirent_unlock(sdl); 2945 zrele(szp); 2946 2947 if (sdzp == tdzp) 2948 rw_exit(&sdzp->z_name_lock); 2949 2950 if (strcmp(tnm, "..") == 0) 2951 terr = EINVAL; 2952 zfs_exit(zfsvfs, FTAG); 2953 return (terr); 2954 } 2955 2956 /* 2957 * If we are using project inheritance, means if the directory has 2958 * ZFS_PROJINHERIT set, then its descendant directories will inherit 2959 * not only the project ID, but also the ZFS_PROJINHERIT flag. Under 2960 * such case, we only allow renames into our tree when the project 2961 * IDs are the same. 2962 */ 2963 if (tdzp->z_pflags & ZFS_PROJINHERIT && 2964 tdzp->z_projid != szp->z_projid) { 2965 error = SET_ERROR(EXDEV); 2966 goto out; 2967 } 2968 2969 /* 2970 * Must have write access at the source to remove the old entry 2971 * and write access at the target to create the new entry. 2972 * Note that if target and source are the same, this can be 2973 * done in a single check. 2974 */ 2975 if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr, mnt_ns))) 2976 goto out; 2977 2978 if (S_ISDIR(ZTOI(szp)->i_mode)) { 2979 /* 2980 * Check to make sure rename is valid. 2981 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d 2982 */ 2983 if ((error = zfs_rename_lock(szp, tdzp, sdzp, &zl))) 2984 goto out; 2985 } 2986 2987 /* 2988 * Does target exist? 2989 */ 2990 if (tzp) { 2991 if (rflags & RENAME_NOREPLACE) { 2992 error = SET_ERROR(EEXIST); 2993 goto out; 2994 } 2995 /* 2996 * Source and target must be the same type (unless exchanging). 2997 */ 2998 if (!(rflags & RENAME_EXCHANGE)) { 2999 boolean_t s_is_dir = S_ISDIR(ZTOI(szp)->i_mode) != 0; 3000 boolean_t t_is_dir = S_ISDIR(ZTOI(tzp)->i_mode) != 0; 3001 3002 if (s_is_dir != t_is_dir) { 3003 error = SET_ERROR(s_is_dir ? ENOTDIR : EISDIR); 3004 goto out; 3005 } 3006 } 3007 /* 3008 * POSIX dictates that when the source and target 3009 * entries refer to the same file object, rename 3010 * must do nothing and exit without error. 3011 */ 3012 if (szp->z_id == tzp->z_id) { 3013 error = 0; 3014 goto out; 3015 } 3016 } else if (rflags & RENAME_EXCHANGE) { 3017 /* Target must exist for RENAME_EXCHANGE. */ 3018 error = SET_ERROR(ENOENT); 3019 goto out; 3020 } 3021 3022 /* Set up inode creation for RENAME_WHITEOUT. */ 3023 if (rflags & RENAME_WHITEOUT) { 3024 /* 3025 * Whiteout files are not regular files or directories, so to 3026 * match zfs_create() we do not inherit the project id. 3027 */ 3028 uint64_t wo_projid = ZFS_DEFAULT_PROJID; 3029 3030 error = zfs_zaccess(sdzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns); 3031 if (error) 3032 goto out; 3033 3034 if (!have_acl) { 3035 error = zfs_acl_ids_create(sdzp, 0, wo_vap, cr, NULL, 3036 &acl_ids, mnt_ns); 3037 if (error) 3038 goto out; 3039 have_acl = B_TRUE; 3040 } 3041 3042 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, wo_projid)) { 3043 error = SET_ERROR(EDQUOT); 3044 goto out; 3045 } 3046 } 3047 3048 tx = dmu_tx_create(zfsvfs->z_os); 3049 dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); 3050 dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); 3051 dmu_tx_hold_zap(tx, sdzp->z_id, 3052 (rflags & RENAME_EXCHANGE) ? TRUE : FALSE, snm); 3053 dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); 3054 if (sdzp != tdzp) { 3055 dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE); 3056 zfs_sa_upgrade_txholds(tx, tdzp); 3057 } 3058 if (tzp) { 3059 dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE); 3060 zfs_sa_upgrade_txholds(tx, tzp); 3061 } 3062 if (rflags & RENAME_WHITEOUT) { 3063 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 3064 ZFS_SA_BASE_ATTR_SIZE); 3065 3066 dmu_tx_hold_zap(tx, sdzp->z_id, TRUE, snm); 3067 dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); 3068 if (!zfsvfs->z_use_sa && 3069 acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 3070 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 3071 0, acl_ids.z_aclp->z_acl_bytes); 3072 } 3073 } 3074 fuid_dirtied = zfsvfs->z_fuid_dirty; 3075 if (fuid_dirtied) 3076 zfs_fuid_txhold(zfsvfs, tx); 3077 zfs_sa_upgrade_txholds(tx, szp); 3078 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 3079 error = dmu_tx_assign(tx, 3080 (waited ? DMU_TX_NOTHROTTLE : 0) | DMU_TX_NOWAIT); 3081 if (error) { 3082 if (zl != NULL) 3083 zfs_rename_unlock(&zl); 3084 zfs_dirent_unlock(sdl); 3085 zfs_dirent_unlock(tdl); 3086 3087 if (sdzp == tdzp) 3088 rw_exit(&sdzp->z_name_lock); 3089 3090 if (error == ERESTART) { 3091 waited = B_TRUE; 3092 dmu_tx_wait(tx); 3093 dmu_tx_abort(tx); 3094 zrele(szp); 3095 if (tzp) 3096 zrele(tzp); 3097 goto top; 3098 } 3099 dmu_tx_abort(tx); 3100 zrele(szp); 3101 if (tzp) 3102 zrele(tzp); 3103 zfs_exit(zfsvfs, FTAG); 3104 return (error); 3105 } 3106 3107 /* 3108 * Unlink the source. 3109 */ 3110 szp->z_pflags |= ZFS_AV_MODIFIED; 3111 if (tdzp->z_pflags & ZFS_PROJINHERIT) 3112 szp->z_pflags |= ZFS_PROJINHERIT; 3113 3114 error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), 3115 (void *)&szp->z_pflags, sizeof (uint64_t), tx); 3116 VERIFY0(error); 3117 3118 error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL); 3119 if (error) 3120 goto commit; 3121 3122 /* 3123 * Unlink the target. 3124 */ 3125 if (tzp) { 3126 int tzflg = zflg; 3127 3128 if (rflags & RENAME_EXCHANGE) { 3129 /* This inode will be re-linked soon. */ 3130 tzflg |= ZRENAMING; 3131 3132 tzp->z_pflags |= ZFS_AV_MODIFIED; 3133 if (sdzp->z_pflags & ZFS_PROJINHERIT) 3134 tzp->z_pflags |= ZFS_PROJINHERIT; 3135 3136 error = sa_update(tzp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), 3137 (void *)&tzp->z_pflags, sizeof (uint64_t), tx); 3138 ASSERT0(error); 3139 } 3140 error = zfs_link_destroy(tdl, tzp, tx, tzflg, NULL); 3141 if (error) 3142 goto commit_link_szp; 3143 } 3144 3145 /* 3146 * Create the new target links: 3147 * * We always link the target. 3148 * * RENAME_EXCHANGE: Link the old target to the source. 3149 * * RENAME_WHITEOUT: Create a whiteout inode in-place of the source. 3150 */ 3151 error = zfs_link_create(tdl, szp, tx, ZRENAMING); 3152 if (error) { 3153 /* 3154 * If we have removed the existing target, a subsequent call to 3155 * zfs_link_create() to add back the same entry, but with a new 3156 * dnode (szp), should not fail. 3157 */ 3158 ASSERT0P(tzp); 3159 goto commit_link_tzp; 3160 } 3161 3162 switch (rflags & (RENAME_EXCHANGE | RENAME_WHITEOUT)) { 3163 case RENAME_EXCHANGE: 3164 error = zfs_link_create(sdl, tzp, tx, ZRENAMING); 3165 /* 3166 * The same argument as zfs_link_create() failing for 3167 * szp applies here, since the source directory must 3168 * have had an entry we are replacing. 3169 */ 3170 ASSERT0(error); 3171 if (error) 3172 goto commit_unlink_td_szp; 3173 break; 3174 case RENAME_WHITEOUT: 3175 zfs_mknode(sdzp, wo_vap, tx, cr, 0, &wzp, &acl_ids); 3176 error = zfs_link_create(sdl, wzp, tx, ZNEW); 3177 if (error) { 3178 zfs_znode_delete(wzp, tx); 3179 remove_inode_hash(ZTOI(wzp)); 3180 goto commit_unlink_td_szp; 3181 } 3182 break; 3183 } 3184 3185 if (fuid_dirtied) 3186 zfs_fuid_sync(zfsvfs, tx); 3187 3188 switch (rflags & (RENAME_EXCHANGE | RENAME_WHITEOUT)) { 3189 case RENAME_EXCHANGE: 3190 zfs_log_rename_exchange(zilog, tx, 3191 (flags & FIGNORECASE ? TX_CI : 0), sdzp, sdl->dl_name, 3192 tdzp, tdl->dl_name, szp); 3193 break; 3194 case RENAME_WHITEOUT: 3195 zfs_log_rename_whiteout(zilog, tx, 3196 (flags & FIGNORECASE ? TX_CI : 0), sdzp, sdl->dl_name, 3197 tdzp, tdl->dl_name, szp, wzp); 3198 break; 3199 default: 3200 ASSERT0(rflags & ~RENAME_NOREPLACE); 3201 zfs_log_rename(zilog, tx, (flags & FIGNORECASE ? TX_CI : 0), 3202 sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp); 3203 break; 3204 } 3205 3206 commit: 3207 dmu_tx_commit(tx); 3208 out: 3209 if (have_acl) 3210 zfs_acl_ids_free(&acl_ids); 3211 3212 zfs_znode_update_vfs(sdzp); 3213 if (sdzp == tdzp) 3214 rw_exit(&sdzp->z_name_lock); 3215 3216 if (sdzp != tdzp) 3217 zfs_znode_update_vfs(tdzp); 3218 3219 zfs_znode_update_vfs(szp); 3220 zrele(szp); 3221 if (wzp) { 3222 zfs_znode_update_vfs(wzp); 3223 zrele(wzp); 3224 } 3225 if (tzp) { 3226 zfs_znode_update_vfs(tzp); 3227 zrele(tzp); 3228 } 3229 3230 if (zl != NULL) 3231 zfs_rename_unlock(&zl); 3232 3233 zfs_dirent_unlock(sdl); 3234 zfs_dirent_unlock(tdl); 3235 3236 if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 3237 error = zil_commit(zilog, 0); 3238 3239 zfs_exit(zfsvfs, FTAG); 3240 return (error); 3241 3242 /* 3243 * Clean-up path for broken link state. 3244 * 3245 * At this point we are in a (very) bad state, so we need to do our 3246 * best to correct the state. In particular, all of the nlinks are 3247 * wrong because we were destroying and creating links with ZRENAMING. 3248 * 3249 * In some form, all of these operations have to resolve the state: 3250 * 3251 * * link_destroy() *must* succeed. Fortunately, this is very likely 3252 * since we only just created it. 3253 * 3254 * * link_create()s are allowed to fail (though they shouldn't because 3255 * we only just unlinked them and are putting the entries back 3256 * during clean-up). But if they fail, we can just forcefully drop 3257 * the nlink value to (at the very least) avoid broken nlink values 3258 * -- though in the case of non-empty directories we will have to 3259 * panic (otherwise we'd have a leaked directory with a broken ..). 3260 */ 3261 commit_unlink_td_szp: 3262 VERIFY0(zfs_link_destroy(tdl, szp, tx, ZRENAMING, NULL)); 3263 commit_link_tzp: 3264 if (tzp) { 3265 if (zfs_link_create(tdl, tzp, tx, ZRENAMING)) 3266 VERIFY0(zfs_drop_nlink(tzp, tx, NULL)); 3267 } 3268 commit_link_szp: 3269 if (zfs_link_create(sdl, szp, tx, ZRENAMING)) 3270 VERIFY0(zfs_drop_nlink(szp, tx, NULL)); 3271 goto commit; 3272 } 3273 3274 /* 3275 * Insert the indicated symbolic reference entry into the directory. 3276 * 3277 * IN: dzp - Directory to contain new symbolic link. 3278 * name - Name of directory entry in dip. 3279 * vap - Attributes of new entry. 3280 * link - Name for new symlink entry. 3281 * cr - credentials of caller. 3282 * flags - case flags 3283 * mnt_ns - user namespace of the mount 3284 * 3285 * OUT: zpp - Znode for new symbolic link. 3286 * 3287 * RETURN: 0 on success, error code on failure. 3288 * 3289 * Timestamps: 3290 * dip - ctime|mtime updated 3291 */ 3292 int 3293 zfs_symlink(znode_t *dzp, char *name, vattr_t *vap, char *link, 3294 znode_t **zpp, cred_t *cr, int flags, zidmap_t *mnt_ns) 3295 { 3296 znode_t *zp; 3297 zfs_dirlock_t *dl; 3298 dmu_tx_t *tx; 3299 zfsvfs_t *zfsvfs = ZTOZSB(dzp); 3300 zilog_t *zilog; 3301 uint64_t len = strlen(link); 3302 int error; 3303 int zflg = ZNEW; 3304 zfs_acl_ids_t acl_ids; 3305 boolean_t fuid_dirtied; 3306 uint64_t txtype = TX_SYMLINK; 3307 boolean_t waited = B_FALSE; 3308 3309 ASSERT(S_ISLNK(vap->va_mode)); 3310 3311 if (name == NULL) 3312 return (SET_ERROR(EINVAL)); 3313 3314 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) 3315 return (error); 3316 zilog = zfsvfs->z_log; 3317 3318 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), 3319 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 3320 zfs_exit(zfsvfs, FTAG); 3321 return (SET_ERROR(EILSEQ)); 3322 } 3323 if (flags & FIGNORECASE) 3324 zflg |= ZCILOOK; 3325 3326 if (len > MAXPATHLEN) { 3327 zfs_exit(zfsvfs, FTAG); 3328 return (SET_ERROR(ENAMETOOLONG)); 3329 } 3330 3331 if ((error = zfs_acl_ids_create(dzp, 0, 3332 vap, cr, NULL, &acl_ids, mnt_ns)) != 0) { 3333 zfs_exit(zfsvfs, FTAG); 3334 return (error); 3335 } 3336 top: 3337 *zpp = NULL; 3338 3339 /* 3340 * Attempt to lock directory; fail if entry already exists. 3341 */ 3342 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL); 3343 if (error) { 3344 zfs_acl_ids_free(&acl_ids); 3345 zfs_exit(zfsvfs, FTAG); 3346 return (error); 3347 } 3348 3349 if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns))) { 3350 zfs_acl_ids_free(&acl_ids); 3351 zfs_dirent_unlock(dl); 3352 zfs_exit(zfsvfs, FTAG); 3353 return (error); 3354 } 3355 3356 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, ZFS_DEFAULT_PROJID)) { 3357 zfs_acl_ids_free(&acl_ids); 3358 zfs_dirent_unlock(dl); 3359 zfs_exit(zfsvfs, FTAG); 3360 return (SET_ERROR(EDQUOT)); 3361 } 3362 tx = dmu_tx_create(zfsvfs->z_os); 3363 fuid_dirtied = zfsvfs->z_fuid_dirty; 3364 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); 3365 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 3366 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 3367 ZFS_SA_BASE_ATTR_SIZE + len); 3368 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); 3369 if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 3370 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 3371 acl_ids.z_aclp->z_acl_bytes); 3372 } 3373 if (fuid_dirtied) 3374 zfs_fuid_txhold(zfsvfs, tx); 3375 error = dmu_tx_assign(tx, 3376 (waited ? DMU_TX_NOTHROTTLE : 0) | DMU_TX_NOWAIT); 3377 if (error) { 3378 zfs_dirent_unlock(dl); 3379 if (error == ERESTART) { 3380 waited = B_TRUE; 3381 dmu_tx_wait(tx); 3382 dmu_tx_abort(tx); 3383 goto top; 3384 } 3385 zfs_acl_ids_free(&acl_ids); 3386 dmu_tx_abort(tx); 3387 zfs_exit(zfsvfs, FTAG); 3388 return (error); 3389 } 3390 3391 /* 3392 * Create a new object for the symlink. 3393 * for version 4 ZPL datasets the symlink will be an SA attribute 3394 */ 3395 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); 3396 3397 if (fuid_dirtied) 3398 zfs_fuid_sync(zfsvfs, tx); 3399 3400 mutex_enter(&zp->z_lock); 3401 if (zp->z_is_sa) 3402 error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), 3403 link, len, tx); 3404 else 3405 zfs_sa_symlink(zp, link, len, tx); 3406 mutex_exit(&zp->z_lock); 3407 3408 zp->z_size = len; 3409 (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), 3410 &zp->z_size, sizeof (zp->z_size), tx); 3411 /* 3412 * Insert the new object into the directory. 3413 */ 3414 error = zfs_link_create(dl, zp, tx, ZNEW); 3415 if (error != 0) { 3416 zfs_znode_delete(zp, tx); 3417 remove_inode_hash(ZTOI(zp)); 3418 } else { 3419 if (flags & FIGNORECASE) 3420 txtype |= TX_CI; 3421 zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); 3422 3423 zfs_znode_update_vfs(dzp); 3424 zfs_znode_update_vfs(zp); 3425 } 3426 3427 zfs_acl_ids_free(&acl_ids); 3428 3429 dmu_tx_commit(tx); 3430 3431 zfs_dirent_unlock(dl); 3432 3433 if (error == 0) { 3434 *zpp = zp; 3435 3436 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 3437 error = zil_commit(zilog, 0); 3438 } else { 3439 zrele(zp); 3440 } 3441 3442 zfs_exit(zfsvfs, FTAG); 3443 return (error); 3444 } 3445 3446 /* 3447 * Return, in the buffer contained in the provided uio structure, 3448 * the symbolic path referred to by ip. 3449 * 3450 * IN: ip - inode of symbolic link 3451 * uio - structure to contain the link path. 3452 * cr - credentials of caller. 3453 * 3454 * RETURN: 0 if success 3455 * error code if failure 3456 * 3457 * Timestamps: 3458 * ip - atime updated 3459 */ 3460 int 3461 zfs_readlink(struct inode *ip, zfs_uio_t *uio, cred_t *cr) 3462 { 3463 (void) cr; 3464 znode_t *zp = ITOZ(ip); 3465 zfsvfs_t *zfsvfs = ITOZSB(ip); 3466 int error; 3467 3468 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 3469 return (error); 3470 3471 mutex_enter(&zp->z_lock); 3472 if (zp->z_is_sa) 3473 error = sa_lookup_uio(zp->z_sa_hdl, 3474 SA_ZPL_SYMLINK(zfsvfs), uio); 3475 else 3476 error = zfs_sa_readlink(zp, uio); 3477 mutex_exit(&zp->z_lock); 3478 3479 zfs_exit(zfsvfs, FTAG); 3480 return (error); 3481 } 3482 3483 /* 3484 * Insert a new entry into directory tdzp referencing szp. 3485 * 3486 * IN: tdzp - Directory to contain new entry. 3487 * szp - znode of new entry. 3488 * name - name of new entry. 3489 * cr - credentials of caller. 3490 * flags - case flags. 3491 * 3492 * RETURN: 0 if success 3493 * error code if failure 3494 * 3495 * Timestamps: 3496 * tdzp - ctime|mtime updated 3497 * szp - ctime updated 3498 */ 3499 int 3500 zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr, 3501 int flags) 3502 { 3503 struct inode *sip = ZTOI(szp); 3504 znode_t *tzp; 3505 zfsvfs_t *zfsvfs = ZTOZSB(tdzp); 3506 zilog_t *zilog; 3507 zfs_dirlock_t *dl; 3508 dmu_tx_t *tx; 3509 int error; 3510 int zf = ZNEW; 3511 uint64_t parent; 3512 uid_t owner; 3513 boolean_t waited = B_FALSE; 3514 boolean_t is_tmpfile = 0; 3515 uint64_t txg; 3516 3517 is_tmpfile = (sip->i_nlink == 0 && (sip->i_state & I_LINKABLE)); 3518 3519 ASSERT(S_ISDIR(ZTOI(tdzp)->i_mode)); 3520 3521 if (name == NULL) 3522 return (SET_ERROR(EINVAL)); 3523 3524 if ((error = zfs_enter_verify_zp(zfsvfs, tdzp, FTAG)) != 0) 3525 return (error); 3526 zilog = zfsvfs->z_log; 3527 3528 /* 3529 * POSIX dictates that we return EPERM here. 3530 * Better choices include ENOTSUP or EISDIR. 3531 */ 3532 if (S_ISDIR(sip->i_mode)) { 3533 zfs_exit(zfsvfs, FTAG); 3534 return (SET_ERROR(EPERM)); 3535 } 3536 3537 if ((error = zfs_verify_zp(szp)) != 0) { 3538 zfs_exit(zfsvfs, FTAG); 3539 return (error); 3540 } 3541 3542 /* 3543 * If we are using project inheritance, means if the directory has 3544 * ZFS_PROJINHERIT set, then its descendant directories will inherit 3545 * not only the project ID, but also the ZFS_PROJINHERIT flag. Under 3546 * such case, we only allow hard link creation in our tree when the 3547 * project IDs are the same. 3548 */ 3549 if (tdzp->z_pflags & ZFS_PROJINHERIT && 3550 tdzp->z_projid != szp->z_projid) { 3551 zfs_exit(zfsvfs, FTAG); 3552 return (SET_ERROR(EXDEV)); 3553 } 3554 3555 /* 3556 * We check i_sb because snapshots and the ctldir must have different 3557 * super blocks. 3558 */ 3559 if (sip->i_sb != ZTOI(tdzp)->i_sb || zfsctl_is_node(sip)) { 3560 zfs_exit(zfsvfs, FTAG); 3561 return (SET_ERROR(EXDEV)); 3562 } 3563 3564 /* Prevent links to .zfs/shares files */ 3565 3566 if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), 3567 &parent, sizeof (uint64_t))) != 0) { 3568 zfs_exit(zfsvfs, FTAG); 3569 return (error); 3570 } 3571 if (parent == zfsvfs->z_shares_dir) { 3572 zfs_exit(zfsvfs, FTAG); 3573 return (SET_ERROR(EPERM)); 3574 } 3575 3576 if (zfsvfs->z_utf8 && u8_validate(name, 3577 strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 3578 zfs_exit(zfsvfs, FTAG); 3579 return (SET_ERROR(EILSEQ)); 3580 } 3581 if (flags & FIGNORECASE) 3582 zf |= ZCILOOK; 3583 3584 /* 3585 * We do not support links between attributes and non-attributes 3586 * because of the potential security risk of creating links 3587 * into "normal" file space in order to circumvent restrictions 3588 * imposed in attribute space. 3589 */ 3590 if ((szp->z_pflags & ZFS_XATTR) != (tdzp->z_pflags & ZFS_XATTR)) { 3591 zfs_exit(zfsvfs, FTAG); 3592 return (SET_ERROR(EINVAL)); 3593 } 3594 3595 owner = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(sip->i_uid), 3596 cr, ZFS_OWNER); 3597 if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) { 3598 zfs_exit(zfsvfs, FTAG); 3599 return (SET_ERROR(EPERM)); 3600 } 3601 3602 if ((error = zfs_zaccess(tdzp, ACE_ADD_FILE, 0, B_FALSE, cr, 3603 zfs_init_idmap))) { 3604 zfs_exit(zfsvfs, FTAG); 3605 return (error); 3606 } 3607 3608 top: 3609 /* 3610 * Attempt to lock directory; fail if entry already exists. 3611 */ 3612 error = zfs_dirent_lock(&dl, tdzp, name, &tzp, zf, NULL, NULL); 3613 if (error) { 3614 zfs_exit(zfsvfs, FTAG); 3615 return (error); 3616 } 3617 3618 tx = dmu_tx_create(zfsvfs->z_os); 3619 dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); 3620 dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, name); 3621 if (is_tmpfile) 3622 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 3623 3624 zfs_sa_upgrade_txholds(tx, szp); 3625 zfs_sa_upgrade_txholds(tx, tdzp); 3626 error = dmu_tx_assign(tx, 3627 (waited ? DMU_TX_NOTHROTTLE : 0) | DMU_TX_NOWAIT); 3628 if (error) { 3629 zfs_dirent_unlock(dl); 3630 if (error == ERESTART) { 3631 waited = B_TRUE; 3632 dmu_tx_wait(tx); 3633 dmu_tx_abort(tx); 3634 goto top; 3635 } 3636 dmu_tx_abort(tx); 3637 zfs_exit(zfsvfs, FTAG); 3638 return (error); 3639 } 3640 /* unmark z_unlinked so zfs_link_create will not reject */ 3641 if (is_tmpfile) 3642 szp->z_unlinked = B_FALSE; 3643 error = zfs_link_create(dl, szp, tx, 0); 3644 3645 if (error == 0) { 3646 uint64_t txtype = TX_LINK; 3647 /* 3648 * tmpfile is created to be in z_unlinkedobj, so remove it. 3649 * Also, we don't log in ZIL, because all previous file 3650 * operation on the tmpfile are ignored by ZIL. Instead we 3651 * always wait for txg to sync to make sure all previous 3652 * operation are sync safe. 3653 */ 3654 if (is_tmpfile) { 3655 VERIFY0(zap_remove_int(zfsvfs->z_os, 3656 zfsvfs->z_unlinkedobj, szp->z_id, tx)); 3657 } else { 3658 if (flags & FIGNORECASE) 3659 txtype |= TX_CI; 3660 zfs_log_link(zilog, tx, txtype, tdzp, szp, name); 3661 } 3662 } else if (is_tmpfile) { 3663 /* restore z_unlinked since when linking failed */ 3664 szp->z_unlinked = B_TRUE; 3665 } 3666 txg = dmu_tx_get_txg(tx); 3667 dmu_tx_commit(tx); 3668 3669 zfs_dirent_unlock(dl); 3670 3671 if (error == 0) { 3672 if (!is_tmpfile && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 3673 error = zil_commit(zilog, 0); 3674 3675 if (is_tmpfile && zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) { 3676 txg_wait_flag_t wait_flags = 3677 spa_get_failmode(dmu_objset_spa(zfsvfs->z_os)) == 3678 ZIO_FAILURE_MODE_CONTINUE ? TXG_WAIT_SUSPEND : 0; 3679 error = txg_wait_synced_flags( 3680 dmu_objset_pool(zfsvfs->z_os), txg, wait_flags); 3681 if (error != 0) { 3682 ASSERT3U(error, ==, ESHUTDOWN); 3683 error = SET_ERROR(EIO); 3684 } 3685 } 3686 } 3687 3688 zfs_znode_update_vfs(tdzp); 3689 zfs_znode_update_vfs(szp); 3690 zfs_exit(zfsvfs, FTAG); 3691 return (error); 3692 } 3693 3694 /* Finish page writeback. */ 3695 static inline void 3696 zfs_page_writeback_done(struct page *pp, int err) 3697 { 3698 if (err != 0) { 3699 /* 3700 * Writeback failed. Re-dirty the page. It was undirtied before 3701 * the IO was issued (in zfs_putpage() or write_cache_pages()). 3702 * The kernel only considers writeback for dirty pages; if we 3703 * don't do this, it is eligible for eviction without being 3704 * written out, which we definitely don't want. 3705 */ 3706 #ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO 3707 filemap_dirty_folio(page_mapping(pp), page_folio(pp)); 3708 #else 3709 __set_page_dirty_nobuffers(pp); 3710 #endif 3711 } 3712 3713 ClearPageError(pp); 3714 end_page_writeback(pp); 3715 } 3716 3717 /* 3718 * ZIL callback for page writeback. Passes to zfs_log_write() in zfs_putpage() 3719 * for syncing writes. Called when the ZIL itx has been written to the log or 3720 * the whole txg syncs, or if the ZIL crashes or the pool suspends. Any failure 3721 * is passed as `err`. 3722 */ 3723 static void 3724 zfs_putpage_commit_cb(void *arg, int err) 3725 { 3726 zfs_page_writeback_done(arg, err); 3727 } 3728 3729 /* 3730 * Push a page out to disk, once the page is on stable storage the 3731 * registered commit callback will be run as notification of completion. 3732 * 3733 * IN: ip - page mapped for inode. 3734 * pp - page to push (page is locked) 3735 * wbc - writeback control data 3736 * for_sync - does the caller intend to wait synchronously for the 3737 * page writeback to complete? 3738 * 3739 * RETURN: 0 if success 3740 * error code if failure 3741 * 3742 * Timestamps: 3743 * ip - ctime|mtime updated 3744 */ 3745 int 3746 zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, 3747 boolean_t for_sync) 3748 { 3749 znode_t *zp = ITOZ(ip); 3750 zfsvfs_t *zfsvfs = ITOZSB(ip); 3751 loff_t offset; 3752 loff_t pgoff; 3753 unsigned int pglen; 3754 dmu_tx_t *tx; 3755 caddr_t va; 3756 int err = 0; 3757 uint64_t mtime[2], ctime[2]; 3758 inode_timespec_t tmp_ts; 3759 sa_bulk_attr_t bulk[3]; 3760 int cnt = 0; 3761 struct address_space *mapping; 3762 3763 if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 3764 return (err); 3765 3766 ASSERT(PageLocked(pp)); 3767 3768 pgoff = page_offset(pp); /* Page byte-offset in file */ 3769 offset = i_size_read(ip); /* File length in bytes */ 3770 pglen = MIN(PAGE_SIZE, /* Page length in bytes */ 3771 P2ROUNDUP(offset, PAGE_SIZE)-pgoff); 3772 3773 /* Page is beyond end of file */ 3774 if (pgoff >= offset) { 3775 unlock_page(pp); 3776 zfs_exit(zfsvfs, FTAG); 3777 return (0); 3778 } 3779 3780 /* Truncate page length to end of file */ 3781 if (pgoff + pglen > offset) 3782 pglen = offset - pgoff; 3783 3784 #if 0 3785 /* 3786 * FIXME: Allow mmap writes past its quota. The correct fix 3787 * is to register a page_mkwrite() handler to count the page 3788 * against its quota when it is about to be dirtied. 3789 */ 3790 if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, 3791 KUID_TO_SUID(ip->i_uid)) || 3792 zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, 3793 KGID_TO_SGID(ip->i_gid)) || 3794 (zp->z_projid != ZFS_DEFAULT_PROJID && 3795 zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT, 3796 zp->z_projid))) { 3797 err = EDQUOT; 3798 } 3799 #endif 3800 3801 /* 3802 * The ordering here is critical and must adhere to the following 3803 * rules in order to avoid deadlocking in either zfs_read() or 3804 * zfs_free_range() due to a lock inversion. 3805 * 3806 * 1) The page must be unlocked prior to acquiring the range lock. 3807 * This is critical because zfs_read() calls find_lock_page() 3808 * which may block on the page lock while holding the range lock. 3809 * 3810 * 2) Before setting or clearing write back on a page the range lock 3811 * must be held in order to prevent a lock inversion with the 3812 * zfs_free_range() function. 3813 * 3814 * This presents a problem because upon entering this function the 3815 * page lock is already held. To safely acquire the range lock the 3816 * page lock must be dropped. This creates a window where another 3817 * process could truncate, invalidate, dirty, or write out the page. 3818 * 3819 * Therefore, after successfully reacquiring the range and page locks 3820 * the current page state is checked. In the common case everything 3821 * will be as is expected and it can be written out. However, if 3822 * the page state has changed it must be handled accordingly. 3823 */ 3824 mapping = pp->mapping; 3825 redirty_page_for_writepage(wbc, pp); 3826 unlock_page(pp); 3827 3828 zfs_locked_range_t *lr = zfs_rangelock_enter(&zp->z_rangelock, 3829 pgoff, pglen, RL_WRITER); 3830 lock_page(pp); 3831 3832 /* Page mapping changed or it was no longer dirty, we're done */ 3833 if (unlikely((mapping != pp->mapping) || !PageDirty(pp))) { 3834 unlock_page(pp); 3835 zfs_rangelock_exit(lr); 3836 zfs_exit(zfsvfs, FTAG); 3837 return (0); 3838 } 3839 3840 /* Another process started write block if required */ 3841 if (PageWriteback(pp)) { 3842 unlock_page(pp); 3843 zfs_rangelock_exit(lr); 3844 3845 if (wbc->sync_mode != WB_SYNC_NONE) { 3846 if (PageWriteback(pp)) 3847 #ifdef HAVE_PAGEMAP_FOLIO_WAIT_BIT 3848 folio_wait_bit(page_folio(pp), PG_writeback); 3849 #else 3850 wait_on_page_bit(pp, PG_writeback); 3851 #endif 3852 } 3853 3854 zfs_exit(zfsvfs, FTAG); 3855 return (0); 3856 } 3857 3858 /* Clear the dirty flag the required locks are held */ 3859 if (!clear_page_dirty_for_io(pp)) { 3860 unlock_page(pp); 3861 zfs_rangelock_exit(lr); 3862 zfs_exit(zfsvfs, FTAG); 3863 return (0); 3864 } 3865 3866 /* 3867 * Counterpart for redirty_page_for_writepage() above. This page 3868 * was in fact not skipped and should not be counted as if it were. 3869 */ 3870 wbc->pages_skipped--; 3871 set_page_writeback(pp); 3872 unlock_page(pp); 3873 3874 tx = dmu_tx_create(zfsvfs->z_os); 3875 dmu_tx_hold_write(tx, zp->z_id, pgoff, pglen); 3876 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 3877 zfs_sa_upgrade_txholds(tx, zp); 3878 3879 err = dmu_tx_assign(tx, DMU_TX_WAIT); 3880 if (err != 0) { 3881 dmu_tx_abort(tx); 3882 zfs_page_writeback_done(pp, err); 3883 zfs_rangelock_exit(lr); 3884 zfs_exit(zfsvfs, FTAG); 3885 3886 /* 3887 * Don't return error for an async writeback; we've re-dirtied 3888 * the page so it will be tried again some other time. 3889 */ 3890 return (for_sync ? err : 0); 3891 } 3892 3893 va = kmap(pp); 3894 ASSERT3U(pglen, <=, PAGE_SIZE); 3895 dmu_write(zfsvfs->z_os, zp->z_id, pgoff, pglen, va, tx, 3896 DMU_READ_PREFETCH); 3897 kunmap(pp); 3898 3899 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); 3900 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); 3901 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_FLAGS(zfsvfs), NULL, 3902 &zp->z_pflags, 8); 3903 3904 /* Preserve the mtime and ctime provided by the inode */ 3905 tmp_ts = zpl_inode_get_mtime(ip); 3906 ZFS_TIME_ENCODE(&tmp_ts, mtime); 3907 tmp_ts = zpl_inode_get_ctime(ip); 3908 ZFS_TIME_ENCODE(&tmp_ts, ctime); 3909 zp->z_atime_dirty = B_FALSE; 3910 zp->z_seq++; 3911 3912 err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx); 3913 3914 /* 3915 * A note about for_sync vs wbc->sync_mode. 3916 * 3917 * for_sync indicates that this is a syncing writeback, that is, kernel 3918 * caller expects the data to be durably stored before being notified. 3919 * Often, but not always, the call was triggered by a userspace syncing 3920 * op (eg fsync(), msync(MS_SYNC)). For our purposes, for_sync==TRUE 3921 * means that that page should remain "locked" (in the writeback state) 3922 * until it is definitely on disk (ie zil_commit() or spa_sync()). 3923 * Otherwise, we can unlock and return as soon as it is on the 3924 * in-memory ZIL. 3925 * 3926 * wbc->sync_mode has similar meaning. wbc is passed from the kernel to 3927 * zpl_writepages()/zpl_writepage(); wbc->sync_mode==WB_SYNC_NONE 3928 * indicates this a regular async writeback (eg a cache eviction) and 3929 * so does not need a durability guarantee, while WB_SYNC_ALL indicates 3930 * a syncing op that must be waited on (by convention, we test for 3931 * !WB_SYNC_NONE rather than WB_SYNC_ALL, to prefer durability over 3932 * performance should there ever be a new mode that we have not yet 3933 * added support for). 3934 * 3935 * So, why a separate for_sync field? This is because zpl_writepages() 3936 * calls zfs_putpage() multiple times for a single "logical" operation. 3937 * It wants all the individual pages to be for_sync==TRUE ie only 3938 * unlocked once durably stored, but it only wants one call to 3939 * zil_commit() at the very end, once all the pages are synced. So, 3940 * it repurposes sync_mode slightly to indicate who issue and wait for 3941 * the IO: for NONE, the caller to zfs_putpage() will do it, while for 3942 * ALL, zfs_putpage should do it. 3943 * 3944 * Summary: 3945 * for_sync: 0=unlock immediately; 1=unlock once on disk 3946 * sync_mode: NONE=caller will commit; ALL=we will commit 3947 */ 3948 boolean_t need_commit = (wbc->sync_mode != WB_SYNC_NONE); 3949 3950 /* 3951 * We use for_sync as the "commit" arg to zfs_log_write() (arg 7) 3952 * because it is a policy flag that indicates "someone will call 3953 * zil_commit() soon". for_sync=TRUE means exactly that; the only 3954 * question is whether it will be us, or zpl_writepages(). 3955 */ 3956 zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, for_sync, 3957 B_FALSE, for_sync ? zfs_putpage_commit_cb : NULL, pp); 3958 3959 if (!for_sync) { 3960 /* 3961 * Async writeback is logged and written to the DMU, so page 3962 * can now be unlocked. 3963 */ 3964 zfs_page_writeback_done(pp, 0); 3965 } 3966 3967 dmu_tx_commit(tx); 3968 3969 zfs_rangelock_exit(lr); 3970 3971 if (need_commit) { 3972 err = zil_commit_flags(zfsvfs->z_log, zp->z_id, ZIL_COMMIT_NOW); 3973 if (err != 0) { 3974 zfs_exit(zfsvfs, FTAG); 3975 return (err); 3976 } 3977 } 3978 3979 dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, pglen); 3980 3981 zfs_exit(zfsvfs, FTAG); 3982 return (err); 3983 } 3984 3985 /* 3986 * Update the system attributes when the inode has been dirtied. For the 3987 * moment we only update the mode, atime, mtime, and ctime. 3988 */ 3989 int 3990 zfs_dirty_inode(struct inode *ip, int flags) 3991 { 3992 znode_t *zp = ITOZ(ip); 3993 zfsvfs_t *zfsvfs = ITOZSB(ip); 3994 dmu_tx_t *tx; 3995 uint64_t mode, atime[2], mtime[2], ctime[2]; 3996 inode_timespec_t tmp_ts; 3997 sa_bulk_attr_t bulk[4]; 3998 int error = 0; 3999 int cnt = 0; 4000 4001 if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os)) 4002 return (0); 4003 4004 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 4005 return (error); 4006 4007 #ifdef I_DIRTY_TIME 4008 /* 4009 * This is the lazytime semantic introduced in Linux 4.0 4010 * This flag will only be called from update_time when lazytime is set. 4011 * (Note, I_DIRTY_SYNC will also set if not lazytime) 4012 * Fortunately mtime and ctime are managed within ZFS itself, so we 4013 * only need to dirty atime. 4014 */ 4015 if (flags == I_DIRTY_TIME) { 4016 zp->z_atime_dirty = B_TRUE; 4017 goto out; 4018 } 4019 #endif 4020 4021 tx = dmu_tx_create(zfsvfs->z_os); 4022 4023 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 4024 zfs_sa_upgrade_txholds(tx, zp); 4025 4026 error = dmu_tx_assign(tx, DMU_TX_WAIT); 4027 if (error) { 4028 dmu_tx_abort(tx); 4029 goto out; 4030 } 4031 4032 mutex_enter(&zp->z_lock); 4033 zp->z_atime_dirty = B_FALSE; 4034 4035 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); 4036 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); 4037 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); 4038 SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); 4039 4040 /* Preserve the mode, mtime and ctime provided by the inode */ 4041 tmp_ts = zpl_inode_get_atime(ip); 4042 ZFS_TIME_ENCODE(&tmp_ts, atime); 4043 tmp_ts = zpl_inode_get_mtime(ip); 4044 ZFS_TIME_ENCODE(&tmp_ts, mtime); 4045 tmp_ts = zpl_inode_get_ctime(ip); 4046 ZFS_TIME_ENCODE(&tmp_ts, ctime); 4047 mode = ip->i_mode; 4048 4049 zp->z_mode = mode; 4050 4051 error = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx); 4052 mutex_exit(&zp->z_lock); 4053 4054 dmu_tx_commit(tx); 4055 out: 4056 zfs_exit(zfsvfs, FTAG); 4057 return (error); 4058 } 4059 4060 void 4061 zfs_inactive(struct inode *ip) 4062 { 4063 znode_t *zp = ITOZ(ip); 4064 zfsvfs_t *zfsvfs = ITOZSB(ip); 4065 uint64_t atime[2]; 4066 int error; 4067 int need_unlock = 0; 4068 4069 /* Only read lock if we haven't already write locked, e.g. rollback */ 4070 if (!RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)) { 4071 need_unlock = 1; 4072 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); 4073 } 4074 if (zp->z_sa_hdl == NULL) { 4075 if (need_unlock) 4076 rw_exit(&zfsvfs->z_teardown_inactive_lock); 4077 return; 4078 } 4079 4080 if (zp->z_atime_dirty && zp->z_unlinked == B_FALSE) { 4081 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); 4082 4083 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 4084 zfs_sa_upgrade_txholds(tx, zp); 4085 error = dmu_tx_assign(tx, DMU_TX_WAIT); 4086 if (error) { 4087 dmu_tx_abort(tx); 4088 } else { 4089 inode_timespec_t tmp_atime; 4090 tmp_atime = zpl_inode_get_atime(ip); 4091 ZFS_TIME_ENCODE(&tmp_atime, atime); 4092 mutex_enter(&zp->z_lock); 4093 (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), 4094 (void *)&atime, sizeof (atime), tx); 4095 zp->z_atime_dirty = B_FALSE; 4096 mutex_exit(&zp->z_lock); 4097 dmu_tx_commit(tx); 4098 } 4099 } 4100 4101 zfs_zinactive(zp); 4102 if (need_unlock) 4103 rw_exit(&zfsvfs->z_teardown_inactive_lock); 4104 } 4105 4106 /* 4107 * Fill pages with data from the disk. 4108 */ 4109 static int 4110 zfs_fillpage(struct inode *ip, struct page *pp) 4111 { 4112 znode_t *zp = ITOZ(ip); 4113 zfsvfs_t *zfsvfs = ITOZSB(ip); 4114 loff_t i_size = i_size_read(ip); 4115 u_offset_t io_off = page_offset(pp); 4116 size_t io_len = PAGE_SIZE; 4117 4118 ASSERT3U(io_off, <, i_size); 4119 4120 if (io_off + io_len > i_size) 4121 io_len = i_size - io_off; 4122 4123 void *va = kmap(pp); 4124 int error = dmu_read(zfsvfs->z_os, zp->z_id, io_off, 4125 io_len, va, DMU_READ_PREFETCH); 4126 if (io_len != PAGE_SIZE) 4127 memset((char *)va + io_len, 0, PAGE_SIZE - io_len); 4128 kunmap(pp); 4129 4130 if (error) { 4131 /* convert checksum errors into IO errors */ 4132 if (error == ECKSUM) 4133 error = SET_ERROR(EIO); 4134 4135 SetPageError(pp); 4136 ClearPageUptodate(pp); 4137 } else { 4138 ClearPageError(pp); 4139 SetPageUptodate(pp); 4140 } 4141 4142 return (error); 4143 } 4144 4145 /* 4146 * Uses zfs_fillpage to read data from the file and fill the page. 4147 * 4148 * IN: ip - inode of file to get data from. 4149 * pp - page to read 4150 * 4151 * RETURN: 0 on success, error code on failure. 4152 * 4153 * Timestamps: 4154 * vp - atime updated 4155 */ 4156 int 4157 zfs_getpage(struct inode *ip, struct page *pp) 4158 { 4159 zfsvfs_t *zfsvfs = ITOZSB(ip); 4160 znode_t *zp = ITOZ(ip); 4161 int error; 4162 loff_t i_size = i_size_read(ip); 4163 u_offset_t io_off = page_offset(pp); 4164 size_t io_len = PAGE_SIZE; 4165 4166 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 4167 return (error); 4168 4169 ASSERT3U(io_off, <, i_size); 4170 4171 if (io_off + io_len > i_size) 4172 io_len = i_size - io_off; 4173 4174 /* 4175 * It is important to hold the rangelock here because it is possible 4176 * a Direct I/O write or block clone might be taking place at the same 4177 * time that a page is being faulted in through filemap_fault(). With 4178 * Direct I/O writes and block cloning db->db_data will be set to NULL 4179 * with dbuf_clear_data() in dmu_buif_will_clone_or_dio(). If the 4180 * rangelock is not held, then there is a race between faulting in a 4181 * page and writing out a Direct I/O write or block cloning. Without 4182 * the rangelock a NULL pointer dereference can occur in 4183 * dmu_read_impl() for db->db_data during the mempcy operation when 4184 * zfs_fillpage() calls dmu_read(). 4185 */ 4186 zfs_locked_range_t *lr = zfs_rangelock_tryenter(&zp->z_rangelock, 4187 io_off, io_len, RL_READER); 4188 if (lr == NULL) { 4189 /* 4190 * It is important to drop the page lock before grabbing the 4191 * rangelock to avoid another deadlock between here and 4192 * zfs_write() -> update_pages(). update_pages() holds both the 4193 * rangelock and the page lock. 4194 */ 4195 get_page(pp); 4196 unlock_page(pp); 4197 lr = zfs_rangelock_enter(&zp->z_rangelock, io_off, 4198 io_len, RL_READER); 4199 lock_page(pp); 4200 put_page(pp); 4201 } 4202 error = zfs_fillpage(ip, pp); 4203 zfs_rangelock_exit(lr); 4204 4205 if (error == 0) 4206 dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, PAGE_SIZE); 4207 4208 zfs_exit(zfsvfs, FTAG); 4209 4210 return (error); 4211 } 4212 4213 /* 4214 * Check ZFS specific permissions to memory map a section of a file. 4215 * 4216 * IN: ip - inode of the file to mmap 4217 * off - file offset 4218 * addrp - start address in memory region 4219 * len - length of memory region 4220 * vm_flags- address flags 4221 * 4222 * RETURN: 0 if success 4223 * error code if failure 4224 */ 4225 int 4226 zfs_map(struct inode *ip, offset_t off, caddr_t *addrp, size_t len, 4227 unsigned long vm_flags) 4228 { 4229 (void) addrp; 4230 znode_t *zp = ITOZ(ip); 4231 zfsvfs_t *zfsvfs = ITOZSB(ip); 4232 int error; 4233 4234 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 4235 return (error); 4236 4237 if ((vm_flags & VM_WRITE) && (vm_flags & VM_SHARED) && 4238 (zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) { 4239 zfs_exit(zfsvfs, FTAG); 4240 return (SET_ERROR(EPERM)); 4241 } 4242 4243 if ((vm_flags & (VM_READ | VM_EXEC)) && 4244 (zp->z_pflags & ZFS_AV_QUARANTINED)) { 4245 zfs_exit(zfsvfs, FTAG); 4246 return (SET_ERROR(EACCES)); 4247 } 4248 4249 if (off < 0 || len > MAXOFFSET_T - off) { 4250 zfs_exit(zfsvfs, FTAG); 4251 return (SET_ERROR(ENXIO)); 4252 } 4253 4254 zfs_exit(zfsvfs, FTAG); 4255 return (0); 4256 } 4257 4258 /* 4259 * Free or allocate space in a file. Currently, this function only 4260 * supports the `F_FREESP' command. However, this command is somewhat 4261 * misnamed, as its functionality includes the ability to allocate as 4262 * well as free space. 4263 * 4264 * IN: zp - znode of file to free data in. 4265 * cmd - action to take (only F_FREESP supported). 4266 * bfp - section of file to free/alloc. 4267 * flag - current file open mode flags. 4268 * offset - current file offset. 4269 * cr - credentials of caller. 4270 * 4271 * RETURN: 0 on success, error code on failure. 4272 * 4273 * Timestamps: 4274 * zp - ctime|mtime updated 4275 */ 4276 int 4277 zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag, 4278 offset_t offset, cred_t *cr) 4279 { 4280 (void) offset; 4281 zfsvfs_t *zfsvfs = ZTOZSB(zp); 4282 uint64_t off, len; 4283 int error; 4284 4285 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 4286 return (error); 4287 4288 if (cmd != F_FREESP) { 4289 zfs_exit(zfsvfs, FTAG); 4290 return (SET_ERROR(EINVAL)); 4291 } 4292 4293 /* 4294 * Callers might not be able to detect properly that we are read-only, 4295 * so check it explicitly here. 4296 */ 4297 if (zfs_is_readonly(zfsvfs)) { 4298 zfs_exit(zfsvfs, FTAG); 4299 return (SET_ERROR(EROFS)); 4300 } 4301 4302 if (bfp->l_len < 0) { 4303 zfs_exit(zfsvfs, FTAG); 4304 return (SET_ERROR(EINVAL)); 4305 } 4306 4307 /* 4308 * Permissions aren't checked on Solaris because on this OS 4309 * zfs_space() can only be called with an opened file handle. 4310 * On Linux we can get here through truncate_range() which 4311 * operates directly on inodes, so we need to check access rights. 4312 */ 4313 if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr, 4314 zfs_init_idmap))) { 4315 zfs_exit(zfsvfs, FTAG); 4316 return (error); 4317 } 4318 4319 off = bfp->l_start; 4320 len = bfp->l_len; /* 0 means from off to end of file */ 4321 4322 error = zfs_freesp(zp, off, len, flag, TRUE); 4323 4324 zfs_exit(zfsvfs, FTAG); 4325 return (error); 4326 } 4327 4328 int 4329 zfs_fid(struct inode *ip, fid_t *fidp) 4330 { 4331 znode_t *zp = ITOZ(ip); 4332 zfsvfs_t *zfsvfs = ITOZSB(ip); 4333 uint32_t gen; 4334 uint64_t gen64; 4335 uint64_t object = zp->z_id; 4336 zfid_short_t *zfid; 4337 int size, i, error; 4338 4339 if ((error = zfs_enter(zfsvfs, FTAG)) != 0) 4340 return (error); 4341 4342 if (fidp->fid_len < SHORT_FID_LEN) { 4343 fidp->fid_len = SHORT_FID_LEN; 4344 zfs_exit(zfsvfs, FTAG); 4345 return (SET_ERROR(ENOSPC)); 4346 } 4347 4348 if ((error = zfs_verify_zp(zp)) != 0) { 4349 zfs_exit(zfsvfs, FTAG); 4350 return (error); 4351 } 4352 4353 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), 4354 &gen64, sizeof (uint64_t))) != 0) { 4355 zfs_exit(zfsvfs, FTAG); 4356 return (error); 4357 } 4358 4359 gen = (uint32_t)gen64; 4360 4361 size = SHORT_FID_LEN; 4362 4363 zfid = (zfid_short_t *)fidp; 4364 4365 zfid->zf_len = size; 4366 4367 for (i = 0; i < sizeof (zfid->zf_object); i++) 4368 zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); 4369 4370 /* Must have a non-zero generation number to distinguish from .zfs */ 4371 if (gen == 0) 4372 gen = 1; 4373 for (i = 0; i < sizeof (zfid->zf_gen); i++) 4374 zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); 4375 4376 zfs_exit(zfsvfs, FTAG); 4377 return (0); 4378 } 4379 4380 #if defined(_KERNEL) 4381 EXPORT_SYMBOL(zfs_open); 4382 EXPORT_SYMBOL(zfs_close); 4383 EXPORT_SYMBOL(zfs_lookup); 4384 EXPORT_SYMBOL(zfs_create); 4385 EXPORT_SYMBOL(zfs_tmpfile); 4386 EXPORT_SYMBOL(zfs_remove); 4387 EXPORT_SYMBOL(zfs_mkdir); 4388 EXPORT_SYMBOL(zfs_rmdir); 4389 EXPORT_SYMBOL(zfs_readdir); 4390 EXPORT_SYMBOL(zfs_getattr_fast); 4391 EXPORT_SYMBOL(zfs_setattr); 4392 EXPORT_SYMBOL(zfs_rename); 4393 EXPORT_SYMBOL(zfs_symlink); 4394 EXPORT_SYMBOL(zfs_readlink); 4395 EXPORT_SYMBOL(zfs_link); 4396 EXPORT_SYMBOL(zfs_inactive); 4397 EXPORT_SYMBOL(zfs_space); 4398 EXPORT_SYMBOL(zfs_fid); 4399 EXPORT_SYMBOL(zfs_getpage); 4400 EXPORT_SYMBOL(zfs_putpage); 4401 EXPORT_SYMBOL(zfs_dirty_inode); 4402 EXPORT_SYMBOL(zfs_map); 4403 4404 module_param(zfs_delete_blocks, ulong, 0644); 4405 MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async"); 4406 #endif 4407