1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3 * CDDL HEADER START
4 *
5 * The contents of this file are subject to the terms of the
6 * Common Development and Distribution License (the "License").
7 * You may not use this file except in compliance with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or https://opensource.org/licenses/CDDL-1.0.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22
23 /*
24 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
25 * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
26 * Copyright (c) 2014 Integros [integros.com]
27 * Copyright 2017 Nexenta Systems, Inc.
28 * Copyright (c) 2025, Klara, Inc.
29 */
30
31 /* Portions Copyright 2007 Jeremy Teo */
32 /* Portions Copyright 2010 Robert Milkowski */
33
34 #include <sys/param.h>
35 #include <sys/time.h>
36 #include <sys/systm.h>
37 #include <sys/sysmacros.h>
38 #include <sys/resource.h>
39 #include <security/mac/mac_framework.h>
40 #include <sys/vfs.h>
41 #include <sys/endian.h>
42 #include <sys/vm.h>
43 #include <sys/vnode.h>
44 #include <sys/smr.h>
45 #include <sys/dirent.h>
46 #include <sys/file.h>
47 #include <sys/stat.h>
48 #include <sys/kmem.h>
49 #include <sys/taskq.h>
50 #include <sys/uio.h>
51 #include <sys/atomic.h>
52 #include <sys/namei.h>
53 #include <sys/mman.h>
54 #include <sys/cmn_err.h>
55 #include <sys/kdb.h>
56 #include <sys/sysproto.h>
57 #include <sys/errno.h>
58 #include <sys/unistd.h>
59 #include <sys/zfs_dir.h>
60 #include <sys/zfs_ioctl.h>
61 #include <sys/fs/zfs.h>
62 #include <sys/dmu.h>
63 #include <sys/dmu_objset.h>
64 #include <sys/dsl_dataset.h>
65 #include <sys/spa.h>
66 #include <sys/txg.h>
67 #include <sys/dbuf.h>
68 #include <sys/zap.h>
69 #include <sys/sa.h>
70 #include <sys/policy.h>
71 #include <sys/sunddi.h>
72 #include <sys/filio.h>
73 #include <sys/sid.h>
74 #include <sys/zfs_ctldir.h>
75 #include <sys/zfs_fuid.h>
76 #include <sys/zfs_quota.h>
77 #include <sys/zfs_sa.h>
78 #include <sys/zfs_rlock.h>
79 #include <sys/zfs_project.h>
80 #include <sys/bio.h>
81 #include <sys/buf.h>
82 #include <sys/sched.h>
83 #include <sys/acl.h>
84 #include <sys/vmmeter.h>
85 #include <vm/vm_param.h>
86 #include <sys/zil.h>
87 #include <sys/zfs_vnops.h>
88 #include <sys/module.h>
89 #include <sys/sysent.h>
90 #include <sys/dmu_impl.h>
91 #include <sys/brt.h>
92 #include <sys/zfeature.h>
93
94 #include <vm/vm_object.h>
95
96 #include <sys/extattr.h>
97 #include <sys/priv.h>
98
99 #ifndef VN_OPEN_INVFS
100 #define VN_OPEN_INVFS 0x0
101 #endif
102
103 VFS_SMR_DECLARE;
104
105 #if __FreeBSD_version >= 1400045
106 typedef uint64_t cookie_t;
107 #else
108 typedef ulong_t cookie_t;
109 #endif
110
111 static int zfs_check_attrname(const char *name);
112
113 /*
114 * Programming rules.
115 *
116 * Each vnode op performs some logical unit of work. To do this, the ZPL must
117 * properly lock its in-core state, create a DMU transaction, do the work,
118 * record this work in the intent log (ZIL), commit the DMU transaction,
119 * and wait for the intent log to commit if it is a synchronous operation.
120 * Moreover, the vnode ops must work in both normal and log replay context.
121 * The ordering of events is important to avoid deadlocks and references
122 * to freed memory. The example below illustrates the following Big Rules:
123 *
124 * (1) A check must be made in each zfs thread for a mounted file system.
125 * This is done avoiding races using zfs_enter(zfsvfs).
126 * A zfs_exit(zfsvfs) is needed before all returns. Any znodes
127 * must be checked with zfs_verify_zp(zp). Both of these macros
128 * can return EIO from the calling function.
129 *
130 * (2) VN_RELE() should always be the last thing except for zil_commit()
131 * (if necessary) and zfs_exit(). This is for 3 reasons:
132 * First, if it's the last reference, the vnode/znode
133 * can be freed, so the zp may point to freed memory. Second, the last
134 * reference will call zfs_zinactive(), which may induce a lot of work --
135 * pushing cached pages (which acquires range locks) and syncing out
136 * cached atime changes. Third, zfs_zinactive() may require a new tx,
137 * which could deadlock the system if you were already holding one.
138 * If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
139 *
140 * (3) All range locks must be grabbed before calling dmu_tx_assign(),
141 * as they can span dmu_tx_assign() calls.
142 *
143 * (4) If ZPL locks are held, pass DMU_TX_NOWAIT as the second argument to
144 * dmu_tx_assign(). This is critical because we don't want to block
145 * while holding locks.
146 *
147 * If no ZPL locks are held (aside from zfs_enter()), use DMU_TX_WAIT.
148 * This reduces lock contention and CPU usage when we must wait (note
149 * that if throughput is constrained by the storage, nearly every
150 * transaction must wait).
151 *
152 * Note, in particular, that if a lock is sometimes acquired before
153 * the tx assigns, and sometimes after (e.g. z_lock), then failing
154 * to use a non-blocking assign can deadlock the system. The scenario:
155 *
156 * Thread A has grabbed a lock before calling dmu_tx_assign().
157 * Thread B is in an already-assigned tx, and blocks for this lock.
158 * Thread A calls dmu_tx_assign(DMU_TX_WAIT) and blocks in
159 * txg_wait_open() forever, because the previous txg can't quiesce
160 * until B's tx commits.
161 *
162 * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is
163 * DMU_TX_NOWAIT, then drop all locks, call dmu_tx_wait(), and try
164 * again. On subsequent calls to dmu_tx_assign(), pass
165 * DMU_TX_NOTHROTTLE in addition to DMU_TX_NOWAIT, to indicate that
166 * this operation has already called dmu_tx_wait(). This will ensure
167 * that we don't retry forever, waiting a short bit each time.
168 *
169 * (5) If the operation succeeded, generate the intent log entry for it
170 * before dropping locks. This ensures that the ordering of events
171 * in the intent log matches the order in which they actually occurred.
172 * During ZIL replay the zfs_log_* functions will update the sequence
173 * number to indicate the zil transaction has replayed.
174 *
175 * (6) At the end of each vnode op, the DMU tx must always commit,
176 * regardless of whether there were any errors.
177 *
178 * (7) After dropping all locks, invoke zil_commit(zilog, foid)
179 * to ensure that synchronous semantics are provided when necessary.
180 *
181 * In general, this is how things should be ordered in each vnode op:
182 *
183 * zfs_enter(zfsvfs); // exit if unmounted
184 * top:
185 * zfs_dirent_lookup(&dl, ...) // lock directory entry (may VN_HOLD())
186 * rw_enter(...); // grab any other locks you need
187 * tx = dmu_tx_create(...); // get DMU tx
188 * dmu_tx_hold_*(); // hold each object you might modify
189 * error = dmu_tx_assign(tx,
190 * (waited ? DMU_TX_NOTHROTTLE : 0) | DMU_TX_NOWAIT);
191 * if (error) {
192 * rw_exit(...); // drop locks
193 * zfs_dirent_unlock(dl); // unlock directory entry
194 * VN_RELE(...); // release held vnodes
195 * if (error == ERESTART) {
196 * waited = B_TRUE;
197 * dmu_tx_wait(tx);
198 * dmu_tx_abort(tx);
199 * goto top;
200 * }
201 * dmu_tx_abort(tx); // abort DMU tx
202 * zfs_exit(zfsvfs); // finished in zfs
203 * return (error); // really out of space
204 * }
205 * error = do_real_work(); // do whatever this VOP does
206 * if (error == 0)
207 * zfs_log_*(...); // on success, make ZIL entry
208 * dmu_tx_commit(tx); // commit DMU tx -- error or not
209 * rw_exit(...); // drop locks
210 * zfs_dirent_unlock(dl); // unlock directory entry
211 * VN_RELE(...); // release held vnodes
212 * zil_commit(zilog, foid); // synchronous when necessary
213 * zfs_exit(zfsvfs); // finished in zfs
214 * return (error); // done, report error
215 */
216 static int
zfs_open(vnode_t ** vpp,int flag,cred_t * cr)217 zfs_open(vnode_t **vpp, int flag, cred_t *cr)
218 {
219 (void) cr;
220 znode_t *zp = VTOZ(*vpp);
221 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
222 int error;
223
224 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
225 return (error);
226
227 if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
228 ((flag & FAPPEND) == 0)) {
229 zfs_exit(zfsvfs, FTAG);
230 return (SET_ERROR(EPERM));
231 }
232
233 /*
234 * Keep a count of the synchronous opens in the znode. On first
235 * synchronous open we must convert all previous async transactions
236 * into sync to keep correct ordering.
237 * Skip it for snapshot, as it won't have any transactions.
238 */
239 if (!zfsvfs->z_issnap && (flag & O_SYNC)) {
240 if (atomic_inc_32_nv(&zp->z_sync_cnt) == 1)
241 zil_async_to_sync(zfsvfs->z_log, zp->z_id);
242 }
243
244 zfs_exit(zfsvfs, FTAG);
245 return (0);
246 }
247
248 static int
zfs_close(vnode_t * vp,int flag,int count,offset_t offset,cred_t * cr)249 zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr)
250 {
251 (void) offset, (void) cr;
252 znode_t *zp = VTOZ(vp);
253 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
254 int error;
255
256 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
257 return (error);
258
259 /* Decrement the synchronous opens in the znode */
260 if (!zfsvfs->z_issnap && (flag & O_SYNC) && (count == 1))
261 atomic_dec_32(&zp->z_sync_cnt);
262
263 zfs_exit(zfsvfs, FTAG);
264 return (0);
265 }
266
267 static int
zfs_ioctl_getxattr(vnode_t * vp,zfsxattr_t * fsx)268 zfs_ioctl_getxattr(vnode_t *vp, zfsxattr_t *fsx)
269 {
270 znode_t *zp = VTOZ(vp);
271
272 memset(fsx, 0, sizeof (*fsx));
273 fsx->fsx_xflags = (zp->z_pflags & ZFS_PROJINHERIT) ?
274 FS_PROJINHERIT_FL : 0;
275 fsx->fsx_projid = zp->z_projid;
276
277 return (0);
278 }
279
280 static int
zfs_ioctl_setflags(vnode_t * vp,uint32_t ioctl_flags,xvattr_t * xva)281 zfs_ioctl_setflags(vnode_t *vp, uint32_t ioctl_flags, xvattr_t *xva)
282 {
283 uint64_t zfs_flags = VTOZ(vp)->z_pflags;
284 xoptattr_t *xoap;
285
286 if (ioctl_flags & ~(FS_PROJINHERIT_FL))
287 return (SET_ERROR(EOPNOTSUPP));
288
289 xva_init(xva);
290 xoap = xva_getxoptattr(xva);
291
292 #define FLAG_CHANGE(iflag, zflag, xflag, xfield) do { \
293 if (((ioctl_flags & (iflag)) && !(zfs_flags & (zflag))) || \
294 ((zfs_flags & (zflag)) && !(ioctl_flags & (iflag)))) { \
295 XVA_SET_REQ(xva, (xflag)); \
296 (xfield) = ((ioctl_flags & (iflag)) != 0); \
297 } \
298 } while (0)
299
300 FLAG_CHANGE(FS_PROJINHERIT_FL, ZFS_PROJINHERIT, XAT_PROJINHERIT,
301 xoap->xoa_projinherit);
302
303 #undef FLAG_CHANGE
304
305 return (0);
306 }
307
308 static int
zfs_ioctl_setxattr(vnode_t * vp,zfsxattr_t * fsx,cred_t * cr)309 zfs_ioctl_setxattr(vnode_t *vp, zfsxattr_t *fsx, cred_t *cr)
310 {
311 znode_t *zp = VTOZ(vp);
312 xvattr_t xva;
313 xoptattr_t *xoap;
314 int err;
315
316 if (!zpl_is_valid_projid(fsx->fsx_projid))
317 return (SET_ERROR(EINVAL));
318
319 err = zfs_ioctl_setflags(vp, fsx->fsx_xflags, &xva);
320 if (err)
321 return (err);
322
323 xoap = xva_getxoptattr(&xva);
324 XVA_SET_REQ(&xva, XAT_PROJID);
325 xoap->xoa_projid = fsx->fsx_projid;
326
327 err = zfs_setattr(zp, (vattr_t *)&xva, 0, cr, NULL);
328
329 return (err);
330 }
331
332 static int
zfs_ioctl(vnode_t * vp,ulong_t com,intptr_t data,int flag,cred_t * cred,int * rvalp)333 zfs_ioctl(vnode_t *vp, ulong_t com, intptr_t data, int flag, cred_t *cred,
334 int *rvalp)
335 {
336 (void) flag, (void) cred, (void) rvalp;
337 loff_t off;
338 int error;
339
340 switch (com) {
341 case _FIOFFS:
342 {
343 return (0);
344
345 /*
346 * The following two ioctls are used by bfu. Faking out,
347 * necessary to avoid bfu errors.
348 */
349 }
350 case _FIOGDIO:
351 case _FIOSDIO:
352 {
353 return (0);
354 }
355
356 case F_SEEK_DATA:
357 case F_SEEK_HOLE:
358 {
359 off = *(offset_t *)data;
360 error = vn_lock(vp, LK_SHARED);
361 if (error)
362 return (error);
363 /* offset parameter is in/out */
364 error = zfs_holey(VTOZ(vp), com, &off);
365 VOP_UNLOCK(vp);
366 if (error)
367 return (error);
368 *(offset_t *)data = off;
369 return (0);
370 }
371 case ZFS_IOC_FSGETXATTR: {
372 zfsxattr_t *fsx = (zfsxattr_t *)data;
373 error = vn_lock(vp, LK_SHARED);
374 if (error)
375 return (error);
376 error = zfs_ioctl_getxattr(vp, fsx);
377 VOP_UNLOCK(vp);
378 return (error);
379 }
380 case ZFS_IOC_FSSETXATTR: {
381 zfsxattr_t *fsx = (zfsxattr_t *)data;
382 error = vn_lock(vp, LK_EXCLUSIVE);
383 if (error)
384 return (error);
385 vn_seqc_write_begin(vp);
386 error = zfs_ioctl_setxattr(vp, fsx, cred);
387 vn_seqc_write_end(vp);
388 VOP_UNLOCK(vp);
389 return (error);
390 }
391 case ZFS_IOC_REWRITE: {
392 zfs_rewrite_args_t *args = (zfs_rewrite_args_t *)data;
393 if ((flag & FWRITE) == 0)
394 return (SET_ERROR(EBADF));
395 error = vn_lock(vp, LK_SHARED);
396 if (error)
397 return (error);
398 error = zfs_rewrite(VTOZ(vp), args->off, args->len,
399 args->flags, args->arg);
400 VOP_UNLOCK(vp);
401 return (error);
402 }
403 }
404 return (SET_ERROR(ENOTTY));
405 }
406
407 static vm_page_t
page_busy(vnode_t * vp,int64_t start,int64_t off,int64_t nbytes)408 page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes)
409 {
410 vm_object_t obj;
411 vm_page_t pp;
412 int64_t end;
413
414 /*
415 * At present vm_page_clear_dirty extends the cleared range to DEV_BSIZE
416 * aligned boundaries, if the range is not aligned. As a result a
417 * DEV_BSIZE subrange with partially dirty data may get marked as clean.
418 * It may happen that all DEV_BSIZE subranges are marked clean and thus
419 * the whole page would be considered clean despite have some
420 * dirty data.
421 * For this reason we should shrink the range to DEV_BSIZE aligned
422 * boundaries before calling vm_page_clear_dirty.
423 */
424 end = rounddown2(off + nbytes, DEV_BSIZE);
425 off = roundup2(off, DEV_BSIZE);
426 nbytes = end - off;
427
428 obj = vp->v_object;
429 vm_page_grab_valid_unlocked(&pp, obj, OFF_TO_IDX(start),
430 VM_ALLOC_NOCREAT | VM_ALLOC_SBUSY | VM_ALLOC_NORMAL |
431 VM_ALLOC_IGN_SBUSY);
432 if (pp != NULL) {
433 ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
434 vm_object_pip_add(obj, 1);
435 pmap_remove_write(pp);
436 if (nbytes != 0)
437 vm_page_clear_dirty(pp, off, nbytes);
438 }
439 return (pp);
440 }
441
442 static void
page_unbusy(vm_page_t pp)443 page_unbusy(vm_page_t pp)
444 {
445
446 vm_page_sunbusy(pp);
447 vm_object_pip_wakeup(pp->object);
448 }
449
450 static vm_page_t
page_hold(vnode_t * vp,int64_t start)451 page_hold(vnode_t *vp, int64_t start)
452 {
453 vm_object_t obj;
454 vm_page_t m;
455
456 obj = vp->v_object;
457 vm_page_grab_valid_unlocked(&m, obj, OFF_TO_IDX(start),
458 VM_ALLOC_NOCREAT | VM_ALLOC_WIRED | VM_ALLOC_IGN_SBUSY |
459 VM_ALLOC_NOBUSY);
460 return (m);
461 }
462
463 static void
page_unhold(vm_page_t pp)464 page_unhold(vm_page_t pp)
465 {
466 vm_page_unwire(pp, PQ_ACTIVE);
467 }
468
469 /*
470 * When a file is memory mapped, we must keep the IO data synchronized
471 * between the DMU cache and the memory mapped pages. What this means:
472 *
473 * On Write: If we find a memory mapped page, we write to *both*
474 * the page and the dmu buffer.
475 */
476 void
update_pages(znode_t * zp,int64_t start,int len,objset_t * os)477 update_pages(znode_t *zp, int64_t start, int len, objset_t *os)
478 {
479 vm_object_t obj;
480 struct sf_buf *sf;
481 vnode_t *vp = ZTOV(zp);
482 caddr_t va;
483 int off;
484
485 ASSERT3P(vp->v_mount, !=, NULL);
486 obj = vp->v_object;
487 ASSERT3P(obj, !=, NULL);
488
489 off = start & PAGEOFFSET;
490 vm_object_pip_add(obj, 1);
491 for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
492 vm_page_t pp;
493 int nbytes = imin(PAGESIZE - off, len);
494
495 if ((pp = page_busy(vp, start, off, nbytes)) != NULL) {
496 va = zfs_map_page(pp, &sf);
497 (void) dmu_read(os, zp->z_id, start + off, nbytes,
498 va + off, DMU_READ_PREFETCH);
499 zfs_unmap_page(sf);
500 page_unbusy(pp);
501 }
502 len -= nbytes;
503 off = 0;
504 }
505 vm_object_pip_wakeup(obj);
506 }
507
508 /*
509 * Read with UIO_NOCOPY flag means that sendfile(2) requests
510 * ZFS to populate a range of page cache pages with data.
511 *
512 * NOTE: this function could be optimized to pre-allocate
513 * all pages in advance, drain exclusive busy on all of them,
514 * map them into contiguous KVA region and populate them
515 * in one single dmu_read() call.
516 */
517 int
mappedread_sf(znode_t * zp,int nbytes,zfs_uio_t * uio)518 mappedread_sf(znode_t *zp, int nbytes, zfs_uio_t *uio)
519 {
520 vnode_t *vp = ZTOV(zp);
521 objset_t *os = zp->z_zfsvfs->z_os;
522 struct sf_buf *sf;
523 vm_object_t obj;
524 vm_page_t pp;
525 int64_t start;
526 caddr_t va;
527 int len = nbytes;
528 int error = 0;
529
530 ASSERT3U(zfs_uio_segflg(uio), ==, UIO_NOCOPY);
531 ASSERT3P(vp->v_mount, !=, NULL);
532 obj = vp->v_object;
533 ASSERT3P(obj, !=, NULL);
534 ASSERT0(zfs_uio_offset(uio) & PAGEOFFSET);
535
536 for (start = zfs_uio_offset(uio); len > 0; start += PAGESIZE) {
537 int bytes = MIN(PAGESIZE, len);
538
539 pp = vm_page_grab_unlocked(obj, OFF_TO_IDX(start),
540 VM_ALLOC_SBUSY | VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY);
541 if (vm_page_none_valid(pp)) {
542 va = zfs_map_page(pp, &sf);
543 error = dmu_read(os, zp->z_id, start, bytes, va,
544 DMU_READ_PREFETCH);
545 if (bytes != PAGESIZE && error == 0)
546 memset(va + bytes, 0, PAGESIZE - bytes);
547 zfs_unmap_page(sf);
548 if (error == 0) {
549 vm_page_valid(pp);
550 vm_page_activate(pp);
551 vm_page_sunbusy(pp);
552 } else {
553 zfs_vmobject_wlock(obj);
554 if (!vm_page_wired(pp) && pp->valid == 0 &&
555 vm_page_busy_tryupgrade(pp))
556 vm_page_free(pp);
557 else {
558 vm_page_deactivate_noreuse(pp);
559 vm_page_sunbusy(pp);
560 }
561 zfs_vmobject_wunlock(obj);
562 }
563 } else {
564 ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
565 vm_page_sunbusy(pp);
566 }
567 if (error)
568 break;
569 zfs_uio_advance(uio, bytes);
570 len -= bytes;
571 }
572 return (error);
573 }
574
575 /*
576 * When a file is memory mapped, we must keep the IO data synchronized
577 * between the DMU cache and the memory mapped pages. What this means:
578 *
579 * On Read: We "read" preferentially from memory mapped pages,
580 * else we default from the dmu buffer.
581 *
582 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
583 * the file is memory mapped.
584 */
585 int
mappedread(znode_t * zp,int nbytes,zfs_uio_t * uio)586 mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio)
587 {
588 vnode_t *vp = ZTOV(zp);
589 vm_object_t obj;
590 int64_t start;
591 int len = nbytes;
592 int off;
593 int error = 0;
594
595 ASSERT3P(vp->v_mount, !=, NULL);
596 obj = vp->v_object;
597 ASSERT3P(obj, !=, NULL);
598
599 start = zfs_uio_offset(uio);
600 off = start & PAGEOFFSET;
601 for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
602 vm_page_t pp;
603 uint64_t bytes = MIN(PAGESIZE - off, len);
604
605 if ((pp = page_hold(vp, start))) {
606 struct sf_buf *sf;
607 caddr_t va;
608
609 va = zfs_map_page(pp, &sf);
610 error = vn_io_fault_uiomove(va + off, bytes,
611 GET_UIO_STRUCT(uio));
612 zfs_unmap_page(sf);
613 page_unhold(pp);
614 } else {
615 error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
616 uio, bytes, DMU_READ_PREFETCH);
617 }
618 len -= bytes;
619 off = 0;
620 if (error)
621 break;
622 }
623 return (error);
624 }
625
626 int
zfs_write_simple(znode_t * zp,const void * data,size_t len,loff_t pos,size_t * presid)627 zfs_write_simple(znode_t *zp, const void *data, size_t len,
628 loff_t pos, size_t *presid)
629 {
630 int error = 0;
631 ssize_t resid;
632
633 error = vn_rdwr(UIO_WRITE, ZTOV(zp), __DECONST(void *, data), len, pos,
634 UIO_SYSSPACE, IO_SYNC, kcred, NOCRED, &resid, curthread);
635
636 if (error) {
637 return (SET_ERROR(error));
638 } else if (presid == NULL) {
639 if (resid != 0) {
640 error = SET_ERROR(EIO);
641 }
642 } else {
643 *presid = resid;
644 }
645 return (error);
646 }
647
648 void
zfs_zrele_async(znode_t * zp)649 zfs_zrele_async(znode_t *zp)
650 {
651 vnode_t *vp = ZTOV(zp);
652 objset_t *os = ITOZSB(vp)->z_os;
653
654 VN_RELE_ASYNC(vp, dsl_pool_zrele_taskq(dmu_objset_pool(os)));
655 }
656
657 static int
zfs_dd_callback(struct mount * mp,void * arg,int lkflags,struct vnode ** vpp)658 zfs_dd_callback(struct mount *mp, void *arg, int lkflags, struct vnode **vpp)
659 {
660 int error;
661
662 *vpp = arg;
663 error = vn_lock(*vpp, lkflags);
664 if (error != 0)
665 vrele(*vpp);
666 return (error);
667 }
668
669 static int
zfs_lookup_lock(vnode_t * dvp,vnode_t * vp,const char * name,int lkflags)670 zfs_lookup_lock(vnode_t *dvp, vnode_t *vp, const char *name, int lkflags)
671 {
672 znode_t *zdp = VTOZ(dvp);
673 zfsvfs_t *zfsvfs __unused = zdp->z_zfsvfs;
674 int error;
675 int ltype;
676
677 if (zfsvfs->z_replay == B_FALSE)
678 ASSERT_VOP_LOCKED(dvp, __func__);
679
680 if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
681 ASSERT3P(dvp, ==, vp);
682 vref(dvp);
683 ltype = lkflags & LK_TYPE_MASK;
684 if (ltype != VOP_ISLOCKED(dvp)) {
685 if (ltype == LK_EXCLUSIVE)
686 vn_lock(dvp, LK_UPGRADE | LK_RETRY);
687 else /* if (ltype == LK_SHARED) */
688 vn_lock(dvp, LK_DOWNGRADE | LK_RETRY);
689
690 /*
691 * Relock for the "." case could leave us with
692 * reclaimed vnode.
693 */
694 if (VN_IS_DOOMED(dvp)) {
695 vrele(dvp);
696 return (SET_ERROR(ENOENT));
697 }
698 }
699 return (0);
700 } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
701 /*
702 * Note that in this case, dvp is the child vnode, and we
703 * are looking up the parent vnode - exactly reverse from
704 * normal operation. Unlocking dvp requires some rather
705 * tricky unlock/relock dance to prevent mp from being freed;
706 * use vn_vget_ino_gen() which takes care of all that.
707 *
708 * XXX Note that there is a time window when both vnodes are
709 * unlocked. It is possible, although highly unlikely, that
710 * during that window the parent-child relationship between
711 * the vnodes may change, for example, get reversed.
712 * In that case we would have a wrong lock order for the vnodes.
713 * All other filesystems seem to ignore this problem, so we
714 * do the same here.
715 * A potential solution could be implemented as follows:
716 * - using LK_NOWAIT when locking the second vnode and retrying
717 * if necessary
718 * - checking that the parent-child relationship still holds
719 * after locking both vnodes and retrying if it doesn't
720 */
721 error = vn_vget_ino_gen(dvp, zfs_dd_callback, vp, lkflags, &vp);
722 return (error);
723 } else {
724 error = vn_lock(vp, lkflags);
725 if (error != 0)
726 vrele(vp);
727 return (error);
728 }
729 }
730
731 /*
732 * Lookup an entry in a directory, or an extended attribute directory.
733 * If it exists, return a held vnode reference for it.
734 *
735 * IN: dvp - vnode of directory to search.
736 * nm - name of entry to lookup.
737 * pnp - full pathname to lookup [UNUSED].
738 * flags - LOOKUP_XATTR set if looking for an attribute.
739 * rdir - root directory vnode [UNUSED].
740 * cr - credentials of caller.
741 * ct - caller context
742 *
743 * OUT: vpp - vnode of located entry, NULL if not found.
744 *
745 * RETURN: 0 on success, error code on failure.
746 *
747 * Timestamps:
748 * NA
749 */
750 static int
zfs_lookup(vnode_t * dvp,const char * nm,vnode_t ** vpp,struct componentname * cnp,int nameiop,cred_t * cr,int flags,boolean_t cached)751 zfs_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp,
752 struct componentname *cnp, int nameiop, cred_t *cr, int flags,
753 boolean_t cached)
754 {
755 znode_t *zdp = VTOZ(dvp);
756 znode_t *zp;
757 zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
758 seqc_t dvp_seqc;
759 int error = 0;
760
761 /*
762 * Fast path lookup, however we must skip DNLC lookup
763 * for case folding or normalizing lookups because the
764 * DNLC code only stores the passed in name. This means
765 * creating 'a' and removing 'A' on a case insensitive
766 * file system would work, but DNLC still thinks 'a'
767 * exists and won't let you create it again on the next
768 * pass through fast path.
769 */
770 if (!(flags & LOOKUP_XATTR)) {
771 if (dvp->v_type != VDIR) {
772 return (SET_ERROR(ENOTDIR));
773 } else if (zdp->z_sa_hdl == NULL) {
774 return (SET_ERROR(EIO));
775 }
776 }
777
778 DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp,
779 const char *, nm);
780
781 if ((error = zfs_enter_verify_zp(zfsvfs, zdp, FTAG)) != 0)
782 return (error);
783
784 dvp_seqc = vn_seqc_read_notmodify(dvp);
785
786 *vpp = NULL;
787
788 if (flags & LOOKUP_XATTR) {
789 /*
790 * If the xattr property is off, refuse the lookup request.
791 */
792 if (!(zfsvfs->z_flags & ZSB_XATTR)) {
793 zfs_exit(zfsvfs, FTAG);
794 return (SET_ERROR(EOPNOTSUPP));
795 }
796
797 /*
798 * We don't allow recursive attributes..
799 * Maybe someday we will.
800 */
801 if (zdp->z_pflags & ZFS_XATTR) {
802 zfs_exit(zfsvfs, FTAG);
803 return (SET_ERROR(EINVAL));
804 }
805
806 if ((error = zfs_get_xattrdir(VTOZ(dvp), &zp, cr, flags))) {
807 zfs_exit(zfsvfs, FTAG);
808 return (error);
809 }
810 *vpp = ZTOV(zp);
811
812 /*
813 * Do we have permission to get into attribute directory?
814 */
815 if (flags & LOOKUP_NAMED_ATTR)
816 error = zfs_zaccess(zp, ACE_EXECUTE, V_NAMEDATTR,
817 B_FALSE, cr, NULL);
818 else
819 error = zfs_zaccess(zp, ACE_EXECUTE, 0, B_FALSE, cr,
820 NULL);
821 if (error) {
822 vrele(ZTOV(zp));
823 }
824
825 zfs_exit(zfsvfs, FTAG);
826 return (error);
827 }
828
829 /*
830 * Check accessibility of directory if we're not coming in via
831 * VOP_CACHEDLOOKUP.
832 */
833 if (!cached) {
834 #ifdef NOEXECCHECK
835 if ((cnp->cn_flags & NOEXECCHECK) != 0) {
836 cnp->cn_flags &= ~NOEXECCHECK;
837 } else
838 #endif
839 if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr,
840 NULL))) {
841 zfs_exit(zfsvfs, FTAG);
842 return (error);
843 }
844 }
845
846 if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
847 NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
848 zfs_exit(zfsvfs, FTAG);
849 return (SET_ERROR(EILSEQ));
850 }
851
852
853 /*
854 * First handle the special cases.
855 */
856 if ((cnp->cn_flags & ISDOTDOT) != 0) {
857 /*
858 * If we are a snapshot mounted under .zfs, return
859 * the vp for the snapshot directory.
860 */
861 if (zdp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) {
862 struct componentname cn;
863 vnode_t *zfsctl_vp;
864 int ltype;
865
866 zfs_exit(zfsvfs, FTAG);
867 ltype = VOP_ISLOCKED(dvp);
868 VOP_UNLOCK(dvp);
869 error = zfsctl_root(zfsvfs->z_parent, LK_SHARED,
870 &zfsctl_vp);
871 if (error == 0) {
872 cn.cn_nameptr = "snapshot";
873 cn.cn_namelen = strlen(cn.cn_nameptr);
874 cn.cn_nameiop = cnp->cn_nameiop;
875 cn.cn_flags = cnp->cn_flags & ~ISDOTDOT;
876 cn.cn_lkflags = cnp->cn_lkflags;
877 error = VOP_LOOKUP(zfsctl_vp, vpp, &cn);
878 vput(zfsctl_vp);
879 }
880 vn_lock(dvp, ltype | LK_RETRY);
881 return (error);
882 }
883 }
884 if (zfs_has_ctldir(zdp) && strcmp(nm, ZFS_CTLDIR_NAME) == 0) {
885 zfs_exit(zfsvfs, FTAG);
886 if (zfsvfs->z_show_ctldir == ZFS_SNAPDIR_DISABLED)
887 return (SET_ERROR(ENOENT));
888 if ((cnp->cn_flags & ISLASTCN) != 0 && nameiop != LOOKUP)
889 return (SET_ERROR(ENOTSUP));
890 error = zfsctl_root(zfsvfs, cnp->cn_lkflags, vpp);
891 return (error);
892 }
893
894 /*
895 * The loop is retry the lookup if the parent-child relationship
896 * changes during the dot-dot locking complexities.
897 */
898 for (;;) {
899 uint64_t parent;
900
901 error = zfs_dirlook(zdp, nm, &zp);
902 if (error == 0)
903 *vpp = ZTOV(zp);
904
905 zfs_exit(zfsvfs, FTAG);
906 if (error != 0)
907 break;
908
909 error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags);
910 if (error != 0) {
911 /*
912 * If we've got a locking error, then the vnode
913 * got reclaimed because of a force unmount.
914 * We never enter doomed vnodes into the name cache.
915 */
916 *vpp = NULL;
917 return (error);
918 }
919
920 if ((cnp->cn_flags & ISDOTDOT) == 0)
921 break;
922
923 if ((error = zfs_enter(zfsvfs, FTAG)) != 0) {
924 vput(ZTOV(zp));
925 *vpp = NULL;
926 return (error);
927 }
928 if (zdp->z_sa_hdl == NULL) {
929 error = SET_ERROR(EIO);
930 } else {
931 error = sa_lookup(zdp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
932 &parent, sizeof (parent));
933 }
934 if (error != 0) {
935 zfs_exit(zfsvfs, FTAG);
936 vput(ZTOV(zp));
937 break;
938 }
939 if (zp->z_id == parent) {
940 zfs_exit(zfsvfs, FTAG);
941 break;
942 }
943 vput(ZTOV(zp));
944 }
945
946 if (error != 0)
947 *vpp = NULL;
948
949 /* Translate errors and add SAVENAME when needed. */
950 if (cnp->cn_flags & ISLASTCN) {
951 switch (nameiop) {
952 case CREATE:
953 case RENAME:
954 if (error == ENOENT) {
955 error = EJUSTRETURN;
956 #if __FreeBSD_version < 1400068
957 cnp->cn_flags |= SAVENAME;
958 #endif
959 break;
960 }
961 zfs_fallthrough;
962 case DELETE:
963 #if __FreeBSD_version < 1400068
964 if (error == 0)
965 cnp->cn_flags |= SAVENAME;
966 #endif
967 break;
968 }
969 }
970
971 if ((cnp->cn_flags & ISDOTDOT) != 0) {
972 /*
973 * FIXME: zfs_lookup_lock relocks vnodes and does nothing to
974 * handle races. In particular different callers may end up
975 * with different vnodes and will try to add conflicting
976 * entries to the namecache.
977 *
978 * While finding different result may be acceptable in face
979 * of concurrent modification, adding conflicting entries
980 * trips over an assert in the namecache.
981 *
982 * Ultimately let an entry through once everything settles.
983 */
984 if (!vn_seqc_consistent(dvp, dvp_seqc)) {
985 cnp->cn_flags &= ~MAKEENTRY;
986 }
987 }
988
989 /* Insert name into cache (as non-existent) if appropriate. */
990 if (zfsvfs->z_use_namecache && !zfsvfs->z_replay &&
991 error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0)
992 cache_enter(dvp, NULL, cnp);
993
994 /* Insert name into cache if appropriate. */
995 if (zfsvfs->z_use_namecache && !zfsvfs->z_replay &&
996 error == 0 && (cnp->cn_flags & MAKEENTRY)) {
997 if (!(cnp->cn_flags & ISLASTCN) ||
998 (nameiop != DELETE && nameiop != RENAME)) {
999 cache_enter(dvp, *vpp, cnp);
1000 }
1001 }
1002
1003 return (error);
1004 }
1005
1006 static inline bool
is_nametoolong(zfsvfs_t * zfsvfs,const char * name)1007 is_nametoolong(zfsvfs_t *zfsvfs, const char *name)
1008 {
1009 size_t dlen = strlen(name);
1010 return ((!zfsvfs->z_longname && dlen >= ZAP_MAXNAMELEN) ||
1011 dlen >= ZAP_MAXNAMELEN_NEW);
1012 }
1013
1014 /*
1015 * Attempt to create a new entry in a directory. If the entry
1016 * already exists, truncate the file if permissible, else return
1017 * an error. Return the vp of the created or trunc'd file.
1018 *
1019 * IN: dvp - vnode of directory to put new file entry in.
1020 * name - name of new file entry.
1021 * vap - attributes of new file.
1022 * excl - flag indicating exclusive or non-exclusive mode.
1023 * mode - mode to open file with.
1024 * cr - credentials of caller.
1025 * flag - large file flag [UNUSED].
1026 * ct - caller context
1027 * vsecp - ACL to be set
1028 * mnt_ns - Unused on FreeBSD
1029 *
1030 * OUT: vpp - vnode of created or trunc'd entry.
1031 *
1032 * RETURN: 0 on success, error code on failure.
1033 *
1034 * Timestamps:
1035 * dvp - ctime|mtime updated if new entry created
1036 * vp - ctime|mtime always, atime if new
1037 */
1038 int
zfs_create(znode_t * dzp,const char * name,vattr_t * vap,int excl,int mode,znode_t ** zpp,cred_t * cr,int flag,vsecattr_t * vsecp,zidmap_t * mnt_ns)1039 zfs_create(znode_t *dzp, const char *name, vattr_t *vap, int excl, int mode,
1040 znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp, zidmap_t *mnt_ns)
1041 {
1042 (void) excl, (void) mode, (void) flag;
1043 znode_t *zp;
1044 zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
1045 zilog_t *zilog;
1046 objset_t *os;
1047 dmu_tx_t *tx;
1048 int error;
1049 uid_t uid = crgetuid(cr);
1050 gid_t gid = crgetgid(cr);
1051 uint64_t projid = ZFS_DEFAULT_PROJID;
1052 zfs_acl_ids_t acl_ids;
1053 boolean_t fuid_dirtied;
1054 uint64_t txtype;
1055
1056 if (is_nametoolong(zfsvfs, name))
1057 return (SET_ERROR(ENAMETOOLONG));
1058
1059 /*
1060 * If we have an ephemeral id, ACL, or XVATTR then
1061 * make sure file system is at proper version
1062 */
1063 if (zfsvfs->z_use_fuids == B_FALSE &&
1064 (vsecp || (vap->va_mask & AT_XVATTR) ||
1065 IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
1066 return (SET_ERROR(EINVAL));
1067
1068 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
1069 return (error);
1070 os = zfsvfs->z_os;
1071 zilog = zfsvfs->z_log;
1072
1073 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
1074 NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1075 zfs_exit(zfsvfs, FTAG);
1076 return (SET_ERROR(EILSEQ));
1077 }
1078
1079 if (vap->va_mask & AT_XVATTR) {
1080 if ((error = secpolicy_xvattr(ZTOV(dzp), (xvattr_t *)vap,
1081 crgetuid(cr), cr, vap->va_type)) != 0) {
1082 zfs_exit(zfsvfs, FTAG);
1083 return (error);
1084 }
1085 }
1086
1087 *zpp = NULL;
1088
1089 if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr))
1090 vap->va_mode &= ~S_ISVTX;
1091
1092 error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
1093 if (error) {
1094 zfs_exit(zfsvfs, FTAG);
1095 return (error);
1096 }
1097 ASSERT0P(zp);
1098
1099 /*
1100 * Create a new file object and update the directory
1101 * to reference it.
1102 */
1103 if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns))) {
1104 goto out;
1105 }
1106
1107 /*
1108 * We only support the creation of regular files in
1109 * extended attribute directories.
1110 */
1111
1112 if ((dzp->z_pflags & ZFS_XATTR) &&
1113 (vap->va_type != VREG)) {
1114 error = SET_ERROR(EINVAL);
1115 goto out;
1116 }
1117
1118 if ((error = zfs_acl_ids_create(dzp, 0, vap,
1119 cr, vsecp, &acl_ids, NULL)) != 0)
1120 goto out;
1121
1122 if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode))
1123 projid = zfs_inherit_projid(dzp);
1124 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) {
1125 zfs_acl_ids_free(&acl_ids);
1126 error = SET_ERROR(EDQUOT);
1127 goto out;
1128 }
1129
1130 getnewvnode_reserve();
1131
1132 tx = dmu_tx_create(os);
1133
1134 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
1135 ZFS_SA_BASE_ATTR_SIZE);
1136
1137 fuid_dirtied = zfsvfs->z_fuid_dirty;
1138 if (fuid_dirtied)
1139 zfs_fuid_txhold(zfsvfs, tx);
1140 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
1141 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
1142 if (!zfsvfs->z_use_sa &&
1143 acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1144 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
1145 0, acl_ids.z_aclp->z_acl_bytes);
1146 }
1147 error = dmu_tx_assign(tx, DMU_TX_WAIT);
1148 if (error) {
1149 zfs_acl_ids_free(&acl_ids);
1150 dmu_tx_abort(tx);
1151 getnewvnode_drop_reserve();
1152 zfs_exit(zfsvfs, FTAG);
1153 return (error);
1154 }
1155 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
1156
1157 error = zfs_link_create(dzp, name, zp, tx, ZNEW);
1158 if (error != 0) {
1159 /*
1160 * Since, we failed to add the directory entry for it,
1161 * delete the newly created dnode.
1162 */
1163 zfs_znode_delete(zp, tx);
1164 VOP_UNLOCK(ZTOV(zp));
1165 zrele(zp);
1166 zfs_acl_ids_free(&acl_ids);
1167 dmu_tx_commit(tx);
1168 getnewvnode_drop_reserve();
1169 goto out;
1170 }
1171
1172 if (fuid_dirtied)
1173 zfs_fuid_sync(zfsvfs, tx);
1174
1175 txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
1176 zfs_log_create(zilog, tx, txtype, dzp, zp, name,
1177 vsecp, acl_ids.z_fuidp, vap);
1178 zfs_acl_ids_free(&acl_ids);
1179 dmu_tx_commit(tx);
1180
1181 getnewvnode_drop_reserve();
1182
1183 out:
1184 VNASSERT(ZTOV(dzp)->v_holdcnt > 0 && ZTOV(dzp)->v_usecount > 0,
1185 ZTOV(dzp), ("%s: wrong ref counts", __func__));
1186 if (error == 0) {
1187 *zpp = zp;
1188 }
1189
1190 if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1191 error = zil_commit(zilog, 0);
1192
1193 zfs_exit(zfsvfs, FTAG);
1194 return (error);
1195 }
1196
1197 /*
1198 * Remove an entry from a directory.
1199 *
1200 * IN: dvp - vnode of directory to remove entry from.
1201 * name - name of entry to remove.
1202 * cr - credentials of caller.
1203 * ct - caller context
1204 * flags - case flags
1205 *
1206 * RETURN: 0 on success, error code on failure.
1207 *
1208 * Timestamps:
1209 * dvp - ctime|mtime
1210 * vp - ctime (if nlink > 0)
1211 */
1212 static int
zfs_remove_(vnode_t * dvp,vnode_t * vp,const char * name,cred_t * cr)1213 zfs_remove_(vnode_t *dvp, vnode_t *vp, const char *name, cred_t *cr)
1214 {
1215 znode_t *dzp = VTOZ(dvp);
1216 znode_t *zp;
1217 znode_t *xzp;
1218 zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
1219 zilog_t *zilog;
1220 uint64_t xattr_obj;
1221 uint64_t obj = 0;
1222 dmu_tx_t *tx;
1223 boolean_t unlinked;
1224 uint64_t txtype;
1225 int error;
1226
1227
1228 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
1229 return (error);
1230 zp = VTOZ(vp);
1231 if ((error = zfs_verify_zp(zp)) != 0) {
1232 zfs_exit(zfsvfs, FTAG);
1233 return (error);
1234 }
1235 zilog = zfsvfs->z_log;
1236
1237 xattr_obj = 0;
1238 xzp = NULL;
1239
1240 if ((error = zfs_zaccess_delete(dzp, zp, cr, NULL))) {
1241 goto out;
1242 }
1243
1244 /*
1245 * Need to use rmdir for removing directories.
1246 */
1247 if (vp->v_type == VDIR) {
1248 error = SET_ERROR(EPERM);
1249 goto out;
1250 }
1251
1252 vnevent_remove(vp, dvp, name, ct);
1253
1254 obj = zp->z_id;
1255
1256 /* are there any extended attributes? */
1257 error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1258 &xattr_obj, sizeof (xattr_obj));
1259 if (error == 0 && xattr_obj) {
1260 error = zfs_zget(zfsvfs, xattr_obj, &xzp);
1261 ASSERT0(error);
1262 }
1263
1264 /*
1265 * We may delete the znode now, or we may put it in the unlinked set;
1266 * it depends on whether we're the last link, and on whether there are
1267 * other holds on the vnode. So we dmu_tx_hold() the right things to
1268 * allow for either case.
1269 */
1270 tx = dmu_tx_create(zfsvfs->z_os);
1271 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1272 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1273 zfs_sa_upgrade_txholds(tx, zp);
1274 zfs_sa_upgrade_txholds(tx, dzp);
1275
1276 if (xzp) {
1277 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
1278 dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
1279 }
1280
1281 /* charge as an update -- would be nice not to charge at all */
1282 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1283
1284 /*
1285 * Mark this transaction as typically resulting in a net free of space
1286 */
1287 dmu_tx_mark_netfree(tx);
1288
1289 error = dmu_tx_assign(tx, DMU_TX_WAIT);
1290 if (error) {
1291 dmu_tx_abort(tx);
1292 zfs_exit(zfsvfs, FTAG);
1293 return (error);
1294 }
1295
1296 /*
1297 * Remove the directory entry.
1298 */
1299 error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, &unlinked);
1300
1301 if (error) {
1302 dmu_tx_commit(tx);
1303 goto out;
1304 }
1305
1306 if (unlinked) {
1307 zfs_unlinked_add(zp, tx);
1308 vp->v_vflag |= VV_NOSYNC;
1309 }
1310 /* XXX check changes to linux vnops */
1311 txtype = TX_REMOVE;
1312 zfs_log_remove(zilog, tx, txtype, dzp, name, obj, unlinked);
1313
1314 dmu_tx_commit(tx);
1315 out:
1316
1317 if (xzp)
1318 vrele(ZTOV(xzp));
1319
1320 if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1321 error = zil_commit(zilog, 0);
1322
1323 zfs_exit(zfsvfs, FTAG);
1324 return (error);
1325 }
1326
1327
1328 static int
zfs_lookup_internal(znode_t * dzp,const char * name,vnode_t ** vpp,struct componentname * cnp,int nameiop)1329 zfs_lookup_internal(znode_t *dzp, const char *name, vnode_t **vpp,
1330 struct componentname *cnp, int nameiop)
1331 {
1332 zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
1333 int error;
1334
1335 cnp->cn_nameptr = __DECONST(char *, name);
1336 cnp->cn_namelen = strlen(name);
1337 cnp->cn_nameiop = nameiop;
1338 cnp->cn_flags = ISLASTCN;
1339 #if __FreeBSD_version < 1400068
1340 cnp->cn_flags |= SAVENAME;
1341 #endif
1342 cnp->cn_lkflags = LK_EXCLUSIVE | LK_RETRY;
1343 cnp->cn_cred = kcred;
1344 #if __FreeBSD_version < 1400037
1345 cnp->cn_thread = curthread;
1346 #endif
1347
1348 if (zfsvfs->z_use_namecache && !zfsvfs->z_replay) {
1349 struct vop_lookup_args a;
1350
1351 a.a_gen.a_desc = &vop_lookup_desc;
1352 a.a_dvp = ZTOV(dzp);
1353 a.a_vpp = vpp;
1354 a.a_cnp = cnp;
1355 error = vfs_cache_lookup(&a);
1356 } else {
1357 error = zfs_lookup(ZTOV(dzp), name, vpp, cnp, nameiop, kcred, 0,
1358 B_FALSE);
1359 }
1360 #ifdef ZFS_DEBUG
1361 if (error) {
1362 printf("got error %d on name %s on op %d\n", error, name,
1363 nameiop);
1364 kdb_backtrace();
1365 }
1366 #endif
1367 return (error);
1368 }
1369
1370 int
zfs_remove(znode_t * dzp,const char * name,cred_t * cr,int flags)1371 zfs_remove(znode_t *dzp, const char *name, cred_t *cr, int flags)
1372 {
1373 vnode_t *vp;
1374 int error;
1375 struct componentname cn;
1376
1377 if ((error = zfs_lookup_internal(dzp, name, &vp, &cn, DELETE)))
1378 return (error);
1379
1380 error = zfs_remove_(ZTOV(dzp), vp, name, cr);
1381 vput(vp);
1382 return (error);
1383 }
1384 /*
1385 * Create a new directory and insert it into dvp using the name
1386 * provided. Return a pointer to the inserted directory.
1387 *
1388 * IN: dvp - vnode of directory to add subdir to.
1389 * dirname - name of new directory.
1390 * vap - attributes of new directory.
1391 * cr - credentials of caller.
1392 * ct - caller context
1393 * flags - case flags
1394 * vsecp - ACL to be set
1395 * mnt_ns - Unused on FreeBSD
1396 *
1397 * OUT: vpp - vnode of created directory.
1398 *
1399 * RETURN: 0 on success, error code on failure.
1400 *
1401 * Timestamps:
1402 * dvp - ctime|mtime updated
1403 * vp - ctime|mtime|atime updated
1404 */
1405 int
zfs_mkdir(znode_t * dzp,const char * dirname,vattr_t * vap,znode_t ** zpp,cred_t * cr,int flags,vsecattr_t * vsecp,zidmap_t * mnt_ns)1406 zfs_mkdir(znode_t *dzp, const char *dirname, vattr_t *vap, znode_t **zpp,
1407 cred_t *cr, int flags, vsecattr_t *vsecp, zidmap_t *mnt_ns)
1408 {
1409 (void) flags, (void) vsecp;
1410 znode_t *zp;
1411 zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
1412 zilog_t *zilog;
1413 uint64_t txtype;
1414 dmu_tx_t *tx;
1415 int error;
1416 uid_t uid = crgetuid(cr);
1417 gid_t gid = crgetgid(cr);
1418 zfs_acl_ids_t acl_ids;
1419 boolean_t fuid_dirtied;
1420
1421 ASSERT3U(vap->va_type, ==, VDIR);
1422
1423 if (is_nametoolong(zfsvfs, dirname))
1424 return (SET_ERROR(ENAMETOOLONG));
1425
1426 /*
1427 * If we have an ephemeral id, ACL, or XVATTR then
1428 * make sure file system is at proper version
1429 */
1430 if (zfsvfs->z_use_fuids == B_FALSE &&
1431 ((vap->va_mask & AT_XVATTR) ||
1432 IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
1433 return (SET_ERROR(EINVAL));
1434
1435 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
1436 return (error);
1437 zilog = zfsvfs->z_log;
1438
1439 if (dzp->z_pflags & ZFS_XATTR) {
1440 zfs_exit(zfsvfs, FTAG);
1441 return (SET_ERROR(EINVAL));
1442 }
1443
1444 if (zfsvfs->z_utf8 && u8_validate(dirname,
1445 strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1446 zfs_exit(zfsvfs, FTAG);
1447 return (SET_ERROR(EILSEQ));
1448 }
1449
1450 if (vap->va_mask & AT_XVATTR) {
1451 if ((error = secpolicy_xvattr(ZTOV(dzp), (xvattr_t *)vap,
1452 crgetuid(cr), cr, vap->va_type)) != 0) {
1453 zfs_exit(zfsvfs, FTAG);
1454 return (error);
1455 }
1456 }
1457
1458 if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
1459 NULL, &acl_ids, NULL)) != 0) {
1460 zfs_exit(zfsvfs, FTAG);
1461 return (error);
1462 }
1463
1464 /*
1465 * First make sure the new directory doesn't exist.
1466 *
1467 * Existence is checked first to make sure we don't return
1468 * EACCES instead of EEXIST which can cause some applications
1469 * to fail.
1470 */
1471 *zpp = NULL;
1472
1473 if ((error = zfs_dirent_lookup(dzp, dirname, &zp, ZNEW))) {
1474 zfs_acl_ids_free(&acl_ids);
1475 zfs_exit(zfsvfs, FTAG);
1476 return (error);
1477 }
1478 ASSERT0P(zp);
1479
1480 if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr,
1481 mnt_ns))) {
1482 zfs_acl_ids_free(&acl_ids);
1483 zfs_exit(zfsvfs, FTAG);
1484 return (error);
1485 }
1486
1487 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zfs_inherit_projid(dzp))) {
1488 zfs_acl_ids_free(&acl_ids);
1489 zfs_exit(zfsvfs, FTAG);
1490 return (SET_ERROR(EDQUOT));
1491 }
1492
1493 /*
1494 * Add a new entry to the directory.
1495 */
1496 getnewvnode_reserve();
1497 tx = dmu_tx_create(zfsvfs->z_os);
1498 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
1499 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
1500 fuid_dirtied = zfsvfs->z_fuid_dirty;
1501 if (fuid_dirtied)
1502 zfs_fuid_txhold(zfsvfs, tx);
1503 if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1504 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
1505 acl_ids.z_aclp->z_acl_bytes);
1506 }
1507
1508 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
1509 ZFS_SA_BASE_ATTR_SIZE);
1510
1511 error = dmu_tx_assign(tx, DMU_TX_WAIT);
1512 if (error) {
1513 zfs_acl_ids_free(&acl_ids);
1514 dmu_tx_abort(tx);
1515 getnewvnode_drop_reserve();
1516 zfs_exit(zfsvfs, FTAG);
1517 return (error);
1518 }
1519
1520 /*
1521 * Create new node.
1522 */
1523 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
1524
1525 /*
1526 * Now put new name in parent dir.
1527 */
1528 error = zfs_link_create(dzp, dirname, zp, tx, ZNEW);
1529 if (error != 0) {
1530 zfs_znode_delete(zp, tx);
1531 VOP_UNLOCK(ZTOV(zp));
1532 zrele(zp);
1533 goto out;
1534 }
1535
1536 if (fuid_dirtied)
1537 zfs_fuid_sync(zfsvfs, tx);
1538
1539 *zpp = zp;
1540
1541 txtype = zfs_log_create_txtype(Z_DIR, NULL, vap);
1542 zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, NULL,
1543 acl_ids.z_fuidp, vap);
1544
1545 out:
1546 zfs_acl_ids_free(&acl_ids);
1547
1548 dmu_tx_commit(tx);
1549
1550 getnewvnode_drop_reserve();
1551
1552 if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1553 error = zil_commit(zilog, 0);
1554
1555 zfs_exit(zfsvfs, FTAG);
1556 return (error);
1557 }
1558
1559 /*
1560 * Remove a directory subdir entry. If the current working
1561 * directory is the same as the subdir to be removed, the
1562 * remove will fail.
1563 *
1564 * IN: dvp - vnode of directory to remove from.
1565 * name - name of directory to be removed.
1566 * cwd - vnode of current working directory.
1567 * cr - credentials of caller.
1568 * ct - caller context
1569 * flags - case flags
1570 *
1571 * RETURN: 0 on success, error code on failure.
1572 *
1573 * Timestamps:
1574 * dvp - ctime|mtime updated
1575 */
1576 static int
zfs_rmdir_(vnode_t * dvp,vnode_t * vp,const char * name,cred_t * cr)1577 zfs_rmdir_(vnode_t *dvp, vnode_t *vp, const char *name, cred_t *cr)
1578 {
1579 znode_t *dzp = VTOZ(dvp);
1580 znode_t *zp = VTOZ(vp);
1581 zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
1582 zilog_t *zilog;
1583 dmu_tx_t *tx;
1584 int error;
1585
1586 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
1587 return (error);
1588 if ((error = zfs_verify_zp(zp)) != 0) {
1589 zfs_exit(zfsvfs, FTAG);
1590 return (error);
1591 }
1592 zilog = zfsvfs->z_log;
1593
1594
1595 if ((error = zfs_zaccess_delete(dzp, zp, cr, NULL))) {
1596 goto out;
1597 }
1598
1599 if (vp->v_type != VDIR) {
1600 error = SET_ERROR(ENOTDIR);
1601 goto out;
1602 }
1603
1604 vnevent_rmdir(vp, dvp, name, ct);
1605
1606 tx = dmu_tx_create(zfsvfs->z_os);
1607 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1608 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1609 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1610 zfs_sa_upgrade_txholds(tx, zp);
1611 zfs_sa_upgrade_txholds(tx, dzp);
1612 dmu_tx_mark_netfree(tx);
1613 error = dmu_tx_assign(tx, DMU_TX_WAIT);
1614 if (error) {
1615 dmu_tx_abort(tx);
1616 zfs_exit(zfsvfs, FTAG);
1617 return (error);
1618 }
1619
1620 error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, NULL);
1621
1622 if (error == 0) {
1623 uint64_t txtype = TX_RMDIR;
1624 zfs_log_remove(zilog, tx, txtype, dzp, name,
1625 ZFS_NO_OBJECT, B_FALSE);
1626 }
1627
1628 dmu_tx_commit(tx);
1629
1630 if (zfsvfs->z_use_namecache)
1631 cache_vop_rmdir(dvp, vp);
1632 out:
1633 if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1634 error = zil_commit(zilog, 0);
1635
1636 zfs_exit(zfsvfs, FTAG);
1637 return (error);
1638 }
1639
1640 int
zfs_rmdir(znode_t * dzp,const char * name,znode_t * cwd,cred_t * cr,int flags)1641 zfs_rmdir(znode_t *dzp, const char *name, znode_t *cwd, cred_t *cr, int flags)
1642 {
1643 struct componentname cn;
1644 vnode_t *vp;
1645 int error;
1646
1647 if ((error = zfs_lookup_internal(dzp, name, &vp, &cn, DELETE)))
1648 return (error);
1649
1650 error = zfs_rmdir_(ZTOV(dzp), vp, name, cr);
1651 vput(vp);
1652 return (error);
1653 }
1654
1655 /*
1656 * Read as many directory entries as will fit into the provided
1657 * buffer from the given directory cursor position (specified in
1658 * the uio structure).
1659 *
1660 * IN: vp - vnode of directory to read.
1661 * uio - structure supplying read location, range info,
1662 * and return buffer.
1663 * cr - credentials of caller.
1664 * ct - caller context
1665 *
1666 * OUT: uio - updated offset and range, buffer filled.
1667 * eofp - set to true if end-of-file detected.
1668 * ncookies- number of entries in cookies
1669 * cookies - offsets to directory entries
1670 *
1671 * RETURN: 0 on success, error code on failure.
1672 *
1673 * Timestamps:
1674 * vp - atime updated
1675 *
1676 * Note that the low 4 bits of the cookie returned by zap is always zero.
1677 * This allows us to use the low range for "special" directory entries:
1678 * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem,
1679 * we use the offset 2 for the '.zfs' directory.
1680 */
1681 static int
zfs_readdir(vnode_t * vp,zfs_uio_t * uio,cred_t * cr,int * eofp,int * ncookies,cookie_t ** cookies)1682 zfs_readdir(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, int *eofp,
1683 int *ncookies, cookie_t **cookies)
1684 {
1685 znode_t *zp = VTOZ(vp);
1686 iovec_t *iovp;
1687 dirent64_t *odp;
1688 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1689 objset_t *os;
1690 caddr_t outbuf;
1691 size_t bufsize;
1692 zap_cursor_t zc;
1693 zap_attribute_t *zap;
1694 uint_t bytes_wanted;
1695 uint64_t offset; /* must be unsigned; checks for < 1 */
1696 uint64_t parent;
1697 int local_eof;
1698 int outcount;
1699 int error;
1700 uint8_t prefetch;
1701 uint8_t type;
1702 int ncooks;
1703 cookie_t *cooks = NULL;
1704
1705 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
1706 return (error);
1707
1708 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
1709 &parent, sizeof (parent))) != 0) {
1710 zfs_exit(zfsvfs, FTAG);
1711 return (error);
1712 }
1713
1714 /*
1715 * If we are not given an eof variable,
1716 * use a local one.
1717 */
1718 if (eofp == NULL)
1719 eofp = &local_eof;
1720
1721 /*
1722 * Check for valid iov_len.
1723 */
1724 if (GET_UIO_STRUCT(uio)->uio_iov->iov_len <= 0) {
1725 zfs_exit(zfsvfs, FTAG);
1726 return (SET_ERROR(EINVAL));
1727 }
1728
1729 /*
1730 * Quit if directory has been removed (posix)
1731 */
1732 if ((*eofp = (zp->z_unlinked != 0)) != 0) {
1733 zfs_exit(zfsvfs, FTAG);
1734 return (0);
1735 }
1736
1737 error = 0;
1738 os = zfsvfs->z_os;
1739 offset = zfs_uio_offset(uio);
1740 prefetch = zp->z_zn_prefetch;
1741 zap = zap_attribute_long_alloc();
1742
1743 /*
1744 * Initialize the iterator cursor.
1745 */
1746 if (offset <= 3) {
1747 /*
1748 * Start iteration from the beginning of the directory.
1749 */
1750 zap_cursor_init(&zc, os, zp->z_id);
1751 } else {
1752 /*
1753 * The offset is a serialized cursor.
1754 */
1755 zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
1756 }
1757
1758 /*
1759 * Get space to change directory entries into fs independent format.
1760 */
1761 iovp = GET_UIO_STRUCT(uio)->uio_iov;
1762 bytes_wanted = iovp->iov_len;
1763 if (zfs_uio_segflg(uio) != UIO_SYSSPACE || zfs_uio_iovcnt(uio) != 1) {
1764 bufsize = bytes_wanted;
1765 outbuf = kmem_alloc(bufsize, KM_SLEEP);
1766 odp = (struct dirent64 *)outbuf;
1767 } else {
1768 bufsize = bytes_wanted;
1769 outbuf = NULL;
1770 odp = (struct dirent64 *)iovp->iov_base;
1771 }
1772
1773 if (ncookies != NULL) {
1774 /*
1775 * Minimum entry size is dirent size and 1 byte for a file name.
1776 */
1777 ncooks = zfs_uio_resid(uio) / (sizeof (struct dirent) -
1778 sizeof (((struct dirent *)NULL)->d_name) + 1);
1779 cooks = malloc(ncooks * sizeof (*cooks), M_TEMP, M_WAITOK);
1780 *cookies = cooks;
1781 *ncookies = ncooks;
1782 }
1783
1784 /*
1785 * Transform to file-system independent format
1786 */
1787 outcount = 0;
1788 while (outcount < bytes_wanted) {
1789 ino64_t objnum;
1790 ushort_t reclen;
1791 off64_t *next = NULL;
1792
1793 /*
1794 * Special case `.', `..', and `.zfs'.
1795 */
1796 if (offset == 0) {
1797 (void) strcpy(zap->za_name, ".");
1798 zap->za_normalization_conflict = 0;
1799 objnum = zp->z_id;
1800 type = DT_DIR;
1801 } else if (offset == 1) {
1802 (void) strcpy(zap->za_name, "..");
1803 zap->za_normalization_conflict = 0;
1804 objnum = parent;
1805 type = DT_DIR;
1806 } else if (offset == 2 && zfs_show_ctldir(zp)) {
1807 (void) strcpy(zap->za_name, ZFS_CTLDIR_NAME);
1808 zap->za_normalization_conflict = 0;
1809 objnum = ZFSCTL_INO_ROOT;
1810 type = DT_DIR;
1811 } else {
1812 /*
1813 * Grab next entry.
1814 */
1815 if ((error = zap_cursor_retrieve(&zc, zap))) {
1816 if ((*eofp = (error == ENOENT)) != 0)
1817 break;
1818 else
1819 goto update;
1820 }
1821
1822 if (zap->za_integer_length != 8 ||
1823 zap->za_num_integers != 1) {
1824 cmn_err(CE_WARN, "zap_readdir: bad directory "
1825 "entry, obj = %lld, offset = %lld\n",
1826 (u_longlong_t)zp->z_id,
1827 (u_longlong_t)offset);
1828 error = SET_ERROR(ENXIO);
1829 goto update;
1830 }
1831
1832 objnum = ZFS_DIRENT_OBJ(zap->za_first_integer);
1833 /*
1834 * MacOS X can extract the object type here such as:
1835 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
1836 */
1837 type = ZFS_DIRENT_TYPE(zap->za_first_integer);
1838 }
1839
1840 reclen = DIRENT64_RECLEN(strlen(zap->za_name));
1841
1842 /*
1843 * Will this entry fit in the buffer?
1844 */
1845 if (outcount + reclen > bufsize) {
1846 /*
1847 * Did we manage to fit anything in the buffer?
1848 */
1849 if (!outcount) {
1850 error = SET_ERROR(EINVAL);
1851 goto update;
1852 }
1853 break;
1854 }
1855 /*
1856 * Add normal entry:
1857 */
1858 odp->d_ino = objnum;
1859 odp->d_reclen = reclen;
1860 odp->d_namlen = strlen(zap->za_name);
1861 /* NOTE: d_off is the offset for the *next* entry. */
1862 next = &odp->d_off;
1863 strlcpy(odp->d_name, zap->za_name, odp->d_namlen + 1);
1864 odp->d_type = type;
1865 dirent_terminate(odp);
1866 odp = (dirent64_t *)((intptr_t)odp + reclen);
1867
1868 outcount += reclen;
1869
1870 ASSERT3S(outcount, <=, bufsize);
1871
1872 if (prefetch)
1873 dmu_prefetch_dnode(os, objnum, ZIO_PRIORITY_SYNC_READ);
1874
1875 /*
1876 * Move to the next entry, fill in the previous offset.
1877 */
1878 if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
1879 zap_cursor_advance(&zc);
1880 offset = zap_cursor_serialize(&zc);
1881 } else {
1882 offset += 1;
1883 }
1884
1885 /* Fill the offset right after advancing the cursor. */
1886 if (next != NULL)
1887 *next = offset;
1888 if (cooks != NULL) {
1889 *cooks++ = offset;
1890 ncooks--;
1891 KASSERT(ncooks >= 0, ("ncookies=%d", ncooks));
1892 }
1893 }
1894 zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
1895
1896 /* Subtract unused cookies */
1897 if (ncookies != NULL)
1898 *ncookies -= ncooks;
1899
1900 if (zfs_uio_segflg(uio) == UIO_SYSSPACE && zfs_uio_iovcnt(uio) == 1) {
1901 iovp->iov_base += outcount;
1902 iovp->iov_len -= outcount;
1903 zfs_uio_resid(uio) -= outcount;
1904 } else if ((error =
1905 zfs_uiomove(outbuf, (long)outcount, UIO_READ, uio))) {
1906 /*
1907 * Reset the pointer.
1908 */
1909 offset = zfs_uio_offset(uio);
1910 }
1911
1912 update:
1913 zap_cursor_fini(&zc);
1914 zap_attribute_free(zap);
1915 if (zfs_uio_segflg(uio) != UIO_SYSSPACE || zfs_uio_iovcnt(uio) != 1)
1916 kmem_free(outbuf, bufsize);
1917
1918 if (error == ENOENT)
1919 error = 0;
1920
1921 ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
1922
1923 zfs_uio_setoffset(uio, offset);
1924 zfs_exit(zfsvfs, FTAG);
1925 if (error != 0 && cookies != NULL) {
1926 free(*cookies, M_TEMP);
1927 *cookies = NULL;
1928 *ncookies = 0;
1929 }
1930 return (error);
1931 }
1932
1933 /*
1934 * Get the requested file attributes and place them in the provided
1935 * vattr structure.
1936 *
1937 * IN: vp - vnode of file.
1938 * vap - va_mask identifies requested attributes.
1939 * If AT_XVATTR set, then optional attrs are requested
1940 * flags - ATTR_NOACLCHECK (CIFS server context)
1941 * cr - credentials of caller.
1942 *
1943 * OUT: vap - attribute values.
1944 *
1945 * RETURN: 0 (always succeeds).
1946 */
1947 static int
zfs_getattr(vnode_t * vp,vattr_t * vap,int flags,cred_t * cr)1948 zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr)
1949 {
1950 znode_t *zp = VTOZ(vp);
1951 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1952 int error = 0;
1953 uint32_t blksize;
1954 u_longlong_t nblocks;
1955 uint64_t mtime[2], ctime[2], crtime[2], rdev;
1956 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */
1957 xoptattr_t *xoap = NULL;
1958 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
1959 sa_bulk_attr_t bulk[4];
1960 int count = 0;
1961
1962 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
1963 return (error);
1964
1965 zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
1966
1967 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
1968 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
1969 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16);
1970 if (vp->v_type == VBLK || vp->v_type == VCHR)
1971 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL,
1972 &rdev, 8);
1973
1974 if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) {
1975 zfs_exit(zfsvfs, FTAG);
1976 return (error);
1977 }
1978
1979 /*
1980 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
1981 * Also, if we are the owner don't bother, since owner should
1982 * always be allowed to read basic attributes of file.
1983 */
1984 if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) &&
1985 (vap->va_uid != crgetuid(cr))) {
1986 if ((error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
1987 skipaclchk, cr, NULL))) {
1988 zfs_exit(zfsvfs, FTAG);
1989 return (error);
1990 }
1991 }
1992
1993 /*
1994 * Return all attributes. It's cheaper to provide the answer
1995 * than to determine whether we were asked the question.
1996 */
1997
1998 vap->va_type = IFTOVT(zp->z_mode);
1999 vap->va_mode = zp->z_mode & ~S_IFMT;
2000 vn_fsid(vp, vap);
2001 vap->va_nodeid = zp->z_id;
2002 vap->va_nlink = zp->z_links;
2003 if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp) &&
2004 zp->z_links < ZFS_LINK_MAX)
2005 vap->va_nlink++;
2006 vap->va_size = zp->z_size;
2007 if (vp->v_type == VBLK || vp->v_type == VCHR)
2008 vap->va_rdev = zfs_cmpldev(rdev);
2009 else
2010 vap->va_rdev = NODEV;
2011 vap->va_gen = zp->z_gen;
2012 vap->va_flags = 0; /* FreeBSD: Reset chflags(2) flags. */
2013 vap->va_filerev = zp->z_seq;
2014
2015 /*
2016 * Add in any requested optional attributes and the create time.
2017 * Also set the corresponding bits in the returned attribute bitmap.
2018 */
2019 if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
2020 if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
2021 xoap->xoa_archive =
2022 ((zp->z_pflags & ZFS_ARCHIVE) != 0);
2023 XVA_SET_RTN(xvap, XAT_ARCHIVE);
2024 }
2025
2026 if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
2027 xoap->xoa_readonly =
2028 ((zp->z_pflags & ZFS_READONLY) != 0);
2029 XVA_SET_RTN(xvap, XAT_READONLY);
2030 }
2031
2032 if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
2033 xoap->xoa_system =
2034 ((zp->z_pflags & ZFS_SYSTEM) != 0);
2035 XVA_SET_RTN(xvap, XAT_SYSTEM);
2036 }
2037
2038 if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
2039 xoap->xoa_hidden =
2040 ((zp->z_pflags & ZFS_HIDDEN) != 0);
2041 XVA_SET_RTN(xvap, XAT_HIDDEN);
2042 }
2043
2044 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2045 xoap->xoa_nounlink =
2046 ((zp->z_pflags & ZFS_NOUNLINK) != 0);
2047 XVA_SET_RTN(xvap, XAT_NOUNLINK);
2048 }
2049
2050 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2051 xoap->xoa_immutable =
2052 ((zp->z_pflags & ZFS_IMMUTABLE) != 0);
2053 XVA_SET_RTN(xvap, XAT_IMMUTABLE);
2054 }
2055
2056 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2057 xoap->xoa_appendonly =
2058 ((zp->z_pflags & ZFS_APPENDONLY) != 0);
2059 XVA_SET_RTN(xvap, XAT_APPENDONLY);
2060 }
2061
2062 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2063 xoap->xoa_nodump =
2064 ((zp->z_pflags & ZFS_NODUMP) != 0);
2065 XVA_SET_RTN(xvap, XAT_NODUMP);
2066 }
2067
2068 if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
2069 xoap->xoa_opaque =
2070 ((zp->z_pflags & ZFS_OPAQUE) != 0);
2071 XVA_SET_RTN(xvap, XAT_OPAQUE);
2072 }
2073
2074 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2075 xoap->xoa_av_quarantined =
2076 ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0);
2077 XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
2078 }
2079
2080 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2081 xoap->xoa_av_modified =
2082 ((zp->z_pflags & ZFS_AV_MODIFIED) != 0);
2083 XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
2084 }
2085
2086 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
2087 vp->v_type == VREG) {
2088 zfs_sa_get_scanstamp(zp, xvap);
2089 }
2090
2091 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
2092 xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0);
2093 XVA_SET_RTN(xvap, XAT_REPARSE);
2094 }
2095 if (XVA_ISSET_REQ(xvap, XAT_GEN)) {
2096 xoap->xoa_generation = zp->z_gen;
2097 XVA_SET_RTN(xvap, XAT_GEN);
2098 }
2099
2100 if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
2101 xoap->xoa_offline =
2102 ((zp->z_pflags & ZFS_OFFLINE) != 0);
2103 XVA_SET_RTN(xvap, XAT_OFFLINE);
2104 }
2105
2106 if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
2107 xoap->xoa_sparse =
2108 ((zp->z_pflags & ZFS_SPARSE) != 0);
2109 XVA_SET_RTN(xvap, XAT_SPARSE);
2110 }
2111
2112 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) {
2113 xoap->xoa_projinherit =
2114 ((zp->z_pflags & ZFS_PROJINHERIT) != 0);
2115 XVA_SET_RTN(xvap, XAT_PROJINHERIT);
2116 }
2117
2118 if (XVA_ISSET_REQ(xvap, XAT_PROJID)) {
2119 xoap->xoa_projid = zp->z_projid;
2120 XVA_SET_RTN(xvap, XAT_PROJID);
2121 }
2122 }
2123
2124 ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime);
2125 ZFS_TIME_DECODE(&vap->va_mtime, mtime);
2126 ZFS_TIME_DECODE(&vap->va_ctime, ctime);
2127 ZFS_TIME_DECODE(&vap->va_birthtime, crtime);
2128
2129
2130 sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
2131 vap->va_blksize = blksize;
2132 vap->va_bytes = nblocks << 9; /* nblocks * 512 */
2133
2134 if (zp->z_blksz == 0) {
2135 /*
2136 * Block size hasn't been set; suggest maximal I/O transfers.
2137 */
2138 vap->va_blksize = zfsvfs->z_max_blksz;
2139 }
2140
2141 zfs_exit(zfsvfs, FTAG);
2142 return (0);
2143 }
2144
2145 /*
2146 * For the operation of changing file's user/group/project, we need to
2147 * handle not only the main object that is assigned to the file directly,
2148 * but also the ones that are used by the file via hidden xattr directory.
2149 *
2150 * Because the xattr directory may contains many EA entries, as to it may
2151 * be impossible to change all of them via the transaction of changing the
2152 * main object's user/group/project attributes. Then we have to change them
2153 * via other multiple independent transactions one by one. It may be not good
2154 * solution, but we have no better idea yet.
2155 */
2156 static int
zfs_setattr_dir(znode_t * dzp)2157 zfs_setattr_dir(znode_t *dzp)
2158 {
2159 zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
2160 objset_t *os = zfsvfs->z_os;
2161 zap_cursor_t zc;
2162 zap_attribute_t *zap;
2163 znode_t *zp = NULL;
2164 dmu_tx_t *tx = NULL;
2165 uint64_t uid, gid;
2166 sa_bulk_attr_t bulk[4];
2167 int count;
2168 int err;
2169
2170 zap = zap_attribute_alloc();
2171 zap_cursor_init(&zc, os, dzp->z_id);
2172 while ((err = zap_cursor_retrieve(&zc, zap)) == 0) {
2173 count = 0;
2174 if (zap->za_integer_length != 8 || zap->za_num_integers != 1) {
2175 err = ENXIO;
2176 break;
2177 }
2178
2179 err = zfs_dirent_lookup(dzp, zap->za_name, &zp, ZEXISTS);
2180 if (err == ENOENT)
2181 goto next;
2182 if (err)
2183 break;
2184
2185 if (zp->z_uid == dzp->z_uid &&
2186 zp->z_gid == dzp->z_gid &&
2187 zp->z_projid == dzp->z_projid)
2188 goto next;
2189
2190 tx = dmu_tx_create(os);
2191 if (!(zp->z_pflags & ZFS_PROJID))
2192 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
2193 else
2194 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
2195
2196 err = dmu_tx_assign(tx, DMU_TX_WAIT);
2197 if (err)
2198 break;
2199
2200 vn_seqc_write_begin(ZTOV(zp));
2201 mutex_enter(&dzp->z_lock);
2202
2203 if (zp->z_uid != dzp->z_uid) {
2204 uid = dzp->z_uid;
2205 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
2206 &uid, sizeof (uid));
2207 zp->z_uid = uid;
2208 }
2209
2210 if (zp->z_gid != dzp->z_gid) {
2211 gid = dzp->z_gid;
2212 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
2213 &gid, sizeof (gid));
2214 zp->z_gid = gid;
2215 }
2216
2217 uint64_t projid = dzp->z_projid;
2218 if (zp->z_projid != projid) {
2219 if (!(zp->z_pflags & ZFS_PROJID)) {
2220 err = sa_add_projid(zp->z_sa_hdl, tx, projid);
2221 if (unlikely(err == EEXIST)) {
2222 err = 0;
2223 } else if (err != 0) {
2224 goto sa_add_projid_err;
2225 } else {
2226 projid = ZFS_INVALID_PROJID;
2227 }
2228 }
2229
2230 if (projid != ZFS_INVALID_PROJID) {
2231 zp->z_projid = projid;
2232 SA_ADD_BULK_ATTR(bulk, count,
2233 SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid,
2234 sizeof (zp->z_projid));
2235 }
2236 }
2237
2238 sa_add_projid_err:
2239 mutex_exit(&dzp->z_lock);
2240
2241 if (likely(count > 0)) {
2242 err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
2243 dmu_tx_commit(tx);
2244 } else if (projid == ZFS_INVALID_PROJID) {
2245 dmu_tx_commit(tx);
2246 } else {
2247 dmu_tx_abort(tx);
2248 }
2249 tx = NULL;
2250 vn_seqc_write_end(ZTOV(zp));
2251 if (err != 0 && err != ENOENT)
2252 break;
2253
2254 next:
2255 if (zp) {
2256 zrele(zp);
2257 zp = NULL;
2258 }
2259 zap_cursor_advance(&zc);
2260 }
2261
2262 if (tx)
2263 dmu_tx_abort(tx);
2264 if (zp) {
2265 zrele(zp);
2266 }
2267 zap_cursor_fini(&zc);
2268 zap_attribute_free(zap);
2269
2270 return (err == ENOENT ? 0 : err);
2271 }
2272
2273 /*
2274 * Set the file attributes to the values contained in the
2275 * vattr structure.
2276 *
2277 * IN: zp - znode of file to be modified.
2278 * vap - new attribute values.
2279 * If AT_XVATTR set, then optional attrs are being set
2280 * flags - ATTR_UTIME set if non-default time values provided.
2281 * - ATTR_NOACLCHECK (CIFS context only).
2282 * cr - credentials of caller.
2283 * mnt_ns - Unused on FreeBSD
2284 *
2285 * RETURN: 0 on success, error code on failure.
2286 *
2287 * Timestamps:
2288 * vp - ctime updated, mtime updated if size changed.
2289 */
2290 int
zfs_setattr(znode_t * zp,vattr_t * vap,int flags,cred_t * cr,zidmap_t * mnt_ns)2291 zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr, zidmap_t *mnt_ns)
2292 {
2293 vnode_t *vp = ZTOV(zp);
2294 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2295 objset_t *os;
2296 zilog_t *zilog;
2297 dmu_tx_t *tx;
2298 vattr_t oldva;
2299 xvattr_t tmpxvattr;
2300 uint_t mask = vap->va_mask;
2301 uint_t saved_mask = 0;
2302 uint64_t saved_mode;
2303 int trim_mask = 0;
2304 uint64_t new_mode;
2305 uint64_t new_uid, new_gid;
2306 uint64_t xattr_obj;
2307 uint64_t mtime[2], ctime[2];
2308 uint64_t projid = ZFS_INVALID_PROJID;
2309 znode_t *attrzp;
2310 int need_policy = FALSE;
2311 int err, err2;
2312 zfs_fuid_info_t *fuidp = NULL;
2313 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */
2314 xoptattr_t *xoap;
2315 zfs_acl_t *aclp;
2316 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2317 boolean_t fuid_dirtied = B_FALSE;
2318 boolean_t handle_eadir = B_FALSE;
2319 sa_bulk_attr_t bulk[7], xattr_bulk[7];
2320 int count = 0, xattr_count = 0;
2321
2322 if (mask == 0)
2323 return (0);
2324
2325 if (mask & AT_NOSET)
2326 return (SET_ERROR(EINVAL));
2327
2328 if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
2329 return (err);
2330
2331 os = zfsvfs->z_os;
2332 zilog = zfsvfs->z_log;
2333
2334 /*
2335 * Make sure that if we have ephemeral uid/gid or xvattr specified
2336 * that file system is at proper version level
2337 */
2338
2339 if (zfsvfs->z_use_fuids == B_FALSE &&
2340 (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
2341 ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) ||
2342 (mask & AT_XVATTR))) {
2343 zfs_exit(zfsvfs, FTAG);
2344 return (SET_ERROR(EINVAL));
2345 }
2346
2347 if (mask & AT_SIZE && vp->v_type == VDIR) {
2348 zfs_exit(zfsvfs, FTAG);
2349 return (SET_ERROR(EISDIR));
2350 }
2351
2352 if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) {
2353 zfs_exit(zfsvfs, FTAG);
2354 return (SET_ERROR(EINVAL));
2355 }
2356
2357 /*
2358 * If this is an xvattr_t, then get a pointer to the structure of
2359 * optional attributes. If this is NULL, then we have a vattr_t.
2360 */
2361 xoap = xva_getxoptattr(xvap);
2362
2363 xva_init(&tmpxvattr);
2364
2365 /*
2366 * Immutable files can only alter immutable bit and atime
2367 */
2368 if ((zp->z_pflags & ZFS_IMMUTABLE) &&
2369 ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) ||
2370 ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
2371 zfs_exit(zfsvfs, FTAG);
2372 return (SET_ERROR(EPERM));
2373 }
2374
2375 /*
2376 * Note: ZFS_READONLY is handled in zfs_zaccess_common.
2377 */
2378
2379 /*
2380 * Verify timestamps doesn't overflow 32 bits.
2381 * ZFS can handle large timestamps, but 32bit syscalls can't
2382 * handle times greater than 2039. This check should be removed
2383 * once large timestamps are fully supported.
2384 */
2385 if (mask & (AT_ATIME | AT_MTIME)) {
2386 if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
2387 ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
2388 zfs_exit(zfsvfs, FTAG);
2389 return (SET_ERROR(EOVERFLOW));
2390 }
2391 }
2392 if (xoap != NULL && (mask & AT_XVATTR)) {
2393 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME) &&
2394 TIMESPEC_OVERFLOW(&vap->va_birthtime)) {
2395 zfs_exit(zfsvfs, FTAG);
2396 return (SET_ERROR(EOVERFLOW));
2397 }
2398
2399 if (XVA_ISSET_REQ(xvap, XAT_PROJID)) {
2400 if (!dmu_objset_projectquota_enabled(os) ||
2401 (!S_ISREG(zp->z_mode) && !S_ISDIR(zp->z_mode))) {
2402 zfs_exit(zfsvfs, FTAG);
2403 return (SET_ERROR(EOPNOTSUPP));
2404 }
2405
2406 projid = xoap->xoa_projid;
2407 if (unlikely(projid == ZFS_INVALID_PROJID)) {
2408 zfs_exit(zfsvfs, FTAG);
2409 return (SET_ERROR(EINVAL));
2410 }
2411
2412 if (projid == zp->z_projid && zp->z_pflags & ZFS_PROJID)
2413 projid = ZFS_INVALID_PROJID;
2414 else
2415 need_policy = TRUE;
2416 }
2417
2418 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT) &&
2419 (xoap->xoa_projinherit !=
2420 ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) &&
2421 (!dmu_objset_projectquota_enabled(os) ||
2422 (!S_ISREG(zp->z_mode) && !S_ISDIR(zp->z_mode)))) {
2423 zfs_exit(zfsvfs, FTAG);
2424 return (SET_ERROR(EOPNOTSUPP));
2425 }
2426 }
2427
2428 attrzp = NULL;
2429 aclp = NULL;
2430
2431 if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
2432 zfs_exit(zfsvfs, FTAG);
2433 return (SET_ERROR(EROFS));
2434 }
2435
2436 /*
2437 * First validate permissions
2438 */
2439
2440 if (mask & AT_SIZE) {
2441 /*
2442 * XXX - Note, we are not providing any open
2443 * mode flags here (like FNDELAY), so we may
2444 * block if there are locks present... this
2445 * should be addressed in openat().
2446 */
2447 /* XXX - would it be OK to generate a log record here? */
2448 err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
2449 if (err) {
2450 zfs_exit(zfsvfs, FTAG);
2451 return (err);
2452 }
2453 }
2454
2455 if (mask & (AT_ATIME|AT_MTIME) ||
2456 ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
2457 XVA_ISSET_REQ(xvap, XAT_READONLY) ||
2458 XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
2459 XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
2460 XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
2461 XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
2462 XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
2463 need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
2464 skipaclchk, cr, mnt_ns);
2465 }
2466
2467 if (mask & (AT_UID|AT_GID)) {
2468 int idmask = (mask & (AT_UID|AT_GID));
2469 int take_owner;
2470 int take_group;
2471
2472 /*
2473 * NOTE: even if a new mode is being set,
2474 * we may clear S_ISUID/S_ISGID bits.
2475 */
2476
2477 if (!(mask & AT_MODE))
2478 vap->va_mode = zp->z_mode;
2479
2480 /*
2481 * Take ownership or chgrp to group we are a member of
2482 */
2483
2484 take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
2485 take_group = (mask & AT_GID) &&
2486 zfs_groupmember(zfsvfs, vap->va_gid, cr);
2487
2488 /*
2489 * If both AT_UID and AT_GID are set then take_owner and
2490 * take_group must both be set in order to allow taking
2491 * ownership.
2492 *
2493 * Otherwise, send the check through secpolicy_vnode_setattr()
2494 *
2495 */
2496
2497 if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
2498 ((idmask == AT_UID) && take_owner) ||
2499 ((idmask == AT_GID) && take_group)) {
2500 if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
2501 skipaclchk, cr, mnt_ns) == 0) {
2502 /*
2503 * Remove setuid/setgid for non-privileged users
2504 */
2505 secpolicy_setid_clear(vap, vp, cr);
2506 trim_mask = (mask & (AT_UID|AT_GID));
2507 } else {
2508 need_policy = TRUE;
2509 }
2510 } else {
2511 need_policy = TRUE;
2512 }
2513 }
2514
2515 oldva.va_mode = zp->z_mode;
2516 zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
2517 if (mask & AT_XVATTR) {
2518 /*
2519 * Update xvattr mask to include only those attributes
2520 * that are actually changing.
2521 *
2522 * the bits will be restored prior to actually setting
2523 * the attributes so the caller thinks they were set.
2524 */
2525 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2526 if (xoap->xoa_appendonly !=
2527 ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
2528 need_policy = TRUE;
2529 } else {
2530 XVA_CLR_REQ(xvap, XAT_APPENDONLY);
2531 XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY);
2532 }
2533 }
2534
2535 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) {
2536 if (xoap->xoa_projinherit !=
2537 ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) {
2538 need_policy = TRUE;
2539 } else {
2540 XVA_CLR_REQ(xvap, XAT_PROJINHERIT);
2541 XVA_SET_REQ(&tmpxvattr, XAT_PROJINHERIT);
2542 }
2543 }
2544
2545 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2546 if (xoap->xoa_nounlink !=
2547 ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
2548 need_policy = TRUE;
2549 } else {
2550 XVA_CLR_REQ(xvap, XAT_NOUNLINK);
2551 XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK);
2552 }
2553 }
2554
2555 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2556 if (xoap->xoa_immutable !=
2557 ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
2558 need_policy = TRUE;
2559 } else {
2560 XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
2561 XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE);
2562 }
2563 }
2564
2565 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2566 if (xoap->xoa_nodump !=
2567 ((zp->z_pflags & ZFS_NODUMP) != 0)) {
2568 need_policy = TRUE;
2569 } else {
2570 XVA_CLR_REQ(xvap, XAT_NODUMP);
2571 XVA_SET_REQ(&tmpxvattr, XAT_NODUMP);
2572 }
2573 }
2574
2575 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2576 if (xoap->xoa_av_modified !=
2577 ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
2578 need_policy = TRUE;
2579 } else {
2580 XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
2581 XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED);
2582 }
2583 }
2584
2585 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2586 if ((vp->v_type != VREG &&
2587 xoap->xoa_av_quarantined) ||
2588 xoap->xoa_av_quarantined !=
2589 ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
2590 need_policy = TRUE;
2591 } else {
2592 XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
2593 XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED);
2594 }
2595 }
2596
2597 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
2598 zfs_exit(zfsvfs, FTAG);
2599 return (SET_ERROR(EPERM));
2600 }
2601
2602 if (need_policy == FALSE &&
2603 (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
2604 XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
2605 need_policy = TRUE;
2606 }
2607 }
2608
2609 if (mask & AT_MODE) {
2610 if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr,
2611 mnt_ns) == 0) {
2612 err = secpolicy_setid_setsticky_clear(vp, vap,
2613 &oldva, cr);
2614 if (err) {
2615 zfs_exit(zfsvfs, FTAG);
2616 return (err);
2617 }
2618 trim_mask |= AT_MODE;
2619 } else {
2620 need_policy = TRUE;
2621 }
2622 }
2623
2624 if (need_policy) {
2625 /*
2626 * If trim_mask is set then take ownership
2627 * has been granted or write_acl is present and user
2628 * has the ability to modify mode. In that case remove
2629 * UID|GID and or MODE from mask so that
2630 * secpolicy_vnode_setattr() doesn't revoke it.
2631 */
2632
2633 if (trim_mask) {
2634 saved_mask = vap->va_mask;
2635 vap->va_mask &= ~trim_mask;
2636 if (trim_mask & AT_MODE) {
2637 /*
2638 * Save the mode, as secpolicy_vnode_setattr()
2639 * will overwrite it with ova.va_mode.
2640 */
2641 saved_mode = vap->va_mode;
2642 }
2643 }
2644 err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
2645 (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
2646 if (err) {
2647 zfs_exit(zfsvfs, FTAG);
2648 return (err);
2649 }
2650
2651 if (trim_mask) {
2652 vap->va_mask |= saved_mask;
2653 if (trim_mask & AT_MODE) {
2654 /*
2655 * Recover the mode after
2656 * secpolicy_vnode_setattr().
2657 */
2658 vap->va_mode = saved_mode;
2659 }
2660 }
2661 }
2662
2663 /*
2664 * secpolicy_vnode_setattr, or take ownership may have
2665 * changed va_mask
2666 */
2667 mask = vap->va_mask;
2668
2669 if ((mask & (AT_UID | AT_GID)) || projid != ZFS_INVALID_PROJID) {
2670 handle_eadir = B_TRUE;
2671 err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
2672 &xattr_obj, sizeof (xattr_obj));
2673
2674 if (err == 0 && xattr_obj) {
2675 err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp);
2676 if (err == 0) {
2677 err = vn_lock(ZTOV(attrzp), LK_EXCLUSIVE);
2678 if (err != 0)
2679 vrele(ZTOV(attrzp));
2680 }
2681 if (err)
2682 goto out2;
2683 }
2684 if (mask & AT_UID) {
2685 new_uid = zfs_fuid_create(zfsvfs,
2686 (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
2687 if (new_uid != zp->z_uid &&
2688 zfs_id_overquota(zfsvfs, DMU_USERUSED_OBJECT,
2689 new_uid)) {
2690 if (attrzp)
2691 vput(ZTOV(attrzp));
2692 err = SET_ERROR(EDQUOT);
2693 goto out2;
2694 }
2695 }
2696
2697 if (mask & AT_GID) {
2698 new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
2699 cr, ZFS_GROUP, &fuidp);
2700 if (new_gid != zp->z_gid &&
2701 zfs_id_overquota(zfsvfs, DMU_GROUPUSED_OBJECT,
2702 new_gid)) {
2703 if (attrzp)
2704 vput(ZTOV(attrzp));
2705 err = SET_ERROR(EDQUOT);
2706 goto out2;
2707 }
2708 }
2709
2710 if (projid != ZFS_INVALID_PROJID &&
2711 zfs_id_overquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid)) {
2712 if (attrzp)
2713 vput(ZTOV(attrzp));
2714 err = SET_ERROR(EDQUOT);
2715 goto out2;
2716 }
2717 }
2718 tx = dmu_tx_create(os);
2719
2720 if (mask & AT_MODE) {
2721 uint64_t pmode = zp->z_mode;
2722 uint64_t acl_obj;
2723 new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
2724
2725 if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED &&
2726 !(zp->z_pflags & ZFS_ACL_TRIVIAL)) {
2727 err = SET_ERROR(EPERM);
2728 goto out;
2729 }
2730
2731 if ((err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)))
2732 goto out;
2733
2734 if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
2735 /*
2736 * Are we upgrading ACL from old V0 format
2737 * to V1 format?
2738 */
2739 if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
2740 zfs_znode_acl_version(zp) ==
2741 ZFS_ACL_VERSION_INITIAL) {
2742 dmu_tx_hold_free(tx, acl_obj, 0,
2743 DMU_OBJECT_END);
2744 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
2745 0, aclp->z_acl_bytes);
2746 } else {
2747 dmu_tx_hold_write(tx, acl_obj, 0,
2748 aclp->z_acl_bytes);
2749 }
2750 } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
2751 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
2752 0, aclp->z_acl_bytes);
2753 }
2754 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
2755 } else {
2756 if (((mask & AT_XVATTR) &&
2757 XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) ||
2758 (projid != ZFS_INVALID_PROJID &&
2759 !(zp->z_pflags & ZFS_PROJID)))
2760 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
2761 else
2762 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
2763 }
2764
2765 if (attrzp) {
2766 dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
2767 }
2768
2769 fuid_dirtied = zfsvfs->z_fuid_dirty;
2770 if (fuid_dirtied)
2771 zfs_fuid_txhold(zfsvfs, tx);
2772
2773 zfs_sa_upgrade_txholds(tx, zp);
2774
2775 err = dmu_tx_assign(tx, DMU_TX_WAIT);
2776 if (err)
2777 goto out;
2778
2779 count = 0;
2780 /*
2781 * Set each attribute requested.
2782 * We group settings according to the locks they need to acquire.
2783 *
2784 * Note: you cannot set ctime directly, although it will be
2785 * updated as a side-effect of calling this function.
2786 */
2787
2788 if (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID)) {
2789 /*
2790 * For the existed object that is upgraded from old system,
2791 * its on-disk layout has no slot for the project ID attribute.
2792 * But quota accounting logic needs to access related slots by
2793 * offset directly. So we need to adjust old objects' layout
2794 * to make the project ID to some unified and fixed offset.
2795 */
2796 if (attrzp)
2797 err = sa_add_projid(attrzp->z_sa_hdl, tx, projid);
2798 if (err == 0)
2799 err = sa_add_projid(zp->z_sa_hdl, tx, projid);
2800
2801 if (unlikely(err == EEXIST))
2802 err = 0;
2803 else if (err != 0)
2804 goto out;
2805 else
2806 projid = ZFS_INVALID_PROJID;
2807 }
2808
2809 if (mask & (AT_UID|AT_GID|AT_MODE))
2810 mutex_enter(&zp->z_acl_lock);
2811
2812 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
2813 &zp->z_pflags, sizeof (zp->z_pflags));
2814
2815 if (attrzp) {
2816 if (mask & (AT_UID|AT_GID|AT_MODE))
2817 mutex_enter(&attrzp->z_acl_lock);
2818 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2819 SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
2820 sizeof (attrzp->z_pflags));
2821 if (projid != ZFS_INVALID_PROJID) {
2822 attrzp->z_projid = projid;
2823 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2824 SA_ZPL_PROJID(zfsvfs), NULL, &attrzp->z_projid,
2825 sizeof (attrzp->z_projid));
2826 }
2827 }
2828
2829 if (mask & (AT_UID|AT_GID)) {
2830
2831 if (mask & AT_UID) {
2832 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
2833 &new_uid, sizeof (new_uid));
2834 zp->z_uid = new_uid;
2835 if (attrzp) {
2836 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2837 SA_ZPL_UID(zfsvfs), NULL, &new_uid,
2838 sizeof (new_uid));
2839 attrzp->z_uid = new_uid;
2840 }
2841 }
2842
2843 if (mask & AT_GID) {
2844 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
2845 NULL, &new_gid, sizeof (new_gid));
2846 zp->z_gid = new_gid;
2847 if (attrzp) {
2848 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2849 SA_ZPL_GID(zfsvfs), NULL, &new_gid,
2850 sizeof (new_gid));
2851 attrzp->z_gid = new_gid;
2852 }
2853 }
2854 if (!(mask & AT_MODE)) {
2855 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
2856 NULL, &new_mode, sizeof (new_mode));
2857 new_mode = zp->z_mode;
2858 }
2859 err = zfs_acl_chown_setattr(zp);
2860 ASSERT0(err);
2861 if (attrzp) {
2862 vn_seqc_write_begin(ZTOV(attrzp));
2863 err = zfs_acl_chown_setattr(attrzp);
2864 vn_seqc_write_end(ZTOV(attrzp));
2865 ASSERT0(err);
2866 }
2867 }
2868
2869 if (mask & AT_MODE) {
2870 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
2871 &new_mode, sizeof (new_mode));
2872 zp->z_mode = new_mode;
2873 ASSERT3P(aclp, !=, NULL);
2874 err = zfs_aclset_common(zp, aclp, cr, tx);
2875 ASSERT0(err);
2876 if (zp->z_acl_cached)
2877 zfs_acl_free(zp->z_acl_cached);
2878 zp->z_acl_cached = aclp;
2879 aclp = NULL;
2880 }
2881
2882
2883 if (mask & AT_ATIME) {
2884 ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime);
2885 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
2886 &zp->z_atime, sizeof (zp->z_atime));
2887 }
2888
2889 if (mask & AT_MTIME) {
2890 ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
2891 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
2892 mtime, sizeof (mtime));
2893 }
2894
2895 if (projid != ZFS_INVALID_PROJID) {
2896 zp->z_projid = projid;
2897 SA_ADD_BULK_ATTR(bulk, count,
2898 SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid,
2899 sizeof (zp->z_projid));
2900 }
2901
2902 /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
2903 if (mask & AT_SIZE && !(mask & AT_MTIME)) {
2904 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
2905 NULL, mtime, sizeof (mtime));
2906 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
2907 &ctime, sizeof (ctime));
2908 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
2909 } else if (mask != 0) {
2910 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
2911 &ctime, sizeof (ctime));
2912 zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime);
2913 if (attrzp) {
2914 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
2915 SA_ZPL_CTIME(zfsvfs), NULL,
2916 &ctime, sizeof (ctime));
2917 zfs_tstamp_update_setup(attrzp, STATE_CHANGED,
2918 mtime, ctime);
2919 }
2920 }
2921
2922 /*
2923 * Do this after setting timestamps to prevent timestamp
2924 * update from toggling bit
2925 */
2926
2927 if (xoap && (mask & AT_XVATTR)) {
2928
2929 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME))
2930 xoap->xoa_createtime = vap->va_birthtime;
2931 /*
2932 * restore trimmed off masks
2933 * so that return masks can be set for caller.
2934 */
2935
2936 if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) {
2937 XVA_SET_REQ(xvap, XAT_APPENDONLY);
2938 }
2939 if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) {
2940 XVA_SET_REQ(xvap, XAT_NOUNLINK);
2941 }
2942 if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) {
2943 XVA_SET_REQ(xvap, XAT_IMMUTABLE);
2944 }
2945 if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) {
2946 XVA_SET_REQ(xvap, XAT_NODUMP);
2947 }
2948 if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) {
2949 XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
2950 }
2951 if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) {
2952 XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
2953 }
2954 if (XVA_ISSET_REQ(&tmpxvattr, XAT_PROJINHERIT)) {
2955 XVA_SET_REQ(xvap, XAT_PROJINHERIT);
2956 }
2957
2958 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
2959 ASSERT3S(vp->v_type, ==, VREG);
2960
2961 zfs_xvattr_set(zp, xvap, tx);
2962 }
2963
2964 if (fuid_dirtied)
2965 zfs_fuid_sync(zfsvfs, tx);
2966
2967 if (mask != 0)
2968 zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
2969
2970 if (mask & (AT_UID|AT_GID|AT_MODE))
2971 mutex_exit(&zp->z_acl_lock);
2972
2973 if (attrzp) {
2974 if (mask & (AT_UID|AT_GID|AT_MODE))
2975 mutex_exit(&attrzp->z_acl_lock);
2976 }
2977 out:
2978 if (err == 0 && attrzp) {
2979 err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
2980 xattr_count, tx);
2981 ASSERT0(err2);
2982 }
2983
2984 if (aclp)
2985 zfs_acl_free(aclp);
2986
2987 if (fuidp) {
2988 zfs_fuid_info_free(fuidp);
2989 fuidp = NULL;
2990 }
2991
2992 if (err) {
2993 dmu_tx_abort(tx);
2994 if (attrzp)
2995 vput(ZTOV(attrzp));
2996 } else {
2997 err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
2998 dmu_tx_commit(tx);
2999 if (attrzp) {
3000 if (err2 == 0 && handle_eadir)
3001 err = zfs_setattr_dir(attrzp);
3002 vput(ZTOV(attrzp));
3003 }
3004 }
3005
3006 out2:
3007 if (err == 0 && os->os_sync == ZFS_SYNC_ALWAYS)
3008 err = zil_commit(zilog, 0);
3009
3010 zfs_exit(zfsvfs, FTAG);
3011 return (err);
3012 }
3013
3014 /*
3015 * Look up the directory entries corresponding to the source and target
3016 * directory/name pairs.
3017 */
3018 static int
zfs_rename_relock_lookup(znode_t * sdzp,const struct componentname * scnp,znode_t ** szpp,znode_t * tdzp,const struct componentname * tcnp,znode_t ** tzpp)3019 zfs_rename_relock_lookup(znode_t *sdzp, const struct componentname *scnp,
3020 znode_t **szpp, znode_t *tdzp, const struct componentname *tcnp,
3021 znode_t **tzpp)
3022 {
3023 zfsvfs_t *zfsvfs;
3024 znode_t *szp, *tzp;
3025 int error;
3026
3027 /*
3028 * Before using sdzp and tdzp we must ensure that they are live.
3029 * As a porting legacy from illumos we have two things to worry
3030 * about. One is typical for FreeBSD and it is that the vnode is
3031 * not reclaimed (doomed). The other is that the znode is live.
3032 * The current code can invalidate the znode without acquiring the
3033 * corresponding vnode lock if the object represented by the znode
3034 * and vnode is no longer valid after a rollback or receive operation.
3035 * z_teardown_lock hidden behind zfs_enter and zfs_exit is the lock
3036 * that protects the znodes from the invalidation.
3037 */
3038 zfsvfs = sdzp->z_zfsvfs;
3039 ASSERT3P(zfsvfs, ==, tdzp->z_zfsvfs);
3040 if ((error = zfs_enter_verify_zp(zfsvfs, sdzp, FTAG)) != 0)
3041 return (error);
3042 if ((error = zfs_verify_zp(tdzp)) != 0) {
3043 zfs_exit(zfsvfs, FTAG);
3044 return (error);
3045 }
3046
3047 /*
3048 * Re-resolve svp to be certain it still exists and fetch the
3049 * correct vnode.
3050 */
3051 error = zfs_dirent_lookup(sdzp, scnp->cn_nameptr, &szp, ZEXISTS);
3052 if (error != 0) {
3053 /* Source entry invalid or not there. */
3054 if ((scnp->cn_flags & ISDOTDOT) != 0 ||
3055 (scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.'))
3056 error = SET_ERROR(EINVAL);
3057 goto out;
3058 }
3059 *szpp = szp;
3060
3061 /*
3062 * Re-resolve tvp, if it disappeared we just carry on.
3063 */
3064 error = zfs_dirent_lookup(tdzp, tcnp->cn_nameptr, &tzp, 0);
3065 if (error != 0) {
3066 vrele(ZTOV(szp));
3067 if ((tcnp->cn_flags & ISDOTDOT) != 0)
3068 error = SET_ERROR(EINVAL);
3069 goto out;
3070 }
3071 *tzpp = tzp;
3072 out:
3073 zfs_exit(zfsvfs, FTAG);
3074 return (error);
3075 }
3076
3077 /*
3078 * We acquire all but fdvp locks using non-blocking acquisitions. If we
3079 * fail to acquire any lock in the path we will drop all held locks,
3080 * acquire the new lock in a blocking fashion, and then release it and
3081 * restart the rename. This acquire/release step ensures that we do not
3082 * spin on a lock waiting for release. On error release all vnode locks
3083 * and decrement references the way tmpfs_rename() would do.
3084 */
3085 static int
zfs_rename_relock(struct vnode * sdvp,struct vnode ** svpp,struct vnode * tdvp,struct vnode ** tvpp,const struct componentname * scnp,const struct componentname * tcnp)3086 zfs_rename_relock(struct vnode *sdvp, struct vnode **svpp,
3087 struct vnode *tdvp, struct vnode **tvpp,
3088 const struct componentname *scnp, const struct componentname *tcnp)
3089 {
3090 struct vnode *nvp, *svp, *tvp;
3091 znode_t *sdzp, *tdzp, *szp, *tzp;
3092 int error;
3093
3094 VOP_UNLOCK(tdvp);
3095 if (*tvpp != NULL && *tvpp != tdvp)
3096 VOP_UNLOCK(*tvpp);
3097
3098 relock:
3099 error = vn_lock(sdvp, LK_EXCLUSIVE);
3100 if (error)
3101 goto out;
3102 error = vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT);
3103 if (error != 0) {
3104 VOP_UNLOCK(sdvp);
3105 if (error != EBUSY)
3106 goto out;
3107 error = vn_lock(tdvp, LK_EXCLUSIVE);
3108 if (error)
3109 goto out;
3110 VOP_UNLOCK(tdvp);
3111 goto relock;
3112 }
3113 tdzp = VTOZ(tdvp);
3114 sdzp = VTOZ(sdvp);
3115
3116 error = zfs_rename_relock_lookup(sdzp, scnp, &szp, tdzp, tcnp, &tzp);
3117 if (error != 0) {
3118 VOP_UNLOCK(sdvp);
3119 VOP_UNLOCK(tdvp);
3120 goto out;
3121 }
3122 svp = ZTOV(szp);
3123 tvp = tzp != NULL ? ZTOV(tzp) : NULL;
3124
3125 /*
3126 * Now try acquire locks on svp and tvp.
3127 */
3128 nvp = svp;
3129 error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
3130 if (error != 0) {
3131 VOP_UNLOCK(sdvp);
3132 VOP_UNLOCK(tdvp);
3133 if (tvp != NULL)
3134 vrele(tvp);
3135 if (error != EBUSY) {
3136 vrele(nvp);
3137 goto out;
3138 }
3139 error = vn_lock(nvp, LK_EXCLUSIVE);
3140 if (error != 0) {
3141 vrele(nvp);
3142 goto out;
3143 }
3144 VOP_UNLOCK(nvp);
3145 /*
3146 * Concurrent rename race.
3147 * XXX ?
3148 */
3149 if (nvp == tdvp) {
3150 vrele(nvp);
3151 error = SET_ERROR(EINVAL);
3152 goto out;
3153 }
3154 vrele(*svpp);
3155 *svpp = nvp;
3156 goto relock;
3157 }
3158 vrele(*svpp);
3159 *svpp = nvp;
3160
3161 if (*tvpp != NULL)
3162 vrele(*tvpp);
3163 *tvpp = NULL;
3164 if (tvp != NULL) {
3165 nvp = tvp;
3166 error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
3167 if (error != 0) {
3168 VOP_UNLOCK(sdvp);
3169 VOP_UNLOCK(tdvp);
3170 VOP_UNLOCK(*svpp);
3171 if (error != EBUSY) {
3172 vrele(nvp);
3173 goto out;
3174 }
3175 error = vn_lock(nvp, LK_EXCLUSIVE);
3176 if (error != 0) {
3177 vrele(nvp);
3178 goto out;
3179 }
3180 vput(nvp);
3181 goto relock;
3182 }
3183 *tvpp = nvp;
3184 }
3185
3186 return (0);
3187
3188 out:
3189 return (error);
3190 }
3191
3192 /*
3193 * Note that we must use VRELE_ASYNC in this function as it walks
3194 * up the directory tree and vrele may need to acquire an exclusive
3195 * lock if a last reference to a vnode is dropped.
3196 */
3197 static int
zfs_rename_check(znode_t * szp,znode_t * sdzp,znode_t * tdzp)3198 zfs_rename_check(znode_t *szp, znode_t *sdzp, znode_t *tdzp)
3199 {
3200 zfsvfs_t *zfsvfs;
3201 znode_t *zp, *zp1;
3202 uint64_t parent;
3203 int error;
3204
3205 zfsvfs = tdzp->z_zfsvfs;
3206 if (tdzp == szp)
3207 return (SET_ERROR(EINVAL));
3208 if (tdzp == sdzp)
3209 return (0);
3210 if (tdzp->z_id == zfsvfs->z_root)
3211 return (0);
3212 zp = tdzp;
3213 for (;;) {
3214 ASSERT(!zp->z_unlinked);
3215 if ((error = sa_lookup(zp->z_sa_hdl,
3216 SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
3217 break;
3218
3219 if (parent == szp->z_id) {
3220 error = SET_ERROR(EINVAL);
3221 break;
3222 }
3223 if (parent == zfsvfs->z_root)
3224 break;
3225 if (parent == sdzp->z_id)
3226 break;
3227
3228 error = zfs_zget(zfsvfs, parent, &zp1);
3229 if (error != 0)
3230 break;
3231
3232 if (zp != tdzp)
3233 VN_RELE_ASYNC(ZTOV(zp),
3234 dsl_pool_zrele_taskq(
3235 dmu_objset_pool(zfsvfs->z_os)));
3236 zp = zp1;
3237 }
3238
3239 if (error == ENOTDIR)
3240 panic("checkpath: .. not a directory\n");
3241 if (zp != tdzp)
3242 VN_RELE_ASYNC(ZTOV(zp),
3243 dsl_pool_zrele_taskq(dmu_objset_pool(zfsvfs->z_os)));
3244 return (error);
3245 }
3246
3247 static int
3248 zfs_do_rename_impl(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp,
3249 vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp,
3250 cred_t *cr, u_int at_flags);
3251
3252 /*
3253 * Move an entry from the provided source directory to the target
3254 * directory. Change the entry name as indicated.
3255 *
3256 * IN: sdvp - Source directory containing the "old entry".
3257 * scnp - Old entry name.
3258 * tdvp - Target directory to contain the "new entry".
3259 * tcnp - New entry name.
3260 * cr - credentials of caller.
3261 * at_flags - AT_RENAME_*
3262 * INOUT: svpp - Source file
3263 * tvpp - Target file, may point to NULL initially
3264 *
3265 * RETURN: 0 on success, error code on failure.
3266 *
3267 * Timestamps:
3268 * sdvp,tdvp - ctime|mtime updated
3269 */
3270 static int
zfs_do_rename(vnode_t * sdvp,vnode_t ** svpp,struct componentname * scnp,vnode_t * tdvp,vnode_t ** tvpp,struct componentname * tcnp,cred_t * cr,u_int at_flags)3271 zfs_do_rename(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp,
3272 vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp,
3273 cred_t *cr, u_int at_flags)
3274 {
3275 int error;
3276
3277 ASSERT_VOP_ELOCKED(tdvp, __func__);
3278 if (*tvpp != NULL)
3279 ASSERT_VOP_ELOCKED(*tvpp, __func__);
3280
3281 /* Reject renames across filesystems. */
3282 if ((*svpp)->v_mount != tdvp->v_mount ||
3283 ((*tvpp) != NULL && (*svpp)->v_mount != (*tvpp)->v_mount)) {
3284 error = SET_ERROR(EXDEV);
3285 goto out;
3286 }
3287
3288 if (zfsctl_is_node(tdvp)) {
3289 error = SET_ERROR(EXDEV);
3290 goto out;
3291 }
3292
3293 /*
3294 * Lock all four vnodes to ensure safety and semantics of renaming.
3295 */
3296 error = zfs_rename_relock(sdvp, svpp, tdvp, tvpp, scnp, tcnp);
3297 if (error != 0) {
3298 /* no vnodes are locked in the case of error here */
3299 return (error);
3300 }
3301
3302 error = zfs_do_rename_impl(sdvp, svpp, scnp, tdvp, tvpp, tcnp, cr,
3303 at_flags);
3304 VOP_UNLOCK(sdvp);
3305 VOP_UNLOCK(*svpp);
3306 out:
3307 if (*tvpp != NULL)
3308 VOP_UNLOCK(*tvpp);
3309 if (tdvp != *tvpp)
3310 VOP_UNLOCK(tdvp);
3311
3312 return (error);
3313 }
3314
3315 static int
zfs_do_rename_impl(vnode_t * sdvp,vnode_t ** svpp,struct componentname * scnp,vnode_t * tdvp,vnode_t ** tvpp,struct componentname * tcnp,cred_t * cr,u_int at_flags)3316 zfs_do_rename_impl(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp,
3317 vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp,
3318 cred_t *cr, u_int at_flags)
3319 {
3320 dmu_tx_t *tx;
3321 zfsvfs_t *zfsvfs;
3322 zilog_t *zilog;
3323 znode_t *tdzp, *sdzp, *tzp, *szp;
3324 const char *snm = scnp->cn_nameptr;
3325 const char *tnm = tcnp->cn_nameptr;
3326 int error;
3327
3328 tdzp = VTOZ(tdvp);
3329 sdzp = VTOZ(sdvp);
3330 zfsvfs = tdzp->z_zfsvfs;
3331
3332 if ((error = zfs_enter_verify_zp(zfsvfs, tdzp, FTAG)) != 0)
3333 return (error);
3334 if ((error = zfs_verify_zp(sdzp)) != 0) {
3335 zfs_exit(zfsvfs, FTAG);
3336 return (error);
3337 }
3338 zilog = zfsvfs->z_log;
3339
3340 if (zfsvfs->z_utf8 && u8_validate(tnm,
3341 strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3342 error = SET_ERROR(EILSEQ);
3343 goto out;
3344 }
3345
3346 /* If source and target are the same file, there is nothing to do. */
3347 if ((*svpp) == (*tvpp)) {
3348 error = 0;
3349 goto out;
3350 }
3351
3352 if (((*svpp)->v_type == VDIR && (*svpp)->v_mountedhere != NULL) ||
3353 ((*tvpp) != NULL && (*tvpp)->v_type == VDIR &&
3354 (*tvpp)->v_mountedhere != NULL)) {
3355 error = SET_ERROR(EXDEV);
3356 goto out;
3357 }
3358
3359 szp = VTOZ(*svpp);
3360 if ((error = zfs_verify_zp(szp)) != 0) {
3361 zfs_exit(zfsvfs, FTAG);
3362 return (error);
3363 }
3364 tzp = *tvpp == NULL ? NULL : VTOZ(*tvpp);
3365 if (tzp != NULL) {
3366 if ((error = zfs_verify_zp(tzp)) != 0) {
3367 zfs_exit(zfsvfs, FTAG);
3368 return (error);
3369 }
3370 }
3371
3372 /*
3373 * This is to prevent the creation of links into attribute space
3374 * by renaming a linked file into/outof an attribute directory.
3375 * See the comment in zfs_link() for why this is considered bad.
3376 */
3377 if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
3378 error = SET_ERROR(EINVAL);
3379 goto out;
3380 }
3381
3382 /*
3383 * If we are using project inheritance, means if the directory has
3384 * ZFS_PROJINHERIT set, then its descendant directories will inherit
3385 * not only the project ID, but also the ZFS_PROJINHERIT flag. Under
3386 * such case, we only allow renames into our tree when the project
3387 * IDs are the same.
3388 */
3389 if (tdzp->z_pflags & ZFS_PROJINHERIT &&
3390 tdzp->z_projid != szp->z_projid) {
3391 error = SET_ERROR(EXDEV);
3392 goto out;
3393 }
3394
3395 /*
3396 * Must have write access at the source to remove the old entry
3397 * and write access at the target to create the new entry.
3398 * Note that if target and source are the same, this can be
3399 * done in a single check.
3400 */
3401 if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr, NULL)))
3402 goto out;
3403
3404 if ((*svpp)->v_type == VDIR) {
3405 /*
3406 * Avoid ".", "..", and aliases of "." for obvious reasons.
3407 */
3408 if ((scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.') ||
3409 sdzp == szp ||
3410 (scnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) {
3411 error = EINVAL;
3412 goto out;
3413 }
3414
3415 /*
3416 * Check to make sure rename is valid.
3417 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
3418 */
3419 if ((error = zfs_rename_check(szp, sdzp, tdzp)))
3420 goto out;
3421 }
3422
3423 /*
3424 * Does target exist?
3425 */
3426 if (tzp) {
3427 if ((at_flags & AT_RENAME_NOREPLACE) != 0) {
3428 error = SET_ERROR(EEXIST);
3429 goto out;
3430 }
3431
3432 /*
3433 * Source and target must be the same type.
3434 */
3435 if ((*svpp)->v_type == VDIR) {
3436 if ((*tvpp)->v_type != VDIR) {
3437 error = SET_ERROR(ENOTDIR);
3438 goto out;
3439 } else {
3440 cache_purge(tdvp);
3441 if (sdvp != tdvp)
3442 cache_purge(sdvp);
3443 }
3444 } else {
3445 if ((*tvpp)->v_type == VDIR) {
3446 error = SET_ERROR(EISDIR);
3447 goto out;
3448 }
3449 }
3450 }
3451
3452 vn_seqc_write_begin(*svpp);
3453 vn_seqc_write_begin(sdvp);
3454 if (*tvpp != NULL)
3455 vn_seqc_write_begin(*tvpp);
3456 if (tdvp != *tvpp)
3457 vn_seqc_write_begin(tdvp);
3458
3459 vnevent_rename_src(*svpp, sdvp, scnp->cn_nameptr, ct);
3460 if (tzp)
3461 vnevent_rename_dest(*tvpp, tdvp, tnm, ct);
3462
3463 /*
3464 * notify the target directory if it is not the same
3465 * as source directory.
3466 */
3467 if (tdvp != sdvp) {
3468 vnevent_rename_dest_dir(tdvp, ct);
3469 }
3470
3471 tx = dmu_tx_create(zfsvfs->z_os);
3472 dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
3473 dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
3474 dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
3475 dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
3476 if (sdzp != tdzp) {
3477 dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
3478 zfs_sa_upgrade_txholds(tx, tdzp);
3479 }
3480 if (tzp) {
3481 dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
3482 zfs_sa_upgrade_txholds(tx, tzp);
3483 }
3484
3485 zfs_sa_upgrade_txholds(tx, szp);
3486 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
3487 error = dmu_tx_assign(tx, DMU_TX_WAIT);
3488 if (error) {
3489 dmu_tx_abort(tx);
3490 goto out_seq;
3491 }
3492
3493 if (tzp) /* Attempt to remove the existing target */
3494 error = zfs_link_destroy(tdzp, tnm, tzp, tx, 0, NULL);
3495
3496 if (error == 0) {
3497 error = zfs_link_create(tdzp, tnm, szp, tx, ZRENAMING);
3498 if (error == 0) {
3499 szp->z_pflags |= ZFS_AV_MODIFIED;
3500
3501 error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
3502 (void *)&szp->z_pflags, sizeof (uint64_t), tx);
3503 ASSERT0(error);
3504
3505 error = zfs_link_destroy(sdzp, snm, szp, tx, ZRENAMING,
3506 NULL);
3507 if (error == 0) {
3508 zfs_log_rename(zilog, tx, TX_RENAME, sdzp,
3509 snm, tdzp, tnm, szp);
3510 } else {
3511 /*
3512 * At this point, we have successfully created
3513 * the target name, but have failed to remove
3514 * the source name. Since the create was done
3515 * with the ZRENAMING flag, there are
3516 * complications; for one, the link count is
3517 * wrong. The easiest way to deal with this
3518 * is to remove the newly created target, and
3519 * return the original error. This must
3520 * succeed; fortunately, it is very unlikely to
3521 * fail, since we just created it.
3522 */
3523 VERIFY0(zfs_link_destroy(tdzp, tnm, szp, tx,
3524 ZRENAMING, NULL));
3525 }
3526 }
3527 if (error == 0) {
3528 cache_vop_rename(sdvp, *svpp, tdvp, *tvpp, scnp, tcnp);
3529 }
3530 }
3531
3532 dmu_tx_commit(tx);
3533
3534 out_seq:
3535 vn_seqc_write_end(*svpp);
3536 vn_seqc_write_end(sdvp);
3537 if (*tvpp != NULL)
3538 vn_seqc_write_end(*tvpp);
3539 if (tdvp != *tvpp)
3540 vn_seqc_write_end(tdvp);
3541
3542 out:
3543 if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3544 error = zil_commit(zilog, 0);
3545 zfs_exit(zfsvfs, FTAG);
3546
3547 return (error);
3548 }
3549
3550 int
zfs_rename(znode_t * sdzp,const char * sname,znode_t * tdzp,const char * tname,cred_t * cr,int flags,uint64_t rflags,u_int at_flags,vattr_t * wo_vap,zidmap_t * mnt_ns)3551 zfs_rename(znode_t *sdzp, const char *sname, znode_t *tdzp, const char *tname,
3552 cred_t *cr, int flags, uint64_t rflags, u_int at_flags, vattr_t *wo_vap,
3553 zidmap_t *mnt_ns)
3554 {
3555 struct componentname scn, tcn;
3556 vnode_t *sdvp, *tdvp;
3557 vnode_t *svp, *tvp;
3558 int error;
3559 svp = tvp = NULL;
3560
3561 if (is_nametoolong(tdzp->z_zfsvfs, tname))
3562 return (SET_ERROR(ENAMETOOLONG));
3563
3564 if (rflags != 0 || wo_vap != NULL)
3565 return (SET_ERROR(EINVAL));
3566
3567 sdvp = ZTOV(sdzp);
3568 tdvp = ZTOV(tdzp);
3569 error = zfs_lookup_internal(sdzp, sname, &svp, &scn, DELETE);
3570 if (sdzp->z_zfsvfs->z_replay == B_FALSE)
3571 VOP_UNLOCK(sdvp);
3572 if (error != 0)
3573 goto fail;
3574 VOP_UNLOCK(svp);
3575
3576 vn_lock(tdvp, LK_EXCLUSIVE | LK_RETRY);
3577 error = zfs_lookup_internal(tdzp, tname, &tvp, &tcn, RENAME);
3578 if (error == EJUSTRETURN)
3579 tvp = NULL;
3580 else if (error != 0) {
3581 VOP_UNLOCK(tdvp);
3582 goto fail;
3583 }
3584
3585 error = zfs_do_rename(sdvp, &svp, &scn, tdvp, &tvp, &tcn, cr,
3586 at_flags);
3587 fail:
3588 if (svp != NULL)
3589 vrele(svp);
3590 if (tvp != NULL)
3591 vrele(tvp);
3592
3593 return (error);
3594 }
3595
3596 /*
3597 * Insert the indicated symbolic reference entry into the directory.
3598 *
3599 * IN: dvp - Directory to contain new symbolic link.
3600 * link - Name for new symlink entry.
3601 * vap - Attributes of new entry.
3602 * cr - credentials of caller.
3603 * ct - caller context
3604 * flags - case flags
3605 * mnt_ns - Unused on FreeBSD
3606 *
3607 * RETURN: 0 on success, error code on failure.
3608 *
3609 * Timestamps:
3610 * dvp - ctime|mtime updated
3611 */
3612 int
zfs_symlink(znode_t * dzp,const char * name,vattr_t * vap,const char * link,znode_t ** zpp,cred_t * cr,int flags,zidmap_t * mnt_ns)3613 zfs_symlink(znode_t *dzp, const char *name, vattr_t *vap,
3614 const char *link, znode_t **zpp, cred_t *cr, int flags, zidmap_t *mnt_ns)
3615 {
3616 (void) flags;
3617 znode_t *zp;
3618 dmu_tx_t *tx;
3619 zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
3620 zilog_t *zilog;
3621 uint64_t len = strlen(link);
3622 int error;
3623 zfs_acl_ids_t acl_ids;
3624 boolean_t fuid_dirtied;
3625 uint64_t txtype = TX_SYMLINK;
3626
3627 ASSERT3S(vap->va_type, ==, VLNK);
3628
3629 if (is_nametoolong(zfsvfs, name))
3630 return (SET_ERROR(ENAMETOOLONG));
3631
3632 if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
3633 return (error);
3634 zilog = zfsvfs->z_log;
3635
3636 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
3637 NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3638 zfs_exit(zfsvfs, FTAG);
3639 return (SET_ERROR(EILSEQ));
3640 }
3641
3642 if (len > MAXPATHLEN) {
3643 zfs_exit(zfsvfs, FTAG);
3644 return (SET_ERROR(ENAMETOOLONG));
3645 }
3646
3647 if ((error = zfs_acl_ids_create(dzp, 0,
3648 vap, cr, NULL, &acl_ids, NULL)) != 0) {
3649 zfs_exit(zfsvfs, FTAG);
3650 return (error);
3651 }
3652
3653 /*
3654 * Attempt to lock directory; fail if entry already exists.
3655 */
3656 error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
3657 if (error) {
3658 zfs_acl_ids_free(&acl_ids);
3659 zfs_exit(zfsvfs, FTAG);
3660 return (error);
3661 }
3662
3663 if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns))) {
3664 zfs_acl_ids_free(&acl_ids);
3665 zfs_exit(zfsvfs, FTAG);
3666 return (error);
3667 }
3668
3669 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, ZFS_DEFAULT_PROJID)) {
3670 zfs_acl_ids_free(&acl_ids);
3671 zfs_exit(zfsvfs, FTAG);
3672 return (SET_ERROR(EDQUOT));
3673 }
3674
3675 getnewvnode_reserve();
3676 tx = dmu_tx_create(zfsvfs->z_os);
3677 fuid_dirtied = zfsvfs->z_fuid_dirty;
3678 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
3679 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
3680 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
3681 ZFS_SA_BASE_ATTR_SIZE + len);
3682 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
3683 if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
3684 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
3685 acl_ids.z_aclp->z_acl_bytes);
3686 }
3687 if (fuid_dirtied)
3688 zfs_fuid_txhold(zfsvfs, tx);
3689 error = dmu_tx_assign(tx, DMU_TX_WAIT);
3690 if (error) {
3691 zfs_acl_ids_free(&acl_ids);
3692 dmu_tx_abort(tx);
3693 getnewvnode_drop_reserve();
3694 zfs_exit(zfsvfs, FTAG);
3695 return (error);
3696 }
3697
3698 /*
3699 * Create a new object for the symlink.
3700 * for version 4 ZPL datasets the symlink will be an SA attribute
3701 */
3702 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
3703
3704 if (fuid_dirtied)
3705 zfs_fuid_sync(zfsvfs, tx);
3706
3707 if (zp->z_is_sa)
3708 error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
3709 __DECONST(void *, link), len, tx);
3710 else
3711 zfs_sa_symlink(zp, __DECONST(char *, link), len, tx);
3712
3713 zp->z_size = len;
3714 (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
3715 &zp->z_size, sizeof (zp->z_size), tx);
3716 /*
3717 * Insert the new object into the directory.
3718 */
3719 error = zfs_link_create(dzp, name, zp, tx, ZNEW);
3720 if (error != 0) {
3721 zfs_znode_delete(zp, tx);
3722 VOP_UNLOCK(ZTOV(zp));
3723 zrele(zp);
3724 } else {
3725 zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
3726 }
3727
3728 zfs_acl_ids_free(&acl_ids);
3729
3730 dmu_tx_commit(tx);
3731
3732 getnewvnode_drop_reserve();
3733
3734 if (error == 0) {
3735 *zpp = zp;
3736
3737 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3738 error = zil_commit(zilog, 0);
3739 }
3740
3741 zfs_exit(zfsvfs, FTAG);
3742 return (error);
3743 }
3744
3745 /*
3746 * Return, in the buffer contained in the provided uio structure,
3747 * the symbolic path referred to by vp.
3748 *
3749 * IN: vp - vnode of symbolic link.
3750 * uio - structure to contain the link path.
3751 * cr - credentials of caller.
3752 * ct - caller context
3753 *
3754 * OUT: uio - structure containing the link path.
3755 *
3756 * RETURN: 0 on success, error code on failure.
3757 *
3758 * Timestamps:
3759 * vp - atime updated
3760 */
3761 static int
zfs_readlink(vnode_t * vp,zfs_uio_t * uio,cred_t * cr,caller_context_t * ct)3762 zfs_readlink(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, caller_context_t *ct)
3763 {
3764 (void) cr, (void) ct;
3765 znode_t *zp = VTOZ(vp);
3766 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
3767 int error;
3768
3769 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
3770 return (error);
3771
3772 if (zp->z_is_sa)
3773 error = sa_lookup_uio(zp->z_sa_hdl,
3774 SA_ZPL_SYMLINK(zfsvfs), uio);
3775 else
3776 error = zfs_sa_readlink(zp, uio);
3777
3778 ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
3779
3780 zfs_exit(zfsvfs, FTAG);
3781 return (error);
3782 }
3783
3784 /*
3785 * Insert a new entry into directory tdvp referencing svp.
3786 *
3787 * IN: tdvp - Directory to contain new entry.
3788 * svp - vnode of new entry.
3789 * name - name of new entry.
3790 * cr - credentials of caller.
3791 *
3792 * RETURN: 0 on success, error code on failure.
3793 *
3794 * Timestamps:
3795 * tdvp - ctime|mtime updated
3796 * svp - ctime updated
3797 */
3798 int
zfs_link(znode_t * tdzp,znode_t * szp,const char * name,cred_t * cr,int flags)3799 zfs_link(znode_t *tdzp, znode_t *szp, const char *name, cred_t *cr,
3800 int flags)
3801 {
3802 (void) flags;
3803 znode_t *tzp;
3804 zfsvfs_t *zfsvfs = tdzp->z_zfsvfs;
3805 zilog_t *zilog;
3806 dmu_tx_t *tx;
3807 int error;
3808 uint64_t parent;
3809 uid_t owner;
3810
3811 ASSERT3S(ZTOV(tdzp)->v_type, ==, VDIR);
3812
3813 if (is_nametoolong(zfsvfs, name))
3814 return (SET_ERROR(ENAMETOOLONG));
3815
3816 if ((error = zfs_enter_verify_zp(zfsvfs, tdzp, FTAG)) != 0)
3817 return (error);
3818 zilog = zfsvfs->z_log;
3819
3820 /*
3821 * POSIX dictates that we return EPERM here.
3822 * Better choices include ENOTSUP or EISDIR.
3823 */
3824 if (ZTOV(szp)->v_type == VDIR) {
3825 zfs_exit(zfsvfs, FTAG);
3826 return (SET_ERROR(EPERM));
3827 }
3828
3829 if ((error = zfs_verify_zp(szp)) != 0) {
3830 zfs_exit(zfsvfs, FTAG);
3831 return (error);
3832 }
3833
3834 /*
3835 * If we are using project inheritance, means if the directory has
3836 * ZFS_PROJINHERIT set, then its descendant directories will inherit
3837 * not only the project ID, but also the ZFS_PROJINHERIT flag. Under
3838 * such case, we only allow hard link creation in our tree when the
3839 * project IDs are the same.
3840 */
3841 if (tdzp->z_pflags & ZFS_PROJINHERIT &&
3842 tdzp->z_projid != szp->z_projid) {
3843 zfs_exit(zfsvfs, FTAG);
3844 return (SET_ERROR(EXDEV));
3845 }
3846
3847 if (szp->z_pflags & (ZFS_APPENDONLY |
3848 ZFS_IMMUTABLE | ZFS_READONLY)) {
3849 zfs_exit(zfsvfs, FTAG);
3850 return (SET_ERROR(EPERM));
3851 }
3852
3853 /* Prevent links to .zfs/shares files */
3854
3855 if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
3856 &parent, sizeof (uint64_t))) != 0) {
3857 zfs_exit(zfsvfs, FTAG);
3858 return (error);
3859 }
3860 if (parent == zfsvfs->z_shares_dir) {
3861 zfs_exit(zfsvfs, FTAG);
3862 return (SET_ERROR(EPERM));
3863 }
3864
3865 if (zfsvfs->z_utf8 && u8_validate(name,
3866 strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3867 zfs_exit(zfsvfs, FTAG);
3868 return (SET_ERROR(EILSEQ));
3869 }
3870
3871 /*
3872 * We do not support links between attributes and non-attributes
3873 * because of the potential security risk of creating links
3874 * into "normal" file space in order to circumvent restrictions
3875 * imposed in attribute space.
3876 */
3877 if ((szp->z_pflags & ZFS_XATTR) != (tdzp->z_pflags & ZFS_XATTR)) {
3878 zfs_exit(zfsvfs, FTAG);
3879 return (SET_ERROR(EINVAL));
3880 }
3881
3882
3883 owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER);
3884 if (owner != crgetuid(cr) && secpolicy_basic_link(ZTOV(szp), cr) != 0) {
3885 zfs_exit(zfsvfs, FTAG);
3886 return (SET_ERROR(EPERM));
3887 }
3888
3889 if ((error = zfs_zaccess(tdzp, ACE_ADD_FILE, 0, B_FALSE, cr, NULL))) {
3890 zfs_exit(zfsvfs, FTAG);
3891 return (error);
3892 }
3893
3894 /*
3895 * Attempt to lock directory; fail if entry already exists.
3896 */
3897 error = zfs_dirent_lookup(tdzp, name, &tzp, ZNEW);
3898 if (error) {
3899 zfs_exit(zfsvfs, FTAG);
3900 return (error);
3901 }
3902
3903 tx = dmu_tx_create(zfsvfs->z_os);
3904 dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
3905 dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, name);
3906 zfs_sa_upgrade_txholds(tx, szp);
3907 zfs_sa_upgrade_txholds(tx, tdzp);
3908 error = dmu_tx_assign(tx, DMU_TX_WAIT);
3909 if (error) {
3910 dmu_tx_abort(tx);
3911 zfs_exit(zfsvfs, FTAG);
3912 return (error);
3913 }
3914
3915 error = zfs_link_create(tdzp, name, szp, tx, 0);
3916
3917 if (error == 0) {
3918 uint64_t txtype = TX_LINK;
3919 zfs_log_link(zilog, tx, txtype, tdzp, szp, name);
3920 }
3921
3922 dmu_tx_commit(tx);
3923
3924 if (error == 0) {
3925 vnevent_link(ZTOV(szp), ct);
3926 }
3927
3928 if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3929 error = zil_commit(zilog, 0);
3930
3931 zfs_exit(zfsvfs, FTAG);
3932 return (error);
3933 }
3934
3935 /*
3936 * Free or allocate space in a file. Currently, this function only
3937 * supports the `F_FREESP' command. However, this command is somewhat
3938 * misnamed, as its functionality includes the ability to allocate as
3939 * well as free space.
3940 *
3941 * IN: ip - inode of file to free data in.
3942 * cmd - action to take (only F_FREESP supported).
3943 * bfp - section of file to free/alloc.
3944 * flag - current file open mode flags.
3945 * offset - current file offset.
3946 * cr - credentials of caller.
3947 *
3948 * RETURN: 0 on success, error code on failure.
3949 *
3950 * Timestamps:
3951 * ip - ctime|mtime updated
3952 */
3953 int
zfs_space(znode_t * zp,int cmd,flock64_t * bfp,int flag,offset_t offset,cred_t * cr)3954 zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag,
3955 offset_t offset, cred_t *cr)
3956 {
3957 (void) offset;
3958 zfsvfs_t *zfsvfs = ZTOZSB(zp);
3959 uint64_t off, len;
3960 int error;
3961
3962 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
3963 return (error);
3964
3965 if (cmd != F_FREESP) {
3966 zfs_exit(zfsvfs, FTAG);
3967 return (SET_ERROR(EINVAL));
3968 }
3969
3970 /*
3971 * Callers might not be able to detect properly that we are read-only,
3972 * so check it explicitly here.
3973 */
3974 if (zfs_is_readonly(zfsvfs)) {
3975 zfs_exit(zfsvfs, FTAG);
3976 return (SET_ERROR(EROFS));
3977 }
3978
3979 if (bfp->l_len < 0) {
3980 zfs_exit(zfsvfs, FTAG);
3981 return (SET_ERROR(EINVAL));
3982 }
3983
3984 /*
3985 * Permissions aren't checked on Solaris because on this OS
3986 * zfs_space() can only be called with an opened file handle.
3987 * On Linux we can get here through truncate_range() which
3988 * operates directly on inodes, so we need to check access rights.
3989 */
3990 if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr, NULL))) {
3991 zfs_exit(zfsvfs, FTAG);
3992 return (error);
3993 }
3994
3995 off = bfp->l_start;
3996 len = bfp->l_len; /* 0 means from off to end of file */
3997
3998 error = zfs_freesp(zp, off, len, flag, TRUE);
3999
4000 zfs_exit(zfsvfs, FTAG);
4001 return (error);
4002 }
4003
4004 static void
zfs_inactive(vnode_t * vp,cred_t * cr,caller_context_t * ct)4005 zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
4006 {
4007 (void) cr, (void) ct;
4008 znode_t *zp = VTOZ(vp);
4009 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4010 int error;
4011
4012 ZFS_TEARDOWN_INACTIVE_ENTER_READ(zfsvfs);
4013 if (zp->z_sa_hdl == NULL) {
4014 /*
4015 * The fs has been unmounted, or we did a
4016 * suspend/resume and this file no longer exists.
4017 */
4018 ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs);
4019 vrecycle(vp);
4020 return;
4021 }
4022
4023 if (zp->z_unlinked) {
4024 /*
4025 * Fast path to recycle a vnode of a removed file.
4026 */
4027 ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs);
4028 vrecycle(vp);
4029 return;
4030 }
4031
4032 if (zp->z_atime_dirty && zp->z_unlinked == 0) {
4033 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
4034
4035 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4036 zfs_sa_upgrade_txholds(tx, zp);
4037 error = dmu_tx_assign(tx, DMU_TX_WAIT);
4038 if (error) {
4039 dmu_tx_abort(tx);
4040 } else {
4041 (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
4042 (void *)&zp->z_atime, sizeof (zp->z_atime), tx);
4043 zp->z_atime_dirty = 0;
4044 dmu_tx_commit(tx);
4045 }
4046 }
4047 ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs);
4048 }
4049
4050
4051 _Static_assert(sizeof (struct zfid_short) <= sizeof (struct fid),
4052 "struct zfid_short bigger than struct fid");
4053 _Static_assert(sizeof (struct zfid_long) <= sizeof (struct fid),
4054 "struct zfid_long bigger than struct fid");
4055
4056 static int
zfs_fid(vnode_t * vp,fid_t * fidp,caller_context_t * ct)4057 zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
4058 {
4059 (void) ct;
4060 znode_t *zp = VTOZ(vp);
4061 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4062 uint32_t gen;
4063 uint64_t gen64;
4064 uint64_t object = zp->z_id;
4065 zfid_short_t *zfid;
4066 int size, i, error;
4067
4068 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
4069 return (error);
4070
4071 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
4072 &gen64, sizeof (uint64_t))) != 0) {
4073 zfs_exit(zfsvfs, FTAG);
4074 return (error);
4075 }
4076
4077 gen = (uint32_t)gen64;
4078
4079 size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
4080 fidp->fid_len = size;
4081
4082 zfid = (zfid_short_t *)fidp;
4083
4084 zfid->zf_len = size;
4085
4086 for (i = 0; i < sizeof (zfid->zf_object); i++)
4087 zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
4088
4089 /* Must have a non-zero generation number to distinguish from .zfs */
4090 if (gen == 0)
4091 gen = 1;
4092 for (i = 0; i < sizeof (zfid->zf_gen); i++)
4093 zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
4094
4095 if (size == LONG_FID_LEN) {
4096 uint64_t objsetid = dmu_objset_id(zfsvfs->z_os);
4097 zfid_long_t *zlfid;
4098
4099 zlfid = (zfid_long_t *)fidp;
4100
4101 for (i = 0; i < sizeof (zlfid->zf_setid); i++)
4102 zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
4103
4104 /* XXX - this should be the generation number for the objset */
4105 for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
4106 zlfid->zf_setgen[i] = 0;
4107 }
4108
4109 zfs_exit(zfsvfs, FTAG);
4110 return (0);
4111 }
4112
4113 static int
zfs_pathconf(vnode_t * vp,int cmd,ulong_t * valp,cred_t * cr,caller_context_t * ct)4114 zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
4115 caller_context_t *ct)
4116 {
4117 znode_t *zp;
4118 zfsvfs_t *zfsvfs;
4119 uint_t blksize, iosize;
4120 int error;
4121
4122 switch (cmd) {
4123 case _PC_LINK_MAX:
4124 *valp = MIN(LONG_MAX, ZFS_LINK_MAX);
4125 return (0);
4126
4127 case _PC_FILESIZEBITS:
4128 *valp = 64;
4129 return (0);
4130 case _PC_MIN_HOLE_SIZE:
4131 iosize = vp->v_mount->mnt_stat.f_iosize;
4132 if (vp->v_type == VREG) {
4133 zp = VTOZ(vp);
4134 blksize = zp->z_blksz;
4135 if (zp->z_size <= blksize)
4136 blksize = MAX(blksize, iosize);
4137 *valp = (int)blksize;
4138 return (0);
4139 }
4140 if (vp->v_type == VDIR) {
4141 *valp = (int)iosize;
4142 return (0);
4143 }
4144 return (EINVAL);
4145 case _PC_ACL_EXTENDED:
4146 #if 0 /* POSIX ACLs are not implemented for ZFS on FreeBSD yet. */
4147 zp = VTOZ(vp);
4148 zfsvfs = zp->z_zfsvfs;
4149 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
4150 return (error);
4151 *valp = zfsvfs->z_acl_type == ZFSACLTYPE_POSIX ? 1 : 0;
4152 zfs_exit(zfsvfs, FTAG);
4153 #else
4154 *valp = 0;
4155 #endif
4156 return (0);
4157
4158 case _PC_ACL_NFS4:
4159 zp = VTOZ(vp);
4160 zfsvfs = zp->z_zfsvfs;
4161 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
4162 return (error);
4163 *valp = zfsvfs->z_acl_type == ZFS_ACLTYPE_NFSV4 ? 1 : 0;
4164 zfs_exit(zfsvfs, FTAG);
4165 return (0);
4166
4167 case _PC_ACL_PATH_MAX:
4168 *valp = ACL_MAX_ENTRIES;
4169 return (0);
4170
4171 default:
4172 return (EOPNOTSUPP);
4173 }
4174 }
4175
4176 static int
zfs_getpages(struct vnode * vp,vm_page_t * ma,int count,int * rbehind,int * rahead)4177 zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind,
4178 int *rahead)
4179 {
4180 znode_t *zp = VTOZ(vp);
4181 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4182 zfs_locked_range_t *lr;
4183 vm_object_t object;
4184 off_t start, end, obj_size;
4185 uint_t blksz;
4186 int pgsin_b, pgsin_a;
4187 int error;
4188
4189 if (zfs_enter_verify_zp(zfsvfs, zp, FTAG) != 0)
4190 return (zfs_vm_pagerret_error);
4191
4192 object = ma[0]->object;
4193 start = IDX_TO_OFF(ma[0]->pindex);
4194 end = IDX_TO_OFF(ma[count - 1]->pindex + 1);
4195
4196 /*
4197 * Lock a range covering all required and optional pages.
4198 * Note that we need to handle the case of the block size growing.
4199 */
4200 for (;;) {
4201 uint64_t len;
4202
4203 blksz = zp->z_blksz;
4204 len = roundup(end, blksz) - rounddown(start, blksz);
4205
4206 lr = zfs_rangelock_tryenter(&zp->z_rangelock,
4207 rounddown(start, blksz), len, RL_READER);
4208 if (lr == NULL) {
4209 /*
4210 * Avoid a deadlock with update_pages(). We need to
4211 * hold the range lock when copying from the DMU, so
4212 * give up the busy lock to allow update_pages() to
4213 * proceed. We might need to allocate new pages, which
4214 * isn't quite right since this allocation isn't subject
4215 * to the page fault handler's OOM logic, but this is
4216 * the best we can do for now.
4217 */
4218 for (int i = 0; i < count; i++)
4219 vm_page_xunbusy(ma[i]);
4220
4221 lr = zfs_rangelock_enter(&zp->z_rangelock,
4222 rounddown(start, blksz), len, RL_READER);
4223
4224 zfs_vmobject_wlock(object);
4225 (void) vm_page_grab_pages(object, OFF_TO_IDX(start),
4226 VM_ALLOC_NORMAL | VM_ALLOC_WAITOK,
4227 ma, count);
4228 if (!vm_page_all_valid(ma[count - 1])) {
4229 /*
4230 * Later in this function, we copy DMU data to
4231 * invalid pages only. The last page may not be
4232 * entirely filled though, if the file does not
4233 * end on a page boundary. Therefore, we zero
4234 * that last page here to make sure it does not
4235 * contain garbage after the end of file.
4236 */
4237 ASSERT(vm_page_none_valid(ma[count - 1]));
4238 vm_page_zero_invalid(ma[count - 1], FALSE);
4239 }
4240 zfs_vmobject_wunlock(object);
4241 }
4242 if (blksz == zp->z_blksz)
4243 break;
4244 zfs_rangelock_exit(lr);
4245 }
4246
4247 zfs_vmobject_wlock(object);
4248 obj_size = object->un_pager.vnp.vnp_size;
4249 zfs_vmobject_wunlock(object);
4250 if (IDX_TO_OFF(ma[count - 1]->pindex) >= obj_size) {
4251 zfs_rangelock_exit(lr);
4252 zfs_exit(zfsvfs, FTAG);
4253 return (zfs_vm_pagerret_bad);
4254 }
4255
4256 pgsin_b = 0;
4257 if (rbehind != NULL) {
4258 pgsin_b = OFF_TO_IDX(start - rounddown(start, blksz));
4259 pgsin_b = MIN(*rbehind, pgsin_b);
4260 }
4261
4262 pgsin_a = 0;
4263 if (rahead != NULL) {
4264 pgsin_a = OFF_TO_IDX(roundup(end, blksz) - end);
4265 if (end + IDX_TO_OFF(pgsin_a) >= obj_size)
4266 pgsin_a = OFF_TO_IDX(round_page(obj_size) - end);
4267 pgsin_a = MIN(*rahead, pgsin_a);
4268 }
4269
4270 /*
4271 * NB: we need to pass the exact byte size of the data that we expect
4272 * to read after accounting for the file size. This is required because
4273 * ZFS will panic if we request DMU to read beyond the end of the last
4274 * allocated block.
4275 */
4276 for (int i = 0; i < count; i++) {
4277 int dummypgsin, count1, j, last_size;
4278
4279 if (vm_page_any_valid(ma[i])) {
4280 ASSERT(vm_page_all_valid(ma[i]));
4281 continue;
4282 }
4283 for (j = i + 1; j < count; j++) {
4284 if (vm_page_any_valid(ma[j])) {
4285 ASSERT(vm_page_all_valid(ma[j]));
4286 break;
4287 }
4288 }
4289 count1 = j - i;
4290 dummypgsin = 0;
4291 last_size = j == count ?
4292 MIN(end, obj_size) - (end - PAGE_SIZE) : PAGE_SIZE;
4293 error = dmu_read_pages(zfsvfs->z_os, zp->z_id, &ma[i], count1,
4294 i == 0 ? &pgsin_b : &dummypgsin,
4295 j == count ? &pgsin_a : &dummypgsin,
4296 last_size);
4297 if (error != 0)
4298 break;
4299 i += count1 - 1;
4300 }
4301
4302 zfs_rangelock_exit(lr);
4303 ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
4304
4305 dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, count*PAGE_SIZE);
4306
4307 zfs_exit(zfsvfs, FTAG);
4308
4309 if (error != 0)
4310 return (zfs_vm_pagerret_error);
4311
4312 VM_CNT_INC(v_vnodein);
4313 VM_CNT_ADD(v_vnodepgsin, count + pgsin_b + pgsin_a);
4314 if (rbehind != NULL)
4315 *rbehind = pgsin_b;
4316 if (rahead != NULL)
4317 *rahead = pgsin_a;
4318 return (zfs_vm_pagerret_ok);
4319 }
4320
4321 #ifndef _SYS_SYSPROTO_H_
4322 struct vop_getpages_args {
4323 struct vnode *a_vp;
4324 vm_page_t *a_m;
4325 int a_count;
4326 int *a_rbehind;
4327 int *a_rahead;
4328 };
4329 #endif
4330
4331 static int
zfs_freebsd_getpages(struct vop_getpages_args * ap)4332 zfs_freebsd_getpages(struct vop_getpages_args *ap)
4333 {
4334
4335 return (zfs_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind,
4336 ap->a_rahead));
4337 }
4338
4339 typedef struct {
4340 uint_t pca_npages;
4341 vm_page_t pca_pages[];
4342 } putpage_commit_arg_t;
4343
4344 static void
zfs_putpage_commit_cb(void * arg,int err)4345 zfs_putpage_commit_cb(void *arg, int err)
4346 {
4347 putpage_commit_arg_t *pca = arg;
4348 vm_object_t object = pca->pca_pages[0]->object;
4349
4350 zfs_vmobject_wlock(object);
4351
4352 for (uint_t i = 0; i < pca->pca_npages; i++) {
4353 vm_page_t pp = pca->pca_pages[i];
4354
4355 if (err == 0) {
4356 /*
4357 * Writeback succeeded, so undirty the page. If it
4358 * fails, we leave it in the same state it was. That's
4359 * most likely dirty, so it will get tried again some
4360 * other time.
4361 */
4362 vm_page_undirty(pp);
4363 }
4364
4365 vm_page_sunbusy(pp);
4366 }
4367
4368 vm_object_pip_wakeupn(object, pca->pca_npages);
4369
4370 zfs_vmobject_wunlock(object);
4371
4372 kmem_free(pca,
4373 offsetof(putpage_commit_arg_t, pca_pages[pca->pca_npages]));
4374 }
4375
4376 static int
zfs_putpages(struct vnode * vp,vm_page_t * ma,size_t len,int flags,int * rtvals)4377 zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags,
4378 int *rtvals)
4379 {
4380 znode_t *zp = VTOZ(vp);
4381 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4382 zfs_locked_range_t *lr;
4383 dmu_tx_t *tx;
4384 struct sf_buf *sf;
4385 vm_object_t object;
4386 vm_page_t m;
4387 caddr_t va;
4388 size_t tocopy;
4389 size_t lo_len;
4390 vm_ooffset_t lo_off;
4391 vm_ooffset_t off;
4392 uint_t blksz;
4393 int ncount;
4394 int pcount;
4395 int err;
4396 int i;
4397
4398 object = vp->v_object;
4399 KASSERT(ma[0]->object == object, ("mismatching object"));
4400 KASSERT(len > 0 && (len & PAGE_MASK) == 0, ("unexpected length"));
4401
4402 pcount = btoc(len);
4403 ncount = pcount;
4404 for (i = 0; i < pcount; i++)
4405 rtvals[i] = zfs_vm_pagerret_error;
4406
4407 if (zfs_enter_verify_zp(zfsvfs, zp, FTAG) != 0)
4408 return (zfs_vm_pagerret_error);
4409
4410 off = IDX_TO_OFF(ma[0]->pindex);
4411 blksz = zp->z_blksz;
4412 lo_off = rounddown(off, blksz);
4413 lo_len = roundup(len + (off - lo_off), blksz);
4414 lr = zfs_rangelock_enter(&zp->z_rangelock, lo_off, lo_len, RL_WRITER);
4415
4416 zfs_vmobject_wlock(object);
4417 if (len + off > object->un_pager.vnp.vnp_size) {
4418 if (object->un_pager.vnp.vnp_size > off) {
4419 int pgoff;
4420
4421 len = object->un_pager.vnp.vnp_size - off;
4422 ncount = btoc(len);
4423 if ((pgoff = (int)len & PAGE_MASK) != 0) {
4424 /*
4425 * If the object is locked and the following
4426 * conditions hold, then the page's dirty
4427 * field cannot be concurrently changed by a
4428 * pmap operation.
4429 */
4430 m = ma[ncount - 1];
4431 vm_page_assert_sbusied(m);
4432 KASSERT(!pmap_page_is_write_mapped(m),
4433 ("zfs_putpages: page %p is not read-only",
4434 m));
4435 vm_page_clear_dirty(m, pgoff, PAGE_SIZE -
4436 pgoff);
4437 }
4438 } else {
4439 len = 0;
4440 ncount = 0;
4441 }
4442 if (ncount < pcount) {
4443 for (i = ncount; i < pcount; i++) {
4444 rtvals[i] = zfs_vm_pagerret_bad;
4445 }
4446 }
4447 }
4448 zfs_vmobject_wunlock(object);
4449
4450 boolean_t commit = (flags & (zfs_vm_pagerput_sync |
4451 zfs_vm_pagerput_inval)) != 0 ||
4452 zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS;
4453
4454 if (ncount == 0)
4455 goto out;
4456
4457 if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, zp->z_uid) ||
4458 zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, zp->z_gid) ||
4459 (zp->z_projid != ZFS_DEFAULT_PROJID &&
4460 zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT,
4461 zp->z_projid))) {
4462 goto out;
4463 }
4464
4465 tx = dmu_tx_create(zfsvfs->z_os);
4466 dmu_tx_hold_write(tx, zp->z_id, off, len);
4467
4468 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4469 zfs_sa_upgrade_txholds(tx, zp);
4470 err = dmu_tx_assign(tx, DMU_TX_WAIT);
4471 if (err != 0) {
4472 dmu_tx_abort(tx);
4473 goto out;
4474 }
4475
4476 if (zp->z_blksz < PAGE_SIZE) {
4477 vm_ooffset_t woff = off;
4478 size_t wlen = len;
4479 for (i = 0; wlen > 0; woff += tocopy, wlen -= tocopy, i++) {
4480 tocopy = MIN(PAGE_SIZE, wlen);
4481 va = zfs_map_page(ma[i], &sf);
4482 dmu_write(zfsvfs->z_os, zp->z_id, woff, tocopy, va, tx,
4483 DMU_READ_PREFETCH);
4484 zfs_unmap_page(sf);
4485 }
4486 } else {
4487 err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, ma, tx);
4488 }
4489
4490 if (err == 0) {
4491 uint64_t mtime[2], ctime[2];
4492 sa_bulk_attr_t bulk[3];
4493 int count = 0;
4494
4495 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
4496 &mtime, 16);
4497 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
4498 &ctime, 16);
4499 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
4500 &zp->z_pflags, 8);
4501 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
4502 err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
4503 ASSERT0(err);
4504
4505 if (commit) {
4506 /*
4507 * Caller requested that we commit immediately. We set
4508 * a callback on the log entry, to be called once its
4509 * on disk after the call to zil_commit() below. The
4510 * pages will be undirtied and unbusied there.
4511 */
4512 putpage_commit_arg_t *pca = kmem_alloc(
4513 offsetof(putpage_commit_arg_t, pca_pages[ncount]),
4514 KM_SLEEP);
4515 pca->pca_npages = ncount;
4516 memcpy(pca->pca_pages, ma, sizeof (vm_page_t) * ncount);
4517
4518 zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len,
4519 B_TRUE, B_FALSE, zfs_putpage_commit_cb, pca);
4520
4521 for (i = 0; i < ncount; i++)
4522 rtvals[i] = zfs_vm_pagerret_pend;
4523 } else {
4524 /*
4525 * Caller just wants the page written back somewhere,
4526 * but doesn't need it committed yet. We've already
4527 * written it back to the DMU, so we just need to put
4528 * it on the async log, then undirty the page and
4529 * return.
4530 *
4531 * We cannot use a callback here, because it would keep
4532 * the page busy (locked) until it is eventually
4533 * written down at txg sync.
4534 */
4535 zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len,
4536 B_FALSE, B_FALSE, NULL, NULL);
4537
4538 zfs_vmobject_wlock(object);
4539 for (i = 0; i < ncount; i++) {
4540 rtvals[i] = zfs_vm_pagerret_ok;
4541 vm_page_undirty(ma[i]);
4542 }
4543 zfs_vmobject_wunlock(object);
4544 }
4545
4546 VM_CNT_INC(v_vnodeout);
4547 VM_CNT_ADD(v_vnodepgsout, ncount);
4548 }
4549 dmu_tx_commit(tx);
4550
4551 out:
4552 zfs_rangelock_exit(lr);
4553 if (commit) {
4554 err = zil_commit(zfsvfs->z_log, zp->z_id);
4555 if (err != 0) {
4556 zfs_exit(zfsvfs, FTAG);
4557 return (err);
4558 }
4559 }
4560
4561 dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, len);
4562
4563 zfs_exit(zfsvfs, FTAG);
4564 return (rtvals[0]);
4565 }
4566
4567 #ifndef _SYS_SYSPROTO_H_
4568 struct vop_putpages_args {
4569 struct vnode *a_vp;
4570 vm_page_t *a_m;
4571 int a_count;
4572 int a_sync;
4573 int *a_rtvals;
4574 };
4575 #endif
4576
4577 static int
zfs_freebsd_putpages(struct vop_putpages_args * ap)4578 zfs_freebsd_putpages(struct vop_putpages_args *ap)
4579 {
4580
4581 return (zfs_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync,
4582 ap->a_rtvals));
4583 }
4584
4585 #ifndef _SYS_SYSPROTO_H_
4586 struct vop_bmap_args {
4587 struct vnode *a_vp;
4588 daddr_t a_bn;
4589 struct bufobj **a_bop;
4590 daddr_t *a_bnp;
4591 int *a_runp;
4592 int *a_runb;
4593 };
4594 #endif
4595
4596 static int
zfs_freebsd_bmap(struct vop_bmap_args * ap)4597 zfs_freebsd_bmap(struct vop_bmap_args *ap)
4598 {
4599
4600 if (ap->a_bop != NULL)
4601 *ap->a_bop = &ap->a_vp->v_bufobj;
4602 if (ap->a_bnp != NULL)
4603 *ap->a_bnp = ap->a_bn;
4604 if (ap->a_runp != NULL)
4605 *ap->a_runp = 0;
4606 if (ap->a_runb != NULL)
4607 *ap->a_runb = 0;
4608
4609 return (0);
4610 }
4611
4612 #ifndef _SYS_SYSPROTO_H_
4613 struct vop_open_args {
4614 struct vnode *a_vp;
4615 int a_mode;
4616 struct ucred *a_cred;
4617 struct thread *a_td;
4618 };
4619 #endif
4620
4621 static int
zfs_freebsd_open(struct vop_open_args * ap)4622 zfs_freebsd_open(struct vop_open_args *ap)
4623 {
4624 vnode_t *vp = ap->a_vp;
4625 znode_t *zp = VTOZ(vp);
4626 int error;
4627
4628 error = zfs_open(&vp, ap->a_mode, ap->a_cred);
4629 if (error == 0)
4630 vnode_create_vobject(vp, zp->z_size, ap->a_td);
4631 return (error);
4632 }
4633
4634 #ifndef _SYS_SYSPROTO_H_
4635 struct vop_close_args {
4636 struct vnode *a_vp;
4637 int a_fflag;
4638 struct ucred *a_cred;
4639 struct thread *a_td;
4640 };
4641 #endif
4642
4643 static int
zfs_freebsd_close(struct vop_close_args * ap)4644 zfs_freebsd_close(struct vop_close_args *ap)
4645 {
4646
4647 return (zfs_close(ap->a_vp, ap->a_fflag, 1, 0, ap->a_cred));
4648 }
4649
4650 #ifndef _SYS_SYSPROTO_H_
4651 struct vop_ioctl_args {
4652 struct vnode *a_vp;
4653 ulong_t a_command;
4654 caddr_t a_data;
4655 int a_fflag;
4656 struct ucred *cred;
4657 struct thread *td;
4658 };
4659 #endif
4660
4661 static int
zfs_freebsd_ioctl(struct vop_ioctl_args * ap)4662 zfs_freebsd_ioctl(struct vop_ioctl_args *ap)
4663 {
4664
4665 return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data,
4666 ap->a_fflag, ap->a_cred, NULL));
4667 }
4668
4669 static int
ioflags(int ioflags)4670 ioflags(int ioflags)
4671 {
4672 int flags = 0;
4673
4674 if (ioflags & IO_APPEND)
4675 flags |= O_APPEND;
4676 if (ioflags & IO_NDELAY)
4677 flags |= O_NONBLOCK;
4678 if (ioflags & IO_DIRECT)
4679 flags |= O_DIRECT;
4680 if (ioflags & IO_SYNC)
4681 flags |= O_SYNC;
4682
4683 return (flags);
4684 }
4685
4686 #ifndef _SYS_SYSPROTO_H_
4687 struct vop_read_args {
4688 struct vnode *a_vp;
4689 struct uio *a_uio;
4690 int a_ioflag;
4691 struct ucred *a_cred;
4692 };
4693 #endif
4694
4695 static int
zfs_freebsd_read(struct vop_read_args * ap)4696 zfs_freebsd_read(struct vop_read_args *ap)
4697 {
4698 zfs_uio_t uio;
4699 int error = 0;
4700 zfs_uio_init(&uio, ap->a_uio);
4701 error = zfs_read(VTOZ(ap->a_vp), &uio, ioflags(ap->a_ioflag),
4702 ap->a_cred);
4703 /*
4704 * XXX We occasionally get an EFAULT for Direct I/O reads on
4705 * FreeBSD 13. This still needs to be resolved. The EFAULT comes
4706 * from:
4707 * zfs_uio_get__dio_pages_alloc() ->
4708 * zfs_uio_get_dio_pages_impl() ->
4709 * zfs_uio_iov_step() ->
4710 * zfs_uio_get_user_pages().
4711 * We return EFAULT from zfs_uio_iov_step(). When a Direct I/O
4712 * read fails to map in the user pages (returning EFAULT) the
4713 * Direct I/O request is broken up into two separate IO requests
4714 * and issued separately using Direct I/O.
4715 */
4716 #ifdef ZFS_DEBUG
4717 if (error == EFAULT && uio.uio_extflg & UIO_DIRECT) {
4718 #if 0
4719 printf("%s(%d): Direct I/O read returning EFAULT "
4720 "uio = %p, zfs_uio_offset(uio) = %lu "
4721 "zfs_uio_resid(uio) = %lu\n",
4722 __FUNCTION__, __LINE__, &uio, zfs_uio_offset(&uio),
4723 zfs_uio_resid(&uio));
4724 #endif
4725 }
4726
4727 #endif
4728 return (error);
4729 }
4730
4731 #ifndef _SYS_SYSPROTO_H_
4732 struct vop_write_args {
4733 struct vnode *a_vp;
4734 struct uio *a_uio;
4735 int a_ioflag;
4736 struct ucred *a_cred;
4737 };
4738 #endif
4739
4740 static int
zfs_freebsd_write(struct vop_write_args * ap)4741 zfs_freebsd_write(struct vop_write_args *ap)
4742 {
4743 zfs_uio_t uio;
4744 zfs_uio_init(&uio, ap->a_uio);
4745 return (zfs_write(VTOZ(ap->a_vp), &uio, ioflags(ap->a_ioflag),
4746 ap->a_cred));
4747 }
4748
4749 /*
4750 * VOP_FPLOOKUP_VEXEC routines are subject to special circumstances, see
4751 * the comment above cache_fplookup for details.
4752 */
4753 static int
zfs_freebsd_fplookup_vexec(struct vop_fplookup_vexec_args * v)4754 zfs_freebsd_fplookup_vexec(struct vop_fplookup_vexec_args *v)
4755 {
4756 vnode_t *vp;
4757 znode_t *zp;
4758 uint64_t pflags;
4759
4760 vp = v->a_vp;
4761 zp = VTOZ_SMR(vp);
4762 if (__predict_false(zp == NULL))
4763 return (EAGAIN);
4764 pflags = atomic_load_64(&zp->z_pflags);
4765 if (pflags & ZFS_AV_QUARANTINED)
4766 return (EAGAIN);
4767 if (pflags & ZFS_XATTR)
4768 return (EAGAIN);
4769 if ((pflags & ZFS_NO_EXECS_DENIED) == 0)
4770 return (EAGAIN);
4771 return (0);
4772 }
4773
4774 static int
zfs_freebsd_fplookup_symlink(struct vop_fplookup_symlink_args * v)4775 zfs_freebsd_fplookup_symlink(struct vop_fplookup_symlink_args *v)
4776 {
4777 vnode_t *vp;
4778 znode_t *zp;
4779 char *target;
4780
4781 vp = v->a_vp;
4782 zp = VTOZ_SMR(vp);
4783 if (__predict_false(zp == NULL)) {
4784 return (EAGAIN);
4785 }
4786
4787 target = atomic_load_consume_ptr(&zp->z_cached_symlink);
4788 if (target == NULL) {
4789 return (EAGAIN);
4790 }
4791 return (cache_symlink_resolve(v->a_fpl, target, strlen(target)));
4792 }
4793
4794 #ifndef _SYS_SYSPROTO_H_
4795 struct vop_access_args {
4796 struct vnode *a_vp;
4797 accmode_t a_accmode;
4798 struct ucred *a_cred;
4799 struct thread *a_td;
4800 };
4801 #endif
4802
4803 static int
zfs_freebsd_access(struct vop_access_args * ap)4804 zfs_freebsd_access(struct vop_access_args *ap)
4805 {
4806 vnode_t *vp = ap->a_vp;
4807 znode_t *zp = VTOZ(vp);
4808 accmode_t accmode;
4809 int error = 0;
4810
4811
4812 if (ap->a_accmode == VEXEC) {
4813 if (zfs_fastaccesschk_execute(zp, ap->a_cred) == 0)
4814 return (0);
4815 }
4816
4817 /*
4818 * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND,
4819 */
4820 accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND);
4821 if (accmode != 0) {
4822 #if __FreeBSD_version >= 1500040
4823 /* For named attributes, do the checks. */
4824 if ((vn_irflag_read(vp) & VIRF_NAMEDATTR) != 0)
4825 error = zfs_access(zp, accmode, V_NAMEDATTR,
4826 ap->a_cred);
4827 else
4828 #endif
4829 error = zfs_access(zp, accmode, 0, ap->a_cred);
4830 }
4831
4832 /*
4833 * VADMIN has to be handled by vaccess().
4834 */
4835 if (error == 0) {
4836 accmode = ap->a_accmode & ~(VREAD|VWRITE|VEXEC|VAPPEND);
4837 if (accmode != 0) {
4838 error = vaccess(vp->v_type, zp->z_mode, zp->z_uid,
4839 zp->z_gid, accmode, ap->a_cred);
4840 }
4841 }
4842
4843 /*
4844 * For VEXEC, ensure that at least one execute bit is set for
4845 * non-directories.
4846 */
4847 if (error == 0 && (ap->a_accmode & VEXEC) != 0 && vp->v_type != VDIR &&
4848 (zp->z_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) {
4849 error = EACCES;
4850 }
4851
4852 return (error);
4853 }
4854
4855 #ifndef _SYS_SYSPROTO_H_
4856 struct vop_lookup_args {
4857 struct vnode *a_dvp;
4858 struct vnode **a_vpp;
4859 struct componentname *a_cnp;
4860 };
4861 #endif
4862
4863 #if __FreeBSD_version >= 1500040
4864 static int
zfs_lookup_nameddir(struct vnode * dvp,struct componentname * cnp,struct vnode ** vpp)4865 zfs_lookup_nameddir(struct vnode *dvp, struct componentname *cnp,
4866 struct vnode **vpp)
4867 {
4868 struct vnode *xvp;
4869 int error, flags;
4870
4871 *vpp = NULL;
4872 flags = LOOKUP_XATTR | LOOKUP_NAMED_ATTR;
4873 if ((cnp->cn_flags & CREATENAMED) != 0)
4874 flags |= CREATE_XATTR_DIR;
4875 error = zfs_lookup(dvp, NULL, &xvp, NULL, 0, cnp->cn_cred, flags,
4876 B_FALSE);
4877 if (error == 0) {
4878 if ((cnp->cn_flags & LOCKLEAF) != 0)
4879 error = vn_lock(xvp, cnp->cn_lkflags);
4880 if (error == 0) {
4881 vn_irflag_set_cond(xvp, VIRF_NAMEDDIR);
4882 *vpp = xvp;
4883 } else {
4884 vrele(xvp);
4885 }
4886 }
4887 return (error);
4888 }
4889
4890 static ssize_t
zfs_readdir_named(struct vnode * vp,char * buf,ssize_t blen,off_t * offp,int * eofflagp,struct ucred * cred,struct thread * td)4891 zfs_readdir_named(struct vnode *vp, char *buf, ssize_t blen, off_t *offp,
4892 int *eofflagp, struct ucred *cred, struct thread *td)
4893 {
4894 struct uio io;
4895 struct iovec iv;
4896 zfs_uio_t uio;
4897 int error;
4898
4899 io.uio_offset = *offp;
4900 io.uio_segflg = UIO_SYSSPACE;
4901 io.uio_rw = UIO_READ;
4902 io.uio_td = td;
4903 iv.iov_base = buf;
4904 iv.iov_len = blen;
4905 io.uio_iov = &iv;
4906 io.uio_iovcnt = 1;
4907 io.uio_resid = blen;
4908 zfs_uio_init(&uio, &io);
4909 error = zfs_readdir(vp, &uio, cred, eofflagp, NULL, NULL);
4910 if (error != 0)
4911 return (-1);
4912 *offp = io.uio_offset;
4913 return (blen - io.uio_resid);
4914 }
4915
4916 static bool
zfs_has_namedattr(struct vnode * vp,struct ucred * cred)4917 zfs_has_namedattr(struct vnode *vp, struct ucred *cred)
4918 {
4919 struct componentname cn;
4920 struct vnode *xvp;
4921 struct dirent *dp;
4922 off_t offs;
4923 ssize_t rsize;
4924 char *buf, *cp, *endcp;
4925 int eofflag, error;
4926 bool ret;
4927
4928 MNT_ILOCK(vp->v_mount);
4929 if ((vp->v_mount->mnt_flag & MNT_NAMEDATTR) == 0) {
4930 MNT_IUNLOCK(vp->v_mount);
4931 return (false);
4932 }
4933 MNT_IUNLOCK(vp->v_mount);
4934
4935 /* Now see if a named attribute directory exists. */
4936 cn.cn_flags = LOCKLEAF;
4937 cn.cn_lkflags = LK_SHARED;
4938 cn.cn_cred = cred;
4939 error = zfs_lookup_nameddir(vp, &cn, &xvp);
4940 if (error != 0)
4941 return (false);
4942
4943 /* It exists, so see if there is any entry other than "." and "..". */
4944 buf = malloc(DEV_BSIZE, M_TEMP, M_WAITOK);
4945 ret = false;
4946 offs = 0;
4947 do {
4948 rsize = zfs_readdir_named(xvp, buf, DEV_BSIZE, &offs, &eofflag,
4949 cred, curthread);
4950 if (rsize <= 0)
4951 break;
4952 cp = buf;
4953 endcp = &buf[rsize];
4954 while (cp < endcp) {
4955 dp = (struct dirent *)cp;
4956 if (dp->d_fileno != 0 && (dp->d_type == DT_REG ||
4957 dp->d_type == DT_UNKNOWN) &&
4958 !ZFS_XA_NS_PREFIX_FORBIDDEN(dp->d_name) &&
4959 ((dp->d_namlen == 1 && dp->d_name[0] != '.') ||
4960 (dp->d_namlen == 2 && (dp->d_name[0] != '.' ||
4961 dp->d_name[1] != '.')) || dp->d_namlen > 2)) {
4962 ret = true;
4963 break;
4964 }
4965 cp += dp->d_reclen;
4966 }
4967 } while (!ret && rsize > 0 && eofflag == 0);
4968 vput(xvp);
4969 free(buf, M_TEMP);
4970 return (ret);
4971 }
4972
4973 static int
zfs_freebsd_lookup(struct vop_lookup_args * ap,boolean_t cached)4974 zfs_freebsd_lookup(struct vop_lookup_args *ap, boolean_t cached)
4975 {
4976 struct componentname *cnp = ap->a_cnp;
4977 char nm[NAME_MAX + 1];
4978 int error;
4979 struct vnode **vpp = ap->a_vpp, *dvp = ap->a_dvp, *xvp;
4980 bool is_nameddir, needs_nameddir, opennamed = false;
4981
4982 /*
4983 * These variables are used to handle the named attribute cases:
4984 * opennamed - Is true when this is a call from open with O_NAMEDATTR
4985 * specified and it is the last component.
4986 * is_nameddir - Is true when the directory is a named attribute dir.
4987 * needs_nameddir - Is set when the lookup needs to look for/create
4988 * a named attribute directory. It is only set when is_nameddir
4989 * is_nameddir is false and opennamed is true.
4990 * xvp - Is the directory that the lookup needs to be done in.
4991 * Usually dvp, unless needs_nameddir is true where it is the
4992 * result of the first non-named directory lookup.
4993 * Note that name caching must be disabled for named attribute
4994 * handling.
4995 */
4996 needs_nameddir = false;
4997 xvp = dvp;
4998 opennamed = (cnp->cn_flags & (OPENNAMED | ISLASTCN)) ==
4999 (OPENNAMED | ISLASTCN);
5000 is_nameddir = (vn_irflag_read(dvp) & VIRF_NAMEDDIR) != 0;
5001 if (is_nameddir && (cnp->cn_flags & ISLASTCN) == 0)
5002 return (ENOATTR);
5003 if (opennamed && !is_nameddir && (cnp->cn_flags & ISDOTDOT) != 0)
5004 return (ENOATTR);
5005 if (opennamed || is_nameddir)
5006 cnp->cn_flags &= ~MAKEENTRY;
5007 if (opennamed && !is_nameddir)
5008 needs_nameddir = true;
5009 ASSERT3U(cnp->cn_namelen, <, sizeof (nm));
5010 error = 0;
5011 *vpp = NULL;
5012 if (needs_nameddir) {
5013 if (VOP_ISLOCKED(dvp) != LK_EXCLUSIVE)
5014 vn_lock(dvp, LK_UPGRADE | LK_RETRY);
5015 error = zfs_lookup_nameddir(dvp, cnp, &xvp);
5016 if (error == 0)
5017 is_nameddir = true;
5018 }
5019 if (error == 0) {
5020 if (!needs_nameddir || cnp->cn_namelen != 1 ||
5021 *cnp->cn_nameptr != '.') {
5022 strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1,
5023 sizeof (nm)));
5024 error = zfs_lookup(xvp, nm, vpp, cnp, cnp->cn_nameiop,
5025 cnp->cn_cred, 0, cached);
5026 if (is_nameddir && error == 0 &&
5027 (cnp->cn_namelen != 1 || *cnp->cn_nameptr != '.') &&
5028 (cnp->cn_flags & ISDOTDOT) == 0) {
5029 if ((*vpp)->v_type == VDIR)
5030 vn_irflag_set_cond(*vpp, VIRF_NAMEDDIR);
5031 else
5032 vn_irflag_set_cond(*vpp,
5033 VIRF_NAMEDATTR);
5034 }
5035 if (needs_nameddir && xvp != *vpp)
5036 vput(xvp);
5037 } else {
5038 /*
5039 * Lookup of "." when a named attribute dir is needed.
5040 */
5041 *vpp = xvp;
5042 }
5043 }
5044 return (error);
5045 }
5046 #else
5047 static int
zfs_freebsd_lookup(struct vop_lookup_args * ap,boolean_t cached)5048 zfs_freebsd_lookup(struct vop_lookup_args *ap, boolean_t cached)
5049 {
5050 struct componentname *cnp = ap->a_cnp;
5051 char nm[NAME_MAX + 1];
5052
5053 ASSERT3U(cnp->cn_namelen, <, sizeof (nm));
5054 strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof (nm)));
5055
5056 return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop,
5057 cnp->cn_cred, 0, cached));
5058 }
5059 #endif
5060
5061 static int
zfs_freebsd_cachedlookup(struct vop_cachedlookup_args * ap)5062 zfs_freebsd_cachedlookup(struct vop_cachedlookup_args *ap)
5063 {
5064
5065 return (zfs_freebsd_lookup((struct vop_lookup_args *)ap, B_TRUE));
5066 }
5067
5068 #ifndef _SYS_SYSPROTO_H_
5069 struct vop_lookup_args {
5070 struct vnode *a_dvp;
5071 struct vnode **a_vpp;
5072 struct componentname *a_cnp;
5073 };
5074 #endif
5075
5076 static int
zfs_cache_lookup(struct vop_lookup_args * ap)5077 zfs_cache_lookup(struct vop_lookup_args *ap)
5078 {
5079 zfsvfs_t *zfsvfs;
5080
5081 zfsvfs = ap->a_dvp->v_mount->mnt_data;
5082 #if __FreeBSD_version >= 1500040
5083 if (zfsvfs->z_use_namecache && (ap->a_cnp->cn_flags & OPENNAMED) == 0)
5084 #else
5085 if (zfsvfs->z_use_namecache)
5086 #endif
5087 return (vfs_cache_lookup(ap));
5088 else
5089 return (zfs_freebsd_lookup(ap, B_FALSE));
5090 }
5091
5092 #ifndef _SYS_SYSPROTO_H_
5093 struct vop_create_args {
5094 struct vnode *a_dvp;
5095 struct vnode **a_vpp;
5096 struct componentname *a_cnp;
5097 struct vattr *a_vap;
5098 };
5099 #endif
5100
5101 static int
zfs_freebsd_create(struct vop_create_args * ap)5102 zfs_freebsd_create(struct vop_create_args *ap)
5103 {
5104 zfsvfs_t *zfsvfs;
5105 struct componentname *cnp = ap->a_cnp;
5106 vattr_t *vap = ap->a_vap;
5107 znode_t *zp = NULL;
5108 int rc, mode;
5109 struct vnode *dvp = ap->a_dvp;
5110 #if __FreeBSD_version >= 1500040
5111 struct vnode *xvp;
5112 bool is_nameddir;
5113 #endif
5114
5115 #if __FreeBSD_version < 1400068
5116 ASSERT(cnp->cn_flags & SAVENAME);
5117 #endif
5118
5119 vattr_init_mask(vap);
5120 mode = vap->va_mode & ALLPERMS;
5121 zfsvfs = ap->a_dvp->v_mount->mnt_data;
5122 *ap->a_vpp = NULL;
5123
5124 rc = 0;
5125 #if __FreeBSD_version >= 1500040
5126 xvp = NULL;
5127 is_nameddir = (vn_irflag_read(dvp) & VIRF_NAMEDDIR) != 0;
5128 if (!is_nameddir && (cnp->cn_flags & OPENNAMED) != 0) {
5129 /* Needs a named attribute directory. */
5130 rc = zfs_lookup_nameddir(dvp, cnp, &xvp);
5131 if (rc == 0) {
5132 dvp = xvp;
5133 is_nameddir = true;
5134 }
5135 }
5136 if (is_nameddir && rc == 0)
5137 rc = zfs_check_attrname(cnp->cn_nameptr);
5138 #endif
5139
5140 if (rc == 0)
5141 rc = zfs_create(VTOZ(dvp), cnp->cn_nameptr, vap, 0, mode,
5142 &zp, cnp->cn_cred, 0 /* flag */, NULL /* vsecattr */, NULL);
5143 #if __FreeBSD_version >= 1500040
5144 if (xvp != NULL)
5145 vput(xvp);
5146 #endif
5147 if (rc == 0) {
5148 *ap->a_vpp = ZTOV(zp);
5149 #if __FreeBSD_version >= 1500040
5150 if (is_nameddir)
5151 vn_irflag_set_cond(*ap->a_vpp, VIRF_NAMEDATTR);
5152 #endif
5153 }
5154 if (zfsvfs->z_use_namecache &&
5155 rc == 0 && (cnp->cn_flags & MAKEENTRY) != 0)
5156 cache_enter(ap->a_dvp, *ap->a_vpp, cnp);
5157
5158 return (rc);
5159 }
5160
5161 #ifndef _SYS_SYSPROTO_H_
5162 struct vop_remove_args {
5163 struct vnode *a_dvp;
5164 struct vnode *a_vp;
5165 struct componentname *a_cnp;
5166 };
5167 #endif
5168
5169 static int
zfs_freebsd_remove(struct vop_remove_args * ap)5170 zfs_freebsd_remove(struct vop_remove_args *ap)
5171 {
5172 int error = 0;
5173
5174 #if __FreeBSD_version < 1400068
5175 ASSERT(ap->a_cnp->cn_flags & SAVENAME);
5176 #endif
5177
5178 #if __FreeBSD_version >= 1500040
5179 if ((vn_irflag_read(ap->a_dvp) & VIRF_NAMEDDIR) != 0)
5180 error = zfs_check_attrname(ap->a_cnp->cn_nameptr);
5181 #endif
5182
5183 if (error == 0)
5184 error = zfs_remove_(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr,
5185 ap->a_cnp->cn_cred);
5186 return (error);
5187 }
5188
5189 #ifndef _SYS_SYSPROTO_H_
5190 struct vop_mkdir_args {
5191 struct vnode *a_dvp;
5192 struct vnode **a_vpp;
5193 struct componentname *a_cnp;
5194 struct vattr *a_vap;
5195 };
5196 #endif
5197
5198 static int
zfs_freebsd_mkdir(struct vop_mkdir_args * ap)5199 zfs_freebsd_mkdir(struct vop_mkdir_args *ap)
5200 {
5201 vattr_t *vap = ap->a_vap;
5202 znode_t *zp = NULL;
5203 int rc;
5204
5205 #if __FreeBSD_version < 1400068
5206 ASSERT(ap->a_cnp->cn_flags & SAVENAME);
5207 #endif
5208
5209 vattr_init_mask(vap);
5210 *ap->a_vpp = NULL;
5211
5212 rc = zfs_mkdir(VTOZ(ap->a_dvp), ap->a_cnp->cn_nameptr, vap, &zp,
5213 ap->a_cnp->cn_cred, 0, NULL, NULL);
5214
5215 if (rc == 0)
5216 *ap->a_vpp = ZTOV(zp);
5217 return (rc);
5218 }
5219
5220 #ifndef _SYS_SYSPROTO_H_
5221 struct vop_rmdir_args {
5222 struct vnode *a_dvp;
5223 struct vnode *a_vp;
5224 struct componentname *a_cnp;
5225 };
5226 #endif
5227
5228 static int
zfs_freebsd_rmdir(struct vop_rmdir_args * ap)5229 zfs_freebsd_rmdir(struct vop_rmdir_args *ap)
5230 {
5231 struct componentname *cnp = ap->a_cnp;
5232
5233 #if __FreeBSD_version < 1400068
5234 ASSERT(cnp->cn_flags & SAVENAME);
5235 #endif
5236
5237 return (zfs_rmdir_(ap->a_dvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred));
5238 }
5239
5240 #ifndef _SYS_SYSPROTO_H_
5241 struct vop_readdir_args {
5242 struct vnode *a_vp;
5243 struct uio *a_uio;
5244 struct ucred *a_cred;
5245 int *a_eofflag;
5246 int *a_ncookies;
5247 cookie_t **a_cookies;
5248 };
5249 #endif
5250
5251 static int
zfs_freebsd_readdir(struct vop_readdir_args * ap)5252 zfs_freebsd_readdir(struct vop_readdir_args *ap)
5253 {
5254 zfs_uio_t uio;
5255 zfs_uio_init(&uio, ap->a_uio);
5256 return (zfs_readdir(ap->a_vp, &uio, ap->a_cred, ap->a_eofflag,
5257 ap->a_ncookies, ap->a_cookies));
5258 }
5259
5260 #ifndef _SYS_SYSPROTO_H_
5261 struct vop_fsync_args {
5262 struct vnode *a_vp;
5263 int a_waitfor;
5264 struct thread *a_td;
5265 };
5266 #endif
5267
5268 static int
zfs_freebsd_fsync(struct vop_fsync_args * ap)5269 zfs_freebsd_fsync(struct vop_fsync_args *ap)
5270 {
5271 vnode_t *vp = ap->a_vp;
5272 int err = 0;
5273
5274 /*
5275 * Push any dirty mmap()'d data out to the DMU and ZIL, ready for
5276 * zil_commit() to be called in zfs_fsync().
5277 */
5278 if (vp->v_object != NULL && vm_object_mightbedirty(vp->v_object)) {
5279 zfs_vmobject_wlock(vp->v_object);
5280 if (!vm_object_page_clean(vp->v_object, 0, 0, 0))
5281 err = SET_ERROR(EIO);
5282 zfs_vmobject_wunlock(vp->v_object);
5283 if (err) {
5284 /*
5285 * Unclear what state things are in. zfs_putpages()
5286 * will ensure the pages remain dirty if they haven't
5287 * been written down to the DMU, but because there may
5288 * be nothing logged, we can't assume that zfs_sync()
5289 * -> zil_commit() will give us a useful error. It's
5290 * safest if we just error out here.
5291 */
5292 return (err);
5293 }
5294 }
5295
5296 return (zfs_fsync(VTOZ(vp), 0, ap->a_td->td_ucred));
5297 }
5298
5299 #ifndef _SYS_SYSPROTO_H_
5300 struct vop_getattr_args {
5301 struct vnode *a_vp;
5302 struct vattr *a_vap;
5303 struct ucred *a_cred;
5304 };
5305 #endif
5306
5307 static int
zfs_freebsd_getattr(struct vop_getattr_args * ap)5308 zfs_freebsd_getattr(struct vop_getattr_args *ap)
5309 {
5310 vattr_t *vap = ap->a_vap;
5311 xvattr_t xvap;
5312 ulong_t fflags = 0;
5313 int error;
5314
5315 xva_init(&xvap);
5316 xvap.xva_vattr = *vap;
5317 xvap.xva_vattr.va_mask |= AT_XVATTR;
5318
5319 /* Convert chflags into ZFS-type flags. */
5320 /* XXX: what about SF_SETTABLE?. */
5321 XVA_SET_REQ(&xvap, XAT_IMMUTABLE);
5322 XVA_SET_REQ(&xvap, XAT_APPENDONLY);
5323 XVA_SET_REQ(&xvap, XAT_NOUNLINK);
5324 XVA_SET_REQ(&xvap, XAT_NODUMP);
5325 XVA_SET_REQ(&xvap, XAT_READONLY);
5326 XVA_SET_REQ(&xvap, XAT_ARCHIVE);
5327 XVA_SET_REQ(&xvap, XAT_SYSTEM);
5328 XVA_SET_REQ(&xvap, XAT_HIDDEN);
5329 XVA_SET_REQ(&xvap, XAT_REPARSE);
5330 XVA_SET_REQ(&xvap, XAT_OFFLINE);
5331 XVA_SET_REQ(&xvap, XAT_SPARSE);
5332
5333 error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred);
5334 if (error != 0)
5335 return (error);
5336
5337 /* Convert ZFS xattr into chflags. */
5338 #define FLAG_CHECK(fflag, xflag, xfield) do { \
5339 if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0) \
5340 fflags |= (fflag); \
5341 } while (0)
5342 FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE,
5343 xvap.xva_xoptattrs.xoa_immutable);
5344 FLAG_CHECK(SF_APPEND, XAT_APPENDONLY,
5345 xvap.xva_xoptattrs.xoa_appendonly);
5346 FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK,
5347 xvap.xva_xoptattrs.xoa_nounlink);
5348 FLAG_CHECK(UF_ARCHIVE, XAT_ARCHIVE,
5349 xvap.xva_xoptattrs.xoa_archive);
5350 FLAG_CHECK(UF_NODUMP, XAT_NODUMP,
5351 xvap.xva_xoptattrs.xoa_nodump);
5352 FLAG_CHECK(UF_READONLY, XAT_READONLY,
5353 xvap.xva_xoptattrs.xoa_readonly);
5354 FLAG_CHECK(UF_SYSTEM, XAT_SYSTEM,
5355 xvap.xva_xoptattrs.xoa_system);
5356 FLAG_CHECK(UF_HIDDEN, XAT_HIDDEN,
5357 xvap.xva_xoptattrs.xoa_hidden);
5358 FLAG_CHECK(UF_REPARSE, XAT_REPARSE,
5359 xvap.xva_xoptattrs.xoa_reparse);
5360 FLAG_CHECK(UF_OFFLINE, XAT_OFFLINE,
5361 xvap.xva_xoptattrs.xoa_offline);
5362 FLAG_CHECK(UF_SPARSE, XAT_SPARSE,
5363 xvap.xva_xoptattrs.xoa_sparse);
5364
5365 #undef FLAG_CHECK
5366 *vap = xvap.xva_vattr;
5367 vap->va_flags = fflags;
5368
5369 #if __FreeBSD_version >= 1500040
5370 if ((vn_irflag_read(ap->a_vp) & (VIRF_NAMEDDIR | VIRF_NAMEDATTR)) != 0)
5371 vap->va_bsdflags |= SFBSD_NAMEDATTR;
5372 #endif
5373 return (0);
5374 }
5375
5376 #ifndef _SYS_SYSPROTO_H_
5377 struct vop_setattr_args {
5378 struct vnode *a_vp;
5379 struct vattr *a_vap;
5380 struct ucred *a_cred;
5381 };
5382 #endif
5383
5384 static int
zfs_freebsd_setattr(struct vop_setattr_args * ap)5385 zfs_freebsd_setattr(struct vop_setattr_args *ap)
5386 {
5387 vnode_t *vp = ap->a_vp;
5388 vattr_t *vap = ap->a_vap;
5389 cred_t *cred = ap->a_cred;
5390 xvattr_t xvap;
5391 ulong_t fflags;
5392 uint64_t zflags;
5393
5394 vattr_init_mask(vap);
5395 vap->va_mask &= ~AT_NOSET;
5396
5397 xva_init(&xvap);
5398 xvap.xva_vattr = *vap;
5399
5400 zflags = VTOZ(vp)->z_pflags;
5401
5402 if (vap->va_flags != VNOVAL) {
5403 zfsvfs_t *zfsvfs = VTOZ(vp)->z_zfsvfs;
5404 int error;
5405
5406 if (zfsvfs->z_use_fuids == B_FALSE)
5407 return (EOPNOTSUPP);
5408
5409 fflags = vap->va_flags;
5410 /*
5411 * XXX KDM
5412 * We need to figure out whether it makes sense to allow
5413 * UF_REPARSE through, since we don't really have other
5414 * facilities to handle reparse points and zfs_setattr()
5415 * doesn't currently allow setting that attribute anyway.
5416 */
5417 if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_ARCHIVE|
5418 UF_NODUMP|UF_SYSTEM|UF_HIDDEN|UF_READONLY|UF_REPARSE|
5419 UF_OFFLINE|UF_SPARSE)) != 0)
5420 return (EOPNOTSUPP);
5421 /*
5422 * Unprivileged processes are not permitted to unset system
5423 * flags, or modify flags if any system flags are set.
5424 * Privileged non-jail processes may not modify system flags
5425 * if securelevel > 0 and any existing system flags are set.
5426 * Privileged jail processes behave like privileged non-jail
5427 * processes if the PR_ALLOW_CHFLAGS permission bit is set;
5428 * otherwise, they behave like unprivileged processes.
5429 */
5430 if (secpolicy_fs_owner(vp->v_mount, cred) == 0 ||
5431 priv_check_cred(cred, PRIV_VFS_SYSFLAGS) == 0) {
5432 if (zflags &
5433 (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
5434 error = securelevel_gt(cred, 0);
5435 if (error != 0)
5436 return (error);
5437 }
5438 } else {
5439 /*
5440 * Callers may only modify the file flags on
5441 * objects they have VADMIN rights for.
5442 */
5443 if ((error = VOP_ACCESS(vp, VADMIN, cred,
5444 curthread)) != 0)
5445 return (error);
5446 if (zflags &
5447 (ZFS_IMMUTABLE | ZFS_APPENDONLY |
5448 ZFS_NOUNLINK)) {
5449 return (EPERM);
5450 }
5451 if (fflags &
5452 (SF_IMMUTABLE | SF_APPEND | SF_NOUNLINK)) {
5453 return (EPERM);
5454 }
5455 }
5456
5457 #define FLAG_CHANGE(fflag, zflag, xflag, xfield) do { \
5458 if (((fflags & (fflag)) && !(zflags & (zflag))) || \
5459 ((zflags & (zflag)) && !(fflags & (fflag)))) { \
5460 XVA_SET_REQ(&xvap, (xflag)); \
5461 (xfield) = ((fflags & (fflag)) != 0); \
5462 } \
5463 } while (0)
5464 /* Convert chflags into ZFS-type flags. */
5465 /* XXX: what about SF_SETTABLE?. */
5466 FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE,
5467 xvap.xva_xoptattrs.xoa_immutable);
5468 FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY,
5469 xvap.xva_xoptattrs.xoa_appendonly);
5470 FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK,
5471 xvap.xva_xoptattrs.xoa_nounlink);
5472 FLAG_CHANGE(UF_ARCHIVE, ZFS_ARCHIVE, XAT_ARCHIVE,
5473 xvap.xva_xoptattrs.xoa_archive);
5474 FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP,
5475 xvap.xva_xoptattrs.xoa_nodump);
5476 FLAG_CHANGE(UF_READONLY, ZFS_READONLY, XAT_READONLY,
5477 xvap.xva_xoptattrs.xoa_readonly);
5478 FLAG_CHANGE(UF_SYSTEM, ZFS_SYSTEM, XAT_SYSTEM,
5479 xvap.xva_xoptattrs.xoa_system);
5480 FLAG_CHANGE(UF_HIDDEN, ZFS_HIDDEN, XAT_HIDDEN,
5481 xvap.xva_xoptattrs.xoa_hidden);
5482 FLAG_CHANGE(UF_REPARSE, ZFS_REPARSE, XAT_REPARSE,
5483 xvap.xva_xoptattrs.xoa_reparse);
5484 FLAG_CHANGE(UF_OFFLINE, ZFS_OFFLINE, XAT_OFFLINE,
5485 xvap.xva_xoptattrs.xoa_offline);
5486 FLAG_CHANGE(UF_SPARSE, ZFS_SPARSE, XAT_SPARSE,
5487 xvap.xva_xoptattrs.xoa_sparse);
5488 #undef FLAG_CHANGE
5489 }
5490 if (vap->va_birthtime.tv_sec != VNOVAL) {
5491 xvap.xva_vattr.va_mask |= AT_XVATTR;
5492 XVA_SET_REQ(&xvap, XAT_CREATETIME);
5493 }
5494 return (zfs_setattr(VTOZ(vp), (vattr_t *)&xvap, 0, cred, NULL));
5495 }
5496
5497 #ifndef _SYS_SYSPROTO_H_
5498 struct vop_rename_args {
5499 struct vnode *a_fdvp;
5500 struct vnode *a_fvp;
5501 struct componentname *a_fcnp;
5502 struct vnode *a_tdvp;
5503 struct vnode *a_tvp;
5504 struct componentname *a_tcnp;
5505 };
5506 #endif
5507
5508 static int
zfs_freebsd_rename(struct vop_rename_args * ap)5509 zfs_freebsd_rename(struct vop_rename_args *ap)
5510 {
5511 vnode_t *fdvp = ap->a_fdvp;
5512 vnode_t *fvp = ap->a_fvp;
5513 vnode_t *tdvp = ap->a_tdvp;
5514 vnode_t *tvp = ap->a_tvp;
5515 int error = 0;
5516
5517 #if __FreeBSD_version < 1400068
5518 ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART));
5519 ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART));
5520 #endif
5521
5522 #if __FreeBSD_version >= 1500040
5523 if ((vn_irflag_read(fdvp) & VIRF_NAMEDDIR) != 0) {
5524 error = zfs_check_attrname(ap->a_fcnp->cn_nameptr);
5525 if (error == 0)
5526 error = zfs_check_attrname(ap->a_tcnp->cn_nameptr);
5527 }
5528 #endif
5529
5530 if (error == 0 && (ap->a_flags & ~(AT_RENAME_NOREPLACE)) != 0)
5531 error = EOPNOTSUPP;
5532
5533 if (error == 0) {
5534 error = zfs_do_rename(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp,
5535 ap->a_tcnp, ap->a_fcnp->cn_cred, ap->a_flags);
5536 vrele(fdvp);
5537 vrele(fvp);
5538 vrele(tdvp);
5539 if (tvp != NULL)
5540 vrele(tvp);
5541 } else {
5542 if (tdvp == tvp)
5543 vrele(tdvp);
5544 else
5545 vput(tdvp);
5546 if (tvp != NULL)
5547 vput(tvp);
5548 vrele(fdvp);
5549 vrele(fvp);
5550 }
5551
5552 return (error);
5553 }
5554
5555 #ifndef _SYS_SYSPROTO_H_
5556 struct vop_symlink_args {
5557 struct vnode *a_dvp;
5558 struct vnode **a_vpp;
5559 struct componentname *a_cnp;
5560 struct vattr *a_vap;
5561 char *a_target;
5562 };
5563 #endif
5564
5565 static int
zfs_freebsd_symlink(struct vop_symlink_args * ap)5566 zfs_freebsd_symlink(struct vop_symlink_args *ap)
5567 {
5568 struct componentname *cnp = ap->a_cnp;
5569 vattr_t *vap = ap->a_vap;
5570 znode_t *zp = NULL;
5571 char *symlink;
5572 size_t symlink_len;
5573 int rc;
5574
5575 #if __FreeBSD_version < 1400068
5576 ASSERT(cnp->cn_flags & SAVENAME);
5577 #endif
5578
5579 vap->va_type = VLNK; /* FreeBSD: Syscall only sets va_mode. */
5580 vattr_init_mask(vap);
5581 *ap->a_vpp = NULL;
5582
5583 rc = zfs_symlink(VTOZ(ap->a_dvp), cnp->cn_nameptr, vap,
5584 ap->a_target, &zp, cnp->cn_cred, 0 /* flags */, NULL);
5585 if (rc == 0) {
5586 *ap->a_vpp = ZTOV(zp);
5587 ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
5588 MPASS(zp->z_cached_symlink == NULL);
5589 symlink_len = strlen(ap->a_target);
5590 symlink = cache_symlink_alloc(symlink_len + 1, M_WAITOK);
5591 if (symlink != NULL) {
5592 memcpy(symlink, ap->a_target, symlink_len);
5593 symlink[symlink_len] = '\0';
5594 atomic_store_rel_ptr((uintptr_t *)&zp->z_cached_symlink,
5595 (uintptr_t)symlink);
5596 }
5597 }
5598 return (rc);
5599 }
5600
5601 #ifndef _SYS_SYSPROTO_H_
5602 struct vop_readlink_args {
5603 struct vnode *a_vp;
5604 struct uio *a_uio;
5605 struct ucred *a_cred;
5606 };
5607 #endif
5608
5609 static int
zfs_freebsd_readlink(struct vop_readlink_args * ap)5610 zfs_freebsd_readlink(struct vop_readlink_args *ap)
5611 {
5612 zfs_uio_t uio;
5613 int error;
5614 znode_t *zp = VTOZ(ap->a_vp);
5615 char *symlink, *base;
5616 size_t symlink_len;
5617 bool trycache;
5618
5619 zfs_uio_init(&uio, ap->a_uio);
5620 trycache = false;
5621 if (zfs_uio_segflg(&uio) == UIO_SYSSPACE &&
5622 zfs_uio_iovcnt(&uio) == 1) {
5623 base = zfs_uio_iovbase(&uio, 0);
5624 symlink_len = zfs_uio_iovlen(&uio, 0);
5625 trycache = true;
5626 }
5627 error = zfs_readlink(ap->a_vp, &uio, ap->a_cred, NULL);
5628 if (atomic_load_ptr(&zp->z_cached_symlink) != NULL ||
5629 error != 0 || !trycache) {
5630 return (error);
5631 }
5632 symlink_len -= zfs_uio_resid(&uio);
5633 symlink = cache_symlink_alloc(symlink_len + 1, M_WAITOK);
5634 if (symlink != NULL) {
5635 memcpy(symlink, base, symlink_len);
5636 symlink[symlink_len] = '\0';
5637 if (!atomic_cmpset_rel_ptr((uintptr_t *)&zp->z_cached_symlink,
5638 (uintptr_t)NULL, (uintptr_t)symlink)) {
5639 cache_symlink_free(symlink, symlink_len + 1);
5640 }
5641 }
5642 return (error);
5643 }
5644
5645 #ifndef _SYS_SYSPROTO_H_
5646 struct vop_link_args {
5647 struct vnode *a_tdvp;
5648 struct vnode *a_vp;
5649 struct componentname *a_cnp;
5650 };
5651 #endif
5652
5653 static int
zfs_freebsd_link(struct vop_link_args * ap)5654 zfs_freebsd_link(struct vop_link_args *ap)
5655 {
5656 struct componentname *cnp = ap->a_cnp;
5657 vnode_t *vp = ap->a_vp;
5658 vnode_t *tdvp = ap->a_tdvp;
5659
5660 if (tdvp->v_mount != vp->v_mount)
5661 return (EXDEV);
5662
5663 #if __FreeBSD_version < 1400068
5664 ASSERT(cnp->cn_flags & SAVENAME);
5665 #endif
5666
5667 return (zfs_link(VTOZ(tdvp), VTOZ(vp),
5668 cnp->cn_nameptr, cnp->cn_cred, 0));
5669 }
5670
5671 #ifndef _SYS_SYSPROTO_H_
5672 struct vop_inactive_args {
5673 struct vnode *a_vp;
5674 struct thread *a_td;
5675 };
5676 #endif
5677
5678 static int
zfs_freebsd_inactive(struct vop_inactive_args * ap)5679 zfs_freebsd_inactive(struct vop_inactive_args *ap)
5680 {
5681 vnode_t *vp = ap->a_vp;
5682
5683 zfs_inactive(vp, curthread->td_ucred, NULL);
5684 return (0);
5685 }
5686
5687 #ifndef _SYS_SYSPROTO_H_
5688 struct vop_need_inactive_args {
5689 struct vnode *a_vp;
5690 struct thread *a_td;
5691 };
5692 #endif
5693
5694 static int
zfs_freebsd_need_inactive(struct vop_need_inactive_args * ap)5695 zfs_freebsd_need_inactive(struct vop_need_inactive_args *ap)
5696 {
5697 vnode_t *vp = ap->a_vp;
5698 znode_t *zp = VTOZ(vp);
5699 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5700 int need;
5701
5702 if (vn_need_pageq_flush(vp))
5703 return (1);
5704
5705 if (!ZFS_TEARDOWN_INACTIVE_TRY_ENTER_READ(zfsvfs))
5706 return (1);
5707 need = (zp->z_sa_hdl == NULL || zp->z_unlinked || zp->z_atime_dirty);
5708 ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs);
5709
5710 return (need);
5711 }
5712
5713 #ifndef _SYS_SYSPROTO_H_
5714 struct vop_reclaim_args {
5715 struct vnode *a_vp;
5716 struct thread *a_td;
5717 };
5718 #endif
5719
5720 static int
zfs_freebsd_reclaim(struct vop_reclaim_args * ap)5721 zfs_freebsd_reclaim(struct vop_reclaim_args *ap)
5722 {
5723 vnode_t *vp = ap->a_vp;
5724 znode_t *zp = VTOZ(vp);
5725 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5726
5727 ASSERT3P(zp, !=, NULL);
5728
5729 /*
5730 * z_teardown_inactive_lock protects from a race with
5731 * zfs_znode_dmu_fini in zfsvfs_teardown during
5732 * force unmount.
5733 */
5734 ZFS_TEARDOWN_INACTIVE_ENTER_READ(zfsvfs);
5735 if (zp->z_sa_hdl == NULL)
5736 zfs_znode_free(zp);
5737 else
5738 zfs_zinactive(zp);
5739 ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs);
5740
5741 vp->v_data = NULL;
5742 return (0);
5743 }
5744
5745 #ifndef _SYS_SYSPROTO_H_
5746 struct vop_fid_args {
5747 struct vnode *a_vp;
5748 struct fid *a_fid;
5749 };
5750 #endif
5751
5752 static int
zfs_freebsd_fid(struct vop_fid_args * ap)5753 zfs_freebsd_fid(struct vop_fid_args *ap)
5754 {
5755
5756 return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL));
5757 }
5758
5759
5760 #ifndef _SYS_SYSPROTO_H_
5761 struct vop_pathconf_args {
5762 struct vnode *a_vp;
5763 int a_name;
5764 register_t *a_retval;
5765 } *ap;
5766 #endif
5767
5768 static int
zfs_freebsd_pathconf(struct vop_pathconf_args * ap)5769 zfs_freebsd_pathconf(struct vop_pathconf_args *ap)
5770 {
5771 ulong_t val;
5772 int error;
5773 #if defined(_PC_CLONE_BLKSIZE) || defined(_PC_CASE_INSENSITIVE)
5774 zfsvfs_t *zfsvfs;
5775 #endif
5776
5777 error = zfs_pathconf(ap->a_vp, ap->a_name, &val,
5778 curthread->td_ucred, NULL);
5779 if (error == 0) {
5780 *ap->a_retval = val;
5781 return (error);
5782 }
5783 if (error != EOPNOTSUPP)
5784 return (error);
5785
5786 switch (ap->a_name) {
5787 case _PC_NAME_MAX:
5788 *ap->a_retval = NAME_MAX;
5789 return (0);
5790 #if __FreeBSD_version >= 1400032
5791 case _PC_DEALLOC_PRESENT:
5792 *ap->a_retval = 1;
5793 return (0);
5794 #endif
5795 case _PC_PIPE_BUF:
5796 if (ap->a_vp->v_type == VDIR || ap->a_vp->v_type == VFIFO) {
5797 *ap->a_retval = PIPE_BUF;
5798 return (0);
5799 }
5800 return (EINVAL);
5801 #if __FreeBSD_version >= 1500040
5802 case _PC_NAMEDATTR_ENABLED:
5803 MNT_ILOCK(ap->a_vp->v_mount);
5804 if ((ap->a_vp->v_mount->mnt_flag & MNT_NAMEDATTR) != 0)
5805 *ap->a_retval = 1;
5806 else
5807 *ap->a_retval = 0;
5808 MNT_IUNLOCK(ap->a_vp->v_mount);
5809 return (0);
5810 case _PC_HAS_NAMEDATTR:
5811 if (zfs_has_namedattr(ap->a_vp, curthread->td_ucred))
5812 *ap->a_retval = 1;
5813 else
5814 *ap->a_retval = 0;
5815 return (0);
5816 #endif
5817 #ifdef _PC_HAS_HIDDENSYSTEM
5818 case _PC_HAS_HIDDENSYSTEM:
5819 *ap->a_retval = 1;
5820 return (0);
5821 #endif
5822 #ifdef _PC_CLONE_BLKSIZE
5823 case _PC_CLONE_BLKSIZE:
5824 zfsvfs = (zfsvfs_t *)ap->a_vp->v_mount->mnt_data;
5825 if (zfs_bclone_enabled &&
5826 spa_feature_is_enabled(dmu_objset_spa(zfsvfs->z_os),
5827 SPA_FEATURE_BLOCK_CLONING))
5828 *ap->a_retval = dsl_dataset_feature_is_active(
5829 zfsvfs->z_os->os_dsl_dataset,
5830 SPA_FEATURE_LARGE_BLOCKS) ?
5831 SPA_MAXBLOCKSIZE :
5832 SPA_OLD_MAXBLOCKSIZE;
5833 else
5834 *ap->a_retval = 0;
5835 return (0);
5836 #endif
5837 #ifdef _PC_CASE_INSENSITIVE
5838 case _PC_CASE_INSENSITIVE:
5839 zfsvfs = (zfsvfs_t *)ap->a_vp->v_mount->mnt_data;
5840 if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE)
5841 *ap->a_retval = 1;
5842 else
5843 *ap->a_retval = 0;
5844 return (0);
5845 #endif
5846 default:
5847 return (vop_stdpathconf(ap));
5848 }
5849 }
5850
5851 int zfs_xattr_compat = 1;
5852
5853 static int
zfs_check_attrname(const char * name)5854 zfs_check_attrname(const char *name)
5855 {
5856 /* We don't allow '/' character in attribute name. */
5857 if (strchr(name, '/') != NULL)
5858 return (SET_ERROR(EINVAL));
5859 /* We don't allow attribute names that start with a namespace prefix. */
5860 if (ZFS_XA_NS_PREFIX_FORBIDDEN(name))
5861 return (SET_ERROR(EINVAL));
5862 return (0);
5863 }
5864
5865 /*
5866 * FreeBSD's extended attributes namespace defines file name prefix for ZFS'
5867 * extended attribute name:
5868 *
5869 * NAMESPACE XATTR_COMPAT PREFIX
5870 * system * freebsd:system:
5871 * user 1 (none, can be used to access ZFS
5872 * fsattr(5) attributes created on Solaris)
5873 * user 0 user.
5874 */
5875 static int
zfs_create_attrname(int attrnamespace,const char * name,char * attrname,size_t size,boolean_t compat)5876 zfs_create_attrname(int attrnamespace, const char *name, char *attrname,
5877 size_t size, boolean_t compat)
5878 {
5879 const char *namespace, *prefix, *suffix;
5880
5881 memset(attrname, 0, size);
5882
5883 switch (attrnamespace) {
5884 case EXTATTR_NAMESPACE_USER:
5885 if (compat) {
5886 /*
5887 * This is the default namespace by which we can access
5888 * all attributes created on Solaris.
5889 */
5890 prefix = namespace = suffix = "";
5891 } else {
5892 /*
5893 * This is compatible with the user namespace encoding
5894 * on Linux prior to xattr_compat, but nothing
5895 * else.
5896 */
5897 prefix = "";
5898 namespace = "user";
5899 suffix = ".";
5900 }
5901 break;
5902 case EXTATTR_NAMESPACE_SYSTEM:
5903 prefix = "freebsd:";
5904 namespace = EXTATTR_NAMESPACE_SYSTEM_STRING;
5905 suffix = ":";
5906 break;
5907 case EXTATTR_NAMESPACE_EMPTY:
5908 default:
5909 return (SET_ERROR(EINVAL));
5910 }
5911 if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix,
5912 name) >= size) {
5913 return (SET_ERROR(ENAMETOOLONG));
5914 }
5915 return (0);
5916 }
5917
5918 static int
zfs_ensure_xattr_cached(znode_t * zp)5919 zfs_ensure_xattr_cached(znode_t *zp)
5920 {
5921 int error = 0;
5922
5923 ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock));
5924
5925 if (zp->z_xattr_cached != NULL)
5926 return (0);
5927
5928 if (rw_write_held(&zp->z_xattr_lock))
5929 return (zfs_sa_get_xattr(zp));
5930
5931 if (!rw_tryupgrade(&zp->z_xattr_lock)) {
5932 rw_exit(&zp->z_xattr_lock);
5933 rw_enter(&zp->z_xattr_lock, RW_WRITER);
5934 }
5935 if (zp->z_xattr_cached == NULL)
5936 error = zfs_sa_get_xattr(zp);
5937 rw_downgrade(&zp->z_xattr_lock);
5938 return (error);
5939 }
5940
5941 #ifndef _SYS_SYSPROTO_H_
5942 struct vop_getextattr {
5943 IN struct vnode *a_vp;
5944 IN int a_attrnamespace;
5945 IN const char *a_name;
5946 INOUT struct uio *a_uio;
5947 OUT size_t *a_size;
5948 IN struct ucred *a_cred;
5949 IN struct thread *a_td;
5950 };
5951 #endif
5952
5953 static int
zfs_getextattr_dir(struct vop_getextattr_args * ap,const char * attrname)5954 zfs_getextattr_dir(struct vop_getextattr_args *ap, const char *attrname)
5955 {
5956 struct thread *td = ap->a_td;
5957 struct nameidata nd;
5958 struct vattr va;
5959 vnode_t *xvp = NULL, *vp;
5960 int error, flags;
5961
5962 error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred,
5963 LOOKUP_XATTR, B_FALSE);
5964 if (error != 0)
5965 return (error);
5966
5967 flags = FREAD;
5968 #if __FreeBSD_version < 1400043
5969 NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
5970 xvp, td);
5971 #else
5972 NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname, xvp);
5973 #endif
5974 error = vn_open_cred(&nd, &flags, 0, VN_OPEN_INVFS, ap->a_cred, NULL);
5975 if (error != 0)
5976 return (SET_ERROR(error));
5977 vp = nd.ni_vp;
5978 NDFREE_PNBUF(&nd);
5979
5980 if (ap->a_size != NULL) {
5981 error = VOP_GETATTR(vp, &va, ap->a_cred);
5982 if (error == 0)
5983 *ap->a_size = (size_t)va.va_size;
5984 } else if (ap->a_uio != NULL)
5985 error = VOP_READ(vp, ap->a_uio, IO_UNIT, ap->a_cred);
5986
5987 VOP_UNLOCK(vp);
5988 vn_close(vp, flags, ap->a_cred, td);
5989 return (error);
5990 }
5991
5992 static int
zfs_getextattr_sa(struct vop_getextattr_args * ap,const char * attrname)5993 zfs_getextattr_sa(struct vop_getextattr_args *ap, const char *attrname)
5994 {
5995 znode_t *zp = VTOZ(ap->a_vp);
5996 uchar_t *nv_value;
5997 uint_t nv_size;
5998 int error;
5999
6000 error = zfs_ensure_xattr_cached(zp);
6001 if (error != 0)
6002 return (error);
6003
6004 ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock));
6005 ASSERT3P(zp->z_xattr_cached, !=, NULL);
6006
6007 error = nvlist_lookup_byte_array(zp->z_xattr_cached, attrname,
6008 &nv_value, &nv_size);
6009 if (error != 0)
6010 return (SET_ERROR(error));
6011
6012 if (ap->a_size != NULL)
6013 *ap->a_size = nv_size;
6014 else if (ap->a_uio != NULL)
6015 error = uiomove(nv_value, nv_size, ap->a_uio);
6016 if (error != 0)
6017 return (SET_ERROR(error));
6018
6019 return (0);
6020 }
6021
6022 static int
zfs_getextattr_impl(struct vop_getextattr_args * ap,boolean_t compat)6023 zfs_getextattr_impl(struct vop_getextattr_args *ap, boolean_t compat)
6024 {
6025 znode_t *zp = VTOZ(ap->a_vp);
6026 zfsvfs_t *zfsvfs = ZTOZSB(zp);
6027 char attrname[EXTATTR_MAXNAMELEN+1];
6028 int error;
6029
6030 error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
6031 sizeof (attrname), compat);
6032 if (error != 0)
6033 return (error);
6034
6035 error = ENOENT;
6036 if (zfsvfs->z_use_sa && zp->z_is_sa)
6037 error = zfs_getextattr_sa(ap, attrname);
6038 if (error == ENOENT)
6039 error = zfs_getextattr_dir(ap, attrname);
6040 return (error);
6041 }
6042
6043 /*
6044 * Vnode operation to retrieve a named extended attribute.
6045 */
6046 static int
zfs_getextattr(struct vop_getextattr_args * ap)6047 zfs_getextattr(struct vop_getextattr_args *ap)
6048 {
6049 znode_t *zp = VTOZ(ap->a_vp);
6050 zfsvfs_t *zfsvfs = ZTOZSB(zp);
6051 int error;
6052
6053 /*
6054 * If the xattr property is off, refuse the request.
6055 */
6056 if (!(zfsvfs->z_flags & ZSB_XATTR))
6057 return (SET_ERROR(EOPNOTSUPP));
6058
6059 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
6060 ap->a_cred, ap->a_td, VREAD);
6061 if (error != 0)
6062 return (SET_ERROR(error));
6063
6064 error = zfs_check_attrname(ap->a_name);
6065 if (error != 0)
6066 return (error);
6067
6068 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
6069 return (error);
6070 error = ENOENT;
6071 rw_enter(&zp->z_xattr_lock, RW_READER);
6072
6073 error = zfs_getextattr_impl(ap, zfs_xattr_compat);
6074 if ((error == ENOENT || error == ENOATTR) &&
6075 ap->a_attrnamespace == EXTATTR_NAMESPACE_USER) {
6076 /*
6077 * Fall back to the alternate namespace format if we failed to
6078 * find a user xattr.
6079 */
6080 error = zfs_getextattr_impl(ap, !zfs_xattr_compat);
6081 }
6082
6083 rw_exit(&zp->z_xattr_lock);
6084 zfs_exit(zfsvfs, FTAG);
6085 if (error == ENOENT)
6086 error = SET_ERROR(ENOATTR);
6087 return (error);
6088 }
6089
6090 #ifndef _SYS_SYSPROTO_H_
6091 struct vop_deleteextattr {
6092 IN struct vnode *a_vp;
6093 IN int a_attrnamespace;
6094 IN const char *a_name;
6095 IN struct ucred *a_cred;
6096 IN struct thread *a_td;
6097 };
6098 #endif
6099
6100 static int
zfs_deleteextattr_dir(struct vop_deleteextattr_args * ap,const char * attrname)6101 zfs_deleteextattr_dir(struct vop_deleteextattr_args *ap, const char *attrname)
6102 {
6103 struct nameidata nd;
6104 vnode_t *xvp = NULL, *vp;
6105 int error;
6106
6107 error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred,
6108 LOOKUP_XATTR, B_FALSE);
6109 if (error != 0)
6110 return (error);
6111
6112 #if __FreeBSD_version < 1400043
6113 NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF,
6114 UIO_SYSSPACE, attrname, xvp, ap->a_td);
6115 #else
6116 NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF,
6117 UIO_SYSSPACE, attrname, xvp);
6118 #endif
6119 error = namei(&nd);
6120 if (error != 0)
6121 return (SET_ERROR(error));
6122
6123 vp = nd.ni_vp;
6124 error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
6125 NDFREE_PNBUF(&nd);
6126
6127 vput(nd.ni_dvp);
6128 if (vp == nd.ni_dvp)
6129 vrele(vp);
6130 else
6131 vput(vp);
6132
6133 return (error);
6134 }
6135
6136 static int
zfs_deleteextattr_sa(struct vop_deleteextattr_args * ap,const char * attrname)6137 zfs_deleteextattr_sa(struct vop_deleteextattr_args *ap, const char *attrname)
6138 {
6139 znode_t *zp = VTOZ(ap->a_vp);
6140 nvlist_t *nvl;
6141 int error;
6142
6143 error = zfs_ensure_xattr_cached(zp);
6144 if (error != 0)
6145 return (error);
6146
6147 ASSERT(RW_WRITE_HELD(&zp->z_xattr_lock));
6148 ASSERT3P(zp->z_xattr_cached, !=, NULL);
6149
6150 nvl = zp->z_xattr_cached;
6151 error = nvlist_remove(nvl, attrname, DATA_TYPE_BYTE_ARRAY);
6152 if (error != 0)
6153 error = SET_ERROR(error);
6154 else
6155 error = zfs_sa_set_xattr(zp, attrname, NULL, 0);
6156 if (error != 0) {
6157 zp->z_xattr_cached = NULL;
6158 nvlist_free(nvl);
6159 }
6160 return (error);
6161 }
6162
6163 static int
zfs_deleteextattr_impl(struct vop_deleteextattr_args * ap,boolean_t compat)6164 zfs_deleteextattr_impl(struct vop_deleteextattr_args *ap, boolean_t compat)
6165 {
6166 znode_t *zp = VTOZ(ap->a_vp);
6167 zfsvfs_t *zfsvfs = ZTOZSB(zp);
6168 char attrname[EXTATTR_MAXNAMELEN+1];
6169 int error;
6170
6171 error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
6172 sizeof (attrname), compat);
6173 if (error != 0)
6174 return (error);
6175
6176 error = ENOENT;
6177 if (zfsvfs->z_use_sa && zp->z_is_sa)
6178 error = zfs_deleteextattr_sa(ap, attrname);
6179 if (error == ENOENT)
6180 error = zfs_deleteextattr_dir(ap, attrname);
6181 return (error);
6182 }
6183
6184 /*
6185 * Vnode operation to remove a named attribute.
6186 */
6187 static int
zfs_deleteextattr(struct vop_deleteextattr_args * ap)6188 zfs_deleteextattr(struct vop_deleteextattr_args *ap)
6189 {
6190 znode_t *zp = VTOZ(ap->a_vp);
6191 zfsvfs_t *zfsvfs = ZTOZSB(zp);
6192 int error;
6193
6194 /*
6195 * If the xattr property is off, refuse the request.
6196 */
6197 if (!(zfsvfs->z_flags & ZSB_XATTR))
6198 return (SET_ERROR(EOPNOTSUPP));
6199
6200 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
6201 ap->a_cred, ap->a_td, VWRITE);
6202 if (error != 0)
6203 return (SET_ERROR(error));
6204
6205 error = zfs_check_attrname(ap->a_name);
6206 if (error != 0)
6207 return (error);
6208
6209 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
6210 return (error);
6211 rw_enter(&zp->z_xattr_lock, RW_WRITER);
6212
6213 error = zfs_deleteextattr_impl(ap, zfs_xattr_compat);
6214 if ((error == ENOENT || error == ENOATTR) &&
6215 ap->a_attrnamespace == EXTATTR_NAMESPACE_USER) {
6216 /*
6217 * Fall back to the alternate namespace format if we failed to
6218 * find a user xattr.
6219 */
6220 error = zfs_deleteextattr_impl(ap, !zfs_xattr_compat);
6221 }
6222
6223 rw_exit(&zp->z_xattr_lock);
6224 zfs_exit(zfsvfs, FTAG);
6225 if (error == ENOENT)
6226 error = SET_ERROR(ENOATTR);
6227 return (error);
6228 }
6229
6230 #ifndef _SYS_SYSPROTO_H_
6231 struct vop_setextattr {
6232 IN struct vnode *a_vp;
6233 IN int a_attrnamespace;
6234 IN const char *a_name;
6235 INOUT struct uio *a_uio;
6236 IN struct ucred *a_cred;
6237 IN struct thread *a_td;
6238 };
6239 #endif
6240
6241 static int
zfs_setextattr_dir(struct vop_setextattr_args * ap,const char * attrname)6242 zfs_setextattr_dir(struct vop_setextattr_args *ap, const char *attrname)
6243 {
6244 struct thread *td = ap->a_td;
6245 struct nameidata nd;
6246 struct vattr va;
6247 vnode_t *xvp = NULL, *vp;
6248 int error, flags;
6249
6250 error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred,
6251 LOOKUP_XATTR | CREATE_XATTR_DIR, B_FALSE);
6252 if (error != 0)
6253 return (error);
6254
6255 flags = FFLAGS(O_WRONLY | O_CREAT);
6256 #if __FreeBSD_version < 1400043
6257 NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname, xvp, td);
6258 #else
6259 NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname, xvp);
6260 #endif
6261 error = vn_open_cred(&nd, &flags, 0600, VN_OPEN_INVFS, ap->a_cred,
6262 NULL);
6263 if (error != 0)
6264 return (SET_ERROR(error));
6265 vp = nd.ni_vp;
6266 NDFREE_PNBUF(&nd);
6267
6268 VATTR_NULL(&va);
6269 va.va_size = 0;
6270 error = VOP_SETATTR(vp, &va, ap->a_cred);
6271 if (error == 0)
6272 VOP_WRITE(vp, ap->a_uio, IO_UNIT, ap->a_cred);
6273
6274 VOP_UNLOCK(vp);
6275 vn_close(vp, flags, ap->a_cred, td);
6276 return (error);
6277 }
6278
6279 static int
zfs_setextattr_sa(struct vop_setextattr_args * ap,const char * attrname)6280 zfs_setextattr_sa(struct vop_setextattr_args *ap, const char *attrname)
6281 {
6282 znode_t *zp = VTOZ(ap->a_vp);
6283 nvlist_t *nvl;
6284 size_t sa_size;
6285 int error;
6286
6287 error = zfs_ensure_xattr_cached(zp);
6288 if (error != 0)
6289 return (error);
6290
6291 ASSERT(RW_WRITE_HELD(&zp->z_xattr_lock));
6292 ASSERT3P(zp->z_xattr_cached, !=, NULL);
6293
6294 nvl = zp->z_xattr_cached;
6295 size_t entry_size = ap->a_uio->uio_resid;
6296 if (entry_size > DXATTR_MAX_ENTRY_SIZE)
6297 return (SET_ERROR(EFBIG));
6298 error = nvlist_size(nvl, &sa_size, NV_ENCODE_XDR);
6299 if (error != 0)
6300 return (SET_ERROR(error));
6301 if (sa_size > DXATTR_MAX_SA_SIZE)
6302 return (SET_ERROR(EFBIG));
6303 uchar_t *buf = kmem_alloc(entry_size, KM_SLEEP);
6304 error = uiomove(buf, entry_size, ap->a_uio);
6305 if (error != 0) {
6306 error = SET_ERROR(error);
6307 } else {
6308 error = nvlist_add_byte_array(nvl, attrname, buf, entry_size);
6309 if (error != 0)
6310 error = SET_ERROR(error);
6311 }
6312 if (error == 0)
6313 error = zfs_sa_set_xattr(zp, attrname, buf, entry_size);
6314 kmem_free(buf, entry_size);
6315 if (error != 0) {
6316 zp->z_xattr_cached = NULL;
6317 nvlist_free(nvl);
6318 }
6319 return (error);
6320 }
6321
6322 static int
zfs_setextattr_impl(struct vop_setextattr_args * ap,boolean_t compat)6323 zfs_setextattr_impl(struct vop_setextattr_args *ap, boolean_t compat)
6324 {
6325 znode_t *zp = VTOZ(ap->a_vp);
6326 zfsvfs_t *zfsvfs = ZTOZSB(zp);
6327 char attrname[EXTATTR_MAXNAMELEN+1];
6328 int error;
6329
6330 error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
6331 sizeof (attrname), compat);
6332 if (error != 0)
6333 return (error);
6334
6335 struct vop_deleteextattr_args vda = {
6336 .a_vp = ap->a_vp,
6337 .a_attrnamespace = ap->a_attrnamespace,
6338 .a_name = ap->a_name,
6339 .a_cred = ap->a_cred,
6340 .a_td = ap->a_td,
6341 };
6342 error = ENOENT;
6343 if (zfsvfs->z_use_sa && zp->z_is_sa && zfsvfs->z_xattr_sa) {
6344 error = zfs_setextattr_sa(ap, attrname);
6345 if (error == 0) {
6346 /*
6347 * Successfully put into SA, we need to clear the one
6348 * in dir if present.
6349 */
6350 zfs_deleteextattr_dir(&vda, attrname);
6351 }
6352 }
6353 if (error != 0) {
6354 error = zfs_setextattr_dir(ap, attrname);
6355 if (error == 0 && zp->z_is_sa) {
6356 /*
6357 * Successfully put into dir, we need to clear the one
6358 * in SA if present.
6359 */
6360 zfs_deleteextattr_sa(&vda, attrname);
6361 }
6362 }
6363 if (error == 0 && ap->a_attrnamespace == EXTATTR_NAMESPACE_USER) {
6364 /*
6365 * Also clear all versions of the alternate compat name.
6366 */
6367 zfs_deleteextattr_impl(&vda, !compat);
6368 }
6369 return (error);
6370 }
6371
6372 /*
6373 * Vnode operation to set a named attribute.
6374 */
6375 static int
zfs_setextattr(struct vop_setextattr_args * ap)6376 zfs_setextattr(struct vop_setextattr_args *ap)
6377 {
6378 znode_t *zp = VTOZ(ap->a_vp);
6379 zfsvfs_t *zfsvfs = ZTOZSB(zp);
6380 int error;
6381
6382 /*
6383 * If the xattr property is off, refuse the request.
6384 */
6385 if (!(zfsvfs->z_flags & ZSB_XATTR))
6386 return (SET_ERROR(EOPNOTSUPP));
6387
6388 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
6389 ap->a_cred, ap->a_td, VWRITE);
6390 if (error != 0)
6391 return (SET_ERROR(error));
6392
6393 error = zfs_check_attrname(ap->a_name);
6394 if (error != 0)
6395 return (error);
6396
6397 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
6398 return (error);
6399 rw_enter(&zp->z_xattr_lock, RW_WRITER);
6400
6401 error = zfs_setextattr_impl(ap, zfs_xattr_compat);
6402
6403 rw_exit(&zp->z_xattr_lock);
6404 zfs_exit(zfsvfs, FTAG);
6405 return (error);
6406 }
6407
6408 #ifndef _SYS_SYSPROTO_H_
6409 struct vop_listextattr {
6410 IN struct vnode *a_vp;
6411 IN int a_attrnamespace;
6412 INOUT struct uio *a_uio;
6413 OUT size_t *a_size;
6414 IN struct ucred *a_cred;
6415 IN struct thread *a_td;
6416 };
6417 #endif
6418
6419 static int
zfs_listextattr_dir(struct vop_listextattr_args * ap,const char * attrprefix)6420 zfs_listextattr_dir(struct vop_listextattr_args *ap, const char *attrprefix)
6421 {
6422 struct thread *td = ap->a_td;
6423 struct nameidata nd;
6424 uint8_t dirbuf[sizeof (struct dirent)];
6425 struct iovec aiov;
6426 struct uio auio;
6427 vnode_t *xvp = NULL, *vp;
6428 int error, eof;
6429
6430 error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred,
6431 LOOKUP_XATTR, B_FALSE);
6432 if (error != 0) {
6433 /*
6434 * ENOATTR means that the EA directory does not yet exist,
6435 * i.e. there are no extended attributes there.
6436 */
6437 if (error == ENOATTR)
6438 error = 0;
6439 return (error);
6440 }
6441
6442 #if __FreeBSD_version < 1400043
6443 NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED,
6444 UIO_SYSSPACE, ".", xvp, td);
6445 #else
6446 NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED,
6447 UIO_SYSSPACE, ".", xvp);
6448 #endif
6449 error = namei(&nd);
6450 if (error != 0)
6451 return (SET_ERROR(error));
6452 vp = nd.ni_vp;
6453 NDFREE_PNBUF(&nd);
6454
6455 auio.uio_iov = &aiov;
6456 auio.uio_iovcnt = 1;
6457 auio.uio_segflg = UIO_SYSSPACE;
6458 auio.uio_td = td;
6459 auio.uio_rw = UIO_READ;
6460 auio.uio_offset = 0;
6461
6462 size_t plen = strlen(attrprefix);
6463
6464 do {
6465 aiov.iov_base = (void *)dirbuf;
6466 aiov.iov_len = sizeof (dirbuf);
6467 auio.uio_resid = sizeof (dirbuf);
6468 error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL);
6469 if (error != 0)
6470 break;
6471 int done = sizeof (dirbuf) - auio.uio_resid;
6472 for (int pos = 0; pos < done; ) {
6473 struct dirent *dp = (struct dirent *)(dirbuf + pos);
6474 pos += dp->d_reclen;
6475 /*
6476 * XXX: Temporarily we also accept DT_UNKNOWN, as this
6477 * is what we get when attribute was created on Solaris.
6478 */
6479 if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN)
6480 continue;
6481 else if (plen == 0 &&
6482 ZFS_XA_NS_PREFIX_FORBIDDEN(dp->d_name))
6483 continue;
6484 else if (strncmp(dp->d_name, attrprefix, plen) != 0)
6485 continue;
6486 uint8_t nlen = dp->d_namlen - plen;
6487 if (ap->a_size != NULL) {
6488 *ap->a_size += 1 + nlen;
6489 } else if (ap->a_uio != NULL) {
6490 /*
6491 * Format of extattr name entry is one byte for
6492 * length and the rest for name.
6493 */
6494 error = uiomove(&nlen, 1, ap->a_uio);
6495 if (error == 0) {
6496 char *namep = dp->d_name + plen;
6497 error = uiomove(namep, nlen, ap->a_uio);
6498 }
6499 if (error != 0) {
6500 error = SET_ERROR(error);
6501 break;
6502 }
6503 }
6504 }
6505 } while (!eof && error == 0);
6506
6507 vput(vp);
6508 return (error);
6509 }
6510
6511 static int
zfs_listextattr_sa(struct vop_listextattr_args * ap,const char * attrprefix)6512 zfs_listextattr_sa(struct vop_listextattr_args *ap, const char *attrprefix)
6513 {
6514 znode_t *zp = VTOZ(ap->a_vp);
6515 int error;
6516
6517 error = zfs_ensure_xattr_cached(zp);
6518 if (error != 0)
6519 return (error);
6520
6521 ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock));
6522 ASSERT3P(zp->z_xattr_cached, !=, NULL);
6523
6524 size_t plen = strlen(attrprefix);
6525 nvpair_t *nvp = NULL;
6526 while ((nvp = nvlist_next_nvpair(zp->z_xattr_cached, nvp)) != NULL) {
6527 ASSERT3U(nvpair_type(nvp), ==, DATA_TYPE_BYTE_ARRAY);
6528
6529 const char *name = nvpair_name(nvp);
6530 if (plen == 0 && ZFS_XA_NS_PREFIX_FORBIDDEN(name))
6531 continue;
6532 else if (strncmp(name, attrprefix, plen) != 0)
6533 continue;
6534 uint8_t nlen = strlen(name) - plen;
6535 if (ap->a_size != NULL) {
6536 *ap->a_size += 1 + nlen;
6537 } else if (ap->a_uio != NULL) {
6538 /*
6539 * Format of extattr name entry is one byte for
6540 * length and the rest for name.
6541 */
6542 error = uiomove(&nlen, 1, ap->a_uio);
6543 if (error == 0) {
6544 char *namep = __DECONST(char *, name) + plen;
6545 error = uiomove(namep, nlen, ap->a_uio);
6546 }
6547 if (error != 0) {
6548 error = SET_ERROR(error);
6549 break;
6550 }
6551 }
6552 }
6553
6554 return (error);
6555 }
6556
6557 static int
zfs_listextattr_impl(struct vop_listextattr_args * ap,boolean_t compat)6558 zfs_listextattr_impl(struct vop_listextattr_args *ap, boolean_t compat)
6559 {
6560 znode_t *zp = VTOZ(ap->a_vp);
6561 zfsvfs_t *zfsvfs = ZTOZSB(zp);
6562 char attrprefix[16];
6563 int error;
6564
6565 error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix,
6566 sizeof (attrprefix), compat);
6567 if (error != 0)
6568 return (error);
6569
6570 if (zfsvfs->z_use_sa && zp->z_is_sa)
6571 error = zfs_listextattr_sa(ap, attrprefix);
6572 if (error == 0)
6573 error = zfs_listextattr_dir(ap, attrprefix);
6574 return (error);
6575 }
6576
6577 /*
6578 * Vnode operation to retrieve extended attributes on a vnode.
6579 */
6580 static int
zfs_listextattr(struct vop_listextattr_args * ap)6581 zfs_listextattr(struct vop_listextattr_args *ap)
6582 {
6583 znode_t *zp = VTOZ(ap->a_vp);
6584 zfsvfs_t *zfsvfs = ZTOZSB(zp);
6585 int error;
6586
6587 if (ap->a_size != NULL)
6588 *ap->a_size = 0;
6589
6590 /*
6591 * If the xattr property is off, refuse the request.
6592 */
6593 if (!(zfsvfs->z_flags & ZSB_XATTR))
6594 return (SET_ERROR(EOPNOTSUPP));
6595
6596 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
6597 ap->a_cred, ap->a_td, VREAD);
6598 if (error != 0)
6599 return (SET_ERROR(error));
6600
6601 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
6602 return (error);
6603 rw_enter(&zp->z_xattr_lock, RW_READER);
6604
6605 error = zfs_listextattr_impl(ap, zfs_xattr_compat);
6606 if (error == 0 && ap->a_attrnamespace == EXTATTR_NAMESPACE_USER) {
6607 /* Also list user xattrs with the alternate format. */
6608 error = zfs_listextattr_impl(ap, !zfs_xattr_compat);
6609 }
6610
6611 rw_exit(&zp->z_xattr_lock);
6612 zfs_exit(zfsvfs, FTAG);
6613 return (error);
6614 }
6615
6616 #ifndef _SYS_SYSPROTO_H_
6617 struct vop_getacl_args {
6618 struct vnode *vp;
6619 acl_type_t type;
6620 struct acl *aclp;
6621 struct ucred *cred;
6622 struct thread *td;
6623 };
6624 #endif
6625
6626 static int
zfs_freebsd_getacl(struct vop_getacl_args * ap)6627 zfs_freebsd_getacl(struct vop_getacl_args *ap)
6628 {
6629 int error;
6630 vsecattr_t vsecattr;
6631
6632 if (ap->a_type != ACL_TYPE_NFS4)
6633 return (EINVAL);
6634
6635 vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT;
6636 if ((error = zfs_getsecattr(VTOZ(ap->a_vp),
6637 &vsecattr, 0, ap->a_cred)))
6638 return (error);
6639
6640 error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp,
6641 vsecattr.vsa_aclcnt);
6642 if (vsecattr.vsa_aclentp != NULL)
6643 kmem_free(vsecattr.vsa_aclentp, vsecattr.vsa_aclentsz);
6644
6645 return (error);
6646 }
6647
6648 #ifndef _SYS_SYSPROTO_H_
6649 struct vop_setacl_args {
6650 struct vnode *vp;
6651 acl_type_t type;
6652 struct acl *aclp;
6653 struct ucred *cred;
6654 struct thread *td;
6655 };
6656 #endif
6657
6658 static int
zfs_freebsd_setacl(struct vop_setacl_args * ap)6659 zfs_freebsd_setacl(struct vop_setacl_args *ap)
6660 {
6661 int error;
6662 vsecattr_t vsecattr;
6663 int aclbsize; /* size of acl list in bytes */
6664 aclent_t *aaclp;
6665
6666 if (ap->a_type != ACL_TYPE_NFS4)
6667 return (EINVAL);
6668
6669 if (ap->a_aclp == NULL)
6670 return (EINVAL);
6671
6672 if (ap->a_aclp->acl_cnt < 1 || ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES)
6673 return (EINVAL);
6674
6675 /*
6676 * With NFSv4 ACLs, chmod(2) may need to add additional entries,
6677 * splitting every entry into two and appending "canonical six"
6678 * entries at the end. Don't allow for setting an ACL that would
6679 * cause chmod(2) to run out of ACL entries.
6680 */
6681 if (ap->a_aclp->acl_cnt * 2 + 6 > ACL_MAX_ENTRIES)
6682 return (ENOSPC);
6683
6684 error = acl_nfs4_check(ap->a_aclp, ap->a_vp->v_type == VDIR);
6685 if (error != 0)
6686 return (error);
6687
6688 vsecattr.vsa_mask = VSA_ACE;
6689 aclbsize = ap->a_aclp->acl_cnt * sizeof (ace_t);
6690 vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP);
6691 aaclp = vsecattr.vsa_aclentp;
6692 vsecattr.vsa_aclentsz = aclbsize;
6693
6694 aces_from_acl(vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, ap->a_aclp);
6695 error = zfs_setsecattr(VTOZ(ap->a_vp), &vsecattr, 0, ap->a_cred);
6696 kmem_free(aaclp, aclbsize);
6697
6698 return (error);
6699 }
6700
6701 #ifndef _SYS_SYSPROTO_H_
6702 struct vop_aclcheck_args {
6703 struct vnode *vp;
6704 acl_type_t type;
6705 struct acl *aclp;
6706 struct ucred *cred;
6707 struct thread *td;
6708 };
6709 #endif
6710
6711 static int
zfs_freebsd_aclcheck(struct vop_aclcheck_args * ap)6712 zfs_freebsd_aclcheck(struct vop_aclcheck_args *ap)
6713 {
6714
6715 return (EOPNOTSUPP);
6716 }
6717
6718 #ifndef _SYS_SYSPROTO_H_
6719 struct vop_advise_args {
6720 struct vnode *a_vp;
6721 off_t a_start;
6722 off_t a_end;
6723 int a_advice;
6724 };
6725 #endif
6726
6727 static int
zfs_freebsd_advise(struct vop_advise_args * ap)6728 zfs_freebsd_advise(struct vop_advise_args *ap)
6729 {
6730 vnode_t *vp = ap->a_vp;
6731 off_t start = ap->a_start;
6732 off_t end = ap->a_end;
6733 int advice = ap->a_advice;
6734 off_t len;
6735 znode_t *zp;
6736 zfsvfs_t *zfsvfs;
6737 objset_t *os;
6738 int error = 0;
6739
6740 if (end < start)
6741 return (EINVAL);
6742
6743 error = vn_lock(vp, LK_SHARED);
6744 if (error)
6745 return (error);
6746
6747 zp = VTOZ(vp);
6748 zfsvfs = zp->z_zfsvfs;
6749 os = zp->z_zfsvfs->z_os;
6750
6751 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
6752 goto out_unlock;
6753
6754 /* kern_posix_fadvise points to the last byte, we want one past */
6755 if (end != OFF_MAX)
6756 end += 1;
6757 len = end - start;
6758
6759 switch (advice) {
6760 case POSIX_FADV_WILLNEED:
6761 /*
6762 * Pass on the caller's size directly, but note that
6763 * dmu_prefetch_max will effectively cap it. If there really
6764 * is a larger sequential access pattern, perhaps dmu_zfetch
6765 * will detect it.
6766 */
6767 dmu_prefetch(os, zp->z_id, 0, start, len,
6768 ZIO_PRIORITY_ASYNC_READ);
6769 break;
6770 case POSIX_FADV_NORMAL:
6771 case POSIX_FADV_RANDOM:
6772 case POSIX_FADV_SEQUENTIAL:
6773 case POSIX_FADV_DONTNEED:
6774 case POSIX_FADV_NOREUSE:
6775 /* ignored for now */
6776 break;
6777 default:
6778 error = EINVAL;
6779 break;
6780 }
6781
6782 zfs_exit(zfsvfs, FTAG);
6783
6784 out_unlock:
6785 VOP_UNLOCK(vp);
6786
6787 return (error);
6788 }
6789
6790 static int
zfs_vptocnp(struct vop_vptocnp_args * ap)6791 zfs_vptocnp(struct vop_vptocnp_args *ap)
6792 {
6793 vnode_t *covered_vp;
6794 vnode_t *vp = ap->a_vp;
6795 zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
6796 znode_t *zp = VTOZ(vp);
6797 int ltype;
6798 int error;
6799
6800 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
6801 return (error);
6802
6803 /*
6804 * If we are a snapshot mounted under .zfs, run the operation
6805 * on the covered vnode.
6806 */
6807 if (zp->z_id != zfsvfs->z_root || zfsvfs->z_parent == zfsvfs) {
6808 char name[MAXNAMLEN + 1];
6809 znode_t *dzp;
6810 size_t len;
6811
6812 error = zfs_znode_parent_and_name(zp, &dzp, name,
6813 sizeof (name));
6814 if (error == 0) {
6815 len = strlen(name);
6816 if (*ap->a_buflen < len)
6817 error = SET_ERROR(ENOMEM);
6818 }
6819 if (error == 0) {
6820 *ap->a_buflen -= len;
6821 memcpy(ap->a_buf + *ap->a_buflen, name, len);
6822 *ap->a_vpp = ZTOV(dzp);
6823 }
6824 zfs_exit(zfsvfs, FTAG);
6825 return (error);
6826 }
6827 zfs_exit(zfsvfs, FTAG);
6828
6829 covered_vp = vp->v_mount->mnt_vnodecovered;
6830 enum vgetstate vs = vget_prep(covered_vp);
6831 ltype = VOP_ISLOCKED(vp);
6832 VOP_UNLOCK(vp);
6833 error = vget_finish(covered_vp, LK_SHARED, vs);
6834 if (error == 0) {
6835 error = VOP_VPTOCNP(covered_vp, ap->a_vpp, ap->a_buf,
6836 ap->a_buflen);
6837 vput(covered_vp);
6838 }
6839 vn_lock(vp, ltype | LK_RETRY);
6840 if (VN_IS_DOOMED(vp))
6841 error = SET_ERROR(ENOENT);
6842 return (error);
6843 }
6844
6845 #if __FreeBSD_version >= 1400032
6846 static int
zfs_deallocate(struct vop_deallocate_args * ap)6847 zfs_deallocate(struct vop_deallocate_args *ap)
6848 {
6849 znode_t *zp = VTOZ(ap->a_vp);
6850 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
6851 zilog_t *zilog;
6852 off_t off, len, file_sz;
6853 int error;
6854
6855 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
6856 return (error);
6857
6858 /*
6859 * Callers might not be able to detect properly that we are read-only,
6860 * so check it explicitly here.
6861 */
6862 if (zfs_is_readonly(zfsvfs)) {
6863 zfs_exit(zfsvfs, FTAG);
6864 return (SET_ERROR(EROFS));
6865 }
6866
6867 zilog = zfsvfs->z_log;
6868 off = *ap->a_offset;
6869 len = *ap->a_len;
6870 file_sz = zp->z_size;
6871 if (off + len > file_sz)
6872 len = file_sz - off;
6873 /* Fast path for out-of-range request. */
6874 if (len <= 0) {
6875 *ap->a_len = 0;
6876 zfs_exit(zfsvfs, FTAG);
6877 return (0);
6878 }
6879
6880 error = zfs_freesp(zp, off, len, O_RDWR, TRUE);
6881 if (error == 0) {
6882 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS ||
6883 (ap->a_ioflag & IO_SYNC) != 0)
6884 error = zil_commit(zilog, zp->z_id);
6885 if (error == 0) {
6886 *ap->a_offset = off + len;
6887 *ap->a_len = 0;
6888 }
6889 }
6890
6891 zfs_exit(zfsvfs, FTAG);
6892 return (error);
6893 }
6894 #endif
6895
6896 #ifndef _SYS_SYSPROTO_H_
6897 struct vop_copy_file_range_args {
6898 struct vnode *a_invp;
6899 off_t *a_inoffp;
6900 struct vnode *a_outvp;
6901 off_t *a_outoffp;
6902 size_t *a_lenp;
6903 unsigned int a_flags;
6904 struct ucred *a_incred;
6905 struct ucred *a_outcred;
6906 struct thread *a_fsizetd;
6907 }
6908 #endif
6909 /*
6910 * TODO: FreeBSD will only call file system-specific copy_file_range() if both
6911 * files resides under the same mountpoint. In case of ZFS we want to be called
6912 * even is files are in different datasets (but on the same pools, but we need
6913 * to check that ourselves).
6914 */
6915 static int
zfs_freebsd_copy_file_range(struct vop_copy_file_range_args * ap)6916 zfs_freebsd_copy_file_range(struct vop_copy_file_range_args *ap)
6917 {
6918 zfsvfs_t *outzfsvfs;
6919 struct vnode *invp = ap->a_invp;
6920 struct vnode *outvp = ap->a_outvp;
6921 struct mount *mp;
6922 int error;
6923 uint64_t len = *ap->a_lenp;
6924
6925 if (!zfs_bclone_enabled) {
6926 mp = NULL;
6927 goto bad_write_fallback;
6928 }
6929
6930 /*
6931 * TODO: If offset/length is not aligned to recordsize, use
6932 * vn_generic_copy_file_range() on this fragment.
6933 * It would be better to do this after we lock the vnodes, but then we
6934 * need something else than vn_generic_copy_file_range().
6935 */
6936
6937 vn_start_write(outvp, &mp, V_WAIT);
6938 if (__predict_true(mp == outvp->v_mount)) {
6939 outzfsvfs = (zfsvfs_t *)mp->mnt_data;
6940 if (!spa_feature_is_enabled(dmu_objset_spa(outzfsvfs->z_os),
6941 SPA_FEATURE_BLOCK_CLONING)) {
6942 goto bad_write_fallback;
6943 }
6944 }
6945 if (invp == outvp) {
6946 if (vn_lock(outvp, LK_EXCLUSIVE) != 0) {
6947 goto bad_write_fallback;
6948 }
6949 } else {
6950 #if (__FreeBSD_version >= 1302506 && __FreeBSD_version < 1400000) || \
6951 __FreeBSD_version >= 1400086
6952 vn_lock_pair(invp, false, LK_SHARED, outvp, false,
6953 LK_EXCLUSIVE);
6954 #else
6955 vn_lock_pair(invp, false, outvp, false);
6956 #endif
6957 if (VN_IS_DOOMED(invp) || VN_IS_DOOMED(outvp)) {
6958 goto bad_locked_fallback;
6959 }
6960 }
6961
6962 #ifdef MAC
6963 error = mac_vnode_check_write(curthread->td_ucred, ap->a_outcred,
6964 outvp);
6965 if (error != 0)
6966 goto out_locked;
6967 #endif
6968
6969 error = zfs_clone_range(VTOZ(invp), ap->a_inoffp, VTOZ(outvp),
6970 ap->a_outoffp, &len, ap->a_outcred);
6971 if (error == EXDEV || error == EAGAIN || error == EINVAL ||
6972 error == EOPNOTSUPP)
6973 goto bad_locked_fallback;
6974 *ap->a_lenp = (size_t)len;
6975 #ifdef MAC
6976 out_locked:
6977 #endif
6978 if (invp != outvp)
6979 VOP_UNLOCK(invp);
6980 VOP_UNLOCK(outvp);
6981 if (mp != NULL)
6982 vn_finished_write(mp);
6983 return (error);
6984
6985 bad_locked_fallback:
6986 if (invp != outvp)
6987 VOP_UNLOCK(invp);
6988 VOP_UNLOCK(outvp);
6989 bad_write_fallback:
6990 if (mp != NULL)
6991 vn_finished_write(mp);
6992 error = ENOSYS;
6993 return (error);
6994 }
6995
6996 struct vop_vector zfs_vnodeops;
6997 struct vop_vector zfs_fifoops;
6998 struct vop_vector zfs_shareops;
6999
7000 struct vop_vector zfs_vnodeops = {
7001 .vop_default = &default_vnodeops,
7002 .vop_inactive = zfs_freebsd_inactive,
7003 .vop_need_inactive = zfs_freebsd_need_inactive,
7004 .vop_reclaim = zfs_freebsd_reclaim,
7005 .vop_fplookup_vexec = zfs_freebsd_fplookup_vexec,
7006 .vop_fplookup_symlink = zfs_freebsd_fplookup_symlink,
7007 .vop_access = zfs_freebsd_access,
7008 .vop_allocate = VOP_EOPNOTSUPP,
7009 #if __FreeBSD_version >= 1400032
7010 .vop_deallocate = zfs_deallocate,
7011 #endif
7012 .vop_lookup = zfs_cache_lookup,
7013 .vop_cachedlookup = zfs_freebsd_cachedlookup,
7014 .vop_getattr = zfs_freebsd_getattr,
7015 .vop_setattr = zfs_freebsd_setattr,
7016 .vop_create = zfs_freebsd_create,
7017 .vop_mknod = (vop_mknod_t *)zfs_freebsd_create,
7018 .vop_mkdir = zfs_freebsd_mkdir,
7019 .vop_readdir = zfs_freebsd_readdir,
7020 .vop_fsync = zfs_freebsd_fsync,
7021 .vop_open = zfs_freebsd_open,
7022 .vop_close = zfs_freebsd_close,
7023 .vop_rmdir = zfs_freebsd_rmdir,
7024 .vop_ioctl = zfs_freebsd_ioctl,
7025 .vop_link = zfs_freebsd_link,
7026 .vop_symlink = zfs_freebsd_symlink,
7027 .vop_readlink = zfs_freebsd_readlink,
7028 .vop_advise = zfs_freebsd_advise,
7029 .vop_read = zfs_freebsd_read,
7030 .vop_write = zfs_freebsd_write,
7031 .vop_remove = zfs_freebsd_remove,
7032 .vop_rename = zfs_freebsd_rename,
7033 .vop_pathconf = zfs_freebsd_pathconf,
7034 .vop_bmap = zfs_freebsd_bmap,
7035 .vop_fid = zfs_freebsd_fid,
7036 .vop_getextattr = zfs_getextattr,
7037 .vop_deleteextattr = zfs_deleteextattr,
7038 .vop_setextattr = zfs_setextattr,
7039 .vop_listextattr = zfs_listextattr,
7040 .vop_getacl = zfs_freebsd_getacl,
7041 .vop_setacl = zfs_freebsd_setacl,
7042 .vop_aclcheck = zfs_freebsd_aclcheck,
7043 .vop_getpages = zfs_freebsd_getpages,
7044 .vop_putpages = zfs_freebsd_putpages,
7045 .vop_vptocnp = zfs_vptocnp,
7046 .vop_lock1 = vop_lock,
7047 .vop_unlock = vop_unlock,
7048 .vop_islocked = vop_islocked,
7049 #if __FreeBSD_version >= 1400043
7050 .vop_add_writecount = vop_stdadd_writecount_nomsync,
7051 #endif
7052 .vop_copy_file_range = zfs_freebsd_copy_file_range,
7053 };
7054 VFS_VOP_VECTOR_REGISTER(zfs_vnodeops);
7055
7056 struct vop_vector zfs_fifoops = {
7057 .vop_default = &fifo_specops,
7058 .vop_fsync = zfs_freebsd_fsync,
7059 .vop_fplookup_vexec = zfs_freebsd_fplookup_vexec,
7060 .vop_fplookup_symlink = zfs_freebsd_fplookup_symlink,
7061 .vop_access = zfs_freebsd_access,
7062 .vop_getattr = zfs_freebsd_getattr,
7063 .vop_inactive = zfs_freebsd_inactive,
7064 .vop_read = VOP_PANIC,
7065 .vop_reclaim = zfs_freebsd_reclaim,
7066 .vop_setattr = zfs_freebsd_setattr,
7067 .vop_write = VOP_PANIC,
7068 .vop_pathconf = zfs_freebsd_pathconf,
7069 .vop_fid = zfs_freebsd_fid,
7070 .vop_getacl = zfs_freebsd_getacl,
7071 .vop_setacl = zfs_freebsd_setacl,
7072 .vop_aclcheck = zfs_freebsd_aclcheck,
7073 #if __FreeBSD_version >= 1400043
7074 .vop_add_writecount = vop_stdadd_writecount_nomsync,
7075 #endif
7076 };
7077 VFS_VOP_VECTOR_REGISTER(zfs_fifoops);
7078
7079 /*
7080 * special share hidden files vnode operations template
7081 */
7082 struct vop_vector zfs_shareops = {
7083 .vop_default = &default_vnodeops,
7084 .vop_fplookup_vexec = VOP_EAGAIN,
7085 .vop_fplookup_symlink = VOP_EAGAIN,
7086 .vop_access = zfs_freebsd_access,
7087 .vop_inactive = zfs_freebsd_inactive,
7088 .vop_reclaim = zfs_freebsd_reclaim,
7089 .vop_fid = zfs_freebsd_fid,
7090 .vop_pathconf = zfs_freebsd_pathconf,
7091 #if __FreeBSD_version >= 1400043
7092 .vop_add_writecount = vop_stdadd_writecount_nomsync,
7093 #endif
7094 };
7095 VFS_VOP_VECTOR_REGISTER(zfs_shareops);
7096
7097 ZFS_MODULE_PARAM(zfs, zfs_, xattr_compat, INT, ZMOD_RW,
7098 "Use legacy ZFS xattr naming for writing new user namespace xattrs");
7099