1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * 25 * Copyright (c) 2006-2010 Pawel Jakub Dawidek <pjd@FreeBSD.org> 26 * All rights reserved. 27 * 28 * Portions Copyright 2010 Robert Milkowski 29 * 30 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 31 * Copyright (c) 2012, 2017 by Delphix. All rights reserved. 32 * Copyright (c) 2013, Joyent, Inc. All rights reserved. 33 * Copyright (c) 2014 Integros [integros.com] 34 * Copyright (c) 2024, 2025, Klara, Inc. 35 */ 36 37 /* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */ 38 39 /* 40 * ZFS volume emulation driver. 41 * 42 * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes. 43 * Volumes are accessed through the symbolic links named: 44 * 45 * /dev/zvol/<pool_name>/<dataset_name> 46 * 47 * Volumes are persistent through reboot. No user command needs to be 48 * run before opening and using a device. 49 * 50 * On FreeBSD ZVOLs are simply GEOM providers like any other storage device 51 * in the system. Except when they're simply character devices (volmode=dev). 52 */ 53 54 #include <sys/types.h> 55 #include <sys/param.h> 56 #include <sys/kernel.h> 57 #include <sys/errno.h> 58 #include <sys/uio.h> 59 #include <sys/bio.h> 60 #include <sys/buf.h> 61 #include <sys/kmem.h> 62 #include <sys/conf.h> 63 #include <sys/cmn_err.h> 64 #include <sys/stat.h> 65 #include <sys/proc.h> 66 #include <sys/zap.h> 67 #include <sys/spa.h> 68 #include <sys/spa_impl.h> 69 #include <sys/zio.h> 70 #include <sys/disk.h> 71 #include <sys/dmu_traverse.h> 72 #include <sys/dnode.h> 73 #include <sys/dsl_dataset.h> 74 #include <sys/dsl_prop.h> 75 #include <sys/dsl_dir.h> 76 #include <sys/byteorder.h> 77 #include <sys/sunddi.h> 78 #include <sys/dirent.h> 79 #include <sys/policy.h> 80 #include <sys/queue.h> 81 #include <sys/fs/zfs.h> 82 #include <sys/zfs_ioctl.h> 83 #include <sys/zil.h> 84 #include <sys/zfs_znode.h> 85 #include <sys/zfs_rlock.h> 86 #include <sys/vdev_impl.h> 87 #include <sys/vdev_raidz.h> 88 #include <sys/zvol.h> 89 #include <sys/zil_impl.h> 90 #include <sys/dataset_kstats.h> 91 #include <sys/dbuf.h> 92 #include <sys/dmu_tx.h> 93 #include <sys/zfeature.h> 94 #include <sys/zio_checksum.h> 95 #include <sys/zil_impl.h> 96 #include <sys/filio.h> 97 #include <sys/freebsd_event.h> 98 99 #include <geom/geom.h> 100 #include <sys/zvol.h> 101 #include <sys/zvol_impl.h> 102 #include <cityhash.h> 103 104 #include "zfs_namecheck.h" 105 106 #define ZVOL_DUMPSIZE "dumpsize" 107 108 #ifdef ZVOL_LOCK_DEBUG 109 #define ZVOL_RW_READER RW_WRITER 110 #define ZVOL_RW_READ_HELD RW_WRITE_HELD 111 #else 112 #define ZVOL_RW_READER RW_READER 113 #define ZVOL_RW_READ_HELD RW_READ_HELD 114 #endif 115 116 struct zvol_state_os { 117 #define zso_dev _zso_state._zso_dev 118 #define zso_geom _zso_state._zso_geom 119 union { 120 /* volmode=dev */ 121 struct zvol_state_dev { 122 struct cdev *zsd_cdev; 123 struct selinfo zsd_selinfo; 124 } _zso_dev; 125 126 /* volmode=geom */ 127 struct zvol_state_geom { 128 struct g_provider *zsg_provider; 129 } _zso_geom; 130 } _zso_state; 131 int zso_dying; 132 }; 133 134 static uint32_t zvol_minors; 135 136 SYSCTL_DECL(_vfs_zfs); 137 SYSCTL_NODE(_vfs_zfs, OID_AUTO, vol, CTLFLAG_RW, 0, "ZFS VOLUME"); 138 139 static boolean_t zpool_on_zvol = B_FALSE; 140 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, recursive, CTLFLAG_RWTUN, &zpool_on_zvol, 0, 141 "Allow zpools to use zvols as vdevs (DANGEROUS)"); 142 143 /* 144 * Toggle unmap functionality. 145 */ 146 boolean_t zvol_unmap_enabled = B_TRUE; 147 148 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_enabled, CTLFLAG_RWTUN, 149 &zvol_unmap_enabled, 0, "Enable UNMAP functionality"); 150 151 /* 152 * zvol maximum transfer in one DMU tx. 153 */ 154 int zvol_maxphys = DMU_MAX_ACCESS / 2; 155 156 static void zvol_ensure_zilog(zvol_state_t *zv); 157 158 static d_open_t zvol_cdev_open; 159 static d_close_t zvol_cdev_close; 160 static d_ioctl_t zvol_cdev_ioctl; 161 static d_read_t zvol_cdev_read; 162 static d_write_t zvol_cdev_write; 163 static d_strategy_t zvol_cdev_bio_strategy; 164 static d_kqfilter_t zvol_cdev_kqfilter; 165 166 static struct cdevsw zvol_cdevsw = { 167 .d_name = "zvol", 168 .d_version = D_VERSION, 169 .d_flags = D_DISK | D_TRACKCLOSE, 170 .d_open = zvol_cdev_open, 171 .d_close = zvol_cdev_close, 172 .d_ioctl = zvol_cdev_ioctl, 173 .d_read = zvol_cdev_read, 174 .d_write = zvol_cdev_write, 175 .d_strategy = zvol_cdev_bio_strategy, 176 .d_kqfilter = zvol_cdev_kqfilter, 177 }; 178 179 static void zvol_filter_detach(struct knote *kn); 180 static int zvol_filter_vnode(struct knote *kn, long hint); 181 182 static struct filterops zvol_filterops_vnode = { 183 .f_isfd = 1, 184 .f_detach = zvol_filter_detach, 185 .f_event = zvol_filter_vnode, 186 }; 187 188 extern uint_t zfs_geom_probe_vdev_key; 189 190 struct g_class zfs_zvol_class = { 191 .name = "ZFS::ZVOL", 192 .version = G_VERSION, 193 }; 194 195 DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol); 196 197 static int zvol_geom_open(struct g_provider *pp, int flag, int count); 198 static int zvol_geom_close(struct g_provider *pp, int flag, int count); 199 static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace); 200 static void zvol_geom_bio_start(struct bio *bp); 201 static int zvol_geom_bio_getattr(struct bio *bp); 202 static void zvol_geom_bio_strategy(struct bio *bp, boolean_t sync); 203 204 /* 205 * GEOM mode implementation 206 */ 207 208 static int 209 zvol_geom_open(struct g_provider *pp, int flag, int count) 210 { 211 zvol_state_t *zv; 212 int err = 0; 213 boolean_t drop_suspend = B_FALSE; 214 215 if (!zpool_on_zvol && tsd_get(zfs_geom_probe_vdev_key) != NULL) { 216 /* 217 * If zfs_geom_probe_vdev_key is set, that means that zfs is 218 * attempting to probe geom providers while looking for a 219 * replacement for a missing VDEV. In this case, the 220 * spa_namespace_lock will not be held, but it is still illegal 221 * to use a zvol as a vdev. Deadlocks can result if another 222 * thread has spa_namespace_lock. 223 */ 224 return (SET_ERROR(EOPNOTSUPP)); 225 } 226 227 retry: 228 zv = atomic_load_ptr(&pp->private); 229 if (zv == NULL) 230 return (SET_ERROR(ENXIO)); 231 232 mutex_enter(&zv->zv_state_lock); 233 if (zv->zv_zso->zso_dying || zv->zv_flags & ZVOL_REMOVING) { 234 err = SET_ERROR(ENXIO); 235 goto out_locked; 236 } 237 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); 238 239 /* 240 * Make sure zvol is not suspended during first open 241 * (hold zv_suspend_lock) and respect proper lock acquisition 242 * ordering - zv_suspend_lock before zv_state_lock. 243 */ 244 if (zv->zv_open_count == 0) { 245 drop_suspend = B_TRUE; 246 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { 247 mutex_exit(&zv->zv_state_lock); 248 249 /* 250 * Removal may happen while the locks are down, so 251 * we can't trust zv any longer; we have to start over. 252 */ 253 zv = atomic_load_ptr(&pp->private); 254 if (zv == NULL) 255 return (SET_ERROR(ENXIO)); 256 257 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 258 mutex_enter(&zv->zv_state_lock); 259 260 if (zv->zv_zso->zso_dying || 261 zv->zv_flags & ZVOL_REMOVING) { 262 err = SET_ERROR(ENXIO); 263 goto out_locked; 264 } 265 266 /* Check to see if zv_suspend_lock is needed. */ 267 if (zv->zv_open_count != 0) { 268 rw_exit(&zv->zv_suspend_lock); 269 drop_suspend = B_FALSE; 270 } 271 } 272 } 273 274 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 275 276 if (zv->zv_open_count == 0) { 277 boolean_t drop_namespace = B_FALSE; 278 279 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); 280 281 /* 282 * Take spa_namespace_lock to prevent lock inversion when 283 * zvols from one pool are opened as vdevs in another. 284 */ 285 if (!mutex_owned(&spa_namespace_lock)) { 286 if (!mutex_tryenter(&spa_namespace_lock)) { 287 mutex_exit(&zv->zv_state_lock); 288 rw_exit(&zv->zv_suspend_lock); 289 drop_suspend = B_FALSE; 290 kern_yield(PRI_USER); 291 goto retry; 292 } else { 293 drop_namespace = B_TRUE; 294 } 295 } 296 err = zvol_first_open(zv, !(flag & FWRITE)); 297 if (drop_namespace) 298 mutex_exit(&spa_namespace_lock); 299 if (err) 300 goto out_locked; 301 pp->mediasize = zv->zv_volsize; 302 pp->stripeoffset = 0; 303 pp->stripesize = zv->zv_volblocksize; 304 } 305 306 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 307 308 /* 309 * Check for a bad on-disk format version now since we 310 * lied about owning the dataset readonly before. 311 */ 312 if ((flag & FWRITE) && ((zv->zv_flags & ZVOL_RDONLY) || 313 dmu_objset_incompatible_encryption_version(zv->zv_objset))) { 314 err = SET_ERROR(EROFS); 315 goto out_opened; 316 } 317 if (zv->zv_flags & ZVOL_EXCL) { 318 err = SET_ERROR(EBUSY); 319 goto out_opened; 320 } 321 if (flag & O_EXCL) { 322 if (zv->zv_open_count != 0) { 323 err = SET_ERROR(EBUSY); 324 goto out_opened; 325 } 326 zv->zv_flags |= ZVOL_EXCL; 327 } 328 329 zv->zv_open_count += count; 330 out_opened: 331 if (zv->zv_open_count == 0) { 332 zvol_last_close(zv); 333 wakeup(zv); 334 } 335 out_locked: 336 mutex_exit(&zv->zv_state_lock); 337 if (drop_suspend) 338 rw_exit(&zv->zv_suspend_lock); 339 return (err); 340 } 341 342 static int 343 zvol_geom_close(struct g_provider *pp, int flag, int count) 344 { 345 (void) flag; 346 zvol_state_t *zv; 347 boolean_t drop_suspend = B_TRUE; 348 int new_open_count; 349 350 zv = atomic_load_ptr(&pp->private); 351 if (zv == NULL) 352 return (SET_ERROR(ENXIO)); 353 354 mutex_enter(&zv->zv_state_lock); 355 if (zv->zv_flags & ZVOL_EXCL) { 356 ASSERT3U(zv->zv_open_count, ==, 1); 357 zv->zv_flags &= ~ZVOL_EXCL; 358 } 359 360 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); 361 362 /* 363 * If the open count is zero, this is a spurious close. 364 * That indicates a bug in the kernel / DDI framework. 365 */ 366 ASSERT3U(zv->zv_open_count, >, 0); 367 368 /* 369 * Make sure zvol is not suspended during last close 370 * (hold zv_suspend_lock) and respect proper lock acquisition 371 * ordering - zv_suspend_lock before zv_state_lock. 372 */ 373 new_open_count = zv->zv_open_count - count; 374 if (new_open_count == 0) { 375 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { 376 mutex_exit(&zv->zv_state_lock); 377 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 378 mutex_enter(&zv->zv_state_lock); 379 380 /* 381 * Unlike in zvol_geom_open(), we don't check if 382 * removal started here, because we might be one of the 383 * openers that needs to be thrown out! If we're the 384 * last, we need to call zvol_last_close() below to 385 * finish cleanup. So, no special treatment for us. 386 */ 387 388 /* Check to see if zv_suspend_lock is needed. */ 389 new_open_count = zv->zv_open_count - count; 390 if (new_open_count != 0) { 391 rw_exit(&zv->zv_suspend_lock); 392 drop_suspend = B_FALSE; 393 } 394 } 395 } else { 396 drop_suspend = B_FALSE; 397 } 398 399 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 400 401 /* 402 * You may get multiple opens, but only one close. 403 */ 404 zv->zv_open_count = new_open_count; 405 if (zv->zv_open_count == 0) { 406 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); 407 zvol_last_close(zv); 408 wakeup(zv); 409 } 410 411 mutex_exit(&zv->zv_state_lock); 412 413 if (drop_suspend) 414 rw_exit(&zv->zv_suspend_lock); 415 return (0); 416 } 417 418 void 419 zvol_wait_close(zvol_state_t *zv) 420 { 421 422 if (zv->zv_volmode != ZFS_VOLMODE_GEOM) 423 return; 424 mutex_enter(&zv->zv_state_lock); 425 zv->zv_zso->zso_dying = B_TRUE; 426 427 if (zv->zv_open_count) 428 msleep(zv, &zv->zv_state_lock, 429 PRIBIO, "zvol:dying", 10*hz); 430 mutex_exit(&zv->zv_state_lock); 431 } 432 433 434 static int 435 zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace) 436 { 437 int count, error, flags; 438 439 g_topology_assert(); 440 441 /* 442 * To make it easier we expect either open or close, but not both 443 * at the same time. 444 */ 445 KASSERT((acr >= 0 && acw >= 0 && ace >= 0) || 446 (acr <= 0 && acw <= 0 && ace <= 0), 447 ("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).", 448 pp->name, acr, acw, ace)); 449 450 if (atomic_load_ptr(&pp->private) == NULL) { 451 if (acr <= 0 && acw <= 0 && ace <= 0) 452 return (0); 453 return (pp->error); 454 } 455 456 /* 457 * We don't pass FEXCL flag to zvol_geom_open()/zvol_geom_close() if 458 * ace != 0, because GEOM already handles that and handles it a bit 459 * differently. GEOM allows for multiple read/exclusive consumers and 460 * ZFS allows only one exclusive consumer, no matter if it is reader or 461 * writer. I like better the way GEOM works so I'll leave it for GEOM 462 * to decide what to do. 463 */ 464 465 count = acr + acw + ace; 466 if (count == 0) 467 return (0); 468 469 flags = 0; 470 if (acr != 0 || ace != 0) 471 flags |= FREAD; 472 if (acw != 0) 473 flags |= FWRITE; 474 475 g_topology_unlock(); 476 if (count > 0) 477 error = zvol_geom_open(pp, flags, count); 478 else 479 error = zvol_geom_close(pp, flags, -count); 480 g_topology_lock(); 481 return (error); 482 } 483 484 static void 485 zvol_geom_bio_start(struct bio *bp) 486 { 487 zvol_state_t *zv = bp->bio_to->private; 488 489 if (zv == NULL) { 490 g_io_deliver(bp, ENXIO); 491 return; 492 } 493 if (bp->bio_cmd == BIO_GETATTR) { 494 if (zvol_geom_bio_getattr(bp)) 495 g_io_deliver(bp, EOPNOTSUPP); 496 return; 497 } 498 499 zvol_geom_bio_strategy(bp, !g_is_geom_thread(curthread) && 500 THREAD_CAN_SLEEP()); 501 } 502 503 static int 504 zvol_geom_bio_getattr(struct bio *bp) 505 { 506 zvol_state_t *zv; 507 508 zv = bp->bio_to->private; 509 ASSERT3P(zv, !=, NULL); 510 511 spa_t *spa = dmu_objset_spa(zv->zv_objset); 512 uint64_t refd, avail, usedobjs, availobjs; 513 514 if (g_handleattr_int(bp, "GEOM::candelete", 1)) 515 return (0); 516 if (strcmp(bp->bio_attribute, "blocksavail") == 0) { 517 dmu_objset_space(zv->zv_objset, &refd, &avail, 518 &usedobjs, &availobjs); 519 if (g_handleattr_off_t(bp, "blocksavail", avail / DEV_BSIZE)) 520 return (0); 521 } else if (strcmp(bp->bio_attribute, "blocksused") == 0) { 522 dmu_objset_space(zv->zv_objset, &refd, &avail, 523 &usedobjs, &availobjs); 524 if (g_handleattr_off_t(bp, "blocksused", refd / DEV_BSIZE)) 525 return (0); 526 } else if (strcmp(bp->bio_attribute, "poolblocksavail") == 0) { 527 avail = metaslab_class_get_space(spa_normal_class(spa)); 528 avail -= metaslab_class_get_alloc(spa_normal_class(spa)); 529 if (g_handleattr_off_t(bp, "poolblocksavail", 530 avail / DEV_BSIZE)) 531 return (0); 532 } else if (strcmp(bp->bio_attribute, "poolblocksused") == 0) { 533 refd = metaslab_class_get_alloc(spa_normal_class(spa)); 534 if (g_handleattr_off_t(bp, "poolblocksused", refd / DEV_BSIZE)) 535 return (0); 536 } 537 return (1); 538 } 539 540 static void 541 zvol_filter_detach(struct knote *kn) 542 { 543 zvol_state_t *zv; 544 struct zvol_state_dev *zsd; 545 546 zv = kn->kn_hook; 547 zsd = &zv->zv_zso->zso_dev; 548 549 knlist_remove(&zsd->zsd_selinfo.si_note, kn, 0); 550 } 551 552 static int 553 zvol_filter_vnode(struct knote *kn, long hint) 554 { 555 kn->kn_fflags |= kn->kn_sfflags & hint; 556 557 return (kn->kn_fflags != 0); 558 } 559 560 static int 561 zvol_cdev_kqfilter(struct cdev *dev, struct knote *kn) 562 { 563 zvol_state_t *zv; 564 struct zvol_state_dev *zsd; 565 566 zv = dev->si_drv2; 567 zsd = &zv->zv_zso->zso_dev; 568 569 if (kn->kn_filter != EVFILT_VNODE) 570 return (EINVAL); 571 572 /* XXX: extend support for other NOTE_* events */ 573 if (kn->kn_sfflags != NOTE_ATTRIB) 574 return (EINVAL); 575 576 kn->kn_fop = &zvol_filterops_vnode; 577 kn->kn_hook = zv; 578 knlist_add(&zsd->zsd_selinfo.si_note, kn, 0); 579 580 return (0); 581 } 582 583 static void 584 zvol_strategy_impl(zv_request_t *zvr) 585 { 586 zvol_state_t *zv; 587 struct bio *bp; 588 uint64_t off, volsize; 589 size_t resid; 590 char *addr; 591 objset_t *os; 592 zfs_locked_range_t *lr; 593 int error = 0; 594 boolean_t doread = B_FALSE; 595 boolean_t is_dumpified; 596 boolean_t commit; 597 598 bp = zvr->bio; 599 zv = zvr->zv; 600 if (zv == NULL) { 601 error = SET_ERROR(ENXIO); 602 goto out; 603 } 604 605 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 606 607 if (zv->zv_flags & ZVOL_REMOVING) { 608 error = SET_ERROR(ENXIO); 609 goto resume; 610 } 611 612 switch (bp->bio_cmd) { 613 case BIO_READ: 614 doread = B_TRUE; 615 break; 616 case BIO_WRITE: 617 case BIO_FLUSH: 618 case BIO_DELETE: 619 if (zv->zv_flags & ZVOL_RDONLY) { 620 error = SET_ERROR(EROFS); 621 goto resume; 622 } 623 zvol_ensure_zilog(zv); 624 if (bp->bio_cmd == BIO_FLUSH) 625 goto commit; 626 break; 627 default: 628 error = SET_ERROR(EOPNOTSUPP); 629 goto resume; 630 } 631 632 off = bp->bio_offset; 633 volsize = zv->zv_volsize; 634 635 os = zv->zv_objset; 636 ASSERT3P(os, !=, NULL); 637 638 addr = bp->bio_data; 639 resid = bp->bio_length; 640 641 if (resid > 0 && off >= volsize) { 642 error = SET_ERROR(EIO); 643 goto resume; 644 } 645 646 is_dumpified = B_FALSE; 647 commit = !doread && !is_dumpified && 648 zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; 649 650 /* 651 * There must be no buffer changes when doing a dmu_sync() because 652 * we can't change the data whilst calculating the checksum. 653 */ 654 lr = zfs_rangelock_enter(&zv->zv_rangelock, off, resid, 655 doread ? RL_READER : RL_WRITER); 656 657 if (bp->bio_cmd == BIO_DELETE) { 658 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); 659 error = dmu_tx_assign(tx, DMU_TX_WAIT); 660 if (error != 0) { 661 dmu_tx_abort(tx); 662 } else { 663 zvol_log_truncate(zv, tx, off, resid); 664 dmu_tx_commit(tx); 665 error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, 666 off, resid); 667 resid = 0; 668 } 669 goto unlock; 670 } 671 while (resid != 0 && off < volsize) { 672 size_t size = MIN(resid, zvol_maxphys); 673 if (doread) { 674 error = dmu_read_by_dnode(zv->zv_dn, off, size, addr, 675 DMU_READ_PREFETCH); 676 } else { 677 dmu_tx_t *tx = dmu_tx_create(os); 678 dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, size); 679 error = dmu_tx_assign(tx, DMU_TX_WAIT); 680 if (error) { 681 dmu_tx_abort(tx); 682 } else { 683 dmu_write_by_dnode(zv->zv_dn, off, size, addr, 684 tx, DMU_READ_PREFETCH); 685 zvol_log_write(zv, tx, off, size, commit); 686 dmu_tx_commit(tx); 687 } 688 } 689 if (error) { 690 /* Convert checksum errors into IO errors. */ 691 if (error == ECKSUM) 692 error = SET_ERROR(EIO); 693 break; 694 } 695 off += size; 696 addr += size; 697 resid -= size; 698 } 699 unlock: 700 zfs_rangelock_exit(lr); 701 702 bp->bio_completed = bp->bio_length - resid; 703 if (bp->bio_completed < bp->bio_length && off > volsize) 704 error = SET_ERROR(EINVAL); 705 706 switch (bp->bio_cmd) { 707 case BIO_FLUSH: 708 break; 709 case BIO_READ: 710 dataset_kstats_update_read_kstats(&zv->zv_kstat, 711 bp->bio_completed); 712 break; 713 case BIO_WRITE: 714 dataset_kstats_update_write_kstats(&zv->zv_kstat, 715 bp->bio_completed); 716 break; 717 case BIO_DELETE: 718 break; 719 default: 720 break; 721 } 722 723 if (error == 0 && commit) { 724 commit: 725 error = zil_commit(zv->zv_zilog, ZVOL_OBJ); 726 } 727 resume: 728 rw_exit(&zv->zv_suspend_lock); 729 out: 730 if (bp->bio_to) 731 g_io_deliver(bp, error); 732 else 733 biofinish(bp, NULL, error); 734 } 735 736 static void 737 zvol_strategy_task(void *arg) 738 { 739 zv_request_task_t *task = arg; 740 741 zvol_strategy_impl(&task->zvr); 742 zv_request_task_free(task); 743 } 744 745 static void 746 zvol_geom_bio_strategy(struct bio *bp, boolean_t sync) 747 { 748 zv_taskq_t *ztqs = &zvol_taskqs; 749 zv_request_task_t *task; 750 zvol_state_t *zv; 751 uint_t tq_idx; 752 uint_t taskq_hash; 753 int error; 754 755 if (bp->bio_to) 756 zv = bp->bio_to->private; 757 else 758 zv = bp->bio_dev->si_drv2; 759 760 if (zv == NULL) { 761 error = SET_ERROR(ENXIO); 762 if (bp->bio_to) 763 g_io_deliver(bp, error); 764 else 765 biofinish(bp, NULL, error); 766 return; 767 } 768 769 zv_request_t zvr = { 770 .zv = zv, 771 .bio = bp, 772 }; 773 774 if (sync || zvol_request_sync) { 775 zvol_strategy_impl(&zvr); 776 return; 777 } 778 779 taskq_hash = cityhash3((uintptr_t)zv, curcpu, bp->bio_offset >> 780 ZVOL_TASKQ_OFFSET_SHIFT); 781 tq_idx = taskq_hash % ztqs->tqs_cnt; 782 task = zv_request_task_create(zvr); 783 taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], zvol_strategy_task, task, 784 0, &task->ent); 785 } 786 787 static void 788 zvol_cdev_bio_strategy(struct bio *bp) 789 { 790 zvol_geom_bio_strategy(bp, B_FALSE); 791 } 792 793 /* 794 * Character device mode implementation 795 */ 796 797 static int 798 zvol_cdev_read(struct cdev *dev, struct uio *uio_s, int ioflag) 799 { 800 zvol_state_t *zv; 801 uint64_t volsize; 802 zfs_locked_range_t *lr; 803 int error = 0; 804 zfs_uio_t uio; 805 806 zfs_uio_init(&uio, uio_s); 807 808 zv = dev->si_drv2; 809 810 volsize = zv->zv_volsize; 811 /* 812 * uio_loffset == volsize isn't an error as 813 * it's required for EOF processing. 814 */ 815 if (zfs_uio_resid(&uio) > 0 && 816 (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize)) 817 return (SET_ERROR(EIO)); 818 819 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 820 ssize_t start_resid = zfs_uio_resid(&uio); 821 lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio), 822 zfs_uio_resid(&uio), RL_READER); 823 while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) { 824 uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1); 825 826 /* Don't read past the end. */ 827 if (bytes > volsize - zfs_uio_offset(&uio)) 828 bytes = volsize - zfs_uio_offset(&uio); 829 830 error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes, 831 DMU_READ_PREFETCH); 832 if (error) { 833 /* Convert checksum errors into IO errors. */ 834 if (error == ECKSUM) 835 error = SET_ERROR(EIO); 836 break; 837 } 838 } 839 zfs_rangelock_exit(lr); 840 int64_t nread = start_resid - zfs_uio_resid(&uio); 841 dataset_kstats_update_read_kstats(&zv->zv_kstat, nread); 842 rw_exit(&zv->zv_suspend_lock); 843 844 return (error); 845 } 846 847 static int 848 zvol_cdev_write(struct cdev *dev, struct uio *uio_s, int ioflag) 849 { 850 zvol_state_t *zv; 851 uint64_t volsize; 852 zfs_locked_range_t *lr; 853 int error = 0; 854 boolean_t commit; 855 zfs_uio_t uio; 856 857 zv = dev->si_drv2; 858 859 volsize = zv->zv_volsize; 860 861 zfs_uio_init(&uio, uio_s); 862 863 if (zfs_uio_resid(&uio) > 0 && 864 (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize)) 865 return (SET_ERROR(EIO)); 866 867 ssize_t start_resid = zfs_uio_resid(&uio); 868 commit = (ioflag & IO_SYNC) || 869 (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS); 870 871 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 872 zvol_ensure_zilog(zv); 873 874 lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio), 875 zfs_uio_resid(&uio), RL_WRITER); 876 while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) { 877 uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1); 878 uint64_t off = zfs_uio_offset(&uio); 879 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); 880 881 if (bytes > volsize - off) /* Don't write past the end. */ 882 bytes = volsize - off; 883 884 dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes); 885 error = dmu_tx_assign(tx, DMU_TX_WAIT); 886 if (error) { 887 dmu_tx_abort(tx); 888 break; 889 } 890 error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx, 891 DMU_READ_PREFETCH); 892 if (error == 0) 893 zvol_log_write(zv, tx, off, bytes, commit); 894 dmu_tx_commit(tx); 895 896 if (error) 897 break; 898 } 899 zfs_rangelock_exit(lr); 900 int64_t nwritten = start_resid - zfs_uio_resid(&uio); 901 dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten); 902 if (error == 0 && commit) 903 error = zil_commit(zv->zv_zilog, ZVOL_OBJ); 904 rw_exit(&zv->zv_suspend_lock); 905 906 return (error); 907 } 908 909 static int 910 zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td) 911 { 912 zvol_state_t *zv; 913 int err = 0; 914 boolean_t drop_suspend = B_FALSE; 915 916 retry: 917 zv = atomic_load_ptr(&dev->si_drv2); 918 if (zv == NULL) 919 return (SET_ERROR(ENXIO)); 920 921 mutex_enter(&zv->zv_state_lock); 922 if (zv->zv_zso->zso_dying || zv->zv_flags & ZVOL_REMOVING) { 923 err = SET_ERROR(ENXIO); 924 goto out_locked; 925 } 926 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV); 927 928 /* 929 * Make sure zvol is not suspended during first open 930 * (hold zv_suspend_lock) and respect proper lock acquisition 931 * ordering - zv_suspend_lock before zv_state_lock. 932 */ 933 if (zv->zv_open_count == 0) { 934 drop_suspend = B_TRUE; 935 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { 936 mutex_exit(&zv->zv_state_lock); 937 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 938 mutex_enter(&zv->zv_state_lock); 939 940 if (unlikely(zv->zv_flags & ZVOL_REMOVING)) { 941 /* Removal started while locks were down. */ 942 err = SET_ERROR(ENXIO); 943 goto out_locked; 944 } 945 946 /* Check to see if zv_suspend_lock is needed. */ 947 if (zv->zv_open_count != 0) { 948 rw_exit(&zv->zv_suspend_lock); 949 drop_suspend = B_FALSE; 950 } 951 } 952 } 953 954 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 955 956 if (zv->zv_open_count == 0) { 957 boolean_t drop_namespace = B_FALSE; 958 959 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); 960 961 /* 962 * Take spa_namespace_lock to prevent lock inversion when 963 * zvols from one pool are opened as vdevs in another. 964 */ 965 if (!mutex_owned(&spa_namespace_lock)) { 966 if (!mutex_tryenter(&spa_namespace_lock)) { 967 mutex_exit(&zv->zv_state_lock); 968 rw_exit(&zv->zv_suspend_lock); 969 drop_suspend = B_FALSE; 970 kern_yield(PRI_USER); 971 goto retry; 972 } else { 973 drop_namespace = B_TRUE; 974 } 975 } 976 err = zvol_first_open(zv, !(flags & FWRITE)); 977 if (drop_namespace) 978 mutex_exit(&spa_namespace_lock); 979 if (err) 980 goto out_locked; 981 } 982 983 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 984 985 if ((flags & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) { 986 err = SET_ERROR(EROFS); 987 goto out_opened; 988 } 989 if (zv->zv_flags & ZVOL_EXCL) { 990 err = SET_ERROR(EBUSY); 991 goto out_opened; 992 } 993 if (flags & O_EXCL) { 994 if (zv->zv_open_count != 0) { 995 err = SET_ERROR(EBUSY); 996 goto out_opened; 997 } 998 zv->zv_flags |= ZVOL_EXCL; 999 } 1000 1001 zv->zv_open_count++; 1002 out_opened: 1003 if (zv->zv_open_count == 0) { 1004 zvol_last_close(zv); 1005 wakeup(zv); 1006 } 1007 out_locked: 1008 mutex_exit(&zv->zv_state_lock); 1009 if (drop_suspend) 1010 rw_exit(&zv->zv_suspend_lock); 1011 return (err); 1012 } 1013 1014 static int 1015 zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td) 1016 { 1017 zvol_state_t *zv; 1018 boolean_t drop_suspend = B_TRUE; 1019 1020 zv = atomic_load_ptr(&dev->si_drv2); 1021 if (zv == NULL) 1022 return (SET_ERROR(ENXIO)); 1023 1024 mutex_enter(&zv->zv_state_lock); 1025 if (zv->zv_flags & ZVOL_EXCL) { 1026 ASSERT3U(zv->zv_open_count, ==, 1); 1027 zv->zv_flags &= ~ZVOL_EXCL; 1028 } 1029 1030 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV); 1031 1032 /* 1033 * If the open count is zero, this is a spurious close. 1034 * That indicates a bug in the kernel / DDI framework. 1035 */ 1036 ASSERT3U(zv->zv_open_count, >, 0); 1037 /* 1038 * Make sure zvol is not suspended during last close 1039 * (hold zv_suspend_lock) and respect proper lock acquisition 1040 * ordering - zv_suspend_lock before zv_state_lock. 1041 */ 1042 if (zv->zv_open_count == 1) { 1043 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { 1044 mutex_exit(&zv->zv_state_lock); 1045 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 1046 mutex_enter(&zv->zv_state_lock); 1047 1048 /* 1049 * Unlike in zvol_cdev_open(), we don't check if 1050 * removal started here, because we might be one of the 1051 * openers that needs to be thrown out! If we're the 1052 * last, we need to call zvol_last_close() below to 1053 * finish cleanup. So, no special treatment for us. 1054 */ 1055 1056 /* Check to see if zv_suspend_lock is needed. */ 1057 if (zv->zv_open_count != 1) { 1058 rw_exit(&zv->zv_suspend_lock); 1059 drop_suspend = B_FALSE; 1060 } 1061 } 1062 } else { 1063 drop_suspend = B_FALSE; 1064 } 1065 1066 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 1067 1068 /* 1069 * You may get multiple opens, but only one close. 1070 */ 1071 zv->zv_open_count--; 1072 1073 if (zv->zv_open_count == 0) { 1074 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); 1075 zvol_last_close(zv); 1076 wakeup(zv); 1077 } 1078 1079 mutex_exit(&zv->zv_state_lock); 1080 1081 if (drop_suspend) 1082 rw_exit(&zv->zv_suspend_lock); 1083 return (0); 1084 } 1085 1086 static int 1087 zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data, 1088 int fflag, struct thread *td) 1089 { 1090 zvol_state_t *zv; 1091 zfs_locked_range_t *lr; 1092 off_t offset, length; 1093 int error; 1094 boolean_t sync; 1095 1096 zv = atomic_load_ptr(&dev->si_drv2); 1097 ASSERT3P(zv, !=, NULL); 1098 1099 error = 0; 1100 KASSERT(zv->zv_open_count > 0, 1101 ("Device with zero access count in %s", __func__)); 1102 1103 switch (cmd) { 1104 case DIOCGSECTORSIZE: 1105 *(uint32_t *)data = DEV_BSIZE; 1106 break; 1107 case DIOCGMEDIASIZE: 1108 *(off_t *)data = zv->zv_volsize; 1109 break; 1110 case DIOCGFLUSH: 1111 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 1112 if (zv->zv_zilog != NULL) 1113 error = zil_commit(zv->zv_zilog, ZVOL_OBJ); 1114 rw_exit(&zv->zv_suspend_lock); 1115 break; 1116 case DIOCGDELETE: 1117 if (!zvol_unmap_enabled) 1118 break; 1119 1120 offset = ((off_t *)data)[0]; 1121 length = ((off_t *)data)[1]; 1122 if ((offset % DEV_BSIZE) != 0 || (length % DEV_BSIZE) != 0 || 1123 offset < 0 || offset >= zv->zv_volsize || 1124 length <= 0) { 1125 printf("%s: offset=%jd length=%jd\n", __func__, offset, 1126 length); 1127 error = SET_ERROR(EINVAL); 1128 break; 1129 } 1130 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 1131 zvol_ensure_zilog(zv); 1132 lr = zfs_rangelock_enter(&zv->zv_rangelock, offset, length, 1133 RL_WRITER); 1134 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); 1135 error = dmu_tx_assign(tx, DMU_TX_WAIT); 1136 if (error != 0) { 1137 sync = FALSE; 1138 dmu_tx_abort(tx); 1139 } else { 1140 sync = (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS); 1141 zvol_log_truncate(zv, tx, offset, length); 1142 dmu_tx_commit(tx); 1143 error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, 1144 offset, length); 1145 } 1146 zfs_rangelock_exit(lr); 1147 if (sync) 1148 error = zil_commit(zv->zv_zilog, ZVOL_OBJ); 1149 rw_exit(&zv->zv_suspend_lock); 1150 break; 1151 case DIOCGSTRIPESIZE: 1152 *(off_t *)data = zv->zv_volblocksize; 1153 break; 1154 case DIOCGSTRIPEOFFSET: 1155 *(off_t *)data = 0; 1156 break; 1157 case DIOCGATTR: { 1158 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 1159 spa_t *spa = dmu_objset_spa(zv->zv_objset); 1160 struct diocgattr_arg *arg = (struct diocgattr_arg *)data; 1161 uint64_t refd, avail, usedobjs, availobjs; 1162 1163 if (strcmp(arg->name, "GEOM::candelete") == 0) 1164 arg->value.i = 1; 1165 else if (strcmp(arg->name, "blocksavail") == 0) { 1166 dmu_objset_space(zv->zv_objset, &refd, &avail, 1167 &usedobjs, &availobjs); 1168 arg->value.off = avail / DEV_BSIZE; 1169 } else if (strcmp(arg->name, "blocksused") == 0) { 1170 dmu_objset_space(zv->zv_objset, &refd, &avail, 1171 &usedobjs, &availobjs); 1172 arg->value.off = refd / DEV_BSIZE; 1173 } else if (strcmp(arg->name, "poolblocksavail") == 0) { 1174 avail = metaslab_class_get_space(spa_normal_class(spa)); 1175 avail -= metaslab_class_get_alloc( 1176 spa_normal_class(spa)); 1177 arg->value.off = avail / DEV_BSIZE; 1178 } else if (strcmp(arg->name, "poolblocksused") == 0) { 1179 refd = metaslab_class_get_alloc(spa_normal_class(spa)); 1180 arg->value.off = refd / DEV_BSIZE; 1181 } else 1182 error = SET_ERROR(ENOIOCTL); 1183 rw_exit(&zv->zv_suspend_lock); 1184 break; 1185 } 1186 case FIOSEEKHOLE: 1187 case FIOSEEKDATA: { 1188 off_t *off = (off_t *)data; 1189 uint64_t noff; 1190 boolean_t hole; 1191 1192 hole = (cmd == FIOSEEKHOLE); 1193 noff = *off; 1194 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); 1195 lr = zfs_rangelock_enter(&zv->zv_rangelock, 0, UINT64_MAX, 1196 RL_READER); 1197 error = dmu_offset_next(zv->zv_objset, ZVOL_OBJ, hole, &noff); 1198 zfs_rangelock_exit(lr); 1199 rw_exit(&zv->zv_suspend_lock); 1200 *off = noff; 1201 break; 1202 } 1203 default: 1204 error = SET_ERROR(ENOIOCTL); 1205 } 1206 1207 return (error); 1208 } 1209 1210 /* 1211 * Misc. helpers 1212 */ 1213 1214 static void 1215 zvol_ensure_zilog(zvol_state_t *zv) 1216 { 1217 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); 1218 1219 /* 1220 * Open a ZIL if this is the first time we have written to this 1221 * zvol. We protect zv->zv_zilog with zv_suspend_lock rather 1222 * than zv_state_lock so that we don't need to acquire an 1223 * additional lock in this path. 1224 */ 1225 if (zv->zv_zilog == NULL) { 1226 if (!rw_tryupgrade(&zv->zv_suspend_lock)) { 1227 rw_exit(&zv->zv_suspend_lock); 1228 rw_enter(&zv->zv_suspend_lock, RW_WRITER); 1229 } 1230 if (zv->zv_zilog == NULL) { 1231 zv->zv_zilog = zil_open(zv->zv_objset, 1232 zvol_get_data, &zv->zv_kstat.dk_zil_sums); 1233 zv->zv_flags |= ZVOL_WRITTEN_TO; 1234 /* replay / destroy done in zvol_os_create_minor() */ 1235 VERIFY0(zv->zv_zilog->zl_header->zh_flags & 1236 ZIL_REPLAY_NEEDED); 1237 } 1238 rw_downgrade(&zv->zv_suspend_lock); 1239 } 1240 } 1241 1242 boolean_t 1243 zvol_os_is_zvol(const char *device) 1244 { 1245 return (device && strncmp(device, ZVOL_DIR, strlen(ZVOL_DIR)) == 0); 1246 } 1247 1248 int 1249 zvol_os_rename_minor(zvol_state_t *zv, const char *newname) 1250 { 1251 int error = 0; 1252 1253 ASSERT(RW_LOCK_HELD(&zvol_state_lock)); 1254 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 1255 1256 /* Move to a new hashtable entry. */ 1257 zv->zv_hash = zvol_name_hash(newname); 1258 hlist_del(&zv->zv_hlink); 1259 hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); 1260 1261 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { 1262 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; 1263 struct g_provider *pp = zsg->zsg_provider; 1264 struct g_geom *gp; 1265 1266 g_topology_lock(); 1267 gp = pp->geom; 1268 ASSERT3P(gp, !=, NULL); 1269 1270 zsg->zsg_provider = NULL; 1271 g_wither_provider(pp, ENXIO); 1272 1273 pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname); 1274 pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND; 1275 pp->sectorsize = DEV_BSIZE; 1276 pp->mediasize = zv->zv_volsize; 1277 pp->private = zv; 1278 zsg->zsg_provider = pp; 1279 g_error_provider(pp, 0); 1280 g_topology_unlock(); 1281 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { 1282 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; 1283 struct cdev *dev; 1284 struct make_dev_args args; 1285 1286 dev = zsd->zsd_cdev; 1287 if (dev != NULL) { 1288 destroy_dev(dev); 1289 dev = zsd->zsd_cdev = NULL; 1290 if (zv->zv_open_count > 0) { 1291 zv->zv_flags &= ~ZVOL_EXCL; 1292 zv->zv_open_count = 0; 1293 /* XXX need suspend lock but lock order */ 1294 zvol_last_close(zv); 1295 } 1296 } 1297 1298 make_dev_args_init(&args); 1299 args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK; 1300 args.mda_devsw = &zvol_cdevsw; 1301 args.mda_cr = NULL; 1302 args.mda_uid = UID_ROOT; 1303 args.mda_gid = GID_OPERATOR; 1304 args.mda_mode = 0640; 1305 args.mda_si_drv2 = zv; 1306 error = make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, newname); 1307 if (error == 0) { 1308 dev->si_iosize_max = maxphys; 1309 zsd->zsd_cdev = dev; 1310 } 1311 } 1312 strlcpy(zv->zv_name, newname, sizeof (zv->zv_name)); 1313 dataset_kstats_rename(&zv->zv_kstat, newname); 1314 1315 return (error); 1316 } 1317 1318 /* 1319 * Allocate memory for a new zvol_state_t and setup the required 1320 * request queue and generic disk structures for the block device. 1321 */ 1322 static int 1323 zvol_alloc(const char *name, uint64_t volsize, uint64_t volblocksize, 1324 zvol_state_t **zvp) 1325 { 1326 zvol_state_t *zv; 1327 uint64_t volmode; 1328 int error; 1329 1330 error = dsl_prop_get_integer(name, zfs_prop_to_name(ZFS_PROP_VOLMODE), 1331 &volmode, NULL); 1332 if (error) 1333 return (error); 1334 1335 if (volmode == ZFS_VOLMODE_DEFAULT) 1336 volmode = zvol_volmode; 1337 1338 if (volmode == ZFS_VOLMODE_NONE) 1339 return (0); 1340 1341 zv = kmem_zalloc(sizeof (*zv), KM_SLEEP); 1342 mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL); 1343 cv_init(&zv->zv_removing_cv, NULL, CV_DEFAULT, NULL); 1344 zv->zv_zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP); 1345 zv->zv_volmode = volmode; 1346 zv->zv_volsize = volsize; 1347 zv->zv_volblocksize = volblocksize; 1348 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { 1349 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; 1350 struct g_provider *pp; 1351 struct g_geom *gp; 1352 1353 g_topology_lock(); 1354 gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name); 1355 gp->start = zvol_geom_bio_start; 1356 gp->access = zvol_geom_access; 1357 pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name); 1358 pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND; 1359 pp->sectorsize = DEV_BSIZE; 1360 pp->mediasize = 0; 1361 pp->private = zv; 1362 1363 zsg->zsg_provider = pp; 1364 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { 1365 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; 1366 struct cdev *dev; 1367 struct make_dev_args args; 1368 1369 make_dev_args_init(&args); 1370 args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK; 1371 args.mda_devsw = &zvol_cdevsw; 1372 args.mda_cr = NULL; 1373 args.mda_uid = UID_ROOT; 1374 args.mda_gid = GID_OPERATOR; 1375 args.mda_mode = 0640; 1376 args.mda_si_drv2 = zv; 1377 error = make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name); 1378 if (error) { 1379 kmem_free(zv->zv_zso, sizeof (struct zvol_state_os)); 1380 kmem_free(zv, sizeof (zvol_state_t)); 1381 return (error); 1382 } 1383 1384 dev->si_iosize_max = maxphys; 1385 zsd->zsd_cdev = dev; 1386 knlist_init_sx(&zsd->zsd_selinfo.si_note, &zv->zv_state_lock); 1387 } 1388 (void) strlcpy(zv->zv_name, name, MAXPATHLEN); 1389 rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL); 1390 zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL); 1391 1392 *zvp = zv; 1393 return (error); 1394 } 1395 1396 /* 1397 * Remove minor node for the specified volume. 1398 */ 1399 void 1400 zvol_os_remove_minor(zvol_state_t *zv) 1401 { 1402 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 1403 ASSERT0(zv->zv_open_count); 1404 ASSERT0(atomic_read(&zv->zv_suspend_ref)); 1405 ASSERT(zv->zv_flags & ZVOL_REMOVING); 1406 1407 struct zvol_state_os *zso = zv->zv_zso; 1408 zv->zv_zso = NULL; 1409 1410 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { 1411 struct zvol_state_geom *zsg = &zso->zso_geom; 1412 struct g_provider *pp = zsg->zsg_provider; 1413 atomic_store_ptr(&pp->private, NULL); 1414 mutex_exit(&zv->zv_state_lock); 1415 1416 g_topology_lock(); 1417 g_wither_geom(pp->geom, ENXIO); 1418 g_topology_unlock(); 1419 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { 1420 struct zvol_state_dev *zsd = &zso->zso_dev; 1421 struct cdev *dev = zsd->zsd_cdev; 1422 1423 if (dev != NULL) 1424 atomic_store_ptr(&dev->si_drv2, NULL); 1425 mutex_exit(&zv->zv_state_lock); 1426 1427 if (dev != NULL) { 1428 destroy_dev(dev); 1429 knlist_clear(&zsd->zsd_selinfo.si_note, 0); 1430 knlist_destroy(&zsd->zsd_selinfo.si_note); 1431 } 1432 } 1433 1434 kmem_free(zso, sizeof (struct zvol_state_os)); 1435 1436 mutex_enter(&zv->zv_state_lock); 1437 } 1438 1439 void 1440 zvol_os_free(zvol_state_t *zv) 1441 { 1442 ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); 1443 ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); 1444 ASSERT0(zv->zv_open_count); 1445 ASSERT0P(zv->zv_zso); 1446 1447 ASSERT0P(zv->zv_objset); 1448 ASSERT0P(zv->zv_zilog); 1449 ASSERT0P(zv->zv_dn); 1450 1451 ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name); 1452 1453 rw_destroy(&zv->zv_suspend_lock); 1454 zfs_rangelock_fini(&zv->zv_rangelock); 1455 1456 mutex_destroy(&zv->zv_state_lock); 1457 cv_destroy(&zv->zv_removing_cv); 1458 dataset_kstats_destroy(&zv->zv_kstat); 1459 kmem_free(zv, sizeof (zvol_state_t)); 1460 zvol_minors--; 1461 } 1462 1463 /* 1464 * Create a minor node (plus a whole lot more) for the specified volume. 1465 */ 1466 int 1467 zvol_os_create_minor(const char *name) 1468 { 1469 zvol_state_t *zv = NULL; 1470 objset_t *os; 1471 dmu_object_info_t *doi; 1472 uint64_t volsize; 1473 uint64_t hash, len; 1474 int error; 1475 bool replayed_zil = B_FALSE; 1476 1477 if (zvol_inhibit_dev) 1478 return (0); 1479 1480 ZFS_LOG(1, "Creating ZVOL %s...", name); 1481 hash = zvol_name_hash(name); 1482 if ((zv = zvol_find_by_name_hash(name, hash, RW_NONE)) != NULL) { 1483 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 1484 mutex_exit(&zv->zv_state_lock); 1485 return (SET_ERROR(EEXIST)); 1486 } 1487 1488 DROP_GIANT(); 1489 1490 doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); 1491 1492 /* Lie and say we're read-only. */ 1493 error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os); 1494 if (error) 1495 goto out_doi; 1496 1497 error = dmu_object_info(os, ZVOL_OBJ, doi); 1498 if (error) 1499 goto out_dmu_objset_disown; 1500 1501 error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); 1502 if (error) 1503 goto out_dmu_objset_disown; 1504 1505 error = zvol_alloc(name, volsize, doi->doi_data_block_size, &zv); 1506 if (error || zv == NULL) 1507 goto out_dmu_objset_disown; 1508 1509 zv->zv_hash = hash; 1510 1511 if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os))) 1512 zv->zv_flags |= ZVOL_RDONLY; 1513 1514 zv->zv_objset = os; 1515 1516 ASSERT0P(zv->zv_kstat.dk_kstats); 1517 error = dataset_kstats_create(&zv->zv_kstat, zv->zv_objset); 1518 if (error) 1519 goto out_dmu_objset_disown; 1520 ASSERT0P(zv->zv_zilog); 1521 zv->zv_zilog = zil_open(os, zvol_get_data, &zv->zv_kstat.dk_zil_sums); 1522 if (spa_writeable(dmu_objset_spa(os))) { 1523 if (zil_replay_disable) 1524 replayed_zil = zil_destroy(zv->zv_zilog, B_FALSE); 1525 else 1526 replayed_zil = zil_replay(os, zv, zvol_replay_vector); 1527 } 1528 if (replayed_zil) 1529 zil_close(zv->zv_zilog); 1530 zv->zv_zilog = NULL; 1531 1532 len = MIN(zvol_prefetch_bytes, SPA_MAXBLOCKSIZE); 1533 if (len > 0) { 1534 dmu_prefetch(os, ZVOL_OBJ, 0, 0, len, ZIO_PRIORITY_ASYNC_READ); 1535 dmu_prefetch(os, ZVOL_OBJ, 0, volsize - len, len, 1536 ZIO_PRIORITY_ASYNC_READ); 1537 } 1538 1539 zv->zv_objset = NULL; 1540 out_dmu_objset_disown: 1541 dmu_objset_disown(os, B_TRUE, FTAG); 1542 1543 if (error == 0 && zv && zv->zv_volmode == ZFS_VOLMODE_GEOM) { 1544 g_error_provider(zv->zv_zso->zso_geom.zsg_provider, 0); 1545 /* geom was locked inside zvol_alloc() function */ 1546 g_topology_unlock(); 1547 } 1548 out_doi: 1549 kmem_free(doi, sizeof (dmu_object_info_t)); 1550 if (error == 0 && zv) { 1551 rw_enter(&zvol_state_lock, RW_WRITER); 1552 zvol_insert(zv); 1553 zvol_minors++; 1554 rw_exit(&zvol_state_lock); 1555 ZFS_LOG(1, "ZVOL %s created.", name); 1556 } 1557 PICKUP_GIANT(); 1558 return (error); 1559 } 1560 1561 int 1562 zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize) 1563 { 1564 zv->zv_volsize = volsize; 1565 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { 1566 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; 1567 struct g_provider *pp = zsg->zsg_provider; 1568 1569 g_topology_lock(); 1570 1571 if (pp->private == NULL) { 1572 g_topology_unlock(); 1573 return (SET_ERROR(ENXIO)); 1574 } 1575 1576 /* 1577 * Do not invoke resize event when initial size was zero. 1578 * ZVOL initializes the size on first open, this is not 1579 * real resizing. 1580 */ 1581 if (pp->mediasize == 0) 1582 pp->mediasize = zv->zv_volsize; 1583 else 1584 g_resize_provider(pp, zv->zv_volsize); 1585 1586 g_topology_unlock(); 1587 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { 1588 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; 1589 1590 KNOTE_UNLOCKED(&zsd->zsd_selinfo.si_note, NOTE_ATTRIB); 1591 } 1592 return (0); 1593 } 1594 1595 void 1596 zvol_os_set_disk_ro(zvol_state_t *zv, int flags) 1597 { 1598 /* 1599 * The ro/rw ZVOL mode is switched using zvol_set_ro() function by 1600 * enabling/disabling ZVOL_RDONLY flag. No additional FreeBSD-specific 1601 * actions are required for readonly zfs property switching. 1602 */ 1603 } 1604 1605 void 1606 zvol_os_set_capacity(zvol_state_t *zv, uint64_t capacity) 1607 { 1608 /* 1609 * The ZVOL size/capacity is changed by zvol_set_volsize() function. 1610 * Leave this method empty, because all required job is doing by 1611 * zvol_os_update_volsize() platform-specific function. 1612 */ 1613 } 1614 1615 /* 1616 * Public interfaces 1617 */ 1618 1619 int 1620 zvol_busy(void) 1621 { 1622 return (zvol_minors != 0); 1623 } 1624 1625 int 1626 zvol_init(void) 1627 { 1628 return (zvol_init_impl()); 1629 } 1630 1631 void 1632 zvol_fini(void) 1633 { 1634 zvol_fini_impl(); 1635 } 1636