1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  *
  24  * Portions Copyright 2010 Robert Milkowski
  25  *
  26  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  27  * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
  28  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  29  * Copyright 2017 Nexenta Systems, Inc.  All rights reserved.
  30  * Copyright (c) 2014 Integros [integros.com]
  31  */
  32 
  33 /*
  34  * ZFS volume emulation driver.
  35  *
  36  * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
  37  * Volumes are accessed through the symbolic links named:
  38  *
  39  * /dev/zvol/dsk/<pool_name>/<dataset_name>
  40  * /dev/zvol/rdsk/<pool_name>/<dataset_name>
  41  *
  42  * These links are created by the /dev filesystem (sdev_zvolops.c).
  43  * Volumes are persistent through reboot.  No user command needs to be
  44  * run before opening and using a device.
  45  */
  46 
  47 #include <sys/types.h>
  48 #include <sys/param.h>
  49 #include <sys/errno.h>
  50 #include <sys/uio.h>
  51 #include <sys/buf.h>
  52 #include <sys/modctl.h>
  53 #include <sys/open.h>
  54 #include <sys/kmem.h>
  55 #include <sys/conf.h>
  56 #include <sys/cmn_err.h>
  57 #include <sys/stat.h>
  58 #include <sys/zap.h>
  59 #include <sys/spa.h>
  60 #include <sys/spa_impl.h>
  61 #include <sys/zio.h>
  62 #include <sys/dmu_traverse.h>
  63 #include <sys/dnode.h>
  64 #include <sys/dsl_dataset.h>
  65 #include <sys/dsl_prop.h>
  66 #include <sys/dkio.h>
  67 #include <sys/efi_partition.h>
  68 #include <sys/byteorder.h>
  69 #include <sys/pathname.h>
  70 #include <sys/ddi.h>
  71 #include <sys/sunddi.h>
  72 #include <sys/crc32.h>
  73 #include <sys/dirent.h>
  74 #include <sys/policy.h>
  75 #include <sys/fs/zfs.h>
  76 #include <sys/zfs_ioctl.h>
  77 #include <sys/mkdev.h>
  78 #include <sys/zil.h>
  79 #include <sys/refcount.h>
  80 #include <sys/zfs_znode.h>
  81 #include <sys/zfs_rlock.h>
  82 #include <sys/vdev_disk.h>
  83 #include <sys/vdev_impl.h>
  84 #include <sys/vdev_raidz.h>
  85 #include <sys/zvol.h>
  86 #include <sys/dumphdr.h>
  87 #include <sys/zil_impl.h>
  88 #include <sys/dbuf.h>
  89 #include <sys/dmu_tx.h>
  90 #include <sys/zfeature.h>
  91 #include <sys/zio_checksum.h>
  92 #include <sys/dkioc_free_util.h>
  93 #include <sys/zil_impl.h>
  94 
  95 #include "zfs_namecheck.h"
  96 
  97 void *zfsdev_state;
  98 static char *zvol_tag = "zvol_tag";
  99 
 100 #define ZVOL_DUMPSIZE           "dumpsize"
 101 
 102 /*
 103  * This lock protects the zfsdev_state structure from being modified
 104  * while it's being used, e.g. an open that comes in before a create
 105  * finishes.  It also protects temporary opens of the dataset so that,
 106  * e.g., an open doesn't get a spurious EBUSY.
 107  */
 108 kmutex_t zfsdev_state_lock;
 109 static uint32_t zvol_minors;
 110 
 111 typedef struct zvol_extent {
 112         list_node_t     ze_node;
 113         dva_t           ze_dva;         /* dva associated with this extent */
 114         uint64_t        ze_nblks;       /* number of blocks in extent */
 115 } zvol_extent_t;
 116 
 117 /*
 118  * The in-core state of each volume.
 119  */
 120 typedef struct zvol_state {
 121         char            zv_name[MAXPATHLEN]; /* pool/dd name */
 122         uint64_t        zv_volsize;     /* amount of space we advertise */
 123         uint64_t        zv_volblocksize; /* volume block size */
 124         minor_t         zv_minor;       /* minor number */
 125         uint8_t         zv_min_bs;      /* minimum addressable block shift */
 126         uint8_t         zv_flags;       /* readonly, dumpified, etc. */
 127         objset_t        *zv_objset;     /* objset handle */
 128         uint32_t        zv_open_count[OTYPCNT]; /* open counts */
 129         uint32_t        zv_total_opens; /* total open count */
 130         zilog_t         *zv_zilog;      /* ZIL handle */
 131         list_t          zv_extents;     /* List of extents for dump */
 132         znode_t         zv_znode;       /* for range locking */
 133         dmu_buf_t       *zv_dbuf;       /* bonus handle */
 134 } zvol_state_t;
 135 
 136 /*
 137  * zvol specific flags
 138  */
 139 #define ZVOL_RDONLY     0x1
 140 #define ZVOL_DUMPIFIED  0x2
 141 #define ZVOL_EXCL       0x4
 142 #define ZVOL_WCE        0x8
 143 
 144 /*
 145  * zvol maximum transfer in one DMU tx.
 146  */
 147 int zvol_maxphys = DMU_MAX_ACCESS/2;
 148 
 149 /*
 150  * Toggle unmap functionality.
 151  */
 152 boolean_t zvol_unmap_enabled = B_TRUE;
 153 
 154 /*
 155  * If true, unmaps requested as synchronous are executed synchronously,
 156  * otherwise all unmaps are asynchronous.
 157  */
 158 boolean_t zvol_unmap_sync_enabled = B_FALSE;
 159 
 160 extern int zfs_set_prop_nvlist(const char *, zprop_source_t,
 161     nvlist_t *, nvlist_t *);
 162 static int zvol_remove_zv(zvol_state_t *);
 163 static int zvol_get_data(void *arg, lr_write_t *lr, char *buf,
 164     struct lwb *lwb, zio_t *zio);
 165 static int zvol_dumpify(zvol_state_t *zv);
 166 static int zvol_dump_fini(zvol_state_t *zv);
 167 static int zvol_dump_init(zvol_state_t *zv, boolean_t resize);
 168 
 169 static void
 170 zvol_size_changed(zvol_state_t *zv, uint64_t volsize)
 171 {
 172         dev_t dev = makedevice(ddi_driver_major(zfs_dip), zv->zv_minor);
 173 
 174         zv->zv_volsize = volsize;
 175         VERIFY(ddi_prop_update_int64(dev, zfs_dip,
 176             "Size", volsize) == DDI_SUCCESS);
 177         VERIFY(ddi_prop_update_int64(dev, zfs_dip,
 178             "Nblocks", lbtodb(volsize)) == DDI_SUCCESS);
 179 
 180         /* Notify specfs to invalidate the cached size */
 181         spec_size_invalidate(dev, VBLK);
 182         spec_size_invalidate(dev, VCHR);
 183 }
 184 
 185 int
 186 zvol_check_volsize(uint64_t volsize, uint64_t blocksize)
 187 {
 188         if (volsize == 0)
 189                 return (SET_ERROR(EINVAL));
 190 
 191         if (volsize % blocksize != 0)
 192                 return (SET_ERROR(EINVAL));
 193 
 194 #ifdef _ILP32
 195         if (volsize - 1 > SPEC_MAXOFFSET_T)
 196                 return (SET_ERROR(EOVERFLOW));
 197 #endif
 198         return (0);
 199 }
 200 
 201 int
 202 zvol_check_volblocksize(uint64_t volblocksize)
 203 {
 204         if (volblocksize < SPA_MINBLOCKSIZE ||
 205             volblocksize > SPA_OLD_MAXBLOCKSIZE ||
 206             !ISP2(volblocksize))
 207                 return (SET_ERROR(EDOM));
 208 
 209         return (0);
 210 }
 211 
 212 int
 213 zvol_get_stats(objset_t *os, nvlist_t *nv)
 214 {
 215         int error;
 216         dmu_object_info_t doi;
 217         uint64_t val;
 218 
 219         error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val);
 220         if (error)
 221                 return (error);
 222 
 223         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLSIZE, val);
 224 
 225         error = dmu_object_info(os, ZVOL_OBJ, &doi);
 226 
 227         if (error == 0) {
 228                 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLBLOCKSIZE,
 229                     doi.doi_data_block_size);
 230         }
 231 
 232         return (error);
 233 }
 234 
 235 static zvol_state_t *
 236 zvol_minor_lookup(const char *name)
 237 {
 238         minor_t minor;
 239         zvol_state_t *zv;
 240 
 241         ASSERT(MUTEX_HELD(&zfsdev_state_lock));
 242 
 243         for (minor = 1; minor <= ZFSDEV_MAX_MINOR; minor++) {
 244                 zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
 245                 if (zv == NULL)
 246                         continue;
 247                 if (strcmp(zv->zv_name, name) == 0)
 248                         return (zv);
 249         }
 250 
 251         return (NULL);
 252 }
 253 
 254 /* extent mapping arg */
 255 struct maparg {
 256         zvol_state_t    *ma_zv;
 257         uint64_t        ma_blks;
 258 };
 259 
 260 /*ARGSUSED*/
 261 static int
 262 zvol_map_block(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 263     const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
 264 {
 265         struct maparg *ma = arg;
 266         zvol_extent_t *ze;
 267         int bs = ma->ma_zv->zv_volblocksize;
 268 
 269         if (bp == NULL || BP_IS_HOLE(bp) ||
 270             zb->zb_object != ZVOL_OBJ || zb->zb_level != 0)
 271                 return (0);
 272 
 273         VERIFY(!BP_IS_EMBEDDED(bp));
 274 
 275         VERIFY3U(ma->ma_blks, ==, zb->zb_blkid);
 276         ma->ma_blks++;
 277 
 278         /* Abort immediately if we have encountered gang blocks */
 279         if (BP_IS_GANG(bp))
 280                 return (SET_ERROR(EFRAGS));
 281 
 282         /*
 283          * See if the block is at the end of the previous extent.
 284          */
 285         ze = list_tail(&ma->ma_zv->zv_extents);
 286         if (ze &&
 287             DVA_GET_VDEV(BP_IDENTITY(bp)) == DVA_GET_VDEV(&ze->ze_dva) &&
 288             DVA_GET_OFFSET(BP_IDENTITY(bp)) ==
 289             DVA_GET_OFFSET(&ze->ze_dva) + ze->ze_nblks * bs) {
 290                 ze->ze_nblks++;
 291                 return (0);
 292         }
 293 
 294         dprintf_bp(bp, "%s", "next blkptr:");
 295 
 296         /* start a new extent */
 297         ze = kmem_zalloc(sizeof (zvol_extent_t), KM_SLEEP);
 298         ze->ze_dva = bp->blk_dva[0];      /* structure assignment */
 299         ze->ze_nblks = 1;
 300         list_insert_tail(&ma->ma_zv->zv_extents, ze);
 301         return (0);
 302 }
 303 
 304 static void
 305 zvol_free_extents(zvol_state_t *zv)
 306 {
 307         zvol_extent_t *ze;
 308 
 309         while (ze = list_head(&zv->zv_extents)) {
 310                 list_remove(&zv->zv_extents, ze);
 311                 kmem_free(ze, sizeof (zvol_extent_t));
 312         }
 313 }
 314 
 315 static int
 316 zvol_get_lbas(zvol_state_t *zv)
 317 {
 318         objset_t *os = zv->zv_objset;
 319         struct maparg   ma;
 320         int             err;
 321 
 322         ma.ma_zv = zv;
 323         ma.ma_blks = 0;
 324         zvol_free_extents(zv);
 325 
 326         /* commit any in-flight changes before traversing the dataset */
 327         txg_wait_synced(dmu_objset_pool(os), 0);
 328         err = traverse_dataset(dmu_objset_ds(os), 0,
 329             TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, zvol_map_block, &ma);
 330         if (err || ma.ma_blks != (zv->zv_volsize / zv->zv_volblocksize)) {
 331                 zvol_free_extents(zv);
 332                 return (err ? err : EIO);
 333         }
 334 
 335         return (0);
 336 }
 337 
 338 /* ARGSUSED */
 339 void
 340 zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
 341 {
 342         zfs_creat_t *zct = arg;
 343         nvlist_t *nvprops = zct->zct_props;
 344         int error;
 345         uint64_t volblocksize, volsize;
 346 
 347         VERIFY(nvlist_lookup_uint64(nvprops,
 348             zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) == 0);
 349         if (nvlist_lookup_uint64(nvprops,
 350             zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0)
 351                 volblocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE);
 352 
 353         /*
 354          * These properties must be removed from the list so the generic
 355          * property setting step won't apply to them.
 356          */
 357         VERIFY(nvlist_remove_all(nvprops,
 358             zfs_prop_to_name(ZFS_PROP_VOLSIZE)) == 0);
 359         (void) nvlist_remove_all(nvprops,
 360             zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE));
 361 
 362         error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize,
 363             DMU_OT_NONE, 0, tx);
 364         ASSERT(error == 0);
 365 
 366         error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP,
 367             DMU_OT_NONE, 0, tx);
 368         ASSERT(error == 0);
 369 
 370         error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx);
 371         ASSERT(error == 0);
 372 }
 373 
 374 /*
 375  * Replay a TX_TRUNCATE ZIL transaction if asked.  TX_TRUNCATE is how we
 376  * implement DKIOCFREE/free-long-range.
 377  */
 378 static int
 379 zvol_replay_truncate(void *arg1, void *arg2, boolean_t byteswap)
 380 {
 381         zvol_state_t *zv = arg1;
 382         lr_truncate_t *lr = arg2;
 383         uint64_t offset, length;
 384 
 385         if (byteswap)
 386                 byteswap_uint64_array(lr, sizeof (*lr));
 387 
 388         offset = lr->lr_offset;
 389         length = lr->lr_length;
 390 
 391         return (dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, offset, length));
 392 }
 393 
 394 /*
 395  * Replay a TX_WRITE ZIL transaction that didn't get committed
 396  * after a system failure
 397  */
 398 static int
 399 zvol_replay_write(void *arg1, void *arg2, boolean_t byteswap)
 400 {
 401         zvol_state_t *zv = arg1;
 402         lr_write_t *lr = arg2;
 403         objset_t *os = zv->zv_objset;
 404         char *data = (char *)(lr + 1);  /* data follows lr_write_t */
 405         uint64_t offset, length;
 406         dmu_tx_t *tx;
 407         int error;
 408 
 409         if (byteswap)
 410                 byteswap_uint64_array(lr, sizeof (*lr));
 411 
 412         offset = lr->lr_offset;
 413         length = lr->lr_length;
 414 
 415         /* If it's a dmu_sync() block, write the whole block */
 416         if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
 417                 uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr);
 418                 if (length < blocksize) {
 419                         offset -= offset % blocksize;
 420                         length = blocksize;
 421                 }
 422         }
 423 
 424         tx = dmu_tx_create(os);
 425         dmu_tx_hold_write(tx, ZVOL_OBJ, offset, length);
 426         error = dmu_tx_assign(tx, TXG_WAIT);
 427         if (error) {
 428                 dmu_tx_abort(tx);
 429         } else {
 430                 dmu_write(os, ZVOL_OBJ, offset, length, data, tx);
 431                 dmu_tx_commit(tx);
 432         }
 433 
 434         return (error);
 435 }
 436 
 437 /* ARGSUSED */
 438 static int
 439 zvol_replay_err(void *arg1, void *arg2, boolean_t byteswap)
 440 {
 441         return (SET_ERROR(ENOTSUP));
 442 }
 443 
 444 /*
 445  * Callback vectors for replaying records.
 446  * Only TX_WRITE and TX_TRUNCATE are needed for zvol.
 447  */
 448 zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = {
 449         zvol_replay_err,        /* 0 no such transaction type */
 450         zvol_replay_err,        /* TX_CREATE */
 451         zvol_replay_err,        /* TX_MKDIR */
 452         zvol_replay_err,        /* TX_MKXATTR */
 453         zvol_replay_err,        /* TX_SYMLINK */
 454         zvol_replay_err,        /* TX_REMOVE */
 455         zvol_replay_err,        /* TX_RMDIR */
 456         zvol_replay_err,        /* TX_LINK */
 457         zvol_replay_err,        /* TX_RENAME */
 458         zvol_replay_write,      /* TX_WRITE */
 459         zvol_replay_truncate,   /* TX_TRUNCATE */
 460         zvol_replay_err,        /* TX_SETATTR */
 461         zvol_replay_err,        /* TX_ACL */
 462         zvol_replay_err,        /* TX_CREATE_ACL */
 463         zvol_replay_err,        /* TX_CREATE_ATTR */
 464         zvol_replay_err,        /* TX_CREATE_ACL_ATTR */
 465         zvol_replay_err,        /* TX_MKDIR_ACL */
 466         zvol_replay_err,        /* TX_MKDIR_ATTR */
 467         zvol_replay_err,        /* TX_MKDIR_ACL_ATTR */
 468         zvol_replay_err,        /* TX_WRITE2 */
 469 };
 470 
 471 int
 472 zvol_name2minor(const char *name, minor_t *minor)
 473 {
 474         zvol_state_t *zv;
 475 
 476         mutex_enter(&zfsdev_state_lock);
 477         zv = zvol_minor_lookup(name);
 478         if (minor && zv)
 479                 *minor = zv->zv_minor;
 480         mutex_exit(&zfsdev_state_lock);
 481         return (zv ? 0 : -1);
 482 }
 483 
 484 /*
 485  * Create a minor node (plus a whole lot more) for the specified volume.
 486  */
 487 int
 488 zvol_create_minor(const char *name)
 489 {
 490         zfs_soft_state_t *zs;
 491         zvol_state_t *zv;
 492         objset_t *os;
 493         dmu_object_info_t doi;
 494         minor_t minor = 0;
 495         char chrbuf[30], blkbuf[30];
 496         int error;
 497 
 498         mutex_enter(&zfsdev_state_lock);
 499 
 500         if (zvol_minor_lookup(name) != NULL) {
 501                 mutex_exit(&zfsdev_state_lock);
 502                 return (SET_ERROR(EEXIST));
 503         }
 504 
 505         /* lie and say we're read-only */
 506         error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, FTAG, &os);
 507 
 508         if (error) {
 509                 mutex_exit(&zfsdev_state_lock);
 510                 return (error);
 511         }
 512 
 513         if ((minor = zfsdev_minor_alloc()) == 0) {
 514                 dmu_objset_disown(os, FTAG);
 515                 mutex_exit(&zfsdev_state_lock);
 516                 return (SET_ERROR(ENXIO));
 517         }
 518 
 519         if (ddi_soft_state_zalloc(zfsdev_state, minor) != DDI_SUCCESS) {
 520                 dmu_objset_disown(os, FTAG);
 521                 mutex_exit(&zfsdev_state_lock);
 522                 return (SET_ERROR(EAGAIN));
 523         }
 524         (void) ddi_prop_update_string(minor, zfs_dip, ZVOL_PROP_NAME,
 525             (char *)name);
 526 
 527         (void) snprintf(chrbuf, sizeof (chrbuf), "%u,raw", minor);
 528 
 529         if (ddi_create_minor_node(zfs_dip, chrbuf, S_IFCHR,
 530             minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
 531                 ddi_soft_state_free(zfsdev_state, minor);
 532                 dmu_objset_disown(os, FTAG);
 533                 mutex_exit(&zfsdev_state_lock);
 534                 return (SET_ERROR(EAGAIN));
 535         }
 536 
 537         (void) snprintf(blkbuf, sizeof (blkbuf), "%u", minor);
 538 
 539         if (ddi_create_minor_node(zfs_dip, blkbuf, S_IFBLK,
 540             minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
 541                 ddi_remove_minor_node(zfs_dip, chrbuf);
 542                 ddi_soft_state_free(zfsdev_state, minor);
 543                 dmu_objset_disown(os, FTAG);
 544                 mutex_exit(&zfsdev_state_lock);
 545                 return (SET_ERROR(EAGAIN));
 546         }
 547 
 548         zs = ddi_get_soft_state(zfsdev_state, minor);
 549         zs->zss_type = ZSST_ZVOL;
 550         zv = zs->zss_data = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP);
 551         (void) strlcpy(zv->zv_name, name, MAXPATHLEN);
 552         zv->zv_min_bs = DEV_BSHIFT;
 553         zv->zv_minor = minor;
 554         zv->zv_objset = os;
 555         if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os)))
 556                 zv->zv_flags |= ZVOL_RDONLY;
 557         mutex_init(&zv->zv_znode.z_range_lock, NULL, MUTEX_DEFAULT, NULL);
 558         avl_create(&zv->zv_znode.z_range_avl, zfs_range_compare,
 559             sizeof (rl_t), offsetof(rl_t, r_node));
 560         list_create(&zv->zv_extents, sizeof (zvol_extent_t),
 561             offsetof(zvol_extent_t, ze_node));
 562         /* get and cache the blocksize */
 563         error = dmu_object_info(os, ZVOL_OBJ, &doi);
 564         ASSERT(error == 0);
 565         zv->zv_volblocksize = doi.doi_data_block_size;
 566 
 567         if (spa_writeable(dmu_objset_spa(os))) {
 568                 if (zil_replay_disable)
 569                         zil_destroy(dmu_objset_zil(os), B_FALSE);
 570                 else
 571                         zil_replay(os, zv, zvol_replay_vector);
 572         }
 573         dmu_objset_disown(os, FTAG);
 574         zv->zv_objset = NULL;
 575 
 576         zvol_minors++;
 577 
 578         mutex_exit(&zfsdev_state_lock);
 579 
 580         return (0);
 581 }
 582 
 583 /*
 584  * Remove minor node for the specified volume.
 585  */
 586 static int
 587 zvol_remove_zv(zvol_state_t *zv)
 588 {
 589         char nmbuf[20];
 590         minor_t minor = zv->zv_minor;
 591 
 592         ASSERT(MUTEX_HELD(&zfsdev_state_lock));
 593         if (zv->zv_total_opens != 0)
 594                 return (SET_ERROR(EBUSY));
 595 
 596         (void) snprintf(nmbuf, sizeof (nmbuf), "%u,raw", minor);
 597         ddi_remove_minor_node(zfs_dip, nmbuf);
 598 
 599         (void) snprintf(nmbuf, sizeof (nmbuf), "%u", minor);
 600         ddi_remove_minor_node(zfs_dip, nmbuf);
 601 
 602         avl_destroy(&zv->zv_znode.z_range_avl);
 603         mutex_destroy(&zv->zv_znode.z_range_lock);
 604 
 605         kmem_free(zv, sizeof (zvol_state_t));
 606 
 607         ddi_soft_state_free(zfsdev_state, minor);
 608 
 609         zvol_minors--;
 610         return (0);
 611 }
 612 
 613 int
 614 zvol_remove_minor(const char *name)
 615 {
 616         zvol_state_t *zv;
 617         int rc;
 618 
 619         mutex_enter(&zfsdev_state_lock);
 620         if ((zv = zvol_minor_lookup(name)) == NULL) {
 621                 mutex_exit(&zfsdev_state_lock);
 622                 return (SET_ERROR(ENXIO));
 623         }
 624         rc = zvol_remove_zv(zv);
 625         mutex_exit(&zfsdev_state_lock);
 626         return (rc);
 627 }
 628 
 629 int
 630 zvol_first_open(zvol_state_t *zv)
 631 {
 632         objset_t *os;
 633         uint64_t volsize;
 634         int error;
 635         uint64_t readonly;
 636 
 637         /* lie and say we're read-only */
 638         error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, B_TRUE,
 639             zvol_tag, &os);
 640         if (error)
 641                 return (error);
 642 
 643         zv->zv_objset = os;
 644         error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
 645         if (error) {
 646                 ASSERT(error == 0);
 647                 dmu_objset_disown(os, zvol_tag);
 648                 return (error);
 649         }
 650 
 651         error = dmu_bonus_hold(os, ZVOL_OBJ, zvol_tag, &zv->zv_dbuf);
 652         if (error) {
 653                 dmu_objset_disown(os, zvol_tag);
 654                 return (error);
 655         }
 656 
 657         zvol_size_changed(zv, volsize);
 658         zv->zv_zilog = zil_open(os, zvol_get_data);
 659 
 660         VERIFY(dsl_prop_get_integer(zv->zv_name, "readonly", &readonly,
 661             NULL) == 0);
 662         if (readonly || dmu_objset_is_snapshot(os) ||
 663             !spa_writeable(dmu_objset_spa(os)))
 664                 zv->zv_flags |= ZVOL_RDONLY;
 665         else
 666                 zv->zv_flags &= ~ZVOL_RDONLY;
 667         return (error);
 668 }
 669 
 670 void
 671 zvol_last_close(zvol_state_t *zv)
 672 {
 673         zil_close(zv->zv_zilog);
 674         zv->zv_zilog = NULL;
 675 
 676         dmu_buf_rele(zv->zv_dbuf, zvol_tag);
 677         zv->zv_dbuf = NULL;
 678 
 679         /*
 680          * Evict cached data
 681          */
 682         if (dsl_dataset_is_dirty(dmu_objset_ds(zv->zv_objset)) &&
 683             !(zv->zv_flags & ZVOL_RDONLY))
 684                 txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
 685         dmu_objset_evict_dbufs(zv->zv_objset);
 686 
 687         dmu_objset_disown(zv->zv_objset, zvol_tag);
 688         zv->zv_objset = NULL;
 689 }
 690 
 691 int
 692 zvol_prealloc(zvol_state_t *zv)
 693 {
 694         objset_t *os = zv->zv_objset;
 695         dmu_tx_t *tx;
 696         uint64_t refd, avail, usedobjs, availobjs;
 697         uint64_t resid = zv->zv_volsize;
 698         uint64_t off = 0;
 699 
 700         /* Check the space usage before attempting to allocate the space */
 701         dmu_objset_space(os, &refd, &avail, &usedobjs, &availobjs);
 702         if (avail < zv->zv_volsize)
 703                 return (SET_ERROR(ENOSPC));
 704 
 705         /* Free old extents if they exist */
 706         zvol_free_extents(zv);
 707 
 708         while (resid != 0) {
 709                 int error;
 710                 uint64_t bytes = MIN(resid, SPA_OLD_MAXBLOCKSIZE);
 711 
 712                 tx = dmu_tx_create(os);
 713                 dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes);
 714                 error = dmu_tx_assign(tx, TXG_WAIT);
 715                 if (error) {
 716                         dmu_tx_abort(tx);
 717                         (void) dmu_free_long_range(os, ZVOL_OBJ, 0, off);
 718                         return (error);
 719                 }
 720                 dmu_prealloc(os, ZVOL_OBJ, off, bytes, tx);
 721                 dmu_tx_commit(tx);
 722                 off += bytes;
 723                 resid -= bytes;
 724         }
 725         txg_wait_synced(dmu_objset_pool(os), 0);
 726 
 727         return (0);
 728 }
 729 
 730 static int
 731 zvol_update_volsize(objset_t *os, uint64_t volsize)
 732 {
 733         dmu_tx_t *tx;
 734         int error;
 735 
 736         ASSERT(MUTEX_HELD(&zfsdev_state_lock));
 737 
 738         tx = dmu_tx_create(os);
 739         dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
 740         dmu_tx_mark_netfree(tx);
 741         error = dmu_tx_assign(tx, TXG_WAIT);
 742         if (error) {
 743                 dmu_tx_abort(tx);
 744                 return (error);
 745         }
 746 
 747         error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1,
 748             &volsize, tx);
 749         dmu_tx_commit(tx);
 750 
 751         if (error == 0)
 752                 error = dmu_free_long_range(os,
 753                     ZVOL_OBJ, volsize, DMU_OBJECT_END);
 754         return (error);
 755 }
 756 
 757 void
 758 zvol_remove_minors(const char *name)
 759 {
 760         zvol_state_t *zv;
 761         char *namebuf;
 762         minor_t minor;
 763 
 764         namebuf = kmem_zalloc(strlen(name) + 2, KM_SLEEP);
 765         (void) strncpy(namebuf, name, strlen(name));
 766         (void) strcat(namebuf, "/");
 767         mutex_enter(&zfsdev_state_lock);
 768         for (minor = 1; minor <= ZFSDEV_MAX_MINOR; minor++) {
 769 
 770                 zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
 771                 if (zv == NULL)
 772                         continue;
 773                 if (strncmp(namebuf, zv->zv_name, strlen(namebuf)) == 0)
 774                         (void) zvol_remove_zv(zv);
 775         }
 776         kmem_free(namebuf, strlen(name) + 2);
 777 
 778         mutex_exit(&zfsdev_state_lock);
 779 }
 780 
 781 static int
 782 zvol_update_live_volsize(zvol_state_t *zv, uint64_t volsize)
 783 {
 784         uint64_t old_volsize = 0ULL;
 785         int error = 0;
 786 
 787         ASSERT(MUTEX_HELD(&zfsdev_state_lock));
 788 
 789         /*
 790          * Reinitialize the dump area to the new size. If we
 791          * failed to resize the dump area then restore it back to
 792          * its original size.  We must set the new volsize prior
 793          * to calling dumpvp_resize() to ensure that the devices'
 794          * size(9P) is not visible by the dump subsystem.
 795          */
 796         old_volsize = zv->zv_volsize;
 797         zvol_size_changed(zv, volsize);
 798 
 799         if (zv->zv_flags & ZVOL_DUMPIFIED) {
 800                 if ((error = zvol_dumpify(zv)) != 0 ||
 801                     (error = dumpvp_resize()) != 0) {
 802                         int dumpify_error;
 803 
 804                         (void) zvol_update_volsize(zv->zv_objset, old_volsize);
 805                         zvol_size_changed(zv, old_volsize);
 806                         dumpify_error = zvol_dumpify(zv);
 807                         error = dumpify_error ? dumpify_error : error;
 808                 }
 809         }
 810 
 811         /*
 812          * Generate a LUN expansion event.
 813          */
 814         if (error == 0) {
 815                 sysevent_id_t eid;
 816                 nvlist_t *attr;
 817                 char *physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
 818 
 819                 (void) snprintf(physpath, MAXPATHLEN, "%s%u", ZVOL_PSEUDO_DEV,
 820                     zv->zv_minor);
 821 
 822                 VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 823                 VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0);
 824 
 825                 (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS,
 826                     ESC_DEV_DLE, attr, &eid, DDI_SLEEP);
 827 
 828                 nvlist_free(attr);
 829                 kmem_free(physpath, MAXPATHLEN);
 830         }
 831         return (error);
 832 }
 833 
 834 int
 835 zvol_set_volsize(const char *name, uint64_t volsize)
 836 {
 837         zvol_state_t *zv = NULL;
 838         objset_t *os;
 839         int error;
 840         dmu_object_info_t doi;
 841         uint64_t readonly;
 842         boolean_t owned = B_FALSE;
 843 
 844         error = dsl_prop_get_integer(name,
 845             zfs_prop_to_name(ZFS_PROP_READONLY), &readonly, NULL);
 846         if (error != 0)
 847                 return (error);
 848         if (readonly)
 849                 return (SET_ERROR(EROFS));
 850 
 851         mutex_enter(&zfsdev_state_lock);
 852         zv = zvol_minor_lookup(name);
 853 
 854         if (zv == NULL || zv->zv_objset == NULL) {
 855                 if ((error = dmu_objset_own(name, DMU_OST_ZVOL, B_FALSE,
 856                     FTAG, &os)) != 0) {
 857                         mutex_exit(&zfsdev_state_lock);
 858                         return (error);
 859                 }
 860                 owned = B_TRUE;
 861                 if (zv != NULL)
 862                         zv->zv_objset = os;
 863         } else {
 864                 os = zv->zv_objset;
 865         }
 866 
 867         if ((error = dmu_object_info(os, ZVOL_OBJ, &doi)) != 0 ||
 868             (error = zvol_check_volsize(volsize, doi.doi_data_block_size)) != 0)
 869                 goto out;
 870 
 871         error = zvol_update_volsize(os, volsize);
 872 
 873         if (error == 0 && zv != NULL)
 874                 error = zvol_update_live_volsize(zv, volsize);
 875 out:
 876         if (owned) {
 877                 dmu_objset_disown(os, FTAG);
 878                 if (zv != NULL)
 879                         zv->zv_objset = NULL;
 880         }
 881         mutex_exit(&zfsdev_state_lock);
 882         return (error);
 883 }
 884 
 885 /*ARGSUSED*/
 886 int
 887 zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr)
 888 {
 889         zvol_state_t *zv;
 890         int err = 0;
 891 
 892         mutex_enter(&zfsdev_state_lock);
 893 
 894         zv = zfsdev_get_soft_state(getminor(*devp), ZSST_ZVOL);
 895         if (zv == NULL) {
 896                 mutex_exit(&zfsdev_state_lock);
 897                 return (SET_ERROR(ENXIO));
 898         }
 899 
 900         if (zv->zv_total_opens == 0)
 901                 err = zvol_first_open(zv);
 902         if (err) {
 903                 mutex_exit(&zfsdev_state_lock);
 904                 return (err);
 905         }
 906         if ((flag & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
 907                 err = SET_ERROR(EROFS);
 908                 goto out;
 909         }
 910         if (zv->zv_flags & ZVOL_EXCL) {
 911                 err = SET_ERROR(EBUSY);
 912                 goto out;
 913         }
 914         if (flag & FEXCL) {
 915                 if (zv->zv_total_opens != 0) {
 916                         err = SET_ERROR(EBUSY);
 917                         goto out;
 918                 }
 919                 zv->zv_flags |= ZVOL_EXCL;
 920         }
 921 
 922         if (zv->zv_open_count[otyp] == 0 || otyp == OTYP_LYR) {
 923                 zv->zv_open_count[otyp]++;
 924                 zv->zv_total_opens++;
 925         }
 926         mutex_exit(&zfsdev_state_lock);
 927 
 928         return (err);
 929 out:
 930         if (zv->zv_total_opens == 0)
 931                 zvol_last_close(zv);
 932         mutex_exit(&zfsdev_state_lock);
 933         return (err);
 934 }
 935 
 936 /*ARGSUSED*/
 937 int
 938 zvol_close(dev_t dev, int flag, int otyp, cred_t *cr)
 939 {
 940         minor_t minor = getminor(dev);
 941         zvol_state_t *zv;
 942         int error = 0;
 943 
 944         mutex_enter(&zfsdev_state_lock);
 945 
 946         zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
 947         if (zv == NULL) {
 948                 mutex_exit(&zfsdev_state_lock);
 949                 return (SET_ERROR(ENXIO));
 950         }
 951 
 952         if (zv->zv_flags & ZVOL_EXCL) {
 953                 ASSERT(zv->zv_total_opens == 1);
 954                 zv->zv_flags &= ~ZVOL_EXCL;
 955         }
 956 
 957         /*
 958          * If the open count is zero, this is a spurious close.
 959          * That indicates a bug in the kernel / DDI framework.
 960          */
 961         ASSERT(zv->zv_open_count[otyp] != 0);
 962         ASSERT(zv->zv_total_opens != 0);
 963 
 964         /*
 965          * You may get multiple opens, but only one close.
 966          */
 967         zv->zv_open_count[otyp]--;
 968         zv->zv_total_opens--;
 969 
 970         if (zv->zv_total_opens == 0)
 971                 zvol_last_close(zv);
 972 
 973         mutex_exit(&zfsdev_state_lock);
 974         return (error);
 975 }
 976 
 977 /* ARGSUSED */
 978 static void
 979 zvol_get_done(zgd_t *zgd, int error)
 980 {
 981         if (zgd->zgd_db)
 982                 dmu_buf_rele(zgd->zgd_db, zgd);
 983 
 984         zfs_range_unlock(zgd->zgd_rl);
 985 
 986         kmem_free(zgd, sizeof (zgd_t));
 987 }
 988 
 989 /*
 990  * Get data to generate a TX_WRITE intent log record.
 991  */
 992 static int
 993 zvol_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
 994 {
 995         zvol_state_t *zv = arg;
 996         objset_t *os = zv->zv_objset;
 997         uint64_t object = ZVOL_OBJ;
 998         uint64_t offset = lr->lr_offset;
 999         uint64_t size = lr->lr_length;       /* length of user data */
1000         dmu_buf_t *db;
1001         zgd_t *zgd;
1002         int error;
1003 
1004         ASSERT3P(lwb, !=, NULL);
1005         ASSERT3P(zio, !=, NULL);
1006         ASSERT3U(size, !=, 0);
1007 
1008         zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
1009         zgd->zgd_lwb = lwb;
1010 
1011         /*
1012          * Write records come in two flavors: immediate and indirect.
1013          * For small writes it's cheaper to store the data with the
1014          * log record (immediate); for large writes it's cheaper to
1015          * sync the data and get a pointer to it (indirect) so that
1016          * we don't have to write the data twice.
1017          */
1018         if (buf != NULL) { /* immediate write */
1019                 zgd->zgd_rl = zfs_range_lock(&zv->zv_znode, offset, size,
1020                     RL_READER);
1021                 error = dmu_read(os, object, offset, size, buf,
1022                     DMU_READ_NO_PREFETCH);
1023         } else { /* indirect write */
1024                 /*
1025                  * Have to lock the whole block to ensure when it's written out
1026                  * and its checksum is being calculated that no one can change
1027                  * the data. Contrarily to zfs_get_data we need not re-check
1028                  * blocksize after we get the lock because it cannot be changed.
1029                  */
1030                 size = zv->zv_volblocksize;
1031                 offset = P2ALIGN(offset, size);
1032                 zgd->zgd_rl = zfs_range_lock(&zv->zv_znode, offset, size,
1033                     RL_READER);
1034                 error = dmu_buf_hold(os, object, offset, zgd, &db,
1035                     DMU_READ_NO_PREFETCH);
1036                 if (error == 0) {
1037                         blkptr_t *bp = &lr->lr_blkptr;
1038 
1039                         zgd->zgd_db = db;
1040                         zgd->zgd_bp = bp;
1041 
1042                         ASSERT(db->db_offset == offset);
1043                         ASSERT(db->db_size == size);
1044 
1045                         error = dmu_sync(zio, lr->lr_common.lrc_txg,
1046                             zvol_get_done, zgd);
1047 
1048                         if (error == 0)
1049                                 return (0);
1050                 }
1051         }
1052 
1053         zvol_get_done(zgd, error);
1054 
1055         return (error);
1056 }
1057 
1058 /*
1059  * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions.
1060  *
1061  * We store data in the log buffers if it's small enough.
1062  * Otherwise we will later flush the data out via dmu_sync().
1063  */
1064 ssize_t zvol_immediate_write_sz = 32768;
1065 
1066 static void
1067 zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t resid,
1068     boolean_t sync)
1069 {
1070         uint32_t blocksize = zv->zv_volblocksize;
1071         zilog_t *zilog = zv->zv_zilog;
1072         spa_t *spa = zilog->zl_spa;
1073         spa_meta_placement_t *mp = &spa->spa_meta_policy;
1074         boolean_t slogging, zil_to_special, write_to_special;
1075         ssize_t immediate_write_sz;
1076         itx_wr_state_t write_state;
1077 
1078         if (zil_replaying(zilog, tx))
1079                 return;
1080 
1081         /*
1082          * See comments in zfs_log_write()
1083          */
1084 
1085         immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
1086             ? 0 : zvol_immediate_write_sz;
1087 
1088         zil_to_special = !spa_has_slogs(spa) &&
1089             spa_can_special_be_used(spa) &&
1090             mp->spa_sync_to_special != SYNC_TO_SPECIAL_DISABLED;
1091 
1092         write_to_special = !spa_has_slogs(spa) &&
1093             spa_write_data_to_special(spa, zilog->zl_os) &&
1094             (mp->spa_sync_to_special == SYNC_TO_SPECIAL_ALWAYS ||
1095             (mp->spa_sync_to_special == SYNC_TO_SPECIAL_BALANCED &&
1096             spa->spa_avg_stat_rotor % 100 < spa->spa_special_to_normal_ratio));
1097 
1098         slogging = (spa_has_slogs(spa) || zil_to_special) &&
1099             (zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);
1100 
1101         if (blocksize > immediate_write_sz && !slogging &&
1102             resid >= blocksize && off % blocksize == 0)
1103                 write_state = WR_INDIRECT;
1104         else if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
1105                 write_state = WR_INDIRECT;
1106         else if (!spa_has_slogs(zilog->zl_spa) &&
1107             resid >= blocksize && blocksize > zvol_immediate_write_sz)
1108                 write_state = WR_INDIRECT;
1109         else if (write_to_special)
1110                  write_state = WR_INDIRECT;
1111         else if (sync)
1112                 write_state = WR_COPIED;
1113         else
1114                 write_state = WR_NEED_COPY;
1115 
1116         while (resid) {
1117                 itx_t *itx;
1118                 lr_write_t *lr;
1119                 itx_wr_state_t wr_state = write_state;
1120                 ssize_t len = resid;
1121 
1122                 if (wr_state == WR_COPIED && resid > ZIL_MAX_COPIED_DATA)
1123                         wr_state = WR_NEED_COPY;
1124                 else if (wr_state == WR_INDIRECT)
1125                         len = MIN(blocksize - P2PHASE(off, blocksize), resid);
1126 
1127                 itx = zil_itx_create(TX_WRITE, sizeof (*lr) +
1128                     (wr_state == WR_COPIED ? len : 0));
1129                 lr = (lr_write_t *)&itx->itx_lr;
1130                 if (wr_state == WR_COPIED && dmu_read(zv->zv_objset,
1131                     ZVOL_OBJ, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
1132                         zil_itx_destroy(itx);
1133                         itx = zil_itx_create(TX_WRITE, sizeof (*lr));
1134                         lr = (lr_write_t *)&itx->itx_lr;
1135                         wr_state = WR_NEED_COPY;
1136                 }
1137 
1138                 itx->itx_wr_state = wr_state;
1139                 lr->lr_foid = ZVOL_OBJ;
1140                 lr->lr_offset = off;
1141                 lr->lr_length = len;
1142                 lr->lr_blkoff = 0;
1143                 BP_ZERO(&lr->lr_blkptr);
1144 
1145                 itx->itx_private = zv;
1146                 itx->itx_sync = sync;
1147 
1148                 zil_itx_assign(zilog, itx, tx);
1149 
1150                 off += len;
1151                 resid -= len;
1152         }
1153 }
1154 
1155 static int
1156 zvol_dumpio_vdev(vdev_t *vd, void *addr, uint64_t offset, uint64_t origoffset,
1157     uint64_t size, boolean_t doread, boolean_t isdump)
1158 {
1159         vdev_disk_t *dvd;
1160         int c, rc;
1161         int numerrors = 0;
1162 
1163         if (vd->vdev_ops == &vdev_mirror_ops ||
1164             vd->vdev_ops == &vdev_replacing_ops ||
1165             vd->vdev_ops == &vdev_spare_ops) {
1166                 for (c = 0; c < vd->vdev_children; c++) {
1167                         int err = zvol_dumpio_vdev(vd->vdev_child[c],
1168                             addr, offset, origoffset, size, doread, isdump);
1169                         if (err != 0) {
1170                                 numerrors++;
1171                         } else if (doread) {
1172                                 break;
1173                         }
1174                 }
1175         }
1176 
1177         if (!vd->vdev_ops->vdev_op_leaf && vd->vdev_ops != &vdev_raidz_ops)
1178                 return (numerrors < vd->vdev_children ? 0 : EIO);
1179 
1180         if (doread && !vdev_readable(vd))
1181                 return (SET_ERROR(EIO));
1182         else if (!doread && !vdev_writeable(vd))
1183                 return (SET_ERROR(EIO));
1184 
1185         if (vd->vdev_ops == &vdev_raidz_ops) {
1186                 return (vdev_raidz_physio(vd,
1187                     addr, size, offset, origoffset, doread, isdump));
1188         }
1189 
1190         offset += VDEV_LABEL_START_SIZE;
1191 
1192         rw_enter(&vd->vdev_tsd_lock, RW_READER);
1193         dvd = vd->vdev_tsd;
1194         if (ddi_in_panic() || isdump) {
1195                 ASSERT(!doread);
1196                 if (doread) {
1197                         rw_exit(&vd->vdev_tsd_lock);
1198                         return (SET_ERROR(EIO));
1199                 }
1200                 /* We assume here dvd is not NULL */
1201                 ASSERT3P(dvd, !=, NULL);
1202 
1203                 /* If our assumption is wrong, we do not want to crash */
1204                 if (dvd != NULL && dvd->vd_lh != NULL) {
1205                         rc = ldi_dump(dvd->vd_lh, addr, lbtodb(offset),
1206                             lbtodb(size));
1207                 } else {
1208                         rc = SET_ERROR(ENXIO);
1209                 }
1210         } else {
1211                 /* We assume here dvd is not NULL */
1212                 ASSERT3P(dvd, !=, NULL);
1213 
1214                 /* If our assumption is wrong, we do not want to crash */
1215                 if (dvd != NULL && dvd->vd_lh != NULL) {
1216                         rc = vdev_disk_ldi_physio(dvd->vd_lh, addr, size,
1217                             offset, doread ? B_READ : B_WRITE);
1218                 } else {
1219                         rc = SET_ERROR(ENXIO);
1220                 }
1221         }
1222         rw_exit(&vd->vdev_tsd_lock);
1223         return (rc);
1224 }
1225 
1226 static int
1227 zvol_dumpio(zvol_state_t *zv, void *addr, uint64_t offset, uint64_t size,
1228     boolean_t doread, boolean_t isdump)
1229 {
1230         vdev_t *vd;
1231         int error;
1232         zvol_extent_t *ze;
1233         spa_t *spa = dmu_objset_spa(zv->zv_objset);
1234 
1235         /* Must be sector aligned, and not stradle a block boundary. */
1236         if (P2PHASE(offset, DEV_BSIZE) || P2PHASE(size, DEV_BSIZE) ||
1237             P2BOUNDARY(offset, size, zv->zv_volblocksize)) {
1238                 return (SET_ERROR(EINVAL));
1239         }
1240         ASSERT(size <= zv->zv_volblocksize);
1241 
1242         /* Locate the extent this belongs to */
1243         ze = list_head(&zv->zv_extents);
1244         while (offset >= ze->ze_nblks * zv->zv_volblocksize) {
1245                 offset -= ze->ze_nblks * zv->zv_volblocksize;
1246                 ze = list_next(&zv->zv_extents, ze);
1247         }
1248 
1249         if (ze == NULL)
1250                 return (SET_ERROR(EINVAL));
1251 
1252         if (!ddi_in_panic())
1253                 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
1254 
1255         vd = vdev_lookup_top(spa, DVA_GET_VDEV(&ze->ze_dva));
1256         offset += DVA_GET_OFFSET(&ze->ze_dva);
1257         error = zvol_dumpio_vdev(vd, addr, offset, DVA_GET_OFFSET(&ze->ze_dva),
1258             size, doread, isdump);
1259 
1260         if (!ddi_in_panic())
1261                 spa_config_exit(spa, SCL_STATE, FTAG);
1262 
1263         return (error);
1264 }
1265 
1266 int
1267 zvol_strategy(buf_t *bp)
1268 {
1269         zfs_soft_state_t *zs = NULL;
1270         zvol_state_t *zv;
1271         uint64_t off, volsize;
1272         size_t resid;
1273         char *addr;
1274         objset_t *os;
1275         rl_t *rl;
1276         int error = 0;
1277         boolean_t doread = bp->b_flags & B_READ;
1278         boolean_t is_dumpified;
1279         boolean_t sync;
1280 
1281         if (getminor(bp->b_edev) == 0) {
1282                 error = SET_ERROR(EINVAL);
1283         } else {
1284                 zs = ddi_get_soft_state(zfsdev_state, getminor(bp->b_edev));
1285                 if (zs == NULL)
1286                         error = SET_ERROR(ENXIO);
1287                 else if (zs->zss_type != ZSST_ZVOL)
1288                         error = SET_ERROR(EINVAL);
1289         }
1290 
1291         if (error) {
1292                 bioerror(bp, error);
1293                 biodone(bp);
1294                 return (0);
1295         }
1296 
1297         zv = zs->zss_data;
1298 
1299         if (!(bp->b_flags & B_READ) && (zv->zv_flags & ZVOL_RDONLY)) {
1300                 bioerror(bp, EROFS);
1301                 biodone(bp);
1302                 return (0);
1303         }
1304 
1305         off = ldbtob(bp->b_blkno);
1306         volsize = zv->zv_volsize;
1307 
1308         os = zv->zv_objset;
1309         ASSERT(os != NULL);
1310 
1311         bp_mapin(bp);
1312         addr = bp->b_un.b_addr;
1313         resid = bp->b_bcount;
1314 
1315         if (resid > 0 && (off < 0 || off >= volsize)) {
1316                 bioerror(bp, EIO);
1317                 biodone(bp);
1318                 return (0);
1319         }
1320 
1321         is_dumpified = zv->zv_flags & ZVOL_DUMPIFIED;
1322         sync = ((!(bp->b_flags & B_ASYNC) &&
1323             !(zv->zv_flags & ZVOL_WCE)) ||
1324             (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS)) &&
1325             !doread && !is_dumpified;
1326 
1327         /*
1328          * There must be no buffer changes when doing a dmu_sync() because
1329          * we can't change the data whilst calculating the checksum.
1330          */
1331         rl = zfs_range_lock(&zv->zv_znode, off, resid,
1332             doread ? RL_READER : RL_WRITER);
1333 
1334         while (resid != 0 && off < volsize) {
1335                 size_t size = MIN(resid, zvol_maxphys);
1336                 if (is_dumpified) {
1337                         size = MIN(size, P2END(off, zv->zv_volblocksize) - off);
1338                         error = zvol_dumpio(zv, addr, off, size,
1339                             doread, B_FALSE);
1340                 } else if (doread) {
1341                         error = dmu_read(os, ZVOL_OBJ, off, size, addr,
1342                             DMU_READ_PREFETCH);
1343                 } else {
1344                         dmu_tx_t *tx = dmu_tx_create(os);
1345                         dmu_tx_hold_write(tx, ZVOL_OBJ, off, size);
1346                         error = dmu_tx_assign(tx, TXG_WAIT);
1347                         if (error) {
1348                                 dmu_tx_abort(tx);
1349                         } else {
1350                                 dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
1351                                 zvol_log_write(zv, tx, off, size, sync);
1352                                 dmu_tx_commit(tx);
1353                         }
1354                 }
1355                 if (error) {
1356                         /* convert checksum errors into IO errors */
1357                         if (error == ECKSUM)
1358                                 error = SET_ERROR(EIO);
1359                         break;
1360                 }
1361                 off += size;
1362                 addr += size;
1363                 resid -= size;
1364         }
1365         zfs_range_unlock(rl);
1366 
1367         if ((bp->b_resid = resid) == bp->b_bcount)
1368                 bioerror(bp, off > volsize ? EINVAL : error);
1369 
1370         if (sync)
1371                 zil_commit(zv->zv_zilog, ZVOL_OBJ);
1372         biodone(bp);
1373 
1374         return (0);
1375 }
1376 
1377 /*
1378  * Set the buffer count to the zvol maximum transfer.
1379  * Using our own routine instead of the default minphys()
1380  * means that for larger writes we write bigger buffers on X86
1381  * (128K instead of 56K) and flush the disk write cache less often
1382  * (every zvol_maxphys - currently 1MB) instead of minphys (currently
1383  * 56K on X86 and 128K on sparc).
1384  */
1385 void
1386 zvol_minphys(struct buf *bp)
1387 {
1388         if (bp->b_bcount > zvol_maxphys)
1389                 bp->b_bcount = zvol_maxphys;
1390 }
1391 
1392 int
1393 zvol_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblocks)
1394 {
1395         minor_t minor = getminor(dev);
1396         zvol_state_t *zv;
1397         int error = 0;
1398         uint64_t size;
1399         uint64_t boff;
1400         uint64_t resid;
1401 
1402         zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
1403         if (zv == NULL)
1404                 return (SET_ERROR(ENXIO));
1405 
1406         if ((zv->zv_flags & ZVOL_DUMPIFIED) == 0)
1407                 return (SET_ERROR(EINVAL));
1408 
1409         boff = ldbtob(blkno);
1410         resid = ldbtob(nblocks);
1411 
1412         VERIFY3U(boff + resid, <=, zv->zv_volsize);
1413 
1414         while (resid) {
1415                 size = MIN(resid, P2END(boff, zv->zv_volblocksize) - boff);
1416                 error = zvol_dumpio(zv, addr, boff, size, B_FALSE, B_TRUE);
1417                 if (error)
1418                         break;
1419                 boff += size;
1420                 addr += size;
1421                 resid -= size;
1422         }
1423 
1424         return (error);
1425 }
1426 
1427 /*ARGSUSED*/
1428 int
1429 zvol_read(dev_t dev, uio_t *uio, cred_t *cr)
1430 {
1431         minor_t minor = getminor(dev);
1432         zvol_state_t *zv;
1433         uint64_t volsize;
1434         rl_t *rl;
1435         int error = 0;
1436 
1437         zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
1438         if (zv == NULL)
1439                 return (SET_ERROR(ENXIO));
1440 
1441         volsize = zv->zv_volsize;
1442         if (uio->uio_resid > 0 &&
1443             (uio->uio_loffset < 0 || uio->uio_loffset >= volsize))
1444                 return (SET_ERROR(EIO));
1445 
1446         if (zv->zv_flags & ZVOL_DUMPIFIED) {
1447                 error = physio(zvol_strategy, NULL, dev, B_READ,
1448                     zvol_minphys, uio);
1449                 return (error);
1450         }
1451 
1452         rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid,
1453             RL_READER);
1454         while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
1455                 uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
1456 
1457                 /* don't read past the end */
1458                 if (bytes > volsize - uio->uio_loffset)
1459                         bytes = volsize - uio->uio_loffset;
1460 
1461                 error =  dmu_read_uio(zv->zv_objset, ZVOL_OBJ, uio, bytes);
1462                 if (error) {
1463                         /* convert checksum errors into IO errors */
1464                         if (error == ECKSUM)
1465                                 error = SET_ERROR(EIO);
1466                         break;
1467                 }
1468         }
1469         zfs_range_unlock(rl);
1470         return (error);
1471 }
1472 
1473 /*ARGSUSED*/
1474 int
1475 zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
1476 {
1477         minor_t minor = getminor(dev);
1478         zvol_state_t *zv;
1479         uint64_t volsize;
1480         rl_t *rl;
1481         int error = 0;
1482         boolean_t sync;
1483 
1484         zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
1485         if (zv == NULL)
1486                 return (SET_ERROR(ENXIO));
1487 
1488         volsize = zv->zv_volsize;
1489         if (uio->uio_resid > 0 &&
1490             (uio->uio_loffset < 0 || uio->uio_loffset >= volsize))
1491                 return (SET_ERROR(EIO));
1492 
1493         if (zv->zv_flags & ZVOL_DUMPIFIED) {
1494                 error = physio(zvol_strategy, NULL, dev, B_WRITE,
1495                     zvol_minphys, uio);
1496                 return (error);
1497         }
1498 
1499         sync = !(zv->zv_flags & ZVOL_WCE) ||
1500             (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
1501 
1502         rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid,
1503             RL_WRITER);
1504         while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
1505                 uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
1506                 uint64_t off = uio->uio_loffset;
1507                 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
1508 
1509                 if (bytes > volsize - off)   /* don't write past the end */
1510                         bytes = volsize - off;
1511 
1512                 dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes);
1513                 error = dmu_tx_assign(tx, TXG_WAIT);
1514                 if (error) {
1515                         dmu_tx_abort(tx);
1516                         break;
1517                 }
1518                 error = dmu_write_uio_dbuf(zv->zv_dbuf, uio, bytes, tx);
1519                 if (error == 0)
1520                         zvol_log_write(zv, tx, off, bytes, sync);
1521                 dmu_tx_commit(tx);
1522 
1523                 if (error)
1524                         break;
1525         }
1526         zfs_range_unlock(rl);
1527         if (sync)
1528                 zil_commit(zv->zv_zilog, ZVOL_OBJ);
1529         return (error);
1530 }
1531 
1532 int
1533 zvol_getefi(void *arg, int flag, uint64_t vs, uint8_t bs)
1534 {
1535         struct uuid uuid = EFI_RESERVED;
1536         efi_gpe_t gpe = { 0 };
1537         uint32_t crc;
1538         dk_efi_t efi;
1539         int length;
1540         char *ptr;
1541 
1542         if (ddi_copyin(arg, &efi, sizeof (dk_efi_t), flag))
1543                 return (SET_ERROR(EFAULT));
1544         ptr = (char *)(uintptr_t)efi.dki_data_64;
1545         length = efi.dki_length;
1546         /*
1547          * Some clients may attempt to request a PMBR for the
1548          * zvol.  Currently this interface will return EINVAL to
1549          * such requests.  These requests could be supported by
1550          * adding a check for lba == 0 and consing up an appropriate
1551          * PMBR.
1552          */
1553         if (efi.dki_lba < 1 || efi.dki_lba > 2 || length <= 0)
1554                 return (SET_ERROR(EINVAL));
1555 
1556         gpe.efi_gpe_StartingLBA = LE_64(34ULL);
1557         gpe.efi_gpe_EndingLBA = LE_64((vs >> bs) - 1);
1558         UUID_LE_CONVERT(gpe.efi_gpe_PartitionTypeGUID, uuid);
1559 
1560         if (efi.dki_lba == 1) {
1561                 efi_gpt_t gpt = { 0 };
1562 
1563                 gpt.efi_gpt_Signature = LE_64(EFI_SIGNATURE);
1564                 gpt.efi_gpt_Revision = LE_32(EFI_VERSION_CURRENT);
1565                 gpt.efi_gpt_HeaderSize = LE_32(sizeof (gpt));
1566                 gpt.efi_gpt_MyLBA = LE_64(1ULL);
1567                 gpt.efi_gpt_FirstUsableLBA = LE_64(34ULL);
1568                 gpt.efi_gpt_LastUsableLBA = LE_64((vs >> bs) - 1);
1569                 gpt.efi_gpt_PartitionEntryLBA = LE_64(2ULL);
1570                 gpt.efi_gpt_NumberOfPartitionEntries = LE_32(1);
1571                 gpt.efi_gpt_SizeOfPartitionEntry =
1572                     LE_32(sizeof (efi_gpe_t));
1573                 CRC32(crc, &gpe, sizeof (gpe), -1U, crc32_table);
1574                 gpt.efi_gpt_PartitionEntryArrayCRC32 = LE_32(~crc);
1575                 CRC32(crc, &gpt, sizeof (gpt), -1U, crc32_table);
1576                 gpt.efi_gpt_HeaderCRC32 = LE_32(~crc);
1577                 if (ddi_copyout(&gpt, ptr, MIN(sizeof (gpt), length),
1578                     flag))
1579                         return (SET_ERROR(EFAULT));
1580                 ptr += sizeof (gpt);
1581                 length -= sizeof (gpt);
1582         }
1583         if (length > 0 && ddi_copyout(&gpe, ptr, MIN(sizeof (gpe),
1584             length), flag))
1585                 return (SET_ERROR(EFAULT));
1586         return (0);
1587 }
1588 
1589 /*
1590  * BEGIN entry points to allow external callers access to the volume.
1591  */
1592 /*
1593  * Return the volume parameters needed for access from an external caller.
1594  * These values are invariant as long as the volume is held open.
1595  */
1596 int
1597 zvol_get_volume_params(minor_t minor, uint64_t *blksize,
1598     uint64_t *max_xfer_len, void **minor_hdl, void **objset_hdl, void **zil_hdl,
1599     void **rl_hdl, void **bonus_hdl)
1600 {
1601         zvol_state_t *zv;
1602 
1603         zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
1604         if (zv == NULL)
1605                 return (SET_ERROR(ENXIO));
1606         if (zv->zv_flags & ZVOL_DUMPIFIED)
1607                 return (SET_ERROR(ENXIO));
1608 
1609         ASSERT(blksize && max_xfer_len && minor_hdl &&
1610             objset_hdl && zil_hdl && rl_hdl && bonus_hdl);
1611 
1612         *blksize = zv->zv_volblocksize;
1613         *max_xfer_len = (uint64_t)zvol_maxphys;
1614         *minor_hdl = zv;
1615         *objset_hdl = zv->zv_objset;
1616         *zil_hdl = zv->zv_zilog;
1617         *rl_hdl = &zv->zv_znode;
1618         *bonus_hdl = zv->zv_dbuf;
1619         return (0);
1620 }
1621 
1622 /*
1623  * Return the current volume size to an external caller.
1624  * The size can change while the volume is open.
1625  */
1626 uint64_t
1627 zvol_get_volume_size(void *minor_hdl)
1628 {
1629         zvol_state_t *zv = minor_hdl;
1630 
1631         return (zv->zv_volsize);
1632 }
1633 
1634 /*
1635  * Return the current WCE setting to an external caller.
1636  * The WCE setting can change while the volume is open.
1637  */
1638 int
1639 zvol_get_volume_wce(void *minor_hdl)
1640 {
1641         zvol_state_t *zv = minor_hdl;
1642 
1643         return ((zv->zv_flags & ZVOL_WCE) ? 1 : 0);
1644 }
1645 
1646 /*
1647  * Entry point for external callers to zvol_log_write
1648  */
1649 void
1650 zvol_log_write_minor(void *minor_hdl, dmu_tx_t *tx, offset_t off, ssize_t resid,
1651     boolean_t sync)
1652 {
1653         zvol_state_t *zv = minor_hdl;
1654 
1655         zvol_log_write(zv, tx, off, resid, sync);
1656 }
1657 /*
1658  * END entry points to allow external callers access to the volume.
1659  */
1660 
1661 /*
1662  * Log a DKIOCFREE/free-long-range to the ZIL with TX_TRUNCATE.
1663  */
1664 static void
1665 zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, uint64_t len,
1666     boolean_t sync)
1667 {
1668         itx_t *itx;
1669         lr_truncate_t *lr;
1670         zilog_t *zilog = zv->zv_zilog;
1671 
1672         if (zil_replaying(zilog, tx))
1673                 return;
1674 
1675         itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr));
1676         lr = (lr_truncate_t *)&itx->itx_lr;
1677         lr->lr_foid = ZVOL_OBJ;
1678         lr->lr_offset = off;
1679         lr->lr_length = len;
1680 
1681         itx->itx_sync = sync;
1682         zil_itx_assign(zilog, itx, tx);
1683 }
1684 
1685 /*
1686  * Dirtbag ioctls to support mkfs(1M) for UFS filesystems.  See dkio(7I).
1687  * Also a dirtbag dkio ioctl for unmap/free-block functionality.
1688  */
1689 /*ARGSUSED*/
1690 int
1691 zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
1692 {
1693         zvol_state_t *zv;
1694         struct dk_callback *dkc;
1695         int error = 0;
1696         rl_t *rl;
1697 
1698         mutex_enter(&zfsdev_state_lock);
1699 
1700         zv = zfsdev_get_soft_state(getminor(dev), ZSST_ZVOL);
1701 
1702         if (zv == NULL) {
1703                 mutex_exit(&zfsdev_state_lock);
1704                 return (SET_ERROR(ENXIO));
1705         }
1706         ASSERT(zv->zv_total_opens > 0);
1707 
1708         switch (cmd) {
1709 
1710         case DKIOCINFO:
1711         {
1712                 struct dk_cinfo dki;
1713 
1714                 bzero(&dki, sizeof (dki));
1715                 (void) strcpy(dki.dki_cname, "zvol");
1716                 (void) strcpy(dki.dki_dname, "zvol");
1717                 dki.dki_ctype = DKC_UNKNOWN;
1718                 dki.dki_unit = getminor(dev);
1719                 dki.dki_maxtransfer =
1720                     1 << (SPA_OLD_MAXBLOCKSHIFT - zv->zv_min_bs);
1721                 mutex_exit(&zfsdev_state_lock);
1722                 if (ddi_copyout(&dki, (void *)arg, sizeof (dki), flag))
1723                         error = SET_ERROR(EFAULT);
1724                 return (error);
1725         }
1726 
1727         case DKIOCGMEDIAINFO:
1728         {
1729                 struct dk_minfo dkm;
1730 
1731                 bzero(&dkm, sizeof (dkm));
1732                 dkm.dki_lbsize = 1U << zv->zv_min_bs;
1733                 dkm.dki_capacity = zv->zv_volsize >> zv->zv_min_bs;
1734                 dkm.dki_media_type = DK_UNKNOWN;
1735                 mutex_exit(&zfsdev_state_lock);
1736                 if (ddi_copyout(&dkm, (void *)arg, sizeof (dkm), flag))
1737                         error = SET_ERROR(EFAULT);
1738                 return (error);
1739         }
1740 
1741         case DKIOCGMEDIAINFOEXT:
1742         {
1743                 struct dk_minfo_ext dkmext;
1744 
1745                 bzero(&dkmext, sizeof (dkmext));
1746                 dkmext.dki_lbsize = 1U << zv->zv_min_bs;
1747                 dkmext.dki_pbsize = zv->zv_volblocksize;
1748                 dkmext.dki_capacity = zv->zv_volsize >> zv->zv_min_bs;
1749                 dkmext.dki_media_type = DK_UNKNOWN;
1750                 mutex_exit(&zfsdev_state_lock);
1751                 if (ddi_copyout(&dkmext, (void *)arg, sizeof (dkmext), flag))
1752                         error = SET_ERROR(EFAULT);
1753                 return (error);
1754         }
1755 
1756         case DKIOCGETEFI:
1757         {
1758                 uint64_t vs = zv->zv_volsize;
1759                 uint8_t bs = zv->zv_min_bs;
1760 
1761                 mutex_exit(&zfsdev_state_lock);
1762                 error = zvol_getefi((void *)arg, flag, vs, bs);
1763                 return (error);
1764         }
1765 
1766         case DKIOCFLUSHWRITECACHE:
1767                 dkc = (struct dk_callback *)arg;
1768                 mutex_exit(&zfsdev_state_lock);
1769                 zil_commit(zv->zv_zilog, ZVOL_OBJ);
1770                 if ((flag & FKIOCTL) && dkc != NULL && dkc->dkc_callback) {
1771                         (*dkc->dkc_callback)(dkc->dkc_cookie, error);
1772                         error = 0;
1773                 }
1774                 return (error);
1775 
1776         case DKIOCGETWCE:
1777         {
1778                 int wce = (zv->zv_flags & ZVOL_WCE) ? 1 : 0;
1779                 if (ddi_copyout(&wce, (void *)arg, sizeof (int),
1780                     flag))
1781                         error = SET_ERROR(EFAULT);
1782                 break;
1783         }
1784         case DKIOCSETWCE:
1785         {
1786                 int wce;
1787                 if (ddi_copyin((void *)arg, &wce, sizeof (int),
1788                     flag)) {
1789                         error = SET_ERROR(EFAULT);
1790                         break;
1791                 }
1792                 if (wce) {
1793                         zv->zv_flags |= ZVOL_WCE;
1794                         mutex_exit(&zfsdev_state_lock);
1795                 } else {
1796                         zv->zv_flags &= ~ZVOL_WCE;
1797                         mutex_exit(&zfsdev_state_lock);
1798                         zil_commit(zv->zv_zilog, ZVOL_OBJ);
1799                 }
1800                 return (0);
1801         }
1802 
1803         case DKIOCGGEOM:
1804         case DKIOCGVTOC:
1805                 /*
1806                  * commands using these (like prtvtoc) expect ENOTSUP
1807                  * since we're emulating an EFI label
1808                  */
1809                 error = SET_ERROR(ENOTSUP);
1810                 break;
1811 
1812         case DKIOCDUMPINIT:
1813                 rl = zfs_range_lock(&zv->zv_znode, 0, zv->zv_volsize,
1814                     RL_WRITER);
1815                 error = zvol_dumpify(zv);
1816                 zfs_range_unlock(rl);
1817                 break;
1818 
1819         case DKIOCDUMPFINI:
1820                 if (!(zv->zv_flags & ZVOL_DUMPIFIED))
1821                         break;
1822                 rl = zfs_range_lock(&zv->zv_znode, 0, zv->zv_volsize,
1823                     RL_WRITER);
1824                 error = zvol_dump_fini(zv);
1825                 zfs_range_unlock(rl);
1826                 break;
1827 
1828         case DKIOCFREE:
1829         {
1830                 dkioc_free_list_t *dfl;
1831                 dmu_tx_t *tx;
1832 
1833                 mutex_exit(&zfsdev_state_lock);
1834 
1835                 if (!zvol_unmap_enabled)
1836                         break;
1837 
1838                 if (!(flag & FKIOCTL)) {
1839                         dfl = dfl_copyin((void *)arg, flag, KM_SLEEP);
1840                         if (dfl == NULL) {
1841                                 error = SET_ERROR(EFAULT);
1842                                 break;
1843                         }
1844                 } else {
1845                         dfl = (dkioc_free_list_t *)arg;
1846                 }
1847 
1848                 for (int i = 0; i < dfl->dfl_num_exts; i++) {
1849                         uint64_t start = dfl->dfl_exts[i].dfle_start,
1850                             length = dfl->dfl_exts[i].dfle_length,
1851                             end = start + length;
1852 
1853                         /*
1854                          * Apply Postel's Law to length-checking.  If they
1855                          * overshoot, just blank out until the end, if there's
1856                          * a need to blank out anything.
1857                          */
1858                         if (start >= zv->zv_volsize)
1859                                 continue;       /* No need to do anything... */
1860                         if (end > zv->zv_volsize) {
1861                                 end = DMU_OBJECT_END;
1862                                 length = end - start;
1863                         }
1864 
1865                         rl = zfs_range_lock(&zv->zv_znode, start, length,
1866                             RL_WRITER);
1867                         tx = dmu_tx_create(zv->zv_objset);
1868                         error = dmu_tx_assign(tx, TXG_WAIT);
1869                         if (error != 0) {
1870                                 dmu_tx_abort(tx);
1871                         } else {
1872                                 zvol_log_truncate(zv, tx, start, length,
1873                                     B_TRUE);
1874                                 dmu_tx_commit(tx);
1875                                 error = dmu_free_long_range(zv->zv_objset,
1876                                     ZVOL_OBJ, start, length);
1877                         }
1878 
1879                         zfs_range_unlock(rl);
1880 
1881                         if (error != 0)
1882                                 break;
1883                 }
1884 
1885                 /*
1886                  * If the write-cache is disabled, 'sync' property
1887                  * is set to 'always', or if the caller is asking for
1888                  * a synchronous free, commit this operation to the zil.
1889                  * This will sync any previous uncommitted writes to the
1890                  * zvol object.
1891                  * Can be overridden by the zvol_unmap_sync_enabled tunable.
1892                  */
1893                 if ((error == 0) && zvol_unmap_sync_enabled &&
1894                     (!(zv->zv_flags & ZVOL_WCE) ||
1895                     (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS) ||
1896                     (dfl->dfl_flags & DF_WAIT_SYNC))) {
1897                         zil_commit(zv->zv_zilog, ZVOL_OBJ);
1898                 }
1899 
1900                 if (!(flag & FKIOCTL))
1901                         dfl_free(dfl);
1902 
1903                 return (error);
1904         }
1905 
1906         default:
1907                 error = SET_ERROR(ENOTTY);
1908                 break;
1909 
1910         }
1911         mutex_exit(&zfsdev_state_lock);
1912         return (error);
1913 }
1914 
1915 int
1916 zvol_busy(void)
1917 {
1918         return (zvol_minors != 0);
1919 }
1920 
1921 void
1922 zvol_init(void)
1923 {
1924         VERIFY(ddi_soft_state_init(&zfsdev_state, sizeof (zfs_soft_state_t),
1925             1) == 0);
1926         mutex_init(&zfsdev_state_lock, NULL, MUTEX_DEFAULT, NULL);
1927 }
1928 
1929 void
1930 zvol_fini(void)
1931 {
1932         mutex_destroy(&zfsdev_state_lock);
1933         ddi_soft_state_fini(&zfsdev_state);
1934 }
1935 
1936 /*ARGSUSED*/
1937 static int
1938 zfs_mvdev_dump_feature_check(void *arg, dmu_tx_t *tx)
1939 {
1940         spa_t *spa = dmu_tx_pool(tx)->dp_spa;
1941 
1942         if (spa_feature_is_active(spa, SPA_FEATURE_MULTI_VDEV_CRASH_DUMP))
1943                 return (1);
1944         return (0);
1945 }
1946 
1947 /*ARGSUSED*/
1948 static void
1949 zfs_mvdev_dump_activate_feature_sync(void *arg, dmu_tx_t *tx)
1950 {
1951         spa_t *spa = dmu_tx_pool(tx)->dp_spa;
1952 
1953         spa_feature_incr(spa, SPA_FEATURE_MULTI_VDEV_CRASH_DUMP, tx);
1954 }
1955 
1956 static int
1957 zvol_dump_init(zvol_state_t *zv, boolean_t resize)
1958 {
1959         dmu_tx_t *tx;
1960         int error;
1961         objset_t *os = zv->zv_objset;
1962         spa_t *spa = dmu_objset_spa(os);
1963         vdev_t *vd = spa->spa_root_vdev;
1964         nvlist_t *nv = NULL;
1965         uint64_t version = spa_version(spa);
1966         uint64_t checksum, compress, refresrv, vbs, dedup;
1967 
1968         ASSERT(MUTEX_HELD(&zfsdev_state_lock));
1969         ASSERT(vd->vdev_ops == &vdev_root_ops);
1970 
1971         error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, 0,
1972             DMU_OBJECT_END);
1973         if (error != 0)
1974                 return (error);
1975         /* wait for dmu_free_long_range to actually free the blocks */
1976         txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
1977 
1978         /*
1979          * If the pool on which the dump device is being initialized has more
1980          * than one child vdev, check that the MULTI_VDEV_CRASH_DUMP feature is
1981          * enabled.  If so, bump that feature's counter to indicate that the
1982          * feature is active. We also check the vdev type to handle the
1983          * following case:
1984          *   # zpool create test raidz disk1 disk2 disk3
1985          *   Now have spa_root_vdev->vdev_children == 1 (the raidz vdev),
1986          *   the raidz vdev itself has 3 children.
1987          */
1988         if (vd->vdev_children > 1 || vd->vdev_ops == &vdev_raidz_ops) {
1989                 if (!spa_feature_is_enabled(spa,
1990                     SPA_FEATURE_MULTI_VDEV_CRASH_DUMP))
1991                         return (SET_ERROR(ENOTSUP));
1992                 (void) dsl_sync_task(spa_name(spa),
1993                     zfs_mvdev_dump_feature_check,
1994                     zfs_mvdev_dump_activate_feature_sync, NULL,
1995                     2, ZFS_SPACE_CHECK_RESERVED);
1996         }
1997 
1998         if (!resize) {
1999                 error = dsl_prop_get_integer(zv->zv_name,
2000                     zfs_prop_to_name(ZFS_PROP_COMPRESSION), &compress, NULL);
2001                 if (error == 0) {
2002                         error = dsl_prop_get_integer(zv->zv_name,
2003                             zfs_prop_to_name(ZFS_PROP_CHECKSUM), &checksum,
2004                             NULL);
2005                 }
2006                 if (error == 0) {
2007                         error = dsl_prop_get_integer(zv->zv_name,
2008                             zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
2009                             &refresrv, NULL);
2010                 }
2011                 if (error == 0) {
2012                         error = dsl_prop_get_integer(zv->zv_name,
2013                             zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &vbs,
2014                             NULL);
2015                 }
2016                 if (version >= SPA_VERSION_DEDUP && error == 0) {
2017                         error = dsl_prop_get_integer(zv->zv_name,
2018                             zfs_prop_to_name(ZFS_PROP_DEDUP), &dedup, NULL);
2019                 }
2020         }
2021         if (error != 0)
2022                 return (error);
2023 
2024         tx = dmu_tx_create(os);
2025         dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
2026         dmu_tx_hold_bonus(tx, ZVOL_OBJ);
2027         error = dmu_tx_assign(tx, TXG_WAIT);
2028         if (error != 0) {
2029                 dmu_tx_abort(tx);
2030                 return (error);
2031         }
2032 
2033         /*
2034          * If we are resizing the dump device then we only need to
2035          * update the refreservation to match the newly updated
2036          * zvolsize. Otherwise, we save off the original state of the
2037          * zvol so that we can restore them if the zvol is ever undumpified.
2038          */
2039         if (resize) {
2040                 error = zap_update(os, ZVOL_ZAP_OBJ,
2041                     zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1,
2042                     &zv->zv_volsize, tx);
2043         } else {
2044                 error = zap_update(os, ZVOL_ZAP_OBJ,
2045                     zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1,
2046                     &compress, tx);
2047                 if (error == 0) {
2048                         error = zap_update(os, ZVOL_ZAP_OBJ,
2049                             zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1,
2050                             &checksum, tx);
2051                 }
2052                 if (error == 0) {
2053                         error = zap_update(os, ZVOL_ZAP_OBJ,
2054                             zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1,
2055                             &refresrv, tx);
2056                 }
2057                 if (error == 0) {
2058                         error = zap_update(os, ZVOL_ZAP_OBJ,
2059                             zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1,
2060                             &vbs, tx);
2061                 }
2062                 if (error == 0) {
2063                         error = dmu_object_set_blocksize(
2064                             os, ZVOL_OBJ, SPA_OLD_MAXBLOCKSIZE, 0, tx);
2065                 }
2066                 if (version >= SPA_VERSION_DEDUP && error == 0) {
2067                         error = zap_update(os, ZVOL_ZAP_OBJ,
2068                             zfs_prop_to_name(ZFS_PROP_DEDUP), 8, 1,
2069                             &dedup, tx);
2070                 }
2071                 if (error == 0)
2072                         zv->zv_volblocksize = SPA_OLD_MAXBLOCKSIZE;
2073         }
2074         dmu_tx_commit(tx);
2075 
2076         /*
2077          * We only need update the zvol's property if we are initializing
2078          * the dump area for the first time.
2079          */
2080         if (error == 0 && !resize) {
2081                 /*
2082                  * If MULTI_VDEV_CRASH_DUMP is active, use the NOPARITY checksum
2083                  * function.  Otherwise, use the old default -- OFF.
2084                  */
2085                 checksum = spa_feature_is_active(spa,
2086                     SPA_FEATURE_MULTI_VDEV_CRASH_DUMP) ? ZIO_CHECKSUM_NOPARITY :
2087                     ZIO_CHECKSUM_OFF;
2088 
2089                 VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
2090                 VERIFY(nvlist_add_uint64(nv,
2091                     zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 0) == 0);
2092                 VERIFY(nvlist_add_uint64(nv,
2093                     zfs_prop_to_name(ZFS_PROP_COMPRESSION),
2094                     ZIO_COMPRESS_OFF) == 0);
2095                 VERIFY(nvlist_add_uint64(nv,
2096                     zfs_prop_to_name(ZFS_PROP_CHECKSUM),
2097                     checksum) == 0);
2098                 if (version >= SPA_VERSION_DEDUP) {
2099                         VERIFY(nvlist_add_uint64(nv,
2100                             zfs_prop_to_name(ZFS_PROP_DEDUP),
2101                             ZIO_CHECKSUM_OFF) == 0);
2102                 }
2103 
2104                 error = zfs_set_prop_nvlist(zv->zv_name, ZPROP_SRC_LOCAL,
2105                     nv, NULL);
2106                 nvlist_free(nv);
2107         }
2108 
2109         /* Allocate the space for the dump */
2110         if (error == 0)
2111                 error = zvol_prealloc(zv);
2112         return (error);
2113 }
2114 
2115 static int
2116 zvol_dumpify(zvol_state_t *zv)
2117 {
2118         int error = 0;
2119         uint64_t dumpsize = 0;
2120         dmu_tx_t *tx;
2121         objset_t *os = zv->zv_objset;
2122 
2123         if (zv->zv_flags & ZVOL_RDONLY)
2124                 return (SET_ERROR(EROFS));
2125 
2126         if (zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE,
2127             8, 1, &dumpsize) != 0 || dumpsize != zv->zv_volsize) {
2128                 boolean_t resize = (dumpsize > 0);
2129 
2130                 if ((error = zvol_dump_init(zv, resize)) != 0) {
2131                         (void) zvol_dump_fini(zv);
2132                         return (error);
2133                 }
2134         }
2135 
2136         /*
2137          * Build up our lba mapping.
2138          */
2139         error = zvol_get_lbas(zv);
2140         if (error) {
2141                 (void) zvol_dump_fini(zv);
2142                 return (error);
2143         }
2144 
2145         tx = dmu_tx_create(os);
2146         dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
2147         error = dmu_tx_assign(tx, TXG_WAIT);
2148         if (error) {
2149                 dmu_tx_abort(tx);
2150                 (void) zvol_dump_fini(zv);
2151                 return (error);
2152         }
2153 
2154         zv->zv_flags |= ZVOL_DUMPIFIED;
2155         error = zap_update(os, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, 8, 1,
2156             &zv->zv_volsize, tx);
2157         dmu_tx_commit(tx);
2158 
2159         if (error) {
2160                 (void) zvol_dump_fini(zv);
2161                 return (error);
2162         }
2163 
2164         txg_wait_synced(dmu_objset_pool(os), 0);
2165         return (0);
2166 }
2167 
2168 static int
2169 zvol_dump_fini(zvol_state_t *zv)
2170 {
2171         dmu_tx_t *tx;
2172         objset_t *os = zv->zv_objset;
2173         nvlist_t *nv;
2174         int error = 0;
2175         uint64_t checksum, compress, refresrv, vbs, dedup;
2176         uint64_t version = spa_version(dmu_objset_spa(zv->zv_objset));
2177 
2178         /*
2179          * Attempt to restore the zvol back to its pre-dumpified state.
2180          * This is a best-effort attempt as it's possible that not all
2181          * of these properties were initialized during the dumpify process
2182          * (i.e. error during zvol_dump_init).
2183          */
2184 
2185         tx = dmu_tx_create(os);
2186         dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
2187         error = dmu_tx_assign(tx, TXG_WAIT);
2188         if (error) {
2189                 dmu_tx_abort(tx);
2190                 return (error);
2191         }
2192         (void) zap_remove(os, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, tx);
2193         dmu_tx_commit(tx);
2194 
2195         (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
2196             zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1, &checksum);
2197         (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
2198             zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1, &compress);
2199         (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
2200             zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1, &refresrv);
2201         (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
2202             zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1, &vbs);
2203 
2204         VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
2205         (void) nvlist_add_uint64(nv,
2206             zfs_prop_to_name(ZFS_PROP_CHECKSUM), checksum);
2207         (void) nvlist_add_uint64(nv,
2208             zfs_prop_to_name(ZFS_PROP_COMPRESSION), compress);
2209         (void) nvlist_add_uint64(nv,
2210             zfs_prop_to_name(ZFS_PROP_REFRESERVATION), refresrv);
2211         if (version >= SPA_VERSION_DEDUP &&
2212             zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
2213             zfs_prop_to_name(ZFS_PROP_DEDUP), 8, 1, &dedup) == 0) {
2214                 (void) nvlist_add_uint64(nv,
2215                     zfs_prop_to_name(ZFS_PROP_DEDUP), dedup);
2216         }
2217         (void) zfs_set_prop_nvlist(zv->zv_name, ZPROP_SRC_LOCAL,
2218             nv, NULL);
2219         nvlist_free(nv);
2220 
2221         zvol_free_extents(zv);
2222         zv->zv_flags &= ~ZVOL_DUMPIFIED;
2223         (void) dmu_free_long_range(os, ZVOL_OBJ, 0, DMU_OBJECT_END);
2224         /* wait for dmu_free_long_range to actually free the blocks */
2225         txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
2226         tx = dmu_tx_create(os);
2227         dmu_tx_hold_bonus(tx, ZVOL_OBJ);
2228         error = dmu_tx_assign(tx, TXG_WAIT);
2229         if (error) {
2230                 dmu_tx_abort(tx);
2231                 return (error);
2232         }
2233         if (dmu_object_set_blocksize(os, ZVOL_OBJ, vbs, 0, tx) == 0)
2234                 zv->zv_volblocksize = vbs;
2235         dmu_tx_commit(tx);
2236 
2237         return (0);
2238 }