Print this page
    
NEX-19083 backport OS-7314 zil_commit should omit cache thrash
9962 zil_commit should omit cache thrash
Reviewed by: Matt Ahrens <matt@delphix.com>
Reviewed by: Brad Lewis <brad.lewis@delphix.com>
Reviewed by: Patrick Mooney <patrick.mooney@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Approved by: Joshua M. Clulow <josh@sysmgr.org>
NEX-5367 special vdev: sync-write options (NEW)
Reviewed by: Roman Strashkin <roman.strashkin@nexenta.com>
Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
6250 zvol_dump_init() can hold txg open
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Albert Lee <trisk@omniti.com>
Reviewed by: Xin Li <delphij@freebsd.org>
Approved by: Garrett D'Amore <garrett@damore.org>
NEX-4582 update wrc test cases for allow to use write back cache per tree of datasets
Reviewed by: Steve Peng <steve.peng@nexenta.com>
Reviewed by: Alex Aizman <alex.aizman@nexenta.com>
5960 zfs recv should prefetch indirect blocks
5925 zfs receive -o origin=
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
NEX-3508 CLONE - Port NEX-2946 Add UNMAP/TRIM functionality to ZFS and illumos
Reviewed by: Josef Sipek <josef.sipek@nexenta.com>
Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
Conflicts:
    usr/src/uts/common/io/scsi/targets/sd.c
    usr/src/uts/common/sys/scsi/targets/sddef.h
NEX-1142 move rwlock to vdev to protect vdev_tsd
not just ldi handle.
This way we serialize open/close, yet allow parallel I/O.
4370 avoid transmitting holes during zfs send
4371 DMU code clean up
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Josef 'Jeff' Sipek <jeffpc@josefsipek.net>
Approved by: Garrett D'Amore <garrett@damore.org>
NEX-1065 Added serialization to avoid race
between ldi notification and I/O path.
Also fixes OS-124, NEX-1051, NEX-1062.
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/fs/zfs/zvol.c
          +++ new/usr/src/uts/common/fs/zfs/zvol.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  
    | 
      ↓ open down ↓ | 
    18 lines elided | 
    
      ↑ open up ↑ | 
  
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   *
  24   24   * Portions Copyright 2010 Robert Milkowski
  25   25   *
  26   26   * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  27   27   * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
  28   28   * Copyright (c) 2013, Joyent, Inc. All rights reserved.
       29 + * Copyright 2017 Nexenta Systems, Inc.  All rights reserved.
  29   30   * Copyright (c) 2014 Integros [integros.com]
  30   31   */
  31   32  
  32   33  /*
  33   34   * ZFS volume emulation driver.
  34   35   *
  35   36   * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
  36   37   * Volumes are accessed through the symbolic links named:
  37   38   *
  38   39   * /dev/zvol/dsk/<pool_name>/<dataset_name>
  39   40   * /dev/zvol/rdsk/<pool_name>/<dataset_name>
  40   41   *
  41   42   * These links are created by the /dev filesystem (sdev_zvolops.c).
  42   43   * Volumes are persistent through reboot.  No user command needs to be
  43   44   * run before opening and using a device.
  44   45   */
  45   46  
  46   47  #include <sys/types.h>
  47   48  #include <sys/param.h>
  48   49  #include <sys/errno.h>
  49   50  #include <sys/uio.h>
  50   51  #include <sys/buf.h>
  51   52  #include <sys/modctl.h>
  52   53  #include <sys/open.h>
  53   54  #include <sys/kmem.h>
  54   55  #include <sys/conf.h>
  55   56  #include <sys/cmn_err.h>
  56   57  #include <sys/stat.h>
  57   58  #include <sys/zap.h>
  58   59  #include <sys/spa.h>
  59   60  #include <sys/spa_impl.h>
  60   61  #include <sys/zio.h>
  61   62  #include <sys/dmu_traverse.h>
  62   63  #include <sys/dnode.h>
  63   64  #include <sys/dsl_dataset.h>
  64   65  #include <sys/dsl_prop.h>
  65   66  #include <sys/dkio.h>
  66   67  #include <sys/efi_partition.h>
  67   68  #include <sys/byteorder.h>
  68   69  #include <sys/pathname.h>
  69   70  #include <sys/ddi.h>
  70   71  #include <sys/sunddi.h>
  71   72  #include <sys/crc32.h>
  72   73  #include <sys/dirent.h>
  73   74  #include <sys/policy.h>
  74   75  #include <sys/fs/zfs.h>
  75   76  #include <sys/zfs_ioctl.h>
  76   77  #include <sys/mkdev.h>
  77   78  #include <sys/zil.h>
  78   79  #include <sys/refcount.h>
  79   80  #include <sys/zfs_znode.h>
  80   81  #include <sys/zfs_rlock.h>
  
    | 
      ↓ open down ↓ | 
    42 lines elided | 
    
      ↑ open up ↑ | 
  
  81   82  #include <sys/vdev_disk.h>
  82   83  #include <sys/vdev_impl.h>
  83   84  #include <sys/vdev_raidz.h>
  84   85  #include <sys/zvol.h>
  85   86  #include <sys/dumphdr.h>
  86   87  #include <sys/zil_impl.h>
  87   88  #include <sys/dbuf.h>
  88   89  #include <sys/dmu_tx.h>
  89   90  #include <sys/zfeature.h>
  90   91  #include <sys/zio_checksum.h>
       92 +#include <sys/dkioc_free_util.h>
  91   93  #include <sys/zil_impl.h>
  92   94  
  93   95  #include "zfs_namecheck.h"
  94   96  
  95   97  void *zfsdev_state;
  96   98  static char *zvol_tag = "zvol_tag";
  97   99  
  98  100  #define ZVOL_DUMPSIZE           "dumpsize"
  99  101  
 100  102  /*
 101  103   * This lock protects the zfsdev_state structure from being modified
 102  104   * while it's being used, e.g. an open that comes in before a create
 103  105   * finishes.  It also protects temporary opens of the dataset so that,
 104  106   * e.g., an open doesn't get a spurious EBUSY.
 105  107   */
 106  108  kmutex_t zfsdev_state_lock;
 107  109  static uint32_t zvol_minors;
 108  110  
 109  111  typedef struct zvol_extent {
 110  112          list_node_t     ze_node;
 111  113          dva_t           ze_dva;         /* dva associated with this extent */
 112  114          uint64_t        ze_nblks;       /* number of blocks in extent */
 113  115  } zvol_extent_t;
 114  116  
 115  117  /*
 116  118   * The in-core state of each volume.
 117  119   */
 118  120  typedef struct zvol_state {
 119  121          char            zv_name[MAXPATHLEN]; /* pool/dd name */
 120  122          uint64_t        zv_volsize;     /* amount of space we advertise */
 121  123          uint64_t        zv_volblocksize; /* volume block size */
 122  124          minor_t         zv_minor;       /* minor number */
 123  125          uint8_t         zv_min_bs;      /* minimum addressable block shift */
 124  126          uint8_t         zv_flags;       /* readonly, dumpified, etc. */
 125  127          objset_t        *zv_objset;     /* objset handle */
 126  128          uint32_t        zv_open_count[OTYPCNT]; /* open counts */
 127  129          uint32_t        zv_total_opens; /* total open count */
 128  130          zilog_t         *zv_zilog;      /* ZIL handle */
 129  131          list_t          zv_extents;     /* List of extents for dump */
 130  132          znode_t         zv_znode;       /* for range locking */
 131  133          dmu_buf_t       *zv_dbuf;       /* bonus handle */
 132  134  } zvol_state_t;
 133  135  
 134  136  /*
 135  137   * zvol specific flags
 136  138   */
 137  139  #define ZVOL_RDONLY     0x1
 138  140  #define ZVOL_DUMPIFIED  0x2
 139  141  #define ZVOL_EXCL       0x4
 140  142  #define ZVOL_WCE        0x8
 141  143  
 142  144  /*
 143  145   * zvol maximum transfer in one DMU tx.
 144  146   */
 145  147  int zvol_maxphys = DMU_MAX_ACCESS/2;
 146  148  
 147  149  /*
 148  150   * Toggle unmap functionality.
 149  151   */
 150  152  boolean_t zvol_unmap_enabled = B_TRUE;
 151  153  
 152  154  /*
 153  155   * If true, unmaps requested as synchronous are executed synchronously,
 154  156   * otherwise all unmaps are asynchronous.
 155  157   */
 156  158  boolean_t zvol_unmap_sync_enabled = B_FALSE;
 157  159  
 158  160  extern int zfs_set_prop_nvlist(const char *, zprop_source_t,
 159  161      nvlist_t *, nvlist_t *);
 160  162  static int zvol_remove_zv(zvol_state_t *);
 161  163  static int zvol_get_data(void *arg, lr_write_t *lr, char *buf,
 162  164      struct lwb *lwb, zio_t *zio);
 163  165  static int zvol_dumpify(zvol_state_t *zv);
 164  166  static int zvol_dump_fini(zvol_state_t *zv);
 165  167  static int zvol_dump_init(zvol_state_t *zv, boolean_t resize);
 166  168  
 167  169  static void
 168  170  zvol_size_changed(zvol_state_t *zv, uint64_t volsize)
 169  171  {
 170  172          dev_t dev = makedevice(ddi_driver_major(zfs_dip), zv->zv_minor);
 171  173  
 172  174          zv->zv_volsize = volsize;
 173  175          VERIFY(ddi_prop_update_int64(dev, zfs_dip,
 174  176              "Size", volsize) == DDI_SUCCESS);
 175  177          VERIFY(ddi_prop_update_int64(dev, zfs_dip,
 176  178              "Nblocks", lbtodb(volsize)) == DDI_SUCCESS);
 177  179  
 178  180          /* Notify specfs to invalidate the cached size */
 179  181          spec_size_invalidate(dev, VBLK);
 180  182          spec_size_invalidate(dev, VCHR);
 181  183  }
 182  184  
 183  185  int
 184  186  zvol_check_volsize(uint64_t volsize, uint64_t blocksize)
 185  187  {
 186  188          if (volsize == 0)
 187  189                  return (SET_ERROR(EINVAL));
 188  190  
 189  191          if (volsize % blocksize != 0)
 190  192                  return (SET_ERROR(EINVAL));
 191  193  
 192  194  #ifdef _ILP32
 193  195          if (volsize - 1 > SPEC_MAXOFFSET_T)
 194  196                  return (SET_ERROR(EOVERFLOW));
 195  197  #endif
 196  198          return (0);
 197  199  }
 198  200  
 199  201  int
 200  202  zvol_check_volblocksize(uint64_t volblocksize)
 201  203  {
 202  204          if (volblocksize < SPA_MINBLOCKSIZE ||
 203  205              volblocksize > SPA_OLD_MAXBLOCKSIZE ||
 204  206              !ISP2(volblocksize))
 205  207                  return (SET_ERROR(EDOM));
 206  208  
 207  209          return (0);
 208  210  }
 209  211  
 210  212  int
 211  213  zvol_get_stats(objset_t *os, nvlist_t *nv)
 212  214  {
 213  215          int error;
 214  216          dmu_object_info_t doi;
 215  217          uint64_t val;
 216  218  
 217  219          error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val);
 218  220          if (error)
 219  221                  return (error);
 220  222  
 221  223          dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLSIZE, val);
 222  224  
 223  225          error = dmu_object_info(os, ZVOL_OBJ, &doi);
 224  226  
 225  227          if (error == 0) {
 226  228                  dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLBLOCKSIZE,
 227  229                      doi.doi_data_block_size);
 228  230          }
 229  231  
 230  232          return (error);
 231  233  }
 232  234  
 233  235  static zvol_state_t *
 234  236  zvol_minor_lookup(const char *name)
 235  237  {
 236  238          minor_t minor;
 237  239          zvol_state_t *zv;
 238  240  
 239  241          ASSERT(MUTEX_HELD(&zfsdev_state_lock));
 240  242  
 241  243          for (minor = 1; minor <= ZFSDEV_MAX_MINOR; minor++) {
 242  244                  zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
 243  245                  if (zv == NULL)
 244  246                          continue;
 245  247                  if (strcmp(zv->zv_name, name) == 0)
 246  248                          return (zv);
 247  249          }
 248  250  
 249  251          return (NULL);
 250  252  }
 251  253  
 252  254  /* extent mapping arg */
 253  255  struct maparg {
 254  256          zvol_state_t    *ma_zv;
 255  257          uint64_t        ma_blks;
 256  258  };
 257  259  
 258  260  /*ARGSUSED*/
 259  261  static int
 260  262  zvol_map_block(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 261  263      const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
 262  264  {
 263  265          struct maparg *ma = arg;
 264  266          zvol_extent_t *ze;
 265  267          int bs = ma->ma_zv->zv_volblocksize;
 266  268  
 267  269          if (bp == NULL || BP_IS_HOLE(bp) ||
 268  270              zb->zb_object != ZVOL_OBJ || zb->zb_level != 0)
 269  271                  return (0);
 270  272  
 271  273          VERIFY(!BP_IS_EMBEDDED(bp));
 272  274  
 273  275          VERIFY3U(ma->ma_blks, ==, zb->zb_blkid);
 274  276          ma->ma_blks++;
 275  277  
 276  278          /* Abort immediately if we have encountered gang blocks */
 277  279          if (BP_IS_GANG(bp))
 278  280                  return (SET_ERROR(EFRAGS));
 279  281  
 280  282          /*
 281  283           * See if the block is at the end of the previous extent.
 282  284           */
 283  285          ze = list_tail(&ma->ma_zv->zv_extents);
 284  286          if (ze &&
 285  287              DVA_GET_VDEV(BP_IDENTITY(bp)) == DVA_GET_VDEV(&ze->ze_dva) &&
 286  288              DVA_GET_OFFSET(BP_IDENTITY(bp)) ==
 287  289              DVA_GET_OFFSET(&ze->ze_dva) + ze->ze_nblks * bs) {
 288  290                  ze->ze_nblks++;
 289  291                  return (0);
 290  292          }
 291  293  
 292  294          dprintf_bp(bp, "%s", "next blkptr:");
 293  295  
 294  296          /* start a new extent */
 295  297          ze = kmem_zalloc(sizeof (zvol_extent_t), KM_SLEEP);
 296  298          ze->ze_dva = bp->blk_dva[0];    /* structure assignment */
 297  299          ze->ze_nblks = 1;
 298  300          list_insert_tail(&ma->ma_zv->zv_extents, ze);
 299  301          return (0);
 300  302  }
 301  303  
 302  304  static void
 303  305  zvol_free_extents(zvol_state_t *zv)
 304  306  {
 305  307          zvol_extent_t *ze;
 306  308  
 307  309          while (ze = list_head(&zv->zv_extents)) {
 308  310                  list_remove(&zv->zv_extents, ze);
 309  311                  kmem_free(ze, sizeof (zvol_extent_t));
 310  312          }
 311  313  }
 312  314  
 313  315  static int
 314  316  zvol_get_lbas(zvol_state_t *zv)
 315  317  {
 316  318          objset_t *os = zv->zv_objset;
 317  319          struct maparg   ma;
 318  320          int             err;
 319  321  
 320  322          ma.ma_zv = zv;
 321  323          ma.ma_blks = 0;
 322  324          zvol_free_extents(zv);
 323  325  
 324  326          /* commit any in-flight changes before traversing the dataset */
 325  327          txg_wait_synced(dmu_objset_pool(os), 0);
 326  328          err = traverse_dataset(dmu_objset_ds(os), 0,
 327  329              TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, zvol_map_block, &ma);
 328  330          if (err || ma.ma_blks != (zv->zv_volsize / zv->zv_volblocksize)) {
 329  331                  zvol_free_extents(zv);
 330  332                  return (err ? err : EIO);
 331  333          }
 332  334  
 333  335          return (0);
 334  336  }
 335  337  
 336  338  /* ARGSUSED */
 337  339  void
 338  340  zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
 339  341  {
 340  342          zfs_creat_t *zct = arg;
 341  343          nvlist_t *nvprops = zct->zct_props;
 342  344          int error;
 343  345          uint64_t volblocksize, volsize;
 344  346  
 345  347          VERIFY(nvlist_lookup_uint64(nvprops,
 346  348              zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) == 0);
 347  349          if (nvlist_lookup_uint64(nvprops,
 348  350              zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0)
 349  351                  volblocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE);
 350  352  
 351  353          /*
 352  354           * These properties must be removed from the list so the generic
 353  355           * property setting step won't apply to them.
 354  356           */
 355  357          VERIFY(nvlist_remove_all(nvprops,
 356  358              zfs_prop_to_name(ZFS_PROP_VOLSIZE)) == 0);
 357  359          (void) nvlist_remove_all(nvprops,
 358  360              zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE));
 359  361  
 360  362          error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize,
 361  363              DMU_OT_NONE, 0, tx);
 362  364          ASSERT(error == 0);
 363  365  
 364  366          error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP,
 365  367              DMU_OT_NONE, 0, tx);
 366  368          ASSERT(error == 0);
 367  369  
 368  370          error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx);
 369  371          ASSERT(error == 0);
 370  372  }
 371  373  
 372  374  /*
 373  375   * Replay a TX_TRUNCATE ZIL transaction if asked.  TX_TRUNCATE is how we
 374  376   * implement DKIOCFREE/free-long-range.
 375  377   */
 376  378  static int
 377  379  zvol_replay_truncate(void *arg1, void *arg2, boolean_t byteswap)
 378  380  {
 379  381          zvol_state_t *zv = arg1;
 380  382          lr_truncate_t *lr = arg2;
 381  383          uint64_t offset, length;
 382  384  
 383  385          if (byteswap)
 384  386                  byteswap_uint64_array(lr, sizeof (*lr));
 385  387  
 386  388          offset = lr->lr_offset;
 387  389          length = lr->lr_length;
 388  390  
 389  391          return (dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, offset, length));
 390  392  }
 391  393  
 392  394  /*
 393  395   * Replay a TX_WRITE ZIL transaction that didn't get committed
 394  396   * after a system failure
 395  397   */
 396  398  static int
 397  399  zvol_replay_write(void *arg1, void *arg2, boolean_t byteswap)
 398  400  {
 399  401          zvol_state_t *zv = arg1;
 400  402          lr_write_t *lr = arg2;
 401  403          objset_t *os = zv->zv_objset;
 402  404          char *data = (char *)(lr + 1);  /* data follows lr_write_t */
 403  405          uint64_t offset, length;
 404  406          dmu_tx_t *tx;
 405  407          int error;
 406  408  
 407  409          if (byteswap)
 408  410                  byteswap_uint64_array(lr, sizeof (*lr));
 409  411  
 410  412          offset = lr->lr_offset;
 411  413          length = lr->lr_length;
 412  414  
 413  415          /* If it's a dmu_sync() block, write the whole block */
 414  416          if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
 415  417                  uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr);
 416  418                  if (length < blocksize) {
 417  419                          offset -= offset % blocksize;
 418  420                          length = blocksize;
 419  421                  }
 420  422          }
 421  423  
 422  424          tx = dmu_tx_create(os);
 423  425          dmu_tx_hold_write(tx, ZVOL_OBJ, offset, length);
 424  426          error = dmu_tx_assign(tx, TXG_WAIT);
 425  427          if (error) {
 426  428                  dmu_tx_abort(tx);
 427  429          } else {
 428  430                  dmu_write(os, ZVOL_OBJ, offset, length, data, tx);
 429  431                  dmu_tx_commit(tx);
 430  432          }
 431  433  
 432  434          return (error);
 433  435  }
 434  436  
 435  437  /* ARGSUSED */
 436  438  static int
 437  439  zvol_replay_err(void *arg1, void *arg2, boolean_t byteswap)
 438  440  {
 439  441          return (SET_ERROR(ENOTSUP));
 440  442  }
 441  443  
 442  444  /*
 443  445   * Callback vectors for replaying records.
 444  446   * Only TX_WRITE and TX_TRUNCATE are needed for zvol.
 445  447   */
 446  448  zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = {
 447  449          zvol_replay_err,        /* 0 no such transaction type */
 448  450          zvol_replay_err,        /* TX_CREATE */
 449  451          zvol_replay_err,        /* TX_MKDIR */
 450  452          zvol_replay_err,        /* TX_MKXATTR */
 451  453          zvol_replay_err,        /* TX_SYMLINK */
 452  454          zvol_replay_err,        /* TX_REMOVE */
 453  455          zvol_replay_err,        /* TX_RMDIR */
 454  456          zvol_replay_err,        /* TX_LINK */
 455  457          zvol_replay_err,        /* TX_RENAME */
 456  458          zvol_replay_write,      /* TX_WRITE */
 457  459          zvol_replay_truncate,   /* TX_TRUNCATE */
 458  460          zvol_replay_err,        /* TX_SETATTR */
 459  461          zvol_replay_err,        /* TX_ACL */
 460  462          zvol_replay_err,        /* TX_CREATE_ACL */
 461  463          zvol_replay_err,        /* TX_CREATE_ATTR */
 462  464          zvol_replay_err,        /* TX_CREATE_ACL_ATTR */
 463  465          zvol_replay_err,        /* TX_MKDIR_ACL */
 464  466          zvol_replay_err,        /* TX_MKDIR_ATTR */
 465  467          zvol_replay_err,        /* TX_MKDIR_ACL_ATTR */
 466  468          zvol_replay_err,        /* TX_WRITE2 */
 467  469  };
 468  470  
 469  471  int
 470  472  zvol_name2minor(const char *name, minor_t *minor)
 471  473  {
 472  474          zvol_state_t *zv;
 473  475  
 474  476          mutex_enter(&zfsdev_state_lock);
 475  477          zv = zvol_minor_lookup(name);
 476  478          if (minor && zv)
 477  479                  *minor = zv->zv_minor;
 478  480          mutex_exit(&zfsdev_state_lock);
 479  481          return (zv ? 0 : -1);
 480  482  }
 481  483  
 482  484  /*
 483  485   * Create a minor node (plus a whole lot more) for the specified volume.
 484  486   */
 485  487  int
 486  488  zvol_create_minor(const char *name)
 487  489  {
 488  490          zfs_soft_state_t *zs;
 489  491          zvol_state_t *zv;
 490  492          objset_t *os;
 491  493          dmu_object_info_t doi;
 492  494          minor_t minor = 0;
 493  495          char chrbuf[30], blkbuf[30];
 494  496          int error;
 495  497  
 496  498          mutex_enter(&zfsdev_state_lock);
 497  499  
 498  500          if (zvol_minor_lookup(name) != NULL) {
 499  501                  mutex_exit(&zfsdev_state_lock);
 500  502                  return (SET_ERROR(EEXIST));
 501  503          }
 502  504  
 503  505          /* lie and say we're read-only */
 504  506          error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, FTAG, &os);
 505  507  
 506  508          if (error) {
 507  509                  mutex_exit(&zfsdev_state_lock);
 508  510                  return (error);
 509  511          }
 510  512  
 511  513          if ((minor = zfsdev_minor_alloc()) == 0) {
 512  514                  dmu_objset_disown(os, FTAG);
 513  515                  mutex_exit(&zfsdev_state_lock);
 514  516                  return (SET_ERROR(ENXIO));
 515  517          }
 516  518  
 517  519          if (ddi_soft_state_zalloc(zfsdev_state, minor) != DDI_SUCCESS) {
 518  520                  dmu_objset_disown(os, FTAG);
 519  521                  mutex_exit(&zfsdev_state_lock);
 520  522                  return (SET_ERROR(EAGAIN));
 521  523          }
 522  524          (void) ddi_prop_update_string(minor, zfs_dip, ZVOL_PROP_NAME,
 523  525              (char *)name);
 524  526  
 525  527          (void) snprintf(chrbuf, sizeof (chrbuf), "%u,raw", minor);
 526  528  
 527  529          if (ddi_create_minor_node(zfs_dip, chrbuf, S_IFCHR,
 528  530              minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
 529  531                  ddi_soft_state_free(zfsdev_state, minor);
 530  532                  dmu_objset_disown(os, FTAG);
 531  533                  mutex_exit(&zfsdev_state_lock);
 532  534                  return (SET_ERROR(EAGAIN));
 533  535          }
 534  536  
 535  537          (void) snprintf(blkbuf, sizeof (blkbuf), "%u", minor);
 536  538  
 537  539          if (ddi_create_minor_node(zfs_dip, blkbuf, S_IFBLK,
 538  540              minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
 539  541                  ddi_remove_minor_node(zfs_dip, chrbuf);
 540  542                  ddi_soft_state_free(zfsdev_state, minor);
 541  543                  dmu_objset_disown(os, FTAG);
 542  544                  mutex_exit(&zfsdev_state_lock);
 543  545                  return (SET_ERROR(EAGAIN));
 544  546          }
 545  547  
 546  548          zs = ddi_get_soft_state(zfsdev_state, minor);
 547  549          zs->zss_type = ZSST_ZVOL;
 548  550          zv = zs->zss_data = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP);
 549  551          (void) strlcpy(zv->zv_name, name, MAXPATHLEN);
 550  552          zv->zv_min_bs = DEV_BSHIFT;
 551  553          zv->zv_minor = minor;
 552  554          zv->zv_objset = os;
 553  555          if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os)))
 554  556                  zv->zv_flags |= ZVOL_RDONLY;
 555  557          mutex_init(&zv->zv_znode.z_range_lock, NULL, MUTEX_DEFAULT, NULL);
 556  558          avl_create(&zv->zv_znode.z_range_avl, zfs_range_compare,
 557  559              sizeof (rl_t), offsetof(rl_t, r_node));
 558  560          list_create(&zv->zv_extents, sizeof (zvol_extent_t),
 559  561              offsetof(zvol_extent_t, ze_node));
 560  562          /* get and cache the blocksize */
 561  563          error = dmu_object_info(os, ZVOL_OBJ, &doi);
 562  564          ASSERT(error == 0);
 563  565          zv->zv_volblocksize = doi.doi_data_block_size;
 564  566  
 565  567          if (spa_writeable(dmu_objset_spa(os))) {
 566  568                  if (zil_replay_disable)
 567  569                          zil_destroy(dmu_objset_zil(os), B_FALSE);
 568  570                  else
 569  571                          zil_replay(os, zv, zvol_replay_vector);
 570  572          }
 571  573          dmu_objset_disown(os, FTAG);
 572  574          zv->zv_objset = NULL;
 573  575  
 574  576          zvol_minors++;
 575  577  
 576  578          mutex_exit(&zfsdev_state_lock);
 577  579  
 578  580          return (0);
 579  581  }
 580  582  
 581  583  /*
 582  584   * Remove minor node for the specified volume.
 583  585   */
 584  586  static int
 585  587  zvol_remove_zv(zvol_state_t *zv)
 586  588  {
 587  589          char nmbuf[20];
 588  590          minor_t minor = zv->zv_minor;
 589  591  
 590  592          ASSERT(MUTEX_HELD(&zfsdev_state_lock));
 591  593          if (zv->zv_total_opens != 0)
 592  594                  return (SET_ERROR(EBUSY));
 593  595  
 594  596          (void) snprintf(nmbuf, sizeof (nmbuf), "%u,raw", minor);
 595  597          ddi_remove_minor_node(zfs_dip, nmbuf);
 596  598  
 597  599          (void) snprintf(nmbuf, sizeof (nmbuf), "%u", minor);
 598  600          ddi_remove_minor_node(zfs_dip, nmbuf);
 599  601  
 600  602          avl_destroy(&zv->zv_znode.z_range_avl);
 601  603          mutex_destroy(&zv->zv_znode.z_range_lock);
 602  604  
 603  605          kmem_free(zv, sizeof (zvol_state_t));
 604  606  
 605  607          ddi_soft_state_free(zfsdev_state, minor);
 606  608  
 607  609          zvol_minors--;
 608  610          return (0);
 609  611  }
 610  612  
 611  613  int
 612  614  zvol_remove_minor(const char *name)
 613  615  {
 614  616          zvol_state_t *zv;
 615  617          int rc;
 616  618  
 617  619          mutex_enter(&zfsdev_state_lock);
 618  620          if ((zv = zvol_minor_lookup(name)) == NULL) {
 619  621                  mutex_exit(&zfsdev_state_lock);
 620  622                  return (SET_ERROR(ENXIO));
 621  623          }
 622  624          rc = zvol_remove_zv(zv);
 623  625          mutex_exit(&zfsdev_state_lock);
 624  626          return (rc);
 625  627  }
 626  628  
 627  629  int
 628  630  zvol_first_open(zvol_state_t *zv)
 629  631  {
 630  632          objset_t *os;
 631  633          uint64_t volsize;
 632  634          int error;
 633  635          uint64_t readonly;
 634  636  
 635  637          /* lie and say we're read-only */
 636  638          error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, B_TRUE,
 637  639              zvol_tag, &os);
 638  640          if (error)
 639  641                  return (error);
 640  642  
 641  643          zv->zv_objset = os;
 642  644          error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
 643  645          if (error) {
 644  646                  ASSERT(error == 0);
 645  647                  dmu_objset_disown(os, zvol_tag);
 646  648                  return (error);
 647  649          }
 648  650  
 649  651          error = dmu_bonus_hold(os, ZVOL_OBJ, zvol_tag, &zv->zv_dbuf);
 650  652          if (error) {
 651  653                  dmu_objset_disown(os, zvol_tag);
 652  654                  return (error);
 653  655          }
 654  656  
 655  657          zvol_size_changed(zv, volsize);
 656  658          zv->zv_zilog = zil_open(os, zvol_get_data);
 657  659  
 658  660          VERIFY(dsl_prop_get_integer(zv->zv_name, "readonly", &readonly,
 659  661              NULL) == 0);
 660  662          if (readonly || dmu_objset_is_snapshot(os) ||
 661  663              !spa_writeable(dmu_objset_spa(os)))
 662  664                  zv->zv_flags |= ZVOL_RDONLY;
 663  665          else
 664  666                  zv->zv_flags &= ~ZVOL_RDONLY;
 665  667          return (error);
 666  668  }
 667  669  
 668  670  void
 669  671  zvol_last_close(zvol_state_t *zv)
 670  672  {
 671  673          zil_close(zv->zv_zilog);
 672  674          zv->zv_zilog = NULL;
 673  675  
 674  676          dmu_buf_rele(zv->zv_dbuf, zvol_tag);
 675  677          zv->zv_dbuf = NULL;
 676  678  
 677  679          /*
 678  680           * Evict cached data
 679  681           */
 680  682          if (dsl_dataset_is_dirty(dmu_objset_ds(zv->zv_objset)) &&
 681  683              !(zv->zv_flags & ZVOL_RDONLY))
 682  684                  txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
 683  685          dmu_objset_evict_dbufs(zv->zv_objset);
 684  686  
 685  687          dmu_objset_disown(zv->zv_objset, zvol_tag);
 686  688          zv->zv_objset = NULL;
 687  689  }
 688  690  
 689  691  int
 690  692  zvol_prealloc(zvol_state_t *zv)
 691  693  {
 692  694          objset_t *os = zv->zv_objset;
 693  695          dmu_tx_t *tx;
 694  696          uint64_t refd, avail, usedobjs, availobjs;
 695  697          uint64_t resid = zv->zv_volsize;
 696  698          uint64_t off = 0;
 697  699  
 698  700          /* Check the space usage before attempting to allocate the space */
 699  701          dmu_objset_space(os, &refd, &avail, &usedobjs, &availobjs);
 700  702          if (avail < zv->zv_volsize)
 701  703                  return (SET_ERROR(ENOSPC));
 702  704  
 703  705          /* Free old extents if they exist */
 704  706          zvol_free_extents(zv);
 705  707  
 706  708          while (resid != 0) {
 707  709                  int error;
 708  710                  uint64_t bytes = MIN(resid, SPA_OLD_MAXBLOCKSIZE);
 709  711  
 710  712                  tx = dmu_tx_create(os);
 711  713                  dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes);
 712  714                  error = dmu_tx_assign(tx, TXG_WAIT);
 713  715                  if (error) {
 714  716                          dmu_tx_abort(tx);
 715  717                          (void) dmu_free_long_range(os, ZVOL_OBJ, 0, off);
 716  718                          return (error);
 717  719                  }
 718  720                  dmu_prealloc(os, ZVOL_OBJ, off, bytes, tx);
 719  721                  dmu_tx_commit(tx);
 720  722                  off += bytes;
 721  723                  resid -= bytes;
 722  724          }
 723  725          txg_wait_synced(dmu_objset_pool(os), 0);
 724  726  
 725  727          return (0);
 726  728  }
 727  729  
 728  730  static int
 729  731  zvol_update_volsize(objset_t *os, uint64_t volsize)
 730  732  {
 731  733          dmu_tx_t *tx;
 732  734          int error;
 733  735  
 734  736          ASSERT(MUTEX_HELD(&zfsdev_state_lock));
 735  737  
 736  738          tx = dmu_tx_create(os);
 737  739          dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
 738  740          dmu_tx_mark_netfree(tx);
 739  741          error = dmu_tx_assign(tx, TXG_WAIT);
 740  742          if (error) {
 741  743                  dmu_tx_abort(tx);
 742  744                  return (error);
 743  745          }
 744  746  
 745  747          error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1,
 746  748              &volsize, tx);
 747  749          dmu_tx_commit(tx);
 748  750  
 749  751          if (error == 0)
 750  752                  error = dmu_free_long_range(os,
 751  753                      ZVOL_OBJ, volsize, DMU_OBJECT_END);
 752  754          return (error);
 753  755  }
 754  756  
 755  757  void
 756  758  zvol_remove_minors(const char *name)
 757  759  {
 758  760          zvol_state_t *zv;
 759  761          char *namebuf;
 760  762          minor_t minor;
 761  763  
 762  764          namebuf = kmem_zalloc(strlen(name) + 2, KM_SLEEP);
 763  765          (void) strncpy(namebuf, name, strlen(name));
 764  766          (void) strcat(namebuf, "/");
 765  767          mutex_enter(&zfsdev_state_lock);
 766  768          for (minor = 1; minor <= ZFSDEV_MAX_MINOR; minor++) {
 767  769  
 768  770                  zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
 769  771                  if (zv == NULL)
 770  772                          continue;
 771  773                  if (strncmp(namebuf, zv->zv_name, strlen(namebuf)) == 0)
 772  774                          (void) zvol_remove_zv(zv);
 773  775          }
 774  776          kmem_free(namebuf, strlen(name) + 2);
 775  777  
 776  778          mutex_exit(&zfsdev_state_lock);
 777  779  }
 778  780  
 779  781  static int
 780  782  zvol_update_live_volsize(zvol_state_t *zv, uint64_t volsize)
 781  783  {
 782  784          uint64_t old_volsize = 0ULL;
 783  785          int error = 0;
 784  786  
 785  787          ASSERT(MUTEX_HELD(&zfsdev_state_lock));
 786  788  
 787  789          /*
 788  790           * Reinitialize the dump area to the new size. If we
 789  791           * failed to resize the dump area then restore it back to
 790  792           * its original size.  We must set the new volsize prior
 791  793           * to calling dumpvp_resize() to ensure that the devices'
 792  794           * size(9P) is not visible by the dump subsystem.
 793  795           */
 794  796          old_volsize = zv->zv_volsize;
 795  797          zvol_size_changed(zv, volsize);
 796  798  
 797  799          if (zv->zv_flags & ZVOL_DUMPIFIED) {
 798  800                  if ((error = zvol_dumpify(zv)) != 0 ||
 799  801                      (error = dumpvp_resize()) != 0) {
 800  802                          int dumpify_error;
 801  803  
 802  804                          (void) zvol_update_volsize(zv->zv_objset, old_volsize);
 803  805                          zvol_size_changed(zv, old_volsize);
 804  806                          dumpify_error = zvol_dumpify(zv);
 805  807                          error = dumpify_error ? dumpify_error : error;
 806  808                  }
 807  809          }
 808  810  
 809  811          /*
 810  812           * Generate a LUN expansion event.
 811  813           */
 812  814          if (error == 0) {
 813  815                  sysevent_id_t eid;
 814  816                  nvlist_t *attr;
 815  817                  char *physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
 816  818  
 817  819                  (void) snprintf(physpath, MAXPATHLEN, "%s%u", ZVOL_PSEUDO_DEV,
 818  820                      zv->zv_minor);
 819  821  
 820  822                  VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 821  823                  VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0);
 822  824  
 823  825                  (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS,
 824  826                      ESC_DEV_DLE, attr, &eid, DDI_SLEEP);
 825  827  
 826  828                  nvlist_free(attr);
 827  829                  kmem_free(physpath, MAXPATHLEN);
 828  830          }
 829  831          return (error);
 830  832  }
 831  833  
 832  834  int
 833  835  zvol_set_volsize(const char *name, uint64_t volsize)
 834  836  {
 835  837          zvol_state_t *zv = NULL;
 836  838          objset_t *os;
 837  839          int error;
 838  840          dmu_object_info_t doi;
 839  841          uint64_t readonly;
 840  842          boolean_t owned = B_FALSE;
 841  843  
 842  844          error = dsl_prop_get_integer(name,
 843  845              zfs_prop_to_name(ZFS_PROP_READONLY), &readonly, NULL);
 844  846          if (error != 0)
 845  847                  return (error);
 846  848          if (readonly)
 847  849                  return (SET_ERROR(EROFS));
 848  850  
 849  851          mutex_enter(&zfsdev_state_lock);
 850  852          zv = zvol_minor_lookup(name);
 851  853  
 852  854          if (zv == NULL || zv->zv_objset == NULL) {
 853  855                  if ((error = dmu_objset_own(name, DMU_OST_ZVOL, B_FALSE,
 854  856                      FTAG, &os)) != 0) {
 855  857                          mutex_exit(&zfsdev_state_lock);
 856  858                          return (error);
 857  859                  }
 858  860                  owned = B_TRUE;
 859  861                  if (zv != NULL)
 860  862                          zv->zv_objset = os;
 861  863          } else {
 862  864                  os = zv->zv_objset;
 863  865          }
 864  866  
 865  867          if ((error = dmu_object_info(os, ZVOL_OBJ, &doi)) != 0 ||
 866  868              (error = zvol_check_volsize(volsize, doi.doi_data_block_size)) != 0)
 867  869                  goto out;
 868  870  
 869  871          error = zvol_update_volsize(os, volsize);
 870  872  
 871  873          if (error == 0 && zv != NULL)
 872  874                  error = zvol_update_live_volsize(zv, volsize);
 873  875  out:
 874  876          if (owned) {
 875  877                  dmu_objset_disown(os, FTAG);
 876  878                  if (zv != NULL)
 877  879                          zv->zv_objset = NULL;
 878  880          }
 879  881          mutex_exit(&zfsdev_state_lock);
 880  882          return (error);
 881  883  }
 882  884  
 883  885  /*ARGSUSED*/
 884  886  int
 885  887  zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr)
 886  888  {
 887  889          zvol_state_t *zv;
 888  890          int err = 0;
 889  891  
 890  892          mutex_enter(&zfsdev_state_lock);
 891  893  
 892  894          zv = zfsdev_get_soft_state(getminor(*devp), ZSST_ZVOL);
 893  895          if (zv == NULL) {
 894  896                  mutex_exit(&zfsdev_state_lock);
 895  897                  return (SET_ERROR(ENXIO));
 896  898          }
 897  899  
 898  900          if (zv->zv_total_opens == 0)
 899  901                  err = zvol_first_open(zv);
 900  902          if (err) {
 901  903                  mutex_exit(&zfsdev_state_lock);
 902  904                  return (err);
 903  905          }
 904  906          if ((flag & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
 905  907                  err = SET_ERROR(EROFS);
 906  908                  goto out;
 907  909          }
 908  910          if (zv->zv_flags & ZVOL_EXCL) {
 909  911                  err = SET_ERROR(EBUSY);
 910  912                  goto out;
 911  913          }
 912  914          if (flag & FEXCL) {
 913  915                  if (zv->zv_total_opens != 0) {
 914  916                          err = SET_ERROR(EBUSY);
 915  917                          goto out;
 916  918                  }
 917  919                  zv->zv_flags |= ZVOL_EXCL;
 918  920          }
 919  921  
 920  922          if (zv->zv_open_count[otyp] == 0 || otyp == OTYP_LYR) {
 921  923                  zv->zv_open_count[otyp]++;
 922  924                  zv->zv_total_opens++;
 923  925          }
 924  926          mutex_exit(&zfsdev_state_lock);
 925  927  
 926  928          return (err);
 927  929  out:
 928  930          if (zv->zv_total_opens == 0)
 929  931                  zvol_last_close(zv);
 930  932          mutex_exit(&zfsdev_state_lock);
 931  933          return (err);
 932  934  }
 933  935  
 934  936  /*ARGSUSED*/
 935  937  int
 936  938  zvol_close(dev_t dev, int flag, int otyp, cred_t *cr)
 937  939  {
 938  940          minor_t minor = getminor(dev);
 939  941          zvol_state_t *zv;
 940  942          int error = 0;
 941  943  
 942  944          mutex_enter(&zfsdev_state_lock);
 943  945  
 944  946          zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
 945  947          if (zv == NULL) {
 946  948                  mutex_exit(&zfsdev_state_lock);
 947  949                  return (SET_ERROR(ENXIO));
 948  950          }
 949  951  
 950  952          if (zv->zv_flags & ZVOL_EXCL) {
 951  953                  ASSERT(zv->zv_total_opens == 1);
 952  954                  zv->zv_flags &= ~ZVOL_EXCL;
 953  955          }
 954  956  
 955  957          /*
 956  958           * If the open count is zero, this is a spurious close.
 957  959           * That indicates a bug in the kernel / DDI framework.
 958  960           */
 959  961          ASSERT(zv->zv_open_count[otyp] != 0);
 960  962          ASSERT(zv->zv_total_opens != 0);
 961  963  
 962  964          /*
 963  965           * You may get multiple opens, but only one close.
 964  966           */
  
    | 
      ↓ open down ↓ | 
    864 lines elided | 
    
      ↑ open up ↑ | 
  
 965  967          zv->zv_open_count[otyp]--;
 966  968          zv->zv_total_opens--;
 967  969  
 968  970          if (zv->zv_total_opens == 0)
 969  971                  zvol_last_close(zv);
 970  972  
 971  973          mutex_exit(&zfsdev_state_lock);
 972  974          return (error);
 973  975  }
 974  976  
      977 +/* ARGSUSED */
 975  978  static void
 976  979  zvol_get_done(zgd_t *zgd, int error)
 977  980  {
 978  981          if (zgd->zgd_db)
 979  982                  dmu_buf_rele(zgd->zgd_db, zgd);
 980  983  
 981  984          zfs_range_unlock(zgd->zgd_rl);
 982  985  
 983      -        if (error == 0 && zgd->zgd_bp)
 984      -                zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
 985      -
 986  986          kmem_free(zgd, sizeof (zgd_t));
 987  987  }
 988  988  
 989  989  /*
 990  990   * Get data to generate a TX_WRITE intent log record.
 991  991   */
 992  992  static int
 993  993  zvol_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
 994  994  {
 995  995          zvol_state_t *zv = arg;
 996  996          objset_t *os = zv->zv_objset;
 997  997          uint64_t object = ZVOL_OBJ;
 998  998          uint64_t offset = lr->lr_offset;
 999  999          uint64_t size = lr->lr_length;  /* length of user data */
1000 1000          dmu_buf_t *db;
1001 1001          zgd_t *zgd;
1002 1002          int error;
1003 1003  
1004 1004          ASSERT3P(lwb, !=, NULL);
1005 1005          ASSERT3P(zio, !=, NULL);
1006 1006          ASSERT3U(size, !=, 0);
1007 1007  
1008 1008          zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
1009 1009          zgd->zgd_lwb = lwb;
1010 1010  
1011 1011          /*
1012 1012           * Write records come in two flavors: immediate and indirect.
1013 1013           * For small writes it's cheaper to store the data with the
1014 1014           * log record (immediate); for large writes it's cheaper to
1015 1015           * sync the data and get a pointer to it (indirect) so that
1016 1016           * we don't have to write the data twice.
1017 1017           */
1018 1018          if (buf != NULL) { /* immediate write */
1019 1019                  zgd->zgd_rl = zfs_range_lock(&zv->zv_znode, offset, size,
1020 1020                      RL_READER);
1021 1021                  error = dmu_read(os, object, offset, size, buf,
1022 1022                      DMU_READ_NO_PREFETCH);
1023 1023          } else { /* indirect write */
1024 1024                  /*
1025 1025                   * Have to lock the whole block to ensure when it's written out
1026 1026                   * and its checksum is being calculated that no one can change
1027 1027                   * the data. Contrarily to zfs_get_data we need not re-check
1028 1028                   * blocksize after we get the lock because it cannot be changed.
1029 1029                   */
1030 1030                  size = zv->zv_volblocksize;
1031 1031                  offset = P2ALIGN(offset, size);
1032 1032                  zgd->zgd_rl = zfs_range_lock(&zv->zv_znode, offset, size,
1033 1033                      RL_READER);
1034 1034                  error = dmu_buf_hold(os, object, offset, zgd, &db,
1035 1035                      DMU_READ_NO_PREFETCH);
1036 1036                  if (error == 0) {
1037 1037                          blkptr_t *bp = &lr->lr_blkptr;
1038 1038  
1039 1039                          zgd->zgd_db = db;
1040 1040                          zgd->zgd_bp = bp;
1041 1041  
1042 1042                          ASSERT(db->db_offset == offset);
1043 1043                          ASSERT(db->db_size == size);
1044 1044  
1045 1045                          error = dmu_sync(zio, lr->lr_common.lrc_txg,
1046 1046                              zvol_get_done, zgd);
1047 1047  
1048 1048                          if (error == 0)
1049 1049                                  return (0);
1050 1050                  }
1051 1051          }
1052 1052  
1053 1053          zvol_get_done(zgd, error);
1054 1054  
1055 1055          return (error);
1056 1056  }
1057 1057  
1058 1058  /*
1059 1059   * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions.
1060 1060   *
1061 1061   * We store data in the log buffers if it's small enough.
  
    | 
      ↓ open down ↓ | 
    66 lines elided | 
    
      ↑ open up ↑ | 
  
1062 1062   * Otherwise we will later flush the data out via dmu_sync().
1063 1063   */
1064 1064  ssize_t zvol_immediate_write_sz = 32768;
1065 1065  
1066 1066  static void
1067 1067  zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t resid,
1068 1068      boolean_t sync)
1069 1069  {
1070 1070          uint32_t blocksize = zv->zv_volblocksize;
1071 1071          zilog_t *zilog = zv->zv_zilog;
     1072 +        spa_t *spa = zilog->zl_spa;
     1073 +        spa_meta_placement_t *mp = &spa->spa_meta_policy;
     1074 +        boolean_t slogging, zil_to_special, write_to_special;
     1075 +        ssize_t immediate_write_sz;
1072 1076          itx_wr_state_t write_state;
1073 1077  
1074 1078          if (zil_replaying(zilog, tx))
1075 1079                  return;
1076 1080  
1077      -        if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
     1081 +        /*
     1082 +         * See comments in zfs_log_write()
     1083 +         */
     1084 +
     1085 +        immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
     1086 +            ? 0 : zvol_immediate_write_sz;
     1087 +
     1088 +        zil_to_special = !spa_has_slogs(spa) &&
     1089 +            spa_can_special_be_used(spa) &&
     1090 +            mp->spa_sync_to_special != SYNC_TO_SPECIAL_DISABLED;
     1091 +
     1092 +        write_to_special = !spa_has_slogs(spa) &&
     1093 +            spa_write_data_to_special(spa, zilog->zl_os) &&
     1094 +            (mp->spa_sync_to_special == SYNC_TO_SPECIAL_ALWAYS ||
     1095 +            (mp->spa_sync_to_special == SYNC_TO_SPECIAL_BALANCED &&
     1096 +            spa->spa_avg_stat_rotor % 100 < spa->spa_special_to_normal_ratio));
     1097 +
     1098 +        slogging = (spa_has_slogs(spa) || zil_to_special) &&
     1099 +            (zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);
     1100 +
     1101 +        if (blocksize > immediate_write_sz && !slogging &&
     1102 +            resid >= blocksize && off % blocksize == 0)
1078 1103                  write_state = WR_INDIRECT;
     1104 +        else if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
     1105 +                write_state = WR_INDIRECT;
1079 1106          else if (!spa_has_slogs(zilog->zl_spa) &&
1080 1107              resid >= blocksize && blocksize > zvol_immediate_write_sz)
1081 1108                  write_state = WR_INDIRECT;
     1109 +        else if (write_to_special)
     1110 +                 write_state = WR_INDIRECT;
1082 1111          else if (sync)
1083 1112                  write_state = WR_COPIED;
1084 1113          else
1085 1114                  write_state = WR_NEED_COPY;
1086 1115  
1087 1116          while (resid) {
1088 1117                  itx_t *itx;
1089 1118                  lr_write_t *lr;
1090 1119                  itx_wr_state_t wr_state = write_state;
1091 1120                  ssize_t len = resid;
1092 1121  
1093 1122                  if (wr_state == WR_COPIED && resid > ZIL_MAX_COPIED_DATA)
1094 1123                          wr_state = WR_NEED_COPY;
1095 1124                  else if (wr_state == WR_INDIRECT)
1096 1125                          len = MIN(blocksize - P2PHASE(off, blocksize), resid);
1097 1126  
1098 1127                  itx = zil_itx_create(TX_WRITE, sizeof (*lr) +
1099 1128                      (wr_state == WR_COPIED ? len : 0));
1100 1129                  lr = (lr_write_t *)&itx->itx_lr;
1101 1130                  if (wr_state == WR_COPIED && dmu_read(zv->zv_objset,
1102 1131                      ZVOL_OBJ, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
1103 1132                          zil_itx_destroy(itx);
1104 1133                          itx = zil_itx_create(TX_WRITE, sizeof (*lr));
1105 1134                          lr = (lr_write_t *)&itx->itx_lr;
1106 1135                          wr_state = WR_NEED_COPY;
1107 1136                  }
1108 1137  
1109 1138                  itx->itx_wr_state = wr_state;
1110 1139                  lr->lr_foid = ZVOL_OBJ;
1111 1140                  lr->lr_offset = off;
1112 1141                  lr->lr_length = len;
1113 1142                  lr->lr_blkoff = 0;
1114 1143                  BP_ZERO(&lr->lr_blkptr);
1115 1144  
1116 1145                  itx->itx_private = zv;
1117 1146                  itx->itx_sync = sync;
1118 1147  
1119 1148                  zil_itx_assign(zilog, itx, tx);
1120 1149  
  
    | 
      ↓ open down ↓ | 
    29 lines elided | 
    
      ↑ open up ↑ | 
  
1121 1150                  off += len;
1122 1151                  resid -= len;
1123 1152          }
1124 1153  }
1125 1154  
1126 1155  static int
1127 1156  zvol_dumpio_vdev(vdev_t *vd, void *addr, uint64_t offset, uint64_t origoffset,
1128 1157      uint64_t size, boolean_t doread, boolean_t isdump)
1129 1158  {
1130 1159          vdev_disk_t *dvd;
1131      -        int c;
     1160 +        int c, rc;
1132 1161          int numerrors = 0;
1133 1162  
1134 1163          if (vd->vdev_ops == &vdev_mirror_ops ||
1135 1164              vd->vdev_ops == &vdev_replacing_ops ||
1136 1165              vd->vdev_ops == &vdev_spare_ops) {
1137 1166                  for (c = 0; c < vd->vdev_children; c++) {
1138 1167                          int err = zvol_dumpio_vdev(vd->vdev_child[c],
1139 1168                              addr, offset, origoffset, size, doread, isdump);
1140 1169                          if (err != 0) {
1141 1170                                  numerrors++;
1142 1171                          } else if (doread) {
1143 1172                                  break;
1144 1173                          }
1145 1174                  }
1146 1175          }
1147 1176  
1148 1177          if (!vd->vdev_ops->vdev_op_leaf && vd->vdev_ops != &vdev_raidz_ops)
1149 1178                  return (numerrors < vd->vdev_children ? 0 : EIO);
1150 1179  
1151 1180          if (doread && !vdev_readable(vd))
1152 1181                  return (SET_ERROR(EIO));
  
    | 
      ↓ open down ↓ | 
    11 lines elided | 
    
      ↑ open up ↑ | 
  
1153 1182          else if (!doread && !vdev_writeable(vd))
1154 1183                  return (SET_ERROR(EIO));
1155 1184  
1156 1185          if (vd->vdev_ops == &vdev_raidz_ops) {
1157 1186                  return (vdev_raidz_physio(vd,
1158 1187                      addr, size, offset, origoffset, doread, isdump));
1159 1188          }
1160 1189  
1161 1190          offset += VDEV_LABEL_START_SIZE;
1162 1191  
     1192 +        rw_enter(&vd->vdev_tsd_lock, RW_READER);
     1193 +        dvd = vd->vdev_tsd;
1163 1194          if (ddi_in_panic() || isdump) {
1164 1195                  ASSERT(!doread);
1165      -                if (doread)
     1196 +                if (doread) {
     1197 +                        rw_exit(&vd->vdev_tsd_lock);
1166 1198                          return (SET_ERROR(EIO));
1167      -                dvd = vd->vdev_tsd;
     1199 +                }
     1200 +                /* We assume here dvd is not NULL */
1168 1201                  ASSERT3P(dvd, !=, NULL);
1169      -                return (ldi_dump(dvd->vd_lh, addr, lbtodb(offset),
1170      -                    lbtodb(size)));
     1202 +
     1203 +                /* If our assumption is wrong, we do not want to crash */
     1204 +                if (dvd != NULL && dvd->vd_lh != NULL) {
     1205 +                        rc = ldi_dump(dvd->vd_lh, addr, lbtodb(offset),
     1206 +                            lbtodb(size));
     1207 +                } else {
     1208 +                        rc = SET_ERROR(ENXIO);
     1209 +                }
1171 1210          } else {
1172      -                dvd = vd->vdev_tsd;
     1211 +                /* We assume here dvd is not NULL */
1173 1212                  ASSERT3P(dvd, !=, NULL);
1174      -                return (vdev_disk_ldi_physio(dvd->vd_lh, addr, size,
1175      -                    offset, doread ? B_READ : B_WRITE));
     1213 +
     1214 +                /* If our assumption is wrong, we do not want to crash */
     1215 +                if (dvd != NULL && dvd->vd_lh != NULL) {
     1216 +                        rc = vdev_disk_ldi_physio(dvd->vd_lh, addr, size,
     1217 +                            offset, doread ? B_READ : B_WRITE);
     1218 +                } else {
     1219 +                        rc = SET_ERROR(ENXIO);
     1220 +                }
1176 1221          }
     1222 +        rw_exit(&vd->vdev_tsd_lock);
     1223 +        return (rc);
1177 1224  }
1178 1225  
1179 1226  static int
1180 1227  zvol_dumpio(zvol_state_t *zv, void *addr, uint64_t offset, uint64_t size,
1181 1228      boolean_t doread, boolean_t isdump)
1182 1229  {
1183 1230          vdev_t *vd;
1184 1231          int error;
1185 1232          zvol_extent_t *ze;
1186 1233          spa_t *spa = dmu_objset_spa(zv->zv_objset);
1187 1234  
1188 1235          /* Must be sector aligned, and not stradle a block boundary. */
1189 1236          if (P2PHASE(offset, DEV_BSIZE) || P2PHASE(size, DEV_BSIZE) ||
1190 1237              P2BOUNDARY(offset, size, zv->zv_volblocksize)) {
1191 1238                  return (SET_ERROR(EINVAL));
1192 1239          }
1193 1240          ASSERT(size <= zv->zv_volblocksize);
1194 1241  
1195 1242          /* Locate the extent this belongs to */
1196 1243          ze = list_head(&zv->zv_extents);
1197 1244          while (offset >= ze->ze_nblks * zv->zv_volblocksize) {
1198 1245                  offset -= ze->ze_nblks * zv->zv_volblocksize;
1199 1246                  ze = list_next(&zv->zv_extents, ze);
1200 1247          }
1201 1248  
1202 1249          if (ze == NULL)
1203 1250                  return (SET_ERROR(EINVAL));
1204 1251  
1205 1252          if (!ddi_in_panic())
1206 1253                  spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
1207 1254  
1208 1255          vd = vdev_lookup_top(spa, DVA_GET_VDEV(&ze->ze_dva));
1209 1256          offset += DVA_GET_OFFSET(&ze->ze_dva);
1210 1257          error = zvol_dumpio_vdev(vd, addr, offset, DVA_GET_OFFSET(&ze->ze_dva),
1211 1258              size, doread, isdump);
1212 1259  
1213 1260          if (!ddi_in_panic())
1214 1261                  spa_config_exit(spa, SCL_STATE, FTAG);
1215 1262  
1216 1263          return (error);
1217 1264  }
1218 1265  
1219 1266  int
1220 1267  zvol_strategy(buf_t *bp)
1221 1268  {
1222 1269          zfs_soft_state_t *zs = NULL;
1223 1270          zvol_state_t *zv;
1224 1271          uint64_t off, volsize;
1225 1272          size_t resid;
1226 1273          char *addr;
1227 1274          objset_t *os;
1228 1275          rl_t *rl;
1229 1276          int error = 0;
1230 1277          boolean_t doread = bp->b_flags & B_READ;
1231 1278          boolean_t is_dumpified;
1232 1279          boolean_t sync;
1233 1280  
1234 1281          if (getminor(bp->b_edev) == 0) {
1235 1282                  error = SET_ERROR(EINVAL);
1236 1283          } else {
1237 1284                  zs = ddi_get_soft_state(zfsdev_state, getminor(bp->b_edev));
1238 1285                  if (zs == NULL)
1239 1286                          error = SET_ERROR(ENXIO);
1240 1287                  else if (zs->zss_type != ZSST_ZVOL)
1241 1288                          error = SET_ERROR(EINVAL);
1242 1289          }
1243 1290  
1244 1291          if (error) {
1245 1292                  bioerror(bp, error);
1246 1293                  biodone(bp);
1247 1294                  return (0);
1248 1295          }
1249 1296  
1250 1297          zv = zs->zss_data;
1251 1298  
1252 1299          if (!(bp->b_flags & B_READ) && (zv->zv_flags & ZVOL_RDONLY)) {
1253 1300                  bioerror(bp, EROFS);
1254 1301                  biodone(bp);
1255 1302                  return (0);
1256 1303          }
1257 1304  
1258 1305          off = ldbtob(bp->b_blkno);
1259 1306          volsize = zv->zv_volsize;
1260 1307  
1261 1308          os = zv->zv_objset;
1262 1309          ASSERT(os != NULL);
1263 1310  
1264 1311          bp_mapin(bp);
1265 1312          addr = bp->b_un.b_addr;
1266 1313          resid = bp->b_bcount;
1267 1314  
1268 1315          if (resid > 0 && (off < 0 || off >= volsize)) {
1269 1316                  bioerror(bp, EIO);
1270 1317                  biodone(bp);
1271 1318                  return (0);
1272 1319          }
1273 1320  
1274 1321          is_dumpified = zv->zv_flags & ZVOL_DUMPIFIED;
1275 1322          sync = ((!(bp->b_flags & B_ASYNC) &&
1276 1323              !(zv->zv_flags & ZVOL_WCE)) ||
1277 1324              (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS)) &&
1278 1325              !doread && !is_dumpified;
1279 1326  
1280 1327          /*
1281 1328           * There must be no buffer changes when doing a dmu_sync() because
1282 1329           * we can't change the data whilst calculating the checksum.
1283 1330           */
1284 1331          rl = zfs_range_lock(&zv->zv_znode, off, resid,
1285 1332              doread ? RL_READER : RL_WRITER);
1286 1333  
1287 1334          while (resid != 0 && off < volsize) {
1288 1335                  size_t size = MIN(resid, zvol_maxphys);
1289 1336                  if (is_dumpified) {
1290 1337                          size = MIN(size, P2END(off, zv->zv_volblocksize) - off);
1291 1338                          error = zvol_dumpio(zv, addr, off, size,
1292 1339                              doread, B_FALSE);
1293 1340                  } else if (doread) {
1294 1341                          error = dmu_read(os, ZVOL_OBJ, off, size, addr,
1295 1342                              DMU_READ_PREFETCH);
1296 1343                  } else {
1297 1344                          dmu_tx_t *tx = dmu_tx_create(os);
1298 1345                          dmu_tx_hold_write(tx, ZVOL_OBJ, off, size);
1299 1346                          error = dmu_tx_assign(tx, TXG_WAIT);
1300 1347                          if (error) {
1301 1348                                  dmu_tx_abort(tx);
1302 1349                          } else {
1303 1350                                  dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
1304 1351                                  zvol_log_write(zv, tx, off, size, sync);
1305 1352                                  dmu_tx_commit(tx);
1306 1353                          }
1307 1354                  }
1308 1355                  if (error) {
1309 1356                          /* convert checksum errors into IO errors */
1310 1357                          if (error == ECKSUM)
1311 1358                                  error = SET_ERROR(EIO);
1312 1359                          break;
1313 1360                  }
1314 1361                  off += size;
1315 1362                  addr += size;
1316 1363                  resid -= size;
1317 1364          }
1318 1365          zfs_range_unlock(rl);
1319 1366  
1320 1367          if ((bp->b_resid = resid) == bp->b_bcount)
1321 1368                  bioerror(bp, off > volsize ? EINVAL : error);
1322 1369  
1323 1370          if (sync)
1324 1371                  zil_commit(zv->zv_zilog, ZVOL_OBJ);
1325 1372          biodone(bp);
1326 1373  
1327 1374          return (0);
1328 1375  }
1329 1376  
1330 1377  /*
1331 1378   * Set the buffer count to the zvol maximum transfer.
1332 1379   * Using our own routine instead of the default minphys()
1333 1380   * means that for larger writes we write bigger buffers on X86
1334 1381   * (128K instead of 56K) and flush the disk write cache less often
1335 1382   * (every zvol_maxphys - currently 1MB) instead of minphys (currently
1336 1383   * 56K on X86 and 128K on sparc).
1337 1384   */
1338 1385  void
1339 1386  zvol_minphys(struct buf *bp)
1340 1387  {
1341 1388          if (bp->b_bcount > zvol_maxphys)
1342 1389                  bp->b_bcount = zvol_maxphys;
1343 1390  }
1344 1391  
1345 1392  int
1346 1393  zvol_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblocks)
1347 1394  {
1348 1395          minor_t minor = getminor(dev);
1349 1396          zvol_state_t *zv;
1350 1397          int error = 0;
1351 1398          uint64_t size;
1352 1399          uint64_t boff;
1353 1400          uint64_t resid;
1354 1401  
1355 1402          zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
1356 1403          if (zv == NULL)
1357 1404                  return (SET_ERROR(ENXIO));
1358 1405  
1359 1406          if ((zv->zv_flags & ZVOL_DUMPIFIED) == 0)
1360 1407                  return (SET_ERROR(EINVAL));
1361 1408  
1362 1409          boff = ldbtob(blkno);
1363 1410          resid = ldbtob(nblocks);
1364 1411  
1365 1412          VERIFY3U(boff + resid, <=, zv->zv_volsize);
1366 1413  
1367 1414          while (resid) {
1368 1415                  size = MIN(resid, P2END(boff, zv->zv_volblocksize) - boff);
1369 1416                  error = zvol_dumpio(zv, addr, boff, size, B_FALSE, B_TRUE);
1370 1417                  if (error)
1371 1418                          break;
1372 1419                  boff += size;
1373 1420                  addr += size;
1374 1421                  resid -= size;
1375 1422          }
1376 1423  
1377 1424          return (error);
1378 1425  }
1379 1426  
1380 1427  /*ARGSUSED*/
1381 1428  int
1382 1429  zvol_read(dev_t dev, uio_t *uio, cred_t *cr)
1383 1430  {
1384 1431          minor_t minor = getminor(dev);
1385 1432          zvol_state_t *zv;
1386 1433          uint64_t volsize;
1387 1434          rl_t *rl;
1388 1435          int error = 0;
1389 1436  
1390 1437          zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
1391 1438          if (zv == NULL)
1392 1439                  return (SET_ERROR(ENXIO));
1393 1440  
1394 1441          volsize = zv->zv_volsize;
1395 1442          if (uio->uio_resid > 0 &&
1396 1443              (uio->uio_loffset < 0 || uio->uio_loffset >= volsize))
1397 1444                  return (SET_ERROR(EIO));
1398 1445  
1399 1446          if (zv->zv_flags & ZVOL_DUMPIFIED) {
1400 1447                  error = physio(zvol_strategy, NULL, dev, B_READ,
1401 1448                      zvol_minphys, uio);
1402 1449                  return (error);
1403 1450          }
1404 1451  
1405 1452          rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid,
1406 1453              RL_READER);
1407 1454          while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
1408 1455                  uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
1409 1456  
1410 1457                  /* don't read past the end */
1411 1458                  if (bytes > volsize - uio->uio_loffset)
1412 1459                          bytes = volsize - uio->uio_loffset;
1413 1460  
1414 1461                  error =  dmu_read_uio(zv->zv_objset, ZVOL_OBJ, uio, bytes);
1415 1462                  if (error) {
1416 1463                          /* convert checksum errors into IO errors */
1417 1464                          if (error == ECKSUM)
1418 1465                                  error = SET_ERROR(EIO);
1419 1466                          break;
1420 1467                  }
1421 1468          }
1422 1469          zfs_range_unlock(rl);
1423 1470          return (error);
1424 1471  }
1425 1472  
1426 1473  /*ARGSUSED*/
1427 1474  int
1428 1475  zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
1429 1476  {
1430 1477          minor_t minor = getminor(dev);
1431 1478          zvol_state_t *zv;
1432 1479          uint64_t volsize;
1433 1480          rl_t *rl;
1434 1481          int error = 0;
1435 1482          boolean_t sync;
1436 1483  
1437 1484          zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
1438 1485          if (zv == NULL)
1439 1486                  return (SET_ERROR(ENXIO));
1440 1487  
1441 1488          volsize = zv->zv_volsize;
1442 1489          if (uio->uio_resid > 0 &&
1443 1490              (uio->uio_loffset < 0 || uio->uio_loffset >= volsize))
1444 1491                  return (SET_ERROR(EIO));
1445 1492  
1446 1493          if (zv->zv_flags & ZVOL_DUMPIFIED) {
1447 1494                  error = physio(zvol_strategy, NULL, dev, B_WRITE,
1448 1495                      zvol_minphys, uio);
1449 1496                  return (error);
1450 1497          }
1451 1498  
1452 1499          sync = !(zv->zv_flags & ZVOL_WCE) ||
1453 1500              (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
1454 1501  
1455 1502          rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid,
1456 1503              RL_WRITER);
1457 1504          while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
1458 1505                  uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
1459 1506                  uint64_t off = uio->uio_loffset;
1460 1507                  dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
1461 1508  
1462 1509                  if (bytes > volsize - off)      /* don't write past the end */
1463 1510                          bytes = volsize - off;
1464 1511  
1465 1512                  dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes);
1466 1513                  error = dmu_tx_assign(tx, TXG_WAIT);
1467 1514                  if (error) {
1468 1515                          dmu_tx_abort(tx);
1469 1516                          break;
1470 1517                  }
1471 1518                  error = dmu_write_uio_dbuf(zv->zv_dbuf, uio, bytes, tx);
1472 1519                  if (error == 0)
1473 1520                          zvol_log_write(zv, tx, off, bytes, sync);
1474 1521                  dmu_tx_commit(tx);
1475 1522  
1476 1523                  if (error)
1477 1524                          break;
1478 1525          }
1479 1526          zfs_range_unlock(rl);
1480 1527          if (sync)
1481 1528                  zil_commit(zv->zv_zilog, ZVOL_OBJ);
1482 1529          return (error);
1483 1530  }
1484 1531  
1485 1532  int
1486 1533  zvol_getefi(void *arg, int flag, uint64_t vs, uint8_t bs)
1487 1534  {
1488 1535          struct uuid uuid = EFI_RESERVED;
1489 1536          efi_gpe_t gpe = { 0 };
1490 1537          uint32_t crc;
1491 1538          dk_efi_t efi;
1492 1539          int length;
1493 1540          char *ptr;
1494 1541  
1495 1542          if (ddi_copyin(arg, &efi, sizeof (dk_efi_t), flag))
1496 1543                  return (SET_ERROR(EFAULT));
1497 1544          ptr = (char *)(uintptr_t)efi.dki_data_64;
1498 1545          length = efi.dki_length;
1499 1546          /*
1500 1547           * Some clients may attempt to request a PMBR for the
1501 1548           * zvol.  Currently this interface will return EINVAL to
1502 1549           * such requests.  These requests could be supported by
1503 1550           * adding a check for lba == 0 and consing up an appropriate
1504 1551           * PMBR.
1505 1552           */
1506 1553          if (efi.dki_lba < 1 || efi.dki_lba > 2 || length <= 0)
1507 1554                  return (SET_ERROR(EINVAL));
1508 1555  
1509 1556          gpe.efi_gpe_StartingLBA = LE_64(34ULL);
1510 1557          gpe.efi_gpe_EndingLBA = LE_64((vs >> bs) - 1);
1511 1558          UUID_LE_CONVERT(gpe.efi_gpe_PartitionTypeGUID, uuid);
1512 1559  
1513 1560          if (efi.dki_lba == 1) {
1514 1561                  efi_gpt_t gpt = { 0 };
1515 1562  
1516 1563                  gpt.efi_gpt_Signature = LE_64(EFI_SIGNATURE);
1517 1564                  gpt.efi_gpt_Revision = LE_32(EFI_VERSION_CURRENT);
1518 1565                  gpt.efi_gpt_HeaderSize = LE_32(sizeof (gpt));
1519 1566                  gpt.efi_gpt_MyLBA = LE_64(1ULL);
1520 1567                  gpt.efi_gpt_FirstUsableLBA = LE_64(34ULL);
1521 1568                  gpt.efi_gpt_LastUsableLBA = LE_64((vs >> bs) - 1);
1522 1569                  gpt.efi_gpt_PartitionEntryLBA = LE_64(2ULL);
1523 1570                  gpt.efi_gpt_NumberOfPartitionEntries = LE_32(1);
1524 1571                  gpt.efi_gpt_SizeOfPartitionEntry =
1525 1572                      LE_32(sizeof (efi_gpe_t));
1526 1573                  CRC32(crc, &gpe, sizeof (gpe), -1U, crc32_table);
1527 1574                  gpt.efi_gpt_PartitionEntryArrayCRC32 = LE_32(~crc);
1528 1575                  CRC32(crc, &gpt, sizeof (gpt), -1U, crc32_table);
1529 1576                  gpt.efi_gpt_HeaderCRC32 = LE_32(~crc);
1530 1577                  if (ddi_copyout(&gpt, ptr, MIN(sizeof (gpt), length),
1531 1578                      flag))
1532 1579                          return (SET_ERROR(EFAULT));
1533 1580                  ptr += sizeof (gpt);
1534 1581                  length -= sizeof (gpt);
1535 1582          }
1536 1583          if (length > 0 && ddi_copyout(&gpe, ptr, MIN(sizeof (gpe),
1537 1584              length), flag))
1538 1585                  return (SET_ERROR(EFAULT));
1539 1586          return (0);
1540 1587  }
1541 1588  
1542 1589  /*
1543 1590   * BEGIN entry points to allow external callers access to the volume.
1544 1591   */
1545 1592  /*
1546 1593   * Return the volume parameters needed for access from an external caller.
1547 1594   * These values are invariant as long as the volume is held open.
1548 1595   */
1549 1596  int
1550 1597  zvol_get_volume_params(minor_t minor, uint64_t *blksize,
1551 1598      uint64_t *max_xfer_len, void **minor_hdl, void **objset_hdl, void **zil_hdl,
1552 1599      void **rl_hdl, void **bonus_hdl)
1553 1600  {
1554 1601          zvol_state_t *zv;
1555 1602  
1556 1603          zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
1557 1604          if (zv == NULL)
1558 1605                  return (SET_ERROR(ENXIO));
1559 1606          if (zv->zv_flags & ZVOL_DUMPIFIED)
1560 1607                  return (SET_ERROR(ENXIO));
1561 1608  
1562 1609          ASSERT(blksize && max_xfer_len && minor_hdl &&
1563 1610              objset_hdl && zil_hdl && rl_hdl && bonus_hdl);
1564 1611  
1565 1612          *blksize = zv->zv_volblocksize;
1566 1613          *max_xfer_len = (uint64_t)zvol_maxphys;
1567 1614          *minor_hdl = zv;
1568 1615          *objset_hdl = zv->zv_objset;
1569 1616          *zil_hdl = zv->zv_zilog;
1570 1617          *rl_hdl = &zv->zv_znode;
1571 1618          *bonus_hdl = zv->zv_dbuf;
1572 1619          return (0);
1573 1620  }
1574 1621  
1575 1622  /*
1576 1623   * Return the current volume size to an external caller.
1577 1624   * The size can change while the volume is open.
1578 1625   */
1579 1626  uint64_t
1580 1627  zvol_get_volume_size(void *minor_hdl)
1581 1628  {
1582 1629          zvol_state_t *zv = minor_hdl;
1583 1630  
1584 1631          return (zv->zv_volsize);
1585 1632  }
1586 1633  
1587 1634  /*
1588 1635   * Return the current WCE setting to an external caller.
1589 1636   * The WCE setting can change while the volume is open.
1590 1637   */
1591 1638  int
1592 1639  zvol_get_volume_wce(void *minor_hdl)
1593 1640  {
1594 1641          zvol_state_t *zv = minor_hdl;
1595 1642  
1596 1643          return ((zv->zv_flags & ZVOL_WCE) ? 1 : 0);
1597 1644  }
1598 1645  
1599 1646  /*
1600 1647   * Entry point for external callers to zvol_log_write
1601 1648   */
1602 1649  void
1603 1650  zvol_log_write_minor(void *minor_hdl, dmu_tx_t *tx, offset_t off, ssize_t resid,
1604 1651      boolean_t sync)
1605 1652  {
1606 1653          zvol_state_t *zv = minor_hdl;
1607 1654  
1608 1655          zvol_log_write(zv, tx, off, resid, sync);
1609 1656  }
1610 1657  /*
1611 1658   * END entry points to allow external callers access to the volume.
1612 1659   */
1613 1660  
1614 1661  /*
1615 1662   * Log a DKIOCFREE/free-long-range to the ZIL with TX_TRUNCATE.
1616 1663   */
1617 1664  static void
1618 1665  zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, uint64_t len,
1619 1666      boolean_t sync)
1620 1667  {
1621 1668          itx_t *itx;
1622 1669          lr_truncate_t *lr;
1623 1670          zilog_t *zilog = zv->zv_zilog;
1624 1671  
1625 1672          if (zil_replaying(zilog, tx))
1626 1673                  return;
1627 1674  
1628 1675          itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr));
1629 1676          lr = (lr_truncate_t *)&itx->itx_lr;
1630 1677          lr->lr_foid = ZVOL_OBJ;
1631 1678          lr->lr_offset = off;
1632 1679          lr->lr_length = len;
1633 1680  
1634 1681          itx->itx_sync = sync;
1635 1682          zil_itx_assign(zilog, itx, tx);
1636 1683  }
1637 1684  
1638 1685  /*
1639 1686   * Dirtbag ioctls to support mkfs(1M) for UFS filesystems.  See dkio(7I).
1640 1687   * Also a dirtbag dkio ioctl for unmap/free-block functionality.
1641 1688   */
1642 1689  /*ARGSUSED*/
1643 1690  int
1644 1691  zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
1645 1692  {
1646 1693          zvol_state_t *zv;
1647 1694          struct dk_callback *dkc;
1648 1695          int error = 0;
1649 1696          rl_t *rl;
1650 1697  
1651 1698          mutex_enter(&zfsdev_state_lock);
1652 1699  
1653 1700          zv = zfsdev_get_soft_state(getminor(dev), ZSST_ZVOL);
1654 1701  
1655 1702          if (zv == NULL) {
1656 1703                  mutex_exit(&zfsdev_state_lock);
1657 1704                  return (SET_ERROR(ENXIO));
1658 1705          }
1659 1706          ASSERT(zv->zv_total_opens > 0);
1660 1707  
1661 1708          switch (cmd) {
1662 1709  
1663 1710          case DKIOCINFO:
1664 1711          {
1665 1712                  struct dk_cinfo dki;
1666 1713  
1667 1714                  bzero(&dki, sizeof (dki));
1668 1715                  (void) strcpy(dki.dki_cname, "zvol");
1669 1716                  (void) strcpy(dki.dki_dname, "zvol");
1670 1717                  dki.dki_ctype = DKC_UNKNOWN;
1671 1718                  dki.dki_unit = getminor(dev);
1672 1719                  dki.dki_maxtransfer =
1673 1720                      1 << (SPA_OLD_MAXBLOCKSHIFT - zv->zv_min_bs);
1674 1721                  mutex_exit(&zfsdev_state_lock);
1675 1722                  if (ddi_copyout(&dki, (void *)arg, sizeof (dki), flag))
1676 1723                          error = SET_ERROR(EFAULT);
1677 1724                  return (error);
1678 1725          }
1679 1726  
1680 1727          case DKIOCGMEDIAINFO:
1681 1728          {
1682 1729                  struct dk_minfo dkm;
1683 1730  
1684 1731                  bzero(&dkm, sizeof (dkm));
1685 1732                  dkm.dki_lbsize = 1U << zv->zv_min_bs;
1686 1733                  dkm.dki_capacity = zv->zv_volsize >> zv->zv_min_bs;
1687 1734                  dkm.dki_media_type = DK_UNKNOWN;
1688 1735                  mutex_exit(&zfsdev_state_lock);
1689 1736                  if (ddi_copyout(&dkm, (void *)arg, sizeof (dkm), flag))
1690 1737                          error = SET_ERROR(EFAULT);
1691 1738                  return (error);
1692 1739          }
1693 1740  
1694 1741          case DKIOCGMEDIAINFOEXT:
1695 1742          {
1696 1743                  struct dk_minfo_ext dkmext;
1697 1744  
1698 1745                  bzero(&dkmext, sizeof (dkmext));
1699 1746                  dkmext.dki_lbsize = 1U << zv->zv_min_bs;
1700 1747                  dkmext.dki_pbsize = zv->zv_volblocksize;
1701 1748                  dkmext.dki_capacity = zv->zv_volsize >> zv->zv_min_bs;
1702 1749                  dkmext.dki_media_type = DK_UNKNOWN;
1703 1750                  mutex_exit(&zfsdev_state_lock);
1704 1751                  if (ddi_copyout(&dkmext, (void *)arg, sizeof (dkmext), flag))
1705 1752                          error = SET_ERROR(EFAULT);
1706 1753                  return (error);
1707 1754          }
1708 1755  
1709 1756          case DKIOCGETEFI:
1710 1757          {
1711 1758                  uint64_t vs = zv->zv_volsize;
1712 1759                  uint8_t bs = zv->zv_min_bs;
1713 1760  
1714 1761                  mutex_exit(&zfsdev_state_lock);
1715 1762                  error = zvol_getefi((void *)arg, flag, vs, bs);
1716 1763                  return (error);
1717 1764          }
1718 1765  
1719 1766          case DKIOCFLUSHWRITECACHE:
1720 1767                  dkc = (struct dk_callback *)arg;
1721 1768                  mutex_exit(&zfsdev_state_lock);
1722 1769                  zil_commit(zv->zv_zilog, ZVOL_OBJ);
1723 1770                  if ((flag & FKIOCTL) && dkc != NULL && dkc->dkc_callback) {
1724 1771                          (*dkc->dkc_callback)(dkc->dkc_cookie, error);
1725 1772                          error = 0;
1726 1773                  }
1727 1774                  return (error);
1728 1775  
1729 1776          case DKIOCGETWCE:
1730 1777          {
1731 1778                  int wce = (zv->zv_flags & ZVOL_WCE) ? 1 : 0;
1732 1779                  if (ddi_copyout(&wce, (void *)arg, sizeof (int),
1733 1780                      flag))
1734 1781                          error = SET_ERROR(EFAULT);
1735 1782                  break;
1736 1783          }
1737 1784          case DKIOCSETWCE:
1738 1785          {
1739 1786                  int wce;
1740 1787                  if (ddi_copyin((void *)arg, &wce, sizeof (int),
1741 1788                      flag)) {
1742 1789                          error = SET_ERROR(EFAULT);
1743 1790                          break;
1744 1791                  }
1745 1792                  if (wce) {
1746 1793                          zv->zv_flags |= ZVOL_WCE;
1747 1794                          mutex_exit(&zfsdev_state_lock);
1748 1795                  } else {
1749 1796                          zv->zv_flags &= ~ZVOL_WCE;
1750 1797                          mutex_exit(&zfsdev_state_lock);
1751 1798                          zil_commit(zv->zv_zilog, ZVOL_OBJ);
1752 1799                  }
1753 1800                  return (0);
1754 1801          }
1755 1802  
1756 1803          case DKIOCGGEOM:
1757 1804          case DKIOCGVTOC:
1758 1805                  /*
1759 1806                   * commands using these (like prtvtoc) expect ENOTSUP
1760 1807                   * since we're emulating an EFI label
1761 1808                   */
1762 1809                  error = SET_ERROR(ENOTSUP);
1763 1810                  break;
1764 1811  
1765 1812          case DKIOCDUMPINIT:
1766 1813                  rl = zfs_range_lock(&zv->zv_znode, 0, zv->zv_volsize,
1767 1814                      RL_WRITER);
1768 1815                  error = zvol_dumpify(zv);
1769 1816                  zfs_range_unlock(rl);
1770 1817                  break;
1771 1818  
1772 1819          case DKIOCDUMPFINI:
  
    | 
      ↓ open down ↓ | 
    586 lines elided | 
    
      ↑ open up ↑ | 
  
1773 1820                  if (!(zv->zv_flags & ZVOL_DUMPIFIED))
1774 1821                          break;
1775 1822                  rl = zfs_range_lock(&zv->zv_znode, 0, zv->zv_volsize,
1776 1823                      RL_WRITER);
1777 1824                  error = zvol_dump_fini(zv);
1778 1825                  zfs_range_unlock(rl);
1779 1826                  break;
1780 1827  
1781 1828          case DKIOCFREE:
1782 1829          {
1783      -                dkioc_free_t df;
     1830 +                dkioc_free_list_t *dfl;
1784 1831                  dmu_tx_t *tx;
1785 1832  
     1833 +                mutex_exit(&zfsdev_state_lock);
     1834 +
1786 1835                  if (!zvol_unmap_enabled)
1787 1836                          break;
1788 1837  
1789      -                if (ddi_copyin((void *)arg, &df, sizeof (df), flag)) {
1790      -                        error = SET_ERROR(EFAULT);
1791      -                        break;
     1838 +                if (!(flag & FKIOCTL)) {
     1839 +                        dfl = dfl_copyin((void *)arg, flag, KM_SLEEP);
     1840 +                        if (dfl == NULL) {
     1841 +                                error = SET_ERROR(EFAULT);
     1842 +                                break;
     1843 +                        }
     1844 +                } else {
     1845 +                        dfl = (dkioc_free_list_t *)arg;
1792 1846                  }
1793 1847  
1794      -                /*
1795      -                 * Apply Postel's Law to length-checking.  If they overshoot,
1796      -                 * just blank out until the end, if there's a need to blank
1797      -                 * out anything.
1798      -                 */
1799      -                if (df.df_start >= zv->zv_volsize)
1800      -                        break;  /* No need to do anything... */
     1848 +                for (int i = 0; i < dfl->dfl_num_exts; i++) {
     1849 +                        uint64_t start = dfl->dfl_exts[i].dfle_start,
     1850 +                            length = dfl->dfl_exts[i].dfle_length,
     1851 +                            end = start + length;
1801 1852  
1802      -                mutex_exit(&zfsdev_state_lock);
     1853 +                        /*
     1854 +                         * Apply Postel's Law to length-checking.  If they
     1855 +                         * overshoot, just blank out until the end, if there's
     1856 +                         * a need to blank out anything.
     1857 +                         */
     1858 +                        if (start >= zv->zv_volsize)
     1859 +                                continue;       /* No need to do anything... */
     1860 +                        if (end > zv->zv_volsize) {
     1861 +                                end = DMU_OBJECT_END;
     1862 +                                length = end - start;
     1863 +                        }
1803 1864  
1804      -                rl = zfs_range_lock(&zv->zv_znode, df.df_start, df.df_length,
1805      -                    RL_WRITER);
1806      -                tx = dmu_tx_create(zv->zv_objset);
1807      -                dmu_tx_mark_netfree(tx);
1808      -                error = dmu_tx_assign(tx, TXG_WAIT);
1809      -                if (error != 0) {
1810      -                        dmu_tx_abort(tx);
1811      -                } else {
1812      -                        zvol_log_truncate(zv, tx, df.df_start,
1813      -                            df.df_length, B_TRUE);
1814      -                        dmu_tx_commit(tx);
1815      -                        error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
1816      -                            df.df_start, df.df_length);
1817      -                }
     1865 +                        rl = zfs_range_lock(&zv->zv_znode, start, length,
     1866 +                            RL_WRITER);
     1867 +                        tx = dmu_tx_create(zv->zv_objset);
     1868 +                        error = dmu_tx_assign(tx, TXG_WAIT);
     1869 +                        if (error != 0) {
     1870 +                                dmu_tx_abort(tx);
     1871 +                        } else {
     1872 +                                zvol_log_truncate(zv, tx, start, length,
     1873 +                                    B_TRUE);
     1874 +                                dmu_tx_commit(tx);
     1875 +                                error = dmu_free_long_range(zv->zv_objset,
     1876 +                                    ZVOL_OBJ, start, length);
     1877 +                        }
1818 1878  
1819      -                zfs_range_unlock(rl);
     1879 +                        zfs_range_unlock(rl);
1820 1880  
     1881 +                        if (error != 0)
     1882 +                                break;
     1883 +                }
     1884 +
1821 1885                  /*
1822 1886                   * If the write-cache is disabled, 'sync' property
1823 1887                   * is set to 'always', or if the caller is asking for
1824 1888                   * a synchronous free, commit this operation to the zil.
1825 1889                   * This will sync any previous uncommitted writes to the
1826 1890                   * zvol object.
1827 1891                   * Can be overridden by the zvol_unmap_sync_enabled tunable.
1828 1892                   */
1829 1893                  if ((error == 0) && zvol_unmap_sync_enabled &&
1830 1894                      (!(zv->zv_flags & ZVOL_WCE) ||
1831 1895                      (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS) ||
1832      -                    (df.df_flags & DF_WAIT_SYNC))) {
     1896 +                    (dfl->dfl_flags & DF_WAIT_SYNC))) {
1833 1897                          zil_commit(zv->zv_zilog, ZVOL_OBJ);
1834 1898                  }
1835 1899  
     1900 +                if (!(flag & FKIOCTL))
     1901 +                        dfl_free(dfl);
     1902 +
1836 1903                  return (error);
1837 1904          }
1838 1905  
1839 1906          default:
1840 1907                  error = SET_ERROR(ENOTTY);
1841 1908                  break;
1842 1909  
1843 1910          }
1844 1911          mutex_exit(&zfsdev_state_lock);
1845 1912          return (error);
1846 1913  }
1847 1914  
1848 1915  int
1849 1916  zvol_busy(void)
1850 1917  {
1851 1918          return (zvol_minors != 0);
1852 1919  }
1853 1920  
1854 1921  void
1855 1922  zvol_init(void)
1856 1923  {
1857 1924          VERIFY(ddi_soft_state_init(&zfsdev_state, sizeof (zfs_soft_state_t),
1858 1925              1) == 0);
1859 1926          mutex_init(&zfsdev_state_lock, NULL, MUTEX_DEFAULT, NULL);
1860 1927  }
1861 1928  
1862 1929  void
1863 1930  zvol_fini(void)
1864 1931  {
1865 1932          mutex_destroy(&zfsdev_state_lock);
1866 1933          ddi_soft_state_fini(&zfsdev_state);
1867 1934  }
1868 1935  
1869 1936  /*ARGSUSED*/
1870 1937  static int
1871 1938  zfs_mvdev_dump_feature_check(void *arg, dmu_tx_t *tx)
1872 1939  {
1873 1940          spa_t *spa = dmu_tx_pool(tx)->dp_spa;
1874 1941  
1875 1942          if (spa_feature_is_active(spa, SPA_FEATURE_MULTI_VDEV_CRASH_DUMP))
1876 1943                  return (1);
1877 1944          return (0);
1878 1945  }
1879 1946  
1880 1947  /*ARGSUSED*/
1881 1948  static void
1882 1949  zfs_mvdev_dump_activate_feature_sync(void *arg, dmu_tx_t *tx)
1883 1950  {
1884 1951          spa_t *spa = dmu_tx_pool(tx)->dp_spa;
1885 1952  
1886 1953          spa_feature_incr(spa, SPA_FEATURE_MULTI_VDEV_CRASH_DUMP, tx);
1887 1954  }
1888 1955  
1889 1956  static int
1890 1957  zvol_dump_init(zvol_state_t *zv, boolean_t resize)
1891 1958  {
1892 1959          dmu_tx_t *tx;
1893 1960          int error;
1894 1961          objset_t *os = zv->zv_objset;
1895 1962          spa_t *spa = dmu_objset_spa(os);
1896 1963          vdev_t *vd = spa->spa_root_vdev;
1897 1964          nvlist_t *nv = NULL;
1898 1965          uint64_t version = spa_version(spa);
1899 1966          uint64_t checksum, compress, refresrv, vbs, dedup;
1900 1967  
1901 1968          ASSERT(MUTEX_HELD(&zfsdev_state_lock));
1902 1969          ASSERT(vd->vdev_ops == &vdev_root_ops);
1903 1970  
1904 1971          error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, 0,
1905 1972              DMU_OBJECT_END);
1906 1973          if (error != 0)
1907 1974                  return (error);
1908 1975          /* wait for dmu_free_long_range to actually free the blocks */
1909 1976          txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
1910 1977  
1911 1978          /*
1912 1979           * If the pool on which the dump device is being initialized has more
1913 1980           * than one child vdev, check that the MULTI_VDEV_CRASH_DUMP feature is
1914 1981           * enabled.  If so, bump that feature's counter to indicate that the
1915 1982           * feature is active. We also check the vdev type to handle the
1916 1983           * following case:
1917 1984           *   # zpool create test raidz disk1 disk2 disk3
1918 1985           *   Now have spa_root_vdev->vdev_children == 1 (the raidz vdev),
1919 1986           *   the raidz vdev itself has 3 children.
1920 1987           */
1921 1988          if (vd->vdev_children > 1 || vd->vdev_ops == &vdev_raidz_ops) {
1922 1989                  if (!spa_feature_is_enabled(spa,
1923 1990                      SPA_FEATURE_MULTI_VDEV_CRASH_DUMP))
1924 1991                          return (SET_ERROR(ENOTSUP));
1925 1992                  (void) dsl_sync_task(spa_name(spa),
1926 1993                      zfs_mvdev_dump_feature_check,
1927 1994                      zfs_mvdev_dump_activate_feature_sync, NULL,
1928 1995                      2, ZFS_SPACE_CHECK_RESERVED);
1929 1996          }
1930 1997  
1931 1998          if (!resize) {
1932 1999                  error = dsl_prop_get_integer(zv->zv_name,
1933 2000                      zfs_prop_to_name(ZFS_PROP_COMPRESSION), &compress, NULL);
1934 2001                  if (error == 0) {
1935 2002                          error = dsl_prop_get_integer(zv->zv_name,
1936 2003                              zfs_prop_to_name(ZFS_PROP_CHECKSUM), &checksum,
1937 2004                              NULL);
1938 2005                  }
1939 2006                  if (error == 0) {
1940 2007                          error = dsl_prop_get_integer(zv->zv_name,
1941 2008                              zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
1942 2009                              &refresrv, NULL);
1943 2010                  }
1944 2011                  if (error == 0) {
1945 2012                          error = dsl_prop_get_integer(zv->zv_name,
1946 2013                              zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &vbs,
1947 2014                              NULL);
1948 2015                  }
1949 2016                  if (version >= SPA_VERSION_DEDUP && error == 0) {
1950 2017                          error = dsl_prop_get_integer(zv->zv_name,
1951 2018                              zfs_prop_to_name(ZFS_PROP_DEDUP), &dedup, NULL);
1952 2019                  }
1953 2020          }
1954 2021          if (error != 0)
1955 2022                  return (error);
1956 2023  
1957 2024          tx = dmu_tx_create(os);
1958 2025          dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
1959 2026          dmu_tx_hold_bonus(tx, ZVOL_OBJ);
1960 2027          error = dmu_tx_assign(tx, TXG_WAIT);
1961 2028          if (error != 0) {
1962 2029                  dmu_tx_abort(tx);
1963 2030                  return (error);
1964 2031          }
1965 2032  
1966 2033          /*
1967 2034           * If we are resizing the dump device then we only need to
1968 2035           * update the refreservation to match the newly updated
1969 2036           * zvolsize. Otherwise, we save off the original state of the
1970 2037           * zvol so that we can restore them if the zvol is ever undumpified.
1971 2038           */
1972 2039          if (resize) {
1973 2040                  error = zap_update(os, ZVOL_ZAP_OBJ,
1974 2041                      zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1,
1975 2042                      &zv->zv_volsize, tx);
1976 2043          } else {
1977 2044                  error = zap_update(os, ZVOL_ZAP_OBJ,
1978 2045                      zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1,
1979 2046                      &compress, tx);
1980 2047                  if (error == 0) {
1981 2048                          error = zap_update(os, ZVOL_ZAP_OBJ,
1982 2049                              zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1,
1983 2050                              &checksum, tx);
1984 2051                  }
1985 2052                  if (error == 0) {
1986 2053                          error = zap_update(os, ZVOL_ZAP_OBJ,
1987 2054                              zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1,
1988 2055                              &refresrv, tx);
1989 2056                  }
1990 2057                  if (error == 0) {
1991 2058                          error = zap_update(os, ZVOL_ZAP_OBJ,
1992 2059                              zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1,
1993 2060                              &vbs, tx);
1994 2061                  }
1995 2062                  if (error == 0) {
1996 2063                          error = dmu_object_set_blocksize(
1997 2064                              os, ZVOL_OBJ, SPA_OLD_MAXBLOCKSIZE, 0, tx);
1998 2065                  }
1999 2066                  if (version >= SPA_VERSION_DEDUP && error == 0) {
2000 2067                          error = zap_update(os, ZVOL_ZAP_OBJ,
2001 2068                              zfs_prop_to_name(ZFS_PROP_DEDUP), 8, 1,
2002 2069                              &dedup, tx);
2003 2070                  }
2004 2071                  if (error == 0)
2005 2072                          zv->zv_volblocksize = SPA_OLD_MAXBLOCKSIZE;
2006 2073          }
2007 2074          dmu_tx_commit(tx);
2008 2075  
2009 2076          /*
2010 2077           * We only need update the zvol's property if we are initializing
2011 2078           * the dump area for the first time.
2012 2079           */
2013 2080          if (error == 0 && !resize) {
2014 2081                  /*
2015 2082                   * If MULTI_VDEV_CRASH_DUMP is active, use the NOPARITY checksum
2016 2083                   * function.  Otherwise, use the old default -- OFF.
2017 2084                   */
2018 2085                  checksum = spa_feature_is_active(spa,
2019 2086                      SPA_FEATURE_MULTI_VDEV_CRASH_DUMP) ? ZIO_CHECKSUM_NOPARITY :
2020 2087                      ZIO_CHECKSUM_OFF;
2021 2088  
2022 2089                  VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
2023 2090                  VERIFY(nvlist_add_uint64(nv,
2024 2091                      zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 0) == 0);
2025 2092                  VERIFY(nvlist_add_uint64(nv,
2026 2093                      zfs_prop_to_name(ZFS_PROP_COMPRESSION),
2027 2094                      ZIO_COMPRESS_OFF) == 0);
2028 2095                  VERIFY(nvlist_add_uint64(nv,
2029 2096                      zfs_prop_to_name(ZFS_PROP_CHECKSUM),
2030 2097                      checksum) == 0);
2031 2098                  if (version >= SPA_VERSION_DEDUP) {
2032 2099                          VERIFY(nvlist_add_uint64(nv,
2033 2100                              zfs_prop_to_name(ZFS_PROP_DEDUP),
2034 2101                              ZIO_CHECKSUM_OFF) == 0);
2035 2102                  }
2036 2103  
2037 2104                  error = zfs_set_prop_nvlist(zv->zv_name, ZPROP_SRC_LOCAL,
2038 2105                      nv, NULL);
2039 2106                  nvlist_free(nv);
2040 2107          }
2041 2108  
2042 2109          /* Allocate the space for the dump */
2043 2110          if (error == 0)
2044 2111                  error = zvol_prealloc(zv);
2045 2112          return (error);
2046 2113  }
2047 2114  
2048 2115  static int
2049 2116  zvol_dumpify(zvol_state_t *zv)
2050 2117  {
2051 2118          int error = 0;
2052 2119          uint64_t dumpsize = 0;
2053 2120          dmu_tx_t *tx;
2054 2121          objset_t *os = zv->zv_objset;
2055 2122  
2056 2123          if (zv->zv_flags & ZVOL_RDONLY)
2057 2124                  return (SET_ERROR(EROFS));
2058 2125  
2059 2126          if (zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE,
2060 2127              8, 1, &dumpsize) != 0 || dumpsize != zv->zv_volsize) {
2061 2128                  boolean_t resize = (dumpsize > 0);
2062 2129  
2063 2130                  if ((error = zvol_dump_init(zv, resize)) != 0) {
2064 2131                          (void) zvol_dump_fini(zv);
2065 2132                          return (error);
2066 2133                  }
2067 2134          }
2068 2135  
2069 2136          /*
2070 2137           * Build up our lba mapping.
2071 2138           */
2072 2139          error = zvol_get_lbas(zv);
2073 2140          if (error) {
2074 2141                  (void) zvol_dump_fini(zv);
2075 2142                  return (error);
2076 2143          }
2077 2144  
2078 2145          tx = dmu_tx_create(os);
2079 2146          dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
2080 2147          error = dmu_tx_assign(tx, TXG_WAIT);
2081 2148          if (error) {
2082 2149                  dmu_tx_abort(tx);
2083 2150                  (void) zvol_dump_fini(zv);
2084 2151                  return (error);
2085 2152          }
2086 2153  
2087 2154          zv->zv_flags |= ZVOL_DUMPIFIED;
2088 2155          error = zap_update(os, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, 8, 1,
2089 2156              &zv->zv_volsize, tx);
2090 2157          dmu_tx_commit(tx);
2091 2158  
2092 2159          if (error) {
2093 2160                  (void) zvol_dump_fini(zv);
2094 2161                  return (error);
2095 2162          }
2096 2163  
2097 2164          txg_wait_synced(dmu_objset_pool(os), 0);
2098 2165          return (0);
2099 2166  }
2100 2167  
2101 2168  static int
2102 2169  zvol_dump_fini(zvol_state_t *zv)
2103 2170  {
2104 2171          dmu_tx_t *tx;
2105 2172          objset_t *os = zv->zv_objset;
2106 2173          nvlist_t *nv;
2107 2174          int error = 0;
2108 2175          uint64_t checksum, compress, refresrv, vbs, dedup;
2109 2176          uint64_t version = spa_version(dmu_objset_spa(zv->zv_objset));
2110 2177  
2111 2178          /*
2112 2179           * Attempt to restore the zvol back to its pre-dumpified state.
2113 2180           * This is a best-effort attempt as it's possible that not all
2114 2181           * of these properties were initialized during the dumpify process
2115 2182           * (i.e. error during zvol_dump_init).
2116 2183           */
2117 2184  
2118 2185          tx = dmu_tx_create(os);
2119 2186          dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
2120 2187          error = dmu_tx_assign(tx, TXG_WAIT);
2121 2188          if (error) {
2122 2189                  dmu_tx_abort(tx);
2123 2190                  return (error);
2124 2191          }
2125 2192          (void) zap_remove(os, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, tx);
2126 2193          dmu_tx_commit(tx);
2127 2194  
2128 2195          (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
2129 2196              zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1, &checksum);
2130 2197          (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
2131 2198              zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1, &compress);
2132 2199          (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
2133 2200              zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1, &refresrv);
2134 2201          (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
2135 2202              zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1, &vbs);
2136 2203  
2137 2204          VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
2138 2205          (void) nvlist_add_uint64(nv,
2139 2206              zfs_prop_to_name(ZFS_PROP_CHECKSUM), checksum);
2140 2207          (void) nvlist_add_uint64(nv,
2141 2208              zfs_prop_to_name(ZFS_PROP_COMPRESSION), compress);
2142 2209          (void) nvlist_add_uint64(nv,
2143 2210              zfs_prop_to_name(ZFS_PROP_REFRESERVATION), refresrv);
2144 2211          if (version >= SPA_VERSION_DEDUP &&
2145 2212              zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
2146 2213              zfs_prop_to_name(ZFS_PROP_DEDUP), 8, 1, &dedup) == 0) {
2147 2214                  (void) nvlist_add_uint64(nv,
2148 2215                      zfs_prop_to_name(ZFS_PROP_DEDUP), dedup);
2149 2216          }
2150 2217          (void) zfs_set_prop_nvlist(zv->zv_name, ZPROP_SRC_LOCAL,
2151 2218              nv, NULL);
2152 2219          nvlist_free(nv);
2153 2220  
2154 2221          zvol_free_extents(zv);
2155 2222          zv->zv_flags &= ~ZVOL_DUMPIFIED;
2156 2223          (void) dmu_free_long_range(os, ZVOL_OBJ, 0, DMU_OBJECT_END);
2157 2224          /* wait for dmu_free_long_range to actually free the blocks */
2158 2225          txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
2159 2226          tx = dmu_tx_create(os);
2160 2227          dmu_tx_hold_bonus(tx, ZVOL_OBJ);
2161 2228          error = dmu_tx_assign(tx, TXG_WAIT);
2162 2229          if (error) {
2163 2230                  dmu_tx_abort(tx);
2164 2231                  return (error);
2165 2232          }
2166 2233          if (dmu_object_set_blocksize(os, ZVOL_OBJ, vbs, 0, tx) == 0)
2167 2234                  zv->zv_volblocksize = vbs;
2168 2235          dmu_tx_commit(tx);
2169 2236  
2170 2237          return (0);
2171 2238  }
  
    | 
      ↓ open down ↓ | 
    326 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX