Print this page
701 UNMAP support for COMSTAR
Contributed by: Sumit Gupta <sumit.gupta@nexenta.com>
Reviewed by: Garrett D'Amore <garrett@nexenta.com>
Reviewed by: Eric Schrock <eric.schrock@delphix.com>
Reviewed by: George Wilson <gwilson@zfsmail.com>


   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.




  23  */
  24 
  25 /* Portions Copyright 2010 Robert Milkowski */
  26 
  27 /*
  28  * ZFS volume emulation driver.
  29  *
  30  * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
  31  * Volumes are accessed through the symbolic links named:
  32  *
  33  * /dev/zvol/dsk/<pool_name>/<dataset_name>
  34  * /dev/zvol/rdsk/<pool_name>/<dataset_name>
  35  *
  36  * These links are created by the /dev filesystem (sdev_zvolops.c).
  37  * Volumes are persistent through reboot.  No user command needs to be
  38  * run before opening and using a device.
  39  */
  40 
  41 #include <sys/types.h>
  42 #include <sys/param.h>
  43 #include <sys/errno.h>
  44 #include <sys/uio.h>
  45 #include <sys/buf.h>
  46 #include <sys/modctl.h>


 325          * property setting step won't apply to them.
 326          */
 327         VERIFY(nvlist_remove_all(nvprops,
 328             zfs_prop_to_name(ZFS_PROP_VOLSIZE)) == 0);
 329         (void) nvlist_remove_all(nvprops,
 330             zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE));
 331 
 332         error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize,
 333             DMU_OT_NONE, 0, tx);
 334         ASSERT(error == 0);
 335 
 336         error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP,
 337             DMU_OT_NONE, 0, tx);
 338         ASSERT(error == 0);
 339 
 340         error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx);
 341         ASSERT(error == 0);
 342 }
 343 
 344 /*


















 345  * Replay a TX_WRITE ZIL transaction that didn't get committed
 346  * after a system failure
 347  */
 348 static int
 349 zvol_replay_write(zvol_state_t *zv, lr_write_t *lr, boolean_t byteswap)
 350 {
 351         objset_t *os = zv->zv_objset;
 352         char *data = (char *)(lr + 1);  /* data follows lr_write_t */
 353         uint64_t offset, length;
 354         dmu_tx_t *tx;
 355         int error;
 356 
 357         if (byteswap)
 358                 byteswap_uint64_array(lr, sizeof (*lr));
 359 
 360         offset = lr->lr_offset;
 361         length = lr->lr_length;
 362 
 363         /* If it's a dmu_sync() block, write the whole block */
 364         if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {


 374         error = dmu_tx_assign(tx, TXG_WAIT);
 375         if (error) {
 376                 dmu_tx_abort(tx);
 377         } else {
 378                 dmu_write(os, ZVOL_OBJ, offset, length, data, tx);
 379                 dmu_tx_commit(tx);
 380         }
 381 
 382         return (error);
 383 }
 384 
 385 /* ARGSUSED */
 386 static int
 387 zvol_replay_err(zvol_state_t *zv, lr_t *lr, boolean_t byteswap)
 388 {
 389         return (ENOTSUP);
 390 }
 391 
 392 /*
 393  * Callback vectors for replaying records.
 394  * Only TX_WRITE is needed for zvol.
 395  */
 396 zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = {
 397         zvol_replay_err,        /* 0 no such transaction type */
 398         zvol_replay_err,        /* TX_CREATE */
 399         zvol_replay_err,        /* TX_MKDIR */
 400         zvol_replay_err,        /* TX_MKXATTR */
 401         zvol_replay_err,        /* TX_SYMLINK */
 402         zvol_replay_err,        /* TX_REMOVE */
 403         zvol_replay_err,        /* TX_RMDIR */
 404         zvol_replay_err,        /* TX_LINK */
 405         zvol_replay_err,        /* TX_RENAME */
 406         zvol_replay_write,      /* TX_WRITE */
 407         zvol_replay_err,        /* TX_TRUNCATE */
 408         zvol_replay_err,        /* TX_SETATTR */
 409         zvol_replay_err,        /* TX_ACL */
 410         zvol_replay_err,        /* TX_CREATE_ACL */
 411         zvol_replay_err,        /* TX_CREATE_ATTR */
 412         zvol_replay_err,        /* TX_CREATE_ACL_ATTR */
 413         zvol_replay_err,        /* TX_MKDIR_ACL */
 414         zvol_replay_err,        /* TX_MKDIR_ATTR */
 415         zvol_replay_err,        /* TX_MKDIR_ACL_ATTR */
 416         zvol_replay_err,        /* TX_WRITE2 */
 417 };
 418 
 419 int
 420 zvol_name2minor(const char *name, minor_t *minor)
 421 {
 422         zvol_state_t *zv;
 423 
 424         mutex_enter(&zfsdev_state_lock);
 425         zv = zvol_minor_lookup(name);
 426         if (minor && zv)
 427                 *minor = zv->zv_minor;


1495 
1496         return ((zv->zv_flags & ZVOL_WCE) ? 1 : 0);
1497 }
1498 
1499 /*
1500  * Entry point for external callers to zvol_log_write
1501  */
1502 void
1503 zvol_log_write_minor(void *minor_hdl, dmu_tx_t *tx, offset_t off, ssize_t resid,
1504     boolean_t sync)
1505 {
1506         zvol_state_t *zv = minor_hdl;
1507 
1508         zvol_log_write(zv, tx, off, resid, sync);
1509 }
1510 /*
1511  * END entry points to allow external callers access to the volume.
1512  */
1513 
1514 /*
























1515  * Dirtbag ioctls to support mkfs(1M) for UFS filesystems.  See dkio(7I).

1516  */
1517 /*ARGSUSED*/
1518 int
1519 zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
1520 {
1521         zvol_state_t *zv;
1522         struct dk_cinfo dki;
1523         struct dk_minfo dkm;
1524         struct dk_callback *dkc;
1525         int error = 0;
1526         rl_t *rl;
1527 
1528         mutex_enter(&zfsdev_state_lock);
1529 
1530         zv = zfsdev_get_soft_state(getminor(dev), ZSST_ZVOL);
1531 
1532         if (zv == NULL) {
1533                 mutex_exit(&zfsdev_state_lock);
1534                 return (ENXIO);
1535         }


1614                  */
1615                 error = ENOTSUP;
1616                 break;
1617 
1618         case DKIOCDUMPINIT:
1619                 rl = zfs_range_lock(&zv->zv_znode, 0, zv->zv_volsize,
1620                     RL_WRITER);
1621                 error = zvol_dumpify(zv);
1622                 zfs_range_unlock(rl);
1623                 break;
1624 
1625         case DKIOCDUMPFINI:
1626                 if (!(zv->zv_flags & ZVOL_DUMPIFIED))
1627                         break;
1628                 rl = zfs_range_lock(&zv->zv_znode, 0, zv->zv_volsize,
1629                     RL_WRITER);
1630                 error = zvol_dump_fini(zv);
1631                 zfs_range_unlock(rl);
1632                 break;
1633 



























































1634         default:
1635                 error = ENOTTY;
1636                 break;
1637 
1638         }
1639         mutex_exit(&zfsdev_state_lock);
1640         return (error);
1641 }
1642 
1643 int
1644 zvol_busy(void)
1645 {
1646         return (zvol_minors != 0);
1647 }
1648 
1649 void
1650 zvol_init(void)
1651 {
1652         VERIFY(ddi_soft_state_init(&zfsdev_state, sizeof (zfs_soft_state_t),
1653             1) == 0);




   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  *
  24  * Portions Copyright 2010 Robert Milkowski
  25  *
  26  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  27  */
  28 


  29 /*
  30  * ZFS volume emulation driver.
  31  *
  32  * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
  33  * Volumes are accessed through the symbolic links named:
  34  *
  35  * /dev/zvol/dsk/<pool_name>/<dataset_name>
  36  * /dev/zvol/rdsk/<pool_name>/<dataset_name>
  37  *
  38  * These links are created by the /dev filesystem (sdev_zvolops.c).
  39  * Volumes are persistent through reboot.  No user command needs to be
  40  * run before opening and using a device.
  41  */
  42 
  43 #include <sys/types.h>
  44 #include <sys/param.h>
  45 #include <sys/errno.h>
  46 #include <sys/uio.h>
  47 #include <sys/buf.h>
  48 #include <sys/modctl.h>


 327          * property setting step won't apply to them.
 328          */
 329         VERIFY(nvlist_remove_all(nvprops,
 330             zfs_prop_to_name(ZFS_PROP_VOLSIZE)) == 0);
 331         (void) nvlist_remove_all(nvprops,
 332             zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE));
 333 
 334         error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize,
 335             DMU_OT_NONE, 0, tx);
 336         ASSERT(error == 0);
 337 
 338         error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP,
 339             DMU_OT_NONE, 0, tx);
 340         ASSERT(error == 0);
 341 
 342         error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx);
 343         ASSERT(error == 0);
 344 }
 345 
 346 /*
 347  * Replay a TX_TRUNCATE ZIL transaction if asked.  TX_TRUNCATE is how we
 348  * implement DKIOCFREE/free-long-range.
 349  */
 350 static int
 351 zvol_replay_truncate(zvol_state_t *zv, lr_truncate_t *lr, boolean_t byteswap)
 352 {
 353         uint64_t offset, length;
 354 
 355         if (byteswap)
 356                 byteswap_uint64_array(lr, sizeof (*lr));
 357 
 358         offset = lr->lr_offset;
 359         length = lr->lr_length;
 360 
 361         return (dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, offset, length));
 362 }
 363 
 364 /*
 365  * Replay a TX_WRITE ZIL transaction that didn't get committed
 366  * after a system failure
 367  */
 368 static int
 369 zvol_replay_write(zvol_state_t *zv, lr_write_t *lr, boolean_t byteswap)
 370 {
 371         objset_t *os = zv->zv_objset;
 372         char *data = (char *)(lr + 1);  /* data follows lr_write_t */
 373         uint64_t offset, length;
 374         dmu_tx_t *tx;
 375         int error;
 376 
 377         if (byteswap)
 378                 byteswap_uint64_array(lr, sizeof (*lr));
 379 
 380         offset = lr->lr_offset;
 381         length = lr->lr_length;
 382 
 383         /* If it's a dmu_sync() block, write the whole block */
 384         if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {


 394         error = dmu_tx_assign(tx, TXG_WAIT);
 395         if (error) {
 396                 dmu_tx_abort(tx);
 397         } else {
 398                 dmu_write(os, ZVOL_OBJ, offset, length, data, tx);
 399                 dmu_tx_commit(tx);
 400         }
 401 
 402         return (error);
 403 }
 404 
 405 /* ARGSUSED */
 406 static int
 407 zvol_replay_err(zvol_state_t *zv, lr_t *lr, boolean_t byteswap)
 408 {
 409         return (ENOTSUP);
 410 }
 411 
 412 /*
 413  * Callback vectors for replaying records.
 414  * Only TX_WRITE and TX_TRUNCATE are needed for zvol.
 415  */
 416 zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = {
 417         zvol_replay_err,        /* 0 no such transaction type */
 418         zvol_replay_err,        /* TX_CREATE */
 419         zvol_replay_err,        /* TX_MKDIR */
 420         zvol_replay_err,        /* TX_MKXATTR */
 421         zvol_replay_err,        /* TX_SYMLINK */
 422         zvol_replay_err,        /* TX_REMOVE */
 423         zvol_replay_err,        /* TX_RMDIR */
 424         zvol_replay_err,        /* TX_LINK */
 425         zvol_replay_err,        /* TX_RENAME */
 426         zvol_replay_write,      /* TX_WRITE */
 427         zvol_replay_truncate,   /* TX_TRUNCATE */
 428         zvol_replay_err,        /* TX_SETATTR */
 429         zvol_replay_err,        /* TX_ACL */
 430         zvol_replay_err,        /* TX_CREATE_ACL */
 431         zvol_replay_err,        /* TX_CREATE_ATTR */
 432         zvol_replay_err,        /* TX_CREATE_ACL_ATTR */
 433         zvol_replay_err,        /* TX_MKDIR_ACL */
 434         zvol_replay_err,        /* TX_MKDIR_ATTR */
 435         zvol_replay_err,        /* TX_MKDIR_ACL_ATTR */
 436         zvol_replay_err,        /* TX_WRITE2 */
 437 };
 438 
 439 int
 440 zvol_name2minor(const char *name, minor_t *minor)
 441 {
 442         zvol_state_t *zv;
 443 
 444         mutex_enter(&zfsdev_state_lock);
 445         zv = zvol_minor_lookup(name);
 446         if (minor && zv)
 447                 *minor = zv->zv_minor;


1515 
1516         return ((zv->zv_flags & ZVOL_WCE) ? 1 : 0);
1517 }
1518 
1519 /*
1520  * Entry point for external callers to zvol_log_write
1521  */
1522 void
1523 zvol_log_write_minor(void *minor_hdl, dmu_tx_t *tx, offset_t off, ssize_t resid,
1524     boolean_t sync)
1525 {
1526         zvol_state_t *zv = minor_hdl;
1527 
1528         zvol_log_write(zv, tx, off, resid, sync);
1529 }
1530 /*
1531  * END entry points to allow external callers access to the volume.
1532  */
1533 
1534 /*
1535  * Log a DKIOCFREE/free-long-range to the ZIL with TX_TRUNCATE.
1536  */
1537 static void
1538 zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, uint64_t len,
1539     boolean_t sync)
1540 {
1541         itx_t *itx;
1542         lr_truncate_t *lr;
1543         zilog_t *zilog = zv->zv_zilog;
1544 
1545         if (zil_replaying(zilog, tx))
1546                 return;
1547 
1548         itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr));
1549         lr = (lr_truncate_t *)&itx->itx_lr;
1550         lr->lr_foid = ZVOL_OBJ;
1551         lr->lr_offset = off;
1552         lr->lr_length = len;
1553 
1554         itx->itx_sync = sync;
1555         zil_itx_assign(zilog, itx, tx);
1556 }
1557 
1558 /*
1559  * Dirtbag ioctls to support mkfs(1M) for UFS filesystems.  See dkio(7I).
1560  * Also a dirtbag dkio ioctl for unmap/free-block functionality.
1561  */
1562 /*ARGSUSED*/
1563 int
1564 zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
1565 {
1566         zvol_state_t *zv;
1567         struct dk_cinfo dki;
1568         struct dk_minfo dkm;
1569         struct dk_callback *dkc;
1570         int error = 0;
1571         rl_t *rl;
1572 
1573         mutex_enter(&zfsdev_state_lock);
1574 
1575         zv = zfsdev_get_soft_state(getminor(dev), ZSST_ZVOL);
1576 
1577         if (zv == NULL) {
1578                 mutex_exit(&zfsdev_state_lock);
1579                 return (ENXIO);
1580         }


1659                  */
1660                 error = ENOTSUP;
1661                 break;
1662 
1663         case DKIOCDUMPINIT:
1664                 rl = zfs_range_lock(&zv->zv_znode, 0, zv->zv_volsize,
1665                     RL_WRITER);
1666                 error = zvol_dumpify(zv);
1667                 zfs_range_unlock(rl);
1668                 break;
1669 
1670         case DKIOCDUMPFINI:
1671                 if (!(zv->zv_flags & ZVOL_DUMPIFIED))
1672                         break;
1673                 rl = zfs_range_lock(&zv->zv_znode, 0, zv->zv_volsize,
1674                     RL_WRITER);
1675                 error = zvol_dump_fini(zv);
1676                 zfs_range_unlock(rl);
1677                 break;
1678 
1679         case DKIOCFREE:
1680         {
1681                 dkioc_free_t df;
1682                 dmu_tx_t *tx;
1683 
1684                 if (ddi_copyin((void *)arg, &df, sizeof (df), flag)) {
1685                         error = EFAULT;
1686                         break;
1687                 }
1688 
1689                 /*
1690                  * Apply Postel's Law to length-checking.  If they overshoot,
1691                  * just blank out until the end, if there's a need to blank
1692                  * out anything.
1693                  */
1694                 if (df.df_start >= zv->zv_volsize)
1695                         break;  /* No need to do anything... */
1696                 if (df.df_start + df.df_length > zv->zv_volsize)
1697                         df.df_length = DMU_OBJECT_END;
1698 
1699                 rl = zfs_range_lock(&zv->zv_znode, df.df_start, df.df_length,
1700                     RL_WRITER);
1701                 tx = dmu_tx_create(zv->zv_objset);
1702                 error = dmu_tx_assign(tx, TXG_WAIT);
1703                 if (error != 0) {
1704                         dmu_tx_abort(tx);
1705                 } else {
1706                         zvol_log_truncate(zv, tx, df.df_start,
1707                             df.df_length, B_TRUE);
1708                         error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
1709                             df.df_start, df.df_length);
1710                         dmu_tx_commit(tx);
1711                 }
1712 
1713                 zfs_range_unlock(rl);
1714 
1715                 if (error == 0) {
1716                         /*
1717                          * If the write-cache is disabled or 'sync' property
1718                          * is set to 'always' then treat this as a synchronous
1719                          * operation (i.e. commit to zil).
1720                          */
1721                         if (!(zv->zv_flags & ZVOL_WCE) ||
1722                             (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS))
1723                                 zil_commit(zv->zv_zilog, ZVOL_OBJ);
1724 
1725                         /*
1726                          * If the caller really wants synchronous writes, and
1727                          * can't wait for them, don't return until the write
1728                          * is done.
1729                          */
1730                         if (df.df_flags & DF_WAIT_SYNC) {
1731                                 txg_wait_synced(
1732                                     dmu_objset_pool(zv->zv_objset), 0);
1733                         }
1734                 }
1735                 break;
1736         }
1737 
1738         default:
1739                 error = ENOTTY;
1740                 break;
1741 
1742         }
1743         mutex_exit(&zfsdev_state_lock);
1744         return (error);
1745 }
1746 
1747 int
1748 zvol_busy(void)
1749 {
1750         return (zvol_minors != 0);
1751 }
1752 
1753 void
1754 zvol_init(void)
1755 {
1756         VERIFY(ddi_soft_state_init(&zfsdev_state, sizeof (zfs_soft_state_t),
1757             1) == 0);