Print this page
NEX-19083 backport OS-7314 zil_commit should omit cache thrash
9962 zil_commit should omit cache thrash
Reviewed by: Matt Ahrens <matt@delphix.com>
Reviewed by: Brad Lewis <brad.lewis@delphix.com>
Reviewed by: Patrick Mooney <patrick.mooney@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Approved by: Joshua M. Clulow <josh@sysmgr.org>
NEX-5367 special vdev: sync-write options (NEW)
Reviewed by: Roman Strashkin <roman.strashkin@nexenta.com>
Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
6250 zvol_dump_init() can hold txg open
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Albert Lee <trisk@omniti.com>
Reviewed by: Xin Li <delphij@freebsd.org>
Approved by: Garrett D'Amore <garrett@damore.org>
NEX-4582 update wrc test cases for allow to use write back cache per tree of datasets
Reviewed by: Steve Peng <steve.peng@nexenta.com>
Reviewed by: Alex Aizman <alex.aizman@nexenta.com>
5960 zfs recv should prefetch indirect blocks
5925 zfs receive -o origin=
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
NEX-3508 CLONE - Port NEX-2946 Add UNMAP/TRIM functionality to ZFS and illumos
Reviewed by: Josef Sipek <josef.sipek@nexenta.com>
Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
Conflicts:
    usr/src/uts/common/io/scsi/targets/sd.c
    usr/src/uts/common/sys/scsi/targets/sddef.h
NEX-1142 move rwlock to vdev to protect vdev_tsd
not just ldi handle.
This way we serialize open/close, yet allow parallel I/O.
4370 avoid transmitting holes during zfs send
4371 DMU code clean up
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Josef 'Jeff' Sipek <jeffpc@josefsipek.net>
Approved by: Garrett D'Amore <garrett@damore.org>
NEX-1065 Added serialization to avoid race
between ldi notification and I/O path.
Also fixes OS-124, NEX-1051, NEX-1062.


   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  *
  24  * Portions Copyright 2010 Robert Milkowski
  25  *
  26  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  27  * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
  28  * Copyright (c) 2013, Joyent, Inc. All rights reserved.

  29  * Copyright (c) 2014 Integros [integros.com]
  30  */
  31 
  32 /*
  33  * ZFS volume emulation driver.
  34  *
  35  * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
  36  * Volumes are accessed through the symbolic links named:
  37  *
  38  * /dev/zvol/dsk/<pool_name>/<dataset_name>
  39  * /dev/zvol/rdsk/<pool_name>/<dataset_name>
  40  *
  41  * These links are created by the /dev filesystem (sdev_zvolops.c).
  42  * Volumes are persistent through reboot.  No user command needs to be
  43  * run before opening and using a device.
  44  */
  45 
  46 #include <sys/types.h>
  47 #include <sys/param.h>
  48 #include <sys/errno.h>


  71 #include <sys/crc32.h>
  72 #include <sys/dirent.h>
  73 #include <sys/policy.h>
  74 #include <sys/fs/zfs.h>
  75 #include <sys/zfs_ioctl.h>
  76 #include <sys/mkdev.h>
  77 #include <sys/zil.h>
  78 #include <sys/refcount.h>
  79 #include <sys/zfs_znode.h>
  80 #include <sys/zfs_rlock.h>
  81 #include <sys/vdev_disk.h>
  82 #include <sys/vdev_impl.h>
  83 #include <sys/vdev_raidz.h>
  84 #include <sys/zvol.h>
  85 #include <sys/dumphdr.h>
  86 #include <sys/zil_impl.h>
  87 #include <sys/dbuf.h>
  88 #include <sys/dmu_tx.h>
  89 #include <sys/zfeature.h>
  90 #include <sys/zio_checksum.h>

  91 #include <sys/zil_impl.h>
  92 
  93 #include "zfs_namecheck.h"
  94 
  95 void *zfsdev_state;
  96 static char *zvol_tag = "zvol_tag";
  97 
  98 #define ZVOL_DUMPSIZE           "dumpsize"
  99 
 100 /*
 101  * This lock protects the zfsdev_state structure from being modified
 102  * while it's being used, e.g. an open that comes in before a create
 103  * finishes.  It also protects temporary opens of the dataset so that,
 104  * e.g., an open doesn't get a spurious EBUSY.
 105  */
 106 kmutex_t zfsdev_state_lock;
 107 static uint32_t zvol_minors;
 108 
 109 typedef struct zvol_extent {
 110         list_node_t     ze_node;


 955         /*
 956          * If the open count is zero, this is a spurious close.
 957          * That indicates a bug in the kernel / DDI framework.
 958          */
 959         ASSERT(zv->zv_open_count[otyp] != 0);
 960         ASSERT(zv->zv_total_opens != 0);
 961 
 962         /*
 963          * You may get multiple opens, but only one close.
 964          */
 965         zv->zv_open_count[otyp]--;
 966         zv->zv_total_opens--;
 967 
 968         if (zv->zv_total_opens == 0)
 969                 zvol_last_close(zv);
 970 
 971         mutex_exit(&zfsdev_state_lock);
 972         return (error);
 973 }
 974 

 975 static void
 976 zvol_get_done(zgd_t *zgd, int error)
 977 {
 978         if (zgd->zgd_db)
 979                 dmu_buf_rele(zgd->zgd_db, zgd);
 980 
 981         zfs_range_unlock(zgd->zgd_rl);
 982 
 983         if (error == 0 && zgd->zgd_bp)
 984                 zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
 985 
 986         kmem_free(zgd, sizeof (zgd_t));
 987 }
 988 
 989 /*
 990  * Get data to generate a TX_WRITE intent log record.
 991  */
 992 static int
 993 zvol_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
 994 {
 995         zvol_state_t *zv = arg;
 996         objset_t *os = zv->zv_objset;
 997         uint64_t object = ZVOL_OBJ;
 998         uint64_t offset = lr->lr_offset;
 999         uint64_t size = lr->lr_length;       /* length of user data */
1000         dmu_buf_t *db;
1001         zgd_t *zgd;
1002         int error;
1003 
1004         ASSERT3P(lwb, !=, NULL);
1005         ASSERT3P(zio, !=, NULL);


1052 
1053         zvol_get_done(zgd, error);
1054 
1055         return (error);
1056 }
1057 
1058 /*
1059  * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions.
1060  *
1061  * We store data in the log buffers if it's small enough.
1062  * Otherwise we will later flush the data out via dmu_sync().
1063  */
1064 ssize_t zvol_immediate_write_sz = 32768;
1065 
1066 static void
1067 zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t resid,
1068     boolean_t sync)
1069 {
1070         uint32_t blocksize = zv->zv_volblocksize;
1071         zilog_t *zilog = zv->zv_zilog;




1072         itx_wr_state_t write_state;
1073 
1074         if (zil_replaying(zilog, tx))
1075                 return;
1076 
1077         if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)





















1078                 write_state = WR_INDIRECT;


1079         else if (!spa_has_slogs(zilog->zl_spa) &&
1080             resid >= blocksize && blocksize > zvol_immediate_write_sz)
1081                 write_state = WR_INDIRECT;


1082         else if (sync)
1083                 write_state = WR_COPIED;
1084         else
1085                 write_state = WR_NEED_COPY;
1086 
1087         while (resid) {
1088                 itx_t *itx;
1089                 lr_write_t *lr;
1090                 itx_wr_state_t wr_state = write_state;
1091                 ssize_t len = resid;
1092 
1093                 if (wr_state == WR_COPIED && resid > ZIL_MAX_COPIED_DATA)
1094                         wr_state = WR_NEED_COPY;
1095                 else if (wr_state == WR_INDIRECT)
1096                         len = MIN(blocksize - P2PHASE(off, blocksize), resid);
1097 
1098                 itx = zil_itx_create(TX_WRITE, sizeof (*lr) +
1099                     (wr_state == WR_COPIED ? len : 0));
1100                 lr = (lr_write_t *)&itx->itx_lr;
1101                 if (wr_state == WR_COPIED && dmu_read(zv->zv_objset,


1111                 lr->lr_offset = off;
1112                 lr->lr_length = len;
1113                 lr->lr_blkoff = 0;
1114                 BP_ZERO(&lr->lr_blkptr);
1115 
1116                 itx->itx_private = zv;
1117                 itx->itx_sync = sync;
1118 
1119                 zil_itx_assign(zilog, itx, tx);
1120 
1121                 off += len;
1122                 resid -= len;
1123         }
1124 }
1125 
1126 static int
1127 zvol_dumpio_vdev(vdev_t *vd, void *addr, uint64_t offset, uint64_t origoffset,
1128     uint64_t size, boolean_t doread, boolean_t isdump)
1129 {
1130         vdev_disk_t *dvd;
1131         int c;
1132         int numerrors = 0;
1133 
1134         if (vd->vdev_ops == &vdev_mirror_ops ||
1135             vd->vdev_ops == &vdev_replacing_ops ||
1136             vd->vdev_ops == &vdev_spare_ops) {
1137                 for (c = 0; c < vd->vdev_children; c++) {
1138                         int err = zvol_dumpio_vdev(vd->vdev_child[c],
1139                             addr, offset, origoffset, size, doread, isdump);
1140                         if (err != 0) {
1141                                 numerrors++;
1142                         } else if (doread) {
1143                                 break;
1144                         }
1145                 }
1146         }
1147 
1148         if (!vd->vdev_ops->vdev_op_leaf && vd->vdev_ops != &vdev_raidz_ops)
1149                 return (numerrors < vd->vdev_children ? 0 : EIO);
1150 
1151         if (doread && !vdev_readable(vd))
1152                 return (SET_ERROR(EIO));
1153         else if (!doread && !vdev_writeable(vd))
1154                 return (SET_ERROR(EIO));
1155 
1156         if (vd->vdev_ops == &vdev_raidz_ops) {
1157                 return (vdev_raidz_physio(vd,
1158                     addr, size, offset, origoffset, doread, isdump));
1159         }
1160 
1161         offset += VDEV_LABEL_START_SIZE;
1162 


1163         if (ddi_in_panic() || isdump) {
1164                 ASSERT(!doread);
1165                 if (doread)

1166                         return (SET_ERROR(EIO));
1167                 dvd = vd->vdev_tsd;

1168                 ASSERT3P(dvd, !=, NULL);
1169                 return (ldi_dump(dvd->vd_lh, addr, lbtodb(offset),
1170                     lbtodb(size)));



1171         } else {
1172                 dvd = vd->vdev_tsd;



1173                 ASSERT3P(dvd, !=, NULL);
1174                 return (vdev_disk_ldi_physio(dvd->vd_lh, addr, size,
1175                     offset, doread ? B_READ : B_WRITE));





1176         }



1177 }
1178 
1179 static int
1180 zvol_dumpio(zvol_state_t *zv, void *addr, uint64_t offset, uint64_t size,
1181     boolean_t doread, boolean_t isdump)
1182 {
1183         vdev_t *vd;
1184         int error;
1185         zvol_extent_t *ze;
1186         spa_t *spa = dmu_objset_spa(zv->zv_objset);
1187 
1188         /* Must be sector aligned, and not stradle a block boundary. */
1189         if (P2PHASE(offset, DEV_BSIZE) || P2PHASE(size, DEV_BSIZE) ||
1190             P2BOUNDARY(offset, size, zv->zv_volblocksize)) {
1191                 return (SET_ERROR(EINVAL));
1192         }
1193         ASSERT(size <= zv->zv_volblocksize);
1194 
1195         /* Locate the extent this belongs to */
1196         ze = list_head(&zv->zv_extents);


1763                 break;
1764 
1765         case DKIOCDUMPINIT:
1766                 rl = zfs_range_lock(&zv->zv_znode, 0, zv->zv_volsize,
1767                     RL_WRITER);
1768                 error = zvol_dumpify(zv);
1769                 zfs_range_unlock(rl);
1770                 break;
1771 
1772         case DKIOCDUMPFINI:
1773                 if (!(zv->zv_flags & ZVOL_DUMPIFIED))
1774                         break;
1775                 rl = zfs_range_lock(&zv->zv_znode, 0, zv->zv_volsize,
1776                     RL_WRITER);
1777                 error = zvol_dump_fini(zv);
1778                 zfs_range_unlock(rl);
1779                 break;
1780 
1781         case DKIOCFREE:
1782         {
1783                 dkioc_free_t df;
1784                 dmu_tx_t *tx;
1785 


1786                 if (!zvol_unmap_enabled)
1787                         break;
1788 
1789                 if (ddi_copyin((void *)arg, &df, sizeof (df), flag)) {


1790                         error = SET_ERROR(EFAULT);
1791                         break;
1792                 }



1793 





1794                 /*
1795                  * Apply Postel's Law to length-checking.  If they overshoot,
1796                  * just blank out until the end, if there's a need to blank
1797                  * out anything.
1798                  */
1799                 if (df.df_start >= zv->zv_volsize)
1800                         break;  /* No need to do anything... */




1801 
1802                 mutex_exit(&zfsdev_state_lock);
1803 
1804                 rl = zfs_range_lock(&zv->zv_znode, df.df_start, df.df_length,
1805                     RL_WRITER);
1806                 tx = dmu_tx_create(zv->zv_objset);
1807                 dmu_tx_mark_netfree(tx);
1808                 error = dmu_tx_assign(tx, TXG_WAIT);
1809                 if (error != 0) {
1810                         dmu_tx_abort(tx);
1811                 } else {
1812                         zvol_log_truncate(zv, tx, df.df_start,
1813                             df.df_length, B_TRUE);
1814                         dmu_tx_commit(tx);
1815                         error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
1816                             df.df_start, df.df_length);
1817                 }
1818 
1819                 zfs_range_unlock(rl);
1820 




1821                 /*
1822                  * If the write-cache is disabled, 'sync' property
1823                  * is set to 'always', or if the caller is asking for
1824                  * a synchronous free, commit this operation to the zil.
1825                  * This will sync any previous uncommitted writes to the
1826                  * zvol object.
1827                  * Can be overridden by the zvol_unmap_sync_enabled tunable.
1828                  */
1829                 if ((error == 0) && zvol_unmap_sync_enabled &&
1830                     (!(zv->zv_flags & ZVOL_WCE) ||
1831                     (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS) ||
1832                     (df.df_flags & DF_WAIT_SYNC))) {
1833                         zil_commit(zv->zv_zilog, ZVOL_OBJ);
1834                 }
1835 



1836                 return (error);
1837         }
1838 
1839         default:
1840                 error = SET_ERROR(ENOTTY);
1841                 break;
1842 
1843         }
1844         mutex_exit(&zfsdev_state_lock);
1845         return (error);
1846 }
1847 
1848 int
1849 zvol_busy(void)
1850 {
1851         return (zvol_minors != 0);
1852 }
1853 
1854 void
1855 zvol_init(void)




   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  *
  24  * Portions Copyright 2010 Robert Milkowski
  25  *
  26  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  27  * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
  28  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  29  * Copyright 2017 Nexenta Systems, Inc.  All rights reserved.
  30  * Copyright (c) 2014 Integros [integros.com]
  31  */
  32 
  33 /*
  34  * ZFS volume emulation driver.
  35  *
  36  * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
  37  * Volumes are accessed through the symbolic links named:
  38  *
  39  * /dev/zvol/dsk/<pool_name>/<dataset_name>
  40  * /dev/zvol/rdsk/<pool_name>/<dataset_name>
  41  *
  42  * These links are created by the /dev filesystem (sdev_zvolops.c).
  43  * Volumes are persistent through reboot.  No user command needs to be
  44  * run before opening and using a device.
  45  */
  46 
  47 #include <sys/types.h>
  48 #include <sys/param.h>
  49 #include <sys/errno.h>


  72 #include <sys/crc32.h>
  73 #include <sys/dirent.h>
  74 #include <sys/policy.h>
  75 #include <sys/fs/zfs.h>
  76 #include <sys/zfs_ioctl.h>
  77 #include <sys/mkdev.h>
  78 #include <sys/zil.h>
  79 #include <sys/refcount.h>
  80 #include <sys/zfs_znode.h>
  81 #include <sys/zfs_rlock.h>
  82 #include <sys/vdev_disk.h>
  83 #include <sys/vdev_impl.h>
  84 #include <sys/vdev_raidz.h>
  85 #include <sys/zvol.h>
  86 #include <sys/dumphdr.h>
  87 #include <sys/zil_impl.h>
  88 #include <sys/dbuf.h>
  89 #include <sys/dmu_tx.h>
  90 #include <sys/zfeature.h>
  91 #include <sys/zio_checksum.h>
  92 #include <sys/dkioc_free_util.h>
  93 #include <sys/zil_impl.h>
  94 
  95 #include "zfs_namecheck.h"
  96 
  97 void *zfsdev_state;
  98 static char *zvol_tag = "zvol_tag";
  99 
 100 #define ZVOL_DUMPSIZE           "dumpsize"
 101 
 102 /*
 103  * This lock protects the zfsdev_state structure from being modified
 104  * while it's being used, e.g. an open that comes in before a create
 105  * finishes.  It also protects temporary opens of the dataset so that,
 106  * e.g., an open doesn't get a spurious EBUSY.
 107  */
 108 kmutex_t zfsdev_state_lock;
 109 static uint32_t zvol_minors;
 110 
 111 typedef struct zvol_extent {
 112         list_node_t     ze_node;


 957         /*
 958          * If the open count is zero, this is a spurious close.
 959          * That indicates a bug in the kernel / DDI framework.
 960          */
 961         ASSERT(zv->zv_open_count[otyp] != 0);
 962         ASSERT(zv->zv_total_opens != 0);
 963 
 964         /*
 965          * You may get multiple opens, but only one close.
 966          */
 967         zv->zv_open_count[otyp]--;
 968         zv->zv_total_opens--;
 969 
 970         if (zv->zv_total_opens == 0)
 971                 zvol_last_close(zv);
 972 
 973         mutex_exit(&zfsdev_state_lock);
 974         return (error);
 975 }
 976 
 977 /* ARGSUSED */
 978 static void
 979 zvol_get_done(zgd_t *zgd, int error)
 980 {
 981         if (zgd->zgd_db)
 982                 dmu_buf_rele(zgd->zgd_db, zgd);
 983 
 984         zfs_range_unlock(zgd->zgd_rl);
 985 



 986         kmem_free(zgd, sizeof (zgd_t));
 987 }
 988 
 989 /*
 990  * Get data to generate a TX_WRITE intent log record.
 991  */
 992 static int
 993 zvol_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
 994 {
 995         zvol_state_t *zv = arg;
 996         objset_t *os = zv->zv_objset;
 997         uint64_t object = ZVOL_OBJ;
 998         uint64_t offset = lr->lr_offset;
 999         uint64_t size = lr->lr_length;       /* length of user data */
1000         dmu_buf_t *db;
1001         zgd_t *zgd;
1002         int error;
1003 
1004         ASSERT3P(lwb, !=, NULL);
1005         ASSERT3P(zio, !=, NULL);


1052 
1053         zvol_get_done(zgd, error);
1054 
1055         return (error);
1056 }
1057 
1058 /*
1059  * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions.
1060  *
1061  * We store data in the log buffers if it's small enough.
1062  * Otherwise we will later flush the data out via dmu_sync().
1063  */
1064 ssize_t zvol_immediate_write_sz = 32768;
1065 
1066 static void
1067 zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t resid,
1068     boolean_t sync)
1069 {
1070         uint32_t blocksize = zv->zv_volblocksize;
1071         zilog_t *zilog = zv->zv_zilog;
1072         spa_t *spa = zilog->zl_spa;
1073         spa_meta_placement_t *mp = &spa->spa_meta_policy;
1074         boolean_t slogging, zil_to_special, write_to_special;
1075         ssize_t immediate_write_sz;
1076         itx_wr_state_t write_state;
1077 
1078         if (zil_replaying(zilog, tx))
1079                 return;
1080 
1081         /*
1082          * See comments in zfs_log_write()
1083          */
1084 
1085         immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
1086             ? 0 : zvol_immediate_write_sz;
1087 
1088         zil_to_special = !spa_has_slogs(spa) &&
1089             spa_can_special_be_used(spa) &&
1090             mp->spa_sync_to_special != SYNC_TO_SPECIAL_DISABLED;
1091 
1092         write_to_special = !spa_has_slogs(spa) &&
1093             spa_write_data_to_special(spa, zilog->zl_os) &&
1094             (mp->spa_sync_to_special == SYNC_TO_SPECIAL_ALWAYS ||
1095             (mp->spa_sync_to_special == SYNC_TO_SPECIAL_BALANCED &&
1096             spa->spa_avg_stat_rotor % 100 < spa->spa_special_to_normal_ratio));
1097 
1098         slogging = (spa_has_slogs(spa) || zil_to_special) &&
1099             (zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);
1100 
1101         if (blocksize > immediate_write_sz && !slogging &&
1102             resid >= blocksize && off % blocksize == 0)
1103                 write_state = WR_INDIRECT;
1104         else if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
1105                 write_state = WR_INDIRECT;
1106         else if (!spa_has_slogs(zilog->zl_spa) &&
1107             resid >= blocksize && blocksize > zvol_immediate_write_sz)
1108                 write_state = WR_INDIRECT;
1109         else if (write_to_special)
1110                  write_state = WR_INDIRECT;
1111         else if (sync)
1112                 write_state = WR_COPIED;
1113         else
1114                 write_state = WR_NEED_COPY;
1115 
1116         while (resid) {
1117                 itx_t *itx;
1118                 lr_write_t *lr;
1119                 itx_wr_state_t wr_state = write_state;
1120                 ssize_t len = resid;
1121 
1122                 if (wr_state == WR_COPIED && resid > ZIL_MAX_COPIED_DATA)
1123                         wr_state = WR_NEED_COPY;
1124                 else if (wr_state == WR_INDIRECT)
1125                         len = MIN(blocksize - P2PHASE(off, blocksize), resid);
1126 
1127                 itx = zil_itx_create(TX_WRITE, sizeof (*lr) +
1128                     (wr_state == WR_COPIED ? len : 0));
1129                 lr = (lr_write_t *)&itx->itx_lr;
1130                 if (wr_state == WR_COPIED && dmu_read(zv->zv_objset,


1140                 lr->lr_offset = off;
1141                 lr->lr_length = len;
1142                 lr->lr_blkoff = 0;
1143                 BP_ZERO(&lr->lr_blkptr);
1144 
1145                 itx->itx_private = zv;
1146                 itx->itx_sync = sync;
1147 
1148                 zil_itx_assign(zilog, itx, tx);
1149 
1150                 off += len;
1151                 resid -= len;
1152         }
1153 }
1154 
1155 static int
1156 zvol_dumpio_vdev(vdev_t *vd, void *addr, uint64_t offset, uint64_t origoffset,
1157     uint64_t size, boolean_t doread, boolean_t isdump)
1158 {
1159         vdev_disk_t *dvd;
1160         int c, rc;
1161         int numerrors = 0;
1162 
1163         if (vd->vdev_ops == &vdev_mirror_ops ||
1164             vd->vdev_ops == &vdev_replacing_ops ||
1165             vd->vdev_ops == &vdev_spare_ops) {
1166                 for (c = 0; c < vd->vdev_children; c++) {
1167                         int err = zvol_dumpio_vdev(vd->vdev_child[c],
1168                             addr, offset, origoffset, size, doread, isdump);
1169                         if (err != 0) {
1170                                 numerrors++;
1171                         } else if (doread) {
1172                                 break;
1173                         }
1174                 }
1175         }
1176 
1177         if (!vd->vdev_ops->vdev_op_leaf && vd->vdev_ops != &vdev_raidz_ops)
1178                 return (numerrors < vd->vdev_children ? 0 : EIO);
1179 
1180         if (doread && !vdev_readable(vd))
1181                 return (SET_ERROR(EIO));
1182         else if (!doread && !vdev_writeable(vd))
1183                 return (SET_ERROR(EIO));
1184 
1185         if (vd->vdev_ops == &vdev_raidz_ops) {
1186                 return (vdev_raidz_physio(vd,
1187                     addr, size, offset, origoffset, doread, isdump));
1188         }
1189 
1190         offset += VDEV_LABEL_START_SIZE;
1191 
1192         rw_enter(&vd->vdev_tsd_lock, RW_READER);
1193         dvd = vd->vdev_tsd;
1194         if (ddi_in_panic() || isdump) {
1195                 ASSERT(!doread);
1196                 if (doread) {
1197                         rw_exit(&vd->vdev_tsd_lock);
1198                         return (SET_ERROR(EIO));
1199                 }
1200                 /* We assume here dvd is not NULL */
1201                 ASSERT3P(dvd, !=, NULL);
1202 
1203                 /* If our assumption is wrong, we do not want to crash */
1204                 if (dvd != NULL && dvd->vd_lh != NULL) {
1205                         rc = ldi_dump(dvd->vd_lh, addr, lbtodb(offset),
1206                             lbtodb(size));
1207                 } else {
1208                         rc = SET_ERROR(ENXIO);
1209                 }
1210         } else {
1211                 /* We assume here dvd is not NULL */
1212                 ASSERT3P(dvd, !=, NULL);
1213 
1214                 /* If our assumption is wrong, we do not want to crash */
1215                 if (dvd != NULL && dvd->vd_lh != NULL) {
1216                         rc = vdev_disk_ldi_physio(dvd->vd_lh, addr, size,
1217                             offset, doread ? B_READ : B_WRITE);
1218                 } else {
1219                         rc = SET_ERROR(ENXIO);
1220                 }
1221         }
1222         rw_exit(&vd->vdev_tsd_lock);
1223         return (rc);
1224 }
1225 
1226 static int
1227 zvol_dumpio(zvol_state_t *zv, void *addr, uint64_t offset, uint64_t size,
1228     boolean_t doread, boolean_t isdump)
1229 {
1230         vdev_t *vd;
1231         int error;
1232         zvol_extent_t *ze;
1233         spa_t *spa = dmu_objset_spa(zv->zv_objset);
1234 
1235         /* Must be sector aligned, and not stradle a block boundary. */
1236         if (P2PHASE(offset, DEV_BSIZE) || P2PHASE(size, DEV_BSIZE) ||
1237             P2BOUNDARY(offset, size, zv->zv_volblocksize)) {
1238                 return (SET_ERROR(EINVAL));
1239         }
1240         ASSERT(size <= zv->zv_volblocksize);
1241 
1242         /* Locate the extent this belongs to */
1243         ze = list_head(&zv->zv_extents);


1810                 break;
1811 
1812         case DKIOCDUMPINIT:
1813                 rl = zfs_range_lock(&zv->zv_znode, 0, zv->zv_volsize,
1814                     RL_WRITER);
1815                 error = zvol_dumpify(zv);
1816                 zfs_range_unlock(rl);
1817                 break;
1818 
1819         case DKIOCDUMPFINI:
1820                 if (!(zv->zv_flags & ZVOL_DUMPIFIED))
1821                         break;
1822                 rl = zfs_range_lock(&zv->zv_znode, 0, zv->zv_volsize,
1823                     RL_WRITER);
1824                 error = zvol_dump_fini(zv);
1825                 zfs_range_unlock(rl);
1826                 break;
1827 
1828         case DKIOCFREE:
1829         {
1830                 dkioc_free_list_t *dfl;
1831                 dmu_tx_t *tx;
1832 
1833                 mutex_exit(&zfsdev_state_lock);
1834 
1835                 if (!zvol_unmap_enabled)
1836                         break;
1837 
1838                 if (!(flag & FKIOCTL)) {
1839                         dfl = dfl_copyin((void *)arg, flag, KM_SLEEP);
1840                         if (dfl == NULL) {
1841                                 error = SET_ERROR(EFAULT);
1842                                 break;
1843                         }
1844                 } else {
1845                         dfl = (dkioc_free_list_t *)arg;
1846                 }
1847 
1848                 for (int i = 0; i < dfl->dfl_num_exts; i++) {
1849                         uint64_t start = dfl->dfl_exts[i].dfle_start,
1850                             length = dfl->dfl_exts[i].dfle_length,
1851                             end = start + length;
1852 
1853                         /*
1854                          * Apply Postel's Law to length-checking.  If they
1855                          * overshoot, just blank out until the end, if there's
1856                          * a need to blank out anything.
1857                          */
1858                         if (start >= zv->zv_volsize)
1859                                 continue;       /* No need to do anything... */
1860                         if (end > zv->zv_volsize) {
1861                                 end = DMU_OBJECT_END;
1862                                 length = end - start;
1863                         }
1864 
1865                         rl = zfs_range_lock(&zv->zv_znode, start, length,


1866                             RL_WRITER);
1867                         tx = dmu_tx_create(zv->zv_objset);

1868                         error = dmu_tx_assign(tx, TXG_WAIT);
1869                         if (error != 0) {
1870                                 dmu_tx_abort(tx);
1871                         } else {
1872                                 zvol_log_truncate(zv, tx, start, length,
1873                                     B_TRUE);
1874                                 dmu_tx_commit(tx);
1875                                 error = dmu_free_long_range(zv->zv_objset,
1876                                     ZVOL_OBJ, start, length);
1877                         }
1878 
1879                         zfs_range_unlock(rl);
1880 
1881                         if (error != 0)
1882                                 break;
1883                 }
1884 
1885                 /*
1886                  * If the write-cache is disabled, 'sync' property
1887                  * is set to 'always', or if the caller is asking for
1888                  * a synchronous free, commit this operation to the zil.
1889                  * This will sync any previous uncommitted writes to the
1890                  * zvol object.
1891                  * Can be overridden by the zvol_unmap_sync_enabled tunable.
1892                  */
1893                 if ((error == 0) && zvol_unmap_sync_enabled &&
1894                     (!(zv->zv_flags & ZVOL_WCE) ||
1895                     (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS) ||
1896                     (dfl->dfl_flags & DF_WAIT_SYNC))) {
1897                         zil_commit(zv->zv_zilog, ZVOL_OBJ);
1898                 }
1899 
1900                 if (!(flag & FKIOCTL))
1901                         dfl_free(dfl);
1902 
1903                 return (error);
1904         }
1905 
1906         default:
1907                 error = SET_ERROR(ENOTTY);
1908                 break;
1909 
1910         }
1911         mutex_exit(&zfsdev_state_lock);
1912         return (error);
1913 }
1914 
1915 int
1916 zvol_busy(void)
1917 {
1918         return (zvol_minors != 0);
1919 }
1920 
1921 void
1922 zvol_init(void)