Print this page
701 UNMAP support for COMSTAR
Contributed by: Sumit Gupta <sumit.gupta@nexenta.com>
Reviewed by: Garrett D'Amore <garrett@nexenta.com>
Reviewed by: Eric Schrock <eric.schrock@delphix.com>
Reviewed by: George Wilson <gwilson@zfsmail.com>

@@ -18,14 +18,16 @@
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Portions Copyright 2010 Robert Milkowski
+ *
+ * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  */
 
-/* Portions Copyright 2010 Robert Milkowski */
-
 /*
  * ZFS volume emulation driver.
  *
  * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
  * Volumes are accessed through the symbolic links named:

@@ -340,10 +342,28 @@
         error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx);
         ASSERT(error == 0);
 }
 
 /*
+ * Replay a TX_TRUNCATE ZIL transaction if asked.  TX_TRUNCATE is how we
+ * implement DKIOCFREE/free-long-range.
+ */
+static int
+zvol_replay_truncate(zvol_state_t *zv, lr_truncate_t *lr, boolean_t byteswap)
+{
+        uint64_t offset, length;
+
+        if (byteswap)
+                byteswap_uint64_array(lr, sizeof (*lr));
+
+        offset = lr->lr_offset;
+        length = lr->lr_length;
+
+        return (dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, offset, length));
+}
+
+/*
  * Replay a TX_WRITE ZIL transaction that didn't get committed
  * after a system failure
  */
 static int
 zvol_replay_write(zvol_state_t *zv, lr_write_t *lr, boolean_t byteswap)

@@ -389,11 +409,11 @@
         return (ENOTSUP);
 }
 
 /*
  * Callback vectors for replaying records.
- * Only TX_WRITE is needed for zvol.
+ * Only TX_WRITE and TX_TRUNCATE are needed for zvol.
  */
 zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = {
         zvol_replay_err,        /* 0 no such transaction type */
         zvol_replay_err,        /* TX_CREATE */
         zvol_replay_err,        /* TX_MKDIR */

@@ -402,11 +422,11 @@
         zvol_replay_err,        /* TX_REMOVE */
         zvol_replay_err,        /* TX_RMDIR */
         zvol_replay_err,        /* TX_LINK */
         zvol_replay_err,        /* TX_RENAME */
         zvol_replay_write,      /* TX_WRITE */
-        zvol_replay_err,        /* TX_TRUNCATE */
+        zvol_replay_truncate,   /* TX_TRUNCATE */
         zvol_replay_err,        /* TX_SETATTR */
         zvol_replay_err,        /* TX_ACL */
         zvol_replay_err,        /* TX_CREATE_ACL */
         zvol_replay_err,        /* TX_CREATE_ATTR */
         zvol_replay_err,        /* TX_CREATE_ACL_ATTR */

@@ -1510,11 +1530,36 @@
 /*
  * END entry points to allow external callers access to the volume.
  */
 
 /*
+ * Log a DKIOCFREE/free-long-range to the ZIL with TX_TRUNCATE.
+ */
+static void
+zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, uint64_t len,
+    boolean_t sync)
+{
+        itx_t *itx;
+        lr_truncate_t *lr;
+        zilog_t *zilog = zv->zv_zilog;
+
+        if (zil_replaying(zilog, tx))
+                return;
+
+        itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr));
+        lr = (lr_truncate_t *)&itx->itx_lr;
+        lr->lr_foid = ZVOL_OBJ;
+        lr->lr_offset = off;
+        lr->lr_length = len;
+
+        itx->itx_sync = sync;
+        zil_itx_assign(zilog, itx, tx);
+}
+
+/*
  * Dirtbag ioctls to support mkfs(1M) for UFS filesystems.  See dkio(7I).
+ * Also a dirtbag dkio ioctl for unmap/free-block functionality.
  */
 /*ARGSUSED*/
 int
 zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
 {

@@ -1629,10 +1674,69 @@
                     RL_WRITER);
                 error = zvol_dump_fini(zv);
                 zfs_range_unlock(rl);
                 break;
 
+        case DKIOCFREE:
+        {
+                dkioc_free_t df;
+                dmu_tx_t *tx;
+
+                if (ddi_copyin((void *)arg, &df, sizeof (df), flag)) {
+                        error = EFAULT;
+                        break;
+                }
+
+                /*
+                 * Apply Postel's Law to length-checking.  If they overshoot,
+                 * just blank out until the end, if there's a need to blank
+                 * out anything.
+                 */
+                if (df.df_start >= zv->zv_volsize)
+                        break;  /* No need to do anything... */
+                if (df.df_start + df.df_length > zv->zv_volsize)
+                        df.df_length = DMU_OBJECT_END;
+
+                rl = zfs_range_lock(&zv->zv_znode, df.df_start, df.df_length,
+                    RL_WRITER);
+                tx = dmu_tx_create(zv->zv_objset);
+                error = dmu_tx_assign(tx, TXG_WAIT);
+                if (error != 0) {
+                        dmu_tx_abort(tx);
+                } else {
+                        zvol_log_truncate(zv, tx, df.df_start,
+                            df.df_length, B_TRUE);
+                        error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
+                            df.df_start, df.df_length);
+                        dmu_tx_commit(tx);
+                }
+
+                zfs_range_unlock(rl);
+
+                if (error == 0) {
+                        /*
+                         * If the write-cache is disabled or 'sync' property
+                         * is set to 'always' then treat this as a synchronous
+                         * operation (i.e. commit to zil).
+                         */
+                        if (!(zv->zv_flags & ZVOL_WCE) ||
+                            (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS))
+                                zil_commit(zv->zv_zilog, ZVOL_OBJ);
+
+                        /*
+                         * If the caller really wants synchronous writes, and
+                         * can't wait for them, don't return until the write
+                         * is done.
+                         */
+                        if (df.df_flags & DF_WAIT_SYNC) {
+                                txg_wait_synced(
+                                    dmu_objset_pool(zv->zv_objset), 0);
+                        }
+                }
+                break;
+        }
+
         default:
                 error = ENOTTY;
                 break;
 
         }