1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
  24  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  25  */
  26 
  27 #include <sys/zfs_context.h>
  28 #include <sys/spa.h>
  29 #include <sys/spa_impl.h>
  30 #include <sys/vdev_file.h>
  31 #include <sys/vdev_impl.h>
  32 #include <sys/zio.h>
  33 #include <sys/fs/zfs.h>
  34 #include <sys/fm/fs/zfs.h>
  35 #include <sys/fcntl.h>
  36 #include <sys/vnode.h>
  37 #include <sys/dkioc_free_util.h>
  38 #include <sys/abd.h>
  39 
  40 /*
  41  * Virtual device vector for files.
  42  */
  43 
  44 static void
  45 vdev_file_hold(vdev_t *vd)
  46 {
  47         ASSERT(vd->vdev_path != NULL);
  48 }
  49 
  50 static void
  51 vdev_file_rele(vdev_t *vd)
  52 {
  53         ASSERT(vd->vdev_path != NULL);
  54 }
  55 
  56 static int
  57 vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
  58     uint64_t *ashift)
  59 {
  60         vdev_file_t *vf;
  61         vnode_t *vp;
  62         vattr_t vattr;
  63         int error;
  64 
  65         /*
  66          * We must have a pathname, and it must be absolute.
  67          */
  68         if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
  69                 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
  70                 return (SET_ERROR(EINVAL));
  71         }
  72 
  73         /*
  74          * Reopen the device if it's not currently open.  Otherwise,
  75          * just update the physical size of the device.
  76          */
  77         if (vd->vdev_tsd != NULL) {
  78                 ASSERT(vd->vdev_reopening);
  79                 vf = vd->vdev_tsd;
  80                 goto skip_open;
  81         }
  82 
  83         vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP);
  84 
  85         /*
  86          * We always open the files from the root of the global zone, even if
  87          * we're in a local zone.  If the user has gotten to this point, the
  88          * administrator has already decided that the pool should be available
  89          * to local zone users, so the underlying devices should be as well.
  90          */
  91         ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/');
  92         error = vn_openat(vd->vdev_path + 1, UIO_SYSSPACE,
  93             spa_mode(vd->vdev_spa) | FOFFMAX, 0, &vp, 0, 0, rootdir, -1);
  94 
  95         if (error) {
  96                 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
  97                 return (error);
  98         }
  99 
 100         vf->vf_vnode = vp;
 101 
 102 #ifdef _KERNEL
 103         /*
 104          * Make sure it's a regular file.
 105          */
 106         if (vp->v_type != VREG) {
 107                 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
 108                 return (SET_ERROR(ENODEV));
 109         }
 110 #endif
 111 
 112 skip_open:
 113         /*
 114          * Determine the physical size of the file.
 115          */
 116         vattr.va_mask = AT_SIZE;
 117         error = VOP_GETATTR(vf->vf_vnode, &vattr, 0, kcred, NULL);
 118         if (error) {
 119                 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
 120                 return (error);
 121         }
 122 
 123         *max_psize = *psize = vattr.va_size;
 124         *ashift = SPA_MINBLOCKSHIFT;
 125 
 126         return (0);
 127 }
 128 
 129 static void
 130 vdev_file_close(vdev_t *vd)
 131 {
 132         vdev_file_t *vf = vd->vdev_tsd;
 133         caller_context_t ct = {0};
 134 
 135         if (vd->vdev_reopening || vf == NULL)
 136                 return;
 137 
 138         if (vf->vf_vnode != NULL) {
 139                 (void) VOP_PUTPAGE(vf->vf_vnode, 0, 0, B_INVAL, kcred, NULL);
 140                 /*
 141                  * We need to supply caller context with PID zero to fop_close
 142                  * in order to clean properly a share reservation which might
 143                  * have been created by vdev_file_open if nbmand was "on" for
 144                  * underlaying filesystem. Mismatched PIDs in create and delete
 145                  * reservation calls would lead to orphaned reservation.
 146                  */
 147                 (void) VOP_CLOSE(vf->vf_vnode, spa_mode(vd->vdev_spa), 1, 0,
 148                     kcred, &ct);
 149                 VN_RELE(vf->vf_vnode);
 150         }
 151 
 152         vd->vdev_delayed_close = B_FALSE;
 153         kmem_free(vf, sizeof (vdev_file_t));
 154         vd->vdev_tsd = NULL;
 155 }
 156 
 157 /*
 158  * Implements the interrupt side for file vdev types. This routine will be
 159  * called when the I/O completes allowing us to transfer the I/O to the
 160  * interrupt taskqs. For consistency, the code structure mimics disk vdev
 161  * types.
 162  */
 163 static void
 164 vdev_file_io_intr(buf_t *bp)
 165 {
 166         vdev_buf_t *vb = (vdev_buf_t *)bp;
 167         zio_t *zio = vb->vb_io;
 168 
 169         zio->io_error = (geterror(bp) != 0 ? EIO : 0);
 170         if (zio->io_error == 0 && bp->b_resid != 0)
 171                 zio->io_error = SET_ERROR(ENOSPC);
 172 
 173         if (zio->io_type == ZIO_TYPE_READ) {
 174                 abd_return_buf_copy(zio->io_abd, bp->b_un.b_addr, zio->io_size);
 175         } else {
 176                 abd_return_buf(zio->io_abd, bp->b_un.b_addr, zio->io_size);
 177         }
 178 
 179         kmem_free(vb, sizeof (vdev_buf_t));
 180         zio_delay_interrupt(zio);
 181 }
 182 
 183 static void
 184 vdev_file_io_strategy(void *arg)
 185 {
 186         buf_t *bp = arg;
 187         vnode_t *vp = bp->b_private;
 188         ssize_t resid;
 189         int error;
 190 
 191         error = vn_rdwr((bp->b_flags & B_READ) ? UIO_READ : UIO_WRITE,
 192             vp, bp->b_un.b_addr, bp->b_bcount, ldbtob(bp->b_lblkno),
 193             UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
 194 
 195         if (error == 0) {
 196                 bp->b_resid = resid;
 197                 biodone(bp);
 198         } else {
 199                 bioerror(bp, error);
 200                 biodone(bp);
 201         }
 202 }
 203 
 204 static void
 205 vdev_file_io_start(zio_t *zio)
 206 {
 207         vdev_t *vd = zio->io_vd;
 208         vdev_file_t *vf = vd->vdev_tsd;
 209         vdev_buf_t *vb;
 210         buf_t *bp;
 211 
 212         if (zio->io_type == ZIO_TYPE_IOCTL) {
 213                 /* XXPOLICY */
 214                 if (!vdev_readable(vd)) {
 215                         zio->io_error = SET_ERROR(ENXIO);
 216                         zio_interrupt(zio);
 217                         return;
 218                 }
 219 
 220                 switch (zio->io_cmd) {
 221                 case DKIOCFLUSHWRITECACHE:
 222                         zio->io_error = VOP_FSYNC(vf->vf_vnode, FSYNC | FDSYNC,
 223                             kcred, NULL);
 224                         break;
 225                 case DKIOCFREE:
 226                 {
 227                         dkioc_free_list_t *dfl = zio->io_private;
 228 
 229                         ASSERT(dfl != NULL);
 230                         for (int i = 0; i < dfl->dfl_num_exts; i++) {
 231                                 struct flock64 flck;
 232                                 int error;
 233 
 234                                 if (dfl->dfl_exts[i].dfle_length == 0)
 235                                         continue;
 236 
 237                                 bzero(&flck, sizeof (flck));
 238                                 flck.l_type = F_FREESP;
 239                                 flck.l_start = dfl->dfl_exts[i].dfle_start +
 240                                     dfl->dfl_offset;
 241                                 flck.l_len = dfl->dfl_exts[i].dfle_length;
 242 
 243                                 error = VOP_SPACE(vf->vf_vnode,
 244                                     F_FREESP, &flck, 0, 0, kcred, NULL);
 245                                 if (error != 0) {
 246                                         zio->io_error = SET_ERROR(error);
 247                                         break;
 248                                 }
 249                         }
 250                         break;
 251                 }
 252                 default:
 253                         zio->io_error = SET_ERROR(ENOTSUP);
 254                 }
 255 
 256                 zio_execute(zio);
 257                 return;
 258         }
 259 
 260         ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
 261         zio->io_target_timestamp = zio_handle_io_delay(zio);
 262 
 263         vb = kmem_alloc(sizeof (vdev_buf_t), KM_SLEEP);
 264 
 265         vb->vb_io = zio;
 266         bp = &vb->vb_buf;
 267 
 268         bioinit(bp);
 269         bp->b_flags = (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE);
 270         bp->b_bcount = zio->io_size;
 271 
 272         if (zio->io_type == ZIO_TYPE_READ) {
 273                 bp->b_un.b_addr =
 274                     abd_borrow_buf(zio->io_abd, zio->io_size);
 275         } else {
 276                 bp->b_un.b_addr =
 277                     abd_borrow_buf_copy(zio->io_abd, zio->io_size);
 278         }
 279 
 280         bp->b_lblkno = lbtodb(zio->io_offset);
 281         bp->b_bufsize = zio->io_size;
 282         bp->b_private = vf->vf_vnode;
 283         bp->b_iodone = (int (*)())vdev_file_io_intr;
 284 
 285         VERIFY3U(taskq_dispatch(system_taskq, vdev_file_io_strategy, bp,
 286             TQ_SLEEP), !=, 0);
 287 }
 288 
 289 /* ARGSUSED */
 290 static void
 291 vdev_file_io_done(zio_t *zio)
 292 {
 293 }
 294 
 295 vdev_ops_t vdev_file_ops = {
 296         vdev_file_open,
 297         vdev_file_close,
 298         vdev_default_asize,
 299         vdev_file_io_start,
 300         vdev_file_io_done,
 301         NULL,
 302         vdev_file_hold,
 303         vdev_file_rele,
 304         NULL,
 305         VDEV_TYPE_FILE,         /* name of this vdev type */
 306         B_TRUE                  /* leaf vdev */
 307 };
 308 
 309 /*
 310  * From userland we access disks just like files.
 311  */
 312 #ifndef _KERNEL
 313 
 314 vdev_ops_t vdev_disk_ops = {
 315         vdev_file_open,
 316         vdev_file_close,
 317         vdev_default_asize,
 318         vdev_file_io_start,
 319         vdev_file_io_done,
 320         NULL,
 321         vdev_file_hold,
 322         vdev_file_rele,
 323         NULL,
 324         VDEV_TYPE_DISK,         /* name of this vdev type */
 325         B_TRUE                  /* leaf vdev */
 326 };
 327 
 328 #endif