1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
  23  *
  24  * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
  25  * Copyright (c) 2016 Andrey Sokolov
  26  * Copyright 2016 Toomas Soome <tsoome@me.com>
  27  */
  28 
  29 /*
  30  * lofi (loopback file) driver - allows you to attach a file to a device,
  31  * which can then be accessed through that device. The simple model is that
  32  * you tell lofi to open a file, and then use the block device you get as
  33  * you would any block device. lofi translates access to the block device
  34  * into I/O on the underlying file. This is mostly useful for
  35  * mounting images of filesystems.
  36  *
  37  * lofi is controlled through /dev/lofictl - this is the only device exported
  38  * during attach, and is instance number 0. lofiadm communicates with lofi
  39  * through ioctls on this device. When a file is attached to lofi, block and
  40  * character devices are exported in /dev/lofi and /dev/rlofi. These devices
  41  * are identified by lofi instance number, and the instance number is also used
  42  * as the name in /dev/lofi.
  43  *
  44  * Virtual disks, or, labeled lofi, implements virtual disk support to
  45  * support partition table and related tools. Such mappings will cause
  46  * block and character devices to be exported in /dev/dsk and /dev/rdsk
  47  * directories.
  48  *
  49  * To support virtual disks, the instance number space is divided to two
  50  * parts, upper part for instance number and lower part for minor number
  51  * space to identify partitions and slices. The virtual disk support is
  52  * implemented by stacking cmlb module. For virtual disks, the partition
  53  * related ioctl calls are routed to cmlb module. Compression and encryption
  54  * is not supported for virtual disks.
  55  *
  56  * Mapped devices are tracked with state structures handled with
  57  * ddi_soft_state(9F) for simplicity.
  58  *
  59  * A file attached to lofi is opened when attached and not closed until
  60  * explicitly detached from lofi. This seems more sensible than deferring
  61  * the open until the /dev/lofi device is opened, for a number of reasons.
  62  * One is that any failure is likely to be noticed by the person (or script)
  63  * running lofiadm. Another is that it would be a security problem if the
  64  * file was replaced by another one after being added but before being opened.
  65  *
  66  * The only hard part about lofi is the ioctls. In order to support things
  67  * like 'newfs' on a lofi device, it needs to support certain disk ioctls.
  68  * So it has to fake disk geometry and partition information. More may need
  69  * to be faked if your favorite utility doesn't work and you think it should
  70  * (fdformat doesn't work because it really wants to know the type of floppy
  71  * controller to talk to, and that didn't seem easy to fake. Or possibly even
  72  * necessary, since we have mkfs_pcfs now).
  73  *
  74  * Normally, a lofi device cannot be detached if it is open (i.e. busy).  To
  75  * support simulation of hotplug events, an optional force flag is provided.
  76  * If a lofi device is open when a force detach is requested, then the
  77  * underlying file is closed and any subsequent operations return EIO.  When the
  78  * device is closed for the last time, it will be cleaned up at that time.  In
  79  * addition, the DKIOCSTATE ioctl will return DKIO_DEV_GONE when the device is
  80  * detached but not removed.
  81  *
  82  * If detach was requested and lofi device is not open, we will perform
  83  * unmap and remove the lofi instance.
  84  *
  85  * If the lofi device is open and the li_cleanup is set on ioctl request,
  86  * we set ls_cleanup flag to notify the cleanup is requested, and the
  87  * last lofi_close will perform the unmapping and this lofi instance will be
  88  * removed.
  89  *
  90  * If the lofi device is open and the li_force is set on ioctl request,
  91  * we set ls_cleanup flag to notify the cleanup is requested,
  92  * we also set ls_vp_closereq to notify IO tasks to return EIO on new
  93  * IO requests and wait in process IO count to become 0, indicating there
  94  * are no more IO requests. Since ls_cleanup is set, the last lofi_close
  95  * will perform unmap and this lofi instance will be removed.
  96  * See also lofi_unmap_file() for details.
  97  *
  98  * Once ls_cleanup is set for the instance, we do not allow lofi_open()
  99  * calls to succeed and can have last lofi_close() to remove the instance.
 100  *
 101  * Known problems:
 102  *
 103  *      UFS logging. Mounting a UFS filesystem image "logging"
 104  *      works for basic copy testing but wedges during a build of ON through
 105  *      that image. Some deadlock in lufs holding the log mutex and then
 106  *      getting stuck on a buf. So for now, don't do that.
 107  *
 108  *      Direct I/O. Since the filesystem data is being cached in the buffer
 109  *      cache, _and_ again in the underlying filesystem, it's tempting to
 110  *      enable direct I/O on the underlying file. Don't, because that deadlocks.
 111  *      I think to fix the cache-twice problem we might need filesystem support.
 112  *
 113  * Interesting things to do:
 114  *
 115  *      Allow multiple files for each device. A poor-man's metadisk, basically.
 116  *
 117  *      Pass-through ioctls on block devices. You can (though it's not
 118  *      documented), give lofi a block device as a file name. Then we shouldn't
 119  *      need to fake a geometry, however, it may be relevant if you're replacing
 120  *      metadisk, or using lofi to get crypto.
 121  *      It makes sense to do lofiadm -c aes -a /dev/dsk/c0t0d0s4 /dev/lofi/1
 122  *      and then in /etc/vfstab have an entry for /dev/lofi/1 as /export/home.
 123  *      In fact this even makes sense if you have lofi "above" metadisk.
 124  *
 125  * Encryption:
 126  *      Each lofi device can have its own symmetric key and cipher.
 127  *      They are passed to us by lofiadm(1m) in the correct format for use
 128  *      with the misc/kcf crypto_* routines.
 129  *
 130  *      Each block has its own IV, that is calculated in lofi_blk_mech(), based
 131  *      on the "master" key held in the lsp and the block number of the buffer.
 132  */
 133 
 134 #include <sys/types.h>
 135 #include <netinet/in.h>
 136 #include <sys/sysmacros.h>
 137 #include <sys/uio.h>
 138 #include <sys/kmem.h>
 139 #include <sys/cred.h>
 140 #include <sys/mman.h>
 141 #include <sys/errno.h>
 142 #include <sys/aio_req.h>
 143 #include <sys/stat.h>
 144 #include <sys/file.h>
 145 #include <sys/modctl.h>
 146 #include <sys/conf.h>
 147 #include <sys/debug.h>
 148 #include <sys/vnode.h>
 149 #include <sys/lofi.h>
 150 #include <sys/lofi_impl.h>        /* for cache structure */
 151 #include <sys/fcntl.h>
 152 #include <sys/pathname.h>
 153 #include <sys/filio.h>
 154 #include <sys/fdio.h>
 155 #include <sys/open.h>
 156 #include <sys/disp.h>
 157 #include <vm/seg_map.h>
 158 #include <sys/ddi.h>
 159 #include <sys/sunddi.h>
 160 #include <sys/zmod.h>
 161 #include <sys/id_space.h>
 162 #include <sys/mkdev.h>
 163 #include <sys/crypto/common.h>
 164 #include <sys/crypto/api.h>
 165 #include <sys/rctl.h>
 166 #include <sys/vtoc.h>
 167 #include <sys/scsi/scsi.h>        /* for DTYPE_DIRECT */
 168 #include <sys/scsi/impl/uscsi.h>
 169 #include <sys/sysevent/dev.h>
 170 #include <LzmaDec.h>
 171 
 172 #define NBLOCKS_PROP_NAME       "Nblocks"
 173 #define SIZE_PROP_NAME          "Size"
 174 #define ZONE_PROP_NAME          "zone"
 175 
 176 #define SETUP_C_DATA(cd, buf, len)              \
 177         (cd).cd_format = CRYPTO_DATA_RAW;       \
 178         (cd).cd_offset = 0;                     \
 179         (cd).cd_miscdata = NULL;                \
 180         (cd).cd_length = (len);                 \
 181         (cd).cd_raw.iov_base = (buf);           \
 182         (cd).cd_raw.iov_len = (len);
 183 
 184 #define UIO_CHECK(uio)  \
 185         if (((uio)->uio_loffset % DEV_BSIZE) != 0 || \
 186             ((uio)->uio_resid % DEV_BSIZE) != 0) { \
 187                 return (EINVAL); \
 188         }
 189 
 190 #define LOFI_TIMEOUT    30
 191 
 192 static void *lofi_statep;
 193 static kmutex_t lofi_lock;              /* state lock */
 194 static id_space_t *lofi_id;             /* lofi ID values */
 195 static list_t lofi_list;
 196 static zone_key_t lofi_zone_key;
 197 
 198 /*
 199  * Because lofi_taskq_nthreads limits the actual swamping of the device, the
 200  * maxalloc parameter (lofi_taskq_maxalloc) should be tuned conservatively
 201  * high.  If we want to be assured that the underlying device is always busy,
 202  * we must be sure that the number of bytes enqueued when the number of
 203  * enqueued tasks exceeds maxalloc is sufficient to keep the device busy for
 204  * the duration of the sleep time in taskq_ent_alloc().  That is, lofi should
 205  * set maxalloc to be the maximum throughput (in bytes per second) of the
 206  * underlying device divided by the minimum I/O size.  We assume a realistic
 207  * maximum throughput of one hundred megabytes per second; we set maxalloc on
 208  * the lofi task queue to be 104857600 divided by DEV_BSIZE.
 209  */
 210 static int lofi_taskq_maxalloc = 104857600 / DEV_BSIZE;
 211 static int lofi_taskq_nthreads = 4;     /* # of taskq threads per device */
 212 
 213 const char lofi_crypto_magic[6] = LOFI_CRYPTO_MAGIC;
 214 
 215 /*
 216  * To avoid decompressing data in a compressed segment multiple times
 217  * when accessing small parts of a segment's data, we cache and reuse
 218  * the uncompressed segment's data.
 219  *
 220  * A single cached segment is sufficient to avoid lots of duplicate
 221  * segment decompress operations. A small cache size also reduces the
 222  * memory footprint.
 223  *
 224  * lofi_max_comp_cache is the maximum number of decompressed data segments
 225  * cached for each compressed lofi image. It can be set to 0 to disable
 226  * caching.
 227  */
 228 
 229 uint32_t lofi_max_comp_cache = 1;
 230 
 231 static int gzip_decompress(void *src, size_t srclen, void *dst,
 232         size_t *destlen, int level);
 233 
 234 static int lzma_decompress(void *src, size_t srclen, void *dst,
 235         size_t *dstlen, int level);
 236 
 237 lofi_compress_info_t lofi_compress_table[LOFI_COMPRESS_FUNCTIONS] = {
 238         {gzip_decompress,       NULL,   6,      "gzip"}, /* default */
 239         {gzip_decompress,       NULL,   6,      "gzip-6"},
 240         {gzip_decompress,       NULL,   9,      "gzip-9"},
 241         {lzma_decompress,       NULL,   0,      "lzma"}
 242 };
 243 
 244 static void lofi_strategy_task(void *);
 245 static int lofi_tg_rdwr(dev_info_t *, uchar_t, void *, diskaddr_t,
 246     size_t, void *);
 247 static int lofi_tg_getinfo(dev_info_t *, int, void *, void *);
 248 
 249 struct cmlb_tg_ops lofi_tg_ops = {
 250         TG_DK_OPS_VERSION_1,
 251         lofi_tg_rdwr,
 252         lofi_tg_getinfo
 253 };
 254 
 255 /*ARGSUSED*/
 256 static void
 257 *SzAlloc(void *p, size_t size)
 258 {
 259         return (kmem_alloc(size, KM_SLEEP));
 260 }
 261 
 262 /*ARGSUSED*/
 263 static void
 264 SzFree(void *p, void *address, size_t size)
 265 {
 266         kmem_free(address, size);
 267 }
 268 
 269 static ISzAlloc g_Alloc = { SzAlloc, SzFree };
 270 
 271 /*
 272  * Free data referenced by the linked list of cached uncompressed
 273  * segments.
 274  */
 275 static void
 276 lofi_free_comp_cache(struct lofi_state *lsp)
 277 {
 278         struct lofi_comp_cache *lc;
 279 
 280         while ((lc = list_remove_head(&lsp->ls_comp_cache)) != NULL) {
 281                 kmem_free(lc->lc_data, lsp->ls_uncomp_seg_sz);
 282                 kmem_free(lc, sizeof (struct lofi_comp_cache));
 283                 lsp->ls_comp_cache_count--;
 284         }
 285         ASSERT(lsp->ls_comp_cache_count == 0);
 286 }
 287 
 288 static int
 289 is_opened(struct lofi_state *lsp)
 290 {
 291         int i;
 292         boolean_t last = B_TRUE;
 293 
 294         ASSERT(MUTEX_HELD(&lofi_lock));
 295         for (i = 0; i < LOFI_PART_MAX; i++) {
 296                 if (lsp->ls_open_lyr[i]) {
 297                         last = B_FALSE;
 298                         break;
 299                 }
 300         }
 301 
 302         for (i = 0; last && (i < OTYP_LYR); i++) {
 303                 if (lsp->ls_open_reg[i]) {
 304                         last = B_FALSE;
 305                 }
 306         }
 307 
 308         return (!last);
 309 }
 310 
 311 static void
 312 lofi_set_cleanup(struct lofi_state *lsp)
 313 {
 314         ASSERT(MUTEX_HELD(&lofi_lock));
 315 
 316         lsp->ls_cleanup = B_TRUE;
 317 
 318         /* wake up any threads waiting on dkiocstate */
 319         cv_broadcast(&lsp->ls_vp_cv);
 320 }
 321 
 322 static void
 323 lofi_free_crypto(struct lofi_state *lsp)
 324 {
 325         ASSERT(MUTEX_HELD(&lofi_lock));
 326 
 327         if (lsp->ls_crypto_enabled) {
 328                 /*
 329                  * Clean up the crypto state so that it doesn't hang around
 330                  * in memory after we are done with it.
 331                  */
 332                 if (lsp->ls_key.ck_data != NULL) {
 333                         bzero(lsp->ls_key.ck_data,
 334                             CRYPTO_BITS2BYTES(lsp->ls_key.ck_length));
 335                         kmem_free(lsp->ls_key.ck_data,
 336                             CRYPTO_BITS2BYTES(lsp->ls_key.ck_length));
 337                         lsp->ls_key.ck_data = NULL;
 338                         lsp->ls_key.ck_length = 0;
 339                 }
 340 
 341                 if (lsp->ls_mech.cm_param != NULL) {
 342                         kmem_free(lsp->ls_mech.cm_param,
 343                             lsp->ls_mech.cm_param_len);
 344                         lsp->ls_mech.cm_param = NULL;
 345                         lsp->ls_mech.cm_param_len = 0;
 346                 }
 347 
 348                 if (lsp->ls_iv_mech.cm_param != NULL) {
 349                         kmem_free(lsp->ls_iv_mech.cm_param,
 350                             lsp->ls_iv_mech.cm_param_len);
 351                         lsp->ls_iv_mech.cm_param = NULL;
 352                         lsp->ls_iv_mech.cm_param_len = 0;
 353                 }
 354 
 355                 mutex_destroy(&lsp->ls_crypto_lock);
 356         }
 357 }
 358 
 359 /* ARGSUSED */
 360 static int
 361 lofi_tg_rdwr(dev_info_t *dip, uchar_t cmd, void *bufaddr, diskaddr_t start,
 362     size_t length, void *tg_cookie)
 363 {
 364         struct lofi_state *lsp;
 365         buf_t   *bp;
 366         int     instance;
 367         int     rv = 0;
 368 
 369         instance = ddi_get_instance(dip);
 370         if (instance == 0)      /* control node does not have disk */
 371                 return (ENXIO);
 372 
 373         lsp = ddi_get_soft_state(lofi_statep, instance);
 374 
 375         if (lsp == NULL)
 376                 return (ENXIO);
 377 
 378         if (cmd != TG_READ && cmd != TG_WRITE)
 379                 return (EINVAL);
 380 
 381         /*
 382          * Make sure the mapping is set up by checking lsp->ls_vp_ready.
 383          */
 384         mutex_enter(&lsp->ls_vp_lock);
 385         while (lsp->ls_vp_ready == B_FALSE)
 386                 cv_wait(&lsp->ls_vp_cv, &lsp->ls_vp_lock);
 387         mutex_exit(&lsp->ls_vp_lock);
 388 
 389         if (P2PHASE(length, (1U << lsp->ls_lbshift)) != 0) {
 390                 /* We can only transfer whole blocks at a time! */
 391                 return (EINVAL);
 392         }
 393 
 394         bp = getrbuf(KM_SLEEP);
 395 
 396         if (cmd == TG_READ) {
 397                 bp->b_flags = B_READ;
 398         } else {
 399                 if (lsp->ls_readonly == B_TRUE) {
 400                         freerbuf(bp);
 401                         return (EROFS);
 402                 }
 403                 bp->b_flags = B_WRITE;
 404         }
 405 
 406         bp->b_un.b_addr = bufaddr;
 407         bp->b_bcount = length;
 408         bp->b_lblkno = start;
 409         bp->b_private = NULL;
 410         bp->b_edev = lsp->ls_dev;
 411 
 412         if (lsp->ls_kstat) {
 413                 mutex_enter(lsp->ls_kstat->ks_lock);
 414                 kstat_waitq_enter(KSTAT_IO_PTR(lsp->ls_kstat));
 415                 mutex_exit(lsp->ls_kstat->ks_lock);
 416         }
 417         (void) taskq_dispatch(lsp->ls_taskq, lofi_strategy_task, bp, KM_SLEEP);
 418         (void) biowait(bp);
 419 
 420         rv = geterror(bp);
 421         freerbuf(bp);
 422         return (rv);
 423 }
 424 
 425 /*
 426  * Get device geometry info for cmlb.
 427  *
 428  * We have mapped disk image as virtual block device and have to report
 429  * physical/virtual geometry to cmlb.
 430  *
 431  * So we have two principal cases:
 432  * 1. Uninitialised image without any existing labels,
 433  *    for this case we fabricate the data based on mapped image.
 434  * 2. Image with existing label information.
 435  *    Since we have no information how the image was created (it may be
 436  *    dump from some physical device), we need to rely on label information
 437  *    from image, or we get "corrupted label" errors.
 438  *    NOTE: label can be MBR, MBR+SMI, GPT
 439  */
 440 static int
 441 lofi_tg_getinfo(dev_info_t *dip, int cmd, void *arg, void *tg_cookie)
 442 {
 443         struct lofi_state *lsp;
 444         int instance;
 445         int ashift;
 446 
 447         _NOTE(ARGUNUSED(tg_cookie));
 448         instance = ddi_get_instance(dip);
 449         if (instance == 0)              /* control device has no storage */
 450                 return (ENXIO);
 451 
 452         lsp = ddi_get_soft_state(lofi_statep, instance);
 453 
 454         if (lsp == NULL)
 455                 return (ENXIO);
 456 
 457         /*
 458          * Make sure the mapping is set up by checking lsp->ls_vp_ready.
 459          *
 460          * When mapping is created, new lofi instance is created and
 461          * lofi_attach() will call cmlb_attach() as part of the procedure
 462          * to set the mapping up. This chain of events will happen in
 463          * the same thread.
 464          * Since cmlb_attach() will call lofi_tg_getinfo to get
 465          * capacity, we return error on that call if cookie is set,
 466          * otherwise lofi_attach will be stuck as the mapping is not yet
 467          * finalized and lofi is not yet ready.
 468          * Note, such error is not fatal for cmlb, as the label setup
 469          * will be finalized when cmlb_validate() is called.
 470          */
 471         mutex_enter(&lsp->ls_vp_lock);
 472         if (tg_cookie != NULL && lsp->ls_vp_ready == B_FALSE) {
 473                 mutex_exit(&lsp->ls_vp_lock);
 474                 return (ENXIO);
 475         }
 476         while (lsp->ls_vp_ready == B_FALSE)
 477                 cv_wait(&lsp->ls_vp_cv, &lsp->ls_vp_lock);
 478         mutex_exit(&lsp->ls_vp_lock);
 479 
 480         ashift = lsp->ls_lbshift;
 481 
 482         switch (cmd) {
 483         case TG_GETPHYGEOM: {
 484                 cmlb_geom_t *geomp = arg;
 485 
 486                 geomp->g_capacity    =
 487                     (lsp->ls_vp_size - lsp->ls_crypto_offset) >> ashift;
 488                 geomp->g_nsect               = lsp->ls_dkg.dkg_nsect;
 489                 geomp->g_nhead               = lsp->ls_dkg.dkg_nhead;
 490                 geomp->g_acyl                = lsp->ls_dkg.dkg_acyl;
 491                 geomp->g_ncyl                = lsp->ls_dkg.dkg_ncyl;
 492                 geomp->g_secsize     = (1U << ashift);
 493                 geomp->g_intrlv              = lsp->ls_dkg.dkg_intrlv;
 494                 geomp->g_rpm         = lsp->ls_dkg.dkg_rpm;
 495                 return (0);
 496         }
 497 
 498         case TG_GETCAPACITY:
 499                 *(diskaddr_t *)arg =
 500                     (lsp->ls_vp_size - lsp->ls_crypto_offset) >> ashift;
 501                 return (0);
 502 
 503         case TG_GETBLOCKSIZE:
 504                 *(uint32_t *)arg = (1U << ashift);
 505                 return (0);
 506 
 507         case TG_GETATTR: {
 508                 tg_attribute_t *tgattr = arg;
 509 
 510                 tgattr->media_is_writable = !lsp->ls_readonly;
 511                 tgattr->media_is_solid_state = B_FALSE;
 512                 tgattr->media_is_rotational = B_FALSE;
 513                 return (0);
 514         }
 515 
 516         default:
 517                 return (EINVAL);
 518         }
 519 }
 520 
 521 static void
 522 lofi_destroy(struct lofi_state *lsp, cred_t *credp)
 523 {
 524         int id = LOFI_MINOR2ID(getminor(lsp->ls_dev));
 525         int i;
 526 
 527         ASSERT(MUTEX_HELD(&lofi_lock));
 528 
 529         /*
 530          * Before we can start to release the other resources,
 531          * make sure we have all tasks completed and taskq removed.
 532          */
 533         if (lsp->ls_taskq != NULL) {
 534                 taskq_destroy(lsp->ls_taskq);
 535                 lsp->ls_taskq = NULL;
 536         }
 537 
 538         list_remove(&lofi_list, lsp);
 539 
 540         lofi_free_crypto(lsp);
 541 
 542         /*
 543          * Free pre-allocated compressed buffers
 544          */
 545         if (lsp->ls_comp_bufs != NULL) {
 546                 for (i = 0; i < lofi_taskq_nthreads; i++) {
 547                         if (lsp->ls_comp_bufs[i].bufsize > 0)
 548                                 kmem_free(lsp->ls_comp_bufs[i].buf,
 549                                     lsp->ls_comp_bufs[i].bufsize);
 550                 }
 551                 kmem_free(lsp->ls_comp_bufs,
 552                     sizeof (struct compbuf) * lofi_taskq_nthreads);
 553         }
 554 
 555         if (lsp->ls_vp != NULL) {
 556                 (void) VOP_PUTPAGE(lsp->ls_vp, 0, 0, B_INVAL, credp, NULL);
 557                 (void) VOP_CLOSE(lsp->ls_vp, lsp->ls_openflag,
 558                     1, 0, credp, NULL);
 559                 VN_RELE(lsp->ls_vp);
 560         }
 561         if (lsp->ls_stacked_vp != lsp->ls_vp)
 562                 VN_RELE(lsp->ls_stacked_vp);
 563         lsp->ls_vp = lsp->ls_stacked_vp = NULL;
 564 
 565         if (lsp->ls_kstat != NULL) {
 566                 kstat_delete(lsp->ls_kstat);
 567                 lsp->ls_kstat = NULL;
 568         }
 569 
 570         /*
 571          * Free cached decompressed segment data
 572          */
 573         lofi_free_comp_cache(lsp);
 574         list_destroy(&lsp->ls_comp_cache);
 575 
 576         if (lsp->ls_uncomp_seg_sz > 0) {
 577                 kmem_free(lsp->ls_comp_index_data, lsp->ls_comp_index_data_sz);
 578                 lsp->ls_uncomp_seg_sz = 0;
 579         }
 580 
 581         rctl_decr_lofi(lsp->ls_zone.zref_zone, 1);
 582         zone_rele_ref(&lsp->ls_zone, ZONE_REF_LOFI);
 583 
 584         mutex_destroy(&lsp->ls_comp_cache_lock);
 585         mutex_destroy(&lsp->ls_comp_bufs_lock);
 586         mutex_destroy(&lsp->ls_kstat_lock);
 587         mutex_destroy(&lsp->ls_vp_lock);
 588         cv_destroy(&lsp->ls_vp_cv);
 589         lsp->ls_vp_ready = B_FALSE;
 590         lsp->ls_vp_closereq = B_FALSE;
 591 
 592         ASSERT(ddi_get_soft_state(lofi_statep, id) == lsp);
 593         (void) ndi_devi_offline(lsp->ls_dip, NDI_DEVI_REMOVE);
 594         id_free(lofi_id, id);
 595 }
 596 
 597 static void
 598 lofi_free_dev(struct lofi_state *lsp)
 599 {
 600         ASSERT(MUTEX_HELD(&lofi_lock));
 601 
 602         if (lsp->ls_cmlbhandle != NULL) {
 603                 cmlb_invalidate(lsp->ls_cmlbhandle, 0);
 604                 cmlb_detach(lsp->ls_cmlbhandle, 0);
 605                 cmlb_free_handle(&lsp->ls_cmlbhandle);
 606                 lsp->ls_cmlbhandle = NULL;
 607         }
 608         (void) ddi_prop_remove_all(lsp->ls_dip);
 609         ddi_remove_minor_node(lsp->ls_dip, NULL);
 610 }
 611 
 612 /*ARGSUSED*/
 613 static void
 614 lofi_zone_shutdown(zoneid_t zoneid, void *arg)
 615 {
 616         struct lofi_state *lsp;
 617         struct lofi_state *next;
 618 
 619         mutex_enter(&lofi_lock);
 620 
 621         for (lsp = list_head(&lofi_list); lsp != NULL; lsp = next) {
 622 
 623                 /* lofi_destroy() frees lsp */
 624                 next = list_next(&lofi_list, lsp);
 625 
 626                 if (lsp->ls_zone.zref_zone->zone_id != zoneid)
 627                         continue;
 628 
 629                 /*
 630                  * No in-zone processes are running, but something has this
 631                  * open.  It's either a global zone process, or a lofi
 632                  * mount.  In either case we set ls_cleanup so the last
 633                  * user destroys the device.
 634                  */
 635                 if (is_opened(lsp)) {
 636                         lofi_set_cleanup(lsp);
 637                 } else {
 638                         lofi_free_dev(lsp);
 639                         lofi_destroy(lsp, kcred);
 640                 }
 641         }
 642 
 643         mutex_exit(&lofi_lock);
 644 }
 645 
 646 /*ARGSUSED*/
 647 static int
 648 lofi_open(dev_t *devp, int flag, int otyp, struct cred *credp)
 649 {
 650         int id;
 651         minor_t part;
 652         uint64_t mask;
 653         diskaddr_t nblks;
 654         diskaddr_t lba;
 655         boolean_t ndelay;
 656 
 657         struct lofi_state *lsp;
 658 
 659         if (otyp >= OTYPCNT)
 660                 return (EINVAL);
 661 
 662         ndelay = (flag & (FNDELAY | FNONBLOCK)) ? B_TRUE : B_FALSE;
 663 
 664         /*
 665          * lofiadm -a /dev/lofi/1 gets us here.
 666          */
 667         if (mutex_owner(&lofi_lock) == curthread)
 668                 return (EINVAL);
 669 
 670         mutex_enter(&lofi_lock);
 671 
 672         id = LOFI_MINOR2ID(getminor(*devp));
 673         part = LOFI_PART(getminor(*devp));
 674         mask = (1U << part);
 675 
 676         /* master control device */
 677         if (id == 0) {
 678                 mutex_exit(&lofi_lock);
 679                 return (0);
 680         }
 681 
 682         /* otherwise, the mapping should already exist */
 683         lsp = ddi_get_soft_state(lofi_statep, id);
 684         if (lsp == NULL) {
 685                 mutex_exit(&lofi_lock);
 686                 return (EINVAL);
 687         }
 688 
 689         if (lsp->ls_cleanup == B_TRUE) {
 690                 mutex_exit(&lofi_lock);
 691                 return (ENXIO);
 692         }
 693 
 694         if (lsp->ls_vp == NULL) {
 695                 mutex_exit(&lofi_lock);
 696                 return (ENXIO);
 697         }
 698 
 699         if (lsp->ls_readonly && (flag & FWRITE)) {
 700                 mutex_exit(&lofi_lock);
 701                 return (EROFS);
 702         }
 703 
 704         if ((lsp->ls_open_excl) & (mask)) {
 705                 mutex_exit(&lofi_lock);
 706                 return (EBUSY);
 707         }
 708 
 709         if (flag & FEXCL) {
 710                 if (lsp->ls_open_lyr[part]) {
 711                         mutex_exit(&lofi_lock);
 712                         return (EBUSY);
 713                 }
 714                 for (int i = 0; i < OTYP_LYR; i++) {
 715                         if (lsp->ls_open_reg[i] & mask) {
 716                                 mutex_exit(&lofi_lock);
 717                                 return (EBUSY);
 718                         }
 719                 }
 720         }
 721 
 722         if (lsp->ls_cmlbhandle != NULL) {
 723                 if (cmlb_validate(lsp->ls_cmlbhandle, 0, 0) != 0) {
 724                         /*
 725                          * non-blocking opens are allowed to succeed to
 726                          * support format and fdisk to create partitioning.
 727                          */
 728                         if (!ndelay) {
 729                                 mutex_exit(&lofi_lock);
 730                                 return (ENXIO);
 731                         }
 732                 } else if (cmlb_partinfo(lsp->ls_cmlbhandle, part, &nblks, &lba,
 733                     NULL, NULL, 0) == 0) {
 734                         if ((!nblks) && ((!ndelay) || (otyp != OTYP_CHR))) {
 735                                 mutex_exit(&lofi_lock);
 736                                 return (ENXIO);
 737                         }
 738                 } else if (!ndelay) {
 739                         mutex_exit(&lofi_lock);
 740                         return (ENXIO);
 741                 }
 742         }
 743 
 744         if (otyp == OTYP_LYR) {
 745                 lsp->ls_open_lyr[part]++;
 746         } else {
 747                 lsp->ls_open_reg[otyp] |= mask;
 748         }
 749         if (flag & FEXCL) {
 750                 lsp->ls_open_excl |= mask;
 751         }
 752 
 753         mutex_exit(&lofi_lock);
 754         return (0);
 755 }
 756 
 757 /*ARGSUSED*/
 758 static int
 759 lofi_close(dev_t dev, int flag, int otyp, struct cred *credp)
 760 {
 761         minor_t part;
 762         int id;
 763         uint64_t mask;
 764         struct lofi_state *lsp;
 765 
 766         id = LOFI_MINOR2ID(getminor(dev));
 767         part = LOFI_PART(getminor(dev));
 768         mask = (1U << part);
 769 
 770         mutex_enter(&lofi_lock);
 771         lsp = ddi_get_soft_state(lofi_statep, id);
 772         if (lsp == NULL) {
 773                 mutex_exit(&lofi_lock);
 774                 return (EINVAL);
 775         }
 776 
 777         if (id == 0) {
 778                 mutex_exit(&lofi_lock);
 779                 return (0);
 780         }
 781 
 782         if (lsp->ls_open_excl & mask)
 783                 lsp->ls_open_excl &= ~mask;
 784 
 785         if (otyp == OTYP_LYR) {
 786                 lsp->ls_open_lyr[part]--;
 787         } else {
 788                 lsp->ls_open_reg[otyp] &= ~mask;
 789         }
 790 
 791         /*
 792          * If we forcibly closed the underlying device (li_force), or
 793          * asked for cleanup (li_cleanup), finish up if we're the last
 794          * out of the door.
 795          */
 796         if (!is_opened(lsp) &&
 797             (lsp->ls_cleanup == B_TRUE || lsp->ls_vp == NULL)) {
 798                 lofi_free_dev(lsp);
 799                 lofi_destroy(lsp, credp);
 800         }
 801 
 802         mutex_exit(&lofi_lock);
 803         return (0);
 804 }
 805 
 806 /*
 807  * Sets the mechanism's initialization vector (IV) if one is needed.
 808  * The IV is computed from the data block number.  lsp->ls_mech is
 809  * altered so that:
 810  *      lsp->ls_mech.cm_param_len is set to the IV len.
 811  *      lsp->ls_mech.cm_param is set to the IV.
 812  */
 813 static int
 814 lofi_blk_mech(struct lofi_state *lsp, longlong_t lblkno)
 815 {
 816         int     ret;
 817         crypto_data_t cdata;
 818         char    *iv;
 819         size_t  iv_len;
 820         size_t  min;
 821         void    *data;
 822         size_t  datasz;
 823 
 824         ASSERT(MUTEX_HELD(&lsp->ls_crypto_lock));
 825 
 826         if (lsp == NULL)
 827                 return (CRYPTO_DEVICE_ERROR);
 828 
 829         /* lsp->ls_mech.cm_param{_len} has already been set for static iv */
 830         if (lsp->ls_iv_type == IVM_NONE) {
 831                 return (CRYPTO_SUCCESS);
 832         }
 833 
 834         /*
 835          * if kmem already alloced from previous call and it's the same size
 836          * we need now, just recycle it; allocate new kmem only if we have to
 837          */
 838         if (lsp->ls_mech.cm_param == NULL ||
 839             lsp->ls_mech.cm_param_len != lsp->ls_iv_len) {
 840                 iv_len = lsp->ls_iv_len;
 841                 iv = kmem_zalloc(iv_len, KM_SLEEP);
 842         } else {
 843                 iv_len = lsp->ls_mech.cm_param_len;
 844                 iv = lsp->ls_mech.cm_param;
 845                 bzero(iv, iv_len);
 846         }
 847 
 848         switch (lsp->ls_iv_type) {
 849         case IVM_ENC_BLKNO:
 850                 /* iv is not static, lblkno changes each time */
 851                 data = &lblkno;
 852                 datasz = sizeof (lblkno);
 853                 break;
 854         default:
 855                 data = 0;
 856                 datasz = 0;
 857                 break;
 858         }
 859 
 860         /*
 861          * write blkno into the iv buffer padded on the left in case
 862          * blkno ever grows bigger than its current longlong_t size
 863          * or a variation other than blkno is used for the iv data
 864          */
 865         min = MIN(datasz, iv_len);
 866         bcopy(data, iv + (iv_len - min), min);
 867 
 868         /* encrypt the data in-place to get the IV */
 869         SETUP_C_DATA(cdata, iv, iv_len);
 870 
 871         ret = crypto_encrypt(&lsp->ls_iv_mech, &cdata, &lsp->ls_key,
 872             NULL, NULL, NULL);
 873         if (ret != CRYPTO_SUCCESS) {
 874                 cmn_err(CE_WARN, "failed to create iv for block %lld: (0x%x)",
 875                     lblkno, ret);
 876                 if (lsp->ls_mech.cm_param != iv)
 877                         kmem_free(iv, iv_len);
 878 
 879                 return (ret);
 880         }
 881 
 882         /* clean up the iv from the last computation */
 883         if (lsp->ls_mech.cm_param != NULL && lsp->ls_mech.cm_param != iv)
 884                 kmem_free(lsp->ls_mech.cm_param, lsp->ls_mech.cm_param_len);
 885 
 886         lsp->ls_mech.cm_param_len = iv_len;
 887         lsp->ls_mech.cm_param = iv;
 888 
 889         return (CRYPTO_SUCCESS);
 890 }
 891 
 892 /*
 893  * Performs encryption and decryption of a chunk of data of size "len",
 894  * one DEV_BSIZE block at a time.  "len" is assumed to be a multiple of
 895  * DEV_BSIZE.
 896  */
 897 static int
 898 lofi_crypto(struct lofi_state *lsp, struct buf *bp, caddr_t plaintext,
 899     caddr_t ciphertext, size_t len, boolean_t op_encrypt)
 900 {
 901         crypto_data_t cdata;
 902         crypto_data_t wdata;
 903         int ret;
 904         longlong_t lblkno = bp->b_lblkno;
 905 
 906         mutex_enter(&lsp->ls_crypto_lock);
 907 
 908         /*
 909          * though we could encrypt/decrypt entire "len" chunk of data, we need
 910          * to break it into DEV_BSIZE pieces to capture blkno incrementing
 911          */
 912         SETUP_C_DATA(cdata, plaintext, len);
 913         cdata.cd_length = DEV_BSIZE;
 914         if (ciphertext != NULL) {               /* not in-place crypto */
 915                 SETUP_C_DATA(wdata, ciphertext, len);
 916                 wdata.cd_length = DEV_BSIZE;
 917         }
 918 
 919         do {
 920                 ret = lofi_blk_mech(lsp, lblkno);
 921                 if (ret != CRYPTO_SUCCESS)
 922                         continue;
 923 
 924                 if (op_encrypt) {
 925                         ret = crypto_encrypt(&lsp->ls_mech, &cdata,
 926                             &lsp->ls_key, NULL,
 927                             ((ciphertext != NULL) ? &wdata : NULL), NULL);
 928                 } else {
 929                         ret = crypto_decrypt(&lsp->ls_mech, &cdata,
 930                             &lsp->ls_key, NULL,
 931                             ((ciphertext != NULL) ? &wdata : NULL), NULL);
 932                 }
 933 
 934                 cdata.cd_offset += DEV_BSIZE;
 935                 if (ciphertext != NULL)
 936                         wdata.cd_offset += DEV_BSIZE;
 937                 lblkno++;
 938         } while (ret == CRYPTO_SUCCESS && cdata.cd_offset < len);
 939 
 940         mutex_exit(&lsp->ls_crypto_lock);
 941 
 942         if (ret != CRYPTO_SUCCESS) {
 943                 cmn_err(CE_WARN, "%s failed for block %lld:  (0x%x)",
 944                     op_encrypt ? "crypto_encrypt()" : "crypto_decrypt()",
 945                     lblkno, ret);
 946         }
 947 
 948         return (ret);
 949 }
 950 
 951 #define RDWR_RAW        1
 952 #define RDWR_BCOPY      2
 953 
 954 static int
 955 lofi_rdwr(caddr_t bufaddr, offset_t offset, struct buf *bp,
 956     struct lofi_state *lsp, size_t len, int method, caddr_t bcopy_locn)
 957 {
 958         ssize_t resid;
 959         int isread;
 960         int error;
 961 
 962         /*
 963          * Handles reads/writes for both plain and encrypted lofi
 964          * Note:  offset is already shifted by lsp->ls_crypto_offset
 965          * when it gets here.
 966          */
 967 
 968         isread = bp->b_flags & B_READ;
 969         if (isread) {
 970                 if (method == RDWR_BCOPY) {
 971                         /* DO NOT update bp->b_resid for bcopy */
 972                         bcopy(bcopy_locn, bufaddr, len);
 973                         error = 0;
 974                 } else {                /* RDWR_RAW */
 975                         error = vn_rdwr(UIO_READ, lsp->ls_vp, bufaddr, len,
 976                             offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred,
 977                             &resid);
 978                         bp->b_resid = resid;
 979                 }
 980                 if (lsp->ls_crypto_enabled && error == 0) {
 981                         if (lofi_crypto(lsp, bp, bufaddr, NULL, len,
 982                             B_FALSE) != CRYPTO_SUCCESS) {
 983                                 /*
 984                                  * XXX: original code didn't set residual
 985                                  * back to len because no error was expected
 986                                  * from bcopy() if encryption is not enabled
 987                                  */
 988                                 if (method != RDWR_BCOPY)
 989                                         bp->b_resid = len;
 990                                 error = EIO;
 991                         }
 992                 }
 993                 return (error);
 994         } else {
 995                 void *iobuf = bufaddr;
 996 
 997                 if (lsp->ls_crypto_enabled) {
 998                         /* don't do in-place crypto to keep bufaddr intact */
 999                         iobuf = kmem_alloc(len, KM_SLEEP);
1000                         if (lofi_crypto(lsp, bp, bufaddr, iobuf, len,
1001                             B_TRUE) != CRYPTO_SUCCESS) {
1002                                 kmem_free(iobuf, len);
1003                                 if (method != RDWR_BCOPY)
1004                                         bp->b_resid = len;
1005                                 return (EIO);
1006                         }
1007                 }
1008                 if (method == RDWR_BCOPY) {
1009                         /* DO NOT update bp->b_resid for bcopy */
1010                         bcopy(iobuf, bcopy_locn, len);
1011                         error = 0;
1012                 } else {                /* RDWR_RAW */
1013                         error = vn_rdwr(UIO_WRITE, lsp->ls_vp, iobuf, len,
1014                             offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred,
1015                             &resid);
1016                         bp->b_resid = resid;
1017                 }
1018                 if (lsp->ls_crypto_enabled) {
1019                         kmem_free(iobuf, len);
1020                 }
1021                 return (error);
1022         }
1023 }
1024 
1025 static int
1026 lofi_mapped_rdwr(caddr_t bufaddr, offset_t offset, struct buf *bp,
1027     struct lofi_state *lsp)
1028 {
1029         int error;
1030         offset_t alignedoffset, mapoffset;
1031         size_t  xfersize;
1032         int     isread;
1033         int     smflags;
1034         caddr_t mapaddr;
1035         size_t  len;
1036         enum seg_rw srw;
1037         int     save_error;
1038 
1039         /*
1040          * Note:  offset is already shifted by lsp->ls_crypto_offset
1041          * when it gets here.
1042          */
1043         if (lsp->ls_crypto_enabled)
1044                 ASSERT(lsp->ls_vp_comp_size == lsp->ls_vp_size);
1045 
1046         /*
1047          * segmap always gives us an 8K (MAXBSIZE) chunk, aligned on
1048          * an 8K boundary, but the buf transfer address may not be
1049          * aligned on more than a 512-byte boundary (we don't enforce
1050          * that even though we could). This matters since the initial
1051          * part of the transfer may not start at offset 0 within the
1052          * segmap'd chunk. So we have to compensate for that with
1053          * 'mapoffset'. Subsequent chunks always start off at the
1054          * beginning, and the last is capped by b_resid
1055          *
1056          * Visually, where "|" represents page map boundaries:
1057          *   alignedoffset (mapaddr begins at this segmap boundary)
1058          *    |   offset (from beginning of file)
1059          *    |    |       len
1060          *    v    v        v
1061          * ===|====X========|====...======|========X====|====
1062          *         /-------------...---------------/
1063          *              ^ bp->b_bcount/bp->b_resid at start
1064          *    /----/--------/----...------/--------/
1065          *      ^       ^       ^   ^           ^
1066          *      |       |       |   |           nth xfersize (<= MAXBSIZE)
1067          *      |       |       2nd thru n-1st xfersize (= MAXBSIZE)
1068          *      |       1st xfersize (<= MAXBSIZE)
1069          *    mapoffset (offset into 1st segmap, non-0 1st time, 0 thereafter)
1070          *
1071          * Notes: "alignedoffset" is "offset" rounded down to nearest
1072          * MAXBSIZE boundary.  "len" is next page boundary of size
1073          * PAGESIZE after "alignedoffset".
1074          */
1075         mapoffset = offset & MAXBOFFSET;
1076         alignedoffset = offset - mapoffset;
1077         bp->b_resid = bp->b_bcount;
1078         isread = bp->b_flags & B_READ;
1079         srw = isread ? S_READ : S_WRITE;
1080         do {
1081                 xfersize = MIN(lsp->ls_vp_comp_size - offset,
1082                     MIN(MAXBSIZE - mapoffset, bp->b_resid));
1083                 len = roundup(mapoffset + xfersize, PAGESIZE);
1084                 mapaddr = segmap_getmapflt(segkmap, lsp->ls_vp,
1085                     alignedoffset, MAXBSIZE, 1, srw);
1086                 /*
1087                  * Now fault in the pages. This lets us check
1088                  * for errors before we reference mapaddr and
1089                  * try to resolve the fault in bcopy (which would
1090                  * panic instead). And this can easily happen,
1091                  * particularly if you've lofi'd a file over NFS
1092                  * and someone deletes the file on the server.
1093                  */
1094                 error = segmap_fault(kas.a_hat, segkmap, mapaddr,
1095                     len, F_SOFTLOCK, srw);
1096                 if (error) {
1097                         (void) segmap_release(segkmap, mapaddr, 0);
1098                         if (FC_CODE(error) == FC_OBJERR)
1099                                 error = FC_ERRNO(error);
1100                         else
1101                                 error = EIO;
1102                         break;
1103                 }
1104                 /* error may be non-zero for encrypted lofi */
1105                 error = lofi_rdwr(bufaddr, 0, bp, lsp, xfersize,
1106                     RDWR_BCOPY, mapaddr + mapoffset);
1107                 if (error == 0) {
1108                         bp->b_resid -= xfersize;
1109                         bufaddr += xfersize;
1110                         offset += xfersize;
1111                 }
1112                 smflags = 0;
1113                 if (isread) {
1114                         smflags |= SM_FREE;
1115                         /*
1116                          * If we're reading an entire page starting
1117                          * at a page boundary, there's a good chance
1118                          * we won't need it again. Put it on the
1119                          * head of the freelist.
1120                          */
1121                         if (mapoffset == 0 && xfersize == MAXBSIZE)
1122                                 smflags |= SM_DONTNEED;
1123                 } else {
1124                         /*
1125                          * Write back good pages, it is okay to
1126                          * always release asynchronous here as we'll
1127                          * follow with VOP_FSYNC for B_SYNC buffers.
1128                          */
1129                         if (error == 0)
1130                                 smflags |= SM_WRITE | SM_ASYNC;
1131                 }
1132                 (void) segmap_fault(kas.a_hat, segkmap, mapaddr,
1133                     len, F_SOFTUNLOCK, srw);
1134                 save_error = segmap_release(segkmap, mapaddr, smflags);
1135                 if (error == 0)
1136                         error = save_error;
1137                 /* only the first map may start partial */
1138                 mapoffset = 0;
1139                 alignedoffset += MAXBSIZE;
1140         } while ((error == 0) && (bp->b_resid > 0) &&
1141             (offset < lsp->ls_vp_comp_size));
1142 
1143         return (error);
1144 }
1145 
1146 /*
1147  * Check if segment seg_index is present in the decompressed segment
1148  * data cache.
1149  *
1150  * Returns a pointer to the decompressed segment data cache entry if
1151  * found, and NULL when decompressed data for this segment is not yet
1152  * cached.
1153  */
1154 static struct lofi_comp_cache *
1155 lofi_find_comp_data(struct lofi_state *lsp, uint64_t seg_index)
1156 {
1157         struct lofi_comp_cache *lc;
1158 
1159         ASSERT(MUTEX_HELD(&lsp->ls_comp_cache_lock));
1160 
1161         for (lc = list_head(&lsp->ls_comp_cache); lc != NULL;
1162             lc = list_next(&lsp->ls_comp_cache, lc)) {
1163                 if (lc->lc_index == seg_index) {
1164                         /*
1165                          * Decompressed segment data was found in the
1166                          * cache.
1167                          *
1168                          * The cache uses an LRU replacement strategy;
1169                          * move the entry to head of list.
1170                          */
1171                         list_remove(&lsp->ls_comp_cache, lc);
1172                         list_insert_head(&lsp->ls_comp_cache, lc);
1173                         return (lc);
1174                 }
1175         }
1176         return (NULL);
1177 }
1178 
1179 /*
1180  * Add the data for a decompressed segment at segment index
1181  * seg_index to the cache of the decompressed segments.
1182  *
1183  * Returns a pointer to the cache element structure in case
1184  * the data was added to the cache; returns NULL when the data
1185  * wasn't cached.
1186  */
1187 static struct lofi_comp_cache *
1188 lofi_add_comp_data(struct lofi_state *lsp, uint64_t seg_index,
1189     uchar_t *data)
1190 {
1191         struct lofi_comp_cache *lc;
1192 
1193         ASSERT(MUTEX_HELD(&lsp->ls_comp_cache_lock));
1194 
1195         while (lsp->ls_comp_cache_count > lofi_max_comp_cache) {
1196                 lc = list_remove_tail(&lsp->ls_comp_cache);
1197                 ASSERT(lc != NULL);
1198                 kmem_free(lc->lc_data, lsp->ls_uncomp_seg_sz);
1199                 kmem_free(lc, sizeof (struct lofi_comp_cache));
1200                 lsp->ls_comp_cache_count--;
1201         }
1202 
1203         /*
1204          * Do not cache when disabled by tunable variable
1205          */
1206         if (lofi_max_comp_cache == 0)
1207                 return (NULL);
1208 
1209         /*
1210          * When the cache has not yet reached the maximum allowed
1211          * number of segments, allocate a new cache element.
1212          * Otherwise the cache is full; reuse the last list element
1213          * (LRU) for caching the decompressed segment data.
1214          *
1215          * The cache element for the new decompressed segment data is
1216          * added to the head of the list.
1217          */
1218         if (lsp->ls_comp_cache_count < lofi_max_comp_cache) {
1219                 lc = kmem_alloc(sizeof (struct lofi_comp_cache), KM_SLEEP);
1220                 lc->lc_data = NULL;
1221                 list_insert_head(&lsp->ls_comp_cache, lc);
1222                 lsp->ls_comp_cache_count++;
1223         } else {
1224                 lc = list_remove_tail(&lsp->ls_comp_cache);
1225                 if (lc == NULL)
1226                         return (NULL);
1227                 list_insert_head(&lsp->ls_comp_cache, lc);
1228         }
1229 
1230         /*
1231          * Free old uncompressed segment data when reusing a cache
1232          * entry.
1233          */
1234         if (lc->lc_data != NULL)
1235                 kmem_free(lc->lc_data, lsp->ls_uncomp_seg_sz);
1236 
1237         lc->lc_data = data;
1238         lc->lc_index = seg_index;
1239         return (lc);
1240 }
1241 
1242 
1243 /*ARGSUSED*/
1244 static int
1245 gzip_decompress(void *src, size_t srclen, void *dst,
1246     size_t *dstlen, int level)
1247 {
1248         ASSERT(*dstlen >= srclen);
1249 
1250         if (z_uncompress(dst, dstlen, src, srclen) != Z_OK)
1251                 return (-1);
1252         return (0);
1253 }
1254 
1255 #define LZMA_HEADER_SIZE        (LZMA_PROPS_SIZE + 8)
1256 /*ARGSUSED*/
1257 static int
1258 lzma_decompress(void *src, size_t srclen, void *dst,
1259     size_t *dstlen, int level)
1260 {
1261         size_t insizepure;
1262         void *actual_src;
1263         ELzmaStatus status;
1264 
1265         insizepure = srclen - LZMA_HEADER_SIZE;
1266         actual_src = (void *)((Byte *)src + LZMA_HEADER_SIZE);
1267 
1268         if (LzmaDecode((Byte *)dst, (size_t *)dstlen,
1269             (const Byte *)actual_src, &insizepure,
1270             (const Byte *)src, LZMA_PROPS_SIZE, LZMA_FINISH_ANY, &status,
1271             &g_Alloc) != SZ_OK) {
1272                 return (-1);
1273         }
1274         return (0);
1275 }
1276 
1277 /*
1278  * This is basically what strategy used to be before we found we
1279  * needed task queues.
1280  */
1281 static void
1282 lofi_strategy_task(void *arg)
1283 {
1284         struct buf *bp = (struct buf *)arg;
1285         int error;
1286         int syncflag = 0;
1287         struct lofi_state *lsp;
1288         offset_t offset;
1289         caddr_t bufaddr;
1290         size_t  len;
1291         size_t  xfersize;
1292         boolean_t bufinited = B_FALSE;
1293 
1294         lsp = ddi_get_soft_state(lofi_statep,
1295             LOFI_MINOR2ID(getminor(bp->b_edev)));
1296 
1297         if (lsp == NULL) {
1298                 error = ENXIO;
1299                 goto errout;
1300         }
1301         if (lsp->ls_kstat) {
1302                 mutex_enter(lsp->ls_kstat->ks_lock);
1303                 kstat_waitq_to_runq(KSTAT_IO_PTR(lsp->ls_kstat));
1304                 mutex_exit(lsp->ls_kstat->ks_lock);
1305         }
1306 
1307         mutex_enter(&lsp->ls_vp_lock);
1308         lsp->ls_vp_iocount++;
1309         mutex_exit(&lsp->ls_vp_lock);
1310 
1311         bp_mapin(bp);
1312         bufaddr = bp->b_un.b_addr;
1313         offset = (bp->b_lblkno + (diskaddr_t)(uintptr_t)bp->b_private)
1314             << lsp->ls_lbshift;        /* offset within file */
1315         if (lsp->ls_crypto_enabled) {
1316                 /* encrypted data really begins after crypto header */
1317                 offset += lsp->ls_crypto_offset;
1318         }
1319         len = bp->b_bcount;
1320         bufinited = B_TRUE;
1321 
1322         if (lsp->ls_vp == NULL || lsp->ls_vp_closereq) {
1323                 error = EIO;
1324                 goto errout;
1325         }
1326 
1327         /*
1328          * If we're writing and the buffer was not B_ASYNC
1329          * we'll follow up with a VOP_FSYNC() to force any
1330          * asynchronous I/O to stable storage.
1331          */
1332         if (!(bp->b_flags & B_READ) && !(bp->b_flags & B_ASYNC))
1333                 syncflag = FSYNC;
1334 
1335         /*
1336          * We used to always use vn_rdwr here, but we cannot do that because
1337          * we might decide to read or write from the the underlying
1338          * file during this call, which would be a deadlock because
1339          * we have the rw_lock. So instead we page, unless it's not
1340          * mapable or it's a character device or it's an encrypted lofi.
1341          */
1342         if ((lsp->ls_vp->v_flag & VNOMAP) || (lsp->ls_vp->v_type == VCHR) ||
1343             lsp->ls_crypto_enabled) {
1344                 error = lofi_rdwr(bufaddr, offset, bp, lsp, len, RDWR_RAW,
1345                     NULL);
1346         } else if (lsp->ls_uncomp_seg_sz == 0) {
1347                 error = lofi_mapped_rdwr(bufaddr, offset, bp, lsp);
1348         } else {
1349                 uchar_t *compressed_seg = NULL, *cmpbuf;
1350                 uchar_t *uncompressed_seg = NULL;
1351                 lofi_compress_info_t *li;
1352                 size_t oblkcount;
1353                 ulong_t seglen;
1354                 uint64_t sblkno, eblkno, cmpbytes;
1355                 uint64_t uncompressed_seg_index;
1356                 struct lofi_comp_cache *lc;
1357                 offset_t sblkoff, eblkoff;
1358                 u_offset_t salign, ealign;
1359                 u_offset_t sdiff;
1360                 uint32_t comp_data_sz;
1361                 uint64_t i;
1362                 int j;
1363 
1364                 /*
1365                  * From here on we're dealing primarily with compressed files
1366                  */
1367                 ASSERT(!lsp->ls_crypto_enabled);
1368 
1369                 /*
1370                  * Compressed files can only be read from and
1371                  * not written to
1372                  */
1373                 if (!(bp->b_flags & B_READ)) {
1374                         bp->b_resid = bp->b_bcount;
1375                         error = EROFS;
1376                         goto done;
1377                 }
1378 
1379                 ASSERT(lsp->ls_comp_algorithm_index >= 0);
1380                 li = &lofi_compress_table[lsp->ls_comp_algorithm_index];
1381                 /*
1382                  * Compute starting and ending compressed segment numbers
1383                  * We use only bitwise operations avoiding division and
1384                  * modulus because we enforce the compression segment size
1385                  * to a power of 2
1386                  */
1387                 sblkno = offset >> lsp->ls_comp_seg_shift;
1388                 sblkoff = offset & (lsp->ls_uncomp_seg_sz - 1);
1389                 eblkno = (offset + bp->b_bcount) >> lsp->ls_comp_seg_shift;
1390                 eblkoff = (offset + bp->b_bcount) & (lsp->ls_uncomp_seg_sz - 1);
1391 
1392                 /*
1393                  * Check the decompressed segment cache.
1394                  *
1395                  * The cache is used only when the requested data
1396                  * is within a segment. Requests that cross
1397                  * segment boundaries bypass the cache.
1398                  */
1399                 if (sblkno == eblkno ||
1400                     (sblkno + 1 == eblkno && eblkoff == 0)) {
1401                         /*
1402                          * Request doesn't cross a segment boundary,
1403                          * now check the cache.
1404                          */
1405                         mutex_enter(&lsp->ls_comp_cache_lock);
1406                         lc = lofi_find_comp_data(lsp, sblkno);
1407                         if (lc != NULL) {
1408                                 /*
1409                                  * We've found the decompressed segment
1410                                  * data in the cache; reuse it.
1411                                  */
1412                                 bcopy(lc->lc_data + sblkoff, bufaddr,
1413                                     bp->b_bcount);
1414                                 mutex_exit(&lsp->ls_comp_cache_lock);
1415                                 bp->b_resid = 0;
1416                                 error = 0;
1417                                 goto done;
1418                         }
1419                         mutex_exit(&lsp->ls_comp_cache_lock);
1420                 }
1421 
1422                 /*
1423                  * Align start offset to block boundary for segmap
1424                  */
1425                 salign = lsp->ls_comp_seg_index[sblkno];
1426                 sdiff = salign & (DEV_BSIZE - 1);
1427                 salign -= sdiff;
1428                 if (eblkno >= (lsp->ls_comp_index_sz - 1)) {
1429                         /*
1430                          * We're dealing with the last segment of
1431                          * the compressed file -- the size of this
1432                          * segment *may not* be the same as the
1433                          * segment size for the file
1434                          */
1435                         eblkoff = (offset + bp->b_bcount) &
1436                             (lsp->ls_uncomp_last_seg_sz - 1);
1437                         ealign = lsp->ls_vp_comp_size;
1438                 } else {
1439                         ealign = lsp->ls_comp_seg_index[eblkno + 1];
1440                 }
1441 
1442                 /*
1443                  * Preserve original request paramaters
1444                  */
1445                 oblkcount = bp->b_bcount;
1446 
1447                 /*
1448                  * Assign the calculated parameters
1449                  */
1450                 comp_data_sz = ealign - salign;
1451                 bp->b_bcount = comp_data_sz;
1452 
1453                 /*
1454                  * Buffers to hold compressed segments are pre-allocated
1455                  * on a per-thread basis. Find a pre-allocated buffer
1456                  * that is not currently in use and mark it for use.
1457                  */
1458                 mutex_enter(&lsp->ls_comp_bufs_lock);
1459                 for (j = 0; j < lofi_taskq_nthreads; j++) {
1460                         if (lsp->ls_comp_bufs[j].inuse == 0) {
1461                                 lsp->ls_comp_bufs[j].inuse = 1;
1462                                 break;
1463                         }
1464                 }
1465 
1466                 mutex_exit(&lsp->ls_comp_bufs_lock);
1467                 ASSERT(j < lofi_taskq_nthreads);
1468 
1469                 /*
1470                  * If the pre-allocated buffer size does not match
1471                  * the size of the I/O request, re-allocate it with
1472                  * the appropriate size
1473                  */
1474                 if (lsp->ls_comp_bufs[j].bufsize < bp->b_bcount) {
1475                         if (lsp->ls_comp_bufs[j].bufsize > 0)
1476                                 kmem_free(lsp->ls_comp_bufs[j].buf,
1477                                     lsp->ls_comp_bufs[j].bufsize);
1478                         lsp->ls_comp_bufs[j].buf = kmem_alloc(bp->b_bcount,
1479                             KM_SLEEP);
1480                         lsp->ls_comp_bufs[j].bufsize = bp->b_bcount;
1481                 }
1482                 compressed_seg = lsp->ls_comp_bufs[j].buf;
1483 
1484                 /*
1485                  * Map in the calculated number of blocks
1486                  */
1487                 error = lofi_mapped_rdwr((caddr_t)compressed_seg, salign,
1488                     bp, lsp);
1489 
1490                 bp->b_bcount = oblkcount;
1491                 bp->b_resid = oblkcount;
1492                 if (error != 0)
1493                         goto done;
1494 
1495                 /*
1496                  * decompress compressed blocks start
1497                  */
1498                 cmpbuf = compressed_seg + sdiff;
1499                 for (i = sblkno; i <= eblkno; i++) {
1500                         ASSERT(i < lsp->ls_comp_index_sz - 1);
1501                         uchar_t *useg;
1502 
1503                         /*
1504                          * The last segment is special in that it is
1505                          * most likely not going to be the same
1506                          * (uncompressed) size as the other segments.
1507                          */
1508                         if (i == (lsp->ls_comp_index_sz - 2)) {
1509                                 seglen = lsp->ls_uncomp_last_seg_sz;
1510                         } else {
1511                                 seglen = lsp->ls_uncomp_seg_sz;
1512                         }
1513 
1514                         /*
1515                          * Each of the segment index entries contains
1516                          * the starting block number for that segment.
1517                          * The number of compressed bytes in a segment
1518                          * is thus the difference between the starting
1519                          * block number of this segment and the starting
1520                          * block number of the next segment.
1521                          */
1522                         cmpbytes = lsp->ls_comp_seg_index[i + 1] -
1523                             lsp->ls_comp_seg_index[i];
1524 
1525                         /*
1526                          * The first byte in a compressed segment is a flag
1527                          * that indicates whether this segment is compressed
1528                          * at all.
1529                          *
1530                          * The variable 'useg' is used (instead of
1531                          * uncompressed_seg) in this loop to keep a
1532                          * reference to the uncompressed segment.
1533                          *
1534                          * N.B. If 'useg' is replaced with uncompressed_seg,
1535                          * it leads to memory leaks and heap corruption in
1536                          * corner cases where compressed segments lie
1537                          * adjacent to uncompressed segments.
1538                          */
1539                         if (*cmpbuf == UNCOMPRESSED) {
1540                                 useg = cmpbuf + SEGHDR;
1541                         } else {
1542                                 if (uncompressed_seg == NULL)
1543                                         uncompressed_seg =
1544                                             kmem_alloc(lsp->ls_uncomp_seg_sz,
1545                                             KM_SLEEP);
1546                                 useg = uncompressed_seg;
1547                                 uncompressed_seg_index = i;
1548 
1549                                 if (li->l_decompress((cmpbuf + SEGHDR),
1550                                     (cmpbytes - SEGHDR), uncompressed_seg,
1551                                     &seglen, li->l_level) != 0) {
1552                                         error = EIO;
1553                                         goto done;
1554                                 }
1555                         }
1556 
1557                         /*
1558                          * Determine how much uncompressed data we
1559                          * have to copy and copy it
1560                          */
1561                         xfersize = lsp->ls_uncomp_seg_sz - sblkoff;
1562                         if (i == eblkno)
1563                                 xfersize -= (lsp->ls_uncomp_seg_sz - eblkoff);
1564 
1565                         bcopy((useg + sblkoff), bufaddr, xfersize);
1566 
1567                         cmpbuf += cmpbytes;
1568                         bufaddr += xfersize;
1569                         bp->b_resid -= xfersize;
1570                         sblkoff = 0;
1571 
1572                         if (bp->b_resid == 0)
1573                                 break;
1574                 } /* decompress compressed blocks ends */
1575 
1576                 /*
1577                  * Skip to done if there is no uncompressed data to cache
1578                  */
1579                 if (uncompressed_seg == NULL)
1580                         goto done;
1581 
1582                 /*
1583                  * Add the data for the last decompressed segment to
1584                  * the cache.
1585                  *
1586                  * In case the uncompressed segment data was added to (and
1587                  * is referenced by) the cache, make sure we don't free it
1588                  * here.
1589                  */
1590                 mutex_enter(&lsp->ls_comp_cache_lock);
1591                 if ((lc = lofi_add_comp_data(lsp, uncompressed_seg_index,
1592                     uncompressed_seg)) != NULL) {
1593                         uncompressed_seg = NULL;
1594                 }
1595                 mutex_exit(&lsp->ls_comp_cache_lock);
1596 
1597 done:
1598                 if (compressed_seg != NULL) {
1599                         mutex_enter(&lsp->ls_comp_bufs_lock);
1600                         lsp->ls_comp_bufs[j].inuse = 0;
1601                         mutex_exit(&lsp->ls_comp_bufs_lock);
1602                 }
1603                 if (uncompressed_seg != NULL)
1604                         kmem_free(uncompressed_seg, lsp->ls_uncomp_seg_sz);
1605         } /* end of handling compressed files */
1606 
1607         if ((error == 0) && (syncflag != 0))
1608                 error = VOP_FSYNC(lsp->ls_vp, syncflag, kcred, NULL);
1609 
1610 errout:
1611         if (bufinited && lsp->ls_kstat) {
1612                 size_t n_done = bp->b_bcount - bp->b_resid;
1613                 kstat_io_t *kioptr;
1614 
1615                 mutex_enter(lsp->ls_kstat->ks_lock);
1616                 kioptr = KSTAT_IO_PTR(lsp->ls_kstat);
1617                 if (bp->b_flags & B_READ) {
1618                         kioptr->nread += n_done;
1619                         kioptr->reads++;
1620                 } else {
1621                         kioptr->nwritten += n_done;
1622                         kioptr->writes++;
1623                 }
1624                 kstat_runq_exit(kioptr);
1625                 mutex_exit(lsp->ls_kstat->ks_lock);
1626         }
1627 
1628         mutex_enter(&lsp->ls_vp_lock);
1629         if (--lsp->ls_vp_iocount == 0)
1630                 cv_broadcast(&lsp->ls_vp_cv);
1631         mutex_exit(&lsp->ls_vp_lock);
1632 
1633         bioerror(bp, error);
1634         biodone(bp);
1635 }
1636 
1637 static int
1638 lofi_strategy(struct buf *bp)
1639 {
1640         struct lofi_state *lsp;
1641         offset_t        offset;
1642         minor_t         part;
1643         diskaddr_t      p_lba;
1644         diskaddr_t      p_nblks;
1645         int             shift;
1646 
1647         /*
1648          * We cannot just do I/O here, because the current thread
1649          * _might_ end up back in here because the underlying filesystem
1650          * wants a buffer, which eventually gets into bio_recycle and
1651          * might call into lofi to write out a delayed-write buffer.
1652          * This is bad if the filesystem above lofi is the same as below.
1653          *
1654          * We could come up with a complex strategy using threads to
1655          * do the I/O asynchronously, or we could use task queues. task
1656          * queues were incredibly easy so they win.
1657          */
1658 
1659         lsp = ddi_get_soft_state(lofi_statep,
1660             LOFI_MINOR2ID(getminor(bp->b_edev)));
1661         part = LOFI_PART(getminor(bp->b_edev));
1662 
1663         if (lsp == NULL) {
1664                 bioerror(bp, ENXIO);
1665                 biodone(bp);
1666                 return (0);
1667         }
1668 
1669         /* Check if we are closing. */
1670         mutex_enter(&lsp->ls_vp_lock);
1671         if (lsp->ls_vp == NULL || lsp->ls_vp_closereq) {
1672                 mutex_exit(&lsp->ls_vp_lock);
1673                 bioerror(bp, EIO);
1674                 biodone(bp);
1675                 return (0);
1676         }
1677         mutex_exit(&lsp->ls_vp_lock);
1678 
1679         shift = lsp->ls_lbshift;
1680         p_lba = 0;
1681         p_nblks = lsp->ls_vp_size >> shift;
1682 
1683         if (lsp->ls_cmlbhandle != NULL) {
1684                 if (cmlb_partinfo(lsp->ls_cmlbhandle, part, &p_nblks, &p_lba,
1685                     NULL, NULL, 0)) {
1686                         bioerror(bp, ENXIO);
1687                         biodone(bp);
1688                         return (0);
1689                 }
1690         }
1691 
1692         /* start block past partition end? */
1693         if (bp->b_lblkno > p_nblks) {
1694                 bioerror(bp, ENXIO);
1695                 biodone(bp);
1696                 return (0);
1697         }
1698 
1699         offset = (bp->b_lblkno+p_lba) << shift;        /* offset within file */
1700 
1701         mutex_enter(&lsp->ls_vp_lock);
1702         if (lsp->ls_crypto_enabled) {
1703                 /* encrypted data really begins after crypto header */
1704                 offset += lsp->ls_crypto_offset;
1705         }
1706 
1707         /* make sure we will not pass the file or partition size */
1708         if (offset == lsp->ls_vp_size ||
1709             offset == (((p_lba + p_nblks) << shift) + lsp->ls_crypto_offset)) {
1710                 /* EOF */
1711                 if ((bp->b_flags & B_READ) != 0) {
1712                         bp->b_resid = bp->b_bcount;
1713                         bioerror(bp, 0);
1714                 } else {
1715                         /* writes should fail */
1716                         bioerror(bp, ENXIO);
1717                 }
1718                 biodone(bp);
1719                 mutex_exit(&lsp->ls_vp_lock);
1720                 return (0);
1721         }
1722         if ((offset > lsp->ls_vp_size) ||
1723             (offset > (((p_lba + p_nblks) << shift) + lsp->ls_crypto_offset)) ||
1724             ((offset + bp->b_bcount) > ((p_lba + p_nblks) << shift))) {
1725                 bioerror(bp, ENXIO);
1726                 biodone(bp);
1727                 mutex_exit(&lsp->ls_vp_lock);
1728                 return (0);
1729         }
1730 
1731         mutex_exit(&lsp->ls_vp_lock);
1732 
1733         if (lsp->ls_kstat) {
1734                 mutex_enter(lsp->ls_kstat->ks_lock);
1735                 kstat_waitq_enter(KSTAT_IO_PTR(lsp->ls_kstat));
1736                 mutex_exit(lsp->ls_kstat->ks_lock);
1737         }
1738         bp->b_private = (void *)(uintptr_t)p_lba;    /* partition start */
1739         (void) taskq_dispatch(lsp->ls_taskq, lofi_strategy_task, bp, KM_SLEEP);
1740         return (0);
1741 }
1742 
1743 /*ARGSUSED2*/
1744 static int
1745 lofi_read(dev_t dev, struct uio *uio, struct cred *credp)
1746 {
1747         if (getminor(dev) == 0)
1748                 return (EINVAL);
1749         UIO_CHECK(uio);
1750         return (physio(lofi_strategy, NULL, dev, B_READ, minphys, uio));
1751 }
1752 
1753 /*ARGSUSED2*/
1754 static int
1755 lofi_write(dev_t dev, struct uio *uio, struct cred *credp)
1756 {
1757         if (getminor(dev) == 0)
1758                 return (EINVAL);
1759         UIO_CHECK(uio);
1760         return (physio(lofi_strategy, NULL, dev, B_WRITE, minphys, uio));
1761 }
1762 
1763 /*ARGSUSED2*/
1764 static int
1765 lofi_aread(dev_t dev, struct aio_req *aio, struct cred *credp)
1766 {
1767         if (getminor(dev) == 0)
1768                 return (EINVAL);
1769         UIO_CHECK(aio->aio_uio);
1770         return (aphysio(lofi_strategy, anocancel, dev, B_READ, minphys, aio));
1771 }
1772 
1773 /*ARGSUSED2*/
1774 static int
1775 lofi_awrite(dev_t dev, struct aio_req *aio, struct cred *credp)
1776 {
1777         if (getminor(dev) == 0)
1778                 return (EINVAL);
1779         UIO_CHECK(aio->aio_uio);
1780         return (aphysio(lofi_strategy, anocancel, dev, B_WRITE, minphys, aio));
1781 }
1782 
1783 /*ARGSUSED*/
1784 static int
1785 lofi_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
1786 {
1787         struct lofi_state *lsp;
1788         dev_t   dev = (dev_t)arg;
1789         int instance;
1790 
1791         instance = LOFI_MINOR2ID(getminor(dev));
1792         switch (infocmd) {
1793         case DDI_INFO_DEVT2DEVINFO:
1794                 lsp = ddi_get_soft_state(lofi_statep, instance);
1795                 if (lsp == NULL)
1796                         return (DDI_FAILURE);
1797                 *result = lsp->ls_dip;
1798                 return (DDI_SUCCESS);
1799         case DDI_INFO_DEVT2INSTANCE:
1800                 *result = (void *) (intptr_t)instance;
1801                 return (DDI_SUCCESS);
1802         }
1803         return (DDI_FAILURE);
1804 }
1805 
1806 static int
1807 lofi_create_minor_nodes(struct lofi_state *lsp, boolean_t labeled)
1808 {
1809         int error = 0;
1810         int instance = ddi_get_instance(lsp->ls_dip);
1811 
1812         if (labeled == B_TRUE) {
1813                 cmlb_alloc_handle(&lsp->ls_cmlbhandle);
1814                 error = cmlb_attach(lsp->ls_dip, &lofi_tg_ops, DTYPE_DIRECT,
1815                     B_FALSE, B_FALSE, DDI_NT_BLOCK_CHAN,
1816                     CMLB_CREATE_P0_MINOR_NODE, lsp->ls_cmlbhandle, (void *)1);
1817 
1818                 if (error != DDI_SUCCESS) {
1819                         cmlb_free_handle(&lsp->ls_cmlbhandle);
1820                         lsp->ls_cmlbhandle = NULL;
1821                         error = ENXIO;
1822                 }
1823         } else {
1824                 /* create minor nodes */
1825                 error = ddi_create_minor_node(lsp->ls_dip, LOFI_BLOCK_NODE,
1826                     S_IFBLK, LOFI_ID2MINOR(instance), DDI_PSEUDO, 0);
1827                 if (error == DDI_SUCCESS) {
1828                         error = ddi_create_minor_node(lsp->ls_dip,
1829                             LOFI_CHAR_NODE, S_IFCHR, LOFI_ID2MINOR(instance),
1830                             DDI_PSEUDO, 0);
1831                         if (error != DDI_SUCCESS) {
1832                                 ddi_remove_minor_node(lsp->ls_dip,
1833                                     LOFI_BLOCK_NODE);
1834                                 error = ENXIO;
1835                         }
1836                 } else
1837                         error = ENXIO;
1838         }
1839         return (error);
1840 }
1841 
1842 static int
1843 lofi_zone_bind(struct lofi_state *lsp)
1844 {
1845         int error = 0;
1846 
1847         mutex_enter(&curproc->p_lock);
1848         if ((error = rctl_incr_lofi(curproc, curproc->p_zone, 1)) != 0) {
1849                 mutex_exit(&curproc->p_lock);
1850                 return (error);
1851         }
1852         mutex_exit(&curproc->p_lock);
1853 
1854         if (ddi_prop_update_string(DDI_DEV_T_NONE, lsp->ls_dip, ZONE_PROP_NAME,
1855             (char *)curproc->p_zone->zone_name) != DDI_PROP_SUCCESS) {
1856                 rctl_decr_lofi(curproc->p_zone, 1);
1857                 error = EINVAL;
1858         } else {
1859                 zone_init_ref(&lsp->ls_zone);
1860                 zone_hold_ref(curzone, &lsp->ls_zone, ZONE_REF_LOFI);
1861         }
1862         return (error);
1863 }
1864 
1865 static void
1866 lofi_zone_unbind(struct lofi_state *lsp)
1867 {
1868         (void) ddi_prop_remove(DDI_DEV_T_NONE, lsp->ls_dip, ZONE_PROP_NAME);
1869         rctl_decr_lofi(curproc->p_zone, 1);
1870         zone_rele_ref(&lsp->ls_zone, ZONE_REF_LOFI);
1871 }
1872 
1873 static int
1874 lofi_online_dev(dev_info_t *dip)
1875 {
1876         boolean_t labeled;
1877         int     error;
1878         int     instance = ddi_get_instance(dip);
1879         struct lofi_state *lsp;
1880 
1881         labeled = B_FALSE;
1882         if (ddi_prop_exists(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, "labeled"))
1883                 labeled = B_TRUE;
1884 
1885         /* lsp alloc+init, soft state is freed in lofi_detach */
1886         error = ddi_soft_state_zalloc(lofi_statep, instance);
1887         if (error == DDI_FAILURE) {
1888                 return (ENOMEM);
1889         }
1890 
1891         lsp = ddi_get_soft_state(lofi_statep, instance);
1892         lsp->ls_dip = dip;
1893 
1894         if ((error = lofi_zone_bind(lsp)) != 0)
1895                 goto err;
1896 
1897         cv_init(&lsp->ls_vp_cv, NULL, CV_DRIVER, NULL);
1898         mutex_init(&lsp->ls_comp_cache_lock, NULL, MUTEX_DRIVER, NULL);
1899         mutex_init(&lsp->ls_comp_bufs_lock, NULL, MUTEX_DRIVER, NULL);
1900         mutex_init(&lsp->ls_kstat_lock, NULL, MUTEX_DRIVER, NULL);
1901         mutex_init(&lsp->ls_vp_lock, NULL, MUTEX_DRIVER, NULL);
1902 
1903         if ((error = lofi_create_minor_nodes(lsp, labeled)) != 0) {
1904                 lofi_zone_unbind(lsp);
1905                 goto lerr;
1906         }
1907 
1908         /* driver handles kernel-issued IOCTLs */
1909         if (ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
1910             DDI_KERNEL_IOCTL, NULL, 0) != DDI_PROP_SUCCESS) {
1911                 error = DDI_FAILURE;
1912                 goto merr;
1913         }
1914 
1915         lsp->ls_kstat = kstat_create_zone(LOFI_DRIVER_NAME, instance,
1916             NULL, "disk", KSTAT_TYPE_IO, 1, 0, getzoneid());
1917         if (lsp->ls_kstat == NULL) {
1918                 (void) ddi_prop_remove(DDI_DEV_T_NONE, lsp->ls_dip,
1919                     DDI_KERNEL_IOCTL);
1920                 error = ENOMEM;
1921                 goto merr;
1922         }
1923 
1924         lsp->ls_kstat->ks_lock = &lsp->ls_kstat_lock;
1925         kstat_zone_add(lsp->ls_kstat, GLOBAL_ZONEID);
1926         kstat_install(lsp->ls_kstat);
1927         return (DDI_SUCCESS);
1928 merr:
1929         if (lsp->ls_cmlbhandle != NULL) {
1930                 cmlb_detach(lsp->ls_cmlbhandle, 0);
1931                 cmlb_free_handle(&lsp->ls_cmlbhandle);
1932         }
1933         ddi_remove_minor_node(dip, NULL);
1934         lofi_zone_unbind(lsp);
1935 lerr:
1936         mutex_destroy(&lsp->ls_comp_cache_lock);
1937         mutex_destroy(&lsp->ls_comp_bufs_lock);
1938         mutex_destroy(&lsp->ls_kstat_lock);
1939         mutex_destroy(&lsp->ls_vp_lock);
1940         cv_destroy(&lsp->ls_vp_cv);
1941 err:
1942         ddi_soft_state_free(lofi_statep, instance);
1943         return (error);
1944 }
1945 
1946 static int
1947 lofi_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
1948 {
1949         int     rv;
1950         int     instance = ddi_get_instance(dip);
1951         struct lofi_state *lsp;
1952 
1953         if (cmd != DDI_ATTACH)
1954                 return (DDI_FAILURE);
1955 
1956         /*
1957          * Instance 0 is control instance, attaching control instance
1958          * will set the lofi up and ready.
1959          */
1960         if (instance == 0) {
1961                 rv = ddi_soft_state_zalloc(lofi_statep, 0);
1962                 if (rv == DDI_FAILURE) {
1963                         return (DDI_FAILURE);
1964                 }
1965                 lsp = ddi_get_soft_state(lofi_statep, instance);
1966                 rv = ddi_create_minor_node(dip, LOFI_CTL_NODE, S_IFCHR, 0,
1967                     DDI_PSEUDO, 0);
1968                 if (rv == DDI_FAILURE) {
1969                         ddi_soft_state_free(lofi_statep, 0);
1970                         return (DDI_FAILURE);
1971                 }
1972                 /* driver handles kernel-issued IOCTLs */
1973                 if (ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
1974                     DDI_KERNEL_IOCTL, NULL, 0) != DDI_PROP_SUCCESS) {
1975                         ddi_remove_minor_node(dip, NULL);
1976                         ddi_soft_state_free(lofi_statep, 0);
1977                         return (DDI_FAILURE);
1978                 }
1979 
1980                 zone_key_create(&lofi_zone_key, NULL, lofi_zone_shutdown, NULL);
1981 
1982                 lsp->ls_dip = dip;
1983         } else {
1984                 if (lofi_online_dev(dip) == DDI_FAILURE)
1985                         return (DDI_FAILURE);
1986         }
1987 
1988         ddi_report_dev(dip);
1989         return (DDI_SUCCESS);
1990 }
1991 
1992 static int
1993 lofi_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
1994 {
1995         struct lofi_state *lsp;
1996         int instance = ddi_get_instance(dip);
1997 
1998         if (cmd != DDI_DETACH)
1999                 return (DDI_FAILURE);
2000 
2001         /*
2002          * If the instance is not 0, release state.
2003          * The instance 0 is control device, we can not detach it
2004          * before other instances are detached.
2005          */
2006         if (instance != 0) {
2007                 lsp = ddi_get_soft_state(lofi_statep, instance);
2008                 if (lsp != NULL && lsp->ls_vp_ready == B_FALSE) {
2009                         ddi_soft_state_free(lofi_statep, instance);
2010                         return (DDI_SUCCESS);
2011                 } else
2012                         return (DDI_FAILURE);
2013         }
2014         mutex_enter(&lofi_lock);
2015 
2016         if (!list_is_empty(&lofi_list)) {
2017                 mutex_exit(&lofi_lock);
2018                 return (DDI_FAILURE);
2019         }
2020 
2021         ddi_remove_minor_node(dip, NULL);
2022         ddi_prop_remove_all(dip);
2023 
2024         mutex_exit(&lofi_lock);
2025 
2026         if (zone_key_delete(lofi_zone_key) != 0)
2027                 cmn_err(CE_WARN, "failed to delete zone key");
2028 
2029         ddi_soft_state_free(lofi_statep, 0);
2030 
2031         return (DDI_SUCCESS);
2032 }
2033 
2034 /*
2035  * With the addition of encryption, we must be careful that encryption key is
2036  * wiped before kernel's data structures are freed so it cannot accidentally
2037  * slip out to userland through uninitialized data elsewhere.
2038  */
2039 static void
2040 free_lofi_ioctl(struct lofi_ioctl *klip)
2041 {
2042         /* Make sure this encryption key doesn't stick around */
2043         bzero(klip->li_key, sizeof (klip->li_key));
2044         kmem_free(klip, sizeof (struct lofi_ioctl));
2045 }
2046 
2047 /*
2048  * These two functions simplify the rest of the ioctls that need to copyin/out
2049  * the lofi_ioctl structure.
2050  */
2051 int
2052 copy_in_lofi_ioctl(const struct lofi_ioctl *ulip, struct lofi_ioctl **klipp,
2053     int flag)
2054 {
2055         struct lofi_ioctl *klip;
2056         int     error;
2057 
2058         klip = *klipp = kmem_alloc(sizeof (struct lofi_ioctl), KM_SLEEP);
2059         error = ddi_copyin(ulip, klip, sizeof (struct lofi_ioctl), flag);
2060         if (error)
2061                 goto err;
2062 
2063         /* ensure NULL termination */
2064         klip->li_filename[MAXPATHLEN-1] = '\0';
2065         klip->li_devpath[MAXPATHLEN-1] = '\0';
2066         klip->li_algorithm[MAXALGLEN-1] = '\0';
2067         klip->li_cipher[CRYPTO_MAX_MECH_NAME-1] = '\0';
2068         klip->li_iv_cipher[CRYPTO_MAX_MECH_NAME-1] = '\0';
2069 
2070         if (klip->li_id > L_MAXMIN32) {
2071                 error = EINVAL;
2072                 goto err;
2073         }
2074 
2075         return (0);
2076 
2077 err:
2078         free_lofi_ioctl(klip);
2079         return (error);
2080 }
2081 
2082 int
2083 copy_out_lofi_ioctl(const struct lofi_ioctl *klip, struct lofi_ioctl *ulip,
2084     int flag)
2085 {
2086         int     error;
2087 
2088         /*
2089          * NOTE: Do NOT copy the crypto_key_t "back" to userland.
2090          * This ensures that an attacker can't trivially find the
2091          * key for a mapping just by issuing the ioctl.
2092          *
2093          * It can still be found by poking around in kmem with mdb(1),
2094          * but there is no point in making it easy when the info isn't
2095          * of any use in this direction anyway.
2096          *
2097          * Either way we don't actually have the raw key stored in
2098          * a form that we can get it anyway, since we just used it
2099          * to create a ctx template and didn't keep "the original".
2100          */
2101         error = ddi_copyout(klip, ulip, sizeof (struct lofi_ioctl), flag);
2102         if (error)
2103                 return (EFAULT);
2104         return (0);
2105 }
2106 
2107 static int
2108 lofi_access(struct lofi_state *lsp)
2109 {
2110         ASSERT(MUTEX_HELD(&lofi_lock));
2111         if (INGLOBALZONE(curproc) || lsp->ls_zone.zref_zone == curzone)
2112                 return (0);
2113         return (EPERM);
2114 }
2115 
2116 /*
2117  * Find the lofi state for the given filename. We compare by vnode to
2118  * allow the global zone visibility into NGZ lofi nodes.
2119  */
2120 static int
2121 file_to_lofi_nocheck(char *filename, boolean_t readonly,
2122     struct lofi_state **lspp)
2123 {
2124         struct lofi_state *lsp;
2125         vnode_t *vp = NULL;
2126         int err = 0;
2127         int rdfiles = 0;
2128 
2129         ASSERT(MUTEX_HELD(&lofi_lock));
2130 
2131         if ((err = lookupname(filename, UIO_SYSSPACE, FOLLOW,
2132             NULLVPP, &vp)) != 0)
2133                 goto out;
2134 
2135         if (vp->v_type == VREG) {
2136                 vnode_t *realvp;
2137                 if (VOP_REALVP(vp, &realvp, NULL) == 0) {
2138                         VN_HOLD(realvp);
2139                         VN_RELE(vp);
2140                         vp = realvp;
2141                 }
2142         }
2143 
2144         for (lsp = list_head(&lofi_list); lsp != NULL;
2145             lsp = list_next(&lofi_list, lsp)) {
2146                 if (lsp->ls_vp == vp) {
2147                         if (lspp != NULL)
2148                                 *lspp = lsp;
2149                         if (lsp->ls_readonly) {
2150                                 rdfiles++;
2151                                 /* Skip if '-r' is specified */
2152                                 if (readonly)
2153                                         continue;
2154                         }
2155                         goto out;
2156                 }
2157         }
2158 
2159         err = ENOENT;
2160 
2161         /*
2162          * If a filename is given as an argument for lofi_unmap, we shouldn't
2163          * allow unmap if there are multiple read-only lofi devices associated
2164          * with this file.
2165          */
2166         if (lspp != NULL) {
2167                 if (rdfiles == 1)
2168                         err = 0;
2169                 else if (rdfiles > 1)
2170                         err = EBUSY;
2171         }
2172 
2173 out:
2174         if (vp != NULL)
2175                 VN_RELE(vp);
2176         return (err);
2177 }
2178 
2179 /*
2180  * Find the minor for the given filename, checking the zone can access
2181  * it.
2182  */
2183 static int
2184 file_to_lofi(char *filename, boolean_t readonly, struct lofi_state **lspp)
2185 {
2186         int err = 0;
2187 
2188         ASSERT(MUTEX_HELD(&lofi_lock));
2189 
2190         if ((err = file_to_lofi_nocheck(filename, readonly, lspp)) != 0)
2191                 return (err);
2192 
2193         if ((err = lofi_access(*lspp)) != 0)
2194                 return (err);
2195 
2196         return (0);
2197 }
2198 
2199 /*
2200  * Fakes up a disk geometry based on the size of the file. This is needed
2201  * to support newfs on traditional lofi device, but also will provide
2202  * geometry hint for cmlb.
2203  */
2204 static void
2205 fake_disk_geometry(struct lofi_state *lsp)
2206 {
2207         u_offset_t dsize = lsp->ls_vp_size - lsp->ls_crypto_offset;
2208 
2209         /* dk_geom - see dkio(7I) */
2210         /*
2211          * dkg_ncyl _could_ be set to one here (one big cylinder with gobs
2212          * of sectors), but that breaks programs like fdisk which want to
2213          * partition a disk by cylinder. With one cylinder, you can't create
2214          * an fdisk partition and put pcfs on it for testing (hard to pick
2215          * a number between one and one).
2216          *
2217          * The cheezy floppy test is an attempt to not have too few cylinders
2218          * for a small file, or so many on a big file that you waste space
2219          * for backup superblocks or cylinder group structures.
2220          */
2221         bzero(&lsp->ls_dkg, sizeof (lsp->ls_dkg));
2222         if (dsize < (2 * 1024 * 1024)) /* floppy? */
2223                 lsp->ls_dkg.dkg_ncyl = dsize / (100 * 1024);
2224         else
2225                 lsp->ls_dkg.dkg_ncyl = dsize / (300 * 1024);
2226         /* in case file file is < 100k */
2227         if (lsp->ls_dkg.dkg_ncyl == 0)
2228                 lsp->ls_dkg.dkg_ncyl = 1;
2229 
2230         lsp->ls_dkg.dkg_pcyl = lsp->ls_dkg.dkg_ncyl;
2231         lsp->ls_dkg.dkg_nhead = 1;
2232         lsp->ls_dkg.dkg_rpm = 7200;
2233 
2234         lsp->ls_dkg.dkg_nsect = dsize /
2235             (lsp->ls_dkg.dkg_ncyl << lsp->ls_pbshift);
2236 }
2237 
2238 /*
2239  * build vtoc - see dkio(7I)
2240  *
2241  * Fakes one big partition based on the size of the file. This is needed
2242  * because we allow newfs'ing the traditional lofi device and newfs will
2243  * do several disk ioctls to figure out the geometry and partition information.
2244  * It uses that information to determine the parameters to pass to mkfs.
2245  */
2246 static void
2247 fake_disk_vtoc(struct lofi_state *lsp, struct vtoc *vt)
2248 {
2249         bzero(vt, sizeof (struct vtoc));
2250         vt->v_sanity = VTOC_SANE;
2251         vt->v_version = V_VERSION;
2252         (void) strncpy(vt->v_volume, LOFI_DRIVER_NAME,
2253             sizeof (vt->v_volume));
2254         vt->v_sectorsz = 1 << lsp->ls_pbshift;
2255         vt->v_nparts = 1;
2256         vt->v_part[0].p_tag = V_UNASSIGNED;
2257 
2258         /*
2259          * A compressed file is read-only, other files can
2260          * be read-write
2261          */
2262         if (lsp->ls_uncomp_seg_sz > 0) {
2263                 vt->v_part[0].p_flag = V_UNMNT | V_RONLY;
2264         } else {
2265                 vt->v_part[0].p_flag = V_UNMNT;
2266         }
2267         vt->v_part[0].p_start = (daddr_t)0;
2268         /*
2269          * The partition size cannot just be the number of sectors, because
2270          * that might not end on a cylinder boundary. And if that's the case,
2271          * newfs/mkfs will print a scary warning. So just figure the size
2272          * based on the number of cylinders and sectors/cylinder.
2273          */
2274         vt->v_part[0].p_size = lsp->ls_dkg.dkg_pcyl *
2275             lsp->ls_dkg.dkg_nsect * lsp->ls_dkg.dkg_nhead;
2276 }
2277 
2278 /*
2279  * build dk_cinfo - see dkio(7I)
2280  */
2281 static void
2282 fake_disk_info(dev_t dev, struct dk_cinfo *ci)
2283 {
2284         bzero(ci, sizeof (struct dk_cinfo));
2285         (void) strlcpy(ci->dki_cname, LOFI_DRIVER_NAME, sizeof (ci->dki_cname));
2286         ci->dki_ctype = DKC_SCSI_CCS;
2287         (void) strlcpy(ci->dki_dname, LOFI_DRIVER_NAME, sizeof (ci->dki_dname));
2288         ci->dki_unit = LOFI_MINOR2ID(getminor(dev));
2289         ci->dki_partition = LOFI_PART(getminor(dev));
2290         /*
2291          * newfs uses this to set maxcontig. Must not be < 16, or it
2292          * will be 0 when newfs multiplies it by DEV_BSIZE and divides
2293          * it by the block size. Then tunefs doesn't work because
2294          * maxcontig is 0.
2295          */
2296         ci->dki_maxtransfer = 16;
2297 }
2298 
2299 /*
2300  * map in a compressed file
2301  *
2302  * Read in the header and the index that follows.
2303  *
2304  * The header is as follows -
2305  *
2306  * Signature (name of the compression algorithm)
2307  * Compression segment size (a multiple of 512)
2308  * Number of index entries
2309  * Size of the last block
2310  * The array containing the index entries
2311  *
2312  * The header information is always stored in
2313  * network byte order on disk.
2314  */
2315 static int
2316 lofi_map_compressed_file(struct lofi_state *lsp, char *buf)
2317 {
2318         uint32_t index_sz, header_len, i;
2319         ssize_t resid;
2320         enum uio_rw rw;
2321         char *tbuf = buf;
2322         int error;
2323 
2324         /* The signature has already been read */
2325         tbuf += sizeof (lsp->ls_comp_algorithm);
2326         bcopy(tbuf, &(lsp->ls_uncomp_seg_sz), sizeof (lsp->ls_uncomp_seg_sz));
2327         lsp->ls_uncomp_seg_sz = ntohl(lsp->ls_uncomp_seg_sz);
2328 
2329         /*
2330          * The compressed segment size must be a power of 2
2331          */
2332         if (lsp->ls_uncomp_seg_sz < DEV_BSIZE ||
2333             !ISP2(lsp->ls_uncomp_seg_sz))
2334                 return (EINVAL);
2335 
2336         for (i = 0; !((lsp->ls_uncomp_seg_sz >> i) & 1); i++)
2337                 ;
2338 
2339         lsp->ls_comp_seg_shift = i;
2340 
2341         tbuf += sizeof (lsp->ls_uncomp_seg_sz);
2342         bcopy(tbuf, &(lsp->ls_comp_index_sz), sizeof (lsp->ls_comp_index_sz));
2343         lsp->ls_comp_index_sz = ntohl(lsp->ls_comp_index_sz);
2344 
2345         tbuf += sizeof (lsp->ls_comp_index_sz);
2346         bcopy(tbuf, &(lsp->ls_uncomp_last_seg_sz),
2347             sizeof (lsp->ls_uncomp_last_seg_sz));
2348         lsp->ls_uncomp_last_seg_sz = ntohl(lsp->ls_uncomp_last_seg_sz);
2349 
2350         /*
2351          * Compute the total size of the uncompressed data
2352          * for use in fake_disk_geometry and other calculations.
2353          * Disk geometry has to be faked with respect to the
2354          * actual uncompressed data size rather than the
2355          * compressed file size.
2356          */
2357         lsp->ls_vp_size =
2358             (u_offset_t)(lsp->ls_comp_index_sz - 2) * lsp->ls_uncomp_seg_sz
2359             + lsp->ls_uncomp_last_seg_sz;
2360 
2361         /*
2362          * Index size is rounded up to DEV_BSIZE for ease
2363          * of segmapping
2364          */
2365         index_sz = sizeof (*lsp->ls_comp_seg_index) * lsp->ls_comp_index_sz;
2366         header_len = sizeof (lsp->ls_comp_algorithm) +
2367             sizeof (lsp->ls_uncomp_seg_sz) +
2368             sizeof (lsp->ls_comp_index_sz) +
2369             sizeof (lsp->ls_uncomp_last_seg_sz);
2370         lsp->ls_comp_offbase = header_len + index_sz;
2371 
2372         index_sz += header_len;
2373         index_sz = roundup(index_sz, DEV_BSIZE);
2374 
2375         lsp->ls_comp_index_data = kmem_alloc(index_sz, KM_SLEEP);
2376         lsp->ls_comp_index_data_sz = index_sz;
2377 
2378         /*
2379          * Read in the index -- this has a side-effect
2380          * of reading in the header as well
2381          */
2382         rw = UIO_READ;
2383         error = vn_rdwr(rw, lsp->ls_vp, lsp->ls_comp_index_data, index_sz,
2384             0, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
2385 
2386         if (error != 0)
2387                 return (error);
2388 
2389         /* Skip the header, this is where the index really begins */
2390         lsp->ls_comp_seg_index =
2391             /*LINTED*/
2392             (uint64_t *)(lsp->ls_comp_index_data + header_len);
2393 
2394         /*
2395          * Now recompute offsets in the index to account for
2396          * the header length
2397          */
2398         for (i = 0; i < lsp->ls_comp_index_sz; i++) {
2399                 lsp->ls_comp_seg_index[i] = lsp->ls_comp_offbase +
2400                     BE_64(lsp->ls_comp_seg_index[i]);
2401         }
2402 
2403         return (error);
2404 }
2405 
2406 static int
2407 lofi_init_crypto(struct lofi_state *lsp, struct lofi_ioctl *klip)
2408 {
2409         struct crypto_meta chead;
2410         char buf[DEV_BSIZE];
2411         ssize_t resid;
2412         char *marker;
2413         int error;
2414         int ret;
2415         int i;
2416 
2417         if (!klip->li_crypto_enabled)
2418                 return (0);
2419 
2420         /*
2421          * All current algorithms have a max of 448 bits.
2422          */
2423         if (klip->li_iv_len > CRYPTO_BITS2BYTES(512))
2424                 return (EINVAL);
2425 
2426         if (CRYPTO_BITS2BYTES(klip->li_key_len) > sizeof (klip->li_key))
2427                 return (EINVAL);
2428 
2429         lsp->ls_crypto_enabled = klip->li_crypto_enabled;
2430 
2431         mutex_init(&lsp->ls_crypto_lock, NULL, MUTEX_DRIVER, NULL);
2432 
2433         lsp->ls_mech.cm_type = crypto_mech2id(klip->li_cipher);
2434         if (lsp->ls_mech.cm_type == CRYPTO_MECH_INVALID) {
2435                 cmn_err(CE_WARN, "invalid cipher %s requested for %s",
2436                     klip->li_cipher, klip->li_filename);
2437                 return (EINVAL);
2438         }
2439 
2440         /* this is just initialization here */
2441         lsp->ls_mech.cm_param = NULL;
2442         lsp->ls_mech.cm_param_len = 0;
2443 
2444         lsp->ls_iv_type = klip->li_iv_type;
2445         lsp->ls_iv_mech.cm_type = crypto_mech2id(klip->li_iv_cipher);
2446         if (lsp->ls_iv_mech.cm_type == CRYPTO_MECH_INVALID) {
2447                 cmn_err(CE_WARN, "invalid iv cipher %s requested"
2448                     " for %s", klip->li_iv_cipher, klip->li_filename);
2449                 return (EINVAL);
2450         }
2451 
2452         /* iv mech must itself take a null iv */
2453         lsp->ls_iv_mech.cm_param = NULL;
2454         lsp->ls_iv_mech.cm_param_len = 0;
2455         lsp->ls_iv_len = klip->li_iv_len;
2456 
2457         /*
2458          * Create ctx using li_cipher & the raw li_key after checking
2459          * that it isn't a weak key.
2460          */
2461         lsp->ls_key.ck_format = CRYPTO_KEY_RAW;
2462         lsp->ls_key.ck_length = klip->li_key_len;
2463         lsp->ls_key.ck_data = kmem_alloc(
2464             CRYPTO_BITS2BYTES(lsp->ls_key.ck_length), KM_SLEEP);
2465         bcopy(klip->li_key, lsp->ls_key.ck_data,
2466             CRYPTO_BITS2BYTES(lsp->ls_key.ck_length));
2467 
2468         ret = crypto_key_check(&lsp->ls_mech, &lsp->ls_key);
2469         if (ret != CRYPTO_SUCCESS) {
2470                 cmn_err(CE_WARN, "weak key check failed for cipher "
2471                     "%s on file %s (0x%x)", klip->li_cipher,
2472                     klip->li_filename, ret);
2473                 return (EINVAL);
2474         }
2475 
2476         error = vn_rdwr(UIO_READ, lsp->ls_vp, buf, DEV_BSIZE,
2477             CRYOFF, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
2478         if (error != 0)
2479                 return (error);
2480 
2481         /*
2482          * This is the case where the header in the lofi image is already
2483          * initialized to indicate it is encrypted.
2484          */
2485         if (strncmp(buf, lofi_crypto_magic, sizeof (lofi_crypto_magic)) == 0) {
2486                 /*
2487                  * The encryption header information is laid out this way:
2488                  *      6 bytes:        hex "CFLOFI"
2489                  *      2 bytes:        version = 0 ... for now
2490                  *      96 bytes:       reserved1 (not implemented yet)
2491                  *      4 bytes:        data_sector = 2 ... for now
2492                  *      more...         not implemented yet
2493                  */
2494 
2495                 marker = buf;
2496 
2497                 /* copy the magic */
2498                 bcopy(marker, lsp->ls_crypto.magic,
2499                     sizeof (lsp->ls_crypto.magic));
2500                 marker += sizeof (lsp->ls_crypto.magic);
2501 
2502                 /* read the encryption version number */
2503                 bcopy(marker, &(lsp->ls_crypto.version),
2504                     sizeof (lsp->ls_crypto.version));
2505                 lsp->ls_crypto.version = ntohs(lsp->ls_crypto.version);
2506                 marker += sizeof (lsp->ls_crypto.version);
2507 
2508                 /* read a chunk of reserved data */
2509                 bcopy(marker, lsp->ls_crypto.reserved1,
2510                     sizeof (lsp->ls_crypto.reserved1));
2511                 marker += sizeof (lsp->ls_crypto.reserved1);
2512 
2513                 /* read block number where encrypted data begins */
2514                 bcopy(marker, &(lsp->ls_crypto.data_sector),
2515                     sizeof (lsp->ls_crypto.data_sector));
2516                 lsp->ls_crypto.data_sector = ntohl(lsp->ls_crypto.data_sector);
2517                 marker += sizeof (lsp->ls_crypto.data_sector);
2518 
2519                 /* and ignore the rest until it is implemented */
2520 
2521                 lsp->ls_crypto_offset = lsp->ls_crypto.data_sector * DEV_BSIZE;
2522                 return (0);
2523         }
2524 
2525         /*
2526          * We've requested encryption, but no magic was found, so it must be
2527          * a new image.
2528          */
2529 
2530         for (i = 0; i < sizeof (struct crypto_meta); i++) {
2531                 if (buf[i] != '\0')
2532                         return (EINVAL);
2533         }
2534 
2535         marker = buf;
2536         bcopy(lofi_crypto_magic, marker, sizeof (lofi_crypto_magic));
2537         marker += sizeof (lofi_crypto_magic);
2538         chead.version = htons(LOFI_CRYPTO_VERSION);
2539         bcopy(&(chead.version), marker, sizeof (chead.version));
2540         marker += sizeof (chead.version);
2541         marker += sizeof (chead.reserved1);
2542         chead.data_sector = htonl(LOFI_CRYPTO_DATA_SECTOR);
2543         bcopy(&(chead.data_sector), marker, sizeof (chead.data_sector));
2544 
2545         /* write the header */
2546         error = vn_rdwr(UIO_WRITE, lsp->ls_vp, buf, DEV_BSIZE,
2547             CRYOFF, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
2548         if (error != 0)
2549                 return (error);
2550 
2551         /* fix things up so it looks like we read this info */
2552         bcopy(lofi_crypto_magic, lsp->ls_crypto.magic,
2553             sizeof (lofi_crypto_magic));
2554         lsp->ls_crypto.version = LOFI_CRYPTO_VERSION;
2555         lsp->ls_crypto.data_sector = LOFI_CRYPTO_DATA_SECTOR;
2556         lsp->ls_crypto_offset = lsp->ls_crypto.data_sector * DEV_BSIZE;
2557         return (0);
2558 }
2559 
2560 /*
2561  * Check to see if the passed in signature is a valid one.  If it is
2562  * valid, return the index into lofi_compress_table.
2563  *
2564  * Return -1 if it is invalid
2565  */
2566 static int
2567 lofi_compress_select(const char *signature)
2568 {
2569         int i;
2570 
2571         for (i = 0; i < LOFI_COMPRESS_FUNCTIONS; i++) {
2572                 if (strcmp(lofi_compress_table[i].l_name, signature) == 0)
2573                         return (i);
2574         }
2575 
2576         return (-1);
2577 }
2578 
2579 static int
2580 lofi_init_compress(struct lofi_state *lsp)
2581 {
2582         char buf[DEV_BSIZE];
2583         int compress_index;
2584         ssize_t resid;
2585         int error;
2586 
2587         error = vn_rdwr(UIO_READ, lsp->ls_vp, buf, DEV_BSIZE, 0, UIO_SYSSPACE,
2588             0, RLIM64_INFINITY, kcred, &resid);
2589 
2590         if (error != 0)
2591                 return (error);
2592 
2593         if ((compress_index = lofi_compress_select(buf)) == -1)
2594                 return (0);
2595 
2596         /* compression and encryption are mutually exclusive */
2597         if (lsp->ls_crypto_enabled)
2598                 return (ENOTSUP);
2599 
2600         /* initialize compression info for compressed lofi */
2601         lsp->ls_comp_algorithm_index = compress_index;
2602         (void) strlcpy(lsp->ls_comp_algorithm,
2603             lofi_compress_table[compress_index].l_name,
2604             sizeof (lsp->ls_comp_algorithm));
2605 
2606         /* Finally setup per-thread pre-allocated buffers */
2607         lsp->ls_comp_bufs = kmem_zalloc(lofi_taskq_nthreads *
2608             sizeof (struct compbuf), KM_SLEEP);
2609 
2610         return (lofi_map_compressed_file(lsp, buf));
2611 }
2612 
2613 /*
2614  * Allocate new or proposed id from lofi_id.
2615  *
2616  * Special cases for proposed id:
2617  * 0: not allowed, 0 is id for control device.
2618  * -1: allocate first usable id from lofi_id.
2619  * any other value is proposed value from userland
2620  *
2621  * returns DDI_SUCCESS or errno.
2622  */
2623 static int
2624 lofi_alloc_id(int *idp)
2625 {
2626         int id, error = DDI_SUCCESS;
2627 
2628         if (*idp == -1) {
2629                 id = id_allocff_nosleep(lofi_id);
2630                 if (id == -1) {
2631                         error = EAGAIN;
2632                         goto err;
2633                 }
2634         } else if (*idp == 0) {
2635                 error = EINVAL;
2636                 goto err;
2637         } else if (*idp > ((1 << (L_BITSMINOR - LOFI_CMLB_SHIFT)) - 1)) {
2638                 error = ERANGE;
2639                 goto err;
2640         } else {
2641                 if (ddi_get_soft_state(lofi_statep, *idp) != NULL) {
2642                         error = EEXIST;
2643                         goto err;
2644                 }
2645 
2646                 id = id_alloc_specific_nosleep(lofi_id, *idp);
2647                 if (id == -1) {
2648                         error = EAGAIN;
2649                         goto err;
2650                 }
2651         }
2652         *idp = id;
2653 err:
2654         return (error);
2655 }
2656 
2657 static int
2658 lofi_create_dev(struct lofi_ioctl *klip)
2659 {
2660         dev_info_t *parent, *child;
2661         struct lofi_state *lsp = NULL;
2662         char namebuf[MAXNAMELEN];
2663         int error, circ;
2664 
2665         /* get control device */
2666         lsp = ddi_get_soft_state(lofi_statep, 0);
2667         parent = ddi_get_parent(lsp->ls_dip);
2668 
2669         if ((error = lofi_alloc_id((int *)&klip->li_id)))
2670                 return (error);
2671 
2672         (void) snprintf(namebuf, sizeof (namebuf), LOFI_DRIVER_NAME "@%d",
2673             klip->li_id);
2674 
2675         ndi_devi_enter(parent, &circ);
2676         child = ndi_devi_findchild(parent, namebuf);
2677         ndi_devi_exit(parent, circ);
2678 
2679         if (child == NULL) {
2680                 child = ddi_add_child(parent, LOFI_DRIVER_NAME,
2681                     (pnode_t)DEVI_SID_NODEID, klip->li_id);
2682                 if ((error = ddi_prop_update_int(DDI_DEV_T_NONE, child,
2683                     "instance", klip->li_id)) != DDI_PROP_SUCCESS)
2684                         goto err;
2685 
2686                 if (klip->li_labeled == B_TRUE) {
2687                         if ((error = ddi_prop_create(DDI_DEV_T_NONE, child,
2688                             DDI_PROP_CANSLEEP, "labeled", 0, 0))
2689                             != DDI_PROP_SUCCESS)
2690                                 goto err;
2691                 }
2692 
2693                 if ((error = ndi_devi_online(child, NDI_ONLINE_ATTACH))
2694                     != NDI_SUCCESS)
2695                         goto err;
2696         } else {
2697                 id_free(lofi_id, klip->li_id);
2698                 error = EEXIST;
2699                 return (error);
2700         }
2701 
2702         goto done;
2703 
2704 err:
2705         ddi_prop_remove_all(child);
2706         (void) ndi_devi_offline(child, NDI_DEVI_REMOVE);
2707         id_free(lofi_id, klip->li_id);
2708 done:
2709 
2710         return (error);
2711 }
2712 
2713 static void
2714 lofi_create_inquiry(struct lofi_state *lsp, struct scsi_inquiry *inq)
2715 {
2716         char *p = NULL;
2717 
2718         (void) strlcpy(inq->inq_vid, LOFI_DRIVER_NAME, sizeof (inq->inq_vid));
2719 
2720         mutex_enter(&lsp->ls_vp_lock);
2721         if (lsp->ls_vp != NULL)
2722                 p = strrchr(lsp->ls_vp->v_path, '/');
2723         if (p != NULL)
2724                 (void) strncpy(inq->inq_pid, p + 1, sizeof (inq->inq_pid));
2725         mutex_exit(&lsp->ls_vp_lock);
2726         (void) strlcpy(inq->inq_revision, "1.0", sizeof (inq->inq_revision));
2727 }
2728 
2729 /*
2730  * copy devlink name from event cache
2731  */
2732 static void
2733 lofi_copy_devpath(struct lofi_ioctl *klip)
2734 {
2735         int     error;
2736         char    namebuf[MAXNAMELEN], *str;
2737         clock_t ticks;
2738         nvlist_t *nvl = NULL;
2739 
2740         if (klip->li_labeled == B_TRUE)
2741                 klip->li_devpath[0] = '\0';
2742         else {
2743                 /* no need to wait for messages */
2744                 (void) snprintf(klip->li_devpath, sizeof (klip->li_devpath),
2745                     "/dev/" LOFI_CHAR_NAME "/%d", klip->li_id);
2746                 return;
2747         }
2748 
2749         (void) snprintf(namebuf, sizeof (namebuf), "%d", klip->li_id);
2750         ticks = ddi_get_lbolt() + LOFI_TIMEOUT * drv_usectohz(1000000);
2751 
2752         mutex_enter(&lofi_devlink_cache.ln_lock);
2753         error = nvlist_lookup_nvlist(lofi_devlink_cache.ln_data, namebuf, &nvl);
2754         while (error != 0) {
2755                 error = cv_timedwait(&lofi_devlink_cache.ln_cv,
2756                     &lofi_devlink_cache.ln_lock, ticks);
2757                 if (error == -1)
2758                         break;
2759                 error = nvlist_lookup_nvlist(lofi_devlink_cache.ln_data,
2760                     namebuf, &nvl);
2761         }
2762 
2763         if (nvl != NULL) {
2764                 if (nvlist_lookup_string(nvl, DEV_NAME, &str) == 0) {
2765                         (void) strlcpy(klip->li_devpath, str,
2766                             sizeof (klip->li_devpath));
2767                 }
2768         }
2769         mutex_exit(&lofi_devlink_cache.ln_lock);
2770 }
2771 
2772 /*
2773  * map a file to a minor number. Return the minor number.
2774  */
2775 static int
2776 lofi_map_file(dev_t dev, struct lofi_ioctl *ulip, int pickminor,
2777     int *rvalp, struct cred *credp, int ioctl_flag)
2778 {
2779         int     id = -1;
2780         struct lofi_state *lsp = NULL;
2781         struct lofi_ioctl *klip;
2782         int     error;
2783         struct vnode *vp = NULL;
2784         vattr_t vattr;
2785         int     flag;
2786         char    namebuf[MAXNAMELEN];
2787 
2788         error = copy_in_lofi_ioctl(ulip, &klip, ioctl_flag);
2789         if (error != 0)
2790                 return (error);
2791 
2792         mutex_enter(&lofi_lock);
2793 
2794         if (file_to_lofi_nocheck(klip->li_filename, klip->li_readonly,
2795             NULL) == 0) {
2796                 error = EBUSY;
2797                 goto err;
2798         }
2799 
2800         flag = FREAD | FWRITE | FOFFMAX | FEXCL;
2801         error = vn_open(klip->li_filename, UIO_SYSSPACE, flag, 0, &vp, 0, 0);
2802         if (error) {
2803                 /* try read-only */
2804                 flag &= ~FWRITE;
2805                 error = vn_open(klip->li_filename, UIO_SYSSPACE, flag, 0,
2806                     &vp, 0, 0);
2807                 if (error)
2808                         goto err;
2809         }
2810 
2811         if (!V_ISLOFIABLE(vp->v_type)) {
2812                 error = EINVAL;
2813                 goto err;
2814         }
2815 
2816         vattr.va_mask = AT_SIZE;
2817         error = VOP_GETATTR(vp, &vattr, 0, credp, NULL);
2818         if (error)
2819                 goto err;
2820 
2821         /* the file needs to be a multiple of the block size */
2822         if ((vattr.va_size % DEV_BSIZE) != 0) {
2823                 error = EINVAL;
2824                 goto err;
2825         }
2826 
2827         if (pickminor) {
2828                 klip->li_id = (uint32_t)-1;
2829         }
2830         if ((error = lofi_create_dev(klip)) != 0)
2831                 goto err;
2832 
2833         id = klip->li_id;
2834         lsp = ddi_get_soft_state(lofi_statep, id);
2835         if (lsp == NULL)
2836                 goto err;
2837 
2838         /*
2839          * from this point lofi_destroy() is used to clean up on error
2840          * make sure the basic data is set
2841          */
2842         list_insert_tail(&lofi_list, lsp);
2843         lsp->ls_dev = makedevice(getmajor(dev), LOFI_ID2MINOR(id));
2844 
2845         list_create(&lsp->ls_comp_cache, sizeof (struct lofi_comp_cache),
2846             offsetof(struct lofi_comp_cache, lc_list));
2847 
2848         /*
2849          * save open mode so file can be closed properly and vnode counts
2850          * updated correctly.
2851          */
2852         lsp->ls_openflag = flag;
2853 
2854         lsp->ls_vp = vp;
2855         lsp->ls_stacked_vp = vp;
2856 
2857         lsp->ls_vp_size = vattr.va_size;
2858         lsp->ls_vp_comp_size = lsp->ls_vp_size;
2859 
2860         /*
2861          * Try to handle stacked lofs vnodes.
2862          */
2863         if (vp->v_type == VREG) {
2864                 vnode_t *realvp;
2865 
2866                 if (VOP_REALVP(vp, &realvp, NULL) == 0) {
2867                         /*
2868                          * We need to use the realvp for uniqueness
2869                          * checking, but keep the stacked vp for
2870                          * LOFI_GET_FILENAME display.
2871                          */
2872                         VN_HOLD(realvp);
2873                         lsp->ls_vp = realvp;
2874                 }
2875         }
2876 
2877         lsp->ls_lbshift = highbit(DEV_BSIZE) - 1;
2878         lsp->ls_pbshift = lsp->ls_lbshift;
2879 
2880         lsp->ls_readonly = klip->li_readonly;
2881         lsp->ls_uncomp_seg_sz = 0;
2882         lsp->ls_comp_algorithm[0] = '\0';
2883         lsp->ls_crypto_offset = 0;
2884 
2885         (void) snprintf(namebuf, sizeof (namebuf), "%s_taskq_%d",
2886             LOFI_DRIVER_NAME, id);
2887         lsp->ls_taskq = taskq_create_proc(namebuf, lofi_taskq_nthreads,
2888             minclsyspri, 1, lofi_taskq_maxalloc, curzone->zone_zsched, 0);
2889 
2890         if ((error = lofi_init_crypto(lsp, klip)) != 0)
2891                 goto err;
2892 
2893         if ((error = lofi_init_compress(lsp)) != 0)
2894                 goto err;
2895 
2896         fake_disk_geometry(lsp);
2897 
2898         /* For unlabeled lofi add Nblocks and Size */
2899         if (klip->li_labeled == B_FALSE) {
2900                 error = ddi_prop_update_int64(lsp->ls_dev, lsp->ls_dip,
2901                     SIZE_PROP_NAME, lsp->ls_vp_size - lsp->ls_crypto_offset);
2902                 if (error != DDI_PROP_SUCCESS) {
2903                         error = EINVAL;
2904                         goto err;
2905                 }
2906                 error = ddi_prop_update_int64(lsp->ls_dev, lsp->ls_dip,
2907                     NBLOCKS_PROP_NAME,
2908                     (lsp->ls_vp_size - lsp->ls_crypto_offset) / DEV_BSIZE);
2909                 if (error != DDI_PROP_SUCCESS) {
2910                         error = EINVAL;
2911                         goto err;
2912                 }
2913         }
2914 
2915         /*
2916          * Notify we are ready to rock.
2917          */
2918         mutex_enter(&lsp->ls_vp_lock);
2919         lsp->ls_vp_ready = B_TRUE;
2920         cv_broadcast(&lsp->ls_vp_cv);
2921         mutex_exit(&lsp->ls_vp_lock);
2922         mutex_exit(&lofi_lock);
2923 
2924         lofi_copy_devpath(klip);
2925 
2926         if (rvalp)
2927                 *rvalp = id;
2928         (void) copy_out_lofi_ioctl(klip, ulip, ioctl_flag);
2929         free_lofi_ioctl(klip);
2930         return (0);
2931 
2932 err:
2933         if (lsp != NULL) {
2934                 lofi_destroy(lsp, credp);
2935         } else {
2936                 if (vp != NULL) {
2937                         (void) VOP_PUTPAGE(vp, 0, 0, B_INVAL, credp, NULL);
2938                         (void) VOP_CLOSE(vp, flag, 1, 0, credp, NULL);
2939                         VN_RELE(vp);
2940                 }
2941         }
2942 
2943         mutex_exit(&lofi_lock);
2944         free_lofi_ioctl(klip);
2945         return (error);
2946 }
2947 
2948 /*
2949  * unmap a file.
2950  */
2951 static int
2952 lofi_unmap_file(struct lofi_ioctl *ulip, int byfilename,
2953     struct cred *credp, int ioctl_flag)
2954 {
2955         struct lofi_state *lsp;
2956         struct lofi_ioctl *klip;
2957         char namebuf[MAXNAMELEN];
2958         int err;
2959 
2960         err = copy_in_lofi_ioctl(ulip, &klip, ioctl_flag);
2961         if (err != 0)
2962                 return (err);
2963 
2964         mutex_enter(&lofi_lock);
2965         if (byfilename) {
2966                 if ((err = file_to_lofi(klip->li_filename, klip->li_readonly,
2967                     &lsp)) != 0) {
2968                         goto done;
2969                 }
2970         } else if (klip->li_id == 0) {
2971                 err = ENXIO;
2972                 goto done;
2973         } else {
2974                 lsp = ddi_get_soft_state(lofi_statep, klip->li_id);
2975         }
2976 
2977         if (lsp == NULL || lsp->ls_vp == NULL || lofi_access(lsp) != 0) {
2978                 err = ENXIO;
2979                 goto done;
2980         }
2981 
2982         klip->li_id = LOFI_MINOR2ID(getminor(lsp->ls_dev));
2983         (void) snprintf(namebuf, sizeof (namebuf), "%u", klip->li_id);
2984 
2985         /*
2986          * If it's still held open, we'll do one of three things:
2987          *
2988          * If no flag is set, just return EBUSY.
2989          *
2990          * If the 'cleanup' flag is set, unmap and remove the device when
2991          * the last user finishes.
2992          *
2993          * If the 'force' flag is set, then we forcibly close the underlying
2994          * file.  Subsequent operations will fail, and the DKIOCSTATE ioctl
2995          * will return DKIO_DEV_GONE.  When the device is last closed, the
2996          * device will be cleaned up appropriately.
2997          *
2998          * This is complicated by the fact that we may have outstanding
2999          * dispatched I/Os.  Rather than having a single mutex to serialize all
3000          * I/O, we keep a count of the number of outstanding I/O requests
3001          * (ls_vp_iocount), as well as a flag to indicate that no new I/Os
3002          * should be dispatched (ls_vp_closereq).
3003          *
3004          * We set the flag, wait for the number of outstanding I/Os to reach 0,
3005          * and then close the underlying vnode.
3006          */
3007         if (is_opened(lsp)) {
3008                 if (klip->li_force) {
3009                         /* Mark the device for cleanup. */
3010                         lofi_set_cleanup(lsp);
3011                         mutex_enter(&lsp->ls_vp_lock);
3012                         lsp->ls_vp_closereq = B_TRUE;
3013                         /* Wake up any threads waiting on dkiocstate. */
3014                         cv_broadcast(&lsp->ls_vp_cv);
3015                         while (lsp->ls_vp_iocount > 0)
3016                                 cv_wait(&lsp->ls_vp_cv, &lsp->ls_vp_lock);
3017                         mutex_exit(&lsp->ls_vp_lock);
3018                 } else if (klip->li_cleanup) {
3019                         lofi_set_cleanup(lsp);
3020                 } else {
3021                         err = EBUSY;
3022                 }
3023         } else {
3024                 lofi_free_dev(lsp);
3025                 lofi_destroy(lsp, credp);
3026         }
3027 
3028         /* Remove name from devlink cache */
3029         mutex_enter(&lofi_devlink_cache.ln_lock);
3030         (void) nvlist_remove_all(lofi_devlink_cache.ln_data, namebuf);
3031         mutex_exit(&lofi_devlink_cache.ln_lock);
3032 done:
3033         mutex_exit(&lofi_lock);
3034         if (err == 0)
3035                 (void) copy_out_lofi_ioctl(klip, ulip, ioctl_flag);
3036         free_lofi_ioctl(klip);
3037         return (err);
3038 }
3039 
3040 /*
3041  * get the filename given the minor number, or the minor number given
3042  * the name.
3043  */
3044 /*ARGSUSED*/
3045 static int
3046 lofi_get_info(dev_t dev, struct lofi_ioctl *ulip, int which,
3047     struct cred *credp, int ioctl_flag)
3048 {
3049         struct lofi_ioctl *klip;
3050         struct lofi_state *lsp;
3051         int     error;
3052 
3053         error = copy_in_lofi_ioctl(ulip, &klip, ioctl_flag);
3054         if (error != 0)
3055                 return (error);
3056 
3057         switch (which) {
3058         case LOFI_GET_FILENAME:
3059                 if (klip->li_id == 0) {
3060                         free_lofi_ioctl(klip);
3061                         return (EINVAL);
3062                 }
3063 
3064                 mutex_enter(&lofi_lock);
3065                 lsp = ddi_get_soft_state(lofi_statep, klip->li_id);
3066                 if (lsp == NULL || lofi_access(lsp) != 0) {
3067                         mutex_exit(&lofi_lock);
3068                         free_lofi_ioctl(klip);
3069                         return (ENXIO);
3070                 }
3071 
3072                 /*
3073                  * This may fail if, for example, we're trying to look
3074                  * up a zoned NFS path from the global zone.
3075                  */
3076                 if (vnodetopath(NULL, lsp->ls_stacked_vp, klip->li_filename,
3077                     sizeof (klip->li_filename), CRED()) != 0) {
3078                         (void) strlcpy(klip->li_filename, "?",
3079                             sizeof (klip->li_filename));
3080                 }
3081 
3082                 klip->li_readonly = lsp->ls_readonly;
3083                 klip->li_labeled = lsp->ls_cmlbhandle != NULL;
3084 
3085                 (void) strlcpy(klip->li_algorithm, lsp->ls_comp_algorithm,
3086                     sizeof (klip->li_algorithm));
3087                 klip->li_crypto_enabled = lsp->ls_crypto_enabled;
3088                 mutex_exit(&lofi_lock);
3089 
3090                 lofi_copy_devpath(klip);
3091                 error = copy_out_lofi_ioctl(klip, ulip, ioctl_flag);
3092                 free_lofi_ioctl(klip);
3093                 return (error);
3094         case LOFI_GET_MINOR:
3095                 mutex_enter(&lofi_lock);
3096                 error = file_to_lofi(klip->li_filename,
3097                     klip->li_readonly, &lsp);
3098                 if (error != 0) {
3099                         mutex_exit(&lofi_lock);
3100                         free_lofi_ioctl(klip);
3101                         return (error);
3102                 }
3103                 klip->li_id = LOFI_MINOR2ID(getminor(lsp->ls_dev));
3104 
3105                 klip->li_readonly = lsp->ls_readonly;
3106                 klip->li_labeled = lsp->ls_cmlbhandle != NULL;
3107                 mutex_exit(&lofi_lock);
3108 
3109                 lofi_copy_devpath(klip);
3110                 error = copy_out_lofi_ioctl(klip, ulip, ioctl_flag);
3111 
3112                 free_lofi_ioctl(klip);
3113                 return (error);
3114         case LOFI_CHECK_COMPRESSED:
3115                 mutex_enter(&lofi_lock);
3116                 error = file_to_lofi(klip->li_filename,
3117                     klip->li_readonly, &lsp);
3118                 if (error != 0) {
3119                         mutex_exit(&lofi_lock);
3120                         free_lofi_ioctl(klip);
3121                         return (error);
3122                 }
3123 
3124                 klip->li_id = LOFI_MINOR2ID(getminor(lsp->ls_dev));
3125                 (void) strlcpy(klip->li_algorithm, lsp->ls_comp_algorithm,
3126                     sizeof (klip->li_algorithm));
3127 
3128                 mutex_exit(&lofi_lock);
3129                 error = copy_out_lofi_ioctl(klip, ulip, ioctl_flag);
3130                 free_lofi_ioctl(klip);
3131                 return (error);
3132         default:
3133                 free_lofi_ioctl(klip);
3134                 return (EINVAL);
3135         }
3136 }
3137 
3138 static int
3139 uscsi_is_inquiry(intptr_t arg, int flag, union scsi_cdb *cdb,
3140     struct uscsi_cmd *uscmd)
3141 {
3142         int rval;
3143 
3144 #ifdef  _MULTI_DATAMODEL
3145         switch (ddi_model_convert_from(flag & FMODELS)) {
3146         case DDI_MODEL_ILP32: {
3147                 struct uscsi_cmd32 ucmd32;
3148 
3149                 if (ddi_copyin((void *)arg, &ucmd32, sizeof (ucmd32), flag)) {
3150                         rval = EFAULT;
3151                         goto err;
3152                 }
3153                 uscsi_cmd32touscsi_cmd((&ucmd32), uscmd);
3154                 break;
3155         }
3156         case DDI_MODEL_NONE:
3157                 if (ddi_copyin((void *)arg, uscmd, sizeof (*uscmd), flag)) {
3158                         rval = EFAULT;
3159                         goto err;
3160                 }
3161                 break;
3162         default:
3163                 rval = EFAULT;
3164                 goto err;
3165         }
3166 #else
3167         if (ddi_copyin((void *)arg, uscmd, sizeof (*uscmd), flag)) {
3168                 rval = EFAULT;
3169                 goto err;
3170         }
3171 #endif  /* _MULTI_DATAMODEL */
3172         if (ddi_copyin(uscmd->uscsi_cdb, cdb, uscmd->uscsi_cdblen, flag)) {
3173                 rval = EFAULT;
3174                 goto err;
3175         }
3176         if (cdb->scc_cmd == SCMD_INQUIRY) {
3177                 return (0);
3178         }
3179 err:
3180         return (rval);
3181 }
3182 
3183 static int
3184 lofi_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *credp,
3185     int *rvalp)
3186 {
3187         int     error;
3188         enum dkio_state dkstate;
3189         struct lofi_state *lsp;
3190         int     id;
3191 
3192         id = LOFI_MINOR2ID(getminor(dev));
3193 
3194         /* lofi ioctls only apply to the master device */
3195         if (id == 0) {
3196                 struct lofi_ioctl *lip = (struct lofi_ioctl *)arg;
3197 
3198                 /*
3199                  * the query command only need read-access - i.e., normal
3200                  * users are allowed to do those on the ctl device as
3201                  * long as they can open it read-only.
3202                  */
3203                 switch (cmd) {
3204                 case LOFI_MAP_FILE:
3205                         if ((flag & FWRITE) == 0)
3206                                 return (EPERM);
3207                         return (lofi_map_file(dev, lip, 1, rvalp, credp, flag));
3208                 case LOFI_MAP_FILE_MINOR:
3209                         if ((flag & FWRITE) == 0)
3210                                 return (EPERM);
3211                         return (lofi_map_file(dev, lip, 0, rvalp, credp, flag));
3212                 case LOFI_UNMAP_FILE:
3213                         if ((flag & FWRITE) == 0)
3214                                 return (EPERM);
3215                         return (lofi_unmap_file(lip, 1, credp, flag));
3216                 case LOFI_UNMAP_FILE_MINOR:
3217                         if ((flag & FWRITE) == 0)
3218                                 return (EPERM);
3219                         return (lofi_unmap_file(lip, 0, credp, flag));
3220                 case LOFI_GET_FILENAME:
3221                         return (lofi_get_info(dev, lip, LOFI_GET_FILENAME,
3222                             credp, flag));
3223                 case LOFI_GET_MINOR:
3224                         return (lofi_get_info(dev, lip, LOFI_GET_MINOR,
3225                             credp, flag));
3226 
3227                 /*
3228                  * This API made limited sense when this value was fixed
3229                  * at LOFI_MAX_FILES.  However, its use to iterate
3230                  * across all possible devices in lofiadm means we don't
3231                  * want to return L_MAXMIN, but the highest
3232                  * *allocated* id.
3233                  */
3234                 case LOFI_GET_MAXMINOR:
3235                         id = 0;
3236 
3237                         mutex_enter(&lofi_lock);
3238 
3239                         for (lsp = list_head(&lofi_list); lsp != NULL;
3240                             lsp = list_next(&lofi_list, lsp)) {
3241                                 int i;
3242                                 if (lofi_access(lsp) != 0)
3243                                         continue;
3244 
3245                                 i = ddi_get_instance(lsp->ls_dip);
3246                                 if (i > id)
3247                                         id = i;
3248                         }
3249 
3250                         mutex_exit(&lofi_lock);
3251 
3252                         error = ddi_copyout(&id, &lip->li_id,
3253                             sizeof (id), flag);
3254                         if (error)
3255                                 return (EFAULT);
3256                         return (0);
3257 
3258                 case LOFI_CHECK_COMPRESSED:
3259                         return (lofi_get_info(dev, lip, LOFI_CHECK_COMPRESSED,
3260                             credp, flag));
3261                 default:
3262                         return (EINVAL);
3263                 }
3264         }
3265 
3266         mutex_enter(&lofi_lock);
3267         lsp = ddi_get_soft_state(lofi_statep, id);
3268         if (lsp == NULL || lsp->ls_cleanup) {
3269                 mutex_exit(&lofi_lock);
3270                 return (ENXIO);
3271         }
3272         mutex_exit(&lofi_lock);
3273 
3274         if (ddi_prop_exists(DDI_DEV_T_ANY, lsp->ls_dip, DDI_PROP_DONTPASS,
3275             "labeled") == 1) {
3276                 error = cmlb_ioctl(lsp->ls_cmlbhandle, dev, cmd, arg, flag,
3277                     credp, rvalp, 0);
3278                 if (error != ENOTTY)
3279                         return (error);
3280         }
3281 
3282         /*
3283          * We explicitly allow DKIOCSTATE, but all other ioctls should fail with
3284          * EIO as if the device was no longer present.
3285          */
3286         if (lsp->ls_vp == NULL && cmd != DKIOCSTATE)
3287                 return (EIO);
3288 
3289         /* these are for faking out utilities like newfs */
3290         switch (cmd) {
3291         case DKIOCGMEDIAINFO:
3292         case DKIOCGMEDIAINFOEXT: {
3293                 struct dk_minfo_ext media_info;
3294                 int shift = lsp->ls_lbshift;
3295                 int size;
3296 
3297                 if (cmd == DKIOCGMEDIAINFOEXT) {
3298                         media_info.dki_pbsize = 1U << lsp->ls_pbshift;
3299                         size = sizeof (struct dk_minfo_ext);
3300                 } else {
3301                         size = sizeof (struct dk_minfo);
3302                 }
3303 
3304                 media_info.dki_media_type = DK_FIXED_DISK;
3305                 media_info.dki_lbsize = 1U << shift;
3306                 media_info.dki_capacity =
3307                     (lsp->ls_vp_size - lsp->ls_crypto_offset) >> shift;
3308 
3309                 if (ddi_copyout(&media_info, (void *)arg, size, flag))
3310                         return (EFAULT);
3311                 return (0);
3312         }
3313         case DKIOCREMOVABLE: {
3314                 int i = 0;
3315                 if (ddi_copyout(&i, (caddr_t)arg, sizeof (int), flag))
3316                         return (EFAULT);
3317                 return (0);
3318         }
3319 
3320         case DKIOCGVTOC: {
3321                 struct vtoc vt;
3322                 fake_disk_vtoc(lsp, &vt);
3323 
3324                 switch (ddi_model_convert_from(flag & FMODELS)) {
3325                 case DDI_MODEL_ILP32: {
3326                         struct vtoc32 vtoc32;
3327 
3328                         vtoctovtoc32(vt, vtoc32);
3329                         if (ddi_copyout(&vtoc32, (void *)arg,
3330                             sizeof (struct vtoc32), flag))
3331                                 return (EFAULT);
3332                         break;
3333                         }
3334 
3335                 case DDI_MODEL_NONE:
3336                         if (ddi_copyout(&vt, (void *)arg,
3337                             sizeof (struct vtoc), flag))
3338                                 return (EFAULT);
3339                         break;
3340                 }
3341                 return (0);
3342         }
3343         case DKIOCINFO: {
3344                 struct dk_cinfo ci;
3345                 fake_disk_info(dev, &ci);
3346                 if (ddi_copyout(&ci, (void *)arg, sizeof (ci), flag))
3347                         return (EFAULT);
3348                 return (0);
3349         }
3350         case DKIOCG_VIRTGEOM:
3351         case DKIOCG_PHYGEOM:
3352         case DKIOCGGEOM:
3353                 error = ddi_copyout(&lsp->ls_dkg, (void *)arg,
3354                     sizeof (struct dk_geom), flag);
3355                 if (error)
3356                         return (EFAULT);
3357                 return (0);
3358         case DKIOCSTATE:
3359                 /*
3360                  * Normally, lofi devices are always in the INSERTED state.  If
3361                  * a device is forcefully unmapped, then the device transitions
3362                  * to the DKIO_DEV_GONE state.
3363                  */
3364                 if (ddi_copyin((void *)arg, &dkstate, sizeof (dkstate),
3365                     flag) != 0)
3366                         return (EFAULT);
3367 
3368                 mutex_enter(&lsp->ls_vp_lock);
3369                 while (((dkstate == DKIO_INSERTED && lsp->ls_vp != NULL) ||
3370                     (dkstate == DKIO_DEV_GONE && lsp->ls_vp == NULL)) &&
3371                     !lsp->ls_cleanup) {
3372                         /*
3373                          * By virtue of having the device open, we know that
3374                          * 'lsp' will remain valid when we return.
3375                          */
3376                         if (!cv_wait_sig(&lsp->ls_vp_cv, &lsp->ls_vp_lock)) {
3377                                 mutex_exit(&lsp->ls_vp_lock);
3378                                 return (EINTR);
3379                         }
3380                 }
3381 
3382                 dkstate = (!lsp->ls_cleanup && lsp->ls_vp != NULL ?
3383                     DKIO_INSERTED : DKIO_DEV_GONE);
3384                 mutex_exit(&lsp->ls_vp_lock);
3385 
3386                 if (ddi_copyout(&dkstate, (void *)arg,
3387                     sizeof (dkstate), flag) != 0)
3388                         return (EFAULT);
3389                 return (0);
3390         case USCSICMD: {
3391                 struct uscsi_cmd uscmd;
3392                 union scsi_cdb cdb;
3393 
3394                 if (uscsi_is_inquiry(arg, flag, &cdb, &uscmd) == 0) {
3395                         struct scsi_inquiry inq = {0};
3396 
3397                         lofi_create_inquiry(lsp, &inq);
3398                         if (ddi_copyout(&inq, uscmd.uscsi_bufaddr,
3399                             uscmd.uscsi_buflen, flag) != 0)
3400                                 return (EFAULT);
3401                         return (0);
3402                 } else if (cdb.scc_cmd == SCMD_READ_CAPACITY) {
3403                         struct scsi_capacity capacity;
3404 
3405                         capacity.capacity =
3406                             BE_32((lsp->ls_vp_size - lsp->ls_crypto_offset) >>
3407                             lsp->ls_lbshift);
3408                         capacity.lbasize = BE_32(1 << lsp->ls_lbshift);
3409                         if (ddi_copyout(&capacity, uscmd.uscsi_bufaddr,
3410                             uscmd.uscsi_buflen, flag) != 0)
3411                                 return (EFAULT);
3412                         return (0);
3413                 }
3414 
3415                 uscmd.uscsi_rqstatus = 0xff;
3416 #ifdef  _MULTI_DATAMODEL
3417                 switch (ddi_model_convert_from(flag & FMODELS)) {
3418                 case DDI_MODEL_ILP32: {
3419                         struct uscsi_cmd32 ucmd32;
3420                         uscsi_cmdtouscsi_cmd32((&uscmd), (&ucmd32));
3421                         if (ddi_copyout(&ucmd32, (void *)arg, sizeof (ucmd32),
3422                             flag) != 0)
3423                                 return (EFAULT);
3424                         break;
3425                 }
3426                 case DDI_MODEL_NONE:
3427                         if (ddi_copyout(&uscmd, (void *)arg, sizeof (uscmd),
3428                             flag) != 0)
3429                                 return (EFAULT);
3430                         break;
3431                 default:
3432                         return (EFAULT);
3433                 }
3434 #else
3435                 if (ddi_copyout(&uscmd, (void *)arg, sizeof (uscmd), flag) != 0)
3436                         return (EFAULT);
3437 #endif  /* _MULTI_DATAMODEL */
3438                 return (0);
3439         }
3440         default:
3441 #ifdef DEBUG
3442                 cmn_err(CE_WARN, "lofi_ioctl: %d is not implemented\n", cmd);
3443 #endif  /* DEBUG */
3444                 return (ENOTTY);
3445         }
3446 }
3447 
3448 static int
3449 lofi_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int mod_flags,
3450     char *name, caddr_t valuep, int *lengthp)
3451 {
3452         struct lofi_state *lsp;
3453         int rc;
3454 
3455         lsp = ddi_get_soft_state(lofi_statep, ddi_get_instance(dip));
3456         if (lsp == NULL) {
3457                 return (ddi_prop_op(dev, dip, prop_op, mod_flags,
3458                     name, valuep, lengthp));
3459         }
3460 
3461         rc = cmlb_prop_op(lsp->ls_cmlbhandle, dev, dip, prop_op, mod_flags,
3462             name, valuep, lengthp, LOFI_PART(getminor(dev)), NULL);
3463         if (rc == DDI_PROP_SUCCESS)
3464                 return (rc);
3465 
3466         return (ddi_prop_op(DDI_DEV_T_ANY, dip, prop_op, mod_flags,
3467             name, valuep, lengthp));
3468 }
3469 
3470 static struct cb_ops lofi_cb_ops = {
3471         lofi_open,              /* open */
3472         lofi_close,             /* close */
3473         lofi_strategy,          /* strategy */
3474         nodev,                  /* print */
3475         nodev,                  /* dump */
3476         lofi_read,              /* read */
3477         lofi_write,             /* write */
3478         lofi_ioctl,             /* ioctl */
3479         nodev,                  /* devmap */
3480         nodev,                  /* mmap */
3481         nodev,                  /* segmap */
3482         nochpoll,               /* poll */
3483         lofi_prop_op,           /* prop_op */
3484         0,                      /* streamtab  */
3485         D_64BIT | D_NEW | D_MP, /* Driver compatibility flag */
3486         CB_REV,
3487         lofi_aread,
3488         lofi_awrite
3489 };
3490 
3491 static struct dev_ops lofi_ops = {
3492         DEVO_REV,               /* devo_rev, */
3493         0,                      /* refcnt  */
3494         lofi_info,              /* info */
3495         nulldev,                /* identify */
3496         nulldev,                /* probe */
3497         lofi_attach,            /* attach */
3498         lofi_detach,            /* detach */
3499         nodev,                  /* reset */
3500         &lofi_cb_ops,               /* driver operations */
3501         NULL,                   /* no bus operations */
3502         NULL,                   /* power */
3503         ddi_quiesce_not_needed, /* quiesce */
3504 };
3505 
3506 static struct modldrv modldrv = {
3507         &mod_driverops,
3508         "loopback file driver",
3509         &lofi_ops,
3510 };
3511 
3512 static struct modlinkage modlinkage = {
3513         MODREV_1,
3514         &modldrv,
3515         NULL
3516 };
3517 
3518 int
3519 _init(void)
3520 {
3521         int error;
3522 
3523         list_create(&lofi_list, sizeof (struct lofi_state),
3524             offsetof(struct lofi_state, ls_list));
3525 
3526         error = ddi_soft_state_init((void **)&lofi_statep,
3527             sizeof (struct lofi_state), 0);
3528         if (error) {
3529                 list_destroy(&lofi_list);
3530                 return (error);
3531         }
3532 
3533         /*
3534          * The minor number is stored as id << LOFI_CMLB_SHIFT as
3535          * we need to reserve space for cmlb minor numbers.
3536          * This will leave out 4096 id values on 32bit kernel, which should
3537          * still suffice.
3538          */
3539         lofi_id = id_space_create("lofi_id", 1,
3540             (1 << (L_BITSMINOR - LOFI_CMLB_SHIFT)));
3541 
3542         if (lofi_id == NULL) {
3543                 ddi_soft_state_fini((void **)&lofi_statep);
3544                 list_destroy(&lofi_list);
3545                 return (DDI_FAILURE);
3546         }
3547 
3548         mutex_init(&lofi_lock, NULL, MUTEX_DRIVER, NULL);
3549 
3550         error = mod_install(&modlinkage);
3551 
3552         if (error) {
3553                 id_space_destroy(lofi_id);
3554                 mutex_destroy(&lofi_lock);
3555                 ddi_soft_state_fini((void **)&lofi_statep);
3556                 list_destroy(&lofi_list);
3557         }
3558 
3559         return (error);
3560 }
3561 
3562 int
3563 _fini(void)
3564 {
3565         int     error;
3566 
3567         mutex_enter(&lofi_lock);
3568 
3569         if (!list_is_empty(&lofi_list)) {
3570                 mutex_exit(&lofi_lock);
3571                 return (EBUSY);
3572         }
3573 
3574         mutex_exit(&lofi_lock);
3575 
3576         error = mod_remove(&modlinkage);
3577         if (error)
3578                 return (error);
3579 
3580         mutex_destroy(&lofi_lock);
3581         id_space_destroy(lofi_id);
3582         ddi_soft_state_fini((void **)&lofi_statep);
3583         list_destroy(&lofi_list);
3584 
3585         return (error);
3586 }
3587 
3588 int
3589 _info(struct modinfo *modinfop)
3590 {
3591         return (mod_info(&modlinkage, modinfop));
3592 }