Print this page
    
10262 excessive page destruction caused by 6602
Reviewed by: Toomas Soome <tsoome@me.com>
Reviewed by: Mike Gerdts <mike.gerdts@joyent.com>
Reviewed by: Ryan Zezeski <rpz@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/io/lofi.c
          +++ new/usr/src/uts/common/io/lofi.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  
    | 
      ↓ open down ↓ | 
    16 lines elided | 
    
      ↑ open up ↑ | 
  
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   *
  24   24   * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
  25   25   * Copyright (c) 2016 Andrey Sokolov
  26   26   * Copyright 2016 Toomas Soome <tsoome@me.com>
       27 + * Copyright 2019 Joyent, Inc.
  27   28   */
  28   29  
  29   30  /*
  30   31   * lofi (loopback file) driver - allows you to attach a file to a device,
  31   32   * which can then be accessed through that device. The simple model is that
  32   33   * you tell lofi to open a file, and then use the block device you get as
  33   34   * you would any block device. lofi translates access to the block device
  34   35   * into I/O on the underlying file. This is mostly useful for
  35   36   * mounting images of filesystems.
  36   37   *
  37   38   * lofi is controlled through /dev/lofictl - this is the only device exported
  38   39   * during attach, and is instance number 0. lofiadm communicates with lofi
  39   40   * through ioctls on this device. When a file is attached to lofi, block and
  40   41   * character devices are exported in /dev/lofi and /dev/rlofi. These devices
  41   42   * are identified by lofi instance number, and the instance number is also used
  42   43   * as the name in /dev/lofi.
  43   44   *
  44   45   * Virtual disks, or, labeled lofi, implements virtual disk support to
  45   46   * support partition table and related tools. Such mappings will cause
  46   47   * block and character devices to be exported in /dev/dsk and /dev/rdsk
  47   48   * directories.
  48   49   *
  49   50   * To support virtual disks, the instance number space is divided to two
  50   51   * parts, upper part for instance number and lower part for minor number
  51   52   * space to identify partitions and slices. The virtual disk support is
  52   53   * implemented by stacking cmlb module. For virtual disks, the partition
  53   54   * related ioctl calls are routed to cmlb module. Compression and encryption
  54   55   * is not supported for virtual disks.
  55   56   *
  56   57   * Mapped devices are tracked with state structures handled with
  57   58   * ddi_soft_state(9F) for simplicity.
  58   59   *
  59   60   * A file attached to lofi is opened when attached and not closed until
  60   61   * explicitly detached from lofi. This seems more sensible than deferring
  61   62   * the open until the /dev/lofi device is opened, for a number of reasons.
  62   63   * One is that any failure is likely to be noticed by the person (or script)
  63   64   * running lofiadm. Another is that it would be a security problem if the
  64   65   * file was replaced by another one after being added but before being opened.
  65   66   *
  66   67   * The only hard part about lofi is the ioctls. In order to support things
  67   68   * like 'newfs' on a lofi device, it needs to support certain disk ioctls.
  68   69   * So it has to fake disk geometry and partition information. More may need
  69   70   * to be faked if your favorite utility doesn't work and you think it should
  70   71   * (fdformat doesn't work because it really wants to know the type of floppy
  71   72   * controller to talk to, and that didn't seem easy to fake. Or possibly even
  72   73   * necessary, since we have mkfs_pcfs now).
  73   74   *
  74   75   * Normally, a lofi device cannot be detached if it is open (i.e. busy).  To
  75   76   * support simulation of hotplug events, an optional force flag is provided.
  76   77   * If a lofi device is open when a force detach is requested, then the
  77   78   * underlying file is closed and any subsequent operations return EIO.  When the
  78   79   * device is closed for the last time, it will be cleaned up at that time.  In
  79   80   * addition, the DKIOCSTATE ioctl will return DKIO_DEV_GONE when the device is
  80   81   * detached but not removed.
  81   82   *
  82   83   * If detach was requested and lofi device is not open, we will perform
  83   84   * unmap and remove the lofi instance.
  84   85   *
  85   86   * If the lofi device is open and the li_cleanup is set on ioctl request,
  86   87   * we set ls_cleanup flag to notify the cleanup is requested, and the
  87   88   * last lofi_close will perform the unmapping and this lofi instance will be
  88   89   * removed.
  89   90   *
  90   91   * If the lofi device is open and the li_force is set on ioctl request,
  91   92   * we set ls_cleanup flag to notify the cleanup is requested,
  92   93   * we also set ls_vp_closereq to notify IO tasks to return EIO on new
  93   94   * IO requests and wait in process IO count to become 0, indicating there
  94   95   * are no more IO requests. Since ls_cleanup is set, the last lofi_close
  95   96   * will perform unmap and this lofi instance will be removed.
  96   97   * See also lofi_unmap_file() for details.
  97   98   *
  98   99   * Once ls_cleanup is set for the instance, we do not allow lofi_open()
  99  100   * calls to succeed and can have last lofi_close() to remove the instance.
 100  101   *
 101  102   * Known problems:
 102  103   *
 103  104   *      UFS logging. Mounting a UFS filesystem image "logging"
 104  105   *      works for basic copy testing but wedges during a build of ON through
 105  106   *      that image. Some deadlock in lufs holding the log mutex and then
 106  107   *      getting stuck on a buf. So for now, don't do that.
 107  108   *
 108  109   *      Direct I/O. Since the filesystem data is being cached in the buffer
 109  110   *      cache, _and_ again in the underlying filesystem, it's tempting to
 110  111   *      enable direct I/O on the underlying file. Don't, because that deadlocks.
 111  112   *      I think to fix the cache-twice problem we might need filesystem support.
 112  113   *
 113  114   * Interesting things to do:
 114  115   *
 115  116   *      Allow multiple files for each device. A poor-man's metadisk, basically.
 116  117   *
 117  118   *      Pass-through ioctls on block devices. You can (though it's not
 118  119   *      documented), give lofi a block device as a file name. Then we shouldn't
 119  120   *      need to fake a geometry, however, it may be relevant if you're replacing
 120  121   *      metadisk, or using lofi to get crypto.
 121  122   *      It makes sense to do lofiadm -c aes -a /dev/dsk/c0t0d0s4 /dev/lofi/1
 122  123   *      and then in /etc/vfstab have an entry for /dev/lofi/1 as /export/home.
 123  124   *      In fact this even makes sense if you have lofi "above" metadisk.
 124  125   *
 125  126   * Encryption:
 126  127   *      Each lofi device can have its own symmetric key and cipher.
 127  128   *      They are passed to us by lofiadm(1m) in the correct format for use
 128  129   *      with the misc/kcf crypto_* routines.
 129  130   *
 130  131   *      Each block has its own IV, that is calculated in lofi_blk_mech(), based
 131  132   *      on the "master" key held in the lsp and the block number of the buffer.
 132  133   */
 133  134  
 134  135  #include <sys/types.h>
 135  136  #include <netinet/in.h>
 136  137  #include <sys/sysmacros.h>
 137  138  #include <sys/uio.h>
 138  139  #include <sys/kmem.h>
 139  140  #include <sys/cred.h>
 140  141  #include <sys/mman.h>
 141  142  #include <sys/errno.h>
 142  143  #include <sys/aio_req.h>
 143  144  #include <sys/stat.h>
 144  145  #include <sys/file.h>
 145  146  #include <sys/modctl.h>
 146  147  #include <sys/conf.h>
 147  148  #include <sys/debug.h>
 148  149  #include <sys/vnode.h>
 149  150  #include <sys/lofi.h>
 150  151  #include <sys/lofi_impl.h>      /* for cache structure */
 151  152  #include <sys/fcntl.h>
 152  153  #include <sys/pathname.h>
 153  154  #include <sys/filio.h>
 154  155  #include <sys/fdio.h>
 155  156  #include <sys/open.h>
 156  157  #include <sys/disp.h>
 157  158  #include <vm/seg_map.h>
 158  159  #include <sys/ddi.h>
 159  160  #include <sys/sunddi.h>
 160  161  #include <sys/zmod.h>
 161  162  #include <sys/id_space.h>
 162  163  #include <sys/mkdev.h>
 163  164  #include <sys/crypto/common.h>
 164  165  #include <sys/crypto/api.h>
 165  166  #include <sys/rctl.h>
 166  167  #include <sys/vtoc.h>
 167  168  #include <sys/scsi/scsi.h>      /* for DTYPE_DIRECT */
 168  169  #include <sys/scsi/impl/uscsi.h>
 169  170  #include <sys/sysevent/dev.h>
 170  171  #include <LzmaDec.h>
 171  172  
 172  173  #define NBLOCKS_PROP_NAME       "Nblocks"
 173  174  #define SIZE_PROP_NAME          "Size"
 174  175  #define ZONE_PROP_NAME          "zone"
 175  176  
 176  177  #define SETUP_C_DATA(cd, buf, len)              \
 177  178          (cd).cd_format = CRYPTO_DATA_RAW;       \
 178  179          (cd).cd_offset = 0;                     \
 179  180          (cd).cd_miscdata = NULL;                \
 180  181          (cd).cd_length = (len);                 \
 181  182          (cd).cd_raw.iov_base = (buf);           \
 182  183          (cd).cd_raw.iov_len = (len);
 183  184  
 184  185  #define UIO_CHECK(uio)  \
 185  186          if (((uio)->uio_loffset % DEV_BSIZE) != 0 || \
 186  187              ((uio)->uio_resid % DEV_BSIZE) != 0) { \
 187  188                  return (EINVAL); \
 188  189          }
 189  190  
 190  191  #define LOFI_TIMEOUT    30
 191  192  
 192  193  static void *lofi_statep;
 193  194  static kmutex_t lofi_lock;              /* state lock */
 194  195  static id_space_t *lofi_id;             /* lofi ID values */
 195  196  static list_t lofi_list;
 196  197  static zone_key_t lofi_zone_key;
 197  198  
 198  199  /*
 199  200   * Because lofi_taskq_nthreads limits the actual swamping of the device, the
 200  201   * maxalloc parameter (lofi_taskq_maxalloc) should be tuned conservatively
 201  202   * high.  If we want to be assured that the underlying device is always busy,
 202  203   * we must be sure that the number of bytes enqueued when the number of
 203  204   * enqueued tasks exceeds maxalloc is sufficient to keep the device busy for
 204  205   * the duration of the sleep time in taskq_ent_alloc().  That is, lofi should
 205  206   * set maxalloc to be the maximum throughput (in bytes per second) of the
 206  207   * underlying device divided by the minimum I/O size.  We assume a realistic
 207  208   * maximum throughput of one hundred megabytes per second; we set maxalloc on
 208  209   * the lofi task queue to be 104857600 divided by DEV_BSIZE.
 209  210   */
 210  211  static int lofi_taskq_maxalloc = 104857600 / DEV_BSIZE;
 211  212  static int lofi_taskq_nthreads = 4;     /* # of taskq threads per device */
 212  213  
 213  214  const char lofi_crypto_magic[6] = LOFI_CRYPTO_MAGIC;
 214  215  
 215  216  /*
 216  217   * To avoid decompressing data in a compressed segment multiple times
 217  218   * when accessing small parts of a segment's data, we cache and reuse
 218  219   * the uncompressed segment's data.
 219  220   *
 220  221   * A single cached segment is sufficient to avoid lots of duplicate
 221  222   * segment decompress operations. A small cache size also reduces the
 222  223   * memory footprint.
 223  224   *
 224  225   * lofi_max_comp_cache is the maximum number of decompressed data segments
 225  226   * cached for each compressed lofi image. It can be set to 0 to disable
 226  227   * caching.
 227  228   */
 228  229  
 229  230  uint32_t lofi_max_comp_cache = 1;
 230  231  
 231  232  static int gzip_decompress(void *src, size_t srclen, void *dst,
 232  233          size_t *destlen, int level);
 233  234  
 234  235  static int lzma_decompress(void *src, size_t srclen, void *dst,
 235  236          size_t *dstlen, int level);
 236  237  
 237  238  lofi_compress_info_t lofi_compress_table[LOFI_COMPRESS_FUNCTIONS] = {
 238  239          {gzip_decompress,       NULL,   6,      "gzip"}, /* default */
 239  240          {gzip_decompress,       NULL,   6,      "gzip-6"},
 240  241          {gzip_decompress,       NULL,   9,      "gzip-9"},
 241  242          {lzma_decompress,       NULL,   0,      "lzma"}
 242  243  };
 243  244  
 244  245  static void lofi_strategy_task(void *);
 245  246  static int lofi_tg_rdwr(dev_info_t *, uchar_t, void *, diskaddr_t,
 246  247      size_t, void *);
 247  248  static int lofi_tg_getinfo(dev_info_t *, int, void *, void *);
 248  249  
 249  250  struct cmlb_tg_ops lofi_tg_ops = {
 250  251          TG_DK_OPS_VERSION_1,
 251  252          lofi_tg_rdwr,
 252  253          lofi_tg_getinfo
 253  254  };
 254  255  
 255  256  /*ARGSUSED*/
 256  257  static void
 257  258  *SzAlloc(void *p, size_t size)
 258  259  {
 259  260          return (kmem_alloc(size, KM_SLEEP));
 260  261  }
 261  262  
 262  263  /*ARGSUSED*/
 263  264  static void
 264  265  SzFree(void *p, void *address, size_t size)
 265  266  {
 266  267          kmem_free(address, size);
 267  268  }
 268  269  
 269  270  static ISzAlloc g_Alloc = { SzAlloc, SzFree };
 270  271  
 271  272  /*
 272  273   * Free data referenced by the linked list of cached uncompressed
 273  274   * segments.
 274  275   */
 275  276  static void
 276  277  lofi_free_comp_cache(struct lofi_state *lsp)
 277  278  {
 278  279          struct lofi_comp_cache *lc;
 279  280  
 280  281          while ((lc = list_remove_head(&lsp->ls_comp_cache)) != NULL) {
 281  282                  kmem_free(lc->lc_data, lsp->ls_uncomp_seg_sz);
 282  283                  kmem_free(lc, sizeof (struct lofi_comp_cache));
 283  284                  lsp->ls_comp_cache_count--;
 284  285          }
 285  286          ASSERT(lsp->ls_comp_cache_count == 0);
 286  287  }
 287  288  
 288  289  static int
 289  290  is_opened(struct lofi_state *lsp)
 290  291  {
 291  292          int i;
 292  293          boolean_t last = B_TRUE;
 293  294  
 294  295          ASSERT(MUTEX_HELD(&lofi_lock));
 295  296          for (i = 0; i < LOFI_PART_MAX; i++) {
 296  297                  if (lsp->ls_open_lyr[i]) {
 297  298                          last = B_FALSE;
 298  299                          break;
 299  300                  }
 300  301          }
 301  302  
 302  303          for (i = 0; last && (i < OTYP_LYR); i++) {
 303  304                  if (lsp->ls_open_reg[i]) {
 304  305                          last = B_FALSE;
 305  306                  }
 306  307          }
 307  308  
 308  309          return (!last);
 309  310  }
 310  311  
 311  312  static void
 312  313  lofi_set_cleanup(struct lofi_state *lsp)
 313  314  {
 314  315          ASSERT(MUTEX_HELD(&lofi_lock));
 315  316  
 316  317          lsp->ls_cleanup = B_TRUE;
 317  318  
 318  319          /* wake up any threads waiting on dkiocstate */
 319  320          cv_broadcast(&lsp->ls_vp_cv);
 320  321  }
 321  322  
 322  323  static void
 323  324  lofi_free_crypto(struct lofi_state *lsp)
 324  325  {
 325  326          ASSERT(MUTEX_HELD(&lofi_lock));
 326  327  
 327  328          if (lsp->ls_crypto_enabled) {
 328  329                  /*
 329  330                   * Clean up the crypto state so that it doesn't hang around
 330  331                   * in memory after we are done with it.
 331  332                   */
 332  333                  if (lsp->ls_key.ck_data != NULL) {
 333  334                          bzero(lsp->ls_key.ck_data,
 334  335                              CRYPTO_BITS2BYTES(lsp->ls_key.ck_length));
 335  336                          kmem_free(lsp->ls_key.ck_data,
 336  337                              CRYPTO_BITS2BYTES(lsp->ls_key.ck_length));
 337  338                          lsp->ls_key.ck_data = NULL;
 338  339                          lsp->ls_key.ck_length = 0;
 339  340                  }
 340  341  
 341  342                  if (lsp->ls_mech.cm_param != NULL) {
 342  343                          kmem_free(lsp->ls_mech.cm_param,
 343  344                              lsp->ls_mech.cm_param_len);
 344  345                          lsp->ls_mech.cm_param = NULL;
 345  346                          lsp->ls_mech.cm_param_len = 0;
 346  347                  }
 347  348  
 348  349                  if (lsp->ls_iv_mech.cm_param != NULL) {
 349  350                          kmem_free(lsp->ls_iv_mech.cm_param,
 350  351                              lsp->ls_iv_mech.cm_param_len);
 351  352                          lsp->ls_iv_mech.cm_param = NULL;
 352  353                          lsp->ls_iv_mech.cm_param_len = 0;
 353  354                  }
 354  355  
 355  356                  mutex_destroy(&lsp->ls_crypto_lock);
 356  357          }
 357  358  }
 358  359  
 359  360  /* ARGSUSED */
 360  361  static int
 361  362  lofi_tg_rdwr(dev_info_t *dip, uchar_t cmd, void *bufaddr, diskaddr_t start,
 362  363      size_t length, void *tg_cookie)
 363  364  {
 364  365          struct lofi_state *lsp;
 365  366          buf_t   *bp;
 366  367          int     instance;
 367  368          int     rv = 0;
 368  369  
 369  370          instance = ddi_get_instance(dip);
 370  371          if (instance == 0)      /* control node does not have disk */
 371  372                  return (ENXIO);
 372  373  
 373  374          lsp = ddi_get_soft_state(lofi_statep, instance);
 374  375  
 375  376          if (lsp == NULL)
 376  377                  return (ENXIO);
 377  378  
 378  379          if (cmd != TG_READ && cmd != TG_WRITE)
 379  380                  return (EINVAL);
 380  381  
 381  382          /*
 382  383           * Make sure the mapping is set up by checking lsp->ls_vp_ready.
 383  384           */
 384  385          mutex_enter(&lsp->ls_vp_lock);
 385  386          while (lsp->ls_vp_ready == B_FALSE)
 386  387                  cv_wait(&lsp->ls_vp_cv, &lsp->ls_vp_lock);
 387  388          mutex_exit(&lsp->ls_vp_lock);
 388  389  
 389  390          if (P2PHASE(length, (1U << lsp->ls_lbshift)) != 0) {
 390  391                  /* We can only transfer whole blocks at a time! */
 391  392                  return (EINVAL);
 392  393          }
 393  394  
 394  395          bp = getrbuf(KM_SLEEP);
 395  396  
 396  397          if (cmd == TG_READ) {
 397  398                  bp->b_flags = B_READ;
 398  399          } else {
 399  400                  if (lsp->ls_readonly == B_TRUE) {
 400  401                          freerbuf(bp);
 401  402                          return (EROFS);
 402  403                  }
 403  404                  bp->b_flags = B_WRITE;
 404  405          }
 405  406  
 406  407          bp->b_un.b_addr = bufaddr;
 407  408          bp->b_bcount = length;
 408  409          bp->b_lblkno = start;
 409  410          bp->b_private = NULL;
 410  411          bp->b_edev = lsp->ls_dev;
 411  412  
 412  413          if (lsp->ls_kstat) {
 413  414                  mutex_enter(lsp->ls_kstat->ks_lock);
 414  415                  kstat_waitq_enter(KSTAT_IO_PTR(lsp->ls_kstat));
 415  416                  mutex_exit(lsp->ls_kstat->ks_lock);
 416  417          }
 417  418          (void) taskq_dispatch(lsp->ls_taskq, lofi_strategy_task, bp, KM_SLEEP);
 418  419          (void) biowait(bp);
 419  420  
 420  421          rv = geterror(bp);
 421  422          freerbuf(bp);
 422  423          return (rv);
 423  424  }
 424  425  
 425  426  /*
 426  427   * Get device geometry info for cmlb.
 427  428   *
 428  429   * We have mapped disk image as virtual block device and have to report
 429  430   * physical/virtual geometry to cmlb.
 430  431   *
 431  432   * So we have two principal cases:
 432  433   * 1. Uninitialised image without any existing labels,
 433  434   *    for this case we fabricate the data based on mapped image.
 434  435   * 2. Image with existing label information.
 435  436   *    Since we have no information how the image was created (it may be
 436  437   *    dump from some physical device), we need to rely on label information
 437  438   *    from image, or we get "corrupted label" errors.
 438  439   *    NOTE: label can be MBR, MBR+SMI, GPT
 439  440   */
 440  441  static int
 441  442  lofi_tg_getinfo(dev_info_t *dip, int cmd, void *arg, void *tg_cookie)
 442  443  {
 443  444          struct lofi_state *lsp;
 444  445          int instance;
 445  446          int ashift;
 446  447  
 447  448          _NOTE(ARGUNUSED(tg_cookie));
 448  449          instance = ddi_get_instance(dip);
 449  450          if (instance == 0)              /* control device has no storage */
 450  451                  return (ENXIO);
 451  452  
 452  453          lsp = ddi_get_soft_state(lofi_statep, instance);
 453  454  
 454  455          if (lsp == NULL)
 455  456                  return (ENXIO);
 456  457  
 457  458          /*
 458  459           * Make sure the mapping is set up by checking lsp->ls_vp_ready.
 459  460           *
 460  461           * When mapping is created, new lofi instance is created and
 461  462           * lofi_attach() will call cmlb_attach() as part of the procedure
 462  463           * to set the mapping up. This chain of events will happen in
 463  464           * the same thread.
 464  465           * Since cmlb_attach() will call lofi_tg_getinfo to get
 465  466           * capacity, we return error on that call if cookie is set,
 466  467           * otherwise lofi_attach will be stuck as the mapping is not yet
 467  468           * finalized and lofi is not yet ready.
 468  469           * Note, such error is not fatal for cmlb, as the label setup
 469  470           * will be finalized when cmlb_validate() is called.
 470  471           */
 471  472          mutex_enter(&lsp->ls_vp_lock);
 472  473          if (tg_cookie != NULL && lsp->ls_vp_ready == B_FALSE) {
 473  474                  mutex_exit(&lsp->ls_vp_lock);
 474  475                  return (ENXIO);
 475  476          }
 476  477          while (lsp->ls_vp_ready == B_FALSE)
 477  478                  cv_wait(&lsp->ls_vp_cv, &lsp->ls_vp_lock);
 478  479          mutex_exit(&lsp->ls_vp_lock);
 479  480  
 480  481          ashift = lsp->ls_lbshift;
 481  482  
 482  483          switch (cmd) {
 483  484          case TG_GETPHYGEOM: {
 484  485                  cmlb_geom_t *geomp = arg;
 485  486  
 486  487                  geomp->g_capacity       =
 487  488                      (lsp->ls_vp_size - lsp->ls_crypto_offset) >> ashift;
 488  489                  geomp->g_nsect          = lsp->ls_dkg.dkg_nsect;
 489  490                  geomp->g_nhead          = lsp->ls_dkg.dkg_nhead;
 490  491                  geomp->g_acyl           = lsp->ls_dkg.dkg_acyl;
 491  492                  geomp->g_ncyl           = lsp->ls_dkg.dkg_ncyl;
 492  493                  geomp->g_secsize        = (1U << ashift);
 493  494                  geomp->g_intrlv         = lsp->ls_dkg.dkg_intrlv;
 494  495                  geomp->g_rpm            = lsp->ls_dkg.dkg_rpm;
 495  496                  return (0);
 496  497          }
 497  498  
 498  499          case TG_GETCAPACITY:
 499  500                  *(diskaddr_t *)arg =
 500  501                      (lsp->ls_vp_size - lsp->ls_crypto_offset) >> ashift;
 501  502                  return (0);
 502  503  
 503  504          case TG_GETBLOCKSIZE:
 504  505                  *(uint32_t *)arg = (1U << ashift);
 505  506                  return (0);
 506  507  
 507  508          case TG_GETATTR: {
 508  509                  tg_attribute_t *tgattr = arg;
 509  510  
 510  511                  tgattr->media_is_writable = !lsp->ls_readonly;
 511  512                  tgattr->media_is_solid_state = B_FALSE;
 512  513                  tgattr->media_is_rotational = B_FALSE;
 513  514                  return (0);
 514  515          }
 515  516  
 516  517          default:
 517  518                  return (EINVAL);
 518  519          }
 519  520  }
 520  521  
 521  522  static void
 522  523  lofi_destroy(struct lofi_state *lsp, cred_t *credp)
 523  524  {
 524  525          int id = LOFI_MINOR2ID(getminor(lsp->ls_dev));
 525  526          int i;
 526  527  
 527  528          ASSERT(MUTEX_HELD(&lofi_lock));
 528  529  
 529  530          /*
 530  531           * Before we can start to release the other resources,
 531  532           * make sure we have all tasks completed and taskq removed.
 532  533           */
 533  534          if (lsp->ls_taskq != NULL) {
 534  535                  taskq_destroy(lsp->ls_taskq);
 535  536                  lsp->ls_taskq = NULL;
 536  537          }
 537  538  
 538  539          list_remove(&lofi_list, lsp);
 539  540  
 540  541          lofi_free_crypto(lsp);
 541  542  
 542  543          /*
 543  544           * Free pre-allocated compressed buffers
 544  545           */
 545  546          if (lsp->ls_comp_bufs != NULL) {
  
    | 
      ↓ open down ↓ | 
    509 lines elided | 
    
      ↑ open up ↑ | 
  
 546  547                  for (i = 0; i < lofi_taskq_nthreads; i++) {
 547  548                          if (lsp->ls_comp_bufs[i].bufsize > 0)
 548  549                                  kmem_free(lsp->ls_comp_bufs[i].buf,
 549  550                                      lsp->ls_comp_bufs[i].bufsize);
 550  551                  }
 551  552                  kmem_free(lsp->ls_comp_bufs,
 552  553                      sizeof (struct compbuf) * lofi_taskq_nthreads);
 553  554          }
 554  555  
 555  556          if (lsp->ls_vp != NULL) {
 556      -                (void) VOP_PUTPAGE(lsp->ls_vp, 0, 0, B_INVAL, credp, NULL);
      557 +                (void) VOP_PUTPAGE(lsp->ls_vp, 0, 0, B_FREE, credp, NULL);
 557  558                  (void) VOP_CLOSE(lsp->ls_vp, lsp->ls_openflag,
 558  559                      1, 0, credp, NULL);
 559  560                  VN_RELE(lsp->ls_vp);
 560  561          }
 561  562          if (lsp->ls_stacked_vp != lsp->ls_vp)
 562  563                  VN_RELE(lsp->ls_stacked_vp);
 563  564          lsp->ls_vp = lsp->ls_stacked_vp = NULL;
 564  565  
 565  566          if (lsp->ls_kstat != NULL) {
 566  567                  kstat_delete(lsp->ls_kstat);
 567  568                  lsp->ls_kstat = NULL;
 568  569          }
 569  570  
 570  571          /*
 571  572           * Free cached decompressed segment data
 572  573           */
 573  574          lofi_free_comp_cache(lsp);
 574  575          list_destroy(&lsp->ls_comp_cache);
 575  576  
 576  577          if (lsp->ls_uncomp_seg_sz > 0) {
 577  578                  kmem_free(lsp->ls_comp_index_data, lsp->ls_comp_index_data_sz);
 578  579                  lsp->ls_uncomp_seg_sz = 0;
 579  580          }
 580  581  
 581  582          rctl_decr_lofi(lsp->ls_zone.zref_zone, 1);
 582  583          zone_rele_ref(&lsp->ls_zone, ZONE_REF_LOFI);
 583  584  
 584  585          mutex_destroy(&lsp->ls_comp_cache_lock);
 585  586          mutex_destroy(&lsp->ls_comp_bufs_lock);
 586  587          mutex_destroy(&lsp->ls_kstat_lock);
 587  588          mutex_destroy(&lsp->ls_vp_lock);
 588  589          cv_destroy(&lsp->ls_vp_cv);
 589  590          lsp->ls_vp_ready = B_FALSE;
 590  591          lsp->ls_vp_closereq = B_FALSE;
 591  592  
 592  593          ASSERT(ddi_get_soft_state(lofi_statep, id) == lsp);
 593  594          (void) ndi_devi_offline(lsp->ls_dip, NDI_DEVI_REMOVE);
 594  595          id_free(lofi_id, id);
 595  596  }
 596  597  
 597  598  static void
 598  599  lofi_free_dev(struct lofi_state *lsp)
 599  600  {
 600  601          ASSERT(MUTEX_HELD(&lofi_lock));
 601  602  
 602  603          if (lsp->ls_cmlbhandle != NULL) {
 603  604                  cmlb_invalidate(lsp->ls_cmlbhandle, 0);
 604  605                  cmlb_detach(lsp->ls_cmlbhandle, 0);
 605  606                  cmlb_free_handle(&lsp->ls_cmlbhandle);
 606  607                  lsp->ls_cmlbhandle = NULL;
 607  608          }
 608  609          (void) ddi_prop_remove_all(lsp->ls_dip);
 609  610          ddi_remove_minor_node(lsp->ls_dip, NULL);
 610  611  }
 611  612  
 612  613  /*ARGSUSED*/
 613  614  static void
 614  615  lofi_zone_shutdown(zoneid_t zoneid, void *arg)
 615  616  {
 616  617          struct lofi_state *lsp;
 617  618          struct lofi_state *next;
 618  619  
 619  620          mutex_enter(&lofi_lock);
 620  621  
 621  622          for (lsp = list_head(&lofi_list); lsp != NULL; lsp = next) {
 622  623  
 623  624                  /* lofi_destroy() frees lsp */
 624  625                  next = list_next(&lofi_list, lsp);
 625  626  
 626  627                  if (lsp->ls_zone.zref_zone->zone_id != zoneid)
 627  628                          continue;
 628  629  
 629  630                  /*
 630  631                   * No in-zone processes are running, but something has this
 631  632                   * open.  It's either a global zone process, or a lofi
 632  633                   * mount.  In either case we set ls_cleanup so the last
 633  634                   * user destroys the device.
 634  635                   */
 635  636                  if (is_opened(lsp)) {
 636  637                          lofi_set_cleanup(lsp);
 637  638                  } else {
 638  639                          lofi_free_dev(lsp);
 639  640                          lofi_destroy(lsp, kcred);
 640  641                  }
 641  642          }
 642  643  
 643  644          mutex_exit(&lofi_lock);
 644  645  }
 645  646  
 646  647  /*ARGSUSED*/
 647  648  static int
 648  649  lofi_open(dev_t *devp, int flag, int otyp, struct cred *credp)
 649  650  {
 650  651          int id;
 651  652          minor_t part;
 652  653          uint64_t mask;
 653  654          diskaddr_t nblks;
 654  655          diskaddr_t lba;
 655  656          boolean_t ndelay;
 656  657  
 657  658          struct lofi_state *lsp;
 658  659  
 659  660          if (otyp >= OTYPCNT)
 660  661                  return (EINVAL);
 661  662  
 662  663          ndelay = (flag & (FNDELAY | FNONBLOCK)) ? B_TRUE : B_FALSE;
 663  664  
 664  665          /*
 665  666           * lofiadm -a /dev/lofi/1 gets us here.
 666  667           */
 667  668          if (mutex_owner(&lofi_lock) == curthread)
 668  669                  return (EINVAL);
 669  670  
 670  671          mutex_enter(&lofi_lock);
 671  672  
 672  673          id = LOFI_MINOR2ID(getminor(*devp));
 673  674          part = LOFI_PART(getminor(*devp));
 674  675          mask = (1U << part);
 675  676  
 676  677          /* master control device */
 677  678          if (id == 0) {
 678  679                  mutex_exit(&lofi_lock);
 679  680                  return (0);
 680  681          }
 681  682  
 682  683          /* otherwise, the mapping should already exist */
 683  684          lsp = ddi_get_soft_state(lofi_statep, id);
 684  685          if (lsp == NULL) {
 685  686                  mutex_exit(&lofi_lock);
 686  687                  return (EINVAL);
 687  688          }
 688  689  
 689  690          if (lsp->ls_cleanup == B_TRUE) {
 690  691                  mutex_exit(&lofi_lock);
 691  692                  return (ENXIO);
 692  693          }
 693  694  
 694  695          if (lsp->ls_vp == NULL) {
 695  696                  mutex_exit(&lofi_lock);
 696  697                  return (ENXIO);
 697  698          }
 698  699  
 699  700          if (lsp->ls_readonly && (flag & FWRITE)) {
 700  701                  mutex_exit(&lofi_lock);
 701  702                  return (EROFS);
 702  703          }
 703  704  
 704  705          if ((lsp->ls_open_excl) & (mask)) {
 705  706                  mutex_exit(&lofi_lock);
 706  707                  return (EBUSY);
 707  708          }
 708  709  
 709  710          if (flag & FEXCL) {
 710  711                  if (lsp->ls_open_lyr[part]) {
 711  712                          mutex_exit(&lofi_lock);
 712  713                          return (EBUSY);
 713  714                  }
 714  715                  for (int i = 0; i < OTYP_LYR; i++) {
 715  716                          if (lsp->ls_open_reg[i] & mask) {
 716  717                                  mutex_exit(&lofi_lock);
 717  718                                  return (EBUSY);
 718  719                          }
 719  720                  }
 720  721          }
 721  722  
 722  723          if (lsp->ls_cmlbhandle != NULL) {
 723  724                  if (cmlb_validate(lsp->ls_cmlbhandle, 0, 0) != 0) {
 724  725                          /*
 725  726                           * non-blocking opens are allowed to succeed to
 726  727                           * support format and fdisk to create partitioning.
 727  728                           */
 728  729                          if (!ndelay) {
 729  730                                  mutex_exit(&lofi_lock);
 730  731                                  return (ENXIO);
 731  732                          }
 732  733                  } else if (cmlb_partinfo(lsp->ls_cmlbhandle, part, &nblks, &lba,
 733  734                      NULL, NULL, 0) == 0) {
 734  735                          if ((!nblks) && ((!ndelay) || (otyp != OTYP_CHR))) {
 735  736                                  mutex_exit(&lofi_lock);
 736  737                                  return (ENXIO);
 737  738                          }
 738  739                  } else if (!ndelay) {
 739  740                          mutex_exit(&lofi_lock);
 740  741                          return (ENXIO);
 741  742                  }
 742  743          }
 743  744  
 744  745          if (otyp == OTYP_LYR) {
 745  746                  lsp->ls_open_lyr[part]++;
 746  747          } else {
 747  748                  lsp->ls_open_reg[otyp] |= mask;
 748  749          }
 749  750          if (flag & FEXCL) {
 750  751                  lsp->ls_open_excl |= mask;
 751  752          }
 752  753  
 753  754          mutex_exit(&lofi_lock);
 754  755          return (0);
 755  756  }
 756  757  
 757  758  /*ARGSUSED*/
 758  759  static int
 759  760  lofi_close(dev_t dev, int flag, int otyp, struct cred *credp)
 760  761  {
 761  762          minor_t part;
 762  763          int id;
 763  764          uint64_t mask;
 764  765          struct lofi_state *lsp;
 765  766  
 766  767          id = LOFI_MINOR2ID(getminor(dev));
 767  768          part = LOFI_PART(getminor(dev));
 768  769          mask = (1U << part);
 769  770  
 770  771          mutex_enter(&lofi_lock);
 771  772          lsp = ddi_get_soft_state(lofi_statep, id);
 772  773          if (lsp == NULL) {
 773  774                  mutex_exit(&lofi_lock);
 774  775                  return (EINVAL);
 775  776          }
 776  777  
 777  778          if (id == 0) {
 778  779                  mutex_exit(&lofi_lock);
 779  780                  return (0);
 780  781          }
 781  782  
 782  783          if (lsp->ls_open_excl & mask)
 783  784                  lsp->ls_open_excl &= ~mask;
 784  785  
 785  786          if (otyp == OTYP_LYR) {
 786  787                  lsp->ls_open_lyr[part]--;
 787  788          } else {
 788  789                  lsp->ls_open_reg[otyp] &= ~mask;
 789  790          }
 790  791  
 791  792          /*
 792  793           * If we forcibly closed the underlying device (li_force), or
 793  794           * asked for cleanup (li_cleanup), finish up if we're the last
 794  795           * out of the door.
 795  796           */
 796  797          if (!is_opened(lsp) &&
 797  798              (lsp->ls_cleanup == B_TRUE || lsp->ls_vp == NULL)) {
 798  799                  lofi_free_dev(lsp);
 799  800                  lofi_destroy(lsp, credp);
 800  801          }
 801  802  
 802  803          mutex_exit(&lofi_lock);
 803  804          return (0);
 804  805  }
 805  806  
 806  807  /*
 807  808   * Sets the mechanism's initialization vector (IV) if one is needed.
 808  809   * The IV is computed from the data block number.  lsp->ls_mech is
 809  810   * altered so that:
 810  811   *      lsp->ls_mech.cm_param_len is set to the IV len.
 811  812   *      lsp->ls_mech.cm_param is set to the IV.
 812  813   */
 813  814  static int
 814  815  lofi_blk_mech(struct lofi_state *lsp, longlong_t lblkno)
 815  816  {
 816  817          int     ret;
 817  818          crypto_data_t cdata;
 818  819          char    *iv;
 819  820          size_t  iv_len;
 820  821          size_t  min;
 821  822          void    *data;
 822  823          size_t  datasz;
 823  824  
 824  825          ASSERT(MUTEX_HELD(&lsp->ls_crypto_lock));
 825  826  
 826  827          if (lsp == NULL)
 827  828                  return (CRYPTO_DEVICE_ERROR);
 828  829  
 829  830          /* lsp->ls_mech.cm_param{_len} has already been set for static iv */
 830  831          if (lsp->ls_iv_type == IVM_NONE) {
 831  832                  return (CRYPTO_SUCCESS);
 832  833          }
 833  834  
 834  835          /*
 835  836           * if kmem already alloced from previous call and it's the same size
 836  837           * we need now, just recycle it; allocate new kmem only if we have to
 837  838           */
 838  839          if (lsp->ls_mech.cm_param == NULL ||
 839  840              lsp->ls_mech.cm_param_len != lsp->ls_iv_len) {
 840  841                  iv_len = lsp->ls_iv_len;
 841  842                  iv = kmem_zalloc(iv_len, KM_SLEEP);
 842  843          } else {
 843  844                  iv_len = lsp->ls_mech.cm_param_len;
 844  845                  iv = lsp->ls_mech.cm_param;
 845  846                  bzero(iv, iv_len);
 846  847          }
 847  848  
 848  849          switch (lsp->ls_iv_type) {
 849  850          case IVM_ENC_BLKNO:
 850  851                  /* iv is not static, lblkno changes each time */
 851  852                  data = &lblkno;
 852  853                  datasz = sizeof (lblkno);
 853  854                  break;
 854  855          default:
 855  856                  data = 0;
 856  857                  datasz = 0;
 857  858                  break;
 858  859          }
 859  860  
 860  861          /*
 861  862           * write blkno into the iv buffer padded on the left in case
 862  863           * blkno ever grows bigger than its current longlong_t size
 863  864           * or a variation other than blkno is used for the iv data
 864  865           */
 865  866          min = MIN(datasz, iv_len);
 866  867          bcopy(data, iv + (iv_len - min), min);
 867  868  
 868  869          /* encrypt the data in-place to get the IV */
 869  870          SETUP_C_DATA(cdata, iv, iv_len);
 870  871  
 871  872          ret = crypto_encrypt(&lsp->ls_iv_mech, &cdata, &lsp->ls_key,
 872  873              NULL, NULL, NULL);
 873  874          if (ret != CRYPTO_SUCCESS) {
 874  875                  cmn_err(CE_WARN, "failed to create iv for block %lld: (0x%x)",
 875  876                      lblkno, ret);
 876  877                  if (lsp->ls_mech.cm_param != iv)
 877  878                          kmem_free(iv, iv_len);
 878  879  
 879  880                  return (ret);
 880  881          }
 881  882  
 882  883          /* clean up the iv from the last computation */
 883  884          if (lsp->ls_mech.cm_param != NULL && lsp->ls_mech.cm_param != iv)
 884  885                  kmem_free(lsp->ls_mech.cm_param, lsp->ls_mech.cm_param_len);
 885  886  
 886  887          lsp->ls_mech.cm_param_len = iv_len;
 887  888          lsp->ls_mech.cm_param = iv;
 888  889  
 889  890          return (CRYPTO_SUCCESS);
 890  891  }
 891  892  
 892  893  /*
 893  894   * Performs encryption and decryption of a chunk of data of size "len",
 894  895   * one DEV_BSIZE block at a time.  "len" is assumed to be a multiple of
 895  896   * DEV_BSIZE.
 896  897   */
 897  898  static int
 898  899  lofi_crypto(struct lofi_state *lsp, struct buf *bp, caddr_t plaintext,
 899  900      caddr_t ciphertext, size_t len, boolean_t op_encrypt)
 900  901  {
 901  902          crypto_data_t cdata;
 902  903          crypto_data_t wdata;
 903  904          int ret;
 904  905          longlong_t lblkno = bp->b_lblkno;
 905  906  
 906  907          mutex_enter(&lsp->ls_crypto_lock);
 907  908  
 908  909          /*
 909  910           * though we could encrypt/decrypt entire "len" chunk of data, we need
 910  911           * to break it into DEV_BSIZE pieces to capture blkno incrementing
 911  912           */
 912  913          SETUP_C_DATA(cdata, plaintext, len);
 913  914          cdata.cd_length = DEV_BSIZE;
 914  915          if (ciphertext != NULL) {               /* not in-place crypto */
 915  916                  SETUP_C_DATA(wdata, ciphertext, len);
 916  917                  wdata.cd_length = DEV_BSIZE;
 917  918          }
 918  919  
 919  920          do {
 920  921                  ret = lofi_blk_mech(lsp, lblkno);
 921  922                  if (ret != CRYPTO_SUCCESS)
 922  923                          continue;
 923  924  
 924  925                  if (op_encrypt) {
 925  926                          ret = crypto_encrypt(&lsp->ls_mech, &cdata,
 926  927                              &lsp->ls_key, NULL,
 927  928                              ((ciphertext != NULL) ? &wdata : NULL), NULL);
 928  929                  } else {
 929  930                          ret = crypto_decrypt(&lsp->ls_mech, &cdata,
 930  931                              &lsp->ls_key, NULL,
 931  932                              ((ciphertext != NULL) ? &wdata : NULL), NULL);
 932  933                  }
 933  934  
 934  935                  cdata.cd_offset += DEV_BSIZE;
 935  936                  if (ciphertext != NULL)
 936  937                          wdata.cd_offset += DEV_BSIZE;
 937  938                  lblkno++;
 938  939          } while (ret == CRYPTO_SUCCESS && cdata.cd_offset < len);
 939  940  
 940  941          mutex_exit(&lsp->ls_crypto_lock);
 941  942  
 942  943          if (ret != CRYPTO_SUCCESS) {
 943  944                  cmn_err(CE_WARN, "%s failed for block %lld:  (0x%x)",
 944  945                      op_encrypt ? "crypto_encrypt()" : "crypto_decrypt()",
 945  946                      lblkno, ret);
 946  947          }
 947  948  
 948  949          return (ret);
 949  950  }
 950  951  
 951  952  #define RDWR_RAW        1
 952  953  #define RDWR_BCOPY      2
 953  954  
 954  955  static int
 955  956  lofi_rdwr(caddr_t bufaddr, offset_t offset, struct buf *bp,
 956  957      struct lofi_state *lsp, size_t len, int method, caddr_t bcopy_locn)
 957  958  {
 958  959          ssize_t resid;
 959  960          int isread;
 960  961          int error;
 961  962  
 962  963          /*
 963  964           * Handles reads/writes for both plain and encrypted lofi
 964  965           * Note:  offset is already shifted by lsp->ls_crypto_offset
 965  966           * when it gets here.
 966  967           */
 967  968  
 968  969          isread = bp->b_flags & B_READ;
 969  970          if (isread) {
 970  971                  if (method == RDWR_BCOPY) {
 971  972                          /* DO NOT update bp->b_resid for bcopy */
 972  973                          bcopy(bcopy_locn, bufaddr, len);
 973  974                          error = 0;
 974  975                  } else {                /* RDWR_RAW */
 975  976                          error = vn_rdwr(UIO_READ, lsp->ls_vp, bufaddr, len,
 976  977                              offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred,
 977  978                              &resid);
 978  979                          bp->b_resid = resid;
 979  980                  }
 980  981                  if (lsp->ls_crypto_enabled && error == 0) {
 981  982                          if (lofi_crypto(lsp, bp, bufaddr, NULL, len,
 982  983                              B_FALSE) != CRYPTO_SUCCESS) {
 983  984                                  /*
 984  985                                   * XXX: original code didn't set residual
 985  986                                   * back to len because no error was expected
 986  987                                   * from bcopy() if encryption is not enabled
 987  988                                   */
 988  989                                  if (method != RDWR_BCOPY)
 989  990                                          bp->b_resid = len;
 990  991                                  error = EIO;
 991  992                          }
 992  993                  }
 993  994                  return (error);
 994  995          } else {
 995  996                  void *iobuf = bufaddr;
 996  997  
 997  998                  if (lsp->ls_crypto_enabled) {
 998  999                          /* don't do in-place crypto to keep bufaddr intact */
 999 1000                          iobuf = kmem_alloc(len, KM_SLEEP);
1000 1001                          if (lofi_crypto(lsp, bp, bufaddr, iobuf, len,
1001 1002                              B_TRUE) != CRYPTO_SUCCESS) {
1002 1003                                  kmem_free(iobuf, len);
1003 1004                                  if (method != RDWR_BCOPY)
1004 1005                                          bp->b_resid = len;
1005 1006                                  return (EIO);
1006 1007                          }
1007 1008                  }
1008 1009                  if (method == RDWR_BCOPY) {
1009 1010                          /* DO NOT update bp->b_resid for bcopy */
1010 1011                          bcopy(iobuf, bcopy_locn, len);
1011 1012                          error = 0;
1012 1013                  } else {                /* RDWR_RAW */
1013 1014                          error = vn_rdwr(UIO_WRITE, lsp->ls_vp, iobuf, len,
1014 1015                              offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred,
1015 1016                              &resid);
1016 1017                          bp->b_resid = resid;
1017 1018                  }
1018 1019                  if (lsp->ls_crypto_enabled) {
1019 1020                          kmem_free(iobuf, len);
1020 1021                  }
1021 1022                  return (error);
1022 1023          }
1023 1024  }
1024 1025  
1025 1026  static int
1026 1027  lofi_mapped_rdwr(caddr_t bufaddr, offset_t offset, struct buf *bp,
1027 1028      struct lofi_state *lsp)
1028 1029  {
1029 1030          int error;
1030 1031          offset_t alignedoffset, mapoffset;
1031 1032          size_t  xfersize;
1032 1033          int     isread;
1033 1034          int     smflags;
1034 1035          caddr_t mapaddr;
1035 1036          size_t  len;
1036 1037          enum seg_rw srw;
1037 1038          int     save_error;
1038 1039  
1039 1040          /*
1040 1041           * Note:  offset is already shifted by lsp->ls_crypto_offset
1041 1042           * when it gets here.
1042 1043           */
1043 1044          if (lsp->ls_crypto_enabled)
1044 1045                  ASSERT(lsp->ls_vp_comp_size == lsp->ls_vp_size);
1045 1046  
1046 1047          /*
1047 1048           * segmap always gives us an 8K (MAXBSIZE) chunk, aligned on
1048 1049           * an 8K boundary, but the buf transfer address may not be
1049 1050           * aligned on more than a 512-byte boundary (we don't enforce
1050 1051           * that even though we could). This matters since the initial
1051 1052           * part of the transfer may not start at offset 0 within the
1052 1053           * segmap'd chunk. So we have to compensate for that with
1053 1054           * 'mapoffset'. Subsequent chunks always start off at the
1054 1055           * beginning, and the last is capped by b_resid
1055 1056           *
1056 1057           * Visually, where "|" represents page map boundaries:
1057 1058           *   alignedoffset (mapaddr begins at this segmap boundary)
1058 1059           *    |   offset (from beginning of file)
1059 1060           *    |    |       len
1060 1061           *    v    v        v
1061 1062           * ===|====X========|====...======|========X====|====
1062 1063           *         /-------------...---------------/
1063 1064           *              ^ bp->b_bcount/bp->b_resid at start
1064 1065           *    /----/--------/----...------/--------/
1065 1066           *      ^       ^       ^   ^           ^
1066 1067           *      |       |       |   |           nth xfersize (<= MAXBSIZE)
1067 1068           *      |       |       2nd thru n-1st xfersize (= MAXBSIZE)
1068 1069           *      |       1st xfersize (<= MAXBSIZE)
1069 1070           *    mapoffset (offset into 1st segmap, non-0 1st time, 0 thereafter)
1070 1071           *
1071 1072           * Notes: "alignedoffset" is "offset" rounded down to nearest
1072 1073           * MAXBSIZE boundary.  "len" is next page boundary of size
1073 1074           * PAGESIZE after "alignedoffset".
1074 1075           */
1075 1076          mapoffset = offset & MAXBOFFSET;
1076 1077          alignedoffset = offset - mapoffset;
1077 1078          bp->b_resid = bp->b_bcount;
1078 1079          isread = bp->b_flags & B_READ;
1079 1080          srw = isread ? S_READ : S_WRITE;
1080 1081          do {
1081 1082                  xfersize = MIN(lsp->ls_vp_comp_size - offset,
1082 1083                      MIN(MAXBSIZE - mapoffset, bp->b_resid));
1083 1084                  len = roundup(mapoffset + xfersize, PAGESIZE);
1084 1085                  mapaddr = segmap_getmapflt(segkmap, lsp->ls_vp,
1085 1086                      alignedoffset, MAXBSIZE, 1, srw);
1086 1087                  /*
1087 1088                   * Now fault in the pages. This lets us check
1088 1089                   * for errors before we reference mapaddr and
1089 1090                   * try to resolve the fault in bcopy (which would
1090 1091                   * panic instead). And this can easily happen,
1091 1092                   * particularly if you've lofi'd a file over NFS
1092 1093                   * and someone deletes the file on the server.
1093 1094                   */
1094 1095                  error = segmap_fault(kas.a_hat, segkmap, mapaddr,
1095 1096                      len, F_SOFTLOCK, srw);
1096 1097                  if (error) {
1097 1098                          (void) segmap_release(segkmap, mapaddr, 0);
1098 1099                          if (FC_CODE(error) == FC_OBJERR)
1099 1100                                  error = FC_ERRNO(error);
1100 1101                          else
1101 1102                                  error = EIO;
1102 1103                          break;
1103 1104                  }
1104 1105                  /* error may be non-zero for encrypted lofi */
1105 1106                  error = lofi_rdwr(bufaddr, 0, bp, lsp, xfersize,
1106 1107                      RDWR_BCOPY, mapaddr + mapoffset);
1107 1108                  if (error == 0) {
1108 1109                          bp->b_resid -= xfersize;
1109 1110                          bufaddr += xfersize;
1110 1111                          offset += xfersize;
1111 1112                  }
1112 1113                  smflags = 0;
1113 1114                  if (isread) {
1114 1115                          smflags |= SM_FREE;
1115 1116                          /*
1116 1117                           * If we're reading an entire page starting
1117 1118                           * at a page boundary, there's a good chance
1118 1119                           * we won't need it again. Put it on the
1119 1120                           * head of the freelist.
1120 1121                           */
1121 1122                          if (mapoffset == 0 && xfersize == MAXBSIZE)
1122 1123                                  smflags |= SM_DONTNEED;
1123 1124                  } else {
1124 1125                          /*
1125 1126                           * Write back good pages, it is okay to
1126 1127                           * always release asynchronous here as we'll
1127 1128                           * follow with VOP_FSYNC for B_SYNC buffers.
1128 1129                           */
1129 1130                          if (error == 0)
1130 1131                                  smflags |= SM_WRITE | SM_ASYNC;
1131 1132                  }
1132 1133                  (void) segmap_fault(kas.a_hat, segkmap, mapaddr,
1133 1134                      len, F_SOFTUNLOCK, srw);
1134 1135                  save_error = segmap_release(segkmap, mapaddr, smflags);
1135 1136                  if (error == 0)
1136 1137                          error = save_error;
1137 1138                  /* only the first map may start partial */
1138 1139                  mapoffset = 0;
1139 1140                  alignedoffset += MAXBSIZE;
1140 1141          } while ((error == 0) && (bp->b_resid > 0) &&
1141 1142              (offset < lsp->ls_vp_comp_size));
1142 1143  
1143 1144          return (error);
1144 1145  }
1145 1146  
1146 1147  /*
1147 1148   * Check if segment seg_index is present in the decompressed segment
1148 1149   * data cache.
1149 1150   *
1150 1151   * Returns a pointer to the decompressed segment data cache entry if
1151 1152   * found, and NULL when decompressed data for this segment is not yet
1152 1153   * cached.
1153 1154   */
1154 1155  static struct lofi_comp_cache *
1155 1156  lofi_find_comp_data(struct lofi_state *lsp, uint64_t seg_index)
1156 1157  {
1157 1158          struct lofi_comp_cache *lc;
1158 1159  
1159 1160          ASSERT(MUTEX_HELD(&lsp->ls_comp_cache_lock));
1160 1161  
1161 1162          for (lc = list_head(&lsp->ls_comp_cache); lc != NULL;
1162 1163              lc = list_next(&lsp->ls_comp_cache, lc)) {
1163 1164                  if (lc->lc_index == seg_index) {
1164 1165                          /*
1165 1166                           * Decompressed segment data was found in the
1166 1167                           * cache.
1167 1168                           *
1168 1169                           * The cache uses an LRU replacement strategy;
1169 1170                           * move the entry to head of list.
1170 1171                           */
1171 1172                          list_remove(&lsp->ls_comp_cache, lc);
1172 1173                          list_insert_head(&lsp->ls_comp_cache, lc);
1173 1174                          return (lc);
1174 1175                  }
1175 1176          }
1176 1177          return (NULL);
1177 1178  }
1178 1179  
1179 1180  /*
1180 1181   * Add the data for a decompressed segment at segment index
1181 1182   * seg_index to the cache of the decompressed segments.
1182 1183   *
1183 1184   * Returns a pointer to the cache element structure in case
1184 1185   * the data was added to the cache; returns NULL when the data
1185 1186   * wasn't cached.
1186 1187   */
1187 1188  static struct lofi_comp_cache *
1188 1189  lofi_add_comp_data(struct lofi_state *lsp, uint64_t seg_index,
1189 1190      uchar_t *data)
1190 1191  {
1191 1192          struct lofi_comp_cache *lc;
1192 1193  
1193 1194          ASSERT(MUTEX_HELD(&lsp->ls_comp_cache_lock));
1194 1195  
1195 1196          while (lsp->ls_comp_cache_count > lofi_max_comp_cache) {
1196 1197                  lc = list_remove_tail(&lsp->ls_comp_cache);
1197 1198                  ASSERT(lc != NULL);
1198 1199                  kmem_free(lc->lc_data, lsp->ls_uncomp_seg_sz);
1199 1200                  kmem_free(lc, sizeof (struct lofi_comp_cache));
1200 1201                  lsp->ls_comp_cache_count--;
1201 1202          }
1202 1203  
1203 1204          /*
1204 1205           * Do not cache when disabled by tunable variable
1205 1206           */
1206 1207          if (lofi_max_comp_cache == 0)
1207 1208                  return (NULL);
1208 1209  
1209 1210          /*
1210 1211           * When the cache has not yet reached the maximum allowed
1211 1212           * number of segments, allocate a new cache element.
1212 1213           * Otherwise the cache is full; reuse the last list element
1213 1214           * (LRU) for caching the decompressed segment data.
1214 1215           *
1215 1216           * The cache element for the new decompressed segment data is
1216 1217           * added to the head of the list.
1217 1218           */
1218 1219          if (lsp->ls_comp_cache_count < lofi_max_comp_cache) {
1219 1220                  lc = kmem_alloc(sizeof (struct lofi_comp_cache), KM_SLEEP);
1220 1221                  lc->lc_data = NULL;
1221 1222                  list_insert_head(&lsp->ls_comp_cache, lc);
1222 1223                  lsp->ls_comp_cache_count++;
1223 1224          } else {
1224 1225                  lc = list_remove_tail(&lsp->ls_comp_cache);
1225 1226                  if (lc == NULL)
1226 1227                          return (NULL);
1227 1228                  list_insert_head(&lsp->ls_comp_cache, lc);
1228 1229          }
1229 1230  
1230 1231          /*
1231 1232           * Free old uncompressed segment data when reusing a cache
1232 1233           * entry.
1233 1234           */
1234 1235          if (lc->lc_data != NULL)
1235 1236                  kmem_free(lc->lc_data, lsp->ls_uncomp_seg_sz);
1236 1237  
1237 1238          lc->lc_data = data;
1238 1239          lc->lc_index = seg_index;
1239 1240          return (lc);
1240 1241  }
1241 1242  
1242 1243  
1243 1244  /*ARGSUSED*/
1244 1245  static int
1245 1246  gzip_decompress(void *src, size_t srclen, void *dst,
1246 1247      size_t *dstlen, int level)
1247 1248  {
1248 1249          ASSERT(*dstlen >= srclen);
1249 1250  
1250 1251          if (z_uncompress(dst, dstlen, src, srclen) != Z_OK)
1251 1252                  return (-1);
1252 1253          return (0);
1253 1254  }
1254 1255  
1255 1256  #define LZMA_HEADER_SIZE        (LZMA_PROPS_SIZE + 8)
1256 1257  /*ARGSUSED*/
1257 1258  static int
1258 1259  lzma_decompress(void *src, size_t srclen, void *dst,
1259 1260      size_t *dstlen, int level)
1260 1261  {
1261 1262          size_t insizepure;
1262 1263          void *actual_src;
1263 1264          ELzmaStatus status;
1264 1265  
1265 1266          insizepure = srclen - LZMA_HEADER_SIZE;
1266 1267          actual_src = (void *)((Byte *)src + LZMA_HEADER_SIZE);
1267 1268  
1268 1269          if (LzmaDecode((Byte *)dst, (size_t *)dstlen,
1269 1270              (const Byte *)actual_src, &insizepure,
1270 1271              (const Byte *)src, LZMA_PROPS_SIZE, LZMA_FINISH_ANY, &status,
1271 1272              &g_Alloc) != SZ_OK) {
1272 1273                  return (-1);
1273 1274          }
1274 1275          return (0);
1275 1276  }
1276 1277  
1277 1278  /*
1278 1279   * This is basically what strategy used to be before we found we
1279 1280   * needed task queues.
1280 1281   */
1281 1282  static void
1282 1283  lofi_strategy_task(void *arg)
1283 1284  {
1284 1285          struct buf *bp = (struct buf *)arg;
1285 1286          int error;
1286 1287          int syncflag = 0;
1287 1288          struct lofi_state *lsp;
1288 1289          offset_t offset;
1289 1290          caddr_t bufaddr;
1290 1291          size_t  len;
1291 1292          size_t  xfersize;
1292 1293          boolean_t bufinited = B_FALSE;
1293 1294  
1294 1295          lsp = ddi_get_soft_state(lofi_statep,
1295 1296              LOFI_MINOR2ID(getminor(bp->b_edev)));
1296 1297  
1297 1298          if (lsp == NULL) {
1298 1299                  error = ENXIO;
1299 1300                  goto errout;
1300 1301          }
1301 1302          if (lsp->ls_kstat) {
1302 1303                  mutex_enter(lsp->ls_kstat->ks_lock);
1303 1304                  kstat_waitq_to_runq(KSTAT_IO_PTR(lsp->ls_kstat));
1304 1305                  mutex_exit(lsp->ls_kstat->ks_lock);
1305 1306          }
1306 1307  
1307 1308          mutex_enter(&lsp->ls_vp_lock);
1308 1309          lsp->ls_vp_iocount++;
1309 1310          mutex_exit(&lsp->ls_vp_lock);
1310 1311  
1311 1312          bp_mapin(bp);
1312 1313          bufaddr = bp->b_un.b_addr;
1313 1314          offset = (bp->b_lblkno + (diskaddr_t)(uintptr_t)bp->b_private)
1314 1315              << lsp->ls_lbshift; /* offset within file */
1315 1316          if (lsp->ls_crypto_enabled) {
1316 1317                  /* encrypted data really begins after crypto header */
1317 1318                  offset += lsp->ls_crypto_offset;
1318 1319          }
1319 1320          len = bp->b_bcount;
1320 1321          bufinited = B_TRUE;
1321 1322  
1322 1323          if (lsp->ls_vp == NULL || lsp->ls_vp_closereq) {
1323 1324                  error = EIO;
1324 1325                  goto errout;
1325 1326          }
1326 1327  
1327 1328          /*
1328 1329           * If we're writing and the buffer was not B_ASYNC
1329 1330           * we'll follow up with a VOP_FSYNC() to force any
1330 1331           * asynchronous I/O to stable storage.
1331 1332           */
1332 1333          if (!(bp->b_flags & B_READ) && !(bp->b_flags & B_ASYNC))
1333 1334                  syncflag = FSYNC;
1334 1335  
1335 1336          /*
1336 1337           * We used to always use vn_rdwr here, but we cannot do that because
1337 1338           * we might decide to read or write from the the underlying
1338 1339           * file during this call, which would be a deadlock because
1339 1340           * we have the rw_lock. So instead we page, unless it's not
1340 1341           * mapable or it's a character device or it's an encrypted lofi.
1341 1342           */
1342 1343          if ((lsp->ls_vp->v_flag & VNOMAP) || (lsp->ls_vp->v_type == VCHR) ||
1343 1344              lsp->ls_crypto_enabled) {
1344 1345                  error = lofi_rdwr(bufaddr, offset, bp, lsp, len, RDWR_RAW,
1345 1346                      NULL);
1346 1347          } else if (lsp->ls_uncomp_seg_sz == 0) {
1347 1348                  error = lofi_mapped_rdwr(bufaddr, offset, bp, lsp);
1348 1349          } else {
1349 1350                  uchar_t *compressed_seg = NULL, *cmpbuf;
1350 1351                  uchar_t *uncompressed_seg = NULL;
1351 1352                  lofi_compress_info_t *li;
1352 1353                  size_t oblkcount;
1353 1354                  ulong_t seglen;
1354 1355                  uint64_t sblkno, eblkno, cmpbytes;
1355 1356                  uint64_t uncompressed_seg_index;
1356 1357                  struct lofi_comp_cache *lc;
1357 1358                  offset_t sblkoff, eblkoff;
1358 1359                  u_offset_t salign, ealign;
1359 1360                  u_offset_t sdiff;
1360 1361                  uint32_t comp_data_sz;
1361 1362                  uint64_t i;
1362 1363                  int j;
1363 1364  
1364 1365                  /*
1365 1366                   * From here on we're dealing primarily with compressed files
1366 1367                   */
1367 1368                  ASSERT(!lsp->ls_crypto_enabled);
1368 1369  
1369 1370                  /*
1370 1371                   * Compressed files can only be read from and
1371 1372                   * not written to
1372 1373                   */
1373 1374                  if (!(bp->b_flags & B_READ)) {
1374 1375                          bp->b_resid = bp->b_bcount;
1375 1376                          error = EROFS;
1376 1377                          goto done;
1377 1378                  }
1378 1379  
1379 1380                  ASSERT(lsp->ls_comp_algorithm_index >= 0);
1380 1381                  li = &lofi_compress_table[lsp->ls_comp_algorithm_index];
1381 1382                  /*
1382 1383                   * Compute starting and ending compressed segment numbers
1383 1384                   * We use only bitwise operations avoiding division and
1384 1385                   * modulus because we enforce the compression segment size
1385 1386                   * to a power of 2
1386 1387                   */
1387 1388                  sblkno = offset >> lsp->ls_comp_seg_shift;
1388 1389                  sblkoff = offset & (lsp->ls_uncomp_seg_sz - 1);
1389 1390                  eblkno = (offset + bp->b_bcount) >> lsp->ls_comp_seg_shift;
1390 1391                  eblkoff = (offset + bp->b_bcount) & (lsp->ls_uncomp_seg_sz - 1);
1391 1392  
1392 1393                  /*
1393 1394                   * Check the decompressed segment cache.
1394 1395                   *
1395 1396                   * The cache is used only when the requested data
1396 1397                   * is within a segment. Requests that cross
1397 1398                   * segment boundaries bypass the cache.
1398 1399                   */
1399 1400                  if (sblkno == eblkno ||
1400 1401                      (sblkno + 1 == eblkno && eblkoff == 0)) {
1401 1402                          /*
1402 1403                           * Request doesn't cross a segment boundary,
1403 1404                           * now check the cache.
1404 1405                           */
1405 1406                          mutex_enter(&lsp->ls_comp_cache_lock);
1406 1407                          lc = lofi_find_comp_data(lsp, sblkno);
1407 1408                          if (lc != NULL) {
1408 1409                                  /*
1409 1410                                   * We've found the decompressed segment
1410 1411                                   * data in the cache; reuse it.
1411 1412                                   */
1412 1413                                  bcopy(lc->lc_data + sblkoff, bufaddr,
1413 1414                                      bp->b_bcount);
1414 1415                                  mutex_exit(&lsp->ls_comp_cache_lock);
1415 1416                                  bp->b_resid = 0;
1416 1417                                  error = 0;
1417 1418                                  goto done;
1418 1419                          }
1419 1420                          mutex_exit(&lsp->ls_comp_cache_lock);
1420 1421                  }
1421 1422  
1422 1423                  /*
1423 1424                   * Align start offset to block boundary for segmap
1424 1425                   */
1425 1426                  salign = lsp->ls_comp_seg_index[sblkno];
1426 1427                  sdiff = salign & (DEV_BSIZE - 1);
1427 1428                  salign -= sdiff;
1428 1429                  if (eblkno >= (lsp->ls_comp_index_sz - 1)) {
1429 1430                          /*
1430 1431                           * We're dealing with the last segment of
1431 1432                           * the compressed file -- the size of this
1432 1433                           * segment *may not* be the same as the
1433 1434                           * segment size for the file
1434 1435                           */
1435 1436                          eblkoff = (offset + bp->b_bcount) &
1436 1437                              (lsp->ls_uncomp_last_seg_sz - 1);
1437 1438                          ealign = lsp->ls_vp_comp_size;
1438 1439                  } else {
1439 1440                          ealign = lsp->ls_comp_seg_index[eblkno + 1];
1440 1441                  }
1441 1442  
1442 1443                  /*
1443 1444                   * Preserve original request paramaters
1444 1445                   */
1445 1446                  oblkcount = bp->b_bcount;
1446 1447  
1447 1448                  /*
1448 1449                   * Assign the calculated parameters
1449 1450                   */
1450 1451                  comp_data_sz = ealign - salign;
1451 1452                  bp->b_bcount = comp_data_sz;
1452 1453  
1453 1454                  /*
1454 1455                   * Buffers to hold compressed segments are pre-allocated
1455 1456                   * on a per-thread basis. Find a pre-allocated buffer
1456 1457                   * that is not currently in use and mark it for use.
1457 1458                   */
1458 1459                  mutex_enter(&lsp->ls_comp_bufs_lock);
1459 1460                  for (j = 0; j < lofi_taskq_nthreads; j++) {
1460 1461                          if (lsp->ls_comp_bufs[j].inuse == 0) {
1461 1462                                  lsp->ls_comp_bufs[j].inuse = 1;
1462 1463                                  break;
1463 1464                          }
1464 1465                  }
1465 1466  
1466 1467                  mutex_exit(&lsp->ls_comp_bufs_lock);
1467 1468                  ASSERT(j < lofi_taskq_nthreads);
1468 1469  
1469 1470                  /*
1470 1471                   * If the pre-allocated buffer size does not match
1471 1472                   * the size of the I/O request, re-allocate it with
1472 1473                   * the appropriate size
1473 1474                   */
1474 1475                  if (lsp->ls_comp_bufs[j].bufsize < bp->b_bcount) {
1475 1476                          if (lsp->ls_comp_bufs[j].bufsize > 0)
1476 1477                                  kmem_free(lsp->ls_comp_bufs[j].buf,
1477 1478                                      lsp->ls_comp_bufs[j].bufsize);
1478 1479                          lsp->ls_comp_bufs[j].buf = kmem_alloc(bp->b_bcount,
1479 1480                              KM_SLEEP);
1480 1481                          lsp->ls_comp_bufs[j].bufsize = bp->b_bcount;
1481 1482                  }
1482 1483                  compressed_seg = lsp->ls_comp_bufs[j].buf;
1483 1484  
1484 1485                  /*
1485 1486                   * Map in the calculated number of blocks
1486 1487                   */
1487 1488                  error = lofi_mapped_rdwr((caddr_t)compressed_seg, salign,
1488 1489                      bp, lsp);
1489 1490  
1490 1491                  bp->b_bcount = oblkcount;
1491 1492                  bp->b_resid = oblkcount;
1492 1493                  if (error != 0)
1493 1494                          goto done;
1494 1495  
1495 1496                  /*
1496 1497                   * decompress compressed blocks start
1497 1498                   */
1498 1499                  cmpbuf = compressed_seg + sdiff;
1499 1500                  for (i = sblkno; i <= eblkno; i++) {
1500 1501                          ASSERT(i < lsp->ls_comp_index_sz - 1);
1501 1502                          uchar_t *useg;
1502 1503  
1503 1504                          /*
1504 1505                           * The last segment is special in that it is
1505 1506                           * most likely not going to be the same
1506 1507                           * (uncompressed) size as the other segments.
1507 1508                           */
1508 1509                          if (i == (lsp->ls_comp_index_sz - 2)) {
1509 1510                                  seglen = lsp->ls_uncomp_last_seg_sz;
1510 1511                          } else {
1511 1512                                  seglen = lsp->ls_uncomp_seg_sz;
1512 1513                          }
1513 1514  
1514 1515                          /*
1515 1516                           * Each of the segment index entries contains
1516 1517                           * the starting block number for that segment.
1517 1518                           * The number of compressed bytes in a segment
1518 1519                           * is thus the difference between the starting
1519 1520                           * block number of this segment and the starting
1520 1521                           * block number of the next segment.
1521 1522                           */
1522 1523                          cmpbytes = lsp->ls_comp_seg_index[i + 1] -
1523 1524                              lsp->ls_comp_seg_index[i];
1524 1525  
1525 1526                          /*
1526 1527                           * The first byte in a compressed segment is a flag
1527 1528                           * that indicates whether this segment is compressed
1528 1529                           * at all.
1529 1530                           *
1530 1531                           * The variable 'useg' is used (instead of
1531 1532                           * uncompressed_seg) in this loop to keep a
1532 1533                           * reference to the uncompressed segment.
1533 1534                           *
1534 1535                           * N.B. If 'useg' is replaced with uncompressed_seg,
1535 1536                           * it leads to memory leaks and heap corruption in
1536 1537                           * corner cases where compressed segments lie
1537 1538                           * adjacent to uncompressed segments.
1538 1539                           */
1539 1540                          if (*cmpbuf == UNCOMPRESSED) {
1540 1541                                  useg = cmpbuf + SEGHDR;
1541 1542                          } else {
1542 1543                                  if (uncompressed_seg == NULL)
1543 1544                                          uncompressed_seg =
1544 1545                                              kmem_alloc(lsp->ls_uncomp_seg_sz,
1545 1546                                              KM_SLEEP);
1546 1547                                  useg = uncompressed_seg;
1547 1548                                  uncompressed_seg_index = i;
1548 1549  
1549 1550                                  if (li->l_decompress((cmpbuf + SEGHDR),
1550 1551                                      (cmpbytes - SEGHDR), uncompressed_seg,
1551 1552                                      &seglen, li->l_level) != 0) {
1552 1553                                          error = EIO;
1553 1554                                          goto done;
1554 1555                                  }
1555 1556                          }
1556 1557  
1557 1558                          /*
1558 1559                           * Determine how much uncompressed data we
1559 1560                           * have to copy and copy it
1560 1561                           */
1561 1562                          xfersize = lsp->ls_uncomp_seg_sz - sblkoff;
1562 1563                          if (i == eblkno)
1563 1564                                  xfersize -= (lsp->ls_uncomp_seg_sz - eblkoff);
1564 1565  
1565 1566                          bcopy((useg + sblkoff), bufaddr, xfersize);
1566 1567  
1567 1568                          cmpbuf += cmpbytes;
1568 1569                          bufaddr += xfersize;
1569 1570                          bp->b_resid -= xfersize;
1570 1571                          sblkoff = 0;
1571 1572  
1572 1573                          if (bp->b_resid == 0)
1573 1574                                  break;
1574 1575                  } /* decompress compressed blocks ends */
1575 1576  
1576 1577                  /*
1577 1578                   * Skip to done if there is no uncompressed data to cache
1578 1579                   */
1579 1580                  if (uncompressed_seg == NULL)
1580 1581                          goto done;
1581 1582  
1582 1583                  /*
1583 1584                   * Add the data for the last decompressed segment to
1584 1585                   * the cache.
1585 1586                   *
1586 1587                   * In case the uncompressed segment data was added to (and
1587 1588                   * is referenced by) the cache, make sure we don't free it
1588 1589                   * here.
1589 1590                   */
1590 1591                  mutex_enter(&lsp->ls_comp_cache_lock);
1591 1592                  if ((lc = lofi_add_comp_data(lsp, uncompressed_seg_index,
1592 1593                      uncompressed_seg)) != NULL) {
1593 1594                          uncompressed_seg = NULL;
1594 1595                  }
1595 1596                  mutex_exit(&lsp->ls_comp_cache_lock);
1596 1597  
1597 1598  done:
1598 1599                  if (compressed_seg != NULL) {
1599 1600                          mutex_enter(&lsp->ls_comp_bufs_lock);
1600 1601                          lsp->ls_comp_bufs[j].inuse = 0;
1601 1602                          mutex_exit(&lsp->ls_comp_bufs_lock);
1602 1603                  }
1603 1604                  if (uncompressed_seg != NULL)
1604 1605                          kmem_free(uncompressed_seg, lsp->ls_uncomp_seg_sz);
1605 1606          } /* end of handling compressed files */
1606 1607  
1607 1608          if ((error == 0) && (syncflag != 0))
1608 1609                  error = VOP_FSYNC(lsp->ls_vp, syncflag, kcred, NULL);
1609 1610  
1610 1611  errout:
1611 1612          if (bufinited && lsp->ls_kstat) {
1612 1613                  size_t n_done = bp->b_bcount - bp->b_resid;
1613 1614                  kstat_io_t *kioptr;
1614 1615  
1615 1616                  mutex_enter(lsp->ls_kstat->ks_lock);
1616 1617                  kioptr = KSTAT_IO_PTR(lsp->ls_kstat);
1617 1618                  if (bp->b_flags & B_READ) {
1618 1619                          kioptr->nread += n_done;
1619 1620                          kioptr->reads++;
1620 1621                  } else {
1621 1622                          kioptr->nwritten += n_done;
1622 1623                          kioptr->writes++;
1623 1624                  }
1624 1625                  kstat_runq_exit(kioptr);
1625 1626                  mutex_exit(lsp->ls_kstat->ks_lock);
1626 1627          }
1627 1628  
1628 1629          mutex_enter(&lsp->ls_vp_lock);
1629 1630          if (--lsp->ls_vp_iocount == 0)
1630 1631                  cv_broadcast(&lsp->ls_vp_cv);
1631 1632          mutex_exit(&lsp->ls_vp_lock);
1632 1633  
1633 1634          bioerror(bp, error);
1634 1635          biodone(bp);
1635 1636  }
1636 1637  
1637 1638  static int
1638 1639  lofi_strategy(struct buf *bp)
1639 1640  {
1640 1641          struct lofi_state *lsp;
1641 1642          offset_t        offset;
1642 1643          minor_t         part;
1643 1644          diskaddr_t      p_lba;
1644 1645          diskaddr_t      p_nblks;
1645 1646          int             shift;
1646 1647  
1647 1648          /*
1648 1649           * We cannot just do I/O here, because the current thread
1649 1650           * _might_ end up back in here because the underlying filesystem
1650 1651           * wants a buffer, which eventually gets into bio_recycle and
1651 1652           * might call into lofi to write out a delayed-write buffer.
1652 1653           * This is bad if the filesystem above lofi is the same as below.
1653 1654           *
1654 1655           * We could come up with a complex strategy using threads to
1655 1656           * do the I/O asynchronously, or we could use task queues. task
1656 1657           * queues were incredibly easy so they win.
1657 1658           */
1658 1659  
1659 1660          lsp = ddi_get_soft_state(lofi_statep,
1660 1661              LOFI_MINOR2ID(getminor(bp->b_edev)));
1661 1662          part = LOFI_PART(getminor(bp->b_edev));
1662 1663  
1663 1664          if (lsp == NULL) {
1664 1665                  bioerror(bp, ENXIO);
1665 1666                  biodone(bp);
1666 1667                  return (0);
1667 1668          }
1668 1669  
1669 1670          /* Check if we are closing. */
1670 1671          mutex_enter(&lsp->ls_vp_lock);
1671 1672          if (lsp->ls_vp == NULL || lsp->ls_vp_closereq) {
1672 1673                  mutex_exit(&lsp->ls_vp_lock);
1673 1674                  bioerror(bp, EIO);
1674 1675                  biodone(bp);
1675 1676                  return (0);
1676 1677          }
1677 1678          mutex_exit(&lsp->ls_vp_lock);
1678 1679  
1679 1680          shift = lsp->ls_lbshift;
1680 1681          p_lba = 0;
1681 1682          p_nblks = lsp->ls_vp_size >> shift;
1682 1683  
1683 1684          if (lsp->ls_cmlbhandle != NULL) {
1684 1685                  if (cmlb_partinfo(lsp->ls_cmlbhandle, part, &p_nblks, &p_lba,
1685 1686                      NULL, NULL, 0)) {
1686 1687                          bioerror(bp, ENXIO);
1687 1688                          biodone(bp);
1688 1689                          return (0);
1689 1690                  }
1690 1691          }
1691 1692  
1692 1693          /* start block past partition end? */
1693 1694          if (bp->b_lblkno > p_nblks) {
1694 1695                  bioerror(bp, ENXIO);
1695 1696                  biodone(bp);
1696 1697                  return (0);
1697 1698          }
1698 1699  
1699 1700          offset = (bp->b_lblkno+p_lba) << shift; /* offset within file */
1700 1701  
1701 1702          mutex_enter(&lsp->ls_vp_lock);
1702 1703          if (lsp->ls_crypto_enabled) {
1703 1704                  /* encrypted data really begins after crypto header */
1704 1705                  offset += lsp->ls_crypto_offset;
1705 1706          }
1706 1707  
1707 1708          /* make sure we will not pass the file or partition size */
1708 1709          if (offset == lsp->ls_vp_size ||
1709 1710              offset == (((p_lba + p_nblks) << shift) + lsp->ls_crypto_offset)) {
1710 1711                  /* EOF */
1711 1712                  if ((bp->b_flags & B_READ) != 0) {
1712 1713                          bp->b_resid = bp->b_bcount;
1713 1714                          bioerror(bp, 0);
1714 1715                  } else {
1715 1716                          /* writes should fail */
1716 1717                          bioerror(bp, ENXIO);
1717 1718                  }
1718 1719                  biodone(bp);
1719 1720                  mutex_exit(&lsp->ls_vp_lock);
1720 1721                  return (0);
1721 1722          }
1722 1723          if ((offset > lsp->ls_vp_size) ||
1723 1724              (offset > (((p_lba + p_nblks) << shift) + lsp->ls_crypto_offset)) ||
1724 1725              ((offset + bp->b_bcount) > ((p_lba + p_nblks) << shift))) {
1725 1726                  bioerror(bp, ENXIO);
1726 1727                  biodone(bp);
1727 1728                  mutex_exit(&lsp->ls_vp_lock);
1728 1729                  return (0);
1729 1730          }
1730 1731  
1731 1732          mutex_exit(&lsp->ls_vp_lock);
1732 1733  
1733 1734          if (lsp->ls_kstat) {
1734 1735                  mutex_enter(lsp->ls_kstat->ks_lock);
1735 1736                  kstat_waitq_enter(KSTAT_IO_PTR(lsp->ls_kstat));
1736 1737                  mutex_exit(lsp->ls_kstat->ks_lock);
1737 1738          }
1738 1739          bp->b_private = (void *)(uintptr_t)p_lba;       /* partition start */
1739 1740          (void) taskq_dispatch(lsp->ls_taskq, lofi_strategy_task, bp, KM_SLEEP);
1740 1741          return (0);
1741 1742  }
1742 1743  
1743 1744  /*ARGSUSED2*/
1744 1745  static int
1745 1746  lofi_read(dev_t dev, struct uio *uio, struct cred *credp)
1746 1747  {
1747 1748          if (getminor(dev) == 0)
1748 1749                  return (EINVAL);
1749 1750          UIO_CHECK(uio);
1750 1751          return (physio(lofi_strategy, NULL, dev, B_READ, minphys, uio));
1751 1752  }
1752 1753  
1753 1754  /*ARGSUSED2*/
1754 1755  static int
1755 1756  lofi_write(dev_t dev, struct uio *uio, struct cred *credp)
1756 1757  {
1757 1758          if (getminor(dev) == 0)
1758 1759                  return (EINVAL);
1759 1760          UIO_CHECK(uio);
1760 1761          return (physio(lofi_strategy, NULL, dev, B_WRITE, minphys, uio));
1761 1762  }
1762 1763  
1763 1764  /*ARGSUSED2*/
1764 1765  static int
1765 1766  lofi_aread(dev_t dev, struct aio_req *aio, struct cred *credp)
1766 1767  {
1767 1768          if (getminor(dev) == 0)
1768 1769                  return (EINVAL);
1769 1770          UIO_CHECK(aio->aio_uio);
1770 1771          return (aphysio(lofi_strategy, anocancel, dev, B_READ, minphys, aio));
1771 1772  }
1772 1773  
1773 1774  /*ARGSUSED2*/
1774 1775  static int
1775 1776  lofi_awrite(dev_t dev, struct aio_req *aio, struct cred *credp)
1776 1777  {
1777 1778          if (getminor(dev) == 0)
1778 1779                  return (EINVAL);
1779 1780          UIO_CHECK(aio->aio_uio);
1780 1781          return (aphysio(lofi_strategy, anocancel, dev, B_WRITE, minphys, aio));
1781 1782  }
1782 1783  
1783 1784  /*ARGSUSED*/
1784 1785  static int
1785 1786  lofi_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
1786 1787  {
1787 1788          struct lofi_state *lsp;
1788 1789          dev_t   dev = (dev_t)arg;
1789 1790          int instance;
1790 1791  
1791 1792          instance = LOFI_MINOR2ID(getminor(dev));
1792 1793          switch (infocmd) {
1793 1794          case DDI_INFO_DEVT2DEVINFO:
1794 1795                  lsp = ddi_get_soft_state(lofi_statep, instance);
1795 1796                  if (lsp == NULL)
1796 1797                          return (DDI_FAILURE);
1797 1798                  *result = lsp->ls_dip;
1798 1799                  return (DDI_SUCCESS);
1799 1800          case DDI_INFO_DEVT2INSTANCE:
1800 1801                  *result = (void *) (intptr_t)instance;
1801 1802                  return (DDI_SUCCESS);
1802 1803          }
1803 1804          return (DDI_FAILURE);
1804 1805  }
1805 1806  
1806 1807  static int
1807 1808  lofi_create_minor_nodes(struct lofi_state *lsp, boolean_t labeled)
1808 1809  {
1809 1810          int error = 0;
1810 1811          int instance = ddi_get_instance(lsp->ls_dip);
1811 1812  
1812 1813          if (labeled == B_TRUE) {
1813 1814                  cmlb_alloc_handle(&lsp->ls_cmlbhandle);
1814 1815                  error = cmlb_attach(lsp->ls_dip, &lofi_tg_ops, DTYPE_DIRECT,
1815 1816                      B_FALSE, B_FALSE, DDI_NT_BLOCK_CHAN,
1816 1817                      CMLB_CREATE_P0_MINOR_NODE, lsp->ls_cmlbhandle, (void *)1);
1817 1818  
1818 1819                  if (error != DDI_SUCCESS) {
1819 1820                          cmlb_free_handle(&lsp->ls_cmlbhandle);
1820 1821                          lsp->ls_cmlbhandle = NULL;
1821 1822                          error = ENXIO;
1822 1823                  }
1823 1824          } else {
1824 1825                  /* create minor nodes */
1825 1826                  error = ddi_create_minor_node(lsp->ls_dip, LOFI_BLOCK_NODE,
1826 1827                      S_IFBLK, LOFI_ID2MINOR(instance), DDI_PSEUDO, 0);
1827 1828                  if (error == DDI_SUCCESS) {
1828 1829                          error = ddi_create_minor_node(lsp->ls_dip,
1829 1830                              LOFI_CHAR_NODE, S_IFCHR, LOFI_ID2MINOR(instance),
1830 1831                              DDI_PSEUDO, 0);
1831 1832                          if (error != DDI_SUCCESS) {
1832 1833                                  ddi_remove_minor_node(lsp->ls_dip,
1833 1834                                      LOFI_BLOCK_NODE);
1834 1835                                  error = ENXIO;
1835 1836                          }
1836 1837                  } else
1837 1838                          error = ENXIO;
1838 1839          }
1839 1840          return (error);
1840 1841  }
1841 1842  
1842 1843  static int
1843 1844  lofi_zone_bind(struct lofi_state *lsp)
1844 1845  {
1845 1846          int error = 0;
1846 1847  
1847 1848          mutex_enter(&curproc->p_lock);
1848 1849          if ((error = rctl_incr_lofi(curproc, curproc->p_zone, 1)) != 0) {
1849 1850                  mutex_exit(&curproc->p_lock);
1850 1851                  return (error);
1851 1852          }
1852 1853          mutex_exit(&curproc->p_lock);
1853 1854  
1854 1855          if (ddi_prop_update_string(DDI_DEV_T_NONE, lsp->ls_dip, ZONE_PROP_NAME,
1855 1856              (char *)curproc->p_zone->zone_name) != DDI_PROP_SUCCESS) {
1856 1857                  rctl_decr_lofi(curproc->p_zone, 1);
1857 1858                  error = EINVAL;
1858 1859          } else {
1859 1860                  zone_init_ref(&lsp->ls_zone);
1860 1861                  zone_hold_ref(curzone, &lsp->ls_zone, ZONE_REF_LOFI);
1861 1862          }
1862 1863          return (error);
1863 1864  }
1864 1865  
1865 1866  static void
1866 1867  lofi_zone_unbind(struct lofi_state *lsp)
1867 1868  {
1868 1869          (void) ddi_prop_remove(DDI_DEV_T_NONE, lsp->ls_dip, ZONE_PROP_NAME);
1869 1870          rctl_decr_lofi(curproc->p_zone, 1);
1870 1871          zone_rele_ref(&lsp->ls_zone, ZONE_REF_LOFI);
1871 1872  }
1872 1873  
1873 1874  static int
1874 1875  lofi_online_dev(dev_info_t *dip)
1875 1876  {
1876 1877          boolean_t labeled;
1877 1878          int     error;
1878 1879          int     instance = ddi_get_instance(dip);
1879 1880          struct lofi_state *lsp;
1880 1881  
1881 1882          labeled = B_FALSE;
1882 1883          if (ddi_prop_exists(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, "labeled"))
1883 1884                  labeled = B_TRUE;
1884 1885  
1885 1886          /* lsp alloc+init, soft state is freed in lofi_detach */
1886 1887          error = ddi_soft_state_zalloc(lofi_statep, instance);
1887 1888          if (error == DDI_FAILURE) {
1888 1889                  return (ENOMEM);
1889 1890          }
1890 1891  
1891 1892          lsp = ddi_get_soft_state(lofi_statep, instance);
1892 1893          lsp->ls_dip = dip;
1893 1894  
1894 1895          if ((error = lofi_zone_bind(lsp)) != 0)
1895 1896                  goto err;
1896 1897  
1897 1898          cv_init(&lsp->ls_vp_cv, NULL, CV_DRIVER, NULL);
1898 1899          mutex_init(&lsp->ls_comp_cache_lock, NULL, MUTEX_DRIVER, NULL);
1899 1900          mutex_init(&lsp->ls_comp_bufs_lock, NULL, MUTEX_DRIVER, NULL);
1900 1901          mutex_init(&lsp->ls_kstat_lock, NULL, MUTEX_DRIVER, NULL);
1901 1902          mutex_init(&lsp->ls_vp_lock, NULL, MUTEX_DRIVER, NULL);
1902 1903  
1903 1904          if ((error = lofi_create_minor_nodes(lsp, labeled)) != 0) {
1904 1905                  lofi_zone_unbind(lsp);
1905 1906                  goto lerr;
1906 1907          }
1907 1908  
1908 1909          /* driver handles kernel-issued IOCTLs */
1909 1910          if (ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
1910 1911              DDI_KERNEL_IOCTL, NULL, 0) != DDI_PROP_SUCCESS) {
1911 1912                  error = DDI_FAILURE;
1912 1913                  goto merr;
1913 1914          }
1914 1915  
1915 1916          lsp->ls_kstat = kstat_create_zone(LOFI_DRIVER_NAME, instance,
1916 1917              NULL, "disk", KSTAT_TYPE_IO, 1, 0, getzoneid());
1917 1918          if (lsp->ls_kstat == NULL) {
1918 1919                  (void) ddi_prop_remove(DDI_DEV_T_NONE, lsp->ls_dip,
1919 1920                      DDI_KERNEL_IOCTL);
1920 1921                  error = ENOMEM;
1921 1922                  goto merr;
1922 1923          }
1923 1924  
1924 1925          lsp->ls_kstat->ks_lock = &lsp->ls_kstat_lock;
1925 1926          kstat_zone_add(lsp->ls_kstat, GLOBAL_ZONEID);
1926 1927          kstat_install(lsp->ls_kstat);
1927 1928          return (DDI_SUCCESS);
1928 1929  merr:
1929 1930          if (lsp->ls_cmlbhandle != NULL) {
1930 1931                  cmlb_detach(lsp->ls_cmlbhandle, 0);
1931 1932                  cmlb_free_handle(&lsp->ls_cmlbhandle);
1932 1933          }
1933 1934          ddi_remove_minor_node(dip, NULL);
1934 1935          lofi_zone_unbind(lsp);
1935 1936  lerr:
1936 1937          mutex_destroy(&lsp->ls_comp_cache_lock);
1937 1938          mutex_destroy(&lsp->ls_comp_bufs_lock);
1938 1939          mutex_destroy(&lsp->ls_kstat_lock);
1939 1940          mutex_destroy(&lsp->ls_vp_lock);
1940 1941          cv_destroy(&lsp->ls_vp_cv);
1941 1942  err:
1942 1943          ddi_soft_state_free(lofi_statep, instance);
1943 1944          return (error);
1944 1945  }
1945 1946  
1946 1947  static int
1947 1948  lofi_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
1948 1949  {
1949 1950          int     rv;
1950 1951          int     instance = ddi_get_instance(dip);
1951 1952          struct lofi_state *lsp;
1952 1953  
1953 1954          if (cmd != DDI_ATTACH)
1954 1955                  return (DDI_FAILURE);
1955 1956  
1956 1957          /*
1957 1958           * Instance 0 is control instance, attaching control instance
1958 1959           * will set the lofi up and ready.
1959 1960           */
1960 1961          if (instance == 0) {
1961 1962                  rv = ddi_soft_state_zalloc(lofi_statep, 0);
1962 1963                  if (rv == DDI_FAILURE) {
1963 1964                          return (DDI_FAILURE);
1964 1965                  }
1965 1966                  lsp = ddi_get_soft_state(lofi_statep, instance);
1966 1967                  rv = ddi_create_minor_node(dip, LOFI_CTL_NODE, S_IFCHR, 0,
1967 1968                      DDI_PSEUDO, 0);
1968 1969                  if (rv == DDI_FAILURE) {
1969 1970                          ddi_soft_state_free(lofi_statep, 0);
1970 1971                          return (DDI_FAILURE);
1971 1972                  }
1972 1973                  /* driver handles kernel-issued IOCTLs */
1973 1974                  if (ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
1974 1975                      DDI_KERNEL_IOCTL, NULL, 0) != DDI_PROP_SUCCESS) {
1975 1976                          ddi_remove_minor_node(dip, NULL);
1976 1977                          ddi_soft_state_free(lofi_statep, 0);
1977 1978                          return (DDI_FAILURE);
1978 1979                  }
1979 1980  
1980 1981                  zone_key_create(&lofi_zone_key, NULL, lofi_zone_shutdown, NULL);
1981 1982  
1982 1983                  lsp->ls_dip = dip;
1983 1984          } else {
1984 1985                  if (lofi_online_dev(dip) == DDI_FAILURE)
1985 1986                          return (DDI_FAILURE);
1986 1987          }
1987 1988  
1988 1989          ddi_report_dev(dip);
1989 1990          return (DDI_SUCCESS);
1990 1991  }
1991 1992  
1992 1993  static int
1993 1994  lofi_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
1994 1995  {
1995 1996          struct lofi_state *lsp;
1996 1997          int instance = ddi_get_instance(dip);
1997 1998  
1998 1999          if (cmd != DDI_DETACH)
1999 2000                  return (DDI_FAILURE);
2000 2001  
2001 2002          /*
2002 2003           * If the instance is not 0, release state.
2003 2004           * The instance 0 is control device, we can not detach it
2004 2005           * before other instances are detached.
2005 2006           */
2006 2007          if (instance != 0) {
2007 2008                  lsp = ddi_get_soft_state(lofi_statep, instance);
2008 2009                  if (lsp != NULL && lsp->ls_vp_ready == B_FALSE) {
2009 2010                          ddi_soft_state_free(lofi_statep, instance);
2010 2011                          return (DDI_SUCCESS);
2011 2012                  } else
2012 2013                          return (DDI_FAILURE);
2013 2014          }
2014 2015          mutex_enter(&lofi_lock);
2015 2016  
2016 2017          if (!list_is_empty(&lofi_list)) {
2017 2018                  mutex_exit(&lofi_lock);
2018 2019                  return (DDI_FAILURE);
2019 2020          }
2020 2021  
2021 2022          ddi_remove_minor_node(dip, NULL);
2022 2023          ddi_prop_remove_all(dip);
2023 2024  
2024 2025          mutex_exit(&lofi_lock);
2025 2026  
2026 2027          if (zone_key_delete(lofi_zone_key) != 0)
2027 2028                  cmn_err(CE_WARN, "failed to delete zone key");
2028 2029  
2029 2030          ddi_soft_state_free(lofi_statep, 0);
2030 2031  
2031 2032          return (DDI_SUCCESS);
2032 2033  }
2033 2034  
2034 2035  /*
2035 2036   * With the addition of encryption, we must be careful that encryption key is
2036 2037   * wiped before kernel's data structures are freed so it cannot accidentally
2037 2038   * slip out to userland through uninitialized data elsewhere.
2038 2039   */
2039 2040  static void
2040 2041  free_lofi_ioctl(struct lofi_ioctl *klip)
2041 2042  {
2042 2043          /* Make sure this encryption key doesn't stick around */
2043 2044          bzero(klip->li_key, sizeof (klip->li_key));
2044 2045          kmem_free(klip, sizeof (struct lofi_ioctl));
2045 2046  }
2046 2047  
2047 2048  /*
2048 2049   * These two functions simplify the rest of the ioctls that need to copyin/out
2049 2050   * the lofi_ioctl structure.
2050 2051   */
2051 2052  int
2052 2053  copy_in_lofi_ioctl(const struct lofi_ioctl *ulip, struct lofi_ioctl **klipp,
2053 2054      int flag)
2054 2055  {
2055 2056          struct lofi_ioctl *klip;
2056 2057          int     error;
2057 2058  
2058 2059          klip = *klipp = kmem_alloc(sizeof (struct lofi_ioctl), KM_SLEEP);
2059 2060          error = ddi_copyin(ulip, klip, sizeof (struct lofi_ioctl), flag);
2060 2061          if (error)
2061 2062                  goto err;
2062 2063  
2063 2064          /* ensure NULL termination */
2064 2065          klip->li_filename[MAXPATHLEN-1] = '\0';
2065 2066          klip->li_devpath[MAXPATHLEN-1] = '\0';
2066 2067          klip->li_algorithm[MAXALGLEN-1] = '\0';
2067 2068          klip->li_cipher[CRYPTO_MAX_MECH_NAME-1] = '\0';
2068 2069          klip->li_iv_cipher[CRYPTO_MAX_MECH_NAME-1] = '\0';
2069 2070  
2070 2071          if (klip->li_id > L_MAXMIN32) {
2071 2072                  error = EINVAL;
2072 2073                  goto err;
2073 2074          }
2074 2075  
2075 2076          return (0);
2076 2077  
2077 2078  err:
2078 2079          free_lofi_ioctl(klip);
2079 2080          return (error);
2080 2081  }
2081 2082  
2082 2083  int
2083 2084  copy_out_lofi_ioctl(const struct lofi_ioctl *klip, struct lofi_ioctl *ulip,
2084 2085      int flag)
2085 2086  {
2086 2087          int     error;
2087 2088  
2088 2089          /*
2089 2090           * NOTE: Do NOT copy the crypto_key_t "back" to userland.
2090 2091           * This ensures that an attacker can't trivially find the
2091 2092           * key for a mapping just by issuing the ioctl.
2092 2093           *
2093 2094           * It can still be found by poking around in kmem with mdb(1),
2094 2095           * but there is no point in making it easy when the info isn't
2095 2096           * of any use in this direction anyway.
2096 2097           *
2097 2098           * Either way we don't actually have the raw key stored in
2098 2099           * a form that we can get it anyway, since we just used it
2099 2100           * to create a ctx template and didn't keep "the original".
2100 2101           */
2101 2102          error = ddi_copyout(klip, ulip, sizeof (struct lofi_ioctl), flag);
2102 2103          if (error)
2103 2104                  return (EFAULT);
2104 2105          return (0);
2105 2106  }
2106 2107  
2107 2108  static int
2108 2109  lofi_access(struct lofi_state *lsp)
2109 2110  {
2110 2111          ASSERT(MUTEX_HELD(&lofi_lock));
2111 2112          if (INGLOBALZONE(curproc) || lsp->ls_zone.zref_zone == curzone)
2112 2113                  return (0);
2113 2114          return (EPERM);
2114 2115  }
2115 2116  
2116 2117  /*
2117 2118   * Find the lofi state for the given filename. We compare by vnode to
2118 2119   * allow the global zone visibility into NGZ lofi nodes.
2119 2120   */
2120 2121  static int
2121 2122  file_to_lofi_nocheck(char *filename, boolean_t readonly,
2122 2123      struct lofi_state **lspp)
2123 2124  {
2124 2125          struct lofi_state *lsp;
2125 2126          vnode_t *vp = NULL;
2126 2127          int err = 0;
2127 2128          int rdfiles = 0;
2128 2129  
2129 2130          ASSERT(MUTEX_HELD(&lofi_lock));
2130 2131  
2131 2132          if ((err = lookupname(filename, UIO_SYSSPACE, FOLLOW,
2132 2133              NULLVPP, &vp)) != 0)
2133 2134                  goto out;
2134 2135  
2135 2136          if (vp->v_type == VREG) {
2136 2137                  vnode_t *realvp;
2137 2138                  if (VOP_REALVP(vp, &realvp, NULL) == 0) {
2138 2139                          VN_HOLD(realvp);
2139 2140                          VN_RELE(vp);
2140 2141                          vp = realvp;
2141 2142                  }
2142 2143          }
2143 2144  
2144 2145          for (lsp = list_head(&lofi_list); lsp != NULL;
2145 2146              lsp = list_next(&lofi_list, lsp)) {
2146 2147                  if (lsp->ls_vp == vp) {
2147 2148                          if (lspp != NULL)
2148 2149                                  *lspp = lsp;
2149 2150                          if (lsp->ls_readonly) {
2150 2151                                  rdfiles++;
2151 2152                                  /* Skip if '-r' is specified */
2152 2153                                  if (readonly)
2153 2154                                          continue;
2154 2155                          }
2155 2156                          goto out;
2156 2157                  }
2157 2158          }
2158 2159  
2159 2160          err = ENOENT;
2160 2161  
2161 2162          /*
2162 2163           * If a filename is given as an argument for lofi_unmap, we shouldn't
2163 2164           * allow unmap if there are multiple read-only lofi devices associated
2164 2165           * with this file.
2165 2166           */
2166 2167          if (lspp != NULL) {
2167 2168                  if (rdfiles == 1)
2168 2169                          err = 0;
2169 2170                  else if (rdfiles > 1)
2170 2171                          err = EBUSY;
2171 2172          }
2172 2173  
2173 2174  out:
2174 2175          if (vp != NULL)
2175 2176                  VN_RELE(vp);
2176 2177          return (err);
2177 2178  }
2178 2179  
2179 2180  /*
2180 2181   * Find the minor for the given filename, checking the zone can access
2181 2182   * it.
2182 2183   */
2183 2184  static int
2184 2185  file_to_lofi(char *filename, boolean_t readonly, struct lofi_state **lspp)
2185 2186  {
2186 2187          int err = 0;
2187 2188  
2188 2189          ASSERT(MUTEX_HELD(&lofi_lock));
2189 2190  
2190 2191          if ((err = file_to_lofi_nocheck(filename, readonly, lspp)) != 0)
2191 2192                  return (err);
2192 2193  
2193 2194          if ((err = lofi_access(*lspp)) != 0)
2194 2195                  return (err);
2195 2196  
2196 2197          return (0);
2197 2198  }
2198 2199  
2199 2200  /*
2200 2201   * Fakes up a disk geometry based on the size of the file. This is needed
2201 2202   * to support newfs on traditional lofi device, but also will provide
2202 2203   * geometry hint for cmlb.
2203 2204   */
2204 2205  static void
2205 2206  fake_disk_geometry(struct lofi_state *lsp)
2206 2207  {
2207 2208          u_offset_t dsize = lsp->ls_vp_size - lsp->ls_crypto_offset;
2208 2209  
2209 2210          /* dk_geom - see dkio(7I) */
2210 2211          /*
2211 2212           * dkg_ncyl _could_ be set to one here (one big cylinder with gobs
2212 2213           * of sectors), but that breaks programs like fdisk which want to
2213 2214           * partition a disk by cylinder. With one cylinder, you can't create
2214 2215           * an fdisk partition and put pcfs on it for testing (hard to pick
2215 2216           * a number between one and one).
2216 2217           *
2217 2218           * The cheezy floppy test is an attempt to not have too few cylinders
2218 2219           * for a small file, or so many on a big file that you waste space
2219 2220           * for backup superblocks or cylinder group structures.
2220 2221           */
2221 2222          bzero(&lsp->ls_dkg, sizeof (lsp->ls_dkg));
2222 2223          if (dsize < (2 * 1024 * 1024)) /* floppy? */
2223 2224                  lsp->ls_dkg.dkg_ncyl = dsize / (100 * 1024);
2224 2225          else
2225 2226                  lsp->ls_dkg.dkg_ncyl = dsize / (300 * 1024);
2226 2227          /* in case file file is < 100k */
2227 2228          if (lsp->ls_dkg.dkg_ncyl == 0)
2228 2229                  lsp->ls_dkg.dkg_ncyl = 1;
2229 2230  
2230 2231          lsp->ls_dkg.dkg_pcyl = lsp->ls_dkg.dkg_ncyl;
2231 2232          lsp->ls_dkg.dkg_nhead = 1;
2232 2233          lsp->ls_dkg.dkg_rpm = 7200;
2233 2234  
2234 2235          lsp->ls_dkg.dkg_nsect = dsize /
2235 2236              (lsp->ls_dkg.dkg_ncyl << lsp->ls_pbshift);
2236 2237  }
2237 2238  
2238 2239  /*
2239 2240   * build vtoc - see dkio(7I)
2240 2241   *
2241 2242   * Fakes one big partition based on the size of the file. This is needed
2242 2243   * because we allow newfs'ing the traditional lofi device and newfs will
2243 2244   * do several disk ioctls to figure out the geometry and partition information.
2244 2245   * It uses that information to determine the parameters to pass to mkfs.
2245 2246   */
2246 2247  static void
2247 2248  fake_disk_vtoc(struct lofi_state *lsp, struct vtoc *vt)
2248 2249  {
2249 2250          bzero(vt, sizeof (struct vtoc));
2250 2251          vt->v_sanity = VTOC_SANE;
2251 2252          vt->v_version = V_VERSION;
2252 2253          (void) strncpy(vt->v_volume, LOFI_DRIVER_NAME,
2253 2254              sizeof (vt->v_volume));
2254 2255          vt->v_sectorsz = 1 << lsp->ls_pbshift;
2255 2256          vt->v_nparts = 1;
2256 2257          vt->v_part[0].p_tag = V_UNASSIGNED;
2257 2258  
2258 2259          /*
2259 2260           * A compressed file is read-only, other files can
2260 2261           * be read-write
2261 2262           */
2262 2263          if (lsp->ls_uncomp_seg_sz > 0) {
2263 2264                  vt->v_part[0].p_flag = V_UNMNT | V_RONLY;
2264 2265          } else {
2265 2266                  vt->v_part[0].p_flag = V_UNMNT;
2266 2267          }
2267 2268          vt->v_part[0].p_start = (daddr_t)0;
2268 2269          /*
2269 2270           * The partition size cannot just be the number of sectors, because
2270 2271           * that might not end on a cylinder boundary. And if that's the case,
2271 2272           * newfs/mkfs will print a scary warning. So just figure the size
2272 2273           * based on the number of cylinders and sectors/cylinder.
2273 2274           */
2274 2275          vt->v_part[0].p_size = lsp->ls_dkg.dkg_pcyl *
2275 2276              lsp->ls_dkg.dkg_nsect * lsp->ls_dkg.dkg_nhead;
2276 2277  }
2277 2278  
2278 2279  /*
2279 2280   * build dk_cinfo - see dkio(7I)
2280 2281   */
2281 2282  static void
2282 2283  fake_disk_info(dev_t dev, struct dk_cinfo *ci)
2283 2284  {
2284 2285          bzero(ci, sizeof (struct dk_cinfo));
2285 2286          (void) strlcpy(ci->dki_cname, LOFI_DRIVER_NAME, sizeof (ci->dki_cname));
2286 2287          ci->dki_ctype = DKC_SCSI_CCS;
2287 2288          (void) strlcpy(ci->dki_dname, LOFI_DRIVER_NAME, sizeof (ci->dki_dname));
2288 2289          ci->dki_unit = LOFI_MINOR2ID(getminor(dev));
2289 2290          ci->dki_partition = LOFI_PART(getminor(dev));
2290 2291          /*
2291 2292           * newfs uses this to set maxcontig. Must not be < 16, or it
2292 2293           * will be 0 when newfs multiplies it by DEV_BSIZE and divides
2293 2294           * it by the block size. Then tunefs doesn't work because
2294 2295           * maxcontig is 0.
2295 2296           */
2296 2297          ci->dki_maxtransfer = 16;
2297 2298  }
2298 2299  
2299 2300  /*
2300 2301   * map in a compressed file
2301 2302   *
2302 2303   * Read in the header and the index that follows.
2303 2304   *
2304 2305   * The header is as follows -
2305 2306   *
2306 2307   * Signature (name of the compression algorithm)
2307 2308   * Compression segment size (a multiple of 512)
2308 2309   * Number of index entries
2309 2310   * Size of the last block
2310 2311   * The array containing the index entries
2311 2312   *
2312 2313   * The header information is always stored in
2313 2314   * network byte order on disk.
2314 2315   */
2315 2316  static int
2316 2317  lofi_map_compressed_file(struct lofi_state *lsp, char *buf)
2317 2318  {
2318 2319          uint32_t index_sz, header_len, i;
2319 2320          ssize_t resid;
2320 2321          enum uio_rw rw;
2321 2322          char *tbuf = buf;
2322 2323          int error;
2323 2324  
2324 2325          /* The signature has already been read */
2325 2326          tbuf += sizeof (lsp->ls_comp_algorithm);
2326 2327          bcopy(tbuf, &(lsp->ls_uncomp_seg_sz), sizeof (lsp->ls_uncomp_seg_sz));
2327 2328          lsp->ls_uncomp_seg_sz = ntohl(lsp->ls_uncomp_seg_sz);
2328 2329  
2329 2330          /*
2330 2331           * The compressed segment size must be a power of 2
2331 2332           */
2332 2333          if (lsp->ls_uncomp_seg_sz < DEV_BSIZE ||
2333 2334              !ISP2(lsp->ls_uncomp_seg_sz))
2334 2335                  return (EINVAL);
2335 2336  
2336 2337          for (i = 0; !((lsp->ls_uncomp_seg_sz >> i) & 1); i++)
2337 2338                  ;
2338 2339  
2339 2340          lsp->ls_comp_seg_shift = i;
2340 2341  
2341 2342          tbuf += sizeof (lsp->ls_uncomp_seg_sz);
2342 2343          bcopy(tbuf, &(lsp->ls_comp_index_sz), sizeof (lsp->ls_comp_index_sz));
2343 2344          lsp->ls_comp_index_sz = ntohl(lsp->ls_comp_index_sz);
2344 2345  
2345 2346          tbuf += sizeof (lsp->ls_comp_index_sz);
2346 2347          bcopy(tbuf, &(lsp->ls_uncomp_last_seg_sz),
2347 2348              sizeof (lsp->ls_uncomp_last_seg_sz));
2348 2349          lsp->ls_uncomp_last_seg_sz = ntohl(lsp->ls_uncomp_last_seg_sz);
2349 2350  
2350 2351          /*
2351 2352           * Compute the total size of the uncompressed data
2352 2353           * for use in fake_disk_geometry and other calculations.
2353 2354           * Disk geometry has to be faked with respect to the
2354 2355           * actual uncompressed data size rather than the
2355 2356           * compressed file size.
2356 2357           */
2357 2358          lsp->ls_vp_size =
2358 2359              (u_offset_t)(lsp->ls_comp_index_sz - 2) * lsp->ls_uncomp_seg_sz
2359 2360              + lsp->ls_uncomp_last_seg_sz;
2360 2361  
2361 2362          /*
2362 2363           * Index size is rounded up to DEV_BSIZE for ease
2363 2364           * of segmapping
2364 2365           */
2365 2366          index_sz = sizeof (*lsp->ls_comp_seg_index) * lsp->ls_comp_index_sz;
2366 2367          header_len = sizeof (lsp->ls_comp_algorithm) +
2367 2368              sizeof (lsp->ls_uncomp_seg_sz) +
2368 2369              sizeof (lsp->ls_comp_index_sz) +
2369 2370              sizeof (lsp->ls_uncomp_last_seg_sz);
2370 2371          lsp->ls_comp_offbase = header_len + index_sz;
2371 2372  
2372 2373          index_sz += header_len;
2373 2374          index_sz = roundup(index_sz, DEV_BSIZE);
2374 2375  
2375 2376          lsp->ls_comp_index_data = kmem_alloc(index_sz, KM_SLEEP);
2376 2377          lsp->ls_comp_index_data_sz = index_sz;
2377 2378  
2378 2379          /*
2379 2380           * Read in the index -- this has a side-effect
2380 2381           * of reading in the header as well
2381 2382           */
2382 2383          rw = UIO_READ;
2383 2384          error = vn_rdwr(rw, lsp->ls_vp, lsp->ls_comp_index_data, index_sz,
2384 2385              0, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
2385 2386  
2386 2387          if (error != 0)
2387 2388                  return (error);
2388 2389  
2389 2390          /* Skip the header, this is where the index really begins */
2390 2391          lsp->ls_comp_seg_index =
2391 2392              /*LINTED*/
2392 2393              (uint64_t *)(lsp->ls_comp_index_data + header_len);
2393 2394  
2394 2395          /*
2395 2396           * Now recompute offsets in the index to account for
2396 2397           * the header length
2397 2398           */
2398 2399          for (i = 0; i < lsp->ls_comp_index_sz; i++) {
2399 2400                  lsp->ls_comp_seg_index[i] = lsp->ls_comp_offbase +
2400 2401                      BE_64(lsp->ls_comp_seg_index[i]);
2401 2402          }
2402 2403  
2403 2404          return (error);
2404 2405  }
2405 2406  
2406 2407  static int
2407 2408  lofi_init_crypto(struct lofi_state *lsp, struct lofi_ioctl *klip)
2408 2409  {
2409 2410          struct crypto_meta chead;
2410 2411          char buf[DEV_BSIZE];
2411 2412          ssize_t resid;
2412 2413          char *marker;
2413 2414          int error;
2414 2415          int ret;
2415 2416          int i;
2416 2417  
2417 2418          if (!klip->li_crypto_enabled)
2418 2419                  return (0);
2419 2420  
2420 2421          /*
2421 2422           * All current algorithms have a max of 448 bits.
2422 2423           */
2423 2424          if (klip->li_iv_len > CRYPTO_BITS2BYTES(512))
2424 2425                  return (EINVAL);
2425 2426  
2426 2427          if (CRYPTO_BITS2BYTES(klip->li_key_len) > sizeof (klip->li_key))
2427 2428                  return (EINVAL);
2428 2429  
2429 2430          lsp->ls_crypto_enabled = klip->li_crypto_enabled;
2430 2431  
2431 2432          mutex_init(&lsp->ls_crypto_lock, NULL, MUTEX_DRIVER, NULL);
2432 2433  
2433 2434          lsp->ls_mech.cm_type = crypto_mech2id(klip->li_cipher);
2434 2435          if (lsp->ls_mech.cm_type == CRYPTO_MECH_INVALID) {
2435 2436                  cmn_err(CE_WARN, "invalid cipher %s requested for %s",
2436 2437                      klip->li_cipher, klip->li_filename);
2437 2438                  return (EINVAL);
2438 2439          }
2439 2440  
2440 2441          /* this is just initialization here */
2441 2442          lsp->ls_mech.cm_param = NULL;
2442 2443          lsp->ls_mech.cm_param_len = 0;
2443 2444  
2444 2445          lsp->ls_iv_type = klip->li_iv_type;
2445 2446          lsp->ls_iv_mech.cm_type = crypto_mech2id(klip->li_iv_cipher);
2446 2447          if (lsp->ls_iv_mech.cm_type == CRYPTO_MECH_INVALID) {
2447 2448                  cmn_err(CE_WARN, "invalid iv cipher %s requested"
2448 2449                      " for %s", klip->li_iv_cipher, klip->li_filename);
2449 2450                  return (EINVAL);
2450 2451          }
2451 2452  
2452 2453          /* iv mech must itself take a null iv */
2453 2454          lsp->ls_iv_mech.cm_param = NULL;
2454 2455          lsp->ls_iv_mech.cm_param_len = 0;
2455 2456          lsp->ls_iv_len = klip->li_iv_len;
2456 2457  
2457 2458          /*
2458 2459           * Create ctx using li_cipher & the raw li_key after checking
2459 2460           * that it isn't a weak key.
2460 2461           */
2461 2462          lsp->ls_key.ck_format = CRYPTO_KEY_RAW;
2462 2463          lsp->ls_key.ck_length = klip->li_key_len;
2463 2464          lsp->ls_key.ck_data = kmem_alloc(
2464 2465              CRYPTO_BITS2BYTES(lsp->ls_key.ck_length), KM_SLEEP);
2465 2466          bcopy(klip->li_key, lsp->ls_key.ck_data,
2466 2467              CRYPTO_BITS2BYTES(lsp->ls_key.ck_length));
2467 2468  
2468 2469          ret = crypto_key_check(&lsp->ls_mech, &lsp->ls_key);
2469 2470          if (ret != CRYPTO_SUCCESS) {
2470 2471                  cmn_err(CE_WARN, "weak key check failed for cipher "
2471 2472                      "%s on file %s (0x%x)", klip->li_cipher,
2472 2473                      klip->li_filename, ret);
2473 2474                  return (EINVAL);
2474 2475          }
2475 2476  
2476 2477          error = vn_rdwr(UIO_READ, lsp->ls_vp, buf, DEV_BSIZE,
2477 2478              CRYOFF, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
2478 2479          if (error != 0)
2479 2480                  return (error);
2480 2481  
2481 2482          /*
2482 2483           * This is the case where the header in the lofi image is already
2483 2484           * initialized to indicate it is encrypted.
2484 2485           */
2485 2486          if (strncmp(buf, lofi_crypto_magic, sizeof (lofi_crypto_magic)) == 0) {
2486 2487                  /*
2487 2488                   * The encryption header information is laid out this way:
2488 2489                   *      6 bytes:        hex "CFLOFI"
2489 2490                   *      2 bytes:        version = 0 ... for now
2490 2491                   *      96 bytes:       reserved1 (not implemented yet)
2491 2492                   *      4 bytes:        data_sector = 2 ... for now
2492 2493                   *      more...         not implemented yet
2493 2494                   */
2494 2495  
2495 2496                  marker = buf;
2496 2497  
2497 2498                  /* copy the magic */
2498 2499                  bcopy(marker, lsp->ls_crypto.magic,
2499 2500                      sizeof (lsp->ls_crypto.magic));
2500 2501                  marker += sizeof (lsp->ls_crypto.magic);
2501 2502  
2502 2503                  /* read the encryption version number */
2503 2504                  bcopy(marker, &(lsp->ls_crypto.version),
2504 2505                      sizeof (lsp->ls_crypto.version));
2505 2506                  lsp->ls_crypto.version = ntohs(lsp->ls_crypto.version);
2506 2507                  marker += sizeof (lsp->ls_crypto.version);
2507 2508  
2508 2509                  /* read a chunk of reserved data */
2509 2510                  bcopy(marker, lsp->ls_crypto.reserved1,
2510 2511                      sizeof (lsp->ls_crypto.reserved1));
2511 2512                  marker += sizeof (lsp->ls_crypto.reserved1);
2512 2513  
2513 2514                  /* read block number where encrypted data begins */
2514 2515                  bcopy(marker, &(lsp->ls_crypto.data_sector),
2515 2516                      sizeof (lsp->ls_crypto.data_sector));
2516 2517                  lsp->ls_crypto.data_sector = ntohl(lsp->ls_crypto.data_sector);
2517 2518                  marker += sizeof (lsp->ls_crypto.data_sector);
2518 2519  
2519 2520                  /* and ignore the rest until it is implemented */
2520 2521  
2521 2522                  lsp->ls_crypto_offset = lsp->ls_crypto.data_sector * DEV_BSIZE;
2522 2523                  return (0);
2523 2524          }
2524 2525  
2525 2526          /*
2526 2527           * We've requested encryption, but no magic was found, so it must be
2527 2528           * a new image.
2528 2529           */
2529 2530  
2530 2531          for (i = 0; i < sizeof (struct crypto_meta); i++) {
2531 2532                  if (buf[i] != '\0')
2532 2533                          return (EINVAL);
2533 2534          }
2534 2535  
2535 2536          marker = buf;
2536 2537          bcopy(lofi_crypto_magic, marker, sizeof (lofi_crypto_magic));
2537 2538          marker += sizeof (lofi_crypto_magic);
2538 2539          chead.version = htons(LOFI_CRYPTO_VERSION);
2539 2540          bcopy(&(chead.version), marker, sizeof (chead.version));
2540 2541          marker += sizeof (chead.version);
2541 2542          marker += sizeof (chead.reserved1);
2542 2543          chead.data_sector = htonl(LOFI_CRYPTO_DATA_SECTOR);
2543 2544          bcopy(&(chead.data_sector), marker, sizeof (chead.data_sector));
2544 2545  
2545 2546          /* write the header */
2546 2547          error = vn_rdwr(UIO_WRITE, lsp->ls_vp, buf, DEV_BSIZE,
2547 2548              CRYOFF, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
2548 2549          if (error != 0)
2549 2550                  return (error);
2550 2551  
2551 2552          /* fix things up so it looks like we read this info */
2552 2553          bcopy(lofi_crypto_magic, lsp->ls_crypto.magic,
2553 2554              sizeof (lofi_crypto_magic));
2554 2555          lsp->ls_crypto.version = LOFI_CRYPTO_VERSION;
2555 2556          lsp->ls_crypto.data_sector = LOFI_CRYPTO_DATA_SECTOR;
2556 2557          lsp->ls_crypto_offset = lsp->ls_crypto.data_sector * DEV_BSIZE;
2557 2558          return (0);
2558 2559  }
2559 2560  
2560 2561  /*
2561 2562   * Check to see if the passed in signature is a valid one.  If it is
2562 2563   * valid, return the index into lofi_compress_table.
2563 2564   *
2564 2565   * Return -1 if it is invalid
2565 2566   */
2566 2567  static int
2567 2568  lofi_compress_select(const char *signature)
2568 2569  {
2569 2570          int i;
2570 2571  
2571 2572          for (i = 0; i < LOFI_COMPRESS_FUNCTIONS; i++) {
2572 2573                  if (strcmp(lofi_compress_table[i].l_name, signature) == 0)
2573 2574                          return (i);
2574 2575          }
2575 2576  
2576 2577          return (-1);
2577 2578  }
2578 2579  
2579 2580  static int
2580 2581  lofi_init_compress(struct lofi_state *lsp)
2581 2582  {
2582 2583          char buf[DEV_BSIZE];
2583 2584          int compress_index;
2584 2585          ssize_t resid;
2585 2586          int error;
2586 2587  
2587 2588          error = vn_rdwr(UIO_READ, lsp->ls_vp, buf, DEV_BSIZE, 0, UIO_SYSSPACE,
2588 2589              0, RLIM64_INFINITY, kcred, &resid);
2589 2590  
2590 2591          if (error != 0)
2591 2592                  return (error);
2592 2593  
2593 2594          if ((compress_index = lofi_compress_select(buf)) == -1)
2594 2595                  return (0);
2595 2596  
2596 2597          /* compression and encryption are mutually exclusive */
2597 2598          if (lsp->ls_crypto_enabled)
2598 2599                  return (ENOTSUP);
2599 2600  
2600 2601          /* initialize compression info for compressed lofi */
2601 2602          lsp->ls_comp_algorithm_index = compress_index;
2602 2603          (void) strlcpy(lsp->ls_comp_algorithm,
2603 2604              lofi_compress_table[compress_index].l_name,
2604 2605              sizeof (lsp->ls_comp_algorithm));
2605 2606  
2606 2607          /* Finally setup per-thread pre-allocated buffers */
2607 2608          lsp->ls_comp_bufs = kmem_zalloc(lofi_taskq_nthreads *
2608 2609              sizeof (struct compbuf), KM_SLEEP);
2609 2610  
2610 2611          return (lofi_map_compressed_file(lsp, buf));
2611 2612  }
2612 2613  
2613 2614  /*
2614 2615   * Allocate new or proposed id from lofi_id.
2615 2616   *
2616 2617   * Special cases for proposed id:
2617 2618   * 0: not allowed, 0 is id for control device.
2618 2619   * -1: allocate first usable id from lofi_id.
2619 2620   * any other value is proposed value from userland
2620 2621   *
2621 2622   * returns DDI_SUCCESS or errno.
2622 2623   */
2623 2624  static int
2624 2625  lofi_alloc_id(int *idp)
2625 2626  {
2626 2627          int id, error = DDI_SUCCESS;
2627 2628  
2628 2629          if (*idp == -1) {
2629 2630                  id = id_allocff_nosleep(lofi_id);
2630 2631                  if (id == -1) {
2631 2632                          error = EAGAIN;
2632 2633                          goto err;
2633 2634                  }
2634 2635          } else if (*idp == 0) {
2635 2636                  error = EINVAL;
2636 2637                  goto err;
2637 2638          } else if (*idp > ((1 << (L_BITSMINOR - LOFI_CMLB_SHIFT)) - 1)) {
2638 2639                  error = ERANGE;
2639 2640                  goto err;
2640 2641          } else {
2641 2642                  if (ddi_get_soft_state(lofi_statep, *idp) != NULL) {
2642 2643                          error = EEXIST;
2643 2644                          goto err;
2644 2645                  }
2645 2646  
2646 2647                  id = id_alloc_specific_nosleep(lofi_id, *idp);
2647 2648                  if (id == -1) {
2648 2649                          error = EAGAIN;
2649 2650                          goto err;
2650 2651                  }
2651 2652          }
2652 2653          *idp = id;
2653 2654  err:
2654 2655          return (error);
2655 2656  }
2656 2657  
2657 2658  static int
2658 2659  lofi_create_dev(struct lofi_ioctl *klip)
2659 2660  {
2660 2661          dev_info_t *parent, *child;
2661 2662          struct lofi_state *lsp = NULL;
2662 2663          char namebuf[MAXNAMELEN];
2663 2664          int error, circ;
2664 2665  
2665 2666          /* get control device */
2666 2667          lsp = ddi_get_soft_state(lofi_statep, 0);
2667 2668          parent = ddi_get_parent(lsp->ls_dip);
2668 2669  
2669 2670          if ((error = lofi_alloc_id((int *)&klip->li_id)))
2670 2671                  return (error);
2671 2672  
2672 2673          (void) snprintf(namebuf, sizeof (namebuf), LOFI_DRIVER_NAME "@%d",
2673 2674              klip->li_id);
2674 2675  
2675 2676          ndi_devi_enter(parent, &circ);
2676 2677          child = ndi_devi_findchild(parent, namebuf);
2677 2678          ndi_devi_exit(parent, circ);
2678 2679  
2679 2680          if (child == NULL) {
2680 2681                  child = ddi_add_child(parent, LOFI_DRIVER_NAME,
2681 2682                      (pnode_t)DEVI_SID_NODEID, klip->li_id);
2682 2683                  if ((error = ddi_prop_update_int(DDI_DEV_T_NONE, child,
2683 2684                      "instance", klip->li_id)) != DDI_PROP_SUCCESS)
2684 2685                          goto err;
2685 2686  
2686 2687                  if (klip->li_labeled == B_TRUE) {
2687 2688                          if ((error = ddi_prop_create(DDI_DEV_T_NONE, child,
2688 2689                              DDI_PROP_CANSLEEP, "labeled", 0, 0))
2689 2690                              != DDI_PROP_SUCCESS)
2690 2691                                  goto err;
2691 2692                  }
2692 2693  
2693 2694                  if ((error = ndi_devi_online(child, NDI_ONLINE_ATTACH))
2694 2695                      != NDI_SUCCESS)
2695 2696                          goto err;
2696 2697          } else {
2697 2698                  id_free(lofi_id, klip->li_id);
2698 2699                  error = EEXIST;
2699 2700                  return (error);
2700 2701          }
2701 2702  
2702 2703          goto done;
2703 2704  
2704 2705  err:
2705 2706          ddi_prop_remove_all(child);
2706 2707          (void) ndi_devi_offline(child, NDI_DEVI_REMOVE);
2707 2708          id_free(lofi_id, klip->li_id);
2708 2709  done:
2709 2710  
2710 2711          return (error);
2711 2712  }
2712 2713  
2713 2714  static void
2714 2715  lofi_create_inquiry(struct lofi_state *lsp, struct scsi_inquiry *inq)
2715 2716  {
2716 2717          char *p = NULL;
2717 2718  
2718 2719          (void) strlcpy(inq->inq_vid, LOFI_DRIVER_NAME, sizeof (inq->inq_vid));
2719 2720  
2720 2721          mutex_enter(&lsp->ls_vp_lock);
2721 2722          if (lsp->ls_vp != NULL)
2722 2723                  p = strrchr(lsp->ls_vp->v_path, '/');
2723 2724          if (p != NULL)
2724 2725                  (void) strncpy(inq->inq_pid, p + 1, sizeof (inq->inq_pid));
2725 2726          mutex_exit(&lsp->ls_vp_lock);
2726 2727          (void) strlcpy(inq->inq_revision, "1.0", sizeof (inq->inq_revision));
2727 2728  }
2728 2729  
2729 2730  /*
2730 2731   * copy devlink name from event cache
2731 2732   */
2732 2733  static void
2733 2734  lofi_copy_devpath(struct lofi_ioctl *klip)
2734 2735  {
2735 2736          int     error;
2736 2737          char    namebuf[MAXNAMELEN], *str;
2737 2738          clock_t ticks;
2738 2739          nvlist_t *nvl = NULL;
2739 2740  
2740 2741          if (klip->li_labeled == B_TRUE)
2741 2742                  klip->li_devpath[0] = '\0';
2742 2743          else {
2743 2744                  /* no need to wait for messages */
2744 2745                  (void) snprintf(klip->li_devpath, sizeof (klip->li_devpath),
2745 2746                      "/dev/" LOFI_CHAR_NAME "/%d", klip->li_id);
2746 2747                  return;
2747 2748          }
2748 2749  
2749 2750          (void) snprintf(namebuf, sizeof (namebuf), "%d", klip->li_id);
2750 2751          ticks = ddi_get_lbolt() + LOFI_TIMEOUT * drv_usectohz(1000000);
2751 2752  
2752 2753          mutex_enter(&lofi_devlink_cache.ln_lock);
2753 2754          error = nvlist_lookup_nvlist(lofi_devlink_cache.ln_data, namebuf, &nvl);
2754 2755          while (error != 0) {
2755 2756                  error = cv_timedwait(&lofi_devlink_cache.ln_cv,
2756 2757                      &lofi_devlink_cache.ln_lock, ticks);
2757 2758                  if (error == -1)
2758 2759                          break;
2759 2760                  error = nvlist_lookup_nvlist(lofi_devlink_cache.ln_data,
2760 2761                      namebuf, &nvl);
2761 2762          }
2762 2763  
2763 2764          if (nvl != NULL) {
2764 2765                  if (nvlist_lookup_string(nvl, DEV_NAME, &str) == 0) {
2765 2766                          (void) strlcpy(klip->li_devpath, str,
2766 2767                              sizeof (klip->li_devpath));
2767 2768                  }
2768 2769          }
2769 2770          mutex_exit(&lofi_devlink_cache.ln_lock);
2770 2771  }
2771 2772  
2772 2773  /*
2773 2774   * map a file to a minor number. Return the minor number.
2774 2775   */
2775 2776  static int
2776 2777  lofi_map_file(dev_t dev, struct lofi_ioctl *ulip, int pickminor,
2777 2778      int *rvalp, struct cred *credp, int ioctl_flag)
2778 2779  {
2779 2780          int     id = -1;
2780 2781          struct lofi_state *lsp = NULL;
2781 2782          struct lofi_ioctl *klip;
2782 2783          int     error;
2783 2784          struct vnode *vp = NULL;
2784 2785          vattr_t vattr;
2785 2786          int     flag;
2786 2787          char    namebuf[MAXNAMELEN];
2787 2788  
2788 2789          error = copy_in_lofi_ioctl(ulip, &klip, ioctl_flag);
2789 2790          if (error != 0)
2790 2791                  return (error);
2791 2792  
2792 2793          mutex_enter(&lofi_lock);
2793 2794  
2794 2795          if (file_to_lofi_nocheck(klip->li_filename, klip->li_readonly,
2795 2796              NULL) == 0) {
2796 2797                  error = EBUSY;
2797 2798                  goto err;
2798 2799          }
2799 2800  
2800 2801          flag = FREAD | FWRITE | FOFFMAX | FEXCL;
2801 2802          error = vn_open(klip->li_filename, UIO_SYSSPACE, flag, 0, &vp, 0, 0);
2802 2803          if (error) {
2803 2804                  /* try read-only */
2804 2805                  flag &= ~FWRITE;
2805 2806                  error = vn_open(klip->li_filename, UIO_SYSSPACE, flag, 0,
2806 2807                      &vp, 0, 0);
2807 2808                  if (error)
2808 2809                          goto err;
2809 2810          }
2810 2811  
2811 2812          if (!V_ISLOFIABLE(vp->v_type)) {
2812 2813                  error = EINVAL;
2813 2814                  goto err;
2814 2815          }
2815 2816  
2816 2817          vattr.va_mask = AT_SIZE;
2817 2818          error = VOP_GETATTR(vp, &vattr, 0, credp, NULL);
2818 2819          if (error)
2819 2820                  goto err;
2820 2821  
2821 2822          /* the file needs to be a multiple of the block size */
2822 2823          if ((vattr.va_size % DEV_BSIZE) != 0) {
2823 2824                  error = EINVAL;
2824 2825                  goto err;
2825 2826          }
2826 2827  
2827 2828          if (pickminor) {
2828 2829                  klip->li_id = (uint32_t)-1;
2829 2830          }
2830 2831          if ((error = lofi_create_dev(klip)) != 0)
2831 2832                  goto err;
2832 2833  
2833 2834          id = klip->li_id;
2834 2835          lsp = ddi_get_soft_state(lofi_statep, id);
2835 2836          if (lsp == NULL)
2836 2837                  goto err;
2837 2838  
2838 2839          /*
2839 2840           * from this point lofi_destroy() is used to clean up on error
2840 2841           * make sure the basic data is set
2841 2842           */
2842 2843          list_insert_tail(&lofi_list, lsp);
2843 2844          lsp->ls_dev = makedevice(getmajor(dev), LOFI_ID2MINOR(id));
2844 2845  
2845 2846          list_create(&lsp->ls_comp_cache, sizeof (struct lofi_comp_cache),
2846 2847              offsetof(struct lofi_comp_cache, lc_list));
2847 2848  
2848 2849          /*
2849 2850           * save open mode so file can be closed properly and vnode counts
2850 2851           * updated correctly.
2851 2852           */
2852 2853          lsp->ls_openflag = flag;
2853 2854  
2854 2855          lsp->ls_vp = vp;
2855 2856          lsp->ls_stacked_vp = vp;
2856 2857  
2857 2858          lsp->ls_vp_size = vattr.va_size;
2858 2859          lsp->ls_vp_comp_size = lsp->ls_vp_size;
2859 2860  
2860 2861          /*
2861 2862           * Try to handle stacked lofs vnodes.
2862 2863           */
2863 2864          if (vp->v_type == VREG) {
2864 2865                  vnode_t *realvp;
2865 2866  
2866 2867                  if (VOP_REALVP(vp, &realvp, NULL) == 0) {
2867 2868                          /*
2868 2869                           * We need to use the realvp for uniqueness
2869 2870                           * checking, but keep the stacked vp for
2870 2871                           * LOFI_GET_FILENAME display.
2871 2872                           */
2872 2873                          VN_HOLD(realvp);
2873 2874                          lsp->ls_vp = realvp;
2874 2875                  }
2875 2876          }
2876 2877  
2877 2878          lsp->ls_lbshift = highbit(DEV_BSIZE) - 1;
2878 2879          lsp->ls_pbshift = lsp->ls_lbshift;
2879 2880  
2880 2881          lsp->ls_readonly = klip->li_readonly;
2881 2882          lsp->ls_uncomp_seg_sz = 0;
2882 2883          lsp->ls_comp_algorithm[0] = '\0';
2883 2884          lsp->ls_crypto_offset = 0;
2884 2885  
2885 2886          (void) snprintf(namebuf, sizeof (namebuf), "%s_taskq_%d",
2886 2887              LOFI_DRIVER_NAME, id);
2887 2888          lsp->ls_taskq = taskq_create_proc(namebuf, lofi_taskq_nthreads,
2888 2889              minclsyspri, 1, lofi_taskq_maxalloc, curzone->zone_zsched, 0);
2889 2890  
2890 2891          if ((error = lofi_init_crypto(lsp, klip)) != 0)
2891 2892                  goto err;
2892 2893  
2893 2894          if ((error = lofi_init_compress(lsp)) != 0)
2894 2895                  goto err;
2895 2896  
2896 2897          fake_disk_geometry(lsp);
2897 2898  
2898 2899          /* For unlabeled lofi add Nblocks and Size */
2899 2900          if (klip->li_labeled == B_FALSE) {
2900 2901                  error = ddi_prop_update_int64(lsp->ls_dev, lsp->ls_dip,
2901 2902                      SIZE_PROP_NAME, lsp->ls_vp_size - lsp->ls_crypto_offset);
2902 2903                  if (error != DDI_PROP_SUCCESS) {
2903 2904                          error = EINVAL;
2904 2905                          goto err;
2905 2906                  }
2906 2907                  error = ddi_prop_update_int64(lsp->ls_dev, lsp->ls_dip,
2907 2908                      NBLOCKS_PROP_NAME,
2908 2909                      (lsp->ls_vp_size - lsp->ls_crypto_offset) / DEV_BSIZE);
2909 2910                  if (error != DDI_PROP_SUCCESS) {
2910 2911                          error = EINVAL;
2911 2912                          goto err;
2912 2913                  }
2913 2914          }
2914 2915  
2915 2916          /*
2916 2917           * Notify we are ready to rock.
2917 2918           */
2918 2919          mutex_enter(&lsp->ls_vp_lock);
2919 2920          lsp->ls_vp_ready = B_TRUE;
2920 2921          cv_broadcast(&lsp->ls_vp_cv);
2921 2922          mutex_exit(&lsp->ls_vp_lock);
2922 2923          mutex_exit(&lofi_lock);
2923 2924  
2924 2925          lofi_copy_devpath(klip);
2925 2926  
2926 2927          if (rvalp)
  
    | 
      ↓ open down ↓ | 
    2360 lines elided | 
    
      ↑ open up ↑ | 
  
2927 2928                  *rvalp = id;
2928 2929          (void) copy_out_lofi_ioctl(klip, ulip, ioctl_flag);
2929 2930          free_lofi_ioctl(klip);
2930 2931          return (0);
2931 2932  
2932 2933  err:
2933 2934          if (lsp != NULL) {
2934 2935                  lofi_destroy(lsp, credp);
2935 2936          } else {
2936 2937                  if (vp != NULL) {
2937      -                        (void) VOP_PUTPAGE(vp, 0, 0, B_INVAL, credp, NULL);
     2938 +                        (void) VOP_PUTPAGE(vp, 0, 0, B_FREE, credp, NULL);
2938 2939                          (void) VOP_CLOSE(vp, flag, 1, 0, credp, NULL);
2939 2940                          VN_RELE(vp);
2940 2941                  }
2941 2942          }
2942 2943  
2943 2944          mutex_exit(&lofi_lock);
2944 2945          free_lofi_ioctl(klip);
2945 2946          return (error);
2946 2947  }
2947 2948  
2948 2949  /*
2949 2950   * unmap a file.
2950 2951   */
2951 2952  static int
2952 2953  lofi_unmap_file(struct lofi_ioctl *ulip, int byfilename,
2953 2954      struct cred *credp, int ioctl_flag)
2954 2955  {
2955 2956          struct lofi_state *lsp;
2956 2957          struct lofi_ioctl *klip;
2957 2958          char namebuf[MAXNAMELEN];
2958 2959          int err;
2959 2960  
2960 2961          err = copy_in_lofi_ioctl(ulip, &klip, ioctl_flag);
2961 2962          if (err != 0)
2962 2963                  return (err);
2963 2964  
2964 2965          mutex_enter(&lofi_lock);
2965 2966          if (byfilename) {
2966 2967                  if ((err = file_to_lofi(klip->li_filename, klip->li_readonly,
2967 2968                      &lsp)) != 0) {
2968 2969                          goto done;
2969 2970                  }
2970 2971          } else if (klip->li_id == 0) {
2971 2972                  err = ENXIO;
2972 2973                  goto done;
2973 2974          } else {
2974 2975                  lsp = ddi_get_soft_state(lofi_statep, klip->li_id);
2975 2976          }
2976 2977  
2977 2978          if (lsp == NULL || lsp->ls_vp == NULL || lofi_access(lsp) != 0) {
2978 2979                  err = ENXIO;
2979 2980                  goto done;
2980 2981          }
2981 2982  
2982 2983          klip->li_id = LOFI_MINOR2ID(getminor(lsp->ls_dev));
2983 2984          (void) snprintf(namebuf, sizeof (namebuf), "%u", klip->li_id);
2984 2985  
2985 2986          /*
2986 2987           * If it's still held open, we'll do one of three things:
2987 2988           *
2988 2989           * If no flag is set, just return EBUSY.
2989 2990           *
2990 2991           * If the 'cleanup' flag is set, unmap and remove the device when
2991 2992           * the last user finishes.
2992 2993           *
2993 2994           * If the 'force' flag is set, then we forcibly close the underlying
2994 2995           * file.  Subsequent operations will fail, and the DKIOCSTATE ioctl
2995 2996           * will return DKIO_DEV_GONE.  When the device is last closed, the
2996 2997           * device will be cleaned up appropriately.
2997 2998           *
2998 2999           * This is complicated by the fact that we may have outstanding
2999 3000           * dispatched I/Os.  Rather than having a single mutex to serialize all
3000 3001           * I/O, we keep a count of the number of outstanding I/O requests
3001 3002           * (ls_vp_iocount), as well as a flag to indicate that no new I/Os
3002 3003           * should be dispatched (ls_vp_closereq).
3003 3004           *
3004 3005           * We set the flag, wait for the number of outstanding I/Os to reach 0,
3005 3006           * and then close the underlying vnode.
3006 3007           */
3007 3008          if (is_opened(lsp)) {
3008 3009                  if (klip->li_force) {
3009 3010                          /* Mark the device for cleanup. */
3010 3011                          lofi_set_cleanup(lsp);
3011 3012                          mutex_enter(&lsp->ls_vp_lock);
3012 3013                          lsp->ls_vp_closereq = B_TRUE;
3013 3014                          /* Wake up any threads waiting on dkiocstate. */
3014 3015                          cv_broadcast(&lsp->ls_vp_cv);
3015 3016                          while (lsp->ls_vp_iocount > 0)
3016 3017                                  cv_wait(&lsp->ls_vp_cv, &lsp->ls_vp_lock);
3017 3018                          mutex_exit(&lsp->ls_vp_lock);
3018 3019                  } else if (klip->li_cleanup) {
3019 3020                          lofi_set_cleanup(lsp);
3020 3021                  } else {
3021 3022                          err = EBUSY;
3022 3023                  }
3023 3024          } else {
3024 3025                  lofi_free_dev(lsp);
3025 3026                  lofi_destroy(lsp, credp);
3026 3027          }
3027 3028  
3028 3029          /* Remove name from devlink cache */
3029 3030          mutex_enter(&lofi_devlink_cache.ln_lock);
3030 3031          (void) nvlist_remove_all(lofi_devlink_cache.ln_data, namebuf);
3031 3032          mutex_exit(&lofi_devlink_cache.ln_lock);
3032 3033  done:
3033 3034          mutex_exit(&lofi_lock);
3034 3035          if (err == 0)
3035 3036                  (void) copy_out_lofi_ioctl(klip, ulip, ioctl_flag);
3036 3037          free_lofi_ioctl(klip);
3037 3038          return (err);
3038 3039  }
3039 3040  
3040 3041  /*
3041 3042   * get the filename given the minor number, or the minor number given
3042 3043   * the name.
3043 3044   */
3044 3045  /*ARGSUSED*/
3045 3046  static int
3046 3047  lofi_get_info(dev_t dev, struct lofi_ioctl *ulip, int which,
3047 3048      struct cred *credp, int ioctl_flag)
3048 3049  {
3049 3050          struct lofi_ioctl *klip;
3050 3051          struct lofi_state *lsp;
3051 3052          int     error;
3052 3053  
3053 3054          error = copy_in_lofi_ioctl(ulip, &klip, ioctl_flag);
3054 3055          if (error != 0)
3055 3056                  return (error);
3056 3057  
3057 3058          switch (which) {
3058 3059          case LOFI_GET_FILENAME:
3059 3060                  if (klip->li_id == 0) {
3060 3061                          free_lofi_ioctl(klip);
3061 3062                          return (EINVAL);
3062 3063                  }
3063 3064  
3064 3065                  mutex_enter(&lofi_lock);
3065 3066                  lsp = ddi_get_soft_state(lofi_statep, klip->li_id);
3066 3067                  if (lsp == NULL || lofi_access(lsp) != 0) {
3067 3068                          mutex_exit(&lofi_lock);
3068 3069                          free_lofi_ioctl(klip);
3069 3070                          return (ENXIO);
3070 3071                  }
3071 3072  
3072 3073                  /*
3073 3074                   * This may fail if, for example, we're trying to look
3074 3075                   * up a zoned NFS path from the global zone.
3075 3076                   */
3076 3077                  if (vnodetopath(NULL, lsp->ls_stacked_vp, klip->li_filename,
3077 3078                      sizeof (klip->li_filename), CRED()) != 0) {
3078 3079                          (void) strlcpy(klip->li_filename, "?",
3079 3080                              sizeof (klip->li_filename));
3080 3081                  }
3081 3082  
3082 3083                  klip->li_readonly = lsp->ls_readonly;
3083 3084                  klip->li_labeled = lsp->ls_cmlbhandle != NULL;
3084 3085  
3085 3086                  (void) strlcpy(klip->li_algorithm, lsp->ls_comp_algorithm,
3086 3087                      sizeof (klip->li_algorithm));
3087 3088                  klip->li_crypto_enabled = lsp->ls_crypto_enabled;
3088 3089                  mutex_exit(&lofi_lock);
3089 3090  
3090 3091                  lofi_copy_devpath(klip);
3091 3092                  error = copy_out_lofi_ioctl(klip, ulip, ioctl_flag);
3092 3093                  free_lofi_ioctl(klip);
3093 3094                  return (error);
3094 3095          case LOFI_GET_MINOR:
3095 3096                  mutex_enter(&lofi_lock);
3096 3097                  error = file_to_lofi(klip->li_filename,
3097 3098                      klip->li_readonly, &lsp);
3098 3099                  if (error != 0) {
3099 3100                          mutex_exit(&lofi_lock);
3100 3101                          free_lofi_ioctl(klip);
3101 3102                          return (error);
3102 3103                  }
3103 3104                  klip->li_id = LOFI_MINOR2ID(getminor(lsp->ls_dev));
3104 3105  
3105 3106                  klip->li_readonly = lsp->ls_readonly;
3106 3107                  klip->li_labeled = lsp->ls_cmlbhandle != NULL;
3107 3108                  mutex_exit(&lofi_lock);
3108 3109  
3109 3110                  lofi_copy_devpath(klip);
3110 3111                  error = copy_out_lofi_ioctl(klip, ulip, ioctl_flag);
3111 3112  
3112 3113                  free_lofi_ioctl(klip);
3113 3114                  return (error);
3114 3115          case LOFI_CHECK_COMPRESSED:
3115 3116                  mutex_enter(&lofi_lock);
3116 3117                  error = file_to_lofi(klip->li_filename,
3117 3118                      klip->li_readonly, &lsp);
3118 3119                  if (error != 0) {
3119 3120                          mutex_exit(&lofi_lock);
3120 3121                          free_lofi_ioctl(klip);
3121 3122                          return (error);
3122 3123                  }
3123 3124  
3124 3125                  klip->li_id = LOFI_MINOR2ID(getminor(lsp->ls_dev));
3125 3126                  (void) strlcpy(klip->li_algorithm, lsp->ls_comp_algorithm,
3126 3127                      sizeof (klip->li_algorithm));
3127 3128  
3128 3129                  mutex_exit(&lofi_lock);
3129 3130                  error = copy_out_lofi_ioctl(klip, ulip, ioctl_flag);
3130 3131                  free_lofi_ioctl(klip);
3131 3132                  return (error);
3132 3133          default:
3133 3134                  free_lofi_ioctl(klip);
3134 3135                  return (EINVAL);
3135 3136          }
3136 3137  }
3137 3138  
3138 3139  static int
3139 3140  uscsi_is_inquiry(intptr_t arg, int flag, union scsi_cdb *cdb,
3140 3141      struct uscsi_cmd *uscmd)
3141 3142  {
3142 3143          int rval;
3143 3144  
3144 3145  #ifdef  _MULTI_DATAMODEL
3145 3146          switch (ddi_model_convert_from(flag & FMODELS)) {
3146 3147          case DDI_MODEL_ILP32: {
3147 3148                  struct uscsi_cmd32 ucmd32;
3148 3149  
3149 3150                  if (ddi_copyin((void *)arg, &ucmd32, sizeof (ucmd32), flag)) {
3150 3151                          rval = EFAULT;
3151 3152                          goto err;
3152 3153                  }
3153 3154                  uscsi_cmd32touscsi_cmd((&ucmd32), uscmd);
3154 3155                  break;
3155 3156          }
3156 3157          case DDI_MODEL_NONE:
3157 3158                  if (ddi_copyin((void *)arg, uscmd, sizeof (*uscmd), flag)) {
3158 3159                          rval = EFAULT;
3159 3160                          goto err;
3160 3161                  }
3161 3162                  break;
3162 3163          default:
3163 3164                  rval = EFAULT;
3164 3165                  goto err;
3165 3166          }
3166 3167  #else
3167 3168          if (ddi_copyin((void *)arg, uscmd, sizeof (*uscmd), flag)) {
3168 3169                  rval = EFAULT;
3169 3170                  goto err;
3170 3171          }
3171 3172  #endif  /* _MULTI_DATAMODEL */
3172 3173          if (ddi_copyin(uscmd->uscsi_cdb, cdb, uscmd->uscsi_cdblen, flag)) {
3173 3174                  rval = EFAULT;
3174 3175                  goto err;
3175 3176          }
3176 3177          if (cdb->scc_cmd == SCMD_INQUIRY) {
3177 3178                  return (0);
3178 3179          }
3179 3180  err:
3180 3181          return (rval);
3181 3182  }
3182 3183  
3183 3184  static int
3184 3185  lofi_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *credp,
3185 3186      int *rvalp)
3186 3187  {
3187 3188          int     error;
3188 3189          enum dkio_state dkstate;
3189 3190          struct lofi_state *lsp;
3190 3191          int     id;
3191 3192  
3192 3193          id = LOFI_MINOR2ID(getminor(dev));
3193 3194  
3194 3195          /* lofi ioctls only apply to the master device */
3195 3196          if (id == 0) {
3196 3197                  struct lofi_ioctl *lip = (struct lofi_ioctl *)arg;
3197 3198  
3198 3199                  /*
3199 3200                   * the query command only need read-access - i.e., normal
3200 3201                   * users are allowed to do those on the ctl device as
3201 3202                   * long as they can open it read-only.
3202 3203                   */
3203 3204                  switch (cmd) {
3204 3205                  case LOFI_MAP_FILE:
3205 3206                          if ((flag & FWRITE) == 0)
3206 3207                                  return (EPERM);
3207 3208                          return (lofi_map_file(dev, lip, 1, rvalp, credp, flag));
3208 3209                  case LOFI_MAP_FILE_MINOR:
3209 3210                          if ((flag & FWRITE) == 0)
3210 3211                                  return (EPERM);
3211 3212                          return (lofi_map_file(dev, lip, 0, rvalp, credp, flag));
3212 3213                  case LOFI_UNMAP_FILE:
3213 3214                          if ((flag & FWRITE) == 0)
3214 3215                                  return (EPERM);
3215 3216                          return (lofi_unmap_file(lip, 1, credp, flag));
3216 3217                  case LOFI_UNMAP_FILE_MINOR:
3217 3218                          if ((flag & FWRITE) == 0)
3218 3219                                  return (EPERM);
3219 3220                          return (lofi_unmap_file(lip, 0, credp, flag));
3220 3221                  case LOFI_GET_FILENAME:
3221 3222                          return (lofi_get_info(dev, lip, LOFI_GET_FILENAME,
3222 3223                              credp, flag));
3223 3224                  case LOFI_GET_MINOR:
3224 3225                          return (lofi_get_info(dev, lip, LOFI_GET_MINOR,
3225 3226                              credp, flag));
3226 3227  
3227 3228                  /*
3228 3229                   * This API made limited sense when this value was fixed
3229 3230                   * at LOFI_MAX_FILES.  However, its use to iterate
3230 3231                   * across all possible devices in lofiadm means we don't
3231 3232                   * want to return L_MAXMIN, but the highest
3232 3233                   * *allocated* id.
3233 3234                   */
3234 3235                  case LOFI_GET_MAXMINOR:
3235 3236                          id = 0;
3236 3237  
3237 3238                          mutex_enter(&lofi_lock);
3238 3239  
3239 3240                          for (lsp = list_head(&lofi_list); lsp != NULL;
3240 3241                              lsp = list_next(&lofi_list, lsp)) {
3241 3242                                  int i;
3242 3243                                  if (lofi_access(lsp) != 0)
3243 3244                                          continue;
3244 3245  
3245 3246                                  i = ddi_get_instance(lsp->ls_dip);
3246 3247                                  if (i > id)
3247 3248                                          id = i;
3248 3249                          }
3249 3250  
3250 3251                          mutex_exit(&lofi_lock);
3251 3252  
3252 3253                          error = ddi_copyout(&id, &lip->li_id,
3253 3254                              sizeof (id), flag);
3254 3255                          if (error)
3255 3256                                  return (EFAULT);
3256 3257                          return (0);
3257 3258  
3258 3259                  case LOFI_CHECK_COMPRESSED:
3259 3260                          return (lofi_get_info(dev, lip, LOFI_CHECK_COMPRESSED,
3260 3261                              credp, flag));
3261 3262                  default:
3262 3263                          return (EINVAL);
3263 3264                  }
3264 3265          }
3265 3266  
3266 3267          mutex_enter(&lofi_lock);
3267 3268          lsp = ddi_get_soft_state(lofi_statep, id);
3268 3269          if (lsp == NULL || lsp->ls_cleanup) {
3269 3270                  mutex_exit(&lofi_lock);
3270 3271                  return (ENXIO);
3271 3272          }
3272 3273          mutex_exit(&lofi_lock);
3273 3274  
3274 3275          if (ddi_prop_exists(DDI_DEV_T_ANY, lsp->ls_dip, DDI_PROP_DONTPASS,
3275 3276              "labeled") == 1) {
3276 3277                  error = cmlb_ioctl(lsp->ls_cmlbhandle, dev, cmd, arg, flag,
3277 3278                      credp, rvalp, 0);
3278 3279                  if (error != ENOTTY)
3279 3280                          return (error);
3280 3281          }
3281 3282  
3282 3283          /*
3283 3284           * We explicitly allow DKIOCSTATE, but all other ioctls should fail with
3284 3285           * EIO as if the device was no longer present.
3285 3286           */
3286 3287          if (lsp->ls_vp == NULL && cmd != DKIOCSTATE)
3287 3288                  return (EIO);
3288 3289  
3289 3290          /* these are for faking out utilities like newfs */
3290 3291          switch (cmd) {
3291 3292          case DKIOCGMEDIAINFO:
3292 3293          case DKIOCGMEDIAINFOEXT: {
3293 3294                  struct dk_minfo_ext media_info;
3294 3295                  int shift = lsp->ls_lbshift;
3295 3296                  int size;
3296 3297  
3297 3298                  if (cmd == DKIOCGMEDIAINFOEXT) {
3298 3299                          media_info.dki_pbsize = 1U << lsp->ls_pbshift;
3299 3300                          size = sizeof (struct dk_minfo_ext);
3300 3301                  } else {
3301 3302                          size = sizeof (struct dk_minfo);
3302 3303                  }
3303 3304  
3304 3305                  media_info.dki_media_type = DK_FIXED_DISK;
3305 3306                  media_info.dki_lbsize = 1U << shift;
3306 3307                  media_info.dki_capacity =
3307 3308                      (lsp->ls_vp_size - lsp->ls_crypto_offset) >> shift;
3308 3309  
3309 3310                  if (ddi_copyout(&media_info, (void *)arg, size, flag))
3310 3311                          return (EFAULT);
3311 3312                  return (0);
3312 3313          }
3313 3314          case DKIOCREMOVABLE: {
3314 3315                  int i = 0;
3315 3316                  if (ddi_copyout(&i, (caddr_t)arg, sizeof (int), flag))
3316 3317                          return (EFAULT);
3317 3318                  return (0);
3318 3319          }
3319 3320  
3320 3321          case DKIOCGVTOC: {
3321 3322                  struct vtoc vt;
3322 3323                  fake_disk_vtoc(lsp, &vt);
3323 3324  
3324 3325                  switch (ddi_model_convert_from(flag & FMODELS)) {
3325 3326                  case DDI_MODEL_ILP32: {
3326 3327                          struct vtoc32 vtoc32;
3327 3328  
3328 3329                          vtoctovtoc32(vt, vtoc32);
3329 3330                          if (ddi_copyout(&vtoc32, (void *)arg,
3330 3331                              sizeof (struct vtoc32), flag))
3331 3332                                  return (EFAULT);
3332 3333                          break;
3333 3334                          }
3334 3335  
3335 3336                  case DDI_MODEL_NONE:
3336 3337                          if (ddi_copyout(&vt, (void *)arg,
3337 3338                              sizeof (struct vtoc), flag))
3338 3339                                  return (EFAULT);
3339 3340                          break;
3340 3341                  }
3341 3342                  return (0);
3342 3343          }
3343 3344          case DKIOCINFO: {
3344 3345                  struct dk_cinfo ci;
3345 3346                  fake_disk_info(dev, &ci);
3346 3347                  if (ddi_copyout(&ci, (void *)arg, sizeof (ci), flag))
3347 3348                          return (EFAULT);
3348 3349                  return (0);
3349 3350          }
3350 3351          case DKIOCG_VIRTGEOM:
3351 3352          case DKIOCG_PHYGEOM:
3352 3353          case DKIOCGGEOM:
3353 3354                  error = ddi_copyout(&lsp->ls_dkg, (void *)arg,
3354 3355                      sizeof (struct dk_geom), flag);
3355 3356                  if (error)
3356 3357                          return (EFAULT);
3357 3358                  return (0);
3358 3359          case DKIOCSTATE:
3359 3360                  /*
3360 3361                   * Normally, lofi devices are always in the INSERTED state.  If
3361 3362                   * a device is forcefully unmapped, then the device transitions
3362 3363                   * to the DKIO_DEV_GONE state.
3363 3364                   */
3364 3365                  if (ddi_copyin((void *)arg, &dkstate, sizeof (dkstate),
3365 3366                      flag) != 0)
3366 3367                          return (EFAULT);
3367 3368  
3368 3369                  mutex_enter(&lsp->ls_vp_lock);
3369 3370                  while (((dkstate == DKIO_INSERTED && lsp->ls_vp != NULL) ||
3370 3371                      (dkstate == DKIO_DEV_GONE && lsp->ls_vp == NULL)) &&
3371 3372                      !lsp->ls_cleanup) {
3372 3373                          /*
3373 3374                           * By virtue of having the device open, we know that
3374 3375                           * 'lsp' will remain valid when we return.
3375 3376                           */
3376 3377                          if (!cv_wait_sig(&lsp->ls_vp_cv, &lsp->ls_vp_lock)) {
3377 3378                                  mutex_exit(&lsp->ls_vp_lock);
3378 3379                                  return (EINTR);
3379 3380                          }
3380 3381                  }
3381 3382  
3382 3383                  dkstate = (!lsp->ls_cleanup && lsp->ls_vp != NULL ?
3383 3384                      DKIO_INSERTED : DKIO_DEV_GONE);
3384 3385                  mutex_exit(&lsp->ls_vp_lock);
3385 3386  
3386 3387                  if (ddi_copyout(&dkstate, (void *)arg,
3387 3388                      sizeof (dkstate), flag) != 0)
3388 3389                          return (EFAULT);
3389 3390                  return (0);
3390 3391          case USCSICMD: {
3391 3392                  struct uscsi_cmd uscmd;
3392 3393                  union scsi_cdb cdb;
3393 3394  
3394 3395                  if (uscsi_is_inquiry(arg, flag, &cdb, &uscmd) == 0) {
3395 3396                          struct scsi_inquiry inq = {0};
3396 3397  
3397 3398                          lofi_create_inquiry(lsp, &inq);
3398 3399                          if (ddi_copyout(&inq, uscmd.uscsi_bufaddr,
3399 3400                              uscmd.uscsi_buflen, flag) != 0)
3400 3401                                  return (EFAULT);
3401 3402                          return (0);
3402 3403                  } else if (cdb.scc_cmd == SCMD_READ_CAPACITY) {
3403 3404                          struct scsi_capacity capacity;
3404 3405  
3405 3406                          capacity.capacity =
3406 3407                              BE_32((lsp->ls_vp_size - lsp->ls_crypto_offset) >>
3407 3408                              lsp->ls_lbshift);
3408 3409                          capacity.lbasize = BE_32(1 << lsp->ls_lbshift);
3409 3410                          if (ddi_copyout(&capacity, uscmd.uscsi_bufaddr,
3410 3411                              uscmd.uscsi_buflen, flag) != 0)
3411 3412                                  return (EFAULT);
3412 3413                          return (0);
3413 3414                  }
3414 3415  
3415 3416                  uscmd.uscsi_rqstatus = 0xff;
3416 3417  #ifdef  _MULTI_DATAMODEL
3417 3418                  switch (ddi_model_convert_from(flag & FMODELS)) {
3418 3419                  case DDI_MODEL_ILP32: {
3419 3420                          struct uscsi_cmd32 ucmd32;
3420 3421                          uscsi_cmdtouscsi_cmd32((&uscmd), (&ucmd32));
3421 3422                          if (ddi_copyout(&ucmd32, (void *)arg, sizeof (ucmd32),
3422 3423                              flag) != 0)
3423 3424                                  return (EFAULT);
3424 3425                          break;
3425 3426                  }
3426 3427                  case DDI_MODEL_NONE:
3427 3428                          if (ddi_copyout(&uscmd, (void *)arg, sizeof (uscmd),
3428 3429                              flag) != 0)
3429 3430                                  return (EFAULT);
3430 3431                          break;
3431 3432                  default:
3432 3433                          return (EFAULT);
3433 3434                  }
3434 3435  #else
3435 3436                  if (ddi_copyout(&uscmd, (void *)arg, sizeof (uscmd), flag) != 0)
3436 3437                          return (EFAULT);
3437 3438  #endif  /* _MULTI_DATAMODEL */
3438 3439                  return (0);
3439 3440          }
3440 3441          default:
3441 3442  #ifdef DEBUG
3442 3443                  cmn_err(CE_WARN, "lofi_ioctl: %d is not implemented\n", cmd);
3443 3444  #endif  /* DEBUG */
3444 3445                  return (ENOTTY);
3445 3446          }
3446 3447  }
3447 3448  
3448 3449  static int
3449 3450  lofi_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int mod_flags,
3450 3451      char *name, caddr_t valuep, int *lengthp)
3451 3452  {
3452 3453          struct lofi_state *lsp;
3453 3454          int rc;
3454 3455  
3455 3456          lsp = ddi_get_soft_state(lofi_statep, ddi_get_instance(dip));
3456 3457          if (lsp == NULL) {
3457 3458                  return (ddi_prop_op(dev, dip, prop_op, mod_flags,
3458 3459                      name, valuep, lengthp));
3459 3460          }
3460 3461  
3461 3462          rc = cmlb_prop_op(lsp->ls_cmlbhandle, dev, dip, prop_op, mod_flags,
3462 3463              name, valuep, lengthp, LOFI_PART(getminor(dev)), NULL);
3463 3464          if (rc == DDI_PROP_SUCCESS)
3464 3465                  return (rc);
3465 3466  
3466 3467          return (ddi_prop_op(DDI_DEV_T_ANY, dip, prop_op, mod_flags,
3467 3468              name, valuep, lengthp));
3468 3469  }
3469 3470  
3470 3471  static struct cb_ops lofi_cb_ops = {
3471 3472          lofi_open,              /* open */
3472 3473          lofi_close,             /* close */
3473 3474          lofi_strategy,          /* strategy */
3474 3475          nodev,                  /* print */
3475 3476          nodev,                  /* dump */
3476 3477          lofi_read,              /* read */
3477 3478          lofi_write,             /* write */
3478 3479          lofi_ioctl,             /* ioctl */
3479 3480          nodev,                  /* devmap */
3480 3481          nodev,                  /* mmap */
3481 3482          nodev,                  /* segmap */
3482 3483          nochpoll,               /* poll */
3483 3484          lofi_prop_op,           /* prop_op */
3484 3485          0,                      /* streamtab  */
3485 3486          D_64BIT | D_NEW | D_MP, /* Driver compatibility flag */
3486 3487          CB_REV,
3487 3488          lofi_aread,
3488 3489          lofi_awrite
3489 3490  };
3490 3491  
3491 3492  static struct dev_ops lofi_ops = {
3492 3493          DEVO_REV,               /* devo_rev, */
3493 3494          0,                      /* refcnt  */
3494 3495          lofi_info,              /* info */
3495 3496          nulldev,                /* identify */
3496 3497          nulldev,                /* probe */
3497 3498          lofi_attach,            /* attach */
3498 3499          lofi_detach,            /* detach */
3499 3500          nodev,                  /* reset */
3500 3501          &lofi_cb_ops,           /* driver operations */
3501 3502          NULL,                   /* no bus operations */
3502 3503          NULL,                   /* power */
3503 3504          ddi_quiesce_not_needed, /* quiesce */
3504 3505  };
3505 3506  
3506 3507  static struct modldrv modldrv = {
3507 3508          &mod_driverops,
3508 3509          "loopback file driver",
3509 3510          &lofi_ops,
3510 3511  };
3511 3512  
3512 3513  static struct modlinkage modlinkage = {
3513 3514          MODREV_1,
3514 3515          &modldrv,
3515 3516          NULL
3516 3517  };
3517 3518  
3518 3519  int
3519 3520  _init(void)
3520 3521  {
3521 3522          int error;
3522 3523  
3523 3524          list_create(&lofi_list, sizeof (struct lofi_state),
3524 3525              offsetof(struct lofi_state, ls_list));
3525 3526  
3526 3527          error = ddi_soft_state_init((void **)&lofi_statep,
3527 3528              sizeof (struct lofi_state), 0);
3528 3529          if (error) {
3529 3530                  list_destroy(&lofi_list);
3530 3531                  return (error);
3531 3532          }
3532 3533  
3533 3534          /*
3534 3535           * The minor number is stored as id << LOFI_CMLB_SHIFT as
3535 3536           * we need to reserve space for cmlb minor numbers.
3536 3537           * This will leave out 4096 id values on 32bit kernel, which should
3537 3538           * still suffice.
3538 3539           */
3539 3540          lofi_id = id_space_create("lofi_id", 1,
3540 3541              (1 << (L_BITSMINOR - LOFI_CMLB_SHIFT)));
3541 3542  
3542 3543          if (lofi_id == NULL) {
3543 3544                  ddi_soft_state_fini((void **)&lofi_statep);
3544 3545                  list_destroy(&lofi_list);
3545 3546                  return (DDI_FAILURE);
3546 3547          }
3547 3548  
3548 3549          mutex_init(&lofi_lock, NULL, MUTEX_DRIVER, NULL);
3549 3550  
3550 3551          error = mod_install(&modlinkage);
3551 3552  
3552 3553          if (error) {
3553 3554                  id_space_destroy(lofi_id);
3554 3555                  mutex_destroy(&lofi_lock);
3555 3556                  ddi_soft_state_fini((void **)&lofi_statep);
3556 3557                  list_destroy(&lofi_list);
3557 3558          }
3558 3559  
3559 3560          return (error);
3560 3561  }
3561 3562  
3562 3563  int
3563 3564  _fini(void)
3564 3565  {
3565 3566          int     error;
3566 3567  
3567 3568          mutex_enter(&lofi_lock);
3568 3569  
3569 3570          if (!list_is_empty(&lofi_list)) {
3570 3571                  mutex_exit(&lofi_lock);
3571 3572                  return (EBUSY);
3572 3573          }
3573 3574  
3574 3575          mutex_exit(&lofi_lock);
3575 3576  
3576 3577          error = mod_remove(&modlinkage);
3577 3578          if (error)
3578 3579                  return (error);
3579 3580  
3580 3581          mutex_destroy(&lofi_lock);
3581 3582          id_space_destroy(lofi_id);
3582 3583          ddi_soft_state_fini((void **)&lofi_statep);
3583 3584          list_destroy(&lofi_list);
3584 3585  
3585 3586          return (error);
3586 3587  }
3587 3588  
3588 3589  int
3589 3590  _info(struct modinfo *modinfop)
3590 3591  {
3591 3592          return (mod_info(&modlinkage, modinfop));
3592 3593  }
  
    | 
      ↓ open down ↓ | 
    645 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX