Print this page
    
935 sv_lyr_open() misses one NULL-pointer check
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Gordon Ross <gwr@nexenta.com>
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/avs/ns/sv/sv.c
          +++ new/usr/src/uts/common/avs/ns/sv/sv.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  
    | 
      ↓ open down ↓ | 
    13 lines elided | 
    
      ↑ open up ↑ | 
  
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23   23   * Use is subject to license terms.
       24 + *
       25 + * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
  24   26   */
  25   27  
  26   28  /*
  27   29   * Storage Volume Character and Block Driver (SV)
  28   30   *
  29   31   * This driver implements a simplistic /dev/{r}dsk/ interface to a
  30   32   * specified disk volume that is otherwise managed by the Prism
  31   33   * software.  The SV driver layers itself onto the underlying disk
  32   34   * device driver by changing function pointers in the cb_ops
  33   35   * structure.
  34   36   *
  35   37   * CONFIGURATION:
  36   38   *
  37   39   * 1. Configure the driver using the svadm utility.
  38   40   * 2. Access the device as before through /dev/rdsk/c?t?d?s?
  39   41   *
  40   42   * LIMITATIONS:
  41   43   *
  42   44   * This driver should NOT be used to share a device between another
  43   45   * DataServices user interface module (e.g., STE) and a user accessing
  44   46   * the device through the block device in O_WRITE mode.  This is because
  45   47   * writes through the block device are asynchronous (due to the page
  46   48   * cache) and so consistency between the block device user and the
  47   49   * STE user cannot be guaranteed.
  48   50   *
  49   51   * Data is copied between system struct buf(9s) and nsc_vec_t.  This is
  50   52   * wasteful and slow.
  51   53   */
  52   54  
  53   55  #include <sys/debug.h>
  54   56  #include <sys/types.h>
  55   57  
  56   58  #include <sys/ksynch.h>
  57   59  #include <sys/kmem.h>
  58   60  #include <sys/errno.h>
  59   61  #include <sys/varargs.h>
  60   62  #include <sys/file.h>
  61   63  #include <sys/open.h>
  62   64  #include <sys/conf.h>
  63   65  #include <sys/cred.h>
  64   66  #include <sys/buf.h>
  65   67  #include <sys/uio.h>
  66   68  #ifndef DS_DDICT
  67   69  #include <sys/pathname.h>
  68   70  #endif
  69   71  #include <sys/aio_req.h>
  70   72  #include <sys/dkio.h>
  71   73  #include <sys/vtoc.h>
  72   74  #include <sys/cmn_err.h>
  73   75  #include <sys/modctl.h>
  74   76  #include <sys/ddi.h>
  75   77  #include <sys/sunddi.h>
  76   78  #include <sys/sunldi.h>
  77   79  #include <sys/nsctl/nsvers.h>
  78   80  
  79   81  #include <sys/nsc_thread.h>
  80   82  #include <sys/unistat/spcs_s.h>
  81   83  #include <sys/unistat/spcs_s_k.h>
  82   84  #include <sys/unistat/spcs_errors.h>
  83   85  
  84   86  #ifdef DS_DDICT
  85   87  #include "../contract.h"
  86   88  #endif
  87   89  
  88   90  #include "../nsctl.h"
  89   91  
  90   92  
  91   93  #include <sys/sdt.h>            /* dtrace is S10 or later */
  92   94  
  93   95  #include "sv.h"
  94   96  #include "sv_impl.h"
  95   97  #include "sv_efi.h"
  96   98  
  97   99  #define MAX_EINTR_COUNT 1000
  98  100  
  99  101  /*
 100  102   * sv_mod_status
 101  103   */
 102  104  #define SV_PREVENT_UNLOAD 1
 103  105  #define SV_ALLOW_UNLOAD 2
 104  106  
 105  107  static const int sv_major_rev = ISS_VERSION_MAJ;        /* Major number */
 106  108  static const int sv_minor_rev = ISS_VERSION_MIN;        /* Minor number */
 107  109  static const int sv_micro_rev = ISS_VERSION_MIC;        /* Micro number */
 108  110  static const int sv_baseline_rev = ISS_VERSION_NUM;     /* Baseline number */
 109  111  
 110  112  #ifdef DKIOCPARTITION
 111  113  /*
 112  114   * CRC32 polynomial table needed for computing the checksums
 113  115   * in an EFI vtoc.
 114  116   */
 115  117  static const uint32_t sv_crc32_table[256] = { CRC32_TABLE };
 116  118  #endif
 117  119  
 118  120  static clock_t sv_config_time;          /* Time of successful {en,dis}able */
 119  121  static int sv_debug;                    /* Set non-zero for debug to syslog */
 120  122  static int sv_mod_status;               /* Set to prevent modunload */
 121  123  
 122  124  static dev_info_t *sv_dip;              /* Single DIP for driver */
 123  125  static kmutex_t sv_mutex;               /* Protect global lists, etc. */
 124  126  
 125  127  static nsc_mem_t        *sv_mem;        /* nsctl memory allocator token */
 126  128  
 127  129  
 128  130  /*
 129  131   * Per device and per major state.
 130  132   */
 131  133  
 132  134  #ifndef _SunOS_5_6
 133  135  #define UNSAFE_ENTER()
 134  136  #define UNSAFE_EXIT()
 135  137  #else
 136  138  #define UNSAFE_ENTER()  mutex_enter(&unsafe_driver)
 137  139  #define UNSAFE_EXIT()   mutex_exit(&unsafe_driver)
 138  140  #endif
 139  141  
 140  142                                          /* hash table of major dev structures */
 141  143  static sv_maj_t *sv_majors[SV_MAJOR_HASH_CNT] = {0};
 142  144  static sv_dev_t *sv_devs;               /* array of per device structures */
 143  145  static int sv_max_devices;              /* SV version of nsc_max_devices() */
 144  146  static int sv_ndevices;                 /* number of SV enabled devices */
 145  147  
 146  148  /*
 147  149   * Threading.
 148  150   */
 149  151  
 150  152  int sv_threads_max = 1024;              /* maximum # to dynamically alloc */
 151  153  int sv_threads = 32;                    /* # to pre-allocate (see sv.conf) */
 152  154  int sv_threads_extra = 0;               /* addl # we would have alloc'ed */
 153  155  
 154  156  static nstset_t *sv_tset;               /* the threadset pointer */
 155  157  
 156  158  static int sv_threads_hysteresis = 4;   /* hysteresis for threadset resizing */
 157  159  static int sv_threads_dev = 2;          /* # of threads to alloc per device */
 158  160  static int sv_threads_inc = 8;          /* increment for changing the set */
 159  161  static int sv_threads_needed;           /* number of threads needed */
 160  162  static int sv_no_threads;               /* number of nsc_create errors */
 161  163  static int sv_max_nlive;                /* max number of threads running */
 162  164  
 163  165  
 164  166  
 165  167  /*
 166  168   * nsctl fd callbacks.
 167  169   */
 168  170  
 169  171  static int svattach_fd(blind_t);
 170  172  static int svdetach_fd(blind_t);
 171  173  
 172  174  static nsc_def_t sv_fd_def[] = {
 173  175          { "Attach",     (uintptr_t)svattach_fd, },
 174  176          { "Detach",     (uintptr_t)svdetach_fd, },
 175  177          { 0, 0, }
 176  178  };
 177  179  
 178  180  /*
 179  181   * cb_ops functions.
 180  182   */
 181  183  
 182  184  static int svopen(dev_t *, int, int, cred_t *);
 183  185  static int svclose(dev_t, int, int, cred_t *);
 184  186  static int svioctl(dev_t, int, intptr_t, int, cred_t *, int *);
 185  187  static int svprint(dev_t, char *);
 186  188  
 187  189  /*
 188  190   * These next functions are layered into the underlying driver's devops.
 189  191   */
 190  192  
 191  193  static int sv_lyr_open(dev_t *, int, int, cred_t *);
 192  194  static int sv_lyr_close(dev_t, int, int, cred_t *);
 193  195  static int sv_lyr_strategy(struct buf *);
 194  196  static int sv_lyr_read(dev_t, struct uio *, cred_t *);
 195  197  static int sv_lyr_write(dev_t, struct uio *, cred_t *);
 196  198  static int sv_lyr_aread(dev_t, struct aio_req *, cred_t *);
 197  199  static int sv_lyr_awrite(dev_t, struct aio_req *, cred_t *);
 198  200  static int sv_lyr_ioctl(dev_t, int, intptr_t, int, cred_t *, int *);
 199  201  
 200  202  static struct cb_ops sv_cb_ops = {
 201  203          svopen,         /* open */
 202  204          svclose,        /* close */
 203  205          nulldev,        /* strategy */
 204  206          svprint,
 205  207          nodev,          /* dump */
 206  208          nodev,          /* read */
 207  209          nodev,          /* write */
 208  210          svioctl,
 209  211          nodev,          /* devmap */
 210  212          nodev,          /* mmap */
 211  213          nodev,          /* segmap */
 212  214          nochpoll,       /* poll */
 213  215          ddi_prop_op,
 214  216          NULL,           /* NOT a stream */
 215  217          D_NEW | D_MP | D_64BIT,
 216  218          CB_REV,
 217  219          nodev,          /* aread */
 218  220          nodev,          /* awrite */
 219  221  };
 220  222  
 221  223  
 222  224  /*
 223  225   * dev_ops functions.
 224  226   */
 225  227  
 226  228  static int sv_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
 227  229  static int sv_attach(dev_info_t *, ddi_attach_cmd_t);
 228  230  static int sv_detach(dev_info_t *, ddi_detach_cmd_t);
 229  231  
 230  232  static struct dev_ops sv_ops = {
 231  233          DEVO_REV,
 232  234          0,
 233  235          sv_getinfo,
 234  236          nulldev,        /* identify */
 235  237          nulldev,        /* probe */
 236  238          sv_attach,
 237  239          sv_detach,
 238  240          nodev,          /* reset */
 239  241          &sv_cb_ops,
 240  242          (struct bus_ops *)0
 241  243  };
 242  244  
 243  245  /*
 244  246   * Module linkage.
 245  247   */
 246  248  
 247  249  extern struct mod_ops mod_driverops;
 248  250  
 249  251  static struct modldrv modldrv = {
 250  252          &mod_driverops,
 251  253          "nws:Storage Volume:" ISS_VERSION_STR,
 252  254          &sv_ops
 253  255  };
 254  256  
 255  257  static struct modlinkage modlinkage = {
 256  258          MODREV_1,
 257  259          &modldrv,
 258  260          0
 259  261  };
 260  262  
 261  263  
 262  264  int
 263  265  _init(void)
 264  266  {
 265  267          int error;
 266  268  
 267  269          mutex_init(&sv_mutex, NULL, MUTEX_DRIVER, NULL);
 268  270  
 269  271          if ((error = mod_install(&modlinkage)) != 0) {
 270  272                  mutex_destroy(&sv_mutex);
 271  273                  return (error);
 272  274          }
 273  275  
 274  276  #ifdef DEBUG
 275  277          cmn_err(CE_CONT, "!sv (revision %d.%d.%d.%d, %s, %s)\n",
 276  278              sv_major_rev, sv_minor_rev, sv_micro_rev, sv_baseline_rev,
 277  279              ISS_VERSION_STR, BUILD_DATE_STR);
 278  280  #else
 279  281          if (sv_micro_rev) {
 280  282                  cmn_err(CE_CONT, "!sv (revision %d.%d.%d, %s, %s)\n",
 281  283                      sv_major_rev, sv_minor_rev, sv_micro_rev,
 282  284                      ISS_VERSION_STR, BUILD_DATE_STR);
 283  285          } else {
 284  286                  cmn_err(CE_CONT, "!sv (revision %d.%d, %s, %s)\n",
 285  287                      sv_major_rev, sv_minor_rev,
 286  288                      ISS_VERSION_STR, BUILD_DATE_STR);
 287  289          }
 288  290  #endif
 289  291  
 290  292          return (error);
 291  293  }
 292  294  
 293  295  
 294  296  int
 295  297  _fini(void)
 296  298  {
 297  299          int error;
 298  300  
 299  301          if ((error = mod_remove(&modlinkage)) != 0)
 300  302                  return (error);
 301  303  
 302  304          mutex_destroy(&sv_mutex);
 303  305  
 304  306          return (error);
 305  307  }
 306  308  
 307  309  
 308  310  int
 309  311  _info(struct modinfo *modinfop)
 310  312  {
 311  313          return (mod_info(&modlinkage, modinfop));
 312  314  }
 313  315  
 314  316  
 315  317  /*
 316  318   * Locking & State.
 317  319   *
 318  320   * sv_mutex protects config information - sv_maj_t and sv_dev_t lists;
 319  321   * threadset creation and sizing; sv_ndevices.
 320  322   *
 321  323   * If we need to hold both sv_mutex and sv_lock, then the sv_mutex
 322  324   * must be acquired first.
 323  325   *
 324  326   * sv_lock protects the sv_dev_t structure for an individual device.
 325  327   *
 326  328   * sv_olock protects the otyp/open members of the sv_dev_t.  If we need
 327  329   * to hold both sv_lock and sv_olock, then the sv_lock must be acquired
 328  330   * first.
 329  331   *
 330  332   * nsc_reserve/nsc_release are used in NSC_MULTI mode to allow multiple
 331  333   * I/O operations to a device simultaneously, as above.
 332  334   *
 333  335   * All nsc_open/nsc_close/nsc_reserve/nsc_release operations that occur
 334  336   * with sv_lock write-locked must be done with (sv_state == SV_PENDING)
 335  337   * and (sv_pending == curthread) so that any recursion through
 336  338   * sv_lyr_open/sv_lyr_close can be detected.
 337  339   */
 338  340  
 339  341  
 340  342  static int
 341  343  sv_init_devs(void)
 342  344  {
 343  345          int i;
 344  346  
 345  347          ASSERT(MUTEX_HELD(&sv_mutex));
 346  348  
 347  349          if (sv_max_devices > 0)
 348  350                  return (0);
 349  351  
 350  352          sv_max_devices = nsc_max_devices();
 351  353  
 352  354          if (sv_max_devices <= 0) {
 353  355                  /* nsctl is not attached (nskernd not running) */
 354  356                  if (sv_debug > 0)
 355  357                          cmn_err(CE_CONT, "!sv: nsc_max_devices = 0\n");
 356  358                  return (EAGAIN);
 357  359          }
 358  360  
 359  361          sv_devs = nsc_kmem_zalloc((sv_max_devices * sizeof (*sv_devs)),
 360  362              KM_NOSLEEP, sv_mem);
 361  363  
 362  364          if (sv_devs == NULL) {
 363  365                  cmn_err(CE_WARN, "!sv: could not allocate sv_devs array");
 364  366                  return (ENOMEM);
 365  367          }
 366  368  
 367  369          for (i = 0; i < sv_max_devices; i++) {
 368  370                  mutex_init(&sv_devs[i].sv_olock, NULL, MUTEX_DRIVER, NULL);
 369  371                  rw_init(&sv_devs[i].sv_lock, NULL, RW_DRIVER, NULL);
 370  372          }
 371  373  
 372  374          if (sv_debug > 0)
 373  375                  cmn_err(CE_CONT, "!sv: sv_init_devs successful\n");
 374  376  
 375  377          return (0);
 376  378  }
 377  379  
 378  380  
 379  381  static int
 380  382  sv_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
 381  383  {
 382  384          int rc;
 383  385  
 384  386          switch (cmd) {
 385  387  
 386  388          case DDI_ATTACH:
 387  389                  sv_dip = dip;
 388  390  
 389  391                  if (ddi_create_minor_node(dip, "sv", S_IFCHR,
 390  392                      0, DDI_PSEUDO, 0) != DDI_SUCCESS)
 391  393                          goto failed;
 392  394  
 393  395                  mutex_enter(&sv_mutex);
 394  396  
 395  397                  sv_mem = nsc_register_mem("SV", NSC_MEM_LOCAL, 0);
 396  398                  if (sv_mem == NULL) {
 397  399                          mutex_exit(&sv_mutex);
 398  400                          goto failed;
 399  401                  }
 400  402  
 401  403                  rc = sv_init_devs();
 402  404                  if (rc != 0 && rc != EAGAIN) {
 403  405                          mutex_exit(&sv_mutex);
 404  406                          goto failed;
 405  407                  }
 406  408  
 407  409                  mutex_exit(&sv_mutex);
 408  410  
 409  411  
 410  412                  ddi_report_dev(dip);
 411  413  
 412  414                  sv_threads = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
 413  415                      DDI_PROP_DONTPASS | DDI_PROP_NOTPROM,
 414  416                      "sv_threads", sv_threads);
 415  417  
 416  418                  if (sv_debug > 0)
 417  419                          cmn_err(CE_CONT, "!sv: sv_threads=%d\n", sv_threads);
 418  420  
 419  421                  if (sv_threads > sv_threads_max)
 420  422                          sv_threads_max = sv_threads;
 421  423  
 422  424                  return (DDI_SUCCESS);
 423  425  
 424  426          default:
 425  427                  return (DDI_FAILURE);
 426  428          }
 427  429  
 428  430  failed:
 429  431          DTRACE_PROBE(sv_attach_failed);
 430  432          (void) sv_detach(dip, DDI_DETACH);
 431  433          return (DDI_FAILURE);
 432  434  }
 433  435  
 434  436  
 435  437  static int
 436  438  sv_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
 437  439  {
 438  440          sv_dev_t *svp;
 439  441          int i;
 440  442  
 441  443          switch (cmd) {
 442  444  
 443  445          case DDI_DETACH:
 444  446  
 445  447                  /*
 446  448                   * Check that everything is disabled.
 447  449                   */
 448  450  
 449  451                  mutex_enter(&sv_mutex);
 450  452  
 451  453                  if (sv_mod_status == SV_PREVENT_UNLOAD) {
 452  454                          mutex_exit(&sv_mutex);
 453  455                          DTRACE_PROBE(sv_detach_err_prevent);
 454  456                          return (DDI_FAILURE);
 455  457                  }
 456  458  
 457  459                  for (i = 0; sv_devs && i < sv_max_devices; i++) {
 458  460                          svp = &sv_devs[i];
 459  461  
 460  462                          if (svp->sv_state != SV_DISABLE) {
 461  463                                  mutex_exit(&sv_mutex);
 462  464                                  DTRACE_PROBE(sv_detach_err_busy);
 463  465                                  return (DDI_FAILURE);
 464  466                          }
 465  467                  }
 466  468  
 467  469  
 468  470                  for (i = 0; sv_devs && i < sv_max_devices; i++) {
 469  471                          mutex_destroy(&sv_devs[i].sv_olock);
 470  472                          rw_destroy(&sv_devs[i].sv_lock);
 471  473                  }
 472  474  
 473  475                  if (sv_devs) {
 474  476                          nsc_kmem_free(sv_devs,
 475  477                              (sv_max_devices * sizeof (*sv_devs)));
 476  478                          sv_devs = NULL;
 477  479                  }
 478  480                  sv_max_devices = 0;
 479  481  
 480  482                  if (sv_mem) {
 481  483                          nsc_unregister_mem(sv_mem);
 482  484                          sv_mem = NULL;
 483  485                  }
 484  486  
 485  487                  mutex_exit(&sv_mutex);
 486  488  
 487  489                  /*
 488  490                   * Remove all minor nodes.
 489  491                   */
 490  492  
 491  493                  ddi_remove_minor_node(dip, NULL);
 492  494                  sv_dip = NULL;
 493  495  
 494  496                  return (DDI_SUCCESS);
 495  497  
 496  498          default:
 497  499                  return (DDI_FAILURE);
 498  500          }
 499  501  }
 500  502  
 501  503  static sv_maj_t *
 502  504  sv_getmajor(const dev_t dev)
 503  505  {
 504  506          sv_maj_t **insert, *maj;
 505  507          major_t umaj = getmajor(dev);
 506  508  
 507  509          /*
 508  510           * See if the hash table entry, or one of the hash chains
 509  511           * is already allocated for this major number
 510  512           */
 511  513          if ((maj = sv_majors[SV_MAJOR_HASH(umaj)]) != 0) {
 512  514                  do {
 513  515                          if (maj->sm_major == umaj)
 514  516                                  return (maj);
 515  517                  } while ((maj = maj->sm_next) != 0);
 516  518          }
 517  519  
 518  520          /*
 519  521           * If the sv_mutex is held, there is design flaw, as the only non-mutex
 520  522           * held callers can be sv_enable() or sv_dev_to_sv()
 521  523           * Return an error, instead of panicing the system
 522  524           */
 523  525          if (MUTEX_HELD(&sv_mutex)) {
 524  526                  cmn_err(CE_WARN, "!sv: could not allocate sv_maj_t");
 525  527                  return (NULL);
 526  528          }
 527  529  
 528  530          /*
 529  531           * Determine where to allocate a new element in the hash table
 530  532           */
 531  533          mutex_enter(&sv_mutex);
 532  534          insert = &(sv_majors[SV_MAJOR_HASH(umaj)]);
 533  535          for (maj = *insert; maj; maj = maj->sm_next) {
 534  536  
 535  537                  /* Did another thread beat us to it? */
 536  538                  if (maj->sm_major == umaj)
 537  539                          return (maj);
 538  540  
 539  541                  /* Find a NULL insert point? */
 540  542                  if (maj->sm_next == NULL)
 541  543                          insert = &maj->sm_next;
 542  544          }
 543  545  
 544  546          /*
 545  547           * Located the new insert point
 546  548           */
 547  549          *insert = nsc_kmem_zalloc(sizeof (*maj), KM_NOSLEEP, sv_mem);
 548  550          if ((maj = *insert) != 0)
 549  551                  maj->sm_major = umaj;
 550  552          else
 551  553                  cmn_err(CE_WARN, "!sv: could not allocate sv_maj_t");
 552  554  
 553  555          mutex_exit(&sv_mutex);
 554  556  
 555  557          return (maj);
 556  558  }
 557  559  
 558  560  /* ARGSUSED */
 559  561  
 560  562  static int
 561  563  sv_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
 562  564  {
 563  565          int rc = DDI_FAILURE;
 564  566  
 565  567          switch (infocmd) {
 566  568  
 567  569          case DDI_INFO_DEVT2DEVINFO:
 568  570                  *result = sv_dip;
 569  571                  rc = DDI_SUCCESS;
 570  572                  break;
 571  573  
 572  574          case DDI_INFO_DEVT2INSTANCE:
 573  575                  /*
 574  576                   * We only have a single instance.
 575  577                   */
 576  578                  *result = 0;
 577  579                  rc = DDI_SUCCESS;
 578  580                  break;
 579  581  
 580  582          default:
 581  583                  break;
 582  584          }
 583  585  
 584  586          return (rc);
 585  587  }
 586  588  
 587  589  
 588  590  /*
 589  591   * Hashing of devices onto major device structures.
 590  592   *
 591  593   * Individual device structures are hashed onto one of the sm_hash[]
 592  594   * buckets in the relevant major device structure.
 593  595   *
 594  596   * Hash insertion and deletion -must- be done with sv_mutex held.  Hash
 595  597   * searching does not require the mutex because of the sm_seq member.
 596  598   * sm_seq is incremented on each insertion (-after- hash chain pointer
 597  599   * manipulation) and each deletion (-before- hash chain pointer
 598  600   * manipulation).  When searching the hash chain, the seq number is
 599  601   * checked before accessing each device structure, if the seq number has
 600  602   * changed, then we restart the search from the top of the hash chain.
 601  603   * If we restart more than SV_HASH_RETRY times, we take sv_mutex and search
 602  604   * the hash chain (we are guaranteed that this search cannot be
 603  605   * interrupted).
 604  606   */
 605  607  
 606  608  #define SV_HASH_RETRY   16
 607  609  
 608  610  static sv_dev_t *
 609  611  sv_dev_to_sv(const dev_t dev, sv_maj_t **majpp)
 610  612  {
 611  613          minor_t umin = getminor(dev);
 612  614          sv_dev_t **hb, *next, *svp;
 613  615          sv_maj_t *maj;
 614  616          int seq;
 615  617          int try;
 616  618  
 617  619          /* Get major hash table */
 618  620          maj = sv_getmajor(dev);
 619  621          if (majpp)
 620  622                  *majpp = maj;
 621  623          if (maj == NULL)
 622  624                  return (NULL);
 623  625  
 624  626          if (maj->sm_inuse == 0) {
 625  627                  DTRACE_PROBE1(
 626  628                      sv_dev_to_sv_end,
 627  629                      dev_t, dev);
 628  630                  return (NULL);
 629  631          }
 630  632  
 631  633          hb = &(maj->sm_hash[SV_MINOR_HASH(umin)]);
 632  634          try = 0;
 633  635  
 634  636  retry:
 635  637          if (try > SV_HASH_RETRY)
 636  638                  mutex_enter(&sv_mutex);
 637  639  
 638  640          seq = maj->sm_seq;
 639  641          for (svp = *hb; svp; svp = next) {
 640  642                  next = svp->sv_hash;
 641  643  
 642  644                  nsc_membar_stld();      /* preserve register load order */
 643  645  
 644  646                  if (maj->sm_seq != seq) {
 645  647                          DTRACE_PROBE1(sv_dev_to_sv_retry, dev_t, dev);
 646  648                          try++;
 647  649                          goto retry;
 648  650                  }
 649  651  
 650  652                  if (svp->sv_dev == dev)
 651  653                          break;
 652  654          }
 653  655  
 654  656          if (try > SV_HASH_RETRY)
 655  657                  mutex_exit(&sv_mutex);
 656  658  
 657  659          return (svp);
 658  660  }
 659  661  
 660  662  
 661  663  /*
 662  664   * Must be called with sv_mutex held.
 663  665   */
 664  666  
 665  667  static int
 666  668  sv_get_state(const dev_t udev, sv_dev_t **svpp)
 667  669  {
 668  670          sv_dev_t **hb, **insert, *svp;
 669  671          sv_maj_t *maj;
 670  672          minor_t umin;
 671  673          int i;
 672  674  
 673  675          /* Get major hash table */
 674  676          if ((maj = sv_getmajor(udev)) == NULL)
 675  677                  return (NULL);
 676  678  
 677  679          /* Determine which minor hash table */
 678  680          umin = getminor(udev);
 679  681          hb = &(maj->sm_hash[SV_MINOR_HASH(umin)]);
 680  682  
 681  683          /* look for clash */
 682  684  
 683  685          insert = hb;
 684  686  
 685  687          for (svp = *hb; svp; svp = svp->sv_hash) {
 686  688                  if (svp->sv_dev == udev)
 687  689                          break;
 688  690  
 689  691                  if (svp->sv_hash == NULL)
 690  692                          insert = &svp->sv_hash;
 691  693          }
 692  694  
 693  695          if (svp) {
 694  696                  DTRACE_PROBE1(
 695  697                      sv_get_state_enabled,
 696  698                      dev_t, udev);
 697  699                  return (SV_EENABLED);
 698  700          }
 699  701  
 700  702          /* look for spare sv_devs slot */
 701  703  
 702  704          for (i = 0; i < sv_max_devices; i++) {
 703  705                  svp = &sv_devs[i];
 704  706  
 705  707                  if (svp->sv_state == SV_DISABLE)
 706  708                          break;
 707  709          }
 708  710  
 709  711          if (i >= sv_max_devices) {
 710  712                  DTRACE_PROBE1(
 711  713                      sv_get_state_noslots,
 712  714                      dev_t, udev);
 713  715                  return (SV_ENOSLOTS);
 714  716          }
 715  717  
 716  718          svp->sv_state = SV_PENDING;
 717  719          svp->sv_pending = curthread;
 718  720  
 719  721          *insert = svp;
 720  722          svp->sv_hash = NULL;
 721  723          maj->sm_seq++;          /* must be after the store to the hash chain */
 722  724  
 723  725          *svpp = svp;
 724  726  
 725  727          /*
 726  728           * We do not know the size of the underlying device at
 727  729           * this stage, so initialise "nblocks" property to
 728  730           * zero, and update it whenever we succeed in
 729  731           * nsc_reserve'ing the underlying nsc_fd_t.
 730  732           */
 731  733  
 732  734          svp->sv_nblocks = 0;
 733  735  
 734  736          return (0);
 735  737  }
 736  738  
 737  739  
 738  740  /*
 739  741   * Remove a device structure from it's hash chain.
 740  742   * Must be called with sv_mutex held.
 741  743   */
 742  744  
 743  745  static void
 744  746  sv_rm_hash(sv_dev_t *svp)
 745  747  {
 746  748          sv_dev_t **svpp;
 747  749          sv_maj_t *maj;
 748  750  
 749  751          /* Get major hash table */
 750  752          if ((maj = sv_getmajor(svp->sv_dev)) == NULL)
 751  753                  return;
 752  754  
 753  755          /* remove svp from hash chain */
 754  756  
 755  757          svpp = &(maj->sm_hash[SV_MINOR_HASH(getminor(svp->sv_dev))]);
 756  758          while (*svpp) {
 757  759                  if (*svpp == svp) {
 758  760                          /*
 759  761                           * increment of sm_seq must be before the
 760  762                           * removal from the hash chain
 761  763                           */
 762  764                          maj->sm_seq++;
 763  765                          *svpp = svp->sv_hash;
 764  766                          break;
 765  767                  }
 766  768  
 767  769                  svpp = &(*svpp)->sv_hash;
 768  770          }
 769  771  
 770  772          svp->sv_hash = NULL;
 771  773  }
 772  774  
 773  775  /*
 774  776   * Free (disable) a device structure.
 775  777   * Must be called with sv_lock(RW_WRITER) and sv_mutex held, and will
 776  778   * perform the exits during its processing.
 777  779   */
 778  780  
 779  781  static int
 780  782  sv_free(sv_dev_t *svp, const int error)
 781  783  {
 782  784          struct cb_ops *cb_ops;
 783  785          sv_maj_t *maj;
 784  786  
 785  787          /* Get major hash table */
 786  788          if ((maj = sv_getmajor(svp->sv_dev)) == NULL)
 787  789                  return (NULL);
 788  790  
 789  791          svp->sv_state = SV_PENDING;
 790  792          svp->sv_pending = curthread;
 791  793  
 792  794          /*
 793  795           * Close the fd's before removing from the hash or swapping
 794  796           * back the cb_ops pointers so that the cache flushes before new
 795  797           * io can come in.
 796  798           */
 797  799  
 798  800          if (svp->sv_fd) {
 799  801                  (void) nsc_close(svp->sv_fd);
 800  802                  svp->sv_fd = 0;
 801  803          }
 802  804  
 803  805          sv_rm_hash(svp);
 804  806  
 805  807          if (error != SV_ESDOPEN &&
 806  808              error != SV_ELYROPEN && --maj->sm_inuse == 0) {
 807  809  
 808  810                  if (maj->sm_dev_ops)
 809  811                          cb_ops = maj->sm_dev_ops->devo_cb_ops;
 810  812                  else
 811  813                          cb_ops = NULL;
 812  814  
 813  815                  if (cb_ops && maj->sm_strategy != NULL) {
 814  816                          cb_ops->cb_strategy = maj->sm_strategy;
 815  817                          cb_ops->cb_close = maj->sm_close;
 816  818                          cb_ops->cb_ioctl = maj->sm_ioctl;
 817  819                          cb_ops->cb_write = maj->sm_write;
 818  820                          cb_ops->cb_open = maj->sm_open;
 819  821                          cb_ops->cb_read = maj->sm_read;
 820  822                          cb_ops->cb_flag = maj->sm_flag;
 821  823  
 822  824                          if (maj->sm_awrite)
 823  825                                  cb_ops->cb_awrite = maj->sm_awrite;
 824  826  
 825  827                          if (maj->sm_aread)
 826  828                                  cb_ops->cb_aread = maj->sm_aread;
 827  829  
 828  830                          /*
 829  831                           * corbin XXX
 830  832                           * Leave backing device ops in maj->sm_*
 831  833                           * to handle any requests that might come
 832  834                           * in during the disable.  This could be
 833  835                           * a problem however if the backing device
 834  836                           * driver is changed while we process these
 835  837                           * requests.
 836  838                           *
 837  839                           * maj->sm_strategy = 0;
 838  840                           * maj->sm_awrite = 0;
 839  841                           * maj->sm_write = 0;
 840  842                           * maj->sm_ioctl = 0;
 841  843                           * maj->sm_close = 0;
 842  844                           * maj->sm_aread = 0;
 843  845                           * maj->sm_read = 0;
 844  846                           * maj->sm_open = 0;
 845  847                           * maj->sm_flag = 0;
 846  848                           *
 847  849                           */
 848  850                  }
 849  851  
 850  852                  if (maj->sm_dev_ops) {
 851  853                          maj->sm_dev_ops = 0;
 852  854                  }
 853  855          }
 854  856  
 855  857          if (svp->sv_lh) {
 856  858                  cred_t *crp = ddi_get_cred();
 857  859  
 858  860                  /*
 859  861                   * Close the protective layered driver open using the
 860  862                   * Sun Private layered driver i/f.
 861  863                   */
 862  864  
 863  865                  (void) ldi_close(svp->sv_lh, FREAD|FWRITE, crp);
 864  866                  svp->sv_lh = NULL;
 865  867          }
 866  868  
 867  869          svp->sv_timestamp = nsc_lbolt();
 868  870          svp->sv_state = SV_DISABLE;
 869  871          svp->sv_pending = NULL;
 870  872          rw_exit(&svp->sv_lock);
 871  873          mutex_exit(&sv_mutex);
 872  874  
 873  875          return (error);
 874  876  }
 875  877  
 876  878  /*
 877  879   * Reserve the device, taking into account the possibility that
 878  880   * the reserve might have to be retried.
 879  881   */
 880  882  static int
 881  883  sv_reserve(nsc_fd_t *fd, int flags)
 882  884  {
 883  885          int eintr_count;
 884  886          int rc;
 885  887  
 886  888          eintr_count = 0;
 887  889          do {
 888  890                  rc = nsc_reserve(fd, flags);
 889  891                  if (rc == EINTR) {
 890  892                          ++eintr_count;
 891  893                          delay(2);
 892  894                  }
 893  895          } while ((rc == EINTR) && (eintr_count < MAX_EINTR_COUNT));
 894  896  
 895  897          return (rc);
 896  898  }
 897  899  
 898  900  static int
 899  901  sv_enable(const caddr_t path, const int flag,
 900  902      const dev_t udev, spcs_s_info_t kstatus)
 901  903  {
 902  904          struct dev_ops *dev_ops;
 903  905          struct cb_ops *cb_ops;
 904  906          sv_dev_t *svp;
 905  907          sv_maj_t *maj;
 906  908          nsc_size_t nblocks;
 907  909          int rc;
 908  910          cred_t *crp;
 909  911          ldi_ident_t     li;
 910  912  
 911  913          if (udev == (dev_t)-1 || udev == 0) {
 912  914                  DTRACE_PROBE1(
 913  915                      sv_enable_err_baddev,
 914  916                      dev_t, udev);
 915  917                  return (SV_EBADDEV);
 916  918          }
 917  919  
 918  920          if ((flag & ~(NSC_CACHE|NSC_DEVICE)) != 0) {
 919  921                  DTRACE_PROBE1(sv_enable_err_amode, dev_t, udev);
 920  922                  return (SV_EAMODE);
 921  923          }
 922  924  
 923  925          /* Get major hash table */
 924  926          if ((maj = sv_getmajor(udev)) == NULL)
 925  927                  return (SV_EBADDEV);
 926  928  
 927  929          mutex_enter(&sv_mutex);
 928  930  
 929  931          rc = sv_get_state(udev, &svp);
 930  932          if (rc) {
 931  933                  mutex_exit(&sv_mutex);
 932  934                  DTRACE_PROBE1(sv_enable_err_state, dev_t, udev);
 933  935                  return (rc);
 934  936          }
 935  937  
 936  938          rw_enter(&svp->sv_lock, RW_WRITER);
 937  939  
 938  940          /*
 939  941           * Get real fd used for io
 940  942           */
 941  943  
 942  944          svp->sv_dev = udev;
 943  945          svp->sv_flag = flag;
 944  946  
 945  947          /*
 946  948           * OR in NSC_DEVICE to ensure that nskern grabs the real strategy
 947  949           * function pointer before sv swaps them out.
 948  950           */
 949  951  
 950  952          svp->sv_fd = nsc_open(path, (svp->sv_flag | NSC_DEVICE),
 951  953              sv_fd_def, (blind_t)udev, &rc);
 952  954  
 953  955          if (svp->sv_fd == NULL) {
 954  956                  if (kstatus)
 955  957                          spcs_s_add(kstatus, rc);
 956  958                  DTRACE_PROBE1(sv_enable_err_fd, dev_t, udev);
 957  959                  return (sv_free(svp, SV_ESDOPEN));
 958  960          }
 959  961  
 960  962          /*
 961  963           * Perform a layered driver open using the Sun Private layered
 962  964           * driver i/f to ensure that the cb_ops structure for the driver
 963  965           * is not detached out from under us whilst sv is enabled.
 964  966           *
 965  967           */
 966  968  
 967  969          crp = ddi_get_cred();
 968  970          svp->sv_lh = NULL;
 969  971  
 970  972          if ((rc = ldi_ident_from_dev(svp->sv_dev, &li)) == 0) {
 971  973                  rc = ldi_open_by_dev(&svp->sv_dev,
 972  974                      OTYP_BLK, FREAD|FWRITE, crp, &svp->sv_lh, li);
 973  975          }
 974  976  
 975  977          if (rc != 0) {
 976  978                  if (kstatus)
 977  979                          spcs_s_add(kstatus, rc);
 978  980                  DTRACE_PROBE1(sv_enable_err_lyr_open, dev_t, udev);
 979  981                  return (sv_free(svp, SV_ELYROPEN));
 980  982          }
 981  983  
 982  984          /*
 983  985           * Do layering if required - must happen after nsc_open().
 984  986           */
 985  987  
 986  988          if (maj->sm_inuse++ == 0) {
 987  989                  maj->sm_dev_ops = nsc_get_devops(getmajor(udev));
 988  990  
 989  991                  if (maj->sm_dev_ops == NULL ||
 990  992                      maj->sm_dev_ops->devo_cb_ops == NULL) {
 991  993                          DTRACE_PROBE1(sv_enable_err_load, dev_t, udev);
 992  994                          return (sv_free(svp, SV_ELOAD));
 993  995                  }
 994  996  
 995  997                  dev_ops = maj->sm_dev_ops;
 996  998                  cb_ops = dev_ops->devo_cb_ops;
 997  999  
 998 1000                  if (cb_ops->cb_strategy == NULL ||
 999 1001                      cb_ops->cb_strategy == nodev ||
1000 1002                      cb_ops->cb_strategy == nulldev) {
1001 1003                          DTRACE_PROBE1(sv_enable_err_nostrategy, dev_t, udev);
1002 1004                          return (sv_free(svp, SV_ELOAD));
1003 1005                  }
1004 1006  
1005 1007                  if (cb_ops->cb_strategy == sv_lyr_strategy) {
1006 1008                          DTRACE_PROBE1(sv_enable_err_svstrategy, dev_t, udev);
1007 1009                          return (sv_free(svp, SV_ESTRATEGY));
1008 1010                  }
1009 1011  
1010 1012                  maj->sm_strategy = cb_ops->cb_strategy;
1011 1013                  maj->sm_close = cb_ops->cb_close;
1012 1014                  maj->sm_ioctl = cb_ops->cb_ioctl;
1013 1015                  maj->sm_write = cb_ops->cb_write;
1014 1016                  maj->sm_open = cb_ops->cb_open;
1015 1017                  maj->sm_read = cb_ops->cb_read;
1016 1018                  maj->sm_flag = cb_ops->cb_flag;
1017 1019  
1018 1020                  cb_ops->cb_flag = cb_ops->cb_flag | D_MP;
1019 1021                  cb_ops->cb_strategy = sv_lyr_strategy;
1020 1022                  cb_ops->cb_close = sv_lyr_close;
1021 1023                  cb_ops->cb_ioctl = sv_lyr_ioctl;
1022 1024                  cb_ops->cb_write = sv_lyr_write;
1023 1025                  cb_ops->cb_open = sv_lyr_open;
1024 1026                  cb_ops->cb_read = sv_lyr_read;
1025 1027  
1026 1028                  /*
1027 1029                   * Check that the driver has async I/O entry points
1028 1030                   * before changing them.
1029 1031                   */
1030 1032  
1031 1033                  if (dev_ops->devo_rev < 3 || cb_ops->cb_rev < 1) {
1032 1034                          maj->sm_awrite = 0;
1033 1035                          maj->sm_aread = 0;
1034 1036                  } else {
1035 1037                          maj->sm_awrite = cb_ops->cb_awrite;
1036 1038                          maj->sm_aread = cb_ops->cb_aread;
1037 1039  
1038 1040                          cb_ops->cb_awrite = sv_lyr_awrite;
1039 1041                          cb_ops->cb_aread = sv_lyr_aread;
1040 1042                  }
1041 1043  
1042 1044                  /*
1043 1045                   * Bug 4645743
1044 1046                   *
1045 1047                   * Prevent sv from ever unloading after it has interposed
1046 1048                   * on a major device because there is a race between
1047 1049                   * sv removing its layered entry points from the target
1048 1050                   * dev_ops, a client coming in and accessing the driver,
1049 1051                   * and the kernel modunloading the sv text.
1050 1052                   *
1051 1053                   * To allow unload, do svboot -u, which only happens in
1052 1054                   * pkgrm time.
1053 1055                   */
1054 1056                  ASSERT(MUTEX_HELD(&sv_mutex));
1055 1057                  sv_mod_status = SV_PREVENT_UNLOAD;
1056 1058          }
1057 1059  
1058 1060  
1059 1061          svp->sv_timestamp = nsc_lbolt();
1060 1062          svp->sv_state = SV_ENABLE;
1061 1063          svp->sv_pending = NULL;
1062 1064          rw_exit(&svp->sv_lock);
1063 1065  
1064 1066          sv_ndevices++;
1065 1067          mutex_exit(&sv_mutex);
1066 1068  
1067 1069          nblocks = 0;
1068 1070          if (sv_reserve(svp->sv_fd, NSC_READ|NSC_MULTI|NSC_PCATCH) == 0) {
1069 1071                  nblocks = svp->sv_nblocks;
1070 1072                  nsc_release(svp->sv_fd);
1071 1073          }
1072 1074  
1073 1075          cmn_err(CE_CONT, "!sv: rdev 0x%lx, nblocks %" NSC_SZFMT "\n",
1074 1076              svp->sv_dev, nblocks);
1075 1077  
1076 1078          return (0);
1077 1079  }
1078 1080  
1079 1081  
1080 1082  static int
1081 1083  sv_prepare_unload()
1082 1084  {
1083 1085          int rc = 0;
1084 1086  
1085 1087          mutex_enter(&sv_mutex);
1086 1088  
1087 1089          if (sv_mod_status == SV_PREVENT_UNLOAD) {
1088 1090                  if ((sv_ndevices != 0) || (sv_tset != NULL)) {
1089 1091                          rc = EBUSY;
1090 1092                  } else {
1091 1093                          sv_mod_status = SV_ALLOW_UNLOAD;
1092 1094                          delay(SV_WAIT_UNLOAD * drv_usectohz(1000000));
1093 1095                  }
1094 1096          }
1095 1097  
1096 1098          mutex_exit(&sv_mutex);
1097 1099          return (rc);
1098 1100  }
1099 1101  
1100 1102  static int
1101 1103  svattach_fd(blind_t arg)
1102 1104  {
1103 1105          dev_t dev = (dev_t)arg;
1104 1106          sv_dev_t *svp = sv_dev_to_sv(dev, NULL);
1105 1107          int rc;
1106 1108  
1107 1109          if (sv_debug > 0)
1108 1110                  cmn_err(CE_CONT, "!svattach_fd(%p, %p)\n", arg, (void *)svp);
1109 1111  
1110 1112          if (svp == NULL) {
1111 1113                  cmn_err(CE_WARN, "!svattach_fd: no state (arg %p)", arg);
1112 1114                  return (0);
1113 1115          }
1114 1116  
1115 1117          if ((rc = nsc_partsize(svp->sv_fd, &svp->sv_nblocks)) != 0) {
1116 1118                  cmn_err(CE_WARN,
1117 1119                      "!svattach_fd: nsc_partsize() failed, rc %d", rc);
1118 1120                  svp->sv_nblocks = 0;
1119 1121          }
1120 1122  
1121 1123          if ((rc = nsc_maxfbas(svp->sv_fd, 0, &svp->sv_maxfbas)) != 0) {
1122 1124                  cmn_err(CE_WARN,
1123 1125                      "!svattach_fd: nsc_maxfbas() failed, rc %d", rc);
1124 1126                  svp->sv_maxfbas = 0;
1125 1127          }
1126 1128  
1127 1129          if (sv_debug > 0) {
1128 1130                  cmn_err(CE_CONT,
1129 1131                      "!svattach_fd(%p): size %" NSC_SZFMT ", "
1130 1132                      "maxfbas %" NSC_SZFMT "\n",
1131 1133                      arg, svp->sv_nblocks, svp->sv_maxfbas);
1132 1134          }
1133 1135  
1134 1136          return (0);
1135 1137  }
1136 1138  
1137 1139  
1138 1140  static int
1139 1141  svdetach_fd(blind_t arg)
1140 1142  {
1141 1143          dev_t dev = (dev_t)arg;
1142 1144          sv_dev_t *svp = sv_dev_to_sv(dev, NULL);
1143 1145  
1144 1146          if (sv_debug > 0)
1145 1147                  cmn_err(CE_CONT, "!svdetach_fd(%p, %p)\n", arg, (void *)svp);
1146 1148  
1147 1149          /* svp can be NULL during disable of an sv */
1148 1150          if (svp == NULL)
1149 1151                  return (0);
1150 1152  
1151 1153          svp->sv_maxfbas = 0;
1152 1154          svp->sv_nblocks = 0;
1153 1155          return (0);
1154 1156  }
1155 1157  
1156 1158  
1157 1159  /*
1158 1160   * Side effect: if called with (guard != 0), then expects both sv_mutex
1159 1161   * and sv_lock(RW_WRITER) to be held, and will release them before returning.
1160 1162   */
1161 1163  
1162 1164  /* ARGSUSED */
1163 1165  static int
1164 1166  sv_disable(dev_t dev, spcs_s_info_t kstatus)
1165 1167  {
1166 1168          sv_dev_t *svp = sv_dev_to_sv(dev, NULL);
1167 1169  
1168 1170          if (svp == NULL) {
1169 1171  
1170 1172                  DTRACE_PROBE1(sv_disable_err_nodev, sv_dev_t *, svp);
1171 1173                  return (SV_ENODEV);
1172 1174          }
1173 1175  
1174 1176          mutex_enter(&sv_mutex);
1175 1177          rw_enter(&svp->sv_lock, RW_WRITER);
1176 1178  
1177 1179          if (svp->sv_fd == NULL || svp->sv_state != SV_ENABLE) {
1178 1180                  rw_exit(&svp->sv_lock);
1179 1181                  mutex_exit(&sv_mutex);
1180 1182  
1181 1183                  DTRACE_PROBE1(sv_disable_err_disabled, sv_dev_t *, svp);
1182 1184                  return (SV_EDISABLED);
1183 1185          }
1184 1186  
1185 1187  
1186 1188          sv_ndevices--;
1187 1189          return (sv_free(svp, 0));
1188 1190  }
1189 1191  
1190 1192  
1191 1193  
1192 1194  static int
1193 1195  sv_lyr_open(dev_t *devp, int flag, int otyp, cred_t *crp)
1194 1196  {
1195 1197          nsc_buf_t *tmph;
1196 1198          sv_dev_t *svp;
1197 1199          sv_maj_t *maj;
1198 1200          int (*fn)();
1199 1201          dev_t odev;
1200 1202          int ret;
1201 1203          int rc;
1202 1204  
1203 1205          svp = sv_dev_to_sv(*devp, &maj);
1204 1206  
1205 1207          if (svp) {
1206 1208                  if (svp->sv_state == SV_PENDING &&
1207 1209                      svp->sv_pending == curthread) {
1208 1210                          /*
1209 1211                           * This is a recursive open from a call to
1210 1212                           * ddi_lyr_open_by_devt and so we just want
1211 1213                           * to pass it straight through to the
1212 1214                           * underlying driver.
1213 1215                           */
1214 1216                          DTRACE_PROBE2(sv_lyr_open_recursive,
1215 1217                              sv_dev_t *, svp,
1216 1218                              dev_t, *devp);
1217 1219                          svp = NULL;
1218 1220                  } else
1219 1221                          rw_enter(&svp->sv_lock, RW_READER);
1220 1222          }
1221 1223  
1222 1224          odev = *devp;
1223 1225  
1224 1226          if (maj && (fn = maj->sm_open) != 0) {
1225 1227                  if (!(maj->sm_flag & D_MP)) {
1226 1228                          UNSAFE_ENTER();
1227 1229                          ret = (*fn)(devp, flag, otyp, crp);
1228 1230                          UNSAFE_EXIT();
  
    | 
      ↓ open down ↓ | 
    1195 lines elided | 
    
      ↑ open up ↑ | 
  
1229 1231                  } else {
1230 1232                          ret = (*fn)(devp, flag, otyp, crp);
1231 1233                  }
1232 1234  
1233 1235                  if (ret == 0) {
1234 1236                          /*
1235 1237                           * Re-acquire svp if the driver changed *devp.
1236 1238                           */
1237 1239  
1238 1240                          if (*devp != odev) {
1239      -                                rw_exit(&svp->sv_lock);
     1241 +                                if (svp != NULL)
     1242 +                                        rw_exit(&svp->sv_lock);
1240 1243  
1241 1244                                  svp = sv_dev_to_sv(*devp, NULL);
1242 1245  
1243 1246                                  if (svp) {
1244 1247                                          rw_enter(&svp->sv_lock, RW_READER);
1245 1248                                  }
1246 1249                          }
1247 1250                  }
1248 1251          } else {
1249 1252                  ret = ENODEV;
1250 1253          }
1251 1254  
1252 1255          if (svp && ret != 0 && svp->sv_state == SV_ENABLE) {
1253 1256                  /*
1254 1257                   * Underlying DDI open failed, but we have this
1255 1258                   * device SV enabled.  If we can read some data
1256 1259                   * from the device, fake a successful open (this
1257 1260                   * probably means that this device is RDC'd and we
1258 1261                   * are getting the data from the secondary node).
1259 1262                   *
1260 1263                   * The reserve must be done with NSC_TRY|NSC_NOWAIT to
1261 1264                   * ensure that it does not deadlock if this open is
1262 1265                   * coming from nskernd:get_bsize().
1263 1266                   */
1264 1267                  rc = sv_reserve(svp->sv_fd,
1265 1268                      NSC_TRY | NSC_NOWAIT | NSC_MULTI | NSC_PCATCH);
1266 1269                  if (rc == 0) {
1267 1270                          tmph = NULL;
1268 1271  
1269 1272                          rc = nsc_alloc_buf(svp->sv_fd, 0, 1, NSC_READ, &tmph);
1270 1273                          if (rc <= 0) {
1271 1274                                  /* success */
1272 1275                                  ret = 0;
1273 1276                          }
1274 1277  
1275 1278                          if (tmph) {
1276 1279                                  (void) nsc_free_buf(tmph);
1277 1280                                  tmph = NULL;
1278 1281                          }
1279 1282  
1280 1283                          nsc_release(svp->sv_fd);
1281 1284  
1282 1285                          /*
1283 1286                           * Count the number of layered opens that we
1284 1287                           * fake since we have to fake a matching number
1285 1288                           * of closes (OTYP_LYR open/close calls must be
1286 1289                           * paired).
1287 1290                           */
1288 1291  
1289 1292                          if (ret == 0 && otyp == OTYP_LYR) {
1290 1293                                  mutex_enter(&svp->sv_olock);
1291 1294                                  svp->sv_openlcnt++;
1292 1295                                  mutex_exit(&svp->sv_olock);
1293 1296                          }
1294 1297                  }
1295 1298          }
1296 1299  
1297 1300          if (svp) {
1298 1301                  rw_exit(&svp->sv_lock);
1299 1302          }
1300 1303  
1301 1304          return (ret);
1302 1305  }
1303 1306  
1304 1307  
1305 1308  static int
1306 1309  sv_lyr_close(dev_t dev, int flag, int otyp, cred_t *crp)
1307 1310  {
1308 1311          sv_dev_t *svp;
1309 1312          sv_maj_t *maj;
1310 1313          int (*fn)();
1311 1314          int ret;
1312 1315  
1313 1316          svp = sv_dev_to_sv(dev, &maj);
1314 1317  
1315 1318          if (svp &&
1316 1319              svp->sv_state == SV_PENDING &&
1317 1320              svp->sv_pending == curthread) {
1318 1321                  /*
1319 1322                   * This is a recursive open from a call to
1320 1323                   * ddi_lyr_close and so we just want
1321 1324                   * to pass it straight through to the
1322 1325                   * underlying driver.
1323 1326                   */
1324 1327                  DTRACE_PROBE2(sv_lyr_close_recursive, sv_dev_t *, svp,
1325 1328                      dev_t, dev);
1326 1329                  svp = NULL;
1327 1330          }
1328 1331  
1329 1332          if (svp) {
1330 1333                  rw_enter(&svp->sv_lock, RW_READER);
1331 1334  
1332 1335                  if (otyp == OTYP_LYR) {
1333 1336                          mutex_enter(&svp->sv_olock);
1334 1337  
1335 1338                          if (svp->sv_openlcnt) {
1336 1339                                  /*
1337 1340                                   * Consume sufficient layered closes to
1338 1341                                   * account for the opens that we faked
1339 1342                                   * whilst the device was failed.
1340 1343                                   */
1341 1344                                  svp->sv_openlcnt--;
1342 1345                                  mutex_exit(&svp->sv_olock);
1343 1346                                  rw_exit(&svp->sv_lock);
1344 1347  
1345 1348                                  DTRACE_PROBE1(sv_lyr_close_end, dev_t, dev);
1346 1349  
1347 1350                                  return (0);
1348 1351                          }
1349 1352  
1350 1353                          mutex_exit(&svp->sv_olock);
1351 1354                  }
1352 1355          }
1353 1356  
1354 1357          if (maj && (fn = maj->sm_close) != 0) {
1355 1358                  if (!(maj->sm_flag & D_MP)) {
1356 1359                          UNSAFE_ENTER();
1357 1360                          ret = (*fn)(dev, flag, otyp, crp);
1358 1361                          UNSAFE_EXIT();
1359 1362                  } else {
1360 1363                          ret = (*fn)(dev, flag, otyp, crp);
1361 1364                  }
1362 1365          } else {
1363 1366                  ret = ENODEV;
1364 1367          }
1365 1368  
1366 1369          if (svp) {
1367 1370                  rw_exit(&svp->sv_lock);
1368 1371          }
1369 1372  
1370 1373          return (ret);
1371 1374  }
1372 1375  
1373 1376  
1374 1377  /*
1375 1378   * Convert the specified dev_t into a locked and enabled sv_dev_t, or
1376 1379   * return NULL.
1377 1380   */
1378 1381  static sv_dev_t *
1379 1382  sv_find_enabled(const dev_t dev, sv_maj_t **majpp)
1380 1383  {
1381 1384          sv_dev_t *svp;
1382 1385  
1383 1386          while ((svp = sv_dev_to_sv(dev, majpp)) != NULL) {
1384 1387                  rw_enter(&svp->sv_lock, RW_READER);
1385 1388  
1386 1389                  if (svp->sv_state == SV_ENABLE) {
1387 1390                          /* locked and enabled */
1388 1391                          break;
1389 1392                  }
1390 1393  
1391 1394                  /*
1392 1395                   * State was changed while waiting on the lock.
1393 1396                   * Wait for a stable state.
1394 1397                   */
1395 1398                  rw_exit(&svp->sv_lock);
1396 1399  
1397 1400                  DTRACE_PROBE1(sv_find_enabled_retry, dev_t, dev);
1398 1401  
1399 1402                  delay(2);
1400 1403          }
1401 1404  
1402 1405          return (svp);
1403 1406  }
1404 1407  
1405 1408  
1406 1409  static int
1407 1410  sv_lyr_uio(dev_t dev, uio_t *uiop, cred_t *crp, int rw)
1408 1411  {
1409 1412          sv_dev_t *svp;
1410 1413          sv_maj_t *maj;
1411 1414          int (*fn)();
1412 1415          int rc;
1413 1416  
1414 1417          svp = sv_find_enabled(dev, &maj);
1415 1418          if (svp == NULL) {
1416 1419                  if (maj) {
1417 1420                          if (rw == NSC_READ)
1418 1421                                  fn = maj->sm_read;
1419 1422                          else
1420 1423                                  fn = maj->sm_write;
1421 1424  
1422 1425                          if (fn != 0) {
1423 1426                                  if (!(maj->sm_flag & D_MP)) {
1424 1427                                          UNSAFE_ENTER();
1425 1428                                          rc = (*fn)(dev, uiop, crp);
1426 1429                                          UNSAFE_EXIT();
1427 1430                                  } else {
1428 1431                                          rc = (*fn)(dev, uiop, crp);
1429 1432                                  }
1430 1433                          }
1431 1434  
1432 1435                          return (rc);
1433 1436                  } else {
1434 1437                          return (ENODEV);
1435 1438                  }
1436 1439          }
1437 1440  
1438 1441          ASSERT(RW_READ_HELD(&svp->sv_lock));
1439 1442  
1440 1443          if (svp->sv_flag == 0) {
1441 1444                  /*
1442 1445                   * guard access mode
1443 1446                   * - prevent user level access to the device
1444 1447                   */
1445 1448                  DTRACE_PROBE1(sv_lyr_uio_err_guard, uio_t *, uiop);
1446 1449                  rc = EPERM;
1447 1450                  goto out;
1448 1451          }
1449 1452  
1450 1453          if ((rc = sv_reserve(svp->sv_fd, NSC_MULTI|NSC_PCATCH)) != 0) {
1451 1454                  DTRACE_PROBE1(sv_lyr_uio_err_rsrv, uio_t *, uiop);
1452 1455                  goto out;
1453 1456          }
1454 1457  
1455 1458          if (rw == NSC_READ)
1456 1459                  rc = nsc_uread(svp->sv_fd, uiop, crp);
1457 1460          else
1458 1461                  rc = nsc_uwrite(svp->sv_fd, uiop, crp);
1459 1462  
1460 1463          nsc_release(svp->sv_fd);
1461 1464  
1462 1465  out:
1463 1466          rw_exit(&svp->sv_lock);
1464 1467  
1465 1468          return (rc);
1466 1469  }
1467 1470  
1468 1471  
1469 1472  static int
1470 1473  sv_lyr_read(dev_t dev, uio_t *uiop, cred_t *crp)
1471 1474  {
1472 1475          return (sv_lyr_uio(dev, uiop, crp, NSC_READ));
1473 1476  }
1474 1477  
1475 1478  
1476 1479  static int
1477 1480  sv_lyr_write(dev_t dev, uio_t *uiop, cred_t *crp)
1478 1481  {
1479 1482          return (sv_lyr_uio(dev, uiop, crp, NSC_WRITE));
1480 1483  }
1481 1484  
1482 1485  
1483 1486  /* ARGSUSED */
1484 1487  
1485 1488  static int
1486 1489  sv_lyr_aread(dev_t dev, struct aio_req *aio, cred_t *crp)
1487 1490  {
1488 1491          return (aphysio(sv_lyr_strategy,
1489 1492              anocancel, dev, B_READ, minphys, aio));
1490 1493  }
1491 1494  
1492 1495  
1493 1496  /* ARGSUSED */
1494 1497  
1495 1498  static int
1496 1499  sv_lyr_awrite(dev_t dev, struct aio_req *aio, cred_t *crp)
1497 1500  {
1498 1501          return (aphysio(sv_lyr_strategy,
1499 1502              anocancel, dev, B_WRITE, minphys, aio));
1500 1503  }
1501 1504  
1502 1505  
1503 1506  /*
1504 1507   * Set up an array containing the list of raw path names
1505 1508   * The array for the paths is svl and the size of the array is
1506 1509   * in size.
1507 1510   *
1508 1511   * If there are more layered devices than will fit in the array,
1509 1512   * the number of extra layered devices is returned.  Otherwise
1510 1513   * zero is return.
1511 1514   *
1512 1515   * Input:
1513 1516   *      svn     : array for paths
1514 1517   *      size    : size of the array
1515 1518   *
1516 1519   * Output (extra):
1517 1520   *      zero    : All paths fit in array
1518 1521   *      >0      : Number of defined layered devices don't fit in array
1519 1522   */
1520 1523  
1521 1524  static int
1522 1525  sv_list(void *ptr, const int size, int *extra, const int ilp32)
1523 1526  {
1524 1527          sv_name32_t *svn32;
1525 1528          sv_name_t *svn;
1526 1529          sv_dev_t *svp;
1527 1530          int *mode, *nblocks;
1528 1531          int i, index;
1529 1532          char *path;
1530 1533  
1531 1534          *extra = 0;
1532 1535          index = 0;
1533 1536  
1534 1537          if (ilp32)
1535 1538                  svn32 = ptr;
1536 1539          else
1537 1540                  svn = ptr;
1538 1541  
1539 1542          mutex_enter(&sv_mutex);
1540 1543          for (i = 0; i < sv_max_devices; i++) {
1541 1544                  svp = &sv_devs[i];
1542 1545  
1543 1546                  rw_enter(&svp->sv_lock, RW_READER);
1544 1547  
1545 1548                  if (svp->sv_state != SV_ENABLE) {
1546 1549                          rw_exit(&svp->sv_lock);
1547 1550                          continue;
1548 1551                  }
1549 1552  
1550 1553                  if ((*extra) != 0 || ptr == NULL) {
1551 1554                          /* Another overflow entry */
1552 1555                          rw_exit(&svp->sv_lock);
1553 1556                          (*extra)++;
1554 1557                          continue;
1555 1558                  }
1556 1559  
1557 1560                  if (ilp32) {
1558 1561                          nblocks = &svn32->svn_nblocks;
1559 1562                          mode = &svn32->svn_mode;
1560 1563                          path = svn32->svn_path;
1561 1564  
1562 1565                          svn32->svn_timestamp = (uint32_t)svp->sv_timestamp;
1563 1566                          svn32++;
1564 1567                  } else {
1565 1568                          nblocks = &svn->svn_nblocks;
1566 1569                          mode = &svn->svn_mode;
1567 1570                          path = svn->svn_path;
1568 1571  
1569 1572                          svn->svn_timestamp = svp->sv_timestamp;
1570 1573                          svn++;
1571 1574                  }
1572 1575  
1573 1576                  (void) strcpy(path, nsc_pathname(svp->sv_fd));
1574 1577                  *nblocks = svp->sv_nblocks;
1575 1578                  *mode = svp->sv_flag;
1576 1579  
1577 1580                  if (*nblocks == 0) {
1578 1581                          if (sv_debug > 3)
1579 1582                                  cmn_err(CE_CONT, "!sv_list: need to reserve\n");
1580 1583  
1581 1584                          if (sv_reserve(svp->sv_fd, NSC_MULTI|NSC_PCATCH) == 0) {
1582 1585                                  *nblocks = svp->sv_nblocks;
1583 1586                                  nsc_release(svp->sv_fd);
1584 1587                          }
1585 1588                  }
1586 1589  
1587 1590                  if (++index >= size) {
1588 1591                          /* Out of space */
1589 1592                          (*extra)++;
1590 1593                  }
1591 1594  
1592 1595                  rw_exit(&svp->sv_lock);
1593 1596          }
1594 1597          mutex_exit(&sv_mutex);
1595 1598  
1596 1599          if (index < size) {
1597 1600                  /* NULL terminated list */
1598 1601                  if (ilp32)
1599 1602                          svn32->svn_path[0] = '\0';
1600 1603                  else
1601 1604                          svn->svn_path[0] = '\0';
1602 1605          }
1603 1606  
1604 1607          return (0);
1605 1608  }
1606 1609  
1607 1610  
1608 1611  static void
1609 1612  sv_thread_tune(int threads)
1610 1613  {
1611 1614          int incr = (threads > 0) ? 1 : -1;
1612 1615          int change = 0;
1613 1616          int nthreads;
1614 1617  
1615 1618          ASSERT(MUTEX_HELD(&sv_mutex));
1616 1619  
1617 1620          if (sv_threads_extra) {
1618 1621                  /* keep track of any additional threads requested */
1619 1622                  if (threads > 0) {
1620 1623                          sv_threads_extra += threads;
1621 1624                          return;
1622 1625                  }
1623 1626                  threads = -threads;
1624 1627                  if (threads >= sv_threads_extra) {
1625 1628                          threads -= sv_threads_extra;
1626 1629                          sv_threads_extra = 0;
1627 1630                          /* fall through to while loop */
1628 1631                  } else {
1629 1632                          sv_threads_extra -= threads;
1630 1633                          return;
1631 1634                  }
1632 1635          } else if (threads > 0) {
1633 1636                  /*
1634 1637                   * do not increase the number of threads beyond
1635 1638                   * sv_threads_max when doing dynamic thread tuning
1636 1639                   */
1637 1640                  nthreads = nst_nthread(sv_tset);
1638 1641                  if ((nthreads + threads) > sv_threads_max) {
1639 1642                          sv_threads_extra = nthreads + threads - sv_threads_max;
1640 1643                          threads = sv_threads_max - nthreads;
1641 1644                          if (threads <= 0)
1642 1645                                  return;
1643 1646                  }
1644 1647          }
1645 1648  
1646 1649          if (threads < 0)
1647 1650                  threads = -threads;
1648 1651  
1649 1652          while (threads--) {
1650 1653                  nthreads = nst_nthread(sv_tset);
1651 1654                  sv_threads_needed += incr;
1652 1655  
1653 1656                  if (sv_threads_needed >= nthreads)
1654 1657                          change += nst_add_thread(sv_tset, sv_threads_inc);
1655 1658                  else if ((sv_threads_needed <
1656 1659                      (nthreads - (sv_threads_inc + sv_threads_hysteresis))) &&
1657 1660                      ((nthreads - sv_threads_inc) >= sv_threads))
1658 1661                          change -= nst_del_thread(sv_tset, sv_threads_inc);
1659 1662          }
1660 1663  
1661 1664  #ifdef DEBUG
1662 1665          if (change) {
1663 1666                  cmn_err(CE_NOTE,
1664 1667                      "!sv_thread_tune: threads needed %d, nthreads %d, "
1665 1668                      "nthreads change %d",
1666 1669                      sv_threads_needed, nst_nthread(sv_tset), change);
1667 1670          }
1668 1671  #endif
1669 1672  }
1670 1673  
1671 1674  
1672 1675  /* ARGSUSED */
1673 1676  static int
1674 1677  svopen(dev_t *devp, int flag, int otyp, cred_t *crp)
1675 1678  {
1676 1679          int rc;
1677 1680  
1678 1681          mutex_enter(&sv_mutex);
1679 1682          rc = sv_init_devs();
1680 1683          mutex_exit(&sv_mutex);
1681 1684  
1682 1685          return (rc);
1683 1686  }
1684 1687  
1685 1688  
1686 1689  /* ARGSUSED */
1687 1690  static int
1688 1691  svclose(dev_t dev, int flag, int otyp, cred_t *crp)
1689 1692  {
1690 1693          const int secs = HZ * 5;
1691 1694          const int ticks = HZ / 10;
1692 1695          int loops = secs / ticks;
1693 1696  
1694 1697          mutex_enter(&sv_mutex);
1695 1698          while (sv_ndevices <= 0 && sv_tset != NULL && loops > 0) {
1696 1699                  if (nst_nlive(sv_tset) <= 0) {
1697 1700                          nst_destroy(sv_tset);
1698 1701                          sv_tset = NULL;
1699 1702                          break;
1700 1703                  }
1701 1704  
1702 1705                  /* threads still active - wait for them to exit */
1703 1706                  mutex_exit(&sv_mutex);
1704 1707                  delay(ticks);
1705 1708                  loops--;
1706 1709                  mutex_enter(&sv_mutex);
1707 1710          }
1708 1711          mutex_exit(&sv_mutex);
1709 1712  
1710 1713          if (loops <= 0) {
1711 1714                  cmn_err(CE_WARN,
1712 1715  #ifndef DEBUG
1713 1716                      /* do not write to console when non-DEBUG */
1714 1717                      "!"
1715 1718  #endif
1716 1719                      "sv:svclose: threads still active "
1717 1720                      "after %d sec - leaking thread set", secs);
1718 1721          }
1719 1722  
1720 1723          return (0);
1721 1724  }
1722 1725  
1723 1726  
1724 1727  static int
1725 1728  svioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *crp, int *rvalp)
1726 1729  {
1727 1730          char itmp1[12], itmp2[12]; /* temp char array for editing ints */
1728 1731          spcs_s_info_t kstatus;  /* Kernel version of spcs status */
1729 1732          spcs_s_info_t ustatus;  /* Address of user version of spcs status */
1730 1733          sv_list32_t svl32;      /* 32 bit Initial structure for SVIOC_LIST */
1731 1734          sv_version_t svv;       /* Version structure */
1732 1735          sv_conf_t svc;          /* User config structure */
1733 1736          sv_list_t svl;          /* Initial structure for SVIOC_LIST */
1734 1737          void *usvn;             /* Address of user sv_name_t */
1735 1738          void *svn = NULL;       /* Array for SVIOC_LIST */
1736 1739          uint64_t phash;         /* pathname hash */
1737 1740          int rc = 0;             /* Return code -- errno */
1738 1741          int size;               /* Number of items in array */
1739 1742          int bytes;              /* Byte size of array */
1740 1743          int ilp32;              /* Convert data structures for ilp32 userland */
1741 1744  
1742 1745          *rvalp = 0;
1743 1746  
1744 1747          /*
1745 1748           * If sv_mod_status is 0 or SV_PREVENT_UNLOAD, then it will continue.
1746 1749           * else it means it previously was SV_PREVENT_UNLOAD, and now it's
1747 1750           * SV_ALLOW_UNLOAD, expecting the driver to eventually unload.
1748 1751           *
1749 1752           * SV_ALLOW_UNLOAD is final state, so no need to grab sv_mutex.
1750 1753           */
1751 1754          if (sv_mod_status == SV_ALLOW_UNLOAD) {
1752 1755                  return (EBUSY);
1753 1756          }
1754 1757  
1755 1758          if ((cmd != SVIOC_LIST) && ((rc = drv_priv(crp)) != 0))
1756 1759                  return (rc);
1757 1760  
1758 1761          kstatus = spcs_s_kcreate();
1759 1762          if (!kstatus) {
1760 1763                  DTRACE_PROBE1(sv_ioctl_err_kcreate, dev_t, dev);
1761 1764                  return (ENOMEM);
1762 1765          }
1763 1766  
1764 1767          ilp32 = (ddi_model_convert_from((mode & FMODELS)) == DDI_MODEL_ILP32);
1765 1768  
1766 1769          switch (cmd) {
1767 1770  
1768 1771          case SVIOC_ENABLE:
1769 1772  
1770 1773                  if (ilp32) {
1771 1774                          sv_conf32_t svc32;
1772 1775  
1773 1776                          if (ddi_copyin((void *)arg, &svc32,
1774 1777                              sizeof (svc32), mode) < 0) {
1775 1778                                  spcs_s_kfree(kstatus);
1776 1779                                  return (EFAULT);
1777 1780                          }
1778 1781  
1779 1782                          svc.svc_error = (spcs_s_info_t)svc32.svc_error;
1780 1783                          (void) strcpy(svc.svc_path, svc32.svc_path);
1781 1784                          svc.svc_flag  = svc32.svc_flag;
1782 1785                          svc.svc_major = svc32.svc_major;
1783 1786                          svc.svc_minor = svc32.svc_minor;
1784 1787                  } else {
1785 1788                          if (ddi_copyin((void *)arg, &svc,
1786 1789                              sizeof (svc), mode) < 0) {
1787 1790                                  spcs_s_kfree(kstatus);
1788 1791                                  return (EFAULT);
1789 1792                          }
1790 1793                  }
1791 1794  
1792 1795                  /* force to raw access */
1793 1796                  svc.svc_flag = NSC_DEVICE;
1794 1797  
1795 1798                  if (sv_tset == NULL) {
1796 1799                          mutex_enter(&sv_mutex);
1797 1800  
1798 1801                          if (sv_tset == NULL) {
1799 1802                                  sv_tset = nst_init("sv_thr", sv_threads);
1800 1803                          }
1801 1804  
1802 1805                          mutex_exit(&sv_mutex);
1803 1806  
1804 1807                          if (sv_tset == NULL) {
1805 1808                                  cmn_err(CE_WARN,
1806 1809                                      "!sv: could not allocate %d threads",
1807 1810                                      sv_threads);
1808 1811                          }
1809 1812                  }
1810 1813  
1811 1814                  rc = sv_enable(svc.svc_path, svc.svc_flag,
1812 1815                      makedevice(svc.svc_major, svc.svc_minor), kstatus);
1813 1816  
1814 1817                  if (rc == 0) {
1815 1818                          sv_config_time = nsc_lbolt();
1816 1819  
1817 1820                          mutex_enter(&sv_mutex);
1818 1821                          sv_thread_tune(sv_threads_dev);
1819 1822                          mutex_exit(&sv_mutex);
1820 1823                  }
1821 1824  
1822 1825                  DTRACE_PROBE3(sv_ioctl_end, dev_t, dev, int, *rvalp, int, rc);
1823 1826  
1824 1827                  return (spcs_s_ocopyoutf(&kstatus, svc.svc_error, rc));
1825 1828                  /* NOTREACHED */
1826 1829  
1827 1830          case SVIOC_DISABLE:
1828 1831  
1829 1832                  if (ilp32) {
1830 1833                          sv_conf32_t svc32;
1831 1834  
1832 1835                          if (ddi_copyin((void *)arg, &svc32,
1833 1836                              sizeof (svc32), mode) < 0) {
1834 1837                                  spcs_s_kfree(kstatus);
1835 1838                                  return (EFAULT);
1836 1839                          }
1837 1840  
1838 1841                          svc.svc_error = (spcs_s_info_t)svc32.svc_error;
1839 1842                          svc.svc_major = svc32.svc_major;
1840 1843                          svc.svc_minor = svc32.svc_minor;
1841 1844                          (void) strcpy(svc.svc_path, svc32.svc_path);
1842 1845                          svc.svc_flag  = svc32.svc_flag;
1843 1846                  } else {
1844 1847                          if (ddi_copyin((void *)arg, &svc,
1845 1848                              sizeof (svc), mode) < 0) {
1846 1849                                  spcs_s_kfree(kstatus);
1847 1850                                  return (EFAULT);
1848 1851                          }
1849 1852                  }
1850 1853  
1851 1854                  if (svc.svc_major == (major_t)-1 &&
1852 1855                      svc.svc_minor == (minor_t)-1) {
1853 1856                          sv_dev_t *svp;
1854 1857                          int i;
1855 1858  
1856 1859                          /*
1857 1860                           * User level could not find the minor device
1858 1861                           * node, so do this the slow way by searching
1859 1862                           * the entire sv config for a matching pathname.
1860 1863                           */
1861 1864  
1862 1865                          phash = nsc_strhash(svc.svc_path);
1863 1866  
1864 1867                          mutex_enter(&sv_mutex);
1865 1868  
1866 1869                          for (i = 0; i < sv_max_devices; i++) {
1867 1870                                  svp = &sv_devs[i];
1868 1871  
1869 1872                                  if (svp->sv_state == SV_DISABLE ||
1870 1873                                      svp->sv_fd == NULL)
1871 1874                                          continue;
1872 1875  
1873 1876                                  if (nsc_fdpathcmp(svp->sv_fd, phash,
1874 1877                                      svc.svc_path) == 0) {
1875 1878                                          svc.svc_major = getmajor(svp->sv_dev);
1876 1879                                          svc.svc_minor = getminor(svp->sv_dev);
1877 1880                                          break;
1878 1881                                  }
1879 1882                          }
1880 1883  
1881 1884                          mutex_exit(&sv_mutex);
1882 1885  
1883 1886                          if (svc.svc_major == (major_t)-1 &&
1884 1887                              svc.svc_minor == (minor_t)-1)
1885 1888                                  return (spcs_s_ocopyoutf(&kstatus,
1886 1889                                      svc.svc_error, SV_ENODEV));
1887 1890                  }
1888 1891  
1889 1892                  rc = sv_disable(makedevice(svc.svc_major, svc.svc_minor),
1890 1893                      kstatus);
1891 1894  
1892 1895                  if (rc == 0) {
1893 1896                          sv_config_time = nsc_lbolt();
1894 1897  
1895 1898                          mutex_enter(&sv_mutex);
1896 1899                          sv_thread_tune(-sv_threads_dev);
1897 1900                          mutex_exit(&sv_mutex);
1898 1901                  }
1899 1902  
1900 1903                  DTRACE_PROBE3(sv_ioctl_2, dev_t, dev, int, *rvalp, int, rc);
1901 1904  
1902 1905                  return (spcs_s_ocopyoutf(&kstatus, svc.svc_error, rc));
1903 1906                  /* NOTREACHED */
1904 1907  
1905 1908          case SVIOC_LIST:
1906 1909  
1907 1910                  if (ilp32) {
1908 1911                          if (ddi_copyin((void *)arg, &svl32,
1909 1912                              sizeof (svl32), mode) < 0) {
1910 1913                                  spcs_s_kfree(kstatus);
1911 1914                                  return (EFAULT);
1912 1915                          }
1913 1916  
1914 1917                          ustatus = (spcs_s_info_t)svl32.svl_error;
1915 1918                          size = svl32.svl_count;
1916 1919                          usvn = (void *)(unsigned long)svl32.svl_names;
1917 1920                  } else {
1918 1921                          if (ddi_copyin((void *)arg, &svl,
1919 1922                              sizeof (svl), mode) < 0) {
1920 1923                                  spcs_s_kfree(kstatus);
1921 1924                                  return (EFAULT);
1922 1925                          }
1923 1926  
1924 1927                          ustatus = svl.svl_error;
1925 1928                          size = svl.svl_count;
1926 1929                          usvn = svl.svl_names;
1927 1930                  }
1928 1931  
1929 1932                  /* Do some boundary checking */
1930 1933                  if ((size < 0) || (size > sv_max_devices)) {
1931 1934                          /* Array size is out of range */
1932 1935                          return (spcs_s_ocopyoutf(&kstatus, ustatus,
1933 1936                              SV_EARRBOUNDS, "0",
1934 1937                              spcs_s_inttostring(sv_max_devices, itmp1,
1935 1938                              sizeof (itmp1), 0),
1936 1939                              spcs_s_inttostring(size, itmp2,
1937 1940                              sizeof (itmp2), 0)));
1938 1941                  }
1939 1942  
1940 1943                  if (ilp32)
1941 1944                          bytes = size * sizeof (sv_name32_t);
1942 1945                  else
1943 1946                          bytes = size * sizeof (sv_name_t);
1944 1947  
1945 1948                  /* Allocate memory for the array of structures */
1946 1949                  if (bytes != 0) {
1947 1950                          svn = kmem_zalloc(bytes, KM_SLEEP);
1948 1951                          if (!svn) {
1949 1952                                  return (spcs_s_ocopyoutf(&kstatus,
1950 1953                                      ustatus, ENOMEM));
1951 1954                          }
1952 1955                  }
1953 1956  
1954 1957                  rc = sv_list(svn, size, rvalp, ilp32);
1955 1958                  if (rc) {
1956 1959                          if (svn != NULL)
1957 1960                                  kmem_free(svn, bytes);
1958 1961                          return (spcs_s_ocopyoutf(&kstatus, ustatus, rc));
1959 1962                  }
1960 1963  
1961 1964                  if (ilp32) {
1962 1965                          svl32.svl_timestamp = (uint32_t)sv_config_time;
1963 1966                          svl32.svl_maxdevs = (int32_t)sv_max_devices;
1964 1967  
1965 1968                          /* Return the list structure */
1966 1969                          if (ddi_copyout(&svl32, (void *)arg,
1967 1970                              sizeof (svl32), mode) < 0) {
1968 1971                                  spcs_s_kfree(kstatus);
1969 1972                                  if (svn != NULL)
1970 1973                                          kmem_free(svn, bytes);
1971 1974                                  return (EFAULT);
1972 1975                          }
1973 1976                  } else {
1974 1977                          svl.svl_timestamp = sv_config_time;
1975 1978                          svl.svl_maxdevs = sv_max_devices;
1976 1979  
1977 1980                          /* Return the list structure */
1978 1981                          if (ddi_copyout(&svl, (void *)arg,
1979 1982                              sizeof (svl), mode) < 0) {
1980 1983                                  spcs_s_kfree(kstatus);
1981 1984                                  if (svn != NULL)
1982 1985                                          kmem_free(svn, bytes);
1983 1986                                  return (EFAULT);
1984 1987                          }
1985 1988                  }
1986 1989  
1987 1990                  /* Return the array */
1988 1991                  if (svn != NULL) {
1989 1992                          if (ddi_copyout(svn, usvn, bytes, mode) < 0) {
1990 1993                                  kmem_free(svn, bytes);
1991 1994                                  spcs_s_kfree(kstatus);
1992 1995                                  return (EFAULT);
1993 1996                          }
1994 1997                          kmem_free(svn, bytes);
1995 1998                  }
1996 1999  
1997 2000                  DTRACE_PROBE3(sv_ioctl_3, dev_t, dev, int, *rvalp, int, 0);
1998 2001  
1999 2002                  return (spcs_s_ocopyoutf(&kstatus, ustatus, 0));
2000 2003                  /* NOTREACHED */
2001 2004  
2002 2005          case SVIOC_VERSION:
2003 2006  
2004 2007                  if (ilp32) {
2005 2008                          sv_version32_t svv32;
2006 2009  
2007 2010                          if (ddi_copyin((void *)arg, &svv32,
2008 2011                              sizeof (svv32), mode) < 0) {
2009 2012                                  spcs_s_kfree(kstatus);
2010 2013                                  return (EFAULT);
2011 2014                          }
2012 2015  
2013 2016                          svv32.svv_major_rev = sv_major_rev;
2014 2017                          svv32.svv_minor_rev = sv_minor_rev;
2015 2018                          svv32.svv_micro_rev = sv_micro_rev;
2016 2019                          svv32.svv_baseline_rev = sv_baseline_rev;
2017 2020  
2018 2021                          if (ddi_copyout(&svv32, (void *)arg,
2019 2022                              sizeof (svv32), mode) < 0) {
2020 2023                                  spcs_s_kfree(kstatus);
2021 2024                                  return (EFAULT);
2022 2025                          }
2023 2026  
2024 2027                          ustatus = (spcs_s_info_t)svv32.svv_error;
2025 2028                  } else {
2026 2029                          if (ddi_copyin((void *)arg, &svv,
2027 2030                              sizeof (svv), mode) < 0) {
2028 2031                                  spcs_s_kfree(kstatus);
2029 2032                                  return (EFAULT);
2030 2033                          }
2031 2034  
2032 2035                          svv.svv_major_rev = sv_major_rev;
2033 2036                          svv.svv_minor_rev = sv_minor_rev;
2034 2037                          svv.svv_micro_rev = sv_micro_rev;
2035 2038                          svv.svv_baseline_rev = sv_baseline_rev;
2036 2039  
2037 2040                          if (ddi_copyout(&svv, (void *)arg,
2038 2041                              sizeof (svv), mode) < 0) {
2039 2042                                  spcs_s_kfree(kstatus);
2040 2043                                  return (EFAULT);
2041 2044                          }
2042 2045  
2043 2046                          ustatus = svv.svv_error;
2044 2047                  }
2045 2048  
2046 2049                  DTRACE_PROBE3(sv_ioctl_4, dev_t, dev, int, *rvalp, int, 0);
2047 2050  
2048 2051                  return (spcs_s_ocopyoutf(&kstatus, ustatus, 0));
2049 2052                  /* NOTREACHED */
2050 2053  
2051 2054          case SVIOC_UNLOAD:
2052 2055                  rc = sv_prepare_unload();
2053 2056  
2054 2057                  if (ddi_copyout(&rc, (void *)arg, sizeof (rc), mode) < 0) {
2055 2058                          rc = EFAULT;
2056 2059                  }
2057 2060  
2058 2061                  spcs_s_kfree(kstatus);
2059 2062                  return (rc);
2060 2063  
2061 2064          default:
2062 2065                  spcs_s_kfree(kstatus);
2063 2066  
2064 2067                  DTRACE_PROBE3(sv_ioctl_4, dev_t, dev, int, *rvalp, int, EINVAL);
2065 2068  
2066 2069                  return (EINVAL);
2067 2070                  /* NOTREACHED */
2068 2071          }
2069 2072  
2070 2073          /* NOTREACHED */
2071 2074  }
2072 2075  
2073 2076  
2074 2077  /* ARGSUSED */
2075 2078  static int
2076 2079  svprint(dev_t dev, char *str)
2077 2080  {
2078 2081          int instance = ddi_get_instance(sv_dip);
2079 2082          cmn_err(CE_WARN, "!%s%d: %s", ddi_get_name(sv_dip), instance, str);
2080 2083          return (0);
2081 2084  }
2082 2085  
2083 2086  
2084 2087  static void
2085 2088  _sv_lyr_strategy(struct buf *bp)
2086 2089  {
2087 2090          caddr_t buf_addr;               /* pointer to linear buffer in bp */
2088 2091          nsc_buf_t *bufh = NULL;
2089 2092          nsc_buf_t *hndl = NULL;
2090 2093          sv_dev_t *svp;
2091 2094          nsc_vec_t *v;
2092 2095          sv_maj_t *maj;
2093 2096          nsc_size_t fba_req, fba_len;    /* FBA lengths */
2094 2097          nsc_off_t fba_off;              /* FBA offset */
2095 2098          size_t tocopy, nbytes;          /* byte lengths */
2096 2099          int rw, rc;                     /* flags and return codes */
2097 2100          int (*fn)();
2098 2101  
2099 2102          rc = 0;
2100 2103  
2101 2104          if (sv_debug > 5)
2102 2105                  cmn_err(CE_CONT, "!_sv_lyr_strategy(%p)\n", (void *)bp);
2103 2106  
2104 2107          svp = sv_find_enabled(bp->b_edev, &maj);
2105 2108          if (svp == NULL) {
2106 2109                  if (maj && (fn = maj->sm_strategy) != 0) {
2107 2110                          if (!(maj->sm_flag & D_MP)) {
2108 2111                                  UNSAFE_ENTER();
2109 2112                                  rc = (*fn)(bp);
2110 2113                                  UNSAFE_EXIT();
2111 2114                          } else {
2112 2115                                  rc = (*fn)(bp);
2113 2116                          }
2114 2117                          return;
2115 2118                  } else {
2116 2119                          bioerror(bp, ENODEV);
2117 2120                          biodone(bp);
2118 2121                          return;
2119 2122                  }
2120 2123          }
2121 2124  
2122 2125          ASSERT(RW_READ_HELD(&svp->sv_lock));
2123 2126  
2124 2127          if (svp->sv_flag == 0) {
2125 2128                  /*
2126 2129                   * guard access mode
2127 2130                   * - prevent user level access to the device
2128 2131                   */
2129 2132                  DTRACE_PROBE1(sv_lyr_strategy_err_guard, struct buf *, bp);
2130 2133                  bioerror(bp, EPERM);
2131 2134                  goto out;
2132 2135          }
2133 2136  
2134 2137          if ((rc = sv_reserve(svp->sv_fd, NSC_MULTI|NSC_PCATCH)) != 0) {
2135 2138                  DTRACE_PROBE1(sv_lyr_strategy_err_rsrv, struct buf *, bp);
2136 2139  
2137 2140                  if (rc == EINTR)
2138 2141                          cmn_err(CE_WARN, "!nsc_reserve() returned EINTR");
2139 2142                  bioerror(bp, rc);
2140 2143                  goto out;
2141 2144          }
2142 2145  
2143 2146          if (bp->b_lblkno >= (diskaddr_t)svp->sv_nblocks) {
2144 2147                  DTRACE_PROBE1(sv_lyr_strategy_eof, struct buf *, bp);
2145 2148  
2146 2149                  if (bp->b_flags & B_READ) {
2147 2150                          /* return EOF, not an error */
2148 2151                          bp->b_resid = bp->b_bcount;
2149 2152                          bioerror(bp, 0);
2150 2153                  } else
2151 2154                          bioerror(bp, EINVAL);
2152 2155  
2153 2156                  goto done;
2154 2157          }
2155 2158  
2156 2159          /*
2157 2160           * Preallocate a handle once per call to strategy.
2158 2161           * If this fails, then the nsc_alloc_buf() will allocate
2159 2162           * a temporary handle per allocation/free pair.
2160 2163           */
2161 2164  
2162 2165          DTRACE_PROBE1(sv_dbg_alloch_start, sv_dev_t *, svp);
2163 2166  
2164 2167          bufh = nsc_alloc_handle(svp->sv_fd, NULL, NULL, NULL);
2165 2168  
2166 2169          DTRACE_PROBE1(sv_dbg_alloch_end, sv_dev_t *, svp);
2167 2170  
2168 2171          if (bufh && (bufh->sb_flag & NSC_HACTIVE) != 0) {
2169 2172                  DTRACE_PROBE1(sv_lyr_strategy_err_hactive, struct buf *, bp);
2170 2173  
2171 2174                  cmn_err(CE_WARN,
2172 2175                      "!sv: allocated active handle (bufh %p, flags %x)",
2173 2176                      (void *)bufh, bufh->sb_flag);
2174 2177  
2175 2178                  bioerror(bp, ENXIO);
2176 2179                  goto done;
2177 2180          }
2178 2181  
2179 2182          fba_req = FBA_LEN(bp->b_bcount);
2180 2183          if (fba_req + bp->b_lblkno > (diskaddr_t)svp->sv_nblocks)
2181 2184                  fba_req = (nsc_size_t)(svp->sv_nblocks - bp->b_lblkno);
2182 2185  
2183 2186          rw = (bp->b_flags & B_READ) ? NSC_READ : NSC_WRITE;
2184 2187  
2185 2188          bp_mapin(bp);
2186 2189  
2187 2190          bp->b_resid = bp->b_bcount;
2188 2191          buf_addr = bp->b_un.b_addr;
2189 2192          fba_off = 0;
2190 2193  
2191 2194          /*
2192 2195           * fba_req  - requested size of transfer in FBAs after
2193 2196           *              truncation to device extent, and allowing for
2194 2197           *              possible non-FBA bounded final chunk.
2195 2198           * fba_off  - offset of start of chunk from start of bp in FBAs.
2196 2199           * fba_len  - size of this chunk in FBAs.
2197 2200           */
2198 2201  
2199 2202  loop:
2200 2203          fba_len = min(fba_req, svp->sv_maxfbas);
2201 2204          hndl = bufh;
2202 2205  
2203 2206          DTRACE_PROBE4(sv_dbg_allocb_start,
2204 2207              sv_dev_t *, svp,
2205 2208              uint64_t, (uint64_t)(bp->b_lblkno + fba_off),
2206 2209              uint64_t, (uint64_t)fba_len,
2207 2210              int, rw);
2208 2211  
2209 2212          rc = nsc_alloc_buf(svp->sv_fd, (nsc_off_t)(bp->b_lblkno + fba_off),
2210 2213              fba_len, rw, &hndl);
2211 2214  
2212 2215          DTRACE_PROBE1(sv_dbg_allocb_end, sv_dev_t *, svp);
2213 2216  
2214 2217          if (rc > 0) {
2215 2218                  DTRACE_PROBE1(sv_lyr_strategy_err_alloc, struct buf *, bp);
2216 2219                  bioerror(bp, rc);
2217 2220                  if (hndl != bufh)
2218 2221                          (void) nsc_free_buf(hndl);
2219 2222                  hndl = NULL;
2220 2223                  goto done;
2221 2224          }
2222 2225  
2223 2226          tocopy = min(FBA_SIZE(fba_len), bp->b_resid);
2224 2227          v = hndl->sb_vec;
2225 2228  
2226 2229          if (rw == NSC_WRITE && FBA_OFF(tocopy) != 0) {
2227 2230                  /*
2228 2231                   * Not overwriting all of the last FBA, so read in the
2229 2232                   * old contents now before we overwrite it with the new
2230 2233                   * data.
2231 2234                   */
2232 2235  
2233 2236                  DTRACE_PROBE2(sv_dbg_read_start, sv_dev_t *, svp,
2234 2237                      uint64_t, (uint64_t)(hndl->sb_pos + hndl->sb_len - 1));
2235 2238  
2236 2239                  rc = nsc_read(hndl, (hndl->sb_pos + hndl->sb_len - 1), 1, 0);
2237 2240                  if (rc > 0) {
2238 2241                          bioerror(bp, rc);
2239 2242                          goto done;
2240 2243                  }
2241 2244  
2242 2245                  DTRACE_PROBE1(sv_dbg_read_end, sv_dev_t *, svp);
2243 2246          }
2244 2247  
2245 2248          DTRACE_PROBE1(sv_dbg_bcopy_start, sv_dev_t *, svp);
2246 2249  
2247 2250          while (tocopy > 0) {
2248 2251                  nbytes = min(tocopy, (nsc_size_t)v->sv_len);
2249 2252  
2250 2253                  if (bp->b_flags & B_READ)
2251 2254                          (void) bcopy(v->sv_addr, buf_addr, nbytes);
2252 2255                  else
2253 2256                          (void) bcopy(buf_addr, v->sv_addr, nbytes);
2254 2257  
2255 2258                  bp->b_resid -= nbytes;
2256 2259                  buf_addr += nbytes;
2257 2260                  tocopy -= nbytes;
2258 2261                  v++;
2259 2262          }
2260 2263  
2261 2264          DTRACE_PROBE1(sv_dbg_bcopy_end, sv_dev_t *, svp);
2262 2265  
2263 2266          if ((bp->b_flags & B_READ) == 0) {
2264 2267                  DTRACE_PROBE3(sv_dbg_write_start, sv_dev_t *, svp,
2265 2268                      uint64_t, (uint64_t)hndl->sb_pos,
2266 2269                      uint64_t, (uint64_t)hndl->sb_len);
2267 2270  
2268 2271                  rc = nsc_write(hndl, hndl->sb_pos, hndl->sb_len, 0);
2269 2272  
2270 2273                  DTRACE_PROBE1(sv_dbg_write_end, sv_dev_t *, svp);
2271 2274  
2272 2275                  if (rc > 0) {
2273 2276                          bioerror(bp, rc);
2274 2277                          goto done;
2275 2278                  }
2276 2279          }
2277 2280  
2278 2281          /*
2279 2282           * Adjust FBA offset and requested (ie. remaining) length,
2280 2283           * loop if more data to transfer.
2281 2284           */
2282 2285  
2283 2286          fba_off += fba_len;
2284 2287          fba_req -= fba_len;
2285 2288  
2286 2289          if (fba_req > 0) {
2287 2290                  DTRACE_PROBE1(sv_dbg_freeb_start, sv_dev_t *, svp);
2288 2291  
2289 2292                  rc = nsc_free_buf(hndl);
2290 2293  
2291 2294                  DTRACE_PROBE1(sv_dbg_freeb_end, sv_dev_t *, svp);
2292 2295  
2293 2296                  if (rc > 0) {
2294 2297                          DTRACE_PROBE1(sv_lyr_strategy_err_free,
2295 2298                              struct buf *, bp);
2296 2299                          bioerror(bp, rc);
2297 2300                  }
2298 2301  
2299 2302                  hndl = NULL;
2300 2303  
2301 2304                  if (rc <= 0)
2302 2305                          goto loop;
2303 2306          }
2304 2307  
2305 2308  done:
2306 2309          if (hndl != NULL) {
2307 2310                  DTRACE_PROBE1(sv_dbg_freeb_start, sv_dev_t *, svp);
2308 2311  
2309 2312                  rc = nsc_free_buf(hndl);
2310 2313  
2311 2314                  DTRACE_PROBE1(sv_dbg_freeb_end, sv_dev_t *, svp);
2312 2315  
2313 2316                  if (rc > 0) {
2314 2317                          DTRACE_PROBE1(sv_lyr_strategy_err_free,
2315 2318                              struct buf *, bp);
2316 2319                          bioerror(bp, rc);
2317 2320                  }
2318 2321  
2319 2322                  hndl = NULL;
2320 2323          }
2321 2324  
2322 2325          if (bufh)
2323 2326                  (void) nsc_free_handle(bufh);
2324 2327  
2325 2328          DTRACE_PROBE1(sv_dbg_rlse_start, sv_dev_t *, svp);
2326 2329  
2327 2330          nsc_release(svp->sv_fd);
2328 2331  
2329 2332          DTRACE_PROBE1(sv_dbg_rlse_end, sv_dev_t *, svp);
2330 2333  
2331 2334  out:
2332 2335          if (sv_debug > 5) {
2333 2336                  cmn_err(CE_CONT,
2334 2337                      "!_sv_lyr_strategy: bp %p, bufh %p, bp->b_error %d\n",
2335 2338                      (void *)bp, (void *)bufh, bp->b_error);
2336 2339          }
2337 2340  
2338 2341          DTRACE_PROBE2(sv_lyr_strategy_end, struct buf *, bp, int, bp->b_error);
2339 2342  
2340 2343          rw_exit(&svp->sv_lock);
2341 2344          biodone(bp);
2342 2345  }
2343 2346  
2344 2347  
2345 2348  static void
2346 2349  sv_async_strategy(blind_t arg)
2347 2350  {
2348 2351          struct buf *bp = (struct buf *)arg;
2349 2352          _sv_lyr_strategy(bp);
2350 2353  }
2351 2354  
2352 2355  
2353 2356  static int
2354 2357  sv_lyr_strategy(struct buf *bp)
2355 2358  {
2356 2359          nsthread_t *tp;
2357 2360          int nlive;
2358 2361  
2359 2362          /*
2360 2363           * If B_ASYNC was part of the DDI we could use it as a hint to
2361 2364           * not create a thread for synchronous i/o.
2362 2365           */
2363 2366          if (sv_dev_to_sv(bp->b_edev, NULL) == NULL) {
2364 2367                  /* not sv enabled - just pass through */
2365 2368                  DTRACE_PROBE1(sv_lyr_strategy_notsv, struct buf *, bp);
2366 2369                  _sv_lyr_strategy(bp);
2367 2370                  return (0);
2368 2371          }
2369 2372  
2370 2373          if (sv_debug > 4) {
2371 2374                  cmn_err(CE_CONT, "!sv_lyr_strategy: nthread %d nlive %d\n",
2372 2375                      nst_nthread(sv_tset), nst_nlive(sv_tset));
2373 2376          }
2374 2377  
2375 2378          /*
2376 2379           * If there are only guard devices enabled there
2377 2380           * won't be a threadset, so don't try and use it.
2378 2381           */
2379 2382          tp = NULL;
2380 2383          if (sv_tset != NULL) {
2381 2384                  tp = nst_create(sv_tset, sv_async_strategy, (blind_t)bp, 0);
2382 2385          }
2383 2386  
2384 2387          if (tp == NULL) {
2385 2388                  /*
2386 2389                   * out of threads, so fall back to synchronous io.
2387 2390                   */
2388 2391                  if (sv_debug > 0) {
2389 2392                          cmn_err(CE_CONT,
2390 2393                              "!sv_lyr_strategy: thread alloc failed\n");
2391 2394                  }
2392 2395  
2393 2396                  DTRACE_PROBE1(sv_lyr_strategy_no_thread,
2394 2397                      struct buf *, bp);
2395 2398  
2396 2399                  _sv_lyr_strategy(bp);
2397 2400                  sv_no_threads++;
2398 2401          } else {
2399 2402                  nlive = nst_nlive(sv_tset);
2400 2403                  if (nlive > sv_max_nlive) {
2401 2404                          if (sv_debug > 0) {
2402 2405                                  cmn_err(CE_CONT,
2403 2406                                      "!sv_lyr_strategy: "
2404 2407                                      "new max nlive %d (nthread %d)\n",
2405 2408                                      nlive, nst_nthread(sv_tset));
2406 2409                          }
2407 2410  
2408 2411                          sv_max_nlive = nlive;
2409 2412                  }
2410 2413          }
2411 2414  
2412 2415          return (0);
2413 2416  }
2414 2417  
2415 2418  
2416 2419  #ifndef offsetof
2417 2420  #define offsetof(s, m)  ((size_t)(&((s *)0)->m))
2418 2421  #endif
2419 2422  
2420 2423  /*
2421 2424   * re-write the size of the current partition
2422 2425   */
2423 2426  static int
2424 2427  sv_fix_dkiocgvtoc(const intptr_t arg, const int mode, sv_dev_t *svp)
2425 2428  {
2426 2429          size_t offset;
2427 2430          int ilp32;
2428 2431          int pnum;
2429 2432          int rc;
2430 2433  
2431 2434          ilp32 = (ddi_model_convert_from((mode & FMODELS)) == DDI_MODEL_ILP32);
2432 2435  
2433 2436          rc = nskern_partition(svp->sv_dev, &pnum);
2434 2437          if (rc != 0) {
2435 2438                  return (rc);
2436 2439          }
2437 2440  
2438 2441          if (pnum < 0 || pnum >= V_NUMPAR) {
2439 2442                  cmn_err(CE_WARN,
2440 2443                      "!sv_gvtoc: unable to determine partition number "
2441 2444                      "for dev %lx", svp->sv_dev);
2442 2445                  return (EINVAL);
2443 2446          }
2444 2447  
2445 2448          if (ilp32) {
2446 2449                  int32_t p_size;
2447 2450  
2448 2451  #ifdef _SunOS_5_6
2449 2452                  offset = offsetof(struct vtoc, v_part);
2450 2453                  offset += sizeof (struct partition) * pnum;
2451 2454                  offset += offsetof(struct partition, p_size);
2452 2455  #else
2453 2456                  offset = offsetof(struct vtoc32, v_part);
2454 2457                  offset += sizeof (struct partition32) * pnum;
2455 2458                  offset += offsetof(struct partition32, p_size);
2456 2459  #endif
2457 2460  
2458 2461                  p_size = (int32_t)svp->sv_nblocks;
2459 2462                  if (p_size == 0) {
2460 2463                          if (sv_reserve(svp->sv_fd,
2461 2464                              NSC_MULTI|NSC_PCATCH) == 0) {
2462 2465                                  p_size = (int32_t)svp->sv_nblocks;
2463 2466                                  nsc_release(svp->sv_fd);
2464 2467                          } else {
2465 2468                                  rc = EINTR;
2466 2469                          }
2467 2470                  }
2468 2471  
2469 2472                  if ((rc == 0) && ddi_copyout(&p_size, (void *)(arg + offset),
2470 2473                      sizeof (p_size), mode) != 0) {
2471 2474                          rc = EFAULT;
2472 2475                  }
2473 2476          } else {
2474 2477                  long p_size;
2475 2478  
2476 2479                  offset = offsetof(struct vtoc, v_part);
2477 2480                  offset += sizeof (struct partition) * pnum;
2478 2481                  offset += offsetof(struct partition, p_size);
2479 2482  
2480 2483                  p_size = (long)svp->sv_nblocks;
2481 2484                  if (p_size == 0) {
2482 2485                          if (sv_reserve(svp->sv_fd,
2483 2486                              NSC_MULTI|NSC_PCATCH) == 0) {
2484 2487                                  p_size = (long)svp->sv_nblocks;
2485 2488                                  nsc_release(svp->sv_fd);
2486 2489                          } else {
2487 2490                                  rc = EINTR;
2488 2491                          }
2489 2492                  }
2490 2493  
2491 2494                  if ((rc == 0) && ddi_copyout(&p_size, (void *)(arg + offset),
2492 2495                      sizeof (p_size), mode) != 0) {
2493 2496                          rc = EFAULT;
2494 2497                  }
2495 2498          }
2496 2499  
2497 2500          return (rc);
2498 2501  }
2499 2502  
2500 2503  
2501 2504  #ifdef DKIOCPARTITION
2502 2505  /*
2503 2506   * re-write the size of the current partition
2504 2507   *
2505 2508   * arg is dk_efi_t.
2506 2509   *
2507 2510   * dk_efi_t->dki_data = (void *)(uintptr_t)efi.dki_data_64;
2508 2511   *
2509 2512   * dk_efi_t->dki_data --> efi_gpt_t (label header)
2510 2513   * dk_efi_t->dki_data + 1 --> efi_gpe_t[] (array of partitions)
2511 2514   *
2512 2515   * efi_gpt_t->efi_gpt_PartitionEntryArrayCRC32 --> CRC32 of array of parts
2513 2516   * efi_gpt_t->efi_gpt_HeaderCRC32 --> CRC32 of header itself
2514 2517   *
2515 2518   * This assumes that sizeof (efi_gpt_t) is the same as the size of a
2516 2519   * logical block on the disk.
2517 2520   *
2518 2521   * Everything is little endian (i.e. disk format).
2519 2522   */
2520 2523  static int
2521 2524  sv_fix_dkiocgetefi(const intptr_t arg, const int mode, sv_dev_t *svp)
2522 2525  {
2523 2526          dk_efi_t efi;
2524 2527          efi_gpt_t gpt;
2525 2528          efi_gpe_t *gpe = NULL;
2526 2529          size_t sgpe;
2527 2530          uint64_t p_size;        /* virtual partition size from nsctl */
2528 2531          uint32_t crc;
2529 2532          int unparts;            /* number of parts in user's array */
2530 2533          int pnum;
2531 2534          int rc;
2532 2535  
2533 2536          rc = nskern_partition(svp->sv_dev, &pnum);
2534 2537          if (rc != 0) {
2535 2538                  return (rc);
2536 2539          }
2537 2540  
2538 2541          if (pnum < 0) {
2539 2542                  cmn_err(CE_WARN,
2540 2543                      "!sv_efi: unable to determine partition number for dev %lx",
2541 2544                      svp->sv_dev);
2542 2545                  return (EINVAL);
2543 2546          }
2544 2547  
2545 2548          if (ddi_copyin((void *)arg, &efi, sizeof (efi), mode)) {
2546 2549                  return (EFAULT);
2547 2550          }
2548 2551  
2549 2552          efi.dki_data = (void *)(uintptr_t)efi.dki_data_64;
2550 2553  
2551 2554          if (efi.dki_length < sizeof (gpt) + sizeof (gpe)) {
2552 2555                  return (EINVAL);
2553 2556          }
2554 2557  
2555 2558          if (ddi_copyin((void *)efi.dki_data, &gpt, sizeof (gpt), mode)) {
2556 2559                  rc = EFAULT;
2557 2560                  goto out;
2558 2561          }
2559 2562  
2560 2563          if ((unparts = LE_32(gpt.efi_gpt_NumberOfPartitionEntries)) == 0)
2561 2564                  unparts = 1;
2562 2565          else if (pnum >= unparts) {
2563 2566                  cmn_err(CE_WARN,
2564 2567                      "!sv_efi: partition# beyond end of user array (%d >= %d)",
2565 2568                      pnum, unparts);
2566 2569                  return (EINVAL);
2567 2570          }
2568 2571  
2569 2572          sgpe = sizeof (*gpe) * unparts;
2570 2573          gpe = kmem_alloc(sgpe, KM_SLEEP);
2571 2574  
2572 2575          if (ddi_copyin((void *)(efi.dki_data + 1), gpe, sgpe, mode)) {
2573 2576                  rc = EFAULT;
2574 2577                  goto out;
2575 2578          }
2576 2579  
2577 2580          p_size = svp->sv_nblocks;
2578 2581          if (p_size == 0) {
2579 2582                  if (sv_reserve(svp->sv_fd, NSC_MULTI|NSC_PCATCH) == 0) {
2580 2583                          p_size = (diskaddr_t)svp->sv_nblocks;
2581 2584                          nsc_release(svp->sv_fd);
2582 2585                  } else {
2583 2586                          rc = EINTR;
2584 2587                  }
2585 2588          }
2586 2589  
2587 2590          gpe[pnum].efi_gpe_EndingLBA = LE_64(
2588 2591              LE_64(gpe[pnum].efi_gpe_StartingLBA) + p_size - 1);
2589 2592  
2590 2593          gpt.efi_gpt_PartitionEntryArrayCRC32 = 0;
2591 2594          CRC32(crc, gpe, sgpe, -1U, sv_crc32_table);
2592 2595          gpt.efi_gpt_PartitionEntryArrayCRC32 = LE_32(~crc);
2593 2596  
2594 2597          gpt.efi_gpt_HeaderCRC32 = 0;
2595 2598          CRC32(crc, &gpt, sizeof (gpt), -1U, sv_crc32_table);
2596 2599          gpt.efi_gpt_HeaderCRC32 = LE_32(~crc);
2597 2600  
2598 2601          if ((rc == 0) && ddi_copyout(&gpt, efi.dki_data, sizeof (gpt), mode)) {
2599 2602                  rc = EFAULT;
2600 2603                  goto out;
2601 2604          }
2602 2605  
2603 2606          if ((rc == 0) && ddi_copyout(gpe, efi.dki_data + 1, sgpe, mode)) {
2604 2607                  rc = EFAULT;
2605 2608                  goto out;
2606 2609          }
2607 2610  
2608 2611  out:
2609 2612          if (gpe) {
2610 2613                  kmem_free(gpe, sgpe);
2611 2614          }
2612 2615  
2613 2616          return (rc);
2614 2617  }
2615 2618  
2616 2619  
2617 2620  /*
2618 2621   * Re-write the size of the partition specified by p_partno
2619 2622   *
2620 2623   * Note that if a DKIOCPARTITION is issued to an fd opened against a
2621 2624   * non-sv'd device, but p_partno requests the size for a different
2622 2625   * device that is sv'd, this function will *not* be called as sv is
2623 2626   * not interposed on the original device (the fd).
2624 2627   *
2625 2628   * It would not be easy to change this as we cannot get the partition
2626 2629   * number for the non-sv'd device, so cannot compute the dev_t of the
2627 2630   * (sv'd) p_partno device, and so cannot find out if it is sv'd or get
2628 2631   * its size from nsctl.
2629 2632   *
2630 2633   * See also the "Bug 4755783" comment in sv_lyr_ioctl().
2631 2634   */
2632 2635  static int
2633 2636  sv_fix_dkiocpartition(const intptr_t arg, const int mode, sv_dev_t *svp)
2634 2637  {
2635 2638          struct partition64 p64;
2636 2639          sv_dev_t *nsvp = NULL;
2637 2640          diskaddr_t p_size;
2638 2641          minor_t nminor;
2639 2642          int pnum, rc;
2640 2643          dev_t ndev;
2641 2644  
2642 2645          rc = nskern_partition(svp->sv_dev, &pnum);
2643 2646          if (rc != 0) {
2644 2647                  return (rc);
2645 2648          }
2646 2649  
2647 2650          if (ddi_copyin((void *)arg, &p64, sizeof (p64), mode)) {
2648 2651                  return (EFAULT);
2649 2652          }
2650 2653  
2651 2654          if (p64.p_partno != pnum) {
2652 2655                  /* switch to requested partition, not the current one */
2653 2656                  nminor = getminor(svp->sv_dev) + (p64.p_partno - pnum);
2654 2657                  ndev = makedevice(getmajor(svp->sv_dev), nminor);
2655 2658                  nsvp = sv_find_enabled(ndev, NULL);
2656 2659                  if (nsvp == NULL) {
2657 2660                          /* not sv device - just return */
2658 2661                          return (0);
2659 2662                  }
2660 2663  
2661 2664                  svp = nsvp;
2662 2665          }
2663 2666  
2664 2667          p_size = svp->sv_nblocks;
2665 2668          if (p_size == 0) {
2666 2669                  if (sv_reserve(svp->sv_fd, NSC_MULTI|NSC_PCATCH) == 0) {
2667 2670                          p_size = (diskaddr_t)svp->sv_nblocks;
2668 2671                          nsc_release(svp->sv_fd);
2669 2672                  } else {
2670 2673                          rc = EINTR;
2671 2674                  }
2672 2675          }
2673 2676  
2674 2677          if (nsvp != NULL) {
2675 2678                  rw_exit(&nsvp->sv_lock);
2676 2679          }
2677 2680  
2678 2681          if ((rc == 0) && ddi_copyout(&p_size,
2679 2682              (void *)(arg + offsetof(struct partition64, p_size)),
2680 2683              sizeof (p_size), mode) != 0) {
2681 2684                  return (EFAULT);
2682 2685          }
2683 2686  
2684 2687          return (rc);
2685 2688  }
2686 2689  #endif /* DKIOCPARTITION */
2687 2690  
2688 2691  
2689 2692  static int
2690 2693  sv_lyr_ioctl(const dev_t dev, const int cmd, const intptr_t arg,
2691 2694      const int mode, cred_t *crp, int *rvalp)
2692 2695  {
2693 2696          sv_dev_t *svp;
2694 2697          sv_maj_t *maj;
2695 2698          int (*fn)();
2696 2699          int rc = 0;
2697 2700  
2698 2701          maj = 0;
2699 2702          fn = 0;
2700 2703  
2701 2704          /*
2702 2705           * If sv_mod_status is 0 or SV_PREVENT_UNLOAD, then it will continue.
2703 2706           * else it means it previously was SV_PREVENT_UNLOAD, and now it's
2704 2707           * SV_ALLOW_UNLOAD, expecting the driver to eventually unload.
2705 2708           *
2706 2709           * SV_ALLOW_UNLOAD is final state, so no need to grab sv_mutex.
2707 2710           */
2708 2711          if (sv_mod_status == SV_ALLOW_UNLOAD) {
2709 2712                  return (EBUSY);
2710 2713          }
2711 2714  
2712 2715          svp = sv_find_enabled(dev, &maj);
2713 2716          if (svp != NULL) {
2714 2717                  if (nskernd_isdaemon()) {
2715 2718                          /*
2716 2719                           * This is nskernd which always needs to see
2717 2720                           * the underlying disk device accurately.
2718 2721                           *
2719 2722                           * So just pass the ioctl straight through
2720 2723                           * to the underlying driver as though the device
2721 2724                           * was not sv enabled.
2722 2725                           */
2723 2726                          DTRACE_PROBE2(sv_lyr_ioctl_nskernd, sv_dev_t *, svp,
2724 2727                              dev_t, dev);
2725 2728  
2726 2729                          rw_exit(&svp->sv_lock);
2727 2730                          svp = NULL;
2728 2731                  } else {
2729 2732                          ASSERT(RW_READ_HELD(&svp->sv_lock));
2730 2733                  }
2731 2734          }
2732 2735  
2733 2736          /*
2734 2737           * We now have a locked and enabled SV device, or a non-SV device.
2735 2738           */
2736 2739  
2737 2740          switch (cmd) {
2738 2741                  /*
2739 2742                   * DKIOCGVTOC, DKIOCSVTOC, DKIOCPARTITION, DKIOCGETEFI
2740 2743                   * and DKIOCSETEFI are intercepted and faked up as some
2741 2744                   * i/o providers emulate volumes of a different size to
2742 2745                   * the underlying volume.
2743 2746                   *
2744 2747                   * Setting the size by rewriting the vtoc is not permitted.
2745 2748                   */
2746 2749  
2747 2750          case DKIOCSVTOC:
2748 2751  #ifdef DKIOCPARTITION
2749 2752          case DKIOCSETEFI:
2750 2753  #endif
2751 2754                  if (svp == NULL) {
2752 2755                          /* not intercepted -- allow ioctl through */
2753 2756                          break;
2754 2757                  }
2755 2758  
2756 2759                  rw_exit(&svp->sv_lock);
2757 2760  
2758 2761                  DTRACE_PROBE2(sv_lyr_ioctl_svtoc, dev_t, dev, int, EPERM);
2759 2762  
2760 2763                  return (EPERM);
2761 2764  
2762 2765          default:
2763 2766                  break;
2764 2767          }
2765 2768  
2766 2769          /*
2767 2770           * Pass through the real ioctl command.
2768 2771           */
2769 2772  
2770 2773          if (maj && (fn = maj->sm_ioctl) != 0) {
2771 2774                  if (!(maj->sm_flag & D_MP)) {
2772 2775                          UNSAFE_ENTER();
2773 2776                          rc = (*fn)(dev, cmd, arg, mode, crp, rvalp);
2774 2777                          UNSAFE_EXIT();
2775 2778                  } else {
2776 2779                          rc = (*fn)(dev, cmd, arg, mode, crp, rvalp);
2777 2780                  }
2778 2781          } else {
2779 2782                  rc = ENODEV;
2780 2783          }
2781 2784  
2782 2785          /*
2783 2786           * Bug 4755783
2784 2787           * Fix up the size of the current partition to allow
2785 2788           * for the virtual volume to be a different size to the
2786 2789           * physical volume (e.g. for II compact dependent shadows).
2787 2790           *
2788 2791           * Note that this only attempts to fix up the current partition
2789 2792           * - the one that the ioctl was issued against.  There could be
2790 2793           * other sv'd partitions in the same vtoc, but we cannot tell
2791 2794           * so we don't attempt to fix them up.
2792 2795           */
2793 2796  
2794 2797          if (svp != NULL && rc == 0) {
2795 2798                  switch (cmd) {
2796 2799                  case DKIOCGVTOC:
2797 2800                          rc = sv_fix_dkiocgvtoc(arg, mode, svp);
2798 2801                          break;
2799 2802  
2800 2803  #ifdef DKIOCPARTITION
2801 2804                  case DKIOCGETEFI:
2802 2805                          rc = sv_fix_dkiocgetefi(arg, mode, svp);
2803 2806                          break;
2804 2807  
2805 2808                  case DKIOCPARTITION:
2806 2809                          rc = sv_fix_dkiocpartition(arg, mode, svp);
2807 2810                          break;
2808 2811  #endif /* DKIOCPARTITION */
2809 2812                  }
2810 2813          }
2811 2814  
2812 2815          if (svp != NULL) {
2813 2816                  rw_exit(&svp->sv_lock);
2814 2817          }
2815 2818  
2816 2819          return (rc);
2817 2820  }
  
    | 
      ↓ open down ↓ | 
    1568 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX