11083 Wdiff usr/src/uts/common/fs/nfs/nfs_client.c

Print this page

11083 support NFS server in zone
Portions contributed by: Dan Kruchinin <dan.kruchinin@nexenta.com>
Portions contributed by: Stepan Zastupov <stepan.zastupov@gmail.com>
Portions contributed by: Joyce McIntosh <joyce.mcintosh@nexenta.com>
Portions contributed by: Mike Zeller <mike@mikezeller.net>
Portions contributed by: Dan McDonald <danmcd@joyent.com>
Portions contributed by: Gordon Ross <gordon.w.ross@gmail.com>
Portions contributed by: Vitaliy Gusev <gusev.vitaliy@gmail.com>
Reviewed by: Rick McNeal <rick.mcneal@nexenta.com>
Reviewed by: Rob Gittins <rob.gittins@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: Jason King <jbk@joyent.com>
Reviewed by: C Fraire <cfraire@me.com>
Change-Id: I22f289d357503f9b48a0bc2482cc4328a6d43d16

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/nfs/nfs_client.c
          +++ new/usr/src/uts/common/fs/nfs/nfs_client.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions

↓ open down ↓

10 lines elided

↑ open up ↑

  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
       21 +
  21   22  /*
  22   23   * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
  23      - *
  24      - *      Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
       24 + */
       25 +
       26 +/*
       27 + *      Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
  25   28   *      All rights reserved.
  26   29   */
  27   30  
       31 +/*
       32 + * Copyright 2018 Nexenta Systems, Inc.
       33 + */
       34 +
  28   35  #include <sys/param.h>
  29   36  #include <sys/types.h>
  30   37  #include <sys/systm.h>
  31   38  #include <sys/thread.h>
  32   39  #include <sys/t_lock.h>
  33   40  #include <sys/time.h>
  34   41  #include <sys/vnode.h>
  35   42  #include <sys/vfs.h>
  36   43  #include <sys/errno.h>
  37   44  #include <sys/buf.h>

  38   45  #include <sys/stat.h>
  39   46  #include <sys/cred.h>
  40   47  #include <sys/kmem.h>
  41   48  #include <sys/debug.h>
  42   49  #include <sys/dnlc.h>
  43   50  #include <sys/vmsystm.h>
  44   51  #include <sys/flock.h>
  45   52  #include <sys/share.h>
  46   53  #include <sys/cmn_err.h>
  47   54  #include <sys/tiuser.h>
  48   55  #include <sys/sysmacros.h>
  49   56  #include <sys/callb.h>
  50   57  #include <sys/acl.h>
  51   58  #include <sys/kstat.h>
  52   59  #include <sys/signal.h>

↓ open down ↓

15 lines elided

↑ open up ↑

  53   60  #include <sys/list.h>
  54   61  #include <sys/zone.h>
  55   62  
  56   63  #include <rpc/types.h>
  57   64  #include <rpc/xdr.h>
  58   65  #include <rpc/auth.h>
  59   66  #include <rpc/clnt.h>
  60   67  
  61   68  #include <nfs/nfs.h>
  62   69  #include <nfs/nfs_clnt.h>
       70 +#include <nfs/nfs_cmd.h>
  63   71  
  64   72  #include <nfs/rnode.h>
  65   73  #include <nfs/nfs_acl.h>
  66   74  #include <nfs/lm.h>
  67   75  
  68   76  #include <vm/hat.h>
  69   77  #include <vm/as.h>
  70   78  #include <vm/page.h>
  71   79  #include <vm/pvn.h>
  72   80  #include <vm/seg.h>

  73   81  #include <vm/seg_map.h>
  74   82  #include <vm/seg_vn.h>
  75   83  
  76   84  static void     nfs3_attr_cache(vnode_t *, vattr_t *, vattr_t *, hrtime_t,
  77   85                          cred_t *);
  78   86  static int      nfs_getattr_cache(vnode_t *, struct vattr *);
  79   87  static int      nfs_remove_locking_id(vnode_t *, int, char *, char *, int *);
  80   88  
  81   89  struct mi_globals {
  82   90          kmutex_t        mig_lock;  /* lock protecting mig_list */
  83   91          list_t          mig_list;  /* list of NFS v2 or v3 mounts in zone */
  84   92          boolean_t       mig_destructor_called;
  85   93  };
  86   94  
  87   95  static zone_key_t mi_list_key;
  88   96  
  89   97  /* Debugging flag for PC file shares. */
  90   98  extern int      share_debug;
  91   99  
  92  100  /*
  93  101   * Attributes caching:
  94  102   *
  95  103   * Attributes are cached in the rnode in struct vattr form.
  96  104   * There is a time associated with the cached attributes (r_attrtime)
  97  105   * which tells whether the attributes are valid. The time is initialized
  98  106   * to the difference between current time and the modify time of the vnode
  99  107   * when new attributes are cached. This allows the attributes for
 100  108   * files that have changed recently to be timed out sooner than for files
 101  109   * that have not changed for a long time. There are minimum and maximum
 102  110   * timeout values that can be set per mount point.
 103  111   */
 104  112  
 105  113  int
 106  114  nfs_waitfor_purge_complete(vnode_t *vp)
 107  115  {
 108  116          rnode_t *rp;
 109  117          k_sigset_t smask;
 110  118  
 111  119          rp = VTOR(vp);
 112  120          if (rp->r_serial != NULL && rp->r_serial != curthread) {
 113  121                  mutex_enter(&rp->r_statelock);
 114  122                  sigintr(&smask, VTOMI(vp)->mi_flags & MI_INT);
 115  123                  while (rp->r_serial != NULL) {
 116  124                          if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
 117  125                                  sigunintr(&smask);
 118  126                                  mutex_exit(&rp->r_statelock);
 119  127                                  return (EINTR);
 120  128                          }
 121  129                  }
 122  130                  sigunintr(&smask);
 123  131                  mutex_exit(&rp->r_statelock);
 124  132          }
 125  133          return (0);
 126  134  }
 127  135  
 128  136  /*
 129  137   * Validate caches by checking cached attributes. If the cached
 130  138   * attributes have timed out, then get new attributes from the server.
 131  139   * As a side affect, this will do cache invalidation if the attributes
 132  140   * have changed.
 133  141   *
 134  142   * If the attributes have not timed out and if there is a cache
 135  143   * invalidation being done by some other thread, then wait until that
 136  144   * thread has completed the cache invalidation.
 137  145   */
 138  146  int
 139  147  nfs_validate_caches(vnode_t *vp, cred_t *cr)
 140  148  {
 141  149          int error;
 142  150          struct vattr va;
 143  151  
 144  152          if (ATTRCACHE_VALID(vp)) {
 145  153                  error = nfs_waitfor_purge_complete(vp);
 146  154                  if (error)
 147  155                          return (error);
 148  156                  return (0);
 149  157          }
 150  158  
 151  159          va.va_mask = AT_ALL;
 152  160          return (nfs_getattr_otw(vp, &va, cr));
 153  161  }
 154  162  
 155  163  /*
 156  164   * Validate caches by checking cached attributes. If the cached
 157  165   * attributes have timed out, then get new attributes from the server.
 158  166   * As a side affect, this will do cache invalidation if the attributes
 159  167   * have changed.
 160  168   *
 161  169   * If the attributes have not timed out and if there is a cache
 162  170   * invalidation being done by some other thread, then wait until that
 163  171   * thread has completed the cache invalidation.
 164  172   */
 165  173  int
 166  174  nfs3_validate_caches(vnode_t *vp, cred_t *cr)
 167  175  {
 168  176          int error;
 169  177          struct vattr va;
 170  178  
 171  179          if (ATTRCACHE_VALID(vp)) {
 172  180                  error = nfs_waitfor_purge_complete(vp);
 173  181                  if (error)
 174  182                          return (error);
 175  183                  return (0);
 176  184          }
 177  185  
 178  186          va.va_mask = AT_ALL;
 179  187          return (nfs3_getattr_otw(vp, &va, cr));
 180  188  }
 181  189  
 182  190  /*
 183  191   * Purge all of the various NFS `data' caches.
 184  192   */
 185  193  void
 186  194  nfs_purge_caches(vnode_t *vp, int purge_dnlc, cred_t *cr)
 187  195  {
 188  196          rnode_t *rp;
 189  197          char *contents;
 190  198          int size;
 191  199          int error;
 192  200  
 193  201          /*
 194  202           * Purge the DNLC for any entries which refer to this file.
 195  203           * Avoid recursive entry into dnlc_purge_vp() in case of a directory.
 196  204           */
 197  205          rp = VTOR(vp);
 198  206          mutex_enter(&rp->r_statelock);
 199  207          if (vp->v_count > 1 &&
 200  208              (vp->v_type == VDIR || purge_dnlc == NFS_PURGE_DNLC) &&
 201  209              !(rp->r_flags & RINDNLCPURGE)) {
 202  210                  /*
 203  211                   * Set the RINDNLCPURGE flag to prevent recursive entry
 204  212                   * into dnlc_purge_vp()
 205  213                   */
 206  214                  if (vp->v_type == VDIR)
 207  215                          rp->r_flags |= RINDNLCPURGE;
 208  216                  mutex_exit(&rp->r_statelock);
 209  217                  dnlc_purge_vp(vp);
 210  218                  mutex_enter(&rp->r_statelock);
 211  219                  if (rp->r_flags & RINDNLCPURGE)
 212  220                          rp->r_flags &= ~RINDNLCPURGE;
 213  221          }
 214  222  
 215  223          /*
 216  224           * Clear any readdir state bits and purge the readlink response cache.
 217  225           */
 218  226          contents = rp->r_symlink.contents;
 219  227          size = rp->r_symlink.size;
 220  228          rp->r_symlink.contents = NULL;
 221  229          mutex_exit(&rp->r_statelock);
 222  230  
 223  231          if (contents != NULL) {
 224  232  
 225  233                  kmem_free((void *)contents, size);
 226  234          }
 227  235  
 228  236          /*
 229  237           * Flush the page cache.
 230  238           */
 231  239          if (vn_has_cached_data(vp)) {
 232  240                  error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_INVAL, cr, NULL);
 233  241                  if (error && (error == ENOSPC || error == EDQUOT)) {
 234  242                          mutex_enter(&rp->r_statelock);
 235  243                          if (!rp->r_error)
 236  244                                  rp->r_error = error;
 237  245                          mutex_exit(&rp->r_statelock);
 238  246                  }
 239  247          }
 240  248  
 241  249          /*
 242  250           * Flush the readdir response cache.
 243  251           */
 244  252          if (HAVE_RDDIR_CACHE(rp))
 245  253                  nfs_purge_rddir_cache(vp);
 246  254  }
 247  255  
 248  256  /*
 249  257   * Purge the readdir cache of all entries
 250  258   */
 251  259  void
 252  260  nfs_purge_rddir_cache(vnode_t *vp)
 253  261  {
 254  262          rnode_t *rp;
 255  263          rddir_cache *rdc;
 256  264          rddir_cache *nrdc;
 257  265  
 258  266          rp = VTOR(vp);
 259  267  top:
 260  268          mutex_enter(&rp->r_statelock);
 261  269          rp->r_direof = NULL;
 262  270          rp->r_flags &= ~RLOOKUP;
 263  271          rp->r_flags |= RREADDIRPLUS;
 264  272          rdc = avl_first(&rp->r_dir);
 265  273          while (rdc != NULL) {
 266  274                  nrdc = AVL_NEXT(&rp->r_dir, rdc);
 267  275                  avl_remove(&rp->r_dir, rdc);
 268  276                  rddir_cache_rele(rdc);
 269  277                  rdc = nrdc;
 270  278          }
 271  279          mutex_exit(&rp->r_statelock);
 272  280  }
 273  281  
 274  282  /*
 275  283   * Do a cache check based on the post-operation attributes.
 276  284   * Then make them the new cached attributes.  If no attributes
 277  285   * were returned, then mark the attributes as timed out.
 278  286   */
 279  287  void
 280  288  nfs3_cache_post_op_attr(vnode_t *vp, post_op_attr *poap, hrtime_t t, cred_t *cr)
 281  289  {
 282  290          vattr_t attr;
 283  291  
 284  292          if (!poap->attributes) {
 285  293                  PURGE_ATTRCACHE(vp);
 286  294                  return;
 287  295          }
 288  296          (void) nfs3_cache_fattr3(vp, &poap->attr, &attr, t, cr);
 289  297  }
 290  298  
 291  299  /*
 292  300   * Same as above, but using a vattr
 293  301   */
 294  302  void
 295  303  nfs3_cache_post_op_vattr(vnode_t *vp, post_op_vattr *poap, hrtime_t t,
 296  304      cred_t *cr)
 297  305  {
 298  306          if (!poap->attributes) {
 299  307                  PURGE_ATTRCACHE(vp);
 300  308                  return;
 301  309          }
 302  310          nfs_attr_cache(vp, poap->fres.vap, t, cr);
 303  311  }
 304  312  
 305  313  /*
 306  314   * Do a cache check based on the weak cache consistency attributes.
 307  315   * These consist of a small set of pre-operation attributes and the
 308  316   * full set of post-operation attributes.
 309  317   *
 310  318   * If we are given the pre-operation attributes, then use them to
 311  319   * check the validity of the various caches.  Then, if we got the
 312  320   * post-operation attributes, make them the new cached attributes.
 313  321   * If we didn't get the post-operation attributes, then mark the
 314  322   * attribute cache as timed out so that the next reference will
 315  323   * cause a GETATTR to the server to refresh with the current
 316  324   * attributes.
 317  325   *
 318  326   * Otherwise, if we didn't get the pre-operation attributes, but
 319  327   * we did get the post-operation attributes, then use these
 320  328   * attributes to check the validity of the various caches.  This
 321  329   * will probably cause a flush of the caches because if the
 322  330   * operation succeeded, the attributes of the object were changed
 323  331   * in some way from the old post-operation attributes.  This
 324  332   * should be okay because it is the safe thing to do.  After
 325  333   * checking the data caches, then we make these the new cached
 326  334   * attributes.
 327  335   *
 328  336   * Otherwise, we didn't get either the pre- or post-operation
 329  337   * attributes.  Simply mark the attribute cache as timed out so
 330  338   * the next reference will cause a GETATTR to the server to
 331  339   * refresh with the current attributes.
 332  340   *
 333  341   * If an error occurred trying to convert the over the wire
 334  342   * attributes to a vattr, then simply mark the attribute cache as
 335  343   * timed out.
 336  344   */
 337  345  void
 338  346  nfs3_cache_wcc_data(vnode_t *vp, wcc_data *wccp, hrtime_t t, cred_t *cr)
 339  347  {
 340  348          vattr_t bva;
 341  349          vattr_t ava;
 342  350  
 343  351          if (wccp->after.attributes) {
 344  352                  if (fattr3_to_vattr(vp, &wccp->after.attr, &ava)) {
 345  353                          PURGE_ATTRCACHE(vp);
 346  354                          return;
 347  355                  }
 348  356                  if (wccp->before.attributes) {
 349  357                          bva.va_ctime.tv_sec = wccp->before.attr.ctime.seconds;
 350  358                          bva.va_ctime.tv_nsec = wccp->before.attr.ctime.nseconds;
 351  359                          bva.va_mtime.tv_sec = wccp->before.attr.mtime.seconds;
 352  360                          bva.va_mtime.tv_nsec = wccp->before.attr.mtime.nseconds;
 353  361                          bva.va_size = wccp->before.attr.size;
 354  362                          nfs3_attr_cache(vp, &bva, &ava, t, cr);
 355  363                  } else
 356  364                          nfs_attr_cache(vp, &ava, t, cr);
 357  365          } else {
 358  366                  PURGE_ATTRCACHE(vp);
 359  367          }
 360  368  }
 361  369  
 362  370  /*
 363  371   * Set attributes cache for given vnode using nfsattr.
 364  372   *
 365  373   * This routine does not do cache validation with the attributes.
 366  374   *
 367  375   * If an error occurred trying to convert the over the wire
 368  376   * attributes to a vattr, then simply mark the attribute cache as
 369  377   * timed out.
 370  378   */
 371  379  void
 372  380  nfs_attrcache(vnode_t *vp, struct nfsfattr *na, hrtime_t t)
 373  381  {
 374  382          rnode_t *rp;
 375  383          struct vattr va;
 376  384  
 377  385          if (!nattr_to_vattr(vp, na, &va)) {
 378  386                  rp = VTOR(vp);
 379  387                  mutex_enter(&rp->r_statelock);
 380  388                  if (rp->r_mtime <= t)
 381  389                          nfs_attrcache_va(vp, &va);
 382  390                  mutex_exit(&rp->r_statelock);
 383  391          } else {
 384  392                  PURGE_ATTRCACHE(vp);
 385  393          }
 386  394  }
 387  395  
 388  396  /*
 389  397   * Set attributes cache for given vnode using fattr3.
 390  398   *
 391  399   * This routine does not do cache validation with the attributes.
 392  400   *
 393  401   * If an error occurred trying to convert the over the wire
 394  402   * attributes to a vattr, then simply mark the attribute cache as
 395  403   * timed out.
 396  404   */
 397  405  void
 398  406  nfs3_attrcache(vnode_t *vp, fattr3 *na, hrtime_t t)
 399  407  {
 400  408          rnode_t *rp;
 401  409          struct vattr va;
 402  410  
 403  411          if (!fattr3_to_vattr(vp, na, &va)) {
 404  412                  rp = VTOR(vp);
 405  413                  mutex_enter(&rp->r_statelock);
 406  414                  if (rp->r_mtime <= t)
 407  415                          nfs_attrcache_va(vp, &va);
 408  416                  mutex_exit(&rp->r_statelock);
 409  417          } else {
 410  418                  PURGE_ATTRCACHE(vp);
 411  419          }
 412  420  }
 413  421  
 414  422  /*
 415  423   * Do a cache check based on attributes returned over the wire.  The
 416  424   * new attributes are cached.
 417  425   *
 418  426   * If an error occurred trying to convert the over the wire attributes
 419  427   * to a vattr, then just return that error.
 420  428   *
 421  429   * As a side affect, the vattr argument is filled in with the converted
 422  430   * attributes.
 423  431   */
 424  432  int
 425  433  nfs_cache_fattr(vnode_t *vp, struct nfsfattr *na, vattr_t *vap, hrtime_t t,
 426  434      cred_t *cr)
 427  435  {
 428  436          int error;
 429  437  
 430  438          error = nattr_to_vattr(vp, na, vap);
 431  439          if (error)
 432  440                  return (error);
 433  441          nfs_attr_cache(vp, vap, t, cr);
 434  442          return (0);
 435  443  }
 436  444  
 437  445  /*
 438  446   * Do a cache check based on attributes returned over the wire.  The
 439  447   * new attributes are cached.
 440  448   *
 441  449   * If an error occurred trying to convert the over the wire attributes
 442  450   * to a vattr, then just return that error.
 443  451   *
 444  452   * As a side affect, the vattr argument is filled in with the converted
 445  453   * attributes.
 446  454   */
 447  455  int
 448  456  nfs3_cache_fattr3(vnode_t *vp, fattr3 *na, vattr_t *vap, hrtime_t t, cred_t *cr)
 449  457  {
 450  458          int error;
 451  459  
 452  460          error = fattr3_to_vattr(vp, na, vap);
 453  461          if (error)
 454  462                  return (error);
 455  463          nfs_attr_cache(vp, vap, t, cr);
 456  464          return (0);
 457  465  }
 458  466  
 459  467  /*
 460  468   * Use the passed in virtual attributes to check to see whether the
 461  469   * data and metadata caches are valid, cache the new attributes, and
 462  470   * then do the cache invalidation if required.
 463  471   *
 464  472   * The cache validation and caching of the new attributes is done
 465  473   * atomically via the use of the mutex, r_statelock.  If required,
 466  474   * the cache invalidation is done atomically w.r.t. the cache
 467  475   * validation and caching of the attributes via the pseudo lock,
 468  476   * r_serial.
 469  477   *
 470  478   * This routine is used to do cache validation and attributes caching
 471  479   * for operations with a single set of post operation attributes.
 472  480   */
 473  481  void
 474  482  nfs_attr_cache(vnode_t *vp, vattr_t *vap, hrtime_t t, cred_t *cr)
 475  483  {
 476  484          rnode_t *rp;
 477  485          int mtime_changed = 0;
 478  486          int ctime_changed = 0;
 479  487          vsecattr_t *vsp;
 480  488          int was_serial;
 481  489          len_t preattr_rsize;
 482  490          boolean_t writeattr_set = B_FALSE;
 483  491          boolean_t cachepurge_set = B_FALSE;
 484  492  
 485  493          rp = VTOR(vp);
 486  494  
 487  495          mutex_enter(&rp->r_statelock);
 488  496  
 489  497          if (rp->r_serial != curthread) {
 490  498                  klwp_t *lwp = ttolwp(curthread);
 491  499  
 492  500                  was_serial = 0;
 493  501                  if (lwp != NULL)
 494  502                          lwp->lwp_nostop++;
 495  503                  while (rp->r_serial != NULL) {
 496  504                          if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
 497  505                                  mutex_exit(&rp->r_statelock);
 498  506                                  if (lwp != NULL)
 499  507                                          lwp->lwp_nostop--;
 500  508                                  return;
 501  509                          }
 502  510                  }
 503  511                  if (lwp != NULL)
 504  512                          lwp->lwp_nostop--;
 505  513          } else
 506  514                  was_serial = 1;
 507  515  
 508  516          if (rp->r_mtime > t) {
 509  517                  if (!CACHE_VALID(rp, vap->va_mtime, vap->va_size))
 510  518                          PURGE_ATTRCACHE_LOCKED(rp);
 511  519                  mutex_exit(&rp->r_statelock);
 512  520                  return;
 513  521          }
 514  522  
 515  523          /*
 516  524           * Write thread after writing data to file on remote server,
 517  525           * will always set RWRITEATTR to indicate that file on remote
 518  526           * server was modified with a WRITE operation and would have
 519  527           * marked attribute cache as timed out. If RWRITEATTR
 520  528           * is set, then do not check for mtime and ctime change.
 521  529           */
 522  530          if (!(rp->r_flags & RWRITEATTR)) {
 523  531                  if (!CACHE_VALID(rp, vap->va_mtime, vap->va_size))
 524  532                          mtime_changed = 1;
 525  533  
 526  534                  if (rp->r_attr.va_ctime.tv_sec != vap->va_ctime.tv_sec ||
 527  535                      rp->r_attr.va_ctime.tv_nsec != vap->va_ctime.tv_nsec)
 528  536                          ctime_changed = 1;
 529  537          } else {
 530  538                  writeattr_set = B_TRUE;
 531  539          }
 532  540  
 533  541          preattr_rsize = rp->r_size;
 534  542  
 535  543          nfs_attrcache_va(vp, vap);
 536  544  
 537  545          /*
 538  546           * If we have updated filesize in nfs_attrcache_va, as soon as we
 539  547           * drop statelock we will be in transition of purging all
 540  548           * our caches and updating them. It is possible for another
 541  549           * thread to pick this new file size and read in zeroed data.
 542  550           * stall other threads till cache purge is complete.
 543  551           */
 544  552          if ((vp->v_type == VREG) && (rp->r_size != preattr_rsize)) {
 545  553                  /*
 546  554                   * If RWRITEATTR was set and we have updated the file
 547  555                   * size, Server's returned file size need not necessarily
 548  556                   * be because of this Client's WRITE. We need to purge
 549  557                   * all caches.
 550  558                   */
 551  559                  if (writeattr_set)
 552  560                          mtime_changed = 1;
 553  561  
 554  562                  if (mtime_changed && !(rp->r_flags & RINCACHEPURGE)) {
 555  563                          rp->r_flags |= RINCACHEPURGE;
 556  564                          cachepurge_set = B_TRUE;
 557  565                  }
 558  566          }
 559  567  
 560  568          if (!mtime_changed && !ctime_changed) {
 561  569                  mutex_exit(&rp->r_statelock);
 562  570                  return;
 563  571          }
 564  572  
 565  573          rp->r_serial = curthread;
 566  574  
 567  575          mutex_exit(&rp->r_statelock);
 568  576  
 569  577          if (mtime_changed)
 570  578                  nfs_purge_caches(vp, NFS_NOPURGE_DNLC, cr);
 571  579  
 572  580          if ((rp->r_flags & RINCACHEPURGE) && cachepurge_set) {
 573  581                  mutex_enter(&rp->r_statelock);
 574  582                  rp->r_flags &= ~RINCACHEPURGE;
 575  583                  cv_broadcast(&rp->r_cv);
 576  584                  mutex_exit(&rp->r_statelock);
 577  585                  cachepurge_set = B_FALSE;
 578  586          }
 579  587  
 580  588          if (ctime_changed) {
 581  589                  (void) nfs_access_purge_rp(rp);
 582  590                  if (rp->r_secattr != NULL) {
 583  591                          mutex_enter(&rp->r_statelock);
 584  592                          vsp = rp->r_secattr;
 585  593                          rp->r_secattr = NULL;
 586  594                          mutex_exit(&rp->r_statelock);
 587  595                          if (vsp != NULL)
 588  596                                  nfs_acl_free(vsp);
 589  597                  }
 590  598          }
 591  599  
 592  600          if (!was_serial) {
 593  601                  mutex_enter(&rp->r_statelock);
 594  602                  rp->r_serial = NULL;
 595  603                  cv_broadcast(&rp->r_cv);
 596  604                  mutex_exit(&rp->r_statelock);
 597  605          }
 598  606  }
 599  607  
 600  608  /*
 601  609   * Use the passed in "before" virtual attributes to check to see
 602  610   * whether the data and metadata caches are valid, cache the "after"
 603  611   * new attributes, and then do the cache invalidation if required.
 604  612   *
 605  613   * The cache validation and caching of the new attributes is done
 606  614   * atomically via the use of the mutex, r_statelock.  If required,
 607  615   * the cache invalidation is done atomically w.r.t. the cache
 608  616   * validation and caching of the attributes via the pseudo lock,
 609  617   * r_serial.
 610  618   *
 611  619   * This routine is used to do cache validation and attributes caching
 612  620   * for operations with both pre operation attributes and post operation
 613  621   * attributes.
 614  622   */
 615  623  static void
 616  624  nfs3_attr_cache(vnode_t *vp, vattr_t *bvap, vattr_t *avap, hrtime_t t,
 617  625      cred_t *cr)
 618  626  {
 619  627          rnode_t *rp;
 620  628          int mtime_changed = 0;
 621  629          int ctime_changed = 0;
 622  630          vsecattr_t *vsp;
 623  631          int was_serial;
 624  632          len_t preattr_rsize;
 625  633          boolean_t writeattr_set = B_FALSE;
 626  634          boolean_t cachepurge_set = B_FALSE;
 627  635  
 628  636          rp = VTOR(vp);
 629  637  
 630  638          mutex_enter(&rp->r_statelock);
 631  639  
 632  640          if (rp->r_serial != curthread) {
 633  641                  klwp_t *lwp = ttolwp(curthread);
 634  642  
 635  643                  was_serial = 0;
 636  644                  if (lwp != NULL)
 637  645                          lwp->lwp_nostop++;
 638  646                  while (rp->r_serial != NULL) {
 639  647                          if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
 640  648                                  mutex_exit(&rp->r_statelock);
 641  649                                  if (lwp != NULL)
 642  650                                          lwp->lwp_nostop--;
 643  651                                  return;
 644  652                          }
 645  653                  }
 646  654                  if (lwp != NULL)
 647  655                          lwp->lwp_nostop--;
 648  656          } else
 649  657                  was_serial = 1;
 650  658  
 651  659          if (rp->r_mtime > t) {
 652  660                  if (!CACHE_VALID(rp, avap->va_mtime, avap->va_size))
 653  661                          PURGE_ATTRCACHE_LOCKED(rp);
 654  662                  mutex_exit(&rp->r_statelock);
 655  663                  return;
 656  664          }
 657  665  
 658  666          /*
 659  667           * Write thread after writing data to file on remote server,
 660  668           * will always set RWRITEATTR to indicate that file on remote
 661  669           * server was modified with a WRITE operation and would have
 662  670           * marked attribute cache as timed out. If RWRITEATTR
 663  671           * is set, then do not check for mtime and ctime change.
 664  672           */
 665  673          if (!(rp->r_flags & RWRITEATTR)) {
 666  674                  if (!CACHE_VALID(rp, bvap->va_mtime, bvap->va_size))
 667  675                          mtime_changed = 1;
 668  676  
 669  677                  if (rp->r_attr.va_ctime.tv_sec != bvap->va_ctime.tv_sec ||
 670  678                      rp->r_attr.va_ctime.tv_nsec != bvap->va_ctime.tv_nsec)
 671  679                          ctime_changed = 1;
 672  680          } else {
 673  681                  writeattr_set = B_TRUE;
 674  682          }
 675  683  
 676  684          preattr_rsize = rp->r_size;
 677  685  
 678  686          nfs_attrcache_va(vp, avap);
 679  687  
 680  688          /*
 681  689           * If we have updated filesize in nfs_attrcache_va, as soon as we
 682  690           * drop statelock we will be in transition of purging all
 683  691           * our caches and updating them. It is possible for another
 684  692           * thread to pick this new file size and read in zeroed data.
 685  693           * stall other threads till cache purge is complete.
 686  694           */
 687  695          if ((vp->v_type == VREG) && (rp->r_size != preattr_rsize)) {
 688  696                  /*
 689  697                   * If RWRITEATTR was set and we have updated the file
 690  698                   * size, Server's returned file size need not necessarily
 691  699                   * be because of this Client's WRITE. We need to purge
 692  700                   * all caches.
 693  701                   */
 694  702                  if (writeattr_set)
 695  703                          mtime_changed = 1;
 696  704  
 697  705                  if (mtime_changed && !(rp->r_flags & RINCACHEPURGE)) {
 698  706                          rp->r_flags |= RINCACHEPURGE;
 699  707                          cachepurge_set = B_TRUE;
 700  708                  }
 701  709          }
 702  710  
 703  711          if (!mtime_changed && !ctime_changed) {
 704  712                  mutex_exit(&rp->r_statelock);
 705  713                  return;
 706  714          }
 707  715  
 708  716          rp->r_serial = curthread;
 709  717  
 710  718          mutex_exit(&rp->r_statelock);
 711  719  
 712  720          if (mtime_changed)
 713  721                  nfs_purge_caches(vp, NFS_NOPURGE_DNLC, cr);
 714  722  
 715  723          if ((rp->r_flags & RINCACHEPURGE) && cachepurge_set) {
 716  724                  mutex_enter(&rp->r_statelock);
 717  725                  rp->r_flags &= ~RINCACHEPURGE;
 718  726                  cv_broadcast(&rp->r_cv);
 719  727                  mutex_exit(&rp->r_statelock);
 720  728                  cachepurge_set = B_FALSE;
 721  729          }
 722  730  
 723  731          if (ctime_changed) {
 724  732                  (void) nfs_access_purge_rp(rp);
 725  733                  if (rp->r_secattr != NULL) {
 726  734                          mutex_enter(&rp->r_statelock);
 727  735                          vsp = rp->r_secattr;
 728  736                          rp->r_secattr = NULL;
 729  737                          mutex_exit(&rp->r_statelock);
 730  738                          if (vsp != NULL)
 731  739                                  nfs_acl_free(vsp);
 732  740                  }
 733  741          }
 734  742  
 735  743          if (!was_serial) {
 736  744                  mutex_enter(&rp->r_statelock);
 737  745                  rp->r_serial = NULL;
 738  746                  cv_broadcast(&rp->r_cv);
 739  747                  mutex_exit(&rp->r_statelock);
 740  748          }
 741  749  }
 742  750  
 743  751  /*
 744  752   * Set attributes cache for given vnode using virtual attributes.
 745  753   *
 746  754   * Set the timeout value on the attribute cache and fill it
 747  755   * with the passed in attributes.
 748  756   *
 749  757   * The caller must be holding r_statelock.
 750  758   */
 751  759  void
 752  760  nfs_attrcache_va(vnode_t *vp, struct vattr *va)
 753  761  {
 754  762          rnode_t *rp;
 755  763          mntinfo_t *mi;
 756  764          hrtime_t delta;
 757  765          hrtime_t now;
 758  766  
 759  767          rp = VTOR(vp);
 760  768  
 761  769          ASSERT(MUTEX_HELD(&rp->r_statelock));
 762  770  
 763  771          now = gethrtime();
 764  772  
 765  773          mi = VTOMI(vp);
 766  774  
 767  775          /*
 768  776           * Delta is the number of nanoseconds that we will
 769  777           * cache the attributes of the file.  It is based on
 770  778           * the number of nanoseconds since the last time that
 771  779           * we detected a change.  The assumption is that files
 772  780           * that changed recently are likely to change again.
 773  781           * There is a minimum and a maximum for regular files
 774  782           * and for directories which is enforced though.
 775  783           *
 776  784           * Using the time since last change was detected
 777  785           * eliminates direct comparison or calculation
 778  786           * using mixed client and server times.  NFS does
 779  787           * not make any assumptions regarding the client
 780  788           * and server clocks being synchronized.
 781  789           */
 782  790          if (va->va_mtime.tv_sec != rp->r_attr.va_mtime.tv_sec ||
 783  791              va->va_mtime.tv_nsec != rp->r_attr.va_mtime.tv_nsec ||
 784  792              va->va_size != rp->r_attr.va_size)
 785  793                  rp->r_mtime = now;
 786  794  
 787  795          if ((mi->mi_flags & MI_NOAC) || (vp->v_flag & VNOCACHE))
 788  796                  delta = 0;
 789  797          else {
 790  798                  delta = now - rp->r_mtime;
 791  799                  if (vp->v_type == VDIR) {
 792  800                          if (delta < mi->mi_acdirmin)
 793  801                                  delta = mi->mi_acdirmin;
 794  802                          else if (delta > mi->mi_acdirmax)
 795  803                                  delta = mi->mi_acdirmax;
 796  804                  } else {
 797  805                          if (delta < mi->mi_acregmin)
 798  806                                  delta = mi->mi_acregmin;
 799  807                          else if (delta > mi->mi_acregmax)
 800  808                                  delta = mi->mi_acregmax;
 801  809                  }
 802  810          }
 803  811          rp->r_attrtime = now + delta;
 804  812          rp->r_attr = *va;
 805  813          /*
 806  814           * Update the size of the file if there is no cached data or if
 807  815           * the cached data is clean and there is no data being written
 808  816           * out.
 809  817           */
 810  818          if (rp->r_size != va->va_size &&
 811  819              (!vn_has_cached_data(vp) ||
 812  820              (!(rp->r_flags & RDIRTY) && rp->r_count == 0)))
 813  821                  rp->r_size = va->va_size;
 814  822          nfs_setswaplike(vp, va);
 815  823          rp->r_flags &= ~RWRITEATTR;
 816  824  }
 817  825  
 818  826  /*
 819  827   * Fill in attribute from the cache.
 820  828   * If valid, then return 0 to indicate that no error occurred,
 821  829   * otherwise return 1 to indicate that an error occurred.
 822  830   */
 823  831  static int
 824  832  nfs_getattr_cache(vnode_t *vp, struct vattr *vap)
 825  833  {
 826  834          rnode_t *rp;
 827  835          uint_t mask = vap->va_mask;
 828  836  
 829  837          rp = VTOR(vp);
 830  838          mutex_enter(&rp->r_statelock);
 831  839          if (ATTRCACHE_VALID(vp)) {
 832  840                  /*
 833  841                   * Cached attributes are valid
 834  842                   */
 835  843                  *vap = rp->r_attr;
 836  844                  /*
 837  845                   * Set the caller's va_mask to the set of attributes
 838  846                   * that were requested ANDed with the attributes that
 839  847                   * are available.  If attributes were requested that
 840  848                   * are not available, those bits must be turned off
 841  849                   * in the callers va_mask.
 842  850                   */
 843  851                  vap->va_mask &= mask;
 844  852                  mutex_exit(&rp->r_statelock);
 845  853                  return (0);
 846  854          }
 847  855          mutex_exit(&rp->r_statelock);
 848  856          return (1);
 849  857  }
 850  858  
 851  859  /*
 852  860   * Get attributes over-the-wire and update attributes cache
 853  861   * if no error occurred in the over-the-wire operation.
 854  862   * Return 0 if successful, otherwise error.
 855  863   */
 856  864  int
 857  865  nfs_getattr_otw(vnode_t *vp, struct vattr *vap, cred_t *cr)
 858  866  {
 859  867          int error;
 860  868          struct nfsattrstat ns;
 861  869          int douprintf;
 862  870          mntinfo_t *mi;
 863  871          failinfo_t fi;
 864  872          hrtime_t t;
 865  873  
 866  874          mi = VTOMI(vp);
 867  875          fi.vp = vp;
 868  876          fi.fhp = NULL;          /* no need to update, filehandle not copied */
 869  877          fi.copyproc = nfscopyfh;
 870  878          fi.lookupproc = nfslookup;
 871  879          fi.xattrdirproc = acl_getxattrdir2;
 872  880  
 873  881          if (mi->mi_flags & MI_ACL) {
 874  882                  error = acl_getattr2_otw(vp, vap, cr);
 875  883                  if (mi->mi_flags & MI_ACL)
 876  884                          return (error);
 877  885          }
 878  886  
 879  887          douprintf = 1;
 880  888  
 881  889          t = gethrtime();
 882  890  
 883  891          error = rfs2call(mi, RFS_GETATTR,
 884  892              xdr_fhandle, (caddr_t)VTOFH(vp),
 885  893              xdr_attrstat, (caddr_t)&ns, cr,
 886  894              &douprintf, &ns.ns_status, 0, &fi);
 887  895  
 888  896          if (!error) {
 889  897                  error = geterrno(ns.ns_status);
 890  898                  if (!error)
 891  899                          error = nfs_cache_fattr(vp, &ns.ns_attr, vap, t, cr);
 892  900                  else {
 893  901                          PURGE_STALE_FH(error, vp, cr);
 894  902                  }
 895  903          }
 896  904  
 897  905          return (error);
 898  906  }
 899  907  
 900  908  /*
 901  909   * Return either cached ot remote attributes. If get remote attr
 902  910   * use them to check and invalidate caches, then cache the new attributes.
 903  911   */
 904  912  int
 905  913  nfsgetattr(vnode_t *vp, struct vattr *vap, cred_t *cr)
 906  914  {
 907  915          int error;
 908  916          rnode_t *rp;
 909  917  
 910  918          /*
 911  919           * If we've got cached attributes, we're done, otherwise go
 912  920           * to the server to get attributes, which will update the cache
 913  921           * in the process.
 914  922           */
 915  923          error = nfs_getattr_cache(vp, vap);
 916  924          if (error)
 917  925                  error = nfs_getattr_otw(vp, vap, cr);
 918  926  
 919  927          /* Return the client's view of file size */
 920  928          rp = VTOR(vp);
 921  929          mutex_enter(&rp->r_statelock);
 922  930          vap->va_size = rp->r_size;
 923  931          mutex_exit(&rp->r_statelock);
 924  932  
 925  933          return (error);
 926  934  }
 927  935  
 928  936  /*
 929  937   * Get attributes over-the-wire and update attributes cache
 930  938   * if no error occurred in the over-the-wire operation.
 931  939   * Return 0 if successful, otherwise error.
 932  940   */
 933  941  int
 934  942  nfs3_getattr_otw(vnode_t *vp, struct vattr *vap, cred_t *cr)
 935  943  {
 936  944          int error;
 937  945          GETATTR3args args;
 938  946          GETATTR3vres res;
 939  947          int douprintf;
 940  948          failinfo_t fi;
 941  949          hrtime_t t;
 942  950  
 943  951          args.object = *VTOFH3(vp);
 944  952          fi.vp = vp;
 945  953          fi.fhp = (caddr_t)&args.object;
 946  954          fi.copyproc = nfs3copyfh;
 947  955          fi.lookupproc = nfs3lookup;
 948  956          fi.xattrdirproc = acl_getxattrdir3;
 949  957          res.fres.vp = vp;
 950  958          res.fres.vap = vap;
 951  959  
 952  960          douprintf = 1;
 953  961  
 954  962          t = gethrtime();
 955  963  
 956  964          error = rfs3call(VTOMI(vp), NFSPROC3_GETATTR,
 957  965              xdr_nfs_fh3, (caddr_t)&args,
 958  966              xdr_GETATTR3vres, (caddr_t)&res, cr,
 959  967              &douprintf, &res.status, 0, &fi);
 960  968  
 961  969          if (error)
 962  970                  return (error);
 963  971  
 964  972          error = geterrno3(res.status);
 965  973          if (error) {
 966  974                  PURGE_STALE_FH(error, vp, cr);
 967  975                  return (error);
 968  976          }
 969  977  
 970  978          /*
 971  979           * Catch status codes that indicate fattr3 to vattr translation failure
 972  980           */
 973  981          if (res.fres.status)
 974  982                  return (res.fres.status);
 975  983  
 976  984          nfs_attr_cache(vp, vap, t, cr);
 977  985          return (0);
 978  986  }
 979  987  
 980  988  /*
 981  989   * Return either cached or remote attributes. If get remote attr
 982  990   * use them to check and invalidate caches, then cache the new attributes.
 983  991   */
 984  992  int
 985  993  nfs3getattr(vnode_t *vp, struct vattr *vap, cred_t *cr)
 986  994  {
 987  995          int error;
 988  996          rnode_t *rp;
 989  997  
 990  998          /*
 991  999           * If we've got cached attributes, we're done, otherwise go
 992 1000           * to the server to get attributes, which will update the cache
 993 1001           * in the process.
 994 1002           */
 995 1003          error = nfs_getattr_cache(vp, vap);
 996 1004          if (error)
 997 1005                  error = nfs3_getattr_otw(vp, vap, cr);
 998 1006  
 999 1007          /* Return the client's view of file size */
1000 1008          rp = VTOR(vp);
1001 1009          mutex_enter(&rp->r_statelock);
1002 1010          vap->va_size = rp->r_size;
1003 1011          mutex_exit(&rp->r_statelock);
1004 1012  
1005 1013          return (error);
1006 1014  }
1007 1015  
1008 1016  vtype_t nf_to_vt[] = {
1009 1017          VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK
1010 1018  };
1011 1019  /*
1012 1020   * Convert NFS Version 2 over the network attributes to the local
1013 1021   * virtual attributes.  The mapping between the UID_NOBODY/GID_NOBODY
1014 1022   * network representation and the local representation is done here.
1015 1023   * Returns 0 for success, error if failed due to overflow.
1016 1024   */
1017 1025  int
1018 1026  nattr_to_vattr(vnode_t *vp, struct nfsfattr *na, struct vattr *vap)
1019 1027  {
1020 1028          /* overflow in time attributes? */
1021 1029  #ifndef _LP64
1022 1030          if (!NFS2_FATTR_TIME_OK(na))
1023 1031                  return (EOVERFLOW);
1024 1032  #endif
1025 1033  
1026 1034          vap->va_mask = AT_ALL;
1027 1035  
1028 1036          if (na->na_type < NFNON || na->na_type > NFSOC)
1029 1037                  vap->va_type = VBAD;
1030 1038          else
1031 1039                  vap->va_type = nf_to_vt[na->na_type];
1032 1040          vap->va_mode = na->na_mode;
1033 1041          vap->va_uid = (na->na_uid == NFS_UID_NOBODY) ? UID_NOBODY : na->na_uid;
1034 1042          vap->va_gid = (na->na_gid == NFS_GID_NOBODY) ? GID_NOBODY : na->na_gid;
1035 1043          vap->va_fsid = vp->v_vfsp->vfs_dev;
1036 1044          vap->va_nodeid = na->na_nodeid;
1037 1045          vap->va_nlink = na->na_nlink;
1038 1046          vap->va_size = na->na_size;     /* keep for cache validation */
1039 1047          /*
1040 1048           * nfs protocol defines times as unsigned so don't extend sign,
1041 1049           * unless sysadmin set nfs_allow_preepoch_time.
1042 1050           */
1043 1051          NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, na->na_atime.tv_sec);
1044 1052          vap->va_atime.tv_nsec = (uint32_t)(na->na_atime.tv_usec * 1000);
1045 1053          NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, na->na_mtime.tv_sec);
1046 1054          vap->va_mtime.tv_nsec = (uint32_t)(na->na_mtime.tv_usec * 1000);
1047 1055          NFS_TIME_T_CONVERT(vap->va_ctime.tv_sec, na->na_ctime.tv_sec);
1048 1056          vap->va_ctime.tv_nsec = (uint32_t)(na->na_ctime.tv_usec * 1000);
1049 1057          /*
1050 1058           * Shannon's law - uncompress the received dev_t
1051 1059           * if the top half of is zero indicating a response
1052 1060           * from an `older style' OS. Except for when it is a
1053 1061           * `new style' OS sending the maj device of zero,
1054 1062           * in which case the algorithm still works because the
1055 1063           * fact that it is a new style server
1056 1064           * is hidden by the minor device not being greater
1057 1065           * than 255 (a requirement in this case).
1058 1066           */
1059 1067          if ((na->na_rdev & 0xffff0000) == 0)
1060 1068                  vap->va_rdev = nfsv2_expdev(na->na_rdev);
1061 1069          else
1062 1070                  vap->va_rdev = expldev(na->na_rdev);
1063 1071  
1064 1072          vap->va_nblocks = na->na_blocks;
1065 1073          switch (na->na_type) {
1066 1074          case NFBLK:
1067 1075                  vap->va_blksize = DEV_BSIZE;
1068 1076                  break;
1069 1077  
1070 1078          case NFCHR:
1071 1079                  vap->va_blksize = MAXBSIZE;
1072 1080                  break;
1073 1081  
1074 1082          case NFSOC:
1075 1083          default:
1076 1084                  vap->va_blksize = na->na_blocksize;
1077 1085                  break;
1078 1086          }
1079 1087          /*
1080 1088           * This bit of ugliness is a hack to preserve the
1081 1089           * over-the-wire protocols for named-pipe vnodes.
1082 1090           * It remaps the special over-the-wire type to the
1083 1091           * VFIFO type. (see note in nfs.h)
1084 1092           */
1085 1093          if (NA_ISFIFO(na)) {
1086 1094                  vap->va_type = VFIFO;
1087 1095                  vap->va_mode = (vap->va_mode & ~S_IFMT) | S_IFIFO;
1088 1096                  vap->va_rdev = 0;
1089 1097                  vap->va_blksize = na->na_blocksize;
1090 1098          }
1091 1099          vap->va_seq = 0;
1092 1100          return (0);
1093 1101  }
1094 1102  
1095 1103  /*
1096 1104   * Convert NFS Version 3 over the network attributes to the local
1097 1105   * virtual attributes.  The mapping between the UID_NOBODY/GID_NOBODY
1098 1106   * network representation and the local representation is done here.
1099 1107   */
1100 1108  vtype_t nf3_to_vt[] = {
1101 1109          VBAD, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO
1102 1110  };
1103 1111  
1104 1112  int
1105 1113  fattr3_to_vattr(vnode_t *vp, fattr3 *na, struct vattr *vap)
1106 1114  {
1107 1115  
1108 1116  #ifndef _LP64
1109 1117          /* overflow in time attributes? */
1110 1118          if (!NFS3_FATTR_TIME_OK(na))
1111 1119                  return (EOVERFLOW);
1112 1120  #endif
1113 1121          if (!NFS3_SIZE_OK(na->size))
1114 1122                  /* file too big */
1115 1123                  return (EFBIG);
1116 1124  
1117 1125          vap->va_mask = AT_ALL;
1118 1126  
1119 1127          if (na->type < NF3REG || na->type > NF3FIFO)
1120 1128                  vap->va_type = VBAD;
1121 1129          else
1122 1130                  vap->va_type = nf3_to_vt[na->type];
1123 1131          vap->va_mode = na->mode;
1124 1132          vap->va_uid = (na->uid == NFS_UID_NOBODY) ? UID_NOBODY : (uid_t)na->uid;
1125 1133          vap->va_gid = (na->gid == NFS_GID_NOBODY) ? GID_NOBODY : (gid_t)na->gid;
1126 1134          vap->va_fsid = vp->v_vfsp->vfs_dev;
1127 1135          vap->va_nodeid = na->fileid;
1128 1136          vap->va_nlink = na->nlink;
1129 1137          vap->va_size = na->size;
1130 1138  
1131 1139          /*
1132 1140           * nfs protocol defines times as unsigned so don't extend sign,
1133 1141           * unless sysadmin set nfs_allow_preepoch_time.
1134 1142           */
1135 1143          NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, na->atime.seconds);
1136 1144          vap->va_atime.tv_nsec = (uint32_t)na->atime.nseconds;
1137 1145          NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, na->mtime.seconds);
1138 1146          vap->va_mtime.tv_nsec = (uint32_t)na->mtime.nseconds;
1139 1147          NFS_TIME_T_CONVERT(vap->va_ctime.tv_sec, na->ctime.seconds);
1140 1148          vap->va_ctime.tv_nsec = (uint32_t)na->ctime.nseconds;
1141 1149  
1142 1150          switch (na->type) {
1143 1151          case NF3BLK:
1144 1152                  vap->va_rdev = makedevice(na->rdev.specdata1,
1145 1153                      na->rdev.specdata2);
1146 1154                  vap->va_blksize = DEV_BSIZE;
1147 1155                  vap->va_nblocks = 0;
1148 1156                  break;
1149 1157          case NF3CHR:
1150 1158                  vap->va_rdev = makedevice(na->rdev.specdata1,
1151 1159                      na->rdev.specdata2);
1152 1160                  vap->va_blksize = MAXBSIZE;
1153 1161                  vap->va_nblocks = 0;
1154 1162                  break;
1155 1163          case NF3REG:
1156 1164          case NF3DIR:
1157 1165          case NF3LNK:
1158 1166                  vap->va_rdev = 0;
1159 1167                  vap->va_blksize = MAXBSIZE;
1160 1168                  vap->va_nblocks = (u_longlong_t)
1161 1169                      ((na->used + (size3)DEV_BSIZE - (size3)1) /
1162 1170                      (size3)DEV_BSIZE);
1163 1171                  break;
1164 1172          case NF3SOCK:
1165 1173          case NF3FIFO:
1166 1174          default:
1167 1175                  vap->va_rdev = 0;
1168 1176                  vap->va_blksize = MAXBSIZE;
1169 1177                  vap->va_nblocks = 0;
1170 1178                  break;
1171 1179          }
1172 1180          vap->va_seq = 0;
1173 1181          return (0);
1174 1182  }
1175 1183  
1176 1184  /*
1177 1185   * Asynchronous I/O parameters.  nfs_async_threads is the high-water mark
1178 1186   * for the demand-based allocation of async threads per-mount.  The
1179 1187   * nfs_async_timeout is the amount of time a thread will live after it
1180 1188   * becomes idle, unless new I/O requests are received before the thread
1181 1189   * dies.  See nfs_async_putpage and nfs_async_start.
1182 1190   */
1183 1191  
1184 1192  int nfs_async_timeout = -1;     /* uninitialized */
1185 1193  
1186 1194  static void     nfs_async_start(struct vfs *);
1187 1195  static void     nfs_async_pgops_start(struct vfs *);
1188 1196  static void     nfs_async_common_start(struct vfs *, int);
1189 1197  
1190 1198  static void
1191 1199  free_async_args(struct nfs_async_reqs *args)
1192 1200  {
1193 1201          rnode_t *rp;
1194 1202  
1195 1203          if (args->a_io != NFS_INACTIVE) {
1196 1204                  rp = VTOR(args->a_vp);
1197 1205                  mutex_enter(&rp->r_statelock);
1198 1206                  rp->r_count--;
1199 1207                  if (args->a_io == NFS_PUTAPAGE ||
1200 1208                      args->a_io == NFS_PAGEIO)
1201 1209                          rp->r_awcount--;
1202 1210                  cv_broadcast(&rp->r_cv);
1203 1211                  mutex_exit(&rp->r_statelock);
1204 1212                  VN_RELE(args->a_vp);
1205 1213          }
1206 1214          crfree(args->a_cred);
1207 1215          kmem_free(args, sizeof (*args));
1208 1216  }
1209 1217  
1210 1218  /*
1211 1219   * Cross-zone thread creation and NFS access is disallowed, yet fsflush() and
1212 1220   * pageout(), running in the global zone, have legitimate reasons to do
1213 1221   * VOP_PUTPAGE(B_ASYNC) on other zones' NFS mounts.  We avoid the problem by
1214 1222   * use of a a per-mount "asynchronous requests manager thread" which is
1215 1223   * signaled by the various asynchronous work routines when there is
1216 1224   * asynchronous work to be done.  It is responsible for creating new
1217 1225   * worker threads if necessary, and notifying existing worker threads
1218 1226   * that there is work to be done.
1219 1227   *
1220 1228   * In other words, it will "take the specifications from the customers and
1221 1229   * give them to the engineers."
1222 1230   *
1223 1231   * Worker threads die off of their own accord if they are no longer
1224 1232   * needed.
1225 1233   *
1226 1234   * This thread is killed when the zone is going away or the filesystem
1227 1235   * is being unmounted.
1228 1236   */
1229 1237  void
1230 1238  nfs_async_manager(vfs_t *vfsp)
1231 1239  {
1232 1240          callb_cpr_t cprinfo;
1233 1241          mntinfo_t *mi;
1234 1242          uint_t max_threads;
1235 1243  
1236 1244          mi = VFTOMI(vfsp);
1237 1245  
1238 1246          CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr,
1239 1247              "nfs_async_manager");
1240 1248  
1241 1249          mutex_enter(&mi->mi_async_lock);
1242 1250          /*
1243 1251           * We want to stash the max number of threads that this mount was
1244 1252           * allowed so we can use it later when the variable is set to zero as
1245 1253           * part of the zone/mount going away.
1246 1254           *
1247 1255           * We want to be able to create at least one thread to handle
1248 1256           * asynchronous inactive calls.
1249 1257           */
1250 1258          max_threads = MAX(mi->mi_max_threads, 1);
1251 1259          /*
1252 1260           * We don't want to wait for mi_max_threads to go to zero, since that
1253 1261           * happens as part of a failed unmount, but this thread should only
1254 1262           * exit when the mount/zone is really going away.
1255 1263           *
1256 1264           * Once MI_ASYNC_MGR_STOP is set, no more async operations will be
1257 1265           * attempted: the various _async_*() functions know to do things
1258 1266           * inline if mi_max_threads == 0.  Henceforth we just drain out the
1259 1267           * outstanding requests.
1260 1268           *
1261 1269           * Note that we still create zthreads even if we notice the zone is
1262 1270           * shutting down (MI_ASYNC_MGR_STOP is set); this may cause the zone
1263 1271           * shutdown sequence to take slightly longer in some cases, but
1264 1272           * doesn't violate the protocol, as all threads will exit as soon as
1265 1273           * they're done processing the remaining requests.
1266 1274           */
1267 1275          for (;;) {
1268 1276                  while (mi->mi_async_req_count > 0) {
1269 1277                          /*
1270 1278                           * Paranoia: If the mount started out having
1271 1279                           * (mi->mi_max_threads == 0), and the value was
1272 1280                           * later changed (via a debugger or somesuch),
1273 1281                           * we could be confused since we will think we
1274 1282                           * can't create any threads, and the calling
1275 1283                           * code (which looks at the current value of
1276 1284                           * mi->mi_max_threads, now non-zero) thinks we
1277 1285                           * can.
1278 1286                           *
1279 1287                           * So, because we're paranoid, we create threads
1280 1288                           * up to the maximum of the original and the
1281 1289                           * current value. This means that future
1282 1290                           * (debugger-induced) lowerings of
1283 1291                           * mi->mi_max_threads are ignored for our
1284 1292                           * purposes, but who told them they could change
1285 1293                           * random values on a live kernel anyhow?
1286 1294                           */
1287 1295                          if (mi->mi_threads[NFS_ASYNC_QUEUE] <
1288 1296                              MAX(mi->mi_max_threads, max_threads)) {
1289 1297                                  mi->mi_threads[NFS_ASYNC_QUEUE]++;
1290 1298                                  mutex_exit(&mi->mi_async_lock);
1291 1299                                  VFS_HOLD(vfsp); /* hold for new thread */
1292 1300                                  (void) zthread_create(NULL, 0, nfs_async_start,
1293 1301                                      vfsp, 0, minclsyspri);
1294 1302                                  mutex_enter(&mi->mi_async_lock);
1295 1303                          } else if (mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE] <
1296 1304                              NUM_ASYNC_PGOPS_THREADS) {
1297 1305                                  mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE]++;
1298 1306                                  mutex_exit(&mi->mi_async_lock);
1299 1307                                  VFS_HOLD(vfsp); /* hold for new thread */
1300 1308                                  (void) zthread_create(NULL, 0,
1301 1309                                      nfs_async_pgops_start, vfsp, 0,
1302 1310                                      minclsyspri);
1303 1311                                  mutex_enter(&mi->mi_async_lock);
1304 1312                          }
1305 1313                          NFS_WAKE_ASYNC_WORKER(mi->mi_async_work_cv);
1306 1314                          ASSERT(mi->mi_async_req_count != 0);
1307 1315                          mi->mi_async_req_count--;
1308 1316                  }
1309 1317  
1310 1318                  mutex_enter(&mi->mi_lock);
1311 1319                  if (mi->mi_flags & MI_ASYNC_MGR_STOP) {
1312 1320                          mutex_exit(&mi->mi_lock);
1313 1321                          break;
1314 1322                  }
1315 1323                  mutex_exit(&mi->mi_lock);
1316 1324  
1317 1325                  CALLB_CPR_SAFE_BEGIN(&cprinfo);
1318 1326                  cv_wait(&mi->mi_async_reqs_cv, &mi->mi_async_lock);
1319 1327                  CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1320 1328          }
1321 1329          /*
1322 1330           * Let everyone know we're done.
1323 1331           */
1324 1332          mi->mi_manager_thread = NULL;
1325 1333          cv_broadcast(&mi->mi_async_cv);
1326 1334  
1327 1335          /*
1328 1336           * There is no explicit call to mutex_exit(&mi->mi_async_lock)
1329 1337           * since CALLB_CPR_EXIT is actually responsible for releasing
1330 1338           * 'mi_async_lock'.
1331 1339           */
1332 1340          CALLB_CPR_EXIT(&cprinfo);
1333 1341          VFS_RELE(vfsp); /* release thread's hold */
1334 1342          zthread_exit();
1335 1343  }
1336 1344  
1337 1345  /*
1338 1346   * Signal (and wait for) the async manager thread to clean up and go away.
1339 1347   */
1340 1348  void
1341 1349  nfs_async_manager_stop(vfs_t *vfsp)
1342 1350  {
1343 1351          mntinfo_t *mi = VFTOMI(vfsp);
1344 1352  
1345 1353          mutex_enter(&mi->mi_async_lock);
1346 1354          mutex_enter(&mi->mi_lock);
1347 1355          mi->mi_flags |= MI_ASYNC_MGR_STOP;
1348 1356          mutex_exit(&mi->mi_lock);
1349 1357          cv_broadcast(&mi->mi_async_reqs_cv);
1350 1358          while (mi->mi_manager_thread != NULL)
1351 1359                  cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
1352 1360          mutex_exit(&mi->mi_async_lock);
1353 1361  }
1354 1362  
1355 1363  int
1356 1364  nfs_async_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr,
1357 1365      struct seg *seg, cred_t *cr, void (*readahead)(vnode_t *,
1358 1366      u_offset_t, caddr_t, struct seg *, cred_t *))
1359 1367  {
1360 1368          rnode_t *rp;
1361 1369          mntinfo_t *mi;
1362 1370          struct nfs_async_reqs *args;
1363 1371  
1364 1372          rp = VTOR(vp);
1365 1373          ASSERT(rp->r_freef == NULL);
1366 1374  
1367 1375          mi = VTOMI(vp);
1368 1376  
1369 1377          /*
1370 1378           * If addr falls in a different segment, don't bother doing readahead.
1371 1379           */
1372 1380          if (addr >= seg->s_base + seg->s_size)
1373 1381                  return (-1);
1374 1382  
1375 1383          /*
1376 1384           * If we can't allocate a request structure, punt on the readahead.
1377 1385           */
1378 1386          if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1379 1387                  return (-1);
1380 1388  
1381 1389          /*
1382 1390           * If a lock operation is pending, don't initiate any new
1383 1391           * readaheads.  Otherwise, bump r_count to indicate the new
1384 1392           * asynchronous I/O.
1385 1393           */
1386 1394          if (!nfs_rw_tryenter(&rp->r_lkserlock, RW_READER)) {
1387 1395                  kmem_free(args, sizeof (*args));
1388 1396                  return (-1);
1389 1397          }
1390 1398          mutex_enter(&rp->r_statelock);
1391 1399          rp->r_count++;
1392 1400          mutex_exit(&rp->r_statelock);
1393 1401          nfs_rw_exit(&rp->r_lkserlock);
1394 1402  
1395 1403          args->a_next = NULL;
1396 1404  #ifdef DEBUG
1397 1405          args->a_queuer = curthread;
1398 1406  #endif
1399 1407          VN_HOLD(vp);
1400 1408          args->a_vp = vp;
1401 1409          ASSERT(cr != NULL);
1402 1410          crhold(cr);
1403 1411          args->a_cred = cr;
1404 1412          args->a_io = NFS_READ_AHEAD;
1405 1413          args->a_nfs_readahead = readahead;
1406 1414          args->a_nfs_blkoff = blkoff;
1407 1415          args->a_nfs_seg = seg;
1408 1416          args->a_nfs_addr = addr;
1409 1417  
1410 1418          mutex_enter(&mi->mi_async_lock);
1411 1419  
1412 1420          /*
1413 1421           * If asyncio has been disabled, don't bother readahead.
1414 1422           */
1415 1423          if (mi->mi_max_threads == 0) {
1416 1424                  mutex_exit(&mi->mi_async_lock);
1417 1425                  goto noasync;
1418 1426          }
1419 1427  
1420 1428          /*
1421 1429           * Link request structure into the async list and
1422 1430           * wakeup async thread to do the i/o.
1423 1431           */
1424 1432          if (mi->mi_async_reqs[NFS_READ_AHEAD] == NULL) {
1425 1433                  mi->mi_async_reqs[NFS_READ_AHEAD] = args;
1426 1434                  mi->mi_async_tail[NFS_READ_AHEAD] = args;
1427 1435          } else {
1428 1436                  mi->mi_async_tail[NFS_READ_AHEAD]->a_next = args;
1429 1437                  mi->mi_async_tail[NFS_READ_AHEAD] = args;
1430 1438          }
1431 1439  
1432 1440          if (mi->mi_io_kstats) {
1433 1441                  mutex_enter(&mi->mi_lock);
1434 1442                  kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1435 1443                  mutex_exit(&mi->mi_lock);
1436 1444          }
1437 1445  
1438 1446          mi->mi_async_req_count++;
1439 1447          ASSERT(mi->mi_async_req_count != 0);
1440 1448          cv_signal(&mi->mi_async_reqs_cv);
1441 1449          mutex_exit(&mi->mi_async_lock);
1442 1450          return (0);
1443 1451  
1444 1452  noasync:
1445 1453          mutex_enter(&rp->r_statelock);
1446 1454          rp->r_count--;
1447 1455          cv_broadcast(&rp->r_cv);
1448 1456          mutex_exit(&rp->r_statelock);
1449 1457          VN_RELE(vp);
1450 1458          crfree(cr);
1451 1459          kmem_free(args, sizeof (*args));
1452 1460          return (-1);
1453 1461  }
1454 1462  
1455 1463  int
1456 1464  nfs_async_putapage(vnode_t *vp, page_t *pp, u_offset_t off, size_t len,
1457 1465      int flags, cred_t *cr, int (*putapage)(vnode_t *, page_t *,
1458 1466      u_offset_t, size_t, int, cred_t *))
1459 1467  {
1460 1468          rnode_t *rp;
1461 1469          mntinfo_t *mi;
1462 1470          struct nfs_async_reqs *args;
1463 1471  
1464 1472          ASSERT(flags & B_ASYNC);
1465 1473          ASSERT(vp->v_vfsp != NULL);
1466 1474  
1467 1475          rp = VTOR(vp);
1468 1476          ASSERT(rp->r_count > 0);
1469 1477  
1470 1478          mi = VTOMI(vp);
1471 1479  
1472 1480          /*
1473 1481           * If we can't allocate a request structure, do the putpage
1474 1482           * operation synchronously in this thread's context.
1475 1483           */
1476 1484          if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1477 1485                  goto noasync;
1478 1486  
1479 1487          args->a_next = NULL;
1480 1488  #ifdef DEBUG
1481 1489          args->a_queuer = curthread;
1482 1490  #endif
1483 1491          VN_HOLD(vp);
1484 1492          args->a_vp = vp;
1485 1493          ASSERT(cr != NULL);
1486 1494          crhold(cr);
1487 1495          args->a_cred = cr;
1488 1496          args->a_io = NFS_PUTAPAGE;
1489 1497          args->a_nfs_putapage = putapage;
1490 1498          args->a_nfs_pp = pp;
1491 1499          args->a_nfs_off = off;
1492 1500          args->a_nfs_len = (uint_t)len;
1493 1501          args->a_nfs_flags = flags;
1494 1502  
1495 1503          mutex_enter(&mi->mi_async_lock);
1496 1504  
1497 1505          /*
1498 1506           * If asyncio has been disabled, then make a synchronous request.
1499 1507           * This check is done a second time in case async io was diabled
1500 1508           * while this thread was blocked waiting for memory pressure to
1501 1509           * reduce or for the queue to drain.
1502 1510           */
1503 1511          if (mi->mi_max_threads == 0) {
1504 1512                  mutex_exit(&mi->mi_async_lock);
1505 1513                  goto noasync;
1506 1514          }
1507 1515  
1508 1516          /*
1509 1517           * Link request structure into the async list and
1510 1518           * wakeup async thread to do the i/o.
1511 1519           */
1512 1520          if (mi->mi_async_reqs[NFS_PUTAPAGE] == NULL) {
1513 1521                  mi->mi_async_reqs[NFS_PUTAPAGE] = args;
1514 1522                  mi->mi_async_tail[NFS_PUTAPAGE] = args;
1515 1523          } else {
1516 1524                  mi->mi_async_tail[NFS_PUTAPAGE]->a_next = args;
1517 1525                  mi->mi_async_tail[NFS_PUTAPAGE] = args;
1518 1526          }
1519 1527  
1520 1528          mutex_enter(&rp->r_statelock);
1521 1529          rp->r_count++;
1522 1530          rp->r_awcount++;
1523 1531          mutex_exit(&rp->r_statelock);
1524 1532  
1525 1533          if (mi->mi_io_kstats) {
1526 1534                  mutex_enter(&mi->mi_lock);
1527 1535                  kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1528 1536                  mutex_exit(&mi->mi_lock);
1529 1537          }
1530 1538  
1531 1539          mi->mi_async_req_count++;
1532 1540          ASSERT(mi->mi_async_req_count != 0);
1533 1541          cv_signal(&mi->mi_async_reqs_cv);
1534 1542          mutex_exit(&mi->mi_async_lock);
1535 1543          return (0);
1536 1544  
1537 1545  noasync:
1538 1546          if (args != NULL) {
1539 1547                  VN_RELE(vp);
1540 1548                  crfree(cr);
1541 1549                  kmem_free(args, sizeof (*args));
1542 1550          }
1543 1551  
1544 1552          if (curproc == proc_pageout || curproc == proc_fsflush) {
1545 1553                  /*
1546 1554                   * If we get here in the context of the pageout/fsflush,
1547 1555                   * we refuse to do a sync write, because this may hang
1548 1556                   * pageout (and the machine). In this case, we just
1549 1557                   * re-mark the page as dirty and punt on the page.
1550 1558                   *
1551 1559                   * Make sure B_FORCE isn't set.  We can re-mark the
1552 1560                   * pages as dirty and unlock the pages in one swoop by
1553 1561                   * passing in B_ERROR to pvn_write_done().  However,
1554 1562                   * we should make sure B_FORCE isn't set - we don't
1555 1563                   * want the page tossed before it gets written out.
1556 1564                   */
1557 1565                  if (flags & B_FORCE)
1558 1566                          flags &= ~(B_INVAL | B_FORCE);
1559 1567                  pvn_write_done(pp, flags | B_ERROR);
1560 1568                  return (0);
1561 1569          }
1562 1570          if (nfs_zone() != mi->mi_zone) {
1563 1571                  /*
1564 1572                   * So this was a cross-zone sync putpage.  We pass in B_ERROR
1565 1573                   * to pvn_write_done() to re-mark the pages as dirty and unlock
1566 1574                   * them.
1567 1575                   *
1568 1576                   * We don't want to clear B_FORCE here as the caller presumably
1569 1577                   * knows what they're doing if they set it.
1570 1578                   */
1571 1579                  pvn_write_done(pp, flags | B_ERROR);
1572 1580                  return (EPERM);
1573 1581          }
1574 1582          return ((*putapage)(vp, pp, off, len, flags, cr));
1575 1583  }
1576 1584  
1577 1585  int
1578 1586  nfs_async_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
1579 1587      int flags, cred_t *cr, int (*pageio)(vnode_t *, page_t *, u_offset_t,
1580 1588      size_t, int, cred_t *))
1581 1589  {
1582 1590          rnode_t *rp;
1583 1591          mntinfo_t *mi;
1584 1592          struct nfs_async_reqs *args;
1585 1593  
1586 1594          ASSERT(flags & B_ASYNC);
1587 1595          ASSERT(vp->v_vfsp != NULL);
1588 1596  
1589 1597          rp = VTOR(vp);
1590 1598          ASSERT(rp->r_count > 0);
1591 1599  
1592 1600          mi = VTOMI(vp);
1593 1601  
1594 1602          /*
1595 1603           * If we can't allocate a request structure, do the pageio
1596 1604           * request synchronously in this thread's context.
1597 1605           */
1598 1606          if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1599 1607                  goto noasync;
1600 1608  
1601 1609          args->a_next = NULL;
1602 1610  #ifdef DEBUG
1603 1611          args->a_queuer = curthread;
1604 1612  #endif
1605 1613          VN_HOLD(vp);
1606 1614          args->a_vp = vp;
1607 1615          ASSERT(cr != NULL);
1608 1616          crhold(cr);
1609 1617          args->a_cred = cr;
1610 1618          args->a_io = NFS_PAGEIO;
1611 1619          args->a_nfs_pageio = pageio;
1612 1620          args->a_nfs_pp = pp;
1613 1621          args->a_nfs_off = io_off;
1614 1622          args->a_nfs_len = (uint_t)io_len;
1615 1623          args->a_nfs_flags = flags;
1616 1624  
1617 1625          mutex_enter(&mi->mi_async_lock);
1618 1626  
1619 1627          /*
1620 1628           * If asyncio has been disabled, then make a synchronous request.
1621 1629           * This check is done a second time in case async io was diabled
1622 1630           * while this thread was blocked waiting for memory pressure to
1623 1631           * reduce or for the queue to drain.
1624 1632           */
1625 1633          if (mi->mi_max_threads == 0) {
1626 1634                  mutex_exit(&mi->mi_async_lock);
1627 1635                  goto noasync;
1628 1636          }
1629 1637  
1630 1638          /*
1631 1639           * Link request structure into the async list and
1632 1640           * wakeup async thread to do the i/o.
1633 1641           */
1634 1642          if (mi->mi_async_reqs[NFS_PAGEIO] == NULL) {
1635 1643                  mi->mi_async_reqs[NFS_PAGEIO] = args;
1636 1644                  mi->mi_async_tail[NFS_PAGEIO] = args;
1637 1645          } else {
1638 1646                  mi->mi_async_tail[NFS_PAGEIO]->a_next = args;
1639 1647                  mi->mi_async_tail[NFS_PAGEIO] = args;
1640 1648          }
1641 1649  
1642 1650          mutex_enter(&rp->r_statelock);
1643 1651          rp->r_count++;
1644 1652          rp->r_awcount++;
1645 1653          mutex_exit(&rp->r_statelock);
1646 1654  
1647 1655          if (mi->mi_io_kstats) {
1648 1656                  mutex_enter(&mi->mi_lock);
1649 1657                  kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1650 1658                  mutex_exit(&mi->mi_lock);
1651 1659          }
1652 1660  
1653 1661          mi->mi_async_req_count++;
1654 1662          ASSERT(mi->mi_async_req_count != 0);
1655 1663          cv_signal(&mi->mi_async_reqs_cv);
1656 1664          mutex_exit(&mi->mi_async_lock);
1657 1665          return (0);
1658 1666  
1659 1667  noasync:
1660 1668          if (args != NULL) {
1661 1669                  VN_RELE(vp);
1662 1670                  crfree(cr);
1663 1671                  kmem_free(args, sizeof (*args));
1664 1672          }
1665 1673  
1666 1674          /*
1667 1675           * If we can't do it ASYNC, for reads we do nothing (but cleanup
1668 1676           * the page list), for writes we do it synchronously, except for
1669 1677           * proc_pageout/proc_fsflush as described below.
1670 1678           */
1671 1679          if (flags & B_READ) {
1672 1680                  pvn_read_done(pp, flags | B_ERROR);
1673 1681                  return (0);
1674 1682          }
1675 1683  
1676 1684          if (curproc == proc_pageout || curproc == proc_fsflush) {
1677 1685                  /*
1678 1686                   * If we get here in the context of the pageout/fsflush,
1679 1687                   * we refuse to do a sync write, because this may hang
1680 1688                   * pageout/fsflush (and the machine). In this case, we just
1681 1689                   * re-mark the page as dirty and punt on the page.
1682 1690                   *
1683 1691                   * Make sure B_FORCE isn't set.  We can re-mark the
1684 1692                   * pages as dirty and unlock the pages in one swoop by
1685 1693                   * passing in B_ERROR to pvn_write_done().  However,
1686 1694                   * we should make sure B_FORCE isn't set - we don't
1687 1695                   * want the page tossed before it gets written out.
1688 1696                   */
1689 1697                  if (flags & B_FORCE)
1690 1698                          flags &= ~(B_INVAL | B_FORCE);
1691 1699                  pvn_write_done(pp, flags | B_ERROR);
1692 1700                  return (0);
1693 1701          }
1694 1702  
1695 1703          if (nfs_zone() != mi->mi_zone) {
1696 1704                  /*
1697 1705                   * So this was a cross-zone sync pageio.  We pass in B_ERROR
1698 1706                   * to pvn_write_done() to re-mark the pages as dirty and unlock
1699 1707                   * them.
1700 1708                   *
1701 1709                   * We don't want to clear B_FORCE here as the caller presumably
1702 1710                   * knows what they're doing if they set it.
1703 1711                   */
1704 1712                  pvn_write_done(pp, flags | B_ERROR);
1705 1713                  return (EPERM);
1706 1714          }
1707 1715          return ((*pageio)(vp, pp, io_off, io_len, flags, cr));
1708 1716  }
1709 1717  
1710 1718  void
1711 1719  nfs_async_readdir(vnode_t *vp, rddir_cache *rdc, cred_t *cr,
1712 1720      int (*readdir)(vnode_t *, rddir_cache *, cred_t *))
1713 1721  {
1714 1722          rnode_t *rp;
1715 1723          mntinfo_t *mi;
1716 1724          struct nfs_async_reqs *args;
1717 1725  
1718 1726          rp = VTOR(vp);
1719 1727          ASSERT(rp->r_freef == NULL);
1720 1728  
1721 1729          mi = VTOMI(vp);
1722 1730  
1723 1731          /*
1724 1732           * If we can't allocate a request structure, do the readdir
1725 1733           * operation synchronously in this thread's context.
1726 1734           */
1727 1735          if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1728 1736                  goto noasync;
1729 1737  
1730 1738          args->a_next = NULL;
1731 1739  #ifdef DEBUG
1732 1740          args->a_queuer = curthread;
1733 1741  #endif
1734 1742          VN_HOLD(vp);
1735 1743          args->a_vp = vp;
1736 1744          ASSERT(cr != NULL);
1737 1745          crhold(cr);
1738 1746          args->a_cred = cr;
1739 1747          args->a_io = NFS_READDIR;
1740 1748          args->a_nfs_readdir = readdir;
1741 1749          args->a_nfs_rdc = rdc;
1742 1750  
1743 1751          mutex_enter(&mi->mi_async_lock);
1744 1752  
1745 1753          /*
1746 1754           * If asyncio has been disabled, then make a synchronous request.
1747 1755           */
1748 1756          if (mi->mi_max_threads == 0) {
1749 1757                  mutex_exit(&mi->mi_async_lock);
1750 1758                  goto noasync;
1751 1759          }
1752 1760  
1753 1761          /*
1754 1762           * Link request structure into the async list and
1755 1763           * wakeup async thread to do the i/o.
1756 1764           */
1757 1765          if (mi->mi_async_reqs[NFS_READDIR] == NULL) {
1758 1766                  mi->mi_async_reqs[NFS_READDIR] = args;
1759 1767                  mi->mi_async_tail[NFS_READDIR] = args;
1760 1768          } else {
1761 1769                  mi->mi_async_tail[NFS_READDIR]->a_next = args;
1762 1770                  mi->mi_async_tail[NFS_READDIR] = args;
1763 1771          }
1764 1772  
1765 1773          mutex_enter(&rp->r_statelock);
1766 1774          rp->r_count++;
1767 1775          mutex_exit(&rp->r_statelock);
1768 1776  
1769 1777          if (mi->mi_io_kstats) {
1770 1778                  mutex_enter(&mi->mi_lock);
1771 1779                  kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1772 1780                  mutex_exit(&mi->mi_lock);
1773 1781          }
1774 1782  
1775 1783          mi->mi_async_req_count++;
1776 1784          ASSERT(mi->mi_async_req_count != 0);
1777 1785          cv_signal(&mi->mi_async_reqs_cv);
1778 1786          mutex_exit(&mi->mi_async_lock);
1779 1787          return;
1780 1788  
1781 1789  noasync:
1782 1790          if (args != NULL) {
1783 1791                  VN_RELE(vp);
1784 1792                  crfree(cr);
1785 1793                  kmem_free(args, sizeof (*args));
1786 1794          }
1787 1795  
1788 1796          rdc->entries = NULL;
1789 1797          mutex_enter(&rp->r_statelock);
1790 1798          ASSERT(rdc->flags & RDDIR);
1791 1799          rdc->flags &= ~RDDIR;
1792 1800          rdc->flags |= RDDIRREQ;
1793 1801          /*
1794 1802           * Check the flag to see if RDDIRWAIT is set. If RDDIRWAIT
1795 1803           * is set, wakeup the thread sleeping in cv_wait_sig().
1796 1804           * The woken up thread will reset the flag to RDDIR and will
1797 1805           * continue with the readdir opeartion.
1798 1806           */
1799 1807          if (rdc->flags & RDDIRWAIT) {
1800 1808                  rdc->flags &= ~RDDIRWAIT;
1801 1809                  cv_broadcast(&rdc->cv);
1802 1810          }
1803 1811          mutex_exit(&rp->r_statelock);
1804 1812          rddir_cache_rele(rdc);
1805 1813  }
1806 1814  
1807 1815  void
1808 1816  nfs_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count,
1809 1817      cred_t *cr, void (*commit)(vnode_t *, page_t *, offset3, count3, cred_t *))
1810 1818  {
1811 1819          rnode_t *rp;
1812 1820          mntinfo_t *mi;
1813 1821          struct nfs_async_reqs *args;
1814 1822          page_t *pp;
1815 1823  
1816 1824          rp = VTOR(vp);
1817 1825          mi = VTOMI(vp);
1818 1826  
1819 1827          /*
1820 1828           * If we can't allocate a request structure, do the commit
1821 1829           * operation synchronously in this thread's context.
1822 1830           */
1823 1831          if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1824 1832                  goto noasync;
1825 1833  
1826 1834          args->a_next = NULL;
1827 1835  #ifdef DEBUG
1828 1836          args->a_queuer = curthread;
1829 1837  #endif
1830 1838          VN_HOLD(vp);
1831 1839          args->a_vp = vp;
1832 1840          ASSERT(cr != NULL);
1833 1841          crhold(cr);
1834 1842          args->a_cred = cr;
1835 1843          args->a_io = NFS_COMMIT;
1836 1844          args->a_nfs_commit = commit;
1837 1845          args->a_nfs_plist = plist;
1838 1846          args->a_nfs_offset = offset;
1839 1847          args->a_nfs_count = count;
1840 1848  
1841 1849          mutex_enter(&mi->mi_async_lock);
1842 1850  
1843 1851          /*
1844 1852           * If asyncio has been disabled, then make a synchronous request.
1845 1853           * This check is done a second time in case async io was diabled
1846 1854           * while this thread was blocked waiting for memory pressure to
1847 1855           * reduce or for the queue to drain.
1848 1856           */
1849 1857          if (mi->mi_max_threads == 0) {
1850 1858                  mutex_exit(&mi->mi_async_lock);
1851 1859                  goto noasync;
1852 1860          }
1853 1861  
1854 1862          /*
1855 1863           * Link request structure into the async list and
1856 1864           * wakeup async thread to do the i/o.
1857 1865           */
1858 1866          if (mi->mi_async_reqs[NFS_COMMIT] == NULL) {
1859 1867                  mi->mi_async_reqs[NFS_COMMIT] = args;
1860 1868                  mi->mi_async_tail[NFS_COMMIT] = args;
1861 1869          } else {
1862 1870                  mi->mi_async_tail[NFS_COMMIT]->a_next = args;
1863 1871                  mi->mi_async_tail[NFS_COMMIT] = args;
1864 1872          }
1865 1873  
1866 1874          mutex_enter(&rp->r_statelock);
1867 1875          rp->r_count++;
1868 1876          mutex_exit(&rp->r_statelock);
1869 1877  
1870 1878          if (mi->mi_io_kstats) {
1871 1879                  mutex_enter(&mi->mi_lock);
1872 1880                  kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1873 1881                  mutex_exit(&mi->mi_lock);
1874 1882          }
1875 1883  
1876 1884          mi->mi_async_req_count++;
1877 1885          ASSERT(mi->mi_async_req_count != 0);
1878 1886          cv_signal(&mi->mi_async_reqs_cv);
1879 1887          mutex_exit(&mi->mi_async_lock);
1880 1888          return;
1881 1889  
1882 1890  noasync:
1883 1891          if (args != NULL) {
1884 1892                  VN_RELE(vp);
1885 1893                  crfree(cr);
1886 1894                  kmem_free(args, sizeof (*args));
1887 1895          }
1888 1896  
1889 1897          if (curproc == proc_pageout || curproc == proc_fsflush ||
1890 1898              nfs_zone() != mi->mi_zone) {
1891 1899                  while (plist != NULL) {
1892 1900                          pp = plist;
1893 1901                          page_sub(&plist, pp);
1894 1902                          pp->p_fsdata = C_COMMIT;
1895 1903                          page_unlock(pp);
1896 1904                  }
1897 1905                  return;
1898 1906          }
1899 1907          (*commit)(vp, plist, offset, count, cr);
1900 1908  }
1901 1909  
1902 1910  void
1903 1911  nfs_async_inactive(vnode_t *vp, cred_t *cr,
1904 1912      void (*inactive)(vnode_t *, cred_t *, caller_context_t *))
1905 1913  {
1906 1914          mntinfo_t *mi;
1907 1915          struct nfs_async_reqs *args;
1908 1916  
1909 1917          mi = VTOMI(vp);
1910 1918  
1911 1919          args = kmem_alloc(sizeof (*args), KM_SLEEP);
1912 1920          args->a_next = NULL;
1913 1921  #ifdef DEBUG
1914 1922          args->a_queuer = curthread;
1915 1923  #endif
1916 1924          args->a_vp = vp;
1917 1925          ASSERT(cr != NULL);
1918 1926          crhold(cr);
1919 1927          args->a_cred = cr;
1920 1928          args->a_io = NFS_INACTIVE;
1921 1929          args->a_nfs_inactive = inactive;
1922 1930  
1923 1931          /*
1924 1932           * Note that we don't check mi->mi_max_threads here, since we
1925 1933           * *need* to get rid of this vnode regardless of whether someone
1926 1934           * set nfs3_max_threads/nfs_max_threads to zero in /etc/system.
1927 1935           *
1928 1936           * The manager thread knows about this and is willing to create
1929 1937           * at least one thread to accommodate us.
1930 1938           */
1931 1939          mutex_enter(&mi->mi_async_lock);
1932 1940          if (mi->mi_manager_thread == NULL) {
1933 1941                  rnode_t *rp = VTOR(vp);
1934 1942  
1935 1943                  mutex_exit(&mi->mi_async_lock);
1936 1944                  crfree(cr);     /* drop our reference */
1937 1945                  kmem_free(args, sizeof (*args));
1938 1946                  /*
1939 1947                   * We can't do an over-the-wire call since we're in the wrong
1940 1948                   * zone, so we need to clean up state as best we can and then
1941 1949                   * throw away the vnode.
1942 1950                   */
1943 1951                  mutex_enter(&rp->r_statelock);
1944 1952                  if (rp->r_unldvp != NULL) {
1945 1953                          vnode_t *unldvp;
1946 1954                          char *unlname;
1947 1955                          cred_t *unlcred;
1948 1956  
1949 1957                          unldvp = rp->r_unldvp;
1950 1958                          rp->r_unldvp = NULL;
1951 1959                          unlname = rp->r_unlname;
1952 1960                          rp->r_unlname = NULL;
1953 1961                          unlcred = rp->r_unlcred;
1954 1962                          rp->r_unlcred = NULL;
1955 1963                          mutex_exit(&rp->r_statelock);
1956 1964  
1957 1965                          VN_RELE(unldvp);
1958 1966                          kmem_free(unlname, MAXNAMELEN);
1959 1967                          crfree(unlcred);
1960 1968                  } else {
1961 1969                          mutex_exit(&rp->r_statelock);
1962 1970                  }
1963 1971                  /*
1964 1972                   * No need to explicitly throw away any cached pages.  The
1965 1973                   * eventual rinactive() will attempt a synchronous
1966 1974                   * VOP_PUTPAGE() which will immediately fail since the request
1967 1975                   * is coming from the wrong zone, and then will proceed to call
1968 1976                   * nfs_invalidate_pages() which will clean things up for us.
1969 1977                   */
1970 1978                  rp_addfree(VTOR(vp), cr);
1971 1979                  return;
1972 1980          }
1973 1981  
1974 1982          if (mi->mi_async_reqs[NFS_INACTIVE] == NULL) {
1975 1983                  mi->mi_async_reqs[NFS_INACTIVE] = args;
1976 1984          } else {
1977 1985                  mi->mi_async_tail[NFS_INACTIVE]->a_next = args;
1978 1986          }
1979 1987          mi->mi_async_tail[NFS_INACTIVE] = args;
1980 1988          /*
1981 1989           * Don't increment r_count, since we're trying to get rid of the vnode.
1982 1990           */
1983 1991  
1984 1992          mi->mi_async_req_count++;
1985 1993          ASSERT(mi->mi_async_req_count != 0);
1986 1994          cv_signal(&mi->mi_async_reqs_cv);
1987 1995          mutex_exit(&mi->mi_async_lock);
1988 1996  }
1989 1997  
1990 1998  static void
1991 1999  nfs_async_start(struct vfs *vfsp)
1992 2000  {
1993 2001          nfs_async_common_start(vfsp, NFS_ASYNC_QUEUE);
1994 2002  }
1995 2003  
1996 2004  static void
1997 2005  nfs_async_pgops_start(struct vfs *vfsp)
1998 2006  {
1999 2007          nfs_async_common_start(vfsp, NFS_ASYNC_PGOPS_QUEUE);
2000 2008  }
2001 2009  
2002 2010  /*
2003 2011   * The async queues for each mounted file system are arranged as a
2004 2012   * set of queues, one for each async i/o type.  Requests are taken
2005 2013   * from the queues in a round-robin fashion.  A number of consecutive
2006 2014   * requests are taken from each queue before moving on to the next
2007 2015   * queue.  This functionality may allow the NFS Version 2 server to do
2008 2016   * write clustering, even if the client is mixing writes and reads
2009 2017   * because it will take multiple write requests from the queue
2010 2018   * before processing any of the other async i/o types.
2011 2019   *
2012 2020   * XXX The nfs_async_common_start thread is unsafe in the light of the present
2013 2021   * model defined by cpr to suspend the system. Specifically over the
2014 2022   * wire calls are cpr-unsafe. The thread should be reevaluated in
2015 2023   * case of future updates to the cpr model.
2016 2024   */
2017 2025  static void
2018 2026  nfs_async_common_start(struct vfs *vfsp, int async_queue)
2019 2027  {
2020 2028          struct nfs_async_reqs *args;
2021 2029          mntinfo_t *mi = VFTOMI(vfsp);
2022 2030          clock_t time_left = 1;
2023 2031          callb_cpr_t cprinfo;
2024 2032          int i;
2025 2033          int async_types;
2026 2034          kcondvar_t *async_work_cv;
2027 2035  
2028 2036          if (async_queue == NFS_ASYNC_QUEUE) {
2029 2037                  async_types = NFS_ASYNC_TYPES;
2030 2038                  async_work_cv = &mi->mi_async_work_cv[NFS_ASYNC_QUEUE];
2031 2039          } else {
2032 2040                  async_types = NFS_ASYNC_PGOPS_TYPES;
2033 2041                  async_work_cv = &mi->mi_async_work_cv[NFS_ASYNC_PGOPS_QUEUE];
2034 2042          }
2035 2043  
2036 2044          /*
2037 2045           * Dynamic initialization of nfs_async_timeout to allow nfs to be
2038 2046           * built in an implementation independent manner.
2039 2047           */
2040 2048          if (nfs_async_timeout == -1)
2041 2049                  nfs_async_timeout = NFS_ASYNC_TIMEOUT;
2042 2050  
2043 2051          CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, "nas");
2044 2052  
2045 2053          mutex_enter(&mi->mi_async_lock);
2046 2054          for (;;) {
2047 2055                  /*
2048 2056                   * Find the next queue containing an entry.  We start
2049 2057                   * at the current queue pointer and then round robin
2050 2058                   * through all of them until we either find a non-empty
2051 2059                   * queue or have looked through all of them.
2052 2060                   */
2053 2061                  for (i = 0; i < async_types; i++) {
2054 2062                          args = *mi->mi_async_curr[async_queue];
2055 2063                          if (args != NULL)
2056 2064                                  break;
2057 2065                          mi->mi_async_curr[async_queue]++;
2058 2066                          if (mi->mi_async_curr[async_queue] ==
2059 2067                              &mi->mi_async_reqs[async_types]) {
2060 2068                                  mi->mi_async_curr[async_queue] =
2061 2069                                      &mi->mi_async_reqs[0];
2062 2070                          }
2063 2071                  }
2064 2072                  /*
2065 2073                   * If we didn't find a entry, then block until woken up
2066 2074                   * again and then look through the queues again.
2067 2075                   */
2068 2076                  if (args == NULL) {
2069 2077                          /*
2070 2078                           * Exiting is considered to be safe for CPR as well
2071 2079                           */
2072 2080                          CALLB_CPR_SAFE_BEGIN(&cprinfo);
2073 2081  
2074 2082                          /*
2075 2083                           * Wakeup thread waiting to unmount the file
2076 2084                           * system only if all async threads are inactive.
2077 2085                           *
2078 2086                           * If we've timed-out and there's nothing to do,
2079 2087                           * then get rid of this thread.
2080 2088                           */
2081 2089                          if (mi->mi_max_threads == 0 || time_left <= 0) {
2082 2090                                  --mi->mi_threads[async_queue];
2083 2091  
2084 2092                                  if (mi->mi_threads[NFS_ASYNC_QUEUE] == 0 &&
2085 2093                                      mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE] == 0)
2086 2094                                          cv_signal(&mi->mi_async_cv);
2087 2095                                  CALLB_CPR_EXIT(&cprinfo);
2088 2096                                  VFS_RELE(vfsp); /* release thread's hold */
2089 2097                                  zthread_exit();
2090 2098                                  /* NOTREACHED */
2091 2099                          }
2092 2100                          time_left = cv_reltimedwait(async_work_cv,
2093 2101                              &mi->mi_async_lock, nfs_async_timeout,
2094 2102                              TR_CLOCK_TICK);
2095 2103  
2096 2104                          CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
2097 2105  
2098 2106                          continue;
2099 2107                  }
2100 2108                  time_left = 1;
2101 2109  
2102 2110                  /*
2103 2111                   * Remove the request from the async queue and then
2104 2112                   * update the current async request queue pointer.  If
2105 2113                   * the current queue is empty or we have removed enough
2106 2114                   * consecutive entries from it, then reset the counter
2107 2115                   * for this queue and then move the current pointer to
2108 2116                   * the next queue.
2109 2117                   */
2110 2118                  *mi->mi_async_curr[async_queue] = args->a_next;
2111 2119                  if (*mi->mi_async_curr[async_queue] == NULL ||
2112 2120                      --mi->mi_async_clusters[args->a_io] == 0) {
2113 2121                          mi->mi_async_clusters[args->a_io] =
2114 2122                              mi->mi_async_init_clusters;
2115 2123                          mi->mi_async_curr[async_queue]++;
2116 2124                          if (mi->mi_async_curr[async_queue] ==
2117 2125                              &mi->mi_async_reqs[async_types]) {
2118 2126                                  mi->mi_async_curr[async_queue] =
2119 2127                                      &mi->mi_async_reqs[0];
2120 2128                          }
2121 2129                  }
2122 2130  
2123 2131                  if (args->a_io != NFS_INACTIVE && mi->mi_io_kstats) {
2124 2132                          mutex_enter(&mi->mi_lock);
2125 2133                          kstat_waitq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
2126 2134                          mutex_exit(&mi->mi_lock);
2127 2135                  }
2128 2136  
2129 2137                  mutex_exit(&mi->mi_async_lock);
2130 2138  
2131 2139                  /*
2132 2140                   * Obtain arguments from the async request structure.
2133 2141                   */
2134 2142                  if (args->a_io == NFS_READ_AHEAD && mi->mi_max_threads > 0) {
2135 2143                          (*args->a_nfs_readahead)(args->a_vp, args->a_nfs_blkoff,
2136 2144                              args->a_nfs_addr, args->a_nfs_seg,
2137 2145                              args->a_cred);
2138 2146                  } else if (args->a_io == NFS_PUTAPAGE) {
2139 2147                          (void) (*args->a_nfs_putapage)(args->a_vp,
2140 2148                              args->a_nfs_pp, args->a_nfs_off,
2141 2149                              args->a_nfs_len, args->a_nfs_flags,
2142 2150                              args->a_cred);
2143 2151                  } else if (args->a_io == NFS_PAGEIO) {
2144 2152                          (void) (*args->a_nfs_pageio)(args->a_vp,
2145 2153                              args->a_nfs_pp, args->a_nfs_off,
2146 2154                              args->a_nfs_len, args->a_nfs_flags,
2147 2155                              args->a_cred);
2148 2156                  } else if (args->a_io == NFS_READDIR) {
2149 2157                          (void) ((*args->a_nfs_readdir)(args->a_vp,
2150 2158                              args->a_nfs_rdc, args->a_cred));
2151 2159                  } else if (args->a_io == NFS_COMMIT) {
2152 2160                          (*args->a_nfs_commit)(args->a_vp, args->a_nfs_plist,
2153 2161                              args->a_nfs_offset, args->a_nfs_count,
2154 2162                              args->a_cred);
2155 2163                  } else if (args->a_io == NFS_INACTIVE) {
2156 2164                          (*args->a_nfs_inactive)(args->a_vp, args->a_cred, NULL);
2157 2165                  }
2158 2166  
2159 2167                  /*
2160 2168                   * Now, release the vnode and free the credentials
2161 2169                   * structure.
2162 2170                   */
2163 2171                  free_async_args(args);
2164 2172                  /*
2165 2173                   * Reacquire the mutex because it will be needed above.
2166 2174                   */
2167 2175                  mutex_enter(&mi->mi_async_lock);
2168 2176          }
2169 2177  }
2170 2178  
2171 2179  void
2172 2180  nfs_async_stop(struct vfs *vfsp)
2173 2181  {
2174 2182          mntinfo_t *mi = VFTOMI(vfsp);
2175 2183  
2176 2184          /*
2177 2185           * Wait for all outstanding async operations to complete and for the
2178 2186           * worker threads to exit.
2179 2187           */
2180 2188          mutex_enter(&mi->mi_async_lock);
2181 2189          mi->mi_max_threads = 0;
2182 2190          NFS_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
2183 2191          while (mi->mi_threads[NFS_ASYNC_QUEUE] != 0 ||
2184 2192              mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE] != 0)
2185 2193                  cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
2186 2194          mutex_exit(&mi->mi_async_lock);
2187 2195  }
2188 2196  
2189 2197  /*
2190 2198   * nfs_async_stop_sig:
2191 2199   * Wait for all outstanding putpage operation to complete. If a signal
2192 2200   * is deliver we will abort and return non-zero. If we can put all the
2193 2201   * pages we will return 0. This routine is called from nfs_unmount and
2194 2202   * nfs3_unmount to make these operations interruptible.
2195 2203   */
2196 2204  int
2197 2205  nfs_async_stop_sig(struct vfs *vfsp)
2198 2206  {
2199 2207          mntinfo_t *mi = VFTOMI(vfsp);
2200 2208          ushort_t omax;
2201 2209          int rval;
2202 2210  
2203 2211          /*
2204 2212           * Wait for all outstanding async operations to complete and for the
2205 2213           * worker threads to exit.
2206 2214           */
2207 2215          mutex_enter(&mi->mi_async_lock);
2208 2216          omax = mi->mi_max_threads;
2209 2217          mi->mi_max_threads = 0;
2210 2218          /*
2211 2219           * Tell all the worker threads to exit.
2212 2220           */
2213 2221          NFS_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
2214 2222          while (mi->mi_threads[NFS_ASYNC_QUEUE] != 0 ||
2215 2223              mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE] != 0) {
2216 2224                  if (!cv_wait_sig(&mi->mi_async_cv, &mi->mi_async_lock))
2217 2225                          break;
2218 2226          }
2219 2227          rval = (mi->mi_threads[NFS_ASYNC_QUEUE] != 0 ||
2220 2228              mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE]  != 0); /* Interrupted */
2221 2229          if (rval)
2222 2230                  mi->mi_max_threads = omax;
2223 2231          mutex_exit(&mi->mi_async_lock);
2224 2232  
2225 2233          return (rval);
2226 2234  }
2227 2235  
2228 2236  int
2229 2237  writerp(rnode_t *rp, caddr_t base, int tcount, struct uio *uio, int pgcreated)
2230 2238  {
2231 2239          int pagecreate;
2232 2240          int n;
2233 2241          int saved_n;
2234 2242          caddr_t saved_base;
2235 2243          u_offset_t offset;
2236 2244          int error;
2237 2245          int sm_error;
2238 2246          vnode_t *vp = RTOV(rp);
2239 2247  
2240 2248          ASSERT(tcount <= MAXBSIZE && tcount <= uio->uio_resid);
2241 2249          ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_WRITER));
2242 2250          if (!vpm_enable) {
2243 2251                  ASSERT(((uintptr_t)base & MAXBOFFSET) + tcount <= MAXBSIZE);
2244 2252          }
2245 2253  
2246 2254          /*
2247 2255           * Move bytes in at most PAGESIZE chunks. We must avoid
2248 2256           * spanning pages in uiomove() because page faults may cause
2249 2257           * the cache to be invalidated out from under us. The r_size is not
2250 2258           * updated until after the uiomove. If we push the last page of a
2251 2259           * file before r_size is correct, we will lose the data written past
2252 2260           * the current (and invalid) r_size.
2253 2261           */
2254 2262          do {
2255 2263                  offset = uio->uio_loffset;
2256 2264                  pagecreate = 0;
2257 2265  
2258 2266                  /*
2259 2267                   * n is the number of bytes required to satisfy the request
2260 2268                   *   or the number of bytes to fill out the page.
2261 2269                   */
2262 2270                  n = (int)MIN((PAGESIZE - (offset & PAGEOFFSET)), tcount);
2263 2271  
2264 2272                  /*
2265 2273                   * Check to see if we can skip reading in the page
2266 2274                   * and just allocate the memory.  We can do this
2267 2275                   * if we are going to rewrite the entire mapping
2268 2276                   * or if we are going to write to or beyond the current
2269 2277                   * end of file from the beginning of the mapping.
2270 2278                   *
2271 2279                   * The read of r_size is now protected by r_statelock.
2272 2280                   */
2273 2281                  mutex_enter(&rp->r_statelock);
2274 2282                  /*
2275 2283                   * When pgcreated is nonzero the caller has already done
2276 2284                   * a segmap_getmapflt with forcefault 0 and S_WRITE. With
2277 2285                   * segkpm this means we already have at least one page
2278 2286                   * created and mapped at base.
2279 2287                   */
2280 2288                  pagecreate = pgcreated ||
2281 2289                      ((offset & PAGEOFFSET) == 0 &&
2282 2290                      (n == PAGESIZE || ((offset + n) >= rp->r_size)));
2283 2291  
2284 2292                  mutex_exit(&rp->r_statelock);
2285 2293                  if (!vpm_enable && pagecreate) {
2286 2294                          /*
2287 2295                           * The last argument tells segmap_pagecreate() to
2288 2296                           * always lock the page, as opposed to sometimes
2289 2297                           * returning with the page locked. This way we avoid a
2290 2298                           * fault on the ensuing uiomove(), but also
2291 2299                           * more importantly (to fix bug 1094402) we can
2292 2300                           * call segmap_fault() to unlock the page in all
2293 2301                           * cases. An alternative would be to modify
2294 2302                           * segmap_pagecreate() to tell us when it is
2295 2303                           * locking a page, but that's a fairly major
2296 2304                           * interface change.
2297 2305                           */
2298 2306                          if (pgcreated == 0)
2299 2307                                  (void) segmap_pagecreate(segkmap, base,
2300 2308                                      (uint_t)n, 1);
2301 2309                          saved_base = base;
2302 2310                          saved_n = n;
2303 2311                  }
2304 2312  
2305 2313                  /*
2306 2314                   * The number of bytes of data in the last page can not
2307 2315                   * be accurately be determined while page is being
2308 2316                   * uiomove'd to and the size of the file being updated.
2309 2317                   * Thus, inform threads which need to know accurately
2310 2318                   * how much data is in the last page of the file.  They
2311 2319                   * will not do the i/o immediately, but will arrange for
2312 2320                   * the i/o to happen later when this modify operation
2313 2321                   * will have finished.
2314 2322                   */
2315 2323                  ASSERT(!(rp->r_flags & RMODINPROGRESS));
2316 2324                  mutex_enter(&rp->r_statelock);
2317 2325                  rp->r_flags |= RMODINPROGRESS;
2318 2326                  rp->r_modaddr = (offset & MAXBMASK);
2319 2327                  mutex_exit(&rp->r_statelock);
2320 2328  
2321 2329                  if (vpm_enable) {
2322 2330                          /*
2323 2331                           * Copy data. If new pages are created, part of
2324 2332                           * the page that is not written will be initizliazed
2325 2333                           * with zeros.
2326 2334                           */
2327 2335                          error = vpm_data_copy(vp, offset, n, uio,
2328 2336                              !pagecreate, NULL, 0, S_WRITE);
2329 2337                  } else {
2330 2338                          error = uiomove(base, n, UIO_WRITE, uio);
2331 2339                  }
2332 2340  
2333 2341                  /*
2334 2342                   * r_size is the maximum number of
2335 2343                   * bytes known to be in the file.
2336 2344                   * Make sure it is at least as high as the
2337 2345                   * first unwritten byte pointed to by uio_loffset.
2338 2346                   */
2339 2347                  mutex_enter(&rp->r_statelock);
2340 2348                  if (rp->r_size < uio->uio_loffset)
2341 2349                          rp->r_size = uio->uio_loffset;
2342 2350                  rp->r_flags &= ~RMODINPROGRESS;
2343 2351                  rp->r_flags |= RDIRTY;
2344 2352                  mutex_exit(&rp->r_statelock);
2345 2353  
2346 2354                  /* n = # of bytes written */
2347 2355                  n = (int)(uio->uio_loffset - offset);
2348 2356  
2349 2357                  if (!vpm_enable) {
2350 2358                          base += n;
2351 2359                  }
2352 2360                  tcount -= n;
2353 2361                  /*
2354 2362                   * If we created pages w/o initializing them completely,
2355 2363                   * we need to zero the part that wasn't set up.
2356 2364                   * This happens on a most EOF write cases and if
2357 2365                   * we had some sort of error during the uiomove.
2358 2366                   */
2359 2367                  if (!vpm_enable && pagecreate) {
2360 2368                          if ((uio->uio_loffset & PAGEOFFSET) || n == 0)
2361 2369                                  (void) kzero(base, PAGESIZE - n);
2362 2370  
2363 2371                          if (pgcreated) {
2364 2372                                  /*
2365 2373                                   * Caller is responsible for this page,
2366 2374                                   * it was not created in this loop.
2367 2375                                   */
2368 2376                                  pgcreated = 0;
2369 2377                          } else {
2370 2378                                  /*
2371 2379                                   * For bug 1094402: segmap_pagecreate locks
2372 2380                                   * page. Unlock it. This also unlocks the
2373 2381                                   * pages allocated by page_create_va() in
2374 2382                                   * segmap_pagecreate().
2375 2383                                   */
2376 2384                                  sm_error = segmap_fault(kas.a_hat, segkmap,
2377 2385                                      saved_base, saved_n,
2378 2386                                      F_SOFTUNLOCK, S_WRITE);
2379 2387                                  if (error == 0)
2380 2388                                          error = sm_error;
2381 2389                          }
2382 2390                  }
2383 2391          } while (tcount > 0 && error == 0);
2384 2392  
2385 2393          return (error);
2386 2394  }
2387 2395  
2388 2396  int
2389 2397  nfs_putpages(vnode_t *vp, u_offset_t off, size_t len, int flags, cred_t *cr)
2390 2398  {
2391 2399          rnode_t *rp;
2392 2400          page_t *pp;
2393 2401          u_offset_t eoff;
2394 2402          u_offset_t io_off;
2395 2403          size_t io_len;
2396 2404          int error;
2397 2405          int rdirty;
2398 2406          int err;
2399 2407  
2400 2408          rp = VTOR(vp);
2401 2409          ASSERT(rp->r_count > 0);
2402 2410  
2403 2411          if (!vn_has_cached_data(vp))
2404 2412                  return (0);
2405 2413  
2406 2414          ASSERT(vp->v_type != VCHR);
2407 2415  
2408 2416          /*
2409 2417           * If ROUTOFSPACE is set, then all writes turn into B_INVAL
2410 2418           * writes.  B_FORCE is set to force the VM system to actually
2411 2419           * invalidate the pages, even if the i/o failed.  The pages
2412 2420           * need to get invalidated because they can't be written out
2413 2421           * because there isn't any space left on either the server's
2414 2422           * file system or in the user's disk quota.  The B_FREE bit
2415 2423           * is cleared to avoid confusion as to whether this is a
2416 2424           * request to place the page on the freelist or to destroy
2417 2425           * it.
2418 2426           */
2419 2427          if ((rp->r_flags & ROUTOFSPACE) ||
2420 2428              (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED))
2421 2429                  flags = (flags & ~B_FREE) | B_INVAL | B_FORCE;
2422 2430  
2423 2431          if (len == 0) {
2424 2432                  /*
2425 2433                   * If doing a full file synchronous operation, then clear
2426 2434                   * the RDIRTY bit.  If a page gets dirtied while the flush
2427 2435                   * is happening, then RDIRTY will get set again.  The
2428 2436                   * RDIRTY bit must get cleared before the flush so that
2429 2437                   * we don't lose this information.
2430 2438                   *
2431 2439                   * If there are no full file async write operations
2432 2440                   * pending and RDIRTY bit is set, clear it.
2433 2441                   */
2434 2442                  if (off == (u_offset_t)0 &&
2435 2443                      !(flags & B_ASYNC) &&
2436 2444                      (rp->r_flags & RDIRTY)) {
2437 2445                          mutex_enter(&rp->r_statelock);
2438 2446                          rdirty = (rp->r_flags & RDIRTY);
2439 2447                          rp->r_flags &= ~RDIRTY;
2440 2448                          mutex_exit(&rp->r_statelock);
2441 2449                  } else if (flags & B_ASYNC && off == (u_offset_t)0) {
2442 2450                          mutex_enter(&rp->r_statelock);
2443 2451                          if (rp->r_flags & RDIRTY && rp->r_awcount == 0) {
2444 2452                                  rdirty = (rp->r_flags & RDIRTY);
2445 2453                                  rp->r_flags &= ~RDIRTY;
2446 2454                          }
2447 2455                          mutex_exit(&rp->r_statelock);
2448 2456                  } else
2449 2457                          rdirty = 0;
2450 2458  
2451 2459                  /*
2452 2460                   * Search the entire vp list for pages >= off, and flush
2453 2461                   * the dirty pages.
2454 2462                   */
2455 2463                  error = pvn_vplist_dirty(vp, off, rp->r_putapage,
2456 2464                      flags, cr);
2457 2465  
2458 2466                  /*
2459 2467                   * If an error occurred and the file was marked as dirty
2460 2468                   * before and we aren't forcibly invalidating pages, then
2461 2469                   * reset the RDIRTY flag.
2462 2470                   */
2463 2471                  if (error && rdirty &&
2464 2472                      (flags & (B_INVAL | B_FORCE)) != (B_INVAL | B_FORCE)) {
2465 2473                          mutex_enter(&rp->r_statelock);
2466 2474                          rp->r_flags |= RDIRTY;
2467 2475                          mutex_exit(&rp->r_statelock);
2468 2476                  }
2469 2477          } else {
2470 2478                  /*
2471 2479                   * Do a range from [off...off + len) looking for pages
2472 2480                   * to deal with.
2473 2481                   */
2474 2482                  error = 0;
2475 2483  #ifdef lint
2476 2484                  io_len = 0;
2477 2485  #endif
2478 2486                  eoff = off + len;
2479 2487                  mutex_enter(&rp->r_statelock);
2480 2488                  for (io_off = off; io_off < eoff && io_off < rp->r_size;
2481 2489                      io_off += io_len) {
2482 2490                          mutex_exit(&rp->r_statelock);
2483 2491                          /*
2484 2492                           * If we are not invalidating, synchronously
2485 2493                           * freeing or writing pages use the routine
2486 2494                           * page_lookup_nowait() to prevent reclaiming
2487 2495                           * them from the free list.
2488 2496                           */
2489 2497                          if ((flags & B_INVAL) || !(flags & B_ASYNC)) {
2490 2498                                  pp = page_lookup(vp, io_off,
2491 2499                                      (flags & (B_INVAL | B_FREE)) ?
2492 2500                                      SE_EXCL : SE_SHARED);
2493 2501                          } else {
2494 2502                                  pp = page_lookup_nowait(vp, io_off,
2495 2503                                      (flags & B_FREE) ? SE_EXCL : SE_SHARED);
2496 2504                          }
2497 2505  
2498 2506                          if (pp == NULL || !pvn_getdirty(pp, flags))
2499 2507                                  io_len = PAGESIZE;
2500 2508                          else {
2501 2509                                  err = (*rp->r_putapage)(vp, pp, &io_off,
2502 2510                                      &io_len, flags, cr);
2503 2511                                  if (!error)
2504 2512                                          error = err;
2505 2513                                  /*
2506 2514                                   * "io_off" and "io_len" are returned as
2507 2515                                   * the range of pages we actually wrote.
2508 2516                                   * This allows us to skip ahead more quickly
2509 2517                                   * since several pages may've been dealt
2510 2518                                   * with by this iteration of the loop.
2511 2519                                   */
2512 2520                          }
2513 2521                          mutex_enter(&rp->r_statelock);
2514 2522                  }
2515 2523                  mutex_exit(&rp->r_statelock);
2516 2524          }
2517 2525  
2518 2526          return (error);
2519 2527  }
2520 2528  
2521 2529  void
2522 2530  nfs_invalidate_pages(vnode_t *vp, u_offset_t off, cred_t *cr)
2523 2531  {
2524 2532          rnode_t *rp;
2525 2533  
2526 2534          rp = VTOR(vp);
2527 2535          mutex_enter(&rp->r_statelock);
2528 2536          while (rp->r_flags & RTRUNCATE)
2529 2537                  cv_wait(&rp->r_cv, &rp->r_statelock);
2530 2538          rp->r_flags |= RTRUNCATE;
2531 2539          if (off == (u_offset_t)0) {
2532 2540                  rp->r_flags &= ~RDIRTY;
2533 2541                  if (!(rp->r_flags & RSTALE))
2534 2542                          rp->r_error = 0;
2535 2543          }
2536 2544          rp->r_truncaddr = off;
2537 2545          mutex_exit(&rp->r_statelock);
2538 2546          (void) pvn_vplist_dirty(vp, off, rp->r_putapage,
2539 2547              B_INVAL | B_TRUNC, cr);
2540 2548          mutex_enter(&rp->r_statelock);
2541 2549          rp->r_flags &= ~RTRUNCATE;
2542 2550          cv_broadcast(&rp->r_cv);
2543 2551          mutex_exit(&rp->r_statelock);
2544 2552  }
2545 2553  
2546 2554  static int nfs_write_error_to_cons_only = 0;
2547 2555  #define MSG(x)  (nfs_write_error_to_cons_only ? (x) : (x) + 1)
2548 2556  
2549 2557  /*
2550 2558   * Print a file handle
2551 2559   */
2552 2560  void
2553 2561  nfs_printfhandle(nfs_fhandle *fhp)
2554 2562  {
2555 2563          int *ip;
2556 2564          char *buf;
2557 2565          size_t bufsize;
2558 2566          char *cp;
2559 2567  
2560 2568          /*
2561 2569           * 13 == "(file handle:"
2562 2570           * maximum of NFS_FHANDLE / sizeof (*ip) elements in fh_buf times
2563 2571           *      1 == ' '
2564 2572           *      8 == maximum strlen of "%x"
2565 2573           * 3 == ")\n\0"
2566 2574           */
2567 2575          bufsize = 13 + ((NFS_FHANDLE_LEN / sizeof (*ip)) * (1 + 8)) + 3;
2568 2576          buf = kmem_alloc(bufsize, KM_NOSLEEP);
2569 2577          if (buf == NULL)
2570 2578                  return;
2571 2579  
2572 2580          cp = buf;
2573 2581          (void) strcpy(cp, "(file handle:");
2574 2582          while (*cp != '\0')
2575 2583                  cp++;
2576 2584          for (ip = (int *)fhp->fh_buf;
2577 2585              ip < (int *)&fhp->fh_buf[fhp->fh_len];
2578 2586              ip++) {
2579 2587                  (void) sprintf(cp, " %x", *ip);
2580 2588                  while (*cp != '\0')
2581 2589                          cp++;
2582 2590          }
2583 2591          (void) strcpy(cp, ")\n");
2584 2592  
2585 2593          zcmn_err(getzoneid(), CE_CONT, MSG("^%s"), buf);
2586 2594  
2587 2595          kmem_free(buf, bufsize);
2588 2596  }
2589 2597  
2590 2598  /*
2591 2599   * Notify the system administrator that an NFS write error has
2592 2600   * occurred.
2593 2601   */
2594 2602  
2595 2603  /* seconds between ENOSPC/EDQUOT messages */
2596 2604  clock_t nfs_write_error_interval = 5;
2597 2605  
2598 2606  void
2599 2607  nfs_write_error(vnode_t *vp, int error, cred_t *cr)
2600 2608  {
2601 2609          mntinfo_t *mi;
2602 2610          clock_t now;
2603 2611  
2604 2612          mi = VTOMI(vp);
2605 2613          /*
2606 2614           * In case of forced unmount or zone shutdown, do not print any
2607 2615           * messages since it can flood the console with error messages.
2608 2616           */
2609 2617          if (FS_OR_ZONE_GONE(mi->mi_vfsp))
2610 2618                  return;
2611 2619  
2612 2620          /*
2613 2621           * No use in flooding the console with ENOSPC
2614 2622           * messages from the same file system.
2615 2623           */
2616 2624          now = ddi_get_lbolt();
2617 2625          if ((error != ENOSPC && error != EDQUOT) ||
2618 2626              now - mi->mi_printftime > 0) {
2619 2627                  zoneid_t zoneid = mi->mi_zone->zone_id;
2620 2628  
2621 2629  #ifdef DEBUG
2622 2630                  nfs_perror(error, "NFS%ld write error on host %s: %m.\n",
2623 2631                      mi->mi_vers, VTOR(vp)->r_server->sv_hostname, NULL);
2624 2632  #else
2625 2633                  nfs_perror(error, "NFS write error on host %s: %m.\n",
2626 2634                      VTOR(vp)->r_server->sv_hostname, NULL);
2627 2635  #endif
2628 2636                  if (error == ENOSPC || error == EDQUOT) {
2629 2637                          zcmn_err(zoneid, CE_CONT,
2630 2638                              MSG("^File: userid=%d, groupid=%d\n"),
2631 2639                              crgetuid(cr), crgetgid(cr));
2632 2640                          if (crgetuid(CRED()) != crgetuid(cr) ||
2633 2641                              crgetgid(CRED()) != crgetgid(cr)) {
2634 2642                                  zcmn_err(zoneid, CE_CONT,
2635 2643                                      MSG("^User: userid=%d, groupid=%d\n"),
2636 2644                                      crgetuid(CRED()), crgetgid(CRED()));
2637 2645                          }
2638 2646                          mi->mi_printftime = now +
2639 2647                              nfs_write_error_interval * hz;
2640 2648                  }
2641 2649                  nfs_printfhandle(&VTOR(vp)->r_fh);
2642 2650  #ifdef DEBUG
2643 2651                  if (error == EACCES) {
2644 2652                          zcmn_err(zoneid, CE_CONT,
2645 2653                              MSG("^nfs_bio: cred is%s kcred\n"),
2646 2654                              cr == kcred ? "" : " not");
2647 2655                  }
2648 2656  #endif
2649 2657          }
2650 2658  }
2651 2659  
2652 2660  /* ARGSUSED */
2653 2661  static void *
2654 2662  nfs_mi_init(zoneid_t zoneid)
2655 2663  {
2656 2664          struct mi_globals *mig;
2657 2665  
2658 2666          mig = kmem_alloc(sizeof (*mig), KM_SLEEP);
2659 2667          mutex_init(&mig->mig_lock, NULL, MUTEX_DEFAULT, NULL);
2660 2668          list_create(&mig->mig_list, sizeof (mntinfo_t),
2661 2669              offsetof(mntinfo_t, mi_zone_node));
2662 2670          mig->mig_destructor_called = B_FALSE;
2663 2671          return (mig);
2664 2672  }
2665 2673  
2666 2674  /*
2667 2675   * Callback routine to tell all NFS mounts in the zone to stop creating new
2668 2676   * threads.  Existing threads should exit.
2669 2677   */
2670 2678  /* ARGSUSED */
2671 2679  static void
2672 2680  nfs_mi_shutdown(zoneid_t zoneid, void *data)
2673 2681  {
2674 2682          struct mi_globals *mig = data;
2675 2683          mntinfo_t *mi;
2676 2684  
2677 2685          ASSERT(mig != NULL);
2678 2686  again:
2679 2687          mutex_enter(&mig->mig_lock);
2680 2688          for (mi = list_head(&mig->mig_list); mi != NULL;
2681 2689              mi = list_next(&mig->mig_list, mi)) {
2682 2690  
2683 2691                  /*
2684 2692                   * If we've done the shutdown work for this FS, skip.
2685 2693                   * Once we go off the end of the list, we're done.
2686 2694                   */
2687 2695                  if (mi->mi_flags & MI_DEAD)
2688 2696                          continue;
2689 2697  
2690 2698                  /*
2691 2699                   * We will do work, so not done.  Get a hold on the FS.
2692 2700                   */
2693 2701                  VFS_HOLD(mi->mi_vfsp);
2694 2702  
2695 2703                  /*
2696 2704                   * purge the DNLC for this filesystem
2697 2705                   */
2698 2706                  (void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
2699 2707  
2700 2708                  mutex_enter(&mi->mi_async_lock);
2701 2709                  /*
2702 2710                   * Tell existing async worker threads to exit.
2703 2711                   */
2704 2712                  mi->mi_max_threads = 0;
2705 2713                  NFS_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
2706 2714                  /*
2707 2715                   * Set MI_ASYNC_MGR_STOP so the async manager thread starts
2708 2716                   * getting ready to exit when it's done with its current work.
2709 2717                   * Also set MI_DEAD to note we've acted on this FS.
2710 2718                   */
2711 2719                  mutex_enter(&mi->mi_lock);
2712 2720                  mi->mi_flags |= (MI_ASYNC_MGR_STOP|MI_DEAD);
2713 2721                  mutex_exit(&mi->mi_lock);
2714 2722                  /*
2715 2723                   * Wake up the async manager thread.
2716 2724                   */
2717 2725                  cv_broadcast(&mi->mi_async_reqs_cv);
2718 2726                  mutex_exit(&mi->mi_async_lock);
2719 2727  
2720 2728                  /*
2721 2729                   * Drop lock and release FS, which may change list, then repeat.
2722 2730                   * We're done when every mi has been done or the list is empty.
2723 2731                   */
2724 2732                  mutex_exit(&mig->mig_lock);
2725 2733                  VFS_RELE(mi->mi_vfsp);
2726 2734                  goto again;
2727 2735          }
2728 2736          mutex_exit(&mig->mig_lock);
2729 2737  }
2730 2738  
2731 2739  static void
2732 2740  nfs_mi_free_globals(struct mi_globals *mig)
2733 2741  {
2734 2742          list_destroy(&mig->mig_list);   /* makes sure the list is empty */
2735 2743          mutex_destroy(&mig->mig_lock);
2736 2744          kmem_free(mig, sizeof (*mig));
2737 2745  
2738 2746  }
2739 2747  
2740 2748  /* ARGSUSED */
2741 2749  static void
2742 2750  nfs_mi_destroy(zoneid_t zoneid, void *data)
2743 2751  {
2744 2752          struct mi_globals *mig = data;
2745 2753  
2746 2754          ASSERT(mig != NULL);
2747 2755          mutex_enter(&mig->mig_lock);
2748 2756          if (list_head(&mig->mig_list) != NULL) {
2749 2757                  /* Still waiting for VFS_FREEVFS() */
2750 2758                  mig->mig_destructor_called = B_TRUE;
2751 2759                  mutex_exit(&mig->mig_lock);
2752 2760                  return;
2753 2761          }
2754 2762          nfs_mi_free_globals(mig);
2755 2763  }
2756 2764  
2757 2765  /*
2758 2766   * Add an NFS mount to the per-zone list of NFS mounts.
2759 2767   */
2760 2768  void
2761 2769  nfs_mi_zonelist_add(mntinfo_t *mi)
2762 2770  {
2763 2771          struct mi_globals *mig;
2764 2772  
2765 2773          mig = zone_getspecific(mi_list_key, mi->mi_zone);
2766 2774          mutex_enter(&mig->mig_lock);
2767 2775          list_insert_head(&mig->mig_list, mi);
2768 2776          mutex_exit(&mig->mig_lock);
2769 2777  }
2770 2778  
2771 2779  /*
2772 2780   * Remove an NFS mount from the per-zone list of NFS mounts.
2773 2781   */
2774 2782  static void
2775 2783  nfs_mi_zonelist_remove(mntinfo_t *mi)
2776 2784  {
2777 2785          struct mi_globals *mig;
2778 2786  
2779 2787          mig = zone_getspecific(mi_list_key, mi->mi_zone);
2780 2788          mutex_enter(&mig->mig_lock);
2781 2789          list_remove(&mig->mig_list, mi);
2782 2790          /*
2783 2791           * We can be called asynchronously by VFS_FREEVFS() after the zone
2784 2792           * shutdown/destroy callbacks have executed; if so, clean up the zone's
2785 2793           * mi globals.
2786 2794           */
2787 2795          if (list_head(&mig->mig_list) == NULL &&
2788 2796              mig->mig_destructor_called == B_TRUE) {

↓ open down ↓

2716 lines elided

↑ open up ↑

2789 2797                  nfs_mi_free_globals(mig);
2790 2798                  return;
2791 2799          }
2792 2800          mutex_exit(&mig->mig_lock);
2793 2801  }
2794 2802  
2795 2803  /*
2796 2804   * NFS Client initialization routine.  This routine should only be called
2797 2805   * once.  It performs the following tasks:
2798 2806   *      - Initalize all global locks
2799      - *      - Call sub-initialization routines (localize access to variables)
     2807 + *      - Call sub-initialization routines (localize access to variables)
2800 2808   */
2801 2809  int
2802 2810  nfs_clntinit(void)
2803 2811  {
2804 2812  #ifdef DEBUG
2805 2813          static boolean_t nfs_clntup = B_FALSE;
2806 2814  #endif
2807 2815          int error;
2808 2816  
2809 2817  #ifdef DEBUG

2810 2818          ASSERT(nfs_clntup == B_FALSE);
2811 2819  #endif
2812 2820  
2813 2821          error = nfs_subrinit();
2814 2822          if (error)
2815 2823                  return (error);
2816 2824  
2817 2825          error = nfs_vfsinit();
2818 2826          if (error) {
2819 2827                  /*

↓ open down ↓

10 lines elided

↑ open up ↑

2820 2828                   * Cleanup nfs_subrinit() work
2821 2829                   */
2822 2830                  nfs_subrfini();
2823 2831                  return (error);
2824 2832          }
2825 2833          zone_key_create(&mi_list_key, nfs_mi_init, nfs_mi_shutdown,
2826 2834              nfs_mi_destroy);
2827 2835  
2828 2836          nfs4_clnt_init();
2829 2837  
     2838 +        nfscmd_init();
     2839 +
2830 2840  #ifdef DEBUG
2831 2841          nfs_clntup = B_TRUE;
2832 2842  #endif
2833 2843  
2834 2844          return (0);
2835 2845  }
2836 2846  
2837 2847  /*
2838 2848   * This routine is only called if the NFS Client has been initialized but
2839 2849   * the module failed to be installed. This routine will cleanup the previously
2840 2850   * allocated/initialized work.
2841 2851   */
2842 2852  void
2843 2853  nfs_clntfini(void)
2844 2854  {
2845 2855          (void) zone_key_delete(mi_list_key);
2846 2856          nfs_subrfini();
2847 2857          nfs_vfsfini();
2848 2858          nfs4_clnt_fini();
     2859 +        nfscmd_fini();
2849 2860  }
2850 2861  
2851 2862  /*
2852 2863   * nfs_lockrelease:
2853 2864   *
2854 2865   * Release any locks on the given vnode that are held by the current
2855 2866   * process.
2856 2867   */
2857 2868  void
2858 2869  nfs_lockrelease(vnode_t *vp, int flag, offset_t offset, cred_t *cr)

2859 2870  {
2860 2871          flock64_t ld;
2861 2872          struct shrlock shr;
2862 2873          char *buf;
2863 2874          int remote_lock_possible;
2864 2875          int ret;
2865 2876  
2866 2877          ASSERT((uintptr_t)vp > KERNELBASE);
2867 2878  
2868 2879          /*
2869 2880           * Generate an explicit unlock operation for the entire file.  As a
2870 2881           * partial optimization, only generate the unlock if there is a
2871 2882           * lock registered for the file.  We could check whether this
2872 2883           * particular process has any locks on the file, but that would
2873 2884           * require the local locking code to provide yet another query
2874 2885           * routine.  Note that no explicit synchronization is needed here.
2875 2886           * At worst, flk_has_remote_locks() will return a false positive,
2876 2887           * in which case the unlock call wastes time but doesn't harm
2877 2888           * correctness.
2878 2889           *
2879 2890           * In addition, an unlock request is generated if the process
2880 2891           * is listed as possibly having a lock on the file because the
2881 2892           * server and client lock managers may have gotten out of sync.
2882 2893           * N.B. It is important to make sure nfs_remove_locking_id() is
2883 2894           * called here even if flk_has_remote_locks(vp) reports true.
2884 2895           * If it is not called and there is an entry on the process id
2885 2896           * list, that entry will never get removed.
2886 2897           */
2887 2898          remote_lock_possible = nfs_remove_locking_id(vp, RLMPL_PID,
2888 2899              (char *)&(ttoproc(curthread)->p_pid), NULL, NULL);
2889 2900          if (remote_lock_possible || flk_has_remote_locks(vp)) {
2890 2901                  ld.l_type = F_UNLCK;    /* set to unlock entire file */
2891 2902                  ld.l_whence = 0;        /* unlock from start of file */
2892 2903                  ld.l_start = 0;
2893 2904                  ld.l_len = 0;           /* do entire file */
2894 2905                  ret = VOP_FRLOCK(vp, F_SETLK, &ld, flag, offset, NULL, cr,
2895 2906                      NULL);
2896 2907  
2897 2908                  if (ret != 0) {
2898 2909                          /*
2899 2910                           * If VOP_FRLOCK fails, make sure we unregister
2900 2911                           * local locks before we continue.
2901 2912                           */
2902 2913                          ld.l_pid = ttoproc(curthread)->p_pid;
2903 2914                          lm_register_lock_locally(vp, NULL, &ld, flag, offset);
2904 2915  #ifdef DEBUG
2905 2916                          nfs_perror(ret,
2906 2917                              "NFS lock release error on vp %p: %m.\n",
2907 2918                              (void *)vp, NULL);
2908 2919  #endif
2909 2920                  }
2910 2921  
2911 2922                  /*
2912 2923                   * The call to VOP_FRLOCK may put the pid back on the
2913 2924                   * list.  We need to remove it.
2914 2925                   */
2915 2926                  (void) nfs_remove_locking_id(vp, RLMPL_PID,
2916 2927                      (char *)&(ttoproc(curthread)->p_pid), NULL, NULL);
2917 2928          }
2918 2929  
2919 2930          /*
2920 2931           * As long as the vp has a share matching our pid,
2921 2932           * pluck it off and unshare it.  There are circumstances in
2922 2933           * which the call to nfs_remove_locking_id() may put the
2923 2934           * owner back on the list, in which case we simply do a
2924 2935           * redundant and harmless unshare.
2925 2936           */
2926 2937          buf = kmem_alloc(MAX_SHR_OWNER_LEN, KM_SLEEP);
2927 2938          while (nfs_remove_locking_id(vp, RLMPL_OWNER,
2928 2939              (char *)NULL, buf, &shr.s_own_len)) {
2929 2940                  shr.s_owner = buf;
2930 2941                  shr.s_access = 0;
2931 2942                  shr.s_deny = 0;
2932 2943                  shr.s_sysid = 0;
2933 2944                  shr.s_pid = curproc->p_pid;
2934 2945  
2935 2946                  ret = VOP_SHRLOCK(vp, F_UNSHARE, &shr, flag, cr, NULL);
2936 2947  #ifdef DEBUG
2937 2948                  if (ret != 0) {
2938 2949                          nfs_perror(ret,
2939 2950                              "NFS share release error on vp %p: %m.\n",
2940 2951                              (void *)vp, NULL);
2941 2952                  }
2942 2953  #endif
2943 2954          }
2944 2955          kmem_free(buf, MAX_SHR_OWNER_LEN);
2945 2956  }
2946 2957  
2947 2958  /*
2948 2959   * nfs_lockcompletion:
2949 2960   *
2950 2961   * If the vnode has a lock that makes it unsafe to cache the file, mark it
2951 2962   * as non cachable (set VNOCACHE bit).
2952 2963   */
2953 2964  
2954 2965  void
2955 2966  nfs_lockcompletion(vnode_t *vp, int cmd)
2956 2967  {
2957 2968  #ifdef DEBUG
2958 2969          rnode_t *rp = VTOR(vp);
2959 2970  
2960 2971          ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER));
2961 2972  #endif
2962 2973  
2963 2974          if (cmd == F_SETLK || cmd == F_SETLKW) {
2964 2975                  if (!lm_safemap(vp)) {
2965 2976                          mutex_enter(&vp->v_lock);
2966 2977                          vp->v_flag |= VNOCACHE;
2967 2978                          mutex_exit(&vp->v_lock);
2968 2979                  } else {
2969 2980                          mutex_enter(&vp->v_lock);
2970 2981                          vp->v_flag &= ~VNOCACHE;
2971 2982                          mutex_exit(&vp->v_lock);
2972 2983                  }
2973 2984          }
2974 2985          /*
2975 2986           * The cached attributes of the file are stale after acquiring
2976 2987           * the lock on the file. They were updated when the file was
2977 2988           * opened, but not updated when the lock was acquired. Therefore the
2978 2989           * cached attributes are invalidated after the lock is obtained.
2979 2990           */
2980 2991          PURGE_ATTRCACHE(vp);
2981 2992  }
2982 2993  
2983 2994  /*
2984 2995   * The lock manager holds state making it possible for the client
2985 2996   * and server to be out of sync.  For example, if the response from
2986 2997   * the server granting a lock request is lost, the server will think
2987 2998   * the lock is granted and the client will think the lock is lost.
2988 2999   * The client can tell when it is not positive if it is in sync with
2989 3000   * the server.
2990 3001   *
2991 3002   * To deal with this, a list of processes for which the client is
2992 3003   * not sure if the server holds a lock is attached to the rnode.
2993 3004   * When such a process closes the rnode, an unlock request is sent
2994 3005   * to the server to unlock the entire file.
2995 3006   *
2996 3007   * The list is kept as a singularly linked NULL terminated list.
2997 3008   * Because it is only added to under extreme error conditions, the
2998 3009   * list shouldn't get very big.  DEBUG kernels print a message if
2999 3010   * the list gets bigger than nfs_lmpl_high_water.  This is arbitrarily
3000 3011   * choosen to be 8, but can be tuned at runtime.
3001 3012   */
3002 3013  #ifdef DEBUG
3003 3014  /* int nfs_lmpl_high_water = 8; */
3004 3015  int nfs_lmpl_high_water = 128;
3005 3016  int nfs_cnt_add_locking_id = 0;
3006 3017  int nfs_len_add_locking_id = 0;
3007 3018  #endif /* DEBUG */
3008 3019  
3009 3020  /*
3010 3021   * Record that the nfs lock manager server may be holding a lock on
3011 3022   * a vnode for a process.
3012 3023   *
3013 3024   * Because the nfs lock manager server holds state, it is possible
3014 3025   * for the server to get out of sync with the client.  This routine is called
3015 3026   * from the client when it is no longer sure if the server is in sync
3016 3027   * with the client.  nfs_lockrelease() will then notice this and send
3017 3028   * an unlock request when the file is closed
3018 3029   */
3019 3030  void
3020 3031  nfs_add_locking_id(vnode_t *vp, pid_t pid, int type, char *id, int len)
3021 3032  {
3022 3033          rnode_t *rp;
3023 3034          lmpl_t *new;
3024 3035          lmpl_t *cur;
3025 3036          lmpl_t **lmplp;
3026 3037  #ifdef DEBUG
3027 3038          int list_len = 1;
3028 3039  #endif /* DEBUG */
3029 3040  
3030 3041  #ifdef DEBUG
3031 3042          ++nfs_cnt_add_locking_id;
3032 3043  #endif /* DEBUG */
3033 3044          /*
3034 3045           * allocate new lmpl_t now so we don't sleep
3035 3046           * later after grabbing mutexes
3036 3047           */
3037 3048          ASSERT(len < MAX_SHR_OWNER_LEN);
3038 3049          new = kmem_alloc(sizeof (*new), KM_SLEEP);
3039 3050          new->lmpl_type = type;
3040 3051          new->lmpl_pid = pid;
3041 3052          new->lmpl_owner = kmem_alloc(len, KM_SLEEP);
3042 3053          bcopy(id, new->lmpl_owner, len);
3043 3054          new->lmpl_own_len = len;
3044 3055          new->lmpl_next = (lmpl_t *)NULL;
3045 3056  #ifdef DEBUG
3046 3057          if (type == RLMPL_PID) {
3047 3058                  ASSERT(len == sizeof (pid_t));
3048 3059                  ASSERT(pid == *(pid_t *)new->lmpl_owner);
3049 3060          } else {
3050 3061                  ASSERT(type == RLMPL_OWNER);
3051 3062          }
3052 3063  #endif
3053 3064  
3054 3065          rp = VTOR(vp);
3055 3066          mutex_enter(&rp->r_statelock);
3056 3067  
3057 3068          /*
3058 3069           * Add this id to the list for this rnode only if the
3059 3070           * rnode is active and the id is not already there.
3060 3071           */
3061 3072          ASSERT(rp->r_flags & RHASHED);
3062 3073          lmplp = &(rp->r_lmpl);
3063 3074          for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL; cur = cur->lmpl_next) {
3064 3075                  if (cur->lmpl_pid == pid &&
3065 3076                      cur->lmpl_type == type &&
3066 3077                      cur->lmpl_own_len == len &&
3067 3078                      bcmp(cur->lmpl_owner, new->lmpl_owner, len) == 0) {
3068 3079                          kmem_free(new->lmpl_owner, len);
3069 3080                          kmem_free(new, sizeof (*new));
3070 3081                          break;
3071 3082                  }
3072 3083                  lmplp = &cur->lmpl_next;
3073 3084  #ifdef DEBUG
3074 3085                  ++list_len;
3075 3086  #endif /* DEBUG */
3076 3087          }
3077 3088          if (cur == (lmpl_t *)NULL) {
3078 3089                  *lmplp = new;
3079 3090  #ifdef DEBUG
3080 3091                  if (list_len > nfs_len_add_locking_id) {
3081 3092                          nfs_len_add_locking_id = list_len;
3082 3093                  }
3083 3094                  if (list_len > nfs_lmpl_high_water) {
3084 3095                          cmn_err(CE_WARN, "nfs_add_locking_id: long list "
3085 3096                              "vp=%p is %d", (void *)vp, list_len);
3086 3097                  }
3087 3098  #endif /* DEBUG */
3088 3099          }
3089 3100  
3090 3101  #ifdef DEBUG
3091 3102          if (share_debug) {
3092 3103                  int nitems = 0;
3093 3104                  int npids = 0;
3094 3105                  int nowners = 0;
3095 3106  
3096 3107                  /*
3097 3108                   * Count the number of things left on r_lmpl after the remove.
3098 3109                   */
3099 3110                  for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL;
3100 3111                      cur = cur->lmpl_next) {
3101 3112                          nitems++;
3102 3113                          if (cur->lmpl_type == RLMPL_PID) {
3103 3114                                  npids++;
3104 3115                          } else if (cur->lmpl_type == RLMPL_OWNER) {
3105 3116                                  nowners++;
3106 3117                          } else {
3107 3118                                  cmn_err(CE_PANIC, "nfs_add_locking_id: "
3108 3119                                      "unrecognized lmpl_type %d",
3109 3120                                      cur->lmpl_type);
3110 3121                          }
3111 3122                  }
3112 3123  
3113 3124                  cmn_err(CE_CONT, "nfs_add_locking_id(%s): %d PIDs + %d "
3114 3125                      "OWNs = %d items left on r_lmpl\n",
3115 3126                      (type == RLMPL_PID) ? "P" : "O", npids, nowners, nitems);
3116 3127          }
3117 3128  #endif
3118 3129  
3119 3130          mutex_exit(&rp->r_statelock);
3120 3131  }
3121 3132  
3122 3133  /*
3123 3134   * Remove an id from the lock manager id list.
3124 3135   *
3125 3136   * If the id is not in the list return 0.  If it was found and
3126 3137   * removed, return 1.
3127 3138   */
3128 3139  static int
3129 3140  nfs_remove_locking_id(vnode_t *vp, int type, char *id, char *rid, int *rlen)
3130 3141  {
3131 3142          lmpl_t *cur;
3132 3143          lmpl_t **lmplp;
3133 3144          rnode_t *rp;
3134 3145          int rv = 0;
3135 3146  
3136 3147          ASSERT(type == RLMPL_PID || type == RLMPL_OWNER);
3137 3148  
3138 3149          rp = VTOR(vp);
3139 3150  
3140 3151          mutex_enter(&rp->r_statelock);
3141 3152          ASSERT(rp->r_flags & RHASHED);
3142 3153          lmplp = &(rp->r_lmpl);
3143 3154  
3144 3155          /*
3145 3156           * Search through the list and remove the entry for this id
3146 3157           * if it is there.  The special case id == NULL allows removal
3147 3158           * of the first share on the r_lmpl list belonging to the
3148 3159           * current process (if any), without regard to further details
3149 3160           * of its identity.
3150 3161           */
3151 3162          for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL; cur = cur->lmpl_next) {
3152 3163                  if (cur->lmpl_type == type &&
3153 3164                      cur->lmpl_pid == curproc->p_pid &&
3154 3165                      (id == (char *)NULL ||
3155 3166                      bcmp(cur->lmpl_owner, id, cur->lmpl_own_len) == 0)) {
3156 3167                          *lmplp = cur->lmpl_next;
3157 3168                          ASSERT(cur->lmpl_own_len < MAX_SHR_OWNER_LEN);
3158 3169                          if (rid != NULL) {
3159 3170                                  bcopy(cur->lmpl_owner, rid, cur->lmpl_own_len);
3160 3171                                  *rlen = cur->lmpl_own_len;
3161 3172                          }
3162 3173                          kmem_free(cur->lmpl_owner, cur->lmpl_own_len);
3163 3174                          kmem_free(cur, sizeof (*cur));
3164 3175                          rv = 1;
3165 3176                          break;
3166 3177                  }
3167 3178                  lmplp = &cur->lmpl_next;
3168 3179          }
3169 3180  
3170 3181  #ifdef DEBUG
3171 3182          if (share_debug) {
3172 3183                  int nitems = 0;
3173 3184                  int npids = 0;
3174 3185                  int nowners = 0;
3175 3186  
3176 3187                  /*
3177 3188                   * Count the number of things left on r_lmpl after the remove.
3178 3189                   */
3179 3190                  for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL;
3180 3191                      cur = cur->lmpl_next) {
3181 3192                          nitems++;
3182 3193                          if (cur->lmpl_type == RLMPL_PID) {
3183 3194                                  npids++;
3184 3195                          } else if (cur->lmpl_type == RLMPL_OWNER) {
3185 3196                                  nowners++;
3186 3197                          } else {
3187 3198                                  cmn_err(CE_PANIC,
3188 3199                                      "nrli: unrecognized lmpl_type %d",
3189 3200                                      cur->lmpl_type);
3190 3201                          }
3191 3202                  }
3192 3203  
3193 3204                  cmn_err(CE_CONT,
3194 3205                  "nrli(%s): %d PIDs + %d OWNs = %d items left on r_lmpl\n",
3195 3206                      (type == RLMPL_PID) ? "P" : "O",
3196 3207                      npids,
3197 3208                      nowners,
3198 3209                      nitems);
3199 3210          }
3200 3211  #endif
3201 3212  
3202 3213          mutex_exit(&rp->r_statelock);
3203 3214          return (rv);
3204 3215  }
3205 3216  
3206 3217  void
3207 3218  nfs_free_mi(mntinfo_t *mi)
3208 3219  {
3209 3220          ASSERT(mi->mi_flags & MI_ASYNC_MGR_STOP);
3210 3221          ASSERT(mi->mi_manager_thread == NULL);
3211 3222          ASSERT(mi->mi_threads[NFS_ASYNC_QUEUE] == 0 &&
3212 3223              mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE] == 0);
3213 3224  
3214 3225          /*
3215 3226           * Remove the node from the global list before we start tearing it down.
3216 3227           */
3217 3228          nfs_mi_zonelist_remove(mi);
3218 3229          if (mi->mi_klmconfig) {
3219 3230                  lm_free_config(mi->mi_klmconfig);
3220 3231                  kmem_free(mi->mi_klmconfig, sizeof (struct knetconfig));
3221 3232          }
3222 3233          mutex_destroy(&mi->mi_lock);
3223 3234          mutex_destroy(&mi->mi_remap_lock);
3224 3235          mutex_destroy(&mi->mi_async_lock);
3225 3236          mutex_destroy(&mi->mi_rnodes_lock);
3226 3237          cv_destroy(&mi->mi_failover_cv);
3227 3238          cv_destroy(&mi->mi_async_work_cv[NFS_ASYNC_QUEUE]);
3228 3239          cv_destroy(&mi->mi_async_work_cv[NFS_ASYNC_PGOPS_QUEUE]);
3229 3240          cv_destroy(&mi->mi_async_reqs_cv);
3230 3241          cv_destroy(&mi->mi_async_cv);
3231 3242          list_destroy(&mi->mi_rnodes);
3232 3243          zone_rele_ref(&mi->mi_zone_ref, ZONE_REF_NFS);
3233 3244          kmem_free(mi, sizeof (*mi));
3234 3245  }
3235 3246  
3236 3247  static int
3237 3248  mnt_kstat_update(kstat_t *ksp, int rw)
3238 3249  {
3239 3250          mntinfo_t *mi;
3240 3251          struct mntinfo_kstat *mik;
3241 3252          vfs_t *vfsp;
3242 3253          int i;
3243 3254  
3244 3255          /* this is a read-only kstat. Bail out on a write */
3245 3256          if (rw == KSTAT_WRITE)
3246 3257                  return (EACCES);
3247 3258  
3248 3259          /*
3249 3260           * We don't want to wait here as kstat_chain_lock could be held by
3250 3261           * dounmount(). dounmount() takes vfs_reflock before the chain lock
3251 3262           * and thus could lead to a deadlock.
3252 3263           */
3253 3264          vfsp = (struct vfs *)ksp->ks_private;
3254 3265  
3255 3266  
3256 3267          mi = VFTOMI(vfsp);
3257 3268  
3258 3269          mik = (struct mntinfo_kstat *)ksp->ks_data;
3259 3270  
3260 3271          (void) strcpy(mik->mik_proto, mi->mi_curr_serv->sv_knconf->knc_proto);
3261 3272          mik->mik_vers = (uint32_t)mi->mi_vers;
3262 3273          mik->mik_flags = mi->mi_flags;
3263 3274          mik->mik_secmod = mi->mi_curr_serv->sv_secdata->secmod;
3264 3275          mik->mik_curread = (uint32_t)mi->mi_curread;
3265 3276          mik->mik_curwrite = (uint32_t)mi->mi_curwrite;
3266 3277          mik->mik_retrans = mi->mi_retrans;
3267 3278          mik->mik_timeo = mi->mi_timeo;
3268 3279          mik->mik_acregmin = HR2SEC(mi->mi_acregmin);
3269 3280          mik->mik_acregmax = HR2SEC(mi->mi_acregmax);
3270 3281          mik->mik_acdirmin = HR2SEC(mi->mi_acdirmin);
3271 3282          mik->mik_acdirmax = HR2SEC(mi->mi_acdirmax);
3272 3283          for (i = 0; i < NFS_CALLTYPES + 1; i++) {
3273 3284                  mik->mik_timers[i].srtt = (uint32_t)mi->mi_timers[i].rt_srtt;
3274 3285                  mik->mik_timers[i].deviate =
3275 3286                      (uint32_t)mi->mi_timers[i].rt_deviate;
3276 3287                  mik->mik_timers[i].rtxcur =
3277 3288                      (uint32_t)mi->mi_timers[i].rt_rtxcur;
3278 3289          }
3279 3290          mik->mik_noresponse = (uint32_t)mi->mi_noresponse;
3280 3291          mik->mik_failover = (uint32_t)mi->mi_failover;
3281 3292          mik->mik_remap = (uint32_t)mi->mi_remap;
3282 3293          (void) strcpy(mik->mik_curserver, mi->mi_curr_serv->sv_hostname);
3283 3294  
3284 3295          return (0);
3285 3296  }
3286 3297  
3287 3298  void
3288 3299  nfs_mnt_kstat_init(struct vfs *vfsp)
3289 3300  {
3290 3301          mntinfo_t *mi = VFTOMI(vfsp);
3291 3302  
3292 3303          /*
3293 3304           * Create the version specific kstats.
3294 3305           *
3295 3306           * PSARC 2001/697 Contract Private Interface
3296 3307           * All nfs kstats are under SunMC contract
3297 3308           * Please refer to the PSARC listed above and contact
3298 3309           * SunMC before making any changes!
3299 3310           *
3300 3311           * Changes must be reviewed by Solaris File Sharing
3301 3312           * Changes must be communicated to contract-2001-697@sun.com
3302 3313           *
3303 3314           */
3304 3315  
3305 3316          mi->mi_io_kstats = kstat_create_zone("nfs", getminor(vfsp->vfs_dev),
3306 3317              NULL, "nfs", KSTAT_TYPE_IO, 1, 0, mi->mi_zone->zone_id);
3307 3318          if (mi->mi_io_kstats) {
3308 3319                  if (mi->mi_zone->zone_id != GLOBAL_ZONEID)
3309 3320                          kstat_zone_add(mi->mi_io_kstats, GLOBAL_ZONEID);
3310 3321                  mi->mi_io_kstats->ks_lock = &mi->mi_lock;
3311 3322                  kstat_install(mi->mi_io_kstats);
3312 3323          }
3313 3324  
3314 3325          if ((mi->mi_ro_kstats = kstat_create_zone("nfs",
3315 3326              getminor(vfsp->vfs_dev), "mntinfo", "misc", KSTAT_TYPE_RAW,
3316 3327              sizeof (struct mntinfo_kstat), 0, mi->mi_zone->zone_id)) != NULL) {
3317 3328                  if (mi->mi_zone->zone_id != GLOBAL_ZONEID)
3318 3329                          kstat_zone_add(mi->mi_ro_kstats, GLOBAL_ZONEID);
3319 3330                  mi->mi_ro_kstats->ks_update = mnt_kstat_update;
3320 3331                  mi->mi_ro_kstats->ks_private = (void *)vfsp;
3321 3332                  kstat_install(mi->mi_ro_kstats);
3322 3333          }
3323 3334  }
3324 3335  
3325 3336  nfs_delmapcall_t *
3326 3337  nfs_init_delmapcall()
3327 3338  {
3328 3339          nfs_delmapcall_t        *delmap_call;
3329 3340  
3330 3341          delmap_call = kmem_alloc(sizeof (nfs_delmapcall_t), KM_SLEEP);
3331 3342          delmap_call->call_id = curthread;
3332 3343          delmap_call->error = 0;
3333 3344  
3334 3345          return (delmap_call);
3335 3346  }
3336 3347  
3337 3348  void
3338 3349  nfs_free_delmapcall(nfs_delmapcall_t *delmap_call)

↓ open down ↓

480 lines elided

↑ open up ↑

3339 3350  {
3340 3351          kmem_free(delmap_call, sizeof (nfs_delmapcall_t));
3341 3352  }
3342 3353  
3343 3354  /*
3344 3355   * Searches for the current delmap caller (based on curthread) in the list of
3345 3356   * callers.  If it is found, we remove it and free the delmap caller.
3346 3357   * Returns:
3347 3358   *      0 if the caller wasn't found
3348 3359   *      1 if the caller was found, removed and freed.  *errp is set to what
3349      - *      the result of the delmap was.
     3360 + *      the result of the delmap was.
3350 3361   */
3351 3362  int
3352 3363  nfs_find_and_delete_delmapcall(rnode_t *rp, int *errp)
3353 3364  {
3354 3365          nfs_delmapcall_t        *delmap_call;
3355 3366  
3356 3367          /*
3357 3368           * If the list doesn't exist yet, we create it and return
3358 3369           * that the caller wasn't found.  No list = no callers.
3359 3370           */

3360 3371          mutex_enter(&rp->r_statelock);
3361 3372          if (!(rp->r_flags & RDELMAPLIST)) {
3362 3373                  /* The list does not exist */
3363 3374                  list_create(&rp->r_indelmap, sizeof (nfs_delmapcall_t),
3364 3375                      offsetof(nfs_delmapcall_t, call_node));
3365 3376                  rp->r_flags |= RDELMAPLIST;
3366 3377                  mutex_exit(&rp->r_statelock);
3367 3378                  return (0);
3368 3379          } else {
3369 3380                  /* The list exists so search it */
3370 3381                  for (delmap_call = list_head(&rp->r_indelmap);
3371 3382                      delmap_call != NULL;
3372 3383                      delmap_call = list_next(&rp->r_indelmap, delmap_call)) {
3373 3384                          if (delmap_call->call_id == curthread) {
3374 3385                                  /* current caller is in the list */
3375 3386                                  *errp = delmap_call->error;
3376 3387                                  list_remove(&rp->r_indelmap, delmap_call);
3377 3388                                  mutex_exit(&rp->r_statelock);
3378 3389                                  nfs_free_delmapcall(delmap_call);
3379 3390                                  return (1);
3380 3391                          }
3381 3392                  }
3382 3393          }
3383 3394          mutex_exit(&rp->r_statelock);
3384 3395          return (0);
3385 3396  }

↓ open down ↓

26 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX