big-one Wdiff usr/src/uts/common/fs/nfs/nfs4_client.c

Print this page

re #13613 rb4516 Tunables needs volatile keyword

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/nfs/nfs4_client.c
          +++ new/usr/src/uts/common/fs/nfs/nfs4_client.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *

↓ open down ↓

12 lines elided

↑ open up ↑

  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
       23 + * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
  23   24   * Copyright (c) 2017 by Delphix. All rights reserved.
  24   25   */
  25   26  
  26   27  /*
  27   28   *      Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
  28   29   *      All Rights Reserved
  29   30   */
  30   31  
  31   32  #include <sys/param.h>
  32   33  #include <sys/types.h>

  33   34  #include <sys/systm.h>
  34   35  #include <sys/thread.h>
  35   36  #include <sys/t_lock.h>
  36   37  #include <sys/time.h>
  37   38  #include <sys/vnode.h>
  38   39  #include <sys/vfs.h>
  39   40  #include <sys/errno.h>
  40   41  #include <sys/buf.h>
  41   42  #include <sys/stat.h>
  42   43  #include <sys/cred.h>
  43   44  #include <sys/kmem.h>
  44   45  #include <sys/debug.h>
  45   46  #include <sys/dnlc.h>
  46   47  #include <sys/vmsystm.h>
  47   48  #include <sys/flock.h>
  48   49  #include <sys/share.h>
  49   50  #include <sys/cmn_err.h>
  50   51  #include <sys/tiuser.h>
  51   52  #include <sys/sysmacros.h>
  52   53  #include <sys/callb.h>
  53   54  #include <sys/acl.h>
  54   55  #include <sys/kstat.h>
  55   56  #include <sys/signal.h>
  56   57  #include <sys/disp.h>
  57   58  #include <sys/atomic.h>
  58   59  #include <sys/list.h>
  59   60  #include <sys/sdt.h>
  60   61  
  61   62  #include <rpc/types.h>
  62   63  #include <rpc/xdr.h>
  63   64  #include <rpc/auth.h>
  64   65  #include <rpc/clnt.h>
  65   66  
  66   67  #include <nfs/nfs.h>
  67   68  #include <nfs/nfs_clnt.h>
  68   69  #include <nfs/nfs_acl.h>
  69   70  
  70   71  #include <nfs/nfs4.h>
  71   72  #include <nfs/rnode4.h>
  72   73  #include <nfs/nfs4_clnt.h>
  73   74  
  74   75  #include <vm/hat.h>
  75   76  #include <vm/as.h>
  76   77  #include <vm/page.h>
  77   78  #include <vm/pvn.h>
  78   79  #include <vm/seg.h>
  79   80  #include <vm/seg_map.h>
  80   81  #include <vm/seg_vn.h>
  81   82  
  82   83  #include <sys/ddi.h>
  83   84  
  84   85  /*
  85   86   * Arguments to page-flush thread.
  86   87   */
  87   88  typedef struct {
  88   89          vnode_t *vp;
  89   90          cred_t *cr;
  90   91  } pgflush_t;
  91   92  
  92   93  #ifdef DEBUG
  93   94  int nfs4_client_lease_debug;
  94   95  int nfs4_sharedfh_debug;
  95   96  int nfs4_fname_debug;
  96   97  
  97   98  /* temporary: panic if v_type is inconsistent with r_attr va_type */
  98   99  int nfs4_vtype_debug;
  99  100  
 100  101  uint_t nfs4_tsd_key;
 101  102  #endif
 102  103  
 103  104  static time_t   nfs4_client_resumed = 0;
 104  105  static  callb_id_t cid = 0;
 105  106  
 106  107  static int      nfs4renew(nfs4_server_t *);
 107  108  static void     nfs4_attrcache_va(vnode_t *, nfs4_ga_res_t *, int);
 108  109  static void     nfs4_pgflush_thread(pgflush_t *);
 109  110  
 110  111  static boolean_t nfs4_client_cpr_callb(void *, int);
 111  112  
 112  113  struct mi4_globals {
 113  114          kmutex_t        mig_lock;  /* lock protecting mig_list */
 114  115          list_t          mig_list;  /* list of NFS v4 mounts in zone */
 115  116          boolean_t       mig_destructor_called;
 116  117  };
 117  118  
 118  119  static zone_key_t mi4_list_key;
 119  120  
 120  121  /*
 121  122   * Attributes caching:
 122  123   *
 123  124   * Attributes are cached in the rnode in struct vattr form.
 124  125   * There is a time associated with the cached attributes (r_time_attr_inval)
 125  126   * which tells whether the attributes are valid. The time is initialized
 126  127   * to the difference between current time and the modify time of the vnode
 127  128   * when new attributes are cached. This allows the attributes for
 128  129   * files that have changed recently to be timed out sooner than for files
 129  130   * that have not changed for a long time. There are minimum and maximum
 130  131   * timeout values that can be set per mount point.
 131  132   */
 132  133  
 133  134  /*
 134  135   * If a cache purge is in progress, wait for it to finish.
 135  136   *
 136  137   * The current thread must not be in the middle of an
 137  138   * nfs4_start_op/nfs4_end_op region.  Otherwise, there could be a deadlock
 138  139   * between this thread, a recovery thread, and the page flush thread.
 139  140   */
 140  141  int
 141  142  nfs4_waitfor_purge_complete(vnode_t *vp)
 142  143  {
 143  144          rnode4_t *rp;
 144  145          k_sigset_t smask;
 145  146  
 146  147          rp = VTOR4(vp);
 147  148          if ((rp->r_serial != NULL && rp->r_serial != curthread) ||
 148  149              ((rp->r_flags & R4PGFLUSH) && rp->r_pgflush != curthread)) {
 149  150                  mutex_enter(&rp->r_statelock);
 150  151                  sigintr(&smask, VTOMI4(vp)->mi_flags & MI4_INT);
 151  152                  while ((rp->r_serial != NULL && rp->r_serial != curthread) ||
 152  153                      ((rp->r_flags & R4PGFLUSH) &&
 153  154                      rp->r_pgflush != curthread)) {
 154  155                          if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
 155  156                                  sigunintr(&smask);
 156  157                                  mutex_exit(&rp->r_statelock);
 157  158                                  return (EINTR);
 158  159                          }
 159  160                  }
 160  161                  sigunintr(&smask);
 161  162                  mutex_exit(&rp->r_statelock);
 162  163          }
 163  164          return (0);
 164  165  }
 165  166  
 166  167  /*
 167  168   * Validate caches by checking cached attributes. If they have timed out,
 168  169   * then get new attributes from the server.  As a side effect, cache
 169  170   * invalidation is done if the attributes have changed.
 170  171   *
 171  172   * If the attributes have not timed out and if there is a cache
 172  173   * invalidation being done by some other thread, then wait until that
 173  174   * thread has completed the cache invalidation.
 174  175   */
 175  176  int
 176  177  nfs4_validate_caches(vnode_t *vp, cred_t *cr)
 177  178  {
 178  179          int error;
 179  180          nfs4_ga_res_t gar;
 180  181  
 181  182          if (ATTRCACHE4_VALID(vp)) {
 182  183                  error = nfs4_waitfor_purge_complete(vp);
 183  184                  if (error)
 184  185                          return (error);
 185  186                  return (0);
 186  187          }
 187  188  
 188  189          return (nfs4_getattr_otw(vp, &gar, cr, 0));
 189  190  }
 190  191  
 191  192  /*
 192  193   * Fill in attribute from the cache.
 193  194   * If valid, then return 0 to indicate that no error occurred,
 194  195   * otherwise return 1 to indicate that an error occurred.
 195  196   */
 196  197  static int
 197  198  nfs4_getattr_cache(vnode_t *vp, struct vattr *vap)
 198  199  {
 199  200          rnode4_t *rp;
 200  201  
 201  202          rp = VTOR4(vp);
 202  203          mutex_enter(&rp->r_statelock);
 203  204          mutex_enter(&rp->r_statev4_lock);
 204  205          if (ATTRCACHE4_VALID(vp)) {
 205  206                  mutex_exit(&rp->r_statev4_lock);
 206  207                  /*
 207  208                   * Cached attributes are valid
 208  209                   */
 209  210                  *vap = rp->r_attr;
 210  211                  mutex_exit(&rp->r_statelock);
 211  212                  return (0);
 212  213          }
 213  214          mutex_exit(&rp->r_statev4_lock);
 214  215          mutex_exit(&rp->r_statelock);
 215  216          return (1);
 216  217  }
 217  218  
 218  219  
 219  220  /*
 220  221   * If returned error is ESTALE flush all caches.  The nfs4_purge_caches()
 221  222   * call is synchronous because all the pages were invalidated by the
 222  223   * nfs4_invalidate_pages() call.
 223  224   */
 224  225  void
 225  226  nfs4_purge_stale_fh(int errno, vnode_t *vp, cred_t *cr)
 226  227  {
 227  228          struct rnode4 *rp = VTOR4(vp);
 228  229  
 229  230          /* Ensure that the ..._end_op() call has been done */
 230  231          ASSERT(tsd_get(nfs4_tsd_key) == NULL);
 231  232  
 232  233          if (errno != ESTALE)
 233  234                  return;
 234  235  
 235  236          mutex_enter(&rp->r_statelock);
 236  237          rp->r_flags |= R4STALE;
 237  238          if (!rp->r_error)
 238  239                  rp->r_error = errno;
 239  240          mutex_exit(&rp->r_statelock);
 240  241          if (nfs4_has_pages(vp))
 241  242                  nfs4_invalidate_pages(vp, (u_offset_t)0, cr);
 242  243          nfs4_purge_caches(vp, NFS4_PURGE_DNLC, cr, FALSE);
 243  244  }
 244  245  
 245  246  /*
 246  247   * Purge all of the various NFS `data' caches.  If "asyncpg" is TRUE, the
 247  248   * page purge is done asynchronously.
 248  249   */
 249  250  void
 250  251  nfs4_purge_caches(vnode_t *vp, int purge_dnlc, cred_t *cr, int asyncpg)
 251  252  {
 252  253          rnode4_t *rp;
 253  254          char *contents;
 254  255          vnode_t *xattr;
 255  256          int size;
 256  257          int pgflush;                    /* are we the page flush thread? */
 257  258  
 258  259          /*
 259  260           * Purge the DNLC for any entries which refer to this file.
 260  261           */
 261  262          if (vp->v_count > 1 &&
 262  263              (vp->v_type == VDIR || purge_dnlc == NFS4_PURGE_DNLC))
 263  264                  dnlc_purge_vp(vp);
 264  265  
 265  266          /*
 266  267           * Clear any readdir state bits and purge the readlink response cache.
 267  268           */
 268  269          rp = VTOR4(vp);
 269  270          mutex_enter(&rp->r_statelock);
 270  271          rp->r_flags &= ~R4LOOKUP;
 271  272          contents = rp->r_symlink.contents;
 272  273          size = rp->r_symlink.size;
 273  274          rp->r_symlink.contents = NULL;
 274  275  
 275  276          xattr = rp->r_xattr_dir;
 276  277          rp->r_xattr_dir = NULL;
 277  278  
 278  279          /*
 279  280           * Purge pathconf cache too.
 280  281           */
 281  282          rp->r_pathconf.pc4_xattr_valid = 0;
 282  283          rp->r_pathconf.pc4_cache_valid = 0;
 283  284  
 284  285          pgflush = (curthread == rp->r_pgflush);
 285  286          mutex_exit(&rp->r_statelock);
 286  287  
 287  288          if (contents != NULL) {
 288  289  
 289  290                  kmem_free((void *)contents, size);
 290  291          }
 291  292  
 292  293          if (xattr != NULL)
 293  294                  VN_RELE(xattr);
 294  295  
 295  296          /*
 296  297           * Flush the page cache.  If the current thread is the page flush
 297  298           * thread, don't initiate a new page flush.  There's no need for
 298  299           * it, and doing it correctly is hard.
 299  300           */
 300  301          if (nfs4_has_pages(vp) && !pgflush) {
 301  302                  if (!asyncpg) {
 302  303                          (void) nfs4_waitfor_purge_complete(vp);
 303  304                          nfs4_flush_pages(vp, cr);
 304  305                  } else {
 305  306                          pgflush_t *args;
 306  307  
 307  308                          /*
 308  309                           * We don't hold r_statelock while creating the
 309  310                           * thread, in case the call blocks.  So we use a
 310  311                           * flag to indicate that a page flush thread is
 311  312                           * active.
 312  313                           */
 313  314                          mutex_enter(&rp->r_statelock);
 314  315                          if (rp->r_flags & R4PGFLUSH) {
 315  316                                  mutex_exit(&rp->r_statelock);
 316  317                          } else {
 317  318                                  rp->r_flags |= R4PGFLUSH;
 318  319                                  mutex_exit(&rp->r_statelock);
 319  320  
 320  321                                  args = kmem_alloc(sizeof (pgflush_t),
 321  322                                      KM_SLEEP);
 322  323                                  args->vp = vp;
 323  324                                  VN_HOLD(args->vp);
 324  325                                  args->cr = cr;
 325  326                                  crhold(args->cr);
 326  327                                  (void) zthread_create(NULL, 0,
 327  328                                      nfs4_pgflush_thread, args, 0,
 328  329                                      minclsyspri);
 329  330                          }
 330  331                  }
 331  332          }
 332  333  
 333  334          /*
 334  335           * Flush the readdir response cache.
 335  336           */
 336  337          nfs4_purge_rddir_cache(vp);
 337  338  }
 338  339  
 339  340  /*
 340  341   * Invalidate all pages for the given file, after writing back the dirty
 341  342   * ones.
 342  343   */
 343  344  
 344  345  void
 345  346  nfs4_flush_pages(vnode_t *vp, cred_t *cr)
 346  347  {
 347  348          int error;
 348  349          rnode4_t *rp = VTOR4(vp);
 349  350  
 350  351          error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_INVAL, cr, NULL);
 351  352          if (error == ENOSPC || error == EDQUOT) {
 352  353                  mutex_enter(&rp->r_statelock);
 353  354                  if (!rp->r_error)
 354  355                          rp->r_error = error;
 355  356                  mutex_exit(&rp->r_statelock);
 356  357          }
 357  358  }
 358  359  
 359  360  /*
 360  361   * Page flush thread.
 361  362   */
 362  363  
 363  364  static void
 364  365  nfs4_pgflush_thread(pgflush_t *args)
 365  366  {
 366  367          rnode4_t *rp = VTOR4(args->vp);
 367  368  
 368  369          /* remember which thread we are, so we don't deadlock ourselves */
 369  370          mutex_enter(&rp->r_statelock);
 370  371          ASSERT(rp->r_pgflush == NULL);
 371  372          rp->r_pgflush = curthread;
 372  373          mutex_exit(&rp->r_statelock);
 373  374  
 374  375          nfs4_flush_pages(args->vp, args->cr);
 375  376  
 376  377          mutex_enter(&rp->r_statelock);
 377  378          rp->r_pgflush = NULL;
 378  379          rp->r_flags &= ~R4PGFLUSH;
 379  380          cv_broadcast(&rp->r_cv);
 380  381          mutex_exit(&rp->r_statelock);
 381  382  
 382  383          VN_RELE(args->vp);
 383  384          crfree(args->cr);
 384  385          kmem_free(args, sizeof (pgflush_t));
 385  386          zthread_exit();
 386  387  }
 387  388  
 388  389  /*
 389  390   * Purge the readdir cache of all entries which are not currently
 390  391   * being filled.
 391  392   */
 392  393  void
 393  394  nfs4_purge_rddir_cache(vnode_t *vp)
 394  395  {
 395  396          rnode4_t *rp;
 396  397  
 397  398          rp = VTOR4(vp);
 398  399  
 399  400          mutex_enter(&rp->r_statelock);
 400  401          rp->r_direof = NULL;
 401  402          rp->r_flags &= ~R4LOOKUP;
 402  403          rp->r_flags |= R4READDIRWATTR;
 403  404          rddir4_cache_purge(rp);
 404  405          mutex_exit(&rp->r_statelock);
 405  406  }
 406  407  
 407  408  /*
 408  409   * Set attributes cache for given vnode using virtual attributes.  There is
 409  410   * no cache validation, but if the attributes are deemed to be stale, they
 410  411   * are ignored.  This corresponds to nfs3_attrcache().
 411  412   *
 412  413   * Set the timeout value on the attribute cache and fill it
 413  414   * with the passed in attributes.
 414  415   */
 415  416  void
 416  417  nfs4_attrcache_noinval(vnode_t *vp, nfs4_ga_res_t *garp, hrtime_t t)
 417  418  {
 418  419          rnode4_t *rp = VTOR4(vp);
 419  420  
 420  421          mutex_enter(&rp->r_statelock);
 421  422          if (rp->r_time_attr_saved <= t)
 422  423                  nfs4_attrcache_va(vp, garp, FALSE);
 423  424          mutex_exit(&rp->r_statelock);
 424  425  }
 425  426  
 426  427  /*
 427  428   * Use the passed in virtual attributes to check to see whether the
 428  429   * data and metadata caches are valid, cache the new attributes, and
 429  430   * then do the cache invalidation if required.
 430  431   *
 431  432   * The cache validation and caching of the new attributes is done
 432  433   * atomically via the use of the mutex, r_statelock.  If required,
 433  434   * the cache invalidation is done atomically w.r.t. the cache
 434  435   * validation and caching of the attributes via the pseudo lock,
 435  436   * r_serial.
 436  437   *
 437  438   * This routine is used to do cache validation and attributes caching
 438  439   * for operations with a single set of post operation attributes.
 439  440   */
 440  441  
 441  442  void
 442  443  nfs4_attr_cache(vnode_t *vp, nfs4_ga_res_t *garp,
 443  444      hrtime_t t, cred_t *cr, int async,
 444  445      change_info4 *cinfo)
 445  446  {
 446  447          rnode4_t *rp;
 447  448          int mtime_changed = 0;
 448  449          int ctime_changed = 0;
 449  450          vsecattr_t *vsp;
 450  451          int was_serial, set_time_cache_inval, recov;
 451  452          vattr_t *vap = &garp->n4g_va;
 452  453          mntinfo4_t *mi = VTOMI4(vp);
 453  454          len_t preattr_rsize;
 454  455          boolean_t writemodify_set = B_FALSE;
 455  456          boolean_t cachepurge_set = B_FALSE;
 456  457  
 457  458          ASSERT(mi->mi_vfsp->vfs_dev == garp->n4g_va.va_fsid);
 458  459  
 459  460          /* Is curthread the recovery thread? */
 460  461          mutex_enter(&mi->mi_lock);
 461  462          recov = (VTOMI4(vp)->mi_recovthread == curthread);
 462  463          mutex_exit(&mi->mi_lock);
 463  464  
 464  465          rp = VTOR4(vp);
 465  466          mutex_enter(&rp->r_statelock);
 466  467          was_serial = (rp->r_serial == curthread);
 467  468          if (rp->r_serial && !was_serial) {
 468  469                  klwp_t *lwp = ttolwp(curthread);
 469  470  
 470  471                  /*
 471  472                   * If we're the recovery thread, then purge current attrs
 472  473                   * and bail out to avoid potential deadlock between another
 473  474                   * thread caching attrs (r_serial thread), recov thread,
 474  475                   * and an async writer thread.
 475  476                   */
 476  477                  if (recov) {
 477  478                          PURGE_ATTRCACHE4_LOCKED(rp);
 478  479                          mutex_exit(&rp->r_statelock);
 479  480                          return;
 480  481                  }
 481  482  
 482  483                  if (lwp != NULL)
 483  484                          lwp->lwp_nostop++;
 484  485                  while (rp->r_serial != NULL) {
 485  486                          if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
 486  487                                  mutex_exit(&rp->r_statelock);
 487  488                                  if (lwp != NULL)
 488  489                                          lwp->lwp_nostop--;
 489  490                                  return;
 490  491                          }
 491  492                  }
 492  493                  if (lwp != NULL)
 493  494                          lwp->lwp_nostop--;
 494  495          }
 495  496  
 496  497          /*
 497  498           * If there is a page flush thread, the current thread needs to
 498  499           * bail out, to prevent a possible deadlock between the current
 499  500           * thread (which might be in a start_op/end_op region), the
 500  501           * recovery thread, and the page flush thread.  Expire the
 501  502           * attribute cache, so that any attributes the current thread was
 502  503           * going to set are not lost.
 503  504           */
 504  505          if ((rp->r_flags & R4PGFLUSH) && rp->r_pgflush != curthread) {
 505  506                  PURGE_ATTRCACHE4_LOCKED(rp);
 506  507                  mutex_exit(&rp->r_statelock);
 507  508                  return;
 508  509          }
 509  510  
 510  511          if (rp->r_time_attr_saved > t) {
 511  512                  /*
 512  513                   * Attributes have been cached since these attributes were
 513  514                   * probably made. If there is an inconsistency in what is
 514  515                   * cached, mark them invalid. If not, don't act on them.
 515  516                   */
 516  517                  if (!CACHE4_VALID(rp, vap->va_mtime, vap->va_size))
 517  518                          PURGE_ATTRCACHE4_LOCKED(rp);
 518  519                  mutex_exit(&rp->r_statelock);
 519  520                  return;
 520  521          }
 521  522          set_time_cache_inval = 0;
 522  523          if (cinfo) {
 523  524                  /*
 524  525                   * Only directory modifying callers pass non-NULL cinfo.
 525  526                   */
 526  527                  ASSERT(vp->v_type == VDIR);
 527  528                  /*
 528  529                   * If the cache timeout either doesn't exist or hasn't expired,
 529  530                   * and dir didn't changed on server before dirmod op
 530  531                   * and dir didn't change after dirmod op but before getattr
 531  532                   * then there's a chance that the client's cached data for
 532  533                   * this object is current (not stale).  No immediate cache
 533  534                   * flush is required.
 534  535                   *
 535  536                   */
 536  537                  if ((! rp->r_time_cache_inval || t < rp->r_time_cache_inval) &&
 537  538                      cinfo->before == rp->r_change &&
 538  539                      (garp->n4g_change_valid &&
 539  540                      cinfo->after == garp->n4g_change)) {
 540  541  
 541  542                          /*
 542  543                           * If atomic isn't set, then the before/after info
 543  544                           * cannot be blindly trusted.  For this case, we tell
 544  545                           * nfs4_attrcache_va to cache the attrs but also
 545  546                           * establish an absolute maximum cache timeout.  When
 546  547                           * the timeout is reached, caches will be flushed.
 547  548                           */
 548  549                          if (! cinfo->atomic)
 549  550                                  set_time_cache_inval = 1;
 550  551                  } else {
 551  552  
 552  553                          /*
 553  554                           * We're not sure exactly what changed, but we know
 554  555                           * what to do.  flush all caches for dir.  remove the
 555  556                           * attr timeout.
 556  557                           *
 557  558                           * a) timeout expired.  flush all caches.
 558  559                           * b) r_change != cinfo.before.  flush all caches.
 559  560                           * c) r_change == cinfo.before, but cinfo.after !=
 560  561                           *    post-op getattr(change).  flush all caches.
 561  562                           * d) post-op getattr(change) not provided by server.
 562  563                           *    flush all caches.
 563  564                           */
 564  565                          mtime_changed = 1;
 565  566                          ctime_changed = 1;
 566  567                          rp->r_time_cache_inval = 0;
 567  568                  }
 568  569          } else {
 569  570                  /*
 570  571                   * Write thread after writing data to file on remote server,
 571  572                   * will always set R4WRITEMODIFIED to indicate that file on
 572  573                   * remote server was modified with a WRITE operation and would
 573  574                   * have marked attribute cache as timed out. If R4WRITEMODIFIED
 574  575                   * is set, then do not check for mtime and ctime change.
 575  576                   */
 576  577                  if (!(rp->r_flags & R4WRITEMODIFIED)) {
 577  578                          if (!CACHE4_VALID(rp, vap->va_mtime, vap->va_size))
 578  579                                  mtime_changed = 1;
 579  580  
 580  581                          if (rp->r_attr.va_ctime.tv_sec !=
 581  582                              vap->va_ctime.tv_sec ||
 582  583                              rp->r_attr.va_ctime.tv_nsec !=
 583  584                              vap->va_ctime.tv_nsec)
 584  585                                  ctime_changed = 1;
 585  586  
 586  587                          /*
 587  588                           * If the change attribute was not provided by server
 588  589                           * or it differs, then flush all caches.
 589  590                           */
 590  591                          if (!garp->n4g_change_valid ||
 591  592                              rp->r_change != garp->n4g_change) {
 592  593                                  mtime_changed = 1;
 593  594                                  ctime_changed = 1;
 594  595                          }
 595  596                  } else {
 596  597                          writemodify_set = B_TRUE;
 597  598                  }
 598  599          }
 599  600  
 600  601          preattr_rsize = rp->r_size;
 601  602  
 602  603          nfs4_attrcache_va(vp, garp, set_time_cache_inval);
 603  604  
 604  605          /*
 605  606           * If we have updated filesize in nfs4_attrcache_va, as soon as we
 606  607           * drop statelock we will be in transition of purging all
 607  608           * our caches and updating them. It is possible for another
 608  609           * thread to pick this new file size and read in zeroed data.
 609  610           * stall other threads till cache purge is complete.
 610  611           */
 611  612          if ((!cinfo) && (rp->r_size != preattr_rsize)) {
 612  613                  /*
 613  614                   * If R4WRITEMODIFIED was set and we have updated the file
 614  615                   * size, Server's returned file size need not necessarily
 615  616                   * be because of this Client's WRITE. We need to purge
 616  617                   * all caches.
 617  618                   */
 618  619                  if (writemodify_set)
 619  620                          mtime_changed = 1;
 620  621  
 621  622                  if (mtime_changed && !(rp->r_flags & R4INCACHEPURGE)) {
 622  623                          rp->r_flags |= R4INCACHEPURGE;
 623  624                          cachepurge_set = B_TRUE;
 624  625                  }
 625  626          }
 626  627  
 627  628          if (!mtime_changed && !ctime_changed) {
 628  629                  mutex_exit(&rp->r_statelock);
 629  630                  return;
 630  631          }
 631  632  
 632  633          rp->r_serial = curthread;
 633  634  
 634  635          mutex_exit(&rp->r_statelock);
 635  636  
 636  637          /*
 637  638           * If we're the recov thread, then force async nfs4_purge_caches
 638  639           * to avoid potential deadlock.
 639  640           */
 640  641          if (mtime_changed)
 641  642                  nfs4_purge_caches(vp, NFS4_NOPURGE_DNLC, cr, recov ? 1 : async);
 642  643  
 643  644          if ((rp->r_flags & R4INCACHEPURGE) && cachepurge_set) {
 644  645                  mutex_enter(&rp->r_statelock);
 645  646                  rp->r_flags &= ~R4INCACHEPURGE;
 646  647                  cv_broadcast(&rp->r_cv);
 647  648                  mutex_exit(&rp->r_statelock);
 648  649                  cachepurge_set = B_FALSE;
 649  650          }
 650  651  
 651  652          if (ctime_changed) {
 652  653                  (void) nfs4_access_purge_rp(rp);
 653  654                  if (rp->r_secattr != NULL) {
 654  655                          mutex_enter(&rp->r_statelock);
 655  656                          vsp = rp->r_secattr;
 656  657                          rp->r_secattr = NULL;
 657  658                          mutex_exit(&rp->r_statelock);
 658  659                          if (vsp != NULL)
 659  660                                  nfs4_acl_free_cache(vsp);
 660  661                  }
 661  662          }
 662  663  
 663  664          if (!was_serial) {
 664  665                  mutex_enter(&rp->r_statelock);
 665  666                  rp->r_serial = NULL;
 666  667                  cv_broadcast(&rp->r_cv);
 667  668                  mutex_exit(&rp->r_statelock);
 668  669          }
 669  670  }
 670  671  
 671  672  /*
 672  673   * Set attributes cache for given vnode using virtual attributes.
 673  674   *
 674  675   * Set the timeout value on the attribute cache and fill it
 675  676   * with the passed in attributes.
 676  677   *
 677  678   * The caller must be holding r_statelock.
 678  679   */
 679  680  static void
 680  681  nfs4_attrcache_va(vnode_t *vp, nfs4_ga_res_t *garp, int set_cache_timeout)
 681  682  {
 682  683          rnode4_t *rp;
 683  684          mntinfo4_t *mi;
 684  685          hrtime_t delta;
 685  686          hrtime_t now;
 686  687          vattr_t *vap = &garp->n4g_va;
 687  688  
 688  689          rp = VTOR4(vp);
 689  690  
 690  691          ASSERT(MUTEX_HELD(&rp->r_statelock));
 691  692          ASSERT(vap->va_mask == AT_ALL);
 692  693  
 693  694          /* Switch to master before checking v_flag */
 694  695          if (IS_SHADOW(vp, rp))
 695  696                  vp = RTOV4(rp);
 696  697  
 697  698          now = gethrtime();
 698  699  
 699  700          mi = VTOMI4(vp);
 700  701  
 701  702          /*
 702  703           * Only establish a new cache timeout (if requested).  Never
 703  704           * extend a timeout.  Never clear a timeout.  Clearing a timeout
 704  705           * is done by nfs4_update_dircaches (ancestor in our call chain)
 705  706           */
 706  707          if (set_cache_timeout && ! rp->r_time_cache_inval)
 707  708                  rp->r_time_cache_inval = now + mi->mi_acdirmax;
 708  709  
 709  710          /*
 710  711           * Delta is the number of nanoseconds that we will
 711  712           * cache the attributes of the file.  It is based on
 712  713           * the number of nanoseconds since the last time that
 713  714           * we detected a change.  The assumption is that files
 714  715           * that changed recently are likely to change again.
 715  716           * There is a minimum and a maximum for regular files
 716  717           * and for directories which is enforced though.
 717  718           *
 718  719           * Using the time since last change was detected
 719  720           * eliminates direct comparison or calculation
 720  721           * using mixed client and server times.  NFS does
 721  722           * not make any assumptions regarding the client
 722  723           * and server clocks being synchronized.
 723  724           */
 724  725          if (vap->va_mtime.tv_sec != rp->r_attr.va_mtime.tv_sec ||
 725  726              vap->va_mtime.tv_nsec != rp->r_attr.va_mtime.tv_nsec ||
 726  727              vap->va_size != rp->r_attr.va_size) {
 727  728                  rp->r_time_attr_saved = now;
 728  729          }
 729  730  
 730  731          if ((mi->mi_flags & MI4_NOAC) || (vp->v_flag & VNOCACHE))
 731  732                  delta = 0;
 732  733          else {
 733  734                  delta = now - rp->r_time_attr_saved;
 734  735                  if (vp->v_type == VDIR) {
 735  736                          if (delta < mi->mi_acdirmin)
 736  737                                  delta = mi->mi_acdirmin;
 737  738                          else if (delta > mi->mi_acdirmax)
 738  739                                  delta = mi->mi_acdirmax;
 739  740                  } else {
 740  741                          if (delta < mi->mi_acregmin)
 741  742                                  delta = mi->mi_acregmin;
 742  743                          else if (delta > mi->mi_acregmax)
 743  744                                  delta = mi->mi_acregmax;
 744  745                  }
 745  746          }
 746  747          rp->r_time_attr_inval = now + delta;
 747  748  
 748  749          rp->r_attr = *vap;
 749  750          if (garp->n4g_change_valid)
 750  751                  rp->r_change = garp->n4g_change;
 751  752  
 752  753          /*
 753  754           * The attributes that were returned may be valid and can
 754  755           * be used, but they may not be allowed to be cached.
 755  756           * Reset the timers to cause immediate invalidation and
 756  757           * clear r_change so no VERIFY operations will suceed
 757  758           */
 758  759          if (garp->n4g_attrwhy == NFS4_GETATTR_NOCACHE_OK) {
 759  760                  rp->r_time_attr_inval = now;
 760  761                  rp->r_time_attr_saved = now;
 761  762                  rp->r_change = 0;
 762  763          }
 763  764  
 764  765          /*
 765  766           * If mounted_on_fileid returned AND the object is a stub,
 766  767           * then set object's va_nodeid to the mounted over fid
 767  768           * returned by server.
 768  769           *
 769  770           * If mounted_on_fileid not provided/supported, then
 770  771           * just set it to 0 for now.  Eventually it would be
 771  772           * better to set it to a hashed version of FH.  This
 772  773           * would probably be good enough to provide a unique
 773  774           * fid/d_ino within a dir.
 774  775           *
 775  776           * We don't need to carry mounted_on_fileid in the
 776  777           * rnode as long as the client never requests fileid
 777  778           * without also requesting mounted_on_fileid.  For
 778  779           * now, it stays.
 779  780           */
 780  781          if (garp->n4g_mon_fid_valid) {
 781  782                  rp->r_mntd_fid = garp->n4g_mon_fid;
 782  783  
 783  784                  if (RP_ISSTUB(rp))
 784  785                          rp->r_attr.va_nodeid = rp->r_mntd_fid;
 785  786          }
 786  787  
 787  788          /*
 788  789           * Check to see if there are valid pathconf bits to
 789  790           * cache in the rnode.
 790  791           */
 791  792          if (garp->n4g_ext_res) {
 792  793                  if (garp->n4g_ext_res->n4g_pc4.pc4_cache_valid) {
 793  794                          rp->r_pathconf = garp->n4g_ext_res->n4g_pc4;
 794  795                  } else {
 795  796                          if (garp->n4g_ext_res->n4g_pc4.pc4_xattr_valid) {
 796  797                                  rp->r_pathconf.pc4_xattr_valid = TRUE;
 797  798                                  rp->r_pathconf.pc4_xattr_exists =
 798  799                                      garp->n4g_ext_res->n4g_pc4.pc4_xattr_exists;
 799  800                          }
 800  801                  }
 801  802          }
 802  803          /*
 803  804           * Update the size of the file if there is no cached data or if
 804  805           * the cached data is clean and there is no data being written
 805  806           * out.
 806  807           */
 807  808          if (rp->r_size != vap->va_size &&
 808  809              (!vn_has_cached_data(vp) ||
 809  810              (!(rp->r_flags & R4DIRTY) && rp->r_count == 0))) {
 810  811                  rp->r_size = vap->va_size;
 811  812          }
 812  813          nfs_setswaplike(vp, vap);
 813  814          rp->r_flags &= ~R4WRITEMODIFIED;
 814  815  }
 815  816  
 816  817  /*
 817  818   * Get attributes over-the-wire and update attributes cache
 818  819   * if no error occurred in the over-the-wire operation.
 819  820   * Return 0 if successful, otherwise error.
 820  821   */
 821  822  int
 822  823  nfs4_getattr_otw(vnode_t *vp, nfs4_ga_res_t *garp, cred_t *cr, int get_acl)
 823  824  {
 824  825          mntinfo4_t *mi = VTOMI4(vp);
 825  826          hrtime_t t;
 826  827          nfs4_recov_state_t recov_state;
 827  828          nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
 828  829  
 829  830          recov_state.rs_flags = 0;
 830  831          recov_state.rs_num_retry_despite_err = 0;
 831  832  
 832  833          /* Save the original mount point security flavor */
 833  834          (void) save_mnt_secinfo(mi->mi_curr_serv);
 834  835  
 835  836  recov_retry:
 836  837  
 837  838          if ((e.error = nfs4_start_fop(mi, vp, NULL, OH_GETATTR,
 838  839              &recov_state, NULL))) {
 839  840                  (void) check_mnt_secinfo(mi->mi_curr_serv, vp);
 840  841                  return (e.error);
 841  842          }
 842  843  
 843  844          t = gethrtime();
 844  845  
 845  846          nfs4_getattr_otw_norecovery(vp, garp, &e, cr, get_acl);
 846  847  
 847  848          if (nfs4_needs_recovery(&e, FALSE, vp->v_vfsp)) {
 848  849                  if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
 849  850                      NULL, OP_GETATTR, NULL, NULL, NULL) == FALSE)  {
 850  851                          nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR,
 851  852                              &recov_state, 1);
 852  853                          goto recov_retry;
 853  854                  }
 854  855          }
 855  856  
 856  857          nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state, 0);
 857  858  
 858  859          if (!e.error) {
 859  860                  if (e.stat == NFS4_OK) {
 860  861                          nfs4_attr_cache(vp, garp, t, cr, FALSE, NULL);
 861  862                  } else {
 862  863                          e.error = geterrno4(e.stat);
 863  864  
 864  865                          nfs4_purge_stale_fh(e.error, vp, cr);
 865  866                  }
 866  867          }
 867  868  
 868  869          /*
 869  870           * If getattr a node that is a stub for a crossed
 870  871           * mount point, keep the original secinfo flavor for
 871  872           * the current file system, not the crossed one.
 872  873           */
 873  874          (void) check_mnt_secinfo(mi->mi_curr_serv, vp);
 874  875  
 875  876          return (e.error);
 876  877  }
 877  878  
 878  879  /*
 879  880   * Generate a compound to get attributes over-the-wire.
 880  881   */
 881  882  void
 882  883  nfs4_getattr_otw_norecovery(vnode_t *vp, nfs4_ga_res_t *garp,
 883  884      nfs4_error_t *ep, cred_t *cr, int get_acl)
 884  885  {
 885  886          COMPOUND4args_clnt args;
 886  887          COMPOUND4res_clnt res;
 887  888          int doqueue;
 888  889          rnode4_t *rp = VTOR4(vp);
 889  890          nfs_argop4 argop[2];
 890  891  
 891  892          args.ctag = TAG_GETATTR;
 892  893  
 893  894          args.array_len = 2;
 894  895          args.array = argop;
 895  896  
 896  897          /* putfh */
 897  898          argop[0].argop = OP_CPUTFH;
 898  899          argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
 899  900  
 900  901          /* getattr */
 901  902          /*
 902  903           * Unlike nfs version 2 and 3, where getattr returns all the
 903  904           * attributes, nfs version 4 returns only the ones explicitly
 904  905           * asked for. This creates problems, as some system functions
 905  906           * (e.g. cache check) require certain attributes and if the
 906  907           * cached node lacks some attributes such as uid/gid, it can
 907  908           * affect system utilities (e.g. "ls") that rely on the information
 908  909           * to be there. This can lead to anything from system crashes to
 909  910           * corrupted information processed by user apps.
 910  911           * So to ensure that all bases are covered, request at least
 911  912           * the AT_ALL attribute mask.
 912  913           */
 913  914          argop[1].argop = OP_GETATTR;
 914  915          argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
 915  916          if (get_acl)
 916  917                  argop[1].nfs_argop4_u.opgetattr.attr_request |= FATTR4_ACL_MASK;
 917  918          argop[1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp);
 918  919  
 919  920          doqueue = 1;
 920  921  
 921  922          rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, ep);
 922  923  
 923  924          if (ep->error)
 924  925                  return;
 925  926  
 926  927          if (res.status != NFS4_OK) {
 927  928                  xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
 928  929                  return;
 929  930          }
 930  931  
 931  932          *garp = res.array[1].nfs_resop4_u.opgetattr.ga_res;
 932  933  
 933  934          xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
 934  935  }
 935  936  
 936  937  /*
 937  938   * Return either cached or remote attributes. If get remote attr
 938  939   * use them to check and invalidate caches, then cache the new attributes.
 939  940   */
 940  941  int
 941  942  nfs4getattr(vnode_t *vp, vattr_t *vap, cred_t *cr)
 942  943  {
 943  944          int error;
 944  945          rnode4_t *rp;
 945  946          nfs4_ga_res_t gar;
 946  947  
 947  948          ASSERT(nfs4_consistent_type(vp));
 948  949  
 949  950          /*
 950  951           * If we've got cached attributes, we're done, otherwise go
 951  952           * to the server to get attributes, which will update the cache
 952  953           * in the process. Either way, use the cached attributes for
 953  954           * the caller's vattr_t.
 954  955           *
 955  956           * Note that we ignore the gar set by the OTW call: the attr caching
 956  957           * code may make adjustments when storing to the rnode, and we want
 957  958           * to see those changes here.
 958  959           */
 959  960          rp = VTOR4(vp);
 960  961          error = 0;
 961  962          mutex_enter(&rp->r_statelock);
 962  963          if (!ATTRCACHE4_VALID(vp)) {
 963  964                  mutex_exit(&rp->r_statelock);
 964  965                  error = nfs4_getattr_otw(vp, &gar, cr, 0);
 965  966                  mutex_enter(&rp->r_statelock);
 966  967          }
 967  968  
 968  969          if (!error)
 969  970                  *vap = rp->r_attr;
 970  971  
 971  972          /* Return the client's view of file size */
 972  973          vap->va_size = rp->r_size;
 973  974  
 974  975          mutex_exit(&rp->r_statelock);
 975  976  
 976  977          ASSERT(nfs4_consistent_type(vp));
 977  978  
 978  979          return (error);
 979  980  }
 980  981  
 981  982  int
 982  983  nfs4_attr_otw(vnode_t *vp, nfs4_tag_type_t tag_type,
 983  984      nfs4_ga_res_t *garp, bitmap4 reqbitmap, cred_t *cr)
 984  985  {
 985  986          COMPOUND4args_clnt args;
 986  987          COMPOUND4res_clnt res;
 987  988          int doqueue;
 988  989          nfs_argop4 argop[2];
 989  990          mntinfo4_t *mi = VTOMI4(vp);
 990  991          bool_t needrecov = FALSE;
 991  992          nfs4_recov_state_t recov_state;
 992  993          nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
 993  994          nfs4_ga_ext_res_t *gerp;
 994  995  
 995  996          recov_state.rs_flags = 0;
 996  997          recov_state.rs_num_retry_despite_err = 0;
 997  998  
 998  999  recov_retry:
 999 1000          args.ctag = tag_type;
1000 1001  
1001 1002          args.array_len = 2;
1002 1003          args.array = argop;
1003 1004  
1004 1005          e.error = nfs4_start_fop(mi, vp, NULL, OH_GETATTR, &recov_state, NULL);
1005 1006          if (e.error)
1006 1007                  return (e.error);
1007 1008  
1008 1009          /* putfh */
1009 1010          argop[0].argop = OP_CPUTFH;
1010 1011          argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh;
1011 1012  
1012 1013          /* getattr */
1013 1014          argop[1].argop = OP_GETATTR;
1014 1015          argop[1].nfs_argop4_u.opgetattr.attr_request = reqbitmap;
1015 1016          argop[1].nfs_argop4_u.opgetattr.mi = mi;
1016 1017  
1017 1018          doqueue = 1;
1018 1019  
1019 1020          NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
1020 1021              "nfs4_attr_otw: %s call, rp %s", needrecov ? "recov" : "first",
1021 1022              rnode4info(VTOR4(vp))));
1022 1023  
1023 1024          rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
1024 1025  
1025 1026          needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
1026 1027          if (!needrecov && e.error) {
1027 1028                  nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state,
1028 1029                      needrecov);
1029 1030                  return (e.error);
1030 1031          }
1031 1032  
1032 1033          if (needrecov) {
1033 1034                  bool_t abort;
1034 1035  
1035 1036                  NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1036 1037                      "nfs4_attr_otw: initiating recovery\n"));
1037 1038  
1038 1039                  abort = nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
1039 1040                      NULL, OP_GETATTR, NULL, NULL, NULL);
1040 1041                  nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state,
1041 1042                      needrecov);
1042 1043                  if (!e.error) {
1043 1044                          xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1044 1045                          e.error = geterrno4(res.status);
1045 1046                  }
1046 1047                  if (abort == FALSE)
1047 1048                          goto recov_retry;
1048 1049                  return (e.error);
1049 1050          }
1050 1051  
1051 1052          if (res.status) {
1052 1053                  e.error = geterrno4(res.status);
1053 1054          } else {
1054 1055                  gerp = garp->n4g_ext_res;
1055 1056                  bcopy(&res.array[1].nfs_resop4_u.opgetattr.ga_res,
1056 1057                      garp, sizeof (nfs4_ga_res_t));
1057 1058                  garp->n4g_ext_res = gerp;
1058 1059                  if (garp->n4g_ext_res &&
1059 1060                      res.array[1].nfs_resop4_u.opgetattr.ga_res.n4g_ext_res)
1060 1061                          bcopy(res.array[1].nfs_resop4_u.opgetattr.
1061 1062                              ga_res.n4g_ext_res,
1062 1063                              garp->n4g_ext_res, sizeof (nfs4_ga_ext_res_t));
1063 1064          }
1064 1065          xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1065 1066          nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state,
1066 1067              needrecov);
1067 1068          return (e.error);
1068 1069  }
1069 1070  
1070 1071  /*
1071 1072   * Asynchronous I/O parameters.  nfs_async_threads is the high-water mark
1072 1073   * for the demand-based allocation of async threads per-mount.  The
1073 1074   * nfs_async_timeout is the amount of time a thread will live after it
1074 1075   * becomes idle, unless new I/O requests are received before the thread
1075 1076   * dies.  See nfs4_async_putpage and nfs4_async_start.
1076 1077   */
1077 1078  
1078 1079  static void     nfs4_async_start(struct vfs *);
1079 1080  static void     nfs4_async_pgops_start(struct vfs *);
1080 1081  static void     nfs4_async_common_start(struct vfs *, int);
1081 1082  
1082 1083  static void
1083 1084  free_async_args4(struct nfs4_async_reqs *args)
1084 1085  {
1085 1086          rnode4_t *rp;
1086 1087  
1087 1088          if (args->a_io != NFS4_INACTIVE) {
1088 1089                  rp = VTOR4(args->a_vp);
1089 1090                  mutex_enter(&rp->r_statelock);
1090 1091                  rp->r_count--;
1091 1092                  if (args->a_io == NFS4_PUTAPAGE ||
1092 1093                      args->a_io == NFS4_PAGEIO)
1093 1094                          rp->r_awcount--;
1094 1095                  cv_broadcast(&rp->r_cv);
1095 1096                  mutex_exit(&rp->r_statelock);
1096 1097                  VN_RELE(args->a_vp);
1097 1098          }
1098 1099          crfree(args->a_cred);
1099 1100          kmem_free(args, sizeof (*args));
1100 1101  }
1101 1102  
1102 1103  /*
1103 1104   * Cross-zone thread creation and NFS access is disallowed, yet fsflush() and
1104 1105   * pageout(), running in the global zone, have legitimate reasons to do
1105 1106   * VOP_PUTPAGE(B_ASYNC) on other zones' NFS mounts.  We avoid the problem by
1106 1107   * use of a a per-mount "asynchronous requests manager thread" which is
1107 1108   * signaled by the various asynchronous work routines when there is
1108 1109   * asynchronous work to be done.  It is responsible for creating new
1109 1110   * worker threads if necessary, and notifying existing worker threads
1110 1111   * that there is work to be done.
1111 1112   *
1112 1113   * In other words, it will "take the specifications from the customers and
1113 1114   * give them to the engineers."
1114 1115   *
1115 1116   * Worker threads die off of their own accord if they are no longer
1116 1117   * needed.
1117 1118   *
1118 1119   * This thread is killed when the zone is going away or the filesystem
1119 1120   * is being unmounted.
1120 1121   */
1121 1122  void
1122 1123  nfs4_async_manager(vfs_t *vfsp)
1123 1124  {
1124 1125          callb_cpr_t cprinfo;
1125 1126          mntinfo4_t *mi;
1126 1127          uint_t max_threads;
1127 1128  
1128 1129          mi = VFTOMI4(vfsp);
1129 1130  
1130 1131          CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr,
1131 1132              "nfs4_async_manager");
1132 1133  
1133 1134          mutex_enter(&mi->mi_async_lock);
1134 1135          /*
1135 1136           * We want to stash the max number of threads that this mount was
1136 1137           * allowed so we can use it later when the variable is set to zero as
1137 1138           * part of the zone/mount going away.
1138 1139           *
1139 1140           * We want to be able to create at least one thread to handle
1140 1141           * asynchronous inactive calls.
1141 1142           */
1142 1143          max_threads = MAX(mi->mi_max_threads, 1);
1143 1144          /*
1144 1145           * We don't want to wait for mi_max_threads to go to zero, since that
1145 1146           * happens as part of a failed unmount, but this thread should only
1146 1147           * exit when the mount is really going away.
1147 1148           *
1148 1149           * Once MI4_ASYNC_MGR_STOP is set, no more async operations will be
1149 1150           * attempted: the various _async_*() functions know to do things
1150 1151           * inline if mi_max_threads == 0.  Henceforth we just drain out the
1151 1152           * outstanding requests.
1152 1153           *
1153 1154           * Note that we still create zthreads even if we notice the zone is
1154 1155           * shutting down (MI4_ASYNC_MGR_STOP is set); this may cause the zone
1155 1156           * shutdown sequence to take slightly longer in some cases, but
1156 1157           * doesn't violate the protocol, as all threads will exit as soon as
1157 1158           * they're done processing the remaining requests.
1158 1159           */
1159 1160          for (;;) {
1160 1161                  while (mi->mi_async_req_count > 0) {
1161 1162                          /*
1162 1163                           * Paranoia: If the mount started out having
1163 1164                           * (mi->mi_max_threads == 0), and the value was
1164 1165                           * later changed (via a debugger or somesuch),
1165 1166                           * we could be confused since we will think we
1166 1167                           * can't create any threads, and the calling
1167 1168                           * code (which looks at the current value of
1168 1169                           * mi->mi_max_threads, now non-zero) thinks we
1169 1170                           * can.
1170 1171                           *
1171 1172                           * So, because we're paranoid, we create threads
1172 1173                           * up to the maximum of the original and the
1173 1174                           * current value. This means that future
1174 1175                           * (debugger-induced) alterations of
1175 1176                           * mi->mi_max_threads are ignored for our
1176 1177                           * purposes, but who told them they could change
1177 1178                           * random values on a live kernel anyhow?
1178 1179                           */
1179 1180                          if (mi->mi_threads[NFS4_ASYNC_QUEUE] <
1180 1181                              MAX(mi->mi_max_threads, max_threads)) {
1181 1182                                  mi->mi_threads[NFS4_ASYNC_QUEUE]++;
1182 1183                                  mutex_exit(&mi->mi_async_lock);
1183 1184                                  MI4_HOLD(mi);
1184 1185                                  VFS_HOLD(vfsp); /* hold for new thread */
1185 1186                                  (void) zthread_create(NULL, 0, nfs4_async_start,
1186 1187                                      vfsp, 0, minclsyspri);
1187 1188                                  mutex_enter(&mi->mi_async_lock);
1188 1189                          } else if (mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] <
1189 1190                              NUM_ASYNC_PGOPS_THREADS) {
1190 1191                                  mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE]++;
1191 1192                                  mutex_exit(&mi->mi_async_lock);
1192 1193                                  MI4_HOLD(mi);
1193 1194                                  VFS_HOLD(vfsp); /* hold for new thread */
1194 1195                                  (void) zthread_create(NULL, 0,
1195 1196                                      nfs4_async_pgops_start, vfsp, 0,
1196 1197                                      minclsyspri);
1197 1198                                  mutex_enter(&mi->mi_async_lock);
1198 1199                          }
1199 1200                          NFS4_WAKE_ASYNC_WORKER(mi->mi_async_work_cv);
1200 1201                          ASSERT(mi->mi_async_req_count != 0);
1201 1202                          mi->mi_async_req_count--;
1202 1203                  }
1203 1204  
1204 1205                  mutex_enter(&mi->mi_lock);
1205 1206                  if (mi->mi_flags & MI4_ASYNC_MGR_STOP) {
1206 1207                          mutex_exit(&mi->mi_lock);
1207 1208                          break;
1208 1209                  }
1209 1210                  mutex_exit(&mi->mi_lock);
1210 1211  
1211 1212                  CALLB_CPR_SAFE_BEGIN(&cprinfo);
1212 1213                  cv_wait(&mi->mi_async_reqs_cv, &mi->mi_async_lock);
1213 1214                  CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1214 1215          }
1215 1216  
1216 1217          NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
1217 1218              "nfs4_async_manager exiting for vfs %p\n", (void *)mi->mi_vfsp));
1218 1219          /*
1219 1220           * Let everyone know we're done.
1220 1221           */
1221 1222          mi->mi_manager_thread = NULL;
1222 1223          /*
1223 1224           * Wake up the inactive thread.
1224 1225           */
1225 1226          cv_broadcast(&mi->mi_inact_req_cv);
1226 1227          /*
1227 1228           * Wake up anyone sitting in nfs4_async_manager_stop()
1228 1229           */
1229 1230          cv_broadcast(&mi->mi_async_cv);
1230 1231          /*
1231 1232           * There is no explicit call to mutex_exit(&mi->mi_async_lock)
1232 1233           * since CALLB_CPR_EXIT is actually responsible for releasing
1233 1234           * 'mi_async_lock'.
1234 1235           */
1235 1236          CALLB_CPR_EXIT(&cprinfo);
1236 1237          VFS_RELE(vfsp); /* release thread's hold */
1237 1238          MI4_RELE(mi);
1238 1239          zthread_exit();
1239 1240  }
1240 1241  
1241 1242  /*
1242 1243   * Signal (and wait for) the async manager thread to clean up and go away.
1243 1244   */
1244 1245  void
1245 1246  nfs4_async_manager_stop(vfs_t *vfsp)
1246 1247  {
1247 1248          mntinfo4_t *mi = VFTOMI4(vfsp);
1248 1249  
1249 1250          mutex_enter(&mi->mi_async_lock);
1250 1251          mutex_enter(&mi->mi_lock);
1251 1252          mi->mi_flags |= MI4_ASYNC_MGR_STOP;
1252 1253          mutex_exit(&mi->mi_lock);
1253 1254          cv_broadcast(&mi->mi_async_reqs_cv);
1254 1255          /*
1255 1256           * Wait for the async manager thread to die.
1256 1257           */
1257 1258          while (mi->mi_manager_thread != NULL)
1258 1259                  cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
1259 1260          mutex_exit(&mi->mi_async_lock);
1260 1261  }
1261 1262  
1262 1263  int
1263 1264  nfs4_async_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr,
1264 1265      struct seg *seg, cred_t *cr, void (*readahead)(vnode_t *,
1265 1266      u_offset_t, caddr_t, struct seg *, cred_t *))
1266 1267  {
1267 1268          rnode4_t *rp;
1268 1269          mntinfo4_t *mi;
1269 1270          struct nfs4_async_reqs *args;
1270 1271  
1271 1272          rp = VTOR4(vp);
1272 1273          ASSERT(rp->r_freef == NULL);
1273 1274  
1274 1275          mi = VTOMI4(vp);
1275 1276  
1276 1277          /*
1277 1278           * If addr falls in a different segment, don't bother doing readahead.
1278 1279           */
1279 1280          if (addr >= seg->s_base + seg->s_size)
1280 1281                  return (-1);
1281 1282  
1282 1283          /*
1283 1284           * If we can't allocate a request structure, punt on the readahead.
1284 1285           */
1285 1286          if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1286 1287                  return (-1);
1287 1288  
1288 1289          /*
1289 1290           * If a lock operation is pending, don't initiate any new
1290 1291           * readaheads.  Otherwise, bump r_count to indicate the new
1291 1292           * asynchronous I/O.
1292 1293           */
1293 1294          if (!nfs_rw_tryenter(&rp->r_lkserlock, RW_READER)) {
1294 1295                  kmem_free(args, sizeof (*args));
1295 1296                  return (-1);
1296 1297          }
1297 1298          mutex_enter(&rp->r_statelock);
1298 1299          rp->r_count++;
1299 1300          mutex_exit(&rp->r_statelock);
1300 1301          nfs_rw_exit(&rp->r_lkserlock);
1301 1302  
1302 1303          args->a_next = NULL;
1303 1304  #ifdef DEBUG
1304 1305          args->a_queuer = curthread;
1305 1306  #endif
1306 1307          VN_HOLD(vp);
1307 1308          args->a_vp = vp;
1308 1309          ASSERT(cr != NULL);
1309 1310          crhold(cr);
1310 1311          args->a_cred = cr;
1311 1312          args->a_io = NFS4_READ_AHEAD;
1312 1313          args->a_nfs4_readahead = readahead;
1313 1314          args->a_nfs4_blkoff = blkoff;
1314 1315          args->a_nfs4_seg = seg;
1315 1316          args->a_nfs4_addr = addr;
1316 1317  
1317 1318          mutex_enter(&mi->mi_async_lock);
1318 1319  
1319 1320          /*
1320 1321           * If asyncio has been disabled, don't bother readahead.
1321 1322           */
1322 1323          if (mi->mi_max_threads == 0) {
1323 1324                  mutex_exit(&mi->mi_async_lock);
1324 1325                  goto noasync;
1325 1326          }
1326 1327  
1327 1328          /*
1328 1329           * Link request structure into the async list and
1329 1330           * wakeup async thread to do the i/o.
1330 1331           */
1331 1332          if (mi->mi_async_reqs[NFS4_READ_AHEAD] == NULL) {
1332 1333                  mi->mi_async_reqs[NFS4_READ_AHEAD] = args;
1333 1334                  mi->mi_async_tail[NFS4_READ_AHEAD] = args;
1334 1335          } else {
1335 1336                  mi->mi_async_tail[NFS4_READ_AHEAD]->a_next = args;
1336 1337                  mi->mi_async_tail[NFS4_READ_AHEAD] = args;
1337 1338          }
1338 1339  
1339 1340          if (mi->mi_io_kstats) {
1340 1341                  mutex_enter(&mi->mi_lock);
1341 1342                  kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1342 1343                  mutex_exit(&mi->mi_lock);
1343 1344          }
1344 1345  
1345 1346          mi->mi_async_req_count++;
1346 1347          ASSERT(mi->mi_async_req_count != 0);
1347 1348          cv_signal(&mi->mi_async_reqs_cv);
1348 1349          mutex_exit(&mi->mi_async_lock);
1349 1350          return (0);
1350 1351  
1351 1352  noasync:
1352 1353          mutex_enter(&rp->r_statelock);
1353 1354          rp->r_count--;
1354 1355          cv_broadcast(&rp->r_cv);
1355 1356          mutex_exit(&rp->r_statelock);
1356 1357          VN_RELE(vp);
1357 1358          crfree(cr);
1358 1359          kmem_free(args, sizeof (*args));
1359 1360          return (-1);
1360 1361  }
1361 1362  
1362 1363  static void
1363 1364  nfs4_async_start(struct vfs *vfsp)
1364 1365  {
1365 1366          nfs4_async_common_start(vfsp, NFS4_ASYNC_QUEUE);
1366 1367  }
1367 1368  
1368 1369  static void
1369 1370  nfs4_async_pgops_start(struct vfs *vfsp)
1370 1371  {
1371 1372          nfs4_async_common_start(vfsp, NFS4_ASYNC_PGOPS_QUEUE);
1372 1373  }
1373 1374  
1374 1375  /*
1375 1376   * The async queues for each mounted file system are arranged as a
1376 1377   * set of queues, one for each async i/o type.  Requests are taken
1377 1378   * from the queues in a round-robin fashion.  A number of consecutive
1378 1379   * requests are taken from each queue before moving on to the next
1379 1380   * queue.  This functionality may allow the NFS Version 2 server to do
1380 1381   * write clustering, even if the client is mixing writes and reads
1381 1382   * because it will take multiple write requests from the queue
1382 1383   * before processing any of the other async i/o types.
1383 1384   *
1384 1385   * XXX The nfs4_async_common_start thread is unsafe in the light of the present
1385 1386   * model defined by cpr to suspend the system. Specifically over the
1386 1387   * wire calls are cpr-unsafe. The thread should be reevaluated in

↓ open down ↓

1354 lines elided

↑ open up ↑

1387 1388   * case of future updates to the cpr model.
1388 1389   */
1389 1390  static void
1390 1391  nfs4_async_common_start(struct vfs *vfsp, int async_queue)
1391 1392  {
1392 1393          struct nfs4_async_reqs *args;
1393 1394          mntinfo4_t *mi = VFTOMI4(vfsp);
1394 1395          clock_t time_left = 1;
1395 1396          callb_cpr_t cprinfo;
1396 1397          int i;
1397      -        extern int nfs_async_timeout;
     1398 +        extern volatile int nfs_async_timeout;
1398 1399          int async_types;
1399 1400          kcondvar_t *async_work_cv;
1400 1401  
1401 1402          if (async_queue == NFS4_ASYNC_QUEUE) {
1402 1403                  async_types = NFS4_ASYNC_TYPES;
1403 1404                  async_work_cv = &mi->mi_async_work_cv[NFS4_ASYNC_QUEUE];
1404 1405          } else {
1405 1406                  async_types = NFS4_ASYNC_PGOPS_TYPES;
1406 1407                  async_work_cv = &mi->mi_async_work_cv[NFS4_ASYNC_PGOPS_QUEUE];
1407 1408          }

1408 1409  
1409 1410          /*
1410 1411           * Dynamic initialization of nfs_async_timeout to allow nfs to be
1411 1412           * built in an implementation independent manner.
1412 1413           */
1413 1414          if (nfs_async_timeout == -1)
1414 1415                  nfs_async_timeout = NFS_ASYNC_TIMEOUT;
1415 1416  
1416 1417          CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, "nas");
1417 1418  
1418 1419          mutex_enter(&mi->mi_async_lock);
1419 1420          for (;;) {
1420 1421                  /*
1421 1422                   * Find the next queue containing an entry.  We start
1422 1423                   * at the current queue pointer and then round robin
1423 1424                   * through all of them until we either find a non-empty
1424 1425                   * queue or have looked through all of them.
1425 1426                   */
1426 1427                  for (i = 0; i < async_types; i++) {
1427 1428                          args = *mi->mi_async_curr[async_queue];
1428 1429                          if (args != NULL)
1429 1430                                  break;
1430 1431                          mi->mi_async_curr[async_queue]++;
1431 1432                          if (mi->mi_async_curr[async_queue] ==
1432 1433                              &mi->mi_async_reqs[async_types]) {
1433 1434                                  mi->mi_async_curr[async_queue] =
1434 1435                                      &mi->mi_async_reqs[0];
1435 1436                          }
1436 1437                  }
1437 1438                  /*
1438 1439                   * If we didn't find a entry, then block until woken up
1439 1440                   * again and then look through the queues again.
1440 1441                   */
1441 1442                  if (args == NULL) {
1442 1443                          /*
1443 1444                           * Exiting is considered to be safe for CPR as well
1444 1445                           */
1445 1446                          CALLB_CPR_SAFE_BEGIN(&cprinfo);
1446 1447  
1447 1448                          /*
1448 1449                           * Wakeup thread waiting to unmount the file
1449 1450                           * system only if all async threads are inactive.
1450 1451                           *
1451 1452                           * If we've timed-out and there's nothing to do,
1452 1453                           * then get rid of this thread.
1453 1454                           */
1454 1455                          if (mi->mi_max_threads == 0 || time_left <= 0) {
1455 1456                                  --mi->mi_threads[async_queue];
1456 1457  
1457 1458                                  if (mi->mi_threads[NFS4_ASYNC_QUEUE] == 0 &&
1458 1459                                      mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] == 0)
1459 1460                                          cv_signal(&mi->mi_async_cv);
1460 1461                                  CALLB_CPR_EXIT(&cprinfo);
1461 1462                                  VFS_RELE(vfsp); /* release thread's hold */
1462 1463                                  MI4_RELE(mi);
1463 1464                                  zthread_exit();
1464 1465                                  /* NOTREACHED */
1465 1466                          }
1466 1467                          time_left = cv_reltimedwait(async_work_cv,
1467 1468                              &mi->mi_async_lock, nfs_async_timeout,
1468 1469                              TR_CLOCK_TICK);
1469 1470  
1470 1471                          CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1471 1472  
1472 1473                          continue;
1473 1474                  } else {
1474 1475                          time_left = 1;
1475 1476                  }
1476 1477  
1477 1478                  /*
1478 1479                   * Remove the request from the async queue and then
1479 1480                   * update the current async request queue pointer.  If
1480 1481                   * the current queue is empty or we have removed enough
1481 1482                   * consecutive entries from it, then reset the counter
1482 1483                   * for this queue and then move the current pointer to
1483 1484                   * the next queue.
1484 1485                   */
1485 1486                  *mi->mi_async_curr[async_queue] = args->a_next;
1486 1487                  if (*mi->mi_async_curr[async_queue] == NULL ||
1487 1488                      --mi->mi_async_clusters[args->a_io] == 0) {
1488 1489                          mi->mi_async_clusters[args->a_io] =
1489 1490                              mi->mi_async_init_clusters;
1490 1491                          mi->mi_async_curr[async_queue]++;
1491 1492                          if (mi->mi_async_curr[async_queue] ==
1492 1493                              &mi->mi_async_reqs[async_types]) {
1493 1494                                  mi->mi_async_curr[async_queue] =
1494 1495                                      &mi->mi_async_reqs[0];
1495 1496                          }
1496 1497                  }
1497 1498  
1498 1499                  if (args->a_io != NFS4_INACTIVE && mi->mi_io_kstats) {
1499 1500                          mutex_enter(&mi->mi_lock);
1500 1501                          kstat_waitq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
1501 1502                          mutex_exit(&mi->mi_lock);
1502 1503                  }
1503 1504  
1504 1505                  mutex_exit(&mi->mi_async_lock);
1505 1506  
1506 1507                  /*
1507 1508                   * Obtain arguments from the async request structure.
1508 1509                   */
1509 1510                  if (args->a_io == NFS4_READ_AHEAD && mi->mi_max_threads > 0) {
1510 1511                          (*args->a_nfs4_readahead)(args->a_vp,
1511 1512                              args->a_nfs4_blkoff, args->a_nfs4_addr,
1512 1513                              args->a_nfs4_seg, args->a_cred);
1513 1514                  } else if (args->a_io == NFS4_PUTAPAGE) {
1514 1515                          (void) (*args->a_nfs4_putapage)(args->a_vp,
1515 1516                              args->a_nfs4_pp, args->a_nfs4_off,
1516 1517                              args->a_nfs4_len, args->a_nfs4_flags,
1517 1518                              args->a_cred);
1518 1519                  } else if (args->a_io == NFS4_PAGEIO) {
1519 1520                          (void) (*args->a_nfs4_pageio)(args->a_vp,
1520 1521                              args->a_nfs4_pp, args->a_nfs4_off,
1521 1522                              args->a_nfs4_len, args->a_nfs4_flags,
1522 1523                              args->a_cred);
1523 1524                  } else if (args->a_io == NFS4_READDIR) {
1524 1525                          (void) ((*args->a_nfs4_readdir)(args->a_vp,
1525 1526                              args->a_nfs4_rdc, args->a_cred));
1526 1527                  } else if (args->a_io == NFS4_COMMIT) {
1527 1528                          (*args->a_nfs4_commit)(args->a_vp, args->a_nfs4_plist,
1528 1529                              args->a_nfs4_offset, args->a_nfs4_count,
1529 1530                              args->a_cred);
1530 1531                  } else if (args->a_io == NFS4_INACTIVE) {
1531 1532                          nfs4_inactive_otw(args->a_vp, args->a_cred);
1532 1533                  }
1533 1534  
1534 1535                  /*
1535 1536                   * Now, release the vnode and free the credentials
1536 1537                   * structure.
1537 1538                   */
1538 1539                  free_async_args4(args);
1539 1540                  /*
1540 1541                   * Reacquire the mutex because it will be needed above.
1541 1542                   */
1542 1543                  mutex_enter(&mi->mi_async_lock);
1543 1544          }
1544 1545  }
1545 1546  
1546 1547  /*
1547 1548   * nfs4_inactive_thread - look for vnodes that need over-the-wire calls as
1548 1549   * part of VOP_INACTIVE.
1549 1550   */
1550 1551  
1551 1552  void
1552 1553  nfs4_inactive_thread(mntinfo4_t *mi)
1553 1554  {
1554 1555          struct nfs4_async_reqs *args;
1555 1556          callb_cpr_t cprinfo;
1556 1557          vfs_t *vfsp = mi->mi_vfsp;
1557 1558  
1558 1559          CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr,
1559 1560              "nfs4_inactive_thread");
1560 1561  
1561 1562          for (;;) {
1562 1563                  mutex_enter(&mi->mi_async_lock);
1563 1564                  args = mi->mi_async_reqs[NFS4_INACTIVE];
1564 1565                  if (args == NULL) {
1565 1566                          mutex_enter(&mi->mi_lock);
1566 1567                          /*
1567 1568                           * We don't want to exit until the async manager is done
1568 1569                           * with its work; hence the check for mi_manager_thread
1569 1570                           * being NULL.
1570 1571                           *
1571 1572                           * The async manager thread will cv_broadcast() on
1572 1573                           * mi_inact_req_cv when it's done, at which point we'll
1573 1574                           * wake up and exit.
1574 1575                           */
1575 1576                          if (mi->mi_manager_thread == NULL)
1576 1577                                  goto die;
1577 1578                          mi->mi_flags |= MI4_INACTIVE_IDLE;
1578 1579                          mutex_exit(&mi->mi_lock);
1579 1580                          cv_signal(&mi->mi_async_cv);
1580 1581                          CALLB_CPR_SAFE_BEGIN(&cprinfo);
1581 1582                          cv_wait(&mi->mi_inact_req_cv, &mi->mi_async_lock);
1582 1583                          CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1583 1584                          mutex_exit(&mi->mi_async_lock);
1584 1585                  } else {
1585 1586                          mutex_enter(&mi->mi_lock);
1586 1587                          mi->mi_flags &= ~MI4_INACTIVE_IDLE;
1587 1588                          mutex_exit(&mi->mi_lock);
1588 1589                          mi->mi_async_reqs[NFS4_INACTIVE] = args->a_next;
1589 1590                          mutex_exit(&mi->mi_async_lock);
1590 1591                          nfs4_inactive_otw(args->a_vp, args->a_cred);
1591 1592                          crfree(args->a_cred);
1592 1593                          kmem_free(args, sizeof (*args));
1593 1594                  }
1594 1595          }
1595 1596  die:
1596 1597          mutex_exit(&mi->mi_lock);
1597 1598          mi->mi_inactive_thread = NULL;
1598 1599          cv_signal(&mi->mi_async_cv);
1599 1600  
1600 1601          /*
1601 1602           * There is no explicit call to mutex_exit(&mi->mi_async_lock) since
1602 1603           * CALLB_CPR_EXIT is actually responsible for releasing 'mi_async_lock'.
1603 1604           */
1604 1605          CALLB_CPR_EXIT(&cprinfo);
1605 1606  
1606 1607          NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
1607 1608              "nfs4_inactive_thread exiting for vfs %p\n", (void *)vfsp));
1608 1609  
1609 1610          MI4_RELE(mi);
1610 1611          zthread_exit();
1611 1612          /* NOTREACHED */
1612 1613  }
1613 1614  
1614 1615  /*
1615 1616   * nfs_async_stop:
1616 1617   * Wait for all outstanding putpage operations and the inactive thread to
1617 1618   * complete; nfs4_async_stop_sig() without interruptibility.
1618 1619   */
1619 1620  void
1620 1621  nfs4_async_stop(struct vfs *vfsp)
1621 1622  {
1622 1623          mntinfo4_t *mi = VFTOMI4(vfsp);
1623 1624  
1624 1625          /*
1625 1626           * Wait for all outstanding async operations to complete and for
1626 1627           * worker threads to exit.
1627 1628           */
1628 1629          mutex_enter(&mi->mi_async_lock);
1629 1630          mi->mi_max_threads = 0;
1630 1631          NFS4_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
1631 1632          while (mi->mi_threads[NFS4_ASYNC_QUEUE] != 0 ||
1632 1633              mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] != 0)
1633 1634                  cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
1634 1635  
1635 1636          /*
1636 1637           * Wait for the inactive thread to finish doing what it's doing.  It
1637 1638           * won't exit until the last reference to the vfs_t goes away.
1638 1639           */
1639 1640          if (mi->mi_inactive_thread != NULL) {
1640 1641                  mutex_enter(&mi->mi_lock);
1641 1642                  while (!(mi->mi_flags & MI4_INACTIVE_IDLE) ||
1642 1643                      (mi->mi_async_reqs[NFS4_INACTIVE] != NULL)) {
1643 1644                          mutex_exit(&mi->mi_lock);
1644 1645                          cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
1645 1646                          mutex_enter(&mi->mi_lock);
1646 1647                  }
1647 1648                  mutex_exit(&mi->mi_lock);
1648 1649          }
1649 1650          mutex_exit(&mi->mi_async_lock);
1650 1651  }
1651 1652  
1652 1653  /*
1653 1654   * nfs_async_stop_sig:
1654 1655   * Wait for all outstanding putpage operations and the inactive thread to
1655 1656   * complete. If a signal is delivered we will abort and return non-zero;
1656 1657   * otherwise return 0. Since this routine is called from nfs4_unmount, we
1657 1658   * need to make it interruptible.
1658 1659   */
1659 1660  int
1660 1661  nfs4_async_stop_sig(struct vfs *vfsp)
1661 1662  {
1662 1663          mntinfo4_t *mi = VFTOMI4(vfsp);
1663 1664          ushort_t omax;
1664 1665          bool_t intr = FALSE;
1665 1666  
1666 1667          /*
1667 1668           * Wait for all outstanding putpage operations to complete and for
1668 1669           * worker threads to exit.
1669 1670           */
1670 1671          mutex_enter(&mi->mi_async_lock);
1671 1672          omax = mi->mi_max_threads;
1672 1673          mi->mi_max_threads = 0;
1673 1674          NFS4_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
1674 1675          while (mi->mi_threads[NFS4_ASYNC_QUEUE] != 0 ||
1675 1676              mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] != 0) {
1676 1677                  if (!cv_wait_sig(&mi->mi_async_cv, &mi->mi_async_lock)) {
1677 1678                          intr = TRUE;
1678 1679                          goto interrupted;
1679 1680                  }
1680 1681          }
1681 1682  
1682 1683          /*
1683 1684           * Wait for the inactive thread to finish doing what it's doing.  It
1684 1685           * won't exit until the a last reference to the vfs_t goes away.
1685 1686           */
1686 1687          if (mi->mi_inactive_thread != NULL) {
1687 1688                  mutex_enter(&mi->mi_lock);
1688 1689                  while (!(mi->mi_flags & MI4_INACTIVE_IDLE) ||
1689 1690                      (mi->mi_async_reqs[NFS4_INACTIVE] != NULL)) {
1690 1691                          mutex_exit(&mi->mi_lock);
1691 1692                          if (!cv_wait_sig(&mi->mi_async_cv,
1692 1693                              &mi->mi_async_lock)) {
1693 1694                                  intr = TRUE;
1694 1695                                  goto interrupted;
1695 1696                          }
1696 1697                          mutex_enter(&mi->mi_lock);
1697 1698                  }
1698 1699                  mutex_exit(&mi->mi_lock);
1699 1700          }
1700 1701  interrupted:
1701 1702          if (intr)
1702 1703                  mi->mi_max_threads = omax;
1703 1704          mutex_exit(&mi->mi_async_lock);
1704 1705  
1705 1706          return (intr);
1706 1707  }
1707 1708  
1708 1709  int
1709 1710  nfs4_async_putapage(vnode_t *vp, page_t *pp, u_offset_t off, size_t len,
1710 1711      int flags, cred_t *cr, int (*putapage)(vnode_t *, page_t *,
1711 1712      u_offset_t, size_t, int, cred_t *))
1712 1713  {
1713 1714          rnode4_t *rp;
1714 1715          mntinfo4_t *mi;
1715 1716          struct nfs4_async_reqs *args;
1716 1717  
1717 1718          ASSERT(flags & B_ASYNC);
1718 1719          ASSERT(vp->v_vfsp != NULL);
1719 1720  
1720 1721          rp = VTOR4(vp);
1721 1722          ASSERT(rp->r_count > 0);
1722 1723  
1723 1724          mi = VTOMI4(vp);
1724 1725  
1725 1726          /*
1726 1727           * If we can't allocate a request structure, do the putpage
1727 1728           * operation synchronously in this thread's context.
1728 1729           */
1729 1730          if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1730 1731                  goto noasync;
1731 1732  
1732 1733          args->a_next = NULL;
1733 1734  #ifdef DEBUG
1734 1735          args->a_queuer = curthread;
1735 1736  #endif
1736 1737          VN_HOLD(vp);
1737 1738          args->a_vp = vp;
1738 1739          ASSERT(cr != NULL);
1739 1740          crhold(cr);
1740 1741          args->a_cred = cr;
1741 1742          args->a_io = NFS4_PUTAPAGE;
1742 1743          args->a_nfs4_putapage = putapage;
1743 1744          args->a_nfs4_pp = pp;
1744 1745          args->a_nfs4_off = off;
1745 1746          args->a_nfs4_len = (uint_t)len;
1746 1747          args->a_nfs4_flags = flags;
1747 1748  
1748 1749          mutex_enter(&mi->mi_async_lock);
1749 1750  
1750 1751          /*
1751 1752           * If asyncio has been disabled, then make a synchronous request.
1752 1753           * This check is done a second time in case async io was diabled
1753 1754           * while this thread was blocked waiting for memory pressure to
1754 1755           * reduce or for the queue to drain.
1755 1756           */
1756 1757          if (mi->mi_max_threads == 0) {
1757 1758                  mutex_exit(&mi->mi_async_lock);
1758 1759  
1759 1760                  VN_RELE(vp);
1760 1761                  crfree(cr);
1761 1762                  kmem_free(args, sizeof (*args));
1762 1763                  goto noasync;
1763 1764          }
1764 1765  
1765 1766          /*
1766 1767           * Link request structure into the async list and
1767 1768           * wakeup async thread to do the i/o.
1768 1769           */
1769 1770          if (mi->mi_async_reqs[NFS4_PUTAPAGE] == NULL) {
1770 1771                  mi->mi_async_reqs[NFS4_PUTAPAGE] = args;
1771 1772                  mi->mi_async_tail[NFS4_PUTAPAGE] = args;
1772 1773          } else {
1773 1774                  mi->mi_async_tail[NFS4_PUTAPAGE]->a_next = args;
1774 1775                  mi->mi_async_tail[NFS4_PUTAPAGE] = args;
1775 1776          }
1776 1777  
1777 1778          mutex_enter(&rp->r_statelock);
1778 1779          rp->r_count++;
1779 1780          rp->r_awcount++;
1780 1781          mutex_exit(&rp->r_statelock);
1781 1782  
1782 1783          if (mi->mi_io_kstats) {
1783 1784                  mutex_enter(&mi->mi_lock);
1784 1785                  kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1785 1786                  mutex_exit(&mi->mi_lock);
1786 1787          }
1787 1788  
1788 1789          mi->mi_async_req_count++;
1789 1790          ASSERT(mi->mi_async_req_count != 0);
1790 1791          cv_signal(&mi->mi_async_reqs_cv);
1791 1792          mutex_exit(&mi->mi_async_lock);
1792 1793          return (0);
1793 1794  
1794 1795  noasync:
1795 1796  
1796 1797          if (curproc == proc_pageout || curproc == proc_fsflush) {
1797 1798                  /*
1798 1799                   * If we get here in the context of the pageout/fsflush,
1799 1800                   * or we have run out of memory or we're attempting to
1800 1801                   * unmount we refuse to do a sync write, because this may
1801 1802                   * hang pageout/fsflush and the machine. In this case,
1802 1803                   * we just re-mark the page as dirty and punt on the page.
1803 1804                   *
1804 1805                   * Make sure B_FORCE isn't set.  We can re-mark the
1805 1806                   * pages as dirty and unlock the pages in one swoop by
1806 1807                   * passing in B_ERROR to pvn_write_done().  However,
1807 1808                   * we should make sure B_FORCE isn't set - we don't
1808 1809                   * want the page tossed before it gets written out.
1809 1810                   */
1810 1811                  if (flags & B_FORCE)
1811 1812                          flags &= ~(B_INVAL | B_FORCE);
1812 1813                  pvn_write_done(pp, flags | B_ERROR);
1813 1814                  return (0);
1814 1815          }
1815 1816  
1816 1817          if (nfs_zone() != mi->mi_zone) {
1817 1818                  /*
1818 1819                   * So this was a cross-zone sync putpage.
1819 1820                   *
1820 1821                   * We pass in B_ERROR to pvn_write_done() to re-mark the pages
1821 1822                   * as dirty and unlock them.
1822 1823                   *
1823 1824                   * We don't want to clear B_FORCE here as the caller presumably
1824 1825                   * knows what they're doing if they set it.
1825 1826                   */
1826 1827                  pvn_write_done(pp, flags | B_ERROR);
1827 1828                  return (EPERM);
1828 1829          }
1829 1830          return ((*putapage)(vp, pp, off, len, flags, cr));
1830 1831  }
1831 1832  
1832 1833  int
1833 1834  nfs4_async_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
1834 1835      int flags, cred_t *cr, int (*pageio)(vnode_t *, page_t *, u_offset_t,
1835 1836      size_t, int, cred_t *))
1836 1837  {
1837 1838          rnode4_t *rp;
1838 1839          mntinfo4_t *mi;
1839 1840          struct nfs4_async_reqs *args;
1840 1841  
1841 1842          ASSERT(flags & B_ASYNC);
1842 1843          ASSERT(vp->v_vfsp != NULL);
1843 1844  
1844 1845          rp = VTOR4(vp);
1845 1846          ASSERT(rp->r_count > 0);
1846 1847  
1847 1848          mi = VTOMI4(vp);
1848 1849  
1849 1850          /*
1850 1851           * If we can't allocate a request structure, do the pageio
1851 1852           * request synchronously in this thread's context.
1852 1853           */
1853 1854          if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1854 1855                  goto noasync;
1855 1856  
1856 1857          args->a_next = NULL;
1857 1858  #ifdef DEBUG
1858 1859          args->a_queuer = curthread;
1859 1860  #endif
1860 1861          VN_HOLD(vp);
1861 1862          args->a_vp = vp;
1862 1863          ASSERT(cr != NULL);
1863 1864          crhold(cr);
1864 1865          args->a_cred = cr;
1865 1866          args->a_io = NFS4_PAGEIO;
1866 1867          args->a_nfs4_pageio = pageio;
1867 1868          args->a_nfs4_pp = pp;
1868 1869          args->a_nfs4_off = io_off;
1869 1870          args->a_nfs4_len = (uint_t)io_len;
1870 1871          args->a_nfs4_flags = flags;
1871 1872  
1872 1873          mutex_enter(&mi->mi_async_lock);
1873 1874  
1874 1875          /*
1875 1876           * If asyncio has been disabled, then make a synchronous request.
1876 1877           * This check is done a second time in case async io was diabled
1877 1878           * while this thread was blocked waiting for memory pressure to
1878 1879           * reduce or for the queue to drain.
1879 1880           */
1880 1881          if (mi->mi_max_threads == 0) {
1881 1882                  mutex_exit(&mi->mi_async_lock);
1882 1883  
1883 1884                  VN_RELE(vp);
1884 1885                  crfree(cr);
1885 1886                  kmem_free(args, sizeof (*args));
1886 1887                  goto noasync;
1887 1888          }
1888 1889  
1889 1890          /*
1890 1891           * Link request structure into the async list and
1891 1892           * wakeup async thread to do the i/o.
1892 1893           */
1893 1894          if (mi->mi_async_reqs[NFS4_PAGEIO] == NULL) {
1894 1895                  mi->mi_async_reqs[NFS4_PAGEIO] = args;
1895 1896                  mi->mi_async_tail[NFS4_PAGEIO] = args;
1896 1897          } else {
1897 1898                  mi->mi_async_tail[NFS4_PAGEIO]->a_next = args;
1898 1899                  mi->mi_async_tail[NFS4_PAGEIO] = args;
1899 1900          }
1900 1901  
1901 1902          mutex_enter(&rp->r_statelock);
1902 1903          rp->r_count++;
1903 1904          rp->r_awcount++;
1904 1905          mutex_exit(&rp->r_statelock);
1905 1906  
1906 1907          if (mi->mi_io_kstats) {
1907 1908                  mutex_enter(&mi->mi_lock);
1908 1909                  kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1909 1910                  mutex_exit(&mi->mi_lock);
1910 1911          }
1911 1912  
1912 1913          mi->mi_async_req_count++;
1913 1914          ASSERT(mi->mi_async_req_count != 0);
1914 1915          cv_signal(&mi->mi_async_reqs_cv);
1915 1916          mutex_exit(&mi->mi_async_lock);
1916 1917          return (0);
1917 1918  
1918 1919  noasync:
1919 1920          /*
1920 1921           * If we can't do it ASYNC, for reads we do nothing (but cleanup
1921 1922           * the page list), for writes we do it synchronously, except for
1922 1923           * proc_pageout/proc_fsflush as described below.
1923 1924           */
1924 1925          if (flags & B_READ) {
1925 1926                  pvn_read_done(pp, flags | B_ERROR);
1926 1927                  return (0);
1927 1928          }
1928 1929  
1929 1930          if (curproc == proc_pageout || curproc == proc_fsflush) {
1930 1931                  /*
1931 1932                   * If we get here in the context of the pageout/fsflush,
1932 1933                   * we refuse to do a sync write, because this may hang
1933 1934                   * pageout/fsflush (and the machine). In this case, we just
1934 1935                   * re-mark the page as dirty and punt on the page.
1935 1936                   *
1936 1937                   * Make sure B_FORCE isn't set.  We can re-mark the
1937 1938                   * pages as dirty and unlock the pages in one swoop by
1938 1939                   * passing in B_ERROR to pvn_write_done().  However,
1939 1940                   * we should make sure B_FORCE isn't set - we don't
1940 1941                   * want the page tossed before it gets written out.
1941 1942                   */
1942 1943                  if (flags & B_FORCE)
1943 1944                          flags &= ~(B_INVAL | B_FORCE);
1944 1945                  pvn_write_done(pp, flags | B_ERROR);
1945 1946                  return (0);
1946 1947          }
1947 1948  
1948 1949          if (nfs_zone() != mi->mi_zone) {
1949 1950                  /*
1950 1951                   * So this was a cross-zone sync pageio.  We pass in B_ERROR
1951 1952                   * to pvn_write_done() to re-mark the pages as dirty and unlock
1952 1953                   * them.
1953 1954                   *
1954 1955                   * We don't want to clear B_FORCE here as the caller presumably
1955 1956                   * knows what they're doing if they set it.
1956 1957                   */
1957 1958                  pvn_write_done(pp, flags | B_ERROR);
1958 1959                  return (EPERM);
1959 1960          }
1960 1961          return ((*pageio)(vp, pp, io_off, io_len, flags, cr));
1961 1962  }
1962 1963  
1963 1964  void
1964 1965  nfs4_async_readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr,
1965 1966      int (*readdir)(vnode_t *, rddir4_cache *, cred_t *))
1966 1967  {
1967 1968          rnode4_t *rp;
1968 1969          mntinfo4_t *mi;
1969 1970          struct nfs4_async_reqs *args;
1970 1971  
1971 1972          rp = VTOR4(vp);
1972 1973          ASSERT(rp->r_freef == NULL);
1973 1974  
1974 1975          mi = VTOMI4(vp);
1975 1976  
1976 1977          /*
1977 1978           * If we can't allocate a request structure, skip the readdir.
1978 1979           */
1979 1980          if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1980 1981                  goto noasync;
1981 1982  
1982 1983          args->a_next = NULL;
1983 1984  #ifdef DEBUG
1984 1985          args->a_queuer = curthread;
1985 1986  #endif
1986 1987          VN_HOLD(vp);
1987 1988          args->a_vp = vp;
1988 1989          ASSERT(cr != NULL);
1989 1990          crhold(cr);
1990 1991          args->a_cred = cr;
1991 1992          args->a_io = NFS4_READDIR;
1992 1993          args->a_nfs4_readdir = readdir;
1993 1994          args->a_nfs4_rdc = rdc;
1994 1995  
1995 1996          mutex_enter(&mi->mi_async_lock);
1996 1997  
1997 1998          /*
1998 1999           * If asyncio has been disabled, then skip this request
1999 2000           */
2000 2001          if (mi->mi_max_threads == 0) {
2001 2002                  mutex_exit(&mi->mi_async_lock);
2002 2003  
2003 2004                  VN_RELE(vp);
2004 2005                  crfree(cr);
2005 2006                  kmem_free(args, sizeof (*args));
2006 2007                  goto noasync;
2007 2008          }
2008 2009  
2009 2010          /*
2010 2011           * Link request structure into the async list and
2011 2012           * wakeup async thread to do the i/o.
2012 2013           */
2013 2014          if (mi->mi_async_reqs[NFS4_READDIR] == NULL) {
2014 2015                  mi->mi_async_reqs[NFS4_READDIR] = args;
2015 2016                  mi->mi_async_tail[NFS4_READDIR] = args;
2016 2017          } else {
2017 2018                  mi->mi_async_tail[NFS4_READDIR]->a_next = args;
2018 2019                  mi->mi_async_tail[NFS4_READDIR] = args;
2019 2020          }
2020 2021  
2021 2022          mutex_enter(&rp->r_statelock);
2022 2023          rp->r_count++;
2023 2024          mutex_exit(&rp->r_statelock);
2024 2025  
2025 2026          if (mi->mi_io_kstats) {
2026 2027                  mutex_enter(&mi->mi_lock);
2027 2028                  kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
2028 2029                  mutex_exit(&mi->mi_lock);
2029 2030          }
2030 2031  
2031 2032          mi->mi_async_req_count++;
2032 2033          ASSERT(mi->mi_async_req_count != 0);
2033 2034          cv_signal(&mi->mi_async_reqs_cv);
2034 2035          mutex_exit(&mi->mi_async_lock);
2035 2036          return;
2036 2037  
2037 2038  noasync:
2038 2039          mutex_enter(&rp->r_statelock);
2039 2040          rdc->entries = NULL;
2040 2041          /*
2041 2042           * Indicate that no one is trying to fill this entry and
2042 2043           * it still needs to be filled.
2043 2044           */
2044 2045          rdc->flags &= ~RDDIR;
2045 2046          rdc->flags |= RDDIRREQ;
2046 2047          rddir4_cache_rele(rp, rdc);
2047 2048          mutex_exit(&rp->r_statelock);
2048 2049  }
2049 2050  
2050 2051  void
2051 2052  nfs4_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count,
2052 2053      cred_t *cr, void (*commit)(vnode_t *, page_t *, offset3, count3,
2053 2054      cred_t *))
2054 2055  {
2055 2056          rnode4_t *rp;
2056 2057          mntinfo4_t *mi;
2057 2058          struct nfs4_async_reqs *args;
2058 2059          page_t *pp;
2059 2060  
2060 2061          rp = VTOR4(vp);
2061 2062          mi = VTOMI4(vp);
2062 2063  
2063 2064          /*
2064 2065           * If we can't allocate a request structure, do the commit
2065 2066           * operation synchronously in this thread's context.
2066 2067           */
2067 2068          if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
2068 2069                  goto noasync;
2069 2070  
2070 2071          args->a_next = NULL;
2071 2072  #ifdef DEBUG
2072 2073          args->a_queuer = curthread;
2073 2074  #endif
2074 2075          VN_HOLD(vp);
2075 2076          args->a_vp = vp;
2076 2077          ASSERT(cr != NULL);
2077 2078          crhold(cr);
2078 2079          args->a_cred = cr;
2079 2080          args->a_io = NFS4_COMMIT;
2080 2081          args->a_nfs4_commit = commit;
2081 2082          args->a_nfs4_plist = plist;
2082 2083          args->a_nfs4_offset = offset;
2083 2084          args->a_nfs4_count = count;
2084 2085  
2085 2086          mutex_enter(&mi->mi_async_lock);
2086 2087  
2087 2088          /*
2088 2089           * If asyncio has been disabled, then make a synchronous request.
2089 2090           * This check is done a second time in case async io was diabled
2090 2091           * while this thread was blocked waiting for memory pressure to
2091 2092           * reduce or for the queue to drain.
2092 2093           */
2093 2094          if (mi->mi_max_threads == 0) {
2094 2095                  mutex_exit(&mi->mi_async_lock);
2095 2096  
2096 2097                  VN_RELE(vp);
2097 2098                  crfree(cr);
2098 2099                  kmem_free(args, sizeof (*args));
2099 2100                  goto noasync;
2100 2101          }
2101 2102  
2102 2103          /*
2103 2104           * Link request structure into the async list and
2104 2105           * wakeup async thread to do the i/o.
2105 2106           */
2106 2107          if (mi->mi_async_reqs[NFS4_COMMIT] == NULL) {
2107 2108                  mi->mi_async_reqs[NFS4_COMMIT] = args;
2108 2109                  mi->mi_async_tail[NFS4_COMMIT] = args;
2109 2110          } else {
2110 2111                  mi->mi_async_tail[NFS4_COMMIT]->a_next = args;
2111 2112                  mi->mi_async_tail[NFS4_COMMIT] = args;
2112 2113          }
2113 2114  
2114 2115          mutex_enter(&rp->r_statelock);
2115 2116          rp->r_count++;
2116 2117          mutex_exit(&rp->r_statelock);
2117 2118  
2118 2119          if (mi->mi_io_kstats) {
2119 2120                  mutex_enter(&mi->mi_lock);
2120 2121                  kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
2121 2122                  mutex_exit(&mi->mi_lock);
2122 2123          }
2123 2124  
2124 2125          mi->mi_async_req_count++;
2125 2126          ASSERT(mi->mi_async_req_count != 0);
2126 2127          cv_signal(&mi->mi_async_reqs_cv);
2127 2128          mutex_exit(&mi->mi_async_lock);
2128 2129          return;
2129 2130  
2130 2131  noasync:
2131 2132          if (curproc == proc_pageout || curproc == proc_fsflush ||
2132 2133              nfs_zone() != mi->mi_zone) {
2133 2134                  while (plist != NULL) {
2134 2135                          pp = plist;
2135 2136                          page_sub(&plist, pp);
2136 2137                          pp->p_fsdata = C_COMMIT;
2137 2138                          page_unlock(pp);
2138 2139                  }
2139 2140                  return;
2140 2141          }
2141 2142          (*commit)(vp, plist, offset, count, cr);
2142 2143  }
2143 2144  
2144 2145  /*
2145 2146   * nfs4_async_inactive - hand off a VOP_INACTIVE call to a thread.  The
2146 2147   * reference to the vnode is handed over to the thread; the caller should
2147 2148   * no longer refer to the vnode.
2148 2149   *
2149 2150   * Unlike most of the async routines, this handoff is needed for
2150 2151   * correctness reasons, not just performance.  So doing operations in the
2151 2152   * context of the current thread is not an option.
2152 2153   */
2153 2154  void
2154 2155  nfs4_async_inactive(vnode_t *vp, cred_t *cr)
2155 2156  {
2156 2157          mntinfo4_t *mi;
2157 2158          struct nfs4_async_reqs *args;
2158 2159          boolean_t signal_inactive_thread = B_FALSE;
2159 2160  
2160 2161          mi = VTOMI4(vp);
2161 2162  
2162 2163          args = kmem_alloc(sizeof (*args), KM_SLEEP);
2163 2164          args->a_next = NULL;
2164 2165  #ifdef DEBUG
2165 2166          args->a_queuer = curthread;
2166 2167  #endif
2167 2168          args->a_vp = vp;
2168 2169          ASSERT(cr != NULL);
2169 2170          crhold(cr);
2170 2171          args->a_cred = cr;
2171 2172          args->a_io = NFS4_INACTIVE;
2172 2173  
2173 2174          /*
2174 2175           * Note that we don't check mi->mi_max_threads here, since we
2175 2176           * *need* to get rid of this vnode regardless of whether someone
2176 2177           * set nfs4_max_threads to zero in /etc/system.
2177 2178           *
2178 2179           * The manager thread knows about this and is willing to create
2179 2180           * at least one thread to accommodate us.
2180 2181           */
2181 2182          mutex_enter(&mi->mi_async_lock);
2182 2183          if (mi->mi_inactive_thread == NULL) {
2183 2184                  rnode4_t *rp;
2184 2185                  vnode_t *unldvp = NULL;
2185 2186                  char *unlname;
2186 2187                  cred_t *unlcred;
2187 2188  
2188 2189                  mutex_exit(&mi->mi_async_lock);
2189 2190                  /*
2190 2191                   * We just need to free up the memory associated with the
2191 2192                   * vnode, which can be safely done from within the current
2192 2193                   * context.
2193 2194                   */
2194 2195                  crfree(cr);     /* drop our reference */
2195 2196                  kmem_free(args, sizeof (*args));
2196 2197                  rp = VTOR4(vp);
2197 2198                  mutex_enter(&rp->r_statelock);
2198 2199                  if (rp->r_unldvp != NULL) {
2199 2200                          unldvp = rp->r_unldvp;
2200 2201                          rp->r_unldvp = NULL;
2201 2202                          unlname = rp->r_unlname;
2202 2203                          rp->r_unlname = NULL;
2203 2204                          unlcred = rp->r_unlcred;
2204 2205                          rp->r_unlcred = NULL;
2205 2206                  }
2206 2207                  mutex_exit(&rp->r_statelock);
2207 2208                  /*
2208 2209                   * No need to explicitly throw away any cached pages.  The
2209 2210                   * eventual r4inactive() will attempt a synchronous
2210 2211                   * VOP_PUTPAGE() which will immediately fail since the request
2211 2212                   * is coming from the wrong zone, and then will proceed to call
2212 2213                   * nfs4_invalidate_pages() which will clean things up for us.
2213 2214                   *
2214 2215                   * Throw away the delegation here so rp4_addfree()'s attempt to
2215 2216                   * return any existing delegations becomes a no-op.
2216 2217                   */
2217 2218                  if (rp->r_deleg_type != OPEN_DELEGATE_NONE) {
2218 2219                          (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER,
2219 2220                              FALSE);
2220 2221                          (void) nfs4delegreturn(rp, NFS4_DR_DISCARD);
2221 2222                          nfs_rw_exit(&mi->mi_recovlock);
2222 2223                  }
2223 2224                  nfs4_clear_open_streams(rp);
2224 2225  
2225 2226                  rp4_addfree(rp, cr);
2226 2227                  if (unldvp != NULL) {
2227 2228                          kmem_free(unlname, MAXNAMELEN);
2228 2229                          VN_RELE(unldvp);
2229 2230                          crfree(unlcred);
2230 2231                  }
2231 2232                  return;
2232 2233          }
2233 2234  
2234 2235          if (mi->mi_manager_thread == NULL) {
2235 2236                  /*
2236 2237                   * We want to talk to the inactive thread.
2237 2238                   */
2238 2239                  signal_inactive_thread = B_TRUE;
2239 2240          }
2240 2241  
2241 2242          /*
2242 2243           * Enqueue the vnode and wake up either the special thread (empty
2243 2244           * list) or an async thread.
2244 2245           */
2245 2246          if (mi->mi_async_reqs[NFS4_INACTIVE] == NULL) {
2246 2247                  mi->mi_async_reqs[NFS4_INACTIVE] = args;
2247 2248                  mi->mi_async_tail[NFS4_INACTIVE] = args;
2248 2249                  signal_inactive_thread = B_TRUE;
2249 2250          } else {
2250 2251                  mi->mi_async_tail[NFS4_INACTIVE]->a_next = args;
2251 2252                  mi->mi_async_tail[NFS4_INACTIVE] = args;
2252 2253          }
2253 2254          if (signal_inactive_thread) {
2254 2255                  cv_signal(&mi->mi_inact_req_cv);
2255 2256          } else  {
2256 2257                  mi->mi_async_req_count++;
2257 2258                  ASSERT(mi->mi_async_req_count != 0);
2258 2259                  cv_signal(&mi->mi_async_reqs_cv);
2259 2260          }
2260 2261  
2261 2262          mutex_exit(&mi->mi_async_lock);
2262 2263  }
2263 2264  
2264 2265  int
2265 2266  writerp4(rnode4_t *rp, caddr_t base, int tcount, struct uio *uio, int pgcreated)
2266 2267  {
2267 2268          int pagecreate;
2268 2269          int n;
2269 2270          int saved_n;
2270 2271          caddr_t saved_base;
2271 2272          u_offset_t offset;
2272 2273          int error;
2273 2274          int sm_error;
2274 2275          vnode_t *vp = RTOV(rp);
2275 2276  
2276 2277          ASSERT(tcount <= MAXBSIZE && tcount <= uio->uio_resid);
2277 2278          ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_WRITER));
2278 2279          if (!vpm_enable) {
2279 2280                  ASSERT(((uintptr_t)base & MAXBOFFSET) + tcount <= MAXBSIZE);
2280 2281          }
2281 2282  
2282 2283          /*
2283 2284           * Move bytes in at most PAGESIZE chunks. We must avoid
2284 2285           * spanning pages in uiomove() because page faults may cause
2285 2286           * the cache to be invalidated out from under us. The r_size is not
2286 2287           * updated until after the uiomove. If we push the last page of a
2287 2288           * file before r_size is correct, we will lose the data written past
2288 2289           * the current (and invalid) r_size.
2289 2290           */
2290 2291          do {
2291 2292                  offset = uio->uio_loffset;
2292 2293                  pagecreate = 0;
2293 2294  
2294 2295                  /*
2295 2296                   * n is the number of bytes required to satisfy the request
2296 2297                   *   or the number of bytes to fill out the page.
2297 2298                   */
2298 2299                  n = (int)MIN((PAGESIZE - (offset & PAGEOFFSET)), tcount);
2299 2300  
2300 2301                  /*
2301 2302                   * Check to see if we can skip reading in the page
2302 2303                   * and just allocate the memory.  We can do this
2303 2304                   * if we are going to rewrite the entire mapping
2304 2305                   * or if we are going to write to or beyond the current
2305 2306                   * end of file from the beginning of the mapping.
2306 2307                   *
2307 2308                   * The read of r_size is now protected by r_statelock.
2308 2309                   */
2309 2310                  mutex_enter(&rp->r_statelock);
2310 2311                  /*
2311 2312                   * When pgcreated is nonzero the caller has already done
2312 2313                   * a segmap_getmapflt with forcefault 0 and S_WRITE. With
2313 2314                   * segkpm this means we already have at least one page
2314 2315                   * created and mapped at base.
2315 2316                   */
2316 2317                  pagecreate = pgcreated ||
2317 2318                      ((offset & PAGEOFFSET) == 0 &&
2318 2319                      (n == PAGESIZE || ((offset + n) >= rp->r_size)));
2319 2320  
2320 2321                  mutex_exit(&rp->r_statelock);
2321 2322  
2322 2323                  if (!vpm_enable && pagecreate) {
2323 2324                          /*
2324 2325                           * The last argument tells segmap_pagecreate() to
2325 2326                           * always lock the page, as opposed to sometimes
2326 2327                           * returning with the page locked. This way we avoid a
2327 2328                           * fault on the ensuing uiomove(), but also
2328 2329                           * more importantly (to fix bug 1094402) we can
2329 2330                           * call segmap_fault() to unlock the page in all
2330 2331                           * cases. An alternative would be to modify
2331 2332                           * segmap_pagecreate() to tell us when it is
2332 2333                           * locking a page, but that's a fairly major
2333 2334                           * interface change.
2334 2335                           */
2335 2336                          if (pgcreated == 0)
2336 2337                                  (void) segmap_pagecreate(segkmap, base,
2337 2338                                      (uint_t)n, 1);
2338 2339                          saved_base = base;
2339 2340                          saved_n = n;
2340 2341                  }
2341 2342  
2342 2343                  /*
2343 2344                   * The number of bytes of data in the last page can not
2344 2345                   * be accurately be determined while page is being
2345 2346                   * uiomove'd to and the size of the file being updated.
2346 2347                   * Thus, inform threads which need to know accurately
2347 2348                   * how much data is in the last page of the file.  They
2348 2349                   * will not do the i/o immediately, but will arrange for
2349 2350                   * the i/o to happen later when this modify operation
2350 2351                   * will have finished.
2351 2352                   */
2352 2353                  ASSERT(!(rp->r_flags & R4MODINPROGRESS));
2353 2354                  mutex_enter(&rp->r_statelock);
2354 2355                  rp->r_flags |= R4MODINPROGRESS;
2355 2356                  rp->r_modaddr = (offset & MAXBMASK);
2356 2357                  mutex_exit(&rp->r_statelock);
2357 2358  
2358 2359                  if (vpm_enable) {
2359 2360                          /*
2360 2361                           * Copy data. If new pages are created, part of
2361 2362                           * the page that is not written will be initizliazed
2362 2363                           * with zeros.
2363 2364                           */
2364 2365                          error = vpm_data_copy(vp, offset, n, uio,
2365 2366                              !pagecreate, NULL, 0, S_WRITE);
2366 2367                  } else {
2367 2368                          error = uiomove(base, n, UIO_WRITE, uio);
2368 2369                  }
2369 2370  
2370 2371                  /*
2371 2372                   * r_size is the maximum number of
2372 2373                   * bytes known to be in the file.
2373 2374                   * Make sure it is at least as high as the
2374 2375                   * first unwritten byte pointed to by uio_loffset.
2375 2376                   */
2376 2377                  mutex_enter(&rp->r_statelock);
2377 2378                  if (rp->r_size < uio->uio_loffset)
2378 2379                          rp->r_size = uio->uio_loffset;
2379 2380                  rp->r_flags &= ~R4MODINPROGRESS;
2380 2381                  rp->r_flags |= R4DIRTY;
2381 2382                  mutex_exit(&rp->r_statelock);
2382 2383  
2383 2384                  /* n = # of bytes written */
2384 2385                  n = (int)(uio->uio_loffset - offset);
2385 2386  
2386 2387                  if (!vpm_enable) {
2387 2388                          base += n;
2388 2389                  }
2389 2390  
2390 2391                  tcount -= n;
2391 2392                  /*
2392 2393                   * If we created pages w/o initializing them completely,
2393 2394                   * we need to zero the part that wasn't set up.
2394 2395                   * This happens on a most EOF write cases and if
2395 2396                   * we had some sort of error during the uiomove.
2396 2397                   */
2397 2398                  if (!vpm_enable && pagecreate) {
2398 2399                          if ((uio->uio_loffset & PAGEOFFSET) || n == 0)
2399 2400                                  (void) kzero(base, PAGESIZE - n);
2400 2401  
2401 2402                          if (pgcreated) {
2402 2403                                  /*
2403 2404                                   * Caller is responsible for this page,
2404 2405                                   * it was not created in this loop.
2405 2406                                   */
2406 2407                                  pgcreated = 0;
2407 2408                          } else {
2408 2409                                  /*
2409 2410                                   * For bug 1094402: segmap_pagecreate locks
2410 2411                                   * page. Unlock it. This also unlocks the
2411 2412                                   * pages allocated by page_create_va() in
2412 2413                                   * segmap_pagecreate().
2413 2414                                   */
2414 2415                                  sm_error = segmap_fault(kas.a_hat, segkmap,
2415 2416                                      saved_base, saved_n,
2416 2417                                      F_SOFTUNLOCK, S_WRITE);
2417 2418                                  if (error == 0)
2418 2419                                          error = sm_error;
2419 2420                          }
2420 2421                  }
2421 2422          } while (tcount > 0 && error == 0);
2422 2423  
2423 2424          return (error);
2424 2425  }
2425 2426  
2426 2427  int
2427 2428  nfs4_putpages(vnode_t *vp, u_offset_t off, size_t len, int flags, cred_t *cr)
2428 2429  {
2429 2430          rnode4_t *rp;
2430 2431          page_t *pp;
2431 2432          u_offset_t eoff;
2432 2433          u_offset_t io_off;
2433 2434          size_t io_len;
2434 2435          int error;
2435 2436          int rdirty;
2436 2437          int err;
2437 2438  
2438 2439          rp = VTOR4(vp);
2439 2440          ASSERT(rp->r_count > 0);
2440 2441  
2441 2442          if (!nfs4_has_pages(vp))
2442 2443                  return (0);
2443 2444  
2444 2445          ASSERT(vp->v_type != VCHR);
2445 2446  
2446 2447          /*
2447 2448           * If R4OUTOFSPACE is set, then all writes turn into B_INVAL
2448 2449           * writes.  B_FORCE is set to force the VM system to actually
2449 2450           * invalidate the pages, even if the i/o failed.  The pages
2450 2451           * need to get invalidated because they can't be written out
2451 2452           * because there isn't any space left on either the server's
2452 2453           * file system or in the user's disk quota.  The B_FREE bit
2453 2454           * is cleared to avoid confusion as to whether this is a
2454 2455           * request to place the page on the freelist or to destroy
2455 2456           * it.
2456 2457           */
2457 2458          if ((rp->r_flags & R4OUTOFSPACE) ||
2458 2459              (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED))
2459 2460                  flags = (flags & ~B_FREE) | B_INVAL | B_FORCE;
2460 2461  
2461 2462          if (len == 0) {
2462 2463                  /*
2463 2464                   * If doing a full file synchronous operation, then clear
2464 2465                   * the R4DIRTY bit.  If a page gets dirtied while the flush
2465 2466                   * is happening, then R4DIRTY will get set again.  The
2466 2467                   * R4DIRTY bit must get cleared before the flush so that
2467 2468                   * we don't lose this information.
2468 2469                   *
2469 2470                   * If there are no full file async write operations
2470 2471                   * pending and RDIRTY bit is set, clear it.
2471 2472                   */
2472 2473                  if (off == (u_offset_t)0 &&
2473 2474                      !(flags & B_ASYNC) &&
2474 2475                      (rp->r_flags & R4DIRTY)) {
2475 2476                          mutex_enter(&rp->r_statelock);
2476 2477                          rdirty = (rp->r_flags & R4DIRTY);
2477 2478                          rp->r_flags &= ~R4DIRTY;
2478 2479                          mutex_exit(&rp->r_statelock);
2479 2480                  } else if (flags & B_ASYNC && off == (u_offset_t)0) {
2480 2481                          mutex_enter(&rp->r_statelock);
2481 2482                          if (rp->r_flags & R4DIRTY && rp->r_awcount == 0) {
2482 2483                                  rdirty = (rp->r_flags & R4DIRTY);
2483 2484                                  rp->r_flags &= ~R4DIRTY;
2484 2485                          }
2485 2486                          mutex_exit(&rp->r_statelock);
2486 2487                  } else
2487 2488                          rdirty = 0;
2488 2489  
2489 2490                  /*
2490 2491                   * Search the entire vp list for pages >= off, and flush
2491 2492                   * the dirty pages.
2492 2493                   */
2493 2494                  error = pvn_vplist_dirty(vp, off, rp->r_putapage,
2494 2495                      flags, cr);
2495 2496  
2496 2497                  /*
2497 2498                   * If an error occurred and the file was marked as dirty
2498 2499                   * before and we aren't forcibly invalidating pages, then
2499 2500                   * reset the R4DIRTY flag.
2500 2501                   */
2501 2502                  if (error && rdirty &&
2502 2503                      (flags & (B_INVAL | B_FORCE)) != (B_INVAL | B_FORCE)) {
2503 2504                          mutex_enter(&rp->r_statelock);
2504 2505                          rp->r_flags |= R4DIRTY;
2505 2506                          mutex_exit(&rp->r_statelock);
2506 2507                  }
2507 2508          } else {
2508 2509                  /*
2509 2510                   * Do a range from [off...off + len) looking for pages
2510 2511                   * to deal with.
2511 2512                   */
2512 2513                  error = 0;
2513 2514                  io_len = 0;
2514 2515                  eoff = off + len;
2515 2516                  mutex_enter(&rp->r_statelock);
2516 2517                  for (io_off = off; io_off < eoff && io_off < rp->r_size;
2517 2518                      io_off += io_len) {
2518 2519                          mutex_exit(&rp->r_statelock);
2519 2520                          /*
2520 2521                           * If we are not invalidating, synchronously
2521 2522                           * freeing or writing pages use the routine
2522 2523                           * page_lookup_nowait() to prevent reclaiming
2523 2524                           * them from the free list.
2524 2525                           */
2525 2526                          if ((flags & B_INVAL) || !(flags & B_ASYNC)) {
2526 2527                                  pp = page_lookup(vp, io_off,
2527 2528                                      (flags & (B_INVAL | B_FREE)) ?
2528 2529                                      SE_EXCL : SE_SHARED);
2529 2530                          } else {
2530 2531                                  pp = page_lookup_nowait(vp, io_off,
2531 2532                                      (flags & B_FREE) ? SE_EXCL : SE_SHARED);
2532 2533                          }
2533 2534  
2534 2535                          if (pp == NULL || !pvn_getdirty(pp, flags))
2535 2536                                  io_len = PAGESIZE;
2536 2537                          else {
2537 2538                                  err = (*rp->r_putapage)(vp, pp, &io_off,
2538 2539                                      &io_len, flags, cr);
2539 2540                                  if (!error)
2540 2541                                          error = err;
2541 2542                                  /*
2542 2543                                   * "io_off" and "io_len" are returned as
2543 2544                                   * the range of pages we actually wrote.
2544 2545                                   * This allows us to skip ahead more quickly
2545 2546                                   * since several pages may've been dealt
2546 2547                                   * with by this iteration of the loop.
2547 2548                                   */
2548 2549                          }
2549 2550                          mutex_enter(&rp->r_statelock);
2550 2551                  }
2551 2552                  mutex_exit(&rp->r_statelock);
2552 2553          }
2553 2554  
2554 2555          return (error);
2555 2556  }
2556 2557  
2557 2558  void
2558 2559  nfs4_invalidate_pages(vnode_t *vp, u_offset_t off, cred_t *cr)
2559 2560  {
2560 2561          rnode4_t *rp;
2561 2562  
2562 2563          rp = VTOR4(vp);
2563 2564          if (IS_SHADOW(vp, rp))
2564 2565                  vp = RTOV4(rp);
2565 2566          mutex_enter(&rp->r_statelock);
2566 2567          while (rp->r_flags & R4TRUNCATE)
2567 2568                  cv_wait(&rp->r_cv, &rp->r_statelock);
2568 2569          rp->r_flags |= R4TRUNCATE;
2569 2570          if (off == (u_offset_t)0) {
2570 2571                  rp->r_flags &= ~R4DIRTY;
2571 2572                  if (!(rp->r_flags & R4STALE))
2572 2573                          rp->r_error = 0;
2573 2574          }
2574 2575          rp->r_truncaddr = off;
2575 2576          mutex_exit(&rp->r_statelock);
2576 2577          (void) pvn_vplist_dirty(vp, off, rp->r_putapage,
2577 2578              B_INVAL | B_TRUNC, cr);
2578 2579          mutex_enter(&rp->r_statelock);
2579 2580          rp->r_flags &= ~R4TRUNCATE;
2580 2581          cv_broadcast(&rp->r_cv);
2581 2582          mutex_exit(&rp->r_statelock);
2582 2583  }
2583 2584  
2584 2585  static int
2585 2586  nfs4_mnt_kstat_update(kstat_t *ksp, int rw)
2586 2587  {
2587 2588          mntinfo4_t *mi;
2588 2589          struct mntinfo_kstat *mik;
2589 2590          vfs_t *vfsp;
2590 2591  
2591 2592          /* this is a read-only kstat. Bail out on a write */
2592 2593          if (rw == KSTAT_WRITE)
2593 2594                  return (EACCES);
2594 2595  
2595 2596  
2596 2597          /*
2597 2598           * We don't want to wait here as kstat_chain_lock could be held by
2598 2599           * dounmount(). dounmount() takes vfs_reflock before the chain lock
2599 2600           * and thus could lead to a deadlock.
2600 2601           */
2601 2602          vfsp = (struct vfs *)ksp->ks_private;
2602 2603  
2603 2604          mi = VFTOMI4(vfsp);
2604 2605          mik = (struct mntinfo_kstat *)ksp->ks_data;
2605 2606  
2606 2607          (void) strcpy(mik->mik_proto, mi->mi_curr_serv->sv_knconf->knc_proto);
2607 2608  
2608 2609          mik->mik_vers = (uint32_t)mi->mi_vers;
2609 2610          mik->mik_flags = mi->mi_flags;
2610 2611          /*
2611 2612           * The sv_secdata holds the flavor the client specifies.
2612 2613           * If the client uses default and a security negotiation
2613 2614           * occurs, sv_currsec will point to the current flavor
2614 2615           * selected from the server flavor list.
2615 2616           * sv_currsec is NULL if no security negotiation takes place.
2616 2617           */
2617 2618          mik->mik_secmod = mi->mi_curr_serv->sv_currsec ?
2618 2619              mi->mi_curr_serv->sv_currsec->secmod :
2619 2620              mi->mi_curr_serv->sv_secdata->secmod;
2620 2621          mik->mik_curread = (uint32_t)mi->mi_curread;
2621 2622          mik->mik_curwrite = (uint32_t)mi->mi_curwrite;
2622 2623          mik->mik_retrans = mi->mi_retrans;
2623 2624          mik->mik_timeo = mi->mi_timeo;
2624 2625          mik->mik_acregmin = HR2SEC(mi->mi_acregmin);
2625 2626          mik->mik_acregmax = HR2SEC(mi->mi_acregmax);
2626 2627          mik->mik_acdirmin = HR2SEC(mi->mi_acdirmin);
2627 2628          mik->mik_acdirmax = HR2SEC(mi->mi_acdirmax);
2628 2629          mik->mik_noresponse = (uint32_t)mi->mi_noresponse;
2629 2630          mik->mik_failover = (uint32_t)mi->mi_failover;
2630 2631          mik->mik_remap = (uint32_t)mi->mi_remap;
2631 2632  
2632 2633          (void) strcpy(mik->mik_curserver, mi->mi_curr_serv->sv_hostname);
2633 2634  
2634 2635          return (0);
2635 2636  }
2636 2637  
2637 2638  void
2638 2639  nfs4_mnt_kstat_init(struct vfs *vfsp)
2639 2640  {
2640 2641          mntinfo4_t *mi = VFTOMI4(vfsp);
2641 2642  
2642 2643          /*
2643 2644           * PSARC 2001/697 Contract Private Interface
2644 2645           * All nfs kstats are under SunMC contract
2645 2646           * Please refer to the PSARC listed above and contact
2646 2647           * SunMC before making any changes!
2647 2648           *
2648 2649           * Changes must be reviewed by Solaris File Sharing
2649 2650           * Changes must be communicated to contract-2001-697@sun.com
2650 2651           *
2651 2652           */
2652 2653  
2653 2654          mi->mi_io_kstats = kstat_create_zone("nfs", getminor(vfsp->vfs_dev),
2654 2655              NULL, "nfs", KSTAT_TYPE_IO, 1, 0, mi->mi_zone->zone_id);
2655 2656          if (mi->mi_io_kstats) {
2656 2657                  if (mi->mi_zone->zone_id != GLOBAL_ZONEID)
2657 2658                          kstat_zone_add(mi->mi_io_kstats, GLOBAL_ZONEID);
2658 2659                  mi->mi_io_kstats->ks_lock = &mi->mi_lock;
2659 2660                  kstat_install(mi->mi_io_kstats);
2660 2661          }
2661 2662  
2662 2663          if ((mi->mi_ro_kstats = kstat_create_zone("nfs",
2663 2664              getminor(vfsp->vfs_dev), "mntinfo", "misc", KSTAT_TYPE_RAW,
2664 2665              sizeof (struct mntinfo_kstat), 0, mi->mi_zone->zone_id)) != NULL) {
2665 2666                  if (mi->mi_zone->zone_id != GLOBAL_ZONEID)
2666 2667                          kstat_zone_add(mi->mi_ro_kstats, GLOBAL_ZONEID);
2667 2668                  mi->mi_ro_kstats->ks_update = nfs4_mnt_kstat_update;
2668 2669                  mi->mi_ro_kstats->ks_private = (void *)vfsp;
2669 2670                  kstat_install(mi->mi_ro_kstats);
2670 2671          }
2671 2672  
2672 2673          nfs4_mnt_recov_kstat_init(vfsp);
2673 2674  }
2674 2675  
2675 2676  void
2676 2677  nfs4_write_error(vnode_t *vp, int error, cred_t *cr)
2677 2678  {
2678 2679          mntinfo4_t *mi;
2679 2680          clock_t now = ddi_get_lbolt();
2680 2681  
2681 2682          mi = VTOMI4(vp);
2682 2683          /*
2683 2684           * In case of forced unmount, do not print any messages
2684 2685           * since it can flood the console with error messages.
2685 2686           */
2686 2687          if (mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)
2687 2688                  return;
2688 2689  
2689 2690          /*
2690 2691           * If the mount point is dead, not recoverable, do not
2691 2692           * print error messages that can flood the console.
2692 2693           */
2693 2694          if (mi->mi_flags & MI4_RECOV_FAIL)
2694 2695                  return;
2695 2696  
2696 2697          /*
2697 2698           * No use in flooding the console with ENOSPC
2698 2699           * messages from the same file system.
2699 2700           */
2700 2701          if ((error != ENOSPC && error != EDQUOT) ||
2701 2702              now - mi->mi_printftime > 0) {
2702 2703                  zoneid_t zoneid = mi->mi_zone->zone_id;
2703 2704  
2704 2705  #ifdef DEBUG
2705 2706                  nfs_perror(error, "NFS%ld write error on host %s: %m.\n",
2706 2707                      mi->mi_vers, VTOR4(vp)->r_server->sv_hostname, NULL);
2707 2708  #else
2708 2709                  nfs_perror(error, "NFS write error on host %s: %m.\n",
2709 2710                      VTOR4(vp)->r_server->sv_hostname, NULL);
2710 2711  #endif
2711 2712                  if (error == ENOSPC || error == EDQUOT) {
2712 2713                          zcmn_err(zoneid, CE_CONT,
2713 2714                              "^File: userid=%d, groupid=%d\n",
2714 2715                              crgetuid(cr), crgetgid(cr));
2715 2716                          if (crgetuid(curthread->t_cred) != crgetuid(cr) ||
2716 2717                              crgetgid(curthread->t_cred) != crgetgid(cr)) {
2717 2718                                  zcmn_err(zoneid, CE_CONT,
2718 2719                                      "^User: userid=%d, groupid=%d\n",
2719 2720                                      crgetuid(curthread->t_cred),
2720 2721                                      crgetgid(curthread->t_cred));
2721 2722                          }
2722 2723                          mi->mi_printftime = now +
2723 2724                              nfs_write_error_interval * hz;
2724 2725                  }
2725 2726                  sfh4_printfhandle(VTOR4(vp)->r_fh);
2726 2727  #ifdef DEBUG
2727 2728                  if (error == EACCES) {
2728 2729                          zcmn_err(zoneid, CE_CONT,
2729 2730                              "nfs_bio: cred is%s kcred\n",
2730 2731                              cr == kcred ? "" : " not");
2731 2732                  }
2732 2733  #endif
2733 2734          }
2734 2735  }
2735 2736  
2736 2737  /*
2737 2738   * Return non-zero if the given file can be safely memory mapped.  Locks
2738 2739   * are safe if whole-file (length and offset are both zero).
2739 2740   */
2740 2741  
2741 2742  #define SAFE_LOCK(flk)  ((flk).l_start == 0 && (flk).l_len == 0)
2742 2743  
2743 2744  static int
2744 2745  nfs4_safemap(const vnode_t *vp)
2745 2746  {
2746 2747          locklist_t      *llp, *next_llp;
2747 2748          int             safe = 1;
2748 2749          rnode4_t        *rp = VTOR4(vp);
2749 2750  
2750 2751          ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER));
2751 2752  
2752 2753          NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, "nfs4_safemap: "
2753 2754              "vp = %p", (void *)vp));
2754 2755  
2755 2756          /*
2756 2757           * Review all the locks for the vnode, both ones that have been
2757 2758           * acquired and ones that are pending.  We assume that
2758 2759           * flk_active_locks_for_vp() has merged any locks that can be
2759 2760           * merged (so that if a process has the entire file locked, it is
2760 2761           * represented as a single lock).
2761 2762           *
2762 2763           * Note that we can't bail out of the loop if we find a non-safe
2763 2764           * lock, because we have to free all the elements in the llp list.
2764 2765           * We might be able to speed up this code slightly by not looking
2765 2766           * at each lock's l_start and l_len fields once we've found a
2766 2767           * non-safe lock.
2767 2768           */
2768 2769  
2769 2770          llp = flk_active_locks_for_vp(vp);
2770 2771          while (llp) {
2771 2772                  NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE,
2772 2773                      "nfs4_safemap: active lock (%" PRId64 ", %" PRId64 ")",
2773 2774                      llp->ll_flock.l_start, llp->ll_flock.l_len));
2774 2775                  if (!SAFE_LOCK(llp->ll_flock)) {
2775 2776                          safe = 0;
2776 2777                          NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE,
2777 2778                              "nfs4_safemap: unsafe active lock (%" PRId64
2778 2779                              ", %" PRId64 ")", llp->ll_flock.l_start,
2779 2780                              llp->ll_flock.l_len));
2780 2781                  }
2781 2782                  next_llp = llp->ll_next;
2782 2783                  VN_RELE(llp->ll_vp);
2783 2784                  kmem_free(llp, sizeof (*llp));
2784 2785                  llp = next_llp;
2785 2786          }
2786 2787  
2787 2788          NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, "nfs4_safemap: %s",
2788 2789              safe ? "safe" : "unsafe"));
2789 2790          return (safe);
2790 2791  }
2791 2792  
2792 2793  /*
2793 2794   * Return whether there is a lost LOCK or LOCKU queued up for the given
2794 2795   * file that would make an mmap request unsafe.  cf. nfs4_safemap().
2795 2796   */
2796 2797  
2797 2798  bool_t
2798 2799  nfs4_map_lost_lock_conflict(vnode_t *vp)
2799 2800  {
2800 2801          bool_t conflict = FALSE;
2801 2802          nfs4_lost_rqst_t *lrp;
2802 2803          mntinfo4_t *mi = VTOMI4(vp);
2803 2804  
2804 2805          mutex_enter(&mi->mi_lock);
2805 2806          for (lrp = list_head(&mi->mi_lost_state); lrp != NULL;
2806 2807              lrp = list_next(&mi->mi_lost_state, lrp)) {
2807 2808                  if (lrp->lr_op != OP_LOCK && lrp->lr_op != OP_LOCKU)
2808 2809                          continue;
2809 2810                  ASSERT(lrp->lr_vp != NULL);
2810 2811                  if (!VOP_CMP(lrp->lr_vp, vp, NULL))
2811 2812                          continue;       /* different file */
2812 2813                  if (!SAFE_LOCK(*lrp->lr_flk)) {
2813 2814                          conflict = TRUE;
2814 2815                          break;
2815 2816                  }
2816 2817          }
2817 2818  
2818 2819          mutex_exit(&mi->mi_lock);
2819 2820          return (conflict);
2820 2821  }
2821 2822  
2822 2823  /*
2823 2824   * nfs_lockcompletion:
2824 2825   *
2825 2826   * If the vnode has a lock that makes it unsafe to cache the file, mark it
2826 2827   * as non cachable (set VNOCACHE bit).
2827 2828   */
2828 2829  
2829 2830  void
2830 2831  nfs4_lockcompletion(vnode_t *vp, int cmd)
2831 2832  {
2832 2833          rnode4_t *rp = VTOR4(vp);
2833 2834  
2834 2835          ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER));
2835 2836          ASSERT(!IS_SHADOW(vp, rp));
2836 2837  
2837 2838          if (cmd == F_SETLK || cmd == F_SETLKW) {
2838 2839  
2839 2840                  if (!nfs4_safemap(vp)) {
2840 2841                          mutex_enter(&vp->v_lock);
2841 2842                          vp->v_flag |= VNOCACHE;
2842 2843                          mutex_exit(&vp->v_lock);
2843 2844                  } else {
2844 2845                          mutex_enter(&vp->v_lock);
2845 2846                          vp->v_flag &= ~VNOCACHE;
2846 2847                          mutex_exit(&vp->v_lock);
2847 2848                  }
2848 2849          }
2849 2850          /*
2850 2851           * The cached attributes of the file are stale after acquiring
2851 2852           * the lock on the file. They were updated when the file was
2852 2853           * opened, but not updated when the lock was acquired. Therefore the
2853 2854           * cached attributes are invalidated after the lock is obtained.
2854 2855           */
2855 2856          PURGE_ATTRCACHE4(vp);
2856 2857  }
2857 2858  
2858 2859  /* ARGSUSED */
2859 2860  static void *
2860 2861  nfs4_mi_init(zoneid_t zoneid)
2861 2862  {
2862 2863          struct mi4_globals *mig;
2863 2864  
2864 2865          mig = kmem_alloc(sizeof (*mig), KM_SLEEP);
2865 2866          mutex_init(&mig->mig_lock, NULL, MUTEX_DEFAULT, NULL);
2866 2867          list_create(&mig->mig_list, sizeof (mntinfo4_t),
2867 2868              offsetof(mntinfo4_t, mi_zone_node));
2868 2869          mig->mig_destructor_called = B_FALSE;
2869 2870          return (mig);
2870 2871  }
2871 2872  
2872 2873  /*
2873 2874   * Callback routine to tell all NFSv4 mounts in the zone to start tearing down
2874 2875   * state and killing off threads.
2875 2876   */
2876 2877  /* ARGSUSED */
2877 2878  static void
2878 2879  nfs4_mi_shutdown(zoneid_t zoneid, void *data)
2879 2880  {
2880 2881          struct mi4_globals *mig = data;
2881 2882          mntinfo4_t *mi;
2882 2883          nfs4_server_t *np;
2883 2884  
2884 2885          NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
2885 2886              "nfs4_mi_shutdown zone %d\n", zoneid));
2886 2887          ASSERT(mig != NULL);
2887 2888          for (;;) {
2888 2889                  mutex_enter(&mig->mig_lock);
2889 2890                  mi = list_head(&mig->mig_list);
2890 2891                  if (mi == NULL) {
2891 2892                          mutex_exit(&mig->mig_lock);
2892 2893                          break;
2893 2894                  }
2894 2895  
2895 2896                  NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
2896 2897                      "nfs4_mi_shutdown stopping vfs %p\n", (void *)mi->mi_vfsp));
2897 2898                  /*
2898 2899                   * purge the DNLC for this filesystem
2899 2900                   */
2900 2901                  (void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
2901 2902                  /*
2902 2903                   * Tell existing async worker threads to exit.
2903 2904                   */
2904 2905                  mutex_enter(&mi->mi_async_lock);
2905 2906                  mi->mi_max_threads = 0;
2906 2907                  NFS4_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
2907 2908                  /*
2908 2909                   * Set the appropriate flags, signal and wait for both the
2909 2910                   * async manager and the inactive thread to exit when they're
2910 2911                   * done with their current work.
2911 2912                   */
2912 2913                  mutex_enter(&mi->mi_lock);
2913 2914                  mi->mi_flags |= (MI4_ASYNC_MGR_STOP|MI4_DEAD);
2914 2915                  mutex_exit(&mi->mi_lock);
2915 2916                  mutex_exit(&mi->mi_async_lock);
2916 2917                  if (mi->mi_manager_thread) {
2917 2918                          nfs4_async_manager_stop(mi->mi_vfsp);
2918 2919                  }
2919 2920                  if (mi->mi_inactive_thread) {
2920 2921                          mutex_enter(&mi->mi_async_lock);
2921 2922                          cv_signal(&mi->mi_inact_req_cv);
2922 2923                          /*
2923 2924                           * Wait for the inactive thread to exit.
2924 2925                           */
2925 2926                          while (mi->mi_inactive_thread != NULL) {
2926 2927                                  cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
2927 2928                          }
2928 2929                          mutex_exit(&mi->mi_async_lock);
2929 2930                  }
2930 2931                  /*
2931 2932                   * Wait for the recovery thread to complete, that is, it will
2932 2933                   * signal when it is done using the "mi" structure and about
2933 2934                   * to exit
2934 2935                   */
2935 2936                  mutex_enter(&mi->mi_lock);
2936 2937                  while (mi->mi_in_recovery > 0)
2937 2938                          cv_wait(&mi->mi_cv_in_recov, &mi->mi_lock);
2938 2939                  mutex_exit(&mi->mi_lock);
2939 2940                  /*
2940 2941                   * We're done when every mi has been done or the list is empty.
2941 2942                   * This one is done, remove it from the list.
2942 2943                   */
2943 2944                  list_remove(&mig->mig_list, mi);
2944 2945                  mutex_exit(&mig->mig_lock);
2945 2946                  zone_rele_ref(&mi->mi_zone_ref, ZONE_REF_NFSV4);
2946 2947  
2947 2948                  /*
2948 2949                   * Release hold on vfs and mi done to prevent race with zone
2949 2950                   * shutdown. This releases the hold in nfs4_mi_zonelist_add.
2950 2951                   */
2951 2952                  VFS_RELE(mi->mi_vfsp);
2952 2953                  MI4_RELE(mi);
2953 2954          }
2954 2955          /*
2955 2956           * Tell each renew thread in the zone to exit
2956 2957           */
2957 2958          mutex_enter(&nfs4_server_lst_lock);
2958 2959          for (np = nfs4_server_lst.forw; np != &nfs4_server_lst; np = np->forw) {
2959 2960                  mutex_enter(&np->s_lock);
2960 2961                  if (np->zoneid == zoneid) {
2961 2962                          /*
2962 2963                           * We add another hold onto the nfs4_server_t
2963 2964                           * because this will make sure tha the nfs4_server_t
2964 2965                           * stays around until nfs4_callback_fini_zone destroys
2965 2966                           * the zone. This way, the renew thread can
2966 2967                           * unconditionally release its holds on the
2967 2968                           * nfs4_server_t.
2968 2969                           */
2969 2970                          np->s_refcnt++;
2970 2971                          nfs4_mark_srv_dead(np);
2971 2972                  }
2972 2973                  mutex_exit(&np->s_lock);
2973 2974          }
2974 2975          mutex_exit(&nfs4_server_lst_lock);
2975 2976  }
2976 2977  
2977 2978  static void
2978 2979  nfs4_mi_free_globals(struct mi4_globals *mig)
2979 2980  {
2980 2981          list_destroy(&mig->mig_list);   /* makes sure the list is empty */
2981 2982          mutex_destroy(&mig->mig_lock);
2982 2983          kmem_free(mig, sizeof (*mig));
2983 2984  }
2984 2985  
2985 2986  /* ARGSUSED */
2986 2987  static void
2987 2988  nfs4_mi_destroy(zoneid_t zoneid, void *data)
2988 2989  {
2989 2990          struct mi4_globals *mig = data;
2990 2991  
2991 2992          NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
2992 2993              "nfs4_mi_destroy zone %d\n", zoneid));
2993 2994          ASSERT(mig != NULL);
2994 2995          mutex_enter(&mig->mig_lock);
2995 2996          if (list_head(&mig->mig_list) != NULL) {
2996 2997                  /* Still waiting for VFS_FREEVFS() */
2997 2998                  mig->mig_destructor_called = B_TRUE;
2998 2999                  mutex_exit(&mig->mig_lock);
2999 3000                  return;
3000 3001          }
3001 3002          nfs4_mi_free_globals(mig);
3002 3003  }
3003 3004  
3004 3005  /*
3005 3006   * Add an NFS mount to the per-zone list of NFS mounts.
3006 3007   */
3007 3008  void
3008 3009  nfs4_mi_zonelist_add(mntinfo4_t *mi)
3009 3010  {
3010 3011          struct mi4_globals *mig;
3011 3012  
3012 3013          mig = zone_getspecific(mi4_list_key, mi->mi_zone);
3013 3014          mutex_enter(&mig->mig_lock);
3014 3015          list_insert_head(&mig->mig_list, mi);
3015 3016          /*
3016 3017           * hold added to eliminate race with zone shutdown -this will be
3017 3018           * released in mi_shutdown
3018 3019           */
3019 3020          MI4_HOLD(mi);
3020 3021          VFS_HOLD(mi->mi_vfsp);
3021 3022          mutex_exit(&mig->mig_lock);
3022 3023  }
3023 3024  
3024 3025  /*
3025 3026   * Remove an NFS mount from the per-zone list of NFS mounts.
3026 3027   */
3027 3028  int
3028 3029  nfs4_mi_zonelist_remove(mntinfo4_t *mi)
3029 3030  {
3030 3031          struct mi4_globals *mig;
3031 3032          int ret = 0;
3032 3033  
3033 3034          mig = zone_getspecific(mi4_list_key, mi->mi_zone);
3034 3035          mutex_enter(&mig->mig_lock);
3035 3036          mutex_enter(&mi->mi_lock);
3036 3037          /* if this mi is marked dead, then the zone already released it */
3037 3038          if (!(mi->mi_flags & MI4_DEAD)) {
3038 3039                  list_remove(&mig->mig_list, mi);
3039 3040                  mutex_exit(&mi->mi_lock);
3040 3041  
3041 3042                  /* release the holds put on in zonelist_add(). */
3042 3043                  VFS_RELE(mi->mi_vfsp);
3043 3044                  MI4_RELE(mi);
3044 3045                  ret = 1;
3045 3046          } else {
3046 3047                  mutex_exit(&mi->mi_lock);
3047 3048          }
3048 3049  
3049 3050          /*
3050 3051           * We can be called asynchronously by VFS_FREEVFS() after the zone
3051 3052           * shutdown/destroy callbacks have executed; if so, clean up the zone's
3052 3053           * mi globals.
3053 3054           */
3054 3055          if (list_head(&mig->mig_list) == NULL &&
3055 3056              mig->mig_destructor_called == B_TRUE) {
3056 3057                  nfs4_mi_free_globals(mig);
3057 3058                  return (ret);
3058 3059          }
3059 3060          mutex_exit(&mig->mig_lock);
3060 3061          return (ret);
3061 3062  }
3062 3063  
3063 3064  void
3064 3065  nfs_free_mi4(mntinfo4_t *mi)
3065 3066  {
3066 3067          nfs4_open_owner_t       *foop;
3067 3068          nfs4_oo_hash_bucket_t   *bucketp;
3068 3069          nfs4_debug_msg_t        *msgp;
3069 3070          int i;
3070 3071          servinfo4_t             *svp;
3071 3072  
3072 3073          /*
3073 3074           * Code introduced here should be carefully evaluated to make
3074 3075           * sure none of the freed resources are accessed either directly
3075 3076           * or indirectly after freeing them. For eg: Introducing calls to
3076 3077           * NFS4_DEBUG that use mntinfo4_t structure member after freeing
3077 3078           * the structure members or other routines calling back into NFS
3078 3079           * accessing freed mntinfo4_t structure member.
3079 3080           */
3080 3081          mutex_enter(&mi->mi_lock);
3081 3082          ASSERT(mi->mi_recovthread == NULL);
3082 3083          ASSERT(mi->mi_flags & MI4_ASYNC_MGR_STOP);
3083 3084          mutex_exit(&mi->mi_lock);
3084 3085          mutex_enter(&mi->mi_async_lock);
3085 3086          ASSERT(mi->mi_threads[NFS4_ASYNC_QUEUE] == 0 &&
3086 3087              mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] == 0);
3087 3088          ASSERT(mi->mi_manager_thread == NULL);
3088 3089          mutex_exit(&mi->mi_async_lock);
3089 3090          if (mi->mi_io_kstats) {
3090 3091                  kstat_delete(mi->mi_io_kstats);
3091 3092                  mi->mi_io_kstats = NULL;
3092 3093          }
3093 3094          if (mi->mi_ro_kstats) {
3094 3095                  kstat_delete(mi->mi_ro_kstats);
3095 3096                  mi->mi_ro_kstats = NULL;
3096 3097          }
3097 3098          if (mi->mi_recov_ksp) {
3098 3099                  kstat_delete(mi->mi_recov_ksp);
3099 3100                  mi->mi_recov_ksp = NULL;
3100 3101          }
3101 3102          mutex_enter(&mi->mi_msg_list_lock);
3102 3103          while (msgp = list_head(&mi->mi_msg_list)) {
3103 3104                  list_remove(&mi->mi_msg_list, msgp);
3104 3105                  nfs4_free_msg(msgp);
3105 3106          }
3106 3107          mutex_exit(&mi->mi_msg_list_lock);
3107 3108          list_destroy(&mi->mi_msg_list);
3108 3109          if (mi->mi_fname != NULL)
3109 3110                  fn_rele(&mi->mi_fname);
3110 3111          if (mi->mi_rootfh != NULL)
3111 3112                  sfh4_rele(&mi->mi_rootfh);
3112 3113          if (mi->mi_srvparentfh != NULL)
3113 3114                  sfh4_rele(&mi->mi_srvparentfh);
3114 3115          svp = mi->mi_servers;
3115 3116          sv4_free(svp);
3116 3117          mutex_destroy(&mi->mi_lock);
3117 3118          mutex_destroy(&mi->mi_async_lock);
3118 3119          mutex_destroy(&mi->mi_msg_list_lock);
3119 3120          nfs_rw_destroy(&mi->mi_recovlock);
3120 3121          nfs_rw_destroy(&mi->mi_rename_lock);
3121 3122          nfs_rw_destroy(&mi->mi_fh_lock);
3122 3123          cv_destroy(&mi->mi_failover_cv);
3123 3124          cv_destroy(&mi->mi_async_reqs_cv);
3124 3125          cv_destroy(&mi->mi_async_work_cv[NFS4_ASYNC_QUEUE]);
3125 3126          cv_destroy(&mi->mi_async_work_cv[NFS4_ASYNC_PGOPS_QUEUE]);
3126 3127          cv_destroy(&mi->mi_async_cv);
3127 3128          cv_destroy(&mi->mi_inact_req_cv);
3128 3129          /*
3129 3130           * Destroy the oo hash lists and mutexes for the cred hash table.
3130 3131           */
3131 3132          for (i = 0; i < NFS4_NUM_OO_BUCKETS; i++) {
3132 3133                  bucketp = &(mi->mi_oo_list[i]);
3133 3134                  /* Destroy any remaining open owners on the list */
3134 3135                  foop = list_head(&bucketp->b_oo_hash_list);
3135 3136                  while (foop != NULL) {
3136 3137                          list_remove(&bucketp->b_oo_hash_list, foop);
3137 3138                          nfs4_destroy_open_owner(foop);
3138 3139                          foop = list_head(&bucketp->b_oo_hash_list);
3139 3140                  }
3140 3141                  list_destroy(&bucketp->b_oo_hash_list);
3141 3142                  mutex_destroy(&bucketp->b_lock);
3142 3143          }
3143 3144          /*
3144 3145           * Empty and destroy the freed open owner list.
3145 3146           */
3146 3147          foop = list_head(&mi->mi_foo_list);
3147 3148          while (foop != NULL) {
3148 3149                  list_remove(&mi->mi_foo_list, foop);
3149 3150                  nfs4_destroy_open_owner(foop);
3150 3151                  foop = list_head(&mi->mi_foo_list);
3151 3152          }
3152 3153          list_destroy(&mi->mi_foo_list);
3153 3154          list_destroy(&mi->mi_bseqid_list);
3154 3155          list_destroy(&mi->mi_lost_state);
3155 3156          avl_destroy(&mi->mi_filehandles);
3156 3157          kmem_free(mi, sizeof (*mi));
3157 3158  }
3158 3159  void
3159 3160  mi_hold(mntinfo4_t *mi)
3160 3161  {
3161 3162          atomic_inc_32(&mi->mi_count);
3162 3163          ASSERT(mi->mi_count != 0);
3163 3164  }
3164 3165  
3165 3166  void
3166 3167  mi_rele(mntinfo4_t *mi)
3167 3168  {
3168 3169          ASSERT(mi->mi_count != 0);
3169 3170          if (atomic_dec_32_nv(&mi->mi_count) == 0) {
3170 3171                  nfs_free_mi4(mi);
3171 3172          }
3172 3173  }
3173 3174  
3174 3175  vnode_t    nfs4_xattr_notsupp_vnode;
3175 3176  
3176 3177  void
3177 3178  nfs4_clnt_init(void)
3178 3179  {
3179 3180          nfs4_vnops_init();
3180 3181          (void) nfs4_rnode_init();
3181 3182          (void) nfs4_shadow_init();
3182 3183          (void) nfs4_acache_init();
3183 3184          (void) nfs4_subr_init();
3184 3185          nfs4_acl_init();
3185 3186          nfs_idmap_init();
3186 3187          nfs4_callback_init();
3187 3188          nfs4_secinfo_init();
3188 3189  #ifdef  DEBUG
3189 3190          tsd_create(&nfs4_tsd_key, NULL);
3190 3191  #endif
3191 3192  
3192 3193          /*
3193 3194           * Add a CPR callback so that we can update client
3194 3195           * lease after a suspend and resume.
3195 3196           */
3196 3197          cid = callb_add(nfs4_client_cpr_callb, 0, CB_CL_CPR_RPC, "nfs4");
3197 3198  
3198 3199          zone_key_create(&mi4_list_key, nfs4_mi_init, nfs4_mi_shutdown,
3199 3200              nfs4_mi_destroy);
3200 3201  
3201 3202          /*
3202 3203           * Initialize the reference count of the notsupp xattr cache vnode to 1
3203 3204           * so that it never goes away (VOP_INACTIVE isn't called on it).
3204 3205           */
3205 3206          vn_reinit(&nfs4_xattr_notsupp_vnode);
3206 3207  }
3207 3208  
3208 3209  void
3209 3210  nfs4_clnt_fini(void)
3210 3211  {
3211 3212          (void) zone_key_delete(mi4_list_key);
3212 3213          nfs4_vnops_fini();
3213 3214          (void) nfs4_rnode_fini();
3214 3215          (void) nfs4_shadow_fini();
3215 3216          (void) nfs4_acache_fini();
3216 3217          (void) nfs4_subr_fini();
3217 3218          nfs_idmap_fini();
3218 3219          nfs4_callback_fini();
3219 3220          nfs4_secinfo_fini();
3220 3221  #ifdef  DEBUG
3221 3222          tsd_destroy(&nfs4_tsd_key);
3222 3223  #endif
3223 3224          if (cid)
3224 3225                  (void) callb_delete(cid);
3225 3226  }
3226 3227  
3227 3228  /*ARGSUSED*/
3228 3229  static boolean_t
3229 3230  nfs4_client_cpr_callb(void *arg, int code)
3230 3231  {
3231 3232          /*
3232 3233           * We get called for Suspend and Resume events.
3233 3234           * For the suspend case we simply don't care!
3234 3235           */
3235 3236          if (code == CB_CODE_CPR_CHKPT) {
3236 3237                  return (B_TRUE);
3237 3238          }
3238 3239  
3239 3240          /*
3240 3241           * When we get to here we are in the process of
3241 3242           * resuming the system from a previous suspend.
3242 3243           */
3243 3244          nfs4_client_resumed = gethrestime_sec();
3244 3245          return (B_TRUE);
3245 3246  }
3246 3247  
3247 3248  void
3248 3249  nfs4_renew_lease_thread(nfs4_server_t *sp)
3249 3250  {
3250 3251          int     error = 0;
3251 3252          time_t  tmp_last_renewal_time, tmp_time, tmp_now_time, kip_secs;
3252 3253          clock_t tick_delay = 0;
3253 3254          clock_t time_left = 0;
3254 3255          callb_cpr_t cpr_info;
3255 3256          kmutex_t cpr_lock;
3256 3257  
3257 3258          NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3258 3259              "nfs4_renew_lease_thread: acting on sp 0x%p", (void*)sp));
3259 3260          mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL);
3260 3261          CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr, "nfsv4Lease");
3261 3262  
3262 3263          mutex_enter(&sp->s_lock);
3263 3264          /* sp->s_lease_time is set via a GETATTR */
3264 3265          sp->last_renewal_time = gethrestime_sec();
3265 3266          sp->lease_valid = NFS4_LEASE_UNINITIALIZED;
3266 3267          ASSERT(sp->s_refcnt >= 1);
3267 3268  
3268 3269          for (;;) {
3269 3270                  if (!sp->state_ref_count ||
3270 3271                      sp->lease_valid != NFS4_LEASE_VALID) {
3271 3272  
3272 3273                          kip_secs = MAX((sp->s_lease_time >> 1) -
3273 3274                              (3 * sp->propagation_delay.tv_sec), 1);
3274 3275  
3275 3276                          tick_delay = SEC_TO_TICK(kip_secs);
3276 3277  
3277 3278                          NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3278 3279                              "nfs4_renew_lease_thread: no renew : thread "
3279 3280                              "wait %ld secs", kip_secs));
3280 3281  
3281 3282                          NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3282 3283                              "nfs4_renew_lease_thread: no renew : "
3283 3284                              "state_ref_count %d, lease_valid %d",
3284 3285                              sp->state_ref_count, sp->lease_valid));
3285 3286  
3286 3287                          mutex_enter(&cpr_lock);
3287 3288                          CALLB_CPR_SAFE_BEGIN(&cpr_info);
3288 3289                          mutex_exit(&cpr_lock);
3289 3290                          time_left = cv_reltimedwait(&sp->cv_thread_exit,
3290 3291                              &sp->s_lock, tick_delay, TR_CLOCK_TICK);
3291 3292                          mutex_enter(&cpr_lock);
3292 3293                          CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
3293 3294                          mutex_exit(&cpr_lock);
3294 3295  
3295 3296                          NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3296 3297                              "nfs4_renew_lease_thread: no renew: "
3297 3298                              "time left %ld", time_left));
3298 3299  
3299 3300                          if (sp->s_thread_exit == NFS4_THREAD_EXIT)
3300 3301                                  goto die;
3301 3302                          continue;
3302 3303                  }
3303 3304  
3304 3305                  tmp_last_renewal_time = sp->last_renewal_time;
3305 3306  
3306 3307                  tmp_time = gethrestime_sec() - sp->last_renewal_time +
3307 3308                      (3 * sp->propagation_delay.tv_sec);
3308 3309  
3309 3310                  NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3310 3311                      "nfs4_renew_lease_thread: tmp_time %ld, "
3311 3312                      "sp->last_renewal_time %ld", tmp_time,
3312 3313                      sp->last_renewal_time));
3313 3314  
3314 3315                  kip_secs = MAX((sp->s_lease_time >> 1) - tmp_time, 1);
3315 3316  
3316 3317                  tick_delay = SEC_TO_TICK(kip_secs);
3317 3318  
3318 3319                  NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3319 3320                      "nfs4_renew_lease_thread: valid lease: sleep for %ld "
3320 3321                      "secs", kip_secs));
3321 3322  
3322 3323                  mutex_enter(&cpr_lock);
3323 3324                  CALLB_CPR_SAFE_BEGIN(&cpr_info);
3324 3325                  mutex_exit(&cpr_lock);
3325 3326                  time_left = cv_reltimedwait(&sp->cv_thread_exit, &sp->s_lock,
3326 3327                      tick_delay, TR_CLOCK_TICK);
3327 3328                  mutex_enter(&cpr_lock);
3328 3329                  CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
3329 3330                  mutex_exit(&cpr_lock);
3330 3331  
3331 3332                  NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3332 3333                      "nfs4_renew_lease_thread: valid lease: time left %ld :"
3333 3334                      "sp last_renewal_time %ld, nfs4_client_resumed %ld, "
3334 3335                      "tmp_last_renewal_time %ld", time_left,
3335 3336                      sp->last_renewal_time, nfs4_client_resumed,
3336 3337                      tmp_last_renewal_time));
3337 3338  
3338 3339                  if (sp->s_thread_exit == NFS4_THREAD_EXIT)
3339 3340                          goto die;
3340 3341  
3341 3342                  if (tmp_last_renewal_time == sp->last_renewal_time ||
3342 3343                      (nfs4_client_resumed != 0 &&
3343 3344                      nfs4_client_resumed > sp->last_renewal_time)) {
3344 3345                          /*
3345 3346                           * Issue RENEW op since we haven't renewed the lease
3346 3347                           * since we slept.
3347 3348                           */
3348 3349                          tmp_now_time = gethrestime_sec();
3349 3350                          error = nfs4renew(sp);
3350 3351                          /*
3351 3352                           * Need to re-acquire sp's lock, nfs4renew()
3352 3353                           * relinqueshes it.
3353 3354                           */
3354 3355                          mutex_enter(&sp->s_lock);
3355 3356  
3356 3357                          /*
3357 3358                           * See if someone changed s_thread_exit while we gave
3358 3359                           * up s_lock.
3359 3360                           */
3360 3361                          if (sp->s_thread_exit == NFS4_THREAD_EXIT)
3361 3362                                  goto die;
3362 3363  
3363 3364                          if (!error) {
3364 3365                                  /*
3365 3366                                   * check to see if we implicitly renewed while
3366 3367                                   * we waited for a reply for our RENEW call.
3367 3368                                   */
3368 3369                                  if (tmp_last_renewal_time ==
3369 3370                                      sp->last_renewal_time) {
3370 3371                                          /* no implicit renew came */
3371 3372                                          sp->last_renewal_time = tmp_now_time;
3372 3373                                  } else {
3373 3374                                          NFS4_DEBUG(nfs4_client_lease_debug,
3374 3375                                              (CE_NOTE, "renew_thread: did "
3375 3376                                              "implicit renewal before reply "
3376 3377                                              "from server for RENEW"));
3377 3378                                  }
3378 3379                          } else {
3379 3380                                  /* figure out error */
3380 3381                                  NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3381 3382                                      "renew_thread: nfs4renew returned error"
3382 3383                                      " %d", error));
3383 3384                          }
3384 3385  
3385 3386                  }
3386 3387          }
3387 3388  
3388 3389  die:
3389 3390          NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3390 3391              "nfs4_renew_lease_thread: thread exiting"));
3391 3392  
3392 3393          while (sp->s_otw_call_count != 0) {
3393 3394                  NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3394 3395                      "nfs4_renew_lease_thread: waiting for outstanding "
3395 3396                      "otw calls to finish for sp 0x%p, current "
3396 3397                      "s_otw_call_count %d", (void *)sp,
3397 3398                      sp->s_otw_call_count));
3398 3399                  mutex_enter(&cpr_lock);
3399 3400                  CALLB_CPR_SAFE_BEGIN(&cpr_info);
3400 3401                  mutex_exit(&cpr_lock);
3401 3402                  cv_wait(&sp->s_cv_otw_count, &sp->s_lock);
3402 3403                  mutex_enter(&cpr_lock);
3403 3404                  CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
3404 3405                  mutex_exit(&cpr_lock);
3405 3406          }
3406 3407          mutex_exit(&sp->s_lock);
3407 3408  
3408 3409          nfs4_server_rele(sp);           /* free the thread's reference */
3409 3410          nfs4_server_rele(sp);           /* free the list's reference */
3410 3411          sp = NULL;
3411 3412  
3412 3413  done:
3413 3414          mutex_enter(&cpr_lock);
3414 3415          CALLB_CPR_EXIT(&cpr_info);      /* drops cpr_lock */
3415 3416          mutex_destroy(&cpr_lock);
3416 3417  
3417 3418          NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3418 3419              "nfs4_renew_lease_thread: renew thread exit officially"));
3419 3420  
3420 3421          zthread_exit();
3421 3422          /* NOT REACHED */
3422 3423  }
3423 3424  
3424 3425  /*
3425 3426   * Send out a RENEW op to the server.
3426 3427   * Assumes sp is locked down.
3427 3428   */
3428 3429  static int
3429 3430  nfs4renew(nfs4_server_t *sp)
3430 3431  {
3431 3432          COMPOUND4args_clnt args;
3432 3433          COMPOUND4res_clnt res;
3433 3434          nfs_argop4 argop[1];
3434 3435          int doqueue = 1;
3435 3436          int rpc_error;
3436 3437          cred_t *cr;
3437 3438          mntinfo4_t *mi;
3438 3439          timespec_t prop_time, after_time;
3439 3440          int needrecov = FALSE;
3440 3441          nfs4_recov_state_t recov_state;
3441 3442          nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
3442 3443  
3443 3444          NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "nfs4renew"));
3444 3445  
3445 3446          recov_state.rs_flags = 0;
3446 3447          recov_state.rs_num_retry_despite_err = 0;
3447 3448  
3448 3449  recov_retry:
3449 3450          mi = sp->mntinfo4_list;
3450 3451          VFS_HOLD(mi->mi_vfsp);
3451 3452          mutex_exit(&sp->s_lock);
3452 3453          ASSERT(mi != NULL);
3453 3454  
3454 3455          e.error = nfs4_start_op(mi, NULL, NULL, &recov_state);
3455 3456          if (e.error) {
3456 3457                  VFS_RELE(mi->mi_vfsp);
3457 3458                  return (e.error);
3458 3459          }
3459 3460  
3460 3461          /* Check to see if we're dealing with a marked-dead sp */
3461 3462          mutex_enter(&sp->s_lock);
3462 3463          if (sp->s_thread_exit == NFS4_THREAD_EXIT) {
3463 3464                  mutex_exit(&sp->s_lock);
3464 3465                  nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3465 3466                  VFS_RELE(mi->mi_vfsp);
3466 3467                  return (0);
3467 3468          }
3468 3469  
3469 3470          /* Make sure mi hasn't changed on us */
3470 3471          if (mi != sp->mntinfo4_list) {
3471 3472                  /* Must drop sp's lock to avoid a recursive mutex enter */
3472 3473                  mutex_exit(&sp->s_lock);
3473 3474                  nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3474 3475                  VFS_RELE(mi->mi_vfsp);
3475 3476                  mutex_enter(&sp->s_lock);
3476 3477                  goto recov_retry;
3477 3478          }
3478 3479          mutex_exit(&sp->s_lock);
3479 3480  
3480 3481          args.ctag = TAG_RENEW;
3481 3482  
3482 3483          args.array_len = 1;
3483 3484          args.array = argop;
3484 3485  
3485 3486          argop[0].argop = OP_RENEW;
3486 3487  
3487 3488          mutex_enter(&sp->s_lock);
3488 3489          argop[0].nfs_argop4_u.oprenew.clientid = sp->clientid;
3489 3490          cr = sp->s_cred;
3490 3491          crhold(cr);
3491 3492          mutex_exit(&sp->s_lock);
3492 3493  
3493 3494          ASSERT(cr != NULL);
3494 3495  
3495 3496          /* used to figure out RTT for sp */
3496 3497          gethrestime(&prop_time);
3497 3498  
3498 3499          NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
3499 3500              "nfs4renew: %s call, sp 0x%p", needrecov ? "recov" : "first",
3500 3501              (void*)sp));
3501 3502          NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "before: %ld s %ld ns ",
3502 3503              prop_time.tv_sec, prop_time.tv_nsec));
3503 3504  
3504 3505          DTRACE_PROBE2(nfs4__renew__start, nfs4_server_t *, sp,
3505 3506              mntinfo4_t *, mi);
3506 3507  
3507 3508          rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
3508 3509          crfree(cr);
3509 3510  
3510 3511          DTRACE_PROBE2(nfs4__renew__end, nfs4_server_t *, sp,
3511 3512              mntinfo4_t *, mi);
3512 3513  
3513 3514          gethrestime(&after_time);
3514 3515  
3515 3516          mutex_enter(&sp->s_lock);
3516 3517          sp->propagation_delay.tv_sec =
3517 3518              MAX(1, after_time.tv_sec - prop_time.tv_sec);
3518 3519          mutex_exit(&sp->s_lock);
3519 3520  
3520 3521          NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "after : %ld s %ld ns ",
3521 3522              after_time.tv_sec, after_time.tv_nsec));
3522 3523  
3523 3524          if (e.error == 0 && res.status == NFS4ERR_CB_PATH_DOWN) {
3524 3525                  xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3525 3526                  nfs4_delegreturn_all(sp);
3526 3527                  nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3527 3528                  VFS_RELE(mi->mi_vfsp);
3528 3529                  /*
3529 3530                   * If the server returns CB_PATH_DOWN, it has renewed
3530 3531                   * the lease and informed us that the callback path is
3531 3532                   * down.  Since the lease is renewed, just return 0 and
3532 3533                   * let the renew thread proceed as normal.
3533 3534                   */
3534 3535                  return (0);
3535 3536          }
3536 3537  
3537 3538          needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
3538 3539          if (!needrecov && e.error) {
3539 3540                  nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3540 3541                  VFS_RELE(mi->mi_vfsp);
3541 3542                  return (e.error);
3542 3543          }
3543 3544  
3544 3545          rpc_error = e.error;
3545 3546  
3546 3547          if (needrecov) {
3547 3548                  NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
3548 3549                      "nfs4renew: initiating recovery\n"));
3549 3550  
3550 3551                  if (nfs4_start_recovery(&e, mi, NULL, NULL, NULL, NULL,
3551 3552                      OP_RENEW, NULL, NULL, NULL) == FALSE) {
3552 3553                          nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3553 3554                          VFS_RELE(mi->mi_vfsp);
3554 3555                          if (!e.error)
3555 3556                                  xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3556 3557                          mutex_enter(&sp->s_lock);
3557 3558                          goto recov_retry;
3558 3559                  }
3559 3560                  /* fall through for res.status case */
3560 3561          }
3561 3562  
3562 3563          if (res.status) {
3563 3564                  if (res.status == NFS4ERR_LEASE_MOVED) {
3564 3565                          /*EMPTY*/
3565 3566                          /*
3566 3567                           * XXX need to try every mntinfo4 in sp->mntinfo4_list
3567 3568                           * to renew the lease on that server
3568 3569                           */
3569 3570                  }
3570 3571                  e.error = geterrno4(res.status);
3571 3572          }
3572 3573  
3573 3574          if (!rpc_error)
3574 3575                  xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3575 3576  
3576 3577          nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3577 3578  
3578 3579          VFS_RELE(mi->mi_vfsp);
3579 3580  
3580 3581          return (e.error);
3581 3582  }
3582 3583  
3583 3584  void
3584 3585  nfs4_inc_state_ref_count(mntinfo4_t *mi)
3585 3586  {
3586 3587          nfs4_server_t   *sp;
3587 3588  
3588 3589          /* this locks down sp if it is found */
3589 3590          sp = find_nfs4_server(mi);
3590 3591  
3591 3592          if (sp != NULL) {
3592 3593                  nfs4_inc_state_ref_count_nolock(sp, mi);
3593 3594                  mutex_exit(&sp->s_lock);
3594 3595                  nfs4_server_rele(sp);
3595 3596          }
3596 3597  }
3597 3598  
3598 3599  /*
3599 3600   * Bump the number of OPEN files (ie: those with state) so we know if this
3600 3601   * nfs4_server has any state to maintain a lease for or not.
3601 3602   *
3602 3603   * Also, marks the nfs4_server's lease valid if it hasn't been done so already.
3603 3604   */
3604 3605  void
3605 3606  nfs4_inc_state_ref_count_nolock(nfs4_server_t *sp, mntinfo4_t *mi)
3606 3607  {
3607 3608          ASSERT(mutex_owned(&sp->s_lock));
3608 3609  
3609 3610          sp->state_ref_count++;
3610 3611          NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3611 3612              "nfs4_inc_state_ref_count: state_ref_count now %d",
3612 3613              sp->state_ref_count));
3613 3614  
3614 3615          if (sp->lease_valid == NFS4_LEASE_UNINITIALIZED)
3615 3616                  sp->lease_valid = NFS4_LEASE_VALID;
3616 3617  
3617 3618          /*
3618 3619           * If this call caused the lease to be marked valid and/or
3619 3620           * took the state_ref_count from 0 to 1, then start the time
3620 3621           * on lease renewal.
3621 3622           */
3622 3623          if (sp->lease_valid == NFS4_LEASE_VALID && sp->state_ref_count == 1)
3623 3624                  sp->last_renewal_time = gethrestime_sec();
3624 3625  
3625 3626          /* update the number of open files for mi */
3626 3627          mi->mi_open_files++;
3627 3628  }
3628 3629  
3629 3630  void
3630 3631  nfs4_dec_state_ref_count(mntinfo4_t *mi)
3631 3632  {
3632 3633          nfs4_server_t   *sp;
3633 3634  
3634 3635          /* this locks down sp if it is found */
3635 3636          sp = find_nfs4_server_all(mi, 1);
3636 3637  
3637 3638          if (sp != NULL) {
3638 3639                  nfs4_dec_state_ref_count_nolock(sp, mi);
3639 3640                  mutex_exit(&sp->s_lock);
3640 3641                  nfs4_server_rele(sp);
3641 3642          }
3642 3643  }
3643 3644  
3644 3645  /*
3645 3646   * Decrement the number of OPEN files (ie: those with state) so we know if
3646 3647   * this nfs4_server has any state to maintain a lease for or not.
3647 3648   */
3648 3649  void
3649 3650  nfs4_dec_state_ref_count_nolock(nfs4_server_t *sp, mntinfo4_t *mi)
3650 3651  {
3651 3652          ASSERT(mutex_owned(&sp->s_lock));
3652 3653          ASSERT(sp->state_ref_count != 0);
3653 3654          sp->state_ref_count--;
3654 3655  
3655 3656          NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3656 3657              "nfs4_dec_state_ref_count: state ref count now %d",
3657 3658              sp->state_ref_count));
3658 3659  
3659 3660          mi->mi_open_files--;
3660 3661          NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3661 3662              "nfs4_dec_state_ref_count: mi open files %d, v4 flags 0x%x",
3662 3663              mi->mi_open_files, mi->mi_flags));
3663 3664  
3664 3665          /* We don't have to hold the mi_lock to test mi_flags */
3665 3666          if (mi->mi_open_files == 0 &&
3666 3667              (mi->mi_flags & MI4_REMOVE_ON_LAST_CLOSE)) {
3667 3668                  NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3668 3669                      "nfs4_dec_state_ref_count: remove mntinfo4 %p since "
3669 3670                      "we have closed the last open file", (void*)mi));
3670 3671                  nfs4_remove_mi_from_server(mi, sp);
3671 3672          }
3672 3673  }
3673 3674  
3674 3675  bool_t
3675 3676  inlease(nfs4_server_t *sp)
3676 3677  {
3677 3678          bool_t result;
3678 3679  
3679 3680          ASSERT(mutex_owned(&sp->s_lock));
3680 3681  
3681 3682          if (sp->lease_valid == NFS4_LEASE_VALID &&
3682 3683              gethrestime_sec() < sp->last_renewal_time + sp->s_lease_time)
3683 3684                  result = TRUE;
3684 3685          else
3685 3686                  result = FALSE;
3686 3687  
3687 3688          return (result);
3688 3689  }
3689 3690  
3690 3691  
3691 3692  /*
3692 3693   * Return non-zero if the given nfs4_server_t is going through recovery.
3693 3694   */
3694 3695  
3695 3696  int
3696 3697  nfs4_server_in_recovery(nfs4_server_t *sp)
3697 3698  {
3698 3699          return (nfs_rw_lock_held(&sp->s_recovlock, RW_WRITER));
3699 3700  }
3700 3701  
3701 3702  /*
3702 3703   * Compare two shared filehandle objects.  Returns -1, 0, or +1, if the
3703 3704   * first is less than, equal to, or greater than the second.
3704 3705   */
3705 3706  
3706 3707  int
3707 3708  sfh4cmp(const void *p1, const void *p2)
3708 3709  {
3709 3710          const nfs4_sharedfh_t *sfh1 = (const nfs4_sharedfh_t *)p1;
3710 3711          const nfs4_sharedfh_t *sfh2 = (const nfs4_sharedfh_t *)p2;
3711 3712  
3712 3713          return (nfs4cmpfh(&sfh1->sfh_fh, &sfh2->sfh_fh));
3713 3714  }
3714 3715  
3715 3716  /*
3716 3717   * Create a table for shared filehandle objects.
3717 3718   */
3718 3719  
3719 3720  void
3720 3721  sfh4_createtab(avl_tree_t *tab)
3721 3722  {
3722 3723          avl_create(tab, sfh4cmp, sizeof (nfs4_sharedfh_t),
3723 3724              offsetof(nfs4_sharedfh_t, sfh_tree));
3724 3725  }
3725 3726  
3726 3727  /*
3727 3728   * Return a shared filehandle object for the given filehandle.  The caller
3728 3729   * is responsible for eventually calling sfh4_rele().
3729 3730   */
3730 3731  
3731 3732  nfs4_sharedfh_t *
3732 3733  sfh4_put(const nfs_fh4 *fh, mntinfo4_t *mi, nfs4_sharedfh_t *key)
3733 3734  {
3734 3735          nfs4_sharedfh_t *sfh, *nsfh;
3735 3736          avl_index_t where;
3736 3737          nfs4_sharedfh_t skey;
3737 3738  
3738 3739          if (!key) {
3739 3740                  skey.sfh_fh = *fh;
3740 3741                  key = &skey;
3741 3742          }
3742 3743  
3743 3744          nsfh = kmem_alloc(sizeof (nfs4_sharedfh_t), KM_SLEEP);
3744 3745          nsfh->sfh_fh.nfs_fh4_len = fh->nfs_fh4_len;
3745 3746          /*
3746 3747           * We allocate the largest possible filehandle size because it's
3747 3748           * not that big, and it saves us from possibly having to resize the
3748 3749           * buffer later.
3749 3750           */
3750 3751          nsfh->sfh_fh.nfs_fh4_val = kmem_alloc(NFS4_FHSIZE, KM_SLEEP);
3751 3752          bcopy(fh->nfs_fh4_val, nsfh->sfh_fh.nfs_fh4_val, fh->nfs_fh4_len);
3752 3753          mutex_init(&nsfh->sfh_lock, NULL, MUTEX_DEFAULT, NULL);
3753 3754          nsfh->sfh_refcnt = 1;
3754 3755          nsfh->sfh_flags = SFH4_IN_TREE;
3755 3756          nsfh->sfh_mi = mi;
3756 3757          NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, "sfh4_get: new object (%p)",
3757 3758              (void *)nsfh));
3758 3759  
3759 3760          (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0);
3760 3761          sfh = avl_find(&mi->mi_filehandles, key, &where);
3761 3762          if (sfh != NULL) {
3762 3763                  mutex_enter(&sfh->sfh_lock);
3763 3764                  sfh->sfh_refcnt++;
3764 3765                  mutex_exit(&sfh->sfh_lock);
3765 3766                  nfs_rw_exit(&mi->mi_fh_lock);
3766 3767                  /* free our speculative allocs */
3767 3768                  kmem_free(nsfh->sfh_fh.nfs_fh4_val, NFS4_FHSIZE);
3768 3769                  kmem_free(nsfh, sizeof (nfs4_sharedfh_t));
3769 3770                  return (sfh);
3770 3771          }
3771 3772  
3772 3773          avl_insert(&mi->mi_filehandles, nsfh, where);
3773 3774          nfs_rw_exit(&mi->mi_fh_lock);
3774 3775  
3775 3776          return (nsfh);
3776 3777  }
3777 3778  
3778 3779  /*
3779 3780   * Return a shared filehandle object for the given filehandle.  The caller
3780 3781   * is responsible for eventually calling sfh4_rele().
3781 3782   */
3782 3783  
3783 3784  nfs4_sharedfh_t *
3784 3785  sfh4_get(const nfs_fh4 *fh, mntinfo4_t *mi)
3785 3786  {
3786 3787          nfs4_sharedfh_t *sfh;
3787 3788          nfs4_sharedfh_t key;
3788 3789  
3789 3790          ASSERT(fh->nfs_fh4_len <= NFS4_FHSIZE);
3790 3791  
3791 3792  #ifdef DEBUG
3792 3793          if (nfs4_sharedfh_debug) {
3793 3794                  nfs4_fhandle_t fhandle;
3794 3795  
3795 3796                  fhandle.fh_len = fh->nfs_fh4_len;
3796 3797                  bcopy(fh->nfs_fh4_val, fhandle.fh_buf, fhandle.fh_len);
3797 3798                  zcmn_err(mi->mi_zone->zone_id, CE_NOTE, "sfh4_get:");
3798 3799                  nfs4_printfhandle(&fhandle);
3799 3800          }
3800 3801  #endif
3801 3802  
3802 3803          /*
3803 3804           * If there's already an object for the given filehandle, bump the
3804 3805           * reference count and return it.  Otherwise, create a new object
3805 3806           * and add it to the AVL tree.
3806 3807           */
3807 3808  
3808 3809          key.sfh_fh = *fh;
3809 3810  
3810 3811          (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0);
3811 3812          sfh = avl_find(&mi->mi_filehandles, &key, NULL);
3812 3813          if (sfh != NULL) {
3813 3814                  mutex_enter(&sfh->sfh_lock);
3814 3815                  sfh->sfh_refcnt++;
3815 3816                  NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3816 3817                      "sfh4_get: found existing %p, new refcnt=%d",
3817 3818                      (void *)sfh, sfh->sfh_refcnt));
3818 3819                  mutex_exit(&sfh->sfh_lock);
3819 3820                  nfs_rw_exit(&mi->mi_fh_lock);
3820 3821                  return (sfh);
3821 3822          }
3822 3823          nfs_rw_exit(&mi->mi_fh_lock);
3823 3824  
3824 3825          return (sfh4_put(fh, mi, &key));
3825 3826  }
3826 3827  
3827 3828  /*
3828 3829   * Get a reference to the given shared filehandle object.
3829 3830   */
3830 3831  
3831 3832  void
3832 3833  sfh4_hold(nfs4_sharedfh_t *sfh)
3833 3834  {
3834 3835          ASSERT(sfh->sfh_refcnt > 0);
3835 3836  
3836 3837          mutex_enter(&sfh->sfh_lock);
3837 3838          sfh->sfh_refcnt++;
3838 3839          NFS4_DEBUG(nfs4_sharedfh_debug,
3839 3840              (CE_NOTE, "sfh4_hold %p, new refcnt=%d",
3840 3841              (void *)sfh, sfh->sfh_refcnt));
3841 3842          mutex_exit(&sfh->sfh_lock);
3842 3843  }
3843 3844  
3844 3845  /*
3845 3846   * Release a reference to the given shared filehandle object and null out
3846 3847   * the given pointer.
3847 3848   */
3848 3849  
3849 3850  void
3850 3851  sfh4_rele(nfs4_sharedfh_t **sfhpp)
3851 3852  {
3852 3853          mntinfo4_t *mi;
3853 3854          nfs4_sharedfh_t *sfh = *sfhpp;
3854 3855  
3855 3856          ASSERT(sfh->sfh_refcnt > 0);
3856 3857  
3857 3858          mutex_enter(&sfh->sfh_lock);
3858 3859          if (sfh->sfh_refcnt > 1) {
3859 3860                  sfh->sfh_refcnt--;
3860 3861                  NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3861 3862                      "sfh4_rele %p, new refcnt=%d",
3862 3863                      (void *)sfh, sfh->sfh_refcnt));
3863 3864                  mutex_exit(&sfh->sfh_lock);
3864 3865                  goto finish;
3865 3866          }
3866 3867          mutex_exit(&sfh->sfh_lock);
3867 3868  
3868 3869          /*
3869 3870           * Possibly the last reference, so get the lock for the table in
3870 3871           * case it's time to remove the object from the table.
3871 3872           */
3872 3873          mi = sfh->sfh_mi;
3873 3874          (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0);
3874 3875          mutex_enter(&sfh->sfh_lock);
3875 3876          sfh->sfh_refcnt--;
3876 3877          if (sfh->sfh_refcnt > 0) {
3877 3878                  NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3878 3879                      "sfh4_rele %p, new refcnt=%d",
3879 3880                      (void *)sfh, sfh->sfh_refcnt));
3880 3881                  mutex_exit(&sfh->sfh_lock);
3881 3882                  nfs_rw_exit(&mi->mi_fh_lock);
3882 3883                  goto finish;
3883 3884          }
3884 3885  
3885 3886          NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3886 3887              "sfh4_rele %p, last ref", (void *)sfh));
3887 3888          if (sfh->sfh_flags & SFH4_IN_TREE) {
3888 3889                  avl_remove(&mi->mi_filehandles, sfh);
3889 3890                  sfh->sfh_flags &= ~SFH4_IN_TREE;
3890 3891          }
3891 3892          mutex_exit(&sfh->sfh_lock);
3892 3893          nfs_rw_exit(&mi->mi_fh_lock);
3893 3894          mutex_destroy(&sfh->sfh_lock);
3894 3895          kmem_free(sfh->sfh_fh.nfs_fh4_val, NFS4_FHSIZE);
3895 3896          kmem_free(sfh, sizeof (nfs4_sharedfh_t));
3896 3897  
3897 3898  finish:
3898 3899          *sfhpp = NULL;
3899 3900  }
3900 3901  
3901 3902  /*
3902 3903   * Update the filehandle for the given shared filehandle object.
3903 3904   */
3904 3905  
3905 3906  int nfs4_warn_dupfh = 0;        /* if set, always warn about dup fhs below */
3906 3907  
3907 3908  void
3908 3909  sfh4_update(nfs4_sharedfh_t *sfh, const nfs_fh4 *newfh)
3909 3910  {
3910 3911          mntinfo4_t *mi = sfh->sfh_mi;
3911 3912          nfs4_sharedfh_t *dupsfh;
3912 3913          avl_index_t where;
3913 3914          nfs4_sharedfh_t key;
3914 3915  
3915 3916  #ifdef DEBUG
3916 3917          mutex_enter(&sfh->sfh_lock);
3917 3918          ASSERT(sfh->sfh_refcnt > 0);
3918 3919          mutex_exit(&sfh->sfh_lock);
3919 3920  #endif
3920 3921          ASSERT(newfh->nfs_fh4_len <= NFS4_FHSIZE);
3921 3922  
3922 3923          /*
3923 3924           * The basic plan is to remove the shared filehandle object from
3924 3925           * the table, update it to have the new filehandle, then reinsert
3925 3926           * it.
3926 3927           */
3927 3928  
3928 3929          (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0);
3929 3930          mutex_enter(&sfh->sfh_lock);
3930 3931          if (sfh->sfh_flags & SFH4_IN_TREE) {
3931 3932                  avl_remove(&mi->mi_filehandles, sfh);
3932 3933                  sfh->sfh_flags &= ~SFH4_IN_TREE;
3933 3934          }
3934 3935          mutex_exit(&sfh->sfh_lock);
3935 3936          sfh->sfh_fh.nfs_fh4_len = newfh->nfs_fh4_len;
3936 3937          bcopy(newfh->nfs_fh4_val, sfh->sfh_fh.nfs_fh4_val,
3937 3938              sfh->sfh_fh.nfs_fh4_len);
3938 3939  
3939 3940          /*
3940 3941           * XXX If there is already a shared filehandle object with the new
3941 3942           * filehandle, we're in trouble, because the rnode code assumes
3942 3943           * that there is only one shared filehandle object for a given
3943 3944           * filehandle.  So issue a warning (for read-write mounts only)
3944 3945           * and don't try to re-insert the given object into the table.
3945 3946           * Hopefully the given object will quickly go away and everyone
3946 3947           * will use the new object.
3947 3948           */
3948 3949          key.sfh_fh = *newfh;
3949 3950          dupsfh = avl_find(&mi->mi_filehandles, &key, &where);
3950 3951          if (dupsfh != NULL) {
3951 3952                  if (!(mi->mi_vfsp->vfs_flag & VFS_RDONLY) || nfs4_warn_dupfh) {
3952 3953                          zcmn_err(mi->mi_zone->zone_id, CE_WARN, "sfh4_update: "
3953 3954                              "duplicate filehandle detected");
3954 3955                          sfh4_printfhandle(dupsfh);
3955 3956                  }
3956 3957          } else {
3957 3958                  avl_insert(&mi->mi_filehandles, sfh, where);
3958 3959                  mutex_enter(&sfh->sfh_lock);
3959 3960                  sfh->sfh_flags |= SFH4_IN_TREE;
3960 3961                  mutex_exit(&sfh->sfh_lock);
3961 3962          }
3962 3963          nfs_rw_exit(&mi->mi_fh_lock);
3963 3964  }
3964 3965  
3965 3966  /*
3966 3967   * Copy out the current filehandle for the given shared filehandle object.
3967 3968   */
3968 3969  
3969 3970  void
3970 3971  sfh4_copyval(const nfs4_sharedfh_t *sfh, nfs4_fhandle_t *fhp)
3971 3972  {
3972 3973          mntinfo4_t *mi = sfh->sfh_mi;
3973 3974  
3974 3975          ASSERT(sfh->sfh_refcnt > 0);
3975 3976  
3976 3977          (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0);
3977 3978          fhp->fh_len = sfh->sfh_fh.nfs_fh4_len;
3978 3979          ASSERT(fhp->fh_len <= NFS4_FHSIZE);
3979 3980          bcopy(sfh->sfh_fh.nfs_fh4_val, fhp->fh_buf, fhp->fh_len);
3980 3981          nfs_rw_exit(&mi->mi_fh_lock);
3981 3982  }
3982 3983  
3983 3984  /*
3984 3985   * Print out the filehandle for the given shared filehandle object.
3985 3986   */
3986 3987  
3987 3988  void
3988 3989  sfh4_printfhandle(const nfs4_sharedfh_t *sfh)
3989 3990  {
3990 3991          nfs4_fhandle_t fhandle;
3991 3992  
3992 3993          sfh4_copyval(sfh, &fhandle);
3993 3994          nfs4_printfhandle(&fhandle);
3994 3995  }
3995 3996  
3996 3997  /*
3997 3998   * Compare 2 fnames.  Returns -1 if the first is "less" than the second, 0
3998 3999   * if they're the same, +1 if the first is "greater" than the second.  The
3999 4000   * caller (or whoever's calling the AVL package) is responsible for
4000 4001   * handling locking issues.
4001 4002   */
4002 4003  
4003 4004  static int
4004 4005  fncmp(const void *p1, const void *p2)
4005 4006  {
4006 4007          const nfs4_fname_t *f1 = p1;
4007 4008          const nfs4_fname_t *f2 = p2;
4008 4009          int res;
4009 4010  
4010 4011          res = strcmp(f1->fn_name, f2->fn_name);
4011 4012          /*
4012 4013           * The AVL package wants +/-1, not arbitrary positive or negative
4013 4014           * integers.
4014 4015           */
4015 4016          if (res > 0)
4016 4017                  res = 1;
4017 4018          else if (res < 0)
4018 4019                  res = -1;
4019 4020          return (res);
4020 4021  }
4021 4022  
4022 4023  /*
4023 4024   * Get or create an fname with the given name, as a child of the given
4024 4025   * fname.  The caller is responsible for eventually releasing the reference
4025 4026   * (fn_rele()).  parent may be NULL.
4026 4027   */
4027 4028  
4028 4029  nfs4_fname_t *
4029 4030  fn_get(nfs4_fname_t *parent, char *name, nfs4_sharedfh_t *sfh)
4030 4031  {
4031 4032          nfs4_fname_t key;
4032 4033          nfs4_fname_t *fnp;
4033 4034          avl_index_t where;
4034 4035  
4035 4036          key.fn_name = name;
4036 4037  
4037 4038          /*
4038 4039           * If there's already an fname registered with the given name, bump
4039 4040           * its reference count and return it.  Otherwise, create a new one
4040 4041           * and add it to the parent's AVL tree.
4041 4042           *
4042 4043           * fname entries we are looking for should match both name
4043 4044           * and sfh stored in the fname.
4044 4045           */
4045 4046  again:
4046 4047          if (parent != NULL) {
4047 4048                  mutex_enter(&parent->fn_lock);
4048 4049                  fnp = avl_find(&parent->fn_children, &key, &where);
4049 4050                  if (fnp != NULL) {
4050 4051                          /*
4051 4052                           * This hold on fnp is released below later,
4052 4053                           * in case this is not the fnp we want.
4053 4054                           */
4054 4055                          fn_hold(fnp);
4055 4056  
4056 4057                          if (fnp->fn_sfh == sfh) {
4057 4058                                  /*
4058 4059                                   * We have found our entry.
4059 4060                                   * put an hold and return it.
4060 4061                                   */
4061 4062                                  mutex_exit(&parent->fn_lock);
4062 4063                                  return (fnp);
4063 4064                          }
4064 4065  
4065 4066                          /*
4066 4067                           * We have found an entry that has a mismatching
4067 4068                           * fn_sfh. This could be a stale entry due to
4068 4069                           * server side rename. We will remove this entry
4069 4070                           * and make sure no such entries exist.
4070 4071                           */
4071 4072                          mutex_exit(&parent->fn_lock);
4072 4073                          mutex_enter(&fnp->fn_lock);
4073 4074                          if (fnp->fn_parent == parent) {
4074 4075                                  /*
4075 4076                                   * Remove ourselves from parent's
4076 4077                                   * fn_children tree.
4077 4078                                   */
4078 4079                                  mutex_enter(&parent->fn_lock);
4079 4080                                  avl_remove(&parent->fn_children, fnp);
4080 4081                                  mutex_exit(&parent->fn_lock);
4081 4082                                  fn_rele(&fnp->fn_parent);
4082 4083                          }
4083 4084                          mutex_exit(&fnp->fn_lock);
4084 4085                          fn_rele(&fnp);
4085 4086                          goto again;
4086 4087                  }
4087 4088          }
4088 4089  
4089 4090          fnp = kmem_alloc(sizeof (nfs4_fname_t), KM_SLEEP);
4090 4091          mutex_init(&fnp->fn_lock, NULL, MUTEX_DEFAULT, NULL);
4091 4092          fnp->fn_parent = parent;
4092 4093          if (parent != NULL)
4093 4094                  fn_hold(parent);
4094 4095          fnp->fn_len = strlen(name);
4095 4096          ASSERT(fnp->fn_len < MAXNAMELEN);
4096 4097          fnp->fn_name = kmem_alloc(fnp->fn_len + 1, KM_SLEEP);
4097 4098          (void) strcpy(fnp->fn_name, name);
4098 4099          fnp->fn_refcnt = 1;
4099 4100  
4100 4101          /*
4101 4102           * This hold on sfh is later released
4102 4103           * when we do the final fn_rele() on this fname.
4103 4104           */
4104 4105          sfh4_hold(sfh);
4105 4106          fnp->fn_sfh = sfh;
4106 4107  
4107 4108          avl_create(&fnp->fn_children, fncmp, sizeof (nfs4_fname_t),
4108 4109              offsetof(nfs4_fname_t, fn_tree));
4109 4110          NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
4110 4111              "fn_get %p:%s, a new nfs4_fname_t!",
4111 4112              (void *)fnp, fnp->fn_name));
4112 4113          if (parent != NULL) {
4113 4114                  avl_insert(&parent->fn_children, fnp, where);
4114 4115                  mutex_exit(&parent->fn_lock);
4115 4116          }
4116 4117  
4117 4118          return (fnp);
4118 4119  }
4119 4120  
4120 4121  void
4121 4122  fn_hold(nfs4_fname_t *fnp)
4122 4123  {
4123 4124          atomic_inc_32(&fnp->fn_refcnt);
4124 4125          NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
4125 4126              "fn_hold %p:%s, new refcnt=%d",
4126 4127              (void *)fnp, fnp->fn_name, fnp->fn_refcnt));
4127 4128  }
4128 4129  
4129 4130  /*
4130 4131   * Decrement the reference count of the given fname, and destroy it if its
4131 4132   * reference count goes to zero.  Nulls out the given pointer.
4132 4133   */
4133 4134  
4134 4135  void
4135 4136  fn_rele(nfs4_fname_t **fnpp)
4136 4137  {
4137 4138          nfs4_fname_t *parent;
4138 4139          uint32_t newref;
4139 4140          nfs4_fname_t *fnp;
4140 4141  
4141 4142  recur:
4142 4143          fnp = *fnpp;
4143 4144          *fnpp = NULL;
4144 4145  
4145 4146          mutex_enter(&fnp->fn_lock);
4146 4147          parent = fnp->fn_parent;
4147 4148          if (parent != NULL)
4148 4149                  mutex_enter(&parent->fn_lock);  /* prevent new references */
4149 4150          newref = atomic_dec_32_nv(&fnp->fn_refcnt);
4150 4151          if (newref > 0) {
4151 4152                  NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
4152 4153                      "fn_rele %p:%s, new refcnt=%d",
4153 4154                      (void *)fnp, fnp->fn_name, fnp->fn_refcnt));
4154 4155                  if (parent != NULL)
4155 4156                          mutex_exit(&parent->fn_lock);
4156 4157                  mutex_exit(&fnp->fn_lock);
4157 4158                  return;
4158 4159          }
4159 4160  
4160 4161          NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
4161 4162              "fn_rele %p:%s, last reference, deleting...",
4162 4163              (void *)fnp, fnp->fn_name));
4163 4164          if (parent != NULL) {
4164 4165                  avl_remove(&parent->fn_children, fnp);
4165 4166                  mutex_exit(&parent->fn_lock);
4166 4167          }
4167 4168          kmem_free(fnp->fn_name, fnp->fn_len + 1);
4168 4169          sfh4_rele(&fnp->fn_sfh);
4169 4170          mutex_destroy(&fnp->fn_lock);
4170 4171          avl_destroy(&fnp->fn_children);
4171 4172          kmem_free(fnp, sizeof (nfs4_fname_t));
4172 4173          /*
4173 4174           * Recursivly fn_rele the parent.
4174 4175           * Use goto instead of a recursive call to avoid stack overflow.
4175 4176           */
4176 4177          if (parent != NULL) {
4177 4178                  fnpp = &parent;
4178 4179                  goto recur;
4179 4180          }
4180 4181  }
4181 4182  
4182 4183  /*
4183 4184   * Returns the single component name of the given fname, in a MAXNAMELEN
4184 4185   * string buffer, which the caller is responsible for freeing.  Note that
4185 4186   * the name may become invalid as a result of fn_move().
4186 4187   */
4187 4188  
4188 4189  char *
4189 4190  fn_name(nfs4_fname_t *fnp)
4190 4191  {
4191 4192          char *name;
4192 4193  
4193 4194          ASSERT(fnp->fn_len < MAXNAMELEN);
4194 4195          name = kmem_alloc(MAXNAMELEN, KM_SLEEP);
4195 4196          mutex_enter(&fnp->fn_lock);
4196 4197          (void) strcpy(name, fnp->fn_name);
4197 4198          mutex_exit(&fnp->fn_lock);
4198 4199  
4199 4200          return (name);
4200 4201  }
4201 4202  
4202 4203  
4203 4204  /*
4204 4205   * fn_path_realloc
4205 4206   *
4206 4207   * This function, used only by fn_path, constructs
4207 4208   * a new string which looks like "prepend" + "/" + "current".
4208 4209   * by allocating a new string and freeing the old one.
4209 4210   */
4210 4211  static void
4211 4212  fn_path_realloc(char **curses, char *prepend)
4212 4213  {
4213 4214          int len, curlen = 0;
4214 4215          char *news;
4215 4216  
4216 4217          if (*curses == NULL) {
4217 4218                  /*
4218 4219                   * Prime the pump, allocate just the
4219 4220                   * space for prepend and return that.
4220 4221                   */
4221 4222                  len = strlen(prepend) + 1;
4222 4223                  news = kmem_alloc(len, KM_SLEEP);
4223 4224                  (void) strncpy(news, prepend, len);
4224 4225          } else {
4225 4226                  /*
4226 4227                   * Allocate the space  for a new string
4227 4228                   * +1 +1 is for the "/" and the NULL
4228 4229                   * byte at the end of it all.
4229 4230                   */
4230 4231                  curlen = strlen(*curses);
4231 4232                  len = curlen + strlen(prepend) + 1 + 1;
4232 4233                  news = kmem_alloc(len, KM_SLEEP);
4233 4234                  (void) strncpy(news, prepend, len);
4234 4235                  (void) strcat(news, "/");
4235 4236                  (void) strcat(news, *curses);
4236 4237                  kmem_free(*curses, curlen + 1);
4237 4238          }
4238 4239          *curses = news;
4239 4240  }
4240 4241  
4241 4242  /*
4242 4243   * Returns the path name (starting from the fs root) for the given fname.
4243 4244   * The caller is responsible for freeing.  Note that the path may be or
4244 4245   * become invalid as a result of fn_move().
4245 4246   */
4246 4247  
4247 4248  char *
4248 4249  fn_path(nfs4_fname_t *fnp)
4249 4250  {
4250 4251          char *path;
4251 4252          nfs4_fname_t *nextfnp;
4252 4253  
4253 4254          if (fnp == NULL)
4254 4255                  return (NULL);
4255 4256  
4256 4257          path = NULL;
4257 4258  
4258 4259          /* walk up the tree constructing the pathname.  */
4259 4260  
4260 4261          fn_hold(fnp);                   /* adjust for later rele */
4261 4262          do {
4262 4263                  mutex_enter(&fnp->fn_lock);
4263 4264                  /*
4264 4265                   * Add fn_name in front of the current path
4265 4266                   */
4266 4267                  fn_path_realloc(&path, fnp->fn_name);
4267 4268                  nextfnp = fnp->fn_parent;
4268 4269                  if (nextfnp != NULL)
4269 4270                          fn_hold(nextfnp);
4270 4271                  mutex_exit(&fnp->fn_lock);
4271 4272                  fn_rele(&fnp);
4272 4273                  fnp = nextfnp;
4273 4274          } while (fnp != NULL);
4274 4275  
4275 4276          return (path);
4276 4277  }
4277 4278  
4278 4279  /*
4279 4280   * Return a reference to the parent of the given fname, which the caller is
4280 4281   * responsible for eventually releasing.
4281 4282   */
4282 4283  
4283 4284  nfs4_fname_t *
4284 4285  fn_parent(nfs4_fname_t *fnp)
4285 4286  {
4286 4287          nfs4_fname_t *parent;
4287 4288  
4288 4289          mutex_enter(&fnp->fn_lock);
4289 4290          parent = fnp->fn_parent;
4290 4291          if (parent != NULL)
4291 4292                  fn_hold(parent);
4292 4293          mutex_exit(&fnp->fn_lock);
4293 4294  
4294 4295          return (parent);
4295 4296  }
4296 4297  
4297 4298  /*
4298 4299   * Update fnp so that its parent is newparent and its name is newname.
4299 4300   */
4300 4301  
4301 4302  void
4302 4303  fn_move(nfs4_fname_t *fnp, nfs4_fname_t *newparent, char *newname)
4303 4304  {
4304 4305          nfs4_fname_t *parent, *tmpfnp;
4305 4306          ssize_t newlen;
4306 4307          nfs4_fname_t key;
4307 4308          avl_index_t where;
4308 4309  
4309 4310          /*
4310 4311           * This assert exists to catch the client trying to rename
4311 4312           * a dir to be a child of itself.  This happened at a recent
4312 4313           * bakeoff against a 3rd party (broken) server which allowed
4313 4314           * the rename to succeed.  If it trips it means that:
4314 4315           *      a) the code in nfs4rename that detects this case is broken
4315 4316           *      b) the server is broken (since it allowed the bogus rename)
4316 4317           *
4317 4318           * For non-DEBUG kernels, prepare for a recursive mutex_enter
4318 4319           * panic below from:  mutex_enter(&newparent->fn_lock);
4319 4320           */
4320 4321          ASSERT(fnp != newparent);
4321 4322  
4322 4323          /*
4323 4324           * Remove fnp from its current parent, change its name, then add it
4324 4325           * to newparent. It might happen that fnp was replaced by another
4325 4326           * nfs4_fname_t with the same fn_name in parent->fn_children.
4326 4327           * In such case, fnp->fn_parent is NULL and we skip the removal
4327 4328           * of fnp from its current parent.
4328 4329           */
4329 4330          mutex_enter(&fnp->fn_lock);
4330 4331          parent = fnp->fn_parent;
4331 4332          if (parent != NULL) {
4332 4333                  mutex_enter(&parent->fn_lock);
4333 4334                  avl_remove(&parent->fn_children, fnp);
4334 4335                  mutex_exit(&parent->fn_lock);
4335 4336                  fn_rele(&fnp->fn_parent);
4336 4337          }
4337 4338  
4338 4339          newlen = strlen(newname);
4339 4340          if (newlen != fnp->fn_len) {
4340 4341                  ASSERT(newlen < MAXNAMELEN);
4341 4342                  kmem_free(fnp->fn_name, fnp->fn_len + 1);
4342 4343                  fnp->fn_name = kmem_alloc(newlen + 1, KM_SLEEP);
4343 4344                  fnp->fn_len = newlen;
4344 4345          }
4345 4346          (void) strcpy(fnp->fn_name, newname);
4346 4347  
4347 4348  again:
4348 4349          mutex_enter(&newparent->fn_lock);
4349 4350          key.fn_name = fnp->fn_name;
4350 4351          tmpfnp = avl_find(&newparent->fn_children, &key, &where);
4351 4352          if (tmpfnp != NULL) {
4352 4353                  /*
4353 4354                   * This could be due to a file that was unlinked while
4354 4355                   * open, or perhaps the rnode is in the free list.  Remove
4355 4356                   * it from newparent and let it go away on its own.  The
4356 4357                   * contorted code is to deal with lock order issues and
4357 4358                   * race conditions.
4358 4359                   */
4359 4360                  fn_hold(tmpfnp);
4360 4361                  mutex_exit(&newparent->fn_lock);
4361 4362                  mutex_enter(&tmpfnp->fn_lock);
4362 4363                  if (tmpfnp->fn_parent == newparent) {
4363 4364                          mutex_enter(&newparent->fn_lock);
4364 4365                          avl_remove(&newparent->fn_children, tmpfnp);
4365 4366                          mutex_exit(&newparent->fn_lock);
4366 4367                          fn_rele(&tmpfnp->fn_parent);
4367 4368                  }
4368 4369                  mutex_exit(&tmpfnp->fn_lock);
4369 4370                  fn_rele(&tmpfnp);
4370 4371                  goto again;
4371 4372          }
4372 4373          fnp->fn_parent = newparent;
4373 4374          fn_hold(newparent);
4374 4375          avl_insert(&newparent->fn_children, fnp, where);
4375 4376          mutex_exit(&newparent->fn_lock);
4376 4377          mutex_exit(&fnp->fn_lock);
4377 4378  }
4378 4379  
4379 4380  #ifdef DEBUG
4380 4381  /*
4381 4382   * Return non-zero if the type information makes sense for the given vnode.
4382 4383   * Otherwise panic.
4383 4384   */
4384 4385  int
4385 4386  nfs4_consistent_type(vnode_t *vp)
4386 4387  {
4387 4388          rnode4_t *rp = VTOR4(vp);
4388 4389  
4389 4390          if (nfs4_vtype_debug && vp->v_type != VNON &&
4390 4391              rp->r_attr.va_type != VNON && vp->v_type != rp->r_attr.va_type) {
4391 4392                  cmn_err(CE_PANIC, "vnode %p type mismatch; v_type=%d, "
4392 4393                      "rnode attr type=%d", (void *)vp, vp->v_type,
4393 4394                      rp->r_attr.va_type);
4394 4395          }
4395 4396  
4396 4397          return (1);
4397 4398  }
4398 4399  #endif /* DEBUG */

↓ open down ↓

2991 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX