checkme Wdiff usr/src/uts/common/fs/nfs/nfs4_client.c

Print this page

8040 NFSv4 client: 3-way deadlock between nfs4_bio(), nfs4_do_delegreturn(), and nfs4_flush_pages()
Reviewed by: Arne Jansen <arne@die-jansens.de>
Reviewed by: Vitaliy Gusev <gusev.vitaliy@icloud.com>
Approved by: Gordon Ross <gordon.w.ross@gmail.com>

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/nfs/nfs4_client.c
          +++ new/usr/src/uts/common/fs/nfs/nfs4_client.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying

↓ open down ↓

16 lines elided

↑ open up ↑

  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright (c) 2017 by Delphix. All rights reserved.
  24   24   */
  25   25  
  26   26  /*
  27      - *      Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
       27 + *      Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
  28   28   *      All Rights Reserved
  29   29   */
  30   30  
  31   31  #include <sys/param.h>
  32   32  #include <sys/types.h>
  33   33  #include <sys/systm.h>
  34   34  #include <sys/thread.h>
  35   35  #include <sys/t_lock.h>
  36   36  #include <sys/time.h>
  37   37  #include <sys/vnode.h>

  38   38  #include <sys/vfs.h>
  39   39  #include <sys/errno.h>
  40   40  #include <sys/buf.h>
  41   41  #include <sys/stat.h>
  42   42  #include <sys/cred.h>
  43   43  #include <sys/kmem.h>
  44   44  #include <sys/debug.h>
  45   45  #include <sys/dnlc.h>
  46   46  #include <sys/vmsystm.h>
  47   47  #include <sys/flock.h>
  48   48  #include <sys/share.h>
  49   49  #include <sys/cmn_err.h>
  50   50  #include <sys/tiuser.h>
  51   51  #include <sys/sysmacros.h>
  52   52  #include <sys/callb.h>
  53   53  #include <sys/acl.h>
  54   54  #include <sys/kstat.h>
  55   55  #include <sys/signal.h>
  56   56  #include <sys/disp.h>
  57   57  #include <sys/atomic.h>
  58   58  #include <sys/list.h>
  59   59  #include <sys/sdt.h>
  60   60  
  61   61  #include <rpc/types.h>
  62   62  #include <rpc/xdr.h>
  63   63  #include <rpc/auth.h>
  64   64  #include <rpc/clnt.h>
  65   65  
  66   66  #include <nfs/nfs.h>
  67   67  #include <nfs/nfs_clnt.h>
  68   68  #include <nfs/nfs_acl.h>
  69   69  
  70   70  #include <nfs/nfs4.h>
  71   71  #include <nfs/rnode4.h>
  72   72  #include <nfs/nfs4_clnt.h>
  73   73  
  74   74  #include <vm/hat.h>
  75   75  #include <vm/as.h>
  76   76  #include <vm/page.h>
  77   77  #include <vm/pvn.h>
  78   78  #include <vm/seg.h>
  79   79  #include <vm/seg_map.h>
  80   80  #include <vm/seg_vn.h>
  81   81  
  82   82  #include <sys/ddi.h>
  83   83  
  84   84  /*
  85   85   * Arguments to page-flush thread.
  86   86   */
  87   87  typedef struct {
  88   88          vnode_t *vp;
  89   89          cred_t *cr;
  90   90  } pgflush_t;
  91   91  
  92   92  #ifdef DEBUG
  93   93  int nfs4_client_lease_debug;
  94   94  int nfs4_sharedfh_debug;
  95   95  int nfs4_fname_debug;
  96   96  
  97   97  /* temporary: panic if v_type is inconsistent with r_attr va_type */
  98   98  int nfs4_vtype_debug;
  99   99  
 100  100  uint_t nfs4_tsd_key;
 101  101  #endif
 102  102  
 103  103  static time_t   nfs4_client_resumed = 0;
 104  104  static  callb_id_t cid = 0;
 105  105  
 106  106  static int      nfs4renew(nfs4_server_t *);
 107  107  static void     nfs4_attrcache_va(vnode_t *, nfs4_ga_res_t *, int);
 108  108  static void     nfs4_pgflush_thread(pgflush_t *);
 109  109  
 110  110  static boolean_t nfs4_client_cpr_callb(void *, int);
 111  111  
 112  112  struct mi4_globals {
 113  113          kmutex_t        mig_lock;  /* lock protecting mig_list */
 114  114          list_t          mig_list;  /* list of NFS v4 mounts in zone */
 115  115          boolean_t       mig_destructor_called;
 116  116  };
 117  117  
 118  118  static zone_key_t mi4_list_key;
 119  119  
 120  120  /*
 121  121   * Attributes caching:
 122  122   *
 123  123   * Attributes are cached in the rnode in struct vattr form.
 124  124   * There is a time associated with the cached attributes (r_time_attr_inval)
 125  125   * which tells whether the attributes are valid. The time is initialized
 126  126   * to the difference between current time and the modify time of the vnode
 127  127   * when new attributes are cached. This allows the attributes for
 128  128   * files that have changed recently to be timed out sooner than for files
 129  129   * that have not changed for a long time. There are minimum and maximum
 130  130   * timeout values that can be set per mount point.
 131  131   */
 132  132  
 133  133  /*
 134  134   * If a cache purge is in progress, wait for it to finish.
 135  135   *
 136  136   * The current thread must not be in the middle of an
 137  137   * nfs4_start_op/nfs4_end_op region.  Otherwise, there could be a deadlock
 138  138   * between this thread, a recovery thread, and the page flush thread.
 139  139   */
 140  140  int
 141  141  nfs4_waitfor_purge_complete(vnode_t *vp)
 142  142  {
 143  143          rnode4_t *rp;
 144  144          k_sigset_t smask;
 145  145  
 146  146          rp = VTOR4(vp);
 147  147          if ((rp->r_serial != NULL && rp->r_serial != curthread) ||
 148  148              ((rp->r_flags & R4PGFLUSH) && rp->r_pgflush != curthread)) {
 149  149                  mutex_enter(&rp->r_statelock);
 150  150                  sigintr(&smask, VTOMI4(vp)->mi_flags & MI4_INT);
 151  151                  while ((rp->r_serial != NULL && rp->r_serial != curthread) ||
 152  152                      ((rp->r_flags & R4PGFLUSH) &&
 153  153                      rp->r_pgflush != curthread)) {
 154  154                          if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
 155  155                                  sigunintr(&smask);
 156  156                                  mutex_exit(&rp->r_statelock);
 157  157                                  return (EINTR);
 158  158                          }
 159  159                  }
 160  160                  sigunintr(&smask);
 161  161                  mutex_exit(&rp->r_statelock);
 162  162          }
 163  163          return (0);
 164  164  }
 165  165  
 166  166  /*
 167  167   * Validate caches by checking cached attributes. If they have timed out,
 168  168   * then get new attributes from the server.  As a side effect, cache
 169  169   * invalidation is done if the attributes have changed.
 170  170   *
 171  171   * If the attributes have not timed out and if there is a cache
 172  172   * invalidation being done by some other thread, then wait until that
 173  173   * thread has completed the cache invalidation.
 174  174   */
 175  175  int
 176  176  nfs4_validate_caches(vnode_t *vp, cred_t *cr)
 177  177  {
 178  178          int error;
 179  179          nfs4_ga_res_t gar;
 180  180  
 181  181          if (ATTRCACHE4_VALID(vp)) {
 182  182                  error = nfs4_waitfor_purge_complete(vp);
 183  183                  if (error)
 184  184                          return (error);
 185  185                  return (0);
 186  186          }
 187  187  
 188  188          return (nfs4_getattr_otw(vp, &gar, cr, 0));
 189  189  }
 190  190  
 191  191  /*
 192  192   * Fill in attribute from the cache.
 193  193   * If valid, then return 0 to indicate that no error occurred,
 194  194   * otherwise return 1 to indicate that an error occurred.
 195  195   */
 196  196  static int
 197  197  nfs4_getattr_cache(vnode_t *vp, struct vattr *vap)
 198  198  {
 199  199          rnode4_t *rp;
 200  200  
 201  201          rp = VTOR4(vp);
 202  202          mutex_enter(&rp->r_statelock);
 203  203          mutex_enter(&rp->r_statev4_lock);
 204  204          if (ATTRCACHE4_VALID(vp)) {
 205  205                  mutex_exit(&rp->r_statev4_lock);
 206  206                  /*
 207  207                   * Cached attributes are valid
 208  208                   */
 209  209                  *vap = rp->r_attr;
 210  210                  mutex_exit(&rp->r_statelock);
 211  211                  return (0);
 212  212          }
 213  213          mutex_exit(&rp->r_statev4_lock);
 214  214          mutex_exit(&rp->r_statelock);
 215  215          return (1);
 216  216  }
 217  217  
 218  218  
 219  219  /*
 220  220   * If returned error is ESTALE flush all caches.  The nfs4_purge_caches()
 221  221   * call is synchronous because all the pages were invalidated by the
 222  222   * nfs4_invalidate_pages() call.
 223  223   */
 224  224  void
 225  225  nfs4_purge_stale_fh(int errno, vnode_t *vp, cred_t *cr)
 226  226  {
 227  227          struct rnode4 *rp = VTOR4(vp);
 228  228  
 229  229          /* Ensure that the ..._end_op() call has been done */
 230  230          ASSERT(tsd_get(nfs4_tsd_key) == NULL);
 231  231  
 232  232          if (errno != ESTALE)
 233  233                  return;
 234  234  
 235  235          mutex_enter(&rp->r_statelock);
 236  236          rp->r_flags |= R4STALE;
 237  237          if (!rp->r_error)
 238  238                  rp->r_error = errno;
 239  239          mutex_exit(&rp->r_statelock);
 240  240          if (nfs4_has_pages(vp))
 241  241                  nfs4_invalidate_pages(vp, (u_offset_t)0, cr);
 242  242          nfs4_purge_caches(vp, NFS4_PURGE_DNLC, cr, FALSE);
 243  243  }
 244  244  
 245  245  /*
 246  246   * Purge all of the various NFS `data' caches.  If "asyncpg" is TRUE, the
 247  247   * page purge is done asynchronously.
 248  248   */
 249  249  void
 250  250  nfs4_purge_caches(vnode_t *vp, int purge_dnlc, cred_t *cr, int asyncpg)
 251  251  {
 252  252          rnode4_t *rp;
 253  253          char *contents;
 254  254          vnode_t *xattr;
 255  255          int size;
 256  256          int pgflush;                    /* are we the page flush thread? */
 257  257  
 258  258          /*
 259  259           * Purge the DNLC for any entries which refer to this file.
 260  260           */
 261  261          if (vp->v_count > 1 &&
 262  262              (vp->v_type == VDIR || purge_dnlc == NFS4_PURGE_DNLC))
 263  263                  dnlc_purge_vp(vp);
 264  264  
 265  265          /*
 266  266           * Clear any readdir state bits and purge the readlink response cache.
 267  267           */
 268  268          rp = VTOR4(vp);
 269  269          mutex_enter(&rp->r_statelock);
 270  270          rp->r_flags &= ~R4LOOKUP;
 271  271          contents = rp->r_symlink.contents;
 272  272          size = rp->r_symlink.size;
 273  273          rp->r_symlink.contents = NULL;
 274  274  
 275  275          xattr = rp->r_xattr_dir;
 276  276          rp->r_xattr_dir = NULL;
 277  277  
 278  278          /*
 279  279           * Purge pathconf cache too.
 280  280           */
 281  281          rp->r_pathconf.pc4_xattr_valid = 0;
 282  282          rp->r_pathconf.pc4_cache_valid = 0;
 283  283  
 284  284          pgflush = (curthread == rp->r_pgflush);
 285  285          mutex_exit(&rp->r_statelock);
 286  286  
 287  287          if (contents != NULL) {
 288  288  
 289  289                  kmem_free((void *)contents, size);
 290  290          }
 291  291  
 292  292          if (xattr != NULL)
 293  293                  VN_RELE(xattr);
 294  294  
 295  295          /*
 296  296           * Flush the page cache.  If the current thread is the page flush
 297  297           * thread, don't initiate a new page flush.  There's no need for
 298  298           * it, and doing it correctly is hard.
 299  299           */
 300  300          if (nfs4_has_pages(vp) && !pgflush) {
 301  301                  if (!asyncpg) {
 302  302                          (void) nfs4_waitfor_purge_complete(vp);
 303  303                          nfs4_flush_pages(vp, cr);
 304  304                  } else {
 305  305                          pgflush_t *args;
 306  306  
 307  307                          /*
 308  308                           * We don't hold r_statelock while creating the
 309  309                           * thread, in case the call blocks.  So we use a
 310  310                           * flag to indicate that a page flush thread is
 311  311                           * active.
 312  312                           */
 313  313                          mutex_enter(&rp->r_statelock);
 314  314                          if (rp->r_flags & R4PGFLUSH) {
 315  315                                  mutex_exit(&rp->r_statelock);
 316  316                          } else {
 317  317                                  rp->r_flags |= R4PGFLUSH;
 318  318                                  mutex_exit(&rp->r_statelock);
 319  319  
 320  320                                  args = kmem_alloc(sizeof (pgflush_t),
 321  321                                      KM_SLEEP);
 322  322                                  args->vp = vp;
 323  323                                  VN_HOLD(args->vp);
 324  324                                  args->cr = cr;
 325  325                                  crhold(args->cr);
 326  326                                  (void) zthread_create(NULL, 0,
 327  327                                      nfs4_pgflush_thread, args, 0,
 328  328                                      minclsyspri);
 329  329                          }
 330  330                  }
 331  331          }
 332  332  
 333  333          /*
 334  334           * Flush the readdir response cache.
 335  335           */
 336  336          nfs4_purge_rddir_cache(vp);
 337  337  }
 338  338  
 339  339  /*
 340  340   * Invalidate all pages for the given file, after writing back the dirty
 341  341   * ones.
 342  342   */
 343  343  
 344  344  void
 345  345  nfs4_flush_pages(vnode_t *vp, cred_t *cr)
 346  346  {
 347  347          int error;
 348  348          rnode4_t *rp = VTOR4(vp);
 349  349  
 350  350          error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_INVAL, cr, NULL);
 351  351          if (error == ENOSPC || error == EDQUOT) {
 352  352                  mutex_enter(&rp->r_statelock);
 353  353                  if (!rp->r_error)
 354  354                          rp->r_error = error;
 355  355                  mutex_exit(&rp->r_statelock);
 356  356          }
 357  357  }
 358  358  
 359  359  /*
 360  360   * Page flush thread.
 361  361   */
 362  362  
 363  363  static void
 364  364  nfs4_pgflush_thread(pgflush_t *args)
 365  365  {
 366  366          rnode4_t *rp = VTOR4(args->vp);
 367  367  
 368  368          /* remember which thread we are, so we don't deadlock ourselves */
 369  369          mutex_enter(&rp->r_statelock);
 370  370          ASSERT(rp->r_pgflush == NULL);
 371  371          rp->r_pgflush = curthread;
 372  372          mutex_exit(&rp->r_statelock);
 373  373  
 374  374          nfs4_flush_pages(args->vp, args->cr);
 375  375  
 376  376          mutex_enter(&rp->r_statelock);
 377  377          rp->r_pgflush = NULL;
 378  378          rp->r_flags &= ~R4PGFLUSH;
 379  379          cv_broadcast(&rp->r_cv);
 380  380          mutex_exit(&rp->r_statelock);
 381  381  
 382  382          VN_RELE(args->vp);
 383  383          crfree(args->cr);
 384  384          kmem_free(args, sizeof (pgflush_t));
 385  385          zthread_exit();
 386  386  }
 387  387  
 388  388  /*
 389  389   * Purge the readdir cache of all entries which are not currently
 390  390   * being filled.
 391  391   */
 392  392  void
 393  393  nfs4_purge_rddir_cache(vnode_t *vp)
 394  394  {
 395  395          rnode4_t *rp;
 396  396  
 397  397          rp = VTOR4(vp);
 398  398  
 399  399          mutex_enter(&rp->r_statelock);
 400  400          rp->r_direof = NULL;
 401  401          rp->r_flags &= ~R4LOOKUP;
 402  402          rp->r_flags |= R4READDIRWATTR;
 403  403          rddir4_cache_purge(rp);
 404  404          mutex_exit(&rp->r_statelock);
 405  405  }
 406  406  
 407  407  /*
 408  408   * Set attributes cache for given vnode using virtual attributes.  There is
 409  409   * no cache validation, but if the attributes are deemed to be stale, they
 410  410   * are ignored.  This corresponds to nfs3_attrcache().
 411  411   *
 412  412   * Set the timeout value on the attribute cache and fill it
 413  413   * with the passed in attributes.
 414  414   */
 415  415  void
 416  416  nfs4_attrcache_noinval(vnode_t *vp, nfs4_ga_res_t *garp, hrtime_t t)
 417  417  {
 418  418          rnode4_t *rp = VTOR4(vp);
 419  419  
 420  420          mutex_enter(&rp->r_statelock);
 421  421          if (rp->r_time_attr_saved <= t)
 422  422                  nfs4_attrcache_va(vp, garp, FALSE);
 423  423          mutex_exit(&rp->r_statelock);
 424  424  }
 425  425  
 426  426  /*
 427  427   * Use the passed in virtual attributes to check to see whether the
 428  428   * data and metadata caches are valid, cache the new attributes, and
 429  429   * then do the cache invalidation if required.
 430  430   *
 431  431   * The cache validation and caching of the new attributes is done
 432  432   * atomically via the use of the mutex, r_statelock.  If required,
 433  433   * the cache invalidation is done atomically w.r.t. the cache
 434  434   * validation and caching of the attributes via the pseudo lock,
 435  435   * r_serial.
 436  436   *
 437  437   * This routine is used to do cache validation and attributes caching
 438  438   * for operations with a single set of post operation attributes.
 439  439   */
 440  440  
 441  441  void
 442  442  nfs4_attr_cache(vnode_t *vp, nfs4_ga_res_t *garp,
 443  443      hrtime_t t, cred_t *cr, int async,
 444  444      change_info4 *cinfo)
 445  445  {
 446  446          rnode4_t *rp;
 447  447          int mtime_changed = 0;
 448  448          int ctime_changed = 0;
 449  449          vsecattr_t *vsp;
 450  450          int was_serial, set_time_cache_inval, recov;
 451  451          vattr_t *vap = &garp->n4g_va;
 452  452          mntinfo4_t *mi = VTOMI4(vp);
 453  453          len_t preattr_rsize;
 454  454          boolean_t writemodify_set = B_FALSE;
 455  455          boolean_t cachepurge_set = B_FALSE;
 456  456

↓ open down ↓

419 lines elided

↑ open up ↑

 457  457          ASSERT(mi->mi_vfsp->vfs_dev == garp->n4g_va.va_fsid);
 458  458  
 459  459          /* Is curthread the recovery thread? */
 460  460          mutex_enter(&mi->mi_lock);
 461  461          recov = (VTOMI4(vp)->mi_recovthread == curthread);
 462  462          mutex_exit(&mi->mi_lock);
 463  463  
 464  464          rp = VTOR4(vp);
 465  465          mutex_enter(&rp->r_statelock);
 466  466          was_serial = (rp->r_serial == curthread);
 467      -        if (rp->r_serial && !was_serial) {
 468      -                klwp_t *lwp = ttolwp(curthread);
 469      -
      467 +        if (rp->r_serial != NULL && !was_serial) {
 470  468                  /*
 471      -                 * If we're the recovery thread, then purge current attrs
 472      -                 * and bail out to avoid potential deadlock between another
 473      -                 * thread caching attrs (r_serial thread), recov thread,
 474      -                 * and an async writer thread.
      469 +                 * Purge current attrs and bail out to avoid potential deadlock
      470 +                 * between another thread caching attrs (r_serial thread), this
      471 +                 * thread, and a thread trying to read or write pages.
 475  472                   */
 476      -                if (recov) {
 477      -                        PURGE_ATTRCACHE4_LOCKED(rp);
 478      -                        mutex_exit(&rp->r_statelock);
 479      -                        return;
 480      -                }
 481      -
 482      -                if (lwp != NULL)
 483      -                        lwp->lwp_nostop++;
 484      -                while (rp->r_serial != NULL) {
 485      -                        if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
 486      -                                mutex_exit(&rp->r_statelock);
 487      -                                if (lwp != NULL)
 488      -                                        lwp->lwp_nostop--;
 489      -                                return;
 490      -                        }
 491      -                }
 492      -                if (lwp != NULL)
 493      -                        lwp->lwp_nostop--;
      473 +                PURGE_ATTRCACHE4_LOCKED(rp);
      474 +                mutex_exit(&rp->r_statelock);
      475 +                return;
 494  476          }
 495  477  
 496  478          /*
 497  479           * If there is a page flush thread, the current thread needs to
 498  480           * bail out, to prevent a possible deadlock between the current
 499  481           * thread (which might be in a start_op/end_op region), the
 500  482           * recovery thread, and the page flush thread.  Expire the
 501  483           * attribute cache, so that any attributes the current thread was
 502  484           * going to set are not lost.
 503  485           */

 504  486          if ((rp->r_flags & R4PGFLUSH) && rp->r_pgflush != curthread) {
 505  487                  PURGE_ATTRCACHE4_LOCKED(rp);
 506  488                  mutex_exit(&rp->r_statelock);
 507  489                  return;
 508  490          }
 509  491  
 510  492          if (rp->r_time_attr_saved > t) {
 511  493                  /*
 512  494                   * Attributes have been cached since these attributes were
 513  495                   * probably made. If there is an inconsistency in what is
 514  496                   * cached, mark them invalid. If not, don't act on them.
 515  497                   */
 516  498                  if (!CACHE4_VALID(rp, vap->va_mtime, vap->va_size))
 517  499                          PURGE_ATTRCACHE4_LOCKED(rp);
 518  500                  mutex_exit(&rp->r_statelock);
 519  501                  return;
 520  502          }
 521  503          set_time_cache_inval = 0;
 522  504          if (cinfo) {
 523  505                  /*
 524  506                   * Only directory modifying callers pass non-NULL cinfo.
 525  507                   */
 526  508                  ASSERT(vp->v_type == VDIR);
 527  509                  /*
 528  510                   * If the cache timeout either doesn't exist or hasn't expired,
 529  511                   * and dir didn't changed on server before dirmod op
 530  512                   * and dir didn't change after dirmod op but before getattr
 531  513                   * then there's a chance that the client's cached data for
 532  514                   * this object is current (not stale).  No immediate cache
 533  515                   * flush is required.
 534  516                   *
 535  517                   */
 536  518                  if ((! rp->r_time_cache_inval || t < rp->r_time_cache_inval) &&
 537  519                      cinfo->before == rp->r_change &&
 538  520                      (garp->n4g_change_valid &&
 539  521                      cinfo->after == garp->n4g_change)) {
 540  522  
 541  523                          /*
 542  524                           * If atomic isn't set, then the before/after info
 543  525                           * cannot be blindly trusted.  For this case, we tell
 544  526                           * nfs4_attrcache_va to cache the attrs but also
 545  527                           * establish an absolute maximum cache timeout.  When
 546  528                           * the timeout is reached, caches will be flushed.
 547  529                           */
 548  530                          if (! cinfo->atomic)
 549  531                                  set_time_cache_inval = 1;
 550  532                  } else {
 551  533  
 552  534                          /*
 553  535                           * We're not sure exactly what changed, but we know
 554  536                           * what to do.  flush all caches for dir.  remove the
 555  537                           * attr timeout.
 556  538                           *
 557  539                           * a) timeout expired.  flush all caches.
 558  540                           * b) r_change != cinfo.before.  flush all caches.
 559  541                           * c) r_change == cinfo.before, but cinfo.after !=
 560  542                           *    post-op getattr(change).  flush all caches.
 561  543                           * d) post-op getattr(change) not provided by server.
 562  544                           *    flush all caches.
 563  545                           */
 564  546                          mtime_changed = 1;
 565  547                          ctime_changed = 1;
 566  548                          rp->r_time_cache_inval = 0;
 567  549                  }
 568  550          } else {
 569  551                  /*
 570  552                   * Write thread after writing data to file on remote server,
 571  553                   * will always set R4WRITEMODIFIED to indicate that file on
 572  554                   * remote server was modified with a WRITE operation and would
 573  555                   * have marked attribute cache as timed out. If R4WRITEMODIFIED
 574  556                   * is set, then do not check for mtime and ctime change.
 575  557                   */
 576  558                  if (!(rp->r_flags & R4WRITEMODIFIED)) {
 577  559                          if (!CACHE4_VALID(rp, vap->va_mtime, vap->va_size))
 578  560                                  mtime_changed = 1;
 579  561  
 580  562                          if (rp->r_attr.va_ctime.tv_sec !=
 581  563                              vap->va_ctime.tv_sec ||
 582  564                              rp->r_attr.va_ctime.tv_nsec !=
 583  565                              vap->va_ctime.tv_nsec)
 584  566                                  ctime_changed = 1;
 585  567  
 586  568                          /*
 587  569                           * If the change attribute was not provided by server
 588  570                           * or it differs, then flush all caches.
 589  571                           */
 590  572                          if (!garp->n4g_change_valid ||
 591  573                              rp->r_change != garp->n4g_change) {
 592  574                                  mtime_changed = 1;
 593  575                                  ctime_changed = 1;
 594  576                          }
 595  577                  } else {
 596  578                          writemodify_set = B_TRUE;
 597  579                  }
 598  580          }
 599  581  
 600  582          preattr_rsize = rp->r_size;
 601  583  
 602  584          nfs4_attrcache_va(vp, garp, set_time_cache_inval);
 603  585  
 604  586          /*
 605  587           * If we have updated filesize in nfs4_attrcache_va, as soon as we
 606  588           * drop statelock we will be in transition of purging all
 607  589           * our caches and updating them. It is possible for another
 608  590           * thread to pick this new file size and read in zeroed data.
 609  591           * stall other threads till cache purge is complete.
 610  592           */
 611  593          if ((!cinfo) && (rp->r_size != preattr_rsize)) {
 612  594                  /*
 613  595                   * If R4WRITEMODIFIED was set and we have updated the file
 614  596                   * size, Server's returned file size need not necessarily
 615  597                   * be because of this Client's WRITE. We need to purge
 616  598                   * all caches.
 617  599                   */
 618  600                  if (writemodify_set)
 619  601                          mtime_changed = 1;
 620  602  
 621  603                  if (mtime_changed && !(rp->r_flags & R4INCACHEPURGE)) {
 622  604                          rp->r_flags |= R4INCACHEPURGE;
 623  605                          cachepurge_set = B_TRUE;
 624  606                  }
 625  607          }
 626  608  
 627  609          if (!mtime_changed && !ctime_changed) {
 628  610                  mutex_exit(&rp->r_statelock);
 629  611                  return;
 630  612          }
 631  613  
 632  614          rp->r_serial = curthread;
 633  615  
 634  616          mutex_exit(&rp->r_statelock);
 635  617  
 636  618          /*
 637  619           * If we're the recov thread, then force async nfs4_purge_caches
 638  620           * to avoid potential deadlock.
 639  621           */
 640  622          if (mtime_changed)
 641  623                  nfs4_purge_caches(vp, NFS4_NOPURGE_DNLC, cr, recov ? 1 : async);
 642  624  
 643  625          if ((rp->r_flags & R4INCACHEPURGE) && cachepurge_set) {
 644  626                  mutex_enter(&rp->r_statelock);
 645  627                  rp->r_flags &= ~R4INCACHEPURGE;
 646  628                  cv_broadcast(&rp->r_cv);
 647  629                  mutex_exit(&rp->r_statelock);
 648  630                  cachepurge_set = B_FALSE;
 649  631          }
 650  632  
 651  633          if (ctime_changed) {
 652  634                  (void) nfs4_access_purge_rp(rp);
 653  635                  if (rp->r_secattr != NULL) {
 654  636                          mutex_enter(&rp->r_statelock);
 655  637                          vsp = rp->r_secattr;
 656  638                          rp->r_secattr = NULL;
 657  639                          mutex_exit(&rp->r_statelock);
 658  640                          if (vsp != NULL)
 659  641                                  nfs4_acl_free_cache(vsp);
 660  642                  }
 661  643          }
 662  644  
 663  645          if (!was_serial) {
 664  646                  mutex_enter(&rp->r_statelock);
 665  647                  rp->r_serial = NULL;
 666  648                  cv_broadcast(&rp->r_cv);
 667  649                  mutex_exit(&rp->r_statelock);
 668  650          }
 669  651  }
 670  652  
 671  653  /*
 672  654   * Set attributes cache for given vnode using virtual attributes.
 673  655   *
 674  656   * Set the timeout value on the attribute cache and fill it
 675  657   * with the passed in attributes.
 676  658   *
 677  659   * The caller must be holding r_statelock.
 678  660   */
 679  661  static void
 680  662  nfs4_attrcache_va(vnode_t *vp, nfs4_ga_res_t *garp, int set_cache_timeout)
 681  663  {
 682  664          rnode4_t *rp;
 683  665          mntinfo4_t *mi;
 684  666          hrtime_t delta;
 685  667          hrtime_t now;
 686  668          vattr_t *vap = &garp->n4g_va;
 687  669  
 688  670          rp = VTOR4(vp);
 689  671  
 690  672          ASSERT(MUTEX_HELD(&rp->r_statelock));
 691  673          ASSERT(vap->va_mask == AT_ALL);
 692  674  
 693  675          /* Switch to master before checking v_flag */
 694  676          if (IS_SHADOW(vp, rp))
 695  677                  vp = RTOV4(rp);
 696  678  
 697  679          now = gethrtime();
 698  680  
 699  681          mi = VTOMI4(vp);
 700  682  
 701  683          /*
 702  684           * Only establish a new cache timeout (if requested).  Never
 703  685           * extend a timeout.  Never clear a timeout.  Clearing a timeout
 704  686           * is done by nfs4_update_dircaches (ancestor in our call chain)
 705  687           */
 706  688          if (set_cache_timeout && ! rp->r_time_cache_inval)
 707  689                  rp->r_time_cache_inval = now + mi->mi_acdirmax;
 708  690  
 709  691          /*
 710  692           * Delta is the number of nanoseconds that we will
 711  693           * cache the attributes of the file.  It is based on
 712  694           * the number of nanoseconds since the last time that
 713  695           * we detected a change.  The assumption is that files
 714  696           * that changed recently are likely to change again.
 715  697           * There is a minimum and a maximum for regular files
 716  698           * and for directories which is enforced though.
 717  699           *
 718  700           * Using the time since last change was detected
 719  701           * eliminates direct comparison or calculation
 720  702           * using mixed client and server times.  NFS does
 721  703           * not make any assumptions regarding the client
 722  704           * and server clocks being synchronized.
 723  705           */
 724  706          if (vap->va_mtime.tv_sec != rp->r_attr.va_mtime.tv_sec ||
 725  707              vap->va_mtime.tv_nsec != rp->r_attr.va_mtime.tv_nsec ||
 726  708              vap->va_size != rp->r_attr.va_size) {
 727  709                  rp->r_time_attr_saved = now;
 728  710          }
 729  711  
 730  712          if ((mi->mi_flags & MI4_NOAC) || (vp->v_flag & VNOCACHE))
 731  713                  delta = 0;
 732  714          else {
 733  715                  delta = now - rp->r_time_attr_saved;
 734  716                  if (vp->v_type == VDIR) {
 735  717                          if (delta < mi->mi_acdirmin)
 736  718                                  delta = mi->mi_acdirmin;
 737  719                          else if (delta > mi->mi_acdirmax)
 738  720                                  delta = mi->mi_acdirmax;
 739  721                  } else {
 740  722                          if (delta < mi->mi_acregmin)
 741  723                                  delta = mi->mi_acregmin;
 742  724                          else if (delta > mi->mi_acregmax)
 743  725                                  delta = mi->mi_acregmax;
 744  726                  }
 745  727          }
 746  728          rp->r_time_attr_inval = now + delta;
 747  729  
 748  730          rp->r_attr = *vap;
 749  731          if (garp->n4g_change_valid)
 750  732                  rp->r_change = garp->n4g_change;
 751  733  
 752  734          /*
 753  735           * The attributes that were returned may be valid and can
 754  736           * be used, but they may not be allowed to be cached.
 755  737           * Reset the timers to cause immediate invalidation and
 756  738           * clear r_change so no VERIFY operations will suceed
 757  739           */
 758  740          if (garp->n4g_attrwhy == NFS4_GETATTR_NOCACHE_OK) {
 759  741                  rp->r_time_attr_inval = now;
 760  742                  rp->r_time_attr_saved = now;
 761  743                  rp->r_change = 0;
 762  744          }
 763  745  
 764  746          /*
 765  747           * If mounted_on_fileid returned AND the object is a stub,
 766  748           * then set object's va_nodeid to the mounted over fid
 767  749           * returned by server.
 768  750           *
 769  751           * If mounted_on_fileid not provided/supported, then
 770  752           * just set it to 0 for now.  Eventually it would be
 771  753           * better to set it to a hashed version of FH.  This
 772  754           * would probably be good enough to provide a unique
 773  755           * fid/d_ino within a dir.
 774  756           *
 775  757           * We don't need to carry mounted_on_fileid in the
 776  758           * rnode as long as the client never requests fileid
 777  759           * without also requesting mounted_on_fileid.  For
 778  760           * now, it stays.
 779  761           */
 780  762          if (garp->n4g_mon_fid_valid) {
 781  763                  rp->r_mntd_fid = garp->n4g_mon_fid;
 782  764  
 783  765                  if (RP_ISSTUB(rp))
 784  766                          rp->r_attr.va_nodeid = rp->r_mntd_fid;
 785  767          }
 786  768  
 787  769          /*
 788  770           * Check to see if there are valid pathconf bits to
 789  771           * cache in the rnode.
 790  772           */
 791  773          if (garp->n4g_ext_res) {
 792  774                  if (garp->n4g_ext_res->n4g_pc4.pc4_cache_valid) {
 793  775                          rp->r_pathconf = garp->n4g_ext_res->n4g_pc4;
 794  776                  } else {
 795  777                          if (garp->n4g_ext_res->n4g_pc4.pc4_xattr_valid) {
 796  778                                  rp->r_pathconf.pc4_xattr_valid = TRUE;
 797  779                                  rp->r_pathconf.pc4_xattr_exists =
 798  780                                      garp->n4g_ext_res->n4g_pc4.pc4_xattr_exists;
 799  781                          }
 800  782                  }
 801  783          }
 802  784          /*
 803  785           * Update the size of the file if there is no cached data or if
 804  786           * the cached data is clean and there is no data being written
 805  787           * out.
 806  788           */
 807  789          if (rp->r_size != vap->va_size &&
 808  790              (!vn_has_cached_data(vp) ||
 809  791              (!(rp->r_flags & R4DIRTY) && rp->r_count == 0))) {
 810  792                  rp->r_size = vap->va_size;
 811  793          }
 812  794          nfs_setswaplike(vp, vap);
 813  795          rp->r_flags &= ~R4WRITEMODIFIED;
 814  796  }
 815  797  
 816  798  /*
 817  799   * Get attributes over-the-wire and update attributes cache
 818  800   * if no error occurred in the over-the-wire operation.
 819  801   * Return 0 if successful, otherwise error.
 820  802   */
 821  803  int
 822  804  nfs4_getattr_otw(vnode_t *vp, nfs4_ga_res_t *garp, cred_t *cr, int get_acl)
 823  805  {
 824  806          mntinfo4_t *mi = VTOMI4(vp);
 825  807          hrtime_t t;
 826  808          nfs4_recov_state_t recov_state;
 827  809          nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
 828  810  
 829  811          recov_state.rs_flags = 0;
 830  812          recov_state.rs_num_retry_despite_err = 0;
 831  813  
 832  814          /* Save the original mount point security flavor */
 833  815          (void) save_mnt_secinfo(mi->mi_curr_serv);
 834  816  
 835  817  recov_retry:
 836  818  
 837  819          if ((e.error = nfs4_start_fop(mi, vp, NULL, OH_GETATTR,
 838  820              &recov_state, NULL))) {
 839  821                  (void) check_mnt_secinfo(mi->mi_curr_serv, vp);
 840  822                  return (e.error);
 841  823          }
 842  824  
 843  825          t = gethrtime();
 844  826  
 845  827          nfs4_getattr_otw_norecovery(vp, garp, &e, cr, get_acl);
 846  828  
 847  829          if (nfs4_needs_recovery(&e, FALSE, vp->v_vfsp)) {
 848  830                  if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
 849  831                      NULL, OP_GETATTR, NULL, NULL, NULL) == FALSE)  {
 850  832                          nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR,
 851  833                              &recov_state, 1);
 852  834                          goto recov_retry;
 853  835                  }
 854  836          }
 855  837  
 856  838          nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state, 0);
 857  839  
 858  840          if (!e.error) {
 859  841                  if (e.stat == NFS4_OK) {
 860  842                          nfs4_attr_cache(vp, garp, t, cr, FALSE, NULL);
 861  843                  } else {
 862  844                          e.error = geterrno4(e.stat);
 863  845  
 864  846                          nfs4_purge_stale_fh(e.error, vp, cr);
 865  847                  }
 866  848          }
 867  849  
 868  850          /*
 869  851           * If getattr a node that is a stub for a crossed
 870  852           * mount point, keep the original secinfo flavor for
 871  853           * the current file system, not the crossed one.
 872  854           */
 873  855          (void) check_mnt_secinfo(mi->mi_curr_serv, vp);
 874  856  
 875  857          return (e.error);
 876  858  }
 877  859  
 878  860  /*
 879  861   * Generate a compound to get attributes over-the-wire.
 880  862   */
 881  863  void
 882  864  nfs4_getattr_otw_norecovery(vnode_t *vp, nfs4_ga_res_t *garp,
 883  865      nfs4_error_t *ep, cred_t *cr, int get_acl)
 884  866  {
 885  867          COMPOUND4args_clnt args;
 886  868          COMPOUND4res_clnt res;
 887  869          int doqueue;
 888  870          rnode4_t *rp = VTOR4(vp);
 889  871          nfs_argop4 argop[2];
 890  872  
 891  873          args.ctag = TAG_GETATTR;
 892  874  
 893  875          args.array_len = 2;
 894  876          args.array = argop;
 895  877  
 896  878          /* putfh */
 897  879          argop[0].argop = OP_CPUTFH;
 898  880          argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
 899  881  
 900  882          /* getattr */
 901  883          /*
 902  884           * Unlike nfs version 2 and 3, where getattr returns all the
 903  885           * attributes, nfs version 4 returns only the ones explicitly
 904  886           * asked for. This creates problems, as some system functions
 905  887           * (e.g. cache check) require certain attributes and if the
 906  888           * cached node lacks some attributes such as uid/gid, it can
 907  889           * affect system utilities (e.g. "ls") that rely on the information
 908  890           * to be there. This can lead to anything from system crashes to
 909  891           * corrupted information processed by user apps.
 910  892           * So to ensure that all bases are covered, request at least
 911  893           * the AT_ALL attribute mask.
 912  894           */
 913  895          argop[1].argop = OP_GETATTR;
 914  896          argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
 915  897          if (get_acl)
 916  898                  argop[1].nfs_argop4_u.opgetattr.attr_request |= FATTR4_ACL_MASK;
 917  899          argop[1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp);
 918  900  
 919  901          doqueue = 1;
 920  902  
 921  903          rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, ep);
 922  904  
 923  905          if (ep->error)
 924  906                  return;
 925  907  
 926  908          if (res.status != NFS4_OK) {
 927  909                  xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
 928  910                  return;
 929  911          }
 930  912  
 931  913          *garp = res.array[1].nfs_resop4_u.opgetattr.ga_res;
 932  914  
 933  915          xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
 934  916  }
 935  917  
 936  918  /*
 937  919   * Return either cached or remote attributes. If get remote attr
 938  920   * use them to check and invalidate caches, then cache the new attributes.
 939  921   */
 940  922  int
 941  923  nfs4getattr(vnode_t *vp, vattr_t *vap, cred_t *cr)
 942  924  {
 943  925          int error;
 944  926          rnode4_t *rp;
 945  927          nfs4_ga_res_t gar;
 946  928  
 947  929          ASSERT(nfs4_consistent_type(vp));
 948  930  
 949  931          /*
 950  932           * If we've got cached attributes, we're done, otherwise go
 951  933           * to the server to get attributes, which will update the cache
 952  934           * in the process. Either way, use the cached attributes for
 953  935           * the caller's vattr_t.
 954  936           *
 955  937           * Note that we ignore the gar set by the OTW call: the attr caching
 956  938           * code may make adjustments when storing to the rnode, and we want
 957  939           * to see those changes here.
 958  940           */
 959  941          rp = VTOR4(vp);
 960  942          error = 0;
 961  943          mutex_enter(&rp->r_statelock);
 962  944          if (!ATTRCACHE4_VALID(vp)) {
 963  945                  mutex_exit(&rp->r_statelock);
 964  946                  error = nfs4_getattr_otw(vp, &gar, cr, 0);
 965  947                  mutex_enter(&rp->r_statelock);
 966  948          }
 967  949  
 968  950          if (!error)
 969  951                  *vap = rp->r_attr;
 970  952  
 971  953          /* Return the client's view of file size */
 972  954          vap->va_size = rp->r_size;
 973  955  
 974  956          mutex_exit(&rp->r_statelock);
 975  957  
 976  958          ASSERT(nfs4_consistent_type(vp));
 977  959  
 978  960          return (error);
 979  961  }
 980  962  
 981  963  int
 982  964  nfs4_attr_otw(vnode_t *vp, nfs4_tag_type_t tag_type,
 983  965      nfs4_ga_res_t *garp, bitmap4 reqbitmap, cred_t *cr)
 984  966  {
 985  967          COMPOUND4args_clnt args;
 986  968          COMPOUND4res_clnt res;
 987  969          int doqueue;
 988  970          nfs_argop4 argop[2];
 989  971          mntinfo4_t *mi = VTOMI4(vp);
 990  972          bool_t needrecov = FALSE;
 991  973          nfs4_recov_state_t recov_state;
 992  974          nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
 993  975          nfs4_ga_ext_res_t *gerp;
 994  976  
 995  977          recov_state.rs_flags = 0;
 996  978          recov_state.rs_num_retry_despite_err = 0;
 997  979  
 998  980  recov_retry:
 999  981          args.ctag = tag_type;
1000  982  
1001  983          args.array_len = 2;
1002  984          args.array = argop;
1003  985  
1004  986          e.error = nfs4_start_fop(mi, vp, NULL, OH_GETATTR, &recov_state, NULL);
1005  987          if (e.error)
1006  988                  return (e.error);
1007  989  
1008  990          /* putfh */
1009  991          argop[0].argop = OP_CPUTFH;
1010  992          argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh;
1011  993  
1012  994          /* getattr */
1013  995          argop[1].argop = OP_GETATTR;
1014  996          argop[1].nfs_argop4_u.opgetattr.attr_request = reqbitmap;
1015  997          argop[1].nfs_argop4_u.opgetattr.mi = mi;
1016  998  
1017  999          doqueue = 1;
1018 1000  
1019 1001          NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
1020 1002              "nfs4_attr_otw: %s call, rp %s", needrecov ? "recov" : "first",
1021 1003              rnode4info(VTOR4(vp))));
1022 1004  
1023 1005          rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
1024 1006  
1025 1007          needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
1026 1008          if (!needrecov && e.error) {
1027 1009                  nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state,
1028 1010                      needrecov);
1029 1011                  return (e.error);
1030 1012          }
1031 1013  
1032 1014          if (needrecov) {
1033 1015                  bool_t abort;
1034 1016  
1035 1017                  NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1036 1018                      "nfs4_attr_otw: initiating recovery\n"));
1037 1019  
1038 1020                  abort = nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
1039 1021                      NULL, OP_GETATTR, NULL, NULL, NULL);
1040 1022                  nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state,
1041 1023                      needrecov);
1042 1024                  if (!e.error) {
1043 1025                          xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1044 1026                          e.error = geterrno4(res.status);
1045 1027                  }
1046 1028                  if (abort == FALSE)
1047 1029                          goto recov_retry;
1048 1030                  return (e.error);
1049 1031          }
1050 1032  
1051 1033          if (res.status) {
1052 1034                  e.error = geterrno4(res.status);
1053 1035          } else {
1054 1036                  gerp = garp->n4g_ext_res;
1055 1037                  bcopy(&res.array[1].nfs_resop4_u.opgetattr.ga_res,
1056 1038                      garp, sizeof (nfs4_ga_res_t));
1057 1039                  garp->n4g_ext_res = gerp;
1058 1040                  if (garp->n4g_ext_res &&
1059 1041                      res.array[1].nfs_resop4_u.opgetattr.ga_res.n4g_ext_res)
1060 1042                          bcopy(res.array[1].nfs_resop4_u.opgetattr.
1061 1043                              ga_res.n4g_ext_res,
1062 1044                              garp->n4g_ext_res, sizeof (nfs4_ga_ext_res_t));
1063 1045          }
1064 1046          xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1065 1047          nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state,
1066 1048              needrecov);
1067 1049          return (e.error);
1068 1050  }
1069 1051  
1070 1052  /*
1071 1053   * Asynchronous I/O parameters.  nfs_async_threads is the high-water mark
1072 1054   * for the demand-based allocation of async threads per-mount.  The
1073 1055   * nfs_async_timeout is the amount of time a thread will live after it
1074 1056   * becomes idle, unless new I/O requests are received before the thread
1075 1057   * dies.  See nfs4_async_putpage and nfs4_async_start.
1076 1058   */
1077 1059  
1078 1060  static void     nfs4_async_start(struct vfs *);
1079 1061  static void     nfs4_async_pgops_start(struct vfs *);
1080 1062  static void     nfs4_async_common_start(struct vfs *, int);
1081 1063  
1082 1064  static void
1083 1065  free_async_args4(struct nfs4_async_reqs *args)
1084 1066  {
1085 1067          rnode4_t *rp;
1086 1068  
1087 1069          if (args->a_io != NFS4_INACTIVE) {
1088 1070                  rp = VTOR4(args->a_vp);
1089 1071                  mutex_enter(&rp->r_statelock);
1090 1072                  rp->r_count--;
1091 1073                  if (args->a_io == NFS4_PUTAPAGE ||
1092 1074                      args->a_io == NFS4_PAGEIO)
1093 1075                          rp->r_awcount--;
1094 1076                  cv_broadcast(&rp->r_cv);
1095 1077                  mutex_exit(&rp->r_statelock);
1096 1078                  VN_RELE(args->a_vp);
1097 1079          }
1098 1080          crfree(args->a_cred);
1099 1081          kmem_free(args, sizeof (*args));
1100 1082  }
1101 1083  
1102 1084  /*
1103 1085   * Cross-zone thread creation and NFS access is disallowed, yet fsflush() and
1104 1086   * pageout(), running in the global zone, have legitimate reasons to do
1105 1087   * VOP_PUTPAGE(B_ASYNC) on other zones' NFS mounts.  We avoid the problem by
1106 1088   * use of a a per-mount "asynchronous requests manager thread" which is
1107 1089   * signaled by the various asynchronous work routines when there is
1108 1090   * asynchronous work to be done.  It is responsible for creating new
1109 1091   * worker threads if necessary, and notifying existing worker threads
1110 1092   * that there is work to be done.
1111 1093   *
1112 1094   * In other words, it will "take the specifications from the customers and
1113 1095   * give them to the engineers."
1114 1096   *
1115 1097   * Worker threads die off of their own accord if they are no longer
1116 1098   * needed.
1117 1099   *
1118 1100   * This thread is killed when the zone is going away or the filesystem
1119 1101   * is being unmounted.
1120 1102   */
1121 1103  void
1122 1104  nfs4_async_manager(vfs_t *vfsp)
1123 1105  {
1124 1106          callb_cpr_t cprinfo;
1125 1107          mntinfo4_t *mi;
1126 1108          uint_t max_threads;
1127 1109  
1128 1110          mi = VFTOMI4(vfsp);
1129 1111  
1130 1112          CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr,
1131 1113              "nfs4_async_manager");
1132 1114  
1133 1115          mutex_enter(&mi->mi_async_lock);
1134 1116          /*
1135 1117           * We want to stash the max number of threads that this mount was
1136 1118           * allowed so we can use it later when the variable is set to zero as
1137 1119           * part of the zone/mount going away.
1138 1120           *
1139 1121           * We want to be able to create at least one thread to handle
1140 1122           * asynchronous inactive calls.
1141 1123           */
1142 1124          max_threads = MAX(mi->mi_max_threads, 1);
1143 1125          /*
1144 1126           * We don't want to wait for mi_max_threads to go to zero, since that
1145 1127           * happens as part of a failed unmount, but this thread should only
1146 1128           * exit when the mount is really going away.
1147 1129           *
1148 1130           * Once MI4_ASYNC_MGR_STOP is set, no more async operations will be
1149 1131           * attempted: the various _async_*() functions know to do things
1150 1132           * inline if mi_max_threads == 0.  Henceforth we just drain out the
1151 1133           * outstanding requests.
1152 1134           *
1153 1135           * Note that we still create zthreads even if we notice the zone is
1154 1136           * shutting down (MI4_ASYNC_MGR_STOP is set); this may cause the zone
1155 1137           * shutdown sequence to take slightly longer in some cases, but
1156 1138           * doesn't violate the protocol, as all threads will exit as soon as
1157 1139           * they're done processing the remaining requests.
1158 1140           */
1159 1141          for (;;) {
1160 1142                  while (mi->mi_async_req_count > 0) {
1161 1143                          /*
1162 1144                           * Paranoia: If the mount started out having
1163 1145                           * (mi->mi_max_threads == 0), and the value was
1164 1146                           * later changed (via a debugger or somesuch),
1165 1147                           * we could be confused since we will think we
1166 1148                           * can't create any threads, and the calling
1167 1149                           * code (which looks at the current value of
1168 1150                           * mi->mi_max_threads, now non-zero) thinks we
1169 1151                           * can.
1170 1152                           *
1171 1153                           * So, because we're paranoid, we create threads
1172 1154                           * up to the maximum of the original and the
1173 1155                           * current value. This means that future
1174 1156                           * (debugger-induced) alterations of
1175 1157                           * mi->mi_max_threads are ignored for our
1176 1158                           * purposes, but who told them they could change
1177 1159                           * random values on a live kernel anyhow?
1178 1160                           */
1179 1161                          if (mi->mi_threads[NFS4_ASYNC_QUEUE] <
1180 1162                              MAX(mi->mi_max_threads, max_threads)) {
1181 1163                                  mi->mi_threads[NFS4_ASYNC_QUEUE]++;
1182 1164                                  mutex_exit(&mi->mi_async_lock);
1183 1165                                  MI4_HOLD(mi);
1184 1166                                  VFS_HOLD(vfsp); /* hold for new thread */
1185 1167                                  (void) zthread_create(NULL, 0, nfs4_async_start,
1186 1168                                      vfsp, 0, minclsyspri);
1187 1169                                  mutex_enter(&mi->mi_async_lock);
1188 1170                          } else if (mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] <
1189 1171                              NUM_ASYNC_PGOPS_THREADS) {
1190 1172                                  mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE]++;
1191 1173                                  mutex_exit(&mi->mi_async_lock);
1192 1174                                  MI4_HOLD(mi);
1193 1175                                  VFS_HOLD(vfsp); /* hold for new thread */
1194 1176                                  (void) zthread_create(NULL, 0,
1195 1177                                      nfs4_async_pgops_start, vfsp, 0,
1196 1178                                      minclsyspri);
1197 1179                                  mutex_enter(&mi->mi_async_lock);
1198 1180                          }
1199 1181                          NFS4_WAKE_ASYNC_WORKER(mi->mi_async_work_cv);
1200 1182                          ASSERT(mi->mi_async_req_count != 0);
1201 1183                          mi->mi_async_req_count--;
1202 1184                  }
1203 1185  
1204 1186                  mutex_enter(&mi->mi_lock);
1205 1187                  if (mi->mi_flags & MI4_ASYNC_MGR_STOP) {
1206 1188                          mutex_exit(&mi->mi_lock);
1207 1189                          break;
1208 1190                  }
1209 1191                  mutex_exit(&mi->mi_lock);
1210 1192  
1211 1193                  CALLB_CPR_SAFE_BEGIN(&cprinfo);
1212 1194                  cv_wait(&mi->mi_async_reqs_cv, &mi->mi_async_lock);
1213 1195                  CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1214 1196          }
1215 1197  
1216 1198          NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
1217 1199              "nfs4_async_manager exiting for vfs %p\n", (void *)mi->mi_vfsp));
1218 1200          /*
1219 1201           * Let everyone know we're done.
1220 1202           */
1221 1203          mi->mi_manager_thread = NULL;
1222 1204          /*
1223 1205           * Wake up the inactive thread.
1224 1206           */
1225 1207          cv_broadcast(&mi->mi_inact_req_cv);
1226 1208          /*
1227 1209           * Wake up anyone sitting in nfs4_async_manager_stop()
1228 1210           */
1229 1211          cv_broadcast(&mi->mi_async_cv);
1230 1212          /*
1231 1213           * There is no explicit call to mutex_exit(&mi->mi_async_lock)
1232 1214           * since CALLB_CPR_EXIT is actually responsible for releasing
1233 1215           * 'mi_async_lock'.
1234 1216           */
1235 1217          CALLB_CPR_EXIT(&cprinfo);
1236 1218          VFS_RELE(vfsp); /* release thread's hold */
1237 1219          MI4_RELE(mi);
1238 1220          zthread_exit();
1239 1221  }
1240 1222  
1241 1223  /*
1242 1224   * Signal (and wait for) the async manager thread to clean up and go away.
1243 1225   */
1244 1226  void
1245 1227  nfs4_async_manager_stop(vfs_t *vfsp)
1246 1228  {
1247 1229          mntinfo4_t *mi = VFTOMI4(vfsp);
1248 1230  
1249 1231          mutex_enter(&mi->mi_async_lock);
1250 1232          mutex_enter(&mi->mi_lock);
1251 1233          mi->mi_flags |= MI4_ASYNC_MGR_STOP;
1252 1234          mutex_exit(&mi->mi_lock);
1253 1235          cv_broadcast(&mi->mi_async_reqs_cv);
1254 1236          /*
1255 1237           * Wait for the async manager thread to die.
1256 1238           */
1257 1239          while (mi->mi_manager_thread != NULL)
1258 1240                  cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
1259 1241          mutex_exit(&mi->mi_async_lock);
1260 1242  }
1261 1243  
1262 1244  int
1263 1245  nfs4_async_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr,
1264 1246      struct seg *seg, cred_t *cr, void (*readahead)(vnode_t *,
1265 1247      u_offset_t, caddr_t, struct seg *, cred_t *))
1266 1248  {
1267 1249          rnode4_t *rp;
1268 1250          mntinfo4_t *mi;
1269 1251          struct nfs4_async_reqs *args;
1270 1252  
1271 1253          rp = VTOR4(vp);
1272 1254          ASSERT(rp->r_freef == NULL);
1273 1255  
1274 1256          mi = VTOMI4(vp);
1275 1257  
1276 1258          /*
1277 1259           * If addr falls in a different segment, don't bother doing readahead.
1278 1260           */
1279 1261          if (addr >= seg->s_base + seg->s_size)
1280 1262                  return (-1);
1281 1263  
1282 1264          /*
1283 1265           * If we can't allocate a request structure, punt on the readahead.
1284 1266           */
1285 1267          if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1286 1268                  return (-1);
1287 1269  
1288 1270          /*
1289 1271           * If a lock operation is pending, don't initiate any new
1290 1272           * readaheads.  Otherwise, bump r_count to indicate the new
1291 1273           * asynchronous I/O.
1292 1274           */
1293 1275          if (!nfs_rw_tryenter(&rp->r_lkserlock, RW_READER)) {
1294 1276                  kmem_free(args, sizeof (*args));
1295 1277                  return (-1);
1296 1278          }
1297 1279          mutex_enter(&rp->r_statelock);
1298 1280          rp->r_count++;
1299 1281          mutex_exit(&rp->r_statelock);
1300 1282          nfs_rw_exit(&rp->r_lkserlock);
1301 1283  
1302 1284          args->a_next = NULL;
1303 1285  #ifdef DEBUG
1304 1286          args->a_queuer = curthread;
1305 1287  #endif
1306 1288          VN_HOLD(vp);
1307 1289          args->a_vp = vp;
1308 1290          ASSERT(cr != NULL);
1309 1291          crhold(cr);
1310 1292          args->a_cred = cr;
1311 1293          args->a_io = NFS4_READ_AHEAD;
1312 1294          args->a_nfs4_readahead = readahead;
1313 1295          args->a_nfs4_blkoff = blkoff;
1314 1296          args->a_nfs4_seg = seg;
1315 1297          args->a_nfs4_addr = addr;
1316 1298  
1317 1299          mutex_enter(&mi->mi_async_lock);
1318 1300  
1319 1301          /*
1320 1302           * If asyncio has been disabled, don't bother readahead.
1321 1303           */
1322 1304          if (mi->mi_max_threads == 0) {
1323 1305                  mutex_exit(&mi->mi_async_lock);
1324 1306                  goto noasync;
1325 1307          }
1326 1308  
1327 1309          /*
1328 1310           * Link request structure into the async list and
1329 1311           * wakeup async thread to do the i/o.
1330 1312           */
1331 1313          if (mi->mi_async_reqs[NFS4_READ_AHEAD] == NULL) {
1332 1314                  mi->mi_async_reqs[NFS4_READ_AHEAD] = args;
1333 1315                  mi->mi_async_tail[NFS4_READ_AHEAD] = args;
1334 1316          } else {
1335 1317                  mi->mi_async_tail[NFS4_READ_AHEAD]->a_next = args;
1336 1318                  mi->mi_async_tail[NFS4_READ_AHEAD] = args;
1337 1319          }
1338 1320  
1339 1321          if (mi->mi_io_kstats) {
1340 1322                  mutex_enter(&mi->mi_lock);
1341 1323                  kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1342 1324                  mutex_exit(&mi->mi_lock);
1343 1325          }
1344 1326  
1345 1327          mi->mi_async_req_count++;
1346 1328          ASSERT(mi->mi_async_req_count != 0);
1347 1329          cv_signal(&mi->mi_async_reqs_cv);
1348 1330          mutex_exit(&mi->mi_async_lock);
1349 1331          return (0);
1350 1332  
1351 1333  noasync:
1352 1334          mutex_enter(&rp->r_statelock);
1353 1335          rp->r_count--;
1354 1336          cv_broadcast(&rp->r_cv);
1355 1337          mutex_exit(&rp->r_statelock);
1356 1338          VN_RELE(vp);
1357 1339          crfree(cr);
1358 1340          kmem_free(args, sizeof (*args));
1359 1341          return (-1);
1360 1342  }
1361 1343  
1362 1344  static void
1363 1345  nfs4_async_start(struct vfs *vfsp)
1364 1346  {
1365 1347          nfs4_async_common_start(vfsp, NFS4_ASYNC_QUEUE);
1366 1348  }
1367 1349  
1368 1350  static void
1369 1351  nfs4_async_pgops_start(struct vfs *vfsp)
1370 1352  {
1371 1353          nfs4_async_common_start(vfsp, NFS4_ASYNC_PGOPS_QUEUE);
1372 1354  }
1373 1355  
1374 1356  /*
1375 1357   * The async queues for each mounted file system are arranged as a
1376 1358   * set of queues, one for each async i/o type.  Requests are taken
1377 1359   * from the queues in a round-robin fashion.  A number of consecutive
1378 1360   * requests are taken from each queue before moving on to the next
1379 1361   * queue.  This functionality may allow the NFS Version 2 server to do
1380 1362   * write clustering, even if the client is mixing writes and reads
1381 1363   * because it will take multiple write requests from the queue
1382 1364   * before processing any of the other async i/o types.
1383 1365   *
1384 1366   * XXX The nfs4_async_common_start thread is unsafe in the light of the present
1385 1367   * model defined by cpr to suspend the system. Specifically over the
1386 1368   * wire calls are cpr-unsafe. The thread should be reevaluated in
1387 1369   * case of future updates to the cpr model.
1388 1370   */
1389 1371  static void
1390 1372  nfs4_async_common_start(struct vfs *vfsp, int async_queue)
1391 1373  {
1392 1374          struct nfs4_async_reqs *args;
1393 1375          mntinfo4_t *mi = VFTOMI4(vfsp);
1394 1376          clock_t time_left = 1;
1395 1377          callb_cpr_t cprinfo;
1396 1378          int i;
1397 1379          extern int nfs_async_timeout;
1398 1380          int async_types;
1399 1381          kcondvar_t *async_work_cv;
1400 1382  
1401 1383          if (async_queue == NFS4_ASYNC_QUEUE) {
1402 1384                  async_types = NFS4_ASYNC_TYPES;
1403 1385                  async_work_cv = &mi->mi_async_work_cv[NFS4_ASYNC_QUEUE];
1404 1386          } else {
1405 1387                  async_types = NFS4_ASYNC_PGOPS_TYPES;
1406 1388                  async_work_cv = &mi->mi_async_work_cv[NFS4_ASYNC_PGOPS_QUEUE];
1407 1389          }
1408 1390  
1409 1391          /*
1410 1392           * Dynamic initialization of nfs_async_timeout to allow nfs to be
1411 1393           * built in an implementation independent manner.
1412 1394           */
1413 1395          if (nfs_async_timeout == -1)
1414 1396                  nfs_async_timeout = NFS_ASYNC_TIMEOUT;
1415 1397  
1416 1398          CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, "nas");
1417 1399  
1418 1400          mutex_enter(&mi->mi_async_lock);
1419 1401          for (;;) {
1420 1402                  /*
1421 1403                   * Find the next queue containing an entry.  We start
1422 1404                   * at the current queue pointer and then round robin
1423 1405                   * through all of them until we either find a non-empty
1424 1406                   * queue or have looked through all of them.
1425 1407                   */
1426 1408                  for (i = 0; i < async_types; i++) {
1427 1409                          args = *mi->mi_async_curr[async_queue];
1428 1410                          if (args != NULL)
1429 1411                                  break;
1430 1412                          mi->mi_async_curr[async_queue]++;
1431 1413                          if (mi->mi_async_curr[async_queue] ==
1432 1414                              &mi->mi_async_reqs[async_types]) {
1433 1415                                  mi->mi_async_curr[async_queue] =
1434 1416                                      &mi->mi_async_reqs[0];
1435 1417                          }
1436 1418                  }
1437 1419                  /*
1438 1420                   * If we didn't find a entry, then block until woken up
1439 1421                   * again and then look through the queues again.
1440 1422                   */
1441 1423                  if (args == NULL) {
1442 1424                          /*
1443 1425                           * Exiting is considered to be safe for CPR as well
1444 1426                           */
1445 1427                          CALLB_CPR_SAFE_BEGIN(&cprinfo);
1446 1428  
1447 1429                          /*
1448 1430                           * Wakeup thread waiting to unmount the file
1449 1431                           * system only if all async threads are inactive.
1450 1432                           *
1451 1433                           * If we've timed-out and there's nothing to do,
1452 1434                           * then get rid of this thread.
1453 1435                           */
1454 1436                          if (mi->mi_max_threads == 0 || time_left <= 0) {
1455 1437                                  --mi->mi_threads[async_queue];
1456 1438  
1457 1439                                  if (mi->mi_threads[NFS4_ASYNC_QUEUE] == 0 &&
1458 1440                                      mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] == 0)
1459 1441                                          cv_signal(&mi->mi_async_cv);
1460 1442                                  CALLB_CPR_EXIT(&cprinfo);
1461 1443                                  VFS_RELE(vfsp); /* release thread's hold */
1462 1444                                  MI4_RELE(mi);
1463 1445                                  zthread_exit();
1464 1446                                  /* NOTREACHED */
1465 1447                          }
1466 1448                          time_left = cv_reltimedwait(async_work_cv,
1467 1449                              &mi->mi_async_lock, nfs_async_timeout,
1468 1450                              TR_CLOCK_TICK);
1469 1451  
1470 1452                          CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1471 1453  
1472 1454                          continue;
1473 1455                  } else {
1474 1456                          time_left = 1;
1475 1457                  }
1476 1458  
1477 1459                  /*
1478 1460                   * Remove the request from the async queue and then
1479 1461                   * update the current async request queue pointer.  If
1480 1462                   * the current queue is empty or we have removed enough
1481 1463                   * consecutive entries from it, then reset the counter
1482 1464                   * for this queue and then move the current pointer to
1483 1465                   * the next queue.
1484 1466                   */
1485 1467                  *mi->mi_async_curr[async_queue] = args->a_next;
1486 1468                  if (*mi->mi_async_curr[async_queue] == NULL ||
1487 1469                      --mi->mi_async_clusters[args->a_io] == 0) {
1488 1470                          mi->mi_async_clusters[args->a_io] =
1489 1471                              mi->mi_async_init_clusters;
1490 1472                          mi->mi_async_curr[async_queue]++;
1491 1473                          if (mi->mi_async_curr[async_queue] ==
1492 1474                              &mi->mi_async_reqs[async_types]) {
1493 1475                                  mi->mi_async_curr[async_queue] =
1494 1476                                      &mi->mi_async_reqs[0];
1495 1477                          }
1496 1478                  }
1497 1479  
1498 1480                  if (args->a_io != NFS4_INACTIVE && mi->mi_io_kstats) {
1499 1481                          mutex_enter(&mi->mi_lock);
1500 1482                          kstat_waitq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
1501 1483                          mutex_exit(&mi->mi_lock);
1502 1484                  }
1503 1485  
1504 1486                  mutex_exit(&mi->mi_async_lock);
1505 1487  
1506 1488                  /*
1507 1489                   * Obtain arguments from the async request structure.
1508 1490                   */
1509 1491                  if (args->a_io == NFS4_READ_AHEAD && mi->mi_max_threads > 0) {
1510 1492                          (*args->a_nfs4_readahead)(args->a_vp,
1511 1493                              args->a_nfs4_blkoff, args->a_nfs4_addr,
1512 1494                              args->a_nfs4_seg, args->a_cred);
1513 1495                  } else if (args->a_io == NFS4_PUTAPAGE) {
1514 1496                          (void) (*args->a_nfs4_putapage)(args->a_vp,
1515 1497                              args->a_nfs4_pp, args->a_nfs4_off,
1516 1498                              args->a_nfs4_len, args->a_nfs4_flags,
1517 1499                              args->a_cred);
1518 1500                  } else if (args->a_io == NFS4_PAGEIO) {
1519 1501                          (void) (*args->a_nfs4_pageio)(args->a_vp,
1520 1502                              args->a_nfs4_pp, args->a_nfs4_off,
1521 1503                              args->a_nfs4_len, args->a_nfs4_flags,
1522 1504                              args->a_cred);
1523 1505                  } else if (args->a_io == NFS4_READDIR) {
1524 1506                          (void) ((*args->a_nfs4_readdir)(args->a_vp,
1525 1507                              args->a_nfs4_rdc, args->a_cred));
1526 1508                  } else if (args->a_io == NFS4_COMMIT) {
1527 1509                          (*args->a_nfs4_commit)(args->a_vp, args->a_nfs4_plist,
1528 1510                              args->a_nfs4_offset, args->a_nfs4_count,
1529 1511                              args->a_cred);
1530 1512                  } else if (args->a_io == NFS4_INACTIVE) {
1531 1513                          nfs4_inactive_otw(args->a_vp, args->a_cred);
1532 1514                  }
1533 1515  
1534 1516                  /*
1535 1517                   * Now, release the vnode and free the credentials
1536 1518                   * structure.
1537 1519                   */
1538 1520                  free_async_args4(args);
1539 1521                  /*
1540 1522                   * Reacquire the mutex because it will be needed above.
1541 1523                   */
1542 1524                  mutex_enter(&mi->mi_async_lock);
1543 1525          }
1544 1526  }
1545 1527  
1546 1528  /*
1547 1529   * nfs4_inactive_thread - look for vnodes that need over-the-wire calls as
1548 1530   * part of VOP_INACTIVE.
1549 1531   */
1550 1532  
1551 1533  void
1552 1534  nfs4_inactive_thread(mntinfo4_t *mi)
1553 1535  {
1554 1536          struct nfs4_async_reqs *args;
1555 1537          callb_cpr_t cprinfo;
1556 1538          vfs_t *vfsp = mi->mi_vfsp;
1557 1539  
1558 1540          CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr,
1559 1541              "nfs4_inactive_thread");
1560 1542  
1561 1543          for (;;) {
1562 1544                  mutex_enter(&mi->mi_async_lock);
1563 1545                  args = mi->mi_async_reqs[NFS4_INACTIVE];
1564 1546                  if (args == NULL) {
1565 1547                          mutex_enter(&mi->mi_lock);
1566 1548                          /*
1567 1549                           * We don't want to exit until the async manager is done
1568 1550                           * with its work; hence the check for mi_manager_thread
1569 1551                           * being NULL.
1570 1552                           *
1571 1553                           * The async manager thread will cv_broadcast() on
1572 1554                           * mi_inact_req_cv when it's done, at which point we'll
1573 1555                           * wake up and exit.
1574 1556                           */
1575 1557                          if (mi->mi_manager_thread == NULL)
1576 1558                                  goto die;
1577 1559                          mi->mi_flags |= MI4_INACTIVE_IDLE;
1578 1560                          mutex_exit(&mi->mi_lock);
1579 1561                          cv_signal(&mi->mi_async_cv);
1580 1562                          CALLB_CPR_SAFE_BEGIN(&cprinfo);
1581 1563                          cv_wait(&mi->mi_inact_req_cv, &mi->mi_async_lock);
1582 1564                          CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1583 1565                          mutex_exit(&mi->mi_async_lock);
1584 1566                  } else {
1585 1567                          mutex_enter(&mi->mi_lock);
1586 1568                          mi->mi_flags &= ~MI4_INACTIVE_IDLE;
1587 1569                          mutex_exit(&mi->mi_lock);
1588 1570                          mi->mi_async_reqs[NFS4_INACTIVE] = args->a_next;
1589 1571                          mutex_exit(&mi->mi_async_lock);
1590 1572                          nfs4_inactive_otw(args->a_vp, args->a_cred);
1591 1573                          crfree(args->a_cred);
1592 1574                          kmem_free(args, sizeof (*args));
1593 1575                  }
1594 1576          }
1595 1577  die:
1596 1578          mutex_exit(&mi->mi_lock);
1597 1579          mi->mi_inactive_thread = NULL;
1598 1580          cv_signal(&mi->mi_async_cv);
1599 1581  
1600 1582          /*
1601 1583           * There is no explicit call to mutex_exit(&mi->mi_async_lock) since
1602 1584           * CALLB_CPR_EXIT is actually responsible for releasing 'mi_async_lock'.
1603 1585           */
1604 1586          CALLB_CPR_EXIT(&cprinfo);
1605 1587  
1606 1588          NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
1607 1589              "nfs4_inactive_thread exiting for vfs %p\n", (void *)vfsp));
1608 1590  
1609 1591          MI4_RELE(mi);
1610 1592          zthread_exit();
1611 1593          /* NOTREACHED */
1612 1594  }
1613 1595  
1614 1596  /*
1615 1597   * nfs_async_stop:
1616 1598   * Wait for all outstanding putpage operations and the inactive thread to
1617 1599   * complete; nfs4_async_stop_sig() without interruptibility.
1618 1600   */
1619 1601  void
1620 1602  nfs4_async_stop(struct vfs *vfsp)
1621 1603  {
1622 1604          mntinfo4_t *mi = VFTOMI4(vfsp);
1623 1605  
1624 1606          /*
1625 1607           * Wait for all outstanding async operations to complete and for
1626 1608           * worker threads to exit.
1627 1609           */
1628 1610          mutex_enter(&mi->mi_async_lock);
1629 1611          mi->mi_max_threads = 0;
1630 1612          NFS4_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
1631 1613          while (mi->mi_threads[NFS4_ASYNC_QUEUE] != 0 ||
1632 1614              mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] != 0)
1633 1615                  cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
1634 1616  
1635 1617          /*
1636 1618           * Wait for the inactive thread to finish doing what it's doing.  It
1637 1619           * won't exit until the last reference to the vfs_t goes away.
1638 1620           */
1639 1621          if (mi->mi_inactive_thread != NULL) {
1640 1622                  mutex_enter(&mi->mi_lock);
1641 1623                  while (!(mi->mi_flags & MI4_INACTIVE_IDLE) ||
1642 1624                      (mi->mi_async_reqs[NFS4_INACTIVE] != NULL)) {
1643 1625                          mutex_exit(&mi->mi_lock);
1644 1626                          cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
1645 1627                          mutex_enter(&mi->mi_lock);
1646 1628                  }
1647 1629                  mutex_exit(&mi->mi_lock);
1648 1630          }
1649 1631          mutex_exit(&mi->mi_async_lock);
1650 1632  }
1651 1633  
1652 1634  /*
1653 1635   * nfs_async_stop_sig:
1654 1636   * Wait for all outstanding putpage operations and the inactive thread to
1655 1637   * complete. If a signal is delivered we will abort and return non-zero;
1656 1638   * otherwise return 0. Since this routine is called from nfs4_unmount, we
1657 1639   * need to make it interruptible.
1658 1640   */
1659 1641  int
1660 1642  nfs4_async_stop_sig(struct vfs *vfsp)
1661 1643  {
1662 1644          mntinfo4_t *mi = VFTOMI4(vfsp);
1663 1645          ushort_t omax;
1664 1646          bool_t intr = FALSE;
1665 1647  
1666 1648          /*
1667 1649           * Wait for all outstanding putpage operations to complete and for
1668 1650           * worker threads to exit.
1669 1651           */
1670 1652          mutex_enter(&mi->mi_async_lock);
1671 1653          omax = mi->mi_max_threads;
1672 1654          mi->mi_max_threads = 0;
1673 1655          NFS4_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
1674 1656          while (mi->mi_threads[NFS4_ASYNC_QUEUE] != 0 ||
1675 1657              mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] != 0) {
1676 1658                  if (!cv_wait_sig(&mi->mi_async_cv, &mi->mi_async_lock)) {
1677 1659                          intr = TRUE;
1678 1660                          goto interrupted;
1679 1661                  }
1680 1662          }
1681 1663  
1682 1664          /*
1683 1665           * Wait for the inactive thread to finish doing what it's doing.  It
1684 1666           * won't exit until the a last reference to the vfs_t goes away.
1685 1667           */
1686 1668          if (mi->mi_inactive_thread != NULL) {
1687 1669                  mutex_enter(&mi->mi_lock);
1688 1670                  while (!(mi->mi_flags & MI4_INACTIVE_IDLE) ||
1689 1671                      (mi->mi_async_reqs[NFS4_INACTIVE] != NULL)) {
1690 1672                          mutex_exit(&mi->mi_lock);
1691 1673                          if (!cv_wait_sig(&mi->mi_async_cv,
1692 1674                              &mi->mi_async_lock)) {
1693 1675                                  intr = TRUE;
1694 1676                                  goto interrupted;
1695 1677                          }
1696 1678                          mutex_enter(&mi->mi_lock);
1697 1679                  }
1698 1680                  mutex_exit(&mi->mi_lock);
1699 1681          }
1700 1682  interrupted:
1701 1683          if (intr)
1702 1684                  mi->mi_max_threads = omax;
1703 1685          mutex_exit(&mi->mi_async_lock);
1704 1686  
1705 1687          return (intr);
1706 1688  }
1707 1689  
1708 1690  int
1709 1691  nfs4_async_putapage(vnode_t *vp, page_t *pp, u_offset_t off, size_t len,
1710 1692      int flags, cred_t *cr, int (*putapage)(vnode_t *, page_t *,
1711 1693      u_offset_t, size_t, int, cred_t *))
1712 1694  {
1713 1695          rnode4_t *rp;
1714 1696          mntinfo4_t *mi;
1715 1697          struct nfs4_async_reqs *args;
1716 1698  
1717 1699          ASSERT(flags & B_ASYNC);
1718 1700          ASSERT(vp->v_vfsp != NULL);
1719 1701  
1720 1702          rp = VTOR4(vp);
1721 1703          ASSERT(rp->r_count > 0);
1722 1704  
1723 1705          mi = VTOMI4(vp);
1724 1706  
1725 1707          /*
1726 1708           * If we can't allocate a request structure, do the putpage
1727 1709           * operation synchronously in this thread's context.
1728 1710           */
1729 1711          if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1730 1712                  goto noasync;
1731 1713  
1732 1714          args->a_next = NULL;
1733 1715  #ifdef DEBUG
1734 1716          args->a_queuer = curthread;
1735 1717  #endif
1736 1718          VN_HOLD(vp);
1737 1719          args->a_vp = vp;
1738 1720          ASSERT(cr != NULL);
1739 1721          crhold(cr);
1740 1722          args->a_cred = cr;
1741 1723          args->a_io = NFS4_PUTAPAGE;
1742 1724          args->a_nfs4_putapage = putapage;
1743 1725          args->a_nfs4_pp = pp;
1744 1726          args->a_nfs4_off = off;
1745 1727          args->a_nfs4_len = (uint_t)len;
1746 1728          args->a_nfs4_flags = flags;
1747 1729  
1748 1730          mutex_enter(&mi->mi_async_lock);
1749 1731  
1750 1732          /*
1751 1733           * If asyncio has been disabled, then make a synchronous request.
1752 1734           * This check is done a second time in case async io was diabled
1753 1735           * while this thread was blocked waiting for memory pressure to
1754 1736           * reduce or for the queue to drain.
1755 1737           */
1756 1738          if (mi->mi_max_threads == 0) {
1757 1739                  mutex_exit(&mi->mi_async_lock);
1758 1740  
1759 1741                  VN_RELE(vp);
1760 1742                  crfree(cr);
1761 1743                  kmem_free(args, sizeof (*args));
1762 1744                  goto noasync;
1763 1745          }
1764 1746  
1765 1747          /*
1766 1748           * Link request structure into the async list and
1767 1749           * wakeup async thread to do the i/o.
1768 1750           */
1769 1751          if (mi->mi_async_reqs[NFS4_PUTAPAGE] == NULL) {
1770 1752                  mi->mi_async_reqs[NFS4_PUTAPAGE] = args;
1771 1753                  mi->mi_async_tail[NFS4_PUTAPAGE] = args;
1772 1754          } else {
1773 1755                  mi->mi_async_tail[NFS4_PUTAPAGE]->a_next = args;
1774 1756                  mi->mi_async_tail[NFS4_PUTAPAGE] = args;
1775 1757          }
1776 1758  
1777 1759          mutex_enter(&rp->r_statelock);
1778 1760          rp->r_count++;
1779 1761          rp->r_awcount++;
1780 1762          mutex_exit(&rp->r_statelock);
1781 1763  
1782 1764          if (mi->mi_io_kstats) {
1783 1765                  mutex_enter(&mi->mi_lock);
1784 1766                  kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1785 1767                  mutex_exit(&mi->mi_lock);
1786 1768          }
1787 1769  
1788 1770          mi->mi_async_req_count++;
1789 1771          ASSERT(mi->mi_async_req_count != 0);
1790 1772          cv_signal(&mi->mi_async_reqs_cv);
1791 1773          mutex_exit(&mi->mi_async_lock);
1792 1774          return (0);
1793 1775  
1794 1776  noasync:
1795 1777  
1796 1778          if (curproc == proc_pageout || curproc == proc_fsflush) {
1797 1779                  /*
1798 1780                   * If we get here in the context of the pageout/fsflush,
1799 1781                   * or we have run out of memory or we're attempting to
1800 1782                   * unmount we refuse to do a sync write, because this may
1801 1783                   * hang pageout/fsflush and the machine. In this case,
1802 1784                   * we just re-mark the page as dirty and punt on the page.
1803 1785                   *
1804 1786                   * Make sure B_FORCE isn't set.  We can re-mark the
1805 1787                   * pages as dirty and unlock the pages in one swoop by
1806 1788                   * passing in B_ERROR to pvn_write_done().  However,
1807 1789                   * we should make sure B_FORCE isn't set - we don't
1808 1790                   * want the page tossed before it gets written out.
1809 1791                   */
1810 1792                  if (flags & B_FORCE)
1811 1793                          flags &= ~(B_INVAL | B_FORCE);
1812 1794                  pvn_write_done(pp, flags | B_ERROR);
1813 1795                  return (0);
1814 1796          }
1815 1797  
1816 1798          if (nfs_zone() != mi->mi_zone) {
1817 1799                  /*
1818 1800                   * So this was a cross-zone sync putpage.
1819 1801                   *
1820 1802                   * We pass in B_ERROR to pvn_write_done() to re-mark the pages
1821 1803                   * as dirty and unlock them.
1822 1804                   *
1823 1805                   * We don't want to clear B_FORCE here as the caller presumably
1824 1806                   * knows what they're doing if they set it.
1825 1807                   */
1826 1808                  pvn_write_done(pp, flags | B_ERROR);
1827 1809                  return (EPERM);
1828 1810          }
1829 1811          return ((*putapage)(vp, pp, off, len, flags, cr));
1830 1812  }
1831 1813  
1832 1814  int
1833 1815  nfs4_async_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
1834 1816      int flags, cred_t *cr, int (*pageio)(vnode_t *, page_t *, u_offset_t,
1835 1817      size_t, int, cred_t *))
1836 1818  {
1837 1819          rnode4_t *rp;
1838 1820          mntinfo4_t *mi;
1839 1821          struct nfs4_async_reqs *args;
1840 1822  
1841 1823          ASSERT(flags & B_ASYNC);
1842 1824          ASSERT(vp->v_vfsp != NULL);
1843 1825  
1844 1826          rp = VTOR4(vp);
1845 1827          ASSERT(rp->r_count > 0);
1846 1828  
1847 1829          mi = VTOMI4(vp);
1848 1830  
1849 1831          /*
1850 1832           * If we can't allocate a request structure, do the pageio
1851 1833           * request synchronously in this thread's context.
1852 1834           */
1853 1835          if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1854 1836                  goto noasync;
1855 1837  
1856 1838          args->a_next = NULL;
1857 1839  #ifdef DEBUG
1858 1840          args->a_queuer = curthread;
1859 1841  #endif
1860 1842          VN_HOLD(vp);
1861 1843          args->a_vp = vp;
1862 1844          ASSERT(cr != NULL);
1863 1845          crhold(cr);
1864 1846          args->a_cred = cr;
1865 1847          args->a_io = NFS4_PAGEIO;
1866 1848          args->a_nfs4_pageio = pageio;
1867 1849          args->a_nfs4_pp = pp;
1868 1850          args->a_nfs4_off = io_off;
1869 1851          args->a_nfs4_len = (uint_t)io_len;
1870 1852          args->a_nfs4_flags = flags;
1871 1853  
1872 1854          mutex_enter(&mi->mi_async_lock);
1873 1855  
1874 1856          /*
1875 1857           * If asyncio has been disabled, then make a synchronous request.
1876 1858           * This check is done a second time in case async io was diabled
1877 1859           * while this thread was blocked waiting for memory pressure to
1878 1860           * reduce or for the queue to drain.
1879 1861           */
1880 1862          if (mi->mi_max_threads == 0) {
1881 1863                  mutex_exit(&mi->mi_async_lock);
1882 1864  
1883 1865                  VN_RELE(vp);
1884 1866                  crfree(cr);
1885 1867                  kmem_free(args, sizeof (*args));
1886 1868                  goto noasync;
1887 1869          }
1888 1870  
1889 1871          /*
1890 1872           * Link request structure into the async list and
1891 1873           * wakeup async thread to do the i/o.
1892 1874           */
1893 1875          if (mi->mi_async_reqs[NFS4_PAGEIO] == NULL) {
1894 1876                  mi->mi_async_reqs[NFS4_PAGEIO] = args;
1895 1877                  mi->mi_async_tail[NFS4_PAGEIO] = args;
1896 1878          } else {
1897 1879                  mi->mi_async_tail[NFS4_PAGEIO]->a_next = args;
1898 1880                  mi->mi_async_tail[NFS4_PAGEIO] = args;
1899 1881          }
1900 1882  
1901 1883          mutex_enter(&rp->r_statelock);
1902 1884          rp->r_count++;
1903 1885          rp->r_awcount++;
1904 1886          mutex_exit(&rp->r_statelock);
1905 1887  
1906 1888          if (mi->mi_io_kstats) {
1907 1889                  mutex_enter(&mi->mi_lock);
1908 1890                  kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1909 1891                  mutex_exit(&mi->mi_lock);
1910 1892          }
1911 1893  
1912 1894          mi->mi_async_req_count++;
1913 1895          ASSERT(mi->mi_async_req_count != 0);
1914 1896          cv_signal(&mi->mi_async_reqs_cv);
1915 1897          mutex_exit(&mi->mi_async_lock);
1916 1898          return (0);
1917 1899  
1918 1900  noasync:
1919 1901          /*
1920 1902           * If we can't do it ASYNC, for reads we do nothing (but cleanup
1921 1903           * the page list), for writes we do it synchronously, except for
1922 1904           * proc_pageout/proc_fsflush as described below.
1923 1905           */
1924 1906          if (flags & B_READ) {
1925 1907                  pvn_read_done(pp, flags | B_ERROR);
1926 1908                  return (0);
1927 1909          }
1928 1910  
1929 1911          if (curproc == proc_pageout || curproc == proc_fsflush) {
1930 1912                  /*
1931 1913                   * If we get here in the context of the pageout/fsflush,
1932 1914                   * we refuse to do a sync write, because this may hang
1933 1915                   * pageout/fsflush (and the machine). In this case, we just
1934 1916                   * re-mark the page as dirty and punt on the page.
1935 1917                   *
1936 1918                   * Make sure B_FORCE isn't set.  We can re-mark the
1937 1919                   * pages as dirty and unlock the pages in one swoop by
1938 1920                   * passing in B_ERROR to pvn_write_done().  However,
1939 1921                   * we should make sure B_FORCE isn't set - we don't
1940 1922                   * want the page tossed before it gets written out.
1941 1923                   */
1942 1924                  if (flags & B_FORCE)
1943 1925                          flags &= ~(B_INVAL | B_FORCE);
1944 1926                  pvn_write_done(pp, flags | B_ERROR);
1945 1927                  return (0);
1946 1928          }
1947 1929  
1948 1930          if (nfs_zone() != mi->mi_zone) {
1949 1931                  /*
1950 1932                   * So this was a cross-zone sync pageio.  We pass in B_ERROR
1951 1933                   * to pvn_write_done() to re-mark the pages as dirty and unlock
1952 1934                   * them.
1953 1935                   *
1954 1936                   * We don't want to clear B_FORCE here as the caller presumably
1955 1937                   * knows what they're doing if they set it.
1956 1938                   */
1957 1939                  pvn_write_done(pp, flags | B_ERROR);
1958 1940                  return (EPERM);
1959 1941          }
1960 1942          return ((*pageio)(vp, pp, io_off, io_len, flags, cr));
1961 1943  }
1962 1944  
1963 1945  void
1964 1946  nfs4_async_readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr,
1965 1947      int (*readdir)(vnode_t *, rddir4_cache *, cred_t *))
1966 1948  {
1967 1949          rnode4_t *rp;
1968 1950          mntinfo4_t *mi;
1969 1951          struct nfs4_async_reqs *args;
1970 1952  
1971 1953          rp = VTOR4(vp);
1972 1954          ASSERT(rp->r_freef == NULL);
1973 1955  
1974 1956          mi = VTOMI4(vp);
1975 1957  
1976 1958          /*
1977 1959           * If we can't allocate a request structure, skip the readdir.
1978 1960           */
1979 1961          if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1980 1962                  goto noasync;
1981 1963  
1982 1964          args->a_next = NULL;
1983 1965  #ifdef DEBUG
1984 1966          args->a_queuer = curthread;
1985 1967  #endif
1986 1968          VN_HOLD(vp);
1987 1969          args->a_vp = vp;
1988 1970          ASSERT(cr != NULL);
1989 1971          crhold(cr);
1990 1972          args->a_cred = cr;
1991 1973          args->a_io = NFS4_READDIR;
1992 1974          args->a_nfs4_readdir = readdir;
1993 1975          args->a_nfs4_rdc = rdc;
1994 1976  
1995 1977          mutex_enter(&mi->mi_async_lock);
1996 1978  
1997 1979          /*
1998 1980           * If asyncio has been disabled, then skip this request
1999 1981           */
2000 1982          if (mi->mi_max_threads == 0) {
2001 1983                  mutex_exit(&mi->mi_async_lock);
2002 1984  
2003 1985                  VN_RELE(vp);
2004 1986                  crfree(cr);
2005 1987                  kmem_free(args, sizeof (*args));
2006 1988                  goto noasync;
2007 1989          }
2008 1990  
2009 1991          /*
2010 1992           * Link request structure into the async list and
2011 1993           * wakeup async thread to do the i/o.
2012 1994           */
2013 1995          if (mi->mi_async_reqs[NFS4_READDIR] == NULL) {
2014 1996                  mi->mi_async_reqs[NFS4_READDIR] = args;
2015 1997                  mi->mi_async_tail[NFS4_READDIR] = args;
2016 1998          } else {
2017 1999                  mi->mi_async_tail[NFS4_READDIR]->a_next = args;
2018 2000                  mi->mi_async_tail[NFS4_READDIR] = args;
2019 2001          }
2020 2002  
2021 2003          mutex_enter(&rp->r_statelock);
2022 2004          rp->r_count++;
2023 2005          mutex_exit(&rp->r_statelock);
2024 2006  
2025 2007          if (mi->mi_io_kstats) {
2026 2008                  mutex_enter(&mi->mi_lock);
2027 2009                  kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
2028 2010                  mutex_exit(&mi->mi_lock);
2029 2011          }
2030 2012  
2031 2013          mi->mi_async_req_count++;
2032 2014          ASSERT(mi->mi_async_req_count != 0);
2033 2015          cv_signal(&mi->mi_async_reqs_cv);
2034 2016          mutex_exit(&mi->mi_async_lock);
2035 2017          return;
2036 2018  
2037 2019  noasync:
2038 2020          mutex_enter(&rp->r_statelock);
2039 2021          rdc->entries = NULL;
2040 2022          /*
2041 2023           * Indicate that no one is trying to fill this entry and
2042 2024           * it still needs to be filled.
2043 2025           */
2044 2026          rdc->flags &= ~RDDIR;
2045 2027          rdc->flags |= RDDIRREQ;
2046 2028          rddir4_cache_rele(rp, rdc);
2047 2029          mutex_exit(&rp->r_statelock);
2048 2030  }
2049 2031  
2050 2032  void
2051 2033  nfs4_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count,
2052 2034      cred_t *cr, void (*commit)(vnode_t *, page_t *, offset3, count3,
2053 2035      cred_t *))
2054 2036  {
2055 2037          rnode4_t *rp;
2056 2038          mntinfo4_t *mi;
2057 2039          struct nfs4_async_reqs *args;
2058 2040          page_t *pp;
2059 2041  
2060 2042          rp = VTOR4(vp);
2061 2043          mi = VTOMI4(vp);
2062 2044  
2063 2045          /*
2064 2046           * If we can't allocate a request structure, do the commit
2065 2047           * operation synchronously in this thread's context.
2066 2048           */
2067 2049          if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
2068 2050                  goto noasync;
2069 2051  
2070 2052          args->a_next = NULL;
2071 2053  #ifdef DEBUG
2072 2054          args->a_queuer = curthread;
2073 2055  #endif
2074 2056          VN_HOLD(vp);
2075 2057          args->a_vp = vp;
2076 2058          ASSERT(cr != NULL);
2077 2059          crhold(cr);
2078 2060          args->a_cred = cr;
2079 2061          args->a_io = NFS4_COMMIT;
2080 2062          args->a_nfs4_commit = commit;
2081 2063          args->a_nfs4_plist = plist;
2082 2064          args->a_nfs4_offset = offset;
2083 2065          args->a_nfs4_count = count;
2084 2066  
2085 2067          mutex_enter(&mi->mi_async_lock);
2086 2068  
2087 2069          /*
2088 2070           * If asyncio has been disabled, then make a synchronous request.
2089 2071           * This check is done a second time in case async io was diabled
2090 2072           * while this thread was blocked waiting for memory pressure to
2091 2073           * reduce or for the queue to drain.
2092 2074           */
2093 2075          if (mi->mi_max_threads == 0) {
2094 2076                  mutex_exit(&mi->mi_async_lock);
2095 2077  
2096 2078                  VN_RELE(vp);
2097 2079                  crfree(cr);
2098 2080                  kmem_free(args, sizeof (*args));
2099 2081                  goto noasync;
2100 2082          }
2101 2083  
2102 2084          /*
2103 2085           * Link request structure into the async list and
2104 2086           * wakeup async thread to do the i/o.
2105 2087           */
2106 2088          if (mi->mi_async_reqs[NFS4_COMMIT] == NULL) {
2107 2089                  mi->mi_async_reqs[NFS4_COMMIT] = args;
2108 2090                  mi->mi_async_tail[NFS4_COMMIT] = args;
2109 2091          } else {
2110 2092                  mi->mi_async_tail[NFS4_COMMIT]->a_next = args;
2111 2093                  mi->mi_async_tail[NFS4_COMMIT] = args;
2112 2094          }
2113 2095  
2114 2096          mutex_enter(&rp->r_statelock);
2115 2097          rp->r_count++;
2116 2098          mutex_exit(&rp->r_statelock);
2117 2099  
2118 2100          if (mi->mi_io_kstats) {
2119 2101                  mutex_enter(&mi->mi_lock);
2120 2102                  kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
2121 2103                  mutex_exit(&mi->mi_lock);
2122 2104          }
2123 2105  
2124 2106          mi->mi_async_req_count++;
2125 2107          ASSERT(mi->mi_async_req_count != 0);
2126 2108          cv_signal(&mi->mi_async_reqs_cv);
2127 2109          mutex_exit(&mi->mi_async_lock);
2128 2110          return;
2129 2111  
2130 2112  noasync:
2131 2113          if (curproc == proc_pageout || curproc == proc_fsflush ||
2132 2114              nfs_zone() != mi->mi_zone) {
2133 2115                  while (plist != NULL) {
2134 2116                          pp = plist;
2135 2117                          page_sub(&plist, pp);
2136 2118                          pp->p_fsdata = C_COMMIT;
2137 2119                          page_unlock(pp);
2138 2120                  }
2139 2121                  return;
2140 2122          }
2141 2123          (*commit)(vp, plist, offset, count, cr);
2142 2124  }
2143 2125  
2144 2126  /*
2145 2127   * nfs4_async_inactive - hand off a VOP_INACTIVE call to a thread.  The
2146 2128   * reference to the vnode is handed over to the thread; the caller should
2147 2129   * no longer refer to the vnode.
2148 2130   *
2149 2131   * Unlike most of the async routines, this handoff is needed for
2150 2132   * correctness reasons, not just performance.  So doing operations in the
2151 2133   * context of the current thread is not an option.
2152 2134   */
2153 2135  void
2154 2136  nfs4_async_inactive(vnode_t *vp, cred_t *cr)
2155 2137  {
2156 2138          mntinfo4_t *mi;
2157 2139          struct nfs4_async_reqs *args;
2158 2140          boolean_t signal_inactive_thread = B_FALSE;
2159 2141  
2160 2142          mi = VTOMI4(vp);
2161 2143  
2162 2144          args = kmem_alloc(sizeof (*args), KM_SLEEP);
2163 2145          args->a_next = NULL;
2164 2146  #ifdef DEBUG
2165 2147          args->a_queuer = curthread;
2166 2148  #endif
2167 2149          args->a_vp = vp;
2168 2150          ASSERT(cr != NULL);
2169 2151          crhold(cr);
2170 2152          args->a_cred = cr;
2171 2153          args->a_io = NFS4_INACTIVE;
2172 2154  
2173 2155          /*
2174 2156           * Note that we don't check mi->mi_max_threads here, since we
2175 2157           * *need* to get rid of this vnode regardless of whether someone
2176 2158           * set nfs4_max_threads to zero in /etc/system.
2177 2159           *
2178 2160           * The manager thread knows about this and is willing to create
2179 2161           * at least one thread to accommodate us.
2180 2162           */
2181 2163          mutex_enter(&mi->mi_async_lock);
2182 2164          if (mi->mi_inactive_thread == NULL) {
2183 2165                  rnode4_t *rp;
2184 2166                  vnode_t *unldvp = NULL;
2185 2167                  char *unlname;
2186 2168                  cred_t *unlcred;
2187 2169  
2188 2170                  mutex_exit(&mi->mi_async_lock);
2189 2171                  /*
2190 2172                   * We just need to free up the memory associated with the
2191 2173                   * vnode, which can be safely done from within the current
2192 2174                   * context.
2193 2175                   */
2194 2176                  crfree(cr);     /* drop our reference */
2195 2177                  kmem_free(args, sizeof (*args));
2196 2178                  rp = VTOR4(vp);
2197 2179                  mutex_enter(&rp->r_statelock);
2198 2180                  if (rp->r_unldvp != NULL) {
2199 2181                          unldvp = rp->r_unldvp;
2200 2182                          rp->r_unldvp = NULL;
2201 2183                          unlname = rp->r_unlname;
2202 2184                          rp->r_unlname = NULL;
2203 2185                          unlcred = rp->r_unlcred;
2204 2186                          rp->r_unlcred = NULL;
2205 2187                  }
2206 2188                  mutex_exit(&rp->r_statelock);
2207 2189                  /*
2208 2190                   * No need to explicitly throw away any cached pages.  The
2209 2191                   * eventual r4inactive() will attempt a synchronous
2210 2192                   * VOP_PUTPAGE() which will immediately fail since the request
2211 2193                   * is coming from the wrong zone, and then will proceed to call
2212 2194                   * nfs4_invalidate_pages() which will clean things up for us.
2213 2195                   *
2214 2196                   * Throw away the delegation here so rp4_addfree()'s attempt to
2215 2197                   * return any existing delegations becomes a no-op.
2216 2198                   */
2217 2199                  if (rp->r_deleg_type != OPEN_DELEGATE_NONE) {
2218 2200                          (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER,
2219 2201                              FALSE);
2220 2202                          (void) nfs4delegreturn(rp, NFS4_DR_DISCARD);
2221 2203                          nfs_rw_exit(&mi->mi_recovlock);
2222 2204                  }
2223 2205                  nfs4_clear_open_streams(rp);
2224 2206  
2225 2207                  rp4_addfree(rp, cr);
2226 2208                  if (unldvp != NULL) {
2227 2209                          kmem_free(unlname, MAXNAMELEN);
2228 2210                          VN_RELE(unldvp);
2229 2211                          crfree(unlcred);
2230 2212                  }
2231 2213                  return;
2232 2214          }
2233 2215  
2234 2216          if (mi->mi_manager_thread == NULL) {
2235 2217                  /*
2236 2218                   * We want to talk to the inactive thread.
2237 2219                   */
2238 2220                  signal_inactive_thread = B_TRUE;
2239 2221          }
2240 2222  
2241 2223          /*
2242 2224           * Enqueue the vnode and wake up either the special thread (empty
2243 2225           * list) or an async thread.
2244 2226           */
2245 2227          if (mi->mi_async_reqs[NFS4_INACTIVE] == NULL) {
2246 2228                  mi->mi_async_reqs[NFS4_INACTIVE] = args;
2247 2229                  mi->mi_async_tail[NFS4_INACTIVE] = args;
2248 2230                  signal_inactive_thread = B_TRUE;
2249 2231          } else {
2250 2232                  mi->mi_async_tail[NFS4_INACTIVE]->a_next = args;
2251 2233                  mi->mi_async_tail[NFS4_INACTIVE] = args;
2252 2234          }
2253 2235          if (signal_inactive_thread) {
2254 2236                  cv_signal(&mi->mi_inact_req_cv);
2255 2237          } else  {
2256 2238                  mi->mi_async_req_count++;
2257 2239                  ASSERT(mi->mi_async_req_count != 0);
2258 2240                  cv_signal(&mi->mi_async_reqs_cv);
2259 2241          }
2260 2242  
2261 2243          mutex_exit(&mi->mi_async_lock);
2262 2244  }
2263 2245  
2264 2246  int
2265 2247  writerp4(rnode4_t *rp, caddr_t base, int tcount, struct uio *uio, int pgcreated)
2266 2248  {
2267 2249          int pagecreate;
2268 2250          int n;
2269 2251          int saved_n;
2270 2252          caddr_t saved_base;
2271 2253          u_offset_t offset;
2272 2254          int error;
2273 2255          int sm_error;
2274 2256          vnode_t *vp = RTOV(rp);
2275 2257  
2276 2258          ASSERT(tcount <= MAXBSIZE && tcount <= uio->uio_resid);
2277 2259          ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_WRITER));
2278 2260          if (!vpm_enable) {
2279 2261                  ASSERT(((uintptr_t)base & MAXBOFFSET) + tcount <= MAXBSIZE);
2280 2262          }
2281 2263  
2282 2264          /*
2283 2265           * Move bytes in at most PAGESIZE chunks. We must avoid
2284 2266           * spanning pages in uiomove() because page faults may cause
2285 2267           * the cache to be invalidated out from under us. The r_size is not
2286 2268           * updated until after the uiomove. If we push the last page of a
2287 2269           * file before r_size is correct, we will lose the data written past
2288 2270           * the current (and invalid) r_size.
2289 2271           */
2290 2272          do {
2291 2273                  offset = uio->uio_loffset;
2292 2274                  pagecreate = 0;
2293 2275  
2294 2276                  /*
2295 2277                   * n is the number of bytes required to satisfy the request
2296 2278                   *   or the number of bytes to fill out the page.
2297 2279                   */
2298 2280                  n = (int)MIN((PAGESIZE - (offset & PAGEOFFSET)), tcount);
2299 2281  
2300 2282                  /*
2301 2283                   * Check to see if we can skip reading in the page
2302 2284                   * and just allocate the memory.  We can do this
2303 2285                   * if we are going to rewrite the entire mapping
2304 2286                   * or if we are going to write to or beyond the current
2305 2287                   * end of file from the beginning of the mapping.
2306 2288                   *
2307 2289                   * The read of r_size is now protected by r_statelock.
2308 2290                   */
2309 2291                  mutex_enter(&rp->r_statelock);
2310 2292                  /*
2311 2293                   * When pgcreated is nonzero the caller has already done
2312 2294                   * a segmap_getmapflt with forcefault 0 and S_WRITE. With
2313 2295                   * segkpm this means we already have at least one page
2314 2296                   * created and mapped at base.
2315 2297                   */
2316 2298                  pagecreate = pgcreated ||
2317 2299                      ((offset & PAGEOFFSET) == 0 &&
2318 2300                      (n == PAGESIZE || ((offset + n) >= rp->r_size)));
2319 2301  
2320 2302                  mutex_exit(&rp->r_statelock);
2321 2303  
2322 2304                  if (!vpm_enable && pagecreate) {
2323 2305                          /*
2324 2306                           * The last argument tells segmap_pagecreate() to
2325 2307                           * always lock the page, as opposed to sometimes
2326 2308                           * returning with the page locked. This way we avoid a
2327 2309                           * fault on the ensuing uiomove(), but also
2328 2310                           * more importantly (to fix bug 1094402) we can
2329 2311                           * call segmap_fault() to unlock the page in all
2330 2312                           * cases. An alternative would be to modify
2331 2313                           * segmap_pagecreate() to tell us when it is
2332 2314                           * locking a page, but that's a fairly major
2333 2315                           * interface change.
2334 2316                           */
2335 2317                          if (pgcreated == 0)
2336 2318                                  (void) segmap_pagecreate(segkmap, base,
2337 2319                                      (uint_t)n, 1);
2338 2320                          saved_base = base;
2339 2321                          saved_n = n;
2340 2322                  }
2341 2323  
2342 2324                  /*
2343 2325                   * The number of bytes of data in the last page can not
2344 2326                   * be accurately be determined while page is being
2345 2327                   * uiomove'd to and the size of the file being updated.
2346 2328                   * Thus, inform threads which need to know accurately
2347 2329                   * how much data is in the last page of the file.  They
2348 2330                   * will not do the i/o immediately, but will arrange for
2349 2331                   * the i/o to happen later when this modify operation
2350 2332                   * will have finished.
2351 2333                   */
2352 2334                  ASSERT(!(rp->r_flags & R4MODINPROGRESS));
2353 2335                  mutex_enter(&rp->r_statelock);
2354 2336                  rp->r_flags |= R4MODINPROGRESS;
2355 2337                  rp->r_modaddr = (offset & MAXBMASK);
2356 2338                  mutex_exit(&rp->r_statelock);
2357 2339  
2358 2340                  if (vpm_enable) {
2359 2341                          /*
2360 2342                           * Copy data. If new pages are created, part of
2361 2343                           * the page that is not written will be initizliazed
2362 2344                           * with zeros.
2363 2345                           */
2364 2346                          error = vpm_data_copy(vp, offset, n, uio,
2365 2347                              !pagecreate, NULL, 0, S_WRITE);
2366 2348                  } else {
2367 2349                          error = uiomove(base, n, UIO_WRITE, uio);
2368 2350                  }
2369 2351  
2370 2352                  /*
2371 2353                   * r_size is the maximum number of
2372 2354                   * bytes known to be in the file.
2373 2355                   * Make sure it is at least as high as the
2374 2356                   * first unwritten byte pointed to by uio_loffset.
2375 2357                   */
2376 2358                  mutex_enter(&rp->r_statelock);
2377 2359                  if (rp->r_size < uio->uio_loffset)
2378 2360                          rp->r_size = uio->uio_loffset;
2379 2361                  rp->r_flags &= ~R4MODINPROGRESS;
2380 2362                  rp->r_flags |= R4DIRTY;
2381 2363                  mutex_exit(&rp->r_statelock);
2382 2364  
2383 2365                  /* n = # of bytes written */
2384 2366                  n = (int)(uio->uio_loffset - offset);
2385 2367  
2386 2368                  if (!vpm_enable) {
2387 2369                          base += n;
2388 2370                  }
2389 2371  
2390 2372                  tcount -= n;
2391 2373                  /*
2392 2374                   * If we created pages w/o initializing them completely,
2393 2375                   * we need to zero the part that wasn't set up.
2394 2376                   * This happens on a most EOF write cases and if
2395 2377                   * we had some sort of error during the uiomove.
2396 2378                   */
2397 2379                  if (!vpm_enable && pagecreate) {
2398 2380                          if ((uio->uio_loffset & PAGEOFFSET) || n == 0)
2399 2381                                  (void) kzero(base, PAGESIZE - n);
2400 2382  
2401 2383                          if (pgcreated) {
2402 2384                                  /*
2403 2385                                   * Caller is responsible for this page,
2404 2386                                   * it was not created in this loop.
2405 2387                                   */
2406 2388                                  pgcreated = 0;
2407 2389                          } else {
2408 2390                                  /*
2409 2391                                   * For bug 1094402: segmap_pagecreate locks
2410 2392                                   * page. Unlock it. This also unlocks the
2411 2393                                   * pages allocated by page_create_va() in
2412 2394                                   * segmap_pagecreate().
2413 2395                                   */
2414 2396                                  sm_error = segmap_fault(kas.a_hat, segkmap,
2415 2397                                      saved_base, saved_n,
2416 2398                                      F_SOFTUNLOCK, S_WRITE);
2417 2399                                  if (error == 0)
2418 2400                                          error = sm_error;
2419 2401                          }
2420 2402                  }
2421 2403          } while (tcount > 0 && error == 0);
2422 2404  
2423 2405          return (error);
2424 2406  }
2425 2407  
2426 2408  int
2427 2409  nfs4_putpages(vnode_t *vp, u_offset_t off, size_t len, int flags, cred_t *cr)
2428 2410  {
2429 2411          rnode4_t *rp;
2430 2412          page_t *pp;
2431 2413          u_offset_t eoff;
2432 2414          u_offset_t io_off;
2433 2415          size_t io_len;
2434 2416          int error;
2435 2417          int rdirty;
2436 2418          int err;
2437 2419  
2438 2420          rp = VTOR4(vp);
2439 2421          ASSERT(rp->r_count > 0);
2440 2422  
2441 2423          if (!nfs4_has_pages(vp))
2442 2424                  return (0);
2443 2425  
2444 2426          ASSERT(vp->v_type != VCHR);
2445 2427  
2446 2428          /*
2447 2429           * If R4OUTOFSPACE is set, then all writes turn into B_INVAL
2448 2430           * writes.  B_FORCE is set to force the VM system to actually
2449 2431           * invalidate the pages, even if the i/o failed.  The pages
2450 2432           * need to get invalidated because they can't be written out
2451 2433           * because there isn't any space left on either the server's
2452 2434           * file system or in the user's disk quota.  The B_FREE bit
2453 2435           * is cleared to avoid confusion as to whether this is a
2454 2436           * request to place the page on the freelist or to destroy
2455 2437           * it.
2456 2438           */
2457 2439          if ((rp->r_flags & R4OUTOFSPACE) ||
2458 2440              (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED))
2459 2441                  flags = (flags & ~B_FREE) | B_INVAL | B_FORCE;
2460 2442  
2461 2443          if (len == 0) {
2462 2444                  /*
2463 2445                   * If doing a full file synchronous operation, then clear
2464 2446                   * the R4DIRTY bit.  If a page gets dirtied while the flush
2465 2447                   * is happening, then R4DIRTY will get set again.  The
2466 2448                   * R4DIRTY bit must get cleared before the flush so that
2467 2449                   * we don't lose this information.
2468 2450                   *
2469 2451                   * If there are no full file async write operations
2470 2452                   * pending and RDIRTY bit is set, clear it.
2471 2453                   */
2472 2454                  if (off == (u_offset_t)0 &&
2473 2455                      !(flags & B_ASYNC) &&
2474 2456                      (rp->r_flags & R4DIRTY)) {
2475 2457                          mutex_enter(&rp->r_statelock);
2476 2458                          rdirty = (rp->r_flags & R4DIRTY);
2477 2459                          rp->r_flags &= ~R4DIRTY;
2478 2460                          mutex_exit(&rp->r_statelock);
2479 2461                  } else if (flags & B_ASYNC && off == (u_offset_t)0) {
2480 2462                          mutex_enter(&rp->r_statelock);
2481 2463                          if (rp->r_flags & R4DIRTY && rp->r_awcount == 0) {
2482 2464                                  rdirty = (rp->r_flags & R4DIRTY);
2483 2465                                  rp->r_flags &= ~R4DIRTY;
2484 2466                          }
2485 2467                          mutex_exit(&rp->r_statelock);
2486 2468                  } else
2487 2469                          rdirty = 0;
2488 2470  
2489 2471                  /*
2490 2472                   * Search the entire vp list for pages >= off, and flush
2491 2473                   * the dirty pages.
2492 2474                   */
2493 2475                  error = pvn_vplist_dirty(vp, off, rp->r_putapage,
2494 2476                      flags, cr);
2495 2477  
2496 2478                  /*
2497 2479                   * If an error occurred and the file was marked as dirty
2498 2480                   * before and we aren't forcibly invalidating pages, then
2499 2481                   * reset the R4DIRTY flag.
2500 2482                   */
2501 2483                  if (error && rdirty &&
2502 2484                      (flags & (B_INVAL | B_FORCE)) != (B_INVAL | B_FORCE)) {
2503 2485                          mutex_enter(&rp->r_statelock);
2504 2486                          rp->r_flags |= R4DIRTY;
2505 2487                          mutex_exit(&rp->r_statelock);
2506 2488                  }
2507 2489          } else {
2508 2490                  /*
2509 2491                   * Do a range from [off...off + len) looking for pages
2510 2492                   * to deal with.
2511 2493                   */
2512 2494                  error = 0;
2513 2495                  io_len = 0;
2514 2496                  eoff = off + len;
2515 2497                  mutex_enter(&rp->r_statelock);
2516 2498                  for (io_off = off; io_off < eoff && io_off < rp->r_size;
2517 2499                      io_off += io_len) {
2518 2500                          mutex_exit(&rp->r_statelock);
2519 2501                          /*
2520 2502                           * If we are not invalidating, synchronously
2521 2503                           * freeing or writing pages use the routine
2522 2504                           * page_lookup_nowait() to prevent reclaiming
2523 2505                           * them from the free list.
2524 2506                           */
2525 2507                          if ((flags & B_INVAL) || !(flags & B_ASYNC)) {
2526 2508                                  pp = page_lookup(vp, io_off,
2527 2509                                      (flags & (B_INVAL | B_FREE)) ?
2528 2510                                      SE_EXCL : SE_SHARED);
2529 2511                          } else {
2530 2512                                  pp = page_lookup_nowait(vp, io_off,
2531 2513                                      (flags & B_FREE) ? SE_EXCL : SE_SHARED);
2532 2514                          }
2533 2515  
2534 2516                          if (pp == NULL || !pvn_getdirty(pp, flags))
2535 2517                                  io_len = PAGESIZE;
2536 2518                          else {
2537 2519                                  err = (*rp->r_putapage)(vp, pp, &io_off,
2538 2520                                      &io_len, flags, cr);
2539 2521                                  if (!error)
2540 2522                                          error = err;
2541 2523                                  /*
2542 2524                                   * "io_off" and "io_len" are returned as
2543 2525                                   * the range of pages we actually wrote.
2544 2526                                   * This allows us to skip ahead more quickly
2545 2527                                   * since several pages may've been dealt
2546 2528                                   * with by this iteration of the loop.
2547 2529                                   */
2548 2530                          }
2549 2531                          mutex_enter(&rp->r_statelock);
2550 2532                  }
2551 2533                  mutex_exit(&rp->r_statelock);
2552 2534          }
2553 2535  
2554 2536          return (error);
2555 2537  }
2556 2538  
2557 2539  void
2558 2540  nfs4_invalidate_pages(vnode_t *vp, u_offset_t off, cred_t *cr)
2559 2541  {
2560 2542          rnode4_t *rp;
2561 2543  
2562 2544          rp = VTOR4(vp);
2563 2545          if (IS_SHADOW(vp, rp))
2564 2546                  vp = RTOV4(rp);
2565 2547          mutex_enter(&rp->r_statelock);
2566 2548          while (rp->r_flags & R4TRUNCATE)
2567 2549                  cv_wait(&rp->r_cv, &rp->r_statelock);
2568 2550          rp->r_flags |= R4TRUNCATE;
2569 2551          if (off == (u_offset_t)0) {
2570 2552                  rp->r_flags &= ~R4DIRTY;
2571 2553                  if (!(rp->r_flags & R4STALE))
2572 2554                          rp->r_error = 0;
2573 2555          }
2574 2556          rp->r_truncaddr = off;
2575 2557          mutex_exit(&rp->r_statelock);
2576 2558          (void) pvn_vplist_dirty(vp, off, rp->r_putapage,
2577 2559              B_INVAL | B_TRUNC, cr);
2578 2560          mutex_enter(&rp->r_statelock);
2579 2561          rp->r_flags &= ~R4TRUNCATE;
2580 2562          cv_broadcast(&rp->r_cv);
2581 2563          mutex_exit(&rp->r_statelock);
2582 2564  }
2583 2565  
2584 2566  static int
2585 2567  nfs4_mnt_kstat_update(kstat_t *ksp, int rw)
2586 2568  {
2587 2569          mntinfo4_t *mi;
2588 2570          struct mntinfo_kstat *mik;
2589 2571          vfs_t *vfsp;
2590 2572  
2591 2573          /* this is a read-only kstat. Bail out on a write */
2592 2574          if (rw == KSTAT_WRITE)
2593 2575                  return (EACCES);
2594 2576  
2595 2577  
2596 2578          /*
2597 2579           * We don't want to wait here as kstat_chain_lock could be held by
2598 2580           * dounmount(). dounmount() takes vfs_reflock before the chain lock
2599 2581           * and thus could lead to a deadlock.
2600 2582           */
2601 2583          vfsp = (struct vfs *)ksp->ks_private;
2602 2584  
2603 2585          mi = VFTOMI4(vfsp);
2604 2586          mik = (struct mntinfo_kstat *)ksp->ks_data;
2605 2587  
2606 2588          (void) strcpy(mik->mik_proto, mi->mi_curr_serv->sv_knconf->knc_proto);
2607 2589  
2608 2590          mik->mik_vers = (uint32_t)mi->mi_vers;
2609 2591          mik->mik_flags = mi->mi_flags;
2610 2592          /*
2611 2593           * The sv_secdata holds the flavor the client specifies.
2612 2594           * If the client uses default and a security negotiation
2613 2595           * occurs, sv_currsec will point to the current flavor
2614 2596           * selected from the server flavor list.
2615 2597           * sv_currsec is NULL if no security negotiation takes place.
2616 2598           */
2617 2599          mik->mik_secmod = mi->mi_curr_serv->sv_currsec ?
2618 2600              mi->mi_curr_serv->sv_currsec->secmod :
2619 2601              mi->mi_curr_serv->sv_secdata->secmod;
2620 2602          mik->mik_curread = (uint32_t)mi->mi_curread;
2621 2603          mik->mik_curwrite = (uint32_t)mi->mi_curwrite;
2622 2604          mik->mik_retrans = mi->mi_retrans;
2623 2605          mik->mik_timeo = mi->mi_timeo;
2624 2606          mik->mik_acregmin = HR2SEC(mi->mi_acregmin);
2625 2607          mik->mik_acregmax = HR2SEC(mi->mi_acregmax);
2626 2608          mik->mik_acdirmin = HR2SEC(mi->mi_acdirmin);
2627 2609          mik->mik_acdirmax = HR2SEC(mi->mi_acdirmax);
2628 2610          mik->mik_noresponse = (uint32_t)mi->mi_noresponse;
2629 2611          mik->mik_failover = (uint32_t)mi->mi_failover;
2630 2612          mik->mik_remap = (uint32_t)mi->mi_remap;
2631 2613  
2632 2614          (void) strcpy(mik->mik_curserver, mi->mi_curr_serv->sv_hostname);
2633 2615  
2634 2616          return (0);
2635 2617  }
2636 2618  
2637 2619  void
2638 2620  nfs4_mnt_kstat_init(struct vfs *vfsp)
2639 2621  {
2640 2622          mntinfo4_t *mi = VFTOMI4(vfsp);
2641 2623  
2642 2624          /*
2643 2625           * PSARC 2001/697 Contract Private Interface
2644 2626           * All nfs kstats are under SunMC contract
2645 2627           * Please refer to the PSARC listed above and contact
2646 2628           * SunMC before making any changes!
2647 2629           *
2648 2630           * Changes must be reviewed by Solaris File Sharing
2649 2631           * Changes must be communicated to contract-2001-697@sun.com
2650 2632           *
2651 2633           */
2652 2634  
2653 2635          mi->mi_io_kstats = kstat_create_zone("nfs", getminor(vfsp->vfs_dev),
2654 2636              NULL, "nfs", KSTAT_TYPE_IO, 1, 0, mi->mi_zone->zone_id);
2655 2637          if (mi->mi_io_kstats) {
2656 2638                  if (mi->mi_zone->zone_id != GLOBAL_ZONEID)
2657 2639                          kstat_zone_add(mi->mi_io_kstats, GLOBAL_ZONEID);
2658 2640                  mi->mi_io_kstats->ks_lock = &mi->mi_lock;
2659 2641                  kstat_install(mi->mi_io_kstats);
2660 2642          }
2661 2643  
2662 2644          if ((mi->mi_ro_kstats = kstat_create_zone("nfs",
2663 2645              getminor(vfsp->vfs_dev), "mntinfo", "misc", KSTAT_TYPE_RAW,
2664 2646              sizeof (struct mntinfo_kstat), 0, mi->mi_zone->zone_id)) != NULL) {
2665 2647                  if (mi->mi_zone->zone_id != GLOBAL_ZONEID)
2666 2648                          kstat_zone_add(mi->mi_ro_kstats, GLOBAL_ZONEID);
2667 2649                  mi->mi_ro_kstats->ks_update = nfs4_mnt_kstat_update;
2668 2650                  mi->mi_ro_kstats->ks_private = (void *)vfsp;
2669 2651                  kstat_install(mi->mi_ro_kstats);
2670 2652          }
2671 2653  
2672 2654          nfs4_mnt_recov_kstat_init(vfsp);
2673 2655  }
2674 2656  
2675 2657  void
2676 2658  nfs4_write_error(vnode_t *vp, int error, cred_t *cr)
2677 2659  {
2678 2660          mntinfo4_t *mi;
2679 2661          clock_t now = ddi_get_lbolt();
2680 2662  
2681 2663          mi = VTOMI4(vp);
2682 2664          /*
2683 2665           * In case of forced unmount, do not print any messages
2684 2666           * since it can flood the console with error messages.
2685 2667           */
2686 2668          if (mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)
2687 2669                  return;
2688 2670  
2689 2671          /*
2690 2672           * If the mount point is dead, not recoverable, do not
2691 2673           * print error messages that can flood the console.
2692 2674           */
2693 2675          if (mi->mi_flags & MI4_RECOV_FAIL)
2694 2676                  return;
2695 2677  
2696 2678          /*
2697 2679           * No use in flooding the console with ENOSPC
2698 2680           * messages from the same file system.
2699 2681           */
2700 2682          if ((error != ENOSPC && error != EDQUOT) ||
2701 2683              now - mi->mi_printftime > 0) {
2702 2684                  zoneid_t zoneid = mi->mi_zone->zone_id;
2703 2685  
2704 2686  #ifdef DEBUG
2705 2687                  nfs_perror(error, "NFS%ld write error on host %s: %m.\n",
2706 2688                      mi->mi_vers, VTOR4(vp)->r_server->sv_hostname, NULL);
2707 2689  #else
2708 2690                  nfs_perror(error, "NFS write error on host %s: %m.\n",
2709 2691                      VTOR4(vp)->r_server->sv_hostname, NULL);
2710 2692  #endif
2711 2693                  if (error == ENOSPC || error == EDQUOT) {
2712 2694                          zcmn_err(zoneid, CE_CONT,
2713 2695                              "^File: userid=%d, groupid=%d\n",
2714 2696                              crgetuid(cr), crgetgid(cr));
2715 2697                          if (crgetuid(curthread->t_cred) != crgetuid(cr) ||
2716 2698                              crgetgid(curthread->t_cred) != crgetgid(cr)) {
2717 2699                                  zcmn_err(zoneid, CE_CONT,
2718 2700                                      "^User: userid=%d, groupid=%d\n",
2719 2701                                      crgetuid(curthread->t_cred),
2720 2702                                      crgetgid(curthread->t_cred));
2721 2703                          }
2722 2704                          mi->mi_printftime = now +
2723 2705                              nfs_write_error_interval * hz;
2724 2706                  }
2725 2707                  sfh4_printfhandle(VTOR4(vp)->r_fh);
2726 2708  #ifdef DEBUG
2727 2709                  if (error == EACCES) {
2728 2710                          zcmn_err(zoneid, CE_CONT,
2729 2711                              "nfs_bio: cred is%s kcred\n",
2730 2712                              cr == kcred ? "" : " not");
2731 2713                  }
2732 2714  #endif
2733 2715          }
2734 2716  }
2735 2717  
2736 2718  /*
2737 2719   * Return non-zero if the given file can be safely memory mapped.  Locks
2738 2720   * are safe if whole-file (length and offset are both zero).
2739 2721   */
2740 2722  
2741 2723  #define SAFE_LOCK(flk)  ((flk).l_start == 0 && (flk).l_len == 0)
2742 2724  
2743 2725  static int
2744 2726  nfs4_safemap(const vnode_t *vp)
2745 2727  {
2746 2728          locklist_t      *llp, *next_llp;
2747 2729          int             safe = 1;
2748 2730          rnode4_t        *rp = VTOR4(vp);
2749 2731  
2750 2732          ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER));
2751 2733  
2752 2734          NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, "nfs4_safemap: "
2753 2735              "vp = %p", (void *)vp));
2754 2736  
2755 2737          /*
2756 2738           * Review all the locks for the vnode, both ones that have been
2757 2739           * acquired and ones that are pending.  We assume that
2758 2740           * flk_active_locks_for_vp() has merged any locks that can be
2759 2741           * merged (so that if a process has the entire file locked, it is
2760 2742           * represented as a single lock).
2761 2743           *
2762 2744           * Note that we can't bail out of the loop if we find a non-safe
2763 2745           * lock, because we have to free all the elements in the llp list.
2764 2746           * We might be able to speed up this code slightly by not looking
2765 2747           * at each lock's l_start and l_len fields once we've found a
2766 2748           * non-safe lock.
2767 2749           */
2768 2750  
2769 2751          llp = flk_active_locks_for_vp(vp);
2770 2752          while (llp) {
2771 2753                  NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE,
2772 2754                      "nfs4_safemap: active lock (%" PRId64 ", %" PRId64 ")",
2773 2755                      llp->ll_flock.l_start, llp->ll_flock.l_len));
2774 2756                  if (!SAFE_LOCK(llp->ll_flock)) {
2775 2757                          safe = 0;
2776 2758                          NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE,
2777 2759                              "nfs4_safemap: unsafe active lock (%" PRId64
2778 2760                              ", %" PRId64 ")", llp->ll_flock.l_start,
2779 2761                              llp->ll_flock.l_len));
2780 2762                  }
2781 2763                  next_llp = llp->ll_next;
2782 2764                  VN_RELE(llp->ll_vp);
2783 2765                  kmem_free(llp, sizeof (*llp));
2784 2766                  llp = next_llp;
2785 2767          }
2786 2768  
2787 2769          NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, "nfs4_safemap: %s",
2788 2770              safe ? "safe" : "unsafe"));
2789 2771          return (safe);
2790 2772  }
2791 2773  
2792 2774  /*
2793 2775   * Return whether there is a lost LOCK or LOCKU queued up for the given
2794 2776   * file that would make an mmap request unsafe.  cf. nfs4_safemap().
2795 2777   */
2796 2778  
2797 2779  bool_t
2798 2780  nfs4_map_lost_lock_conflict(vnode_t *vp)
2799 2781  {
2800 2782          bool_t conflict = FALSE;
2801 2783          nfs4_lost_rqst_t *lrp;
2802 2784          mntinfo4_t *mi = VTOMI4(vp);
2803 2785  
2804 2786          mutex_enter(&mi->mi_lock);
2805 2787          for (lrp = list_head(&mi->mi_lost_state); lrp != NULL;
2806 2788              lrp = list_next(&mi->mi_lost_state, lrp)) {
2807 2789                  if (lrp->lr_op != OP_LOCK && lrp->lr_op != OP_LOCKU)
2808 2790                          continue;
2809 2791                  ASSERT(lrp->lr_vp != NULL);
2810 2792                  if (!VOP_CMP(lrp->lr_vp, vp, NULL))
2811 2793                          continue;       /* different file */
2812 2794                  if (!SAFE_LOCK(*lrp->lr_flk)) {
2813 2795                          conflict = TRUE;
2814 2796                          break;
2815 2797                  }
2816 2798          }
2817 2799  
2818 2800          mutex_exit(&mi->mi_lock);
2819 2801          return (conflict);
2820 2802  }
2821 2803  
2822 2804  /*
2823 2805   * nfs_lockcompletion:
2824 2806   *
2825 2807   * If the vnode has a lock that makes it unsafe to cache the file, mark it
2826 2808   * as non cachable (set VNOCACHE bit).
2827 2809   */
2828 2810  
2829 2811  void
2830 2812  nfs4_lockcompletion(vnode_t *vp, int cmd)
2831 2813  {
2832 2814          rnode4_t *rp = VTOR4(vp);
2833 2815  
2834 2816          ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER));
2835 2817          ASSERT(!IS_SHADOW(vp, rp));
2836 2818  
2837 2819          if (cmd == F_SETLK || cmd == F_SETLKW) {
2838 2820  
2839 2821                  if (!nfs4_safemap(vp)) {
2840 2822                          mutex_enter(&vp->v_lock);
2841 2823                          vp->v_flag |= VNOCACHE;
2842 2824                          mutex_exit(&vp->v_lock);
2843 2825                  } else {
2844 2826                          mutex_enter(&vp->v_lock);
2845 2827                          vp->v_flag &= ~VNOCACHE;
2846 2828                          mutex_exit(&vp->v_lock);
2847 2829                  }
2848 2830          }
2849 2831          /*
2850 2832           * The cached attributes of the file are stale after acquiring
2851 2833           * the lock on the file. They were updated when the file was
2852 2834           * opened, but not updated when the lock was acquired. Therefore the
2853 2835           * cached attributes are invalidated after the lock is obtained.
2854 2836           */
2855 2837          PURGE_ATTRCACHE4(vp);
2856 2838  }
2857 2839  
2858 2840  /* ARGSUSED */
2859 2841  static void *
2860 2842  nfs4_mi_init(zoneid_t zoneid)
2861 2843  {
2862 2844          struct mi4_globals *mig;
2863 2845  
2864 2846          mig = kmem_alloc(sizeof (*mig), KM_SLEEP);
2865 2847          mutex_init(&mig->mig_lock, NULL, MUTEX_DEFAULT, NULL);
2866 2848          list_create(&mig->mig_list, sizeof (mntinfo4_t),
2867 2849              offsetof(mntinfo4_t, mi_zone_node));
2868 2850          mig->mig_destructor_called = B_FALSE;
2869 2851          return (mig);
2870 2852  }
2871 2853  
2872 2854  /*
2873 2855   * Callback routine to tell all NFSv4 mounts in the zone to start tearing down
2874 2856   * state and killing off threads.
2875 2857   */
2876 2858  /* ARGSUSED */
2877 2859  static void
2878 2860  nfs4_mi_shutdown(zoneid_t zoneid, void *data)
2879 2861  {
2880 2862          struct mi4_globals *mig = data;
2881 2863          mntinfo4_t *mi;
2882 2864          nfs4_server_t *np;
2883 2865  
2884 2866          NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
2885 2867              "nfs4_mi_shutdown zone %d\n", zoneid));
2886 2868          ASSERT(mig != NULL);
2887 2869          for (;;) {
2888 2870                  mutex_enter(&mig->mig_lock);
2889 2871                  mi = list_head(&mig->mig_list);
2890 2872                  if (mi == NULL) {
2891 2873                          mutex_exit(&mig->mig_lock);
2892 2874                          break;
2893 2875                  }
2894 2876  
2895 2877                  NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
2896 2878                      "nfs4_mi_shutdown stopping vfs %p\n", (void *)mi->mi_vfsp));
2897 2879                  /*
2898 2880                   * purge the DNLC for this filesystem
2899 2881                   */
2900 2882                  (void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
2901 2883                  /*
2902 2884                   * Tell existing async worker threads to exit.
2903 2885                   */
2904 2886                  mutex_enter(&mi->mi_async_lock);
2905 2887                  mi->mi_max_threads = 0;
2906 2888                  NFS4_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
2907 2889                  /*
2908 2890                   * Set the appropriate flags, signal and wait for both the
2909 2891                   * async manager and the inactive thread to exit when they're
2910 2892                   * done with their current work.
2911 2893                   */
2912 2894                  mutex_enter(&mi->mi_lock);
2913 2895                  mi->mi_flags |= (MI4_ASYNC_MGR_STOP|MI4_DEAD);
2914 2896                  mutex_exit(&mi->mi_lock);
2915 2897                  mutex_exit(&mi->mi_async_lock);
2916 2898                  if (mi->mi_manager_thread) {
2917 2899                          nfs4_async_manager_stop(mi->mi_vfsp);
2918 2900                  }
2919 2901                  if (mi->mi_inactive_thread) {
2920 2902                          mutex_enter(&mi->mi_async_lock);
2921 2903                          cv_signal(&mi->mi_inact_req_cv);
2922 2904                          /*
2923 2905                           * Wait for the inactive thread to exit.
2924 2906                           */
2925 2907                          while (mi->mi_inactive_thread != NULL) {
2926 2908                                  cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
2927 2909                          }
2928 2910                          mutex_exit(&mi->mi_async_lock);
2929 2911                  }
2930 2912                  /*
2931 2913                   * Wait for the recovery thread to complete, that is, it will
2932 2914                   * signal when it is done using the "mi" structure and about
2933 2915                   * to exit
2934 2916                   */
2935 2917                  mutex_enter(&mi->mi_lock);
2936 2918                  while (mi->mi_in_recovery > 0)
2937 2919                          cv_wait(&mi->mi_cv_in_recov, &mi->mi_lock);
2938 2920                  mutex_exit(&mi->mi_lock);
2939 2921                  /*
2940 2922                   * We're done when every mi has been done or the list is empty.
2941 2923                   * This one is done, remove it from the list.
2942 2924                   */
2943 2925                  list_remove(&mig->mig_list, mi);
2944 2926                  mutex_exit(&mig->mig_lock);
2945 2927                  zone_rele_ref(&mi->mi_zone_ref, ZONE_REF_NFSV4);
2946 2928  
2947 2929                  /*
2948 2930                   * Release hold on vfs and mi done to prevent race with zone
2949 2931                   * shutdown. This releases the hold in nfs4_mi_zonelist_add.
2950 2932                   */
2951 2933                  VFS_RELE(mi->mi_vfsp);
2952 2934                  MI4_RELE(mi);
2953 2935          }
2954 2936          /*
2955 2937           * Tell each renew thread in the zone to exit
2956 2938           */
2957 2939          mutex_enter(&nfs4_server_lst_lock);
2958 2940          for (np = nfs4_server_lst.forw; np != &nfs4_server_lst; np = np->forw) {
2959 2941                  mutex_enter(&np->s_lock);
2960 2942                  if (np->zoneid == zoneid) {
2961 2943                          /*
2962 2944                           * We add another hold onto the nfs4_server_t
2963 2945                           * because this will make sure tha the nfs4_server_t
2964 2946                           * stays around until nfs4_callback_fini_zone destroys
2965 2947                           * the zone. This way, the renew thread can
2966 2948                           * unconditionally release its holds on the
2967 2949                           * nfs4_server_t.
2968 2950                           */
2969 2951                          np->s_refcnt++;
2970 2952                          nfs4_mark_srv_dead(np);
2971 2953                  }
2972 2954                  mutex_exit(&np->s_lock);
2973 2955          }
2974 2956          mutex_exit(&nfs4_server_lst_lock);
2975 2957  }
2976 2958  
2977 2959  static void
2978 2960  nfs4_mi_free_globals(struct mi4_globals *mig)
2979 2961  {
2980 2962          list_destroy(&mig->mig_list);   /* makes sure the list is empty */
2981 2963          mutex_destroy(&mig->mig_lock);
2982 2964          kmem_free(mig, sizeof (*mig));
2983 2965  }
2984 2966  
2985 2967  /* ARGSUSED */
2986 2968  static void
2987 2969  nfs4_mi_destroy(zoneid_t zoneid, void *data)
2988 2970  {
2989 2971          struct mi4_globals *mig = data;
2990 2972  
2991 2973          NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
2992 2974              "nfs4_mi_destroy zone %d\n", zoneid));
2993 2975          ASSERT(mig != NULL);
2994 2976          mutex_enter(&mig->mig_lock);
2995 2977          if (list_head(&mig->mig_list) != NULL) {
2996 2978                  /* Still waiting for VFS_FREEVFS() */
2997 2979                  mig->mig_destructor_called = B_TRUE;
2998 2980                  mutex_exit(&mig->mig_lock);
2999 2981                  return;
3000 2982          }
3001 2983          nfs4_mi_free_globals(mig);
3002 2984  }
3003 2985  
3004 2986  /*
3005 2987   * Add an NFS mount to the per-zone list of NFS mounts.
3006 2988   */
3007 2989  void
3008 2990  nfs4_mi_zonelist_add(mntinfo4_t *mi)
3009 2991  {
3010 2992          struct mi4_globals *mig;
3011 2993  
3012 2994          mig = zone_getspecific(mi4_list_key, mi->mi_zone);
3013 2995          mutex_enter(&mig->mig_lock);
3014 2996          list_insert_head(&mig->mig_list, mi);
3015 2997          /*
3016 2998           * hold added to eliminate race with zone shutdown -this will be
3017 2999           * released in mi_shutdown
3018 3000           */
3019 3001          MI4_HOLD(mi);
3020 3002          VFS_HOLD(mi->mi_vfsp);
3021 3003          mutex_exit(&mig->mig_lock);
3022 3004  }
3023 3005  
3024 3006  /*
3025 3007   * Remove an NFS mount from the per-zone list of NFS mounts.
3026 3008   */
3027 3009  int
3028 3010  nfs4_mi_zonelist_remove(mntinfo4_t *mi)
3029 3011  {
3030 3012          struct mi4_globals *mig;
3031 3013          int ret = 0;
3032 3014  
3033 3015          mig = zone_getspecific(mi4_list_key, mi->mi_zone);
3034 3016          mutex_enter(&mig->mig_lock);
3035 3017          mutex_enter(&mi->mi_lock);
3036 3018          /* if this mi is marked dead, then the zone already released it */
3037 3019          if (!(mi->mi_flags & MI4_DEAD)) {
3038 3020                  list_remove(&mig->mig_list, mi);
3039 3021                  mutex_exit(&mi->mi_lock);
3040 3022  
3041 3023                  /* release the holds put on in zonelist_add(). */
3042 3024                  VFS_RELE(mi->mi_vfsp);
3043 3025                  MI4_RELE(mi);
3044 3026                  ret = 1;
3045 3027          } else {
3046 3028                  mutex_exit(&mi->mi_lock);
3047 3029          }
3048 3030  
3049 3031          /*
3050 3032           * We can be called asynchronously by VFS_FREEVFS() after the zone
3051 3033           * shutdown/destroy callbacks have executed; if so, clean up the zone's
3052 3034           * mi globals.
3053 3035           */
3054 3036          if (list_head(&mig->mig_list) == NULL &&
3055 3037              mig->mig_destructor_called == B_TRUE) {
3056 3038                  nfs4_mi_free_globals(mig);
3057 3039                  return (ret);
3058 3040          }
3059 3041          mutex_exit(&mig->mig_lock);

↓ open down ↓

2556 lines elided

↑ open up ↑

3060 3042          return (ret);
3061 3043  }
3062 3044  
3063 3045  void
3064 3046  nfs_free_mi4(mntinfo4_t *mi)
3065 3047  {
3066 3048          nfs4_open_owner_t       *foop;
3067 3049          nfs4_oo_hash_bucket_t   *bucketp;
3068 3050          nfs4_debug_msg_t        *msgp;
3069 3051          int i;
3070      -        servinfo4_t             *svp;
     3052 +        servinfo4_t             *svp;
3071 3053  
3072 3054          /*
3073 3055           * Code introduced here should be carefully evaluated to make
3074 3056           * sure none of the freed resources are accessed either directly
3075 3057           * or indirectly after freeing them. For eg: Introducing calls to
3076 3058           * NFS4_DEBUG that use mntinfo4_t structure member after freeing
3077 3059           * the structure members or other routines calling back into NFS
3078 3060           * accessing freed mntinfo4_t structure member.
3079 3061           */
3080 3062          mutex_enter(&mi->mi_lock);

3081 3063          ASSERT(mi->mi_recovthread == NULL);
3082 3064          ASSERT(mi->mi_flags & MI4_ASYNC_MGR_STOP);
3083 3065          mutex_exit(&mi->mi_lock);
3084 3066          mutex_enter(&mi->mi_async_lock);
3085 3067          ASSERT(mi->mi_threads[NFS4_ASYNC_QUEUE] == 0 &&
3086 3068              mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] == 0);
3087 3069          ASSERT(mi->mi_manager_thread == NULL);
3088 3070          mutex_exit(&mi->mi_async_lock);
3089 3071          if (mi->mi_io_kstats) {
3090 3072                  kstat_delete(mi->mi_io_kstats);
3091 3073                  mi->mi_io_kstats = NULL;
3092 3074          }
3093 3075          if (mi->mi_ro_kstats) {
3094 3076                  kstat_delete(mi->mi_ro_kstats);
3095 3077                  mi->mi_ro_kstats = NULL;
3096 3078          }
3097 3079          if (mi->mi_recov_ksp) {
3098 3080                  kstat_delete(mi->mi_recov_ksp);
3099 3081                  mi->mi_recov_ksp = NULL;
3100 3082          }
3101 3083          mutex_enter(&mi->mi_msg_list_lock);
3102 3084          while (msgp = list_head(&mi->mi_msg_list)) {
3103 3085                  list_remove(&mi->mi_msg_list, msgp);
3104 3086                  nfs4_free_msg(msgp);
3105 3087          }
3106 3088          mutex_exit(&mi->mi_msg_list_lock);
3107 3089          list_destroy(&mi->mi_msg_list);
3108 3090          if (mi->mi_fname != NULL)
3109 3091                  fn_rele(&mi->mi_fname);
3110 3092          if (mi->mi_rootfh != NULL)
3111 3093                  sfh4_rele(&mi->mi_rootfh);
3112 3094          if (mi->mi_srvparentfh != NULL)
3113 3095                  sfh4_rele(&mi->mi_srvparentfh);
3114 3096          svp = mi->mi_servers;
3115 3097          sv4_free(svp);
3116 3098          mutex_destroy(&mi->mi_lock);
3117 3099          mutex_destroy(&mi->mi_async_lock);
3118 3100          mutex_destroy(&mi->mi_msg_list_lock);
3119 3101          mutex_destroy(&mi->mi_rnodes_lock);
3120 3102          nfs_rw_destroy(&mi->mi_recovlock);
3121 3103          nfs_rw_destroy(&mi->mi_rename_lock);
3122 3104          nfs_rw_destroy(&mi->mi_fh_lock);
3123 3105          cv_destroy(&mi->mi_failover_cv);
3124 3106          cv_destroy(&mi->mi_async_reqs_cv);
3125 3107          cv_destroy(&mi->mi_async_work_cv[NFS4_ASYNC_QUEUE]);
3126 3108          cv_destroy(&mi->mi_async_work_cv[NFS4_ASYNC_PGOPS_QUEUE]);
3127 3109          cv_destroy(&mi->mi_async_cv);
3128 3110          cv_destroy(&mi->mi_inact_req_cv);
3129 3111          /*
3130 3112           * Destroy the oo hash lists and mutexes for the cred hash table.
3131 3113           */
3132 3114          for (i = 0; i < NFS4_NUM_OO_BUCKETS; i++) {
3133 3115                  bucketp = &(mi->mi_oo_list[i]);
3134 3116                  /* Destroy any remaining open owners on the list */
3135 3117                  foop = list_head(&bucketp->b_oo_hash_list);
3136 3118                  while (foop != NULL) {
3137 3119                          list_remove(&bucketp->b_oo_hash_list, foop);
3138 3120                          nfs4_destroy_open_owner(foop);
3139 3121                          foop = list_head(&bucketp->b_oo_hash_list);
3140 3122                  }
3141 3123                  list_destroy(&bucketp->b_oo_hash_list);
3142 3124                  mutex_destroy(&bucketp->b_lock);
3143 3125          }
3144 3126          /*
3145 3127           * Empty and destroy the freed open owner list.
3146 3128           */
3147 3129          foop = list_head(&mi->mi_foo_list);
3148 3130          while (foop != NULL) {
3149 3131                  list_remove(&mi->mi_foo_list, foop);
3150 3132                  nfs4_destroy_open_owner(foop);
3151 3133                  foop = list_head(&mi->mi_foo_list);
3152 3134          }
3153 3135          list_destroy(&mi->mi_foo_list);
3154 3136          list_destroy(&mi->mi_bseqid_list);
3155 3137          list_destroy(&mi->mi_lost_state);
3156 3138          list_destroy(&mi->mi_rnodes);
3157 3139          avl_destroy(&mi->mi_filehandles);
3158 3140          kmem_free(mi, sizeof (*mi));
3159 3141  }
3160 3142  void
3161 3143  mi_hold(mntinfo4_t *mi)
3162 3144  {
3163 3145          atomic_inc_32(&mi->mi_count);
3164 3146          ASSERT(mi->mi_count != 0);
3165 3147  }
3166 3148  
3167 3149  void
3168 3150  mi_rele(mntinfo4_t *mi)
3169 3151  {
3170 3152          ASSERT(mi->mi_count != 0);
3171 3153          if (atomic_dec_32_nv(&mi->mi_count) == 0) {
3172 3154                  nfs_free_mi4(mi);
3173 3155          }
3174 3156  }
3175 3157  
3176 3158  vnode_t    nfs4_xattr_notsupp_vnode;
3177 3159  
3178 3160  void
3179 3161  nfs4_clnt_init(void)
3180 3162  {
3181 3163          nfs4_vnops_init();
3182 3164          (void) nfs4_rnode_init();
3183 3165          (void) nfs4_shadow_init();
3184 3166          (void) nfs4_acache_init();
3185 3167          (void) nfs4_subr_init();
3186 3168          nfs4_acl_init();
3187 3169          nfs_idmap_init();
3188 3170          nfs4_callback_init();
3189 3171          nfs4_secinfo_init();
3190 3172  #ifdef  DEBUG
3191 3173          tsd_create(&nfs4_tsd_key, NULL);
3192 3174  #endif
3193 3175  
3194 3176          /*
3195 3177           * Add a CPR callback so that we can update client
3196 3178           * lease after a suspend and resume.
3197 3179           */
3198 3180          cid = callb_add(nfs4_client_cpr_callb, 0, CB_CL_CPR_RPC, "nfs4");
3199 3181  
3200 3182          zone_key_create(&mi4_list_key, nfs4_mi_init, nfs4_mi_shutdown,
3201 3183              nfs4_mi_destroy);
3202 3184  
3203 3185          /*
3204 3186           * Initialize the reference count of the notsupp xattr cache vnode to 1
3205 3187           * so that it never goes away (VOP_INACTIVE isn't called on it).
3206 3188           */
3207 3189          vn_reinit(&nfs4_xattr_notsupp_vnode);
3208 3190  }
3209 3191  
3210 3192  void
3211 3193  nfs4_clnt_fini(void)
3212 3194  {
3213 3195          (void) zone_key_delete(mi4_list_key);
3214 3196          nfs4_vnops_fini();
3215 3197          (void) nfs4_rnode_fini();
3216 3198          (void) nfs4_shadow_fini();
3217 3199          (void) nfs4_acache_fini();
3218 3200          (void) nfs4_subr_fini();
3219 3201          nfs_idmap_fini();
3220 3202          nfs4_callback_fini();
3221 3203          nfs4_secinfo_fini();
3222 3204  #ifdef  DEBUG
3223 3205          tsd_destroy(&nfs4_tsd_key);
3224 3206  #endif
3225 3207          if (cid)
3226 3208                  (void) callb_delete(cid);
3227 3209  }
3228 3210  
3229 3211  /*ARGSUSED*/
3230 3212  static boolean_t
3231 3213  nfs4_client_cpr_callb(void *arg, int code)
3232 3214  {
3233 3215          /*
3234 3216           * We get called for Suspend and Resume events.
3235 3217           * For the suspend case we simply don't care!
3236 3218           */
3237 3219          if (code == CB_CODE_CPR_CHKPT) {
3238 3220                  return (B_TRUE);
3239 3221          }
3240 3222  
3241 3223          /*
3242 3224           * When we get to here we are in the process of
3243 3225           * resuming the system from a previous suspend.
3244 3226           */
3245 3227          nfs4_client_resumed = gethrestime_sec();
3246 3228          return (B_TRUE);
3247 3229  }
3248 3230  
3249 3231  void
3250 3232  nfs4_renew_lease_thread(nfs4_server_t *sp)
3251 3233  {
3252 3234          int     error = 0;
3253 3235          time_t  tmp_last_renewal_time, tmp_time, tmp_now_time, kip_secs;
3254 3236          clock_t tick_delay = 0;
3255 3237          clock_t time_left = 0;
3256 3238          callb_cpr_t cpr_info;
3257 3239          kmutex_t cpr_lock;
3258 3240  
3259 3241          NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3260 3242              "nfs4_renew_lease_thread: acting on sp 0x%p", (void*)sp));
3261 3243          mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL);
3262 3244          CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr, "nfsv4Lease");
3263 3245  
3264 3246          mutex_enter(&sp->s_lock);
3265 3247          /* sp->s_lease_time is set via a GETATTR */
3266 3248          sp->last_renewal_time = gethrestime_sec();
3267 3249          sp->lease_valid = NFS4_LEASE_UNINITIALIZED;
3268 3250          ASSERT(sp->s_refcnt >= 1);
3269 3251  
3270 3252          for (;;) {
3271 3253                  if (!sp->state_ref_count ||
3272 3254                      sp->lease_valid != NFS4_LEASE_VALID) {
3273 3255  
3274 3256                          kip_secs = MAX((sp->s_lease_time >> 1) -
3275 3257                              (3 * sp->propagation_delay.tv_sec), 1);
3276 3258  
3277 3259                          tick_delay = SEC_TO_TICK(kip_secs);
3278 3260  
3279 3261                          NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3280 3262                              "nfs4_renew_lease_thread: no renew : thread "
3281 3263                              "wait %ld secs", kip_secs));
3282 3264  
3283 3265                          NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3284 3266                              "nfs4_renew_lease_thread: no renew : "
3285 3267                              "state_ref_count %d, lease_valid %d",
3286 3268                              sp->state_ref_count, sp->lease_valid));
3287 3269  
3288 3270                          mutex_enter(&cpr_lock);
3289 3271                          CALLB_CPR_SAFE_BEGIN(&cpr_info);
3290 3272                          mutex_exit(&cpr_lock);
3291 3273                          time_left = cv_reltimedwait(&sp->cv_thread_exit,
3292 3274                              &sp->s_lock, tick_delay, TR_CLOCK_TICK);
3293 3275                          mutex_enter(&cpr_lock);
3294 3276                          CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
3295 3277                          mutex_exit(&cpr_lock);
3296 3278  
3297 3279                          NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3298 3280                              "nfs4_renew_lease_thread: no renew: "
3299 3281                              "time left %ld", time_left));
3300 3282  
3301 3283                          if (sp->s_thread_exit == NFS4_THREAD_EXIT)
3302 3284                                  goto die;
3303 3285                          continue;
3304 3286                  }
3305 3287  
3306 3288                  tmp_last_renewal_time = sp->last_renewal_time;
3307 3289  
3308 3290                  tmp_time = gethrestime_sec() - sp->last_renewal_time +
3309 3291                      (3 * sp->propagation_delay.tv_sec);
3310 3292  
3311 3293                  NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3312 3294                      "nfs4_renew_lease_thread: tmp_time %ld, "
3313 3295                      "sp->last_renewal_time %ld", tmp_time,
3314 3296                      sp->last_renewal_time));
3315 3297  
3316 3298                  kip_secs = MAX((sp->s_lease_time >> 1) - tmp_time, 1);
3317 3299  
3318 3300                  tick_delay = SEC_TO_TICK(kip_secs);
3319 3301  
3320 3302                  NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3321 3303                      "nfs4_renew_lease_thread: valid lease: sleep for %ld "
3322 3304                      "secs", kip_secs));
3323 3305  
3324 3306                  mutex_enter(&cpr_lock);
3325 3307                  CALLB_CPR_SAFE_BEGIN(&cpr_info);
3326 3308                  mutex_exit(&cpr_lock);
3327 3309                  time_left = cv_reltimedwait(&sp->cv_thread_exit, &sp->s_lock,
3328 3310                      tick_delay, TR_CLOCK_TICK);
3329 3311                  mutex_enter(&cpr_lock);
3330 3312                  CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
3331 3313                  mutex_exit(&cpr_lock);
3332 3314  
3333 3315                  NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3334 3316                      "nfs4_renew_lease_thread: valid lease: time left %ld :"
3335 3317                      "sp last_renewal_time %ld, nfs4_client_resumed %ld, "
3336 3318                      "tmp_last_renewal_time %ld", time_left,
3337 3319                      sp->last_renewal_time, nfs4_client_resumed,
3338 3320                      tmp_last_renewal_time));
3339 3321  
3340 3322                  if (sp->s_thread_exit == NFS4_THREAD_EXIT)
3341 3323                          goto die;
3342 3324  
3343 3325                  if (tmp_last_renewal_time == sp->last_renewal_time ||
3344 3326                      (nfs4_client_resumed != 0 &&
3345 3327                      nfs4_client_resumed > sp->last_renewal_time)) {
3346 3328                          /*
3347 3329                           * Issue RENEW op since we haven't renewed the lease
3348 3330                           * since we slept.
3349 3331                           */
3350 3332                          tmp_now_time = gethrestime_sec();
3351 3333                          error = nfs4renew(sp);
3352 3334                          /*
3353 3335                           * Need to re-acquire sp's lock, nfs4renew()
3354 3336                           * relinqueshes it.
3355 3337                           */
3356 3338                          mutex_enter(&sp->s_lock);
3357 3339  
3358 3340                          /*
3359 3341                           * See if someone changed s_thread_exit while we gave
3360 3342                           * up s_lock.
3361 3343                           */
3362 3344                          if (sp->s_thread_exit == NFS4_THREAD_EXIT)
3363 3345                                  goto die;
3364 3346  
3365 3347                          if (!error) {
3366 3348                                  /*
3367 3349                                   * check to see if we implicitly renewed while
3368 3350                                   * we waited for a reply for our RENEW call.
3369 3351                                   */
3370 3352                                  if (tmp_last_renewal_time ==
3371 3353                                      sp->last_renewal_time) {
3372 3354                                          /* no implicit renew came */
3373 3355                                          sp->last_renewal_time = tmp_now_time;
3374 3356                                  } else {
3375 3357                                          NFS4_DEBUG(nfs4_client_lease_debug,
3376 3358                                              (CE_NOTE, "renew_thread: did "
3377 3359                                              "implicit renewal before reply "
3378 3360                                              "from server for RENEW"));
3379 3361                                  }
3380 3362                          } else {
3381 3363                                  /* figure out error */
3382 3364                                  NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3383 3365                                      "renew_thread: nfs4renew returned error"
3384 3366                                      " %d", error));
3385 3367                          }
3386 3368  
3387 3369                  }
3388 3370          }
3389 3371  
3390 3372  die:
3391 3373          NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3392 3374              "nfs4_renew_lease_thread: thread exiting"));
3393 3375  
3394 3376          while (sp->s_otw_call_count != 0) {
3395 3377                  NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3396 3378                      "nfs4_renew_lease_thread: waiting for outstanding "
3397 3379                      "otw calls to finish for sp 0x%p, current "
3398 3380                      "s_otw_call_count %d", (void *)sp,
3399 3381                      sp->s_otw_call_count));
3400 3382                  mutex_enter(&cpr_lock);
3401 3383                  CALLB_CPR_SAFE_BEGIN(&cpr_info);
3402 3384                  mutex_exit(&cpr_lock);
3403 3385                  cv_wait(&sp->s_cv_otw_count, &sp->s_lock);
3404 3386                  mutex_enter(&cpr_lock);
3405 3387                  CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
3406 3388                  mutex_exit(&cpr_lock);
3407 3389          }
3408 3390          mutex_exit(&sp->s_lock);
3409 3391  
3410 3392          nfs4_server_rele(sp);           /* free the thread's reference */
3411 3393          nfs4_server_rele(sp);           /* free the list's reference */
3412 3394          sp = NULL;
3413 3395  
3414 3396  done:
3415 3397          mutex_enter(&cpr_lock);
3416 3398          CALLB_CPR_EXIT(&cpr_info);      /* drops cpr_lock */
3417 3399          mutex_destroy(&cpr_lock);
3418 3400  
3419 3401          NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3420 3402              "nfs4_renew_lease_thread: renew thread exit officially"));
3421 3403  
3422 3404          zthread_exit();
3423 3405          /* NOT REACHED */
3424 3406  }
3425 3407  
3426 3408  /*
3427 3409   * Send out a RENEW op to the server.
3428 3410   * Assumes sp is locked down.
3429 3411   */
3430 3412  static int
3431 3413  nfs4renew(nfs4_server_t *sp)
3432 3414  {
3433 3415          COMPOUND4args_clnt args;
3434 3416          COMPOUND4res_clnt res;
3435 3417          nfs_argop4 argop[1];
3436 3418          int doqueue = 1;
3437 3419          int rpc_error;
3438 3420          cred_t *cr;
3439 3421          mntinfo4_t *mi;
3440 3422          timespec_t prop_time, after_time;
3441 3423          int needrecov = FALSE;
3442 3424          nfs4_recov_state_t recov_state;
3443 3425          nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
3444 3426  
3445 3427          NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "nfs4renew"));
3446 3428  
3447 3429          recov_state.rs_flags = 0;
3448 3430          recov_state.rs_num_retry_despite_err = 0;
3449 3431  
3450 3432  recov_retry:
3451 3433          mi = sp->mntinfo4_list;
3452 3434          VFS_HOLD(mi->mi_vfsp);
3453 3435          mutex_exit(&sp->s_lock);
3454 3436          ASSERT(mi != NULL);
3455 3437  
3456 3438          e.error = nfs4_start_op(mi, NULL, NULL, &recov_state);
3457 3439          if (e.error) {
3458 3440                  VFS_RELE(mi->mi_vfsp);
3459 3441                  return (e.error);
3460 3442          }
3461 3443  
3462 3444          /* Check to see if we're dealing with a marked-dead sp */
3463 3445          mutex_enter(&sp->s_lock);
3464 3446          if (sp->s_thread_exit == NFS4_THREAD_EXIT) {
3465 3447                  mutex_exit(&sp->s_lock);
3466 3448                  nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3467 3449                  VFS_RELE(mi->mi_vfsp);
3468 3450                  return (0);
3469 3451          }
3470 3452  
3471 3453          /* Make sure mi hasn't changed on us */
3472 3454          if (mi != sp->mntinfo4_list) {
3473 3455                  /* Must drop sp's lock to avoid a recursive mutex enter */
3474 3456                  mutex_exit(&sp->s_lock);
3475 3457                  nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3476 3458                  VFS_RELE(mi->mi_vfsp);
3477 3459                  mutex_enter(&sp->s_lock);
3478 3460                  goto recov_retry;
3479 3461          }
3480 3462          mutex_exit(&sp->s_lock);
3481 3463  
3482 3464          args.ctag = TAG_RENEW;
3483 3465  
3484 3466          args.array_len = 1;
3485 3467          args.array = argop;
3486 3468  
3487 3469          argop[0].argop = OP_RENEW;
3488 3470  
3489 3471          mutex_enter(&sp->s_lock);
3490 3472          argop[0].nfs_argop4_u.oprenew.clientid = sp->clientid;
3491 3473          cr = sp->s_cred;
3492 3474          crhold(cr);
3493 3475          mutex_exit(&sp->s_lock);
3494 3476  
3495 3477          ASSERT(cr != NULL);
3496 3478  
3497 3479          /* used to figure out RTT for sp */
3498 3480          gethrestime(&prop_time);
3499 3481  
3500 3482          NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
3501 3483              "nfs4renew: %s call, sp 0x%p", needrecov ? "recov" : "first",
3502 3484              (void*)sp));
3503 3485          NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "before: %ld s %ld ns ",
3504 3486              prop_time.tv_sec, prop_time.tv_nsec));
3505 3487  
3506 3488          DTRACE_PROBE2(nfs4__renew__start, nfs4_server_t *, sp,
3507 3489              mntinfo4_t *, mi);
3508 3490  
3509 3491          rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
3510 3492          crfree(cr);
3511 3493  
3512 3494          DTRACE_PROBE2(nfs4__renew__end, nfs4_server_t *, sp,
3513 3495              mntinfo4_t *, mi);
3514 3496  
3515 3497          gethrestime(&after_time);
3516 3498  
3517 3499          mutex_enter(&sp->s_lock);
3518 3500          sp->propagation_delay.tv_sec =
3519 3501              MAX(1, after_time.tv_sec - prop_time.tv_sec);
3520 3502          mutex_exit(&sp->s_lock);
3521 3503  
3522 3504          NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "after : %ld s %ld ns ",
3523 3505              after_time.tv_sec, after_time.tv_nsec));
3524 3506  
3525 3507          if (e.error == 0 && res.status == NFS4ERR_CB_PATH_DOWN) {
3526 3508                  xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3527 3509                  nfs4_delegreturn_all(sp);
3528 3510                  nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3529 3511                  VFS_RELE(mi->mi_vfsp);
3530 3512                  /*
3531 3513                   * If the server returns CB_PATH_DOWN, it has renewed
3532 3514                   * the lease and informed us that the callback path is
3533 3515                   * down.  Since the lease is renewed, just return 0 and
3534 3516                   * let the renew thread proceed as normal.
3535 3517                   */
3536 3518                  return (0);
3537 3519          }
3538 3520  
3539 3521          needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
3540 3522          if (!needrecov && e.error) {
3541 3523                  nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3542 3524                  VFS_RELE(mi->mi_vfsp);
3543 3525                  return (e.error);
3544 3526          }
3545 3527  
3546 3528          rpc_error = e.error;
3547 3529  
3548 3530          if (needrecov) {
3549 3531                  NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
3550 3532                      "nfs4renew: initiating recovery\n"));
3551 3533  
3552 3534                  if (nfs4_start_recovery(&e, mi, NULL, NULL, NULL, NULL,
3553 3535                      OP_RENEW, NULL, NULL, NULL) == FALSE) {
3554 3536                          nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3555 3537                          VFS_RELE(mi->mi_vfsp);
3556 3538                          if (!e.error)
3557 3539                                  xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3558 3540                          mutex_enter(&sp->s_lock);
3559 3541                          goto recov_retry;
3560 3542                  }
3561 3543                  /* fall through for res.status case */
3562 3544          }
3563 3545  
3564 3546          if (res.status) {
3565 3547                  if (res.status == NFS4ERR_LEASE_MOVED) {
3566 3548                          /*EMPTY*/
3567 3549                          /*
3568 3550                           * XXX need to try every mntinfo4 in sp->mntinfo4_list
3569 3551                           * to renew the lease on that server
3570 3552                           */
3571 3553                  }
3572 3554                  e.error = geterrno4(res.status);
3573 3555          }
3574 3556  
3575 3557          if (!rpc_error)
3576 3558                  xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3577 3559  
3578 3560          nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3579 3561  
3580 3562          VFS_RELE(mi->mi_vfsp);
3581 3563  
3582 3564          return (e.error);
3583 3565  }
3584 3566  
3585 3567  void
3586 3568  nfs4_inc_state_ref_count(mntinfo4_t *mi)
3587 3569  {
3588 3570          nfs4_server_t   *sp;
3589 3571  
3590 3572          /* this locks down sp if it is found */
3591 3573          sp = find_nfs4_server(mi);
3592 3574  
3593 3575          if (sp != NULL) {
3594 3576                  nfs4_inc_state_ref_count_nolock(sp, mi);
3595 3577                  mutex_exit(&sp->s_lock);
3596 3578                  nfs4_server_rele(sp);
3597 3579          }
3598 3580  }
3599 3581  
3600 3582  /*
3601 3583   * Bump the number of OPEN files (ie: those with state) so we know if this
3602 3584   * nfs4_server has any state to maintain a lease for or not.
3603 3585   *
3604 3586   * Also, marks the nfs4_server's lease valid if it hasn't been done so already.
3605 3587   */
3606 3588  void
3607 3589  nfs4_inc_state_ref_count_nolock(nfs4_server_t *sp, mntinfo4_t *mi)
3608 3590  {
3609 3591          ASSERT(mutex_owned(&sp->s_lock));
3610 3592  
3611 3593          sp->state_ref_count++;
3612 3594          NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3613 3595              "nfs4_inc_state_ref_count: state_ref_count now %d",
3614 3596              sp->state_ref_count));
3615 3597  
3616 3598          if (sp->lease_valid == NFS4_LEASE_UNINITIALIZED)
3617 3599                  sp->lease_valid = NFS4_LEASE_VALID;
3618 3600  
3619 3601          /*
3620 3602           * If this call caused the lease to be marked valid and/or
3621 3603           * took the state_ref_count from 0 to 1, then start the time
3622 3604           * on lease renewal.
3623 3605           */
3624 3606          if (sp->lease_valid == NFS4_LEASE_VALID && sp->state_ref_count == 1)
3625 3607                  sp->last_renewal_time = gethrestime_sec();
3626 3608  
3627 3609          /* update the number of open files for mi */
3628 3610          mi->mi_open_files++;
3629 3611  }
3630 3612  
3631 3613  void
3632 3614  nfs4_dec_state_ref_count(mntinfo4_t *mi)
3633 3615  {
3634 3616          nfs4_server_t   *sp;
3635 3617  
3636 3618          /* this locks down sp if it is found */
3637 3619          sp = find_nfs4_server_all(mi, 1);
3638 3620  
3639 3621          if (sp != NULL) {
3640 3622                  nfs4_dec_state_ref_count_nolock(sp, mi);
3641 3623                  mutex_exit(&sp->s_lock);
3642 3624                  nfs4_server_rele(sp);
3643 3625          }
3644 3626  }
3645 3627  
3646 3628  /*
3647 3629   * Decrement the number of OPEN files (ie: those with state) so we know if
3648 3630   * this nfs4_server has any state to maintain a lease for or not.
3649 3631   */
3650 3632  void
3651 3633  nfs4_dec_state_ref_count_nolock(nfs4_server_t *sp, mntinfo4_t *mi)
3652 3634  {
3653 3635          ASSERT(mutex_owned(&sp->s_lock));
3654 3636          ASSERT(sp->state_ref_count != 0);
3655 3637          sp->state_ref_count--;
3656 3638  
3657 3639          NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3658 3640              "nfs4_dec_state_ref_count: state ref count now %d",
3659 3641              sp->state_ref_count));
3660 3642  
3661 3643          mi->mi_open_files--;
3662 3644          NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3663 3645              "nfs4_dec_state_ref_count: mi open files %d, v4 flags 0x%x",
3664 3646              mi->mi_open_files, mi->mi_flags));
3665 3647  
3666 3648          /* We don't have to hold the mi_lock to test mi_flags */
3667 3649          if (mi->mi_open_files == 0 &&
3668 3650              (mi->mi_flags & MI4_REMOVE_ON_LAST_CLOSE)) {
3669 3651                  NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3670 3652                      "nfs4_dec_state_ref_count: remove mntinfo4 %p since "
3671 3653                      "we have closed the last open file", (void*)mi));
3672 3654                  nfs4_remove_mi_from_server(mi, sp);
3673 3655          }
3674 3656  }
3675 3657  
3676 3658  bool_t
3677 3659  inlease(nfs4_server_t *sp)
3678 3660  {
3679 3661          bool_t result;
3680 3662  
3681 3663          ASSERT(mutex_owned(&sp->s_lock));
3682 3664  
3683 3665          if (sp->lease_valid == NFS4_LEASE_VALID &&
3684 3666              gethrestime_sec() < sp->last_renewal_time + sp->s_lease_time)
3685 3667                  result = TRUE;
3686 3668          else
3687 3669                  result = FALSE;
3688 3670  
3689 3671          return (result);
3690 3672  }
3691 3673  
3692 3674  
3693 3675  /*
3694 3676   * Return non-zero if the given nfs4_server_t is going through recovery.
3695 3677   */
3696 3678  
3697 3679  int
3698 3680  nfs4_server_in_recovery(nfs4_server_t *sp)
3699 3681  {
3700 3682          return (nfs_rw_lock_held(&sp->s_recovlock, RW_WRITER));
3701 3683  }
3702 3684  
3703 3685  /*
3704 3686   * Compare two shared filehandle objects.  Returns -1, 0, or +1, if the
3705 3687   * first is less than, equal to, or greater than the second.
3706 3688   */
3707 3689  
3708 3690  int
3709 3691  sfh4cmp(const void *p1, const void *p2)
3710 3692  {
3711 3693          const nfs4_sharedfh_t *sfh1 = (const nfs4_sharedfh_t *)p1;
3712 3694          const nfs4_sharedfh_t *sfh2 = (const nfs4_sharedfh_t *)p2;
3713 3695  
3714 3696          return (nfs4cmpfh(&sfh1->sfh_fh, &sfh2->sfh_fh));
3715 3697  }
3716 3698  
3717 3699  /*
3718 3700   * Create a table for shared filehandle objects.
3719 3701   */
3720 3702  
3721 3703  void
3722 3704  sfh4_createtab(avl_tree_t *tab)
3723 3705  {
3724 3706          avl_create(tab, sfh4cmp, sizeof (nfs4_sharedfh_t),
3725 3707              offsetof(nfs4_sharedfh_t, sfh_tree));
3726 3708  }
3727 3709  
3728 3710  /*
3729 3711   * Return a shared filehandle object for the given filehandle.  The caller
3730 3712   * is responsible for eventually calling sfh4_rele().
3731 3713   */
3732 3714  
3733 3715  nfs4_sharedfh_t *
3734 3716  sfh4_put(const nfs_fh4 *fh, mntinfo4_t *mi, nfs4_sharedfh_t *key)
3735 3717  {
3736 3718          nfs4_sharedfh_t *sfh, *nsfh;
3737 3719          avl_index_t where;
3738 3720          nfs4_sharedfh_t skey;
3739 3721  
3740 3722          if (!key) {
3741 3723                  skey.sfh_fh = *fh;
3742 3724                  key = &skey;
3743 3725          }
3744 3726  
3745 3727          nsfh = kmem_alloc(sizeof (nfs4_sharedfh_t), KM_SLEEP);
3746 3728          nsfh->sfh_fh.nfs_fh4_len = fh->nfs_fh4_len;
3747 3729          /*
3748 3730           * We allocate the largest possible filehandle size because it's
3749 3731           * not that big, and it saves us from possibly having to resize the
3750 3732           * buffer later.
3751 3733           */
3752 3734          nsfh->sfh_fh.nfs_fh4_val = kmem_alloc(NFS4_FHSIZE, KM_SLEEP);
3753 3735          bcopy(fh->nfs_fh4_val, nsfh->sfh_fh.nfs_fh4_val, fh->nfs_fh4_len);
3754 3736          mutex_init(&nsfh->sfh_lock, NULL, MUTEX_DEFAULT, NULL);
3755 3737          nsfh->sfh_refcnt = 1;
3756 3738          nsfh->sfh_flags = SFH4_IN_TREE;
3757 3739          nsfh->sfh_mi = mi;
3758 3740          NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, "sfh4_get: new object (%p)",
3759 3741              (void *)nsfh));
3760 3742  
3761 3743          (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0);
3762 3744          sfh = avl_find(&mi->mi_filehandles, key, &where);
3763 3745          if (sfh != NULL) {
3764 3746                  mutex_enter(&sfh->sfh_lock);
3765 3747                  sfh->sfh_refcnt++;
3766 3748                  mutex_exit(&sfh->sfh_lock);
3767 3749                  nfs_rw_exit(&mi->mi_fh_lock);
3768 3750                  /* free our speculative allocs */
3769 3751                  kmem_free(nsfh->sfh_fh.nfs_fh4_val, NFS4_FHSIZE);
3770 3752                  kmem_free(nsfh, sizeof (nfs4_sharedfh_t));
3771 3753                  return (sfh);
3772 3754          }
3773 3755  
3774 3756          avl_insert(&mi->mi_filehandles, nsfh, where);
3775 3757          nfs_rw_exit(&mi->mi_fh_lock);
3776 3758  
3777 3759          return (nsfh);
3778 3760  }
3779 3761  
3780 3762  /*
3781 3763   * Return a shared filehandle object for the given filehandle.  The caller
3782 3764   * is responsible for eventually calling sfh4_rele().
3783 3765   */
3784 3766  
3785 3767  nfs4_sharedfh_t *
3786 3768  sfh4_get(const nfs_fh4 *fh, mntinfo4_t *mi)
3787 3769  {
3788 3770          nfs4_sharedfh_t *sfh;
3789 3771          nfs4_sharedfh_t key;
3790 3772  
3791 3773          ASSERT(fh->nfs_fh4_len <= NFS4_FHSIZE);
3792 3774  
3793 3775  #ifdef DEBUG
3794 3776          if (nfs4_sharedfh_debug) {
3795 3777                  nfs4_fhandle_t fhandle;
3796 3778  
3797 3779                  fhandle.fh_len = fh->nfs_fh4_len;
3798 3780                  bcopy(fh->nfs_fh4_val, fhandle.fh_buf, fhandle.fh_len);
3799 3781                  zcmn_err(mi->mi_zone->zone_id, CE_NOTE, "sfh4_get:");
3800 3782                  nfs4_printfhandle(&fhandle);
3801 3783          }
3802 3784  #endif
3803 3785  
3804 3786          /*
3805 3787           * If there's already an object for the given filehandle, bump the
3806 3788           * reference count and return it.  Otherwise, create a new object
3807 3789           * and add it to the AVL tree.
3808 3790           */
3809 3791  
3810 3792          key.sfh_fh = *fh;
3811 3793  
3812 3794          (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0);
3813 3795          sfh = avl_find(&mi->mi_filehandles, &key, NULL);
3814 3796          if (sfh != NULL) {
3815 3797                  mutex_enter(&sfh->sfh_lock);
3816 3798                  sfh->sfh_refcnt++;
3817 3799                  NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3818 3800                      "sfh4_get: found existing %p, new refcnt=%d",
3819 3801                      (void *)sfh, sfh->sfh_refcnt));
3820 3802                  mutex_exit(&sfh->sfh_lock);
3821 3803                  nfs_rw_exit(&mi->mi_fh_lock);
3822 3804                  return (sfh);
3823 3805          }
3824 3806          nfs_rw_exit(&mi->mi_fh_lock);
3825 3807  
3826 3808          return (sfh4_put(fh, mi, &key));
3827 3809  }
3828 3810  
3829 3811  /*
3830 3812   * Get a reference to the given shared filehandle object.
3831 3813   */
3832 3814  
3833 3815  void
3834 3816  sfh4_hold(nfs4_sharedfh_t *sfh)
3835 3817  {
3836 3818          ASSERT(sfh->sfh_refcnt > 0);
3837 3819  
3838 3820          mutex_enter(&sfh->sfh_lock);
3839 3821          sfh->sfh_refcnt++;
3840 3822          NFS4_DEBUG(nfs4_sharedfh_debug,
3841 3823              (CE_NOTE, "sfh4_hold %p, new refcnt=%d",
3842 3824              (void *)sfh, sfh->sfh_refcnt));
3843 3825          mutex_exit(&sfh->sfh_lock);
3844 3826  }
3845 3827  
3846 3828  /*
3847 3829   * Release a reference to the given shared filehandle object and null out
3848 3830   * the given pointer.
3849 3831   */
3850 3832  
3851 3833  void
3852 3834  sfh4_rele(nfs4_sharedfh_t **sfhpp)
3853 3835  {
3854 3836          mntinfo4_t *mi;
3855 3837          nfs4_sharedfh_t *sfh = *sfhpp;
3856 3838  
3857 3839          ASSERT(sfh->sfh_refcnt > 0);
3858 3840  
3859 3841          mutex_enter(&sfh->sfh_lock);
3860 3842          if (sfh->sfh_refcnt > 1) {
3861 3843                  sfh->sfh_refcnt--;
3862 3844                  NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3863 3845                      "sfh4_rele %p, new refcnt=%d",
3864 3846                      (void *)sfh, sfh->sfh_refcnt));
3865 3847                  mutex_exit(&sfh->sfh_lock);
3866 3848                  goto finish;
3867 3849          }
3868 3850          mutex_exit(&sfh->sfh_lock);
3869 3851  
3870 3852          /*
3871 3853           * Possibly the last reference, so get the lock for the table in
3872 3854           * case it's time to remove the object from the table.
3873 3855           */
3874 3856          mi = sfh->sfh_mi;
3875 3857          (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0);
3876 3858          mutex_enter(&sfh->sfh_lock);
3877 3859          sfh->sfh_refcnt--;
3878 3860          if (sfh->sfh_refcnt > 0) {
3879 3861                  NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3880 3862                      "sfh4_rele %p, new refcnt=%d",
3881 3863                      (void *)sfh, sfh->sfh_refcnt));
3882 3864                  mutex_exit(&sfh->sfh_lock);
3883 3865                  nfs_rw_exit(&mi->mi_fh_lock);
3884 3866                  goto finish;
3885 3867          }
3886 3868  
3887 3869          NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3888 3870              "sfh4_rele %p, last ref", (void *)sfh));
3889 3871          if (sfh->sfh_flags & SFH4_IN_TREE) {
3890 3872                  avl_remove(&mi->mi_filehandles, sfh);
3891 3873                  sfh->sfh_flags &= ~SFH4_IN_TREE;
3892 3874          }
3893 3875          mutex_exit(&sfh->sfh_lock);
3894 3876          nfs_rw_exit(&mi->mi_fh_lock);
3895 3877          mutex_destroy(&sfh->sfh_lock);
3896 3878          kmem_free(sfh->sfh_fh.nfs_fh4_val, NFS4_FHSIZE);
3897 3879          kmem_free(sfh, sizeof (nfs4_sharedfh_t));
3898 3880  
3899 3881  finish:
3900 3882          *sfhpp = NULL;
3901 3883  }
3902 3884  
3903 3885  /*
3904 3886   * Update the filehandle for the given shared filehandle object.
3905 3887   */
3906 3888  
3907 3889  int nfs4_warn_dupfh = 0;        /* if set, always warn about dup fhs below */
3908 3890  
3909 3891  void
3910 3892  sfh4_update(nfs4_sharedfh_t *sfh, const nfs_fh4 *newfh)
3911 3893  {
3912 3894          mntinfo4_t *mi = sfh->sfh_mi;
3913 3895          nfs4_sharedfh_t *dupsfh;
3914 3896          avl_index_t where;
3915 3897          nfs4_sharedfh_t key;
3916 3898  
3917 3899  #ifdef DEBUG
3918 3900          mutex_enter(&sfh->sfh_lock);
3919 3901          ASSERT(sfh->sfh_refcnt > 0);
3920 3902          mutex_exit(&sfh->sfh_lock);
3921 3903  #endif
3922 3904          ASSERT(newfh->nfs_fh4_len <= NFS4_FHSIZE);
3923 3905  
3924 3906          /*
3925 3907           * The basic plan is to remove the shared filehandle object from
3926 3908           * the table, update it to have the new filehandle, then reinsert
3927 3909           * it.
3928 3910           */
3929 3911  
3930 3912          (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0);
3931 3913          mutex_enter(&sfh->sfh_lock);
3932 3914          if (sfh->sfh_flags & SFH4_IN_TREE) {
3933 3915                  avl_remove(&mi->mi_filehandles, sfh);
3934 3916                  sfh->sfh_flags &= ~SFH4_IN_TREE;
3935 3917          }
3936 3918          mutex_exit(&sfh->sfh_lock);
3937 3919          sfh->sfh_fh.nfs_fh4_len = newfh->nfs_fh4_len;
3938 3920          bcopy(newfh->nfs_fh4_val, sfh->sfh_fh.nfs_fh4_val,
3939 3921              sfh->sfh_fh.nfs_fh4_len);
3940 3922  
3941 3923          /*
3942 3924           * XXX If there is already a shared filehandle object with the new
3943 3925           * filehandle, we're in trouble, because the rnode code assumes
3944 3926           * that there is only one shared filehandle object for a given
3945 3927           * filehandle.  So issue a warning (for read-write mounts only)
3946 3928           * and don't try to re-insert the given object into the table.
3947 3929           * Hopefully the given object will quickly go away and everyone
3948 3930           * will use the new object.
3949 3931           */
3950 3932          key.sfh_fh = *newfh;
3951 3933          dupsfh = avl_find(&mi->mi_filehandles, &key, &where);
3952 3934          if (dupsfh != NULL) {
3953 3935                  if (!(mi->mi_vfsp->vfs_flag & VFS_RDONLY) || nfs4_warn_dupfh) {
3954 3936                          zcmn_err(mi->mi_zone->zone_id, CE_WARN, "sfh4_update: "
3955 3937                              "duplicate filehandle detected");
3956 3938                          sfh4_printfhandle(dupsfh);
3957 3939                  }
3958 3940          } else {
3959 3941                  avl_insert(&mi->mi_filehandles, sfh, where);
3960 3942                  mutex_enter(&sfh->sfh_lock);
3961 3943                  sfh->sfh_flags |= SFH4_IN_TREE;
3962 3944                  mutex_exit(&sfh->sfh_lock);
3963 3945          }
3964 3946          nfs_rw_exit(&mi->mi_fh_lock);
3965 3947  }
3966 3948  
3967 3949  /*
3968 3950   * Copy out the current filehandle for the given shared filehandle object.
3969 3951   */
3970 3952  
3971 3953  void
3972 3954  sfh4_copyval(const nfs4_sharedfh_t *sfh, nfs4_fhandle_t *fhp)
3973 3955  {
3974 3956          mntinfo4_t *mi = sfh->sfh_mi;
3975 3957  
3976 3958          ASSERT(sfh->sfh_refcnt > 0);
3977 3959  
3978 3960          (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0);
3979 3961          fhp->fh_len = sfh->sfh_fh.nfs_fh4_len;
3980 3962          ASSERT(fhp->fh_len <= NFS4_FHSIZE);
3981 3963          bcopy(sfh->sfh_fh.nfs_fh4_val, fhp->fh_buf, fhp->fh_len);
3982 3964          nfs_rw_exit(&mi->mi_fh_lock);
3983 3965  }
3984 3966  
3985 3967  /*
3986 3968   * Print out the filehandle for the given shared filehandle object.
3987 3969   */
3988 3970  
3989 3971  void
3990 3972  sfh4_printfhandle(const nfs4_sharedfh_t *sfh)
3991 3973  {
3992 3974          nfs4_fhandle_t fhandle;
3993 3975  
3994 3976          sfh4_copyval(sfh, &fhandle);
3995 3977          nfs4_printfhandle(&fhandle);
3996 3978  }
3997 3979  
3998 3980  /*
3999 3981   * Compare 2 fnames.  Returns -1 if the first is "less" than the second, 0
4000 3982   * if they're the same, +1 if the first is "greater" than the second.  The
4001 3983   * caller (or whoever's calling the AVL package) is responsible for
4002 3984   * handling locking issues.
4003 3985   */
4004 3986  
4005 3987  static int
4006 3988  fncmp(const void *p1, const void *p2)
4007 3989  {
4008 3990          const nfs4_fname_t *f1 = p1;
4009 3991          const nfs4_fname_t *f2 = p2;
4010 3992          int res;
4011 3993  
4012 3994          res = strcmp(f1->fn_name, f2->fn_name);
4013 3995          /*
4014 3996           * The AVL package wants +/-1, not arbitrary positive or negative
4015 3997           * integers.
4016 3998           */
4017 3999          if (res > 0)
4018 4000                  res = 1;
4019 4001          else if (res < 0)
4020 4002                  res = -1;
4021 4003          return (res);
4022 4004  }
4023 4005  
4024 4006  /*
4025 4007   * Get or create an fname with the given name, as a child of the given
4026 4008   * fname.  The caller is responsible for eventually releasing the reference
4027 4009   * (fn_rele()).  parent may be NULL.
4028 4010   */
4029 4011  
4030 4012  nfs4_fname_t *
4031 4013  fn_get(nfs4_fname_t *parent, char *name, nfs4_sharedfh_t *sfh)
4032 4014  {
4033 4015          nfs4_fname_t key;
4034 4016          nfs4_fname_t *fnp;
4035 4017          avl_index_t where;
4036 4018  
4037 4019          key.fn_name = name;
4038 4020  
4039 4021          /*
4040 4022           * If there's already an fname registered with the given name, bump
4041 4023           * its reference count and return it.  Otherwise, create a new one
4042 4024           * and add it to the parent's AVL tree.
4043 4025           *
4044 4026           * fname entries we are looking for should match both name
4045 4027           * and sfh stored in the fname.
4046 4028           */
4047 4029  again:
4048 4030          if (parent != NULL) {
4049 4031                  mutex_enter(&parent->fn_lock);
4050 4032                  fnp = avl_find(&parent->fn_children, &key, &where);
4051 4033                  if (fnp != NULL) {
4052 4034                          /*
4053 4035                           * This hold on fnp is released below later,
4054 4036                           * in case this is not the fnp we want.
4055 4037                           */
4056 4038                          fn_hold(fnp);
4057 4039  
4058 4040                          if (fnp->fn_sfh == sfh) {
4059 4041                                  /*
4060 4042                                   * We have found our entry.
4061 4043                                   * put an hold and return it.
4062 4044                                   */
4063 4045                                  mutex_exit(&parent->fn_lock);
4064 4046                                  return (fnp);
4065 4047                          }
4066 4048  
4067 4049                          /*
4068 4050                           * We have found an entry that has a mismatching
4069 4051                           * fn_sfh. This could be a stale entry due to
4070 4052                           * server side rename. We will remove this entry
4071 4053                           * and make sure no such entries exist.
4072 4054                           */
4073 4055                          mutex_exit(&parent->fn_lock);
4074 4056                          mutex_enter(&fnp->fn_lock);
4075 4057                          if (fnp->fn_parent == parent) {
4076 4058                                  /*
4077 4059                                   * Remove ourselves from parent's
4078 4060                                   * fn_children tree.
4079 4061                                   */
4080 4062                                  mutex_enter(&parent->fn_lock);
4081 4063                                  avl_remove(&parent->fn_children, fnp);
4082 4064                                  mutex_exit(&parent->fn_lock);
4083 4065                                  fn_rele(&fnp->fn_parent);
4084 4066                          }
4085 4067                          mutex_exit(&fnp->fn_lock);
4086 4068                          fn_rele(&fnp);
4087 4069                          goto again;
4088 4070                  }
4089 4071          }
4090 4072  
4091 4073          fnp = kmem_alloc(sizeof (nfs4_fname_t), KM_SLEEP);
4092 4074          mutex_init(&fnp->fn_lock, NULL, MUTEX_DEFAULT, NULL);
4093 4075          fnp->fn_parent = parent;
4094 4076          if (parent != NULL)
4095 4077                  fn_hold(parent);
4096 4078          fnp->fn_len = strlen(name);
4097 4079          ASSERT(fnp->fn_len < MAXNAMELEN);
4098 4080          fnp->fn_name = kmem_alloc(fnp->fn_len + 1, KM_SLEEP);
4099 4081          (void) strcpy(fnp->fn_name, name);
4100 4082          fnp->fn_refcnt = 1;
4101 4083  
4102 4084          /*
4103 4085           * This hold on sfh is later released
4104 4086           * when we do the final fn_rele() on this fname.
4105 4087           */
4106 4088          sfh4_hold(sfh);
4107 4089          fnp->fn_sfh = sfh;
4108 4090  
4109 4091          avl_create(&fnp->fn_children, fncmp, sizeof (nfs4_fname_t),
4110 4092              offsetof(nfs4_fname_t, fn_tree));
4111 4093          NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
4112 4094              "fn_get %p:%s, a new nfs4_fname_t!",
4113 4095              (void *)fnp, fnp->fn_name));
4114 4096          if (parent != NULL) {
4115 4097                  avl_insert(&parent->fn_children, fnp, where);
4116 4098                  mutex_exit(&parent->fn_lock);
4117 4099          }
4118 4100  
4119 4101          return (fnp);
4120 4102  }
4121 4103  
4122 4104  void
4123 4105  fn_hold(nfs4_fname_t *fnp)
4124 4106  {
4125 4107          atomic_inc_32(&fnp->fn_refcnt);
4126 4108          NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
4127 4109              "fn_hold %p:%s, new refcnt=%d",
4128 4110              (void *)fnp, fnp->fn_name, fnp->fn_refcnt));
4129 4111  }
4130 4112  
4131 4113  /*
4132 4114   * Decrement the reference count of the given fname, and destroy it if its
4133 4115   * reference count goes to zero.  Nulls out the given pointer.
4134 4116   */
4135 4117  
4136 4118  void
4137 4119  fn_rele(nfs4_fname_t **fnpp)
4138 4120  {
4139 4121          nfs4_fname_t *parent;
4140 4122          uint32_t newref;
4141 4123          nfs4_fname_t *fnp;
4142 4124  
4143 4125  recur:
4144 4126          fnp = *fnpp;
4145 4127          *fnpp = NULL;
4146 4128  
4147 4129          mutex_enter(&fnp->fn_lock);
4148 4130          parent = fnp->fn_parent;
4149 4131          if (parent != NULL)
4150 4132                  mutex_enter(&parent->fn_lock);  /* prevent new references */
4151 4133          newref = atomic_dec_32_nv(&fnp->fn_refcnt);
4152 4134          if (newref > 0) {
4153 4135                  NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
4154 4136                      "fn_rele %p:%s, new refcnt=%d",
4155 4137                      (void *)fnp, fnp->fn_name, fnp->fn_refcnt));
4156 4138                  if (parent != NULL)
4157 4139                          mutex_exit(&parent->fn_lock);
4158 4140                  mutex_exit(&fnp->fn_lock);
4159 4141                  return;
4160 4142          }
4161 4143  
4162 4144          NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
4163 4145              "fn_rele %p:%s, last reference, deleting...",
4164 4146              (void *)fnp, fnp->fn_name));
4165 4147          if (parent != NULL) {
4166 4148                  avl_remove(&parent->fn_children, fnp);
4167 4149                  mutex_exit(&parent->fn_lock);
4168 4150          }
4169 4151          kmem_free(fnp->fn_name, fnp->fn_len + 1);
4170 4152          sfh4_rele(&fnp->fn_sfh);
4171 4153          mutex_destroy(&fnp->fn_lock);
4172 4154          avl_destroy(&fnp->fn_children);
4173 4155          kmem_free(fnp, sizeof (nfs4_fname_t));
4174 4156          /*
4175 4157           * Recursivly fn_rele the parent.
4176 4158           * Use goto instead of a recursive call to avoid stack overflow.
4177 4159           */
4178 4160          if (parent != NULL) {
4179 4161                  fnpp = &parent;
4180 4162                  goto recur;
4181 4163          }
4182 4164  }
4183 4165  
4184 4166  /*
4185 4167   * Returns the single component name of the given fname, in a MAXNAMELEN
4186 4168   * string buffer, which the caller is responsible for freeing.  Note that
4187 4169   * the name may become invalid as a result of fn_move().
4188 4170   */
4189 4171  
4190 4172  char *
4191 4173  fn_name(nfs4_fname_t *fnp)
4192 4174  {
4193 4175          char *name;
4194 4176  
4195 4177          ASSERT(fnp->fn_len < MAXNAMELEN);
4196 4178          name = kmem_alloc(MAXNAMELEN, KM_SLEEP);
4197 4179          mutex_enter(&fnp->fn_lock);
4198 4180          (void) strcpy(name, fnp->fn_name);
4199 4181          mutex_exit(&fnp->fn_lock);
4200 4182  
4201 4183          return (name);
4202 4184  }
4203 4185  
4204 4186  
4205 4187  /*
4206 4188   * fn_path_realloc
4207 4189   *
4208 4190   * This function, used only by fn_path, constructs
4209 4191   * a new string which looks like "prepend" + "/" + "current".
4210 4192   * by allocating a new string and freeing the old one.
4211 4193   */
4212 4194  static void
4213 4195  fn_path_realloc(char **curses, char *prepend)
4214 4196  {
4215 4197          int len, curlen = 0;
4216 4198          char *news;
4217 4199  
4218 4200          if (*curses == NULL) {
4219 4201                  /*
4220 4202                   * Prime the pump, allocate just the
4221 4203                   * space for prepend and return that.
4222 4204                   */
4223 4205                  len = strlen(prepend) + 1;
4224 4206                  news = kmem_alloc(len, KM_SLEEP);
4225 4207                  (void) strncpy(news, prepend, len);
4226 4208          } else {
4227 4209                  /*
4228 4210                   * Allocate the space  for a new string
4229 4211                   * +1 +1 is for the "/" and the NULL
4230 4212                   * byte at the end of it all.
4231 4213                   */
4232 4214                  curlen = strlen(*curses);
4233 4215                  len = curlen + strlen(prepend) + 1 + 1;
4234 4216                  news = kmem_alloc(len, KM_SLEEP);
4235 4217                  (void) strncpy(news, prepend, len);
4236 4218                  (void) strcat(news, "/");
4237 4219                  (void) strcat(news, *curses);
4238 4220                  kmem_free(*curses, curlen + 1);
4239 4221          }
4240 4222          *curses = news;
4241 4223  }
4242 4224  
4243 4225  /*
4244 4226   * Returns the path name (starting from the fs root) for the given fname.
4245 4227   * The caller is responsible for freeing.  Note that the path may be or
4246 4228   * become invalid as a result of fn_move().
4247 4229   */
4248 4230  
4249 4231  char *
4250 4232  fn_path(nfs4_fname_t *fnp)
4251 4233  {
4252 4234          char *path;
4253 4235          nfs4_fname_t *nextfnp;
4254 4236  
4255 4237          if (fnp == NULL)
4256 4238                  return (NULL);
4257 4239  
4258 4240          path = NULL;
4259 4241  
4260 4242          /* walk up the tree constructing the pathname.  */
4261 4243  
4262 4244          fn_hold(fnp);                   /* adjust for later rele */
4263 4245          do {
4264 4246                  mutex_enter(&fnp->fn_lock);
4265 4247                  /*
4266 4248                   * Add fn_name in front of the current path
4267 4249                   */
4268 4250                  fn_path_realloc(&path, fnp->fn_name);
4269 4251                  nextfnp = fnp->fn_parent;
4270 4252                  if (nextfnp != NULL)
4271 4253                          fn_hold(nextfnp);
4272 4254                  mutex_exit(&fnp->fn_lock);
4273 4255                  fn_rele(&fnp);
4274 4256                  fnp = nextfnp;
4275 4257          } while (fnp != NULL);
4276 4258  
4277 4259          return (path);
4278 4260  }
4279 4261  
4280 4262  /*
4281 4263   * Return a reference to the parent of the given fname, which the caller is
4282 4264   * responsible for eventually releasing.
4283 4265   */
4284 4266  
4285 4267  nfs4_fname_t *
4286 4268  fn_parent(nfs4_fname_t *fnp)
4287 4269  {
4288 4270          nfs4_fname_t *parent;
4289 4271  
4290 4272          mutex_enter(&fnp->fn_lock);
4291 4273          parent = fnp->fn_parent;
4292 4274          if (parent != NULL)
4293 4275                  fn_hold(parent);
4294 4276          mutex_exit(&fnp->fn_lock);
4295 4277  
4296 4278          return (parent);
4297 4279  }
4298 4280  
4299 4281  /*
4300 4282   * Update fnp so that its parent is newparent and its name is newname.
4301 4283   */
4302 4284  
4303 4285  void
4304 4286  fn_move(nfs4_fname_t *fnp, nfs4_fname_t *newparent, char *newname)
4305 4287  {
4306 4288          nfs4_fname_t *parent, *tmpfnp;
4307 4289          ssize_t newlen;
4308 4290          nfs4_fname_t key;
4309 4291          avl_index_t where;
4310 4292  
4311 4293          /*
4312 4294           * This assert exists to catch the client trying to rename
4313 4295           * a dir to be a child of itself.  This happened at a recent
4314 4296           * bakeoff against a 3rd party (broken) server which allowed
4315 4297           * the rename to succeed.  If it trips it means that:
4316 4298           *      a) the code in nfs4rename that detects this case is broken
4317 4299           *      b) the server is broken (since it allowed the bogus rename)
4318 4300           *
4319 4301           * For non-DEBUG kernels, prepare for a recursive mutex_enter
4320 4302           * panic below from:  mutex_enter(&newparent->fn_lock);
4321 4303           */
4322 4304          ASSERT(fnp != newparent);
4323 4305  
4324 4306          /*
4325 4307           * Remove fnp from its current parent, change its name, then add it
4326 4308           * to newparent. It might happen that fnp was replaced by another
4327 4309           * nfs4_fname_t with the same fn_name in parent->fn_children.
4328 4310           * In such case, fnp->fn_parent is NULL and we skip the removal
4329 4311           * of fnp from its current parent.
4330 4312           */
4331 4313          mutex_enter(&fnp->fn_lock);
4332 4314          parent = fnp->fn_parent;
4333 4315          if (parent != NULL) {
4334 4316                  mutex_enter(&parent->fn_lock);
4335 4317                  avl_remove(&parent->fn_children, fnp);
4336 4318                  mutex_exit(&parent->fn_lock);
4337 4319                  fn_rele(&fnp->fn_parent);
4338 4320          }
4339 4321  
4340 4322          newlen = strlen(newname);
4341 4323          if (newlen != fnp->fn_len) {
4342 4324                  ASSERT(newlen < MAXNAMELEN);
4343 4325                  kmem_free(fnp->fn_name, fnp->fn_len + 1);
4344 4326                  fnp->fn_name = kmem_alloc(newlen + 1, KM_SLEEP);
4345 4327                  fnp->fn_len = newlen;
4346 4328          }
4347 4329          (void) strcpy(fnp->fn_name, newname);
4348 4330  
4349 4331  again:
4350 4332          mutex_enter(&newparent->fn_lock);
4351 4333          key.fn_name = fnp->fn_name;
4352 4334          tmpfnp = avl_find(&newparent->fn_children, &key, &where);
4353 4335          if (tmpfnp != NULL) {
4354 4336                  /*
4355 4337                   * This could be due to a file that was unlinked while
4356 4338                   * open, or perhaps the rnode is in the free list.  Remove
4357 4339                   * it from newparent and let it go away on its own.  The
4358 4340                   * contorted code is to deal with lock order issues and
4359 4341                   * race conditions.
4360 4342                   */
4361 4343                  fn_hold(tmpfnp);
4362 4344                  mutex_exit(&newparent->fn_lock);
4363 4345                  mutex_enter(&tmpfnp->fn_lock);
4364 4346                  if (tmpfnp->fn_parent == newparent) {
4365 4347                          mutex_enter(&newparent->fn_lock);
4366 4348                          avl_remove(&newparent->fn_children, tmpfnp);
4367 4349                          mutex_exit(&newparent->fn_lock);
4368 4350                          fn_rele(&tmpfnp->fn_parent);
4369 4351                  }
4370 4352                  mutex_exit(&tmpfnp->fn_lock);
4371 4353                  fn_rele(&tmpfnp);
4372 4354                  goto again;
4373 4355          }
4374 4356          fnp->fn_parent = newparent;
4375 4357          fn_hold(newparent);
4376 4358          avl_insert(&newparent->fn_children, fnp, where);
4377 4359          mutex_exit(&newparent->fn_lock);
4378 4360          mutex_exit(&fnp->fn_lock);
4379 4361  }
4380 4362  
4381 4363  #ifdef DEBUG
4382 4364  /*
4383 4365   * Return non-zero if the type information makes sense for the given vnode.
4384 4366   * Otherwise panic.
4385 4367   */
4386 4368  int
4387 4369  nfs4_consistent_type(vnode_t *vp)
4388 4370  {
4389 4371          rnode4_t *rp = VTOR4(vp);
4390 4372  
4391 4373          if (nfs4_vtype_debug && vp->v_type != VNON &&
4392 4374              rp->r_attr.va_type != VNON && vp->v_type != rp->r_attr.va_type) {
4393 4375                  cmn_err(CE_PANIC, "vnode %p type mismatch; v_type=%d, "
4394 4376                      "rnode attr type=%d", (void *)vp, vp->v_type,
4395 4377                      rp->r_attr.va_type);
4396 4378          }
4397 4379  
4398 4380          return (1);
4399 4381  }
4400 4382  #endif /* DEBUG */

↓ open down ↓

1320 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX