big-one New usr/src/uts/common/fs/nfs/nfs4

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
  24  * Copyright (c) 2017 by Delphix. All rights reserved.
  25  */
  26 
  27 /*
  28  *      Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
  29  *      All Rights Reserved
  30  */
  31 
  32 #include <sys/param.h>
  33 #include <sys/types.h>
  34 #include <sys/systm.h>
  35 #include <sys/thread.h>
  36 #include <sys/t_lock.h>
  37 #include <sys/time.h>
  38 #include <sys/vnode.h>
  39 #include <sys/vfs.h>
  40 #include <sys/errno.h>
  41 #include <sys/buf.h>
  42 #include <sys/stat.h>
  43 #include <sys/cred.h>
  44 #include <sys/kmem.h>
  45 #include <sys/debug.h>
  46 #include <sys/dnlc.h>
  47 #include <sys/vmsystm.h>
  48 #include <sys/flock.h>
  49 #include <sys/share.h>
  50 #include <sys/cmn_err.h>
  51 #include <sys/tiuser.h>
  52 #include <sys/sysmacros.h>
  53 #include <sys/callb.h>
  54 #include <sys/acl.h>
  55 #include <sys/kstat.h>
  56 #include <sys/signal.h>
  57 #include <sys/disp.h>
  58 #include <sys/atomic.h>
  59 #include <sys/list.h>
  60 #include <sys/sdt.h>
  61 
  62 #include <rpc/types.h>
  63 #include <rpc/xdr.h>
  64 #include <rpc/auth.h>
  65 #include <rpc/clnt.h>
  66 
  67 #include <nfs/nfs.h>
  68 #include <nfs/nfs_clnt.h>
  69 #include <nfs/nfs_acl.h>
  70 
  71 #include <nfs/nfs4.h>
  72 #include <nfs/rnode4.h>
  73 #include <nfs/nfs4_clnt.h>
  74 
  75 #include <vm/hat.h>
  76 #include <vm/as.h>
  77 #include <vm/page.h>
  78 #include <vm/pvn.h>
  79 #include <vm/seg.h>
  80 #include <vm/seg_map.h>
  81 #include <vm/seg_vn.h>
  82 
  83 #include <sys/ddi.h>
  84 
  85 /*
  86  * Arguments to page-flush thread.
  87  */
  88 typedef struct {
  89         vnode_t *vp;
  90         cred_t *cr;
  91 } pgflush_t;
  92 
  93 #ifdef DEBUG
  94 int nfs4_client_lease_debug;
  95 int nfs4_sharedfh_debug;
  96 int nfs4_fname_debug;
  97 
  98 /* temporary: panic if v_type is inconsistent with r_attr va_type */
  99 int nfs4_vtype_debug;
 100 
 101 uint_t nfs4_tsd_key;
 102 #endif
 103 
 104 static time_t   nfs4_client_resumed = 0;
 105 static  callb_id_t cid = 0;
 106 
 107 static int      nfs4renew(nfs4_server_t *);
 108 static void     nfs4_attrcache_va(vnode_t *, nfs4_ga_res_t *, int);
 109 static void     nfs4_pgflush_thread(pgflush_t *);
 110 
 111 static boolean_t nfs4_client_cpr_callb(void *, int);
 112 
 113 struct mi4_globals {
 114         kmutex_t        mig_lock;  /* lock protecting mig_list */
 115         list_t          mig_list;  /* list of NFS v4 mounts in zone */
 116         boolean_t       mig_destructor_called;
 117 };
 118 
 119 static zone_key_t mi4_list_key;
 120 
 121 /*
 122  * Attributes caching:
 123  *
 124  * Attributes are cached in the rnode in struct vattr form.
 125  * There is a time associated with the cached attributes (r_time_attr_inval)
 126  * which tells whether the attributes are valid. The time is initialized
 127  * to the difference between current time and the modify time of the vnode
 128  * when new attributes are cached. This allows the attributes for
 129  * files that have changed recently to be timed out sooner than for files
 130  * that have not changed for a long time. There are minimum and maximum
 131  * timeout values that can be set per mount point.
 132  */
 133 
 134 /*
 135  * If a cache purge is in progress, wait for it to finish.
 136  *
 137  * The current thread must not be in the middle of an
 138  * nfs4_start_op/nfs4_end_op region.  Otherwise, there could be a deadlock
 139  * between this thread, a recovery thread, and the page flush thread.
 140  */
 141 int
 142 nfs4_waitfor_purge_complete(vnode_t *vp)
 143 {
 144         rnode4_t *rp;
 145         k_sigset_t smask;
 146 
 147         rp = VTOR4(vp);
 148         if ((rp->r_serial != NULL && rp->r_serial != curthread) ||
 149             ((rp->r_flags & R4PGFLUSH) && rp->r_pgflush != curthread)) {
 150                 mutex_enter(&rp->r_statelock);
 151                 sigintr(&smask, VTOMI4(vp)->mi_flags & MI4_INT);
 152                 while ((rp->r_serial != NULL && rp->r_serial != curthread) ||
 153                     ((rp->r_flags & R4PGFLUSH) &&
 154                     rp->r_pgflush != curthread)) {
 155                         if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
 156                                 sigunintr(&smask);
 157                                 mutex_exit(&rp->r_statelock);
 158                                 return (EINTR);
 159                         }
 160                 }
 161                 sigunintr(&smask);
 162                 mutex_exit(&rp->r_statelock);
 163         }
 164         return (0);
 165 }
 166 
 167 /*
 168  * Validate caches by checking cached attributes. If they have timed out,
 169  * then get new attributes from the server.  As a side effect, cache
 170  * invalidation is done if the attributes have changed.
 171  *
 172  * If the attributes have not timed out and if there is a cache
 173  * invalidation being done by some other thread, then wait until that
 174  * thread has completed the cache invalidation.
 175  */
 176 int
 177 nfs4_validate_caches(vnode_t *vp, cred_t *cr)
 178 {
 179         int error;
 180         nfs4_ga_res_t gar;
 181 
 182         if (ATTRCACHE4_VALID(vp)) {
 183                 error = nfs4_waitfor_purge_complete(vp);
 184                 if (error)
 185                         return (error);
 186                 return (0);
 187         }
 188 
 189         return (nfs4_getattr_otw(vp, &gar, cr, 0));
 190 }
 191 
 192 /*
 193  * Fill in attribute from the cache.
 194  * If valid, then return 0 to indicate that no error occurred,
 195  * otherwise return 1 to indicate that an error occurred.
 196  */
 197 static int
 198 nfs4_getattr_cache(vnode_t *vp, struct vattr *vap)
 199 {
 200         rnode4_t *rp;
 201 
 202         rp = VTOR4(vp);
 203         mutex_enter(&rp->r_statelock);
 204         mutex_enter(&rp->r_statev4_lock);
 205         if (ATTRCACHE4_VALID(vp)) {
 206                 mutex_exit(&rp->r_statev4_lock);
 207                 /*
 208                  * Cached attributes are valid
 209                  */
 210                 *vap = rp->r_attr;
 211                 mutex_exit(&rp->r_statelock);
 212                 return (0);
 213         }
 214         mutex_exit(&rp->r_statev4_lock);
 215         mutex_exit(&rp->r_statelock);
 216         return (1);
 217 }
 218 
 219 
 220 /*
 221  * If returned error is ESTALE flush all caches.  The nfs4_purge_caches()
 222  * call is synchronous because all the pages were invalidated by the
 223  * nfs4_invalidate_pages() call.
 224  */
 225 void
 226 nfs4_purge_stale_fh(int errno, vnode_t *vp, cred_t *cr)
 227 {
 228         struct rnode4 *rp = VTOR4(vp);
 229 
 230         /* Ensure that the ..._end_op() call has been done */
 231         ASSERT(tsd_get(nfs4_tsd_key) == NULL);
 232 
 233         if (errno != ESTALE)
 234                 return;
 235 
 236         mutex_enter(&rp->r_statelock);
 237         rp->r_flags |= R4STALE;
 238         if (!rp->r_error)
 239                 rp->r_error = errno;
 240         mutex_exit(&rp->r_statelock);
 241         if (nfs4_has_pages(vp))
 242                 nfs4_invalidate_pages(vp, (u_offset_t)0, cr);
 243         nfs4_purge_caches(vp, NFS4_PURGE_DNLC, cr, FALSE);
 244 }
 245 
 246 /*
 247  * Purge all of the various NFS `data' caches.  If "asyncpg" is TRUE, the
 248  * page purge is done asynchronously.
 249  */
 250 void
 251 nfs4_purge_caches(vnode_t *vp, int purge_dnlc, cred_t *cr, int asyncpg)
 252 {
 253         rnode4_t *rp;
 254         char *contents;
 255         vnode_t *xattr;
 256         int size;
 257         int pgflush;                    /* are we the page flush thread? */
 258 
 259         /*
 260          * Purge the DNLC for any entries which refer to this file.
 261          */
 262         if (vp->v_count > 1 &&
 263             (vp->v_type == VDIR || purge_dnlc == NFS4_PURGE_DNLC))
 264                 dnlc_purge_vp(vp);
 265 
 266         /*
 267          * Clear any readdir state bits and purge the readlink response cache.
 268          */
 269         rp = VTOR4(vp);
 270         mutex_enter(&rp->r_statelock);
 271         rp->r_flags &= ~R4LOOKUP;
 272         contents = rp->r_symlink.contents;
 273         size = rp->r_symlink.size;
 274         rp->r_symlink.contents = NULL;
 275 
 276         xattr = rp->r_xattr_dir;
 277         rp->r_xattr_dir = NULL;
 278 
 279         /*
 280          * Purge pathconf cache too.
 281          */
 282         rp->r_pathconf.pc4_xattr_valid = 0;
 283         rp->r_pathconf.pc4_cache_valid = 0;
 284 
 285         pgflush = (curthread == rp->r_pgflush);
 286         mutex_exit(&rp->r_statelock);
 287 
 288         if (contents != NULL) {
 289 
 290                 kmem_free((void *)contents, size);
 291         }
 292 
 293         if (xattr != NULL)
 294                 VN_RELE(xattr);
 295 
 296         /*
 297          * Flush the page cache.  If the current thread is the page flush
 298          * thread, don't initiate a new page flush.  There's no need for
 299          * it, and doing it correctly is hard.
 300          */
 301         if (nfs4_has_pages(vp) && !pgflush) {
 302                 if (!asyncpg) {
 303                         (void) nfs4_waitfor_purge_complete(vp);
 304                         nfs4_flush_pages(vp, cr);
 305                 } else {
 306                         pgflush_t *args;
 307 
 308                         /*
 309                          * We don't hold r_statelock while creating the
 310                          * thread, in case the call blocks.  So we use a
 311                          * flag to indicate that a page flush thread is
 312                          * active.
 313                          */
 314                         mutex_enter(&rp->r_statelock);
 315                         if (rp->r_flags & R4PGFLUSH) {
 316                                 mutex_exit(&rp->r_statelock);
 317                         } else {
 318                                 rp->r_flags |= R4PGFLUSH;
 319                                 mutex_exit(&rp->r_statelock);
 320 
 321                                 args = kmem_alloc(sizeof (pgflush_t),
 322                                     KM_SLEEP);
 323                                 args->vp = vp;
 324                                 VN_HOLD(args->vp);
 325                                 args->cr = cr;
 326                                 crhold(args->cr);
 327                                 (void) zthread_create(NULL, 0,
 328                                     nfs4_pgflush_thread, args, 0,
 329                                     minclsyspri);
 330                         }
 331                 }
 332         }
 333 
 334         /*
 335          * Flush the readdir response cache.
 336          */
 337         nfs4_purge_rddir_cache(vp);
 338 }
 339 
 340 /*
 341  * Invalidate all pages for the given file, after writing back the dirty
 342  * ones.
 343  */
 344 
 345 void
 346 nfs4_flush_pages(vnode_t *vp, cred_t *cr)
 347 {
 348         int error;
 349         rnode4_t *rp = VTOR4(vp);
 350 
 351         error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_INVAL, cr, NULL);
 352         if (error == ENOSPC || error == EDQUOT) {
 353                 mutex_enter(&rp->r_statelock);
 354                 if (!rp->r_error)
 355                         rp->r_error = error;
 356                 mutex_exit(&rp->r_statelock);
 357         }
 358 }
 359 
 360 /*
 361  * Page flush thread.
 362  */
 363 
 364 static void
 365 nfs4_pgflush_thread(pgflush_t *args)
 366 {
 367         rnode4_t *rp = VTOR4(args->vp);
 368 
 369         /* remember which thread we are, so we don't deadlock ourselves */
 370         mutex_enter(&rp->r_statelock);
 371         ASSERT(rp->r_pgflush == NULL);
 372         rp->r_pgflush = curthread;
 373         mutex_exit(&rp->r_statelock);
 374 
 375         nfs4_flush_pages(args->vp, args->cr);
 376 
 377         mutex_enter(&rp->r_statelock);
 378         rp->r_pgflush = NULL;
 379         rp->r_flags &= ~R4PGFLUSH;
 380         cv_broadcast(&rp->r_cv);
 381         mutex_exit(&rp->r_statelock);
 382 
 383         VN_RELE(args->vp);
 384         crfree(args->cr);
 385         kmem_free(args, sizeof (pgflush_t));
 386         zthread_exit();
 387 }
 388 
 389 /*
 390  * Purge the readdir cache of all entries which are not currently
 391  * being filled.
 392  */
 393 void
 394 nfs4_purge_rddir_cache(vnode_t *vp)
 395 {
 396         rnode4_t *rp;
 397 
 398         rp = VTOR4(vp);
 399 
 400         mutex_enter(&rp->r_statelock);
 401         rp->r_direof = NULL;
 402         rp->r_flags &= ~R4LOOKUP;
 403         rp->r_flags |= R4READDIRWATTR;
 404         rddir4_cache_purge(rp);
 405         mutex_exit(&rp->r_statelock);
 406 }
 407 
 408 /*
 409  * Set attributes cache for given vnode using virtual attributes.  There is
 410  * no cache validation, but if the attributes are deemed to be stale, they
 411  * are ignored.  This corresponds to nfs3_attrcache().
 412  *
 413  * Set the timeout value on the attribute cache and fill it
 414  * with the passed in attributes.
 415  */
 416 void
 417 nfs4_attrcache_noinval(vnode_t *vp, nfs4_ga_res_t *garp, hrtime_t t)
 418 {
 419         rnode4_t *rp = VTOR4(vp);
 420 
 421         mutex_enter(&rp->r_statelock);
 422         if (rp->r_time_attr_saved <= t)
 423                 nfs4_attrcache_va(vp, garp, FALSE);
 424         mutex_exit(&rp->r_statelock);
 425 }
 426 
 427 /*
 428  * Use the passed in virtual attributes to check to see whether the
 429  * data and metadata caches are valid, cache the new attributes, and
 430  * then do the cache invalidation if required.
 431  *
 432  * The cache validation and caching of the new attributes is done
 433  * atomically via the use of the mutex, r_statelock.  If required,
 434  * the cache invalidation is done atomically w.r.t. the cache
 435  * validation and caching of the attributes via the pseudo lock,
 436  * r_serial.
 437  *
 438  * This routine is used to do cache validation and attributes caching
 439  * for operations with a single set of post operation attributes.
 440  */
 441 
 442 void
 443 nfs4_attr_cache(vnode_t *vp, nfs4_ga_res_t *garp,
 444     hrtime_t t, cred_t *cr, int async,
 445     change_info4 *cinfo)
 446 {
 447         rnode4_t *rp;
 448         int mtime_changed = 0;
 449         int ctime_changed = 0;
 450         vsecattr_t *vsp;
 451         int was_serial, set_time_cache_inval, recov;
 452         vattr_t *vap = &garp->n4g_va;
 453         mntinfo4_t *mi = VTOMI4(vp);
 454         len_t preattr_rsize;
 455         boolean_t writemodify_set = B_FALSE;
 456         boolean_t cachepurge_set = B_FALSE;
 457 
 458         ASSERT(mi->mi_vfsp->vfs_dev == garp->n4g_va.va_fsid);
 459 
 460         /* Is curthread the recovery thread? */
 461         mutex_enter(&mi->mi_lock);
 462         recov = (VTOMI4(vp)->mi_recovthread == curthread);
 463         mutex_exit(&mi->mi_lock);
 464 
 465         rp = VTOR4(vp);
 466         mutex_enter(&rp->r_statelock);
 467         was_serial = (rp->r_serial == curthread);
 468         if (rp->r_serial && !was_serial) {
 469                 klwp_t *lwp = ttolwp(curthread);
 470 
 471                 /*
 472                  * If we're the recovery thread, then purge current attrs
 473                  * and bail out to avoid potential deadlock between another
 474                  * thread caching attrs (r_serial thread), recov thread,
 475                  * and an async writer thread.
 476                  */
 477                 if (recov) {
 478                         PURGE_ATTRCACHE4_LOCKED(rp);
 479                         mutex_exit(&rp->r_statelock);
 480                         return;
 481                 }
 482 
 483                 if (lwp != NULL)
 484                         lwp->lwp_nostop++;
 485                 while (rp->r_serial != NULL) {
 486                         if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
 487                                 mutex_exit(&rp->r_statelock);
 488                                 if (lwp != NULL)
 489                                         lwp->lwp_nostop--;
 490                                 return;
 491                         }
 492                 }
 493                 if (lwp != NULL)
 494                         lwp->lwp_nostop--;
 495         }
 496 
 497         /*
 498          * If there is a page flush thread, the current thread needs to
 499          * bail out, to prevent a possible deadlock between the current
 500          * thread (which might be in a start_op/end_op region), the
 501          * recovery thread, and the page flush thread.  Expire the
 502          * attribute cache, so that any attributes the current thread was
 503          * going to set are not lost.
 504          */
 505         if ((rp->r_flags & R4PGFLUSH) && rp->r_pgflush != curthread) {
 506                 PURGE_ATTRCACHE4_LOCKED(rp);
 507                 mutex_exit(&rp->r_statelock);
 508                 return;
 509         }
 510 
 511         if (rp->r_time_attr_saved > t) {
 512                 /*
 513                  * Attributes have been cached since these attributes were
 514                  * probably made. If there is an inconsistency in what is
 515                  * cached, mark them invalid. If not, don't act on them.
 516                  */
 517                 if (!CACHE4_VALID(rp, vap->va_mtime, vap->va_size))
 518                         PURGE_ATTRCACHE4_LOCKED(rp);
 519                 mutex_exit(&rp->r_statelock);
 520                 return;
 521         }
 522         set_time_cache_inval = 0;
 523         if (cinfo) {
 524                 /*
 525                  * Only directory modifying callers pass non-NULL cinfo.
 526                  */
 527                 ASSERT(vp->v_type == VDIR);
 528                 /*
 529                  * If the cache timeout either doesn't exist or hasn't expired,
 530                  * and dir didn't changed on server before dirmod op
 531                  * and dir didn't change after dirmod op but before getattr
 532                  * then there's a chance that the client's cached data for
 533                  * this object is current (not stale).  No immediate cache
 534                  * flush is required.
 535                  *
 536                  */
 537                 if ((! rp->r_time_cache_inval || t < rp->r_time_cache_inval) &&
 538                     cinfo->before == rp->r_change &&
 539                     (garp->n4g_change_valid &&
 540                     cinfo->after == garp->n4g_change)) {
 541 
 542                         /*
 543                          * If atomic isn't set, then the before/after info
 544                          * cannot be blindly trusted.  For this case, we tell
 545                          * nfs4_attrcache_va to cache the attrs but also
 546                          * establish an absolute maximum cache timeout.  When
 547                          * the timeout is reached, caches will be flushed.
 548                          */
 549                         if (! cinfo->atomic)
 550                                 set_time_cache_inval = 1;
 551                 } else {
 552 
 553                         /*
 554                          * We're not sure exactly what changed, but we know
 555                          * what to do.  flush all caches for dir.  remove the
 556                          * attr timeout.
 557                          *
 558                          * a) timeout expired.  flush all caches.
 559                          * b) r_change != cinfo.before.  flush all caches.
 560                          * c) r_change == cinfo.before, but cinfo.after !=
 561                          *    post-op getattr(change).  flush all caches.
 562                          * d) post-op getattr(change) not provided by server.
 563                          *    flush all caches.
 564                          */
 565                         mtime_changed = 1;
 566                         ctime_changed = 1;
 567                         rp->r_time_cache_inval = 0;
 568                 }
 569         } else {
 570                 /*
 571                  * Write thread after writing data to file on remote server,
 572                  * will always set R4WRITEMODIFIED to indicate that file on
 573                  * remote server was modified with a WRITE operation and would
 574                  * have marked attribute cache as timed out. If R4WRITEMODIFIED
 575                  * is set, then do not check for mtime and ctime change.
 576                  */
 577                 if (!(rp->r_flags & R4WRITEMODIFIED)) {
 578                         if (!CACHE4_VALID(rp, vap->va_mtime, vap->va_size))
 579                                 mtime_changed = 1;
 580 
 581                         if (rp->r_attr.va_ctime.tv_sec !=
 582                             vap->va_ctime.tv_sec ||
 583                             rp->r_attr.va_ctime.tv_nsec !=
 584                             vap->va_ctime.tv_nsec)
 585                                 ctime_changed = 1;
 586 
 587                         /*
 588                          * If the change attribute was not provided by server
 589                          * or it differs, then flush all caches.
 590                          */
 591                         if (!garp->n4g_change_valid ||
 592                             rp->r_change != garp->n4g_change) {
 593                                 mtime_changed = 1;
 594                                 ctime_changed = 1;
 595                         }
 596                 } else {
 597                         writemodify_set = B_TRUE;
 598                 }
 599         }
 600 
 601         preattr_rsize = rp->r_size;
 602 
 603         nfs4_attrcache_va(vp, garp, set_time_cache_inval);
 604 
 605         /*
 606          * If we have updated filesize in nfs4_attrcache_va, as soon as we
 607          * drop statelock we will be in transition of purging all
 608          * our caches and updating them. It is possible for another
 609          * thread to pick this new file size and read in zeroed data.
 610          * stall other threads till cache purge is complete.
 611          */
 612         if ((!cinfo) && (rp->r_size != preattr_rsize)) {
 613                 /*
 614                  * If R4WRITEMODIFIED was set and we have updated the file
 615                  * size, Server's returned file size need not necessarily
 616                  * be because of this Client's WRITE. We need to purge
 617                  * all caches.
 618                  */
 619                 if (writemodify_set)
 620                         mtime_changed = 1;
 621 
 622                 if (mtime_changed && !(rp->r_flags & R4INCACHEPURGE)) {
 623                         rp->r_flags |= R4INCACHEPURGE;
 624                         cachepurge_set = B_TRUE;
 625                 }
 626         }
 627 
 628         if (!mtime_changed && !ctime_changed) {
 629                 mutex_exit(&rp->r_statelock);
 630                 return;
 631         }
 632 
 633         rp->r_serial = curthread;
 634 
 635         mutex_exit(&rp->r_statelock);
 636 
 637         /*
 638          * If we're the recov thread, then force async nfs4_purge_caches
 639          * to avoid potential deadlock.
 640          */
 641         if (mtime_changed)
 642                 nfs4_purge_caches(vp, NFS4_NOPURGE_DNLC, cr, recov ? 1 : async);
 643 
 644         if ((rp->r_flags & R4INCACHEPURGE) && cachepurge_set) {
 645                 mutex_enter(&rp->r_statelock);
 646                 rp->r_flags &= ~R4INCACHEPURGE;
 647                 cv_broadcast(&rp->r_cv);
 648                 mutex_exit(&rp->r_statelock);
 649                 cachepurge_set = B_FALSE;
 650         }
 651 
 652         if (ctime_changed) {
 653                 (void) nfs4_access_purge_rp(rp);
 654                 if (rp->r_secattr != NULL) {
 655                         mutex_enter(&rp->r_statelock);
 656                         vsp = rp->r_secattr;
 657                         rp->r_secattr = NULL;
 658                         mutex_exit(&rp->r_statelock);
 659                         if (vsp != NULL)
 660                                 nfs4_acl_free_cache(vsp);
 661                 }
 662         }
 663 
 664         if (!was_serial) {
 665                 mutex_enter(&rp->r_statelock);
 666                 rp->r_serial = NULL;
 667                 cv_broadcast(&rp->r_cv);
 668                 mutex_exit(&rp->r_statelock);
 669         }
 670 }
 671 
 672 /*
 673  * Set attributes cache for given vnode using virtual attributes.
 674  *
 675  * Set the timeout value on the attribute cache and fill it
 676  * with the passed in attributes.
 677  *
 678  * The caller must be holding r_statelock.
 679  */
 680 static void
 681 nfs4_attrcache_va(vnode_t *vp, nfs4_ga_res_t *garp, int set_cache_timeout)
 682 {
 683         rnode4_t *rp;
 684         mntinfo4_t *mi;
 685         hrtime_t delta;
 686         hrtime_t now;
 687         vattr_t *vap = &garp->n4g_va;
 688 
 689         rp = VTOR4(vp);
 690 
 691         ASSERT(MUTEX_HELD(&rp->r_statelock));
 692         ASSERT(vap->va_mask == AT_ALL);
 693 
 694         /* Switch to master before checking v_flag */
 695         if (IS_SHADOW(vp, rp))
 696                 vp = RTOV4(rp);
 697 
 698         now = gethrtime();
 699 
 700         mi = VTOMI4(vp);
 701 
 702         /*
 703          * Only establish a new cache timeout (if requested).  Never
 704          * extend a timeout.  Never clear a timeout.  Clearing a timeout
 705          * is done by nfs4_update_dircaches (ancestor in our call chain)
 706          */
 707         if (set_cache_timeout && ! rp->r_time_cache_inval)
 708                 rp->r_time_cache_inval = now + mi->mi_acdirmax;
 709 
 710         /*
 711          * Delta is the number of nanoseconds that we will
 712          * cache the attributes of the file.  It is based on
 713          * the number of nanoseconds since the last time that
 714          * we detected a change.  The assumption is that files
 715          * that changed recently are likely to change again.
 716          * There is a minimum and a maximum for regular files
 717          * and for directories which is enforced though.
 718          *
 719          * Using the time since last change was detected
 720          * eliminates direct comparison or calculation
 721          * using mixed client and server times.  NFS does
 722          * not make any assumptions regarding the client
 723          * and server clocks being synchronized.
 724          */
 725         if (vap->va_mtime.tv_sec != rp->r_attr.va_mtime.tv_sec ||
 726             vap->va_mtime.tv_nsec != rp->r_attr.va_mtime.tv_nsec ||
 727             vap->va_size != rp->r_attr.va_size) {
 728                 rp->r_time_attr_saved = now;
 729         }
 730 
 731         if ((mi->mi_flags & MI4_NOAC) || (vp->v_flag & VNOCACHE))
 732                 delta = 0;
 733         else {
 734                 delta = now - rp->r_time_attr_saved;
 735                 if (vp->v_type == VDIR) {
 736                         if (delta < mi->mi_acdirmin)
 737                                 delta = mi->mi_acdirmin;
 738                         else if (delta > mi->mi_acdirmax)
 739                                 delta = mi->mi_acdirmax;
 740                 } else {
 741                         if (delta < mi->mi_acregmin)
 742                                 delta = mi->mi_acregmin;
 743                         else if (delta > mi->mi_acregmax)
 744                                 delta = mi->mi_acregmax;
 745                 }
 746         }
 747         rp->r_time_attr_inval = now + delta;
 748 
 749         rp->r_attr = *vap;
 750         if (garp->n4g_change_valid)
 751                 rp->r_change = garp->n4g_change;
 752 
 753         /*
 754          * The attributes that were returned may be valid and can
 755          * be used, but they may not be allowed to be cached.
 756          * Reset the timers to cause immediate invalidation and
 757          * clear r_change so no VERIFY operations will suceed
 758          */
 759         if (garp->n4g_attrwhy == NFS4_GETATTR_NOCACHE_OK) {
 760                 rp->r_time_attr_inval = now;
 761                 rp->r_time_attr_saved = now;
 762                 rp->r_change = 0;
 763         }
 764 
 765         /*
 766          * If mounted_on_fileid returned AND the object is a stub,
 767          * then set object's va_nodeid to the mounted over fid
 768          * returned by server.
 769          *
 770          * If mounted_on_fileid not provided/supported, then
 771          * just set it to 0 for now.  Eventually it would be
 772          * better to set it to a hashed version of FH.  This
 773          * would probably be good enough to provide a unique
 774          * fid/d_ino within a dir.
 775          *
 776          * We don't need to carry mounted_on_fileid in the
 777          * rnode as long as the client never requests fileid
 778          * without also requesting mounted_on_fileid.  For
 779          * now, it stays.
 780          */
 781         if (garp->n4g_mon_fid_valid) {
 782                 rp->r_mntd_fid = garp->n4g_mon_fid;
 783 
 784                 if (RP_ISSTUB(rp))
 785                         rp->r_attr.va_nodeid = rp->r_mntd_fid;
 786         }
 787 
 788         /*
 789          * Check to see if there are valid pathconf bits to
 790          * cache in the rnode.
 791          */
 792         if (garp->n4g_ext_res) {
 793                 if (garp->n4g_ext_res->n4g_pc4.pc4_cache_valid) {
 794                         rp->r_pathconf = garp->n4g_ext_res->n4g_pc4;
 795                 } else {
 796                         if (garp->n4g_ext_res->n4g_pc4.pc4_xattr_valid) {
 797                                 rp->r_pathconf.pc4_xattr_valid = TRUE;
 798                                 rp->r_pathconf.pc4_xattr_exists =
 799                                     garp->n4g_ext_res->n4g_pc4.pc4_xattr_exists;
 800                         }
 801                 }
 802         }
 803         /*
 804          * Update the size of the file if there is no cached data or if
 805          * the cached data is clean and there is no data being written
 806          * out.
 807          */
 808         if (rp->r_size != vap->va_size &&
 809             (!vn_has_cached_data(vp) ||
 810             (!(rp->r_flags & R4DIRTY) && rp->r_count == 0))) {
 811                 rp->r_size = vap->va_size;
 812         }
 813         nfs_setswaplike(vp, vap);
 814         rp->r_flags &= ~R4WRITEMODIFIED;
 815 }
 816 
 817 /*
 818  * Get attributes over-the-wire and update attributes cache
 819  * if no error occurred in the over-the-wire operation.
 820  * Return 0 if successful, otherwise error.
 821  */
 822 int
 823 nfs4_getattr_otw(vnode_t *vp, nfs4_ga_res_t *garp, cred_t *cr, int get_acl)
 824 {
 825         mntinfo4_t *mi = VTOMI4(vp);
 826         hrtime_t t;
 827         nfs4_recov_state_t recov_state;
 828         nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
 829 
 830         recov_state.rs_flags = 0;
 831         recov_state.rs_num_retry_despite_err = 0;
 832 
 833         /* Save the original mount point security flavor */
 834         (void) save_mnt_secinfo(mi->mi_curr_serv);
 835 
 836 recov_retry:
 837 
 838         if ((e.error = nfs4_start_fop(mi, vp, NULL, OH_GETATTR,
 839             &recov_state, NULL))) {
 840                 (void) check_mnt_secinfo(mi->mi_curr_serv, vp);
 841                 return (e.error);
 842         }
 843 
 844         t = gethrtime();
 845 
 846         nfs4_getattr_otw_norecovery(vp, garp, &e, cr, get_acl);
 847 
 848         if (nfs4_needs_recovery(&e, FALSE, vp->v_vfsp)) {
 849                 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
 850                     NULL, OP_GETATTR, NULL, NULL, NULL) == FALSE)  {
 851                         nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR,
 852                             &recov_state, 1);
 853                         goto recov_retry;
 854                 }
 855         }
 856 
 857         nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state, 0);
 858 
 859         if (!e.error) {
 860                 if (e.stat == NFS4_OK) {
 861                         nfs4_attr_cache(vp, garp, t, cr, FALSE, NULL);
 862                 } else {
 863                         e.error = geterrno4(e.stat);
 864 
 865                         nfs4_purge_stale_fh(e.error, vp, cr);
 866                 }
 867         }
 868 
 869         /*
 870          * If getattr a node that is a stub for a crossed
 871          * mount point, keep the original secinfo flavor for
 872          * the current file system, not the crossed one.
 873          */
 874         (void) check_mnt_secinfo(mi->mi_curr_serv, vp);
 875 
 876         return (e.error);
 877 }
 878 
 879 /*
 880  * Generate a compound to get attributes over-the-wire.
 881  */
 882 void
 883 nfs4_getattr_otw_norecovery(vnode_t *vp, nfs4_ga_res_t *garp,
 884     nfs4_error_t *ep, cred_t *cr, int get_acl)
 885 {
 886         COMPOUND4args_clnt args;
 887         COMPOUND4res_clnt res;
 888         int doqueue;
 889         rnode4_t *rp = VTOR4(vp);
 890         nfs_argop4 argop[2];
 891 
 892         args.ctag = TAG_GETATTR;
 893 
 894         args.array_len = 2;
 895         args.array = argop;
 896 
 897         /* putfh */
 898         argop[0].argop = OP_CPUTFH;
 899         argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
 900 
 901         /* getattr */
 902         /*
 903          * Unlike nfs version 2 and 3, where getattr returns all the
 904          * attributes, nfs version 4 returns only the ones explicitly
 905          * asked for. This creates problems, as some system functions
 906          * (e.g. cache check) require certain attributes and if the
 907          * cached node lacks some attributes such as uid/gid, it can
 908          * affect system utilities (e.g. "ls") that rely on the information
 909          * to be there. This can lead to anything from system crashes to
 910          * corrupted information processed by user apps.
 911          * So to ensure that all bases are covered, request at least
 912          * the AT_ALL attribute mask.
 913          */
 914         argop[1].argop = OP_GETATTR;
 915         argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
 916         if (get_acl)
 917                 argop[1].nfs_argop4_u.opgetattr.attr_request |= FATTR4_ACL_MASK;
 918         argop[1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp);
 919 
 920         doqueue = 1;
 921 
 922         rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, ep);
 923 
 924         if (ep->error)
 925                 return;
 926 
 927         if (res.status != NFS4_OK) {
 928                 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
 929                 return;
 930         }
 931 
 932         *garp = res.array[1].nfs_resop4_u.opgetattr.ga_res;
 933 
 934         xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
 935 }
 936 
 937 /*
 938  * Return either cached or remote attributes. If get remote attr
 939  * use them to check and invalidate caches, then cache the new attributes.
 940  */
 941 int
 942 nfs4getattr(vnode_t *vp, vattr_t *vap, cred_t *cr)
 943 {
 944         int error;
 945         rnode4_t *rp;
 946         nfs4_ga_res_t gar;
 947 
 948         ASSERT(nfs4_consistent_type(vp));
 949 
 950         /*
 951          * If we've got cached attributes, we're done, otherwise go
 952          * to the server to get attributes, which will update the cache
 953          * in the process. Either way, use the cached attributes for
 954          * the caller's vattr_t.
 955          *
 956          * Note that we ignore the gar set by the OTW call: the attr caching
 957          * code may make adjustments when storing to the rnode, and we want
 958          * to see those changes here.
 959          */
 960         rp = VTOR4(vp);
 961         error = 0;
 962         mutex_enter(&rp->r_statelock);
 963         if (!ATTRCACHE4_VALID(vp)) {
 964                 mutex_exit(&rp->r_statelock);
 965                 error = nfs4_getattr_otw(vp, &gar, cr, 0);
 966                 mutex_enter(&rp->r_statelock);
 967         }
 968 
 969         if (!error)
 970                 *vap = rp->r_attr;
 971 
 972         /* Return the client's view of file size */
 973         vap->va_size = rp->r_size;
 974 
 975         mutex_exit(&rp->r_statelock);
 976 
 977         ASSERT(nfs4_consistent_type(vp));
 978 
 979         return (error);
 980 }
 981 
 982 int
 983 nfs4_attr_otw(vnode_t *vp, nfs4_tag_type_t tag_type,
 984     nfs4_ga_res_t *garp, bitmap4 reqbitmap, cred_t *cr)
 985 {
 986         COMPOUND4args_clnt args;
 987         COMPOUND4res_clnt res;
 988         int doqueue;
 989         nfs_argop4 argop[2];
 990         mntinfo4_t *mi = VTOMI4(vp);
 991         bool_t needrecov = FALSE;
 992         nfs4_recov_state_t recov_state;
 993         nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
 994         nfs4_ga_ext_res_t *gerp;
 995 
 996         recov_state.rs_flags = 0;
 997         recov_state.rs_num_retry_despite_err = 0;
 998 
 999 recov_retry:
1000         args.ctag = tag_type;
1001 
1002         args.array_len = 2;
1003         args.array = argop;
1004 
1005         e.error = nfs4_start_fop(mi, vp, NULL, OH_GETATTR, &recov_state, NULL);
1006         if (e.error)
1007                 return (e.error);
1008 
1009         /* putfh */
1010         argop[0].argop = OP_CPUTFH;
1011         argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh;
1012 
1013         /* getattr */
1014         argop[1].argop = OP_GETATTR;
1015         argop[1].nfs_argop4_u.opgetattr.attr_request = reqbitmap;
1016         argop[1].nfs_argop4_u.opgetattr.mi = mi;
1017 
1018         doqueue = 1;
1019 
1020         NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
1021             "nfs4_attr_otw: %s call, rp %s", needrecov ? "recov" : "first",
1022             rnode4info(VTOR4(vp))));
1023 
1024         rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
1025 
1026         needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
1027         if (!needrecov && e.error) {
1028                 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state,
1029                     needrecov);
1030                 return (e.error);
1031         }
1032 
1033         if (needrecov) {
1034                 bool_t abort;
1035 
1036                 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1037                     "nfs4_attr_otw: initiating recovery\n"));
1038 
1039                 abort = nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
1040                     NULL, OP_GETATTR, NULL, NULL, NULL);
1041                 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state,
1042                     needrecov);
1043                 if (!e.error) {
1044                         xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1045                         e.error = geterrno4(res.status);
1046                 }
1047                 if (abort == FALSE)
1048                         goto recov_retry;
1049                 return (e.error);
1050         }
1051 
1052         if (res.status) {
1053                 e.error = geterrno4(res.status);
1054         } else {
1055                 gerp = garp->n4g_ext_res;
1056                 bcopy(&res.array[1].nfs_resop4_u.opgetattr.ga_res,
1057                     garp, sizeof (nfs4_ga_res_t));
1058                 garp->n4g_ext_res = gerp;
1059                 if (garp->n4g_ext_res &&
1060                     res.array[1].nfs_resop4_u.opgetattr.ga_res.n4g_ext_res)
1061                         bcopy(res.array[1].nfs_resop4_u.opgetattr.
1062                             ga_res.n4g_ext_res,
1063                             garp->n4g_ext_res, sizeof (nfs4_ga_ext_res_t));
1064         }
1065         xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1066         nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state,
1067             needrecov);
1068         return (e.error);
1069 }
1070 
1071 /*
1072  * Asynchronous I/O parameters.  nfs_async_threads is the high-water mark
1073  * for the demand-based allocation of async threads per-mount.  The
1074  * nfs_async_timeout is the amount of time a thread will live after it
1075  * becomes idle, unless new I/O requests are received before the thread
1076  * dies.  See nfs4_async_putpage and nfs4_async_start.
1077  */
1078 
1079 static void     nfs4_async_start(struct vfs *);
1080 static void     nfs4_async_pgops_start(struct vfs *);
1081 static void     nfs4_async_common_start(struct vfs *, int);
1082 
1083 static void
1084 free_async_args4(struct nfs4_async_reqs *args)
1085 {
1086         rnode4_t *rp;
1087 
1088         if (args->a_io != NFS4_INACTIVE) {
1089                 rp = VTOR4(args->a_vp);
1090                 mutex_enter(&rp->r_statelock);
1091                 rp->r_count--;
1092                 if (args->a_io == NFS4_PUTAPAGE ||
1093                     args->a_io == NFS4_PAGEIO)
1094                         rp->r_awcount--;
1095                 cv_broadcast(&rp->r_cv);
1096                 mutex_exit(&rp->r_statelock);
1097                 VN_RELE(args->a_vp);
1098         }
1099         crfree(args->a_cred);
1100         kmem_free(args, sizeof (*args));
1101 }
1102 
1103 /*
1104  * Cross-zone thread creation and NFS access is disallowed, yet fsflush() and
1105  * pageout(), running in the global zone, have legitimate reasons to do
1106  * VOP_PUTPAGE(B_ASYNC) on other zones' NFS mounts.  We avoid the problem by
1107  * use of a a per-mount "asynchronous requests manager thread" which is
1108  * signaled by the various asynchronous work routines when there is
1109  * asynchronous work to be done.  It is responsible for creating new
1110  * worker threads if necessary, and notifying existing worker threads
1111  * that there is work to be done.
1112  *
1113  * In other words, it will "take the specifications from the customers and
1114  * give them to the engineers."
1115  *
1116  * Worker threads die off of their own accord if they are no longer
1117  * needed.
1118  *
1119  * This thread is killed when the zone is going away or the filesystem
1120  * is being unmounted.
1121  */
1122 void
1123 nfs4_async_manager(vfs_t *vfsp)
1124 {
1125         callb_cpr_t cprinfo;
1126         mntinfo4_t *mi;
1127         uint_t max_threads;
1128 
1129         mi = VFTOMI4(vfsp);
1130 
1131         CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr,
1132             "nfs4_async_manager");
1133 
1134         mutex_enter(&mi->mi_async_lock);
1135         /*
1136          * We want to stash the max number of threads that this mount was
1137          * allowed so we can use it later when the variable is set to zero as
1138          * part of the zone/mount going away.
1139          *
1140          * We want to be able to create at least one thread to handle
1141          * asynchronous inactive calls.
1142          */
1143         max_threads = MAX(mi->mi_max_threads, 1);
1144         /*
1145          * We don't want to wait for mi_max_threads to go to zero, since that
1146          * happens as part of a failed unmount, but this thread should only
1147          * exit when the mount is really going away.
1148          *
1149          * Once MI4_ASYNC_MGR_STOP is set, no more async operations will be
1150          * attempted: the various _async_*() functions know to do things
1151          * inline if mi_max_threads == 0.  Henceforth we just drain out the
1152          * outstanding requests.
1153          *
1154          * Note that we still create zthreads even if we notice the zone is
1155          * shutting down (MI4_ASYNC_MGR_STOP is set); this may cause the zone
1156          * shutdown sequence to take slightly longer in some cases, but
1157          * doesn't violate the protocol, as all threads will exit as soon as
1158          * they're done processing the remaining requests.
1159          */
1160         for (;;) {
1161                 while (mi->mi_async_req_count > 0) {
1162                         /*
1163                          * Paranoia: If the mount started out having
1164                          * (mi->mi_max_threads == 0), and the value was
1165                          * later changed (via a debugger or somesuch),
1166                          * we could be confused since we will think we
1167                          * can't create any threads, and the calling
1168                          * code (which looks at the current value of
1169                          * mi->mi_max_threads, now non-zero) thinks we
1170                          * can.
1171                          *
1172                          * So, because we're paranoid, we create threads
1173                          * up to the maximum of the original and the
1174                          * current value. This means that future
1175                          * (debugger-induced) alterations of
1176                          * mi->mi_max_threads are ignored for our
1177                          * purposes, but who told them they could change
1178                          * random values on a live kernel anyhow?
1179                          */
1180                         if (mi->mi_threads[NFS4_ASYNC_QUEUE] <
1181                             MAX(mi->mi_max_threads, max_threads)) {
1182                                 mi->mi_threads[NFS4_ASYNC_QUEUE]++;
1183                                 mutex_exit(&mi->mi_async_lock);
1184                                 MI4_HOLD(mi);
1185                                 VFS_HOLD(vfsp); /* hold for new thread */
1186                                 (void) zthread_create(NULL, 0, nfs4_async_start,
1187                                     vfsp, 0, minclsyspri);
1188                                 mutex_enter(&mi->mi_async_lock);
1189                         } else if (mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] <
1190                             NUM_ASYNC_PGOPS_THREADS) {
1191                                 mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE]++;
1192                                 mutex_exit(&mi->mi_async_lock);
1193                                 MI4_HOLD(mi);
1194                                 VFS_HOLD(vfsp); /* hold for new thread */
1195                                 (void) zthread_create(NULL, 0,
1196                                     nfs4_async_pgops_start, vfsp, 0,
1197                                     minclsyspri);
1198                                 mutex_enter(&mi->mi_async_lock);
1199                         }
1200                         NFS4_WAKE_ASYNC_WORKER(mi->mi_async_work_cv);
1201                         ASSERT(mi->mi_async_req_count != 0);
1202                         mi->mi_async_req_count--;
1203                 }
1204 
1205                 mutex_enter(&mi->mi_lock);
1206                 if (mi->mi_flags & MI4_ASYNC_MGR_STOP) {
1207                         mutex_exit(&mi->mi_lock);
1208                         break;
1209                 }
1210                 mutex_exit(&mi->mi_lock);
1211 
1212                 CALLB_CPR_SAFE_BEGIN(&cprinfo);
1213                 cv_wait(&mi->mi_async_reqs_cv, &mi->mi_async_lock);
1214                 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1215         }
1216 
1217         NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
1218             "nfs4_async_manager exiting for vfs %p\n", (void *)mi->mi_vfsp));
1219         /*
1220          * Let everyone know we're done.
1221          */
1222         mi->mi_manager_thread = NULL;
1223         /*
1224          * Wake up the inactive thread.
1225          */
1226         cv_broadcast(&mi->mi_inact_req_cv);
1227         /*
1228          * Wake up anyone sitting in nfs4_async_manager_stop()
1229          */
1230         cv_broadcast(&mi->mi_async_cv);
1231         /*
1232          * There is no explicit call to mutex_exit(&mi->mi_async_lock)
1233          * since CALLB_CPR_EXIT is actually responsible for releasing
1234          * 'mi_async_lock'.
1235          */
1236         CALLB_CPR_EXIT(&cprinfo);
1237         VFS_RELE(vfsp); /* release thread's hold */
1238         MI4_RELE(mi);
1239         zthread_exit();
1240 }
1241 
1242 /*
1243  * Signal (and wait for) the async manager thread to clean up and go away.
1244  */
1245 void
1246 nfs4_async_manager_stop(vfs_t *vfsp)
1247 {
1248         mntinfo4_t *mi = VFTOMI4(vfsp);
1249 
1250         mutex_enter(&mi->mi_async_lock);
1251         mutex_enter(&mi->mi_lock);
1252         mi->mi_flags |= MI4_ASYNC_MGR_STOP;
1253         mutex_exit(&mi->mi_lock);
1254         cv_broadcast(&mi->mi_async_reqs_cv);
1255         /*
1256          * Wait for the async manager thread to die.
1257          */
1258         while (mi->mi_manager_thread != NULL)
1259                 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
1260         mutex_exit(&mi->mi_async_lock);
1261 }
1262 
1263 int
1264 nfs4_async_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr,
1265     struct seg *seg, cred_t *cr, void (*readahead)(vnode_t *,
1266     u_offset_t, caddr_t, struct seg *, cred_t *))
1267 {
1268         rnode4_t *rp;
1269         mntinfo4_t *mi;
1270         struct nfs4_async_reqs *args;
1271 
1272         rp = VTOR4(vp);
1273         ASSERT(rp->r_freef == NULL);
1274 
1275         mi = VTOMI4(vp);
1276 
1277         /*
1278          * If addr falls in a different segment, don't bother doing readahead.
1279          */
1280         if (addr >= seg->s_base + seg->s_size)
1281                 return (-1);
1282 
1283         /*
1284          * If we can't allocate a request structure, punt on the readahead.
1285          */
1286         if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1287                 return (-1);
1288 
1289         /*
1290          * If a lock operation is pending, don't initiate any new
1291          * readaheads.  Otherwise, bump r_count to indicate the new
1292          * asynchronous I/O.
1293          */
1294         if (!nfs_rw_tryenter(&rp->r_lkserlock, RW_READER)) {
1295                 kmem_free(args, sizeof (*args));
1296                 return (-1);
1297         }
1298         mutex_enter(&rp->r_statelock);
1299         rp->r_count++;
1300         mutex_exit(&rp->r_statelock);
1301         nfs_rw_exit(&rp->r_lkserlock);
1302 
1303         args->a_next = NULL;
1304 #ifdef DEBUG
1305         args->a_queuer = curthread;
1306 #endif
1307         VN_HOLD(vp);
1308         args->a_vp = vp;
1309         ASSERT(cr != NULL);
1310         crhold(cr);
1311         args->a_cred = cr;
1312         args->a_io = NFS4_READ_AHEAD;
1313         args->a_nfs4_readahead = readahead;
1314         args->a_nfs4_blkoff = blkoff;
1315         args->a_nfs4_seg = seg;
1316         args->a_nfs4_addr = addr;
1317 
1318         mutex_enter(&mi->mi_async_lock);
1319 
1320         /*
1321          * If asyncio has been disabled, don't bother readahead.
1322          */
1323         if (mi->mi_max_threads == 0) {
1324                 mutex_exit(&mi->mi_async_lock);
1325                 goto noasync;
1326         }
1327 
1328         /*
1329          * Link request structure into the async list and
1330          * wakeup async thread to do the i/o.
1331          */
1332         if (mi->mi_async_reqs[NFS4_READ_AHEAD] == NULL) {
1333                 mi->mi_async_reqs[NFS4_READ_AHEAD] = args;
1334                 mi->mi_async_tail[NFS4_READ_AHEAD] = args;
1335         } else {
1336                 mi->mi_async_tail[NFS4_READ_AHEAD]->a_next = args;
1337                 mi->mi_async_tail[NFS4_READ_AHEAD] = args;
1338         }
1339 
1340         if (mi->mi_io_kstats) {
1341                 mutex_enter(&mi->mi_lock);
1342                 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1343                 mutex_exit(&mi->mi_lock);
1344         }
1345 
1346         mi->mi_async_req_count++;
1347         ASSERT(mi->mi_async_req_count != 0);
1348         cv_signal(&mi->mi_async_reqs_cv);
1349         mutex_exit(&mi->mi_async_lock);
1350         return (0);
1351 
1352 noasync:
1353         mutex_enter(&rp->r_statelock);
1354         rp->r_count--;
1355         cv_broadcast(&rp->r_cv);
1356         mutex_exit(&rp->r_statelock);
1357         VN_RELE(vp);
1358         crfree(cr);
1359         kmem_free(args, sizeof (*args));
1360         return (-1);
1361 }
1362 
1363 static void
1364 nfs4_async_start(struct vfs *vfsp)
1365 {
1366         nfs4_async_common_start(vfsp, NFS4_ASYNC_QUEUE);
1367 }
1368 
1369 static void
1370 nfs4_async_pgops_start(struct vfs *vfsp)
1371 {
1372         nfs4_async_common_start(vfsp, NFS4_ASYNC_PGOPS_QUEUE);
1373 }
1374 
1375 /*
1376  * The async queues for each mounted file system are arranged as a
1377  * set of queues, one for each async i/o type.  Requests are taken
1378  * from the queues in a round-robin fashion.  A number of consecutive
1379  * requests are taken from each queue before moving on to the next
1380  * queue.  This functionality may allow the NFS Version 2 server to do
1381  * write clustering, even if the client is mixing writes and reads
1382  * because it will take multiple write requests from the queue
1383  * before processing any of the other async i/o types.
1384  *
1385  * XXX The nfs4_async_common_start thread is unsafe in the light of the present
1386  * model defined by cpr to suspend the system. Specifically over the
1387  * wire calls are cpr-unsafe. The thread should be reevaluated in
1388  * case of future updates to the cpr model.
1389  */
1390 static void
1391 nfs4_async_common_start(struct vfs *vfsp, int async_queue)
1392 {
1393         struct nfs4_async_reqs *args;
1394         mntinfo4_t *mi = VFTOMI4(vfsp);
1395         clock_t time_left = 1;
1396         callb_cpr_t cprinfo;
1397         int i;
1398         extern volatile int nfs_async_timeout;
1399         int async_types;
1400         kcondvar_t *async_work_cv;
1401 
1402         if (async_queue == NFS4_ASYNC_QUEUE) {
1403                 async_types = NFS4_ASYNC_TYPES;
1404                 async_work_cv = &mi->mi_async_work_cv[NFS4_ASYNC_QUEUE];
1405         } else {
1406                 async_types = NFS4_ASYNC_PGOPS_TYPES;
1407                 async_work_cv = &mi->mi_async_work_cv[NFS4_ASYNC_PGOPS_QUEUE];
1408         }
1409 
1410         /*
1411          * Dynamic initialization of nfs_async_timeout to allow nfs to be
1412          * built in an implementation independent manner.
1413          */
1414         if (nfs_async_timeout == -1)
1415                 nfs_async_timeout = NFS_ASYNC_TIMEOUT;
1416 
1417         CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, "nas");
1418 
1419         mutex_enter(&mi->mi_async_lock);
1420         for (;;) {
1421                 /*
1422                  * Find the next queue containing an entry.  We start
1423                  * at the current queue pointer and then round robin
1424                  * through all of them until we either find a non-empty
1425                  * queue or have looked through all of them.
1426                  */
1427                 for (i = 0; i < async_types; i++) {
1428                         args = *mi->mi_async_curr[async_queue];
1429                         if (args != NULL)
1430                                 break;
1431                         mi->mi_async_curr[async_queue]++;
1432                         if (mi->mi_async_curr[async_queue] ==
1433                             &mi->mi_async_reqs[async_types]) {
1434                                 mi->mi_async_curr[async_queue] =
1435                                     &mi->mi_async_reqs[0];
1436                         }
1437                 }
1438                 /*
1439                  * If we didn't find a entry, then block until woken up
1440                  * again and then look through the queues again.
1441                  */
1442                 if (args == NULL) {
1443                         /*
1444                          * Exiting is considered to be safe for CPR as well
1445                          */
1446                         CALLB_CPR_SAFE_BEGIN(&cprinfo);
1447 
1448                         /*
1449                          * Wakeup thread waiting to unmount the file
1450                          * system only if all async threads are inactive.
1451                          *
1452                          * If we've timed-out and there's nothing to do,
1453                          * then get rid of this thread.
1454                          */
1455                         if (mi->mi_max_threads == 0 || time_left <= 0) {
1456                                 --mi->mi_threads[async_queue];
1457 
1458                                 if (mi->mi_threads[NFS4_ASYNC_QUEUE] == 0 &&
1459                                     mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] == 0)
1460                                         cv_signal(&mi->mi_async_cv);
1461                                 CALLB_CPR_EXIT(&cprinfo);
1462                                 VFS_RELE(vfsp); /* release thread's hold */
1463                                 MI4_RELE(mi);
1464                                 zthread_exit();
1465                                 /* NOTREACHED */
1466                         }
1467                         time_left = cv_reltimedwait(async_work_cv,
1468                             &mi->mi_async_lock, nfs_async_timeout,
1469                             TR_CLOCK_TICK);
1470 
1471                         CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1472 
1473                         continue;
1474                 } else {
1475                         time_left = 1;
1476                 }
1477 
1478                 /*
1479                  * Remove the request from the async queue and then
1480                  * update the current async request queue pointer.  If
1481                  * the current queue is empty or we have removed enough
1482                  * consecutive entries from it, then reset the counter
1483                  * for this queue and then move the current pointer to
1484                  * the next queue.
1485                  */
1486                 *mi->mi_async_curr[async_queue] = args->a_next;
1487                 if (*mi->mi_async_curr[async_queue] == NULL ||
1488                     --mi->mi_async_clusters[args->a_io] == 0) {
1489                         mi->mi_async_clusters[args->a_io] =
1490                             mi->mi_async_init_clusters;
1491                         mi->mi_async_curr[async_queue]++;
1492                         if (mi->mi_async_curr[async_queue] ==
1493                             &mi->mi_async_reqs[async_types]) {
1494                                 mi->mi_async_curr[async_queue] =
1495                                     &mi->mi_async_reqs[0];
1496                         }
1497                 }
1498 
1499                 if (args->a_io != NFS4_INACTIVE && mi->mi_io_kstats) {
1500                         mutex_enter(&mi->mi_lock);
1501                         kstat_waitq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
1502                         mutex_exit(&mi->mi_lock);
1503                 }
1504 
1505                 mutex_exit(&mi->mi_async_lock);
1506 
1507                 /*
1508                  * Obtain arguments from the async request structure.
1509                  */
1510                 if (args->a_io == NFS4_READ_AHEAD && mi->mi_max_threads > 0) {
1511                         (*args->a_nfs4_readahead)(args->a_vp,
1512                             args->a_nfs4_blkoff, args->a_nfs4_addr,
1513                             args->a_nfs4_seg, args->a_cred);
1514                 } else if (args->a_io == NFS4_PUTAPAGE) {
1515                         (void) (*args->a_nfs4_putapage)(args->a_vp,
1516                             args->a_nfs4_pp, args->a_nfs4_off,
1517                             args->a_nfs4_len, args->a_nfs4_flags,
1518                             args->a_cred);
1519                 } else if (args->a_io == NFS4_PAGEIO) {
1520                         (void) (*args->a_nfs4_pageio)(args->a_vp,
1521                             args->a_nfs4_pp, args->a_nfs4_off,
1522                             args->a_nfs4_len, args->a_nfs4_flags,
1523                             args->a_cred);
1524                 } else if (args->a_io == NFS4_READDIR) {
1525                         (void) ((*args->a_nfs4_readdir)(args->a_vp,
1526                             args->a_nfs4_rdc, args->a_cred));
1527                 } else if (args->a_io == NFS4_COMMIT) {
1528                         (*args->a_nfs4_commit)(args->a_vp, args->a_nfs4_plist,
1529                             args->a_nfs4_offset, args->a_nfs4_count,
1530                             args->a_cred);
1531                 } else if (args->a_io == NFS4_INACTIVE) {
1532                         nfs4_inactive_otw(args->a_vp, args->a_cred);
1533                 }
1534 
1535                 /*
1536                  * Now, release the vnode and free the credentials
1537                  * structure.
1538                  */
1539                 free_async_args4(args);
1540                 /*
1541                  * Reacquire the mutex because it will be needed above.
1542                  */
1543                 mutex_enter(&mi->mi_async_lock);
1544         }
1545 }
1546 
1547 /*
1548  * nfs4_inactive_thread - look for vnodes that need over-the-wire calls as
1549  * part of VOP_INACTIVE.
1550  */
1551 
1552 void
1553 nfs4_inactive_thread(mntinfo4_t *mi)
1554 {
1555         struct nfs4_async_reqs *args;
1556         callb_cpr_t cprinfo;
1557         vfs_t *vfsp = mi->mi_vfsp;
1558 
1559         CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr,
1560             "nfs4_inactive_thread");
1561 
1562         for (;;) {
1563                 mutex_enter(&mi->mi_async_lock);
1564                 args = mi->mi_async_reqs[NFS4_INACTIVE];
1565                 if (args == NULL) {
1566                         mutex_enter(&mi->mi_lock);
1567                         /*
1568                          * We don't want to exit until the async manager is done
1569                          * with its work; hence the check for mi_manager_thread
1570                          * being NULL.
1571                          *
1572                          * The async manager thread will cv_broadcast() on
1573                          * mi_inact_req_cv when it's done, at which point we'll
1574                          * wake up and exit.
1575                          */
1576                         if (mi->mi_manager_thread == NULL)
1577                                 goto die;
1578                         mi->mi_flags |= MI4_INACTIVE_IDLE;
1579                         mutex_exit(&mi->mi_lock);
1580                         cv_signal(&mi->mi_async_cv);
1581                         CALLB_CPR_SAFE_BEGIN(&cprinfo);
1582                         cv_wait(&mi->mi_inact_req_cv, &mi->mi_async_lock);
1583                         CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1584                         mutex_exit(&mi->mi_async_lock);
1585                 } else {
1586                         mutex_enter(&mi->mi_lock);
1587                         mi->mi_flags &= ~MI4_INACTIVE_IDLE;
1588                         mutex_exit(&mi->mi_lock);
1589                         mi->mi_async_reqs[NFS4_INACTIVE] = args->a_next;
1590                         mutex_exit(&mi->mi_async_lock);
1591                         nfs4_inactive_otw(args->a_vp, args->a_cred);
1592                         crfree(args->a_cred);
1593                         kmem_free(args, sizeof (*args));
1594                 }
1595         }
1596 die:
1597         mutex_exit(&mi->mi_lock);
1598         mi->mi_inactive_thread = NULL;
1599         cv_signal(&mi->mi_async_cv);
1600 
1601         /*
1602          * There is no explicit call to mutex_exit(&mi->mi_async_lock) since
1603          * CALLB_CPR_EXIT is actually responsible for releasing 'mi_async_lock'.
1604          */
1605         CALLB_CPR_EXIT(&cprinfo);
1606 
1607         NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
1608             "nfs4_inactive_thread exiting for vfs %p\n", (void *)vfsp));
1609 
1610         MI4_RELE(mi);
1611         zthread_exit();
1612         /* NOTREACHED */
1613 }
1614 
1615 /*
1616  * nfs_async_stop:
1617  * Wait for all outstanding putpage operations and the inactive thread to
1618  * complete; nfs4_async_stop_sig() without interruptibility.
1619  */
1620 void
1621 nfs4_async_stop(struct vfs *vfsp)
1622 {
1623         mntinfo4_t *mi = VFTOMI4(vfsp);
1624 
1625         /*
1626          * Wait for all outstanding async operations to complete and for
1627          * worker threads to exit.
1628          */
1629         mutex_enter(&mi->mi_async_lock);
1630         mi->mi_max_threads = 0;
1631         NFS4_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
1632         while (mi->mi_threads[NFS4_ASYNC_QUEUE] != 0 ||
1633             mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] != 0)
1634                 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
1635 
1636         /*
1637          * Wait for the inactive thread to finish doing what it's doing.  It
1638          * won't exit until the last reference to the vfs_t goes away.
1639          */
1640         if (mi->mi_inactive_thread != NULL) {
1641                 mutex_enter(&mi->mi_lock);
1642                 while (!(mi->mi_flags & MI4_INACTIVE_IDLE) ||
1643                     (mi->mi_async_reqs[NFS4_INACTIVE] != NULL)) {
1644                         mutex_exit(&mi->mi_lock);
1645                         cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
1646                         mutex_enter(&mi->mi_lock);
1647                 }
1648                 mutex_exit(&mi->mi_lock);
1649         }
1650         mutex_exit(&mi->mi_async_lock);
1651 }
1652 
1653 /*
1654  * nfs_async_stop_sig:
1655  * Wait for all outstanding putpage operations and the inactive thread to
1656  * complete. If a signal is delivered we will abort and return non-zero;
1657  * otherwise return 0. Since this routine is called from nfs4_unmount, we
1658  * need to make it interruptible.
1659  */
1660 int
1661 nfs4_async_stop_sig(struct vfs *vfsp)
1662 {
1663         mntinfo4_t *mi = VFTOMI4(vfsp);
1664         ushort_t omax;
1665         bool_t intr = FALSE;
1666 
1667         /*
1668          * Wait for all outstanding putpage operations to complete and for
1669          * worker threads to exit.
1670          */
1671         mutex_enter(&mi->mi_async_lock);
1672         omax = mi->mi_max_threads;
1673         mi->mi_max_threads = 0;
1674         NFS4_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
1675         while (mi->mi_threads[NFS4_ASYNC_QUEUE] != 0 ||
1676             mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] != 0) {
1677                 if (!cv_wait_sig(&mi->mi_async_cv, &mi->mi_async_lock)) {
1678                         intr = TRUE;
1679                         goto interrupted;
1680                 }
1681         }
1682 
1683         /*
1684          * Wait for the inactive thread to finish doing what it's doing.  It
1685          * won't exit until the a last reference to the vfs_t goes away.
1686          */
1687         if (mi->mi_inactive_thread != NULL) {
1688                 mutex_enter(&mi->mi_lock);
1689                 while (!(mi->mi_flags & MI4_INACTIVE_IDLE) ||
1690                     (mi->mi_async_reqs[NFS4_INACTIVE] != NULL)) {
1691                         mutex_exit(&mi->mi_lock);
1692                         if (!cv_wait_sig(&mi->mi_async_cv,
1693                             &mi->mi_async_lock)) {
1694                                 intr = TRUE;
1695                                 goto interrupted;
1696                         }
1697                         mutex_enter(&mi->mi_lock);
1698                 }
1699                 mutex_exit(&mi->mi_lock);
1700         }
1701 interrupted:
1702         if (intr)
1703                 mi->mi_max_threads = omax;
1704         mutex_exit(&mi->mi_async_lock);
1705 
1706         return (intr);
1707 }
1708 
1709 int
1710 nfs4_async_putapage(vnode_t *vp, page_t *pp, u_offset_t off, size_t len,
1711     int flags, cred_t *cr, int (*putapage)(vnode_t *, page_t *,
1712     u_offset_t, size_t, int, cred_t *))
1713 {
1714         rnode4_t *rp;
1715         mntinfo4_t *mi;
1716         struct nfs4_async_reqs *args;
1717 
1718         ASSERT(flags & B_ASYNC);
1719         ASSERT(vp->v_vfsp != NULL);
1720 
1721         rp = VTOR4(vp);
1722         ASSERT(rp->r_count > 0);
1723 
1724         mi = VTOMI4(vp);
1725 
1726         /*
1727          * If we can't allocate a request structure, do the putpage
1728          * operation synchronously in this thread's context.
1729          */
1730         if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1731                 goto noasync;
1732 
1733         args->a_next = NULL;
1734 #ifdef DEBUG
1735         args->a_queuer = curthread;
1736 #endif
1737         VN_HOLD(vp);
1738         args->a_vp = vp;
1739         ASSERT(cr != NULL);
1740         crhold(cr);
1741         args->a_cred = cr;
1742         args->a_io = NFS4_PUTAPAGE;
1743         args->a_nfs4_putapage = putapage;
1744         args->a_nfs4_pp = pp;
1745         args->a_nfs4_off = off;
1746         args->a_nfs4_len = (uint_t)len;
1747         args->a_nfs4_flags = flags;
1748 
1749         mutex_enter(&mi->mi_async_lock);
1750 
1751         /*
1752          * If asyncio has been disabled, then make a synchronous request.
1753          * This check is done a second time in case async io was diabled
1754          * while this thread was blocked waiting for memory pressure to
1755          * reduce or for the queue to drain.
1756          */
1757         if (mi->mi_max_threads == 0) {
1758                 mutex_exit(&mi->mi_async_lock);
1759 
1760                 VN_RELE(vp);
1761                 crfree(cr);
1762                 kmem_free(args, sizeof (*args));
1763                 goto noasync;
1764         }
1765 
1766         /*
1767          * Link request structure into the async list and
1768          * wakeup async thread to do the i/o.
1769          */
1770         if (mi->mi_async_reqs[NFS4_PUTAPAGE] == NULL) {
1771                 mi->mi_async_reqs[NFS4_PUTAPAGE] = args;
1772                 mi->mi_async_tail[NFS4_PUTAPAGE] = args;
1773         } else {
1774                 mi->mi_async_tail[NFS4_PUTAPAGE]->a_next = args;
1775                 mi->mi_async_tail[NFS4_PUTAPAGE] = args;
1776         }
1777 
1778         mutex_enter(&rp->r_statelock);
1779         rp->r_count++;
1780         rp->r_awcount++;
1781         mutex_exit(&rp->r_statelock);
1782 
1783         if (mi->mi_io_kstats) {
1784                 mutex_enter(&mi->mi_lock);
1785                 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1786                 mutex_exit(&mi->mi_lock);
1787         }
1788 
1789         mi->mi_async_req_count++;
1790         ASSERT(mi->mi_async_req_count != 0);
1791         cv_signal(&mi->mi_async_reqs_cv);
1792         mutex_exit(&mi->mi_async_lock);
1793         return (0);
1794 
1795 noasync:
1796 
1797         if (curproc == proc_pageout || curproc == proc_fsflush) {
1798                 /*
1799                  * If we get here in the context of the pageout/fsflush,
1800                  * or we have run out of memory or we're attempting to
1801                  * unmount we refuse to do a sync write, because this may
1802                  * hang pageout/fsflush and the machine. In this case,
1803                  * we just re-mark the page as dirty and punt on the page.
1804                  *
1805                  * Make sure B_FORCE isn't set.  We can re-mark the
1806                  * pages as dirty and unlock the pages in one swoop by
1807                  * passing in B_ERROR to pvn_write_done().  However,
1808                  * we should make sure B_FORCE isn't set - we don't
1809                  * want the page tossed before it gets written out.
1810                  */
1811                 if (flags & B_FORCE)
1812                         flags &= ~(B_INVAL | B_FORCE);
1813                 pvn_write_done(pp, flags | B_ERROR);
1814                 return (0);
1815         }
1816 
1817         if (nfs_zone() != mi->mi_zone) {
1818                 /*
1819                  * So this was a cross-zone sync putpage.
1820                  *
1821                  * We pass in B_ERROR to pvn_write_done() to re-mark the pages
1822                  * as dirty and unlock them.
1823                  *
1824                  * We don't want to clear B_FORCE here as the caller presumably
1825                  * knows what they're doing if they set it.
1826                  */
1827                 pvn_write_done(pp, flags | B_ERROR);
1828                 return (EPERM);
1829         }
1830         return ((*putapage)(vp, pp, off, len, flags, cr));
1831 }
1832 
1833 int
1834 nfs4_async_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
1835     int flags, cred_t *cr, int (*pageio)(vnode_t *, page_t *, u_offset_t,
1836     size_t, int, cred_t *))
1837 {
1838         rnode4_t *rp;
1839         mntinfo4_t *mi;
1840         struct nfs4_async_reqs *args;
1841 
1842         ASSERT(flags & B_ASYNC);
1843         ASSERT(vp->v_vfsp != NULL);
1844 
1845         rp = VTOR4(vp);
1846         ASSERT(rp->r_count > 0);
1847 
1848         mi = VTOMI4(vp);
1849 
1850         /*
1851          * If we can't allocate a request structure, do the pageio
1852          * request synchronously in this thread's context.
1853          */
1854         if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1855                 goto noasync;
1856 
1857         args->a_next = NULL;
1858 #ifdef DEBUG
1859         args->a_queuer = curthread;
1860 #endif
1861         VN_HOLD(vp);
1862         args->a_vp = vp;
1863         ASSERT(cr != NULL);
1864         crhold(cr);
1865         args->a_cred = cr;
1866         args->a_io = NFS4_PAGEIO;
1867         args->a_nfs4_pageio = pageio;
1868         args->a_nfs4_pp = pp;
1869         args->a_nfs4_off = io_off;
1870         args->a_nfs4_len = (uint_t)io_len;
1871         args->a_nfs4_flags = flags;
1872 
1873         mutex_enter(&mi->mi_async_lock);
1874 
1875         /*
1876          * If asyncio has been disabled, then make a synchronous request.
1877          * This check is done a second time in case async io was diabled
1878          * while this thread was blocked waiting for memory pressure to
1879          * reduce or for the queue to drain.
1880          */
1881         if (mi->mi_max_threads == 0) {
1882                 mutex_exit(&mi->mi_async_lock);
1883 
1884                 VN_RELE(vp);
1885                 crfree(cr);
1886                 kmem_free(args, sizeof (*args));
1887                 goto noasync;
1888         }
1889 
1890         /*
1891          * Link request structure into the async list and
1892          * wakeup async thread to do the i/o.
1893          */
1894         if (mi->mi_async_reqs[NFS4_PAGEIO] == NULL) {
1895                 mi->mi_async_reqs[NFS4_PAGEIO] = args;
1896                 mi->mi_async_tail[NFS4_PAGEIO] = args;
1897         } else {
1898                 mi->mi_async_tail[NFS4_PAGEIO]->a_next = args;
1899                 mi->mi_async_tail[NFS4_PAGEIO] = args;
1900         }
1901 
1902         mutex_enter(&rp->r_statelock);
1903         rp->r_count++;
1904         rp->r_awcount++;
1905         mutex_exit(&rp->r_statelock);
1906 
1907         if (mi->mi_io_kstats) {
1908                 mutex_enter(&mi->mi_lock);
1909                 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1910                 mutex_exit(&mi->mi_lock);
1911         }
1912 
1913         mi->mi_async_req_count++;
1914         ASSERT(mi->mi_async_req_count != 0);
1915         cv_signal(&mi->mi_async_reqs_cv);
1916         mutex_exit(&mi->mi_async_lock);
1917         return (0);
1918 
1919 noasync:
1920         /*
1921          * If we can't do it ASYNC, for reads we do nothing (but cleanup
1922          * the page list), for writes we do it synchronously, except for
1923          * proc_pageout/proc_fsflush as described below.
1924          */
1925         if (flags & B_READ) {
1926                 pvn_read_done(pp, flags | B_ERROR);
1927                 return (0);
1928         }
1929 
1930         if (curproc == proc_pageout || curproc == proc_fsflush) {
1931                 /*
1932                  * If we get here in the context of the pageout/fsflush,
1933                  * we refuse to do a sync write, because this may hang
1934                  * pageout/fsflush (and the machine). In this case, we just
1935                  * re-mark the page as dirty and punt on the page.
1936                  *
1937                  * Make sure B_FORCE isn't set.  We can re-mark the
1938                  * pages as dirty and unlock the pages in one swoop by
1939                  * passing in B_ERROR to pvn_write_done().  However,
1940                  * we should make sure B_FORCE isn't set - we don't
1941                  * want the page tossed before it gets written out.
1942                  */
1943                 if (flags & B_FORCE)
1944                         flags &= ~(B_INVAL | B_FORCE);
1945                 pvn_write_done(pp, flags | B_ERROR);
1946                 return (0);
1947         }
1948 
1949         if (nfs_zone() != mi->mi_zone) {
1950                 /*
1951                  * So this was a cross-zone sync pageio.  We pass in B_ERROR
1952                  * to pvn_write_done() to re-mark the pages as dirty and unlock
1953                  * them.
1954                  *
1955                  * We don't want to clear B_FORCE here as the caller presumably
1956                  * knows what they're doing if they set it.
1957                  */
1958                 pvn_write_done(pp, flags | B_ERROR);
1959                 return (EPERM);
1960         }
1961         return ((*pageio)(vp, pp, io_off, io_len, flags, cr));
1962 }
1963 
1964 void
1965 nfs4_async_readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr,
1966     int (*readdir)(vnode_t *, rddir4_cache *, cred_t *))
1967 {
1968         rnode4_t *rp;
1969         mntinfo4_t *mi;
1970         struct nfs4_async_reqs *args;
1971 
1972         rp = VTOR4(vp);
1973         ASSERT(rp->r_freef == NULL);
1974 
1975         mi = VTOMI4(vp);
1976 
1977         /*
1978          * If we can't allocate a request structure, skip the readdir.
1979          */
1980         if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1981                 goto noasync;
1982 
1983         args->a_next = NULL;
1984 #ifdef DEBUG
1985         args->a_queuer = curthread;
1986 #endif
1987         VN_HOLD(vp);
1988         args->a_vp = vp;
1989         ASSERT(cr != NULL);
1990         crhold(cr);
1991         args->a_cred = cr;
1992         args->a_io = NFS4_READDIR;
1993         args->a_nfs4_readdir = readdir;
1994         args->a_nfs4_rdc = rdc;
1995 
1996         mutex_enter(&mi->mi_async_lock);
1997 
1998         /*
1999          * If asyncio has been disabled, then skip this request
2000          */
2001         if (mi->mi_max_threads == 0) {
2002                 mutex_exit(&mi->mi_async_lock);
2003 
2004                 VN_RELE(vp);
2005                 crfree(cr);
2006                 kmem_free(args, sizeof (*args));
2007                 goto noasync;
2008         }
2009 
2010         /*
2011          * Link request structure into the async list and
2012          * wakeup async thread to do the i/o.
2013          */
2014         if (mi->mi_async_reqs[NFS4_READDIR] == NULL) {
2015                 mi->mi_async_reqs[NFS4_READDIR] = args;
2016                 mi->mi_async_tail[NFS4_READDIR] = args;
2017         } else {
2018                 mi->mi_async_tail[NFS4_READDIR]->a_next = args;
2019                 mi->mi_async_tail[NFS4_READDIR] = args;
2020         }
2021 
2022         mutex_enter(&rp->r_statelock);
2023         rp->r_count++;
2024         mutex_exit(&rp->r_statelock);
2025 
2026         if (mi->mi_io_kstats) {
2027                 mutex_enter(&mi->mi_lock);
2028                 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
2029                 mutex_exit(&mi->mi_lock);
2030         }
2031 
2032         mi->mi_async_req_count++;
2033         ASSERT(mi->mi_async_req_count != 0);
2034         cv_signal(&mi->mi_async_reqs_cv);
2035         mutex_exit(&mi->mi_async_lock);
2036         return;
2037 
2038 noasync:
2039         mutex_enter(&rp->r_statelock);
2040         rdc->entries = NULL;
2041         /*
2042          * Indicate that no one is trying to fill this entry and
2043          * it still needs to be filled.
2044          */
2045         rdc->flags &= ~RDDIR;
2046         rdc->flags |= RDDIRREQ;
2047         rddir4_cache_rele(rp, rdc);
2048         mutex_exit(&rp->r_statelock);
2049 }
2050 
2051 void
2052 nfs4_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count,
2053     cred_t *cr, void (*commit)(vnode_t *, page_t *, offset3, count3,
2054     cred_t *))
2055 {
2056         rnode4_t *rp;
2057         mntinfo4_t *mi;
2058         struct nfs4_async_reqs *args;
2059         page_t *pp;
2060 
2061         rp = VTOR4(vp);
2062         mi = VTOMI4(vp);
2063 
2064         /*
2065          * If we can't allocate a request structure, do the commit
2066          * operation synchronously in this thread's context.
2067          */
2068         if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
2069                 goto noasync;
2070 
2071         args->a_next = NULL;
2072 #ifdef DEBUG
2073         args->a_queuer = curthread;
2074 #endif
2075         VN_HOLD(vp);
2076         args->a_vp = vp;
2077         ASSERT(cr != NULL);
2078         crhold(cr);
2079         args->a_cred = cr;
2080         args->a_io = NFS4_COMMIT;
2081         args->a_nfs4_commit = commit;
2082         args->a_nfs4_plist = plist;
2083         args->a_nfs4_offset = offset;
2084         args->a_nfs4_count = count;
2085 
2086         mutex_enter(&mi->mi_async_lock);
2087 
2088         /*
2089          * If asyncio has been disabled, then make a synchronous request.
2090          * This check is done a second time in case async io was diabled
2091          * while this thread was blocked waiting for memory pressure to
2092          * reduce or for the queue to drain.
2093          */
2094         if (mi->mi_max_threads == 0) {
2095                 mutex_exit(&mi->mi_async_lock);
2096 
2097                 VN_RELE(vp);
2098                 crfree(cr);
2099                 kmem_free(args, sizeof (*args));
2100                 goto noasync;
2101         }
2102 
2103         /*
2104          * Link request structure into the async list and
2105          * wakeup async thread to do the i/o.
2106          */
2107         if (mi->mi_async_reqs[NFS4_COMMIT] == NULL) {
2108                 mi->mi_async_reqs[NFS4_COMMIT] = args;
2109                 mi->mi_async_tail[NFS4_COMMIT] = args;
2110         } else {
2111                 mi->mi_async_tail[NFS4_COMMIT]->a_next = args;
2112                 mi->mi_async_tail[NFS4_COMMIT] = args;
2113         }
2114 
2115         mutex_enter(&rp->r_statelock);
2116         rp->r_count++;
2117         mutex_exit(&rp->r_statelock);
2118 
2119         if (mi->mi_io_kstats) {
2120                 mutex_enter(&mi->mi_lock);
2121                 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
2122                 mutex_exit(&mi->mi_lock);
2123         }
2124 
2125         mi->mi_async_req_count++;
2126         ASSERT(mi->mi_async_req_count != 0);
2127         cv_signal(&mi->mi_async_reqs_cv);
2128         mutex_exit(&mi->mi_async_lock);
2129         return;
2130 
2131 noasync:
2132         if (curproc == proc_pageout || curproc == proc_fsflush ||
2133             nfs_zone() != mi->mi_zone) {
2134                 while (plist != NULL) {
2135                         pp = plist;
2136                         page_sub(&plist, pp);
2137                         pp->p_fsdata = C_COMMIT;
2138                         page_unlock(pp);
2139                 }
2140                 return;
2141         }
2142         (*commit)(vp, plist, offset, count, cr);
2143 }
2144 
2145 /*
2146  * nfs4_async_inactive - hand off a VOP_INACTIVE call to a thread.  The
2147  * reference to the vnode is handed over to the thread; the caller should
2148  * no longer refer to the vnode.
2149  *
2150  * Unlike most of the async routines, this handoff is needed for
2151  * correctness reasons, not just performance.  So doing operations in the
2152  * context of the current thread is not an option.
2153  */
2154 void
2155 nfs4_async_inactive(vnode_t *vp, cred_t *cr)
2156 {
2157         mntinfo4_t *mi;
2158         struct nfs4_async_reqs *args;
2159         boolean_t signal_inactive_thread = B_FALSE;
2160 
2161         mi = VTOMI4(vp);
2162 
2163         args = kmem_alloc(sizeof (*args), KM_SLEEP);
2164         args->a_next = NULL;
2165 #ifdef DEBUG
2166         args->a_queuer = curthread;
2167 #endif
2168         args->a_vp = vp;
2169         ASSERT(cr != NULL);
2170         crhold(cr);
2171         args->a_cred = cr;
2172         args->a_io = NFS4_INACTIVE;
2173 
2174         /*
2175          * Note that we don't check mi->mi_max_threads here, since we
2176          * *need* to get rid of this vnode regardless of whether someone
2177          * set nfs4_max_threads to zero in /etc/system.
2178          *
2179          * The manager thread knows about this and is willing to create
2180          * at least one thread to accommodate us.
2181          */
2182         mutex_enter(&mi->mi_async_lock);
2183         if (mi->mi_inactive_thread == NULL) {
2184                 rnode4_t *rp;
2185                 vnode_t *unldvp = NULL;
2186                 char *unlname;
2187                 cred_t *unlcred;
2188 
2189                 mutex_exit(&mi->mi_async_lock);
2190                 /*
2191                  * We just need to free up the memory associated with the
2192                  * vnode, which can be safely done from within the current
2193                  * context.
2194                  */
2195                 crfree(cr);     /* drop our reference */
2196                 kmem_free(args, sizeof (*args));
2197                 rp = VTOR4(vp);
2198                 mutex_enter(&rp->r_statelock);
2199                 if (rp->r_unldvp != NULL) {
2200                         unldvp = rp->r_unldvp;
2201                         rp->r_unldvp = NULL;
2202                         unlname = rp->r_unlname;
2203                         rp->r_unlname = NULL;
2204                         unlcred = rp->r_unlcred;
2205                         rp->r_unlcred = NULL;
2206                 }
2207                 mutex_exit(&rp->r_statelock);
2208                 /*
2209                  * No need to explicitly throw away any cached pages.  The
2210                  * eventual r4inactive() will attempt a synchronous
2211                  * VOP_PUTPAGE() which will immediately fail since the request
2212                  * is coming from the wrong zone, and then will proceed to call
2213                  * nfs4_invalidate_pages() which will clean things up for us.
2214                  *
2215                  * Throw away the delegation here so rp4_addfree()'s attempt to
2216                  * return any existing delegations becomes a no-op.
2217                  */
2218                 if (rp->r_deleg_type != OPEN_DELEGATE_NONE) {
2219                         (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER,
2220                             FALSE);
2221                         (void) nfs4delegreturn(rp, NFS4_DR_DISCARD);
2222                         nfs_rw_exit(&mi->mi_recovlock);
2223                 }
2224                 nfs4_clear_open_streams(rp);
2225 
2226                 rp4_addfree(rp, cr);
2227                 if (unldvp != NULL) {
2228                         kmem_free(unlname, MAXNAMELEN);
2229                         VN_RELE(unldvp);
2230                         crfree(unlcred);
2231                 }
2232                 return;
2233         }
2234 
2235         if (mi->mi_manager_thread == NULL) {
2236                 /*
2237                  * We want to talk to the inactive thread.
2238                  */
2239                 signal_inactive_thread = B_TRUE;
2240         }
2241 
2242         /*
2243          * Enqueue the vnode and wake up either the special thread (empty
2244          * list) or an async thread.
2245          */
2246         if (mi->mi_async_reqs[NFS4_INACTIVE] == NULL) {
2247                 mi->mi_async_reqs[NFS4_INACTIVE] = args;
2248                 mi->mi_async_tail[NFS4_INACTIVE] = args;
2249                 signal_inactive_thread = B_TRUE;
2250         } else {
2251                 mi->mi_async_tail[NFS4_INACTIVE]->a_next = args;
2252                 mi->mi_async_tail[NFS4_INACTIVE] = args;
2253         }
2254         if (signal_inactive_thread) {
2255                 cv_signal(&mi->mi_inact_req_cv);
2256         } else  {
2257                 mi->mi_async_req_count++;
2258                 ASSERT(mi->mi_async_req_count != 0);
2259                 cv_signal(&mi->mi_async_reqs_cv);
2260         }
2261 
2262         mutex_exit(&mi->mi_async_lock);
2263 }
2264 
2265 int
2266 writerp4(rnode4_t *rp, caddr_t base, int tcount, struct uio *uio, int pgcreated)
2267 {
2268         int pagecreate;
2269         int n;
2270         int saved_n;
2271         caddr_t saved_base;
2272         u_offset_t offset;
2273         int error;
2274         int sm_error;
2275         vnode_t *vp = RTOV(rp);
2276 
2277         ASSERT(tcount <= MAXBSIZE && tcount <= uio->uio_resid);
2278         ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_WRITER));
2279         if (!vpm_enable) {
2280                 ASSERT(((uintptr_t)base & MAXBOFFSET) + tcount <= MAXBSIZE);
2281         }
2282 
2283         /*
2284          * Move bytes in at most PAGESIZE chunks. We must avoid
2285          * spanning pages in uiomove() because page faults may cause
2286          * the cache to be invalidated out from under us. The r_size is not
2287          * updated until after the uiomove. If we push the last page of a
2288          * file before r_size is correct, we will lose the data written past
2289          * the current (and invalid) r_size.
2290          */
2291         do {
2292                 offset = uio->uio_loffset;
2293                 pagecreate = 0;
2294 
2295                 /*
2296                  * n is the number of bytes required to satisfy the request
2297                  *   or the number of bytes to fill out the page.
2298                  */
2299                 n = (int)MIN((PAGESIZE - (offset & PAGEOFFSET)), tcount);
2300 
2301                 /*
2302                  * Check to see if we can skip reading in the page
2303                  * and just allocate the memory.  We can do this
2304                  * if we are going to rewrite the entire mapping
2305                  * or if we are going to write to or beyond the current
2306                  * end of file from the beginning of the mapping.
2307                  *
2308                  * The read of r_size is now protected by r_statelock.
2309                  */
2310                 mutex_enter(&rp->r_statelock);
2311                 /*
2312                  * When pgcreated is nonzero the caller has already done
2313                  * a segmap_getmapflt with forcefault 0 and S_WRITE. With
2314                  * segkpm this means we already have at least one page
2315                  * created and mapped at base.
2316                  */
2317                 pagecreate = pgcreated ||
2318                     ((offset & PAGEOFFSET) == 0 &&
2319                     (n == PAGESIZE || ((offset + n) >= rp->r_size)));
2320 
2321                 mutex_exit(&rp->r_statelock);
2322 
2323                 if (!vpm_enable && pagecreate) {
2324                         /*
2325                          * The last argument tells segmap_pagecreate() to
2326                          * always lock the page, as opposed to sometimes
2327                          * returning with the page locked. This way we avoid a
2328                          * fault on the ensuing uiomove(), but also
2329                          * more importantly (to fix bug 1094402) we can
2330                          * call segmap_fault() to unlock the page in all
2331                          * cases. An alternative would be to modify
2332                          * segmap_pagecreate() to tell us when it is
2333                          * locking a page, but that's a fairly major
2334                          * interface change.
2335                          */
2336                         if (pgcreated == 0)
2337                                 (void) segmap_pagecreate(segkmap, base,
2338                                     (uint_t)n, 1);
2339                         saved_base = base;
2340                         saved_n = n;
2341                 }
2342 
2343                 /*
2344                  * The number of bytes of data in the last page can not
2345                  * be accurately be determined while page is being
2346                  * uiomove'd to and the size of the file being updated.
2347                  * Thus, inform threads which need to know accurately
2348                  * how much data is in the last page of the file.  They
2349                  * will not do the i/o immediately, but will arrange for
2350                  * the i/o to happen later when this modify operation
2351                  * will have finished.
2352                  */
2353                 ASSERT(!(rp->r_flags & R4MODINPROGRESS));
2354                 mutex_enter(&rp->r_statelock);
2355                 rp->r_flags |= R4MODINPROGRESS;
2356                 rp->r_modaddr = (offset & MAXBMASK);
2357                 mutex_exit(&rp->r_statelock);
2358 
2359                 if (vpm_enable) {
2360                         /*
2361                          * Copy data. If new pages are created, part of
2362                          * the page that is not written will be initizliazed
2363                          * with zeros.
2364                          */
2365                         error = vpm_data_copy(vp, offset, n, uio,
2366                             !pagecreate, NULL, 0, S_WRITE);
2367                 } else {
2368                         error = uiomove(base, n, UIO_WRITE, uio);
2369                 }
2370 
2371                 /*
2372                  * r_size is the maximum number of
2373                  * bytes known to be in the file.
2374                  * Make sure it is at least as high as the
2375                  * first unwritten byte pointed to by uio_loffset.
2376                  */
2377                 mutex_enter(&rp->r_statelock);
2378                 if (rp->r_size < uio->uio_loffset)
2379                         rp->r_size = uio->uio_loffset;
2380                 rp->r_flags &= ~R4MODINPROGRESS;
2381                 rp->r_flags |= R4DIRTY;
2382                 mutex_exit(&rp->r_statelock);
2383 
2384                 /* n = # of bytes written */
2385                 n = (int)(uio->uio_loffset - offset);
2386 
2387                 if (!vpm_enable) {
2388                         base += n;
2389                 }
2390 
2391                 tcount -= n;
2392                 /*
2393                  * If we created pages w/o initializing them completely,
2394                  * we need to zero the part that wasn't set up.
2395                  * This happens on a most EOF write cases and if
2396                  * we had some sort of error during the uiomove.
2397                  */
2398                 if (!vpm_enable && pagecreate) {
2399                         if ((uio->uio_loffset & PAGEOFFSET) || n == 0)
2400                                 (void) kzero(base, PAGESIZE - n);
2401 
2402                         if (pgcreated) {
2403                                 /*
2404                                  * Caller is responsible for this page,
2405                                  * it was not created in this loop.
2406                                  */
2407                                 pgcreated = 0;
2408                         } else {
2409                                 /*
2410                                  * For bug 1094402: segmap_pagecreate locks
2411                                  * page. Unlock it. This also unlocks the
2412                                  * pages allocated by page_create_va() in
2413                                  * segmap_pagecreate().
2414                                  */
2415                                 sm_error = segmap_fault(kas.a_hat, segkmap,
2416                                     saved_base, saved_n,
2417                                     F_SOFTUNLOCK, S_WRITE);
2418                                 if (error == 0)
2419                                         error = sm_error;
2420                         }
2421                 }
2422         } while (tcount > 0 && error == 0);
2423 
2424         return (error);
2425 }
2426 
2427 int
2428 nfs4_putpages(vnode_t *vp, u_offset_t off, size_t len, int flags, cred_t *cr)
2429 {
2430         rnode4_t *rp;
2431         page_t *pp;
2432         u_offset_t eoff;
2433         u_offset_t io_off;
2434         size_t io_len;
2435         int error;
2436         int rdirty;
2437         int err;
2438 
2439         rp = VTOR4(vp);
2440         ASSERT(rp->r_count > 0);
2441 
2442         if (!nfs4_has_pages(vp))
2443                 return (0);
2444 
2445         ASSERT(vp->v_type != VCHR);
2446 
2447         /*
2448          * If R4OUTOFSPACE is set, then all writes turn into B_INVAL
2449          * writes.  B_FORCE is set to force the VM system to actually
2450          * invalidate the pages, even if the i/o failed.  The pages
2451          * need to get invalidated because they can't be written out
2452          * because there isn't any space left on either the server's
2453          * file system or in the user's disk quota.  The B_FREE bit
2454          * is cleared to avoid confusion as to whether this is a
2455          * request to place the page on the freelist or to destroy
2456          * it.
2457          */
2458         if ((rp->r_flags & R4OUTOFSPACE) ||
2459             (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED))
2460                 flags = (flags & ~B_FREE) | B_INVAL | B_FORCE;
2461 
2462         if (len == 0) {
2463                 /*
2464                  * If doing a full file synchronous operation, then clear
2465                  * the R4DIRTY bit.  If a page gets dirtied while the flush
2466                  * is happening, then R4DIRTY will get set again.  The
2467                  * R4DIRTY bit must get cleared before the flush so that
2468                  * we don't lose this information.
2469                  *
2470                  * If there are no full file async write operations
2471                  * pending and RDIRTY bit is set, clear it.
2472                  */
2473                 if (off == (u_offset_t)0 &&
2474                     !(flags & B_ASYNC) &&
2475                     (rp->r_flags & R4DIRTY)) {
2476                         mutex_enter(&rp->r_statelock);
2477                         rdirty = (rp->r_flags & R4DIRTY);
2478                         rp->r_flags &= ~R4DIRTY;
2479                         mutex_exit(&rp->r_statelock);
2480                 } else if (flags & B_ASYNC && off == (u_offset_t)0) {
2481                         mutex_enter(&rp->r_statelock);
2482                         if (rp->r_flags & R4DIRTY && rp->r_awcount == 0) {
2483                                 rdirty = (rp->r_flags & R4DIRTY);
2484                                 rp->r_flags &= ~R4DIRTY;
2485                         }
2486                         mutex_exit(&rp->r_statelock);
2487                 } else
2488                         rdirty = 0;
2489 
2490                 /*
2491                  * Search the entire vp list for pages >= off, and flush
2492                  * the dirty pages.
2493                  */
2494                 error = pvn_vplist_dirty(vp, off, rp->r_putapage,
2495                     flags, cr);
2496 
2497                 /*
2498                  * If an error occurred and the file was marked as dirty
2499                  * before and we aren't forcibly invalidating pages, then
2500                  * reset the R4DIRTY flag.
2501                  */
2502                 if (error && rdirty &&
2503                     (flags & (B_INVAL | B_FORCE)) != (B_INVAL | B_FORCE)) {
2504                         mutex_enter(&rp->r_statelock);
2505                         rp->r_flags |= R4DIRTY;
2506                         mutex_exit(&rp->r_statelock);
2507                 }
2508         } else {
2509                 /*
2510                  * Do a range from [off...off + len) looking for pages
2511                  * to deal with.
2512                  */
2513                 error = 0;
2514                 io_len = 0;
2515                 eoff = off + len;
2516                 mutex_enter(&rp->r_statelock);
2517                 for (io_off = off; io_off < eoff && io_off < rp->r_size;
2518                     io_off += io_len) {
2519                         mutex_exit(&rp->r_statelock);
2520                         /*
2521                          * If we are not invalidating, synchronously
2522                          * freeing or writing pages use the routine
2523                          * page_lookup_nowait() to prevent reclaiming
2524                          * them from the free list.
2525                          */
2526                         if ((flags & B_INVAL) || !(flags & B_ASYNC)) {
2527                                 pp = page_lookup(vp, io_off,
2528                                     (flags & (B_INVAL | B_FREE)) ?
2529                                     SE_EXCL : SE_SHARED);
2530                         } else {
2531                                 pp = page_lookup_nowait(vp, io_off,
2532                                     (flags & B_FREE) ? SE_EXCL : SE_SHARED);
2533                         }
2534 
2535                         if (pp == NULL || !pvn_getdirty(pp, flags))
2536                                 io_len = PAGESIZE;
2537                         else {
2538                                 err = (*rp->r_putapage)(vp, pp, &io_off,
2539                                     &io_len, flags, cr);
2540                                 if (!error)
2541                                         error = err;
2542                                 /*
2543                                  * "io_off" and "io_len" are returned as
2544                                  * the range of pages we actually wrote.
2545                                  * This allows us to skip ahead more quickly
2546                                  * since several pages may've been dealt
2547                                  * with by this iteration of the loop.
2548                                  */
2549                         }
2550                         mutex_enter(&rp->r_statelock);
2551                 }
2552                 mutex_exit(&rp->r_statelock);
2553         }
2554 
2555         return (error);
2556 }
2557 
2558 void
2559 nfs4_invalidate_pages(vnode_t *vp, u_offset_t off, cred_t *cr)
2560 {
2561         rnode4_t *rp;
2562 
2563         rp = VTOR4(vp);
2564         if (IS_SHADOW(vp, rp))
2565                 vp = RTOV4(rp);
2566         mutex_enter(&rp->r_statelock);
2567         while (rp->r_flags & R4TRUNCATE)
2568                 cv_wait(&rp->r_cv, &rp->r_statelock);
2569         rp->r_flags |= R4TRUNCATE;
2570         if (off == (u_offset_t)0) {
2571                 rp->r_flags &= ~R4DIRTY;
2572                 if (!(rp->r_flags & R4STALE))
2573                         rp->r_error = 0;
2574         }
2575         rp->r_truncaddr = off;
2576         mutex_exit(&rp->r_statelock);
2577         (void) pvn_vplist_dirty(vp, off, rp->r_putapage,
2578             B_INVAL | B_TRUNC, cr);
2579         mutex_enter(&rp->r_statelock);
2580         rp->r_flags &= ~R4TRUNCATE;
2581         cv_broadcast(&rp->r_cv);
2582         mutex_exit(&rp->r_statelock);
2583 }
2584 
2585 static int
2586 nfs4_mnt_kstat_update(kstat_t *ksp, int rw)
2587 {
2588         mntinfo4_t *mi;
2589         struct mntinfo_kstat *mik;
2590         vfs_t *vfsp;
2591 
2592         /* this is a read-only kstat. Bail out on a write */
2593         if (rw == KSTAT_WRITE)
2594                 return (EACCES);
2595 
2596 
2597         /*
2598          * We don't want to wait here as kstat_chain_lock could be held by
2599          * dounmount(). dounmount() takes vfs_reflock before the chain lock
2600          * and thus could lead to a deadlock.
2601          */
2602         vfsp = (struct vfs *)ksp->ks_private;
2603 
2604         mi = VFTOMI4(vfsp);
2605         mik = (struct mntinfo_kstat *)ksp->ks_data;
2606 
2607         (void) strcpy(mik->mik_proto, mi->mi_curr_serv->sv_knconf->knc_proto);
2608 
2609         mik->mik_vers = (uint32_t)mi->mi_vers;
2610         mik->mik_flags = mi->mi_flags;
2611         /*
2612          * The sv_secdata holds the flavor the client specifies.
2613          * If the client uses default and a security negotiation
2614          * occurs, sv_currsec will point to the current flavor
2615          * selected from the server flavor list.
2616          * sv_currsec is NULL if no security negotiation takes place.
2617          */
2618         mik->mik_secmod = mi->mi_curr_serv->sv_currsec ?
2619             mi->mi_curr_serv->sv_currsec->secmod :
2620             mi->mi_curr_serv->sv_secdata->secmod;
2621         mik->mik_curread = (uint32_t)mi->mi_curread;
2622         mik->mik_curwrite = (uint32_t)mi->mi_curwrite;
2623         mik->mik_retrans = mi->mi_retrans;
2624         mik->mik_timeo = mi->mi_timeo;
2625         mik->mik_acregmin = HR2SEC(mi->mi_acregmin);
2626         mik->mik_acregmax = HR2SEC(mi->mi_acregmax);
2627         mik->mik_acdirmin = HR2SEC(mi->mi_acdirmin);
2628         mik->mik_acdirmax = HR2SEC(mi->mi_acdirmax);
2629         mik->mik_noresponse = (uint32_t)mi->mi_noresponse;
2630         mik->mik_failover = (uint32_t)mi->mi_failover;
2631         mik->mik_remap = (uint32_t)mi->mi_remap;
2632 
2633         (void) strcpy(mik->mik_curserver, mi->mi_curr_serv->sv_hostname);
2634 
2635         return (0);
2636 }
2637 
2638 void
2639 nfs4_mnt_kstat_init(struct vfs *vfsp)
2640 {
2641         mntinfo4_t *mi = VFTOMI4(vfsp);
2642 
2643         /*
2644          * PSARC 2001/697 Contract Private Interface
2645          * All nfs kstats are under SunMC contract
2646          * Please refer to the PSARC listed above and contact
2647          * SunMC before making any changes!
2648          *
2649          * Changes must be reviewed by Solaris File Sharing
2650          * Changes must be communicated to contract-2001-697@sun.com
2651          *
2652          */
2653 
2654         mi->mi_io_kstats = kstat_create_zone("nfs", getminor(vfsp->vfs_dev),
2655             NULL, "nfs", KSTAT_TYPE_IO, 1, 0, mi->mi_zone->zone_id);
2656         if (mi->mi_io_kstats) {
2657                 if (mi->mi_zone->zone_id != GLOBAL_ZONEID)
2658                         kstat_zone_add(mi->mi_io_kstats, GLOBAL_ZONEID);
2659                 mi->mi_io_kstats->ks_lock = &mi->mi_lock;
2660                 kstat_install(mi->mi_io_kstats);
2661         }
2662 
2663         if ((mi->mi_ro_kstats = kstat_create_zone("nfs",
2664             getminor(vfsp->vfs_dev), "mntinfo", "misc", KSTAT_TYPE_RAW,
2665             sizeof (struct mntinfo_kstat), 0, mi->mi_zone->zone_id)) != NULL) {
2666                 if (mi->mi_zone->zone_id != GLOBAL_ZONEID)
2667                         kstat_zone_add(mi->mi_ro_kstats, GLOBAL_ZONEID);
2668                 mi->mi_ro_kstats->ks_update = nfs4_mnt_kstat_update;
2669                 mi->mi_ro_kstats->ks_private = (void *)vfsp;
2670                 kstat_install(mi->mi_ro_kstats);
2671         }
2672 
2673         nfs4_mnt_recov_kstat_init(vfsp);
2674 }
2675 
2676 void
2677 nfs4_write_error(vnode_t *vp, int error, cred_t *cr)
2678 {
2679         mntinfo4_t *mi;
2680         clock_t now = ddi_get_lbolt();
2681 
2682         mi = VTOMI4(vp);
2683         /*
2684          * In case of forced unmount, do not print any messages
2685          * since it can flood the console with error messages.
2686          */
2687         if (mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)
2688                 return;
2689 
2690         /*
2691          * If the mount point is dead, not recoverable, do not
2692          * print error messages that can flood the console.
2693          */
2694         if (mi->mi_flags & MI4_RECOV_FAIL)
2695                 return;
2696 
2697         /*
2698          * No use in flooding the console with ENOSPC
2699          * messages from the same file system.
2700          */
2701         if ((error != ENOSPC && error != EDQUOT) ||
2702             now - mi->mi_printftime > 0) {
2703                 zoneid_t zoneid = mi->mi_zone->zone_id;
2704 
2705 #ifdef DEBUG
2706                 nfs_perror(error, "NFS%ld write error on host %s: %m.\n",
2707                     mi->mi_vers, VTOR4(vp)->r_server->sv_hostname, NULL);
2708 #else
2709                 nfs_perror(error, "NFS write error on host %s: %m.\n",
2710                     VTOR4(vp)->r_server->sv_hostname, NULL);
2711 #endif
2712                 if (error == ENOSPC || error == EDQUOT) {
2713                         zcmn_err(zoneid, CE_CONT,
2714                             "^File: userid=%d, groupid=%d\n",
2715                             crgetuid(cr), crgetgid(cr));
2716                         if (crgetuid(curthread->t_cred) != crgetuid(cr) ||
2717                             crgetgid(curthread->t_cred) != crgetgid(cr)) {
2718                                 zcmn_err(zoneid, CE_CONT,
2719                                     "^User: userid=%d, groupid=%d\n",
2720                                     crgetuid(curthread->t_cred),
2721                                     crgetgid(curthread->t_cred));
2722                         }
2723                         mi->mi_printftime = now +
2724                             nfs_write_error_interval * hz;
2725                 }
2726                 sfh4_printfhandle(VTOR4(vp)->r_fh);
2727 #ifdef DEBUG
2728                 if (error == EACCES) {
2729                         zcmn_err(zoneid, CE_CONT,
2730                             "nfs_bio: cred is%s kcred\n",
2731                             cr == kcred ? "" : " not");
2732                 }
2733 #endif
2734         }
2735 }
2736 
2737 /*
2738  * Return non-zero if the given file can be safely memory mapped.  Locks
2739  * are safe if whole-file (length and offset are both zero).
2740  */
2741 
2742 #define SAFE_LOCK(flk)  ((flk).l_start == 0 && (flk).l_len == 0)
2743 
2744 static int
2745 nfs4_safemap(const vnode_t *vp)
2746 {
2747         locklist_t      *llp, *next_llp;
2748         int             safe = 1;
2749         rnode4_t        *rp = VTOR4(vp);
2750 
2751         ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER));
2752 
2753         NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, "nfs4_safemap: "
2754             "vp = %p", (void *)vp));
2755 
2756         /*
2757          * Review all the locks for the vnode, both ones that have been
2758          * acquired and ones that are pending.  We assume that
2759          * flk_active_locks_for_vp() has merged any locks that can be
2760          * merged (so that if a process has the entire file locked, it is
2761          * represented as a single lock).
2762          *
2763          * Note that we can't bail out of the loop if we find a non-safe
2764          * lock, because we have to free all the elements in the llp list.
2765          * We might be able to speed up this code slightly by not looking
2766          * at each lock's l_start and l_len fields once we've found a
2767          * non-safe lock.
2768          */
2769 
2770         llp = flk_active_locks_for_vp(vp);
2771         while (llp) {
2772                 NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE,
2773                     "nfs4_safemap: active lock (%" PRId64 ", %" PRId64 ")",
2774                     llp->ll_flock.l_start, llp->ll_flock.l_len));
2775                 if (!SAFE_LOCK(llp->ll_flock)) {
2776                         safe = 0;
2777                         NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE,
2778                             "nfs4_safemap: unsafe active lock (%" PRId64
2779                             ", %" PRId64 ")", llp->ll_flock.l_start,
2780                             llp->ll_flock.l_len));
2781                 }
2782                 next_llp = llp->ll_next;
2783                 VN_RELE(llp->ll_vp);
2784                 kmem_free(llp, sizeof (*llp));
2785                 llp = next_llp;
2786         }
2787 
2788         NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, "nfs4_safemap: %s",
2789             safe ? "safe" : "unsafe"));
2790         return (safe);
2791 }
2792 
2793 /*
2794  * Return whether there is a lost LOCK or LOCKU queued up for the given
2795  * file that would make an mmap request unsafe.  cf. nfs4_safemap().
2796  */
2797 
2798 bool_t
2799 nfs4_map_lost_lock_conflict(vnode_t *vp)
2800 {
2801         bool_t conflict = FALSE;
2802         nfs4_lost_rqst_t *lrp;
2803         mntinfo4_t *mi = VTOMI4(vp);
2804 
2805         mutex_enter(&mi->mi_lock);
2806         for (lrp = list_head(&mi->mi_lost_state); lrp != NULL;
2807             lrp = list_next(&mi->mi_lost_state, lrp)) {
2808                 if (lrp->lr_op != OP_LOCK && lrp->lr_op != OP_LOCKU)
2809                         continue;
2810                 ASSERT(lrp->lr_vp != NULL);
2811                 if (!VOP_CMP(lrp->lr_vp, vp, NULL))
2812                         continue;       /* different file */
2813                 if (!SAFE_LOCK(*lrp->lr_flk)) {
2814                         conflict = TRUE;
2815                         break;
2816                 }
2817         }
2818 
2819         mutex_exit(&mi->mi_lock);
2820         return (conflict);
2821 }
2822 
2823 /*
2824  * nfs_lockcompletion:
2825  *
2826  * If the vnode has a lock that makes it unsafe to cache the file, mark it
2827  * as non cachable (set VNOCACHE bit).
2828  */
2829 
2830 void
2831 nfs4_lockcompletion(vnode_t *vp, int cmd)
2832 {
2833         rnode4_t *rp = VTOR4(vp);
2834 
2835         ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER));
2836         ASSERT(!IS_SHADOW(vp, rp));
2837 
2838         if (cmd == F_SETLK || cmd == F_SETLKW) {
2839 
2840                 if (!nfs4_safemap(vp)) {
2841                         mutex_enter(&vp->v_lock);
2842                         vp->v_flag |= VNOCACHE;
2843                         mutex_exit(&vp->v_lock);
2844                 } else {
2845                         mutex_enter(&vp->v_lock);
2846                         vp->v_flag &= ~VNOCACHE;
2847                         mutex_exit(&vp->v_lock);
2848                 }
2849         }
2850         /*
2851          * The cached attributes of the file are stale after acquiring
2852          * the lock on the file. They were updated when the file was
2853          * opened, but not updated when the lock was acquired. Therefore the
2854          * cached attributes are invalidated after the lock is obtained.
2855          */
2856         PURGE_ATTRCACHE4(vp);
2857 }
2858 
2859 /* ARGSUSED */
2860 static void *
2861 nfs4_mi_init(zoneid_t zoneid)
2862 {
2863         struct mi4_globals *mig;
2864 
2865         mig = kmem_alloc(sizeof (*mig), KM_SLEEP);
2866         mutex_init(&mig->mig_lock, NULL, MUTEX_DEFAULT, NULL);
2867         list_create(&mig->mig_list, sizeof (mntinfo4_t),
2868             offsetof(mntinfo4_t, mi_zone_node));
2869         mig->mig_destructor_called = B_FALSE;
2870         return (mig);
2871 }
2872 
2873 /*
2874  * Callback routine to tell all NFSv4 mounts in the zone to start tearing down
2875  * state and killing off threads.
2876  */
2877 /* ARGSUSED */
2878 static void
2879 nfs4_mi_shutdown(zoneid_t zoneid, void *data)
2880 {
2881         struct mi4_globals *mig = data;
2882         mntinfo4_t *mi;
2883         nfs4_server_t *np;
2884 
2885         NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
2886             "nfs4_mi_shutdown zone %d\n", zoneid));
2887         ASSERT(mig != NULL);
2888         for (;;) {
2889                 mutex_enter(&mig->mig_lock);
2890                 mi = list_head(&mig->mig_list);
2891                 if (mi == NULL) {
2892                         mutex_exit(&mig->mig_lock);
2893                         break;
2894                 }
2895 
2896                 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
2897                     "nfs4_mi_shutdown stopping vfs %p\n", (void *)mi->mi_vfsp));
2898                 /*
2899                  * purge the DNLC for this filesystem
2900                  */
2901                 (void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
2902                 /*
2903                  * Tell existing async worker threads to exit.
2904                  */
2905                 mutex_enter(&mi->mi_async_lock);
2906                 mi->mi_max_threads = 0;
2907                 NFS4_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
2908                 /*
2909                  * Set the appropriate flags, signal and wait for both the
2910                  * async manager and the inactive thread to exit when they're
2911                  * done with their current work.
2912                  */
2913                 mutex_enter(&mi->mi_lock);
2914                 mi->mi_flags |= (MI4_ASYNC_MGR_STOP|MI4_DEAD);
2915                 mutex_exit(&mi->mi_lock);
2916                 mutex_exit(&mi->mi_async_lock);
2917                 if (mi->mi_manager_thread) {
2918                         nfs4_async_manager_stop(mi->mi_vfsp);
2919                 }
2920                 if (mi->mi_inactive_thread) {
2921                         mutex_enter(&mi->mi_async_lock);
2922                         cv_signal(&mi->mi_inact_req_cv);
2923                         /*
2924                          * Wait for the inactive thread to exit.
2925                          */
2926                         while (mi->mi_inactive_thread != NULL) {
2927                                 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
2928                         }
2929                         mutex_exit(&mi->mi_async_lock);
2930                 }
2931                 /*
2932                  * Wait for the recovery thread to complete, that is, it will
2933                  * signal when it is done using the "mi" structure and about
2934                  * to exit
2935                  */
2936                 mutex_enter(&mi->mi_lock);
2937                 while (mi->mi_in_recovery > 0)
2938                         cv_wait(&mi->mi_cv_in_recov, &mi->mi_lock);
2939                 mutex_exit(&mi->mi_lock);
2940                 /*
2941                  * We're done when every mi has been done or the list is empty.
2942                  * This one is done, remove it from the list.
2943                  */
2944                 list_remove(&mig->mig_list, mi);
2945                 mutex_exit(&mig->mig_lock);
2946                 zone_rele_ref(&mi->mi_zone_ref, ZONE_REF_NFSV4);
2947 
2948                 /*
2949                  * Release hold on vfs and mi done to prevent race with zone
2950                  * shutdown. This releases the hold in nfs4_mi_zonelist_add.
2951                  */
2952                 VFS_RELE(mi->mi_vfsp);
2953                 MI4_RELE(mi);
2954         }
2955         /*
2956          * Tell each renew thread in the zone to exit
2957          */
2958         mutex_enter(&nfs4_server_lst_lock);
2959         for (np = nfs4_server_lst.forw; np != &nfs4_server_lst; np = np->forw) {
2960                 mutex_enter(&np->s_lock);
2961                 if (np->zoneid == zoneid) {
2962                         /*
2963                          * We add another hold onto the nfs4_server_t
2964                          * because this will make sure tha the nfs4_server_t
2965                          * stays around until nfs4_callback_fini_zone destroys
2966                          * the zone. This way, the renew thread can
2967                          * unconditionally release its holds on the
2968                          * nfs4_server_t.
2969                          */
2970                         np->s_refcnt++;
2971                         nfs4_mark_srv_dead(np);
2972                 }
2973                 mutex_exit(&np->s_lock);
2974         }
2975         mutex_exit(&nfs4_server_lst_lock);
2976 }
2977 
2978 static void
2979 nfs4_mi_free_globals(struct mi4_globals *mig)
2980 {
2981         list_destroy(&mig->mig_list);    /* makes sure the list is empty */
2982         mutex_destroy(&mig->mig_lock);
2983         kmem_free(mig, sizeof (*mig));
2984 }
2985 
2986 /* ARGSUSED */
2987 static void
2988 nfs4_mi_destroy(zoneid_t zoneid, void *data)
2989 {
2990         struct mi4_globals *mig = data;
2991 
2992         NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
2993             "nfs4_mi_destroy zone %d\n", zoneid));
2994         ASSERT(mig != NULL);
2995         mutex_enter(&mig->mig_lock);
2996         if (list_head(&mig->mig_list) != NULL) {
2997                 /* Still waiting for VFS_FREEVFS() */
2998                 mig->mig_destructor_called = B_TRUE;
2999                 mutex_exit(&mig->mig_lock);
3000                 return;
3001         }
3002         nfs4_mi_free_globals(mig);
3003 }
3004 
3005 /*
3006  * Add an NFS mount to the per-zone list of NFS mounts.
3007  */
3008 void
3009 nfs4_mi_zonelist_add(mntinfo4_t *mi)
3010 {
3011         struct mi4_globals *mig;
3012 
3013         mig = zone_getspecific(mi4_list_key, mi->mi_zone);
3014         mutex_enter(&mig->mig_lock);
3015         list_insert_head(&mig->mig_list, mi);
3016         /*
3017          * hold added to eliminate race with zone shutdown -this will be
3018          * released in mi_shutdown
3019          */
3020         MI4_HOLD(mi);
3021         VFS_HOLD(mi->mi_vfsp);
3022         mutex_exit(&mig->mig_lock);
3023 }
3024 
3025 /*
3026  * Remove an NFS mount from the per-zone list of NFS mounts.
3027  */
3028 int
3029 nfs4_mi_zonelist_remove(mntinfo4_t *mi)
3030 {
3031         struct mi4_globals *mig;
3032         int ret = 0;
3033 
3034         mig = zone_getspecific(mi4_list_key, mi->mi_zone);
3035         mutex_enter(&mig->mig_lock);
3036         mutex_enter(&mi->mi_lock);
3037         /* if this mi is marked dead, then the zone already released it */
3038         if (!(mi->mi_flags & MI4_DEAD)) {
3039                 list_remove(&mig->mig_list, mi);
3040                 mutex_exit(&mi->mi_lock);
3041 
3042                 /* release the holds put on in zonelist_add(). */
3043                 VFS_RELE(mi->mi_vfsp);
3044                 MI4_RELE(mi);
3045                 ret = 1;
3046         } else {
3047                 mutex_exit(&mi->mi_lock);
3048         }
3049 
3050         /*
3051          * We can be called asynchronously by VFS_FREEVFS() after the zone
3052          * shutdown/destroy callbacks have executed; if so, clean up the zone's
3053          * mi globals.
3054          */
3055         if (list_head(&mig->mig_list) == NULL &&
3056             mig->mig_destructor_called == B_TRUE) {
3057                 nfs4_mi_free_globals(mig);
3058                 return (ret);
3059         }
3060         mutex_exit(&mig->mig_lock);
3061         return (ret);
3062 }
3063 
3064 void
3065 nfs_free_mi4(mntinfo4_t *mi)
3066 {
3067         nfs4_open_owner_t       *foop;
3068         nfs4_oo_hash_bucket_t   *bucketp;
3069         nfs4_debug_msg_t        *msgp;
3070         int i;
3071         servinfo4_t             *svp;
3072 
3073         /*
3074          * Code introduced here should be carefully evaluated to make
3075          * sure none of the freed resources are accessed either directly
3076          * or indirectly after freeing them. For eg: Introducing calls to
3077          * NFS4_DEBUG that use mntinfo4_t structure member after freeing
3078          * the structure members or other routines calling back into NFS
3079          * accessing freed mntinfo4_t structure member.
3080          */
3081         mutex_enter(&mi->mi_lock);
3082         ASSERT(mi->mi_recovthread == NULL);
3083         ASSERT(mi->mi_flags & MI4_ASYNC_MGR_STOP);
3084         mutex_exit(&mi->mi_lock);
3085         mutex_enter(&mi->mi_async_lock);
3086         ASSERT(mi->mi_threads[NFS4_ASYNC_QUEUE] == 0 &&
3087             mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] == 0);
3088         ASSERT(mi->mi_manager_thread == NULL);
3089         mutex_exit(&mi->mi_async_lock);
3090         if (mi->mi_io_kstats) {
3091                 kstat_delete(mi->mi_io_kstats);
3092                 mi->mi_io_kstats = NULL;
3093         }
3094         if (mi->mi_ro_kstats) {
3095                 kstat_delete(mi->mi_ro_kstats);
3096                 mi->mi_ro_kstats = NULL;
3097         }
3098         if (mi->mi_recov_ksp) {
3099                 kstat_delete(mi->mi_recov_ksp);
3100                 mi->mi_recov_ksp = NULL;
3101         }
3102         mutex_enter(&mi->mi_msg_list_lock);
3103         while (msgp = list_head(&mi->mi_msg_list)) {
3104                 list_remove(&mi->mi_msg_list, msgp);
3105                 nfs4_free_msg(msgp);
3106         }
3107         mutex_exit(&mi->mi_msg_list_lock);
3108         list_destroy(&mi->mi_msg_list);
3109         if (mi->mi_fname != NULL)
3110                 fn_rele(&mi->mi_fname);
3111         if (mi->mi_rootfh != NULL)
3112                 sfh4_rele(&mi->mi_rootfh);
3113         if (mi->mi_srvparentfh != NULL)
3114                 sfh4_rele(&mi->mi_srvparentfh);
3115         svp = mi->mi_servers;
3116         sv4_free(svp);
3117         mutex_destroy(&mi->mi_lock);
3118         mutex_destroy(&mi->mi_async_lock);
3119         mutex_destroy(&mi->mi_msg_list_lock);
3120         nfs_rw_destroy(&mi->mi_recovlock);
3121         nfs_rw_destroy(&mi->mi_rename_lock);
3122         nfs_rw_destroy(&mi->mi_fh_lock);
3123         cv_destroy(&mi->mi_failover_cv);
3124         cv_destroy(&mi->mi_async_reqs_cv);
3125         cv_destroy(&mi->mi_async_work_cv[NFS4_ASYNC_QUEUE]);
3126         cv_destroy(&mi->mi_async_work_cv[NFS4_ASYNC_PGOPS_QUEUE]);
3127         cv_destroy(&mi->mi_async_cv);
3128         cv_destroy(&mi->mi_inact_req_cv);
3129         /*
3130          * Destroy the oo hash lists and mutexes for the cred hash table.
3131          */
3132         for (i = 0; i < NFS4_NUM_OO_BUCKETS; i++) {
3133                 bucketp = &(mi->mi_oo_list[i]);
3134                 /* Destroy any remaining open owners on the list */
3135                 foop = list_head(&bucketp->b_oo_hash_list);
3136                 while (foop != NULL) {
3137                         list_remove(&bucketp->b_oo_hash_list, foop);
3138                         nfs4_destroy_open_owner(foop);
3139                         foop = list_head(&bucketp->b_oo_hash_list);
3140                 }
3141                 list_destroy(&bucketp->b_oo_hash_list);
3142                 mutex_destroy(&bucketp->b_lock);
3143         }
3144         /*
3145          * Empty and destroy the freed open owner list.
3146          */
3147         foop = list_head(&mi->mi_foo_list);
3148         while (foop != NULL) {
3149                 list_remove(&mi->mi_foo_list, foop);
3150                 nfs4_destroy_open_owner(foop);
3151                 foop = list_head(&mi->mi_foo_list);
3152         }
3153         list_destroy(&mi->mi_foo_list);
3154         list_destroy(&mi->mi_bseqid_list);
3155         list_destroy(&mi->mi_lost_state);
3156         avl_destroy(&mi->mi_filehandles);
3157         kmem_free(mi, sizeof (*mi));
3158 }
3159 void
3160 mi_hold(mntinfo4_t *mi)
3161 {
3162         atomic_inc_32(&mi->mi_count);
3163         ASSERT(mi->mi_count != 0);
3164 }
3165 
3166 void
3167 mi_rele(mntinfo4_t *mi)
3168 {
3169         ASSERT(mi->mi_count != 0);
3170         if (atomic_dec_32_nv(&mi->mi_count) == 0) {
3171                 nfs_free_mi4(mi);
3172         }
3173 }
3174 
3175 vnode_t    nfs4_xattr_notsupp_vnode;
3176 
3177 void
3178 nfs4_clnt_init(void)
3179 {
3180         nfs4_vnops_init();
3181         (void) nfs4_rnode_init();
3182         (void) nfs4_shadow_init();
3183         (void) nfs4_acache_init();
3184         (void) nfs4_subr_init();
3185         nfs4_acl_init();
3186         nfs_idmap_init();
3187         nfs4_callback_init();
3188         nfs4_secinfo_init();
3189 #ifdef  DEBUG
3190         tsd_create(&nfs4_tsd_key, NULL);
3191 #endif
3192 
3193         /*
3194          * Add a CPR callback so that we can update client
3195          * lease after a suspend and resume.
3196          */
3197         cid = callb_add(nfs4_client_cpr_callb, 0, CB_CL_CPR_RPC, "nfs4");
3198 
3199         zone_key_create(&mi4_list_key, nfs4_mi_init, nfs4_mi_shutdown,
3200             nfs4_mi_destroy);
3201 
3202         /*
3203          * Initialize the reference count of the notsupp xattr cache vnode to 1
3204          * so that it never goes away (VOP_INACTIVE isn't called on it).
3205          */
3206         vn_reinit(&nfs4_xattr_notsupp_vnode);
3207 }
3208 
3209 void
3210 nfs4_clnt_fini(void)
3211 {
3212         (void) zone_key_delete(mi4_list_key);
3213         nfs4_vnops_fini();
3214         (void) nfs4_rnode_fini();
3215         (void) nfs4_shadow_fini();
3216         (void) nfs4_acache_fini();
3217         (void) nfs4_subr_fini();
3218         nfs_idmap_fini();
3219         nfs4_callback_fini();
3220         nfs4_secinfo_fini();
3221 #ifdef  DEBUG
3222         tsd_destroy(&nfs4_tsd_key);
3223 #endif
3224         if (cid)
3225                 (void) callb_delete(cid);
3226 }
3227 
3228 /*ARGSUSED*/
3229 static boolean_t
3230 nfs4_client_cpr_callb(void *arg, int code)
3231 {
3232         /*
3233          * We get called for Suspend and Resume events.
3234          * For the suspend case we simply don't care!
3235          */
3236         if (code == CB_CODE_CPR_CHKPT) {
3237                 return (B_TRUE);
3238         }
3239 
3240         /*
3241          * When we get to here we are in the process of
3242          * resuming the system from a previous suspend.
3243          */
3244         nfs4_client_resumed = gethrestime_sec();
3245         return (B_TRUE);
3246 }
3247 
3248 void
3249 nfs4_renew_lease_thread(nfs4_server_t *sp)
3250 {
3251         int     error = 0;
3252         time_t  tmp_last_renewal_time, tmp_time, tmp_now_time, kip_secs;
3253         clock_t tick_delay = 0;
3254         clock_t time_left = 0;
3255         callb_cpr_t cpr_info;
3256         kmutex_t cpr_lock;
3257 
3258         NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3259             "nfs4_renew_lease_thread: acting on sp 0x%p", (void*)sp));
3260         mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL);
3261         CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr, "nfsv4Lease");
3262 
3263         mutex_enter(&sp->s_lock);
3264         /* sp->s_lease_time is set via a GETATTR */
3265         sp->last_renewal_time = gethrestime_sec();
3266         sp->lease_valid = NFS4_LEASE_UNINITIALIZED;
3267         ASSERT(sp->s_refcnt >= 1);
3268 
3269         for (;;) {
3270                 if (!sp->state_ref_count ||
3271                     sp->lease_valid != NFS4_LEASE_VALID) {
3272 
3273                         kip_secs = MAX((sp->s_lease_time >> 1) -
3274                             (3 * sp->propagation_delay.tv_sec), 1);
3275 
3276                         tick_delay = SEC_TO_TICK(kip_secs);
3277 
3278                         NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3279                             "nfs4_renew_lease_thread: no renew : thread "
3280                             "wait %ld secs", kip_secs));
3281 
3282                         NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3283                             "nfs4_renew_lease_thread: no renew : "
3284                             "state_ref_count %d, lease_valid %d",
3285                             sp->state_ref_count, sp->lease_valid));
3286 
3287                         mutex_enter(&cpr_lock);
3288                         CALLB_CPR_SAFE_BEGIN(&cpr_info);
3289                         mutex_exit(&cpr_lock);
3290                         time_left = cv_reltimedwait(&sp->cv_thread_exit,
3291                             &sp->s_lock, tick_delay, TR_CLOCK_TICK);
3292                         mutex_enter(&cpr_lock);
3293                         CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
3294                         mutex_exit(&cpr_lock);
3295 
3296                         NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3297                             "nfs4_renew_lease_thread: no renew: "
3298                             "time left %ld", time_left));
3299 
3300                         if (sp->s_thread_exit == NFS4_THREAD_EXIT)
3301                                 goto die;
3302                         continue;
3303                 }
3304 
3305                 tmp_last_renewal_time = sp->last_renewal_time;
3306 
3307                 tmp_time = gethrestime_sec() - sp->last_renewal_time +
3308                     (3 * sp->propagation_delay.tv_sec);
3309 
3310                 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3311                     "nfs4_renew_lease_thread: tmp_time %ld, "
3312                     "sp->last_renewal_time %ld", tmp_time,
3313                     sp->last_renewal_time));
3314 
3315                 kip_secs = MAX((sp->s_lease_time >> 1) - tmp_time, 1);
3316 
3317                 tick_delay = SEC_TO_TICK(kip_secs);
3318 
3319                 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3320                     "nfs4_renew_lease_thread: valid lease: sleep for %ld "
3321                     "secs", kip_secs));
3322 
3323                 mutex_enter(&cpr_lock);
3324                 CALLB_CPR_SAFE_BEGIN(&cpr_info);
3325                 mutex_exit(&cpr_lock);
3326                 time_left = cv_reltimedwait(&sp->cv_thread_exit, &sp->s_lock,
3327                     tick_delay, TR_CLOCK_TICK);
3328                 mutex_enter(&cpr_lock);
3329                 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
3330                 mutex_exit(&cpr_lock);
3331 
3332                 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3333                     "nfs4_renew_lease_thread: valid lease: time left %ld :"
3334                     "sp last_renewal_time %ld, nfs4_client_resumed %ld, "
3335                     "tmp_last_renewal_time %ld", time_left,
3336                     sp->last_renewal_time, nfs4_client_resumed,
3337                     tmp_last_renewal_time));
3338 
3339                 if (sp->s_thread_exit == NFS4_THREAD_EXIT)
3340                         goto die;
3341 
3342                 if (tmp_last_renewal_time == sp->last_renewal_time ||
3343                     (nfs4_client_resumed != 0 &&
3344                     nfs4_client_resumed > sp->last_renewal_time)) {
3345                         /*
3346                          * Issue RENEW op since we haven't renewed the lease
3347                          * since we slept.
3348                          */
3349                         tmp_now_time = gethrestime_sec();
3350                         error = nfs4renew(sp);
3351                         /*
3352                          * Need to re-acquire sp's lock, nfs4renew()
3353                          * relinqueshes it.
3354                          */
3355                         mutex_enter(&sp->s_lock);
3356 
3357                         /*
3358                          * See if someone changed s_thread_exit while we gave
3359                          * up s_lock.
3360                          */
3361                         if (sp->s_thread_exit == NFS4_THREAD_EXIT)
3362                                 goto die;
3363 
3364                         if (!error) {
3365                                 /*
3366                                  * check to see if we implicitly renewed while
3367                                  * we waited for a reply for our RENEW call.
3368                                  */
3369                                 if (tmp_last_renewal_time ==
3370                                     sp->last_renewal_time) {
3371                                         /* no implicit renew came */
3372                                         sp->last_renewal_time = tmp_now_time;
3373                                 } else {
3374                                         NFS4_DEBUG(nfs4_client_lease_debug,
3375                                             (CE_NOTE, "renew_thread: did "
3376                                             "implicit renewal before reply "
3377                                             "from server for RENEW"));
3378                                 }
3379                         } else {
3380                                 /* figure out error */
3381                                 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3382                                     "renew_thread: nfs4renew returned error"
3383                                     " %d", error));
3384                         }
3385 
3386                 }
3387         }
3388 
3389 die:
3390         NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3391             "nfs4_renew_lease_thread: thread exiting"));
3392 
3393         while (sp->s_otw_call_count != 0) {
3394                 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3395                     "nfs4_renew_lease_thread: waiting for outstanding "
3396                     "otw calls to finish for sp 0x%p, current "
3397                     "s_otw_call_count %d", (void *)sp,
3398                     sp->s_otw_call_count));
3399                 mutex_enter(&cpr_lock);
3400                 CALLB_CPR_SAFE_BEGIN(&cpr_info);
3401                 mutex_exit(&cpr_lock);
3402                 cv_wait(&sp->s_cv_otw_count, &sp->s_lock);
3403                 mutex_enter(&cpr_lock);
3404                 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
3405                 mutex_exit(&cpr_lock);
3406         }
3407         mutex_exit(&sp->s_lock);
3408 
3409         nfs4_server_rele(sp);           /* free the thread's reference */
3410         nfs4_server_rele(sp);           /* free the list's reference */
3411         sp = NULL;
3412 
3413 done:
3414         mutex_enter(&cpr_lock);
3415         CALLB_CPR_EXIT(&cpr_info);  /* drops cpr_lock */
3416         mutex_destroy(&cpr_lock);
3417 
3418         NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3419             "nfs4_renew_lease_thread: renew thread exit officially"));
3420 
3421         zthread_exit();
3422         /* NOT REACHED */
3423 }
3424 
3425 /*
3426  * Send out a RENEW op to the server.
3427  * Assumes sp is locked down.
3428  */
3429 static int
3430 nfs4renew(nfs4_server_t *sp)
3431 {
3432         COMPOUND4args_clnt args;
3433         COMPOUND4res_clnt res;
3434         nfs_argop4 argop[1];
3435         int doqueue = 1;
3436         int rpc_error;
3437         cred_t *cr;
3438         mntinfo4_t *mi;
3439         timespec_t prop_time, after_time;
3440         int needrecov = FALSE;
3441         nfs4_recov_state_t recov_state;
3442         nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
3443 
3444         NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "nfs4renew"));
3445 
3446         recov_state.rs_flags = 0;
3447         recov_state.rs_num_retry_despite_err = 0;
3448 
3449 recov_retry:
3450         mi = sp->mntinfo4_list;
3451         VFS_HOLD(mi->mi_vfsp);
3452         mutex_exit(&sp->s_lock);
3453         ASSERT(mi != NULL);
3454 
3455         e.error = nfs4_start_op(mi, NULL, NULL, &recov_state);
3456         if (e.error) {
3457                 VFS_RELE(mi->mi_vfsp);
3458                 return (e.error);
3459         }
3460 
3461         /* Check to see if we're dealing with a marked-dead sp */
3462         mutex_enter(&sp->s_lock);
3463         if (sp->s_thread_exit == NFS4_THREAD_EXIT) {
3464                 mutex_exit(&sp->s_lock);
3465                 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3466                 VFS_RELE(mi->mi_vfsp);
3467                 return (0);
3468         }
3469 
3470         /* Make sure mi hasn't changed on us */
3471         if (mi != sp->mntinfo4_list) {
3472                 /* Must drop sp's lock to avoid a recursive mutex enter */
3473                 mutex_exit(&sp->s_lock);
3474                 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3475                 VFS_RELE(mi->mi_vfsp);
3476                 mutex_enter(&sp->s_lock);
3477                 goto recov_retry;
3478         }
3479         mutex_exit(&sp->s_lock);
3480 
3481         args.ctag = TAG_RENEW;
3482 
3483         args.array_len = 1;
3484         args.array = argop;
3485 
3486         argop[0].argop = OP_RENEW;
3487 
3488         mutex_enter(&sp->s_lock);
3489         argop[0].nfs_argop4_u.oprenew.clientid = sp->clientid;
3490         cr = sp->s_cred;
3491         crhold(cr);
3492         mutex_exit(&sp->s_lock);
3493 
3494         ASSERT(cr != NULL);
3495 
3496         /* used to figure out RTT for sp */
3497         gethrestime(&prop_time);
3498 
3499         NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
3500             "nfs4renew: %s call, sp 0x%p", needrecov ? "recov" : "first",
3501             (void*)sp));
3502         NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "before: %ld s %ld ns ",
3503             prop_time.tv_sec, prop_time.tv_nsec));
3504 
3505         DTRACE_PROBE2(nfs4__renew__start, nfs4_server_t *, sp,
3506             mntinfo4_t *, mi);
3507 
3508         rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
3509         crfree(cr);
3510 
3511         DTRACE_PROBE2(nfs4__renew__end, nfs4_server_t *, sp,
3512             mntinfo4_t *, mi);
3513 
3514         gethrestime(&after_time);
3515 
3516         mutex_enter(&sp->s_lock);
3517         sp->propagation_delay.tv_sec =
3518             MAX(1, after_time.tv_sec - prop_time.tv_sec);
3519         mutex_exit(&sp->s_lock);
3520 
3521         NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "after : %ld s %ld ns ",
3522             after_time.tv_sec, after_time.tv_nsec));
3523 
3524         if (e.error == 0 && res.status == NFS4ERR_CB_PATH_DOWN) {
3525                 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3526                 nfs4_delegreturn_all(sp);
3527                 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3528                 VFS_RELE(mi->mi_vfsp);
3529                 /*
3530                  * If the server returns CB_PATH_DOWN, it has renewed
3531                  * the lease and informed us that the callback path is
3532                  * down.  Since the lease is renewed, just return 0 and
3533                  * let the renew thread proceed as normal.
3534                  */
3535                 return (0);
3536         }
3537 
3538         needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
3539         if (!needrecov && e.error) {
3540                 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3541                 VFS_RELE(mi->mi_vfsp);
3542                 return (e.error);
3543         }
3544 
3545         rpc_error = e.error;
3546 
3547         if (needrecov) {
3548                 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
3549                     "nfs4renew: initiating recovery\n"));
3550 
3551                 if (nfs4_start_recovery(&e, mi, NULL, NULL, NULL, NULL,
3552                     OP_RENEW, NULL, NULL, NULL) == FALSE) {
3553                         nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3554                         VFS_RELE(mi->mi_vfsp);
3555                         if (!e.error)
3556                                 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3557                         mutex_enter(&sp->s_lock);
3558                         goto recov_retry;
3559                 }
3560                 /* fall through for res.status case */
3561         }
3562 
3563         if (res.status) {
3564                 if (res.status == NFS4ERR_LEASE_MOVED) {
3565                         /*EMPTY*/
3566                         /*
3567                          * XXX need to try every mntinfo4 in sp->mntinfo4_list
3568                          * to renew the lease on that server
3569                          */
3570                 }
3571                 e.error = geterrno4(res.status);
3572         }
3573 
3574         if (!rpc_error)
3575                 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3576 
3577         nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3578 
3579         VFS_RELE(mi->mi_vfsp);
3580 
3581         return (e.error);
3582 }
3583 
3584 void
3585 nfs4_inc_state_ref_count(mntinfo4_t *mi)
3586 {
3587         nfs4_server_t   *sp;
3588 
3589         /* this locks down sp if it is found */
3590         sp = find_nfs4_server(mi);
3591 
3592         if (sp != NULL) {
3593                 nfs4_inc_state_ref_count_nolock(sp, mi);
3594                 mutex_exit(&sp->s_lock);
3595                 nfs4_server_rele(sp);
3596         }
3597 }
3598 
3599 /*
3600  * Bump the number of OPEN files (ie: those with state) so we know if this
3601  * nfs4_server has any state to maintain a lease for or not.
3602  *
3603  * Also, marks the nfs4_server's lease valid if it hasn't been done so already.
3604  */
3605 void
3606 nfs4_inc_state_ref_count_nolock(nfs4_server_t *sp, mntinfo4_t *mi)
3607 {
3608         ASSERT(mutex_owned(&sp->s_lock));
3609 
3610         sp->state_ref_count++;
3611         NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3612             "nfs4_inc_state_ref_count: state_ref_count now %d",
3613             sp->state_ref_count));
3614 
3615         if (sp->lease_valid == NFS4_LEASE_UNINITIALIZED)
3616                 sp->lease_valid = NFS4_LEASE_VALID;
3617 
3618         /*
3619          * If this call caused the lease to be marked valid and/or
3620          * took the state_ref_count from 0 to 1, then start the time
3621          * on lease renewal.
3622          */
3623         if (sp->lease_valid == NFS4_LEASE_VALID && sp->state_ref_count == 1)
3624                 sp->last_renewal_time = gethrestime_sec();
3625 
3626         /* update the number of open files for mi */
3627         mi->mi_open_files++;
3628 }
3629 
3630 void
3631 nfs4_dec_state_ref_count(mntinfo4_t *mi)
3632 {
3633         nfs4_server_t   *sp;
3634 
3635         /* this locks down sp if it is found */
3636         sp = find_nfs4_server_all(mi, 1);
3637 
3638         if (sp != NULL) {
3639                 nfs4_dec_state_ref_count_nolock(sp, mi);
3640                 mutex_exit(&sp->s_lock);
3641                 nfs4_server_rele(sp);
3642         }
3643 }
3644 
3645 /*
3646  * Decrement the number of OPEN files (ie: those with state) so we know if
3647  * this nfs4_server has any state to maintain a lease for or not.
3648  */
3649 void
3650 nfs4_dec_state_ref_count_nolock(nfs4_server_t *sp, mntinfo4_t *mi)
3651 {
3652         ASSERT(mutex_owned(&sp->s_lock));
3653         ASSERT(sp->state_ref_count != 0);
3654         sp->state_ref_count--;
3655 
3656         NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3657             "nfs4_dec_state_ref_count: state ref count now %d",
3658             sp->state_ref_count));
3659 
3660         mi->mi_open_files--;
3661         NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3662             "nfs4_dec_state_ref_count: mi open files %d, v4 flags 0x%x",
3663             mi->mi_open_files, mi->mi_flags));
3664 
3665         /* We don't have to hold the mi_lock to test mi_flags */
3666         if (mi->mi_open_files == 0 &&
3667             (mi->mi_flags & MI4_REMOVE_ON_LAST_CLOSE)) {
3668                 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3669                     "nfs4_dec_state_ref_count: remove mntinfo4 %p since "
3670                     "we have closed the last open file", (void*)mi));
3671                 nfs4_remove_mi_from_server(mi, sp);
3672         }
3673 }
3674 
3675 bool_t
3676 inlease(nfs4_server_t *sp)
3677 {
3678         bool_t result;
3679 
3680         ASSERT(mutex_owned(&sp->s_lock));
3681 
3682         if (sp->lease_valid == NFS4_LEASE_VALID &&
3683             gethrestime_sec() < sp->last_renewal_time + sp->s_lease_time)
3684                 result = TRUE;
3685         else
3686                 result = FALSE;
3687 
3688         return (result);
3689 }
3690 
3691 
3692 /*
3693  * Return non-zero if the given nfs4_server_t is going through recovery.
3694  */
3695 
3696 int
3697 nfs4_server_in_recovery(nfs4_server_t *sp)
3698 {
3699         return (nfs_rw_lock_held(&sp->s_recovlock, RW_WRITER));
3700 }
3701 
3702 /*
3703  * Compare two shared filehandle objects.  Returns -1, 0, or +1, if the
3704  * first is less than, equal to, or greater than the second.
3705  */
3706 
3707 int
3708 sfh4cmp(const void *p1, const void *p2)
3709 {
3710         const nfs4_sharedfh_t *sfh1 = (const nfs4_sharedfh_t *)p1;
3711         const nfs4_sharedfh_t *sfh2 = (const nfs4_sharedfh_t *)p2;
3712 
3713         return (nfs4cmpfh(&sfh1->sfh_fh, &sfh2->sfh_fh));
3714 }
3715 
3716 /*
3717  * Create a table for shared filehandle objects.
3718  */
3719 
3720 void
3721 sfh4_createtab(avl_tree_t *tab)
3722 {
3723         avl_create(tab, sfh4cmp, sizeof (nfs4_sharedfh_t),
3724             offsetof(nfs4_sharedfh_t, sfh_tree));
3725 }
3726 
3727 /*
3728  * Return a shared filehandle object for the given filehandle.  The caller
3729  * is responsible for eventually calling sfh4_rele().
3730  */
3731 
3732 nfs4_sharedfh_t *
3733 sfh4_put(const nfs_fh4 *fh, mntinfo4_t *mi, nfs4_sharedfh_t *key)
3734 {
3735         nfs4_sharedfh_t *sfh, *nsfh;
3736         avl_index_t where;
3737         nfs4_sharedfh_t skey;
3738 
3739         if (!key) {
3740                 skey.sfh_fh = *fh;
3741                 key = &skey;
3742         }
3743 
3744         nsfh = kmem_alloc(sizeof (nfs4_sharedfh_t), KM_SLEEP);
3745         nsfh->sfh_fh.nfs_fh4_len = fh->nfs_fh4_len;
3746         /*
3747          * We allocate the largest possible filehandle size because it's
3748          * not that big, and it saves us from possibly having to resize the
3749          * buffer later.
3750          */
3751         nsfh->sfh_fh.nfs_fh4_val = kmem_alloc(NFS4_FHSIZE, KM_SLEEP);
3752         bcopy(fh->nfs_fh4_val, nsfh->sfh_fh.nfs_fh4_val, fh->nfs_fh4_len);
3753         mutex_init(&nsfh->sfh_lock, NULL, MUTEX_DEFAULT, NULL);
3754         nsfh->sfh_refcnt = 1;
3755         nsfh->sfh_flags = SFH4_IN_TREE;
3756         nsfh->sfh_mi = mi;
3757         NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, "sfh4_get: new object (%p)",
3758             (void *)nsfh));
3759 
3760         (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0);
3761         sfh = avl_find(&mi->mi_filehandles, key, &where);
3762         if (sfh != NULL) {
3763                 mutex_enter(&sfh->sfh_lock);
3764                 sfh->sfh_refcnt++;
3765                 mutex_exit(&sfh->sfh_lock);
3766                 nfs_rw_exit(&mi->mi_fh_lock);
3767                 /* free our speculative allocs */
3768                 kmem_free(nsfh->sfh_fh.nfs_fh4_val, NFS4_FHSIZE);
3769                 kmem_free(nsfh, sizeof (nfs4_sharedfh_t));
3770                 return (sfh);
3771         }
3772 
3773         avl_insert(&mi->mi_filehandles, nsfh, where);
3774         nfs_rw_exit(&mi->mi_fh_lock);
3775 
3776         return (nsfh);
3777 }
3778 
3779 /*
3780  * Return a shared filehandle object for the given filehandle.  The caller
3781  * is responsible for eventually calling sfh4_rele().
3782  */
3783 
3784 nfs4_sharedfh_t *
3785 sfh4_get(const nfs_fh4 *fh, mntinfo4_t *mi)
3786 {
3787         nfs4_sharedfh_t *sfh;
3788         nfs4_sharedfh_t key;
3789 
3790         ASSERT(fh->nfs_fh4_len <= NFS4_FHSIZE);
3791 
3792 #ifdef DEBUG
3793         if (nfs4_sharedfh_debug) {
3794                 nfs4_fhandle_t fhandle;
3795 
3796                 fhandle.fh_len = fh->nfs_fh4_len;
3797                 bcopy(fh->nfs_fh4_val, fhandle.fh_buf, fhandle.fh_len);
3798                 zcmn_err(mi->mi_zone->zone_id, CE_NOTE, "sfh4_get:");
3799                 nfs4_printfhandle(&fhandle);
3800         }
3801 #endif
3802 
3803         /*
3804          * If there's already an object for the given filehandle, bump the
3805          * reference count and return it.  Otherwise, create a new object
3806          * and add it to the AVL tree.
3807          */
3808 
3809         key.sfh_fh = *fh;
3810 
3811         (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0);
3812         sfh = avl_find(&mi->mi_filehandles, &key, NULL);
3813         if (sfh != NULL) {
3814                 mutex_enter(&sfh->sfh_lock);
3815                 sfh->sfh_refcnt++;
3816                 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3817                     "sfh4_get: found existing %p, new refcnt=%d",
3818                     (void *)sfh, sfh->sfh_refcnt));
3819                 mutex_exit(&sfh->sfh_lock);
3820                 nfs_rw_exit(&mi->mi_fh_lock);
3821                 return (sfh);
3822         }
3823         nfs_rw_exit(&mi->mi_fh_lock);
3824 
3825         return (sfh4_put(fh, mi, &key));
3826 }
3827 
3828 /*
3829  * Get a reference to the given shared filehandle object.
3830  */
3831 
3832 void
3833 sfh4_hold(nfs4_sharedfh_t *sfh)
3834 {
3835         ASSERT(sfh->sfh_refcnt > 0);
3836 
3837         mutex_enter(&sfh->sfh_lock);
3838         sfh->sfh_refcnt++;
3839         NFS4_DEBUG(nfs4_sharedfh_debug,
3840             (CE_NOTE, "sfh4_hold %p, new refcnt=%d",
3841             (void *)sfh, sfh->sfh_refcnt));
3842         mutex_exit(&sfh->sfh_lock);
3843 }
3844 
3845 /*
3846  * Release a reference to the given shared filehandle object and null out
3847  * the given pointer.
3848  */
3849 
3850 void
3851 sfh4_rele(nfs4_sharedfh_t **sfhpp)
3852 {
3853         mntinfo4_t *mi;
3854         nfs4_sharedfh_t *sfh = *sfhpp;
3855 
3856         ASSERT(sfh->sfh_refcnt > 0);
3857 
3858         mutex_enter(&sfh->sfh_lock);
3859         if (sfh->sfh_refcnt > 1) {
3860                 sfh->sfh_refcnt--;
3861                 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3862                     "sfh4_rele %p, new refcnt=%d",
3863                     (void *)sfh, sfh->sfh_refcnt));
3864                 mutex_exit(&sfh->sfh_lock);
3865                 goto finish;
3866         }
3867         mutex_exit(&sfh->sfh_lock);
3868 
3869         /*
3870          * Possibly the last reference, so get the lock for the table in
3871          * case it's time to remove the object from the table.
3872          */
3873         mi = sfh->sfh_mi;
3874         (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0);
3875         mutex_enter(&sfh->sfh_lock);
3876         sfh->sfh_refcnt--;
3877         if (sfh->sfh_refcnt > 0) {
3878                 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3879                     "sfh4_rele %p, new refcnt=%d",
3880                     (void *)sfh, sfh->sfh_refcnt));
3881                 mutex_exit(&sfh->sfh_lock);
3882                 nfs_rw_exit(&mi->mi_fh_lock);
3883                 goto finish;
3884         }
3885 
3886         NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3887             "sfh4_rele %p, last ref", (void *)sfh));
3888         if (sfh->sfh_flags & SFH4_IN_TREE) {
3889                 avl_remove(&mi->mi_filehandles, sfh);
3890                 sfh->sfh_flags &= ~SFH4_IN_TREE;
3891         }
3892         mutex_exit(&sfh->sfh_lock);
3893         nfs_rw_exit(&mi->mi_fh_lock);
3894         mutex_destroy(&sfh->sfh_lock);
3895         kmem_free(sfh->sfh_fh.nfs_fh4_val, NFS4_FHSIZE);
3896         kmem_free(sfh, sizeof (nfs4_sharedfh_t));
3897 
3898 finish:
3899         *sfhpp = NULL;
3900 }
3901 
3902 /*
3903  * Update the filehandle for the given shared filehandle object.
3904  */
3905 
3906 int nfs4_warn_dupfh = 0;        /* if set, always warn about dup fhs below */
3907 
3908 void
3909 sfh4_update(nfs4_sharedfh_t *sfh, const nfs_fh4 *newfh)
3910 {
3911         mntinfo4_t *mi = sfh->sfh_mi;
3912         nfs4_sharedfh_t *dupsfh;
3913         avl_index_t where;
3914         nfs4_sharedfh_t key;
3915 
3916 #ifdef DEBUG
3917         mutex_enter(&sfh->sfh_lock);
3918         ASSERT(sfh->sfh_refcnt > 0);
3919         mutex_exit(&sfh->sfh_lock);
3920 #endif
3921         ASSERT(newfh->nfs_fh4_len <= NFS4_FHSIZE);
3922 
3923         /*
3924          * The basic plan is to remove the shared filehandle object from
3925          * the table, update it to have the new filehandle, then reinsert
3926          * it.
3927          */
3928 
3929         (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0);
3930         mutex_enter(&sfh->sfh_lock);
3931         if (sfh->sfh_flags & SFH4_IN_TREE) {
3932                 avl_remove(&mi->mi_filehandles, sfh);
3933                 sfh->sfh_flags &= ~SFH4_IN_TREE;
3934         }
3935         mutex_exit(&sfh->sfh_lock);
3936         sfh->sfh_fh.nfs_fh4_len = newfh->nfs_fh4_len;
3937         bcopy(newfh->nfs_fh4_val, sfh->sfh_fh.nfs_fh4_val,
3938             sfh->sfh_fh.nfs_fh4_len);
3939 
3940         /*
3941          * XXX If there is already a shared filehandle object with the new
3942          * filehandle, we're in trouble, because the rnode code assumes
3943          * that there is only one shared filehandle object for a given
3944          * filehandle.  So issue a warning (for read-write mounts only)
3945          * and don't try to re-insert the given object into the table.
3946          * Hopefully the given object will quickly go away and everyone
3947          * will use the new object.
3948          */
3949         key.sfh_fh = *newfh;
3950         dupsfh = avl_find(&mi->mi_filehandles, &key, &where);
3951         if (dupsfh != NULL) {
3952                 if (!(mi->mi_vfsp->vfs_flag & VFS_RDONLY) || nfs4_warn_dupfh) {
3953                         zcmn_err(mi->mi_zone->zone_id, CE_WARN, "sfh4_update: "
3954                             "duplicate filehandle detected");
3955                         sfh4_printfhandle(dupsfh);
3956                 }
3957         } else {
3958                 avl_insert(&mi->mi_filehandles, sfh, where);
3959                 mutex_enter(&sfh->sfh_lock);
3960                 sfh->sfh_flags |= SFH4_IN_TREE;
3961                 mutex_exit(&sfh->sfh_lock);
3962         }
3963         nfs_rw_exit(&mi->mi_fh_lock);
3964 }
3965 
3966 /*
3967  * Copy out the current filehandle for the given shared filehandle object.
3968  */
3969 
3970 void
3971 sfh4_copyval(const nfs4_sharedfh_t *sfh, nfs4_fhandle_t *fhp)
3972 {
3973         mntinfo4_t *mi = sfh->sfh_mi;
3974 
3975         ASSERT(sfh->sfh_refcnt > 0);
3976 
3977         (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0);
3978         fhp->fh_len = sfh->sfh_fh.nfs_fh4_len;
3979         ASSERT(fhp->fh_len <= NFS4_FHSIZE);
3980         bcopy(sfh->sfh_fh.nfs_fh4_val, fhp->fh_buf, fhp->fh_len);
3981         nfs_rw_exit(&mi->mi_fh_lock);
3982 }
3983 
3984 /*
3985  * Print out the filehandle for the given shared filehandle object.
3986  */
3987 
3988 void
3989 sfh4_printfhandle(const nfs4_sharedfh_t *sfh)
3990 {
3991         nfs4_fhandle_t fhandle;
3992 
3993         sfh4_copyval(sfh, &fhandle);
3994         nfs4_printfhandle(&fhandle);
3995 }
3996 
3997 /*
3998  * Compare 2 fnames.  Returns -1 if the first is "less" than the second, 0
3999  * if they're the same, +1 if the first is "greater" than the second.  The
4000  * caller (or whoever's calling the AVL package) is responsible for
4001  * handling locking issues.
4002  */
4003 
4004 static int
4005 fncmp(const void *p1, const void *p2)
4006 {
4007         const nfs4_fname_t *f1 = p1;
4008         const nfs4_fname_t *f2 = p2;
4009         int res;
4010 
4011         res = strcmp(f1->fn_name, f2->fn_name);
4012         /*
4013          * The AVL package wants +/-1, not arbitrary positive or negative
4014          * integers.
4015          */
4016         if (res > 0)
4017                 res = 1;
4018         else if (res < 0)
4019                 res = -1;
4020         return (res);
4021 }
4022 
4023 /*
4024  * Get or create an fname with the given name, as a child of the given
4025  * fname.  The caller is responsible for eventually releasing the reference
4026  * (fn_rele()).  parent may be NULL.
4027  */
4028 
4029 nfs4_fname_t *
4030 fn_get(nfs4_fname_t *parent, char *name, nfs4_sharedfh_t *sfh)
4031 {
4032         nfs4_fname_t key;
4033         nfs4_fname_t *fnp;
4034         avl_index_t where;
4035 
4036         key.fn_name = name;
4037 
4038         /*
4039          * If there's already an fname registered with the given name, bump
4040          * its reference count and return it.  Otherwise, create a new one
4041          * and add it to the parent's AVL tree.
4042          *
4043          * fname entries we are looking for should match both name
4044          * and sfh stored in the fname.
4045          */
4046 again:
4047         if (parent != NULL) {
4048                 mutex_enter(&parent->fn_lock);
4049                 fnp = avl_find(&parent->fn_children, &key, &where);
4050                 if (fnp != NULL) {
4051                         /*
4052                          * This hold on fnp is released below later,
4053                          * in case this is not the fnp we want.
4054                          */
4055                         fn_hold(fnp);
4056 
4057                         if (fnp->fn_sfh == sfh) {
4058                                 /*
4059                                  * We have found our entry.
4060                                  * put an hold and return it.
4061                                  */
4062                                 mutex_exit(&parent->fn_lock);
4063                                 return (fnp);
4064                         }
4065 
4066                         /*
4067                          * We have found an entry that has a mismatching
4068                          * fn_sfh. This could be a stale entry due to
4069                          * server side rename. We will remove this entry
4070                          * and make sure no such entries exist.
4071                          */
4072                         mutex_exit(&parent->fn_lock);
4073                         mutex_enter(&fnp->fn_lock);
4074                         if (fnp->fn_parent == parent) {
4075                                 /*
4076                                  * Remove ourselves from parent's
4077                                  * fn_children tree.
4078                                  */
4079                                 mutex_enter(&parent->fn_lock);
4080                                 avl_remove(&parent->fn_children, fnp);
4081                                 mutex_exit(&parent->fn_lock);
4082                                 fn_rele(&fnp->fn_parent);
4083                         }
4084                         mutex_exit(&fnp->fn_lock);
4085                         fn_rele(&fnp);
4086                         goto again;
4087                 }
4088         }
4089 
4090         fnp = kmem_alloc(sizeof (nfs4_fname_t), KM_SLEEP);
4091         mutex_init(&fnp->fn_lock, NULL, MUTEX_DEFAULT, NULL);
4092         fnp->fn_parent = parent;
4093         if (parent != NULL)
4094                 fn_hold(parent);
4095         fnp->fn_len = strlen(name);
4096         ASSERT(fnp->fn_len < MAXNAMELEN);
4097         fnp->fn_name = kmem_alloc(fnp->fn_len + 1, KM_SLEEP);
4098         (void) strcpy(fnp->fn_name, name);
4099         fnp->fn_refcnt = 1;
4100 
4101         /*
4102          * This hold on sfh is later released
4103          * when we do the final fn_rele() on this fname.
4104          */
4105         sfh4_hold(sfh);
4106         fnp->fn_sfh = sfh;
4107 
4108         avl_create(&fnp->fn_children, fncmp, sizeof (nfs4_fname_t),
4109             offsetof(nfs4_fname_t, fn_tree));
4110         NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
4111             "fn_get %p:%s, a new nfs4_fname_t!",
4112             (void *)fnp, fnp->fn_name));
4113         if (parent != NULL) {
4114                 avl_insert(&parent->fn_children, fnp, where);
4115                 mutex_exit(&parent->fn_lock);
4116         }
4117 
4118         return (fnp);
4119 }
4120 
4121 void
4122 fn_hold(nfs4_fname_t *fnp)
4123 {
4124         atomic_inc_32(&fnp->fn_refcnt);
4125         NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
4126             "fn_hold %p:%s, new refcnt=%d",
4127             (void *)fnp, fnp->fn_name, fnp->fn_refcnt));
4128 }
4129 
4130 /*
4131  * Decrement the reference count of the given fname, and destroy it if its
4132  * reference count goes to zero.  Nulls out the given pointer.
4133  */
4134 
4135 void
4136 fn_rele(nfs4_fname_t **fnpp)
4137 {
4138         nfs4_fname_t *parent;
4139         uint32_t newref;
4140         nfs4_fname_t *fnp;
4141 
4142 recur:
4143         fnp = *fnpp;
4144         *fnpp = NULL;
4145 
4146         mutex_enter(&fnp->fn_lock);
4147         parent = fnp->fn_parent;
4148         if (parent != NULL)
4149                 mutex_enter(&parent->fn_lock);   /* prevent new references */
4150         newref = atomic_dec_32_nv(&fnp->fn_refcnt);
4151         if (newref > 0) {
4152                 NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
4153                     "fn_rele %p:%s, new refcnt=%d",
4154                     (void *)fnp, fnp->fn_name, fnp->fn_refcnt));
4155                 if (parent != NULL)
4156                         mutex_exit(&parent->fn_lock);
4157                 mutex_exit(&fnp->fn_lock);
4158                 return;
4159         }
4160 
4161         NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
4162             "fn_rele %p:%s, last reference, deleting...",
4163             (void *)fnp, fnp->fn_name));
4164         if (parent != NULL) {
4165                 avl_remove(&parent->fn_children, fnp);
4166                 mutex_exit(&parent->fn_lock);
4167         }
4168         kmem_free(fnp->fn_name, fnp->fn_len + 1);
4169         sfh4_rele(&fnp->fn_sfh);
4170         mutex_destroy(&fnp->fn_lock);
4171         avl_destroy(&fnp->fn_children);
4172         kmem_free(fnp, sizeof (nfs4_fname_t));
4173         /*
4174          * Recursivly fn_rele the parent.
4175          * Use goto instead of a recursive call to avoid stack overflow.
4176          */
4177         if (parent != NULL) {
4178                 fnpp = &parent;
4179                 goto recur;
4180         }
4181 }
4182 
4183 /*
4184  * Returns the single component name of the given fname, in a MAXNAMELEN
4185  * string buffer, which the caller is responsible for freeing.  Note that
4186  * the name may become invalid as a result of fn_move().
4187  */
4188 
4189 char *
4190 fn_name(nfs4_fname_t *fnp)
4191 {
4192         char *name;
4193 
4194         ASSERT(fnp->fn_len < MAXNAMELEN);
4195         name = kmem_alloc(MAXNAMELEN, KM_SLEEP);
4196         mutex_enter(&fnp->fn_lock);
4197         (void) strcpy(name, fnp->fn_name);
4198         mutex_exit(&fnp->fn_lock);
4199 
4200         return (name);
4201 }
4202 
4203 
4204 /*
4205  * fn_path_realloc
4206  *
4207  * This function, used only by fn_path, constructs
4208  * a new string which looks like "prepend" + "/" + "current".
4209  * by allocating a new string and freeing the old one.
4210  */
4211 static void
4212 fn_path_realloc(char **curses, char *prepend)
4213 {
4214         int len, curlen = 0;
4215         char *news;
4216 
4217         if (*curses == NULL) {
4218                 /*
4219                  * Prime the pump, allocate just the
4220                  * space for prepend and return that.
4221                  */
4222                 len = strlen(prepend) + 1;
4223                 news = kmem_alloc(len, KM_SLEEP);
4224                 (void) strncpy(news, prepend, len);
4225         } else {
4226                 /*
4227                  * Allocate the space  for a new string
4228                  * +1 +1 is for the "/" and the NULL
4229                  * byte at the end of it all.
4230                  */
4231                 curlen = strlen(*curses);
4232                 len = curlen + strlen(prepend) + 1 + 1;
4233                 news = kmem_alloc(len, KM_SLEEP);
4234                 (void) strncpy(news, prepend, len);
4235                 (void) strcat(news, "/");
4236                 (void) strcat(news, *curses);
4237                 kmem_free(*curses, curlen + 1);
4238         }
4239         *curses = news;
4240 }
4241 
4242 /*
4243  * Returns the path name (starting from the fs root) for the given fname.
4244  * The caller is responsible for freeing.  Note that the path may be or
4245  * become invalid as a result of fn_move().
4246  */
4247 
4248 char *
4249 fn_path(nfs4_fname_t *fnp)
4250 {
4251         char *path;
4252         nfs4_fname_t *nextfnp;
4253 
4254         if (fnp == NULL)
4255                 return (NULL);
4256 
4257         path = NULL;
4258 
4259         /* walk up the tree constructing the pathname.  */
4260 
4261         fn_hold(fnp);                   /* adjust for later rele */
4262         do {
4263                 mutex_enter(&fnp->fn_lock);
4264                 /*
4265                  * Add fn_name in front of the current path
4266                  */
4267                 fn_path_realloc(&path, fnp->fn_name);
4268                 nextfnp = fnp->fn_parent;
4269                 if (nextfnp != NULL)
4270                         fn_hold(nextfnp);
4271                 mutex_exit(&fnp->fn_lock);
4272                 fn_rele(&fnp);
4273                 fnp = nextfnp;
4274         } while (fnp != NULL);
4275 
4276         return (path);
4277 }
4278 
4279 /*
4280  * Return a reference to the parent of the given fname, which the caller is
4281  * responsible for eventually releasing.
4282  */
4283 
4284 nfs4_fname_t *
4285 fn_parent(nfs4_fname_t *fnp)
4286 {
4287         nfs4_fname_t *parent;
4288 
4289         mutex_enter(&fnp->fn_lock);
4290         parent = fnp->fn_parent;
4291         if (parent != NULL)
4292                 fn_hold(parent);
4293         mutex_exit(&fnp->fn_lock);
4294 
4295         return (parent);
4296 }
4297 
4298 /*
4299  * Update fnp so that its parent is newparent and its name is newname.
4300  */
4301 
4302 void
4303 fn_move(nfs4_fname_t *fnp, nfs4_fname_t *newparent, char *newname)
4304 {
4305         nfs4_fname_t *parent, *tmpfnp;
4306         ssize_t newlen;
4307         nfs4_fname_t key;
4308         avl_index_t where;
4309 
4310         /*
4311          * This assert exists to catch the client trying to rename
4312          * a dir to be a child of itself.  This happened at a recent
4313          * bakeoff against a 3rd party (broken) server which allowed
4314          * the rename to succeed.  If it trips it means that:
4315          *      a) the code in nfs4rename that detects this case is broken
4316          *      b) the server is broken (since it allowed the bogus rename)
4317          *
4318          * For non-DEBUG kernels, prepare for a recursive mutex_enter
4319          * panic below from:  mutex_enter(&newparent->fn_lock);
4320          */
4321         ASSERT(fnp != newparent);
4322 
4323         /*
4324          * Remove fnp from its current parent, change its name, then add it
4325          * to newparent. It might happen that fnp was replaced by another
4326          * nfs4_fname_t with the same fn_name in parent->fn_children.
4327          * In such case, fnp->fn_parent is NULL and we skip the removal
4328          * of fnp from its current parent.
4329          */
4330         mutex_enter(&fnp->fn_lock);
4331         parent = fnp->fn_parent;
4332         if (parent != NULL) {
4333                 mutex_enter(&parent->fn_lock);
4334                 avl_remove(&parent->fn_children, fnp);
4335                 mutex_exit(&parent->fn_lock);
4336                 fn_rele(&fnp->fn_parent);
4337         }
4338 
4339         newlen = strlen(newname);
4340         if (newlen != fnp->fn_len) {
4341                 ASSERT(newlen < MAXNAMELEN);
4342                 kmem_free(fnp->fn_name, fnp->fn_len + 1);
4343                 fnp->fn_name = kmem_alloc(newlen + 1, KM_SLEEP);
4344                 fnp->fn_len = newlen;
4345         }
4346         (void) strcpy(fnp->fn_name, newname);
4347 
4348 again:
4349         mutex_enter(&newparent->fn_lock);
4350         key.fn_name = fnp->fn_name;
4351         tmpfnp = avl_find(&newparent->fn_children, &key, &where);
4352         if (tmpfnp != NULL) {
4353                 /*
4354                  * This could be due to a file that was unlinked while
4355                  * open, or perhaps the rnode is in the free list.  Remove
4356                  * it from newparent and let it go away on its own.  The
4357                  * contorted code is to deal with lock order issues and
4358                  * race conditions.
4359                  */
4360                 fn_hold(tmpfnp);
4361                 mutex_exit(&newparent->fn_lock);
4362                 mutex_enter(&tmpfnp->fn_lock);
4363                 if (tmpfnp->fn_parent == newparent) {
4364                         mutex_enter(&newparent->fn_lock);
4365                         avl_remove(&newparent->fn_children, tmpfnp);
4366                         mutex_exit(&newparent->fn_lock);
4367                         fn_rele(&tmpfnp->fn_parent);
4368                 }
4369                 mutex_exit(&tmpfnp->fn_lock);
4370                 fn_rele(&tmpfnp);
4371                 goto again;
4372         }
4373         fnp->fn_parent = newparent;
4374         fn_hold(newparent);
4375         avl_insert(&newparent->fn_children, fnp, where);
4376         mutex_exit(&newparent->fn_lock);
4377         mutex_exit(&fnp->fn_lock);
4378 }
4379 
4380 #ifdef DEBUG
4381 /*
4382  * Return non-zero if the type information makes sense for the given vnode.
4383  * Otherwise panic.
4384  */
4385 int
4386 nfs4_consistent_type(vnode_t *vp)
4387 {
4388         rnode4_t *rp = VTOR4(vp);
4389 
4390         if (nfs4_vtype_debug && vp->v_type != VNON &&
4391             rp->r_attr.va_type != VNON && vp->v_type != rp->r_attr.va_type) {
4392                 cmn_err(CE_PANIC, "vnode %p type mismatch; v_type=%d, "
4393                     "rnode attr type=%d", (void *)vp, vp->v_type,
4394                     rp->r_attr.va_type);
4395         }
4396 
4397         return (1);
4398 }
4399 #endif /* DEBUG */