1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
  23  *
  24  *      Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
  25  *      All rights reserved.
  26  */
  27 
  28 #include <sys/param.h>
  29 #include <sys/types.h>
  30 #include <sys/systm.h>
  31 #include <sys/thread.h>
  32 #include <sys/t_lock.h>
  33 #include <sys/time.h>
  34 #include <sys/vnode.h>
  35 #include <sys/vfs.h>
  36 #include <sys/errno.h>
  37 #include <sys/buf.h>
  38 #include <sys/stat.h>
  39 #include <sys/cred.h>
  40 #include <sys/kmem.h>
  41 #include <sys/debug.h>
  42 #include <sys/dnlc.h>
  43 #include <sys/vmsystm.h>
  44 #include <sys/flock.h>
  45 #include <sys/share.h>
  46 #include <sys/cmn_err.h>
  47 #include <sys/tiuser.h>
  48 #include <sys/sysmacros.h>
  49 #include <sys/callb.h>
  50 #include <sys/acl.h>
  51 #include <sys/kstat.h>
  52 #include <sys/signal.h>
  53 #include <sys/list.h>
  54 #include <sys/zone.h>
  55 
  56 #include <rpc/types.h>
  57 #include <rpc/xdr.h>
  58 #include <rpc/auth.h>
  59 #include <rpc/clnt.h>
  60 
  61 #include <nfs/nfs.h>
  62 #include <nfs/nfs_clnt.h>
  63 
  64 #include <nfs/rnode.h>
  65 #include <nfs/nfs_acl.h>
  66 #include <nfs/lm.h>
  67 
  68 #include <vm/hat.h>
  69 #include <vm/as.h>
  70 #include <vm/page.h>
  71 #include <vm/pvn.h>
  72 #include <vm/seg.h>
  73 #include <vm/seg_map.h>
  74 #include <vm/seg_vn.h>
  75 
  76 static void     nfs3_attr_cache(vnode_t *, vattr_t *, vattr_t *, hrtime_t,
  77                         cred_t *);
  78 static int      nfs_getattr_cache(vnode_t *, struct vattr *);
  79 static int      nfs_remove_locking_id(vnode_t *, int, char *, char *, int *);
  80 
  81 struct mi_globals {
  82         kmutex_t        mig_lock;  /* lock protecting mig_list */
  83         list_t          mig_list;  /* list of NFS v2 or v3 mounts in zone */
  84         boolean_t       mig_destructor_called;
  85 };
  86 
  87 static zone_key_t mi_list_key;
  88 
  89 /* Debugging flag for PC file shares. */
  90 extern int      share_debug;
  91 
  92 /*
  93  * Attributes caching:
  94  *
  95  * Attributes are cached in the rnode in struct vattr form.
  96  * There is a time associated with the cached attributes (r_attrtime)
  97  * which tells whether the attributes are valid. The time is initialized
  98  * to the difference between current time and the modify time of the vnode
  99  * when new attributes are cached. This allows the attributes for
 100  * files that have changed recently to be timed out sooner than for files
 101  * that have not changed for a long time. There are minimum and maximum
 102  * timeout values that can be set per mount point.
 103  */
 104 
 105 int
 106 nfs_waitfor_purge_complete(vnode_t *vp)
 107 {
 108         rnode_t *rp;
 109         k_sigset_t smask;
 110 
 111         rp = VTOR(vp);
 112         if (rp->r_serial != NULL && rp->r_serial != curthread) {
 113                 mutex_enter(&rp->r_statelock);
 114                 sigintr(&smask, VTOMI(vp)->mi_flags & MI_INT);
 115                 while (rp->r_serial != NULL) {
 116                         if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
 117                                 sigunintr(&smask);
 118                                 mutex_exit(&rp->r_statelock);
 119                                 return (EINTR);
 120                         }
 121                 }
 122                 sigunintr(&smask);
 123                 mutex_exit(&rp->r_statelock);
 124         }
 125         return (0);
 126 }
 127 
 128 /*
 129  * Validate caches by checking cached attributes. If the cached
 130  * attributes have timed out, then get new attributes from the server.
 131  * As a side affect, this will do cache invalidation if the attributes
 132  * have changed.
 133  *
 134  * If the attributes have not timed out and if there is a cache
 135  * invalidation being done by some other thread, then wait until that
 136  * thread has completed the cache invalidation.
 137  */
 138 int
 139 nfs_validate_caches(vnode_t *vp, cred_t *cr)
 140 {
 141         int error;
 142         struct vattr va;
 143 
 144         if (ATTRCACHE_VALID(vp)) {
 145                 error = nfs_waitfor_purge_complete(vp);
 146                 if (error)
 147                         return (error);
 148                 return (0);
 149         }
 150 
 151         va.va_mask = AT_ALL;
 152         return (nfs_getattr_otw(vp, &va, cr));
 153 }
 154 
 155 /*
 156  * Validate caches by checking cached attributes. If the cached
 157  * attributes have timed out, then get new attributes from the server.
 158  * As a side affect, this will do cache invalidation if the attributes
 159  * have changed.
 160  *
 161  * If the attributes have not timed out and if there is a cache
 162  * invalidation being done by some other thread, then wait until that
 163  * thread has completed the cache invalidation.
 164  */
 165 int
 166 nfs3_validate_caches(vnode_t *vp, cred_t *cr)
 167 {
 168         int error;
 169         struct vattr va;
 170 
 171         if (ATTRCACHE_VALID(vp)) {
 172                 error = nfs_waitfor_purge_complete(vp);
 173                 if (error)
 174                         return (error);
 175                 return (0);
 176         }
 177 
 178         va.va_mask = AT_ALL;
 179         return (nfs3_getattr_otw(vp, &va, cr));
 180 }
 181 
 182 /*
 183  * Purge all of the various NFS `data' caches.
 184  */
 185 void
 186 nfs_purge_caches(vnode_t *vp, int purge_dnlc, cred_t *cr)
 187 {
 188         rnode_t *rp;
 189         char *contents;
 190         int size;
 191         int error;
 192 
 193         /*
 194          * Purge the DNLC for any entries which refer to this file.
 195          * Avoid recursive entry into dnlc_purge_vp() in case of a directory.
 196          */
 197         rp = VTOR(vp);
 198         mutex_enter(&rp->r_statelock);
 199         if (vp->v_count > 1 &&
 200             (vp->v_type == VDIR || purge_dnlc == NFS_PURGE_DNLC) &&
 201             !(rp->r_flags & RINDNLCPURGE)) {
 202                 /*
 203                  * Set the RINDNLCPURGE flag to prevent recursive entry
 204                  * into dnlc_purge_vp()
 205                  */
 206                 if (vp->v_type == VDIR)
 207                         rp->r_flags |= RINDNLCPURGE;
 208                 mutex_exit(&rp->r_statelock);
 209                 dnlc_purge_vp(vp);
 210                 mutex_enter(&rp->r_statelock);
 211                 if (rp->r_flags & RINDNLCPURGE)
 212                         rp->r_flags &= ~RINDNLCPURGE;
 213         }
 214 
 215         /*
 216          * Clear any readdir state bits and purge the readlink response cache.
 217          */
 218         contents = rp->r_symlink.contents;
 219         size = rp->r_symlink.size;
 220         rp->r_symlink.contents = NULL;
 221         mutex_exit(&rp->r_statelock);
 222 
 223         if (contents != NULL) {
 224 
 225                 kmem_free((void *)contents, size);
 226         }
 227 
 228         /*
 229          * Flush the page cache.
 230          */
 231         if (vn_has_cached_data(vp)) {
 232                 error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_INVAL, cr, NULL);
 233                 if (error && (error == ENOSPC || error == EDQUOT)) {
 234                         mutex_enter(&rp->r_statelock);
 235                         if (!rp->r_error)
 236                                 rp->r_error = error;
 237                         mutex_exit(&rp->r_statelock);
 238                 }
 239         }
 240 
 241         /*
 242          * Flush the readdir response cache.
 243          */
 244         if (HAVE_RDDIR_CACHE(rp))
 245                 nfs_purge_rddir_cache(vp);
 246 }
 247 
 248 /*
 249  * Purge the readdir cache of all entries
 250  */
 251 void
 252 nfs_purge_rddir_cache(vnode_t *vp)
 253 {
 254         rnode_t *rp;
 255         rddir_cache *rdc;
 256         rddir_cache *nrdc;
 257 
 258         rp = VTOR(vp);
 259 top:
 260         mutex_enter(&rp->r_statelock);
 261         rp->r_direof = NULL;
 262         rp->r_flags &= ~RLOOKUP;
 263         rp->r_flags |= RREADDIRPLUS;
 264         rdc = avl_first(&rp->r_dir);
 265         while (rdc != NULL) {
 266                 nrdc = AVL_NEXT(&rp->r_dir, rdc);
 267                 avl_remove(&rp->r_dir, rdc);
 268                 rddir_cache_rele(rdc);
 269                 rdc = nrdc;
 270         }
 271         mutex_exit(&rp->r_statelock);
 272 }
 273 
 274 /*
 275  * Do a cache check based on the post-operation attributes.
 276  * Then make them the new cached attributes.  If no attributes
 277  * were returned, then mark the attributes as timed out.
 278  */
 279 void
 280 nfs3_cache_post_op_attr(vnode_t *vp, post_op_attr *poap, hrtime_t t, cred_t *cr)
 281 {
 282         vattr_t attr;
 283 
 284         if (!poap->attributes) {
 285                 PURGE_ATTRCACHE(vp);
 286                 return;
 287         }
 288         (void) nfs3_cache_fattr3(vp, &poap->attr, &attr, t, cr);
 289 }
 290 
 291 /*
 292  * Same as above, but using a vattr
 293  */
 294 void
 295 nfs3_cache_post_op_vattr(vnode_t *vp, post_op_vattr *poap, hrtime_t t,
 296     cred_t *cr)
 297 {
 298         if (!poap->attributes) {
 299                 PURGE_ATTRCACHE(vp);
 300                 return;
 301         }
 302         nfs_attr_cache(vp, poap->fres.vap, t, cr);
 303 }
 304 
 305 /*
 306  * Do a cache check based on the weak cache consistency attributes.
 307  * These consist of a small set of pre-operation attributes and the
 308  * full set of post-operation attributes.
 309  *
 310  * If we are given the pre-operation attributes, then use them to
 311  * check the validity of the various caches.  Then, if we got the
 312  * post-operation attributes, make them the new cached attributes.
 313  * If we didn't get the post-operation attributes, then mark the
 314  * attribute cache as timed out so that the next reference will
 315  * cause a GETATTR to the server to refresh with the current
 316  * attributes.
 317  *
 318  * Otherwise, if we didn't get the pre-operation attributes, but
 319  * we did get the post-operation attributes, then use these
 320  * attributes to check the validity of the various caches.  This
 321  * will probably cause a flush of the caches because if the
 322  * operation succeeded, the attributes of the object were changed
 323  * in some way from the old post-operation attributes.  This
 324  * should be okay because it is the safe thing to do.  After
 325  * checking the data caches, then we make these the new cached
 326  * attributes.
 327  *
 328  * Otherwise, we didn't get either the pre- or post-operation
 329  * attributes.  Simply mark the attribute cache as timed out so
 330  * the next reference will cause a GETATTR to the server to
 331  * refresh with the current attributes.
 332  *
 333  * If an error occurred trying to convert the over the wire
 334  * attributes to a vattr, then simply mark the attribute cache as
 335  * timed out.
 336  */
 337 void
 338 nfs3_cache_wcc_data(vnode_t *vp, wcc_data *wccp, hrtime_t t, cred_t *cr)
 339 {
 340         vattr_t bva;
 341         vattr_t ava;
 342 
 343         if (wccp->after.attributes) {
 344                 if (fattr3_to_vattr(vp, &wccp->after.attr, &ava)) {
 345                         PURGE_ATTRCACHE(vp);
 346                         return;
 347                 }
 348                 if (wccp->before.attributes) {
 349                         bva.va_ctime.tv_sec = wccp->before.attr.ctime.seconds;
 350                         bva.va_ctime.tv_nsec = wccp->before.attr.ctime.nseconds;
 351                         bva.va_mtime.tv_sec = wccp->before.attr.mtime.seconds;
 352                         bva.va_mtime.tv_nsec = wccp->before.attr.mtime.nseconds;
 353                         bva.va_size = wccp->before.attr.size;
 354                         nfs3_attr_cache(vp, &bva, &ava, t, cr);
 355                 } else
 356                         nfs_attr_cache(vp, &ava, t, cr);
 357         } else {
 358                 PURGE_ATTRCACHE(vp);
 359         }
 360 }
 361 
 362 /*
 363  * Set attributes cache for given vnode using nfsattr.
 364  *
 365  * This routine does not do cache validation with the attributes.
 366  *
 367  * If an error occurred trying to convert the over the wire
 368  * attributes to a vattr, then simply mark the attribute cache as
 369  * timed out.
 370  */
 371 void
 372 nfs_attrcache(vnode_t *vp, struct nfsfattr *na, hrtime_t t)
 373 {
 374         rnode_t *rp;
 375         struct vattr va;
 376 
 377         if (!nattr_to_vattr(vp, na, &va)) {
 378                 rp = VTOR(vp);
 379                 mutex_enter(&rp->r_statelock);
 380                 if (rp->r_mtime <= t)
 381                         nfs_attrcache_va(vp, &va);
 382                 mutex_exit(&rp->r_statelock);
 383         } else {
 384                 PURGE_ATTRCACHE(vp);
 385         }
 386 }
 387 
 388 /*
 389  * Set attributes cache for given vnode using fattr3.
 390  *
 391  * This routine does not do cache validation with the attributes.
 392  *
 393  * If an error occurred trying to convert the over the wire
 394  * attributes to a vattr, then simply mark the attribute cache as
 395  * timed out.
 396  */
 397 void
 398 nfs3_attrcache(vnode_t *vp, fattr3 *na, hrtime_t t)
 399 {
 400         rnode_t *rp;
 401         struct vattr va;
 402 
 403         if (!fattr3_to_vattr(vp, na, &va)) {
 404                 rp = VTOR(vp);
 405                 mutex_enter(&rp->r_statelock);
 406                 if (rp->r_mtime <= t)
 407                         nfs_attrcache_va(vp, &va);
 408                 mutex_exit(&rp->r_statelock);
 409         } else {
 410                 PURGE_ATTRCACHE(vp);
 411         }
 412 }
 413 
 414 /*
 415  * Do a cache check based on attributes returned over the wire.  The
 416  * new attributes are cached.
 417  *
 418  * If an error occurred trying to convert the over the wire attributes
 419  * to a vattr, then just return that error.
 420  *
 421  * As a side affect, the vattr argument is filled in with the converted
 422  * attributes.
 423  */
 424 int
 425 nfs_cache_fattr(vnode_t *vp, struct nfsfattr *na, vattr_t *vap, hrtime_t t,
 426     cred_t *cr)
 427 {
 428         int error;
 429 
 430         error = nattr_to_vattr(vp, na, vap);
 431         if (error)
 432                 return (error);
 433         nfs_attr_cache(vp, vap, t, cr);
 434         return (0);
 435 }
 436 
 437 /*
 438  * Do a cache check based on attributes returned over the wire.  The
 439  * new attributes are cached.
 440  *
 441  * If an error occurred trying to convert the over the wire attributes
 442  * to a vattr, then just return that error.
 443  *
 444  * As a side affect, the vattr argument is filled in with the converted
 445  * attributes.
 446  */
 447 int
 448 nfs3_cache_fattr3(vnode_t *vp, fattr3 *na, vattr_t *vap, hrtime_t t, cred_t *cr)
 449 {
 450         int error;
 451 
 452         error = fattr3_to_vattr(vp, na, vap);
 453         if (error)
 454                 return (error);
 455         nfs_attr_cache(vp, vap, t, cr);
 456         return (0);
 457 }
 458 
 459 /*
 460  * Use the passed in virtual attributes to check to see whether the
 461  * data and metadata caches are valid, cache the new attributes, and
 462  * then do the cache invalidation if required.
 463  *
 464  * The cache validation and caching of the new attributes is done
 465  * atomically via the use of the mutex, r_statelock.  If required,
 466  * the cache invalidation is done atomically w.r.t. the cache
 467  * validation and caching of the attributes via the pseudo lock,
 468  * r_serial.
 469  *
 470  * This routine is used to do cache validation and attributes caching
 471  * for operations with a single set of post operation attributes.
 472  */
 473 void
 474 nfs_attr_cache(vnode_t *vp, vattr_t *vap, hrtime_t t, cred_t *cr)
 475 {
 476         rnode_t *rp;
 477         int mtime_changed = 0;
 478         int ctime_changed = 0;
 479         vsecattr_t *vsp;
 480         int was_serial;
 481         len_t preattr_rsize;
 482         boolean_t writeattr_set = B_FALSE;
 483         boolean_t cachepurge_set = B_FALSE;
 484 
 485         rp = VTOR(vp);
 486 
 487         mutex_enter(&rp->r_statelock);
 488 
 489         if (rp->r_serial != curthread) {
 490                 klwp_t *lwp = ttolwp(curthread);
 491 
 492                 was_serial = 0;
 493                 if (lwp != NULL)
 494                         lwp->lwp_nostop++;
 495                 while (rp->r_serial != NULL) {
 496                         if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
 497                                 mutex_exit(&rp->r_statelock);
 498                                 if (lwp != NULL)
 499                                         lwp->lwp_nostop--;
 500                                 return;
 501                         }
 502                 }
 503                 if (lwp != NULL)
 504                         lwp->lwp_nostop--;
 505         } else
 506                 was_serial = 1;
 507 
 508         if (rp->r_mtime > t) {
 509                 if (!CACHE_VALID(rp, vap->va_mtime, vap->va_size))
 510                         PURGE_ATTRCACHE_LOCKED(rp);
 511                 mutex_exit(&rp->r_statelock);
 512                 return;
 513         }
 514 
 515         /*
 516          * Write thread after writing data to file on remote server,
 517          * will always set RWRITEATTR to indicate that file on remote
 518          * server was modified with a WRITE operation and would have
 519          * marked attribute cache as timed out. If RWRITEATTR
 520          * is set, then do not check for mtime and ctime change.
 521          */
 522         if (!(rp->r_flags & RWRITEATTR)) {
 523                 if (!CACHE_VALID(rp, vap->va_mtime, vap->va_size))
 524                         mtime_changed = 1;
 525 
 526                 if (rp->r_attr.va_ctime.tv_sec != vap->va_ctime.tv_sec ||
 527                     rp->r_attr.va_ctime.tv_nsec != vap->va_ctime.tv_nsec)
 528                         ctime_changed = 1;
 529         } else {
 530                 writeattr_set = B_TRUE;
 531         }
 532 
 533         preattr_rsize = rp->r_size;
 534 
 535         nfs_attrcache_va(vp, vap);
 536 
 537         /*
 538          * If we have updated filesize in nfs_attrcache_va, as soon as we
 539          * drop statelock we will be in transition of purging all
 540          * our caches and updating them. It is possible for another
 541          * thread to pick this new file size and read in zeroed data.
 542          * stall other threads till cache purge is complete.
 543          */
 544         if ((vp->v_type == VREG) && (rp->r_size != preattr_rsize)) {
 545                 /*
 546                  * If RWRITEATTR was set and we have updated the file
 547                  * size, Server's returned file size need not necessarily
 548                  * be because of this Client's WRITE. We need to purge
 549                  * all caches.
 550                  */
 551                 if (writeattr_set)
 552                         mtime_changed = 1;
 553 
 554                 if (mtime_changed && !(rp->r_flags & RINCACHEPURGE)) {
 555                         rp->r_flags |= RINCACHEPURGE;
 556                         cachepurge_set = B_TRUE;
 557                 }
 558         }
 559 
 560         if (!mtime_changed && !ctime_changed) {
 561                 mutex_exit(&rp->r_statelock);
 562                 return;
 563         }
 564 
 565         rp->r_serial = curthread;
 566 
 567         mutex_exit(&rp->r_statelock);
 568 
 569         if (mtime_changed)
 570                 nfs_purge_caches(vp, NFS_NOPURGE_DNLC, cr);
 571 
 572         if ((rp->r_flags & RINCACHEPURGE) && cachepurge_set) {
 573                 mutex_enter(&rp->r_statelock);
 574                 rp->r_flags &= ~RINCACHEPURGE;
 575                 cv_broadcast(&rp->r_cv);
 576                 mutex_exit(&rp->r_statelock);
 577                 cachepurge_set = B_FALSE;
 578         }
 579 
 580         if (ctime_changed) {
 581                 (void) nfs_access_purge_rp(rp);
 582                 if (rp->r_secattr != NULL) {
 583                         mutex_enter(&rp->r_statelock);
 584                         vsp = rp->r_secattr;
 585                         rp->r_secattr = NULL;
 586                         mutex_exit(&rp->r_statelock);
 587                         if (vsp != NULL)
 588                                 nfs_acl_free(vsp);
 589                 }
 590         }
 591 
 592         if (!was_serial) {
 593                 mutex_enter(&rp->r_statelock);
 594                 rp->r_serial = NULL;
 595                 cv_broadcast(&rp->r_cv);
 596                 mutex_exit(&rp->r_statelock);
 597         }
 598 }
 599 
 600 /*
 601  * Use the passed in "before" virtual attributes to check to see
 602  * whether the data and metadata caches are valid, cache the "after"
 603  * new attributes, and then do the cache invalidation if required.
 604  *
 605  * The cache validation and caching of the new attributes is done
 606  * atomically via the use of the mutex, r_statelock.  If required,
 607  * the cache invalidation is done atomically w.r.t. the cache
 608  * validation and caching of the attributes via the pseudo lock,
 609  * r_serial.
 610  *
 611  * This routine is used to do cache validation and attributes caching
 612  * for operations with both pre operation attributes and post operation
 613  * attributes.
 614  */
 615 static void
 616 nfs3_attr_cache(vnode_t *vp, vattr_t *bvap, vattr_t *avap, hrtime_t t,
 617     cred_t *cr)
 618 {
 619         rnode_t *rp;
 620         int mtime_changed = 0;
 621         int ctime_changed = 0;
 622         vsecattr_t *vsp;
 623         int was_serial;
 624         len_t preattr_rsize;
 625         boolean_t writeattr_set = B_FALSE;
 626         boolean_t cachepurge_set = B_FALSE;
 627 
 628         rp = VTOR(vp);
 629 
 630         mutex_enter(&rp->r_statelock);
 631 
 632         if (rp->r_serial != curthread) {
 633                 klwp_t *lwp = ttolwp(curthread);
 634 
 635                 was_serial = 0;
 636                 if (lwp != NULL)
 637                         lwp->lwp_nostop++;
 638                 while (rp->r_serial != NULL) {
 639                         if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
 640                                 mutex_exit(&rp->r_statelock);
 641                                 if (lwp != NULL)
 642                                         lwp->lwp_nostop--;
 643                                 return;
 644                         }
 645                 }
 646                 if (lwp != NULL)
 647                         lwp->lwp_nostop--;
 648         } else
 649                 was_serial = 1;
 650 
 651         if (rp->r_mtime > t) {
 652                 if (!CACHE_VALID(rp, avap->va_mtime, avap->va_size))
 653                         PURGE_ATTRCACHE_LOCKED(rp);
 654                 mutex_exit(&rp->r_statelock);
 655                 return;
 656         }
 657 
 658         /*
 659          * Write thread after writing data to file on remote server,
 660          * will always set RWRITEATTR to indicate that file on remote
 661          * server was modified with a WRITE operation and would have
 662          * marked attribute cache as timed out. If RWRITEATTR
 663          * is set, then do not check for mtime and ctime change.
 664          */
 665         if (!(rp->r_flags & RWRITEATTR)) {
 666                 if (!CACHE_VALID(rp, bvap->va_mtime, bvap->va_size))
 667                         mtime_changed = 1;
 668 
 669                 if (rp->r_attr.va_ctime.tv_sec != bvap->va_ctime.tv_sec ||
 670                     rp->r_attr.va_ctime.tv_nsec != bvap->va_ctime.tv_nsec)
 671                         ctime_changed = 1;
 672         } else {
 673                 writeattr_set = B_TRUE;
 674         }
 675 
 676         preattr_rsize = rp->r_size;
 677 
 678         nfs_attrcache_va(vp, avap);
 679 
 680         /*
 681          * If we have updated filesize in nfs_attrcache_va, as soon as we
 682          * drop statelock we will be in transition of purging all
 683          * our caches and updating them. It is possible for another
 684          * thread to pick this new file size and read in zeroed data.
 685          * stall other threads till cache purge is complete.
 686          */
 687         if ((vp->v_type == VREG) && (rp->r_size != preattr_rsize)) {
 688                 /*
 689                  * If RWRITEATTR was set and we have updated the file
 690                  * size, Server's returned file size need not necessarily
 691                  * be because of this Client's WRITE. We need to purge
 692                  * all caches.
 693                  */
 694                 if (writeattr_set)
 695                         mtime_changed = 1;
 696 
 697                 if (mtime_changed && !(rp->r_flags & RINCACHEPURGE)) {
 698                         rp->r_flags |= RINCACHEPURGE;
 699                         cachepurge_set = B_TRUE;
 700                 }
 701         }
 702 
 703         if (!mtime_changed && !ctime_changed) {
 704                 mutex_exit(&rp->r_statelock);
 705                 return;
 706         }
 707 
 708         rp->r_serial = curthread;
 709 
 710         mutex_exit(&rp->r_statelock);
 711 
 712         if (mtime_changed)
 713                 nfs_purge_caches(vp, NFS_NOPURGE_DNLC, cr);
 714 
 715         if ((rp->r_flags & RINCACHEPURGE) && cachepurge_set) {
 716                 mutex_enter(&rp->r_statelock);
 717                 rp->r_flags &= ~RINCACHEPURGE;
 718                 cv_broadcast(&rp->r_cv);
 719                 mutex_exit(&rp->r_statelock);
 720                 cachepurge_set = B_FALSE;
 721         }
 722 
 723         if (ctime_changed) {
 724                 (void) nfs_access_purge_rp(rp);
 725                 if (rp->r_secattr != NULL) {
 726                         mutex_enter(&rp->r_statelock);
 727                         vsp = rp->r_secattr;
 728                         rp->r_secattr = NULL;
 729                         mutex_exit(&rp->r_statelock);
 730                         if (vsp != NULL)
 731                                 nfs_acl_free(vsp);
 732                 }
 733         }
 734 
 735         if (!was_serial) {
 736                 mutex_enter(&rp->r_statelock);
 737                 rp->r_serial = NULL;
 738                 cv_broadcast(&rp->r_cv);
 739                 mutex_exit(&rp->r_statelock);
 740         }
 741 }
 742 
 743 /*
 744  * Set attributes cache for given vnode using virtual attributes.
 745  *
 746  * Set the timeout value on the attribute cache and fill it
 747  * with the passed in attributes.
 748  *
 749  * The caller must be holding r_statelock.
 750  */
 751 void
 752 nfs_attrcache_va(vnode_t *vp, struct vattr *va)
 753 {
 754         rnode_t *rp;
 755         mntinfo_t *mi;
 756         hrtime_t delta;
 757         hrtime_t now;
 758 
 759         rp = VTOR(vp);
 760 
 761         ASSERT(MUTEX_HELD(&rp->r_statelock));
 762 
 763         now = gethrtime();
 764 
 765         mi = VTOMI(vp);
 766 
 767         /*
 768          * Delta is the number of nanoseconds that we will
 769          * cache the attributes of the file.  It is based on
 770          * the number of nanoseconds since the last time that
 771          * we detected a change.  The assumption is that files
 772          * that changed recently are likely to change again.
 773          * There is a minimum and a maximum for regular files
 774          * and for directories which is enforced though.
 775          *
 776          * Using the time since last change was detected
 777          * eliminates direct comparison or calculation
 778          * using mixed client and server times.  NFS does
 779          * not make any assumptions regarding the client
 780          * and server clocks being synchronized.
 781          */
 782         if (va->va_mtime.tv_sec != rp->r_attr.va_mtime.tv_sec ||
 783             va->va_mtime.tv_nsec != rp->r_attr.va_mtime.tv_nsec ||
 784             va->va_size != rp->r_attr.va_size)
 785                 rp->r_mtime = now;
 786 
 787         if ((mi->mi_flags & MI_NOAC) || (vp->v_flag & VNOCACHE))
 788                 delta = 0;
 789         else {
 790                 delta = now - rp->r_mtime;
 791                 if (vp->v_type == VDIR) {
 792                         if (delta < mi->mi_acdirmin)
 793                                 delta = mi->mi_acdirmin;
 794                         else if (delta > mi->mi_acdirmax)
 795                                 delta = mi->mi_acdirmax;
 796                 } else {
 797                         if (delta < mi->mi_acregmin)
 798                                 delta = mi->mi_acregmin;
 799                         else if (delta > mi->mi_acregmax)
 800                                 delta = mi->mi_acregmax;
 801                 }
 802         }
 803         rp->r_attrtime = now + delta;
 804         rp->r_attr = *va;
 805         /*
 806          * Update the size of the file if there is no cached data or if
 807          * the cached data is clean and there is no data being written
 808          * out.
 809          */
 810         if (rp->r_size != va->va_size &&
 811             (!vn_has_cached_data(vp) ||
 812             (!(rp->r_flags & RDIRTY) && rp->r_count == 0)))
 813                 rp->r_size = va->va_size;
 814         nfs_setswaplike(vp, va);
 815         rp->r_flags &= ~RWRITEATTR;
 816 }
 817 
 818 /*
 819  * Fill in attribute from the cache.
 820  * If valid, then return 0 to indicate that no error occurred,
 821  * otherwise return 1 to indicate that an error occurred.
 822  */
 823 static int
 824 nfs_getattr_cache(vnode_t *vp, struct vattr *vap)
 825 {
 826         rnode_t *rp;
 827         uint_t mask = vap->va_mask;
 828 
 829         rp = VTOR(vp);
 830         mutex_enter(&rp->r_statelock);
 831         if (ATTRCACHE_VALID(vp)) {
 832                 /*
 833                  * Cached attributes are valid
 834                  */
 835                 *vap = rp->r_attr;
 836                 /*
 837                  * Set the caller's va_mask to the set of attributes
 838                  * that were requested ANDed with the attributes that
 839                  * are available.  If attributes were requested that
 840                  * are not available, those bits must be turned off
 841                  * in the callers va_mask.
 842                  */
 843                 vap->va_mask &= mask;
 844                 mutex_exit(&rp->r_statelock);
 845                 return (0);
 846         }
 847         mutex_exit(&rp->r_statelock);
 848         return (1);
 849 }
 850 
 851 /*
 852  * Get attributes over-the-wire and update attributes cache
 853  * if no error occurred in the over-the-wire operation.
 854  * Return 0 if successful, otherwise error.
 855  */
 856 int
 857 nfs_getattr_otw(vnode_t *vp, struct vattr *vap, cred_t *cr)
 858 {
 859         int error;
 860         struct nfsattrstat ns;
 861         int douprintf;
 862         mntinfo_t *mi;
 863         failinfo_t fi;
 864         hrtime_t t;
 865 
 866         mi = VTOMI(vp);
 867         fi.vp = vp;
 868         fi.fhp = NULL;          /* no need to update, filehandle not copied */
 869         fi.copyproc = nfscopyfh;
 870         fi.lookupproc = nfslookup;
 871         fi.xattrdirproc = acl_getxattrdir2;
 872 
 873         if (mi->mi_flags & MI_ACL) {
 874                 error = acl_getattr2_otw(vp, vap, cr);
 875                 if (mi->mi_flags & MI_ACL)
 876                         return (error);
 877         }
 878 
 879         douprintf = 1;
 880 
 881         t = gethrtime();
 882 
 883         error = rfs2call(mi, RFS_GETATTR,
 884             xdr_fhandle, (caddr_t)VTOFH(vp),
 885             xdr_attrstat, (caddr_t)&ns, cr,
 886             &douprintf, &ns.ns_status, 0, &fi);
 887 
 888         if (!error) {
 889                 error = geterrno(ns.ns_status);
 890                 if (!error)
 891                         error = nfs_cache_fattr(vp, &ns.ns_attr, vap, t, cr);
 892                 else {
 893                         PURGE_STALE_FH(error, vp, cr);
 894                 }
 895         }
 896 
 897         return (error);
 898 }
 899 
 900 /*
 901  * Return either cached ot remote attributes. If get remote attr
 902  * use them to check and invalidate caches, then cache the new attributes.
 903  */
 904 int
 905 nfsgetattr(vnode_t *vp, struct vattr *vap, cred_t *cr)
 906 {
 907         int error;
 908         rnode_t *rp;
 909 
 910         /*
 911          * If we've got cached attributes, we're done, otherwise go
 912          * to the server to get attributes, which will update the cache
 913          * in the process.
 914          */
 915         error = nfs_getattr_cache(vp, vap);
 916         if (error)
 917                 error = nfs_getattr_otw(vp, vap, cr);
 918 
 919         /* Return the client's view of file size */
 920         rp = VTOR(vp);
 921         mutex_enter(&rp->r_statelock);
 922         vap->va_size = rp->r_size;
 923         mutex_exit(&rp->r_statelock);
 924 
 925         return (error);
 926 }
 927 
 928 /*
 929  * Get attributes over-the-wire and update attributes cache
 930  * if no error occurred in the over-the-wire operation.
 931  * Return 0 if successful, otherwise error.
 932  */
 933 int
 934 nfs3_getattr_otw(vnode_t *vp, struct vattr *vap, cred_t *cr)
 935 {
 936         int error;
 937         GETATTR3args args;
 938         GETATTR3vres res;
 939         int douprintf;
 940         failinfo_t fi;
 941         hrtime_t t;
 942 
 943         args.object = *VTOFH3(vp);
 944         fi.vp = vp;
 945         fi.fhp = (caddr_t)&args.object;
 946         fi.copyproc = nfs3copyfh;
 947         fi.lookupproc = nfs3lookup;
 948         fi.xattrdirproc = acl_getxattrdir3;
 949         res.fres.vp = vp;
 950         res.fres.vap = vap;
 951 
 952         douprintf = 1;
 953 
 954         t = gethrtime();
 955 
 956         error = rfs3call(VTOMI(vp), NFSPROC3_GETATTR,
 957             xdr_nfs_fh3, (caddr_t)&args,
 958             xdr_GETATTR3vres, (caddr_t)&res, cr,
 959             &douprintf, &res.status, 0, &fi);
 960 
 961         if (error)
 962                 return (error);
 963 
 964         error = geterrno3(res.status);
 965         if (error) {
 966                 PURGE_STALE_FH(error, vp, cr);
 967                 return (error);
 968         }
 969 
 970         /*
 971          * Catch status codes that indicate fattr3 to vattr translation failure
 972          */
 973         if (res.fres.status)
 974                 return (res.fres.status);
 975 
 976         nfs_attr_cache(vp, vap, t, cr);
 977         return (0);
 978 }
 979 
 980 /*
 981  * Return either cached or remote attributes. If get remote attr
 982  * use them to check and invalidate caches, then cache the new attributes.
 983  */
 984 int
 985 nfs3getattr(vnode_t *vp, struct vattr *vap, cred_t *cr)
 986 {
 987         int error;
 988         rnode_t *rp;
 989 
 990         /*
 991          * If we've got cached attributes, we're done, otherwise go
 992          * to the server to get attributes, which will update the cache
 993          * in the process.
 994          */
 995         error = nfs_getattr_cache(vp, vap);
 996         if (error)
 997                 error = nfs3_getattr_otw(vp, vap, cr);
 998 
 999         /* Return the client's view of file size */
1000         rp = VTOR(vp);
1001         mutex_enter(&rp->r_statelock);
1002         vap->va_size = rp->r_size;
1003         mutex_exit(&rp->r_statelock);
1004 
1005         return (error);
1006 }
1007 
1008 vtype_t nf_to_vt[] = {
1009         VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK
1010 };
1011 /*
1012  * Convert NFS Version 2 over the network attributes to the local
1013  * virtual attributes.  The mapping between the UID_NOBODY/GID_NOBODY
1014  * network representation and the local representation is done here.
1015  * Returns 0 for success, error if failed due to overflow.
1016  */
1017 int
1018 nattr_to_vattr(vnode_t *vp, struct nfsfattr *na, struct vattr *vap)
1019 {
1020         /* overflow in time attributes? */
1021 #ifndef _LP64
1022         if (!NFS2_FATTR_TIME_OK(na))
1023                 return (EOVERFLOW);
1024 #endif
1025 
1026         vap->va_mask = AT_ALL;
1027 
1028         if (na->na_type < NFNON || na->na_type > NFSOC)
1029                 vap->va_type = VBAD;
1030         else
1031                 vap->va_type = nf_to_vt[na->na_type];
1032         vap->va_mode = na->na_mode;
1033         vap->va_uid = (na->na_uid == NFS_UID_NOBODY) ? UID_NOBODY : na->na_uid;
1034         vap->va_gid = (na->na_gid == NFS_GID_NOBODY) ? GID_NOBODY : na->na_gid;
1035         vap->va_fsid = vp->v_vfsp->vfs_dev;
1036         vap->va_nodeid = na->na_nodeid;
1037         vap->va_nlink = na->na_nlink;
1038         vap->va_size = na->na_size;       /* keep for cache validation */
1039         /*
1040          * nfs protocol defines times as unsigned so don't extend sign,
1041          * unless sysadmin set nfs_allow_preepoch_time.
1042          */
1043         NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, na->na_atime.tv_sec);
1044         vap->va_atime.tv_nsec = (uint32_t)(na->na_atime.tv_usec * 1000);
1045         NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, na->na_mtime.tv_sec);
1046         vap->va_mtime.tv_nsec = (uint32_t)(na->na_mtime.tv_usec * 1000);
1047         NFS_TIME_T_CONVERT(vap->va_ctime.tv_sec, na->na_ctime.tv_sec);
1048         vap->va_ctime.tv_nsec = (uint32_t)(na->na_ctime.tv_usec * 1000);
1049         /*
1050          * Shannon's law - uncompress the received dev_t
1051          * if the top half of is zero indicating a response
1052          * from an `older style' OS. Except for when it is a
1053          * `new style' OS sending the maj device of zero,
1054          * in which case the algorithm still works because the
1055          * fact that it is a new style server
1056          * is hidden by the minor device not being greater
1057          * than 255 (a requirement in this case).
1058          */
1059         if ((na->na_rdev & 0xffff0000) == 0)
1060                 vap->va_rdev = nfsv2_expdev(na->na_rdev);
1061         else
1062                 vap->va_rdev = expldev(na->na_rdev);
1063 
1064         vap->va_nblocks = na->na_blocks;
1065         switch (na->na_type) {
1066         case NFBLK:
1067                 vap->va_blksize = DEV_BSIZE;
1068                 break;
1069 
1070         case NFCHR:
1071                 vap->va_blksize = MAXBSIZE;
1072                 break;
1073 
1074         case NFSOC:
1075         default:
1076                 vap->va_blksize = na->na_blocksize;
1077                 break;
1078         }
1079         /*
1080          * This bit of ugliness is a hack to preserve the
1081          * over-the-wire protocols for named-pipe vnodes.
1082          * It remaps the special over-the-wire type to the
1083          * VFIFO type. (see note in nfs.h)
1084          */
1085         if (NA_ISFIFO(na)) {
1086                 vap->va_type = VFIFO;
1087                 vap->va_mode = (vap->va_mode & ~S_IFMT) | S_IFIFO;
1088                 vap->va_rdev = 0;
1089                 vap->va_blksize = na->na_blocksize;
1090         }
1091         vap->va_seq = 0;
1092         return (0);
1093 }
1094 
1095 /*
1096  * Convert NFS Version 3 over the network attributes to the local
1097  * virtual attributes.  The mapping between the UID_NOBODY/GID_NOBODY
1098  * network representation and the local representation is done here.
1099  */
1100 vtype_t nf3_to_vt[] = {
1101         VBAD, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO
1102 };
1103 
1104 int
1105 fattr3_to_vattr(vnode_t *vp, fattr3 *na, struct vattr *vap)
1106 {
1107 
1108 #ifndef _LP64
1109         /* overflow in time attributes? */
1110         if (!NFS3_FATTR_TIME_OK(na))
1111                 return (EOVERFLOW);
1112 #endif
1113         if (!NFS3_SIZE_OK(na->size))
1114                 /* file too big */
1115                 return (EFBIG);
1116 
1117         vap->va_mask = AT_ALL;
1118 
1119         if (na->type < NF3REG || na->type > NF3FIFO)
1120                 vap->va_type = VBAD;
1121         else
1122                 vap->va_type = nf3_to_vt[na->type];
1123         vap->va_mode = na->mode;
1124         vap->va_uid = (na->uid == NFS_UID_NOBODY) ? UID_NOBODY : (uid_t)na->uid;
1125         vap->va_gid = (na->gid == NFS_GID_NOBODY) ? GID_NOBODY : (gid_t)na->gid;
1126         vap->va_fsid = vp->v_vfsp->vfs_dev;
1127         vap->va_nodeid = na->fileid;
1128         vap->va_nlink = na->nlink;
1129         vap->va_size = na->size;
1130 
1131         /*
1132          * nfs protocol defines times as unsigned so don't extend sign,
1133          * unless sysadmin set nfs_allow_preepoch_time.
1134          */
1135         NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, na->atime.seconds);
1136         vap->va_atime.tv_nsec = (uint32_t)na->atime.nseconds;
1137         NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, na->mtime.seconds);
1138         vap->va_mtime.tv_nsec = (uint32_t)na->mtime.nseconds;
1139         NFS_TIME_T_CONVERT(vap->va_ctime.tv_sec, na->ctime.seconds);
1140         vap->va_ctime.tv_nsec = (uint32_t)na->ctime.nseconds;
1141 
1142         switch (na->type) {
1143         case NF3BLK:
1144                 vap->va_rdev = makedevice(na->rdev.specdata1,
1145                     na->rdev.specdata2);
1146                 vap->va_blksize = DEV_BSIZE;
1147                 vap->va_nblocks = 0;
1148                 break;
1149         case NF3CHR:
1150                 vap->va_rdev = makedevice(na->rdev.specdata1,
1151                     na->rdev.specdata2);
1152                 vap->va_blksize = MAXBSIZE;
1153                 vap->va_nblocks = 0;
1154                 break;
1155         case NF3REG:
1156         case NF3DIR:
1157         case NF3LNK:
1158                 vap->va_rdev = 0;
1159                 vap->va_blksize = MAXBSIZE;
1160                 vap->va_nblocks = (u_longlong_t)
1161                     ((na->used + (size3)DEV_BSIZE - (size3)1) /
1162                     (size3)DEV_BSIZE);
1163                 break;
1164         case NF3SOCK:
1165         case NF3FIFO:
1166         default:
1167                 vap->va_rdev = 0;
1168                 vap->va_blksize = MAXBSIZE;
1169                 vap->va_nblocks = 0;
1170                 break;
1171         }
1172         vap->va_seq = 0;
1173         return (0);
1174 }
1175 
1176 /*
1177  * Asynchronous I/O parameters.  nfs_async_threads is the high-water mark
1178  * for the demand-based allocation of async threads per-mount.  The
1179  * nfs_async_timeout is the amount of time a thread will live after it
1180  * becomes idle, unless new I/O requests are received before the thread
1181  * dies.  See nfs_async_putpage and nfs_async_start.
1182  */
1183 
1184 int nfs_async_timeout = -1;     /* uninitialized */
1185 
1186 static void     nfs_async_start(struct vfs *);
1187 static void     nfs_async_pgops_start(struct vfs *);
1188 static void     nfs_async_common_start(struct vfs *, int);
1189 
1190 static void
1191 free_async_args(struct nfs_async_reqs *args)
1192 {
1193         rnode_t *rp;
1194 
1195         if (args->a_io != NFS_INACTIVE) {
1196                 rp = VTOR(args->a_vp);
1197                 mutex_enter(&rp->r_statelock);
1198                 rp->r_count--;
1199                 if (args->a_io == NFS_PUTAPAGE ||
1200                     args->a_io == NFS_PAGEIO)
1201                         rp->r_awcount--;
1202                 cv_broadcast(&rp->r_cv);
1203                 mutex_exit(&rp->r_statelock);
1204                 VN_RELE(args->a_vp);
1205         }
1206         crfree(args->a_cred);
1207         kmem_free(args, sizeof (*args));
1208 }
1209 
1210 /*
1211  * Cross-zone thread creation and NFS access is disallowed, yet fsflush() and
1212  * pageout(), running in the global zone, have legitimate reasons to do
1213  * VOP_PUTPAGE(B_ASYNC) on other zones' NFS mounts.  We avoid the problem by
1214  * use of a a per-mount "asynchronous requests manager thread" which is
1215  * signaled by the various asynchronous work routines when there is
1216  * asynchronous work to be done.  It is responsible for creating new
1217  * worker threads if necessary, and notifying existing worker threads
1218  * that there is work to be done.
1219  *
1220  * In other words, it will "take the specifications from the customers and
1221  * give them to the engineers."
1222  *
1223  * Worker threads die off of their own accord if they are no longer
1224  * needed.
1225  *
1226  * This thread is killed when the zone is going away or the filesystem
1227  * is being unmounted.
1228  */
1229 void
1230 nfs_async_manager(vfs_t *vfsp)
1231 {
1232         callb_cpr_t cprinfo;
1233         mntinfo_t *mi;
1234         uint_t max_threads;
1235 
1236         mi = VFTOMI(vfsp);
1237 
1238         CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr,
1239             "nfs_async_manager");
1240 
1241         mutex_enter(&mi->mi_async_lock);
1242         /*
1243          * We want to stash the max number of threads that this mount was
1244          * allowed so we can use it later when the variable is set to zero as
1245          * part of the zone/mount going away.
1246          *
1247          * We want to be able to create at least one thread to handle
1248          * asynchronous inactive calls.
1249          */
1250         max_threads = MAX(mi->mi_max_threads, 1);
1251         /*
1252          * We don't want to wait for mi_max_threads to go to zero, since that
1253          * happens as part of a failed unmount, but this thread should only
1254          * exit when the mount/zone is really going away.
1255          *
1256          * Once MI_ASYNC_MGR_STOP is set, no more async operations will be
1257          * attempted: the various _async_*() functions know to do things
1258          * inline if mi_max_threads == 0.  Henceforth we just drain out the
1259          * outstanding requests.
1260          *
1261          * Note that we still create zthreads even if we notice the zone is
1262          * shutting down (MI_ASYNC_MGR_STOP is set); this may cause the zone
1263          * shutdown sequence to take slightly longer in some cases, but
1264          * doesn't violate the protocol, as all threads will exit as soon as
1265          * they're done processing the remaining requests.
1266          */
1267         for (;;) {
1268                 while (mi->mi_async_req_count > 0) {
1269                         /*
1270                          * Paranoia: If the mount started out having
1271                          * (mi->mi_max_threads == 0), and the value was
1272                          * later changed (via a debugger or somesuch),
1273                          * we could be confused since we will think we
1274                          * can't create any threads, and the calling
1275                          * code (which looks at the current value of
1276                          * mi->mi_max_threads, now non-zero) thinks we
1277                          * can.
1278                          *
1279                          * So, because we're paranoid, we create threads
1280                          * up to the maximum of the original and the
1281                          * current value. This means that future
1282                          * (debugger-induced) lowerings of
1283                          * mi->mi_max_threads are ignored for our
1284                          * purposes, but who told them they could change
1285                          * random values on a live kernel anyhow?
1286                          */
1287                         if (mi->mi_threads[NFS_ASYNC_QUEUE] <
1288                             MAX(mi->mi_max_threads, max_threads)) {
1289                                 mi->mi_threads[NFS_ASYNC_QUEUE]++;
1290                                 mutex_exit(&mi->mi_async_lock);
1291                                 VFS_HOLD(vfsp); /* hold for new thread */
1292                                 (void) zthread_create(NULL, 0, nfs_async_start,
1293                                     vfsp, 0, minclsyspri);
1294                                 mutex_enter(&mi->mi_async_lock);
1295                         } else if (mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE] <
1296                             NUM_ASYNC_PGOPS_THREADS) {
1297                                 mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE]++;
1298                                 mutex_exit(&mi->mi_async_lock);
1299                                 VFS_HOLD(vfsp); /* hold for new thread */
1300                                 (void) zthread_create(NULL, 0,
1301                                     nfs_async_pgops_start, vfsp, 0,
1302                                     minclsyspri);
1303                                 mutex_enter(&mi->mi_async_lock);
1304                         }
1305                         NFS_WAKE_ASYNC_WORKER(mi->mi_async_work_cv);
1306                         ASSERT(mi->mi_async_req_count != 0);
1307                         mi->mi_async_req_count--;
1308                 }
1309 
1310                 mutex_enter(&mi->mi_lock);
1311                 if (mi->mi_flags & MI_ASYNC_MGR_STOP) {
1312                         mutex_exit(&mi->mi_lock);
1313                         break;
1314                 }
1315                 mutex_exit(&mi->mi_lock);
1316 
1317                 CALLB_CPR_SAFE_BEGIN(&cprinfo);
1318                 cv_wait(&mi->mi_async_reqs_cv, &mi->mi_async_lock);
1319                 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1320         }
1321         /*
1322          * Let everyone know we're done.
1323          */
1324         mi->mi_manager_thread = NULL;
1325         cv_broadcast(&mi->mi_async_cv);
1326 
1327         /*
1328          * There is no explicit call to mutex_exit(&mi->mi_async_lock)
1329          * since CALLB_CPR_EXIT is actually responsible for releasing
1330          * 'mi_async_lock'.
1331          */
1332         CALLB_CPR_EXIT(&cprinfo);
1333         VFS_RELE(vfsp); /* release thread's hold */
1334         zthread_exit();
1335 }
1336 
1337 /*
1338  * Signal (and wait for) the async manager thread to clean up and go away.
1339  */
1340 void
1341 nfs_async_manager_stop(vfs_t *vfsp)
1342 {
1343         mntinfo_t *mi = VFTOMI(vfsp);
1344 
1345         mutex_enter(&mi->mi_async_lock);
1346         mutex_enter(&mi->mi_lock);
1347         mi->mi_flags |= MI_ASYNC_MGR_STOP;
1348         mutex_exit(&mi->mi_lock);
1349         cv_broadcast(&mi->mi_async_reqs_cv);
1350         while (mi->mi_manager_thread != NULL)
1351                 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
1352         mutex_exit(&mi->mi_async_lock);
1353 }
1354 
1355 int
1356 nfs_async_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr,
1357     struct seg *seg, cred_t *cr, void (*readahead)(vnode_t *,
1358     u_offset_t, caddr_t, struct seg *, cred_t *))
1359 {
1360         rnode_t *rp;
1361         mntinfo_t *mi;
1362         struct nfs_async_reqs *args;
1363 
1364         rp = VTOR(vp);
1365         ASSERT(rp->r_freef == NULL);
1366 
1367         mi = VTOMI(vp);
1368 
1369         /*
1370          * If addr falls in a different segment, don't bother doing readahead.
1371          */
1372         if (addr >= seg->s_base + seg->s_size)
1373                 return (-1);
1374 
1375         /*
1376          * If we can't allocate a request structure, punt on the readahead.
1377          */
1378         if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1379                 return (-1);
1380 
1381         /*
1382          * If a lock operation is pending, don't initiate any new
1383          * readaheads.  Otherwise, bump r_count to indicate the new
1384          * asynchronous I/O.
1385          */
1386         if (!nfs_rw_tryenter(&rp->r_lkserlock, RW_READER)) {
1387                 kmem_free(args, sizeof (*args));
1388                 return (-1);
1389         }
1390         mutex_enter(&rp->r_statelock);
1391         rp->r_count++;
1392         mutex_exit(&rp->r_statelock);
1393         nfs_rw_exit(&rp->r_lkserlock);
1394 
1395         args->a_next = NULL;
1396 #ifdef DEBUG
1397         args->a_queuer = curthread;
1398 #endif
1399         VN_HOLD(vp);
1400         args->a_vp = vp;
1401         ASSERT(cr != NULL);
1402         crhold(cr);
1403         args->a_cred = cr;
1404         args->a_io = NFS_READ_AHEAD;
1405         args->a_nfs_readahead = readahead;
1406         args->a_nfs_blkoff = blkoff;
1407         args->a_nfs_seg = seg;
1408         args->a_nfs_addr = addr;
1409 
1410         mutex_enter(&mi->mi_async_lock);
1411 
1412         /*
1413          * If asyncio has been disabled, don't bother readahead.
1414          */
1415         if (mi->mi_max_threads == 0) {
1416                 mutex_exit(&mi->mi_async_lock);
1417                 goto noasync;
1418         }
1419 
1420         /*
1421          * Link request structure into the async list and
1422          * wakeup async thread to do the i/o.
1423          */
1424         if (mi->mi_async_reqs[NFS_READ_AHEAD] == NULL) {
1425                 mi->mi_async_reqs[NFS_READ_AHEAD] = args;
1426                 mi->mi_async_tail[NFS_READ_AHEAD] = args;
1427         } else {
1428                 mi->mi_async_tail[NFS_READ_AHEAD]->a_next = args;
1429                 mi->mi_async_tail[NFS_READ_AHEAD] = args;
1430         }
1431 
1432         if (mi->mi_io_kstats) {
1433                 mutex_enter(&mi->mi_lock);
1434                 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1435                 mutex_exit(&mi->mi_lock);
1436         }
1437 
1438         mi->mi_async_req_count++;
1439         ASSERT(mi->mi_async_req_count != 0);
1440         cv_signal(&mi->mi_async_reqs_cv);
1441         mutex_exit(&mi->mi_async_lock);
1442         return (0);
1443 
1444 noasync:
1445         mutex_enter(&rp->r_statelock);
1446         rp->r_count--;
1447         cv_broadcast(&rp->r_cv);
1448         mutex_exit(&rp->r_statelock);
1449         VN_RELE(vp);
1450         crfree(cr);
1451         kmem_free(args, sizeof (*args));
1452         return (-1);
1453 }
1454 
1455 int
1456 nfs_async_putapage(vnode_t *vp, page_t *pp, u_offset_t off, size_t len,
1457     int flags, cred_t *cr, int (*putapage)(vnode_t *, page_t *,
1458     u_offset_t, size_t, int, cred_t *))
1459 {
1460         rnode_t *rp;
1461         mntinfo_t *mi;
1462         struct nfs_async_reqs *args;
1463 
1464         ASSERT(flags & B_ASYNC);
1465         ASSERT(vp->v_vfsp != NULL);
1466 
1467         rp = VTOR(vp);
1468         ASSERT(rp->r_count > 0);
1469 
1470         mi = VTOMI(vp);
1471 
1472         /*
1473          * If we can't allocate a request structure, do the putpage
1474          * operation synchronously in this thread's context.
1475          */
1476         if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1477                 goto noasync;
1478 
1479         args->a_next = NULL;
1480 #ifdef DEBUG
1481         args->a_queuer = curthread;
1482 #endif
1483         VN_HOLD(vp);
1484         args->a_vp = vp;
1485         ASSERT(cr != NULL);
1486         crhold(cr);
1487         args->a_cred = cr;
1488         args->a_io = NFS_PUTAPAGE;
1489         args->a_nfs_putapage = putapage;
1490         args->a_nfs_pp = pp;
1491         args->a_nfs_off = off;
1492         args->a_nfs_len = (uint_t)len;
1493         args->a_nfs_flags = flags;
1494 
1495         mutex_enter(&mi->mi_async_lock);
1496 
1497         /*
1498          * If asyncio has been disabled, then make a synchronous request.
1499          * This check is done a second time in case async io was diabled
1500          * while this thread was blocked waiting for memory pressure to
1501          * reduce or for the queue to drain.
1502          */
1503         if (mi->mi_max_threads == 0) {
1504                 mutex_exit(&mi->mi_async_lock);
1505                 goto noasync;
1506         }
1507 
1508         /*
1509          * Link request structure into the async list and
1510          * wakeup async thread to do the i/o.
1511          */
1512         if (mi->mi_async_reqs[NFS_PUTAPAGE] == NULL) {
1513                 mi->mi_async_reqs[NFS_PUTAPAGE] = args;
1514                 mi->mi_async_tail[NFS_PUTAPAGE] = args;
1515         } else {
1516                 mi->mi_async_tail[NFS_PUTAPAGE]->a_next = args;
1517                 mi->mi_async_tail[NFS_PUTAPAGE] = args;
1518         }
1519 
1520         mutex_enter(&rp->r_statelock);
1521         rp->r_count++;
1522         rp->r_awcount++;
1523         mutex_exit(&rp->r_statelock);
1524 
1525         if (mi->mi_io_kstats) {
1526                 mutex_enter(&mi->mi_lock);
1527                 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1528                 mutex_exit(&mi->mi_lock);
1529         }
1530 
1531         mi->mi_async_req_count++;
1532         ASSERT(mi->mi_async_req_count != 0);
1533         cv_signal(&mi->mi_async_reqs_cv);
1534         mutex_exit(&mi->mi_async_lock);
1535         return (0);
1536 
1537 noasync:
1538         if (args != NULL) {
1539                 VN_RELE(vp);
1540                 crfree(cr);
1541                 kmem_free(args, sizeof (*args));
1542         }
1543 
1544         if (curproc == proc_pageout || curproc == proc_fsflush) {
1545                 /*
1546                  * If we get here in the context of the pageout/fsflush,
1547                  * we refuse to do a sync write, because this may hang
1548                  * pageout (and the machine). In this case, we just
1549                  * re-mark the page as dirty and punt on the page.
1550                  *
1551                  * Make sure B_FORCE isn't set.  We can re-mark the
1552                  * pages as dirty and unlock the pages in one swoop by
1553                  * passing in B_ERROR to pvn_write_done().  However,
1554                  * we should make sure B_FORCE isn't set - we don't
1555                  * want the page tossed before it gets written out.
1556                  */
1557                 if (flags & B_FORCE)
1558                         flags &= ~(B_INVAL | B_FORCE);
1559                 pvn_write_done(pp, flags | B_ERROR);
1560                 return (0);
1561         }
1562         if (nfs_zone() != mi->mi_zone) {
1563                 /*
1564                  * So this was a cross-zone sync putpage.  We pass in B_ERROR
1565                  * to pvn_write_done() to re-mark the pages as dirty and unlock
1566                  * them.
1567                  *
1568                  * We don't want to clear B_FORCE here as the caller presumably
1569                  * knows what they're doing if they set it.
1570                  */
1571                 pvn_write_done(pp, flags | B_ERROR);
1572                 return (EPERM);
1573         }
1574         return ((*putapage)(vp, pp, off, len, flags, cr));
1575 }
1576 
1577 int
1578 nfs_async_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
1579     int flags, cred_t *cr, int (*pageio)(vnode_t *, page_t *, u_offset_t,
1580     size_t, int, cred_t *))
1581 {
1582         rnode_t *rp;
1583         mntinfo_t *mi;
1584         struct nfs_async_reqs *args;
1585 
1586         ASSERT(flags & B_ASYNC);
1587         ASSERT(vp->v_vfsp != NULL);
1588 
1589         rp = VTOR(vp);
1590         ASSERT(rp->r_count > 0);
1591 
1592         mi = VTOMI(vp);
1593 
1594         /*
1595          * If we can't allocate a request structure, do the pageio
1596          * request synchronously in this thread's context.
1597          */
1598         if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1599                 goto noasync;
1600 
1601         args->a_next = NULL;
1602 #ifdef DEBUG
1603         args->a_queuer = curthread;
1604 #endif
1605         VN_HOLD(vp);
1606         args->a_vp = vp;
1607         ASSERT(cr != NULL);
1608         crhold(cr);
1609         args->a_cred = cr;
1610         args->a_io = NFS_PAGEIO;
1611         args->a_nfs_pageio = pageio;
1612         args->a_nfs_pp = pp;
1613         args->a_nfs_off = io_off;
1614         args->a_nfs_len = (uint_t)io_len;
1615         args->a_nfs_flags = flags;
1616 
1617         mutex_enter(&mi->mi_async_lock);
1618 
1619         /*
1620          * If asyncio has been disabled, then make a synchronous request.
1621          * This check is done a second time in case async io was diabled
1622          * while this thread was blocked waiting for memory pressure to
1623          * reduce or for the queue to drain.
1624          */
1625         if (mi->mi_max_threads == 0) {
1626                 mutex_exit(&mi->mi_async_lock);
1627                 goto noasync;
1628         }
1629 
1630         /*
1631          * Link request structure into the async list and
1632          * wakeup async thread to do the i/o.
1633          */
1634         if (mi->mi_async_reqs[NFS_PAGEIO] == NULL) {
1635                 mi->mi_async_reqs[NFS_PAGEIO] = args;
1636                 mi->mi_async_tail[NFS_PAGEIO] = args;
1637         } else {
1638                 mi->mi_async_tail[NFS_PAGEIO]->a_next = args;
1639                 mi->mi_async_tail[NFS_PAGEIO] = args;
1640         }
1641 
1642         mutex_enter(&rp->r_statelock);
1643         rp->r_count++;
1644         rp->r_awcount++;
1645         mutex_exit(&rp->r_statelock);
1646 
1647         if (mi->mi_io_kstats) {
1648                 mutex_enter(&mi->mi_lock);
1649                 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1650                 mutex_exit(&mi->mi_lock);
1651         }
1652 
1653         mi->mi_async_req_count++;
1654         ASSERT(mi->mi_async_req_count != 0);
1655         cv_signal(&mi->mi_async_reqs_cv);
1656         mutex_exit(&mi->mi_async_lock);
1657         return (0);
1658 
1659 noasync:
1660         if (args != NULL) {
1661                 VN_RELE(vp);
1662                 crfree(cr);
1663                 kmem_free(args, sizeof (*args));
1664         }
1665 
1666         /*
1667          * If we can't do it ASYNC, for reads we do nothing (but cleanup
1668          * the page list), for writes we do it synchronously, except for
1669          * proc_pageout/proc_fsflush as described below.
1670          */
1671         if (flags & B_READ) {
1672                 pvn_read_done(pp, flags | B_ERROR);
1673                 return (0);
1674         }
1675 
1676         if (curproc == proc_pageout || curproc == proc_fsflush) {
1677                 /*
1678                  * If we get here in the context of the pageout/fsflush,
1679                  * we refuse to do a sync write, because this may hang
1680                  * pageout/fsflush (and the machine). In this case, we just
1681                  * re-mark the page as dirty and punt on the page.
1682                  *
1683                  * Make sure B_FORCE isn't set.  We can re-mark the
1684                  * pages as dirty and unlock the pages in one swoop by
1685                  * passing in B_ERROR to pvn_write_done().  However,
1686                  * we should make sure B_FORCE isn't set - we don't
1687                  * want the page tossed before it gets written out.
1688                  */
1689                 if (flags & B_FORCE)
1690                         flags &= ~(B_INVAL | B_FORCE);
1691                 pvn_write_done(pp, flags | B_ERROR);
1692                 return (0);
1693         }
1694 
1695         if (nfs_zone() != mi->mi_zone) {
1696                 /*
1697                  * So this was a cross-zone sync pageio.  We pass in B_ERROR
1698                  * to pvn_write_done() to re-mark the pages as dirty and unlock
1699                  * them.
1700                  *
1701                  * We don't want to clear B_FORCE here as the caller presumably
1702                  * knows what they're doing if they set it.
1703                  */
1704                 pvn_write_done(pp, flags | B_ERROR);
1705                 return (EPERM);
1706         }
1707         return ((*pageio)(vp, pp, io_off, io_len, flags, cr));
1708 }
1709 
1710 void
1711 nfs_async_readdir(vnode_t *vp, rddir_cache *rdc, cred_t *cr,
1712     int (*readdir)(vnode_t *, rddir_cache *, cred_t *))
1713 {
1714         rnode_t *rp;
1715         mntinfo_t *mi;
1716         struct nfs_async_reqs *args;
1717 
1718         rp = VTOR(vp);
1719         ASSERT(rp->r_freef == NULL);
1720 
1721         mi = VTOMI(vp);
1722 
1723         /*
1724          * If we can't allocate a request structure, do the readdir
1725          * operation synchronously in this thread's context.
1726          */
1727         if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1728                 goto noasync;
1729 
1730         args->a_next = NULL;
1731 #ifdef DEBUG
1732         args->a_queuer = curthread;
1733 #endif
1734         VN_HOLD(vp);
1735         args->a_vp = vp;
1736         ASSERT(cr != NULL);
1737         crhold(cr);
1738         args->a_cred = cr;
1739         args->a_io = NFS_READDIR;
1740         args->a_nfs_readdir = readdir;
1741         args->a_nfs_rdc = rdc;
1742 
1743         mutex_enter(&mi->mi_async_lock);
1744 
1745         /*
1746          * If asyncio has been disabled, then make a synchronous request.
1747          */
1748         if (mi->mi_max_threads == 0) {
1749                 mutex_exit(&mi->mi_async_lock);
1750                 goto noasync;
1751         }
1752 
1753         /*
1754          * Link request structure into the async list and
1755          * wakeup async thread to do the i/o.
1756          */
1757         if (mi->mi_async_reqs[NFS_READDIR] == NULL) {
1758                 mi->mi_async_reqs[NFS_READDIR] = args;
1759                 mi->mi_async_tail[NFS_READDIR] = args;
1760         } else {
1761                 mi->mi_async_tail[NFS_READDIR]->a_next = args;
1762                 mi->mi_async_tail[NFS_READDIR] = args;
1763         }
1764 
1765         mutex_enter(&rp->r_statelock);
1766         rp->r_count++;
1767         mutex_exit(&rp->r_statelock);
1768 
1769         if (mi->mi_io_kstats) {
1770                 mutex_enter(&mi->mi_lock);
1771                 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1772                 mutex_exit(&mi->mi_lock);
1773         }
1774 
1775         mi->mi_async_req_count++;
1776         ASSERT(mi->mi_async_req_count != 0);
1777         cv_signal(&mi->mi_async_reqs_cv);
1778         mutex_exit(&mi->mi_async_lock);
1779         return;
1780 
1781 noasync:
1782         if (args != NULL) {
1783                 VN_RELE(vp);
1784                 crfree(cr);
1785                 kmem_free(args, sizeof (*args));
1786         }
1787 
1788         rdc->entries = NULL;
1789         mutex_enter(&rp->r_statelock);
1790         ASSERT(rdc->flags & RDDIR);
1791         rdc->flags &= ~RDDIR;
1792         rdc->flags |= RDDIRREQ;
1793         /*
1794          * Check the flag to see if RDDIRWAIT is set. If RDDIRWAIT
1795          * is set, wakeup the thread sleeping in cv_wait_sig().
1796          * The woken up thread will reset the flag to RDDIR and will
1797          * continue with the readdir opeartion.
1798          */
1799         if (rdc->flags & RDDIRWAIT) {
1800                 rdc->flags &= ~RDDIRWAIT;
1801                 cv_broadcast(&rdc->cv);
1802         }
1803         mutex_exit(&rp->r_statelock);
1804         rddir_cache_rele(rdc);
1805 }
1806 
1807 void
1808 nfs_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count,
1809     cred_t *cr, void (*commit)(vnode_t *, page_t *, offset3, count3, cred_t *))
1810 {
1811         rnode_t *rp;
1812         mntinfo_t *mi;
1813         struct nfs_async_reqs *args;
1814         page_t *pp;
1815 
1816         rp = VTOR(vp);
1817         mi = VTOMI(vp);
1818 
1819         /*
1820          * If we can't allocate a request structure, do the commit
1821          * operation synchronously in this thread's context.
1822          */
1823         if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1824                 goto noasync;
1825 
1826         args->a_next = NULL;
1827 #ifdef DEBUG
1828         args->a_queuer = curthread;
1829 #endif
1830         VN_HOLD(vp);
1831         args->a_vp = vp;
1832         ASSERT(cr != NULL);
1833         crhold(cr);
1834         args->a_cred = cr;
1835         args->a_io = NFS_COMMIT;
1836         args->a_nfs_commit = commit;
1837         args->a_nfs_plist = plist;
1838         args->a_nfs_offset = offset;
1839         args->a_nfs_count = count;
1840 
1841         mutex_enter(&mi->mi_async_lock);
1842 
1843         /*
1844          * If asyncio has been disabled, then make a synchronous request.
1845          * This check is done a second time in case async io was diabled
1846          * while this thread was blocked waiting for memory pressure to
1847          * reduce or for the queue to drain.
1848          */
1849         if (mi->mi_max_threads == 0) {
1850                 mutex_exit(&mi->mi_async_lock);
1851                 goto noasync;
1852         }
1853 
1854         /*
1855          * Link request structure into the async list and
1856          * wakeup async thread to do the i/o.
1857          */
1858         if (mi->mi_async_reqs[NFS_COMMIT] == NULL) {
1859                 mi->mi_async_reqs[NFS_COMMIT] = args;
1860                 mi->mi_async_tail[NFS_COMMIT] = args;
1861         } else {
1862                 mi->mi_async_tail[NFS_COMMIT]->a_next = args;
1863                 mi->mi_async_tail[NFS_COMMIT] = args;
1864         }
1865 
1866         mutex_enter(&rp->r_statelock);
1867         rp->r_count++;
1868         mutex_exit(&rp->r_statelock);
1869 
1870         if (mi->mi_io_kstats) {
1871                 mutex_enter(&mi->mi_lock);
1872                 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1873                 mutex_exit(&mi->mi_lock);
1874         }
1875 
1876         mi->mi_async_req_count++;
1877         ASSERT(mi->mi_async_req_count != 0);
1878         cv_signal(&mi->mi_async_reqs_cv);
1879         mutex_exit(&mi->mi_async_lock);
1880         return;
1881 
1882 noasync:
1883         if (args != NULL) {
1884                 VN_RELE(vp);
1885                 crfree(cr);
1886                 kmem_free(args, sizeof (*args));
1887         }
1888 
1889         if (curproc == proc_pageout || curproc == proc_fsflush ||
1890             nfs_zone() != mi->mi_zone) {
1891                 while (plist != NULL) {
1892                         pp = plist;
1893                         page_sub(&plist, pp);
1894                         pp->p_fsdata = C_COMMIT;
1895                         page_unlock(pp);
1896                 }
1897                 return;
1898         }
1899         (*commit)(vp, plist, offset, count, cr);
1900 }
1901 
1902 void
1903 nfs_async_inactive(vnode_t *vp, cred_t *cr,
1904     void (*inactive)(vnode_t *, cred_t *, caller_context_t *))
1905 {
1906         mntinfo_t *mi;
1907         struct nfs_async_reqs *args;
1908 
1909         mi = VTOMI(vp);
1910 
1911         args = kmem_alloc(sizeof (*args), KM_SLEEP);
1912         args->a_next = NULL;
1913 #ifdef DEBUG
1914         args->a_queuer = curthread;
1915 #endif
1916         args->a_vp = vp;
1917         ASSERT(cr != NULL);
1918         crhold(cr);
1919         args->a_cred = cr;
1920         args->a_io = NFS_INACTIVE;
1921         args->a_nfs_inactive = inactive;
1922 
1923         /*
1924          * Note that we don't check mi->mi_max_threads here, since we
1925          * *need* to get rid of this vnode regardless of whether someone
1926          * set nfs3_max_threads/nfs_max_threads to zero in /etc/system.
1927          *
1928          * The manager thread knows about this and is willing to create
1929          * at least one thread to accommodate us.
1930          */
1931         mutex_enter(&mi->mi_async_lock);
1932         if (mi->mi_manager_thread == NULL) {
1933                 rnode_t *rp = VTOR(vp);
1934 
1935                 mutex_exit(&mi->mi_async_lock);
1936                 crfree(cr);     /* drop our reference */
1937                 kmem_free(args, sizeof (*args));
1938                 /*
1939                  * We can't do an over-the-wire call since we're in the wrong
1940                  * zone, so we need to clean up state as best we can and then
1941                  * throw away the vnode.
1942                  */
1943                 mutex_enter(&rp->r_statelock);
1944                 if (rp->r_unldvp != NULL) {
1945                         vnode_t *unldvp;
1946                         char *unlname;
1947                         cred_t *unlcred;
1948 
1949                         unldvp = rp->r_unldvp;
1950                         rp->r_unldvp = NULL;
1951                         unlname = rp->r_unlname;
1952                         rp->r_unlname = NULL;
1953                         unlcred = rp->r_unlcred;
1954                         rp->r_unlcred = NULL;
1955                         mutex_exit(&rp->r_statelock);
1956 
1957                         VN_RELE(unldvp);
1958                         kmem_free(unlname, MAXNAMELEN);
1959                         crfree(unlcred);
1960                 } else {
1961                         mutex_exit(&rp->r_statelock);
1962                 }
1963                 /*
1964                  * No need to explicitly throw away any cached pages.  The
1965                  * eventual rinactive() will attempt a synchronous
1966                  * VOP_PUTPAGE() which will immediately fail since the request
1967                  * is coming from the wrong zone, and then will proceed to call
1968                  * nfs_invalidate_pages() which will clean things up for us.
1969                  */
1970                 rp_addfree(VTOR(vp), cr);
1971                 return;
1972         }
1973 
1974         if (mi->mi_async_reqs[NFS_INACTIVE] == NULL) {
1975                 mi->mi_async_reqs[NFS_INACTIVE] = args;
1976         } else {
1977                 mi->mi_async_tail[NFS_INACTIVE]->a_next = args;
1978         }
1979         mi->mi_async_tail[NFS_INACTIVE] = args;
1980         /*
1981          * Don't increment r_count, since we're trying to get rid of the vnode.
1982          */
1983 
1984         mi->mi_async_req_count++;
1985         ASSERT(mi->mi_async_req_count != 0);
1986         cv_signal(&mi->mi_async_reqs_cv);
1987         mutex_exit(&mi->mi_async_lock);
1988 }
1989 
1990 static void
1991 nfs_async_start(struct vfs *vfsp)
1992 {
1993         nfs_async_common_start(vfsp, NFS_ASYNC_QUEUE);
1994 }
1995 
1996 static void
1997 nfs_async_pgops_start(struct vfs *vfsp)
1998 {
1999         nfs_async_common_start(vfsp, NFS_ASYNC_PGOPS_QUEUE);
2000 }
2001 
2002 /*
2003  * The async queues for each mounted file system are arranged as a
2004  * set of queues, one for each async i/o type.  Requests are taken
2005  * from the queues in a round-robin fashion.  A number of consecutive
2006  * requests are taken from each queue before moving on to the next
2007  * queue.  This functionality may allow the NFS Version 2 server to do
2008  * write clustering, even if the client is mixing writes and reads
2009  * because it will take multiple write requests from the queue
2010  * before processing any of the other async i/o types.
2011  *
2012  * XXX The nfs_async_common_start thread is unsafe in the light of the present
2013  * model defined by cpr to suspend the system. Specifically over the
2014  * wire calls are cpr-unsafe. The thread should be reevaluated in
2015  * case of future updates to the cpr model.
2016  */
2017 static void
2018 nfs_async_common_start(struct vfs *vfsp, int async_queue)
2019 {
2020         struct nfs_async_reqs *args;
2021         mntinfo_t *mi = VFTOMI(vfsp);
2022         clock_t time_left = 1;
2023         callb_cpr_t cprinfo;
2024         int i;
2025         int async_types;
2026         kcondvar_t *async_work_cv;
2027 
2028         if (async_queue == NFS_ASYNC_QUEUE) {
2029                 async_types = NFS_ASYNC_TYPES;
2030                 async_work_cv = &mi->mi_async_work_cv[NFS_ASYNC_QUEUE];
2031         } else {
2032                 async_types = NFS_ASYNC_PGOPS_TYPES;
2033                 async_work_cv = &mi->mi_async_work_cv[NFS_ASYNC_PGOPS_QUEUE];
2034         }
2035 
2036         /*
2037          * Dynamic initialization of nfs_async_timeout to allow nfs to be
2038          * built in an implementation independent manner.
2039          */
2040         if (nfs_async_timeout == -1)
2041                 nfs_async_timeout = NFS_ASYNC_TIMEOUT;
2042 
2043         CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, "nas");
2044 
2045         mutex_enter(&mi->mi_async_lock);
2046         for (;;) {
2047                 /*
2048                  * Find the next queue containing an entry.  We start
2049                  * at the current queue pointer and then round robin
2050                  * through all of them until we either find a non-empty
2051                  * queue or have looked through all of them.
2052                  */
2053                 for (i = 0; i < async_types; i++) {
2054                         args = *mi->mi_async_curr[async_queue];
2055                         if (args != NULL)
2056                                 break;
2057                         mi->mi_async_curr[async_queue]++;
2058                         if (mi->mi_async_curr[async_queue] ==
2059                             &mi->mi_async_reqs[async_types]) {
2060                                 mi->mi_async_curr[async_queue] =
2061                                     &mi->mi_async_reqs[0];
2062                         }
2063                 }
2064                 /*
2065                  * If we didn't find a entry, then block until woken up
2066                  * again and then look through the queues again.
2067                  */
2068                 if (args == NULL) {
2069                         /*
2070                          * Exiting is considered to be safe for CPR as well
2071                          */
2072                         CALLB_CPR_SAFE_BEGIN(&cprinfo);
2073 
2074                         /*
2075                          * Wakeup thread waiting to unmount the file
2076                          * system only if all async threads are inactive.
2077                          *
2078                          * If we've timed-out and there's nothing to do,
2079                          * then get rid of this thread.
2080                          */
2081                         if (mi->mi_max_threads == 0 || time_left <= 0) {
2082                                 --mi->mi_threads[async_queue];
2083 
2084                                 if (mi->mi_threads[NFS_ASYNC_QUEUE] == 0 &&
2085                                     mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE] == 0)
2086                                         cv_signal(&mi->mi_async_cv);
2087                                 CALLB_CPR_EXIT(&cprinfo);
2088                                 VFS_RELE(vfsp); /* release thread's hold */
2089                                 zthread_exit();
2090                                 /* NOTREACHED */
2091                         }
2092                         time_left = cv_reltimedwait(async_work_cv,
2093                             &mi->mi_async_lock, nfs_async_timeout,
2094                             TR_CLOCK_TICK);
2095 
2096                         CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
2097 
2098                         continue;
2099                 }
2100                 time_left = 1;
2101 
2102                 /*
2103                  * Remove the request from the async queue and then
2104                  * update the current async request queue pointer.  If
2105                  * the current queue is empty or we have removed enough
2106                  * consecutive entries from it, then reset the counter
2107                  * for this queue and then move the current pointer to
2108                  * the next queue.
2109                  */
2110                 *mi->mi_async_curr[async_queue] = args->a_next;
2111                 if (*mi->mi_async_curr[async_queue] == NULL ||
2112                     --mi->mi_async_clusters[args->a_io] == 0) {
2113                         mi->mi_async_clusters[args->a_io] =
2114                             mi->mi_async_init_clusters;
2115                         mi->mi_async_curr[async_queue]++;
2116                         if (mi->mi_async_curr[async_queue] ==
2117                             &mi->mi_async_reqs[async_types]) {
2118                                 mi->mi_async_curr[async_queue] =
2119                                     &mi->mi_async_reqs[0];
2120                         }
2121                 }
2122 
2123                 if (args->a_io != NFS_INACTIVE && mi->mi_io_kstats) {
2124                         mutex_enter(&mi->mi_lock);
2125                         kstat_waitq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
2126                         mutex_exit(&mi->mi_lock);
2127                 }
2128 
2129                 mutex_exit(&mi->mi_async_lock);
2130 
2131                 /*
2132                  * Obtain arguments from the async request structure.
2133                  */
2134                 if (args->a_io == NFS_READ_AHEAD && mi->mi_max_threads > 0) {
2135                         (*args->a_nfs_readahead)(args->a_vp, args->a_nfs_blkoff,
2136                             args->a_nfs_addr, args->a_nfs_seg,
2137                             args->a_cred);
2138                 } else if (args->a_io == NFS_PUTAPAGE) {
2139                         (void) (*args->a_nfs_putapage)(args->a_vp,
2140                             args->a_nfs_pp, args->a_nfs_off,
2141                             args->a_nfs_len, args->a_nfs_flags,
2142                             args->a_cred);
2143                 } else if (args->a_io == NFS_PAGEIO) {
2144                         (void) (*args->a_nfs_pageio)(args->a_vp,
2145                             args->a_nfs_pp, args->a_nfs_off,
2146                             args->a_nfs_len, args->a_nfs_flags,
2147                             args->a_cred);
2148                 } else if (args->a_io == NFS_READDIR) {
2149                         (void) ((*args->a_nfs_readdir)(args->a_vp,
2150                             args->a_nfs_rdc, args->a_cred));
2151                 } else if (args->a_io == NFS_COMMIT) {
2152                         (*args->a_nfs_commit)(args->a_vp, args->a_nfs_plist,
2153                             args->a_nfs_offset, args->a_nfs_count,
2154                             args->a_cred);
2155                 } else if (args->a_io == NFS_INACTIVE) {
2156                         (*args->a_nfs_inactive)(args->a_vp, args->a_cred, NULL);
2157                 }
2158 
2159                 /*
2160                  * Now, release the vnode and free the credentials
2161                  * structure.
2162                  */
2163                 free_async_args(args);
2164                 /*
2165                  * Reacquire the mutex because it will be needed above.
2166                  */
2167                 mutex_enter(&mi->mi_async_lock);
2168         }
2169 }
2170 
2171 void
2172 nfs_async_stop(struct vfs *vfsp)
2173 {
2174         mntinfo_t *mi = VFTOMI(vfsp);
2175 
2176         /*
2177          * Wait for all outstanding async operations to complete and for the
2178          * worker threads to exit.
2179          */
2180         mutex_enter(&mi->mi_async_lock);
2181         mi->mi_max_threads = 0;
2182         NFS_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
2183         while (mi->mi_threads[NFS_ASYNC_QUEUE] != 0 ||
2184             mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE] != 0)
2185                 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
2186         mutex_exit(&mi->mi_async_lock);
2187 }
2188 
2189 /*
2190  * nfs_async_stop_sig:
2191  * Wait for all outstanding putpage operation to complete. If a signal
2192  * is deliver we will abort and return non-zero. If we can put all the
2193  * pages we will return 0. This routine is called from nfs_unmount and
2194  * nfs3_unmount to make these operations interruptible.
2195  */
2196 int
2197 nfs_async_stop_sig(struct vfs *vfsp)
2198 {
2199         mntinfo_t *mi = VFTOMI(vfsp);
2200         ushort_t omax;
2201         int rval;
2202 
2203         /*
2204          * Wait for all outstanding async operations to complete and for the
2205          * worker threads to exit.
2206          */
2207         mutex_enter(&mi->mi_async_lock);
2208         omax = mi->mi_max_threads;
2209         mi->mi_max_threads = 0;
2210         /*
2211          * Tell all the worker threads to exit.
2212          */
2213         NFS_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
2214         while (mi->mi_threads[NFS_ASYNC_QUEUE] != 0 ||
2215             mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE] != 0) {
2216                 if (!cv_wait_sig(&mi->mi_async_cv, &mi->mi_async_lock))
2217                         break;
2218         }
2219         rval = (mi->mi_threads[NFS_ASYNC_QUEUE] != 0 ||
2220             mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE]  != 0); /* Interrupted */
2221         if (rval)
2222                 mi->mi_max_threads = omax;
2223         mutex_exit(&mi->mi_async_lock);
2224 
2225         return (rval);
2226 }
2227 
2228 int
2229 writerp(rnode_t *rp, caddr_t base, int tcount, struct uio *uio, int pgcreated)
2230 {
2231         int pagecreate;
2232         int n;
2233         int saved_n;
2234         caddr_t saved_base;
2235         u_offset_t offset;
2236         int error;
2237         int sm_error;
2238         vnode_t *vp = RTOV(rp);
2239 
2240         ASSERT(tcount <= MAXBSIZE && tcount <= uio->uio_resid);
2241         ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_WRITER));
2242         if (!vpm_enable) {
2243                 ASSERT(((uintptr_t)base & MAXBOFFSET) + tcount <= MAXBSIZE);
2244         }
2245 
2246         /*
2247          * Move bytes in at most PAGESIZE chunks. We must avoid
2248          * spanning pages in uiomove() because page faults may cause
2249          * the cache to be invalidated out from under us. The r_size is not
2250          * updated until after the uiomove. If we push the last page of a
2251          * file before r_size is correct, we will lose the data written past
2252          * the current (and invalid) r_size.
2253          */
2254         do {
2255                 offset = uio->uio_loffset;
2256                 pagecreate = 0;
2257 
2258                 /*
2259                  * n is the number of bytes required to satisfy the request
2260                  *   or the number of bytes to fill out the page.
2261                  */
2262                 n = (int)MIN((PAGESIZE - (offset & PAGEOFFSET)), tcount);
2263 
2264                 /*
2265                  * Check to see if we can skip reading in the page
2266                  * and just allocate the memory.  We can do this
2267                  * if we are going to rewrite the entire mapping
2268                  * or if we are going to write to or beyond the current
2269                  * end of file from the beginning of the mapping.
2270                  *
2271                  * The read of r_size is now protected by r_statelock.
2272                  */
2273                 mutex_enter(&rp->r_statelock);
2274                 /*
2275                  * When pgcreated is nonzero the caller has already done
2276                  * a segmap_getmapflt with forcefault 0 and S_WRITE. With
2277                  * segkpm this means we already have at least one page
2278                  * created and mapped at base.
2279                  */
2280                 pagecreate = pgcreated ||
2281                     ((offset & PAGEOFFSET) == 0 &&
2282                     (n == PAGESIZE || ((offset + n) >= rp->r_size)));
2283 
2284                 mutex_exit(&rp->r_statelock);
2285                 if (!vpm_enable && pagecreate) {
2286                         /*
2287                          * The last argument tells segmap_pagecreate() to
2288                          * always lock the page, as opposed to sometimes
2289                          * returning with the page locked. This way we avoid a
2290                          * fault on the ensuing uiomove(), but also
2291                          * more importantly (to fix bug 1094402) we can
2292                          * call segmap_fault() to unlock the page in all
2293                          * cases. An alternative would be to modify
2294                          * segmap_pagecreate() to tell us when it is
2295                          * locking a page, but that's a fairly major
2296                          * interface change.
2297                          */
2298                         if (pgcreated == 0)
2299                                 (void) segmap_pagecreate(segkmap, base,
2300                                     (uint_t)n, 1);
2301                         saved_base = base;
2302                         saved_n = n;
2303                 }
2304 
2305                 /*
2306                  * The number of bytes of data in the last page can not
2307                  * be accurately be determined while page is being
2308                  * uiomove'd to and the size of the file being updated.
2309                  * Thus, inform threads which need to know accurately
2310                  * how much data is in the last page of the file.  They
2311                  * will not do the i/o immediately, but will arrange for
2312                  * the i/o to happen later when this modify operation
2313                  * will have finished.
2314                  */
2315                 ASSERT(!(rp->r_flags & RMODINPROGRESS));
2316                 mutex_enter(&rp->r_statelock);
2317                 rp->r_flags |= RMODINPROGRESS;
2318                 rp->r_modaddr = (offset & MAXBMASK);
2319                 mutex_exit(&rp->r_statelock);
2320 
2321                 if (vpm_enable) {
2322                         /*
2323                          * Copy data. If new pages are created, part of
2324                          * the page that is not written will be initizliazed
2325                          * with zeros.
2326                          */
2327                         error = vpm_data_copy(vp, offset, n, uio,
2328                             !pagecreate, NULL, 0, S_WRITE);
2329                 } else {
2330                         error = uiomove(base, n, UIO_WRITE, uio);
2331                 }
2332 
2333                 /*
2334                  * r_size is the maximum number of
2335                  * bytes known to be in the file.
2336                  * Make sure it is at least as high as the
2337                  * first unwritten byte pointed to by uio_loffset.
2338                  */
2339                 mutex_enter(&rp->r_statelock);
2340                 if (rp->r_size < uio->uio_loffset)
2341                         rp->r_size = uio->uio_loffset;
2342                 rp->r_flags &= ~RMODINPROGRESS;
2343                 rp->r_flags |= RDIRTY;
2344                 mutex_exit(&rp->r_statelock);
2345 
2346                 /* n = # of bytes written */
2347                 n = (int)(uio->uio_loffset - offset);
2348 
2349                 if (!vpm_enable) {
2350                         base += n;
2351                 }
2352                 tcount -= n;
2353                 /*
2354                  * If we created pages w/o initializing them completely,
2355                  * we need to zero the part that wasn't set up.
2356                  * This happens on a most EOF write cases and if
2357                  * we had some sort of error during the uiomove.
2358                  */
2359                 if (!vpm_enable && pagecreate) {
2360                         if ((uio->uio_loffset & PAGEOFFSET) || n == 0)
2361                                 (void) kzero(base, PAGESIZE - n);
2362 
2363                         if (pgcreated) {
2364                                 /*
2365                                  * Caller is responsible for this page,
2366                                  * it was not created in this loop.
2367                                  */
2368                                 pgcreated = 0;
2369                         } else {
2370                                 /*
2371                                  * For bug 1094402: segmap_pagecreate locks
2372                                  * page. Unlock it. This also unlocks the
2373                                  * pages allocated by page_create_va() in
2374                                  * segmap_pagecreate().
2375                                  */
2376                                 sm_error = segmap_fault(kas.a_hat, segkmap,
2377                                     saved_base, saved_n,
2378                                     F_SOFTUNLOCK, S_WRITE);
2379                                 if (error == 0)
2380                                         error = sm_error;
2381                         }
2382                 }
2383         } while (tcount > 0 && error == 0);
2384 
2385         return (error);
2386 }
2387 
2388 int
2389 nfs_putpages(vnode_t *vp, u_offset_t off, size_t len, int flags, cred_t *cr)
2390 {
2391         rnode_t *rp;
2392         page_t *pp;
2393         u_offset_t eoff;
2394         u_offset_t io_off;
2395         size_t io_len;
2396         int error;
2397         int rdirty;
2398         int err;
2399 
2400         rp = VTOR(vp);
2401         ASSERT(rp->r_count > 0);
2402 
2403         if (!vn_has_cached_data(vp))
2404                 return (0);
2405 
2406         ASSERT(vp->v_type != VCHR);
2407 
2408         /*
2409          * If ROUTOFSPACE is set, then all writes turn into B_INVAL
2410          * writes.  B_FORCE is set to force the VM system to actually
2411          * invalidate the pages, even if the i/o failed.  The pages
2412          * need to get invalidated because they can't be written out
2413          * because there isn't any space left on either the server's
2414          * file system or in the user's disk quota.  The B_FREE bit
2415          * is cleared to avoid confusion as to whether this is a
2416          * request to place the page on the freelist or to destroy
2417          * it.
2418          */
2419         if ((rp->r_flags & ROUTOFSPACE) ||
2420             (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED))
2421                 flags = (flags & ~B_FREE) | B_INVAL | B_FORCE;
2422 
2423         if (len == 0) {
2424                 /*
2425                  * If doing a full file synchronous operation, then clear
2426                  * the RDIRTY bit.  If a page gets dirtied while the flush
2427                  * is happening, then RDIRTY will get set again.  The
2428                  * RDIRTY bit must get cleared before the flush so that
2429                  * we don't lose this information.
2430                  *
2431                  * If there are no full file async write operations
2432                  * pending and RDIRTY bit is set, clear it.
2433                  */
2434                 if (off == (u_offset_t)0 &&
2435                     !(flags & B_ASYNC) &&
2436                     (rp->r_flags & RDIRTY)) {
2437                         mutex_enter(&rp->r_statelock);
2438                         rdirty = (rp->r_flags & RDIRTY);
2439                         rp->r_flags &= ~RDIRTY;
2440                         mutex_exit(&rp->r_statelock);
2441                 } else if (flags & B_ASYNC && off == (u_offset_t)0) {
2442                         mutex_enter(&rp->r_statelock);
2443                         if (rp->r_flags & RDIRTY && rp->r_awcount == 0) {
2444                                 rdirty = (rp->r_flags & RDIRTY);
2445                                 rp->r_flags &= ~RDIRTY;
2446                         }
2447                         mutex_exit(&rp->r_statelock);
2448                 } else
2449                         rdirty = 0;
2450 
2451                 /*
2452                  * Search the entire vp list for pages >= off, and flush
2453                  * the dirty pages.
2454                  */
2455                 error = pvn_vplist_dirty(vp, off, rp->r_putapage,
2456                     flags, cr);
2457 
2458                 /*
2459                  * If an error occurred and the file was marked as dirty
2460                  * before and we aren't forcibly invalidating pages, then
2461                  * reset the RDIRTY flag.
2462                  */
2463                 if (error && rdirty &&
2464                     (flags & (B_INVAL | B_FORCE)) != (B_INVAL | B_FORCE)) {
2465                         mutex_enter(&rp->r_statelock);
2466                         rp->r_flags |= RDIRTY;
2467                         mutex_exit(&rp->r_statelock);
2468                 }
2469         } else {
2470                 /*
2471                  * Do a range from [off...off + len) looking for pages
2472                  * to deal with.
2473                  */
2474                 error = 0;
2475 #ifdef lint
2476                 io_len = 0;
2477 #endif
2478                 eoff = off + len;
2479                 mutex_enter(&rp->r_statelock);
2480                 for (io_off = off; io_off < eoff && io_off < rp->r_size;
2481                     io_off += io_len) {
2482                         mutex_exit(&rp->r_statelock);
2483                         /*
2484                          * If we are not invalidating, synchronously
2485                          * freeing or writing pages use the routine
2486                          * page_lookup_nowait() to prevent reclaiming
2487                          * them from the free list.
2488                          */
2489                         if ((flags & B_INVAL) || !(flags & B_ASYNC)) {
2490                                 pp = page_lookup(vp, io_off,
2491                                     (flags & (B_INVAL | B_FREE)) ?
2492                                     SE_EXCL : SE_SHARED);
2493                         } else {
2494                                 pp = page_lookup_nowait(vp, io_off,
2495                                     (flags & B_FREE) ? SE_EXCL : SE_SHARED);
2496                         }
2497 
2498                         if (pp == NULL || !pvn_getdirty(pp, flags))
2499                                 io_len = PAGESIZE;
2500                         else {
2501                                 err = (*rp->r_putapage)(vp, pp, &io_off,
2502                                     &io_len, flags, cr);
2503                                 if (!error)
2504                                         error = err;
2505                                 /*
2506                                  * "io_off" and "io_len" are returned as
2507                                  * the range of pages we actually wrote.
2508                                  * This allows us to skip ahead more quickly
2509                                  * since several pages may've been dealt
2510                                  * with by this iteration of the loop.
2511                                  */
2512                         }
2513                         mutex_enter(&rp->r_statelock);
2514                 }
2515                 mutex_exit(&rp->r_statelock);
2516         }
2517 
2518         return (error);
2519 }
2520 
2521 void
2522 nfs_invalidate_pages(vnode_t *vp, u_offset_t off, cred_t *cr)
2523 {
2524         rnode_t *rp;
2525 
2526         rp = VTOR(vp);
2527         mutex_enter(&rp->r_statelock);
2528         while (rp->r_flags & RTRUNCATE)
2529                 cv_wait(&rp->r_cv, &rp->r_statelock);
2530         rp->r_flags |= RTRUNCATE;
2531         if (off == (u_offset_t)0) {
2532                 rp->r_flags &= ~RDIRTY;
2533                 if (!(rp->r_flags & RSTALE))
2534                         rp->r_error = 0;
2535         }
2536         rp->r_truncaddr = off;
2537         mutex_exit(&rp->r_statelock);
2538         (void) pvn_vplist_dirty(vp, off, rp->r_putapage,
2539             B_INVAL | B_TRUNC, cr);
2540         mutex_enter(&rp->r_statelock);
2541         rp->r_flags &= ~RTRUNCATE;
2542         cv_broadcast(&rp->r_cv);
2543         mutex_exit(&rp->r_statelock);
2544 }
2545 
2546 static int nfs_write_error_to_cons_only = 0;
2547 #define MSG(x)  (nfs_write_error_to_cons_only ? (x) : (x) + 1)
2548 
2549 /*
2550  * Print a file handle
2551  */
2552 void
2553 nfs_printfhandle(nfs_fhandle *fhp)
2554 {
2555         int *ip;
2556         char *buf;
2557         size_t bufsize;
2558         char *cp;
2559 
2560         /*
2561          * 13 == "(file handle:"
2562          * maximum of NFS_FHANDLE / sizeof (*ip) elements in fh_buf times
2563          *      1 == ' '
2564          *      8 == maximum strlen of "%x"
2565          * 3 == ")\n\0"
2566          */
2567         bufsize = 13 + ((NFS_FHANDLE_LEN / sizeof (*ip)) * (1 + 8)) + 3;
2568         buf = kmem_alloc(bufsize, KM_NOSLEEP);
2569         if (buf == NULL)
2570                 return;
2571 
2572         cp = buf;
2573         (void) strcpy(cp, "(file handle:");
2574         while (*cp != '\0')
2575                 cp++;
2576         for (ip = (int *)fhp->fh_buf;
2577             ip < (int *)&fhp->fh_buf[fhp->fh_len];
2578             ip++) {
2579                 (void) sprintf(cp, " %x", *ip);
2580                 while (*cp != '\0')
2581                         cp++;
2582         }
2583         (void) strcpy(cp, ")\n");
2584 
2585         zcmn_err(getzoneid(), CE_CONT, MSG("^%s"), buf);
2586 
2587         kmem_free(buf, bufsize);
2588 }
2589 
2590 /*
2591  * Notify the system administrator that an NFS write error has
2592  * occurred.
2593  */
2594 
2595 /* seconds between ENOSPC/EDQUOT messages */
2596 clock_t nfs_write_error_interval = 5;
2597 
2598 void
2599 nfs_write_error(vnode_t *vp, int error, cred_t *cr)
2600 {
2601         mntinfo_t *mi;
2602         clock_t now;
2603 
2604         mi = VTOMI(vp);
2605         /*
2606          * In case of forced unmount or zone shutdown, do not print any
2607          * messages since it can flood the console with error messages.
2608          */
2609         if (FS_OR_ZONE_GONE(mi->mi_vfsp))
2610                 return;
2611 
2612         /*
2613          * No use in flooding the console with ENOSPC
2614          * messages from the same file system.
2615          */
2616         now = ddi_get_lbolt();
2617         if ((error != ENOSPC && error != EDQUOT) ||
2618             now - mi->mi_printftime > 0) {
2619                 zoneid_t zoneid = mi->mi_zone->zone_id;
2620 
2621 #ifdef DEBUG
2622                 nfs_perror(error, "NFS%ld write error on host %s: %m.\n",
2623                     mi->mi_vers, VTOR(vp)->r_server->sv_hostname, NULL);
2624 #else
2625                 nfs_perror(error, "NFS write error on host %s: %m.\n",
2626                     VTOR(vp)->r_server->sv_hostname, NULL);
2627 #endif
2628                 if (error == ENOSPC || error == EDQUOT) {
2629                         zcmn_err(zoneid, CE_CONT,
2630                             MSG("^File: userid=%d, groupid=%d\n"),
2631                             crgetuid(cr), crgetgid(cr));
2632                         if (crgetuid(CRED()) != crgetuid(cr) ||
2633                             crgetgid(CRED()) != crgetgid(cr)) {
2634                                 zcmn_err(zoneid, CE_CONT,
2635                                     MSG("^User: userid=%d, groupid=%d\n"),
2636                                     crgetuid(CRED()), crgetgid(CRED()));
2637                         }
2638                         mi->mi_printftime = now +
2639                             nfs_write_error_interval * hz;
2640                 }
2641                 nfs_printfhandle(&VTOR(vp)->r_fh);
2642 #ifdef DEBUG
2643                 if (error == EACCES) {
2644                         zcmn_err(zoneid, CE_CONT,
2645                             MSG("^nfs_bio: cred is%s kcred\n"),
2646                             cr == kcred ? "" : " not");
2647                 }
2648 #endif
2649         }
2650 }
2651 
2652 /* ARGSUSED */
2653 static void *
2654 nfs_mi_init(zoneid_t zoneid)
2655 {
2656         struct mi_globals *mig;
2657 
2658         mig = kmem_alloc(sizeof (*mig), KM_SLEEP);
2659         mutex_init(&mig->mig_lock, NULL, MUTEX_DEFAULT, NULL);
2660         list_create(&mig->mig_list, sizeof (mntinfo_t),
2661             offsetof(mntinfo_t, mi_zone_node));
2662         mig->mig_destructor_called = B_FALSE;
2663         return (mig);
2664 }
2665 
2666 /*
2667  * Callback routine to tell all NFS mounts in the zone to stop creating new
2668  * threads.  Existing threads should exit.
2669  */
2670 /* ARGSUSED */
2671 static void
2672 nfs_mi_shutdown(zoneid_t zoneid, void *data)
2673 {
2674         struct mi_globals *mig = data;
2675         mntinfo_t *mi;
2676 
2677         ASSERT(mig != NULL);
2678 again:
2679         mutex_enter(&mig->mig_lock);
2680         for (mi = list_head(&mig->mig_list); mi != NULL;
2681             mi = list_next(&mig->mig_list, mi)) {
2682 
2683                 /*
2684                  * If we've done the shutdown work for this FS, skip.
2685                  * Once we go off the end of the list, we're done.
2686                  */
2687                 if (mi->mi_flags & MI_DEAD)
2688                         continue;
2689 
2690                 /*
2691                  * We will do work, so not done.  Get a hold on the FS.
2692                  */
2693                 VFS_HOLD(mi->mi_vfsp);
2694 
2695                 /*
2696                  * purge the DNLC for this filesystem
2697                  */
2698                 (void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
2699 
2700                 mutex_enter(&mi->mi_async_lock);
2701                 /*
2702                  * Tell existing async worker threads to exit.
2703                  */
2704                 mi->mi_max_threads = 0;
2705                 NFS_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
2706                 /*
2707                  * Set MI_ASYNC_MGR_STOP so the async manager thread starts
2708                  * getting ready to exit when it's done with its current work.
2709                  * Also set MI_DEAD to note we've acted on this FS.
2710                  */
2711                 mutex_enter(&mi->mi_lock);
2712                 mi->mi_flags |= (MI_ASYNC_MGR_STOP|MI_DEAD);
2713                 mutex_exit(&mi->mi_lock);
2714                 /*
2715                  * Wake up the async manager thread.
2716                  */
2717                 cv_broadcast(&mi->mi_async_reqs_cv);
2718                 mutex_exit(&mi->mi_async_lock);
2719 
2720                 /*
2721                  * Drop lock and release FS, which may change list, then repeat.
2722                  * We're done when every mi has been done or the list is empty.
2723                  */
2724                 mutex_exit(&mig->mig_lock);
2725                 VFS_RELE(mi->mi_vfsp);
2726                 goto again;
2727         }
2728         mutex_exit(&mig->mig_lock);
2729 }
2730 
2731 static void
2732 nfs_mi_free_globals(struct mi_globals *mig)
2733 {
2734         list_destroy(&mig->mig_list);    /* makes sure the list is empty */
2735         mutex_destroy(&mig->mig_lock);
2736         kmem_free(mig, sizeof (*mig));
2737 
2738 }
2739 
2740 /* ARGSUSED */
2741 static void
2742 nfs_mi_destroy(zoneid_t zoneid, void *data)
2743 {
2744         struct mi_globals *mig = data;
2745 
2746         ASSERT(mig != NULL);
2747         mutex_enter(&mig->mig_lock);
2748         if (list_head(&mig->mig_list) != NULL) {
2749                 /* Still waiting for VFS_FREEVFS() */
2750                 mig->mig_destructor_called = B_TRUE;
2751                 mutex_exit(&mig->mig_lock);
2752                 return;
2753         }
2754         nfs_mi_free_globals(mig);
2755 }
2756 
2757 /*
2758  * Add an NFS mount to the per-zone list of NFS mounts.
2759  */
2760 void
2761 nfs_mi_zonelist_add(mntinfo_t *mi)
2762 {
2763         struct mi_globals *mig;
2764 
2765         mig = zone_getspecific(mi_list_key, mi->mi_zone);
2766         mutex_enter(&mig->mig_lock);
2767         list_insert_head(&mig->mig_list, mi);
2768         mutex_exit(&mig->mig_lock);
2769 }
2770 
2771 /*
2772  * Remove an NFS mount from the per-zone list of NFS mounts.
2773  */
2774 static void
2775 nfs_mi_zonelist_remove(mntinfo_t *mi)
2776 {
2777         struct mi_globals *mig;
2778 
2779         mig = zone_getspecific(mi_list_key, mi->mi_zone);
2780         mutex_enter(&mig->mig_lock);
2781         list_remove(&mig->mig_list, mi);
2782         /*
2783          * We can be called asynchronously by VFS_FREEVFS() after the zone
2784          * shutdown/destroy callbacks have executed; if so, clean up the zone's
2785          * mi globals.
2786          */
2787         if (list_head(&mig->mig_list) == NULL &&
2788             mig->mig_destructor_called == B_TRUE) {
2789                 nfs_mi_free_globals(mig);
2790                 return;
2791         }
2792         mutex_exit(&mig->mig_lock);
2793 }
2794 
2795 /*
2796  * NFS Client initialization routine.  This routine should only be called
2797  * once.  It performs the following tasks:
2798  *      - Initalize all global locks
2799  *      - Call sub-initialization routines (localize access to variables)
2800  */
2801 int
2802 nfs_clntinit(void)
2803 {
2804 #ifdef DEBUG
2805         static boolean_t nfs_clntup = B_FALSE;
2806 #endif
2807         int error;
2808 
2809 #ifdef DEBUG
2810         ASSERT(nfs_clntup == B_FALSE);
2811 #endif
2812 
2813         error = nfs_subrinit();
2814         if (error)
2815                 return (error);
2816 
2817         error = nfs_vfsinit();
2818         if (error) {
2819                 /*
2820                  * Cleanup nfs_subrinit() work
2821                  */
2822                 nfs_subrfini();
2823                 return (error);
2824         }
2825         zone_key_create(&mi_list_key, nfs_mi_init, nfs_mi_shutdown,
2826             nfs_mi_destroy);
2827 
2828         nfs4_clnt_init();
2829 
2830 #ifdef DEBUG
2831         nfs_clntup = B_TRUE;
2832 #endif
2833 
2834         return (0);
2835 }
2836 
2837 /*
2838  * This routine is only called if the NFS Client has been initialized but
2839  * the module failed to be installed. This routine will cleanup the previously
2840  * allocated/initialized work.
2841  */
2842 void
2843 nfs_clntfini(void)
2844 {
2845         (void) zone_key_delete(mi_list_key);
2846         nfs_subrfini();
2847         nfs_vfsfini();
2848         nfs4_clnt_fini();
2849 }
2850 
2851 /*
2852  * nfs_lockrelease:
2853  *
2854  * Release any locks on the given vnode that are held by the current
2855  * process.
2856  */
2857 void
2858 nfs_lockrelease(vnode_t *vp, int flag, offset_t offset, cred_t *cr)
2859 {
2860         flock64_t ld;
2861         struct shrlock shr;
2862         char *buf;
2863         int remote_lock_possible;
2864         int ret;
2865 
2866         ASSERT((uintptr_t)vp > KERNELBASE);
2867 
2868         /*
2869          * Generate an explicit unlock operation for the entire file.  As a
2870          * partial optimization, only generate the unlock if there is a
2871          * lock registered for the file.  We could check whether this
2872          * particular process has any locks on the file, but that would
2873          * require the local locking code to provide yet another query
2874          * routine.  Note that no explicit synchronization is needed here.
2875          * At worst, flk_has_remote_locks() will return a false positive,
2876          * in which case the unlock call wastes time but doesn't harm
2877          * correctness.
2878          *
2879          * In addition, an unlock request is generated if the process
2880          * is listed as possibly having a lock on the file because the
2881          * server and client lock managers may have gotten out of sync.
2882          * N.B. It is important to make sure nfs_remove_locking_id() is
2883          * called here even if flk_has_remote_locks(vp) reports true.
2884          * If it is not called and there is an entry on the process id
2885          * list, that entry will never get removed.
2886          */
2887         remote_lock_possible = nfs_remove_locking_id(vp, RLMPL_PID,
2888             (char *)&(ttoproc(curthread)->p_pid), NULL, NULL);
2889         if (remote_lock_possible || flk_has_remote_locks(vp)) {
2890                 ld.l_type = F_UNLCK;    /* set to unlock entire file */
2891                 ld.l_whence = 0;        /* unlock from start of file */
2892                 ld.l_start = 0;
2893                 ld.l_len = 0;           /* do entire file */
2894                 ret = VOP_FRLOCK(vp, F_SETLK, &ld, flag, offset, NULL, cr,
2895                     NULL);
2896 
2897                 if (ret != 0) {
2898                         /*
2899                          * If VOP_FRLOCK fails, make sure we unregister
2900                          * local locks before we continue.
2901                          */
2902                         ld.l_pid = ttoproc(curthread)->p_pid;
2903                         lm_register_lock_locally(vp, NULL, &ld, flag, offset);
2904 #ifdef DEBUG
2905                         nfs_perror(ret,
2906                             "NFS lock release error on vp %p: %m.\n",
2907                             (void *)vp, NULL);
2908 #endif
2909                 }
2910 
2911                 /*
2912                  * The call to VOP_FRLOCK may put the pid back on the
2913                  * list.  We need to remove it.
2914                  */
2915                 (void) nfs_remove_locking_id(vp, RLMPL_PID,
2916                     (char *)&(ttoproc(curthread)->p_pid), NULL, NULL);
2917         }
2918 
2919         /*
2920          * As long as the vp has a share matching our pid,
2921          * pluck it off and unshare it.  There are circumstances in
2922          * which the call to nfs_remove_locking_id() may put the
2923          * owner back on the list, in which case we simply do a
2924          * redundant and harmless unshare.
2925          */
2926         buf = kmem_alloc(MAX_SHR_OWNER_LEN, KM_SLEEP);
2927         while (nfs_remove_locking_id(vp, RLMPL_OWNER,
2928             (char *)NULL, buf, &shr.s_own_len)) {
2929                 shr.s_owner = buf;
2930                 shr.s_access = 0;
2931                 shr.s_deny = 0;
2932                 shr.s_sysid = 0;
2933                 shr.s_pid = curproc->p_pid;
2934 
2935                 ret = VOP_SHRLOCK(vp, F_UNSHARE, &shr, flag, cr, NULL);
2936 #ifdef DEBUG
2937                 if (ret != 0) {
2938                         nfs_perror(ret,
2939                             "NFS share release error on vp %p: %m.\n",
2940                             (void *)vp, NULL);
2941                 }
2942 #endif
2943         }
2944         kmem_free(buf, MAX_SHR_OWNER_LEN);
2945 }
2946 
2947 /*
2948  * nfs_lockcompletion:
2949  *
2950  * If the vnode has a lock that makes it unsafe to cache the file, mark it
2951  * as non cachable (set VNOCACHE bit).
2952  */
2953 
2954 void
2955 nfs_lockcompletion(vnode_t *vp, int cmd)
2956 {
2957 #ifdef DEBUG
2958         rnode_t *rp = VTOR(vp);
2959 
2960         ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER));
2961 #endif
2962 
2963         if (cmd == F_SETLK || cmd == F_SETLKW) {
2964                 if (!lm_safemap(vp)) {
2965                         mutex_enter(&vp->v_lock);
2966                         vp->v_flag |= VNOCACHE;
2967                         mutex_exit(&vp->v_lock);
2968                 } else {
2969                         mutex_enter(&vp->v_lock);
2970                         vp->v_flag &= ~VNOCACHE;
2971                         mutex_exit(&vp->v_lock);
2972                 }
2973         }
2974         /*
2975          * The cached attributes of the file are stale after acquiring
2976          * the lock on the file. They were updated when the file was
2977          * opened, but not updated when the lock was acquired. Therefore the
2978          * cached attributes are invalidated after the lock is obtained.
2979          */
2980         PURGE_ATTRCACHE(vp);
2981 }
2982 
2983 /*
2984  * The lock manager holds state making it possible for the client
2985  * and server to be out of sync.  For example, if the response from
2986  * the server granting a lock request is lost, the server will think
2987  * the lock is granted and the client will think the lock is lost.
2988  * The client can tell when it is not positive if it is in sync with
2989  * the server.
2990  *
2991  * To deal with this, a list of processes for which the client is
2992  * not sure if the server holds a lock is attached to the rnode.
2993  * When such a process closes the rnode, an unlock request is sent
2994  * to the server to unlock the entire file.
2995  *
2996  * The list is kept as a singularly linked NULL terminated list.
2997  * Because it is only added to under extreme error conditions, the
2998  * list shouldn't get very big.  DEBUG kernels print a message if
2999  * the list gets bigger than nfs_lmpl_high_water.  This is arbitrarily
3000  * choosen to be 8, but can be tuned at runtime.
3001  */
3002 #ifdef DEBUG
3003 /* int nfs_lmpl_high_water = 8; */
3004 int nfs_lmpl_high_water = 128;
3005 int nfs_cnt_add_locking_id = 0;
3006 int nfs_len_add_locking_id = 0;
3007 #endif /* DEBUG */
3008 
3009 /*
3010  * Record that the nfs lock manager server may be holding a lock on
3011  * a vnode for a process.
3012  *
3013  * Because the nfs lock manager server holds state, it is possible
3014  * for the server to get out of sync with the client.  This routine is called
3015  * from the client when it is no longer sure if the server is in sync
3016  * with the client.  nfs_lockrelease() will then notice this and send
3017  * an unlock request when the file is closed
3018  */
3019 void
3020 nfs_add_locking_id(vnode_t *vp, pid_t pid, int type, char *id, int len)
3021 {
3022         rnode_t *rp;
3023         lmpl_t *new;
3024         lmpl_t *cur;
3025         lmpl_t **lmplp;
3026 #ifdef DEBUG
3027         int list_len = 1;
3028 #endif /* DEBUG */
3029 
3030 #ifdef DEBUG
3031         ++nfs_cnt_add_locking_id;
3032 #endif /* DEBUG */
3033         /*
3034          * allocate new lmpl_t now so we don't sleep
3035          * later after grabbing mutexes
3036          */
3037         ASSERT(len < MAX_SHR_OWNER_LEN);
3038         new = kmem_alloc(sizeof (*new), KM_SLEEP);
3039         new->lmpl_type = type;
3040         new->lmpl_pid = pid;
3041         new->lmpl_owner = kmem_alloc(len, KM_SLEEP);
3042         bcopy(id, new->lmpl_owner, len);
3043         new->lmpl_own_len = len;
3044         new->lmpl_next = (lmpl_t *)NULL;
3045 #ifdef DEBUG
3046         if (type == RLMPL_PID) {
3047                 ASSERT(len == sizeof (pid_t));
3048                 ASSERT(pid == *(pid_t *)new->lmpl_owner);
3049         } else {
3050                 ASSERT(type == RLMPL_OWNER);
3051         }
3052 #endif
3053 
3054         rp = VTOR(vp);
3055         mutex_enter(&rp->r_statelock);
3056 
3057         /*
3058          * Add this id to the list for this rnode only if the
3059          * rnode is active and the id is not already there.
3060          */
3061         ASSERT(rp->r_flags & RHASHED);
3062         lmplp = &(rp->r_lmpl);
3063         for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL; cur = cur->lmpl_next) {
3064                 if (cur->lmpl_pid == pid &&
3065                     cur->lmpl_type == type &&
3066                     cur->lmpl_own_len == len &&
3067                     bcmp(cur->lmpl_owner, new->lmpl_owner, len) == 0) {
3068                         kmem_free(new->lmpl_owner, len);
3069                         kmem_free(new, sizeof (*new));
3070                         break;
3071                 }
3072                 lmplp = &cur->lmpl_next;
3073 #ifdef DEBUG
3074                 ++list_len;
3075 #endif /* DEBUG */
3076         }
3077         if (cur == (lmpl_t *)NULL) {
3078                 *lmplp = new;
3079 #ifdef DEBUG
3080                 if (list_len > nfs_len_add_locking_id) {
3081                         nfs_len_add_locking_id = list_len;
3082                 }
3083                 if (list_len > nfs_lmpl_high_water) {
3084                         cmn_err(CE_WARN, "nfs_add_locking_id: long list "
3085                             "vp=%p is %d", (void *)vp, list_len);
3086                 }
3087 #endif /* DEBUG */
3088         }
3089 
3090 #ifdef DEBUG
3091         if (share_debug) {
3092                 int nitems = 0;
3093                 int npids = 0;
3094                 int nowners = 0;
3095 
3096                 /*
3097                  * Count the number of things left on r_lmpl after the remove.
3098                  */
3099                 for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL;
3100                     cur = cur->lmpl_next) {
3101                         nitems++;
3102                         if (cur->lmpl_type == RLMPL_PID) {
3103                                 npids++;
3104                         } else if (cur->lmpl_type == RLMPL_OWNER) {
3105                                 nowners++;
3106                         } else {
3107                                 cmn_err(CE_PANIC, "nfs_add_locking_id: "
3108                                     "unrecognized lmpl_type %d",
3109                                     cur->lmpl_type);
3110                         }
3111                 }
3112 
3113                 cmn_err(CE_CONT, "nfs_add_locking_id(%s): %d PIDs + %d "
3114                     "OWNs = %d items left on r_lmpl\n",
3115                     (type == RLMPL_PID) ? "P" : "O", npids, nowners, nitems);
3116         }
3117 #endif
3118 
3119         mutex_exit(&rp->r_statelock);
3120 }
3121 
3122 /*
3123  * Remove an id from the lock manager id list.
3124  *
3125  * If the id is not in the list return 0.  If it was found and
3126  * removed, return 1.
3127  */
3128 static int
3129 nfs_remove_locking_id(vnode_t *vp, int type, char *id, char *rid, int *rlen)
3130 {
3131         lmpl_t *cur;
3132         lmpl_t **lmplp;
3133         rnode_t *rp;
3134         int rv = 0;
3135 
3136         ASSERT(type == RLMPL_PID || type == RLMPL_OWNER);
3137 
3138         rp = VTOR(vp);
3139 
3140         mutex_enter(&rp->r_statelock);
3141         ASSERT(rp->r_flags & RHASHED);
3142         lmplp = &(rp->r_lmpl);
3143 
3144         /*
3145          * Search through the list and remove the entry for this id
3146          * if it is there.  The special case id == NULL allows removal
3147          * of the first share on the r_lmpl list belonging to the
3148          * current process (if any), without regard to further details
3149          * of its identity.
3150          */
3151         for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL; cur = cur->lmpl_next) {
3152                 if (cur->lmpl_type == type &&
3153                     cur->lmpl_pid == curproc->p_pid &&
3154                     (id == (char *)NULL ||
3155                     bcmp(cur->lmpl_owner, id, cur->lmpl_own_len) == 0)) {
3156                         *lmplp = cur->lmpl_next;
3157                         ASSERT(cur->lmpl_own_len < MAX_SHR_OWNER_LEN);
3158                         if (rid != NULL) {
3159                                 bcopy(cur->lmpl_owner, rid, cur->lmpl_own_len);
3160                                 *rlen = cur->lmpl_own_len;
3161                         }
3162                         kmem_free(cur->lmpl_owner, cur->lmpl_own_len);
3163                         kmem_free(cur, sizeof (*cur));
3164                         rv = 1;
3165                         break;
3166                 }
3167                 lmplp = &cur->lmpl_next;
3168         }
3169 
3170 #ifdef DEBUG
3171         if (share_debug) {
3172                 int nitems = 0;
3173                 int npids = 0;
3174                 int nowners = 0;
3175 
3176                 /*
3177                  * Count the number of things left on r_lmpl after the remove.
3178                  */
3179                 for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL;
3180                     cur = cur->lmpl_next) {
3181                         nitems++;
3182                         if (cur->lmpl_type == RLMPL_PID) {
3183                                 npids++;
3184                         } else if (cur->lmpl_type == RLMPL_OWNER) {
3185                                 nowners++;
3186                         } else {
3187                                 cmn_err(CE_PANIC,
3188                                     "nrli: unrecognized lmpl_type %d",
3189                                     cur->lmpl_type);
3190                         }
3191                 }
3192 
3193                 cmn_err(CE_CONT,
3194                 "nrli(%s): %d PIDs + %d OWNs = %d items left on r_lmpl\n",
3195                     (type == RLMPL_PID) ? "P" : "O",
3196                     npids,
3197                     nowners,
3198                     nitems);
3199         }
3200 #endif
3201 
3202         mutex_exit(&rp->r_statelock);
3203         return (rv);
3204 }
3205 
3206 void
3207 nfs_free_mi(mntinfo_t *mi)
3208 {
3209         ASSERT(mi->mi_flags & MI_ASYNC_MGR_STOP);
3210         ASSERT(mi->mi_manager_thread == NULL);
3211         ASSERT(mi->mi_threads[NFS_ASYNC_QUEUE] == 0 &&
3212             mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE] == 0);
3213 
3214         /*
3215          * Remove the node from the global list before we start tearing it down.
3216          */
3217         nfs_mi_zonelist_remove(mi);
3218         if (mi->mi_klmconfig) {
3219                 lm_free_config(mi->mi_klmconfig);
3220                 kmem_free(mi->mi_klmconfig, sizeof (struct knetconfig));
3221         }
3222         mutex_destroy(&mi->mi_lock);
3223         mutex_destroy(&mi->mi_remap_lock);
3224         mutex_destroy(&mi->mi_async_lock);
3225         mutex_destroy(&mi->mi_rnodes_lock);
3226         cv_destroy(&mi->mi_failover_cv);
3227         cv_destroy(&mi->mi_async_work_cv[NFS_ASYNC_QUEUE]);
3228         cv_destroy(&mi->mi_async_work_cv[NFS_ASYNC_PGOPS_QUEUE]);
3229         cv_destroy(&mi->mi_async_reqs_cv);
3230         cv_destroy(&mi->mi_async_cv);
3231         list_destroy(&mi->mi_rnodes);
3232         zone_rele_ref(&mi->mi_zone_ref, ZONE_REF_NFS);
3233         kmem_free(mi, sizeof (*mi));
3234 }
3235 
3236 static int
3237 mnt_kstat_update(kstat_t *ksp, int rw)
3238 {
3239         mntinfo_t *mi;
3240         struct mntinfo_kstat *mik;
3241         vfs_t *vfsp;
3242         int i;
3243 
3244         /* this is a read-only kstat. Bail out on a write */
3245         if (rw == KSTAT_WRITE)
3246                 return (EACCES);
3247 
3248         /*
3249          * We don't want to wait here as kstat_chain_lock could be held by
3250          * dounmount(). dounmount() takes vfs_reflock before the chain lock
3251          * and thus could lead to a deadlock.
3252          */
3253         vfsp = (struct vfs *)ksp->ks_private;
3254 
3255 
3256         mi = VFTOMI(vfsp);
3257 
3258         mik = (struct mntinfo_kstat *)ksp->ks_data;
3259 
3260         (void) strcpy(mik->mik_proto, mi->mi_curr_serv->sv_knconf->knc_proto);
3261         mik->mik_vers = (uint32_t)mi->mi_vers;
3262         mik->mik_flags = mi->mi_flags;
3263         mik->mik_secmod = mi->mi_curr_serv->sv_secdata->secmod;
3264         mik->mik_curread = (uint32_t)mi->mi_curread;
3265         mik->mik_curwrite = (uint32_t)mi->mi_curwrite;
3266         mik->mik_retrans = mi->mi_retrans;
3267         mik->mik_timeo = mi->mi_timeo;
3268         mik->mik_acregmin = HR2SEC(mi->mi_acregmin);
3269         mik->mik_acregmax = HR2SEC(mi->mi_acregmax);
3270         mik->mik_acdirmin = HR2SEC(mi->mi_acdirmin);
3271         mik->mik_acdirmax = HR2SEC(mi->mi_acdirmax);
3272         for (i = 0; i < NFS_CALLTYPES + 1; i++) {
3273                 mik->mik_timers[i].srtt = (uint32_t)mi->mi_timers[i].rt_srtt;
3274                 mik->mik_timers[i].deviate =
3275                     (uint32_t)mi->mi_timers[i].rt_deviate;
3276                 mik->mik_timers[i].rtxcur =
3277                     (uint32_t)mi->mi_timers[i].rt_rtxcur;
3278         }
3279         mik->mik_noresponse = (uint32_t)mi->mi_noresponse;
3280         mik->mik_failover = (uint32_t)mi->mi_failover;
3281         mik->mik_remap = (uint32_t)mi->mi_remap;
3282         (void) strcpy(mik->mik_curserver, mi->mi_curr_serv->sv_hostname);
3283 
3284         return (0);
3285 }
3286 
3287 void
3288 nfs_mnt_kstat_init(struct vfs *vfsp)
3289 {
3290         mntinfo_t *mi = VFTOMI(vfsp);
3291 
3292         /*
3293          * Create the version specific kstats.
3294          *
3295          * PSARC 2001/697 Contract Private Interface
3296          * All nfs kstats are under SunMC contract
3297          * Please refer to the PSARC listed above and contact
3298          * SunMC before making any changes!
3299          *
3300          * Changes must be reviewed by Solaris File Sharing
3301          * Changes must be communicated to contract-2001-697@sun.com
3302          *
3303          */
3304 
3305         mi->mi_io_kstats = kstat_create_zone("nfs", getminor(vfsp->vfs_dev),
3306             NULL, "nfs", KSTAT_TYPE_IO, 1, 0, mi->mi_zone->zone_id);
3307         if (mi->mi_io_kstats) {
3308                 if (mi->mi_zone->zone_id != GLOBAL_ZONEID)
3309                         kstat_zone_add(mi->mi_io_kstats, GLOBAL_ZONEID);
3310                 mi->mi_io_kstats->ks_lock = &mi->mi_lock;
3311                 kstat_install(mi->mi_io_kstats);
3312         }
3313 
3314         if ((mi->mi_ro_kstats = kstat_create_zone("nfs",
3315             getminor(vfsp->vfs_dev), "mntinfo", "misc", KSTAT_TYPE_RAW,
3316             sizeof (struct mntinfo_kstat), 0, mi->mi_zone->zone_id)) != NULL) {
3317                 if (mi->mi_zone->zone_id != GLOBAL_ZONEID)
3318                         kstat_zone_add(mi->mi_ro_kstats, GLOBAL_ZONEID);
3319                 mi->mi_ro_kstats->ks_update = mnt_kstat_update;
3320                 mi->mi_ro_kstats->ks_private = (void *)vfsp;
3321                 kstat_install(mi->mi_ro_kstats);
3322         }
3323 }
3324 
3325 nfs_delmapcall_t *
3326 nfs_init_delmapcall()
3327 {
3328         nfs_delmapcall_t        *delmap_call;
3329 
3330         delmap_call = kmem_alloc(sizeof (nfs_delmapcall_t), KM_SLEEP);
3331         delmap_call->call_id = curthread;
3332         delmap_call->error = 0;
3333 
3334         return (delmap_call);
3335 }
3336 
3337 void
3338 nfs_free_delmapcall(nfs_delmapcall_t *delmap_call)
3339 {
3340         kmem_free(delmap_call, sizeof (nfs_delmapcall_t));
3341 }
3342 
3343 /*
3344  * Searches for the current delmap caller (based on curthread) in the list of
3345  * callers.  If it is found, we remove it and free the delmap caller.
3346  * Returns:
3347  *      0 if the caller wasn't found
3348  *      1 if the caller was found, removed and freed.  *errp is set to what
3349  *      the result of the delmap was.
3350  */
3351 int
3352 nfs_find_and_delete_delmapcall(rnode_t *rp, int *errp)
3353 {
3354         nfs_delmapcall_t        *delmap_call;
3355 
3356         /*
3357          * If the list doesn't exist yet, we create it and return
3358          * that the caller wasn't found.  No list = no callers.
3359          */
3360         mutex_enter(&rp->r_statelock);
3361         if (!(rp->r_flags & RDELMAPLIST)) {
3362                 /* The list does not exist */
3363                 list_create(&rp->r_indelmap, sizeof (nfs_delmapcall_t),
3364                     offsetof(nfs_delmapcall_t, call_node));
3365                 rp->r_flags |= RDELMAPLIST;
3366                 mutex_exit(&rp->r_statelock);
3367                 return (0);
3368         } else {
3369                 /* The list exists so search it */
3370                 for (delmap_call = list_head(&rp->r_indelmap);
3371                     delmap_call != NULL;
3372                     delmap_call = list_next(&rp->r_indelmap, delmap_call)) {
3373                         if (delmap_call->call_id == curthread) {
3374                                 /* current caller is in the list */
3375                                 *errp = delmap_call->error;
3376                                 list_remove(&rp->r_indelmap, delmap_call);
3377                                 mutex_exit(&rp->r_statelock);
3378                                 nfs_free_delmapcall(delmap_call);
3379                                 return (1);
3380                         }
3381                 }
3382         }
3383         mutex_exit(&rp->r_statelock);
3384         return (0);
3385 }