1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 /*
  27  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
  28  * Copyright (c) 2016, 2017 by Delphix. All rights reserved.
  29  */
  30 
  31 #include <sys/param.h>
  32 #include <sys/types.h>
  33 #include <sys/systm.h>
  34 #include <sys/cred.h>
  35 #include <sys/proc.h>
  36 #include <sys/user.h>
  37 #include <sys/time.h>
  38 #include <sys/buf.h>
  39 #include <sys/vfs.h>
  40 #include <sys/vnode.h>
  41 #include <sys/socket.h>
  42 #include <sys/uio.h>
  43 #include <sys/tiuser.h>
  44 #include <sys/swap.h>
  45 #include <sys/errno.h>
  46 #include <sys/debug.h>
  47 #include <sys/kmem.h>
  48 #include <sys/kstat.h>
  49 #include <sys/cmn_err.h>
  50 #include <sys/vtrace.h>
  51 #include <sys/session.h>
  52 #include <sys/dnlc.h>
  53 #include <sys/bitmap.h>
  54 #include <sys/acl.h>
  55 #include <sys/ddi.h>
  56 #include <sys/pathname.h>
  57 #include <sys/flock.h>
  58 #include <sys/dirent.h>
  59 #include <sys/flock.h>
  60 #include <sys/callb.h>
  61 #include <sys/atomic.h>
  62 #include <sys/list.h>
  63 #include <sys/tsol/tnet.h>
  64 #include <sys/priv.h>
  65 #include <sys/sdt.h>
  66 #include <sys/attr.h>
  67 
  68 #include <inet/ip6.h>
  69 
  70 #include <rpc/types.h>
  71 #include <rpc/xdr.h>
  72 #include <rpc/auth.h>
  73 #include <rpc/clnt.h>
  74 
  75 #include <nfs/nfs.h>
  76 #include <nfs/nfs4.h>
  77 #include <nfs/nfs_clnt.h>
  78 #include <nfs/rnode.h>
  79 #include <nfs/nfs_acl.h>
  80 
  81 #include <sys/tsol/label.h>
  82 
  83 /*
  84  * The hash queues for the access to active and cached rnodes
  85  * are organized as doubly linked lists.  A reader/writer lock
  86  * for each hash bucket is used to control access and to synchronize
  87  * lookups, additions, and deletions from the hash queue.
  88  *
  89  * The rnode freelist is organized as a doubly linked list with
  90  * a head pointer.  Additions and deletions are synchronized via
  91  * a single mutex.
  92  *
  93  * In order to add an rnode to the free list, it must be hashed into
  94  * a hash queue and the exclusive lock to the hash queue be held.
  95  * If an rnode is not hashed into a hash queue, then it is destroyed
  96  * because it represents no valuable information that can be reused
  97  * about the file.  The exclusive lock to the hash queue must be
  98  * held in order to prevent a lookup in the hash queue from finding
  99  * the rnode and using it and assuming that the rnode is not on the
 100  * freelist.  The lookup in the hash queue will have the hash queue
 101  * locked, either exclusive or shared.
 102  *
 103  * The vnode reference count for each rnode is not allowed to drop
 104  * below 1.  This prevents external entities, such as the VM
 105  * subsystem, from acquiring references to vnodes already on the
 106  * freelist and then trying to place them back on the freelist
 107  * when their reference is released.  This means that the when an
 108  * rnode is looked up in the hash queues, then either the rnode
 109  * is removed from the freelist and that reference is transferred to
 110  * the new reference or the vnode reference count must be incremented
 111  * accordingly.  The mutex for the freelist must be held in order to
 112  * accurately test to see if the rnode is on the freelist or not.
 113  * The hash queue lock might be held shared and it is possible that
 114  * two different threads may race to remove the rnode from the
 115  * freelist.  This race can be resolved by holding the mutex for the
 116  * freelist.  Please note that the mutex for the freelist does not
 117  * need to held if the rnode is not on the freelist.  It can not be
 118  * placed on the freelist due to the requirement that the thread
 119  * putting the rnode on the freelist must hold the exclusive lock
 120  * to the hash queue and the thread doing the lookup in the hash
 121  * queue is holding either a shared or exclusive lock to the hash
 122  * queue.
 123  *
 124  * The lock ordering is:
 125  *
 126  *      hash bucket lock -> vnode lock
 127  *      hash bucket lock -> freelist lock
 128  */
 129 static rhashq_t *rtable;
 130 
 131 static kmutex_t rpfreelist_lock;
 132 static rnode_t *rpfreelist = NULL;
 133 static long rnew = 0;
 134 long nrnode = 0;
 135 
 136 static int rtablesize;
 137 static int rtablemask;
 138 
 139 static int hashlen = 4;
 140 
 141 static struct kmem_cache *rnode_cache;
 142 
 143 /*
 144  * Mutex to protect the following variables:
 145  *      nfs_major
 146  *      nfs_minor
 147  */
 148 kmutex_t nfs_minor_lock;
 149 int nfs_major;
 150 int nfs_minor;
 151 
 152 /* Do we allow preepoch (negative) time values otw? */
 153 bool_t nfs_allow_preepoch_time = FALSE; /* default: do not allow preepoch */
 154 
 155 /*
 156  * Access cache
 157  */
 158 static acache_hash_t *acache;
 159 static long nacache;    /* used strictly to size the number of hash queues */
 160 
 161 static int acachesize;
 162 static int acachemask;
 163 static struct kmem_cache *acache_cache;
 164 
 165 /*
 166  * Client side utilities
 167  */
 168 
 169 /*
 170  * client side statistics
 171  */
 172 static const struct clstat clstat_tmpl = {
 173         { "calls",      KSTAT_DATA_UINT64 },
 174         { "badcalls",   KSTAT_DATA_UINT64 },
 175         { "clgets",     KSTAT_DATA_UINT64 },
 176         { "cltoomany",  KSTAT_DATA_UINT64 },
 177 #ifdef DEBUG
 178         { "clalloc",    KSTAT_DATA_UINT64 },
 179         { "noresponse", KSTAT_DATA_UINT64 },
 180         { "failover",   KSTAT_DATA_UINT64 },
 181         { "remap",      KSTAT_DATA_UINT64 },
 182 #endif
 183 };
 184 
 185 /*
 186  * The following are statistics that describe behavior of the system as a whole
 187  * and doesn't correspond to any one particular zone.
 188  */
 189 #ifdef DEBUG
 190 static struct clstat_debug {
 191         kstat_named_t   nrnode;                 /* number of allocated rnodes */
 192         kstat_named_t   access;                 /* size of access cache */
 193         kstat_named_t   dirent;                 /* size of readdir cache */
 194         kstat_named_t   dirents;                /* size of readdir buf cache */
 195         kstat_named_t   reclaim;                /* number of reclaims */
 196         kstat_named_t   clreclaim;              /* number of cl reclaims */
 197         kstat_named_t   f_reclaim;              /* number of free reclaims */
 198         kstat_named_t   a_reclaim;              /* number of active reclaims */
 199         kstat_named_t   r_reclaim;              /* number of rnode reclaims */
 200         kstat_named_t   rpath;                  /* bytes used to store rpaths */
 201 } clstat_debug = {
 202         { "nrnode",     KSTAT_DATA_UINT64 },
 203         { "access",     KSTAT_DATA_UINT64 },
 204         { "dirent",     KSTAT_DATA_UINT64 },
 205         { "dirents",    KSTAT_DATA_UINT64 },
 206         { "reclaim",    KSTAT_DATA_UINT64 },
 207         { "clreclaim",  KSTAT_DATA_UINT64 },
 208         { "f_reclaim",  KSTAT_DATA_UINT64 },
 209         { "a_reclaim",  KSTAT_DATA_UINT64 },
 210         { "r_reclaim",  KSTAT_DATA_UINT64 },
 211         { "r_path",     KSTAT_DATA_UINT64 },
 212 };
 213 #endif  /* DEBUG */
 214 
 215 /*
 216  * We keep a global list of per-zone client data, so we can clean up all zones
 217  * if we get low on memory.
 218  */
 219 static list_t nfs_clnt_list;
 220 static kmutex_t nfs_clnt_list_lock;
 221 static zone_key_t nfsclnt_zone_key;
 222 
 223 static struct kmem_cache *chtab_cache;
 224 
 225 /*
 226  * Some servers do not properly update the attributes of the
 227  * directory when changes are made.  To allow interoperability
 228  * with these broken servers, the nfs_disable_rddir_cache
 229  * parameter must be set in /etc/system
 230  */
 231 int nfs_disable_rddir_cache = 0;
 232 
 233 int             clget(clinfo_t *, servinfo_t *, cred_t *, CLIENT **,
 234                     struct chtab **);
 235 void            clfree(CLIENT *, struct chtab *);
 236 static int      acl_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **,
 237                     struct chtab **, struct nfs_clnt *);
 238 static int      nfs_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **,
 239                     struct chtab **, struct nfs_clnt *);
 240 static void     clreclaim(void *);
 241 static int      nfs_feedback(int, int, mntinfo_t *);
 242 static int      rfscall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t,
 243                     caddr_t, cred_t *, int *, enum clnt_stat *, int,
 244                     failinfo_t *);
 245 static int      aclcall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t,
 246                     caddr_t, cred_t *, int *, int, failinfo_t *);
 247 static void     rinactive(rnode_t *, cred_t *);
 248 static int      rtablehash(nfs_fhandle *);
 249 static vnode_t  *make_rnode(nfs_fhandle *, rhashq_t *, struct vfs *,
 250                     struct vnodeops *,
 251                     int (*)(vnode_t *, page_t *, u_offset_t *, size_t *, int,
 252                         cred_t *),
 253                     int (*)(const void *, const void *), int *, cred_t *,
 254                     char *, char *);
 255 static void     rp_rmfree(rnode_t *);
 256 static void     rp_addhash(rnode_t *);
 257 static void     rp_rmhash_locked(rnode_t *);
 258 static rnode_t  *rfind(rhashq_t *, nfs_fhandle *, struct vfs *);
 259 static void     destroy_rnode(rnode_t *);
 260 static void     rddir_cache_free(rddir_cache *);
 261 static int      nfs_free_data_reclaim(rnode_t *);
 262 static int      nfs_active_data_reclaim(rnode_t *);
 263 static int      nfs_free_reclaim(void);
 264 static int      nfs_active_reclaim(void);
 265 static int      nfs_rnode_reclaim(void);
 266 static void     nfs_reclaim(void *);
 267 static int      failover_safe(failinfo_t *);
 268 static void     failover_newserver(mntinfo_t *mi);
 269 static void     failover_thread(mntinfo_t *mi);
 270 static int      failover_wait(mntinfo_t *);
 271 static int      failover_remap(failinfo_t *);
 272 static int      failover_lookup(char *, vnode_t *,
 273                     int (*)(vnode_t *, char *, vnode_t **,
 274                         struct pathname *, int, vnode_t *, cred_t *, int),
 275                     int (*)(vnode_t *, vnode_t **, bool_t, cred_t *, int),
 276                     vnode_t **);
 277 static void     nfs_free_r_path(rnode_t *);
 278 static void     nfs_set_vroot(vnode_t *);
 279 static char     *nfs_getsrvnames(mntinfo_t *, size_t *);
 280 
 281 /*
 282  * from rpcsec module (common/rpcsec)
 283  */
 284 extern int sec_clnt_geth(CLIENT *, struct sec_data *, cred_t *, AUTH **);
 285 extern void sec_clnt_freeh(AUTH *);
 286 extern void sec_clnt_freeinfo(struct sec_data *);
 287 
 288 /*
 289  * used in mount policy
 290  */
 291 extern ts_label_t *getflabel_cipso(vfs_t *);
 292 
 293 /*
 294  * EIO or EINTR are not recoverable errors.
 295  */
 296 #define IS_RECOVERABLE_ERROR(error)     !((error == EINTR) || (error == EIO))
 297 
 298 #ifdef DEBUG
 299 #define SRV_QFULL_MSG   "send queue to NFS%d server %s is full; still trying\n"
 300 #define SRV_NOTRESP_MSG "NFS%d server %s not responding still trying\n"
 301 #else
 302 #define SRV_QFULL_MSG   "send queue to NFS server %s is full still trying\n"
 303 #define SRV_NOTRESP_MSG "NFS server %s not responding still trying\n"
 304 #endif
 305 /*
 306  * Common handle get program for NFS, NFS ACL, and NFS AUTH client.
 307  */
 308 static int
 309 clget_impl(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
 310     struct chtab **chp, struct nfs_clnt *nfscl)
 311 {
 312         struct chhead *ch, *newch;
 313         struct chhead **plistp;
 314         struct chtab *cp;
 315         int error;
 316         k_sigset_t smask;
 317 
 318         if (newcl == NULL || chp == NULL || ci == NULL)
 319                 return (EINVAL);
 320 
 321         *newcl = NULL;
 322         *chp = NULL;
 323 
 324         /*
 325          * Find an unused handle or create one
 326          */
 327         newch = NULL;
 328         nfscl->nfscl_stat.clgets.value.ui64++;
 329 top:
 330         /*
 331          * Find the correct entry in the cache to check for free
 332          * client handles.  The search is based on the RPC program
 333          * number, program version number, dev_t for the transport
 334          * device, and the protocol family.
 335          */
 336         mutex_enter(&nfscl->nfscl_chtable_lock);
 337         plistp = &nfscl->nfscl_chtable;
 338         for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) {
 339                 if (ch->ch_prog == ci->cl_prog &&
 340                     ch->ch_vers == ci->cl_vers &&
 341                     ch->ch_dev == svp->sv_knconf->knc_rdev &&
 342                     (strcmp(ch->ch_protofmly,
 343                     svp->sv_knconf->knc_protofmly) == 0))
 344                         break;
 345                 plistp = &ch->ch_next;
 346         }
 347 
 348         /*
 349          * If we didn't find a cache entry for this quadruple, then
 350          * create one.  If we don't have one already preallocated,
 351          * then drop the cache lock, create one, and then start over.
 352          * If we did have a preallocated entry, then just add it to
 353          * the front of the list.
 354          */
 355         if (ch == NULL) {
 356                 if (newch == NULL) {
 357                         mutex_exit(&nfscl->nfscl_chtable_lock);
 358                         newch = kmem_alloc(sizeof (*newch), KM_SLEEP);
 359                         newch->ch_timesused = 0;
 360                         newch->ch_prog = ci->cl_prog;
 361                         newch->ch_vers = ci->cl_vers;
 362                         newch->ch_dev = svp->sv_knconf->knc_rdev;
 363                         newch->ch_protofmly = kmem_alloc(
 364                             strlen(svp->sv_knconf->knc_protofmly) + 1,
 365                             KM_SLEEP);
 366                         (void) strcpy(newch->ch_protofmly,
 367                             svp->sv_knconf->knc_protofmly);
 368                         newch->ch_list = NULL;
 369                         goto top;
 370                 }
 371                 ch = newch;
 372                 newch = NULL;
 373                 ch->ch_next = nfscl->nfscl_chtable;
 374                 nfscl->nfscl_chtable = ch;
 375         /*
 376          * We found a cache entry, but if it isn't on the front of the
 377          * list, then move it to the front of the list to try to take
 378          * advantage of locality of operations.
 379          */
 380         } else if (ch != nfscl->nfscl_chtable) {
 381                 *plistp = ch->ch_next;
 382                 ch->ch_next = nfscl->nfscl_chtable;
 383                 nfscl->nfscl_chtable = ch;
 384         }
 385 
 386         /*
 387          * If there was a free client handle cached, then remove it
 388          * from the list, init it, and use it.
 389          */
 390         if (ch->ch_list != NULL) {
 391                 cp = ch->ch_list;
 392                 ch->ch_list = cp->ch_list;
 393                 mutex_exit(&nfscl->nfscl_chtable_lock);
 394                 if (newch != NULL) {
 395                         kmem_free(newch->ch_protofmly,
 396                             strlen(newch->ch_protofmly) + 1);
 397                         kmem_free(newch, sizeof (*newch));
 398                 }
 399                 (void) clnt_tli_kinit(cp->ch_client, svp->sv_knconf,
 400                     &svp->sv_addr, ci->cl_readsize, ci->cl_retrans, cr);
 401                 error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr,
 402                     &cp->ch_client->cl_auth);
 403                 if (error || cp->ch_client->cl_auth == NULL) {
 404                         CLNT_DESTROY(cp->ch_client);
 405                         kmem_cache_free(chtab_cache, cp);
 406                         return ((error != 0) ? error : EINTR);
 407                 }
 408                 ch->ch_timesused++;
 409                 *newcl = cp->ch_client;
 410                 *chp = cp;
 411                 return (0);
 412         }
 413 
 414         /*
 415          * There weren't any free client handles which fit, so allocate
 416          * a new one and use that.
 417          */
 418 #ifdef DEBUG
 419         atomic_inc_64(&nfscl->nfscl_stat.clalloc.value.ui64);
 420 #endif
 421         mutex_exit(&nfscl->nfscl_chtable_lock);
 422 
 423         nfscl->nfscl_stat.cltoomany.value.ui64++;
 424         if (newch != NULL) {
 425                 kmem_free(newch->ch_protofmly, strlen(newch->ch_protofmly) + 1);
 426                 kmem_free(newch, sizeof (*newch));
 427         }
 428 
 429         cp = kmem_cache_alloc(chtab_cache, KM_SLEEP);
 430         cp->ch_head = ch;
 431 
 432         sigintr(&smask, (int)ci->cl_flags & MI_INT);
 433         error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, ci->cl_prog,
 434             ci->cl_vers, ci->cl_readsize, ci->cl_retrans, cr, &cp->ch_client);
 435         sigunintr(&smask);
 436 
 437         if (error != 0) {
 438                 kmem_cache_free(chtab_cache, cp);
 439 #ifdef DEBUG
 440                 atomic_dec_64(&nfscl->nfscl_stat.clalloc.value.ui64);
 441 #endif
 442                 /*
 443                  * Warning is unnecessary if error is EINTR.
 444                  */
 445                 if (error != EINTR) {
 446                         nfs_cmn_err(error, CE_WARN,
 447                             "clget: couldn't create handle: %m\n");
 448                 }
 449                 return (error);
 450         }
 451         (void) CLNT_CONTROL(cp->ch_client, CLSET_PROGRESS, NULL);
 452         auth_destroy(cp->ch_client->cl_auth);
 453         error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr,
 454             &cp->ch_client->cl_auth);
 455         if (error || cp->ch_client->cl_auth == NULL) {
 456                 CLNT_DESTROY(cp->ch_client);
 457                 kmem_cache_free(chtab_cache, cp);
 458 #ifdef DEBUG
 459                 atomic_dec_64(&nfscl->nfscl_stat.clalloc.value.ui64);
 460 #endif
 461                 return ((error != 0) ? error : EINTR);
 462         }
 463         ch->ch_timesused++;
 464         *newcl = cp->ch_client;
 465         ASSERT(cp->ch_client->cl_nosignal == FALSE);
 466         *chp = cp;
 467         return (0);
 468 }
 469 
 470 int
 471 clget(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
 472     struct chtab **chp)
 473 {
 474         struct nfs_clnt *nfscl;
 475 
 476         nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
 477         ASSERT(nfscl != NULL);
 478 
 479         return (clget_impl(ci, svp, cr, newcl, chp, nfscl));
 480 }
 481 
 482 static int
 483 acl_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
 484     struct chtab **chp, struct nfs_clnt *nfscl)
 485 {
 486         clinfo_t ci;
 487         int error;
 488 
 489         /*
 490          * Set read buffer size to rsize
 491          * and add room for RPC headers.
 492          */
 493         ci.cl_readsize = mi->mi_tsize;
 494         if (ci.cl_readsize != 0)
 495                 ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA);
 496 
 497         /*
 498          * If soft mount and server is down just try once.
 499          * meaning: do not retransmit.
 500          */
 501         if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN))
 502                 ci.cl_retrans = 0;
 503         else
 504                 ci.cl_retrans = mi->mi_retrans;
 505 
 506         ci.cl_prog = NFS_ACL_PROGRAM;
 507         ci.cl_vers = mi->mi_vers;
 508         ci.cl_flags = mi->mi_flags;
 509 
 510         /*
 511          * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS
 512          * security flavor, the client tries to establish a security context
 513          * by contacting the server. If the connection is timed out or reset,
 514          * e.g. server reboot, we will try again.
 515          */
 516         do {
 517                 error = clget_impl(&ci, svp, cr, newcl, chp, nfscl);
 518 
 519                 if (error == 0)
 520                         break;
 521 
 522                 /*
 523                  * For forced unmount or zone shutdown, bail out, no retry.
 524                  */
 525                 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
 526                         error = EIO;
 527                         break;
 528                 }
 529 
 530                 /* do not retry for softmount */
 531                 if (!(mi->mi_flags & MI_HARD))
 532                         break;
 533 
 534                 /* let the caller deal with the failover case */
 535                 if (FAILOVER_MOUNT(mi))
 536                         break;
 537 
 538         } while (error == ETIMEDOUT || error == ECONNRESET);
 539 
 540         return (error);
 541 }
 542 
 543 static int
 544 nfs_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
 545     struct chtab **chp, struct nfs_clnt *nfscl)
 546 {
 547         clinfo_t ci;
 548         int error;
 549 
 550         /*
 551          * Set read buffer size to rsize
 552          * and add room for RPC headers.
 553          */
 554         ci.cl_readsize = mi->mi_tsize;
 555         if (ci.cl_readsize != 0)
 556                 ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA);
 557 
 558         /*
 559          * If soft mount and server is down just try once.
 560          * meaning: do not retransmit.
 561          */
 562         if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN))
 563                 ci.cl_retrans = 0;
 564         else
 565                 ci.cl_retrans = mi->mi_retrans;
 566 
 567         ci.cl_prog = mi->mi_prog;
 568         ci.cl_vers = mi->mi_vers;
 569         ci.cl_flags = mi->mi_flags;
 570 
 571         /*
 572          * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS
 573          * security flavor, the client tries to establish a security context
 574          * by contacting the server. If the connection is timed out or reset,
 575          * e.g. server reboot, we will try again.
 576          */
 577         do {
 578                 error = clget_impl(&ci, svp, cr, newcl, chp, nfscl);
 579 
 580                 if (error == 0)
 581                         break;
 582 
 583                 /*
 584                  * For forced unmount or zone shutdown, bail out, no retry.
 585                  */
 586                 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
 587                         error = EIO;
 588                         break;
 589                 }
 590 
 591                 /* do not retry for softmount */
 592                 if (!(mi->mi_flags & MI_HARD))
 593                         break;
 594 
 595                 /* let the caller deal with the failover case */
 596                 if (FAILOVER_MOUNT(mi))
 597                         break;
 598 
 599         } while (error == ETIMEDOUT || error == ECONNRESET);
 600 
 601         return (error);
 602 }
 603 
 604 static void
 605 clfree_impl(CLIENT *cl, struct chtab *cp, struct nfs_clnt *nfscl)
 606 {
 607         if (cl->cl_auth != NULL) {
 608                 sec_clnt_freeh(cl->cl_auth);
 609                 cl->cl_auth = NULL;
 610         }
 611 
 612         /*
 613          * Timestamp this cache entry so that we know when it was last
 614          * used.
 615          */
 616         cp->ch_freed = gethrestime_sec();
 617 
 618         /*
 619          * Add the free client handle to the front of the list.
 620          * This way, the list will be sorted in youngest to oldest
 621          * order.
 622          */
 623         mutex_enter(&nfscl->nfscl_chtable_lock);
 624         cp->ch_list = cp->ch_head->ch_list;
 625         cp->ch_head->ch_list = cp;
 626         mutex_exit(&nfscl->nfscl_chtable_lock);
 627 }
 628 
 629 void
 630 clfree(CLIENT *cl, struct chtab *cp)
 631 {
 632         struct nfs_clnt *nfscl;
 633 
 634         nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
 635         ASSERT(nfscl != NULL);
 636 
 637         clfree_impl(cl, cp, nfscl);
 638 }
 639 
 640 #define CL_HOLDTIME     60      /* time to hold client handles */
 641 
 642 static void
 643 clreclaim_zone(struct nfs_clnt *nfscl, uint_t cl_holdtime)
 644 {
 645         struct chhead *ch;
 646         struct chtab *cp;       /* list of objects that can be reclaimed */
 647         struct chtab *cpe;
 648         struct chtab *cpl;
 649         struct chtab **cpp;
 650 #ifdef DEBUG
 651         int n = 0;
 652 #endif
 653 
 654         /*
 655          * Need to reclaim some memory, so step through the cache
 656          * looking through the lists for entries which can be freed.
 657          */
 658         cp = NULL;
 659 
 660         mutex_enter(&nfscl->nfscl_chtable_lock);
 661 
 662         /*
 663          * Here we step through each non-NULL quadruple and start to
 664          * construct the reclaim list pointed to by cp.  Note that
 665          * cp will contain all eligible chtab entries.  When this traversal
 666          * completes, chtab entries from the last quadruple will be at the
 667          * front of cp and entries from previously inspected quadruples have
 668          * been appended to the rear of cp.
 669          */
 670         for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) {
 671                 if (ch->ch_list == NULL)
 672                         continue;
 673                 /*
 674                  * Search each list for entries older then
 675                  * cl_holdtime seconds.  The lists are maintained
 676                  * in youngest to oldest order so that when the
 677                  * first entry is found which is old enough, then
 678                  * all of the rest of the entries on the list will
 679                  * be old enough as well.
 680                  */
 681                 cpl = ch->ch_list;
 682                 cpp = &ch->ch_list;
 683                 while (cpl != NULL &&
 684                     cpl->ch_freed + cl_holdtime > gethrestime_sec()) {
 685                         cpp = &cpl->ch_list;
 686                         cpl = cpl->ch_list;
 687                 }
 688                 if (cpl != NULL) {
 689                         *cpp = NULL;
 690                         if (cp != NULL) {
 691                                 cpe = cpl;
 692                                 while (cpe->ch_list != NULL)
 693                                         cpe = cpe->ch_list;
 694                                 cpe->ch_list = cp;
 695                         }
 696                         cp = cpl;
 697                 }
 698         }
 699 
 700         mutex_exit(&nfscl->nfscl_chtable_lock);
 701 
 702         /*
 703          * If cp is empty, then there is nothing to reclaim here.
 704          */
 705         if (cp == NULL)
 706                 return;
 707 
 708         /*
 709          * Step through the list of entries to free, destroying each client
 710          * handle and kmem_free'ing the memory for each entry.
 711          */
 712         while (cp != NULL) {
 713 #ifdef DEBUG
 714                 n++;
 715 #endif
 716                 CLNT_DESTROY(cp->ch_client);
 717                 cpl = cp->ch_list;
 718                 kmem_cache_free(chtab_cache, cp);
 719                 cp = cpl;
 720         }
 721 
 722 #ifdef DEBUG
 723         /*
 724          * Update clalloc so that nfsstat shows the current number
 725          * of allocated client handles.
 726          */
 727         atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -n);
 728 #endif
 729 }
 730 
 731 /* ARGSUSED */
 732 static void
 733 clreclaim(void *all)
 734 {
 735         struct nfs_clnt *nfscl;
 736 
 737 #ifdef DEBUG
 738         clstat_debug.clreclaim.value.ui64++;
 739 #endif
 740         /*
 741          * The system is low on memory; go through and try to reclaim some from
 742          * every zone on the system.
 743          */
 744         mutex_enter(&nfs_clnt_list_lock);
 745         nfscl = list_head(&nfs_clnt_list);
 746         for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl))
 747                 clreclaim_zone(nfscl, CL_HOLDTIME);
 748         mutex_exit(&nfs_clnt_list_lock);
 749 }
 750 
 751 /*
 752  * Minimum time-out values indexed by call type
 753  * These units are in "eights" of a second to avoid multiplies
 754  */
 755 static unsigned int minimum_timeo[] = {
 756         6, 7, 10
 757 };
 758 
 759 /*
 760  * Back off for retransmission timeout, MAXTIMO is in hz of a sec
 761  */
 762 #define MAXTIMO (20*hz)
 763 #define backoff(tim)    (((tim) < MAXTIMO) ? dobackoff(tim) : (tim))
 764 #define dobackoff(tim)  ((((tim) << 1) > MAXTIMO) ? MAXTIMO : ((tim) << 1))
 765 
 766 #define MIN_NFS_TSIZE 512       /* minimum "chunk" of NFS IO */
 767 #define REDUCE_NFS_TIME (hz/2)  /* rtxcur we try to keep under */
 768 #define INCREASE_NFS_TIME (hz/3*8) /* srtt we try to keep under (scaled*8) */
 769 
 770 /*
 771  * Function called when rfscall notices that we have been
 772  * re-transmitting, or when we get a response without retransmissions.
 773  * Return 1 if the transfer size was adjusted down - 0 if no change.
 774  */
 775 static int
 776 nfs_feedback(int flag, int which, mntinfo_t *mi)
 777 {
 778         int kind;
 779         int r = 0;
 780 
 781         mutex_enter(&mi->mi_lock);
 782         if (flag == FEEDBACK_REXMIT1) {
 783                 if (mi->mi_timers[NFS_CALLTYPES].rt_rtxcur != 0 &&
 784                     mi->mi_timers[NFS_CALLTYPES].rt_rtxcur < REDUCE_NFS_TIME)
 785                         goto done;
 786                 if (mi->mi_curread > MIN_NFS_TSIZE) {
 787                         mi->mi_curread /= 2;
 788                         if (mi->mi_curread < MIN_NFS_TSIZE)
 789                                 mi->mi_curread = MIN_NFS_TSIZE;
 790                         r = 1;
 791                 }
 792 
 793                 if (mi->mi_curwrite > MIN_NFS_TSIZE) {
 794                         mi->mi_curwrite /= 2;
 795                         if (mi->mi_curwrite < MIN_NFS_TSIZE)
 796                                 mi->mi_curwrite = MIN_NFS_TSIZE;
 797                         r = 1;
 798                 }
 799         } else if (flag == FEEDBACK_OK) {
 800                 kind = mi->mi_timer_type[which];
 801                 if (kind == 0 ||
 802                     mi->mi_timers[kind].rt_srtt >= INCREASE_NFS_TIME)
 803                         goto done;
 804                 if (kind == 1) {
 805                         if (mi->mi_curread >= mi->mi_tsize)
 806                                 goto done;
 807                         mi->mi_curread +=  MIN_NFS_TSIZE;
 808                         if (mi->mi_curread > mi->mi_tsize/2)
 809                                 mi->mi_curread = mi->mi_tsize;
 810                 } else if (kind == 2) {
 811                         if (mi->mi_curwrite >= mi->mi_stsize)
 812                                 goto done;
 813                         mi->mi_curwrite += MIN_NFS_TSIZE;
 814                         if (mi->mi_curwrite > mi->mi_stsize/2)
 815                                 mi->mi_curwrite = mi->mi_stsize;
 816                 }
 817         }
 818 done:
 819         mutex_exit(&mi->mi_lock);
 820         return (r);
 821 }
 822 
 823 #ifdef DEBUG
 824 static int rfs2call_hits = 0;
 825 static int rfs2call_misses = 0;
 826 #endif
 827 
 828 int
 829 rfs2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
 830     xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
 831     enum nfsstat *statusp, int flags, failinfo_t *fi)
 832 {
 833         int rpcerror;
 834         enum clnt_stat rpc_status;
 835 
 836         ASSERT(statusp != NULL);
 837 
 838         rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp,
 839             cr, douprintf, &rpc_status, flags, fi);
 840         if (!rpcerror) {
 841                 /*
 842                  * See crnetadjust() for comments.
 843                  */
 844                 if (*statusp == NFSERR_ACCES &&
 845                     (cr = crnetadjust(cr)) != NULL) {
 846 #ifdef DEBUG
 847                         rfs2call_hits++;
 848 #endif
 849                         rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres,
 850                             resp, cr, douprintf, NULL, flags, fi);
 851                         crfree(cr);
 852 #ifdef DEBUG
 853                         if (*statusp == NFSERR_ACCES)
 854                                 rfs2call_misses++;
 855 #endif
 856                 }
 857         } else if (rpc_status == RPC_PROCUNAVAIL) {
 858                 *statusp = NFSERR_OPNOTSUPP;
 859                 rpcerror = 0;
 860         }
 861 
 862         return (rpcerror);
 863 }
 864 
 865 #define NFS3_JUKEBOX_DELAY      10 * hz
 866 
 867 static clock_t nfs3_jukebox_delay = 0;
 868 
 869 #ifdef DEBUG
 870 static int rfs3call_hits = 0;
 871 static int rfs3call_misses = 0;
 872 #endif
 873 
 874 int
 875 rfs3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
 876     xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
 877     nfsstat3 *statusp, int flags, failinfo_t *fi)
 878 {
 879         int rpcerror;
 880         int user_informed;
 881 
 882         user_informed = 0;
 883         do {
 884                 rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp,
 885                     cr, douprintf, NULL, flags, fi);
 886                 if (!rpcerror) {
 887                         cred_t *crr;
 888                         if (*statusp == NFS3ERR_JUKEBOX) {
 889                                 if (ttoproc(curthread) == &p0) {
 890                                         rpcerror = EAGAIN;
 891                                         break;
 892                                 }
 893                                 if (!user_informed) {
 894                                         user_informed = 1;
 895                                         uprintf(
 896                 "file temporarily unavailable on the server, retrying...\n");
 897                                 }
 898                                 delay(nfs3_jukebox_delay);
 899                         }
 900                         /*
 901                          * See crnetadjust() for comments.
 902                          */
 903                         else if (*statusp == NFS3ERR_ACCES &&
 904                             (crr = crnetadjust(cr)) != NULL) {
 905 #ifdef DEBUG
 906                                 rfs3call_hits++;
 907 #endif
 908                                 rpcerror = rfscall(mi, which, xdrargs, argsp,
 909                                     xdrres, resp, crr, douprintf,
 910                                     NULL, flags, fi);
 911 
 912                                 crfree(crr);
 913 #ifdef DEBUG
 914                                 if (*statusp == NFS3ERR_ACCES)
 915                                         rfs3call_misses++;
 916 #endif
 917                         }
 918                 }
 919         } while (!rpcerror && *statusp == NFS3ERR_JUKEBOX);
 920 
 921         return (rpcerror);
 922 }
 923 
 924 #define VALID_FH(fi)    (VTOR(fi->vp)->r_server == VTOMI(fi->vp)->mi_curr_serv)
 925 #define INC_READERS(mi)         { \
 926         mi->mi_readers++; \
 927 }
 928 #define DEC_READERS(mi)         { \
 929         mi->mi_readers--; \
 930         if (mi->mi_readers == 0) \
 931                 cv_broadcast(&mi->mi_failover_cv); \
 932 }
 933 
 934 static int
 935 rfscall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
 936     xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf,
 937     enum clnt_stat *rpc_status, int flags, failinfo_t *fi)
 938 {
 939         CLIENT *client;
 940         struct chtab *ch;
 941         cred_t *cr = icr;
 942         enum clnt_stat status;
 943         struct rpc_err rpcerr, rpcerr_tmp;
 944         struct timeval wait;
 945         int timeo;              /* in units of hz */
 946         int my_rsize, my_wsize;
 947         bool_t tryagain;
 948         bool_t cred_cloned = FALSE;
 949         k_sigset_t smask;
 950         servinfo_t *svp;
 951         struct nfs_clnt *nfscl;
 952         zoneid_t zoneid = getzoneid();
 953         char *msg;
 954 #ifdef DEBUG
 955         char *bufp;
 956 #endif
 957 
 958 
 959         TRACE_2(TR_FAC_NFS, TR_RFSCALL_START,
 960             "rfscall_start:which %d mi %p", which, mi);
 961 
 962         nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
 963         ASSERT(nfscl != NULL);
 964 
 965         nfscl->nfscl_stat.calls.value.ui64++;
 966         mi->mi_reqs[which].value.ui64++;
 967 
 968         rpcerr.re_status = RPC_SUCCESS;
 969 
 970         /*
 971          * In case of forced unmount or zone shutdown, return EIO.
 972          */
 973 
 974         if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
 975                 rpcerr.re_status = RPC_FAILED;
 976                 rpcerr.re_errno = EIO;
 977                 return (rpcerr.re_errno);
 978         }
 979 
 980         /*
 981          * Remember the transfer sizes in case
 982          * nfs_feedback changes them underneath us.
 983          */
 984         my_rsize = mi->mi_curread;
 985         my_wsize = mi->mi_curwrite;
 986 
 987         /*
 988          * NFS client failover support
 989          *
 990          * If this rnode is not in sync with the current server (VALID_FH),
 991          * we'd like to do a remap to get in sync.  We can be interrupted
 992          * in failover_remap(), and if so we'll bail.  Otherwise, we'll
 993          * use the best info we have to try the RPC.  Part of that is
 994          * unconditionally updating the filehandle copy kept for V3.
 995          *
 996          * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible
 997          * rw_enter(); we're trying to keep the current server from being
 998          * changed on us until we're done with the remapping and have a
 999          * matching client handle.  We don't want to sending a filehandle
1000          * to the wrong host.
1001          */
1002 failoverretry:
1003         if (FAILOVER_MOUNT(mi)) {
1004                 mutex_enter(&mi->mi_lock);
1005                 if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1006                         if (failover_wait(mi)) {
1007                                 mutex_exit(&mi->mi_lock);
1008                                 return (EINTR);
1009                         }
1010                 }
1011                 INC_READERS(mi);
1012                 mutex_exit(&mi->mi_lock);
1013                 if (fi) {
1014                         if (!VALID_FH(fi) &&
1015                             !(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1016                                 int remaperr;
1017 
1018                                 svp = mi->mi_curr_serv;
1019                                 remaperr = failover_remap(fi);
1020                                 if (remaperr != 0) {
1021 #ifdef DEBUG
1022                                         if (remaperr != EINTR)
1023                                                 nfs_cmn_err(remaperr, CE_WARN,
1024                                             "rfscall couldn't failover: %m");
1025 #endif
1026                                         mutex_enter(&mi->mi_lock);
1027                                         DEC_READERS(mi);
1028                                         mutex_exit(&mi->mi_lock);
1029                                         /*
1030                                          * If failover_remap returns ETIMEDOUT
1031                                          * and the filesystem is hard mounted
1032                                          * we have to retry the call with a new
1033                                          * server.
1034                                          */
1035                                         if ((mi->mi_flags & MI_HARD) &&
1036                                             IS_RECOVERABLE_ERROR(remaperr)) {
1037                                                 if (svp == mi->mi_curr_serv)
1038                                                         failover_newserver(mi);
1039                                                 rpcerr.re_status = RPC_SUCCESS;
1040                                                 goto failoverretry;
1041                                         }
1042                                         rpcerr.re_errno = remaperr;
1043                                         return (remaperr);
1044                                 }
1045                         }
1046                         if (fi->fhp && fi->copyproc)
1047                                 (*fi->copyproc)(fi->fhp, fi->vp);
1048                 }
1049         }
1050 
1051         /* For TSOL, use a new cred which has net_mac_aware flag */
1052         if (!cred_cloned && is_system_labeled()) {
1053                 cred_cloned = TRUE;
1054                 cr = crdup(icr);
1055                 (void) setpflags(NET_MAC_AWARE, 1, cr);
1056         }
1057 
1058         /*
1059          * clget() calls clnt_tli_kinit() which clears the xid, so we
1060          * are guaranteed to reprocess the retry as a new request.
1061          */
1062         svp = mi->mi_curr_serv;
1063         rpcerr.re_errno = nfs_clget(mi, svp, cr, &client, &ch, nfscl);
1064 
1065         if (FAILOVER_MOUNT(mi)) {
1066                 mutex_enter(&mi->mi_lock);
1067                 DEC_READERS(mi);
1068                 mutex_exit(&mi->mi_lock);
1069 
1070                 if ((rpcerr.re_errno == ETIMEDOUT ||
1071                     rpcerr.re_errno == ECONNRESET) &&
1072                     failover_safe(fi)) {
1073                         if (svp == mi->mi_curr_serv)
1074                                 failover_newserver(mi);
1075                         goto failoverretry;
1076                 }
1077         }
1078         if (rpcerr.re_errno != 0)
1079                 return (rpcerr.re_errno);
1080 
1081         if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD ||
1082             svp->sv_knconf->knc_semantics == NC_TPI_COTS) {
1083                 timeo = (mi->mi_timeo * hz) / 10;
1084         } else {
1085                 mutex_enter(&mi->mi_lock);
1086                 timeo = CLNT_SETTIMERS(client,
1087                     &(mi->mi_timers[mi->mi_timer_type[which]]),
1088                     &(mi->mi_timers[NFS_CALLTYPES]),
1089                     (minimum_timeo[mi->mi_call_type[which]]*hz)>>3,
1090                     (void (*)())NULL, (caddr_t)mi, 0);
1091                 mutex_exit(&mi->mi_lock);
1092         }
1093 
1094         /*
1095          * If hard mounted fs, retry call forever unless hard error occurs.
1096          */
1097         do {
1098                 tryagain = FALSE;
1099 
1100                 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1101                         status = RPC_FAILED;
1102                         rpcerr.re_status = RPC_FAILED;
1103                         rpcerr.re_errno = EIO;
1104                         break;
1105                 }
1106 
1107                 TICK_TO_TIMEVAL(timeo, &wait);
1108 
1109                 /*
1110                  * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
1111                  * and SIGTERM. (Preserving the existing masks).
1112                  * Mask out SIGINT if mount option nointr is specified.
1113                  */
1114                 sigintr(&smask, (int)mi->mi_flags & MI_INT);
1115                 if (!(mi->mi_flags & MI_INT))
1116                         client->cl_nosignal = TRUE;
1117 
1118                 /*
1119                  * If there is a current signal, then don't bother
1120                  * even trying to send out the request because we
1121                  * won't be able to block waiting for the response.
1122                  * Simply assume RPC_INTR and get on with it.
1123                  */
1124                 if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING))
1125                         status = RPC_INTR;
1126                 else {
1127                         status = CLNT_CALL(client, which, xdrargs, argsp,
1128                             xdrres, resp, wait);
1129                 }
1130 
1131                 if (!(mi->mi_flags & MI_INT))
1132                         client->cl_nosignal = FALSE;
1133                 /*
1134                  * restore original signal mask
1135                  */
1136                 sigunintr(&smask);
1137 
1138                 switch (status) {
1139                 case RPC_SUCCESS:
1140                         if ((mi->mi_flags & MI_DYNAMIC) &&
1141                             mi->mi_timer_type[which] != 0 &&
1142                             (mi->mi_curread != my_rsize ||
1143                             mi->mi_curwrite != my_wsize))
1144                                 (void) nfs_feedback(FEEDBACK_OK, which, mi);
1145                         break;
1146 
1147                 case RPC_INTR:
1148                         /*
1149                          * There is no way to recover from this error,
1150                          * even if mount option nointr is specified.
1151                          * SIGKILL, for example, cannot be blocked.
1152                          */
1153                         rpcerr.re_status = RPC_INTR;
1154                         rpcerr.re_errno = EINTR;
1155                         break;
1156 
1157                 case RPC_UDERROR:
1158                         /*
1159                          * If the NFS server is local (vold) and
1160                          * it goes away then we get RPC_UDERROR.
1161                          * This is a retryable error, so we would
1162                          * loop, so check to see if the specific
1163                          * error was ECONNRESET, indicating that
1164                          * target did not exist at all.  If so,
1165                          * return with RPC_PROGUNAVAIL and
1166                          * ECONNRESET to indicate why.
1167                          */
1168                         CLNT_GETERR(client, &rpcerr);
1169                         if (rpcerr.re_errno == ECONNRESET) {
1170                                 rpcerr.re_status = RPC_PROGUNAVAIL;
1171                                 rpcerr.re_errno = ECONNRESET;
1172                                 break;
1173                         }
1174                         /*FALLTHROUGH*/
1175 
1176                 default:                /* probably RPC_TIMEDOUT */
1177                         if (IS_UNRECOVERABLE_RPC(status))
1178                                 break;
1179 
1180                         /*
1181                          * increment server not responding count
1182                          */
1183                         mutex_enter(&mi->mi_lock);
1184                         mi->mi_noresponse++;
1185                         mutex_exit(&mi->mi_lock);
1186 #ifdef DEBUG
1187                         nfscl->nfscl_stat.noresponse.value.ui64++;
1188 #endif
1189 
1190                         if (!(mi->mi_flags & MI_HARD)) {
1191                                 if (!(mi->mi_flags & MI_SEMISOFT) ||
1192                                     (mi->mi_ss_call_type[which] == 0))
1193                                         break;
1194                         }
1195 
1196                         /*
1197                          * The call is in progress (over COTS).
1198                          * Try the CLNT_CALL again, but don't
1199                          * print a noisy error message.
1200                          */
1201                         if (status == RPC_INPROGRESS) {
1202                                 tryagain = TRUE;
1203                                 break;
1204                         }
1205 
1206                         if (flags & RFSCALL_SOFT)
1207                                 break;
1208 
1209                         /*
1210                          * On zone shutdown, just move on.
1211                          */
1212                         if (zone_status_get(curproc->p_zone) >=
1213                             ZONE_IS_SHUTTING_DOWN) {
1214                                 rpcerr.re_status = RPC_FAILED;
1215                                 rpcerr.re_errno = EIO;
1216                                 break;
1217                         }
1218 
1219                         /*
1220                          * NFS client failover support
1221                          *
1222                          * If the current server just failed us, we'll
1223                          * start the process of finding a new server.
1224                          * After that, we can just retry.
1225                          */
1226                         if (FAILOVER_MOUNT(mi) && failover_safe(fi)) {
1227                                 if (svp == mi->mi_curr_serv)
1228                                         failover_newserver(mi);
1229                                 clfree_impl(client, ch, nfscl);
1230                                 goto failoverretry;
1231                         }
1232 
1233                         tryagain = TRUE;
1234                         timeo = backoff(timeo);
1235 
1236                         CLNT_GETERR(client, &rpcerr_tmp);
1237                         if ((status == RPC_CANTSEND) &&
1238                             (rpcerr_tmp.re_errno == ENOBUFS))
1239                                 msg = SRV_QFULL_MSG;
1240                         else
1241                                 msg = SRV_NOTRESP_MSG;
1242 
1243                         mutex_enter(&mi->mi_lock);
1244                         if (!(mi->mi_flags & MI_PRINTED)) {
1245                                 mi->mi_flags |= MI_PRINTED;
1246                                 mutex_exit(&mi->mi_lock);
1247 #ifdef DEBUG
1248                                 zprintf(zoneid, msg, mi->mi_vers,
1249                                     svp->sv_hostname);
1250 #else
1251                                 zprintf(zoneid, msg, svp->sv_hostname);
1252 #endif
1253                         } else
1254                                 mutex_exit(&mi->mi_lock);
1255                         if (*douprintf && nfs_has_ctty()) {
1256                                 *douprintf = 0;
1257                                 if (!(mi->mi_flags & MI_NOPRINT))
1258 #ifdef DEBUG
1259                                         uprintf(msg, mi->mi_vers,
1260                                             svp->sv_hostname);
1261 #else
1262                                         uprintf(msg, svp->sv_hostname);
1263 #endif
1264                         }
1265 
1266                         /*
1267                          * If doing dynamic adjustment of transfer
1268                          * size and if it's a read or write call
1269                          * and if the transfer size changed while
1270                          * retransmitting or if the feedback routine
1271                          * changed the transfer size,
1272                          * then exit rfscall so that the transfer
1273                          * size can be adjusted at the vnops level.
1274                          */
1275                         if ((mi->mi_flags & MI_DYNAMIC) &&
1276                             mi->mi_timer_type[which] != 0 &&
1277                             (mi->mi_curread != my_rsize ||
1278                             mi->mi_curwrite != my_wsize ||
1279                             nfs_feedback(FEEDBACK_REXMIT1, which, mi))) {
1280                                 /*
1281                                  * On read or write calls, return
1282                                  * back to the vnode ops level if
1283                                  * the transfer size changed.
1284                                  */
1285                                 clfree_impl(client, ch, nfscl);
1286                                 if (cred_cloned)
1287                                         crfree(cr);
1288                                 return (ENFS_TRYAGAIN);
1289                         }
1290                 }
1291         } while (tryagain);
1292 
1293         if (status != RPC_SUCCESS) {
1294                 /*
1295                  * Let soft mounts use the timed out message.
1296                  */
1297                 if (status == RPC_INPROGRESS)
1298                         status = RPC_TIMEDOUT;
1299                 nfscl->nfscl_stat.badcalls.value.ui64++;
1300                 if (status != RPC_INTR) {
1301                         mutex_enter(&mi->mi_lock);
1302                         mi->mi_flags |= MI_DOWN;
1303                         mutex_exit(&mi->mi_lock);
1304                         CLNT_GETERR(client, &rpcerr);
1305 #ifdef DEBUG
1306                         bufp = clnt_sperror(client, svp->sv_hostname);
1307                         zprintf(zoneid, "NFS%d %s failed for %s\n",
1308                             mi->mi_vers, mi->mi_rfsnames[which], bufp);
1309                         if (nfs_has_ctty()) {
1310                                 if (!(mi->mi_flags & MI_NOPRINT)) {
1311                                         uprintf("NFS%d %s failed for %s\n",
1312                                             mi->mi_vers, mi->mi_rfsnames[which],
1313                                             bufp);
1314                                 }
1315                         }
1316                         kmem_free(bufp, MAXPATHLEN);
1317 #else
1318                         zprintf(zoneid,
1319                             "NFS %s failed for server %s: error %d (%s)\n",
1320                             mi->mi_rfsnames[which], svp->sv_hostname,
1321                             status, clnt_sperrno(status));
1322                         if (nfs_has_ctty()) {
1323                                 if (!(mi->mi_flags & MI_NOPRINT)) {
1324                                         uprintf(
1325                                 "NFS %s failed for server %s: error %d (%s)\n",
1326                                             mi->mi_rfsnames[which],
1327                                             svp->sv_hostname, status,
1328                                             clnt_sperrno(status));
1329                                 }
1330                         }
1331 #endif
1332                         /*
1333                          * when CLNT_CALL() fails with RPC_AUTHERROR,
1334                          * re_errno is set appropriately depending on
1335                          * the authentication error
1336                          */
1337                         if (status == RPC_VERSMISMATCH ||
1338                             status == RPC_PROGVERSMISMATCH)
1339                                 rpcerr.re_errno = EIO;
1340                 }
1341         } else {
1342                 /*
1343                  * Test the value of mi_down and mi_printed without
1344                  * holding the mi_lock mutex.  If they are both zero,
1345                  * then it is okay to skip the down and printed
1346                  * processing.  This saves on a mutex_enter and
1347                  * mutex_exit pair for a normal, successful RPC.
1348                  * This was just complete overhead.
1349                  */
1350                 if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) {
1351                         mutex_enter(&mi->mi_lock);
1352                         mi->mi_flags &= ~MI_DOWN;
1353                         if (mi->mi_flags & MI_PRINTED) {
1354                                 mi->mi_flags &= ~MI_PRINTED;
1355                                 mutex_exit(&mi->mi_lock);
1356 #ifdef DEBUG
1357                         if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1358                                 zprintf(zoneid, "NFS%d server %s ok\n",
1359                                     mi->mi_vers, svp->sv_hostname);
1360 #else
1361                         if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1362                                 zprintf(zoneid, "NFS server %s ok\n",
1363                                     svp->sv_hostname);
1364 #endif
1365                         } else
1366                                 mutex_exit(&mi->mi_lock);
1367                 }
1368 
1369                 if (*douprintf == 0) {
1370                         if (!(mi->mi_flags & MI_NOPRINT))
1371 #ifdef DEBUG
1372                                 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1373                                         uprintf("NFS%d server %s ok\n",
1374                                             mi->mi_vers, svp->sv_hostname);
1375 #else
1376                         if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1377                                 uprintf("NFS server %s ok\n", svp->sv_hostname);
1378 #endif
1379                         *douprintf = 1;
1380                 }
1381         }
1382 
1383         clfree_impl(client, ch, nfscl);
1384         if (cred_cloned)
1385                 crfree(cr);
1386 
1387         ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0);
1388 
1389         if (rpc_status != NULL)
1390                 *rpc_status = rpcerr.re_status;
1391 
1392         TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d",
1393             rpcerr.re_errno);
1394 
1395         return (rpcerr.re_errno);
1396 }
1397 
1398 #ifdef DEBUG
1399 static int acl2call_hits = 0;
1400 static int acl2call_misses = 0;
1401 #endif
1402 
1403 int
1404 acl2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1405     xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
1406     enum nfsstat *statusp, int flags, failinfo_t *fi)
1407 {
1408         int rpcerror;
1409 
1410         rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp,
1411             cr, douprintf, flags, fi);
1412         if (!rpcerror) {
1413                 /*
1414                  * See comments with crnetadjust().
1415                  */
1416                 if (*statusp == NFSERR_ACCES &&
1417                     (cr = crnetadjust(cr)) != NULL) {
1418 #ifdef DEBUG
1419                         acl2call_hits++;
1420 #endif
1421                         rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres,
1422                             resp, cr, douprintf, flags, fi);
1423                         crfree(cr);
1424 #ifdef DEBUG
1425                         if (*statusp == NFSERR_ACCES)
1426                                 acl2call_misses++;
1427 #endif
1428                 }
1429         }
1430 
1431         return (rpcerror);
1432 }
1433 
1434 #ifdef DEBUG
1435 static int acl3call_hits = 0;
1436 static int acl3call_misses = 0;
1437 #endif
1438 
1439 int
1440 acl3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1441     xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
1442     nfsstat3 *statusp, int flags, failinfo_t *fi)
1443 {
1444         int rpcerror;
1445         int user_informed;
1446 
1447         user_informed = 0;
1448 
1449         do {
1450                 rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp,
1451                     cr, douprintf, flags, fi);
1452                 if (!rpcerror) {
1453                         cred_t *crr;
1454                         if (*statusp == NFS3ERR_JUKEBOX) {
1455                                 if (!user_informed) {
1456                                         user_informed = 1;
1457                                         uprintf(
1458                 "file temporarily unavailable on the server, retrying...\n");
1459                                 }
1460                                 delay(nfs3_jukebox_delay);
1461                         }
1462                         /*
1463                          * See crnetadjust() for comments.
1464                          */
1465                         else if (*statusp == NFS3ERR_ACCES &&
1466                             (crr = crnetadjust(cr)) != NULL) {
1467 #ifdef DEBUG
1468                                 acl3call_hits++;
1469 #endif
1470                                 rpcerror = aclcall(mi, which, xdrargs, argsp,
1471                                     xdrres, resp, crr, douprintf, flags, fi);
1472 
1473                                 crfree(crr);
1474 #ifdef DEBUG
1475                                 if (*statusp == NFS3ERR_ACCES)
1476                                         acl3call_misses++;
1477 #endif
1478                         }
1479                 }
1480         } while (!rpcerror && *statusp == NFS3ERR_JUKEBOX);
1481 
1482         return (rpcerror);
1483 }
1484 
1485 static int
1486 aclcall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1487     xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf,
1488     int flags, failinfo_t *fi)
1489 {
1490         CLIENT *client;
1491         struct chtab *ch;
1492         cred_t *cr = icr;
1493         bool_t cred_cloned = FALSE;
1494         enum clnt_stat status;
1495         struct rpc_err rpcerr;
1496         struct timeval wait;
1497         int timeo;              /* in units of hz */
1498 #if 0 /* notyet */
1499         int my_rsize, my_wsize;
1500 #endif
1501         bool_t tryagain;
1502         k_sigset_t smask;
1503         servinfo_t *svp;
1504         struct nfs_clnt *nfscl;
1505         zoneid_t zoneid = getzoneid();
1506 #ifdef DEBUG
1507         char *bufp;
1508 #endif
1509 
1510 #if 0 /* notyet */
1511         TRACE_2(TR_FAC_NFS, TR_RFSCALL_START,
1512             "rfscall_start:which %d mi %p", which, mi);
1513 #endif
1514 
1515         nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
1516         ASSERT(nfscl != NULL);
1517 
1518         nfscl->nfscl_stat.calls.value.ui64++;
1519         mi->mi_aclreqs[which].value.ui64++;
1520 
1521         rpcerr.re_status = RPC_SUCCESS;
1522 
1523         if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1524                 rpcerr.re_status = RPC_FAILED;
1525                 rpcerr.re_errno = EIO;
1526                 return (rpcerr.re_errno);
1527         }
1528 
1529 #if 0 /* notyet */
1530         /*
1531          * Remember the transfer sizes in case
1532          * nfs_feedback changes them underneath us.
1533          */
1534         my_rsize = mi->mi_curread;
1535         my_wsize = mi->mi_curwrite;
1536 #endif
1537 
1538         /*
1539          * NFS client failover support
1540          *
1541          * If this rnode is not in sync with the current server (VALID_FH),
1542          * we'd like to do a remap to get in sync.  We can be interrupted
1543          * in failover_remap(), and if so we'll bail.  Otherwise, we'll
1544          * use the best info we have to try the RPC.  Part of that is
1545          * unconditionally updating the filehandle copy kept for V3.
1546          *
1547          * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible
1548          * rw_enter(); we're trying to keep the current server from being
1549          * changed on us until we're done with the remapping and have a
1550          * matching client handle.  We don't want to sending a filehandle
1551          * to the wrong host.
1552          */
1553 failoverretry:
1554         if (FAILOVER_MOUNT(mi)) {
1555                 mutex_enter(&mi->mi_lock);
1556                 if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1557                         if (failover_wait(mi)) {
1558                                 mutex_exit(&mi->mi_lock);
1559                                 return (EINTR);
1560                         }
1561                 }
1562                 INC_READERS(mi);
1563                 mutex_exit(&mi->mi_lock);
1564                 if (fi) {
1565                         if (!VALID_FH(fi) &&
1566                             !(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1567                                 int remaperr;
1568 
1569                                 svp = mi->mi_curr_serv;
1570                                 remaperr = failover_remap(fi);
1571                                 if (remaperr != 0) {
1572 #ifdef DEBUG
1573                                         if (remaperr != EINTR)
1574                                                 nfs_cmn_err(remaperr, CE_WARN,
1575                                             "aclcall couldn't failover: %m");
1576 #endif
1577                                         mutex_enter(&mi->mi_lock);
1578                                         DEC_READERS(mi);
1579                                         mutex_exit(&mi->mi_lock);
1580 
1581                                         /*
1582                                          * If failover_remap returns ETIMEDOUT
1583                                          * and the filesystem is hard mounted
1584                                          * we have to retry the call with a new
1585                                          * server.
1586                                          */
1587                                         if ((mi->mi_flags & MI_HARD) &&
1588                                             IS_RECOVERABLE_ERROR(remaperr)) {
1589                                                 if (svp == mi->mi_curr_serv)
1590                                                         failover_newserver(mi);
1591                                                 rpcerr.re_status = RPC_SUCCESS;
1592                                                 goto failoverretry;
1593                                         }
1594                                         return (remaperr);
1595                                 }
1596                         }
1597                         if (fi->fhp && fi->copyproc)
1598                                 (*fi->copyproc)(fi->fhp, fi->vp);
1599                 }
1600         }
1601 
1602         /* For TSOL, use a new cred which has net_mac_aware flag */
1603         if (!cred_cloned && is_system_labeled()) {
1604                 cred_cloned = TRUE;
1605                 cr = crdup(icr);
1606                 (void) setpflags(NET_MAC_AWARE, 1, cr);
1607         }
1608 
1609         /*
1610          * acl_clget() calls clnt_tli_kinit() which clears the xid, so we
1611          * are guaranteed to reprocess the retry as a new request.
1612          */
1613         svp = mi->mi_curr_serv;
1614         rpcerr.re_errno = acl_clget(mi, svp, cr, &client, &ch, nfscl);
1615         if (FAILOVER_MOUNT(mi)) {
1616                 mutex_enter(&mi->mi_lock);
1617                 DEC_READERS(mi);
1618                 mutex_exit(&mi->mi_lock);
1619 
1620                 if ((rpcerr.re_errno == ETIMEDOUT ||
1621                     rpcerr.re_errno == ECONNRESET) &&
1622                     failover_safe(fi)) {
1623                         if (svp == mi->mi_curr_serv)
1624                                 failover_newserver(mi);
1625                         goto failoverretry;
1626                 }
1627         }
1628         if (rpcerr.re_errno != 0) {
1629                 if (cred_cloned)
1630                         crfree(cr);
1631                 return (rpcerr.re_errno);
1632         }
1633 
1634         if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD ||
1635             svp->sv_knconf->knc_semantics == NC_TPI_COTS) {
1636                 timeo = (mi->mi_timeo * hz) / 10;
1637         } else {
1638                 mutex_enter(&mi->mi_lock);
1639                 timeo = CLNT_SETTIMERS(client,
1640                     &(mi->mi_timers[mi->mi_acl_timer_type[which]]),
1641                     &(mi->mi_timers[NFS_CALLTYPES]),
1642                     (minimum_timeo[mi->mi_acl_call_type[which]]*hz)>>3,
1643                     (void (*)()) 0, (caddr_t)mi, 0);
1644                 mutex_exit(&mi->mi_lock);
1645         }
1646 
1647         /*
1648          * If hard mounted fs, retry call forever unless hard error occurs.
1649          */
1650         do {
1651                 tryagain = FALSE;
1652 
1653                 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1654                         status = RPC_FAILED;
1655                         rpcerr.re_status = RPC_FAILED;
1656                         rpcerr.re_errno = EIO;
1657                         break;
1658                 }
1659 
1660                 TICK_TO_TIMEVAL(timeo, &wait);
1661 
1662                 /*
1663                  * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
1664                  * and SIGTERM. (Preserving the existing masks).
1665                  * Mask out SIGINT if mount option nointr is specified.
1666                  */
1667                 sigintr(&smask, (int)mi->mi_flags & MI_INT);
1668                 if (!(mi->mi_flags & MI_INT))
1669                         client->cl_nosignal = TRUE;
1670 
1671                 /*
1672                  * If there is a current signal, then don't bother
1673                  * even trying to send out the request because we
1674                  * won't be able to block waiting for the response.
1675                  * Simply assume RPC_INTR and get on with it.
1676                  */
1677                 if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING))
1678                         status = RPC_INTR;
1679                 else {
1680                         status = CLNT_CALL(client, which, xdrargs, argsp,
1681                             xdrres, resp, wait);
1682                 }
1683 
1684                 if (!(mi->mi_flags & MI_INT))
1685                         client->cl_nosignal = FALSE;
1686                 /*
1687                  * restore original signal mask
1688                  */
1689                 sigunintr(&smask);
1690 
1691                 switch (status) {
1692                 case RPC_SUCCESS:
1693 #if 0 /* notyet */
1694                         if ((mi->mi_flags & MI_DYNAMIC) &&
1695                             mi->mi_timer_type[which] != 0 &&
1696                             (mi->mi_curread != my_rsize ||
1697                             mi->mi_curwrite != my_wsize))
1698                                 (void) nfs_feedback(FEEDBACK_OK, which, mi);
1699 #endif
1700                         break;
1701 
1702                 /*
1703                  * Unfortunately, there are servers in the world which
1704                  * are not coded correctly.  They are not prepared to
1705                  * handle RPC requests to the NFS port which are not
1706                  * NFS requests.  Thus, they may try to process the
1707                  * NFS_ACL request as if it were an NFS request.  This
1708                  * does not work.  Generally, an error will be generated
1709                  * on the client because it will not be able to decode
1710                  * the response from the server.  However, it seems
1711                  * possible that the server may not be able to decode
1712                  * the arguments.  Thus, the criteria for deciding
1713                  * whether the server supports NFS_ACL or not is whether
1714                  * the following RPC errors are returned from CLNT_CALL.
1715                  */
1716                 case RPC_CANTDECODERES:
1717                 case RPC_PROGUNAVAIL:
1718                 case RPC_CANTDECODEARGS:
1719                 case RPC_PROGVERSMISMATCH:
1720                         mutex_enter(&mi->mi_lock);
1721                         mi->mi_flags &= ~(MI_ACL | MI_EXTATTR);
1722                         mutex_exit(&mi->mi_lock);
1723                         break;
1724 
1725                 /*
1726                  * If the server supports NFS_ACL but not the new ops
1727                  * for extended attributes, make sure we don't retry.
1728                  */
1729                 case RPC_PROCUNAVAIL:
1730                         mutex_enter(&mi->mi_lock);
1731                         mi->mi_flags &= ~MI_EXTATTR;
1732                         mutex_exit(&mi->mi_lock);
1733                         break;
1734 
1735                 case RPC_INTR:
1736                         /*
1737                          * There is no way to recover from this error,
1738                          * even if mount option nointr is specified.
1739                          * SIGKILL, for example, cannot be blocked.
1740                          */
1741                         rpcerr.re_status = RPC_INTR;
1742                         rpcerr.re_errno = EINTR;
1743                         break;
1744 
1745                 case RPC_UDERROR:
1746                         /*
1747                          * If the NFS server is local (vold) and
1748                          * it goes away then we get RPC_UDERROR.
1749                          * This is a retryable error, so we would
1750                          * loop, so check to see if the specific
1751                          * error was ECONNRESET, indicating that
1752                          * target did not exist at all.  If so,
1753                          * return with RPC_PROGUNAVAIL and
1754                          * ECONNRESET to indicate why.
1755                          */
1756                         CLNT_GETERR(client, &rpcerr);
1757                         if (rpcerr.re_errno == ECONNRESET) {
1758                                 rpcerr.re_status = RPC_PROGUNAVAIL;
1759                                 rpcerr.re_errno = ECONNRESET;
1760                                 break;
1761                         }
1762                         /*FALLTHROUGH*/
1763 
1764                 default:                /* probably RPC_TIMEDOUT */
1765                         if (IS_UNRECOVERABLE_RPC(status))
1766                                 break;
1767 
1768                         /*
1769                          * increment server not responding count
1770                          */
1771                         mutex_enter(&mi->mi_lock);
1772                         mi->mi_noresponse++;
1773                         mutex_exit(&mi->mi_lock);
1774 #ifdef DEBUG
1775                         nfscl->nfscl_stat.noresponse.value.ui64++;
1776 #endif
1777 
1778                         if (!(mi->mi_flags & MI_HARD)) {
1779                                 if (!(mi->mi_flags & MI_SEMISOFT) ||
1780                                     (mi->mi_acl_ss_call_type[which] == 0))
1781                                         break;
1782                         }
1783 
1784                         /*
1785                          * The call is in progress (over COTS).
1786                          * Try the CLNT_CALL again, but don't
1787                          * print a noisy error message.
1788                          */
1789                         if (status == RPC_INPROGRESS) {
1790                                 tryagain = TRUE;
1791                                 break;
1792                         }
1793 
1794                         if (flags & RFSCALL_SOFT)
1795                                 break;
1796 
1797                         /*
1798                          * On zone shutdown, just move on.
1799                          */
1800                         if (zone_status_get(curproc->p_zone) >=
1801                             ZONE_IS_SHUTTING_DOWN) {
1802                                 rpcerr.re_status = RPC_FAILED;
1803                                 rpcerr.re_errno = EIO;
1804                                 break;
1805                         }
1806 
1807                         /*
1808                          * NFS client failover support
1809                          *
1810                          * If the current server just failed us, we'll
1811                          * start the process of finding a new server.
1812                          * After that, we can just retry.
1813                          */
1814                         if (FAILOVER_MOUNT(mi) && failover_safe(fi)) {
1815                                 if (svp == mi->mi_curr_serv)
1816                                         failover_newserver(mi);
1817                                 clfree_impl(client, ch, nfscl);
1818                                 goto failoverretry;
1819                         }
1820 
1821                         tryagain = TRUE;
1822                         timeo = backoff(timeo);
1823                         mutex_enter(&mi->mi_lock);
1824                         if (!(mi->mi_flags & MI_PRINTED)) {
1825                                 mi->mi_flags |= MI_PRINTED;
1826                                 mutex_exit(&mi->mi_lock);
1827 #ifdef DEBUG
1828                                 zprintf(zoneid,
1829                         "NFS_ACL%d server %s not responding still trying\n",
1830                                     mi->mi_vers, svp->sv_hostname);
1831 #else
1832                                 zprintf(zoneid,
1833                             "NFS server %s not responding still trying\n",
1834                                     svp->sv_hostname);
1835 #endif
1836                         } else
1837                                 mutex_exit(&mi->mi_lock);
1838                         if (*douprintf && nfs_has_ctty()) {
1839                                 *douprintf = 0;
1840                                 if (!(mi->mi_flags & MI_NOPRINT))
1841 #ifdef DEBUG
1842                                         uprintf(
1843                         "NFS_ACL%d server %s not responding still trying\n",
1844                                             mi->mi_vers, svp->sv_hostname);
1845 #else
1846                                         uprintf(
1847                             "NFS server %s not responding still trying\n",
1848                                             svp->sv_hostname);
1849 #endif
1850                         }
1851 
1852 #if 0 /* notyet */
1853                         /*
1854                          * If doing dynamic adjustment of transfer
1855                          * size and if it's a read or write call
1856                          * and if the transfer size changed while
1857                          * retransmitting or if the feedback routine
1858                          * changed the transfer size,
1859                          * then exit rfscall so that the transfer
1860                          * size can be adjusted at the vnops level.
1861                          */
1862                         if ((mi->mi_flags & MI_DYNAMIC) &&
1863                             mi->mi_acl_timer_type[which] != 0 &&
1864                             (mi->mi_curread != my_rsize ||
1865                             mi->mi_curwrite != my_wsize ||
1866                             nfs_feedback(FEEDBACK_REXMIT1, which, mi))) {
1867                                 /*
1868                                  * On read or write calls, return
1869                                  * back to the vnode ops level if
1870                                  * the transfer size changed.
1871                                  */
1872                                 clfree_impl(client, ch, nfscl);
1873                                 if (cred_cloned)
1874                                         crfree(cr);
1875                                 return (ENFS_TRYAGAIN);
1876                         }
1877 #endif
1878                 }
1879         } while (tryagain);
1880 
1881         if (status != RPC_SUCCESS) {
1882                 /*
1883                  * Let soft mounts use the timed out message.
1884                  */
1885                 if (status == RPC_INPROGRESS)
1886                         status = RPC_TIMEDOUT;
1887                 nfscl->nfscl_stat.badcalls.value.ui64++;
1888                 if (status == RPC_CANTDECODERES ||
1889                     status == RPC_PROGUNAVAIL ||
1890                     status == RPC_PROCUNAVAIL ||
1891                     status == RPC_CANTDECODEARGS ||
1892                     status == RPC_PROGVERSMISMATCH)
1893                         CLNT_GETERR(client, &rpcerr);
1894                 else if (status != RPC_INTR) {
1895                         mutex_enter(&mi->mi_lock);
1896                         mi->mi_flags |= MI_DOWN;
1897                         mutex_exit(&mi->mi_lock);
1898                         CLNT_GETERR(client, &rpcerr);
1899 #ifdef DEBUG
1900                         bufp = clnt_sperror(client, svp->sv_hostname);
1901                         zprintf(zoneid, "NFS_ACL%d %s failed for %s\n",
1902                             mi->mi_vers, mi->mi_aclnames[which], bufp);
1903                         if (nfs_has_ctty()) {
1904                                 if (!(mi->mi_flags & MI_NOPRINT)) {
1905                                         uprintf("NFS_ACL%d %s failed for %s\n",
1906                                             mi->mi_vers, mi->mi_aclnames[which],
1907                                             bufp);
1908                                 }
1909                         }
1910                         kmem_free(bufp, MAXPATHLEN);
1911 #else
1912                         zprintf(zoneid,
1913                             "NFS %s failed for server %s: error %d (%s)\n",
1914                             mi->mi_aclnames[which], svp->sv_hostname,
1915                             status, clnt_sperrno(status));
1916                         if (nfs_has_ctty()) {
1917                                 if (!(mi->mi_flags & MI_NOPRINT))
1918                                         uprintf(
1919                                 "NFS %s failed for server %s: error %d (%s)\n",
1920                                             mi->mi_aclnames[which],
1921                                             svp->sv_hostname, status,
1922                                             clnt_sperrno(status));
1923                         }
1924 #endif
1925                         /*
1926                          * when CLNT_CALL() fails with RPC_AUTHERROR,
1927                          * re_errno is set appropriately depending on
1928                          * the authentication error
1929                          */
1930                         if (status == RPC_VERSMISMATCH ||
1931                             status == RPC_PROGVERSMISMATCH)
1932                                 rpcerr.re_errno = EIO;
1933                 }
1934         } else {
1935                 /*
1936                  * Test the value of mi_down and mi_printed without
1937                  * holding the mi_lock mutex.  If they are both zero,
1938                  * then it is okay to skip the down and printed
1939                  * processing.  This saves on a mutex_enter and
1940                  * mutex_exit pair for a normal, successful RPC.
1941                  * This was just complete overhead.
1942                  */
1943                 if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) {
1944                         mutex_enter(&mi->mi_lock);
1945                         mi->mi_flags &= ~MI_DOWN;
1946                         if (mi->mi_flags & MI_PRINTED) {
1947                                 mi->mi_flags &= ~MI_PRINTED;
1948                                 mutex_exit(&mi->mi_lock);
1949 #ifdef DEBUG
1950                                 zprintf(zoneid, "NFS_ACL%d server %s ok\n",
1951                                     mi->mi_vers, svp->sv_hostname);
1952 #else
1953                                 zprintf(zoneid, "NFS server %s ok\n",
1954                                     svp->sv_hostname);
1955 #endif
1956                         } else
1957                                 mutex_exit(&mi->mi_lock);
1958                 }
1959 
1960                 if (*douprintf == 0) {
1961                         if (!(mi->mi_flags & MI_NOPRINT))
1962 #ifdef DEBUG
1963                                 uprintf("NFS_ACL%d server %s ok\n",
1964                                     mi->mi_vers, svp->sv_hostname);
1965 #else
1966                                 uprintf("NFS server %s ok\n", svp->sv_hostname);
1967 #endif
1968                         *douprintf = 1;
1969                 }
1970         }
1971 
1972         clfree_impl(client, ch, nfscl);
1973         if (cred_cloned)
1974                 crfree(cr);
1975 
1976         ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0);
1977 
1978 #if 0 /* notyet */
1979         TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d",
1980             rpcerr.re_errno);
1981 #endif
1982 
1983         return (rpcerr.re_errno);
1984 }
1985 
1986 int
1987 vattr_to_sattr(struct vattr *vap, struct nfssattr *sa)
1988 {
1989         uint_t mask = vap->va_mask;
1990 
1991         if (!(mask & AT_MODE))
1992                 sa->sa_mode = (uint32_t)-1;
1993         else
1994                 sa->sa_mode = vap->va_mode;
1995         if (!(mask & AT_UID))
1996                 sa->sa_uid = (uint32_t)-1;
1997         else
1998                 sa->sa_uid = (uint32_t)vap->va_uid;
1999         if (!(mask & AT_GID))
2000                 sa->sa_gid = (uint32_t)-1;
2001         else
2002                 sa->sa_gid = (uint32_t)vap->va_gid;
2003         if (!(mask & AT_SIZE))
2004                 sa->sa_size = (uint32_t)-1;
2005         else
2006                 sa->sa_size = (uint32_t)vap->va_size;
2007         if (!(mask & AT_ATIME))
2008                 sa->sa_atime.tv_sec = sa->sa_atime.tv_usec = (int32_t)-1;
2009         else {
2010                 /* check time validity */
2011                 if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) {
2012                         return (EOVERFLOW);
2013                 }
2014                 sa->sa_atime.tv_sec = vap->va_atime.tv_sec;
2015                 sa->sa_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2016         }
2017         if (!(mask & AT_MTIME))
2018                 sa->sa_mtime.tv_sec = sa->sa_mtime.tv_usec = (int32_t)-1;
2019         else {
2020                 /* check time validity */
2021                 if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) {
2022                         return (EOVERFLOW);
2023                 }
2024                 sa->sa_mtime.tv_sec = vap->va_mtime.tv_sec;
2025                 sa->sa_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2026         }
2027         return (0);
2028 }
2029 
2030 int
2031 vattr_to_sattr3(struct vattr *vap, sattr3 *sa)
2032 {
2033         uint_t mask = vap->va_mask;
2034 
2035         if (!(mask & AT_MODE))
2036                 sa->mode.set_it = FALSE;
2037         else {
2038                 sa->mode.set_it = TRUE;
2039                 sa->mode.mode = (mode3)vap->va_mode;
2040         }
2041         if (!(mask & AT_UID))
2042                 sa->uid.set_it = FALSE;
2043         else {
2044                 sa->uid.set_it = TRUE;
2045                 sa->uid.uid = (uid3)vap->va_uid;
2046         }
2047         if (!(mask & AT_GID))
2048                 sa->gid.set_it = FALSE;
2049         else {
2050                 sa->gid.set_it = TRUE;
2051                 sa->gid.gid = (gid3)vap->va_gid;
2052         }
2053         if (!(mask & AT_SIZE))
2054                 sa->size.set_it = FALSE;
2055         else {
2056                 sa->size.set_it = TRUE;
2057                 sa->size.size = (size3)vap->va_size;
2058         }
2059         if (!(mask & AT_ATIME))
2060                 sa->atime.set_it = DONT_CHANGE;
2061         else {
2062                 /* check time validity */
2063                 if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) {
2064                         return (EOVERFLOW);
2065                 }
2066                 sa->atime.set_it = SET_TO_CLIENT_TIME;
2067                 sa->atime.atime.seconds = (uint32)vap->va_atime.tv_sec;
2068                 sa->atime.atime.nseconds = (uint32)vap->va_atime.tv_nsec;
2069         }
2070         if (!(mask & AT_MTIME))
2071                 sa->mtime.set_it = DONT_CHANGE;
2072         else {
2073                 /* check time validity */
2074                 if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) {
2075                         return (EOVERFLOW);
2076                 }
2077                 sa->mtime.set_it = SET_TO_CLIENT_TIME;
2078                 sa->mtime.mtime.seconds = (uint32)vap->va_mtime.tv_sec;
2079                 sa->mtime.mtime.nseconds = (uint32)vap->va_mtime.tv_nsec;
2080         }
2081         return (0);
2082 }
2083 
2084 void
2085 setdiropargs(struct nfsdiropargs *da, char *nm, vnode_t *dvp)
2086 {
2087 
2088         da->da_fhandle = VTOFH(dvp);
2089         da->da_name = nm;
2090         da->da_flags = 0;
2091 }
2092 
2093 void
2094 setdiropargs3(diropargs3 *da, char *nm, vnode_t *dvp)
2095 {
2096 
2097         da->dirp = VTOFH3(dvp);
2098         da->name = nm;
2099 }
2100 
2101 int
2102 setdirgid(vnode_t *dvp, gid_t *gidp, cred_t *cr)
2103 {
2104         int error;
2105         rnode_t *rp;
2106         struct vattr va;
2107 
2108         va.va_mask = AT_MODE | AT_GID;
2109         error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2110         if (error)
2111                 return (error);
2112 
2113         /*
2114          * To determine the expected group-id of the created file:
2115          *  1)  If the filesystem was not mounted with the Old-BSD-compatible
2116          *      GRPID option, and the directory's set-gid bit is clear,
2117          *      then use the process's gid.
2118          *  2)  Otherwise, set the group-id to the gid of the parent directory.
2119          */
2120         rp = VTOR(dvp);
2121         mutex_enter(&rp->r_statelock);
2122         if (!(VTOMI(dvp)->mi_flags & MI_GRPID) && !(va.va_mode & VSGID))
2123                 *gidp = crgetgid(cr);
2124         else
2125                 *gidp = va.va_gid;
2126         mutex_exit(&rp->r_statelock);
2127         return (0);
2128 }
2129 
2130 int
2131 setdirmode(vnode_t *dvp, mode_t *omp, cred_t *cr)
2132 {
2133         int error;
2134         struct vattr va;
2135 
2136         va.va_mask = AT_MODE;
2137         error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2138         if (error)
2139                 return (error);
2140 
2141         /*
2142          * Modify the expected mode (om) so that the set-gid bit matches
2143          * that of the parent directory (dvp).
2144          */
2145         if (va.va_mode & VSGID)
2146                 *omp |= VSGID;
2147         else
2148                 *omp &= ~VSGID;
2149         return (0);
2150 }
2151 
2152 void
2153 nfs_setswaplike(vnode_t *vp, vattr_t *vap)
2154 {
2155 
2156         if (vp->v_type == VREG && (vap->va_mode & (VEXEC | VSVTX)) == VSVTX) {
2157                 if (!(vp->v_flag & VSWAPLIKE)) {
2158                         mutex_enter(&vp->v_lock);
2159                         vp->v_flag |= VSWAPLIKE;
2160                         mutex_exit(&vp->v_lock);
2161                 }
2162         } else {
2163                 if (vp->v_flag & VSWAPLIKE) {
2164                         mutex_enter(&vp->v_lock);
2165                         vp->v_flag &= ~VSWAPLIKE;
2166                         mutex_exit(&vp->v_lock);
2167                 }
2168         }
2169 }
2170 
2171 /*
2172  * Free the resources associated with an rnode.
2173  */
2174 static void
2175 rinactive(rnode_t *rp, cred_t *cr)
2176 {
2177         vnode_t *vp;
2178         cred_t *cred;
2179         char *contents;
2180         int size;
2181         vsecattr_t *vsp;
2182         int error;
2183         nfs3_pathconf_info *info;
2184 
2185         /*
2186          * Before freeing anything, wait until all asynchronous
2187          * activity is done on this rnode.  This will allow all
2188          * asynchronous read ahead and write behind i/o's to
2189          * finish.
2190          */
2191         mutex_enter(&rp->r_statelock);
2192         while (rp->r_count > 0)
2193                 cv_wait(&rp->r_cv, &rp->r_statelock);
2194         mutex_exit(&rp->r_statelock);
2195 
2196         /*
2197          * Flush and invalidate all pages associated with the vnode.
2198          */
2199         vp = RTOV(rp);
2200         if (vn_has_cached_data(vp)) {
2201                 ASSERT(vp->v_type != VCHR);
2202                 if ((rp->r_flags & RDIRTY) && !rp->r_error) {
2203                         error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, 0, cr, NULL);
2204                         if (error && (error == ENOSPC || error == EDQUOT)) {
2205                                 mutex_enter(&rp->r_statelock);
2206                                 if (!rp->r_error)
2207                                         rp->r_error = error;
2208                                 mutex_exit(&rp->r_statelock);
2209                         }
2210                 }
2211                 nfs_invalidate_pages(vp, (u_offset_t)0, cr);
2212         }
2213 
2214         /*
2215          * Free any held credentials and caches which may be associated
2216          * with this rnode.
2217          */
2218         mutex_enter(&rp->r_statelock);
2219         cred = rp->r_cred;
2220         rp->r_cred = NULL;
2221         contents = rp->r_symlink.contents;
2222         size = rp->r_symlink.size;
2223         rp->r_symlink.contents = NULL;
2224         vsp = rp->r_secattr;
2225         rp->r_secattr = NULL;
2226         info = rp->r_pathconf;
2227         rp->r_pathconf = NULL;
2228         mutex_exit(&rp->r_statelock);
2229 
2230         /*
2231          * Free the held credential.
2232          */
2233         if (cred != NULL)
2234                 crfree(cred);
2235 
2236         /*
2237          * Free the access cache entries.
2238          */
2239         (void) nfs_access_purge_rp(rp);
2240 
2241         /*
2242          * Free the readdir cache entries.
2243          */
2244         if (HAVE_RDDIR_CACHE(rp))
2245                 nfs_purge_rddir_cache(vp);
2246 
2247         /*
2248          * Free the symbolic link cache.
2249          */
2250         if (contents != NULL) {
2251 
2252                 kmem_free((void *)contents, size);
2253         }
2254 
2255         /*
2256          * Free any cached ACL.
2257          */
2258         if (vsp != NULL)
2259                 nfs_acl_free(vsp);
2260 
2261         /*
2262          * Free any cached pathconf information.
2263          */
2264         if (info != NULL)
2265                 kmem_free(info, sizeof (*info));
2266 }
2267 
2268 /*
2269  * Return a vnode for the given NFS Version 2 file handle.
2270  * If no rnode exists for this fhandle, create one and put it
2271  * into the hash queues.  If the rnode for this fhandle
2272  * already exists, return it.
2273  *
2274  * Note: make_rnode() may upgrade the hash bucket lock to exclusive.
2275  */
2276 vnode_t *
2277 makenfsnode(fhandle_t *fh, struct nfsfattr *attr, struct vfs *vfsp,
2278     hrtime_t t, cred_t *cr, char *dnm, char *nm)
2279 {
2280         int newnode;
2281         int index;
2282         vnode_t *vp;
2283         nfs_fhandle nfh;
2284         vattr_t va;
2285 
2286         nfh.fh_len = NFS_FHSIZE;
2287         bcopy(fh, nfh.fh_buf, NFS_FHSIZE);
2288 
2289         index = rtablehash(&nfh);
2290         rw_enter(&rtable[index].r_lock, RW_READER);
2291 
2292         vp = make_rnode(&nfh, &rtable[index], vfsp, nfs_vnodeops,
2293             nfs_putapage, nfs_rddir_compar, &newnode, cr, dnm, nm);
2294 
2295         if (attr != NULL) {
2296                 if (!newnode) {
2297                         rw_exit(&rtable[index].r_lock);
2298                         (void) nfs_cache_fattr(vp, attr, &va, t, cr);
2299                 } else {
2300                         if (attr->na_type < NFNON || attr->na_type > NFSOC)
2301                                 vp->v_type = VBAD;
2302                         else
2303                                 vp->v_type = n2v_type(attr);
2304                         /*
2305                          * A translation here seems to be necessary
2306                          * because this function can be called
2307                          * with `attr' that has come from the wire,
2308                          * and been operated on by vattr_to_nattr().
2309                          * See nfsrootvp()->VOP_GETTATTR()->nfsgetattr()
2310                          * ->nfs_getattr_otw()->rfscall()->vattr_to_nattr()
2311                          * ->makenfsnode().
2312                          */
2313                         if ((attr->na_rdev & 0xffff0000) == 0)
2314                                 vp->v_rdev = nfsv2_expdev(attr->na_rdev);
2315                         else
2316                                 vp->v_rdev = expldev(n2v_rdev(attr));
2317                         nfs_attrcache(vp, attr, t);
2318                         rw_exit(&rtable[index].r_lock);
2319                 }
2320         } else {
2321                 if (newnode) {
2322                         PURGE_ATTRCACHE(vp);
2323                 }
2324                 rw_exit(&rtable[index].r_lock);
2325         }
2326 
2327         return (vp);
2328 }
2329 
2330 /*
2331  * Return a vnode for the given NFS Version 3 file handle.
2332  * If no rnode exists for this fhandle, create one and put it
2333  * into the hash queues.  If the rnode for this fhandle
2334  * already exists, return it.
2335  *
2336  * Note: make_rnode() may upgrade the hash bucket lock to exclusive.
2337  */
2338 vnode_t *
2339 makenfs3node_va(nfs_fh3 *fh, vattr_t *vap, struct vfs *vfsp, hrtime_t t,
2340     cred_t *cr, char *dnm, char *nm)
2341 {
2342         int newnode;
2343         int index;
2344         vnode_t *vp;
2345 
2346         index = rtablehash((nfs_fhandle *)fh);
2347         rw_enter(&rtable[index].r_lock, RW_READER);
2348 
2349         vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp,
2350             nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr,
2351             dnm, nm);
2352 
2353         if (vap == NULL) {
2354                 if (newnode) {
2355                         PURGE_ATTRCACHE(vp);
2356                 }
2357                 rw_exit(&rtable[index].r_lock);
2358                 return (vp);
2359         }
2360 
2361         if (!newnode) {
2362                 rw_exit(&rtable[index].r_lock);
2363                 nfs_attr_cache(vp, vap, t, cr);
2364         } else {
2365                 rnode_t *rp = VTOR(vp);
2366 
2367                 vp->v_type = vap->va_type;
2368                 vp->v_rdev = vap->va_rdev;
2369 
2370                 mutex_enter(&rp->r_statelock);
2371                 if (rp->r_mtime <= t)
2372                         nfs_attrcache_va(vp, vap);
2373                 mutex_exit(&rp->r_statelock);
2374                 rw_exit(&rtable[index].r_lock);
2375         }
2376 
2377         return (vp);
2378 }
2379 
2380 vnode_t *
2381 makenfs3node(nfs_fh3 *fh, fattr3 *attr, struct vfs *vfsp, hrtime_t t,
2382     cred_t *cr, char *dnm, char *nm)
2383 {
2384         int newnode;
2385         int index;
2386         vnode_t *vp;
2387         vattr_t va;
2388 
2389         index = rtablehash((nfs_fhandle *)fh);
2390         rw_enter(&rtable[index].r_lock, RW_READER);
2391 
2392         vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp,
2393             nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr,
2394             dnm, nm);
2395 
2396         if (attr == NULL) {
2397                 if (newnode) {
2398                         PURGE_ATTRCACHE(vp);
2399                 }
2400                 rw_exit(&rtable[index].r_lock);
2401                 return (vp);
2402         }
2403 
2404         if (!newnode) {
2405                 rw_exit(&rtable[index].r_lock);
2406                 (void) nfs3_cache_fattr3(vp, attr, &va, t, cr);
2407         } else {
2408                 if (attr->type < NF3REG || attr->type > NF3FIFO)
2409                         vp->v_type = VBAD;
2410                 else
2411                         vp->v_type = nf3_to_vt[attr->type];
2412                 vp->v_rdev = makedevice(attr->rdev.specdata1,
2413                     attr->rdev.specdata2);
2414                 nfs3_attrcache(vp, attr, t);
2415                 rw_exit(&rtable[index].r_lock);
2416         }
2417 
2418         return (vp);
2419 }
2420 
2421 /*
2422  * Read this comment before making changes to rtablehash()!
2423  * This is a hash function in which seemingly obvious and harmless
2424  * changes can cause escalations costing million dollars!
2425  * Know what you are doing.
2426  *
2427  * rtablehash() implements Jenkins' one-at-a-time hash algorithm.  The
2428  * algorithm is currently detailed here:
2429  *
2430  *   http://burtleburtle.net/bob/hash/doobs.html
2431  *
2432  * Of course, the above link may not be valid by the time you are reading
2433  * this, but suffice it to say that the one-at-a-time algorithm works well in
2434  * almost all cases.  If you are changing the algorithm be sure to verify that
2435  * the hash algorithm still provides even distribution in all cases and with
2436  * any server returning filehandles in whatever order (sequential or random).
2437  */
2438 static int
2439 rtablehash(nfs_fhandle *fh)
2440 {
2441         ulong_t hash, len, i;
2442         char *key;
2443 
2444         key = fh->fh_buf;
2445         len = (ulong_t)fh->fh_len;
2446         for (hash = 0, i = 0; i < len; i++) {
2447                 hash += key[i];
2448                 hash += (hash << 10);
2449                 hash ^= (hash >> 6);
2450         }
2451         hash += (hash << 3);
2452         hash ^= (hash >> 11);
2453         hash += (hash << 15);
2454         return (hash & rtablemask);
2455 }
2456 
2457 static vnode_t *
2458 make_rnode(nfs_fhandle *fh, rhashq_t *rhtp, struct vfs *vfsp,
2459     struct vnodeops *vops,
2460     int (*putapage)(vnode_t *, page_t *, u_offset_t *, size_t *, int, cred_t *),
2461     int (*compar)(const void *, const void *),
2462     int *newnode, cred_t *cr, char *dnm, char *nm)
2463 {
2464         rnode_t *rp;
2465         rnode_t *trp;
2466         vnode_t *vp;
2467         mntinfo_t *mi;
2468 
2469         ASSERT(RW_READ_HELD(&rhtp->r_lock));
2470 
2471         mi = VFTOMI(vfsp);
2472 start:
2473         if ((rp = rfind(rhtp, fh, vfsp)) != NULL) {
2474                 vp = RTOV(rp);
2475                 nfs_set_vroot(vp);
2476                 *newnode = 0;
2477                 return (vp);
2478         }
2479         rw_exit(&rhtp->r_lock);
2480 
2481         mutex_enter(&rpfreelist_lock);
2482         if (rpfreelist != NULL && rnew >= nrnode) {
2483                 rp = rpfreelist;
2484                 rp_rmfree(rp);
2485                 mutex_exit(&rpfreelist_lock);
2486 
2487                 vp = RTOV(rp);
2488 
2489                 if (rp->r_flags & RHASHED) {
2490                         rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2491                         mutex_enter(&vp->v_lock);
2492                         if (vp->v_count > 1) {
2493                                 VN_RELE_LOCKED(vp);
2494                                 mutex_exit(&vp->v_lock);
2495                                 rw_exit(&rp->r_hashq->r_lock);
2496                                 rw_enter(&rhtp->r_lock, RW_READER);
2497                                 goto start;
2498                         }
2499                         mutex_exit(&vp->v_lock);
2500                         rp_rmhash_locked(rp);
2501                         rw_exit(&rp->r_hashq->r_lock);
2502                 }
2503 
2504                 rinactive(rp, cr);
2505 
2506                 mutex_enter(&vp->v_lock);
2507                 if (vp->v_count > 1) {
2508                         VN_RELE_LOCKED(vp);
2509                         mutex_exit(&vp->v_lock);
2510                         rw_enter(&rhtp->r_lock, RW_READER);
2511                         goto start;
2512                 }
2513                 mutex_exit(&vp->v_lock);
2514                 vn_invalid(vp);
2515                 /*
2516                  * destroy old locks before bzero'ing and
2517                  * recreating the locks below.
2518                  */
2519                 nfs_rw_destroy(&rp->r_rwlock);
2520                 nfs_rw_destroy(&rp->r_lkserlock);
2521                 mutex_destroy(&rp->r_statelock);
2522                 cv_destroy(&rp->r_cv);
2523                 cv_destroy(&rp->r_commit.c_cv);
2524                 nfs_free_r_path(rp);
2525                 avl_destroy(&rp->r_dir);
2526                 /*
2527                  * Make sure that if rnode is recycled then
2528                  * VFS count is decremented properly before
2529                  * reuse.
2530                  */
2531                 VFS_RELE(vp->v_vfsp);
2532                 vn_reinit(vp);
2533         } else {
2534                 vnode_t *new_vp;
2535 
2536                 mutex_exit(&rpfreelist_lock);
2537 
2538                 rp = kmem_cache_alloc(rnode_cache, KM_SLEEP);
2539                 new_vp = vn_alloc(KM_SLEEP);
2540 
2541                 atomic_inc_ulong((ulong_t *)&rnew);
2542 #ifdef DEBUG
2543                 clstat_debug.nrnode.value.ui64++;
2544 #endif
2545                 vp = new_vp;
2546         }
2547 
2548         bzero(rp, sizeof (*rp));
2549         rp->r_vnode = vp;
2550         nfs_rw_init(&rp->r_rwlock, NULL, RW_DEFAULT, NULL);
2551         nfs_rw_init(&rp->r_lkserlock, NULL, RW_DEFAULT, NULL);
2552         mutex_init(&rp->r_statelock, NULL, MUTEX_DEFAULT, NULL);
2553         cv_init(&rp->r_cv, NULL, CV_DEFAULT, NULL);
2554         cv_init(&rp->r_commit.c_cv, NULL, CV_DEFAULT, NULL);
2555         rp->r_fh.fh_len = fh->fh_len;
2556         bcopy(fh->fh_buf, rp->r_fh.fh_buf, fh->fh_len);
2557         rp->r_server = mi->mi_curr_serv;
2558         if (FAILOVER_MOUNT(mi)) {
2559                 /*
2560                  * If replicated servers, stash pathnames
2561                  */
2562                 if (dnm != NULL && nm != NULL) {
2563                         char *s, *p;
2564                         uint_t len;
2565 
2566                         len = (uint_t)(strlen(dnm) + strlen(nm) + 2);
2567                         rp->r_path = kmem_alloc(len, KM_SLEEP);
2568 #ifdef DEBUG
2569                         clstat_debug.rpath.value.ui64 += len;
2570 #endif
2571                         s = rp->r_path;
2572                         for (p = dnm; *p; p++)
2573                                 *s++ = *p;
2574                         *s++ = '/';
2575                         for (p = nm; *p; p++)
2576                                 *s++ = *p;
2577                         *s = '\0';
2578                 } else {
2579                         /* special case for root */
2580                         rp->r_path = kmem_alloc(2, KM_SLEEP);
2581 #ifdef DEBUG
2582                         clstat_debug.rpath.value.ui64 += 2;
2583 #endif
2584                         *rp->r_path = '.';
2585                         *(rp->r_path + 1) = '\0';
2586                 }
2587         }
2588         VFS_HOLD(vfsp);
2589         rp->r_putapage = putapage;
2590         rp->r_hashq = rhtp;
2591         rp->r_flags = RREADDIRPLUS;
2592         avl_create(&rp->r_dir, compar, sizeof (rddir_cache),
2593             offsetof(rddir_cache, tree));
2594         vn_setops(vp, vops);
2595         vp->v_data = (caddr_t)rp;
2596         vp->v_vfsp = vfsp;
2597         vp->v_type = VNON;
2598         vp->v_flag |= VMODSORT;
2599         nfs_set_vroot(vp);
2600 
2601         /*
2602          * There is a race condition if someone else
2603          * alloc's the rnode while no locks are held, so we
2604          * check again and recover if found.
2605          */
2606         rw_enter(&rhtp->r_lock, RW_WRITER);
2607         if ((trp = rfind(rhtp, fh, vfsp)) != NULL) {
2608                 vp = RTOV(trp);
2609                 nfs_set_vroot(vp);
2610                 *newnode = 0;
2611                 rw_exit(&rhtp->r_lock);
2612                 rp_addfree(rp, cr);
2613                 rw_enter(&rhtp->r_lock, RW_READER);
2614                 return (vp);
2615         }
2616         rp_addhash(rp);
2617         *newnode = 1;
2618         return (vp);
2619 }
2620 
2621 /*
2622  * Callback function to check if the page should be marked as
2623  * modified. In the positive case, p_fsdata is set to C_NOCOMMIT.
2624  */
2625 int
2626 nfs_setmod_check(page_t *pp)
2627 {
2628         if (pp->p_fsdata != C_NOCOMMIT) {
2629                 pp->p_fsdata = C_NOCOMMIT;
2630                 return (1);
2631         }
2632         return (0);
2633 }
2634 
2635 static void
2636 nfs_set_vroot(vnode_t *vp)
2637 {
2638         rnode_t *rp;
2639         nfs_fhandle *rootfh;
2640 
2641         rp = VTOR(vp);
2642         rootfh = &rp->r_server->sv_fhandle;
2643         if (rootfh->fh_len == rp->r_fh.fh_len &&
2644             bcmp(rootfh->fh_buf, rp->r_fh.fh_buf, rp->r_fh.fh_len) == 0) {
2645                 if (!(vp->v_flag & VROOT)) {
2646                         mutex_enter(&vp->v_lock);
2647                         vp->v_flag |= VROOT;
2648                         mutex_exit(&vp->v_lock);
2649                 }
2650         }
2651 }
2652 
2653 static void
2654 nfs_free_r_path(rnode_t *rp)
2655 {
2656         char *path;
2657         size_t len;
2658 
2659         path = rp->r_path;
2660         if (path) {
2661                 rp->r_path = NULL;
2662                 len = strlen(path) + 1;
2663                 kmem_free(path, len);
2664 #ifdef DEBUG
2665                 clstat_debug.rpath.value.ui64 -= len;
2666 #endif
2667         }
2668 }
2669 
2670 /*
2671  * Put an rnode on the free list.
2672  *
2673  * Rnodes which were allocated above and beyond the normal limit
2674  * are immediately freed.
2675  */
2676 void
2677 rp_addfree(rnode_t *rp, cred_t *cr)
2678 {
2679         vnode_t *vp;
2680         struct vfs *vfsp;
2681 
2682         vp = RTOV(rp);
2683         ASSERT(vp->v_count >= 1);
2684         ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
2685 
2686         /*
2687          * If we have too many rnodes allocated and there are no
2688          * references to this rnode, or if the rnode is no longer
2689          * accessible by it does not reside in the hash queues,
2690          * or if an i/o error occurred while writing to the file,
2691          * then just free it instead of putting it on the rnode
2692          * freelist.
2693          */
2694         vfsp = vp->v_vfsp;
2695         if (((rnew > nrnode || !(rp->r_flags & RHASHED) || rp->r_error ||
2696             (vfsp->vfs_flag & VFS_UNMOUNTED)) && rp->r_count == 0)) {
2697                 if (rp->r_flags & RHASHED) {
2698                         rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2699                         mutex_enter(&vp->v_lock);
2700                         if (vp->v_count > 1) {
2701                                 VN_RELE_LOCKED(vp);
2702                                 mutex_exit(&vp->v_lock);
2703                                 rw_exit(&rp->r_hashq->r_lock);
2704                                 return;
2705                         }
2706                         mutex_exit(&vp->v_lock);
2707                         rp_rmhash_locked(rp);
2708                         rw_exit(&rp->r_hashq->r_lock);
2709                 }
2710 
2711                 rinactive(rp, cr);
2712 
2713                 /*
2714                  * Recheck the vnode reference count.  We need to
2715                  * make sure that another reference has not been
2716                  * acquired while we were not holding v_lock.  The
2717                  * rnode is not in the rnode hash queues, so the
2718                  * only way for a reference to have been acquired
2719                  * is for a VOP_PUTPAGE because the rnode was marked
2720                  * with RDIRTY or for a modified page.  This
2721                  * reference may have been acquired before our call
2722                  * to rinactive.  The i/o may have been completed,
2723                  * thus allowing rinactive to complete, but the
2724                  * reference to the vnode may not have been released
2725                  * yet.  In any case, the rnode can not be destroyed
2726                  * until the other references to this vnode have been
2727                  * released.  The other references will take care of
2728                  * either destroying the rnode or placing it on the
2729                  * rnode freelist.  If there are no other references,
2730                  * then the rnode may be safely destroyed.
2731                  */
2732                 mutex_enter(&vp->v_lock);
2733                 if (vp->v_count > 1) {
2734                         VN_RELE_LOCKED(vp);
2735                         mutex_exit(&vp->v_lock);
2736                         return;
2737                 }
2738                 mutex_exit(&vp->v_lock);
2739 
2740                 destroy_rnode(rp);
2741                 return;
2742         }
2743 
2744         /*
2745          * Lock the hash queue and then recheck the reference count
2746          * to ensure that no other threads have acquired a reference
2747          * to indicate that the rnode should not be placed on the
2748          * freelist.  If another reference has been acquired, then
2749          * just release this one and let the other thread complete
2750          * the processing of adding this rnode to the freelist.
2751          */
2752         rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2753 
2754         mutex_enter(&vp->v_lock);
2755         if (vp->v_count > 1) {
2756                 VN_RELE_LOCKED(vp);
2757                 mutex_exit(&vp->v_lock);
2758                 rw_exit(&rp->r_hashq->r_lock);
2759                 return;
2760         }
2761         mutex_exit(&vp->v_lock);
2762 
2763         /*
2764          * If there is no cached data or metadata for this file, then
2765          * put the rnode on the front of the freelist so that it will
2766          * be reused before other rnodes which may have cached data or
2767          * metadata associated with them.
2768          */
2769         mutex_enter(&rpfreelist_lock);
2770         if (rpfreelist == NULL) {
2771                 rp->r_freef = rp;
2772                 rp->r_freeb = rp;
2773                 rpfreelist = rp;
2774         } else {
2775                 rp->r_freef = rpfreelist;
2776                 rp->r_freeb = rpfreelist->r_freeb;
2777                 rpfreelist->r_freeb->r_freef = rp;
2778                 rpfreelist->r_freeb = rp;
2779                 if (!vn_has_cached_data(vp) &&
2780                     !HAVE_RDDIR_CACHE(rp) &&
2781                     rp->r_symlink.contents == NULL &&
2782                     rp->r_secattr == NULL &&
2783                     rp->r_pathconf == NULL)
2784                         rpfreelist = rp;
2785         }
2786         mutex_exit(&rpfreelist_lock);
2787 
2788         rw_exit(&rp->r_hashq->r_lock);
2789 }
2790 
2791 /*
2792  * Remove an rnode from the free list.
2793  *
2794  * The caller must be holding rpfreelist_lock and the rnode
2795  * must be on the freelist.
2796  */
2797 static void
2798 rp_rmfree(rnode_t *rp)
2799 {
2800 
2801         ASSERT(MUTEX_HELD(&rpfreelist_lock));
2802         ASSERT(rp->r_freef != NULL && rp->r_freeb != NULL);
2803 
2804         if (rp == rpfreelist) {
2805                 rpfreelist = rp->r_freef;
2806                 if (rp == rpfreelist)
2807                         rpfreelist = NULL;
2808         }
2809 
2810         rp->r_freeb->r_freef = rp->r_freef;
2811         rp->r_freef->r_freeb = rp->r_freeb;
2812 
2813         rp->r_freef = rp->r_freeb = NULL;
2814 }
2815 
2816 /*
2817  * Put a rnode in the hash table.
2818  *
2819  * The caller must be holding the exclusive hash queue lock.
2820  */
2821 static void
2822 rp_addhash(rnode_t *rp)
2823 {
2824 
2825         ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
2826         ASSERT(!(rp->r_flags & RHASHED));
2827 
2828         rp->r_hashf = rp->r_hashq->r_hashf;
2829         rp->r_hashq->r_hashf = rp;
2830         rp->r_hashb = (rnode_t *)rp->r_hashq;
2831         rp->r_hashf->r_hashb = rp;
2832 
2833         mutex_enter(&rp->r_statelock);
2834         rp->r_flags |= RHASHED;
2835         mutex_exit(&rp->r_statelock);
2836 }
2837 
2838 /*
2839  * Remove a rnode from the hash table.
2840  *
2841  * The caller must be holding the hash queue lock.
2842  */
2843 static void
2844 rp_rmhash_locked(rnode_t *rp)
2845 {
2846 
2847         ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
2848         ASSERT(rp->r_flags & RHASHED);
2849 
2850         rp->r_hashb->r_hashf = rp->r_hashf;
2851         rp->r_hashf->r_hashb = rp->r_hashb;
2852 
2853         mutex_enter(&rp->r_statelock);
2854         rp->r_flags &= ~RHASHED;
2855         mutex_exit(&rp->r_statelock);
2856 }
2857 
2858 /*
2859  * Remove a rnode from the hash table.
2860  *
2861  * The caller must not be holding the hash queue lock.
2862  */
2863 void
2864 rp_rmhash(rnode_t *rp)
2865 {
2866 
2867         rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2868         rp_rmhash_locked(rp);
2869         rw_exit(&rp->r_hashq->r_lock);
2870 }
2871 
2872 /*
2873  * Lookup a rnode by fhandle.
2874  *
2875  * The caller must be holding the hash queue lock, either shared or exclusive.
2876  */
2877 static rnode_t *
2878 rfind(rhashq_t *rhtp, nfs_fhandle *fh, struct vfs *vfsp)
2879 {
2880         rnode_t *rp;
2881         vnode_t *vp;
2882 
2883         ASSERT(RW_LOCK_HELD(&rhtp->r_lock));
2884 
2885         for (rp = rhtp->r_hashf; rp != (rnode_t *)rhtp; rp = rp->r_hashf) {
2886                 vp = RTOV(rp);
2887                 if (vp->v_vfsp == vfsp &&
2888                     rp->r_fh.fh_len == fh->fh_len &&
2889                     bcmp(rp->r_fh.fh_buf, fh->fh_buf, fh->fh_len) == 0) {
2890                         /*
2891                          * remove rnode from free list, if necessary.
2892                          */
2893                         if (rp->r_freef != NULL) {
2894                                 mutex_enter(&rpfreelist_lock);
2895                                 /*
2896                                  * If the rnode is on the freelist,
2897                                  * then remove it and use that reference
2898                                  * as the new reference.  Otherwise,
2899                                  * need to increment the reference count.
2900                                  */
2901                                 if (rp->r_freef != NULL) {
2902                                         rp_rmfree(rp);
2903                                         mutex_exit(&rpfreelist_lock);
2904                                 } else {
2905                                         mutex_exit(&rpfreelist_lock);
2906                                         VN_HOLD(vp);
2907                                 }
2908                         } else
2909                                 VN_HOLD(vp);
2910                         return (rp);
2911                 }
2912         }
2913         return (NULL);
2914 }
2915 
2916 /*
2917  * Return 1 if there is a active vnode belonging to this vfs in the
2918  * rtable cache.
2919  *
2920  * Several of these checks are done without holding the usual
2921  * locks.  This is safe because destroy_rtable(), rp_addfree(),
2922  * etc. will redo the necessary checks before actually destroying
2923  * any rnodes.
2924  */
2925 int
2926 check_rtable(struct vfs *vfsp)
2927 {
2928         int index;
2929         rnode_t *rp;
2930         vnode_t *vp;
2931 
2932         for (index = 0; index < rtablesize; index++) {
2933                 rw_enter(&rtable[index].r_lock, RW_READER);
2934                 for (rp = rtable[index].r_hashf;
2935                     rp != (rnode_t *)(&rtable[index]);
2936                     rp = rp->r_hashf) {
2937                         vp = RTOV(rp);
2938                         if (vp->v_vfsp == vfsp) {
2939                                 if (rp->r_freef == NULL ||
2940                                     (vn_has_cached_data(vp) &&
2941                                     (rp->r_flags & RDIRTY)) ||
2942                                     rp->r_count > 0) {
2943                                         rw_exit(&rtable[index].r_lock);
2944                                         return (1);
2945                                 }
2946                         }
2947                 }
2948                 rw_exit(&rtable[index].r_lock);
2949         }
2950         return (0);
2951 }
2952 
2953 /*
2954  * Destroy inactive vnodes from the hash queues which belong to this
2955  * vfs.  It is essential that we destroy all inactive vnodes during a
2956  * forced unmount as well as during a normal unmount.
2957  */
2958 void
2959 destroy_rtable(struct vfs *vfsp, cred_t *cr)
2960 {
2961         int index;
2962         rnode_t *rp;
2963         rnode_t *rlist;
2964         rnode_t *r_hashf;
2965         vnode_t *vp;
2966 
2967         rlist = NULL;
2968 
2969         for (index = 0; index < rtablesize; index++) {
2970                 rw_enter(&rtable[index].r_lock, RW_WRITER);
2971                 for (rp = rtable[index].r_hashf;
2972                     rp != (rnode_t *)(&rtable[index]);
2973                     rp = r_hashf) {
2974                         /* save the hash pointer before destroying */
2975                         r_hashf = rp->r_hashf;
2976                         vp = RTOV(rp);
2977                         if (vp->v_vfsp == vfsp) {
2978                                 mutex_enter(&rpfreelist_lock);
2979                                 if (rp->r_freef != NULL) {
2980                                         rp_rmfree(rp);
2981                                         mutex_exit(&rpfreelist_lock);
2982                                         rp_rmhash_locked(rp);
2983                                         rp->r_hashf = rlist;
2984                                         rlist = rp;
2985                                 } else
2986                                         mutex_exit(&rpfreelist_lock);
2987                         }
2988                 }
2989                 rw_exit(&rtable[index].r_lock);
2990         }
2991 
2992         for (rp = rlist; rp != NULL; rp = rlist) {
2993                 rlist = rp->r_hashf;
2994                 /*
2995                  * This call to rp_addfree will end up destroying the
2996                  * rnode, but in a safe way with the appropriate set
2997                  * of checks done.
2998                  */
2999                 rp_addfree(rp, cr);
3000         }
3001 
3002 }
3003 
3004 /*
3005  * This routine destroys all the resources associated with the rnode
3006  * and then the rnode itself.
3007  */
3008 static void
3009 destroy_rnode(rnode_t *rp)
3010 {
3011         vnode_t *vp;
3012         vfs_t *vfsp;
3013 
3014         vp = RTOV(rp);
3015         vfsp = vp->v_vfsp;
3016 
3017         ASSERT(vp->v_count == 1);
3018         ASSERT(rp->r_count == 0);
3019         ASSERT(rp->r_lmpl == NULL);
3020         ASSERT(rp->r_mapcnt == 0);
3021         ASSERT(!(rp->r_flags & RHASHED));
3022         ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
3023         atomic_dec_ulong((ulong_t *)&rnew);
3024 #ifdef DEBUG
3025         clstat_debug.nrnode.value.ui64--;
3026 #endif
3027         nfs_rw_destroy(&rp->r_rwlock);
3028         nfs_rw_destroy(&rp->r_lkserlock);
3029         mutex_destroy(&rp->r_statelock);
3030         cv_destroy(&rp->r_cv);
3031         cv_destroy(&rp->r_commit.c_cv);
3032         if (rp->r_flags & RDELMAPLIST)
3033                 list_destroy(&rp->r_indelmap);
3034         nfs_free_r_path(rp);
3035         avl_destroy(&rp->r_dir);
3036         vn_invalid(vp);
3037         vn_free(vp);
3038         kmem_cache_free(rnode_cache, rp);
3039         VFS_RELE(vfsp);
3040 }
3041 
3042 /*
3043  * Flush all vnodes in this (or every) vfs.
3044  * Used by nfs_sync and by nfs_unmount.
3045  */
3046 void
3047 rflush(struct vfs *vfsp, cred_t *cr)
3048 {
3049         int index;
3050         rnode_t *rp;
3051         vnode_t *vp, **vplist;
3052         long num, cnt;
3053 
3054         /*
3055          * Check to see whether there is anything to do.
3056          */
3057         num = rnew;
3058         if (num == 0)
3059                 return;
3060 
3061         /*
3062          * Allocate a slot for all currently active rnodes on the
3063          * supposition that they all may need flushing.
3064          */
3065         vplist = kmem_alloc(num * sizeof (*vplist), KM_SLEEP);
3066         cnt = 0;
3067 
3068         /*
3069          * Walk the hash queues looking for rnodes with page
3070          * lists associated with them.  Make a list of these
3071          * files.
3072          */
3073         for (index = 0; index < rtablesize; index++) {
3074                 rw_enter(&rtable[index].r_lock, RW_READER);
3075                 for (rp = rtable[index].r_hashf;
3076                     rp != (rnode_t *)(&rtable[index]);
3077                     rp = rp->r_hashf) {
3078                         vp = RTOV(rp);
3079                         /*
3080                          * Don't bother sync'ing a vp if it
3081                          * is part of virtual swap device or
3082                          * if VFS is read-only
3083                          */
3084                         if (IS_SWAPVP(vp) || vn_is_readonly(vp))
3085                                 continue;
3086                         /*
3087                          * If flushing all mounted file systems or
3088                          * the vnode belongs to this vfs, has pages
3089                          * and is marked as either dirty or mmap'd,
3090                          * hold and add this vnode to the list of
3091                          * vnodes to flush.
3092                          */
3093                         if ((vfsp == NULL || vp->v_vfsp == vfsp) &&
3094                             vn_has_cached_data(vp) &&
3095                             ((rp->r_flags & RDIRTY) || rp->r_mapcnt > 0)) {
3096                                 VN_HOLD(vp);
3097                                 vplist[cnt++] = vp;
3098                                 if (cnt == num) {
3099                                         rw_exit(&rtable[index].r_lock);
3100                                         goto toomany;
3101                                 }
3102                         }
3103                 }
3104                 rw_exit(&rtable[index].r_lock);
3105         }
3106 toomany:
3107 
3108         /*
3109          * Flush and release all of the files on the list.
3110          */
3111         while (cnt-- > 0) {
3112                 vp = vplist[cnt];
3113                 (void) VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_ASYNC, cr, NULL);
3114                 VN_RELE(vp);
3115         }
3116 
3117         /*
3118          * Free the space allocated to hold the list.
3119          */
3120         kmem_free(vplist, num * sizeof (*vplist));
3121 }
3122 
3123 /*
3124  * This probably needs to be larger than or equal to
3125  * log2(sizeof (struct rnode)) due to the way that rnodes are
3126  * allocated.
3127  */
3128 #define ACACHE_SHIFT_BITS       9
3129 
3130 static int
3131 acachehash(rnode_t *rp, cred_t *cr)
3132 {
3133 
3134         return ((((intptr_t)rp >> ACACHE_SHIFT_BITS) + crgetuid(cr)) &
3135             acachemask);
3136 }
3137 
3138 #ifdef DEBUG
3139 static long nfs_access_cache_hits = 0;
3140 static long nfs_access_cache_misses = 0;
3141 #endif
3142 
3143 nfs_access_type_t
3144 nfs_access_check(rnode_t *rp, uint32_t acc, cred_t *cr)
3145 {
3146         vnode_t *vp;
3147         acache_t *ap;
3148         acache_hash_t *hp;
3149         nfs_access_type_t all;
3150 
3151         vp = RTOV(rp);
3152         if (!ATTRCACHE_VALID(vp) || nfs_waitfor_purge_complete(vp))
3153                 return (NFS_ACCESS_UNKNOWN);
3154 
3155         if (rp->r_acache != NULL) {
3156                 hp = &acache[acachehash(rp, cr)];
3157                 rw_enter(&hp->lock, RW_READER);
3158                 ap = hp->next;
3159                 while (ap != (acache_t *)hp) {
3160                         if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) {
3161                                 if ((ap->known & acc) == acc) {
3162 #ifdef DEBUG
3163                                         nfs_access_cache_hits++;
3164 #endif
3165                                         if ((ap->allowed & acc) == acc)
3166                                                 all = NFS_ACCESS_ALLOWED;
3167                                         else
3168                                                 all = NFS_ACCESS_DENIED;
3169                                 } else {
3170 #ifdef DEBUG
3171                                         nfs_access_cache_misses++;
3172 #endif
3173                                         all = NFS_ACCESS_UNKNOWN;
3174                                 }
3175                                 rw_exit(&hp->lock);
3176                                 return (all);
3177                         }
3178                         ap = ap->next;
3179                 }
3180                 rw_exit(&hp->lock);
3181         }
3182 
3183 #ifdef DEBUG
3184         nfs_access_cache_misses++;
3185 #endif
3186         return (NFS_ACCESS_UNKNOWN);
3187 }
3188 
3189 void
3190 nfs_access_cache(rnode_t *rp, uint32_t acc, uint32_t resacc, cred_t *cr)
3191 {
3192         acache_t *ap;
3193         acache_t *nap;
3194         acache_hash_t *hp;
3195 
3196         hp = &acache[acachehash(rp, cr)];
3197 
3198         /*
3199          * Allocate now assuming that mostly an allocation will be
3200          * required.  This allows the allocation to happen without
3201          * holding the hash bucket locked.
3202          */
3203         nap = kmem_cache_alloc(acache_cache, KM_NOSLEEP);
3204         if (nap != NULL) {
3205                 nap->known = acc;
3206                 nap->allowed = resacc;
3207                 nap->rnode = rp;
3208                 crhold(cr);
3209                 nap->cred = cr;
3210                 nap->hashq = hp;
3211         }
3212 
3213         rw_enter(&hp->lock, RW_WRITER);
3214 
3215         if (rp->r_acache != NULL) {
3216                 ap = hp->next;
3217                 while (ap != (acache_t *)hp) {
3218                         if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) {
3219                                 ap->known |= acc;
3220                                 ap->allowed &= ~acc;
3221                                 ap->allowed |= resacc;
3222                                 rw_exit(&hp->lock);
3223                                 if (nap != NULL) {
3224                                         crfree(nap->cred);
3225                                         kmem_cache_free(acache_cache, nap);
3226                                 }
3227                                 return;
3228                         }
3229                         ap = ap->next;
3230                 }
3231         }
3232 
3233         if (nap != NULL) {
3234 #ifdef DEBUG
3235                 clstat_debug.access.value.ui64++;
3236 #endif
3237                 nap->next = hp->next;
3238                 hp->next = nap;
3239                 nap->next->prev = nap;
3240                 nap->prev = (acache_t *)hp;
3241 
3242                 mutex_enter(&rp->r_statelock);
3243                 nap->list = rp->r_acache;
3244                 rp->r_acache = nap;
3245                 mutex_exit(&rp->r_statelock);
3246         }
3247 
3248         rw_exit(&hp->lock);
3249 }
3250 
3251 int
3252 nfs_access_purge_rp(rnode_t *rp)
3253 {
3254         acache_t *ap;
3255         acache_t *tmpap;
3256         acache_t *rplist;
3257 
3258         /*
3259          * If there aren't any cached entries, then there is nothing
3260          * to free.
3261          */
3262         if (rp->r_acache == NULL)
3263                 return (0);
3264 
3265         mutex_enter(&rp->r_statelock);
3266         rplist = rp->r_acache;
3267         rp->r_acache = NULL;
3268         mutex_exit(&rp->r_statelock);
3269 
3270         /*
3271          * Loop through each entry in the list pointed to in the
3272          * rnode.  Remove each of these entries from the hash
3273          * queue that it is on and remove it from the list in
3274          * the rnode.
3275          */
3276         for (ap = rplist; ap != NULL; ap = tmpap) {
3277                 rw_enter(&ap->hashq->lock, RW_WRITER);
3278                 ap->prev->next = ap->next;
3279                 ap->next->prev = ap->prev;
3280                 rw_exit(&ap->hashq->lock);
3281 
3282                 tmpap = ap->list;
3283                 crfree(ap->cred);
3284                 kmem_cache_free(acache_cache, ap);
3285 #ifdef DEBUG
3286                 clstat_debug.access.value.ui64--;
3287 #endif
3288         }
3289 
3290         return (1);
3291 }
3292 
3293 static const char prefix[] = ".nfs";
3294 
3295 static kmutex_t newnum_lock;
3296 
3297 int
3298 newnum(void)
3299 {
3300         static uint_t newnum = 0;
3301         uint_t id;
3302 
3303         mutex_enter(&newnum_lock);
3304         if (newnum == 0)
3305                 newnum = gethrestime_sec() & 0xffff;
3306         id = newnum++;
3307         mutex_exit(&newnum_lock);
3308         return (id);
3309 }
3310 
3311 char *
3312 newname(void)
3313 {
3314         char *news;
3315         char *s;
3316         const char *p;
3317         uint_t id;
3318 
3319         id = newnum();
3320         news = kmem_alloc(MAXNAMELEN, KM_SLEEP);
3321         s = news;
3322         p = prefix;
3323         while (*p != '\0')
3324                 *s++ = *p++;
3325         while (id != 0) {
3326                 *s++ = "0123456789ABCDEF"[id & 0x0f];
3327                 id >>= 4;
3328         }
3329         *s = '\0';
3330         return (news);
3331 }
3332 
3333 /*
3334  * Snapshot callback for nfs:0:nfs_client as registered with the kstat
3335  * framework.
3336  */
3337 static int
3338 cl_snapshot(kstat_t *ksp, void *buf, int rw)
3339 {
3340         ksp->ks_snaptime = gethrtime();
3341         if (rw == KSTAT_WRITE) {
3342                 bcopy(buf, ksp->ks_private, sizeof (clstat_tmpl));
3343 #ifdef DEBUG
3344                 /*
3345                  * Currently only the global zone can write to kstats, but we
3346                  * add the check just for paranoia.
3347                  */
3348                 if (INGLOBALZONE(curproc))
3349                         bcopy((char *)buf + sizeof (clstat_tmpl), &clstat_debug,
3350                             sizeof (clstat_debug));
3351 #endif
3352         } else {
3353                 bcopy(ksp->ks_private, buf, sizeof (clstat_tmpl));
3354 #ifdef DEBUG
3355                 /*
3356                  * If we're displaying the "global" debug kstat values, we
3357                  * display them as-is to all zones since in fact they apply to
3358                  * the system as a whole.
3359                  */
3360                 bcopy(&clstat_debug, (char *)buf + sizeof (clstat_tmpl),
3361                     sizeof (clstat_debug));
3362 #endif
3363         }
3364         return (0);
3365 }
3366 
3367 static void *
3368 clinit_zone(zoneid_t zoneid)
3369 {
3370         kstat_t *nfs_client_kstat;
3371         struct nfs_clnt *nfscl;
3372         uint_t ndata;
3373 
3374         nfscl = kmem_alloc(sizeof (*nfscl), KM_SLEEP);
3375         mutex_init(&nfscl->nfscl_chtable_lock, NULL, MUTEX_DEFAULT, NULL);
3376         nfscl->nfscl_chtable = NULL;
3377         nfscl->nfscl_zoneid = zoneid;
3378 
3379         bcopy(&clstat_tmpl, &nfscl->nfscl_stat, sizeof (clstat_tmpl));
3380         ndata = sizeof (clstat_tmpl) / sizeof (kstat_named_t);
3381 #ifdef DEBUG
3382         ndata += sizeof (clstat_debug) / sizeof (kstat_named_t);
3383 #endif
3384         if ((nfs_client_kstat = kstat_create_zone("nfs", 0, "nfs_client",
3385             "misc", KSTAT_TYPE_NAMED, ndata,
3386             KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, zoneid)) != NULL) {
3387                 nfs_client_kstat->ks_private = &nfscl->nfscl_stat;
3388                 nfs_client_kstat->ks_snapshot = cl_snapshot;
3389                 kstat_install(nfs_client_kstat);
3390         }
3391         mutex_enter(&nfs_clnt_list_lock);
3392         list_insert_head(&nfs_clnt_list, nfscl);
3393         mutex_exit(&nfs_clnt_list_lock);
3394         return (nfscl);
3395 }
3396 
3397 /*ARGSUSED*/
3398 static void
3399 clfini_zone(zoneid_t zoneid, void *arg)
3400 {
3401         struct nfs_clnt *nfscl = arg;
3402         chhead_t *chp, *next;
3403 
3404         if (nfscl == NULL)
3405                 return;
3406         mutex_enter(&nfs_clnt_list_lock);
3407         list_remove(&nfs_clnt_list, nfscl);
3408         mutex_exit(&nfs_clnt_list_lock);
3409         clreclaim_zone(nfscl, 0);
3410         for (chp = nfscl->nfscl_chtable; chp != NULL; chp = next) {
3411                 ASSERT(chp->ch_list == NULL);
3412                 kmem_free(chp->ch_protofmly, strlen(chp->ch_protofmly) + 1);
3413                 next = chp->ch_next;
3414                 kmem_free(chp, sizeof (*chp));
3415         }
3416         kstat_delete_byname_zone("nfs", 0, "nfs_client", zoneid);
3417         mutex_destroy(&nfscl->nfscl_chtable_lock);
3418         kmem_free(nfscl, sizeof (*nfscl));
3419 }
3420 
3421 /*
3422  * Called by endpnt_destructor to make sure the client handles are
3423  * cleaned up before the RPC endpoints.  This becomes a no-op if
3424  * clfini_zone (above) is called first.  This function is needed
3425  * (rather than relying on clfini_zone to clean up) because the ZSD
3426  * callbacks have no ordering mechanism, so we have no way to ensure
3427  * that clfini_zone is called before endpnt_destructor.
3428  */
3429 void
3430 clcleanup_zone(zoneid_t zoneid)
3431 {
3432         struct nfs_clnt *nfscl;
3433 
3434         mutex_enter(&nfs_clnt_list_lock);
3435         nfscl = list_head(&nfs_clnt_list);
3436         for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl)) {
3437                 if (nfscl->nfscl_zoneid == zoneid) {
3438                         clreclaim_zone(nfscl, 0);
3439                         break;
3440                 }
3441         }
3442         mutex_exit(&nfs_clnt_list_lock);
3443 }
3444 
3445 int
3446 nfs_subrinit(void)
3447 {
3448         int i;
3449         ulong_t nrnode_max;
3450 
3451         /*
3452          * Allocate and initialize the rnode hash queues
3453          */
3454         if (nrnode <= 0)
3455                 nrnode = ncsize;
3456         nrnode_max = (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct rnode));
3457         if (nrnode > nrnode_max || (nrnode == 0 && ncsize == 0)) {
3458                 zcmn_err(GLOBAL_ZONEID, CE_NOTE,
3459                     "!setting nrnode to max value of %ld", nrnode_max);
3460                 nrnode = nrnode_max;
3461         }
3462 
3463         rtablesize = 1 << highbit(nrnode / hashlen);
3464         rtablemask = rtablesize - 1;
3465         rtable = kmem_alloc(rtablesize * sizeof (*rtable), KM_SLEEP);
3466         for (i = 0; i < rtablesize; i++) {
3467                 rtable[i].r_hashf = (rnode_t *)(&rtable[i]);
3468                 rtable[i].r_hashb = (rnode_t *)(&rtable[i]);
3469                 rw_init(&rtable[i].r_lock, NULL, RW_DEFAULT, NULL);
3470         }
3471         rnode_cache = kmem_cache_create("rnode_cache", sizeof (rnode_t),
3472             0, NULL, NULL, nfs_reclaim, NULL, NULL, 0);
3473 
3474         /*
3475          * Allocate and initialize the access cache
3476          */
3477 
3478         /*
3479          * Initial guess is one access cache entry per rnode unless
3480          * nacache is set to a non-zero value and then it is used to
3481          * indicate a guess at the number of access cache entries.
3482          */
3483         if (nacache > 0)
3484                 acachesize = 1 << highbit(nacache / hashlen);
3485         else
3486                 acachesize = rtablesize;
3487         acachemask = acachesize - 1;
3488         acache = kmem_alloc(acachesize * sizeof (*acache), KM_SLEEP);
3489         for (i = 0; i < acachesize; i++) {
3490                 acache[i].next = (acache_t *)&acache[i];
3491                 acache[i].prev = (acache_t *)&acache[i];
3492                 rw_init(&acache[i].lock, NULL, RW_DEFAULT, NULL);
3493         }
3494         acache_cache = kmem_cache_create("nfs_access_cache",
3495             sizeof (acache_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
3496         /*
3497          * Allocate and initialize the client handle cache
3498          */
3499         chtab_cache = kmem_cache_create("client_handle_cache",
3500             sizeof (struct chtab), 0, NULL, NULL, clreclaim, NULL, NULL, 0);
3501         /*
3502          * Initialize the list of per-zone client handles (and associated data).
3503          * This needs to be done before we call zone_key_create().
3504          */
3505         list_create(&nfs_clnt_list, sizeof (struct nfs_clnt),
3506             offsetof(struct nfs_clnt, nfscl_node));
3507         /*
3508          * Initialize the zone_key for per-zone client handle lists.
3509          */
3510         zone_key_create(&nfsclnt_zone_key, clinit_zone, NULL, clfini_zone);
3511         /*
3512          * Initialize the various mutexes and reader/writer locks
3513          */
3514         mutex_init(&rpfreelist_lock, NULL, MUTEX_DEFAULT, NULL);
3515         mutex_init(&newnum_lock, NULL, MUTEX_DEFAULT, NULL);
3516         mutex_init(&nfs_minor_lock, NULL, MUTEX_DEFAULT, NULL);
3517 
3518         /*
3519          * Assign unique major number for all nfs mounts
3520          */
3521         if ((nfs_major = getudev()) == -1) {
3522                 zcmn_err(GLOBAL_ZONEID, CE_WARN,
3523                     "nfs: init: can't get unique device number");
3524                 nfs_major = 0;
3525         }
3526         nfs_minor = 0;
3527 
3528         if (nfs3_jukebox_delay == 0)
3529                 nfs3_jukebox_delay = NFS3_JUKEBOX_DELAY;
3530 
3531         return (0);
3532 }
3533 
3534 void
3535 nfs_subrfini(void)
3536 {
3537         int i;
3538 
3539         /*
3540          * Deallocate the rnode hash queues
3541          */
3542         kmem_cache_destroy(rnode_cache);
3543 
3544         for (i = 0; i < rtablesize; i++)
3545                 rw_destroy(&rtable[i].r_lock);
3546         kmem_free(rtable, rtablesize * sizeof (*rtable));
3547 
3548         /*
3549          * Deallocated the access cache
3550          */
3551         kmem_cache_destroy(acache_cache);
3552 
3553         for (i = 0; i < acachesize; i++)
3554                 rw_destroy(&acache[i].lock);
3555         kmem_free(acache, acachesize * sizeof (*acache));
3556 
3557         /*
3558          * Deallocate the client handle cache
3559          */
3560         kmem_cache_destroy(chtab_cache);
3561 
3562         /*
3563          * Destroy the various mutexes and reader/writer locks
3564          */
3565         mutex_destroy(&rpfreelist_lock);
3566         mutex_destroy(&newnum_lock);
3567         mutex_destroy(&nfs_minor_lock);
3568         (void) zone_key_delete(nfsclnt_zone_key);
3569 }
3570 
3571 enum nfsstat
3572 puterrno(int error)
3573 {
3574 
3575         switch (error) {
3576         case EOPNOTSUPP:
3577                 return (NFSERR_OPNOTSUPP);
3578         case ENAMETOOLONG:
3579                 return (NFSERR_NAMETOOLONG);
3580         case ENOTEMPTY:
3581                 return (NFSERR_NOTEMPTY);
3582         case EDQUOT:
3583                 return (NFSERR_DQUOT);
3584         case ESTALE:
3585                 return (NFSERR_STALE);
3586         case EREMOTE:
3587                 return (NFSERR_REMOTE);
3588         case ENOSYS:
3589                 return (NFSERR_OPNOTSUPP);
3590         case EOVERFLOW:
3591                 return (NFSERR_INVAL);
3592         default:
3593                 return ((enum nfsstat)error);
3594         }
3595         /* NOTREACHED */
3596 }
3597 
3598 int
3599 geterrno(enum nfsstat status)
3600 {
3601 
3602         switch (status) {
3603         case NFSERR_OPNOTSUPP:
3604                 return (EOPNOTSUPP);
3605         case NFSERR_NAMETOOLONG:
3606                 return (ENAMETOOLONG);
3607         case NFSERR_NOTEMPTY:
3608                 return (ENOTEMPTY);
3609         case NFSERR_DQUOT:
3610                 return (EDQUOT);
3611         case NFSERR_STALE:
3612                 return (ESTALE);
3613         case NFSERR_REMOTE:
3614                 return (EREMOTE);
3615         case NFSERR_WFLUSH:
3616                 return (EIO);
3617         default:
3618                 return ((int)status);
3619         }
3620         /* NOTREACHED */
3621 }
3622 
3623 enum nfsstat3
3624 puterrno3(int error)
3625 {
3626 
3627 #ifdef DEBUG
3628         switch (error) {
3629         case 0:
3630                 return (NFS3_OK);
3631         case EPERM:
3632                 return (NFS3ERR_PERM);
3633         case ENOENT:
3634                 return (NFS3ERR_NOENT);
3635         case EIO:
3636                 return (NFS3ERR_IO);
3637         case ENXIO:
3638                 return (NFS3ERR_NXIO);
3639         case EACCES:
3640                 return (NFS3ERR_ACCES);
3641         case EEXIST:
3642                 return (NFS3ERR_EXIST);
3643         case EXDEV:
3644                 return (NFS3ERR_XDEV);
3645         case ENODEV:
3646                 return (NFS3ERR_NODEV);
3647         case ENOTDIR:
3648                 return (NFS3ERR_NOTDIR);
3649         case EISDIR:
3650                 return (NFS3ERR_ISDIR);
3651         case EINVAL:
3652                 return (NFS3ERR_INVAL);
3653         case EFBIG:
3654                 return (NFS3ERR_FBIG);
3655         case ENOSPC:
3656                 return (NFS3ERR_NOSPC);
3657         case EROFS:
3658                 return (NFS3ERR_ROFS);
3659         case EMLINK:
3660                 return (NFS3ERR_MLINK);
3661         case ENAMETOOLONG:
3662                 return (NFS3ERR_NAMETOOLONG);
3663         case ENOTEMPTY:
3664                 return (NFS3ERR_NOTEMPTY);
3665         case EDQUOT:
3666                 return (NFS3ERR_DQUOT);
3667         case ESTALE:
3668                 return (NFS3ERR_STALE);
3669         case EREMOTE:
3670                 return (NFS3ERR_REMOTE);
3671         case ENOSYS:
3672         case EOPNOTSUPP:
3673                 return (NFS3ERR_NOTSUPP);
3674         case EOVERFLOW:
3675                 return (NFS3ERR_INVAL);
3676         default:
3677                 zcmn_err(getzoneid(), CE_WARN,
3678                     "puterrno3: got error %d", error);
3679                 return ((enum nfsstat3)error);
3680         }
3681 #else
3682         switch (error) {
3683         case ENAMETOOLONG:
3684                 return (NFS3ERR_NAMETOOLONG);
3685         case ENOTEMPTY:
3686                 return (NFS3ERR_NOTEMPTY);
3687         case EDQUOT:
3688                 return (NFS3ERR_DQUOT);
3689         case ESTALE:
3690                 return (NFS3ERR_STALE);
3691         case ENOSYS:
3692         case EOPNOTSUPP:
3693                 return (NFS3ERR_NOTSUPP);
3694         case EREMOTE:
3695                 return (NFS3ERR_REMOTE);
3696         case EOVERFLOW:
3697                 return (NFS3ERR_INVAL);
3698         default:
3699                 return ((enum nfsstat3)error);
3700         }
3701 #endif
3702 }
3703 
3704 int
3705 geterrno3(enum nfsstat3 status)
3706 {
3707 
3708 #ifdef DEBUG
3709         switch (status) {
3710         case NFS3_OK:
3711                 return (0);
3712         case NFS3ERR_PERM:
3713                 return (EPERM);
3714         case NFS3ERR_NOENT:
3715                 return (ENOENT);
3716         case NFS3ERR_IO:
3717                 return (EIO);
3718         case NFS3ERR_NXIO:
3719                 return (ENXIO);
3720         case NFS3ERR_ACCES:
3721                 return (EACCES);
3722         case NFS3ERR_EXIST:
3723                 return (EEXIST);
3724         case NFS3ERR_XDEV:
3725                 return (EXDEV);
3726         case NFS3ERR_NODEV:
3727                 return (ENODEV);
3728         case NFS3ERR_NOTDIR:
3729                 return (ENOTDIR);
3730         case NFS3ERR_ISDIR:
3731                 return (EISDIR);
3732         case NFS3ERR_INVAL:
3733                 return (EINVAL);
3734         case NFS3ERR_FBIG:
3735                 return (EFBIG);
3736         case NFS3ERR_NOSPC:
3737                 return (ENOSPC);
3738         case NFS3ERR_ROFS:
3739                 return (EROFS);
3740         case NFS3ERR_MLINK:
3741                 return (EMLINK);
3742         case NFS3ERR_NAMETOOLONG:
3743                 return (ENAMETOOLONG);
3744         case NFS3ERR_NOTEMPTY:
3745                 return (ENOTEMPTY);
3746         case NFS3ERR_DQUOT:
3747                 return (EDQUOT);
3748         case NFS3ERR_STALE:
3749                 return (ESTALE);
3750         case NFS3ERR_REMOTE:
3751                 return (EREMOTE);
3752         case NFS3ERR_BADHANDLE:
3753                 return (ESTALE);
3754         case NFS3ERR_NOT_SYNC:
3755                 return (EINVAL);
3756         case NFS3ERR_BAD_COOKIE:
3757                 return (ENOENT);
3758         case NFS3ERR_NOTSUPP:
3759                 return (EOPNOTSUPP);
3760         case NFS3ERR_TOOSMALL:
3761                 return (EINVAL);
3762         case NFS3ERR_SERVERFAULT:
3763                 return (EIO);
3764         case NFS3ERR_BADTYPE:
3765                 return (EINVAL);
3766         case NFS3ERR_JUKEBOX:
3767                 return (ENXIO);
3768         default:
3769                 zcmn_err(getzoneid(), CE_WARN,
3770                     "geterrno3: got status %d", status);
3771                 return ((int)status);
3772         }
3773 #else
3774         switch (status) {
3775         case NFS3ERR_NAMETOOLONG:
3776                 return (ENAMETOOLONG);
3777         case NFS3ERR_NOTEMPTY:
3778                 return (ENOTEMPTY);
3779         case NFS3ERR_DQUOT:
3780                 return (EDQUOT);
3781         case NFS3ERR_STALE:
3782         case NFS3ERR_BADHANDLE:
3783                 return (ESTALE);
3784         case NFS3ERR_NOTSUPP:
3785                 return (EOPNOTSUPP);
3786         case NFS3ERR_REMOTE:
3787                 return (EREMOTE);
3788         case NFS3ERR_NOT_SYNC:
3789         case NFS3ERR_TOOSMALL:
3790         case NFS3ERR_BADTYPE:
3791                 return (EINVAL);
3792         case NFS3ERR_BAD_COOKIE:
3793                 return (ENOENT);
3794         case NFS3ERR_SERVERFAULT:
3795                 return (EIO);
3796         case NFS3ERR_JUKEBOX:
3797                 return (ENXIO);
3798         default:
3799                 return ((int)status);
3800         }
3801 #endif
3802 }
3803 
3804 rddir_cache *
3805 rddir_cache_alloc(int flags)
3806 {
3807         rddir_cache *rc;
3808 
3809         rc = kmem_alloc(sizeof (*rc), flags);
3810         if (rc != NULL) {
3811                 rc->entries = NULL;
3812                 rc->flags = RDDIR;
3813                 cv_init(&rc->cv, NULL, CV_DEFAULT, NULL);
3814                 mutex_init(&rc->lock, NULL, MUTEX_DEFAULT, NULL);
3815                 rc->count = 1;
3816 #ifdef DEBUG
3817                 atomic_inc_64(&clstat_debug.dirent.value.ui64);
3818 #endif
3819         }
3820         return (rc);
3821 }
3822 
3823 static void
3824 rddir_cache_free(rddir_cache *rc)
3825 {
3826 
3827 #ifdef DEBUG
3828         atomic_dec_64(&clstat_debug.dirent.value.ui64);
3829 #endif
3830         if (rc->entries != NULL) {
3831 #ifdef DEBUG
3832                 rddir_cache_buf_free(rc->entries, rc->buflen);
3833 #else
3834                 kmem_free(rc->entries, rc->buflen);
3835 #endif
3836         }
3837         cv_destroy(&rc->cv);
3838         mutex_destroy(&rc->lock);
3839         kmem_free(rc, sizeof (*rc));
3840 }
3841 
3842 void
3843 rddir_cache_hold(rddir_cache *rc)
3844 {
3845 
3846         mutex_enter(&rc->lock);
3847         rc->count++;
3848         mutex_exit(&rc->lock);
3849 }
3850 
3851 void
3852 rddir_cache_rele(rddir_cache *rc)
3853 {
3854 
3855         mutex_enter(&rc->lock);
3856         ASSERT(rc->count > 0);
3857         if (--rc->count == 0) {
3858                 mutex_exit(&rc->lock);
3859                 rddir_cache_free(rc);
3860         } else
3861                 mutex_exit(&rc->lock);
3862 }
3863 
3864 #ifdef DEBUG
3865 char *
3866 rddir_cache_buf_alloc(size_t size, int flags)
3867 {
3868         char *rc;
3869 
3870         rc = kmem_alloc(size, flags);
3871         if (rc != NULL)
3872                 atomic_add_64(&clstat_debug.dirents.value.ui64, size);
3873         return (rc);
3874 }
3875 
3876 void
3877 rddir_cache_buf_free(void *addr, size_t size)
3878 {
3879 
3880         atomic_add_64(&clstat_debug.dirents.value.ui64, -(int64_t)size);
3881         kmem_free(addr, size);
3882 }
3883 #endif
3884 
3885 static int
3886 nfs_free_data_reclaim(rnode_t *rp)
3887 {
3888         char *contents;
3889         int size;
3890         vsecattr_t *vsp;
3891         nfs3_pathconf_info *info;
3892         int freed;
3893         cred_t *cred;
3894 
3895         /*
3896          * Free any held credentials and caches which
3897          * may be associated with this rnode.
3898          */
3899         mutex_enter(&rp->r_statelock);
3900         cred = rp->r_cred;
3901         rp->r_cred = NULL;
3902         contents = rp->r_symlink.contents;
3903         size = rp->r_symlink.size;
3904         rp->r_symlink.contents = NULL;
3905         vsp = rp->r_secattr;
3906         rp->r_secattr = NULL;
3907         info = rp->r_pathconf;
3908         rp->r_pathconf = NULL;
3909         mutex_exit(&rp->r_statelock);
3910 
3911         if (cred != NULL)
3912                 crfree(cred);
3913 
3914         /*
3915          * Free the access cache entries.
3916          */
3917         freed = nfs_access_purge_rp(rp);
3918 
3919         if (!HAVE_RDDIR_CACHE(rp) &&
3920             contents == NULL &&
3921             vsp == NULL &&
3922             info == NULL)
3923                 return (freed);
3924 
3925         /*
3926          * Free the readdir cache entries
3927          */
3928         if (HAVE_RDDIR_CACHE(rp))
3929                 nfs_purge_rddir_cache(RTOV(rp));
3930 
3931         /*
3932          * Free the symbolic link cache.
3933          */
3934         if (contents != NULL) {
3935 
3936                 kmem_free((void *)contents, size);
3937         }
3938 
3939         /*
3940          * Free any cached ACL.
3941          */
3942         if (vsp != NULL)
3943                 nfs_acl_free(vsp);
3944 
3945         /*
3946          * Free any cached pathconf information.
3947          */
3948         if (info != NULL)
3949                 kmem_free(info, sizeof (*info));
3950 
3951         return (1);
3952 }
3953 
3954 static int
3955 nfs_active_data_reclaim(rnode_t *rp)
3956 {
3957         char *contents;
3958         int size;
3959         vsecattr_t *vsp;
3960         nfs3_pathconf_info *info;
3961         int freed;
3962 
3963         /*
3964          * Free any held credentials and caches which
3965          * may be associated with this rnode.
3966          */
3967         if (!mutex_tryenter(&rp->r_statelock))
3968                 return (0);
3969         contents = rp->r_symlink.contents;
3970         size = rp->r_symlink.size;
3971         rp->r_symlink.contents = NULL;
3972         vsp = rp->r_secattr;
3973         rp->r_secattr = NULL;
3974         info = rp->r_pathconf;
3975         rp->r_pathconf = NULL;
3976         mutex_exit(&rp->r_statelock);
3977 
3978         /*
3979          * Free the access cache entries.
3980          */
3981         freed = nfs_access_purge_rp(rp);
3982 
3983         if (!HAVE_RDDIR_CACHE(rp) &&
3984             contents == NULL &&
3985             vsp == NULL &&
3986             info == NULL)
3987                 return (freed);
3988 
3989         /*
3990          * Free the readdir cache entries
3991          */
3992         if (HAVE_RDDIR_CACHE(rp))
3993                 nfs_purge_rddir_cache(RTOV(rp));
3994 
3995         /*
3996          * Free the symbolic link cache.
3997          */
3998         if (contents != NULL) {
3999 
4000                 kmem_free((void *)contents, size);
4001         }
4002 
4003         /*
4004          * Free any cached ACL.
4005          */
4006         if (vsp != NULL)
4007                 nfs_acl_free(vsp);
4008 
4009         /*
4010          * Free any cached pathconf information.
4011          */
4012         if (info != NULL)
4013                 kmem_free(info, sizeof (*info));
4014 
4015         return (1);
4016 }
4017 
4018 static int
4019 nfs_free_reclaim(void)
4020 {
4021         int freed;
4022         rnode_t *rp;
4023 
4024 #ifdef DEBUG
4025         clstat_debug.f_reclaim.value.ui64++;
4026 #endif
4027         freed = 0;
4028         mutex_enter(&rpfreelist_lock);
4029         rp = rpfreelist;
4030         if (rp != NULL) {
4031                 do {
4032                         if (nfs_free_data_reclaim(rp))
4033                                 freed = 1;
4034                 } while ((rp = rp->r_freef) != rpfreelist);
4035         }
4036         mutex_exit(&rpfreelist_lock);
4037         return (freed);
4038 }
4039 
4040 static int
4041 nfs_active_reclaim(void)
4042 {
4043         int freed;
4044         int index;
4045         rnode_t *rp;
4046 
4047 #ifdef DEBUG
4048         clstat_debug.a_reclaim.value.ui64++;
4049 #endif
4050         freed = 0;
4051         for (index = 0; index < rtablesize; index++) {
4052                 rw_enter(&rtable[index].r_lock, RW_READER);
4053                 for (rp = rtable[index].r_hashf;
4054                     rp != (rnode_t *)(&rtable[index]);
4055                     rp = rp->r_hashf) {
4056                         if (nfs_active_data_reclaim(rp))
4057                                 freed = 1;
4058                 }
4059                 rw_exit(&rtable[index].r_lock);
4060         }
4061         return (freed);
4062 }
4063 
4064 static int
4065 nfs_rnode_reclaim(void)
4066 {
4067         int freed;
4068         rnode_t *rp;
4069         vnode_t *vp;
4070 
4071 #ifdef DEBUG
4072         clstat_debug.r_reclaim.value.ui64++;
4073 #endif
4074         freed = 0;
4075         mutex_enter(&rpfreelist_lock);
4076         while ((rp = rpfreelist) != NULL) {
4077                 rp_rmfree(rp);
4078                 mutex_exit(&rpfreelist_lock);
4079                 if (rp->r_flags & RHASHED) {
4080                         vp = RTOV(rp);
4081                         rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4082                         mutex_enter(&vp->v_lock);
4083                         if (vp->v_count > 1) {
4084                                 VN_RELE_LOCKED(vp);
4085                                 mutex_exit(&vp->v_lock);
4086                                 rw_exit(&rp->r_hashq->r_lock);
4087                                 mutex_enter(&rpfreelist_lock);
4088                                 continue;
4089                         }
4090                         mutex_exit(&vp->v_lock);
4091                         rp_rmhash_locked(rp);
4092                         rw_exit(&rp->r_hashq->r_lock);
4093                 }
4094                 /*
4095                  * This call to rp_addfree will end up destroying the
4096                  * rnode, but in a safe way with the appropriate set
4097                  * of checks done.
4098                  */
4099                 rp_addfree(rp, CRED());
4100                 mutex_enter(&rpfreelist_lock);
4101         }
4102         mutex_exit(&rpfreelist_lock);
4103         return (freed);
4104 }
4105 
4106 /*ARGSUSED*/
4107 static void
4108 nfs_reclaim(void *cdrarg)
4109 {
4110 
4111 #ifdef DEBUG
4112         clstat_debug.reclaim.value.ui64++;
4113 #endif
4114         if (nfs_free_reclaim())
4115                 return;
4116 
4117         if (nfs_active_reclaim())
4118                 return;
4119 
4120         (void) nfs_rnode_reclaim();
4121 }
4122 
4123 /*
4124  * NFS client failover support
4125  *
4126  * Routines to copy filehandles
4127  */
4128 void
4129 nfscopyfh(caddr_t fhp, vnode_t *vp)
4130 {
4131         fhandle_t *dest = (fhandle_t *)fhp;
4132 
4133         if (dest != NULL)
4134                 *dest = *VTOFH(vp);
4135 }
4136 
4137 void
4138 nfs3copyfh(caddr_t fhp, vnode_t *vp)
4139 {
4140         nfs_fh3 *dest = (nfs_fh3 *)fhp;
4141 
4142         if (dest != NULL)
4143                 *dest = *VTOFH3(vp);
4144 }
4145 
4146 /*
4147  * NFS client failover support
4148  *
4149  * failover_safe() will test various conditions to ensure that
4150  * failover is permitted for this vnode.  It will be denied
4151  * if:
4152  *      1) the operation in progress does not support failover (NULL fi)
4153  *      2) there are no available replicas (NULL mi_servers->sv_next)
4154  *      3) any locks are outstanding on this file
4155  */
4156 static int
4157 failover_safe(failinfo_t *fi)
4158 {
4159 
4160         /*
4161          * Does this op permit failover?
4162          */
4163         if (fi == NULL || fi->vp == NULL)
4164                 return (0);
4165 
4166         /*
4167          * Are there any alternates to failover to?
4168          */
4169         if (VTOMI(fi->vp)->mi_servers->sv_next == NULL)
4170                 return (0);
4171 
4172         /*
4173          * Disable check; we've forced local locking
4174          *
4175          * if (flk_has_remote_locks(fi->vp))
4176          *      return (0);
4177          */
4178 
4179         /*
4180          * If we have no partial path, we can't do anything
4181          */
4182         if (VTOR(fi->vp)->r_path == NULL)
4183                 return (0);
4184 
4185         return (1);
4186 }
4187 
4188 #include <sys/thread.h>
4189 
4190 /*
4191  * NFS client failover support
4192  *
4193  * failover_newserver() will start a search for a new server,
4194  * preferably by starting an async thread to do the work.  If
4195  * someone is already doing this (recognizable by MI_BINDINPROG
4196  * being set), it will simply return and the calling thread
4197  * will queue on the mi_failover_cv condition variable.
4198  */
4199 static void
4200 failover_newserver(mntinfo_t *mi)
4201 {
4202         /*
4203          * Check if someone else is doing this already
4204          */
4205         mutex_enter(&mi->mi_lock);
4206         if (mi->mi_flags & MI_BINDINPROG) {
4207                 mutex_exit(&mi->mi_lock);
4208                 return;
4209         }
4210         mi->mi_flags |= MI_BINDINPROG;
4211 
4212         /*
4213          * Need to hold the vfs struct so that it can't be released
4214          * while the failover thread is selecting a new server.
4215          */
4216         VFS_HOLD(mi->mi_vfsp);
4217 
4218         /*
4219          * Start a thread to do the real searching.
4220          */
4221         (void) zthread_create(NULL, 0, failover_thread, mi, 0, minclsyspri);
4222 
4223         mutex_exit(&mi->mi_lock);
4224 }
4225 
4226 /*
4227  * NFS client failover support
4228  *
4229  * failover_thread() will find a new server to replace the one
4230  * currently in use, wake up other threads waiting on this mount
4231  * point, and die.  It will start at the head of the server list
4232  * and poll servers until it finds one with an NFS server which is
4233  * registered and responds to a NULL procedure ping.
4234  *
4235  * XXX failover_thread is unsafe within the scope of the
4236  * present model defined for cpr to suspend the system.
4237  * Specifically, over-the-wire calls made by the thread
4238  * are unsafe. The thread needs to be reevaluated in case of
4239  * future updates to the cpr suspend model.
4240  */
4241 static void
4242 failover_thread(mntinfo_t *mi)
4243 {
4244         servinfo_t *svp = NULL;
4245         CLIENT *cl;
4246         enum clnt_stat status;
4247         struct timeval tv;
4248         int error;
4249         int oncethru = 0;
4250         callb_cpr_t cprinfo;
4251         rnode_t *rp;
4252         int index;
4253         char *srvnames;
4254         size_t srvnames_len;
4255         struct nfs_clnt *nfscl = NULL;
4256         zoneid_t zoneid = getzoneid();
4257 
4258 #ifdef DEBUG
4259         /*
4260          * This is currently only needed to access counters which exist on
4261          * DEBUG kernels, hence we don't want to pay the penalty of the lookup
4262          * on non-DEBUG kernels.
4263          */
4264         nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
4265         ASSERT(nfscl != NULL);
4266 #endif
4267 
4268         /*
4269          * Its safe to piggyback on the mi_lock since failover_newserver()
4270          * code guarantees that there will be only one failover thread
4271          * per mountinfo at any instance.
4272          */
4273         CALLB_CPR_INIT(&cprinfo, &mi->mi_lock, callb_generic_cpr,
4274             "failover_thread");
4275 
4276         mutex_enter(&mi->mi_lock);
4277         while (mi->mi_readers) {
4278                 CALLB_CPR_SAFE_BEGIN(&cprinfo);
4279                 cv_wait(&mi->mi_failover_cv, &mi->mi_lock);
4280                 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock);
4281         }
4282         mutex_exit(&mi->mi_lock);
4283 
4284         tv.tv_sec = 2;
4285         tv.tv_usec = 0;
4286 
4287         /*
4288          * Ping the null NFS procedure of every server in
4289          * the list until one responds.  We always start
4290          * at the head of the list and always skip the one
4291          * that is current, since it's caused us a problem.
4292          */
4293         while (svp == NULL) {
4294                 for (svp = mi->mi_servers; svp; svp = svp->sv_next) {
4295                         if (!oncethru && svp == mi->mi_curr_serv)
4296                                 continue;
4297 
4298                         /*
4299                          * If the file system was forcibly umounted
4300                          * while trying to do a failover, then just
4301                          * give up on the failover.  It won't matter
4302                          * what the server is.
4303                          */
4304                         if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
4305                                 svp = NULL;
4306                                 goto done;
4307                         }
4308 
4309                         error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr,
4310                             NFS_PROGRAM, NFS_VERSION, 0, 1, CRED(), &cl);
4311                         if (error)
4312                                 continue;
4313 
4314                         if (!(mi->mi_flags & MI_INT))
4315                                 cl->cl_nosignal = TRUE;
4316                         status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL,
4317                             xdr_void, NULL, tv);
4318                         if (!(mi->mi_flags & MI_INT))
4319                                 cl->cl_nosignal = FALSE;
4320                         AUTH_DESTROY(cl->cl_auth);
4321                         CLNT_DESTROY(cl);
4322                         if (status == RPC_SUCCESS) {
4323                                 if (svp == mi->mi_curr_serv) {
4324 #ifdef DEBUG
4325                                         zcmn_err(zoneid, CE_NOTE,
4326                         "NFS%d: failing over: selecting original server %s",
4327                                             mi->mi_vers, svp->sv_hostname);
4328 #else
4329                                         zcmn_err(zoneid, CE_NOTE,
4330                         "NFS: failing over: selecting original server %s",
4331                                             svp->sv_hostname);
4332 #endif
4333                                 } else {
4334 #ifdef DEBUG
4335                                         zcmn_err(zoneid, CE_NOTE,
4336                                     "NFS%d: failing over from %s to %s",
4337                                             mi->mi_vers,
4338                                             mi->mi_curr_serv->sv_hostname,
4339                                             svp->sv_hostname);
4340 #else
4341                                         zcmn_err(zoneid, CE_NOTE,
4342                                     "NFS: failing over from %s to %s",
4343                                             mi->mi_curr_serv->sv_hostname,
4344                                             svp->sv_hostname);
4345 #endif
4346                                 }
4347                                 break;
4348                         }
4349                 }
4350 
4351                 if (svp == NULL) {
4352                         if (!oncethru) {
4353                                 srvnames = nfs_getsrvnames(mi, &srvnames_len);
4354 #ifdef DEBUG
4355                                 zprintf(zoneid,
4356                                     "NFS%d servers %s not responding "
4357                                     "still trying\n", mi->mi_vers, srvnames);
4358 #else
4359                                 zprintf(zoneid, "NFS servers %s not responding "
4360                                     "still trying\n", srvnames);
4361 #endif
4362                                 oncethru = 1;
4363                         }
4364                         mutex_enter(&mi->mi_lock);
4365                         CALLB_CPR_SAFE_BEGIN(&cprinfo);
4366                         mutex_exit(&mi->mi_lock);
4367                         delay(hz);
4368                         mutex_enter(&mi->mi_lock);
4369                         CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock);
4370                         mutex_exit(&mi->mi_lock);
4371                 }
4372         }
4373 
4374         if (oncethru) {
4375 #ifdef DEBUG
4376                 zprintf(zoneid, "NFS%d servers %s ok\n", mi->mi_vers, srvnames);
4377 #else
4378                 zprintf(zoneid, "NFS servers %s ok\n", srvnames);
4379 #endif
4380         }
4381 
4382         if (svp != mi->mi_curr_serv) {
4383                 (void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
4384                 index = rtablehash(&mi->mi_curr_serv->sv_fhandle);
4385                 rw_enter(&rtable[index].r_lock, RW_WRITER);
4386                 rp = rfind(&rtable[index], &mi->mi_curr_serv->sv_fhandle,
4387                     mi->mi_vfsp);
4388                 if (rp != NULL) {
4389                         if (rp->r_flags & RHASHED)
4390                                 rp_rmhash_locked(rp);
4391                         rw_exit(&rtable[index].r_lock);
4392                         rp->r_server = svp;
4393                         rp->r_fh = svp->sv_fhandle;
4394                         (void) nfs_free_data_reclaim(rp);
4395                         index = rtablehash(&rp->r_fh);
4396                         rp->r_hashq = &rtable[index];
4397                         rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4398                         vn_exists(RTOV(rp));
4399                         rp_addhash(rp);
4400                         rw_exit(&rp->r_hashq->r_lock);
4401                         VN_RELE(RTOV(rp));
4402                 } else
4403                         rw_exit(&rtable[index].r_lock);
4404         }
4405 
4406 done:
4407         if (oncethru)
4408                 kmem_free(srvnames, srvnames_len);
4409         mutex_enter(&mi->mi_lock);
4410         mi->mi_flags &= ~MI_BINDINPROG;
4411         if (svp != NULL) {
4412                 mi->mi_curr_serv = svp;
4413                 mi->mi_failover++;
4414 #ifdef DEBUG
4415         nfscl->nfscl_stat.failover.value.ui64++;
4416 #endif
4417         }
4418         cv_broadcast(&mi->mi_failover_cv);
4419         CALLB_CPR_EXIT(&cprinfo);
4420         VFS_RELE(mi->mi_vfsp);
4421         zthread_exit();
4422         /* NOTREACHED */
4423 }
4424 
4425 /*
4426  * NFS client failover support
4427  *
4428  * failover_wait() will put the thread to sleep until MI_BINDINPROG
4429  * is cleared, meaning that failover is complete.  Called with
4430  * mi_lock mutex held.
4431  */
4432 static int
4433 failover_wait(mntinfo_t *mi)
4434 {
4435         k_sigset_t smask;
4436 
4437         /*
4438          * If someone else is hunting for a living server,
4439          * sleep until it's done.  After our sleep, we may
4440          * be bound to the right server and get off cheaply.
4441          */
4442         while (mi->mi_flags & MI_BINDINPROG) {
4443                 /*
4444                  * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
4445                  * and SIGTERM. (Preserving the existing masks).
4446                  * Mask out SIGINT if mount option nointr is specified.
4447                  */
4448                 sigintr(&smask, (int)mi->mi_flags & MI_INT);
4449                 if (!cv_wait_sig(&mi->mi_failover_cv, &mi->mi_lock)) {
4450                         /*
4451                          * restore original signal mask
4452                          */
4453                         sigunintr(&smask);
4454                         return (EINTR);
4455                 }
4456                 /*
4457                  * restore original signal mask
4458                  */
4459                 sigunintr(&smask);
4460         }
4461         return (0);
4462 }
4463 
4464 /*
4465  * NFS client failover support
4466  *
4467  * failover_remap() will do a partial pathname lookup and find the
4468  * desired vnode on the current server.  The interim vnode will be
4469  * discarded after we pilfer the new filehandle.
4470  *
4471  * Side effects:
4472  * - This routine will also update the filehandle in the args structure
4473  *    pointed to by the fi->fhp pointer if it is non-NULL.
4474  */
4475 
4476 static int
4477 failover_remap(failinfo_t *fi)
4478 {
4479         vnode_t *vp, *nvp, *rootvp;
4480         rnode_t *rp, *nrp;
4481         mntinfo_t *mi;
4482         int error;
4483 #ifdef DEBUG
4484         struct nfs_clnt *nfscl;
4485 
4486         nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
4487         ASSERT(nfscl != NULL);
4488 #endif
4489         /*
4490          * Sanity check
4491          */
4492         if (fi == NULL || fi->vp == NULL || fi->lookupproc == NULL)
4493                 return (EINVAL);
4494         vp = fi->vp;
4495         rp = VTOR(vp);
4496         mi = VTOMI(vp);
4497 
4498         if (!(vp->v_flag & VROOT)) {
4499                 /*
4500                  * Given the root fh, use the path stored in
4501                  * the rnode to find the fh for the new server.
4502                  */
4503                 error = VFS_ROOT(mi->mi_vfsp, &rootvp);
4504                 if (error)
4505                         return (error);
4506 
4507                 error = failover_lookup(rp->r_path, rootvp,
4508                     fi->lookupproc, fi->xattrdirproc, &nvp);
4509 
4510                 VN_RELE(rootvp);
4511 
4512                 if (error)
4513                         return (error);
4514 
4515                 /*
4516                  * If we found the same rnode, we're done now
4517                  */
4518                 if (nvp == vp) {
4519                         /*
4520                          * Failed and the new server may physically be same
4521                          * OR may share a same disk subsystem. In this case
4522                          * file handle for a particular file path is not going
4523                          * to change, given the same filehandle lookup will
4524                          * always locate the same rnode as the existing one.
4525                          * All we might need to do is to update the r_server
4526                          * with the current servinfo.
4527                          */
4528                         if (!VALID_FH(fi)) {
4529                                 rp->r_server = mi->mi_curr_serv;
4530                         }
4531                         VN_RELE(nvp);
4532                         return (0);
4533                 }
4534 
4535                 /*
4536                  * Try to make it so that no one else will find this
4537                  * vnode because it is just a temporary to hold the
4538                  * new file handle until that file handle can be
4539                  * copied to the original vnode/rnode.
4540                  */
4541                 nrp = VTOR(nvp);
4542                 mutex_enter(&mi->mi_remap_lock);
4543                 /*
4544                  * Some other thread could have raced in here and could
4545                  * have done the remap for this particular rnode before
4546                  * this thread here. Check for rp->r_server and
4547                  * mi->mi_curr_serv and return if they are same.
4548                  */
4549                 if (VALID_FH(fi)) {
4550                         mutex_exit(&mi->mi_remap_lock);
4551                         VN_RELE(nvp);
4552                         return (0);
4553                 }
4554 
4555                 if (nrp->r_flags & RHASHED)
4556                         rp_rmhash(nrp);
4557 
4558                 /*
4559                  * As a heuristic check on the validity of the new
4560                  * file, check that the size and type match against
4561                  * that we remember from the old version.
4562                  */
4563                 if (rp->r_size != nrp->r_size || vp->v_type != nvp->v_type) {
4564                         mutex_exit(&mi->mi_remap_lock);
4565                         zcmn_err(mi->mi_zone->zone_id, CE_WARN,
4566                             "NFS replicas %s and %s: file %s not same.",
4567                             rp->r_server->sv_hostname,
4568                             nrp->r_server->sv_hostname, rp->r_path);
4569                         VN_RELE(nvp);
4570                         return (EINVAL);
4571                 }
4572 
4573                 /*
4574                  * snarf the filehandle from the new rnode
4575                  * then release it, again while updating the
4576                  * hash queues for the rnode.
4577                  */
4578                 if (rp->r_flags & RHASHED)
4579                         rp_rmhash(rp);
4580                 rp->r_server = mi->mi_curr_serv;
4581                 rp->r_fh = nrp->r_fh;
4582                 rp->r_hashq = nrp->r_hashq;
4583                 /*
4584                  * Copy the attributes from the new rnode to the old
4585                  * rnode.  This will help to reduce unnecessary page
4586                  * cache flushes.
4587                  */
4588                 rp->r_attr = nrp->r_attr;
4589                 rp->r_attrtime = nrp->r_attrtime;
4590                 rp->r_mtime = nrp->r_mtime;
4591                 (void) nfs_free_data_reclaim(rp);
4592                 nfs_setswaplike(vp, &rp->r_attr);
4593                 rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4594                 rp_addhash(rp);
4595                 rw_exit(&rp->r_hashq->r_lock);
4596                 mutex_exit(&mi->mi_remap_lock);
4597                 VN_RELE(nvp);
4598         }
4599 
4600         /*
4601          * Update successful failover remap count
4602          */
4603         mutex_enter(&mi->mi_lock);
4604         mi->mi_remap++;
4605         mutex_exit(&mi->mi_lock);
4606 #ifdef DEBUG
4607         nfscl->nfscl_stat.remap.value.ui64++;
4608 #endif
4609 
4610         /*
4611          * If we have a copied filehandle to update, do it now.
4612          */
4613         if (fi->fhp != NULL && fi->copyproc != NULL)
4614                 (*fi->copyproc)(fi->fhp, vp);
4615 
4616         return (0);
4617 }
4618 
4619 /*
4620  * NFS client failover support
4621  *
4622  * We want a simple pathname lookup routine to parse the pieces
4623  * of path in rp->r_path.  We know that the path was a created
4624  * as rnodes were made, so we know we have only to deal with
4625  * paths that look like:
4626  *      dir1/dir2/dir3/file
4627  * Any evidence of anything like .., symlinks, and ENOTDIR
4628  * are hard errors, because they mean something in this filesystem
4629  * is different from the one we came from, or has changed under
4630  * us in some way.  If this is true, we want the failure.
4631  *
4632  * Extended attributes: if the filesystem is mounted with extended
4633  * attributes enabled (-o xattr), the attribute directory will be
4634  * represented in the r_path as the magic name XATTR_RPATH. So if
4635  * we see that name in the pathname, is must be because this node
4636  * is an extended attribute.  Therefore, look it up that way.
4637  */
4638 static int
4639 failover_lookup(char *path, vnode_t *root,
4640     int (*lookupproc)(vnode_t *, char *, vnode_t **, struct pathname *, int,
4641     vnode_t *, cred_t *, int),
4642     int (*xattrdirproc)(vnode_t *, vnode_t **, bool_t, cred_t *, int),
4643     vnode_t **new)
4644 {
4645         vnode_t *dvp, *nvp;
4646         int error = EINVAL;
4647         char *s, *p, *tmppath;
4648         size_t len;
4649         mntinfo_t *mi;
4650         bool_t xattr;
4651 
4652         /* Make local copy of path */
4653         len = strlen(path) + 1;
4654         tmppath = kmem_alloc(len, KM_SLEEP);
4655         (void) strcpy(tmppath, path);
4656         s = tmppath;
4657 
4658         dvp = root;
4659         VN_HOLD(dvp);
4660         mi = VTOMI(root);
4661         xattr = mi->mi_flags & MI_EXTATTR;
4662 
4663         do {
4664                 p = strchr(s, '/');
4665                 if (p != NULL)
4666                         *p = '\0';
4667                 if (xattr && strcmp(s, XATTR_RPATH) == 0) {
4668                         error = (*xattrdirproc)(dvp, &nvp, FALSE, CRED(),
4669                             RFSCALL_SOFT);
4670                 } else {
4671                         error = (*lookupproc)(dvp, s, &nvp, NULL, 0, NULL,
4672                             CRED(), RFSCALL_SOFT);
4673                 }
4674                 if (p != NULL)
4675                         *p++ = '/';
4676                 if (error) {
4677                         VN_RELE(dvp);
4678                         kmem_free(tmppath, len);
4679                         return (error);
4680                 }
4681                 s = p;
4682                 VN_RELE(dvp);
4683                 dvp = nvp;
4684         } while (p != NULL);
4685 
4686         if (nvp != NULL && new != NULL)
4687                 *new = nvp;
4688         kmem_free(tmppath, len);
4689         return (0);
4690 }
4691 
4692 /*
4693  * NFS client failover support
4694  *
4695  * sv_free() frees the malloc'd portion of a "servinfo_t".
4696  */
4697 void
4698 sv_free(servinfo_t *svp)
4699 {
4700         servinfo_t *next;
4701         struct knetconfig *knconf;
4702 
4703         while (svp != NULL) {
4704                 next = svp->sv_next;
4705                 if (svp->sv_secdata)
4706                         sec_clnt_freeinfo(svp->sv_secdata);
4707                 if (svp->sv_hostname && svp->sv_hostnamelen > 0)
4708                         kmem_free(svp->sv_hostname, svp->sv_hostnamelen);
4709                 knconf = svp->sv_knconf;
4710                 if (knconf != NULL) {
4711                         if (knconf->knc_protofmly != NULL)
4712                                 kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
4713                         if (knconf->knc_proto != NULL)
4714                                 kmem_free(knconf->knc_proto, KNC_STRSIZE);
4715                         kmem_free(knconf, sizeof (*knconf));
4716                 }
4717                 knconf = svp->sv_origknconf;
4718                 if (knconf != NULL) {
4719                         if (knconf->knc_protofmly != NULL)
4720                                 kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
4721                         if (knconf->knc_proto != NULL)
4722                                 kmem_free(knconf->knc_proto, KNC_STRSIZE);
4723                         kmem_free(knconf, sizeof (*knconf));
4724                 }
4725                 if (svp->sv_addr.buf != NULL && svp->sv_addr.maxlen != 0)
4726                         kmem_free(svp->sv_addr.buf, svp->sv_addr.maxlen);
4727                 mutex_destroy(&svp->sv_lock);
4728                 kmem_free(svp, sizeof (*svp));
4729                 svp = next;
4730         }
4731 }
4732 
4733 /*
4734  * Only can return non-zero if intr != 0.
4735  */
4736 int
4737 nfs_rw_enter_sig(nfs_rwlock_t *l, krw_t rw, int intr)
4738 {
4739 
4740         mutex_enter(&l->lock);
4741 
4742         /*
4743          * If this is a nested enter, then allow it.  There
4744          * must be as many exits as enters through.
4745          */
4746         if (l->owner == curthread) {
4747                 /* lock is held for writing by current thread */
4748                 ASSERT(rw == RW_READER || rw == RW_WRITER);
4749                 l->count--;
4750         } else if (rw == RW_READER) {
4751                 /*
4752                  * While there is a writer active or writers waiting,
4753                  * then wait for them to finish up and move on.  Then,
4754                  * increment the count to indicate that a reader is
4755                  * active.
4756                  */
4757                 while (l->count < 0 || l->waiters > 0) {
4758                         if (intr) {
4759                                 klwp_t *lwp = ttolwp(curthread);
4760 
4761                                 if (lwp != NULL)
4762                                         lwp->lwp_nostop++;
4763                                 if (cv_wait_sig(&l->cv_rd, &l->lock) == 0) {
4764                                         if (lwp != NULL)
4765                                                 lwp->lwp_nostop--;
4766                                         mutex_exit(&l->lock);
4767                                         return (EINTR);
4768                                 }
4769                                 if (lwp != NULL)
4770                                         lwp->lwp_nostop--;
4771                         } else
4772                                 cv_wait(&l->cv_rd, &l->lock);
4773                 }
4774                 ASSERT(l->count < INT_MAX);
4775 #ifdef  DEBUG
4776                 if ((l->count % 10000) == 9999)
4777                         cmn_err(CE_WARN, "nfs_rw_enter_sig: count %d on"
4778                             "rwlock @ %p\n", l->count, (void *)&l);
4779 #endif
4780                 l->count++;
4781         } else {
4782                 ASSERT(rw == RW_WRITER);
4783                 /*
4784                  * While there are readers active or a writer
4785                  * active, then wait for all of the readers
4786                  * to finish or for the writer to finish.
4787                  * Then, set the owner field to curthread and
4788                  * decrement count to indicate that a writer
4789                  * is active.
4790                  */
4791                 while (l->count != 0) {
4792                         l->waiters++;
4793                         if (intr) {
4794                                 klwp_t *lwp = ttolwp(curthread);
4795 
4796                                 if (lwp != NULL)
4797                                         lwp->lwp_nostop++;
4798                                 if (cv_wait_sig(&l->cv, &l->lock) == 0) {
4799                                         if (lwp != NULL)
4800                                                 lwp->lwp_nostop--;
4801                                         l->waiters--;
4802                                         /*
4803                                          * If there are readers active and no
4804                                          * writers waiting then wake up all of
4805                                          * the waiting readers (if any).
4806                                          */
4807                                         if (l->count > 0 && l->waiters == 0)
4808                                                 cv_broadcast(&l->cv_rd);
4809                                         mutex_exit(&l->lock);
4810                                         return (EINTR);
4811                                 }
4812                                 if (lwp != NULL)
4813                                         lwp->lwp_nostop--;
4814                         } else
4815                                 cv_wait(&l->cv, &l->lock);
4816                         l->waiters--;
4817                 }
4818                 ASSERT(l->owner == NULL);
4819                 l->owner = curthread;
4820                 l->count--;
4821         }
4822 
4823         mutex_exit(&l->lock);
4824 
4825         return (0);
4826 }
4827 
4828 /*
4829  * If the lock is available, obtain it and return non-zero.  If there is
4830  * already a conflicting lock, return 0 immediately.
4831  */
4832 
4833 int
4834 nfs_rw_tryenter(nfs_rwlock_t *l, krw_t rw)
4835 {
4836         mutex_enter(&l->lock);
4837 
4838         /*
4839          * If this is a nested enter, then allow it.  There
4840          * must be as many exits as enters through.
4841          */
4842         if (l->owner == curthread) {
4843                 /* lock is held for writing by current thread */
4844                 ASSERT(rw == RW_READER || rw == RW_WRITER);
4845                 l->count--;
4846         } else if (rw == RW_READER) {
4847                 /*
4848                  * If there is a writer active or writers waiting, deny the
4849                  * lock.  Otherwise, bump the count of readers.
4850                  */
4851                 if (l->count < 0 || l->waiters > 0) {
4852                         mutex_exit(&l->lock);
4853                         return (0);
4854                 }
4855                 l->count++;
4856         } else {
4857                 ASSERT(rw == RW_WRITER);
4858                 /*
4859                  * If there are readers active or a writer active, deny the
4860                  * lock.  Otherwise, set the owner field to curthread and
4861                  * decrement count to indicate that a writer is active.
4862                  */
4863                 if (l->count != 0) {
4864                         mutex_exit(&l->lock);
4865                         return (0);
4866                 }
4867                 ASSERT(l->owner == NULL);
4868                 l->owner = curthread;
4869                 l->count--;
4870         }
4871 
4872         mutex_exit(&l->lock);
4873 
4874         return (1);
4875 }
4876 
4877 void
4878 nfs_rw_exit(nfs_rwlock_t *l)
4879 {
4880 
4881         mutex_enter(&l->lock);
4882 
4883         if (l->owner != NULL) {
4884                 ASSERT(l->owner == curthread);
4885 
4886                 /*
4887                  * To release a writer lock increment count to indicate that
4888                  * there is one less writer active.  If this was the last of
4889                  * possibly nested writer locks, then clear the owner field as
4890                  * well to indicate that there is no writer active.
4891                  */
4892                 ASSERT(l->count < 0);
4893                 l->count++;
4894                 if (l->count == 0) {
4895                         l->owner = NULL;
4896 
4897                         /*
4898                          * If there are no writers waiting then wakeup all of
4899                          * the waiting readers (if any).
4900                          */
4901                         if (l->waiters == 0)
4902                                 cv_broadcast(&l->cv_rd);
4903                 }
4904         } else {
4905                 /*
4906                  * To release a reader lock just decrement count to indicate
4907                  * that there is one less reader active.
4908                  */
4909                 ASSERT(l->count > 0);
4910                 l->count--;
4911         }
4912 
4913         /*
4914          * If there are no readers active nor a writer active and there is a
4915          * writer waiting we need to wake up it.
4916          */
4917         if (l->count == 0 && l->waiters > 0)
4918                 cv_signal(&l->cv);
4919         mutex_exit(&l->lock);
4920 }
4921 
4922 int
4923 nfs_rw_lock_held(nfs_rwlock_t *l, krw_t rw)
4924 {
4925 
4926         if (rw == RW_READER)
4927                 return (l->count > 0);
4928         ASSERT(rw == RW_WRITER);
4929         return (l->count < 0);
4930 }
4931 
4932 /* ARGSUSED */
4933 void
4934 nfs_rw_init(nfs_rwlock_t *l, char *name, krw_type_t type, void *arg)
4935 {
4936 
4937         l->count = 0;
4938         l->waiters = 0;
4939         l->owner = NULL;
4940         mutex_init(&l->lock, NULL, MUTEX_DEFAULT, NULL);
4941         cv_init(&l->cv, NULL, CV_DEFAULT, NULL);
4942         cv_init(&l->cv_rd, NULL, CV_DEFAULT, NULL);
4943 }
4944 
4945 void
4946 nfs_rw_destroy(nfs_rwlock_t *l)
4947 {
4948 
4949         mutex_destroy(&l->lock);
4950         cv_destroy(&l->cv);
4951         cv_destroy(&l->cv_rd);
4952 }
4953 
4954 int
4955 nfs3_rddir_compar(const void *x, const void *y)
4956 {
4957         rddir_cache *a = (rddir_cache *)x;
4958         rddir_cache *b = (rddir_cache *)y;
4959 
4960         if (a->nfs3_cookie == b->nfs3_cookie) {
4961                 if (a->buflen == b->buflen)
4962                         return (0);
4963                 if (a->buflen < b->buflen)
4964                         return (-1);
4965                 return (1);
4966         }
4967 
4968         if (a->nfs3_cookie < b->nfs3_cookie)
4969                 return (-1);
4970 
4971         return (1);
4972 }
4973 
4974 int
4975 nfs_rddir_compar(const void *x, const void *y)
4976 {
4977         rddir_cache *a = (rddir_cache *)x;
4978         rddir_cache *b = (rddir_cache *)y;
4979 
4980         if (a->nfs_cookie == b->nfs_cookie) {
4981                 if (a->buflen == b->buflen)
4982                         return (0);
4983                 if (a->buflen < b->buflen)
4984                         return (-1);
4985                 return (1);
4986         }
4987 
4988         if (a->nfs_cookie < b->nfs_cookie)
4989                 return (-1);
4990 
4991         return (1);
4992 }
4993 
4994 static char *
4995 nfs_getsrvnames(mntinfo_t *mi, size_t *len)
4996 {
4997         servinfo_t *s;
4998         char *srvnames;
4999         char *namep;
5000         size_t length;
5001 
5002         /*
5003          * Calculate the length of the string required to hold all
5004          * of the server names plus either a comma or a null
5005          * character following each individual one.
5006          */
5007         length = 0;
5008         for (s = mi->mi_servers; s != NULL; s = s->sv_next)
5009                 length += s->sv_hostnamelen;
5010 
5011         srvnames = kmem_alloc(length, KM_SLEEP);
5012 
5013         namep = srvnames;
5014         for (s = mi->mi_servers; s != NULL; s = s->sv_next) {
5015                 (void) strcpy(namep, s->sv_hostname);
5016                 namep += s->sv_hostnamelen - 1;
5017                 *namep++ = ',';
5018         }
5019         *--namep = '\0';
5020 
5021         *len = length;
5022 
5023         return (srvnames);
5024 }
5025 
5026 /*
5027  * These two functions are temporary and designed for the upgrade-workaround
5028  * only.  They cannot be used for general zone-crossing NFS client support, and
5029  * will be removed shortly.
5030  *
5031  * When the workaround is enabled, all NFS traffic is forced into the global
5032  * zone.  These functions are called when the code needs to refer to the state
5033  * of the underlying network connection.  They're not called when the function
5034  * needs to refer to the state of the process that invoked the system call.
5035  * (E.g., when checking whether the zone is shutting down during the mount()
5036  * call.)
5037  */
5038 
5039 struct zone *
5040 nfs_zone(void)
5041 {
5042         return (nfs_global_client_only != 0 ? global_zone : curproc->p_zone);
5043 }
5044 
5045 zoneid_t
5046 nfs_zoneid(void)
5047 {
5048         return (nfs_global_client_only != 0 ? GLOBAL_ZONEID : getzoneid());
5049 }
5050 
5051 /*
5052  * nfs_mount_label_policy:
5053  *      Determine whether the mount is allowed according to MAC check,
5054  *      by comparing (where appropriate) label of the remote server
5055  *      against the label of the zone being mounted into.
5056  *
5057  *      Returns:
5058  *               0 :    access allowed
5059  *              -1 :    read-only access allowed (i.e., read-down)
5060  *              >0 : error code, such as EACCES
5061  */
5062 int
5063 nfs_mount_label_policy(vfs_t *vfsp, struct netbuf *addr,
5064     struct knetconfig *knconf, cred_t *cr)
5065 {
5066         int             addr_type;
5067         void            *ipaddr;
5068         bslabel_t       *server_sl, *mntlabel;
5069         zone_t          *mntzone = NULL;
5070         ts_label_t      *zlabel;
5071         tsol_tpc_t      *tp;
5072         ts_label_t      *tsl = NULL;
5073         int             retv;
5074 
5075         /*
5076          * Get the zone's label.  Each zone on a labeled system has a label.
5077          */
5078         mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE);
5079         zlabel = mntzone->zone_slabel;
5080         ASSERT(zlabel != NULL);
5081         label_hold(zlabel);
5082 
5083         if (strcmp(knconf->knc_protofmly, NC_INET) == 0) {
5084                 addr_type = IPV4_VERSION;
5085                 ipaddr = &((struct sockaddr_in *)addr->buf)->sin_addr;
5086         } else if (strcmp(knconf->knc_protofmly, NC_INET6) == 0) {
5087                 addr_type = IPV6_VERSION;
5088                 ipaddr = &((struct sockaddr_in6 *)addr->buf)->sin6_addr;
5089         } else {
5090                 retv = 0;
5091                 goto out;
5092         }
5093 
5094         retv = EACCES;                          /* assume the worst */
5095 
5096         /*
5097          * Next, get the assigned label of the remote server.
5098          */
5099         tp = find_tpc(ipaddr, addr_type, B_FALSE);
5100         if (tp == NULL)
5101                 goto out;                       /* error getting host entry */
5102 
5103         if (tp->tpc_tp.tp_doi != zlabel->tsl_doi)
5104                 goto rel_tpc;                   /* invalid domain */
5105         if ((tp->tpc_tp.host_type != SUN_CIPSO) &&
5106             (tp->tpc_tp.host_type != UNLABELED))
5107                 goto rel_tpc;                   /* invalid hosttype */
5108 
5109         if (tp->tpc_tp.host_type == SUN_CIPSO) {
5110                 tsl = getflabel_cipso(vfsp);
5111                 if (tsl == NULL)
5112                         goto rel_tpc;           /* error getting server lbl */
5113 
5114                 server_sl = label2bslabel(tsl);
5115         } else {        /* UNLABELED */
5116                 server_sl = &tp->tpc_tp.tp_def_label;
5117         }
5118 
5119         mntlabel = label2bslabel(zlabel);
5120 
5121         /*
5122          * Now compare labels to complete the MAC check.  If the labels
5123          * are equal or if the requestor is in the global zone and has
5124          * NET_MAC_AWARE, then allow read-write access.   (Except for
5125          * mounts into the global zone itself; restrict these to
5126          * read-only.)
5127          *
5128          * If the requestor is in some other zone, but their label
5129          * dominates the server, then allow read-down.
5130          *
5131          * Otherwise, access is denied.
5132          */
5133         if (blequal(mntlabel, server_sl) ||
5134             (crgetzoneid(cr) == GLOBAL_ZONEID &&
5135             getpflags(NET_MAC_AWARE, cr) != 0)) {
5136                 if ((mntzone == global_zone) ||
5137                     !blequal(mntlabel, server_sl))
5138                         retv = -1;              /* read-only */
5139                 else
5140                         retv = 0;               /* access OK */
5141         } else if (bldominates(mntlabel, server_sl)) {
5142                 retv = -1;                      /* read-only */
5143         } else {
5144                 retv = EACCES;
5145         }
5146 
5147         if (tsl != NULL)
5148                 label_rele(tsl);
5149 
5150 rel_tpc:
5151         TPC_RELE(tp);
5152 out:
5153         if (mntzone)
5154                 zone_rele(mntzone);
5155         label_rele(zlabel);
5156         return (retv);
5157 }
5158 
5159 boolean_t
5160 nfs_has_ctty(void)
5161 {
5162         boolean_t rv;
5163         mutex_enter(&curproc->p_splock);
5164         rv = (curproc->p_sessp->s_vp != NULL);
5165         mutex_exit(&curproc->p_splock);
5166         return (rv);
5167 }
5168 
5169 /*
5170  * See if xattr directory to see if it has any generic user attributes
5171  */
5172 int
5173 do_xattr_exists_check(vnode_t *vp, ulong_t *valp, cred_t *cr)
5174 {
5175         struct uio uio;
5176         struct iovec iov;
5177         char *dbuf;
5178         struct dirent64 *dp;
5179         size_t dlen = 8 * 1024;
5180         size_t dbuflen;
5181         int eof = 0;
5182         int error;
5183 
5184         *valp = 0;
5185         dbuf = kmem_alloc(dlen, KM_SLEEP);
5186         uio.uio_iov = &iov;
5187         uio.uio_iovcnt = 1;
5188         uio.uio_segflg = UIO_SYSSPACE;
5189         uio.uio_fmode = 0;
5190         uio.uio_extflg = UIO_COPY_CACHED;
5191         uio.uio_loffset = 0;
5192         uio.uio_resid = dlen;
5193         iov.iov_base = dbuf;
5194         iov.iov_len = dlen;
5195         (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
5196         error = VOP_READDIR(vp, &uio, cr, &eof, NULL, 0);
5197         VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
5198 
5199         dbuflen = dlen - uio.uio_resid;
5200 
5201         if (error || dbuflen == 0) {
5202                 kmem_free(dbuf, dlen);
5203                 return (error);
5204         }
5205 
5206         dp = (dirent64_t *)dbuf;
5207 
5208         while ((intptr_t)dp < (intptr_t)dbuf + dbuflen) {
5209                 if (strcmp(dp->d_name, ".") == 0 ||
5210                     strcmp(dp->d_name, "..") == 0 || strcmp(dp->d_name,
5211                     VIEW_READWRITE) == 0 || strcmp(dp->d_name,
5212                     VIEW_READONLY) == 0) {
5213                         dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen);
5214                         continue;
5215                 }
5216 
5217                 *valp = 1;
5218                 break;
5219         }
5220         kmem_free(dbuf, dlen);
5221         return (0);
5222 }