1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 /*
  28  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  29  * Copyright (c) 2016, 2017 by Delphix. All rights reserved.
  30  */
  31 
  32 #include <sys/param.h>
  33 #include <sys/types.h>
  34 #include <sys/systm.h>
  35 #include <sys/cred.h>
  36 #include <sys/proc.h>
  37 #include <sys/user.h>
  38 #include <sys/time.h>
  39 #include <sys/buf.h>
  40 #include <sys/vfs.h>
  41 #include <sys/vnode.h>
  42 #include <sys/socket.h>
  43 #include <sys/uio.h>
  44 #include <sys/tiuser.h>
  45 #include <sys/swap.h>
  46 #include <sys/errno.h>
  47 #include <sys/debug.h>
  48 #include <sys/kmem.h>
  49 #include <sys/kstat.h>
  50 #include <sys/cmn_err.h>
  51 #include <sys/vtrace.h>
  52 #include <sys/session.h>
  53 #include <sys/dnlc.h>
  54 #include <sys/bitmap.h>
  55 #include <sys/acl.h>
  56 #include <sys/ddi.h>
  57 #include <sys/pathname.h>
  58 #include <sys/flock.h>
  59 #include <sys/dirent.h>
  60 #include <sys/flock.h>
  61 #include <sys/callb.h>
  62 #include <sys/atomic.h>
  63 #include <sys/list.h>
  64 #include <sys/tsol/tnet.h>
  65 #include <sys/priv.h>
  66 #include <sys/sdt.h>
  67 #include <sys/attr.h>
  68 
  69 #include <inet/ip6.h>
  70 
  71 #include <rpc/types.h>
  72 #include <rpc/xdr.h>
  73 #include <rpc/auth.h>
  74 #include <rpc/clnt.h>
  75 
  76 #include <nfs/nfs.h>
  77 #include <nfs/nfs4.h>
  78 #include <nfs/nfs_clnt.h>
  79 #include <nfs/rnode.h>
  80 #include <nfs/nfs_acl.h>
  81 
  82 #include <sys/tsol/label.h>
  83 
  84 /*
  85  * The hash queues for the access to active and cached rnodes
  86  * are organized as doubly linked lists.  A reader/writer lock
  87  * for each hash bucket is used to control access and to synchronize
  88  * lookups, additions, and deletions from the hash queue.
  89  *
  90  * The rnode freelist is organized as a doubly linked list with
  91  * a head pointer.  Additions and deletions are synchronized via
  92  * a single mutex.
  93  *
  94  * In order to add an rnode to the free list, it must be hashed into
  95  * a hash queue and the exclusive lock to the hash queue be held.
  96  * If an rnode is not hashed into a hash queue, then it is destroyed
  97  * because it represents no valuable information that can be reused
  98  * about the file.  The exclusive lock to the hash queue must be
  99  * held in order to prevent a lookup in the hash queue from finding
 100  * the rnode and using it and assuming that the rnode is not on the
 101  * freelist.  The lookup in the hash queue will have the hash queue
 102  * locked, either exclusive or shared.
 103  *
 104  * The vnode reference count for each rnode is not allowed to drop
 105  * below 1.  This prevents external entities, such as the VM
 106  * subsystem, from acquiring references to vnodes already on the
 107  * freelist and then trying to place them back on the freelist
 108  * when their reference is released.  This means that the when an
 109  * rnode is looked up in the hash queues, then either the rnode
 110  * is removed from the freelist and that reference is transferred to
 111  * the new reference or the vnode reference count must be incremented
 112  * accordingly.  The mutex for the freelist must be held in order to
 113  * accurately test to see if the rnode is on the freelist or not.
 114  * The hash queue lock might be held shared and it is possible that
 115  * two different threads may race to remove the rnode from the
 116  * freelist.  This race can be resolved by holding the mutex for the
 117  * freelist.  Please note that the mutex for the freelist does not
 118  * need to held if the rnode is not on the freelist.  It can not be
 119  * placed on the freelist due to the requirement that the thread
 120  * putting the rnode on the freelist must hold the exclusive lock
 121  * to the hash queue and the thread doing the lookup in the hash
 122  * queue is holding either a shared or exclusive lock to the hash
 123  * queue.
 124  *
 125  * The lock ordering is:
 126  *
 127  *      hash bucket lock -> vnode lock
 128  *      hash bucket lock -> freelist lock
 129  */
 130 static rhashq_t *rtable;
 131 
 132 static kmutex_t rpfreelist_lock;
 133 static rnode_t *rpfreelist = NULL;
 134 static long rnew = 0;
 135 volatile long nrnode = 0;
 136 
 137 static int rtablesize;
 138 static int rtablemask;
 139 
 140 static int hashlen = 4;
 141 
 142 static struct kmem_cache *rnode_cache;
 143 
 144 /*
 145  * Mutex to protect the following variables:
 146  *      nfs_major
 147  *      nfs_minor
 148  */
 149 kmutex_t nfs_minor_lock;
 150 int nfs_major;
 151 int nfs_minor;
 152 
 153 /*
 154  * Do we allow preepoch (negative) time values otw?
 155  * default: do not allow preepoch
 156  */
 157 volatile bool_t nfs_allow_preepoch_time = FALSE;
 158 
 159 /*
 160  * Access cache
 161  */
 162 static acache_hash_t *acache;
 163 volatile long nacache;  /* used strictly to size the number of hash queues */
 164 
 165 static int acachesize;
 166 static int acachemask;
 167 static struct kmem_cache *acache_cache;
 168 
 169 /*
 170  * Client side utilities
 171  */
 172 
 173 /*
 174  * client side statistics
 175  */
 176 static const struct clstat clstat_tmpl = {
 177         { "calls",      KSTAT_DATA_UINT64 },
 178         { "badcalls",   KSTAT_DATA_UINT64 },
 179         { "clgets",     KSTAT_DATA_UINT64 },
 180         { "cltoomany",  KSTAT_DATA_UINT64 },
 181 #ifdef DEBUG
 182         { "clalloc",    KSTAT_DATA_UINT64 },
 183         { "noresponse", KSTAT_DATA_UINT64 },
 184         { "failover",   KSTAT_DATA_UINT64 },
 185         { "remap",      KSTAT_DATA_UINT64 },
 186 #endif
 187 };
 188 
 189 /*
 190  * The following are statistics that describe behavior of the system as a whole
 191  * and doesn't correspond to any one particular zone.
 192  */
 193 #ifdef DEBUG
 194 static struct clstat_debug {
 195         kstat_named_t   nrnode;                 /* number of allocated rnodes */
 196         kstat_named_t   access;                 /* size of access cache */
 197         kstat_named_t   dirent;                 /* size of readdir cache */
 198         kstat_named_t   dirents;                /* size of readdir buf cache */
 199         kstat_named_t   reclaim;                /* number of reclaims */
 200         kstat_named_t   clreclaim;              /* number of cl reclaims */
 201         kstat_named_t   f_reclaim;              /* number of free reclaims */
 202         kstat_named_t   a_reclaim;              /* number of active reclaims */
 203         kstat_named_t   r_reclaim;              /* number of rnode reclaims */
 204         kstat_named_t   rpath;                  /* bytes used to store rpaths */
 205 } clstat_debug = {
 206         { "nrnode",     KSTAT_DATA_UINT64 },
 207         { "access",     KSTAT_DATA_UINT64 },
 208         { "dirent",     KSTAT_DATA_UINT64 },
 209         { "dirents",    KSTAT_DATA_UINT64 },
 210         { "reclaim",    KSTAT_DATA_UINT64 },
 211         { "clreclaim",  KSTAT_DATA_UINT64 },
 212         { "f_reclaim",  KSTAT_DATA_UINT64 },
 213         { "a_reclaim",  KSTAT_DATA_UINT64 },
 214         { "r_reclaim",  KSTAT_DATA_UINT64 },
 215         { "r_path",     KSTAT_DATA_UINT64 },
 216 };
 217 #endif  /* DEBUG */
 218 
 219 /*
 220  * We keep a global list of per-zone client data, so we can clean up all zones
 221  * if we get low on memory.
 222  */
 223 static list_t nfs_clnt_list;
 224 static kmutex_t nfs_clnt_list_lock;
 225 static zone_key_t nfsclnt_zone_key;
 226 
 227 static struct kmem_cache *chtab_cache;
 228 
 229 /*
 230  * Some servers do not properly update the attributes of the
 231  * directory when changes are made.  To allow interoperability
 232  * with these broken servers, the nfs_disable_rddir_cache
 233  * parameter must be set in /etc/system
 234  */
 235 volatile int nfs_disable_rddir_cache = 0;
 236 
 237 int             clget(clinfo_t *, servinfo_t *, cred_t *, CLIENT **,
 238                     struct chtab **);
 239 void            clfree(CLIENT *, struct chtab *);
 240 static int      acl_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **,
 241                     struct chtab **, struct nfs_clnt *);
 242 static int      nfs_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **,
 243                     struct chtab **, struct nfs_clnt *);
 244 static void     clreclaim(void *);
 245 static int      nfs_feedback(int, int, mntinfo_t *);
 246 static int      rfscall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t,
 247                     caddr_t, cred_t *, int *, enum clnt_stat *, int,
 248                     failinfo_t *);
 249 static int      aclcall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t,
 250                     caddr_t, cred_t *, int *, int, failinfo_t *);
 251 static void     rinactive(rnode_t *, cred_t *);
 252 static int      rtablehash(nfs_fhandle *);
 253 static vnode_t  *make_rnode(nfs_fhandle *, rhashq_t *, struct vfs *,
 254                     struct vnodeops *,
 255                     int (*)(vnode_t *, page_t *, u_offset_t *, size_t *, int,
 256                         cred_t *),
 257                     int (*)(const void *, const void *), int *, cred_t *,
 258                     char *, char *);
 259 static void     rp_rmfree(rnode_t *);
 260 static void     rp_addhash(rnode_t *);
 261 static void     rp_rmhash_locked(rnode_t *);
 262 static rnode_t  *rfind(rhashq_t *, nfs_fhandle *, struct vfs *);
 263 static void     destroy_rnode(rnode_t *);
 264 static void     rddir_cache_free(rddir_cache *);
 265 static int      nfs_free_data_reclaim(rnode_t *);
 266 static int      nfs_active_data_reclaim(rnode_t *);
 267 static int      nfs_free_reclaim(void);
 268 static int      nfs_active_reclaim(void);
 269 static int      nfs_rnode_reclaim(void);
 270 static void     nfs_reclaim(void *);
 271 static int      failover_safe(failinfo_t *);
 272 static void     failover_newserver(mntinfo_t *mi);
 273 static void     failover_thread(mntinfo_t *mi);
 274 static int      failover_wait(mntinfo_t *);
 275 static int      failover_remap(failinfo_t *);
 276 static int      failover_lookup(char *, vnode_t *,
 277                     int (*)(vnode_t *, char *, vnode_t **,
 278                         struct pathname *, int, vnode_t *, cred_t *, int),
 279                     int (*)(vnode_t *, vnode_t **, bool_t, cred_t *, int),
 280                     vnode_t **);
 281 static void     nfs_free_r_path(rnode_t *);
 282 static void     nfs_set_vroot(vnode_t *);
 283 static char     *nfs_getsrvnames(mntinfo_t *, size_t *);
 284 
 285 /*
 286  * from rpcsec module (common/rpcsec)
 287  */
 288 extern int sec_clnt_geth(CLIENT *, struct sec_data *, cred_t *, AUTH **);
 289 extern void sec_clnt_freeh(AUTH *);
 290 extern void sec_clnt_freeinfo(struct sec_data *);
 291 
 292 /*
 293  * used in mount policy
 294  */
 295 extern ts_label_t *getflabel_cipso(vfs_t *);
 296 
 297 /*
 298  * EIO or EINTR are not recoverable errors.
 299  */
 300 #define IS_RECOVERABLE_ERROR(error)     !((error == EINTR) || (error == EIO))
 301 
 302 #ifdef DEBUG
 303 #define SRV_QFULL_MSG   "send queue to NFS%d server %s is full; still trying\n"
 304 #define SRV_NOTRESP_MSG "NFS%d server %s not responding still trying\n"
 305 #else
 306 #define SRV_QFULL_MSG   "send queue to NFS server %s is full still trying\n"
 307 #define SRV_NOTRESP_MSG "NFS server %s not responding still trying\n"
 308 #endif
 309 /*
 310  * Common handle get program for NFS, NFS ACL, and NFS AUTH client.
 311  */
 312 static int
 313 clget_impl(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
 314     struct chtab **chp, struct nfs_clnt *nfscl)
 315 {
 316         struct chhead *ch, *newch;
 317         struct chhead **plistp;
 318         struct chtab *cp;
 319         int error;
 320         k_sigset_t smask;
 321 
 322         if (newcl == NULL || chp == NULL || ci == NULL)
 323                 return (EINVAL);
 324 
 325         *newcl = NULL;
 326         *chp = NULL;
 327 
 328         /*
 329          * Find an unused handle or create one
 330          */
 331         newch = NULL;
 332         nfscl->nfscl_stat.clgets.value.ui64++;
 333 top:
 334         /*
 335          * Find the correct entry in the cache to check for free
 336          * client handles.  The search is based on the RPC program
 337          * number, program version number, dev_t for the transport
 338          * device, and the protocol family.
 339          */
 340         mutex_enter(&nfscl->nfscl_chtable_lock);
 341         plistp = &nfscl->nfscl_chtable;
 342         for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) {
 343                 if (ch->ch_prog == ci->cl_prog &&
 344                     ch->ch_vers == ci->cl_vers &&
 345                     ch->ch_dev == svp->sv_knconf->knc_rdev &&
 346                     (strcmp(ch->ch_protofmly,
 347                     svp->sv_knconf->knc_protofmly) == 0))
 348                         break;
 349                 plistp = &ch->ch_next;
 350         }
 351 
 352         /*
 353          * If we didn't find a cache entry for this quadruple, then
 354          * create one.  If we don't have one already preallocated,
 355          * then drop the cache lock, create one, and then start over.
 356          * If we did have a preallocated entry, then just add it to
 357          * the front of the list.
 358          */
 359         if (ch == NULL) {
 360                 if (newch == NULL) {
 361                         mutex_exit(&nfscl->nfscl_chtable_lock);
 362                         newch = kmem_alloc(sizeof (*newch), KM_SLEEP);
 363                         newch->ch_timesused = 0;
 364                         newch->ch_prog = ci->cl_prog;
 365                         newch->ch_vers = ci->cl_vers;
 366                         newch->ch_dev = svp->sv_knconf->knc_rdev;
 367                         newch->ch_protofmly = kmem_alloc(
 368                             strlen(svp->sv_knconf->knc_protofmly) + 1,
 369                             KM_SLEEP);
 370                         (void) strcpy(newch->ch_protofmly,
 371                             svp->sv_knconf->knc_protofmly);
 372                         newch->ch_list = NULL;
 373                         goto top;
 374                 }
 375                 ch = newch;
 376                 newch = NULL;
 377                 ch->ch_next = nfscl->nfscl_chtable;
 378                 nfscl->nfscl_chtable = ch;
 379         /*
 380          * We found a cache entry, but if it isn't on the front of the
 381          * list, then move it to the front of the list to try to take
 382          * advantage of locality of operations.
 383          */
 384         } else if (ch != nfscl->nfscl_chtable) {
 385                 *plistp = ch->ch_next;
 386                 ch->ch_next = nfscl->nfscl_chtable;
 387                 nfscl->nfscl_chtable = ch;
 388         }
 389 
 390         /*
 391          * If there was a free client handle cached, then remove it
 392          * from the list, init it, and use it.
 393          */
 394         if (ch->ch_list != NULL) {
 395                 cp = ch->ch_list;
 396                 ch->ch_list = cp->ch_list;
 397                 mutex_exit(&nfscl->nfscl_chtable_lock);
 398                 if (newch != NULL) {
 399                         kmem_free(newch->ch_protofmly,
 400                             strlen(newch->ch_protofmly) + 1);
 401                         kmem_free(newch, sizeof (*newch));
 402                 }
 403                 (void) clnt_tli_kinit(cp->ch_client, svp->sv_knconf,
 404                     &svp->sv_addr, ci->cl_readsize, ci->cl_retrans, cr);
 405                 error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr,
 406                     &cp->ch_client->cl_auth);
 407                 if (error || cp->ch_client->cl_auth == NULL) {
 408                         CLNT_DESTROY(cp->ch_client);
 409                         kmem_cache_free(chtab_cache, cp);
 410                         return ((error != 0) ? error : EINTR);
 411                 }
 412                 ch->ch_timesused++;
 413                 *newcl = cp->ch_client;
 414                 *chp = cp;
 415                 return (0);
 416         }
 417 
 418         /*
 419          * There weren't any free client handles which fit, so allocate
 420          * a new one and use that.
 421          */
 422 #ifdef DEBUG
 423         atomic_inc_64(&nfscl->nfscl_stat.clalloc.value.ui64);
 424 #endif
 425         mutex_exit(&nfscl->nfscl_chtable_lock);
 426 
 427         nfscl->nfscl_stat.cltoomany.value.ui64++;
 428         if (newch != NULL) {
 429                 kmem_free(newch->ch_protofmly, strlen(newch->ch_protofmly) + 1);
 430                 kmem_free(newch, sizeof (*newch));
 431         }
 432 
 433         cp = kmem_cache_alloc(chtab_cache, KM_SLEEP);
 434         cp->ch_head = ch;
 435 
 436         sigintr(&smask, (int)ci->cl_flags & MI_INT);
 437         error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, ci->cl_prog,
 438             ci->cl_vers, ci->cl_readsize, ci->cl_retrans, cr, &cp->ch_client);
 439         sigunintr(&smask);
 440 
 441         if (error != 0) {
 442                 kmem_cache_free(chtab_cache, cp);
 443 #ifdef DEBUG
 444                 atomic_dec_64(&nfscl->nfscl_stat.clalloc.value.ui64);
 445 #endif
 446                 /*
 447                  * Warning is unnecessary if error is EINTR.
 448                  */
 449                 if (error != EINTR) {
 450                         nfs_cmn_err(error, CE_WARN,
 451                             "clget: couldn't create handle: %m\n");
 452                 }
 453                 return (error);
 454         }
 455         (void) CLNT_CONTROL(cp->ch_client, CLSET_PROGRESS, NULL);
 456         auth_destroy(cp->ch_client->cl_auth);
 457         error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr,
 458             &cp->ch_client->cl_auth);
 459         if (error || cp->ch_client->cl_auth == NULL) {
 460                 CLNT_DESTROY(cp->ch_client);
 461                 kmem_cache_free(chtab_cache, cp);
 462 #ifdef DEBUG
 463                 atomic_dec_64(&nfscl->nfscl_stat.clalloc.value.ui64);
 464 #endif
 465                 return ((error != 0) ? error : EINTR);
 466         }
 467         ch->ch_timesused++;
 468         *newcl = cp->ch_client;
 469         ASSERT(cp->ch_client->cl_nosignal == FALSE);
 470         *chp = cp;
 471         return (0);
 472 }
 473 
 474 int
 475 clget(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
 476     struct chtab **chp)
 477 {
 478         struct nfs_clnt *nfscl;
 479 
 480         nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
 481         ASSERT(nfscl != NULL);
 482 
 483         return (clget_impl(ci, svp, cr, newcl, chp, nfscl));
 484 }
 485 
 486 static int
 487 acl_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
 488     struct chtab **chp, struct nfs_clnt *nfscl)
 489 {
 490         clinfo_t ci;
 491         int error;
 492 
 493         /*
 494          * Set read buffer size to rsize
 495          * and add room for RPC headers.
 496          */
 497         ci.cl_readsize = mi->mi_tsize;
 498         if (ci.cl_readsize != 0)
 499                 ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA);
 500 
 501         /*
 502          * If soft mount and server is down just try once.
 503          * meaning: do not retransmit.
 504          */
 505         if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN))
 506                 ci.cl_retrans = 0;
 507         else
 508                 ci.cl_retrans = mi->mi_retrans;
 509 
 510         ci.cl_prog = NFS_ACL_PROGRAM;
 511         ci.cl_vers = mi->mi_vers;
 512         ci.cl_flags = mi->mi_flags;
 513 
 514         /*
 515          * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS
 516          * security flavor, the client tries to establish a security context
 517          * by contacting the server. If the connection is timed out or reset,
 518          * e.g. server reboot, we will try again.
 519          */
 520         do {
 521                 error = clget_impl(&ci, svp, cr, newcl, chp, nfscl);
 522 
 523                 if (error == 0)
 524                         break;
 525 
 526                 /*
 527                  * For forced unmount or zone shutdown, bail out, no retry.
 528                  */
 529                 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
 530                         error = EIO;
 531                         break;
 532                 }
 533 
 534                 /* do not retry for softmount */
 535                 if (!(mi->mi_flags & MI_HARD))
 536                         break;
 537 
 538                 /* let the caller deal with the failover case */
 539                 if (FAILOVER_MOUNT(mi))
 540                         break;
 541 
 542         } while (error == ETIMEDOUT || error == ECONNRESET);
 543 
 544         return (error);
 545 }
 546 
 547 static int
 548 nfs_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
 549     struct chtab **chp, struct nfs_clnt *nfscl)
 550 {
 551         clinfo_t ci;
 552         int error;
 553 
 554         /*
 555          * Set read buffer size to rsize
 556          * and add room for RPC headers.
 557          */
 558         ci.cl_readsize = mi->mi_tsize;
 559         if (ci.cl_readsize != 0)
 560                 ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA);
 561 
 562         /*
 563          * If soft mount and server is down just try once.
 564          * meaning: do not retransmit.
 565          */
 566         if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN))
 567                 ci.cl_retrans = 0;
 568         else
 569                 ci.cl_retrans = mi->mi_retrans;
 570 
 571         ci.cl_prog = mi->mi_prog;
 572         ci.cl_vers = mi->mi_vers;
 573         ci.cl_flags = mi->mi_flags;
 574 
 575         /*
 576          * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS
 577          * security flavor, the client tries to establish a security context
 578          * by contacting the server. If the connection is timed out or reset,
 579          * e.g. server reboot, we will try again.
 580          */
 581         do {
 582                 error = clget_impl(&ci, svp, cr, newcl, chp, nfscl);
 583 
 584                 if (error == 0)
 585                         break;
 586 
 587                 /*
 588                  * For forced unmount or zone shutdown, bail out, no retry.
 589                  */
 590                 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
 591                         error = EIO;
 592                         break;
 593                 }
 594 
 595                 /* do not retry for softmount */
 596                 if (!(mi->mi_flags & MI_HARD))
 597                         break;
 598 
 599                 /* let the caller deal with the failover case */
 600                 if (FAILOVER_MOUNT(mi))
 601                         break;
 602 
 603         } while (error == ETIMEDOUT || error == ECONNRESET);
 604 
 605         return (error);
 606 }
 607 
 608 static void
 609 clfree_impl(CLIENT *cl, struct chtab *cp, struct nfs_clnt *nfscl)
 610 {
 611         if (cl->cl_auth != NULL) {
 612                 sec_clnt_freeh(cl->cl_auth);
 613                 cl->cl_auth = NULL;
 614         }
 615 
 616         /*
 617          * Timestamp this cache entry so that we know when it was last
 618          * used.
 619          */
 620         cp->ch_freed = gethrestime_sec();
 621 
 622         /*
 623          * Add the free client handle to the front of the list.
 624          * This way, the list will be sorted in youngest to oldest
 625          * order.
 626          */
 627         mutex_enter(&nfscl->nfscl_chtable_lock);
 628         cp->ch_list = cp->ch_head->ch_list;
 629         cp->ch_head->ch_list = cp;
 630         mutex_exit(&nfscl->nfscl_chtable_lock);
 631 }
 632 
 633 void
 634 clfree(CLIENT *cl, struct chtab *cp)
 635 {
 636         struct nfs_clnt *nfscl;
 637 
 638         nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
 639         ASSERT(nfscl != NULL);
 640 
 641         clfree_impl(cl, cp, nfscl);
 642 }
 643 
 644 #define CL_HOLDTIME     60      /* time to hold client handles */
 645 
 646 static void
 647 clreclaim_zone(struct nfs_clnt *nfscl, uint_t cl_holdtime)
 648 {
 649         struct chhead *ch;
 650         struct chtab *cp;       /* list of objects that can be reclaimed */
 651         struct chtab *cpe;
 652         struct chtab *cpl;
 653         struct chtab **cpp;
 654 #ifdef DEBUG
 655         int n = 0;
 656 #endif
 657 
 658         /*
 659          * Need to reclaim some memory, so step through the cache
 660          * looking through the lists for entries which can be freed.
 661          */
 662         cp = NULL;
 663 
 664         mutex_enter(&nfscl->nfscl_chtable_lock);
 665 
 666         /*
 667          * Here we step through each non-NULL quadruple and start to
 668          * construct the reclaim list pointed to by cp.  Note that
 669          * cp will contain all eligible chtab entries.  When this traversal
 670          * completes, chtab entries from the last quadruple will be at the
 671          * front of cp and entries from previously inspected quadruples have
 672          * been appended to the rear of cp.
 673          */
 674         for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) {
 675                 if (ch->ch_list == NULL)
 676                         continue;
 677                 /*
 678                  * Search each list for entries older then
 679                  * cl_holdtime seconds.  The lists are maintained
 680                  * in youngest to oldest order so that when the
 681                  * first entry is found which is old enough, then
 682                  * all of the rest of the entries on the list will
 683                  * be old enough as well.
 684                  */
 685                 cpl = ch->ch_list;
 686                 cpp = &ch->ch_list;
 687                 while (cpl != NULL &&
 688                     cpl->ch_freed + cl_holdtime > gethrestime_sec()) {
 689                         cpp = &cpl->ch_list;
 690                         cpl = cpl->ch_list;
 691                 }
 692                 if (cpl != NULL) {
 693                         *cpp = NULL;
 694                         if (cp != NULL) {
 695                                 cpe = cpl;
 696                                 while (cpe->ch_list != NULL)
 697                                         cpe = cpe->ch_list;
 698                                 cpe->ch_list = cp;
 699                         }
 700                         cp = cpl;
 701                 }
 702         }
 703 
 704         mutex_exit(&nfscl->nfscl_chtable_lock);
 705 
 706         /*
 707          * If cp is empty, then there is nothing to reclaim here.
 708          */
 709         if (cp == NULL)
 710                 return;
 711 
 712         /*
 713          * Step through the list of entries to free, destroying each client
 714          * handle and kmem_free'ing the memory for each entry.
 715          */
 716         while (cp != NULL) {
 717 #ifdef DEBUG
 718                 n++;
 719 #endif
 720                 CLNT_DESTROY(cp->ch_client);
 721                 cpl = cp->ch_list;
 722                 kmem_cache_free(chtab_cache, cp);
 723                 cp = cpl;
 724         }
 725 
 726 #ifdef DEBUG
 727         /*
 728          * Update clalloc so that nfsstat shows the current number
 729          * of allocated client handles.
 730          */
 731         atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -n);
 732 #endif
 733 }
 734 
 735 /* ARGSUSED */
 736 static void
 737 clreclaim(void *all)
 738 {
 739         struct nfs_clnt *nfscl;
 740 
 741 #ifdef DEBUG
 742         clstat_debug.clreclaim.value.ui64++;
 743 #endif
 744         /*
 745          * The system is low on memory; go through and try to reclaim some from
 746          * every zone on the system.
 747          */
 748         mutex_enter(&nfs_clnt_list_lock);
 749         nfscl = list_head(&nfs_clnt_list);
 750         for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl))
 751                 clreclaim_zone(nfscl, CL_HOLDTIME);
 752         mutex_exit(&nfs_clnt_list_lock);
 753 }
 754 
 755 /*
 756  * Minimum time-out values indexed by call type
 757  * These units are in "eights" of a second to avoid multiplies
 758  */
 759 static unsigned int minimum_timeo[] = {
 760         6, 7, 10
 761 };
 762 
 763 /*
 764  * Back off for retransmission timeout, MAXTIMO is in hz of a sec
 765  */
 766 #define MAXTIMO (20*hz)
 767 #define backoff(tim)    (((tim) < MAXTIMO) ? dobackoff(tim) : (tim))
 768 #define dobackoff(tim)  ((((tim) << 1) > MAXTIMO) ? MAXTIMO : ((tim) << 1))
 769 
 770 #define MIN_NFS_TSIZE 512       /* minimum "chunk" of NFS IO */
 771 #define REDUCE_NFS_TIME (hz/2)  /* rtxcur we try to keep under */
 772 #define INCREASE_NFS_TIME (hz/3*8) /* srtt we try to keep under (scaled*8) */
 773 
 774 /*
 775  * Function called when rfscall notices that we have been
 776  * re-transmitting, or when we get a response without retransmissions.
 777  * Return 1 if the transfer size was adjusted down - 0 if no change.
 778  */
 779 static int
 780 nfs_feedback(int flag, int which, mntinfo_t *mi)
 781 {
 782         int kind;
 783         int r = 0;
 784 
 785         mutex_enter(&mi->mi_lock);
 786         if (flag == FEEDBACK_REXMIT1) {
 787                 if (mi->mi_timers[NFS_CALLTYPES].rt_rtxcur != 0 &&
 788                     mi->mi_timers[NFS_CALLTYPES].rt_rtxcur < REDUCE_NFS_TIME)
 789                         goto done;
 790                 if (mi->mi_curread > MIN_NFS_TSIZE) {
 791                         mi->mi_curread /= 2;
 792                         if (mi->mi_curread < MIN_NFS_TSIZE)
 793                                 mi->mi_curread = MIN_NFS_TSIZE;
 794                         r = 1;
 795                 }
 796 
 797                 if (mi->mi_curwrite > MIN_NFS_TSIZE) {
 798                         mi->mi_curwrite /= 2;
 799                         if (mi->mi_curwrite < MIN_NFS_TSIZE)
 800                                 mi->mi_curwrite = MIN_NFS_TSIZE;
 801                         r = 1;
 802                 }
 803         } else if (flag == FEEDBACK_OK) {
 804                 kind = mi->mi_timer_type[which];
 805                 if (kind == 0 ||
 806                     mi->mi_timers[kind].rt_srtt >= INCREASE_NFS_TIME)
 807                         goto done;
 808                 if (kind == 1) {
 809                         if (mi->mi_curread >= mi->mi_tsize)
 810                                 goto done;
 811                         mi->mi_curread +=  MIN_NFS_TSIZE;
 812                         if (mi->mi_curread > mi->mi_tsize/2)
 813                                 mi->mi_curread = mi->mi_tsize;
 814                 } else if (kind == 2) {
 815                         if (mi->mi_curwrite >= mi->mi_stsize)
 816                                 goto done;
 817                         mi->mi_curwrite += MIN_NFS_TSIZE;
 818                         if (mi->mi_curwrite > mi->mi_stsize/2)
 819                                 mi->mi_curwrite = mi->mi_stsize;
 820                 }
 821         }
 822 done:
 823         mutex_exit(&mi->mi_lock);
 824         return (r);
 825 }
 826 
 827 #ifdef DEBUG
 828 static int rfs2call_hits = 0;
 829 static int rfs2call_misses = 0;
 830 #endif
 831 
 832 int
 833 rfs2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
 834     xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
 835     enum nfsstat *statusp, int flags, failinfo_t *fi)
 836 {
 837         int rpcerror;
 838         enum clnt_stat rpc_status;
 839 
 840         ASSERT(statusp != NULL);
 841 
 842         rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp,
 843             cr, douprintf, &rpc_status, flags, fi);
 844         if (!rpcerror) {
 845                 /*
 846                  * See crnetadjust() for comments.
 847                  */
 848                 if (*statusp == NFSERR_ACCES &&
 849                     (cr = crnetadjust(cr)) != NULL) {
 850 #ifdef DEBUG
 851                         rfs2call_hits++;
 852 #endif
 853                         rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres,
 854                             resp, cr, douprintf, NULL, flags, fi);
 855                         crfree(cr);
 856 #ifdef DEBUG
 857                         if (*statusp == NFSERR_ACCES)
 858                                 rfs2call_misses++;
 859 #endif
 860                 }
 861         } else if (rpc_status == RPC_PROCUNAVAIL) {
 862                 *statusp = NFSERR_OPNOTSUPP;
 863                 rpcerror = 0;
 864         }
 865 
 866         return (rpcerror);
 867 }
 868 
 869 #define NFS3_JUKEBOX_DELAY      10 * hz
 870 
 871 volatile clock_t nfs3_jukebox_delay = 0;
 872 
 873 #ifdef DEBUG
 874 static int rfs3call_hits = 0;
 875 static int rfs3call_misses = 0;
 876 #endif
 877 
 878 int
 879 rfs3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
 880     xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
 881     nfsstat3 *statusp, int flags, failinfo_t *fi)
 882 {
 883         int rpcerror;
 884         int user_informed;
 885 
 886         user_informed = 0;
 887         do {
 888                 rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp,
 889                     cr, douprintf, NULL, flags, fi);
 890                 if (!rpcerror) {
 891                         cred_t *crr;
 892                         if (*statusp == NFS3ERR_JUKEBOX) {
 893                                 if (ttoproc(curthread) == &p0) {
 894                                         rpcerror = EAGAIN;
 895                                         break;
 896                                 }
 897                                 if (!user_informed) {
 898                                         user_informed = 1;
 899                                         uprintf(
 900                 "file temporarily unavailable on the server, retrying...\n");
 901                                 }
 902                                 delay(nfs3_jukebox_delay);
 903                         }
 904                         /*
 905                          * See crnetadjust() for comments.
 906                          */
 907                         else if (*statusp == NFS3ERR_ACCES &&
 908                             (crr = crnetadjust(cr)) != NULL) {
 909 #ifdef DEBUG
 910                                 rfs3call_hits++;
 911 #endif
 912                                 rpcerror = rfscall(mi, which, xdrargs, argsp,
 913                                     xdrres, resp, crr, douprintf,
 914                                     NULL, flags, fi);
 915 
 916                                 crfree(crr);
 917 #ifdef DEBUG
 918                                 if (*statusp == NFS3ERR_ACCES)
 919                                         rfs3call_misses++;
 920 #endif
 921                         }
 922                 }
 923         } while (!rpcerror && *statusp == NFS3ERR_JUKEBOX);
 924 
 925         return (rpcerror);
 926 }
 927 
 928 #define VALID_FH(fi)    (VTOR(fi->vp)->r_server == VTOMI(fi->vp)->mi_curr_serv)
 929 #define INC_READERS(mi)         { \
 930         mi->mi_readers++; \
 931 }
 932 #define DEC_READERS(mi)         { \
 933         mi->mi_readers--; \
 934         if (mi->mi_readers == 0) \
 935                 cv_broadcast(&mi->mi_failover_cv); \
 936 }
 937 
 938 static int
 939 rfscall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
 940     xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf,
 941     enum clnt_stat *rpc_status, int flags, failinfo_t *fi)
 942 {
 943         CLIENT *client;
 944         struct chtab *ch;
 945         cred_t *cr = icr;
 946         enum clnt_stat status;
 947         struct rpc_err rpcerr, rpcerr_tmp;
 948         struct timeval wait;
 949         int timeo;              /* in units of hz */
 950         int my_rsize, my_wsize;
 951         bool_t tryagain;
 952         bool_t cred_cloned = FALSE;
 953         k_sigset_t smask;
 954         servinfo_t *svp;
 955         struct nfs_clnt *nfscl;
 956         zoneid_t zoneid = getzoneid();
 957         char *msg;
 958 #ifdef DEBUG
 959         char *bufp;
 960 #endif
 961 
 962 
 963         TRACE_2(TR_FAC_NFS, TR_RFSCALL_START,
 964             "rfscall_start:which %d mi %p", which, mi);
 965 
 966         nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
 967         ASSERT(nfscl != NULL);
 968 
 969         nfscl->nfscl_stat.calls.value.ui64++;
 970         mi->mi_reqs[which].value.ui64++;
 971 
 972         rpcerr.re_status = RPC_SUCCESS;
 973 
 974         /*
 975          * In case of forced unmount or zone shutdown, return EIO.
 976          */
 977 
 978         if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
 979                 rpcerr.re_status = RPC_FAILED;
 980                 rpcerr.re_errno = EIO;
 981                 return (rpcerr.re_errno);
 982         }
 983 
 984         /*
 985          * Remember the transfer sizes in case
 986          * nfs_feedback changes them underneath us.
 987          */
 988         my_rsize = mi->mi_curread;
 989         my_wsize = mi->mi_curwrite;
 990 
 991         /*
 992          * NFS client failover support
 993          *
 994          * If this rnode is not in sync with the current server (VALID_FH),
 995          * we'd like to do a remap to get in sync.  We can be interrupted
 996          * in failover_remap(), and if so we'll bail.  Otherwise, we'll
 997          * use the best info we have to try the RPC.  Part of that is
 998          * unconditionally updating the filehandle copy kept for V3.
 999          *
1000          * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible
1001          * rw_enter(); we're trying to keep the current server from being
1002          * changed on us until we're done with the remapping and have a
1003          * matching client handle.  We don't want to sending a filehandle
1004          * to the wrong host.
1005          */
1006 failoverretry:
1007         if (FAILOVER_MOUNT(mi)) {
1008                 mutex_enter(&mi->mi_lock);
1009                 if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1010                         if (failover_wait(mi)) {
1011                                 mutex_exit(&mi->mi_lock);
1012                                 return (EINTR);
1013                         }
1014                 }
1015                 INC_READERS(mi);
1016                 mutex_exit(&mi->mi_lock);
1017                 if (fi) {
1018                         if (!VALID_FH(fi) &&
1019                             !(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1020                                 int remaperr;
1021 
1022                                 svp = mi->mi_curr_serv;
1023                                 remaperr = failover_remap(fi);
1024                                 if (remaperr != 0) {
1025 #ifdef DEBUG
1026                                         if (remaperr != EINTR)
1027                                                 nfs_cmn_err(remaperr, CE_WARN,
1028                                             "rfscall couldn't failover: %m");
1029 #endif
1030                                         mutex_enter(&mi->mi_lock);
1031                                         DEC_READERS(mi);
1032                                         mutex_exit(&mi->mi_lock);
1033                                         /*
1034                                          * If failover_remap returns ETIMEDOUT
1035                                          * and the filesystem is hard mounted
1036                                          * we have to retry the call with a new
1037                                          * server.
1038                                          */
1039                                         if ((mi->mi_flags & MI_HARD) &&
1040                                             IS_RECOVERABLE_ERROR(remaperr)) {
1041                                                 if (svp == mi->mi_curr_serv)
1042                                                         failover_newserver(mi);
1043                                                 rpcerr.re_status = RPC_SUCCESS;
1044                                                 goto failoverretry;
1045                                         }
1046                                         rpcerr.re_errno = remaperr;
1047                                         return (remaperr);
1048                                 }
1049                         }
1050                         if (fi->fhp && fi->copyproc)
1051                                 (*fi->copyproc)(fi->fhp, fi->vp);
1052                 }
1053         }
1054 
1055         /* For TSOL, use a new cred which has net_mac_aware flag */
1056         if (!cred_cloned && is_system_labeled()) {
1057                 cred_cloned = TRUE;
1058                 cr = crdup(icr);
1059                 (void) setpflags(NET_MAC_AWARE, 1, cr);
1060         }
1061 
1062         /*
1063          * clget() calls clnt_tli_kinit() which clears the xid, so we
1064          * are guaranteed to reprocess the retry as a new request.
1065          */
1066         svp = mi->mi_curr_serv;
1067         rpcerr.re_errno = nfs_clget(mi, svp, cr, &client, &ch, nfscl);
1068 
1069         if (FAILOVER_MOUNT(mi)) {
1070                 mutex_enter(&mi->mi_lock);
1071                 DEC_READERS(mi);
1072                 mutex_exit(&mi->mi_lock);
1073 
1074                 if ((rpcerr.re_errno == ETIMEDOUT ||
1075                     rpcerr.re_errno == ECONNRESET) &&
1076                     failover_safe(fi)) {
1077                         if (svp == mi->mi_curr_serv)
1078                                 failover_newserver(mi);
1079                         goto failoverretry;
1080                 }
1081         }
1082         if (rpcerr.re_errno != 0)
1083                 return (rpcerr.re_errno);
1084 
1085         if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD ||
1086             svp->sv_knconf->knc_semantics == NC_TPI_COTS) {
1087                 timeo = (mi->mi_timeo * hz) / 10;
1088         } else {
1089                 mutex_enter(&mi->mi_lock);
1090                 timeo = CLNT_SETTIMERS(client,
1091                     &(mi->mi_timers[mi->mi_timer_type[which]]),
1092                     &(mi->mi_timers[NFS_CALLTYPES]),
1093                     (minimum_timeo[mi->mi_call_type[which]]*hz)>>3,
1094                     (void (*)())NULL, (caddr_t)mi, 0);
1095                 mutex_exit(&mi->mi_lock);
1096         }
1097 
1098         /*
1099          * If hard mounted fs, retry call forever unless hard error occurs.
1100          */
1101         do {
1102                 tryagain = FALSE;
1103 
1104                 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1105                         status = RPC_FAILED;
1106                         rpcerr.re_status = RPC_FAILED;
1107                         rpcerr.re_errno = EIO;
1108                         break;
1109                 }
1110 
1111                 TICK_TO_TIMEVAL(timeo, &wait);
1112 
1113                 /*
1114                  * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
1115                  * and SIGTERM. (Preserving the existing masks).
1116                  * Mask out SIGINT if mount option nointr is specified.
1117                  */
1118                 sigintr(&smask, (int)mi->mi_flags & MI_INT);
1119                 if (!(mi->mi_flags & MI_INT))
1120                         client->cl_nosignal = TRUE;
1121 
1122                 /*
1123                  * If there is a current signal, then don't bother
1124                  * even trying to send out the request because we
1125                  * won't be able to block waiting for the response.
1126                  * Simply assume RPC_INTR and get on with it.
1127                  */
1128                 if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING))
1129                         status = RPC_INTR;
1130                 else {
1131                         status = CLNT_CALL(client, which, xdrargs, argsp,
1132                             xdrres, resp, wait);
1133                 }
1134 
1135                 if (!(mi->mi_flags & MI_INT))
1136                         client->cl_nosignal = FALSE;
1137                 /*
1138                  * restore original signal mask
1139                  */
1140                 sigunintr(&smask);
1141 
1142                 switch (status) {
1143                 case RPC_SUCCESS:
1144                         if ((mi->mi_flags & MI_DYNAMIC) &&
1145                             mi->mi_timer_type[which] != 0 &&
1146                             (mi->mi_curread != my_rsize ||
1147                             mi->mi_curwrite != my_wsize))
1148                                 (void) nfs_feedback(FEEDBACK_OK, which, mi);
1149                         break;
1150 
1151                 case RPC_INTR:
1152                         /*
1153                          * There is no way to recover from this error,
1154                          * even if mount option nointr is specified.
1155                          * SIGKILL, for example, cannot be blocked.
1156                          */
1157                         rpcerr.re_status = RPC_INTR;
1158                         rpcerr.re_errno = EINTR;
1159                         break;
1160 
1161                 case RPC_UDERROR:
1162                         /*
1163                          * If the NFS server is local (vold) and
1164                          * it goes away then we get RPC_UDERROR.
1165                          * This is a retryable error, so we would
1166                          * loop, so check to see if the specific
1167                          * error was ECONNRESET, indicating that
1168                          * target did not exist at all.  If so,
1169                          * return with RPC_PROGUNAVAIL and
1170                          * ECONNRESET to indicate why.
1171                          */
1172                         CLNT_GETERR(client, &rpcerr);
1173                         if (rpcerr.re_errno == ECONNRESET) {
1174                                 rpcerr.re_status = RPC_PROGUNAVAIL;
1175                                 rpcerr.re_errno = ECONNRESET;
1176                                 break;
1177                         }
1178                         /*FALLTHROUGH*/
1179 
1180                 default:                /* probably RPC_TIMEDOUT */
1181                         if (IS_UNRECOVERABLE_RPC(status))
1182                                 break;
1183 
1184                         /*
1185                          * increment server not responding count
1186                          */
1187                         mutex_enter(&mi->mi_lock);
1188                         mi->mi_noresponse++;
1189                         mutex_exit(&mi->mi_lock);
1190 #ifdef DEBUG
1191                         nfscl->nfscl_stat.noresponse.value.ui64++;
1192 #endif
1193 
1194                         if (!(mi->mi_flags & MI_HARD)) {
1195                                 if (!(mi->mi_flags & MI_SEMISOFT) ||
1196                                     (mi->mi_ss_call_type[which] == 0))
1197                                         break;
1198                         }
1199 
1200                         /*
1201                          * The call is in progress (over COTS).
1202                          * Try the CLNT_CALL again, but don't
1203                          * print a noisy error message.
1204                          */
1205                         if (status == RPC_INPROGRESS) {
1206                                 tryagain = TRUE;
1207                                 break;
1208                         }
1209 
1210                         if (flags & RFSCALL_SOFT)
1211                                 break;
1212 
1213                         /*
1214                          * On zone shutdown, just move on.
1215                          */
1216                         if (zone_status_get(curproc->p_zone) >=
1217                             ZONE_IS_SHUTTING_DOWN) {
1218                                 rpcerr.re_status = RPC_FAILED;
1219                                 rpcerr.re_errno = EIO;
1220                                 break;
1221                         }
1222 
1223                         /*
1224                          * NFS client failover support
1225                          *
1226                          * If the current server just failed us, we'll
1227                          * start the process of finding a new server.
1228                          * After that, we can just retry.
1229                          */
1230                         if (FAILOVER_MOUNT(mi) && failover_safe(fi)) {
1231                                 if (svp == mi->mi_curr_serv)
1232                                         failover_newserver(mi);
1233                                 clfree_impl(client, ch, nfscl);
1234                                 goto failoverretry;
1235                         }
1236 
1237                         tryagain = TRUE;
1238                         timeo = backoff(timeo);
1239 
1240                         CLNT_GETERR(client, &rpcerr_tmp);
1241                         if ((status == RPC_CANTSEND) &&
1242                             (rpcerr_tmp.re_errno == ENOBUFS))
1243                                 msg = SRV_QFULL_MSG;
1244                         else
1245                                 msg = SRV_NOTRESP_MSG;
1246 
1247                         mutex_enter(&mi->mi_lock);
1248                         if (!(mi->mi_flags & MI_PRINTED)) {
1249                                 mi->mi_flags |= MI_PRINTED;
1250                                 mutex_exit(&mi->mi_lock);
1251 #ifdef DEBUG
1252                                 zprintf(zoneid, msg, mi->mi_vers,
1253                                     svp->sv_hostname);
1254 #else
1255                                 zprintf(zoneid, msg, svp->sv_hostname);
1256 #endif
1257                         } else
1258                                 mutex_exit(&mi->mi_lock);
1259                         if (*douprintf && nfs_has_ctty()) {
1260                                 *douprintf = 0;
1261                                 if (!(mi->mi_flags & MI_NOPRINT))
1262 #ifdef DEBUG
1263                                         uprintf(msg, mi->mi_vers,
1264                                             svp->sv_hostname);
1265 #else
1266                                         uprintf(msg, svp->sv_hostname);
1267 #endif
1268                         }
1269 
1270                         /*
1271                          * If doing dynamic adjustment of transfer
1272                          * size and if it's a read or write call
1273                          * and if the transfer size changed while
1274                          * retransmitting or if the feedback routine
1275                          * changed the transfer size,
1276                          * then exit rfscall so that the transfer
1277                          * size can be adjusted at the vnops level.
1278                          */
1279                         if ((mi->mi_flags & MI_DYNAMIC) &&
1280                             mi->mi_timer_type[which] != 0 &&
1281                             (mi->mi_curread != my_rsize ||
1282                             mi->mi_curwrite != my_wsize ||
1283                             nfs_feedback(FEEDBACK_REXMIT1, which, mi))) {
1284                                 /*
1285                                  * On read or write calls, return
1286                                  * back to the vnode ops level if
1287                                  * the transfer size changed.
1288                                  */
1289                                 clfree_impl(client, ch, nfscl);
1290                                 if (cred_cloned)
1291                                         crfree(cr);
1292                                 return (ENFS_TRYAGAIN);
1293                         }
1294                 }
1295         } while (tryagain);
1296 
1297         if (status != RPC_SUCCESS) {
1298                 /*
1299                  * Let soft mounts use the timed out message.
1300                  */
1301                 if (status == RPC_INPROGRESS)
1302                         status = RPC_TIMEDOUT;
1303                 nfscl->nfscl_stat.badcalls.value.ui64++;
1304                 if (status != RPC_INTR) {
1305                         mutex_enter(&mi->mi_lock);
1306                         mi->mi_flags |= MI_DOWN;
1307                         mutex_exit(&mi->mi_lock);
1308                         CLNT_GETERR(client, &rpcerr);
1309 #ifdef DEBUG
1310                         bufp = clnt_sperror(client, svp->sv_hostname);
1311                         zprintf(zoneid, "NFS%d %s failed for %s\n",
1312                             mi->mi_vers, mi->mi_rfsnames[which], bufp);
1313                         if (nfs_has_ctty()) {
1314                                 if (!(mi->mi_flags & MI_NOPRINT)) {
1315                                         uprintf("NFS%d %s failed for %s\n",
1316                                             mi->mi_vers, mi->mi_rfsnames[which],
1317                                             bufp);
1318                                 }
1319                         }
1320                         kmem_free(bufp, MAXPATHLEN);
1321 #else
1322                         zprintf(zoneid,
1323                             "NFS %s failed for server %s: error %d (%s)\n",
1324                             mi->mi_rfsnames[which], svp->sv_hostname,
1325                             status, clnt_sperrno(status));
1326                         if (nfs_has_ctty()) {
1327                                 if (!(mi->mi_flags & MI_NOPRINT)) {
1328                                         uprintf(
1329                                 "NFS %s failed for server %s: error %d (%s)\n",
1330                                             mi->mi_rfsnames[which],
1331                                             svp->sv_hostname, status,
1332                                             clnt_sperrno(status));
1333                                 }
1334                         }
1335 #endif
1336                         /*
1337                          * when CLNT_CALL() fails with RPC_AUTHERROR,
1338                          * re_errno is set appropriately depending on
1339                          * the authentication error
1340                          */
1341                         if (status == RPC_VERSMISMATCH ||
1342                             status == RPC_PROGVERSMISMATCH)
1343                                 rpcerr.re_errno = EIO;
1344                 }
1345         } else {
1346                 /*
1347                  * Test the value of mi_down and mi_printed without
1348                  * holding the mi_lock mutex.  If they are both zero,
1349                  * then it is okay to skip the down and printed
1350                  * processing.  This saves on a mutex_enter and
1351                  * mutex_exit pair for a normal, successful RPC.
1352                  * This was just complete overhead.
1353                  */
1354                 if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) {
1355                         mutex_enter(&mi->mi_lock);
1356                         mi->mi_flags &= ~MI_DOWN;
1357                         if (mi->mi_flags & MI_PRINTED) {
1358                                 mi->mi_flags &= ~MI_PRINTED;
1359                                 mutex_exit(&mi->mi_lock);
1360 #ifdef DEBUG
1361                         if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1362                                 zprintf(zoneid, "NFS%d server %s ok\n",
1363                                     mi->mi_vers, svp->sv_hostname);
1364 #else
1365                         if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1366                                 zprintf(zoneid, "NFS server %s ok\n",
1367                                     svp->sv_hostname);
1368 #endif
1369                         } else
1370                                 mutex_exit(&mi->mi_lock);
1371                 }
1372 
1373                 if (*douprintf == 0) {
1374                         if (!(mi->mi_flags & MI_NOPRINT))
1375 #ifdef DEBUG
1376                                 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1377                                         uprintf("NFS%d server %s ok\n",
1378                                             mi->mi_vers, svp->sv_hostname);
1379 #else
1380                         if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1381                                 uprintf("NFS server %s ok\n", svp->sv_hostname);
1382 #endif
1383                         *douprintf = 1;
1384                 }
1385         }
1386 
1387         clfree_impl(client, ch, nfscl);
1388         if (cred_cloned)
1389                 crfree(cr);
1390 
1391         ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0);
1392 
1393         if (rpc_status != NULL)
1394                 *rpc_status = rpcerr.re_status;
1395 
1396         TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d",
1397             rpcerr.re_errno);
1398 
1399         return (rpcerr.re_errno);
1400 }
1401 
1402 #ifdef DEBUG
1403 static int acl2call_hits = 0;
1404 static int acl2call_misses = 0;
1405 #endif
1406 
1407 int
1408 acl2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1409     xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
1410     enum nfsstat *statusp, int flags, failinfo_t *fi)
1411 {
1412         int rpcerror;
1413 
1414         rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp,
1415             cr, douprintf, flags, fi);
1416         if (!rpcerror) {
1417                 /*
1418                  * See comments with crnetadjust().
1419                  */
1420                 if (*statusp == NFSERR_ACCES &&
1421                     (cr = crnetadjust(cr)) != NULL) {
1422 #ifdef DEBUG
1423                         acl2call_hits++;
1424 #endif
1425                         rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres,
1426                             resp, cr, douprintf, flags, fi);
1427                         crfree(cr);
1428 #ifdef DEBUG
1429                         if (*statusp == NFSERR_ACCES)
1430                                 acl2call_misses++;
1431 #endif
1432                 }
1433         }
1434 
1435         return (rpcerror);
1436 }
1437 
1438 #ifdef DEBUG
1439 static int acl3call_hits = 0;
1440 static int acl3call_misses = 0;
1441 #endif
1442 
1443 int
1444 acl3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1445     xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
1446     nfsstat3 *statusp, int flags, failinfo_t *fi)
1447 {
1448         int rpcerror;
1449         int user_informed;
1450 
1451         user_informed = 0;
1452 
1453         do {
1454                 rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp,
1455                     cr, douprintf, flags, fi);
1456                 if (!rpcerror) {
1457                         cred_t *crr;
1458                         if (*statusp == NFS3ERR_JUKEBOX) {
1459                                 if (!user_informed) {
1460                                         user_informed = 1;
1461                                         uprintf(
1462                 "file temporarily unavailable on the server, retrying...\n");
1463                                 }
1464                                 delay(nfs3_jukebox_delay);
1465                         }
1466                         /*
1467                          * See crnetadjust() for comments.
1468                          */
1469                         else if (*statusp == NFS3ERR_ACCES &&
1470                             (crr = crnetadjust(cr)) != NULL) {
1471 #ifdef DEBUG
1472                                 acl3call_hits++;
1473 #endif
1474                                 rpcerror = aclcall(mi, which, xdrargs, argsp,
1475                                     xdrres, resp, crr, douprintf, flags, fi);
1476 
1477                                 crfree(crr);
1478 #ifdef DEBUG
1479                                 if (*statusp == NFS3ERR_ACCES)
1480                                         acl3call_misses++;
1481 #endif
1482                         }
1483                 }
1484         } while (!rpcerror && *statusp == NFS3ERR_JUKEBOX);
1485 
1486         return (rpcerror);
1487 }
1488 
1489 static int
1490 aclcall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1491     xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf,
1492     int flags, failinfo_t *fi)
1493 {
1494         CLIENT *client;
1495         struct chtab *ch;
1496         cred_t *cr = icr;
1497         bool_t cred_cloned = FALSE;
1498         enum clnt_stat status;
1499         struct rpc_err rpcerr;
1500         struct timeval wait;
1501         int timeo;              /* in units of hz */
1502 #if 0 /* notyet */
1503         int my_rsize, my_wsize;
1504 #endif
1505         bool_t tryagain;
1506         k_sigset_t smask;
1507         servinfo_t *svp;
1508         struct nfs_clnt *nfscl;
1509         zoneid_t zoneid = getzoneid();
1510 #ifdef DEBUG
1511         char *bufp;
1512 #endif
1513 
1514 #if 0 /* notyet */
1515         TRACE_2(TR_FAC_NFS, TR_RFSCALL_START,
1516             "rfscall_start:which %d mi %p", which, mi);
1517 #endif
1518 
1519         nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
1520         ASSERT(nfscl != NULL);
1521 
1522         nfscl->nfscl_stat.calls.value.ui64++;
1523         mi->mi_aclreqs[which].value.ui64++;
1524 
1525         rpcerr.re_status = RPC_SUCCESS;
1526 
1527         if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1528                 rpcerr.re_status = RPC_FAILED;
1529                 rpcerr.re_errno = EIO;
1530                 return (rpcerr.re_errno);
1531         }
1532 
1533 #if 0 /* notyet */
1534         /*
1535          * Remember the transfer sizes in case
1536          * nfs_feedback changes them underneath us.
1537          */
1538         my_rsize = mi->mi_curread;
1539         my_wsize = mi->mi_curwrite;
1540 #endif
1541 
1542         /*
1543          * NFS client failover support
1544          *
1545          * If this rnode is not in sync with the current server (VALID_FH),
1546          * we'd like to do a remap to get in sync.  We can be interrupted
1547          * in failover_remap(), and if so we'll bail.  Otherwise, we'll
1548          * use the best info we have to try the RPC.  Part of that is
1549          * unconditionally updating the filehandle copy kept for V3.
1550          *
1551          * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible
1552          * rw_enter(); we're trying to keep the current server from being
1553          * changed on us until we're done with the remapping and have a
1554          * matching client handle.  We don't want to sending a filehandle
1555          * to the wrong host.
1556          */
1557 failoverretry:
1558         if (FAILOVER_MOUNT(mi)) {
1559                 mutex_enter(&mi->mi_lock);
1560                 if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1561                         if (failover_wait(mi)) {
1562                                 mutex_exit(&mi->mi_lock);
1563                                 return (EINTR);
1564                         }
1565                 }
1566                 INC_READERS(mi);
1567                 mutex_exit(&mi->mi_lock);
1568                 if (fi) {
1569                         if (!VALID_FH(fi) &&
1570                             !(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1571                                 int remaperr;
1572 
1573                                 svp = mi->mi_curr_serv;
1574                                 remaperr = failover_remap(fi);
1575                                 if (remaperr != 0) {
1576 #ifdef DEBUG
1577                                         if (remaperr != EINTR)
1578                                                 nfs_cmn_err(remaperr, CE_WARN,
1579                                             "aclcall couldn't failover: %m");
1580 #endif
1581                                         mutex_enter(&mi->mi_lock);
1582                                         DEC_READERS(mi);
1583                                         mutex_exit(&mi->mi_lock);
1584 
1585                                         /*
1586                                          * If failover_remap returns ETIMEDOUT
1587                                          * and the filesystem is hard mounted
1588                                          * we have to retry the call with a new
1589                                          * server.
1590                                          */
1591                                         if ((mi->mi_flags & MI_HARD) &&
1592                                             IS_RECOVERABLE_ERROR(remaperr)) {
1593                                                 if (svp == mi->mi_curr_serv)
1594                                                         failover_newserver(mi);
1595                                                 rpcerr.re_status = RPC_SUCCESS;
1596                                                 goto failoverretry;
1597                                         }
1598                                         return (remaperr);
1599                                 }
1600                         }
1601                         if (fi->fhp && fi->copyproc)
1602                                 (*fi->copyproc)(fi->fhp, fi->vp);
1603                 }
1604         }
1605 
1606         /* For TSOL, use a new cred which has net_mac_aware flag */
1607         if (!cred_cloned && is_system_labeled()) {
1608                 cred_cloned = TRUE;
1609                 cr = crdup(icr);
1610                 (void) setpflags(NET_MAC_AWARE, 1, cr);
1611         }
1612 
1613         /*
1614          * acl_clget() calls clnt_tli_kinit() which clears the xid, so we
1615          * are guaranteed to reprocess the retry as a new request.
1616          */
1617         svp = mi->mi_curr_serv;
1618         rpcerr.re_errno = acl_clget(mi, svp, cr, &client, &ch, nfscl);
1619         if (FAILOVER_MOUNT(mi)) {
1620                 mutex_enter(&mi->mi_lock);
1621                 DEC_READERS(mi);
1622                 mutex_exit(&mi->mi_lock);
1623 
1624                 if ((rpcerr.re_errno == ETIMEDOUT ||
1625                     rpcerr.re_errno == ECONNRESET) &&
1626                     failover_safe(fi)) {
1627                         if (svp == mi->mi_curr_serv)
1628                                 failover_newserver(mi);
1629                         goto failoverretry;
1630                 }
1631         }
1632         if (rpcerr.re_errno != 0) {
1633                 if (cred_cloned)
1634                         crfree(cr);
1635                 return (rpcerr.re_errno);
1636         }
1637 
1638         if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD ||
1639             svp->sv_knconf->knc_semantics == NC_TPI_COTS) {
1640                 timeo = (mi->mi_timeo * hz) / 10;
1641         } else {
1642                 mutex_enter(&mi->mi_lock);
1643                 timeo = CLNT_SETTIMERS(client,
1644                     &(mi->mi_timers[mi->mi_acl_timer_type[which]]),
1645                     &(mi->mi_timers[NFS_CALLTYPES]),
1646                     (minimum_timeo[mi->mi_acl_call_type[which]]*hz)>>3,
1647                     (void (*)()) 0, (caddr_t)mi, 0);
1648                 mutex_exit(&mi->mi_lock);
1649         }
1650 
1651         /*
1652          * If hard mounted fs, retry call forever unless hard error occurs.
1653          */
1654         do {
1655                 tryagain = FALSE;
1656 
1657                 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1658                         status = RPC_FAILED;
1659                         rpcerr.re_status = RPC_FAILED;
1660                         rpcerr.re_errno = EIO;
1661                         break;
1662                 }
1663 
1664                 TICK_TO_TIMEVAL(timeo, &wait);
1665 
1666                 /*
1667                  * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
1668                  * and SIGTERM. (Preserving the existing masks).
1669                  * Mask out SIGINT if mount option nointr is specified.
1670                  */
1671                 sigintr(&smask, (int)mi->mi_flags & MI_INT);
1672                 if (!(mi->mi_flags & MI_INT))
1673                         client->cl_nosignal = TRUE;
1674 
1675                 /*
1676                  * If there is a current signal, then don't bother
1677                  * even trying to send out the request because we
1678                  * won't be able to block waiting for the response.
1679                  * Simply assume RPC_INTR and get on with it.
1680                  */
1681                 if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING))
1682                         status = RPC_INTR;
1683                 else {
1684                         status = CLNT_CALL(client, which, xdrargs, argsp,
1685                             xdrres, resp, wait);
1686                 }
1687 
1688                 if (!(mi->mi_flags & MI_INT))
1689                         client->cl_nosignal = FALSE;
1690                 /*
1691                  * restore original signal mask
1692                  */
1693                 sigunintr(&smask);
1694 
1695                 switch (status) {
1696                 case RPC_SUCCESS:
1697 #if 0 /* notyet */
1698                         if ((mi->mi_flags & MI_DYNAMIC) &&
1699                             mi->mi_timer_type[which] != 0 &&
1700                             (mi->mi_curread != my_rsize ||
1701                             mi->mi_curwrite != my_wsize))
1702                                 (void) nfs_feedback(FEEDBACK_OK, which, mi);
1703 #endif
1704                         break;
1705 
1706                 /*
1707                  * Unfortunately, there are servers in the world which
1708                  * are not coded correctly.  They are not prepared to
1709                  * handle RPC requests to the NFS port which are not
1710                  * NFS requests.  Thus, they may try to process the
1711                  * NFS_ACL request as if it were an NFS request.  This
1712                  * does not work.  Generally, an error will be generated
1713                  * on the client because it will not be able to decode
1714                  * the response from the server.  However, it seems
1715                  * possible that the server may not be able to decode
1716                  * the arguments.  Thus, the criteria for deciding
1717                  * whether the server supports NFS_ACL or not is whether
1718                  * the following RPC errors are returned from CLNT_CALL.
1719                  */
1720                 case RPC_CANTDECODERES:
1721                 case RPC_PROGUNAVAIL:
1722                 case RPC_CANTDECODEARGS:
1723                 case RPC_PROGVERSMISMATCH:
1724                         mutex_enter(&mi->mi_lock);
1725                         mi->mi_flags &= ~(MI_ACL | MI_EXTATTR);
1726                         mutex_exit(&mi->mi_lock);
1727                         break;
1728 
1729                 /*
1730                  * If the server supports NFS_ACL but not the new ops
1731                  * for extended attributes, make sure we don't retry.
1732                  */
1733                 case RPC_PROCUNAVAIL:
1734                         mutex_enter(&mi->mi_lock);
1735                         mi->mi_flags &= ~MI_EXTATTR;
1736                         mutex_exit(&mi->mi_lock);
1737                         break;
1738 
1739                 case RPC_INTR:
1740                         /*
1741                          * There is no way to recover from this error,
1742                          * even if mount option nointr is specified.
1743                          * SIGKILL, for example, cannot be blocked.
1744                          */
1745                         rpcerr.re_status = RPC_INTR;
1746                         rpcerr.re_errno = EINTR;
1747                         break;
1748 
1749                 case RPC_UDERROR:
1750                         /*
1751                          * If the NFS server is local (vold) and
1752                          * it goes away then we get RPC_UDERROR.
1753                          * This is a retryable error, so we would
1754                          * loop, so check to see if the specific
1755                          * error was ECONNRESET, indicating that
1756                          * target did not exist at all.  If so,
1757                          * return with RPC_PROGUNAVAIL and
1758                          * ECONNRESET to indicate why.
1759                          */
1760                         CLNT_GETERR(client, &rpcerr);
1761                         if (rpcerr.re_errno == ECONNRESET) {
1762                                 rpcerr.re_status = RPC_PROGUNAVAIL;
1763                                 rpcerr.re_errno = ECONNRESET;
1764                                 break;
1765                         }
1766                         /*FALLTHROUGH*/
1767 
1768                 default:                /* probably RPC_TIMEDOUT */
1769                         if (IS_UNRECOVERABLE_RPC(status))
1770                                 break;
1771 
1772                         /*
1773                          * increment server not responding count
1774                          */
1775                         mutex_enter(&mi->mi_lock);
1776                         mi->mi_noresponse++;
1777                         mutex_exit(&mi->mi_lock);
1778 #ifdef DEBUG
1779                         nfscl->nfscl_stat.noresponse.value.ui64++;
1780 #endif
1781 
1782                         if (!(mi->mi_flags & MI_HARD)) {
1783                                 if (!(mi->mi_flags & MI_SEMISOFT) ||
1784                                     (mi->mi_acl_ss_call_type[which] == 0))
1785                                         break;
1786                         }
1787 
1788                         /*
1789                          * The call is in progress (over COTS).
1790                          * Try the CLNT_CALL again, but don't
1791                          * print a noisy error message.
1792                          */
1793                         if (status == RPC_INPROGRESS) {
1794                                 tryagain = TRUE;
1795                                 break;
1796                         }
1797 
1798                         if (flags & RFSCALL_SOFT)
1799                                 break;
1800 
1801                         /*
1802                          * On zone shutdown, just move on.
1803                          */
1804                         if (zone_status_get(curproc->p_zone) >=
1805                             ZONE_IS_SHUTTING_DOWN) {
1806                                 rpcerr.re_status = RPC_FAILED;
1807                                 rpcerr.re_errno = EIO;
1808                                 break;
1809                         }
1810 
1811                         /*
1812                          * NFS client failover support
1813                          *
1814                          * If the current server just failed us, we'll
1815                          * start the process of finding a new server.
1816                          * After that, we can just retry.
1817                          */
1818                         if (FAILOVER_MOUNT(mi) && failover_safe(fi)) {
1819                                 if (svp == mi->mi_curr_serv)
1820                                         failover_newserver(mi);
1821                                 clfree_impl(client, ch, nfscl);
1822                                 goto failoverretry;
1823                         }
1824 
1825                         tryagain = TRUE;
1826                         timeo = backoff(timeo);
1827                         mutex_enter(&mi->mi_lock);
1828                         if (!(mi->mi_flags & MI_PRINTED)) {
1829                                 mi->mi_flags |= MI_PRINTED;
1830                                 mutex_exit(&mi->mi_lock);
1831 #ifdef DEBUG
1832                                 zprintf(zoneid,
1833                         "NFS_ACL%d server %s not responding still trying\n",
1834                                     mi->mi_vers, svp->sv_hostname);
1835 #else
1836                                 zprintf(zoneid,
1837                             "NFS server %s not responding still trying\n",
1838                                     svp->sv_hostname);
1839 #endif
1840                         } else
1841                                 mutex_exit(&mi->mi_lock);
1842                         if (*douprintf && nfs_has_ctty()) {
1843                                 *douprintf = 0;
1844                                 if (!(mi->mi_flags & MI_NOPRINT))
1845 #ifdef DEBUG
1846                                         uprintf(
1847                         "NFS_ACL%d server %s not responding still trying\n",
1848                                             mi->mi_vers, svp->sv_hostname);
1849 #else
1850                                         uprintf(
1851                             "NFS server %s not responding still trying\n",
1852                                             svp->sv_hostname);
1853 #endif
1854                         }
1855 
1856 #if 0 /* notyet */
1857                         /*
1858                          * If doing dynamic adjustment of transfer
1859                          * size and if it's a read or write call
1860                          * and if the transfer size changed while
1861                          * retransmitting or if the feedback routine
1862                          * changed the transfer size,
1863                          * then exit rfscall so that the transfer
1864                          * size can be adjusted at the vnops level.
1865                          */
1866                         if ((mi->mi_flags & MI_DYNAMIC) &&
1867                             mi->mi_acl_timer_type[which] != 0 &&
1868                             (mi->mi_curread != my_rsize ||
1869                             mi->mi_curwrite != my_wsize ||
1870                             nfs_feedback(FEEDBACK_REXMIT1, which, mi))) {
1871                                 /*
1872                                  * On read or write calls, return
1873                                  * back to the vnode ops level if
1874                                  * the transfer size changed.
1875                                  */
1876                                 clfree_impl(client, ch, nfscl);
1877                                 if (cred_cloned)
1878                                         crfree(cr);
1879                                 return (ENFS_TRYAGAIN);
1880                         }
1881 #endif
1882                 }
1883         } while (tryagain);
1884 
1885         if (status != RPC_SUCCESS) {
1886                 /*
1887                  * Let soft mounts use the timed out message.
1888                  */
1889                 if (status == RPC_INPROGRESS)
1890                         status = RPC_TIMEDOUT;
1891                 nfscl->nfscl_stat.badcalls.value.ui64++;
1892                 if (status == RPC_CANTDECODERES ||
1893                     status == RPC_PROGUNAVAIL ||
1894                     status == RPC_PROCUNAVAIL ||
1895                     status == RPC_CANTDECODEARGS ||
1896                     status == RPC_PROGVERSMISMATCH)
1897                         CLNT_GETERR(client, &rpcerr);
1898                 else if (status != RPC_INTR) {
1899                         mutex_enter(&mi->mi_lock);
1900                         mi->mi_flags |= MI_DOWN;
1901                         mutex_exit(&mi->mi_lock);
1902                         CLNT_GETERR(client, &rpcerr);
1903 #ifdef DEBUG
1904                         bufp = clnt_sperror(client, svp->sv_hostname);
1905                         zprintf(zoneid, "NFS_ACL%d %s failed for %s\n",
1906                             mi->mi_vers, mi->mi_aclnames[which], bufp);
1907                         if (nfs_has_ctty()) {
1908                                 if (!(mi->mi_flags & MI_NOPRINT)) {
1909                                         uprintf("NFS_ACL%d %s failed for %s\n",
1910                                             mi->mi_vers, mi->mi_aclnames[which],
1911                                             bufp);
1912                                 }
1913                         }
1914                         kmem_free(bufp, MAXPATHLEN);
1915 #else
1916                         zprintf(zoneid,
1917                             "NFS %s failed for server %s: error %d (%s)\n",
1918                             mi->mi_aclnames[which], svp->sv_hostname,
1919                             status, clnt_sperrno(status));
1920                         if (nfs_has_ctty()) {
1921                                 if (!(mi->mi_flags & MI_NOPRINT))
1922                                         uprintf(
1923                                 "NFS %s failed for server %s: error %d (%s)\n",
1924                                             mi->mi_aclnames[which],
1925                                             svp->sv_hostname, status,
1926                                             clnt_sperrno(status));
1927                         }
1928 #endif
1929                         /*
1930                          * when CLNT_CALL() fails with RPC_AUTHERROR,
1931                          * re_errno is set appropriately depending on
1932                          * the authentication error
1933                          */
1934                         if (status == RPC_VERSMISMATCH ||
1935                             status == RPC_PROGVERSMISMATCH)
1936                                 rpcerr.re_errno = EIO;
1937                 }
1938         } else {
1939                 /*
1940                  * Test the value of mi_down and mi_printed without
1941                  * holding the mi_lock mutex.  If they are both zero,
1942                  * then it is okay to skip the down and printed
1943                  * processing.  This saves on a mutex_enter and
1944                  * mutex_exit pair for a normal, successful RPC.
1945                  * This was just complete overhead.
1946                  */
1947                 if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) {
1948                         mutex_enter(&mi->mi_lock);
1949                         mi->mi_flags &= ~MI_DOWN;
1950                         if (mi->mi_flags & MI_PRINTED) {
1951                                 mi->mi_flags &= ~MI_PRINTED;
1952                                 mutex_exit(&mi->mi_lock);
1953 #ifdef DEBUG
1954                                 zprintf(zoneid, "NFS_ACL%d server %s ok\n",
1955                                     mi->mi_vers, svp->sv_hostname);
1956 #else
1957                                 zprintf(zoneid, "NFS server %s ok\n",
1958                                     svp->sv_hostname);
1959 #endif
1960                         } else
1961                                 mutex_exit(&mi->mi_lock);
1962                 }
1963 
1964                 if (*douprintf == 0) {
1965                         if (!(mi->mi_flags & MI_NOPRINT))
1966 #ifdef DEBUG
1967                                 uprintf("NFS_ACL%d server %s ok\n",
1968                                     mi->mi_vers, svp->sv_hostname);
1969 #else
1970                                 uprintf("NFS server %s ok\n", svp->sv_hostname);
1971 #endif
1972                         *douprintf = 1;
1973                 }
1974         }
1975 
1976         clfree_impl(client, ch, nfscl);
1977         if (cred_cloned)
1978                 crfree(cr);
1979 
1980         ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0);
1981 
1982 #if 0 /* notyet */
1983         TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d",
1984             rpcerr.re_errno);
1985 #endif
1986 
1987         return (rpcerr.re_errno);
1988 }
1989 
1990 int
1991 vattr_to_sattr(struct vattr *vap, struct nfssattr *sa)
1992 {
1993         uint_t mask = vap->va_mask;
1994 
1995         if (!(mask & AT_MODE))
1996                 sa->sa_mode = (uint32_t)-1;
1997         else
1998                 sa->sa_mode = vap->va_mode;
1999         if (!(mask & AT_UID))
2000                 sa->sa_uid = (uint32_t)-1;
2001         else
2002                 sa->sa_uid = (uint32_t)vap->va_uid;
2003         if (!(mask & AT_GID))
2004                 sa->sa_gid = (uint32_t)-1;
2005         else
2006                 sa->sa_gid = (uint32_t)vap->va_gid;
2007         if (!(mask & AT_SIZE))
2008                 sa->sa_size = (uint32_t)-1;
2009         else
2010                 sa->sa_size = (uint32_t)vap->va_size;
2011         if (!(mask & AT_ATIME))
2012                 sa->sa_atime.tv_sec = sa->sa_atime.tv_usec = (int32_t)-1;
2013         else {
2014                 /* check time validity */
2015                 if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) {
2016                         return (EOVERFLOW);
2017                 }
2018                 sa->sa_atime.tv_sec = vap->va_atime.tv_sec;
2019                 sa->sa_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2020         }
2021         if (!(mask & AT_MTIME))
2022                 sa->sa_mtime.tv_sec = sa->sa_mtime.tv_usec = (int32_t)-1;
2023         else {
2024                 /* check time validity */
2025                 if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) {
2026                         return (EOVERFLOW);
2027                 }
2028                 sa->sa_mtime.tv_sec = vap->va_mtime.tv_sec;
2029                 sa->sa_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2030         }
2031         return (0);
2032 }
2033 
2034 int
2035 vattr_to_sattr3(struct vattr *vap, sattr3 *sa)
2036 {
2037         uint_t mask = vap->va_mask;
2038 
2039         if (!(mask & AT_MODE))
2040                 sa->mode.set_it = FALSE;
2041         else {
2042                 sa->mode.set_it = TRUE;
2043                 sa->mode.mode = (mode3)vap->va_mode;
2044         }
2045         if (!(mask & AT_UID))
2046                 sa->uid.set_it = FALSE;
2047         else {
2048                 sa->uid.set_it = TRUE;
2049                 sa->uid.uid = (uid3)vap->va_uid;
2050         }
2051         if (!(mask & AT_GID))
2052                 sa->gid.set_it = FALSE;
2053         else {
2054                 sa->gid.set_it = TRUE;
2055                 sa->gid.gid = (gid3)vap->va_gid;
2056         }
2057         if (!(mask & AT_SIZE))
2058                 sa->size.set_it = FALSE;
2059         else {
2060                 sa->size.set_it = TRUE;
2061                 sa->size.size = (size3)vap->va_size;
2062         }
2063         if (!(mask & AT_ATIME))
2064                 sa->atime.set_it = DONT_CHANGE;
2065         else {
2066                 /* check time validity */
2067                 if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) {
2068                         return (EOVERFLOW);
2069                 }
2070                 sa->atime.set_it = SET_TO_CLIENT_TIME;
2071                 sa->atime.atime.seconds = (uint32)vap->va_atime.tv_sec;
2072                 sa->atime.atime.nseconds = (uint32)vap->va_atime.tv_nsec;
2073         }
2074         if (!(mask & AT_MTIME))
2075                 sa->mtime.set_it = DONT_CHANGE;
2076         else {
2077                 /* check time validity */
2078                 if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) {
2079                         return (EOVERFLOW);
2080                 }
2081                 sa->mtime.set_it = SET_TO_CLIENT_TIME;
2082                 sa->mtime.mtime.seconds = (uint32)vap->va_mtime.tv_sec;
2083                 sa->mtime.mtime.nseconds = (uint32)vap->va_mtime.tv_nsec;
2084         }
2085         return (0);
2086 }
2087 
2088 void
2089 setdiropargs(struct nfsdiropargs *da, char *nm, vnode_t *dvp)
2090 {
2091 
2092         da->da_fhandle = VTOFH(dvp);
2093         da->da_name = nm;
2094         da->da_flags = 0;
2095 }
2096 
2097 void
2098 setdiropargs3(diropargs3 *da, char *nm, vnode_t *dvp)
2099 {
2100 
2101         da->dirp = VTOFH3(dvp);
2102         da->name = nm;
2103 }
2104 
2105 int
2106 setdirgid(vnode_t *dvp, gid_t *gidp, cred_t *cr)
2107 {
2108         int error;
2109         rnode_t *rp;
2110         struct vattr va;
2111 
2112         va.va_mask = AT_MODE | AT_GID;
2113         error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2114         if (error)
2115                 return (error);
2116 
2117         /*
2118          * To determine the expected group-id of the created file:
2119          *  1)  If the filesystem was not mounted with the Old-BSD-compatible
2120          *      GRPID option, and the directory's set-gid bit is clear,
2121          *      then use the process's gid.
2122          *  2)  Otherwise, set the group-id to the gid of the parent directory.
2123          */
2124         rp = VTOR(dvp);
2125         mutex_enter(&rp->r_statelock);
2126         if (!(VTOMI(dvp)->mi_flags & MI_GRPID) && !(va.va_mode & VSGID))
2127                 *gidp = crgetgid(cr);
2128         else
2129                 *gidp = va.va_gid;
2130         mutex_exit(&rp->r_statelock);
2131         return (0);
2132 }
2133 
2134 int
2135 setdirmode(vnode_t *dvp, mode_t *omp, cred_t *cr)
2136 {
2137         int error;
2138         struct vattr va;
2139 
2140         va.va_mask = AT_MODE;
2141         error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2142         if (error)
2143                 return (error);
2144 
2145         /*
2146          * Modify the expected mode (om) so that the set-gid bit matches
2147          * that of the parent directory (dvp).
2148          */
2149         if (va.va_mode & VSGID)
2150                 *omp |= VSGID;
2151         else
2152                 *omp &= ~VSGID;
2153         return (0);
2154 }
2155 
2156 void
2157 nfs_setswaplike(vnode_t *vp, vattr_t *vap)
2158 {
2159 
2160         if (vp->v_type == VREG && (vap->va_mode & (VEXEC | VSVTX)) == VSVTX) {
2161                 if (!(vp->v_flag & VSWAPLIKE)) {
2162                         mutex_enter(&vp->v_lock);
2163                         vp->v_flag |= VSWAPLIKE;
2164                         mutex_exit(&vp->v_lock);
2165                 }
2166         } else {
2167                 if (vp->v_flag & VSWAPLIKE) {
2168                         mutex_enter(&vp->v_lock);
2169                         vp->v_flag &= ~VSWAPLIKE;
2170                         mutex_exit(&vp->v_lock);
2171                 }
2172         }
2173 }
2174 
2175 /*
2176  * Free the resources associated with an rnode.
2177  */
2178 static void
2179 rinactive(rnode_t *rp, cred_t *cr)
2180 {
2181         vnode_t *vp;
2182         cred_t *cred;
2183         char *contents;
2184         int size;
2185         vsecattr_t *vsp;
2186         int error;
2187         nfs3_pathconf_info *info;
2188 
2189         /*
2190          * Before freeing anything, wait until all asynchronous
2191          * activity is done on this rnode.  This will allow all
2192          * asynchronous read ahead and write behind i/o's to
2193          * finish.
2194          */
2195         mutex_enter(&rp->r_statelock);
2196         while (rp->r_count > 0)
2197                 cv_wait(&rp->r_cv, &rp->r_statelock);
2198         mutex_exit(&rp->r_statelock);
2199 
2200         /*
2201          * Flush and invalidate all pages associated with the vnode.
2202          */
2203         vp = RTOV(rp);
2204         if (vn_has_cached_data(vp)) {
2205                 ASSERT(vp->v_type != VCHR);
2206                 if ((rp->r_flags & RDIRTY) && !rp->r_error) {
2207                         error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, 0, cr, NULL);
2208                         if (error && (error == ENOSPC || error == EDQUOT)) {
2209                                 mutex_enter(&rp->r_statelock);
2210                                 if (!rp->r_error)
2211                                         rp->r_error = error;
2212                                 mutex_exit(&rp->r_statelock);
2213                         }
2214                 }
2215                 nfs_invalidate_pages(vp, (u_offset_t)0, cr);
2216         }
2217 
2218         /*
2219          * Free any held credentials and caches which may be associated
2220          * with this rnode.
2221          */
2222         mutex_enter(&rp->r_statelock);
2223         cred = rp->r_cred;
2224         rp->r_cred = NULL;
2225         contents = rp->r_symlink.contents;
2226         size = rp->r_symlink.size;
2227         rp->r_symlink.contents = NULL;
2228         vsp = rp->r_secattr;
2229         rp->r_secattr = NULL;
2230         info = rp->r_pathconf;
2231         rp->r_pathconf = NULL;
2232         mutex_exit(&rp->r_statelock);
2233 
2234         /*
2235          * Free the held credential.
2236          */
2237         if (cred != NULL)
2238                 crfree(cred);
2239 
2240         /*
2241          * Free the access cache entries.
2242          */
2243         (void) nfs_access_purge_rp(rp);
2244 
2245         /*
2246          * Free the readdir cache entries.
2247          */
2248         if (HAVE_RDDIR_CACHE(rp))
2249                 nfs_purge_rddir_cache(vp);
2250 
2251         /*
2252          * Free the symbolic link cache.
2253          */
2254         if (contents != NULL) {
2255 
2256                 kmem_free((void *)contents, size);
2257         }
2258 
2259         /*
2260          * Free any cached ACL.
2261          */
2262         if (vsp != NULL)
2263                 nfs_acl_free(vsp);
2264 
2265         /*
2266          * Free any cached pathconf information.
2267          */
2268         if (info != NULL)
2269                 kmem_free(info, sizeof (*info));
2270 }
2271 
2272 /*
2273  * Return a vnode for the given NFS Version 2 file handle.
2274  * If no rnode exists for this fhandle, create one and put it
2275  * into the hash queues.  If the rnode for this fhandle
2276  * already exists, return it.
2277  *
2278  * Note: make_rnode() may upgrade the hash bucket lock to exclusive.
2279  */
2280 vnode_t *
2281 makenfsnode(fhandle_t *fh, struct nfsfattr *attr, struct vfs *vfsp,
2282     hrtime_t t, cred_t *cr, char *dnm, char *nm)
2283 {
2284         int newnode;
2285         int index;
2286         vnode_t *vp;
2287         nfs_fhandle nfh;
2288         vattr_t va;
2289 
2290         nfh.fh_len = NFS_FHSIZE;
2291         bcopy(fh, nfh.fh_buf, NFS_FHSIZE);
2292 
2293         index = rtablehash(&nfh);
2294         rw_enter(&rtable[index].r_lock, RW_READER);
2295 
2296         vp = make_rnode(&nfh, &rtable[index], vfsp, nfs_vnodeops,
2297             nfs_putapage, nfs_rddir_compar, &newnode, cr, dnm, nm);
2298 
2299         if (attr != NULL) {
2300                 if (!newnode) {
2301                         rw_exit(&rtable[index].r_lock);
2302                         (void) nfs_cache_fattr(vp, attr, &va, t, cr);
2303                 } else {
2304                         if (attr->na_type < NFNON || attr->na_type > NFSOC)
2305                                 vp->v_type = VBAD;
2306                         else
2307                                 vp->v_type = n2v_type(attr);
2308                         /*
2309                          * A translation here seems to be necessary
2310                          * because this function can be called
2311                          * with `attr' that has come from the wire,
2312                          * and been operated on by vattr_to_nattr().
2313                          * See nfsrootvp()->VOP_GETTATTR()->nfsgetattr()
2314                          * ->nfs_getattr_otw()->rfscall()->vattr_to_nattr()
2315                          * ->makenfsnode().
2316                          */
2317                         if ((attr->na_rdev & 0xffff0000) == 0)
2318                                 vp->v_rdev = nfsv2_expdev(attr->na_rdev);
2319                         else
2320                                 vp->v_rdev = expldev(n2v_rdev(attr));
2321                         nfs_attrcache(vp, attr, t);
2322                         rw_exit(&rtable[index].r_lock);
2323                 }
2324         } else {
2325                 if (newnode) {
2326                         PURGE_ATTRCACHE(vp);
2327                 }
2328                 rw_exit(&rtable[index].r_lock);
2329         }
2330 
2331         return (vp);
2332 }
2333 
2334 /*
2335  * Return a vnode for the given NFS Version 3 file handle.
2336  * If no rnode exists for this fhandle, create one and put it
2337  * into the hash queues.  If the rnode for this fhandle
2338  * already exists, return it.
2339  *
2340  * Note: make_rnode() may upgrade the hash bucket lock to exclusive.
2341  */
2342 vnode_t *
2343 makenfs3node_va(nfs_fh3 *fh, vattr_t *vap, struct vfs *vfsp, hrtime_t t,
2344     cred_t *cr, char *dnm, char *nm)
2345 {
2346         int newnode;
2347         int index;
2348         vnode_t *vp;
2349 
2350         index = rtablehash((nfs_fhandle *)fh);
2351         rw_enter(&rtable[index].r_lock, RW_READER);
2352 
2353         vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp,
2354             nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr,
2355             dnm, nm);
2356 
2357         if (vap == NULL) {
2358                 if (newnode) {
2359                         PURGE_ATTRCACHE(vp);
2360                 }
2361                 rw_exit(&rtable[index].r_lock);
2362                 return (vp);
2363         }
2364 
2365         if (!newnode) {
2366                 rw_exit(&rtable[index].r_lock);
2367                 nfs_attr_cache(vp, vap, t, cr);
2368         } else {
2369                 rnode_t *rp = VTOR(vp);
2370 
2371                 vp->v_type = vap->va_type;
2372                 vp->v_rdev = vap->va_rdev;
2373 
2374                 mutex_enter(&rp->r_statelock);
2375                 if (rp->r_mtime <= t)
2376                         nfs_attrcache_va(vp, vap);
2377                 mutex_exit(&rp->r_statelock);
2378                 rw_exit(&rtable[index].r_lock);
2379         }
2380 
2381         return (vp);
2382 }
2383 
2384 vnode_t *
2385 makenfs3node(nfs_fh3 *fh, fattr3 *attr, struct vfs *vfsp, hrtime_t t,
2386     cred_t *cr, char *dnm, char *nm)
2387 {
2388         int newnode;
2389         int index;
2390         vnode_t *vp;
2391         vattr_t va;
2392 
2393         index = rtablehash((nfs_fhandle *)fh);
2394         rw_enter(&rtable[index].r_lock, RW_READER);
2395 
2396         vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp,
2397             nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr,
2398             dnm, nm);
2399 
2400         if (attr == NULL) {
2401                 if (newnode) {
2402                         PURGE_ATTRCACHE(vp);
2403                 }
2404                 rw_exit(&rtable[index].r_lock);
2405                 return (vp);
2406         }
2407 
2408         if (!newnode) {
2409                 rw_exit(&rtable[index].r_lock);
2410                 (void) nfs3_cache_fattr3(vp, attr, &va, t, cr);
2411         } else {
2412                 if (attr->type < NF3REG || attr->type > NF3FIFO)
2413                         vp->v_type = VBAD;
2414                 else
2415                         vp->v_type = nf3_to_vt[attr->type];
2416                 vp->v_rdev = makedevice(attr->rdev.specdata1,
2417                     attr->rdev.specdata2);
2418                 nfs3_attrcache(vp, attr, t);
2419                 rw_exit(&rtable[index].r_lock);
2420         }
2421 
2422         return (vp);
2423 }
2424 
2425 /*
2426  * Read this comment before making changes to rtablehash()!
2427  * This is a hash function in which seemingly obvious and harmless
2428  * changes can cause escalations costing million dollars!
2429  * Know what you are doing.
2430  *
2431  * rtablehash() implements Jenkins' one-at-a-time hash algorithm.  The
2432  * algorithm is currently detailed here:
2433  *
2434  *   http://burtleburtle.net/bob/hash/doobs.html
2435  *
2436  * Of course, the above link may not be valid by the time you are reading
2437  * this, but suffice it to say that the one-at-a-time algorithm works well in
2438  * almost all cases.  If you are changing the algorithm be sure to verify that
2439  * the hash algorithm still provides even distribution in all cases and with
2440  * any server returning filehandles in whatever order (sequential or random).
2441  */
2442 static int
2443 rtablehash(nfs_fhandle *fh)
2444 {
2445         ulong_t hash, len, i;
2446         char *key;
2447 
2448         key = fh->fh_buf;
2449         len = (ulong_t)fh->fh_len;
2450         for (hash = 0, i = 0; i < len; i++) {
2451                 hash += key[i];
2452                 hash += (hash << 10);
2453                 hash ^= (hash >> 6);
2454         }
2455         hash += (hash << 3);
2456         hash ^= (hash >> 11);
2457         hash += (hash << 15);
2458         return (hash & rtablemask);
2459 }
2460 
2461 static vnode_t *
2462 make_rnode(nfs_fhandle *fh, rhashq_t *rhtp, struct vfs *vfsp,
2463     struct vnodeops *vops,
2464     int (*putapage)(vnode_t *, page_t *, u_offset_t *, size_t *, int, cred_t *),
2465     int (*compar)(const void *, const void *),
2466     int *newnode, cred_t *cr, char *dnm, char *nm)
2467 {
2468         rnode_t *rp;
2469         rnode_t *trp;
2470         vnode_t *vp;
2471         mntinfo_t *mi;
2472 
2473         ASSERT(RW_READ_HELD(&rhtp->r_lock));
2474 
2475         mi = VFTOMI(vfsp);
2476 start:
2477         if ((rp = rfind(rhtp, fh, vfsp)) != NULL) {
2478                 vp = RTOV(rp);
2479                 nfs_set_vroot(vp);
2480                 *newnode = 0;
2481                 return (vp);
2482         }
2483         rw_exit(&rhtp->r_lock);
2484 
2485         mutex_enter(&rpfreelist_lock);
2486         if (rpfreelist != NULL && rnew >= nrnode) {
2487                 rp = rpfreelist;
2488                 rp_rmfree(rp);
2489                 mutex_exit(&rpfreelist_lock);
2490 
2491                 vp = RTOV(rp);
2492 
2493                 if (rp->r_flags & RHASHED) {
2494                         rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2495                         mutex_enter(&vp->v_lock);
2496                         if (vp->v_count > 1) {
2497                                 VN_RELE_LOCKED(vp);
2498                                 mutex_exit(&vp->v_lock);
2499                                 rw_exit(&rp->r_hashq->r_lock);
2500                                 rw_enter(&rhtp->r_lock, RW_READER);
2501                                 goto start;
2502                         }
2503                         mutex_exit(&vp->v_lock);
2504                         rp_rmhash_locked(rp);
2505                         rw_exit(&rp->r_hashq->r_lock);
2506                 }
2507 
2508                 rinactive(rp, cr);
2509 
2510                 mutex_enter(&vp->v_lock);
2511                 if (vp->v_count > 1) {
2512                         VN_RELE_LOCKED(vp);
2513                         mutex_exit(&vp->v_lock);
2514                         rw_enter(&rhtp->r_lock, RW_READER);
2515                         goto start;
2516                 }
2517                 mutex_exit(&vp->v_lock);
2518                 vn_invalid(vp);
2519                 /*
2520                  * destroy old locks before bzero'ing and
2521                  * recreating the locks below.
2522                  */
2523                 nfs_rw_destroy(&rp->r_rwlock);
2524                 nfs_rw_destroy(&rp->r_lkserlock);
2525                 mutex_destroy(&rp->r_statelock);
2526                 cv_destroy(&rp->r_cv);
2527                 cv_destroy(&rp->r_commit.c_cv);
2528                 nfs_free_r_path(rp);
2529                 avl_destroy(&rp->r_dir);
2530                 /*
2531                  * Make sure that if rnode is recycled then
2532                  * VFS count is decremented properly before
2533                  * reuse.
2534                  */
2535                 VFS_RELE(vp->v_vfsp);
2536                 vn_reinit(vp);
2537         } else {
2538                 vnode_t *new_vp;
2539 
2540                 mutex_exit(&rpfreelist_lock);
2541 
2542                 rp = kmem_cache_alloc(rnode_cache, KM_SLEEP);
2543                 new_vp = vn_alloc(KM_SLEEP);
2544 
2545                 atomic_inc_ulong((ulong_t *)&rnew);
2546 #ifdef DEBUG
2547                 clstat_debug.nrnode.value.ui64++;
2548 #endif
2549                 vp = new_vp;
2550         }
2551 
2552         bzero(rp, sizeof (*rp));
2553         rp->r_vnode = vp;
2554         nfs_rw_init(&rp->r_rwlock, NULL, RW_DEFAULT, NULL);
2555         nfs_rw_init(&rp->r_lkserlock, NULL, RW_DEFAULT, NULL);
2556         mutex_init(&rp->r_statelock, NULL, MUTEX_DEFAULT, NULL);
2557         cv_init(&rp->r_cv, NULL, CV_DEFAULT, NULL);
2558         cv_init(&rp->r_commit.c_cv, NULL, CV_DEFAULT, NULL);
2559         rp->r_fh.fh_len = fh->fh_len;
2560         bcopy(fh->fh_buf, rp->r_fh.fh_buf, fh->fh_len);
2561         rp->r_server = mi->mi_curr_serv;
2562         if (FAILOVER_MOUNT(mi)) {
2563                 /*
2564                  * If replicated servers, stash pathnames
2565                  */
2566                 if (dnm != NULL && nm != NULL) {
2567                         char *s, *p;
2568                         uint_t len;
2569 
2570                         len = (uint_t)(strlen(dnm) + strlen(nm) + 2);
2571                         rp->r_path = kmem_alloc(len, KM_SLEEP);
2572 #ifdef DEBUG
2573                         clstat_debug.rpath.value.ui64 += len;
2574 #endif
2575                         s = rp->r_path;
2576                         for (p = dnm; *p; p++)
2577                                 *s++ = *p;
2578                         *s++ = '/';
2579                         for (p = nm; *p; p++)
2580                                 *s++ = *p;
2581                         *s = '\0';
2582                 } else {
2583                         /* special case for root */
2584                         rp->r_path = kmem_alloc(2, KM_SLEEP);
2585 #ifdef DEBUG
2586                         clstat_debug.rpath.value.ui64 += 2;
2587 #endif
2588                         *rp->r_path = '.';
2589                         *(rp->r_path + 1) = '\0';
2590                 }
2591         }
2592         VFS_HOLD(vfsp);
2593         rp->r_putapage = putapage;
2594         rp->r_hashq = rhtp;
2595         rp->r_flags = RREADDIRPLUS;
2596         avl_create(&rp->r_dir, compar, sizeof (rddir_cache),
2597             offsetof(rddir_cache, tree));
2598         vn_setops(vp, vops);
2599         vp->v_data = (caddr_t)rp;
2600         vp->v_vfsp = vfsp;
2601         vp->v_type = VNON;
2602         vp->v_flag |= VMODSORT;
2603         nfs_set_vroot(vp);
2604 
2605         /*
2606          * There is a race condition if someone else
2607          * alloc's the rnode while no locks are held, so we
2608          * check again and recover if found.
2609          */
2610         rw_enter(&rhtp->r_lock, RW_WRITER);
2611         if ((trp = rfind(rhtp, fh, vfsp)) != NULL) {
2612                 vp = RTOV(trp);
2613                 nfs_set_vroot(vp);
2614                 *newnode = 0;
2615                 rw_exit(&rhtp->r_lock);
2616                 rp_addfree(rp, cr);
2617                 rw_enter(&rhtp->r_lock, RW_READER);
2618                 return (vp);
2619         }
2620         rp_addhash(rp);
2621         *newnode = 1;
2622         return (vp);
2623 }
2624 
2625 /*
2626  * Callback function to check if the page should be marked as
2627  * modified. In the positive case, p_fsdata is set to C_NOCOMMIT.
2628  */
2629 int
2630 nfs_setmod_check(page_t *pp)
2631 {
2632         if (pp->p_fsdata != C_NOCOMMIT) {
2633                 pp->p_fsdata = C_NOCOMMIT;
2634                 return (1);
2635         }
2636         return (0);
2637 }
2638 
2639 static void
2640 nfs_set_vroot(vnode_t *vp)
2641 {
2642         rnode_t *rp;
2643         nfs_fhandle *rootfh;
2644 
2645         rp = VTOR(vp);
2646         rootfh = &rp->r_server->sv_fhandle;
2647         if (rootfh->fh_len == rp->r_fh.fh_len &&
2648             bcmp(rootfh->fh_buf, rp->r_fh.fh_buf, rp->r_fh.fh_len) == 0) {
2649                 if (!(vp->v_flag & VROOT)) {
2650                         mutex_enter(&vp->v_lock);
2651                         vp->v_flag |= VROOT;
2652                         mutex_exit(&vp->v_lock);
2653                 }
2654         }
2655 }
2656 
2657 static void
2658 nfs_free_r_path(rnode_t *rp)
2659 {
2660         char *path;
2661         size_t len;
2662 
2663         path = rp->r_path;
2664         if (path) {
2665                 rp->r_path = NULL;
2666                 len = strlen(path) + 1;
2667                 kmem_free(path, len);
2668 #ifdef DEBUG
2669                 clstat_debug.rpath.value.ui64 -= len;
2670 #endif
2671         }
2672 }
2673 
2674 /*
2675  * Put an rnode on the free list.
2676  *
2677  * Rnodes which were allocated above and beyond the normal limit
2678  * are immediately freed.
2679  */
2680 void
2681 rp_addfree(rnode_t *rp, cred_t *cr)
2682 {
2683         vnode_t *vp;
2684         struct vfs *vfsp;
2685 
2686         vp = RTOV(rp);
2687         ASSERT(vp->v_count >= 1);
2688         ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
2689 
2690         /*
2691          * If we have too many rnodes allocated and there are no
2692          * references to this rnode, or if the rnode is no longer
2693          * accessible by it does not reside in the hash queues,
2694          * or if an i/o error occurred while writing to the file,
2695          * then just free it instead of putting it on the rnode
2696          * freelist.
2697          */
2698         vfsp = vp->v_vfsp;
2699         if (((rnew > nrnode || !(rp->r_flags & RHASHED) || rp->r_error ||
2700             (vfsp->vfs_flag & VFS_UNMOUNTED)) && rp->r_count == 0)) {
2701                 if (rp->r_flags & RHASHED) {
2702                         rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2703                         mutex_enter(&vp->v_lock);
2704                         if (vp->v_count > 1) {
2705                                 VN_RELE_LOCKED(vp);
2706                                 mutex_exit(&vp->v_lock);
2707                                 rw_exit(&rp->r_hashq->r_lock);
2708                                 return;
2709                         }
2710                         mutex_exit(&vp->v_lock);
2711                         rp_rmhash_locked(rp);
2712                         rw_exit(&rp->r_hashq->r_lock);
2713                 }
2714 
2715                 rinactive(rp, cr);
2716 
2717                 /*
2718                  * Recheck the vnode reference count.  We need to
2719                  * make sure that another reference has not been
2720                  * acquired while we were not holding v_lock.  The
2721                  * rnode is not in the rnode hash queues, so the
2722                  * only way for a reference to have been acquired
2723                  * is for a VOP_PUTPAGE because the rnode was marked
2724                  * with RDIRTY or for a modified page.  This
2725                  * reference may have been acquired before our call
2726                  * to rinactive.  The i/o may have been completed,
2727                  * thus allowing rinactive to complete, but the
2728                  * reference to the vnode may not have been released
2729                  * yet.  In any case, the rnode can not be destroyed
2730                  * until the other references to this vnode have been
2731                  * released.  The other references will take care of
2732                  * either destroying the rnode or placing it on the
2733                  * rnode freelist.  If there are no other references,
2734                  * then the rnode may be safely destroyed.
2735                  */
2736                 mutex_enter(&vp->v_lock);
2737                 if (vp->v_count > 1) {
2738                         VN_RELE_LOCKED(vp);
2739                         mutex_exit(&vp->v_lock);
2740                         return;
2741                 }
2742                 mutex_exit(&vp->v_lock);
2743 
2744                 destroy_rnode(rp);
2745                 return;
2746         }
2747 
2748         /*
2749          * Lock the hash queue and then recheck the reference count
2750          * to ensure that no other threads have acquired a reference
2751          * to indicate that the rnode should not be placed on the
2752          * freelist.  If another reference has been acquired, then
2753          * just release this one and let the other thread complete
2754          * the processing of adding this rnode to the freelist.
2755          */
2756         rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2757 
2758         mutex_enter(&vp->v_lock);
2759         if (vp->v_count > 1) {
2760                 VN_RELE_LOCKED(vp);
2761                 mutex_exit(&vp->v_lock);
2762                 rw_exit(&rp->r_hashq->r_lock);
2763                 return;
2764         }
2765         mutex_exit(&vp->v_lock);
2766 
2767         /*
2768          * If there is no cached data or metadata for this file, then
2769          * put the rnode on the front of the freelist so that it will
2770          * be reused before other rnodes which may have cached data or
2771          * metadata associated with them.
2772          */
2773         mutex_enter(&rpfreelist_lock);
2774         if (rpfreelist == NULL) {
2775                 rp->r_freef = rp;
2776                 rp->r_freeb = rp;
2777                 rpfreelist = rp;
2778         } else {
2779                 rp->r_freef = rpfreelist;
2780                 rp->r_freeb = rpfreelist->r_freeb;
2781                 rpfreelist->r_freeb->r_freef = rp;
2782                 rpfreelist->r_freeb = rp;
2783                 if (!vn_has_cached_data(vp) &&
2784                     !HAVE_RDDIR_CACHE(rp) &&
2785                     rp->r_symlink.contents == NULL &&
2786                     rp->r_secattr == NULL &&
2787                     rp->r_pathconf == NULL)
2788                         rpfreelist = rp;
2789         }
2790         mutex_exit(&rpfreelist_lock);
2791 
2792         rw_exit(&rp->r_hashq->r_lock);
2793 }
2794 
2795 /*
2796  * Remove an rnode from the free list.
2797  *
2798  * The caller must be holding rpfreelist_lock and the rnode
2799  * must be on the freelist.
2800  */
2801 static void
2802 rp_rmfree(rnode_t *rp)
2803 {
2804 
2805         ASSERT(MUTEX_HELD(&rpfreelist_lock));
2806         ASSERT(rp->r_freef != NULL && rp->r_freeb != NULL);
2807 
2808         if (rp == rpfreelist) {
2809                 rpfreelist = rp->r_freef;
2810                 if (rp == rpfreelist)
2811                         rpfreelist = NULL;
2812         }
2813 
2814         rp->r_freeb->r_freef = rp->r_freef;
2815         rp->r_freef->r_freeb = rp->r_freeb;
2816 
2817         rp->r_freef = rp->r_freeb = NULL;
2818 }
2819 
2820 /*
2821  * Put a rnode in the hash table.
2822  *
2823  * The caller must be holding the exclusive hash queue lock.
2824  */
2825 static void
2826 rp_addhash(rnode_t *rp)
2827 {
2828 
2829         ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
2830         ASSERT(!(rp->r_flags & RHASHED));
2831 
2832         rp->r_hashf = rp->r_hashq->r_hashf;
2833         rp->r_hashq->r_hashf = rp;
2834         rp->r_hashb = (rnode_t *)rp->r_hashq;
2835         rp->r_hashf->r_hashb = rp;
2836 
2837         mutex_enter(&rp->r_statelock);
2838         rp->r_flags |= RHASHED;
2839         mutex_exit(&rp->r_statelock);
2840 }
2841 
2842 /*
2843  * Remove a rnode from the hash table.
2844  *
2845  * The caller must be holding the hash queue lock.
2846  */
2847 static void
2848 rp_rmhash_locked(rnode_t *rp)
2849 {
2850 
2851         ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
2852         ASSERT(rp->r_flags & RHASHED);
2853 
2854         rp->r_hashb->r_hashf = rp->r_hashf;
2855         rp->r_hashf->r_hashb = rp->r_hashb;
2856 
2857         mutex_enter(&rp->r_statelock);
2858         rp->r_flags &= ~RHASHED;
2859         mutex_exit(&rp->r_statelock);
2860 }
2861 
2862 /*
2863  * Remove a rnode from the hash table.
2864  *
2865  * The caller must not be holding the hash queue lock.
2866  */
2867 void
2868 rp_rmhash(rnode_t *rp)
2869 {
2870 
2871         rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2872         rp_rmhash_locked(rp);
2873         rw_exit(&rp->r_hashq->r_lock);
2874 }
2875 
2876 /*
2877  * Lookup a rnode by fhandle.
2878  *
2879  * The caller must be holding the hash queue lock, either shared or exclusive.
2880  */
2881 static rnode_t *
2882 rfind(rhashq_t *rhtp, nfs_fhandle *fh, struct vfs *vfsp)
2883 {
2884         rnode_t *rp;
2885         vnode_t *vp;
2886 
2887         ASSERT(RW_LOCK_HELD(&rhtp->r_lock));
2888 
2889         for (rp = rhtp->r_hashf; rp != (rnode_t *)rhtp; rp = rp->r_hashf) {
2890                 vp = RTOV(rp);
2891                 if (vp->v_vfsp == vfsp &&
2892                     rp->r_fh.fh_len == fh->fh_len &&
2893                     bcmp(rp->r_fh.fh_buf, fh->fh_buf, fh->fh_len) == 0) {
2894                         /*
2895                          * remove rnode from free list, if necessary.
2896                          */
2897                         if (rp->r_freef != NULL) {
2898                                 mutex_enter(&rpfreelist_lock);
2899                                 /*
2900                                  * If the rnode is on the freelist,
2901                                  * then remove it and use that reference
2902                                  * as the new reference.  Otherwise,
2903                                  * need to increment the reference count.
2904                                  */
2905                                 if (rp->r_freef != NULL) {
2906                                         rp_rmfree(rp);
2907                                         mutex_exit(&rpfreelist_lock);
2908                                 } else {
2909                                         mutex_exit(&rpfreelist_lock);
2910                                         VN_HOLD(vp);
2911                                 }
2912                         } else
2913                                 VN_HOLD(vp);
2914                         return (rp);
2915                 }
2916         }
2917         return (NULL);
2918 }
2919 
2920 /*
2921  * Return 1 if there is a active vnode belonging to this vfs in the
2922  * rtable cache.
2923  *
2924  * Several of these checks are done without holding the usual
2925  * locks.  This is safe because destroy_rtable(), rp_addfree(),
2926  * etc. will redo the necessary checks before actually destroying
2927  * any rnodes.
2928  */
2929 int
2930 check_rtable(struct vfs *vfsp)
2931 {
2932         int index;
2933         rnode_t *rp;
2934         vnode_t *vp;
2935 
2936         for (index = 0; index < rtablesize; index++) {
2937                 rw_enter(&rtable[index].r_lock, RW_READER);
2938                 for (rp = rtable[index].r_hashf;
2939                     rp != (rnode_t *)(&rtable[index]);
2940                     rp = rp->r_hashf) {
2941                         vp = RTOV(rp);
2942                         if (vp->v_vfsp == vfsp) {
2943                                 if (rp->r_freef == NULL ||
2944                                     (vn_has_cached_data(vp) &&
2945                                     (rp->r_flags & RDIRTY)) ||
2946                                     rp->r_count > 0) {
2947                                         rw_exit(&rtable[index].r_lock);
2948                                         return (1);
2949                                 }
2950                         }
2951                 }
2952                 rw_exit(&rtable[index].r_lock);
2953         }
2954         return (0);
2955 }
2956 
2957 /*
2958  * Destroy inactive vnodes from the hash queues which belong to this
2959  * vfs.  It is essential that we destroy all inactive vnodes during a
2960  * forced unmount as well as during a normal unmount.
2961  */
2962 void
2963 destroy_rtable(struct vfs *vfsp, cred_t *cr)
2964 {
2965         int index;
2966         rnode_t *rp;
2967         rnode_t *rlist;
2968         rnode_t *r_hashf;
2969         vnode_t *vp;
2970 
2971         rlist = NULL;
2972 
2973         for (index = 0; index < rtablesize; index++) {
2974                 rw_enter(&rtable[index].r_lock, RW_WRITER);
2975                 for (rp = rtable[index].r_hashf;
2976                     rp != (rnode_t *)(&rtable[index]);
2977                     rp = r_hashf) {
2978                         /* save the hash pointer before destroying */
2979                         r_hashf = rp->r_hashf;
2980                         vp = RTOV(rp);
2981                         if (vp->v_vfsp == vfsp) {
2982                                 mutex_enter(&rpfreelist_lock);
2983                                 if (rp->r_freef != NULL) {
2984                                         rp_rmfree(rp);
2985                                         mutex_exit(&rpfreelist_lock);
2986                                         rp_rmhash_locked(rp);
2987                                         rp->r_hashf = rlist;
2988                                         rlist = rp;
2989                                 } else
2990                                         mutex_exit(&rpfreelist_lock);
2991                         }
2992                 }
2993                 rw_exit(&rtable[index].r_lock);
2994         }
2995 
2996         for (rp = rlist; rp != NULL; rp = rlist) {
2997                 rlist = rp->r_hashf;
2998                 /*
2999                  * This call to rp_addfree will end up destroying the
3000                  * rnode, but in a safe way with the appropriate set
3001                  * of checks done.
3002                  */
3003                 rp_addfree(rp, cr);
3004         }
3005 
3006 }
3007 
3008 /*
3009  * This routine destroys all the resources associated with the rnode
3010  * and then the rnode itself.
3011  */
3012 static void
3013 destroy_rnode(rnode_t *rp)
3014 {
3015         vnode_t *vp;
3016         vfs_t *vfsp;
3017 
3018         vp = RTOV(rp);
3019         vfsp = vp->v_vfsp;
3020 
3021         ASSERT(vp->v_count == 1);
3022         ASSERT(rp->r_count == 0);
3023         ASSERT(rp->r_lmpl == NULL);
3024         ASSERT(rp->r_mapcnt == 0);
3025         ASSERT(!(rp->r_flags & RHASHED));
3026         ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
3027         atomic_dec_ulong((ulong_t *)&rnew);
3028 #ifdef DEBUG
3029         clstat_debug.nrnode.value.ui64--;
3030 #endif
3031         nfs_rw_destroy(&rp->r_rwlock);
3032         nfs_rw_destroy(&rp->r_lkserlock);
3033         mutex_destroy(&rp->r_statelock);
3034         cv_destroy(&rp->r_cv);
3035         cv_destroy(&rp->r_commit.c_cv);
3036         if (rp->r_flags & RDELMAPLIST)
3037                 list_destroy(&rp->r_indelmap);
3038         nfs_free_r_path(rp);
3039         avl_destroy(&rp->r_dir);
3040         vn_invalid(vp);
3041         vn_free(vp);
3042         kmem_cache_free(rnode_cache, rp);
3043         VFS_RELE(vfsp);
3044 }
3045 
3046 /*
3047  * Flush all vnodes in this (or every) vfs.
3048  * Used by nfs_sync and by nfs_unmount.
3049  */
3050 void
3051 rflush(struct vfs *vfsp, cred_t *cr)
3052 {
3053         int index;
3054         rnode_t *rp;
3055         vnode_t *vp, **vplist;
3056         long num, cnt;
3057 
3058         /*
3059          * Check to see whether there is anything to do.
3060          */
3061         num = rnew;
3062         if (num == 0)
3063                 return;
3064 
3065         /*
3066          * Allocate a slot for all currently active rnodes on the
3067          * supposition that they all may need flushing.
3068          */
3069         vplist = kmem_alloc(num * sizeof (*vplist), KM_SLEEP);
3070         cnt = 0;
3071 
3072         /*
3073          * Walk the hash queues looking for rnodes with page
3074          * lists associated with them.  Make a list of these
3075          * files.
3076          */
3077         for (index = 0; index < rtablesize; index++) {
3078                 rw_enter(&rtable[index].r_lock, RW_READER);
3079                 for (rp = rtable[index].r_hashf;
3080                     rp != (rnode_t *)(&rtable[index]);
3081                     rp = rp->r_hashf) {
3082                         vp = RTOV(rp);
3083                         /*
3084                          * Don't bother sync'ing a vp if it
3085                          * is part of virtual swap device or
3086                          * if VFS is read-only
3087                          */
3088                         if (IS_SWAPVP(vp) || vn_is_readonly(vp))
3089                                 continue;
3090                         /*
3091                          * If flushing all mounted file systems or
3092                          * the vnode belongs to this vfs, has pages
3093                          * and is marked as either dirty or mmap'd,
3094                          * hold and add this vnode to the list of
3095                          * vnodes to flush.
3096                          */
3097                         if ((vfsp == NULL || vp->v_vfsp == vfsp) &&
3098                             vn_has_cached_data(vp) &&
3099                             ((rp->r_flags & RDIRTY) || rp->r_mapcnt > 0)) {
3100                                 VN_HOLD(vp);
3101                                 vplist[cnt++] = vp;
3102                                 if (cnt == num) {
3103                                         rw_exit(&rtable[index].r_lock);
3104                                         goto toomany;
3105                                 }
3106                         }
3107                 }
3108                 rw_exit(&rtable[index].r_lock);
3109         }
3110 toomany:
3111 
3112         /*
3113          * Flush and release all of the files on the list.
3114          */
3115         while (cnt-- > 0) {
3116                 vp = vplist[cnt];
3117                 (void) VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_ASYNC, cr, NULL);
3118                 VN_RELE(vp);
3119         }
3120 
3121         /*
3122          * Free the space allocated to hold the list.
3123          */
3124         kmem_free(vplist, num * sizeof (*vplist));
3125 }
3126 
3127 /*
3128  * This probably needs to be larger than or equal to
3129  * log2(sizeof (struct rnode)) due to the way that rnodes are
3130  * allocated.
3131  */
3132 #define ACACHE_SHIFT_BITS       9
3133 
3134 static int
3135 acachehash(rnode_t *rp, cred_t *cr)
3136 {
3137 
3138         return ((((intptr_t)rp >> ACACHE_SHIFT_BITS) + crgetuid(cr)) &
3139             acachemask);
3140 }
3141 
3142 #ifdef DEBUG
3143 static long nfs_access_cache_hits = 0;
3144 static long nfs_access_cache_misses = 0;
3145 #endif
3146 
3147 nfs_access_type_t
3148 nfs_access_check(rnode_t *rp, uint32_t acc, cred_t *cr)
3149 {
3150         vnode_t *vp;
3151         acache_t *ap;
3152         acache_hash_t *hp;
3153         nfs_access_type_t all;
3154 
3155         vp = RTOV(rp);
3156         if (!ATTRCACHE_VALID(vp) || nfs_waitfor_purge_complete(vp))
3157                 return (NFS_ACCESS_UNKNOWN);
3158 
3159         if (rp->r_acache != NULL) {
3160                 hp = &acache[acachehash(rp, cr)];
3161                 rw_enter(&hp->lock, RW_READER);
3162                 ap = hp->next;
3163                 while (ap != (acache_t *)hp) {
3164                         if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) {
3165                                 if ((ap->known & acc) == acc) {
3166 #ifdef DEBUG
3167                                         nfs_access_cache_hits++;
3168 #endif
3169                                         if ((ap->allowed & acc) == acc)
3170                                                 all = NFS_ACCESS_ALLOWED;
3171                                         else
3172                                                 all = NFS_ACCESS_DENIED;
3173                                 } else {
3174 #ifdef DEBUG
3175                                         nfs_access_cache_misses++;
3176 #endif
3177                                         all = NFS_ACCESS_UNKNOWN;
3178                                 }
3179                                 rw_exit(&hp->lock);
3180                                 return (all);
3181                         }
3182                         ap = ap->next;
3183                 }
3184                 rw_exit(&hp->lock);
3185         }
3186 
3187 #ifdef DEBUG
3188         nfs_access_cache_misses++;
3189 #endif
3190         return (NFS_ACCESS_UNKNOWN);
3191 }
3192 
3193 void
3194 nfs_access_cache(rnode_t *rp, uint32_t acc, uint32_t resacc, cred_t *cr)
3195 {
3196         acache_t *ap;
3197         acache_t *nap;
3198         acache_hash_t *hp;
3199 
3200         hp = &acache[acachehash(rp, cr)];
3201 
3202         /*
3203          * Allocate now assuming that mostly an allocation will be
3204          * required.  This allows the allocation to happen without
3205          * holding the hash bucket locked.
3206          */
3207         nap = kmem_cache_alloc(acache_cache, KM_NOSLEEP);
3208         if (nap != NULL) {
3209                 nap->known = acc;
3210                 nap->allowed = resacc;
3211                 nap->rnode = rp;
3212                 crhold(cr);
3213                 nap->cred = cr;
3214                 nap->hashq = hp;
3215         }
3216 
3217         rw_enter(&hp->lock, RW_WRITER);
3218 
3219         if (rp->r_acache != NULL) {
3220                 ap = hp->next;
3221                 while (ap != (acache_t *)hp) {
3222                         if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) {
3223                                 ap->known |= acc;
3224                                 ap->allowed &= ~acc;
3225                                 ap->allowed |= resacc;
3226                                 rw_exit(&hp->lock);
3227                                 if (nap != NULL) {
3228                                         crfree(nap->cred);
3229                                         kmem_cache_free(acache_cache, nap);
3230                                 }
3231                                 return;
3232                         }
3233                         ap = ap->next;
3234                 }
3235         }
3236 
3237         if (nap != NULL) {
3238 #ifdef DEBUG
3239                 clstat_debug.access.value.ui64++;
3240 #endif
3241                 nap->next = hp->next;
3242                 hp->next = nap;
3243                 nap->next->prev = nap;
3244                 nap->prev = (acache_t *)hp;
3245 
3246                 mutex_enter(&rp->r_statelock);
3247                 nap->list = rp->r_acache;
3248                 rp->r_acache = nap;
3249                 mutex_exit(&rp->r_statelock);
3250         }
3251 
3252         rw_exit(&hp->lock);
3253 }
3254 
3255 int
3256 nfs_access_purge_rp(rnode_t *rp)
3257 {
3258         acache_t *ap;
3259         acache_t *tmpap;
3260         acache_t *rplist;
3261 
3262         /*
3263          * If there aren't any cached entries, then there is nothing
3264          * to free.
3265          */
3266         if (rp->r_acache == NULL)
3267                 return (0);
3268 
3269         mutex_enter(&rp->r_statelock);
3270         rplist = rp->r_acache;
3271         rp->r_acache = NULL;
3272         mutex_exit(&rp->r_statelock);
3273 
3274         /*
3275          * Loop through each entry in the list pointed to in the
3276          * rnode.  Remove each of these entries from the hash
3277          * queue that it is on and remove it from the list in
3278          * the rnode.
3279          */
3280         for (ap = rplist; ap != NULL; ap = tmpap) {
3281                 rw_enter(&ap->hashq->lock, RW_WRITER);
3282                 ap->prev->next = ap->next;
3283                 ap->next->prev = ap->prev;
3284                 rw_exit(&ap->hashq->lock);
3285 
3286                 tmpap = ap->list;
3287                 crfree(ap->cred);
3288                 kmem_cache_free(acache_cache, ap);
3289 #ifdef DEBUG
3290                 clstat_debug.access.value.ui64--;
3291 #endif
3292         }
3293 
3294         return (1);
3295 }
3296 
3297 static const char prefix[] = ".nfs";
3298 
3299 static kmutex_t newnum_lock;
3300 
3301 int
3302 newnum(void)
3303 {
3304         static uint_t newnum = 0;
3305         uint_t id;
3306 
3307         mutex_enter(&newnum_lock);
3308         if (newnum == 0)
3309                 newnum = gethrestime_sec() & 0xffff;
3310         id = newnum++;
3311         mutex_exit(&newnum_lock);
3312         return (id);
3313 }
3314 
3315 char *
3316 newname(void)
3317 {
3318         char *news;
3319         char *s;
3320         const char *p;
3321         uint_t id;
3322 
3323         id = newnum();
3324         news = kmem_alloc(MAXNAMELEN, KM_SLEEP);
3325         s = news;
3326         p = prefix;
3327         while (*p != '\0')
3328                 *s++ = *p++;
3329         while (id != 0) {
3330                 *s++ = "0123456789ABCDEF"[id & 0x0f];
3331                 id >>= 4;
3332         }
3333         *s = '\0';
3334         return (news);
3335 }
3336 
3337 /*
3338  * Snapshot callback for nfs:0:nfs_client as registered with the kstat
3339  * framework.
3340  */
3341 static int
3342 cl_snapshot(kstat_t *ksp, void *buf, int rw)
3343 {
3344         ksp->ks_snaptime = gethrtime();
3345         if (rw == KSTAT_WRITE) {
3346                 bcopy(buf, ksp->ks_private, sizeof (clstat_tmpl));
3347 #ifdef DEBUG
3348                 /*
3349                  * Currently only the global zone can write to kstats, but we
3350                  * add the check just for paranoia.
3351                  */
3352                 if (INGLOBALZONE(curproc))
3353                         bcopy((char *)buf + sizeof (clstat_tmpl), &clstat_debug,
3354                             sizeof (clstat_debug));
3355 #endif
3356         } else {
3357                 bcopy(ksp->ks_private, buf, sizeof (clstat_tmpl));
3358 #ifdef DEBUG
3359                 /*
3360                  * If we're displaying the "global" debug kstat values, we
3361                  * display them as-is to all zones since in fact they apply to
3362                  * the system as a whole.
3363                  */
3364                 bcopy(&clstat_debug, (char *)buf + sizeof (clstat_tmpl),
3365                     sizeof (clstat_debug));
3366 #endif
3367         }
3368         return (0);
3369 }
3370 
3371 static void *
3372 clinit_zone(zoneid_t zoneid)
3373 {
3374         kstat_t *nfs_client_kstat;
3375         struct nfs_clnt *nfscl;
3376         uint_t ndata;
3377 
3378         nfscl = kmem_alloc(sizeof (*nfscl), KM_SLEEP);
3379         mutex_init(&nfscl->nfscl_chtable_lock, NULL, MUTEX_DEFAULT, NULL);
3380         nfscl->nfscl_chtable = NULL;
3381         nfscl->nfscl_zoneid = zoneid;
3382 
3383         bcopy(&clstat_tmpl, &nfscl->nfscl_stat, sizeof (clstat_tmpl));
3384         ndata = sizeof (clstat_tmpl) / sizeof (kstat_named_t);
3385 #ifdef DEBUG
3386         ndata += sizeof (clstat_debug) / sizeof (kstat_named_t);
3387 #endif
3388         if ((nfs_client_kstat = kstat_create_zone("nfs", 0, "nfs_client",
3389             "misc", KSTAT_TYPE_NAMED, ndata,
3390             KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, zoneid)) != NULL) {
3391                 nfs_client_kstat->ks_private = &nfscl->nfscl_stat;
3392                 nfs_client_kstat->ks_snapshot = cl_snapshot;
3393                 kstat_install(nfs_client_kstat);
3394         }
3395         mutex_enter(&nfs_clnt_list_lock);
3396         list_insert_head(&nfs_clnt_list, nfscl);
3397         mutex_exit(&nfs_clnt_list_lock);
3398         return (nfscl);
3399 }
3400 
3401 /*ARGSUSED*/
3402 static void
3403 clfini_zone(zoneid_t zoneid, void *arg)
3404 {
3405         struct nfs_clnt *nfscl = arg;
3406         chhead_t *chp, *next;
3407 
3408         if (nfscl == NULL)
3409                 return;
3410         mutex_enter(&nfs_clnt_list_lock);
3411         list_remove(&nfs_clnt_list, nfscl);
3412         mutex_exit(&nfs_clnt_list_lock);
3413         clreclaim_zone(nfscl, 0);
3414         for (chp = nfscl->nfscl_chtable; chp != NULL; chp = next) {
3415                 ASSERT(chp->ch_list == NULL);
3416                 kmem_free(chp->ch_protofmly, strlen(chp->ch_protofmly) + 1);
3417                 next = chp->ch_next;
3418                 kmem_free(chp, sizeof (*chp));
3419         }
3420         kstat_delete_byname_zone("nfs", 0, "nfs_client", zoneid);
3421         mutex_destroy(&nfscl->nfscl_chtable_lock);
3422         kmem_free(nfscl, sizeof (*nfscl));
3423 }
3424 
3425 /*
3426  * Called by endpnt_destructor to make sure the client handles are
3427  * cleaned up before the RPC endpoints.  This becomes a no-op if
3428  * clfini_zone (above) is called first.  This function is needed
3429  * (rather than relying on clfini_zone to clean up) because the ZSD
3430  * callbacks have no ordering mechanism, so we have no way to ensure
3431  * that clfini_zone is called before endpnt_destructor.
3432  */
3433 void
3434 clcleanup_zone(zoneid_t zoneid)
3435 {
3436         struct nfs_clnt *nfscl;
3437 
3438         mutex_enter(&nfs_clnt_list_lock);
3439         nfscl = list_head(&nfs_clnt_list);
3440         for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl)) {
3441                 if (nfscl->nfscl_zoneid == zoneid) {
3442                         clreclaim_zone(nfscl, 0);
3443                         break;
3444                 }
3445         }
3446         mutex_exit(&nfs_clnt_list_lock);
3447 }
3448 
3449 int
3450 nfs_subrinit(void)
3451 {
3452         int i;
3453         ulong_t nrnode_max;
3454 
3455         /*
3456          * Allocate and initialize the rnode hash queues
3457          */
3458         if (nrnode <= 0)
3459                 nrnode = ncsize;
3460         nrnode_max = (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct rnode));
3461         if (nrnode > nrnode_max || (nrnode == 0 && ncsize == 0)) {
3462                 zcmn_err(GLOBAL_ZONEID, CE_NOTE,
3463                     "!setting nrnode to max value of %ld", nrnode_max);
3464                 nrnode = nrnode_max;
3465         }
3466 
3467         rtablesize = 1 << highbit(nrnode / hashlen);
3468         rtablemask = rtablesize - 1;
3469         rtable = kmem_alloc(rtablesize * sizeof (*rtable), KM_SLEEP);
3470         for (i = 0; i < rtablesize; i++) {
3471                 rtable[i].r_hashf = (rnode_t *)(&rtable[i]);
3472                 rtable[i].r_hashb = (rnode_t *)(&rtable[i]);
3473                 rw_init(&rtable[i].r_lock, NULL, RW_DEFAULT, NULL);
3474         }
3475         rnode_cache = kmem_cache_create("rnode_cache", sizeof (rnode_t),
3476             0, NULL, NULL, nfs_reclaim, NULL, NULL, 0);
3477 
3478         /*
3479          * Allocate and initialize the access cache
3480          */
3481 
3482         /*
3483          * Initial guess is one access cache entry per rnode unless
3484          * nacache is set to a non-zero value and then it is used to
3485          * indicate a guess at the number of access cache entries.
3486          */
3487         if (nacache > 0)
3488                 acachesize = 1 << highbit(nacache / hashlen);
3489         else
3490                 acachesize = rtablesize;
3491         acachemask = acachesize - 1;
3492         acache = kmem_alloc(acachesize * sizeof (*acache), KM_SLEEP);
3493         for (i = 0; i < acachesize; i++) {
3494                 acache[i].next = (acache_t *)&acache[i];
3495                 acache[i].prev = (acache_t *)&acache[i];
3496                 rw_init(&acache[i].lock, NULL, RW_DEFAULT, NULL);
3497         }
3498         acache_cache = kmem_cache_create("nfs_access_cache",
3499             sizeof (acache_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
3500         /*
3501          * Allocate and initialize the client handle cache
3502          */
3503         chtab_cache = kmem_cache_create("client_handle_cache",
3504             sizeof (struct chtab), 0, NULL, NULL, clreclaim, NULL, NULL, 0);
3505         /*
3506          * Initialize the list of per-zone client handles (and associated data).
3507          * This needs to be done before we call zone_key_create().
3508          */
3509         list_create(&nfs_clnt_list, sizeof (struct nfs_clnt),
3510             offsetof(struct nfs_clnt, nfscl_node));
3511         /*
3512          * Initialize the zone_key for per-zone client handle lists.
3513          */
3514         zone_key_create(&nfsclnt_zone_key, clinit_zone, NULL, clfini_zone);
3515         /*
3516          * Initialize the various mutexes and reader/writer locks
3517          */
3518         mutex_init(&rpfreelist_lock, NULL, MUTEX_DEFAULT, NULL);
3519         mutex_init(&newnum_lock, NULL, MUTEX_DEFAULT, NULL);
3520         mutex_init(&nfs_minor_lock, NULL, MUTEX_DEFAULT, NULL);
3521 
3522         /*
3523          * Assign unique major number for all nfs mounts
3524          */
3525         if ((nfs_major = getudev()) == -1) {
3526                 zcmn_err(GLOBAL_ZONEID, CE_WARN,
3527                     "nfs: init: can't get unique device number");
3528                 nfs_major = 0;
3529         }
3530         nfs_minor = 0;
3531 
3532         if (nfs3_jukebox_delay == 0)
3533                 nfs3_jukebox_delay = NFS3_JUKEBOX_DELAY;
3534 
3535         return (0);
3536 }
3537 
3538 void
3539 nfs_subrfini(void)
3540 {
3541         int i;
3542 
3543         /*
3544          * Deallocate the rnode hash queues
3545          */
3546         kmem_cache_destroy(rnode_cache);
3547 
3548         for (i = 0; i < rtablesize; i++)
3549                 rw_destroy(&rtable[i].r_lock);
3550         kmem_free(rtable, rtablesize * sizeof (*rtable));
3551 
3552         /*
3553          * Deallocated the access cache
3554          */
3555         kmem_cache_destroy(acache_cache);
3556 
3557         for (i = 0; i < acachesize; i++)
3558                 rw_destroy(&acache[i].lock);
3559         kmem_free(acache, acachesize * sizeof (*acache));
3560 
3561         /*
3562          * Deallocate the client handle cache
3563          */
3564         kmem_cache_destroy(chtab_cache);
3565 
3566         /*
3567          * Destroy the various mutexes and reader/writer locks
3568          */
3569         mutex_destroy(&rpfreelist_lock);
3570         mutex_destroy(&newnum_lock);
3571         mutex_destroy(&nfs_minor_lock);
3572         (void) zone_key_delete(nfsclnt_zone_key);
3573 }
3574 
3575 enum nfsstat
3576 puterrno(int error)
3577 {
3578 
3579         switch (error) {
3580         case EOPNOTSUPP:
3581                 return (NFSERR_OPNOTSUPP);
3582         case ENAMETOOLONG:
3583                 return (NFSERR_NAMETOOLONG);
3584         case ENOTEMPTY:
3585                 return (NFSERR_NOTEMPTY);
3586         case EDQUOT:
3587                 return (NFSERR_DQUOT);
3588         case ESTALE:
3589                 return (NFSERR_STALE);
3590         case EREMOTE:
3591                 return (NFSERR_REMOTE);
3592         case ENOSYS:
3593                 return (NFSERR_OPNOTSUPP);
3594         case EOVERFLOW:
3595                 return (NFSERR_INVAL);
3596         default:
3597                 return ((enum nfsstat)error);
3598         }
3599         /* NOTREACHED */
3600 }
3601 
3602 int
3603 geterrno(enum nfsstat status)
3604 {
3605 
3606         switch (status) {
3607         case NFSERR_OPNOTSUPP:
3608                 return (EOPNOTSUPP);
3609         case NFSERR_NAMETOOLONG:
3610                 return (ENAMETOOLONG);
3611         case NFSERR_NOTEMPTY:
3612                 return (ENOTEMPTY);
3613         case NFSERR_DQUOT:
3614                 return (EDQUOT);
3615         case NFSERR_STALE:
3616                 return (ESTALE);
3617         case NFSERR_REMOTE:
3618                 return (EREMOTE);
3619         case NFSERR_WFLUSH:
3620                 return (EIO);
3621         default:
3622                 return ((int)status);
3623         }
3624         /* NOTREACHED */
3625 }
3626 
3627 enum nfsstat3
3628 puterrno3(int error)
3629 {
3630 
3631 #ifdef DEBUG
3632         switch (error) {
3633         case 0:
3634                 return (NFS3_OK);
3635         case EPERM:
3636                 return (NFS3ERR_PERM);
3637         case ENOENT:
3638                 return (NFS3ERR_NOENT);
3639         case EIO:
3640                 return (NFS3ERR_IO);
3641         case ENXIO:
3642                 return (NFS3ERR_NXIO);
3643         case EACCES:
3644                 return (NFS3ERR_ACCES);
3645         case EEXIST:
3646                 return (NFS3ERR_EXIST);
3647         case EXDEV:
3648                 return (NFS3ERR_XDEV);
3649         case ENODEV:
3650                 return (NFS3ERR_NODEV);
3651         case ENOTDIR:
3652                 return (NFS3ERR_NOTDIR);
3653         case EISDIR:
3654                 return (NFS3ERR_ISDIR);
3655         case EINVAL:
3656                 return (NFS3ERR_INVAL);
3657         case EFBIG:
3658                 return (NFS3ERR_FBIG);
3659         case ENOSPC:
3660                 return (NFS3ERR_NOSPC);
3661         case EROFS:
3662                 return (NFS3ERR_ROFS);
3663         case EMLINK:
3664                 return (NFS3ERR_MLINK);
3665         case ENAMETOOLONG:
3666                 return (NFS3ERR_NAMETOOLONG);
3667         case ENOTEMPTY:
3668                 return (NFS3ERR_NOTEMPTY);
3669         case EDQUOT:
3670                 return (NFS3ERR_DQUOT);
3671         case ESTALE:
3672                 return (NFS3ERR_STALE);
3673         case EREMOTE:
3674                 return (NFS3ERR_REMOTE);
3675         case ENOSYS:
3676         case EOPNOTSUPP:
3677                 return (NFS3ERR_NOTSUPP);
3678         case EOVERFLOW:
3679                 return (NFS3ERR_INVAL);
3680         default:
3681                 zcmn_err(getzoneid(), CE_WARN,
3682                     "puterrno3: got error %d", error);
3683                 return ((enum nfsstat3)error);
3684         }
3685 #else
3686         switch (error) {
3687         case ENAMETOOLONG:
3688                 return (NFS3ERR_NAMETOOLONG);
3689         case ENOTEMPTY:
3690                 return (NFS3ERR_NOTEMPTY);
3691         case EDQUOT:
3692                 return (NFS3ERR_DQUOT);
3693         case ESTALE:
3694                 return (NFS3ERR_STALE);
3695         case ENOSYS:
3696         case EOPNOTSUPP:
3697                 return (NFS3ERR_NOTSUPP);
3698         case EREMOTE:
3699                 return (NFS3ERR_REMOTE);
3700         case EOVERFLOW:
3701                 return (NFS3ERR_INVAL);
3702         default:
3703                 return ((enum nfsstat3)error);
3704         }
3705 #endif
3706 }
3707 
3708 int
3709 geterrno3(enum nfsstat3 status)
3710 {
3711 
3712 #ifdef DEBUG
3713         switch (status) {
3714         case NFS3_OK:
3715                 return (0);
3716         case NFS3ERR_PERM:
3717                 return (EPERM);
3718         case NFS3ERR_NOENT:
3719                 return (ENOENT);
3720         case NFS3ERR_IO:
3721                 return (EIO);
3722         case NFS3ERR_NXIO:
3723                 return (ENXIO);
3724         case NFS3ERR_ACCES:
3725                 return (EACCES);
3726         case NFS3ERR_EXIST:
3727                 return (EEXIST);
3728         case NFS3ERR_XDEV:
3729                 return (EXDEV);
3730         case NFS3ERR_NODEV:
3731                 return (ENODEV);
3732         case NFS3ERR_NOTDIR:
3733                 return (ENOTDIR);
3734         case NFS3ERR_ISDIR:
3735                 return (EISDIR);
3736         case NFS3ERR_INVAL:
3737                 return (EINVAL);
3738         case NFS3ERR_FBIG:
3739                 return (EFBIG);
3740         case NFS3ERR_NOSPC:
3741                 return (ENOSPC);
3742         case NFS3ERR_ROFS:
3743                 return (EROFS);
3744         case NFS3ERR_MLINK:
3745                 return (EMLINK);
3746         case NFS3ERR_NAMETOOLONG:
3747                 return (ENAMETOOLONG);
3748         case NFS3ERR_NOTEMPTY:
3749                 return (ENOTEMPTY);
3750         case NFS3ERR_DQUOT:
3751                 return (EDQUOT);
3752         case NFS3ERR_STALE:
3753                 return (ESTALE);
3754         case NFS3ERR_REMOTE:
3755                 return (EREMOTE);
3756         case NFS3ERR_BADHANDLE:
3757                 return (ESTALE);
3758         case NFS3ERR_NOT_SYNC:
3759                 return (EINVAL);
3760         case NFS3ERR_BAD_COOKIE:
3761                 return (ENOENT);
3762         case NFS3ERR_NOTSUPP:
3763                 return (EOPNOTSUPP);
3764         case NFS3ERR_TOOSMALL:
3765                 return (EINVAL);
3766         case NFS3ERR_SERVERFAULT:
3767                 return (EIO);
3768         case NFS3ERR_BADTYPE:
3769                 return (EINVAL);
3770         case NFS3ERR_JUKEBOX:
3771                 return (ENXIO);
3772         default:
3773                 zcmn_err(getzoneid(), CE_WARN,
3774                     "geterrno3: got status %d", status);
3775                 return ((int)status);
3776         }
3777 #else
3778         switch (status) {
3779         case NFS3ERR_NAMETOOLONG:
3780                 return (ENAMETOOLONG);
3781         case NFS3ERR_NOTEMPTY:
3782                 return (ENOTEMPTY);
3783         case NFS3ERR_DQUOT:
3784                 return (EDQUOT);
3785         case NFS3ERR_STALE:
3786         case NFS3ERR_BADHANDLE:
3787                 return (ESTALE);
3788         case NFS3ERR_NOTSUPP:
3789                 return (EOPNOTSUPP);
3790         case NFS3ERR_REMOTE:
3791                 return (EREMOTE);
3792         case NFS3ERR_NOT_SYNC:
3793         case NFS3ERR_TOOSMALL:
3794         case NFS3ERR_BADTYPE:
3795                 return (EINVAL);
3796         case NFS3ERR_BAD_COOKIE:
3797                 return (ENOENT);
3798         case NFS3ERR_SERVERFAULT:
3799                 return (EIO);
3800         case NFS3ERR_JUKEBOX:
3801                 return (ENXIO);
3802         default:
3803                 return ((int)status);
3804         }
3805 #endif
3806 }
3807 
3808 rddir_cache *
3809 rddir_cache_alloc(int flags)
3810 {
3811         rddir_cache *rc;
3812 
3813         rc = kmem_alloc(sizeof (*rc), flags);
3814         if (rc != NULL) {
3815                 rc->entries = NULL;
3816                 rc->flags = RDDIR;
3817                 cv_init(&rc->cv, NULL, CV_DEFAULT, NULL);
3818                 mutex_init(&rc->lock, NULL, MUTEX_DEFAULT, NULL);
3819                 rc->count = 1;
3820 #ifdef DEBUG
3821                 atomic_inc_64(&clstat_debug.dirent.value.ui64);
3822 #endif
3823         }
3824         return (rc);
3825 }
3826 
3827 static void
3828 rddir_cache_free(rddir_cache *rc)
3829 {
3830 
3831 #ifdef DEBUG
3832         atomic_dec_64(&clstat_debug.dirent.value.ui64);
3833 #endif
3834         if (rc->entries != NULL) {
3835 #ifdef DEBUG
3836                 rddir_cache_buf_free(rc->entries, rc->buflen);
3837 #else
3838                 kmem_free(rc->entries, rc->buflen);
3839 #endif
3840         }
3841         cv_destroy(&rc->cv);
3842         mutex_destroy(&rc->lock);
3843         kmem_free(rc, sizeof (*rc));
3844 }
3845 
3846 void
3847 rddir_cache_hold(rddir_cache *rc)
3848 {
3849 
3850         mutex_enter(&rc->lock);
3851         rc->count++;
3852         mutex_exit(&rc->lock);
3853 }
3854 
3855 void
3856 rddir_cache_rele(rddir_cache *rc)
3857 {
3858 
3859         mutex_enter(&rc->lock);
3860         ASSERT(rc->count > 0);
3861         if (--rc->count == 0) {
3862                 mutex_exit(&rc->lock);
3863                 rddir_cache_free(rc);
3864         } else
3865                 mutex_exit(&rc->lock);
3866 }
3867 
3868 #ifdef DEBUG
3869 char *
3870 rddir_cache_buf_alloc(size_t size, int flags)
3871 {
3872         char *rc;
3873 
3874         rc = kmem_alloc(size, flags);
3875         if (rc != NULL)
3876                 atomic_add_64(&clstat_debug.dirents.value.ui64, size);
3877         return (rc);
3878 }
3879 
3880 void
3881 rddir_cache_buf_free(void *addr, size_t size)
3882 {
3883 
3884         atomic_add_64(&clstat_debug.dirents.value.ui64, -(int64_t)size);
3885         kmem_free(addr, size);
3886 }
3887 #endif
3888 
3889 static int
3890 nfs_free_data_reclaim(rnode_t *rp)
3891 {
3892         char *contents;
3893         int size;
3894         vsecattr_t *vsp;
3895         nfs3_pathconf_info *info;
3896         int freed;
3897         cred_t *cred;
3898 
3899         /*
3900          * Free any held credentials and caches which
3901          * may be associated with this rnode.
3902          */
3903         mutex_enter(&rp->r_statelock);
3904         cred = rp->r_cred;
3905         rp->r_cred = NULL;
3906         contents = rp->r_symlink.contents;
3907         size = rp->r_symlink.size;
3908         rp->r_symlink.contents = NULL;
3909         vsp = rp->r_secattr;
3910         rp->r_secattr = NULL;
3911         info = rp->r_pathconf;
3912         rp->r_pathconf = NULL;
3913         mutex_exit(&rp->r_statelock);
3914 
3915         if (cred != NULL)
3916                 crfree(cred);
3917 
3918         /*
3919          * Free the access cache entries.
3920          */
3921         freed = nfs_access_purge_rp(rp);
3922 
3923         if (!HAVE_RDDIR_CACHE(rp) &&
3924             contents == NULL &&
3925             vsp == NULL &&
3926             info == NULL)
3927                 return (freed);
3928 
3929         /*
3930          * Free the readdir cache entries
3931          */
3932         if (HAVE_RDDIR_CACHE(rp))
3933                 nfs_purge_rddir_cache(RTOV(rp));
3934 
3935         /*
3936          * Free the symbolic link cache.
3937          */
3938         if (contents != NULL) {
3939 
3940                 kmem_free((void *)contents, size);
3941         }
3942 
3943         /*
3944          * Free any cached ACL.
3945          */
3946         if (vsp != NULL)
3947                 nfs_acl_free(vsp);
3948 
3949         /*
3950          * Free any cached pathconf information.
3951          */
3952         if (info != NULL)
3953                 kmem_free(info, sizeof (*info));
3954 
3955         return (1);
3956 }
3957 
3958 static int
3959 nfs_active_data_reclaim(rnode_t *rp)
3960 {
3961         char *contents;
3962         int size;
3963         vsecattr_t *vsp;
3964         nfs3_pathconf_info *info;
3965         int freed;
3966 
3967         /*
3968          * Free any held credentials and caches which
3969          * may be associated with this rnode.
3970          */
3971         if (!mutex_tryenter(&rp->r_statelock))
3972                 return (0);
3973         contents = rp->r_symlink.contents;
3974         size = rp->r_symlink.size;
3975         rp->r_symlink.contents = NULL;
3976         vsp = rp->r_secattr;
3977         rp->r_secattr = NULL;
3978         info = rp->r_pathconf;
3979         rp->r_pathconf = NULL;
3980         mutex_exit(&rp->r_statelock);
3981 
3982         /*
3983          * Free the access cache entries.
3984          */
3985         freed = nfs_access_purge_rp(rp);
3986 
3987         if (!HAVE_RDDIR_CACHE(rp) &&
3988             contents == NULL &&
3989             vsp == NULL &&
3990             info == NULL)
3991                 return (freed);
3992 
3993         /*
3994          * Free the readdir cache entries
3995          */
3996         if (HAVE_RDDIR_CACHE(rp))
3997                 nfs_purge_rddir_cache(RTOV(rp));
3998 
3999         /*
4000          * Free the symbolic link cache.
4001          */
4002         if (contents != NULL) {
4003 
4004                 kmem_free((void *)contents, size);
4005         }
4006 
4007         /*
4008          * Free any cached ACL.
4009          */
4010         if (vsp != NULL)
4011                 nfs_acl_free(vsp);
4012 
4013         /*
4014          * Free any cached pathconf information.
4015          */
4016         if (info != NULL)
4017                 kmem_free(info, sizeof (*info));
4018 
4019         return (1);
4020 }
4021 
4022 static int
4023 nfs_free_reclaim(void)
4024 {
4025         int freed;
4026         rnode_t *rp;
4027 
4028 #ifdef DEBUG
4029         clstat_debug.f_reclaim.value.ui64++;
4030 #endif
4031         freed = 0;
4032         mutex_enter(&rpfreelist_lock);
4033         rp = rpfreelist;
4034         if (rp != NULL) {
4035                 do {
4036                         if (nfs_free_data_reclaim(rp))
4037                                 freed = 1;
4038                 } while ((rp = rp->r_freef) != rpfreelist);
4039         }
4040         mutex_exit(&rpfreelist_lock);
4041         return (freed);
4042 }
4043 
4044 static int
4045 nfs_active_reclaim(void)
4046 {
4047         int freed;
4048         int index;
4049         rnode_t *rp;
4050 
4051 #ifdef DEBUG
4052         clstat_debug.a_reclaim.value.ui64++;
4053 #endif
4054         freed = 0;
4055         for (index = 0; index < rtablesize; index++) {
4056                 rw_enter(&rtable[index].r_lock, RW_READER);
4057                 for (rp = rtable[index].r_hashf;
4058                     rp != (rnode_t *)(&rtable[index]);
4059                     rp = rp->r_hashf) {
4060                         if (nfs_active_data_reclaim(rp))
4061                                 freed = 1;
4062                 }
4063                 rw_exit(&rtable[index].r_lock);
4064         }
4065         return (freed);
4066 }
4067 
4068 static int
4069 nfs_rnode_reclaim(void)
4070 {
4071         int freed;
4072         rnode_t *rp;
4073         vnode_t *vp;
4074 
4075 #ifdef DEBUG
4076         clstat_debug.r_reclaim.value.ui64++;
4077 #endif
4078         freed = 0;
4079         mutex_enter(&rpfreelist_lock);
4080         while ((rp = rpfreelist) != NULL) {
4081                 rp_rmfree(rp);
4082                 mutex_exit(&rpfreelist_lock);
4083                 if (rp->r_flags & RHASHED) {
4084                         vp = RTOV(rp);
4085                         rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4086                         mutex_enter(&vp->v_lock);
4087                         if (vp->v_count > 1) {
4088                                 VN_RELE_LOCKED(vp);
4089                                 mutex_exit(&vp->v_lock);
4090                                 rw_exit(&rp->r_hashq->r_lock);
4091                                 mutex_enter(&rpfreelist_lock);
4092                                 continue;
4093                         }
4094                         mutex_exit(&vp->v_lock);
4095                         rp_rmhash_locked(rp);
4096                         rw_exit(&rp->r_hashq->r_lock);
4097                 }
4098                 /*
4099                  * This call to rp_addfree will end up destroying the
4100                  * rnode, but in a safe way with the appropriate set
4101                  * of checks done.
4102                  */
4103                 rp_addfree(rp, CRED());
4104                 mutex_enter(&rpfreelist_lock);
4105         }
4106         mutex_exit(&rpfreelist_lock);
4107         return (freed);
4108 }
4109 
4110 /*ARGSUSED*/
4111 static void
4112 nfs_reclaim(void *cdrarg)
4113 {
4114 
4115 #ifdef DEBUG
4116         clstat_debug.reclaim.value.ui64++;
4117 #endif
4118         if (nfs_free_reclaim())
4119                 return;
4120 
4121         if (nfs_active_reclaim())
4122                 return;
4123 
4124         (void) nfs_rnode_reclaim();
4125 }
4126 
4127 /*
4128  * NFS client failover support
4129  *
4130  * Routines to copy filehandles
4131  */
4132 void
4133 nfscopyfh(caddr_t fhp, vnode_t *vp)
4134 {
4135         fhandle_t *dest = (fhandle_t *)fhp;
4136 
4137         if (dest != NULL)
4138                 *dest = *VTOFH(vp);
4139 }
4140 
4141 void
4142 nfs3copyfh(caddr_t fhp, vnode_t *vp)
4143 {
4144         nfs_fh3 *dest = (nfs_fh3 *)fhp;
4145 
4146         if (dest != NULL)
4147                 *dest = *VTOFH3(vp);
4148 }
4149 
4150 /*
4151  * NFS client failover support
4152  *
4153  * failover_safe() will test various conditions to ensure that
4154  * failover is permitted for this vnode.  It will be denied
4155  * if:
4156  *      1) the operation in progress does not support failover (NULL fi)
4157  *      2) there are no available replicas (NULL mi_servers->sv_next)
4158  *      3) any locks are outstanding on this file
4159  */
4160 static int
4161 failover_safe(failinfo_t *fi)
4162 {
4163 
4164         /*
4165          * Does this op permit failover?
4166          */
4167         if (fi == NULL || fi->vp == NULL)
4168                 return (0);
4169 
4170         /*
4171          * Are there any alternates to failover to?
4172          */
4173         if (VTOMI(fi->vp)->mi_servers->sv_next == NULL)
4174                 return (0);
4175 
4176         /*
4177          * Disable check; we've forced local locking
4178          *
4179          * if (flk_has_remote_locks(fi->vp))
4180          *      return (0);
4181          */
4182 
4183         /*
4184          * If we have no partial path, we can't do anything
4185          */
4186         if (VTOR(fi->vp)->r_path == NULL)
4187                 return (0);
4188 
4189         return (1);
4190 }
4191 
4192 #include <sys/thread.h>
4193 
4194 /*
4195  * NFS client failover support
4196  *
4197  * failover_newserver() will start a search for a new server,
4198  * preferably by starting an async thread to do the work.  If
4199  * someone is already doing this (recognizable by MI_BINDINPROG
4200  * being set), it will simply return and the calling thread
4201  * will queue on the mi_failover_cv condition variable.
4202  */
4203 static void
4204 failover_newserver(mntinfo_t *mi)
4205 {
4206         /*
4207          * Check if someone else is doing this already
4208          */
4209         mutex_enter(&mi->mi_lock);
4210         if (mi->mi_flags & MI_BINDINPROG) {
4211                 mutex_exit(&mi->mi_lock);
4212                 return;
4213         }
4214         mi->mi_flags |= MI_BINDINPROG;
4215 
4216         /*
4217          * Need to hold the vfs struct so that it can't be released
4218          * while the failover thread is selecting a new server.
4219          */
4220         VFS_HOLD(mi->mi_vfsp);
4221 
4222         /*
4223          * Start a thread to do the real searching.
4224          */
4225         (void) zthread_create(NULL, 0, failover_thread, mi, 0, minclsyspri);
4226 
4227         mutex_exit(&mi->mi_lock);
4228 }
4229 
4230 /*
4231  * NFS client failover support
4232  *
4233  * failover_thread() will find a new server to replace the one
4234  * currently in use, wake up other threads waiting on this mount
4235  * point, and die.  It will start at the head of the server list
4236  * and poll servers until it finds one with an NFS server which is
4237  * registered and responds to a NULL procedure ping.
4238  *
4239  * XXX failover_thread is unsafe within the scope of the
4240  * present model defined for cpr to suspend the system.
4241  * Specifically, over-the-wire calls made by the thread
4242  * are unsafe. The thread needs to be reevaluated in case of
4243  * future updates to the cpr suspend model.
4244  */
4245 static void
4246 failover_thread(mntinfo_t *mi)
4247 {
4248         servinfo_t *svp = NULL;
4249         CLIENT *cl;
4250         enum clnt_stat status;
4251         struct timeval tv;
4252         int error;
4253         int oncethru = 0;
4254         callb_cpr_t cprinfo;
4255         rnode_t *rp;
4256         int index;
4257         char *srvnames;
4258         size_t srvnames_len;
4259         struct nfs_clnt *nfscl = NULL;
4260         zoneid_t zoneid = getzoneid();
4261 
4262 #ifdef DEBUG
4263         /*
4264          * This is currently only needed to access counters which exist on
4265          * DEBUG kernels, hence we don't want to pay the penalty of the lookup
4266          * on non-DEBUG kernels.
4267          */
4268         nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
4269         ASSERT(nfscl != NULL);
4270 #endif
4271 
4272         /*
4273          * Its safe to piggyback on the mi_lock since failover_newserver()
4274          * code guarantees that there will be only one failover thread
4275          * per mountinfo at any instance.
4276          */
4277         CALLB_CPR_INIT(&cprinfo, &mi->mi_lock, callb_generic_cpr,
4278             "failover_thread");
4279 
4280         mutex_enter(&mi->mi_lock);
4281         while (mi->mi_readers) {
4282                 CALLB_CPR_SAFE_BEGIN(&cprinfo);
4283                 cv_wait(&mi->mi_failover_cv, &mi->mi_lock);
4284                 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock);
4285         }
4286         mutex_exit(&mi->mi_lock);
4287 
4288         tv.tv_sec = 2;
4289         tv.tv_usec = 0;
4290 
4291         /*
4292          * Ping the null NFS procedure of every server in
4293          * the list until one responds.  We always start
4294          * at the head of the list and always skip the one
4295          * that is current, since it's caused us a problem.
4296          */
4297         while (svp == NULL) {
4298                 for (svp = mi->mi_servers; svp; svp = svp->sv_next) {
4299                         if (!oncethru && svp == mi->mi_curr_serv)
4300                                 continue;
4301 
4302                         /*
4303                          * If the file system was forcibly umounted
4304                          * while trying to do a failover, then just
4305                          * give up on the failover.  It won't matter
4306                          * what the server is.
4307                          */
4308                         if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
4309                                 svp = NULL;
4310                                 goto done;
4311                         }
4312 
4313                         error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr,
4314                             NFS_PROGRAM, NFS_VERSION, 0, 1, CRED(), &cl);
4315                         if (error)
4316                                 continue;
4317 
4318                         if (!(mi->mi_flags & MI_INT))
4319                                 cl->cl_nosignal = TRUE;
4320                         status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL,
4321                             xdr_void, NULL, tv);
4322                         if (!(mi->mi_flags & MI_INT))
4323                                 cl->cl_nosignal = FALSE;
4324                         AUTH_DESTROY(cl->cl_auth);
4325                         CLNT_DESTROY(cl);
4326                         if (status == RPC_SUCCESS) {
4327                                 if (svp == mi->mi_curr_serv) {
4328 #ifdef DEBUG
4329                                         zcmn_err(zoneid, CE_NOTE,
4330                         "NFS%d: failing over: selecting original server %s",
4331                                             mi->mi_vers, svp->sv_hostname);
4332 #else
4333                                         zcmn_err(zoneid, CE_NOTE,
4334                         "NFS: failing over: selecting original server %s",
4335                                             svp->sv_hostname);
4336 #endif
4337                                 } else {
4338 #ifdef DEBUG
4339                                         zcmn_err(zoneid, CE_NOTE,
4340                                     "NFS%d: failing over from %s to %s",
4341                                             mi->mi_vers,
4342                                             mi->mi_curr_serv->sv_hostname,
4343                                             svp->sv_hostname);
4344 #else
4345                                         zcmn_err(zoneid, CE_NOTE,
4346                                     "NFS: failing over from %s to %s",
4347                                             mi->mi_curr_serv->sv_hostname,
4348                                             svp->sv_hostname);
4349 #endif
4350                                 }
4351                                 break;
4352                         }
4353                 }
4354 
4355                 if (svp == NULL) {
4356                         if (!oncethru) {
4357                                 srvnames = nfs_getsrvnames(mi, &srvnames_len);
4358 #ifdef DEBUG
4359                                 zprintf(zoneid,
4360                                     "NFS%d servers %s not responding "
4361                                     "still trying\n", mi->mi_vers, srvnames);
4362 #else
4363                                 zprintf(zoneid, "NFS servers %s not responding "
4364                                     "still trying\n", srvnames);
4365 #endif
4366                                 oncethru = 1;
4367                         }
4368                         mutex_enter(&mi->mi_lock);
4369                         CALLB_CPR_SAFE_BEGIN(&cprinfo);
4370                         mutex_exit(&mi->mi_lock);
4371                         delay(hz);
4372                         mutex_enter(&mi->mi_lock);
4373                         CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock);
4374                         mutex_exit(&mi->mi_lock);
4375                 }
4376         }
4377 
4378         if (oncethru) {
4379 #ifdef DEBUG
4380                 zprintf(zoneid, "NFS%d servers %s ok\n", mi->mi_vers, srvnames);
4381 #else
4382                 zprintf(zoneid, "NFS servers %s ok\n", srvnames);
4383 #endif
4384         }
4385 
4386         if (svp != mi->mi_curr_serv) {
4387                 (void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
4388                 index = rtablehash(&mi->mi_curr_serv->sv_fhandle);
4389                 rw_enter(&rtable[index].r_lock, RW_WRITER);
4390                 rp = rfind(&rtable[index], &mi->mi_curr_serv->sv_fhandle,
4391                     mi->mi_vfsp);
4392                 if (rp != NULL) {
4393                         if (rp->r_flags & RHASHED)
4394                                 rp_rmhash_locked(rp);
4395                         rw_exit(&rtable[index].r_lock);
4396                         rp->r_server = svp;
4397                         rp->r_fh = svp->sv_fhandle;
4398                         (void) nfs_free_data_reclaim(rp);
4399                         index = rtablehash(&rp->r_fh);
4400                         rp->r_hashq = &rtable[index];
4401                         rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4402                         vn_exists(RTOV(rp));
4403                         rp_addhash(rp);
4404                         rw_exit(&rp->r_hashq->r_lock);
4405                         VN_RELE(RTOV(rp));
4406                 } else
4407                         rw_exit(&rtable[index].r_lock);
4408         }
4409 
4410 done:
4411         if (oncethru)
4412                 kmem_free(srvnames, srvnames_len);
4413         mutex_enter(&mi->mi_lock);
4414         mi->mi_flags &= ~MI_BINDINPROG;
4415         if (svp != NULL) {
4416                 mi->mi_curr_serv = svp;
4417                 mi->mi_failover++;
4418 #ifdef DEBUG
4419         nfscl->nfscl_stat.failover.value.ui64++;
4420 #endif
4421         }
4422         cv_broadcast(&mi->mi_failover_cv);
4423         CALLB_CPR_EXIT(&cprinfo);
4424         VFS_RELE(mi->mi_vfsp);
4425         zthread_exit();
4426         /* NOTREACHED */
4427 }
4428 
4429 /*
4430  * NFS client failover support
4431  *
4432  * failover_wait() will put the thread to sleep until MI_BINDINPROG
4433  * is cleared, meaning that failover is complete.  Called with
4434  * mi_lock mutex held.
4435  */
4436 static int
4437 failover_wait(mntinfo_t *mi)
4438 {
4439         k_sigset_t smask;
4440 
4441         /*
4442          * If someone else is hunting for a living server,
4443          * sleep until it's done.  After our sleep, we may
4444          * be bound to the right server and get off cheaply.
4445          */
4446         while (mi->mi_flags & MI_BINDINPROG) {
4447                 /*
4448                  * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
4449                  * and SIGTERM. (Preserving the existing masks).
4450                  * Mask out SIGINT if mount option nointr is specified.
4451                  */
4452                 sigintr(&smask, (int)mi->mi_flags & MI_INT);
4453                 if (!cv_wait_sig(&mi->mi_failover_cv, &mi->mi_lock)) {
4454                         /*
4455                          * restore original signal mask
4456                          */
4457                         sigunintr(&smask);
4458                         return (EINTR);
4459                 }
4460                 /*
4461                  * restore original signal mask
4462                  */
4463                 sigunintr(&smask);
4464         }
4465         return (0);
4466 }
4467 
4468 /*
4469  * NFS client failover support
4470  *
4471  * failover_remap() will do a partial pathname lookup and find the
4472  * desired vnode on the current server.  The interim vnode will be
4473  * discarded after we pilfer the new filehandle.
4474  *
4475  * Side effects:
4476  * - This routine will also update the filehandle in the args structure
4477  *    pointed to by the fi->fhp pointer if it is non-NULL.
4478  */
4479 
4480 static int
4481 failover_remap(failinfo_t *fi)
4482 {
4483         vnode_t *vp, *nvp, *rootvp;
4484         rnode_t *rp, *nrp;
4485         mntinfo_t *mi;
4486         int error;
4487 #ifdef DEBUG
4488         struct nfs_clnt *nfscl;
4489 
4490         nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
4491         ASSERT(nfscl != NULL);
4492 #endif
4493         /*
4494          * Sanity check
4495          */
4496         if (fi == NULL || fi->vp == NULL || fi->lookupproc == NULL)
4497                 return (EINVAL);
4498         vp = fi->vp;
4499         rp = VTOR(vp);
4500         mi = VTOMI(vp);
4501 
4502         if (!(vp->v_flag & VROOT)) {
4503                 /*
4504                  * Given the root fh, use the path stored in
4505                  * the rnode to find the fh for the new server.
4506                  */
4507                 error = VFS_ROOT(mi->mi_vfsp, &rootvp);
4508                 if (error)
4509                         return (error);
4510 
4511                 error = failover_lookup(rp->r_path, rootvp,
4512                     fi->lookupproc, fi->xattrdirproc, &nvp);
4513 
4514                 VN_RELE(rootvp);
4515 
4516                 if (error)
4517                         return (error);
4518 
4519                 /*
4520                  * If we found the same rnode, we're done now
4521                  */
4522                 if (nvp == vp) {
4523                         /*
4524                          * Failed and the new server may physically be same
4525                          * OR may share a same disk subsystem. In this case
4526                          * file handle for a particular file path is not going
4527                          * to change, given the same filehandle lookup will
4528                          * always locate the same rnode as the existing one.
4529                          * All we might need to do is to update the r_server
4530                          * with the current servinfo.
4531                          */
4532                         if (!VALID_FH(fi)) {
4533                                 rp->r_server = mi->mi_curr_serv;
4534                         }
4535                         VN_RELE(nvp);
4536                         return (0);
4537                 }
4538 
4539                 /*
4540                  * Try to make it so that no one else will find this
4541                  * vnode because it is just a temporary to hold the
4542                  * new file handle until that file handle can be
4543                  * copied to the original vnode/rnode.
4544                  */
4545                 nrp = VTOR(nvp);
4546                 mutex_enter(&mi->mi_remap_lock);
4547                 /*
4548                  * Some other thread could have raced in here and could
4549                  * have done the remap for this particular rnode before
4550                  * this thread here. Check for rp->r_server and
4551                  * mi->mi_curr_serv and return if they are same.
4552                  */
4553                 if (VALID_FH(fi)) {
4554                         mutex_exit(&mi->mi_remap_lock);
4555                         VN_RELE(nvp);
4556                         return (0);
4557                 }
4558 
4559                 if (nrp->r_flags & RHASHED)
4560                         rp_rmhash(nrp);
4561 
4562                 /*
4563                  * As a heuristic check on the validity of the new
4564                  * file, check that the size and type match against
4565                  * that we remember from the old version.
4566                  */
4567                 if (rp->r_size != nrp->r_size || vp->v_type != nvp->v_type) {
4568                         mutex_exit(&mi->mi_remap_lock);
4569                         zcmn_err(mi->mi_zone->zone_id, CE_WARN,
4570                             "NFS replicas %s and %s: file %s not same.",
4571                             rp->r_server->sv_hostname,
4572                             nrp->r_server->sv_hostname, rp->r_path);
4573                         VN_RELE(nvp);
4574                         return (EINVAL);
4575                 }
4576 
4577                 /*
4578                  * snarf the filehandle from the new rnode
4579                  * then release it, again while updating the
4580                  * hash queues for the rnode.
4581                  */
4582                 if (rp->r_flags & RHASHED)
4583                         rp_rmhash(rp);
4584                 rp->r_server = mi->mi_curr_serv;
4585                 rp->r_fh = nrp->r_fh;
4586                 rp->r_hashq = nrp->r_hashq;
4587                 /*
4588                  * Copy the attributes from the new rnode to the old
4589                  * rnode.  This will help to reduce unnecessary page
4590                  * cache flushes.
4591                  */
4592                 rp->r_attr = nrp->r_attr;
4593                 rp->r_attrtime = nrp->r_attrtime;
4594                 rp->r_mtime = nrp->r_mtime;
4595                 (void) nfs_free_data_reclaim(rp);
4596                 nfs_setswaplike(vp, &rp->r_attr);
4597                 rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4598                 rp_addhash(rp);
4599                 rw_exit(&rp->r_hashq->r_lock);
4600                 mutex_exit(&mi->mi_remap_lock);
4601                 VN_RELE(nvp);
4602         }
4603 
4604         /*
4605          * Update successful failover remap count
4606          */
4607         mutex_enter(&mi->mi_lock);
4608         mi->mi_remap++;
4609         mutex_exit(&mi->mi_lock);
4610 #ifdef DEBUG
4611         nfscl->nfscl_stat.remap.value.ui64++;
4612 #endif
4613 
4614         /*
4615          * If we have a copied filehandle to update, do it now.
4616          */
4617         if (fi->fhp != NULL && fi->copyproc != NULL)
4618                 (*fi->copyproc)(fi->fhp, vp);
4619 
4620         return (0);
4621 }
4622 
4623 /*
4624  * NFS client failover support
4625  *
4626  * We want a simple pathname lookup routine to parse the pieces
4627  * of path in rp->r_path.  We know that the path was a created
4628  * as rnodes were made, so we know we have only to deal with
4629  * paths that look like:
4630  *      dir1/dir2/dir3/file
4631  * Any evidence of anything like .., symlinks, and ENOTDIR
4632  * are hard errors, because they mean something in this filesystem
4633  * is different from the one we came from, or has changed under
4634  * us in some way.  If this is true, we want the failure.
4635  *
4636  * Extended attributes: if the filesystem is mounted with extended
4637  * attributes enabled (-o xattr), the attribute directory will be
4638  * represented in the r_path as the magic name XATTR_RPATH. So if
4639  * we see that name in the pathname, is must be because this node
4640  * is an extended attribute.  Therefore, look it up that way.
4641  */
4642 static int
4643 failover_lookup(char *path, vnode_t *root,
4644     int (*lookupproc)(vnode_t *, char *, vnode_t **, struct pathname *, int,
4645     vnode_t *, cred_t *, int),
4646     int (*xattrdirproc)(vnode_t *, vnode_t **, bool_t, cred_t *, int),
4647     vnode_t **new)
4648 {
4649         vnode_t *dvp, *nvp;
4650         int error = EINVAL;
4651         char *s, *p, *tmppath;
4652         size_t len;
4653         mntinfo_t *mi;
4654         bool_t xattr;
4655 
4656         /* Make local copy of path */
4657         len = strlen(path) + 1;
4658         tmppath = kmem_alloc(len, KM_SLEEP);
4659         (void) strcpy(tmppath, path);
4660         s = tmppath;
4661 
4662         dvp = root;
4663         VN_HOLD(dvp);
4664         mi = VTOMI(root);
4665         xattr = mi->mi_flags & MI_EXTATTR;
4666 
4667         do {
4668                 p = strchr(s, '/');
4669                 if (p != NULL)
4670                         *p = '\0';
4671                 if (xattr && strcmp(s, XATTR_RPATH) == 0) {
4672                         error = (*xattrdirproc)(dvp, &nvp, FALSE, CRED(),
4673                             RFSCALL_SOFT);
4674                 } else {
4675                         error = (*lookupproc)(dvp, s, &nvp, NULL, 0, NULL,
4676                             CRED(), RFSCALL_SOFT);
4677                 }
4678                 if (p != NULL)
4679                         *p++ = '/';
4680                 if (error) {
4681                         VN_RELE(dvp);
4682                         kmem_free(tmppath, len);
4683                         return (error);
4684                 }
4685                 s = p;
4686                 VN_RELE(dvp);
4687                 dvp = nvp;
4688         } while (p != NULL);
4689 
4690         if (nvp != NULL && new != NULL)
4691                 *new = nvp;
4692         kmem_free(tmppath, len);
4693         return (0);
4694 }
4695 
4696 /*
4697  * NFS client failover support
4698  *
4699  * sv_free() frees the malloc'd portion of a "servinfo_t".
4700  */
4701 void
4702 sv_free(servinfo_t *svp)
4703 {
4704         servinfo_t *next;
4705         struct knetconfig *knconf;
4706 
4707         while (svp != NULL) {
4708                 next = svp->sv_next;
4709                 if (svp->sv_secdata)
4710                         sec_clnt_freeinfo(svp->sv_secdata);
4711                 if (svp->sv_hostname && svp->sv_hostnamelen > 0)
4712                         kmem_free(svp->sv_hostname, svp->sv_hostnamelen);
4713                 knconf = svp->sv_knconf;
4714                 if (knconf != NULL) {
4715                         if (knconf->knc_protofmly != NULL)
4716                                 kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
4717                         if (knconf->knc_proto != NULL)
4718                                 kmem_free(knconf->knc_proto, KNC_STRSIZE);
4719                         kmem_free(knconf, sizeof (*knconf));
4720                 }
4721                 knconf = svp->sv_origknconf;
4722                 if (knconf != NULL) {
4723                         if (knconf->knc_protofmly != NULL)
4724                                 kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
4725                         if (knconf->knc_proto != NULL)
4726                                 kmem_free(knconf->knc_proto, KNC_STRSIZE);
4727                         kmem_free(knconf, sizeof (*knconf));
4728                 }
4729                 if (svp->sv_addr.buf != NULL && svp->sv_addr.maxlen != 0)
4730                         kmem_free(svp->sv_addr.buf, svp->sv_addr.maxlen);
4731                 mutex_destroy(&svp->sv_lock);
4732                 kmem_free(svp, sizeof (*svp));
4733                 svp = next;
4734         }
4735 }
4736 
4737 /*
4738  * Only can return non-zero if intr != 0.
4739  */
4740 int
4741 nfs_rw_enter_sig(nfs_rwlock_t *l, krw_t rw, int intr)
4742 {
4743 
4744         mutex_enter(&l->lock);
4745 
4746         /*
4747          * If this is a nested enter, then allow it.  There
4748          * must be as many exits as enters through.
4749          */
4750         if (l->owner == curthread) {
4751                 /* lock is held for writing by current thread */
4752                 ASSERT(rw == RW_READER || rw == RW_WRITER);
4753                 l->count--;
4754         } else if (rw == RW_READER) {
4755                 /*
4756                  * While there is a writer active or writers waiting,
4757                  * then wait for them to finish up and move on.  Then,
4758                  * increment the count to indicate that a reader is
4759                  * active.
4760                  */
4761                 while (l->count < 0 || l->waiters > 0) {
4762                         if (intr) {
4763                                 klwp_t *lwp = ttolwp(curthread);
4764 
4765                                 if (lwp != NULL)
4766                                         lwp->lwp_nostop++;
4767                                 if (cv_wait_sig(&l->cv_rd, &l->lock) == 0) {
4768                                         if (lwp != NULL)
4769                                                 lwp->lwp_nostop--;
4770                                         mutex_exit(&l->lock);
4771                                         return (EINTR);
4772                                 }
4773                                 if (lwp != NULL)
4774                                         lwp->lwp_nostop--;
4775                         } else
4776                                 cv_wait(&l->cv_rd, &l->lock);
4777                 }
4778                 ASSERT(l->count < INT_MAX);
4779 #ifdef  DEBUG
4780                 if ((l->count % 10000) == 9999)
4781                         cmn_err(CE_WARN, "nfs_rw_enter_sig: count %d on"
4782                             "rwlock @ %p\n", l->count, (void *)&l);
4783 #endif
4784                 l->count++;
4785         } else {
4786                 ASSERT(rw == RW_WRITER);
4787                 /*
4788                  * While there are readers active or a writer
4789                  * active, then wait for all of the readers
4790                  * to finish or for the writer to finish.
4791                  * Then, set the owner field to curthread and
4792                  * decrement count to indicate that a writer
4793                  * is active.
4794                  */
4795                 while (l->count != 0) {
4796                         l->waiters++;
4797                         if (intr) {
4798                                 klwp_t *lwp = ttolwp(curthread);
4799 
4800                                 if (lwp != NULL)
4801                                         lwp->lwp_nostop++;
4802                                 if (cv_wait_sig(&l->cv, &l->lock) == 0) {
4803                                         if (lwp != NULL)
4804                                                 lwp->lwp_nostop--;
4805                                         l->waiters--;
4806                                         /*
4807                                          * If there are readers active and no
4808                                          * writers waiting then wake up all of
4809                                          * the waiting readers (if any).
4810                                          */
4811                                         if (l->count > 0 && l->waiters == 0)
4812                                                 cv_broadcast(&l->cv_rd);
4813                                         mutex_exit(&l->lock);
4814                                         return (EINTR);
4815                                 }
4816                                 if (lwp != NULL)
4817                                         lwp->lwp_nostop--;
4818                         } else
4819                                 cv_wait(&l->cv, &l->lock);
4820                         l->waiters--;
4821                 }
4822                 ASSERT(l->owner == NULL);
4823                 l->owner = curthread;
4824                 l->count--;
4825         }
4826 
4827         mutex_exit(&l->lock);
4828 
4829         return (0);
4830 }
4831 
4832 /*
4833  * If the lock is available, obtain it and return non-zero.  If there is
4834  * already a conflicting lock, return 0 immediately.
4835  */
4836 
4837 int
4838 nfs_rw_tryenter(nfs_rwlock_t *l, krw_t rw)
4839 {
4840         mutex_enter(&l->lock);
4841 
4842         /*
4843          * If this is a nested enter, then allow it.  There
4844          * must be as many exits as enters through.
4845          */
4846         if (l->owner == curthread) {
4847                 /* lock is held for writing by current thread */
4848                 ASSERT(rw == RW_READER || rw == RW_WRITER);
4849                 l->count--;
4850         } else if (rw == RW_READER) {
4851                 /*
4852                  * If there is a writer active or writers waiting, deny the
4853                  * lock.  Otherwise, bump the count of readers.
4854                  */
4855                 if (l->count < 0 || l->waiters > 0) {
4856                         mutex_exit(&l->lock);
4857                         return (0);
4858                 }
4859                 l->count++;
4860         } else {
4861                 ASSERT(rw == RW_WRITER);
4862                 /*
4863                  * If there are readers active or a writer active, deny the
4864                  * lock.  Otherwise, set the owner field to curthread and
4865                  * decrement count to indicate that a writer is active.
4866                  */
4867                 if (l->count != 0) {
4868                         mutex_exit(&l->lock);
4869                         return (0);
4870                 }
4871                 ASSERT(l->owner == NULL);
4872                 l->owner = curthread;
4873                 l->count--;
4874         }
4875 
4876         mutex_exit(&l->lock);
4877 
4878         return (1);
4879 }
4880 
4881 void
4882 nfs_rw_exit(nfs_rwlock_t *l)
4883 {
4884 
4885         mutex_enter(&l->lock);
4886 
4887         if (l->owner != NULL) {
4888                 ASSERT(l->owner == curthread);
4889 
4890                 /*
4891                  * To release a writer lock increment count to indicate that
4892                  * there is one less writer active.  If this was the last of
4893                  * possibly nested writer locks, then clear the owner field as
4894                  * well to indicate that there is no writer active.
4895                  */
4896                 ASSERT(l->count < 0);
4897                 l->count++;
4898                 if (l->count == 0) {
4899                         l->owner = NULL;
4900 
4901                         /*
4902                          * If there are no writers waiting then wakeup all of
4903                          * the waiting readers (if any).
4904                          */
4905                         if (l->waiters == 0)
4906                                 cv_broadcast(&l->cv_rd);
4907                 }
4908         } else {
4909                 /*
4910                  * To release a reader lock just decrement count to indicate
4911                  * that there is one less reader active.
4912                  */
4913                 ASSERT(l->count > 0);
4914                 l->count--;
4915         }
4916 
4917         /*
4918          * If there are no readers active nor a writer active and there is a
4919          * writer waiting we need to wake up it.
4920          */
4921         if (l->count == 0 && l->waiters > 0)
4922                 cv_signal(&l->cv);
4923         mutex_exit(&l->lock);
4924 }
4925 
4926 int
4927 nfs_rw_lock_held(nfs_rwlock_t *l, krw_t rw)
4928 {
4929 
4930         if (rw == RW_READER)
4931                 return (l->count > 0);
4932         ASSERT(rw == RW_WRITER);
4933         return (l->count < 0);
4934 }
4935 
4936 /* ARGSUSED */
4937 void
4938 nfs_rw_init(nfs_rwlock_t *l, char *name, krw_type_t type, void *arg)
4939 {
4940 
4941         l->count = 0;
4942         l->waiters = 0;
4943         l->owner = NULL;
4944         mutex_init(&l->lock, NULL, MUTEX_DEFAULT, NULL);
4945         cv_init(&l->cv, NULL, CV_DEFAULT, NULL);
4946         cv_init(&l->cv_rd, NULL, CV_DEFAULT, NULL);
4947 }
4948 
4949 void
4950 nfs_rw_destroy(nfs_rwlock_t *l)
4951 {
4952 
4953         mutex_destroy(&l->lock);
4954         cv_destroy(&l->cv);
4955         cv_destroy(&l->cv_rd);
4956 }
4957 
4958 int
4959 nfs3_rddir_compar(const void *x, const void *y)
4960 {
4961         rddir_cache *a = (rddir_cache *)x;
4962         rddir_cache *b = (rddir_cache *)y;
4963 
4964         if (a->nfs3_cookie == b->nfs3_cookie) {
4965                 if (a->buflen == b->buflen)
4966                         return (0);
4967                 if (a->buflen < b->buflen)
4968                         return (-1);
4969                 return (1);
4970         }
4971 
4972         if (a->nfs3_cookie < b->nfs3_cookie)
4973                 return (-1);
4974 
4975         return (1);
4976 }
4977 
4978 int
4979 nfs_rddir_compar(const void *x, const void *y)
4980 {
4981         rddir_cache *a = (rddir_cache *)x;
4982         rddir_cache *b = (rddir_cache *)y;
4983 
4984         if (a->nfs_cookie == b->nfs_cookie) {
4985                 if (a->buflen == b->buflen)
4986                         return (0);
4987                 if (a->buflen < b->buflen)
4988                         return (-1);
4989                 return (1);
4990         }
4991 
4992         if (a->nfs_cookie < b->nfs_cookie)
4993                 return (-1);
4994 
4995         return (1);
4996 }
4997 
4998 static char *
4999 nfs_getsrvnames(mntinfo_t *mi, size_t *len)
5000 {
5001         servinfo_t *s;
5002         char *srvnames;
5003         char *namep;
5004         size_t length;
5005 
5006         /*
5007          * Calculate the length of the string required to hold all
5008          * of the server names plus either a comma or a null
5009          * character following each individual one.
5010          */
5011         length = 0;
5012         for (s = mi->mi_servers; s != NULL; s = s->sv_next)
5013                 length += s->sv_hostnamelen;
5014 
5015         srvnames = kmem_alloc(length, KM_SLEEP);
5016 
5017         namep = srvnames;
5018         for (s = mi->mi_servers; s != NULL; s = s->sv_next) {
5019                 (void) strcpy(namep, s->sv_hostname);
5020                 namep += s->sv_hostnamelen - 1;
5021                 *namep++ = ',';
5022         }
5023         *--namep = '\0';
5024 
5025         *len = length;
5026 
5027         return (srvnames);
5028 }
5029 
5030 /*
5031  * These two functions are temporary and designed for the upgrade-workaround
5032  * only.  They cannot be used for general zone-crossing NFS client support, and
5033  * will be removed shortly.
5034  *
5035  * When the workaround is enabled, all NFS traffic is forced into the global
5036  * zone.  These functions are called when the code needs to refer to the state
5037  * of the underlying network connection.  They're not called when the function
5038  * needs to refer to the state of the process that invoked the system call.
5039  * (E.g., when checking whether the zone is shutting down during the mount()
5040  * call.)
5041  */
5042 
5043 struct zone *
5044 nfs_zone(void)
5045 {
5046         return (nfs_global_client_only != 0 ? global_zone : curproc->p_zone);
5047 }
5048 
5049 zoneid_t
5050 nfs_zoneid(void)
5051 {
5052         return (nfs_global_client_only != 0 ? GLOBAL_ZONEID : getzoneid());
5053 }
5054 
5055 /*
5056  * nfs_mount_label_policy:
5057  *      Determine whether the mount is allowed according to MAC check,
5058  *      by comparing (where appropriate) label of the remote server
5059  *      against the label of the zone being mounted into.
5060  *
5061  *      Returns:
5062  *               0 :    access allowed
5063  *              -1 :    read-only access allowed (i.e., read-down)
5064  *              >0 : error code, such as EACCES
5065  */
5066 int
5067 nfs_mount_label_policy(vfs_t *vfsp, struct netbuf *addr,
5068     struct knetconfig *knconf, cred_t *cr)
5069 {
5070         int             addr_type;
5071         void            *ipaddr;
5072         bslabel_t       *server_sl, *mntlabel;
5073         zone_t          *mntzone = NULL;
5074         ts_label_t      *zlabel;
5075         tsol_tpc_t      *tp;
5076         ts_label_t      *tsl = NULL;
5077         int             retv;
5078 
5079         /*
5080          * Get the zone's label.  Each zone on a labeled system has a label.
5081          */
5082         mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE);
5083         zlabel = mntzone->zone_slabel;
5084         ASSERT(zlabel != NULL);
5085         label_hold(zlabel);
5086 
5087         if (strcmp(knconf->knc_protofmly, NC_INET) == 0) {
5088                 addr_type = IPV4_VERSION;
5089                 ipaddr = &((struct sockaddr_in *)addr->buf)->sin_addr;
5090         } else if (strcmp(knconf->knc_protofmly, NC_INET6) == 0) {
5091                 addr_type = IPV6_VERSION;
5092                 ipaddr = &((struct sockaddr_in6 *)addr->buf)->sin6_addr;
5093         } else {
5094                 retv = 0;
5095                 goto out;
5096         }
5097 
5098         retv = EACCES;                          /* assume the worst */
5099 
5100         /*
5101          * Next, get the assigned label of the remote server.
5102          */
5103         tp = find_tpc(ipaddr, addr_type, B_FALSE);
5104         if (tp == NULL)
5105                 goto out;                       /* error getting host entry */
5106 
5107         if (tp->tpc_tp.tp_doi != zlabel->tsl_doi)
5108                 goto rel_tpc;                   /* invalid domain */
5109         if ((tp->tpc_tp.host_type != SUN_CIPSO) &&
5110             (tp->tpc_tp.host_type != UNLABELED))
5111                 goto rel_tpc;                   /* invalid hosttype */
5112 
5113         if (tp->tpc_tp.host_type == SUN_CIPSO) {
5114                 tsl = getflabel_cipso(vfsp);
5115                 if (tsl == NULL)
5116                         goto rel_tpc;           /* error getting server lbl */
5117 
5118                 server_sl = label2bslabel(tsl);
5119         } else {        /* UNLABELED */
5120                 server_sl = &tp->tpc_tp.tp_def_label;
5121         }
5122 
5123         mntlabel = label2bslabel(zlabel);
5124 
5125         /*
5126          * Now compare labels to complete the MAC check.  If the labels
5127          * are equal or if the requestor is in the global zone and has
5128          * NET_MAC_AWARE, then allow read-write access.   (Except for
5129          * mounts into the global zone itself; restrict these to
5130          * read-only.)
5131          *
5132          * If the requestor is in some other zone, but their label
5133          * dominates the server, then allow read-down.
5134          *
5135          * Otherwise, access is denied.
5136          */
5137         if (blequal(mntlabel, server_sl) ||
5138             (crgetzoneid(cr) == GLOBAL_ZONEID &&
5139             getpflags(NET_MAC_AWARE, cr) != 0)) {
5140                 if ((mntzone == global_zone) ||
5141                     !blequal(mntlabel, server_sl))
5142                         retv = -1;              /* read-only */
5143                 else
5144                         retv = 0;               /* access OK */
5145         } else if (bldominates(mntlabel, server_sl)) {
5146                 retv = -1;                      /* read-only */
5147         } else {
5148                 retv = EACCES;
5149         }
5150 
5151         if (tsl != NULL)
5152                 label_rele(tsl);
5153 
5154 rel_tpc:
5155         TPC_RELE(tp);
5156 out:
5157         if (mntzone)
5158                 zone_rele(mntzone);
5159         label_rele(zlabel);
5160         return (retv);
5161 }
5162 
5163 boolean_t
5164 nfs_has_ctty(void)
5165 {
5166         boolean_t rv;
5167         mutex_enter(&curproc->p_splock);
5168         rv = (curproc->p_sessp->s_vp != NULL);
5169         mutex_exit(&curproc->p_splock);
5170         return (rv);
5171 }
5172 
5173 /*
5174  * See if xattr directory to see if it has any generic user attributes
5175  */
5176 int
5177 do_xattr_exists_check(vnode_t *vp, ulong_t *valp, cred_t *cr)
5178 {
5179         struct uio uio;
5180         struct iovec iov;
5181         char *dbuf;
5182         struct dirent64 *dp;
5183         size_t dlen = 8 * 1024;
5184         size_t dbuflen;
5185         int eof = 0;
5186         int error;
5187 
5188         *valp = 0;
5189         dbuf = kmem_alloc(dlen, KM_SLEEP);
5190         uio.uio_iov = &iov;
5191         uio.uio_iovcnt = 1;
5192         uio.uio_segflg = UIO_SYSSPACE;
5193         uio.uio_fmode = 0;
5194         uio.uio_extflg = UIO_COPY_CACHED;
5195         uio.uio_loffset = 0;
5196         uio.uio_resid = dlen;
5197         iov.iov_base = dbuf;
5198         iov.iov_len = dlen;
5199         (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
5200         error = VOP_READDIR(vp, &uio, cr, &eof, NULL, 0);
5201         VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
5202 
5203         dbuflen = dlen - uio.uio_resid;
5204 
5205         if (error || dbuflen == 0) {
5206                 kmem_free(dbuf, dlen);
5207                 return (error);
5208         }
5209 
5210         dp = (dirent64_t *)dbuf;
5211 
5212         while ((intptr_t)dp < (intptr_t)dbuf + dbuflen) {
5213                 if (strcmp(dp->d_name, ".") == 0 ||
5214                     strcmp(dp->d_name, "..") == 0 || strcmp(dp->d_name,
5215                     VIEW_READWRITE) == 0 || strcmp(dp->d_name,
5216                     VIEW_READONLY) == 0) {
5217                         dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen);
5218                         continue;
5219                 }
5220 
5221                 *valp = 1;
5222                 break;
5223         }
5224         kmem_free(dbuf, dlen);
5225         return (0);
5226 }
5227 
5228 /*
5229  * Return non-zero in a case the vp is an empty directory used as a ZFS mount
5230  * point.  The NFSv2 and NFSv3 servers should not allow to write to such
5231  * directories.
5232  */
5233 int
5234 protect_zfs_mntpt(vnode_t *vp)
5235 {
5236         int error;
5237         vfs_t *vfsp;
5238         struct uio uio;
5239         struct iovec iov;
5240         int eof;
5241         size_t len = 8 * 1024;
5242         char *buf;
5243 
5244         if (vp->v_type != VDIR || vn_ismntpt(vp) == 0)
5245                 return (0);
5246 
5247         error = vn_vfsrlock_wait(vp);
5248         if (error != 0)
5249                 return (error);
5250 
5251         /*
5252          * We protect ZFS mount points only
5253          */
5254         if ((vfsp = vn_mountedvfs(vp)) == NULL ||
5255             strncmp(vfssw[vfsp->vfs_fstype].vsw_name, "zfs", 3) != 0) {
5256                 vn_vfsunlock(vp);
5257                 return (0);
5258         }
5259 
5260         vn_vfsunlock(vp);
5261 
5262         buf = kmem_alloc(len, KM_SLEEP);
5263 
5264         uio.uio_iov = &iov;
5265         uio.uio_iovcnt = 1;
5266         uio.uio_segflg = UIO_SYSSPACE;
5267         uio.uio_fmode = 0;
5268         uio.uio_extflg = UIO_COPY_CACHED;
5269         uio.uio_loffset = 0;
5270         uio.uio_llimit = MAXOFFSET_T;
5271 
5272         eof = 0;
5273 
5274         do {
5275                 size_t rlen;
5276                 dirent64_t *dp;
5277 
5278                 uio.uio_resid = len;
5279                 iov.iov_base = buf;
5280                 iov.iov_len = len;
5281 
5282                 (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
5283                 error = VOP_READDIR(vp, &uio, kcred, &eof, NULL, 0);
5284                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
5285 
5286                 if (error != 0)
5287                         break;
5288 
5289                 error = EBUSY;
5290 
5291                 rlen = len - uio.uio_resid;
5292                 if (rlen == 0)
5293                         break;
5294 
5295                 for (dp = (dirent64_t *)buf;
5296                     (intptr_t)dp < (intptr_t)buf + rlen;
5297                     dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen)) {
5298                         if (strcmp(dp->d_name, ".") != 0 &&
5299                             strcmp(dp->d_name, "..") != 0) {
5300                                 error = 0;
5301                                 break;
5302                         }
5303                 }
5304         } while (eof == 0 && error != 0);
5305 
5306         kmem_free(buf, len);
5307 
5308         return (error);
5309 }