big-one Wdiff usr/src/uts/common/fs/nfs/nfs_subr.c

Print this page

NEX-3524 CLONE - Port NEX-3505 "wrong authentication" messages with root=@0.0.0.0/0 set, result in loss of client access
Reviewed by: Marcel Telka <marcel.telka@nexenta.com>
NEX-3533 CLONE - Port NEX-3019 NFSv3 writes underneath mounted filesystem to directory
Reviewed by: Dan Fields <dan.fields@nexenta.com>
re #13613 rb4516 Tunables needs volatile keyword

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/nfs/nfs_subr.c
          +++ new/usr/src/uts/common/fs/nfs/nfs_subr.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions

↓ open down ↓

10 lines elided

↑ open up ↑

  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
       21 +
  21   22  /*
  22   23   * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  23   24   * Use is subject to license terms.
  24   25   */
  25   26  
  26   27  /*
  27      - * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
       28 + * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  28   29   * Copyright (c) 2016, 2017 by Delphix. All rights reserved.
  29   30   */
  30   31  
  31   32  #include <sys/param.h>
  32   33  #include <sys/types.h>
  33   34  #include <sys/systm.h>
  34   35  #include <sys/cred.h>
  35   36  #include <sys/proc.h>
  36   37  #include <sys/user.h>
  37   38  #include <sys/time.h>

  38   39  #include <sys/buf.h>
  39   40  #include <sys/vfs.h>
  40   41  #include <sys/vnode.h>
  41   42  #include <sys/socket.h>
  42   43  #include <sys/uio.h>
  43   44  #include <sys/tiuser.h>
  44   45  #include <sys/swap.h>
  45   46  #include <sys/errno.h>
  46   47  #include <sys/debug.h>
  47   48  #include <sys/kmem.h>
  48   49  #include <sys/kstat.h>
  49   50  #include <sys/cmn_err.h>
  50   51  #include <sys/vtrace.h>
  51   52  #include <sys/session.h>
  52   53  #include <sys/dnlc.h>
  53   54  #include <sys/bitmap.h>
  54   55  #include <sys/acl.h>
  55   56  #include <sys/ddi.h>
  56   57  #include <sys/pathname.h>
  57   58  #include <sys/flock.h>
  58   59  #include <sys/dirent.h>
  59   60  #include <sys/flock.h>
  60   61  #include <sys/callb.h>
  61   62  #include <sys/atomic.h>
  62   63  #include <sys/list.h>
  63   64  #include <sys/tsol/tnet.h>
  64   65  #include <sys/priv.h>
  65   66  #include <sys/sdt.h>
  66   67  #include <sys/attr.h>
  67   68  
  68   69  #include <inet/ip6.h>
  69   70  
  70   71  #include <rpc/types.h>
  71   72  #include <rpc/xdr.h>
  72   73  #include <rpc/auth.h>
  73   74  #include <rpc/clnt.h>
  74   75  
  75   76  #include <nfs/nfs.h>
  76   77  #include <nfs/nfs4.h>
  77   78  #include <nfs/nfs_clnt.h>
  78   79  #include <nfs/rnode.h>
  79   80  #include <nfs/nfs_acl.h>
  80   81  
  81   82  #include <sys/tsol/label.h>
  82   83  
  83   84  /*
  84   85   * The hash queues for the access to active and cached rnodes
  85   86   * are organized as doubly linked lists.  A reader/writer lock
  86   87   * for each hash bucket is used to control access and to synchronize
  87   88   * lookups, additions, and deletions from the hash queue.
  88   89   *
  89   90   * The rnode freelist is organized as a doubly linked list with
  90   91   * a head pointer.  Additions and deletions are synchronized via
  91   92   * a single mutex.
  92   93   *
  93   94   * In order to add an rnode to the free list, it must be hashed into
  94   95   * a hash queue and the exclusive lock to the hash queue be held.
  95   96   * If an rnode is not hashed into a hash queue, then it is destroyed
  96   97   * because it represents no valuable information that can be reused
  97   98   * about the file.  The exclusive lock to the hash queue must be
  98   99   * held in order to prevent a lookup in the hash queue from finding
  99  100   * the rnode and using it and assuming that the rnode is not on the
 100  101   * freelist.  The lookup in the hash queue will have the hash queue
 101  102   * locked, either exclusive or shared.
 102  103   *
 103  104   * The vnode reference count for each rnode is not allowed to drop
 104  105   * below 1.  This prevents external entities, such as the VM
 105  106   * subsystem, from acquiring references to vnodes already on the
 106  107   * freelist and then trying to place them back on the freelist
 107  108   * when their reference is released.  This means that the when an
 108  109   * rnode is looked up in the hash queues, then either the rnode
 109  110   * is removed from the freelist and that reference is transferred to
 110  111   * the new reference or the vnode reference count must be incremented
 111  112   * accordingly.  The mutex for the freelist must be held in order to
 112  113   * accurately test to see if the rnode is on the freelist or not.
 113  114   * The hash queue lock might be held shared and it is possible that
 114  115   * two different threads may race to remove the rnode from the
 115  116   * freelist.  This race can be resolved by holding the mutex for the
 116  117   * freelist.  Please note that the mutex for the freelist does not
 117  118   * need to held if the rnode is not on the freelist.  It can not be
 118  119   * placed on the freelist due to the requirement that the thread
 119  120   * putting the rnode on the freelist must hold the exclusive lock
 120  121   * to the hash queue and the thread doing the lookup in the hash
 121  122   * queue is holding either a shared or exclusive lock to the hash
 122  123   * queue.
 123  124   *

↓ open down ↓

86 lines elided

↑ open up ↑

 124  125   * The lock ordering is:
 125  126   *
 126  127   *      hash bucket lock -> vnode lock
 127  128   *      hash bucket lock -> freelist lock
 128  129   */
 129  130  static rhashq_t *rtable;
 130  131  
 131  132  static kmutex_t rpfreelist_lock;
 132  133  static rnode_t *rpfreelist = NULL;
 133  134  static long rnew = 0;
 134      -long nrnode = 0;
      135 +volatile long nrnode = 0;
 135  136  
 136  137  static int rtablesize;
 137  138  static int rtablemask;
 138  139  
 139  140  static int hashlen = 4;
 140  141  
 141  142  static struct kmem_cache *rnode_cache;
 142  143  
 143  144  /*
 144  145   * Mutex to protect the following variables:
 145  146   *      nfs_major
 146  147   *      nfs_minor
 147  148   */
 148  149  kmutex_t nfs_minor_lock;
 149  150  int nfs_major;
 150  151  int nfs_minor;
 151  152  
 152      -/* Do we allow preepoch (negative) time values otw? */
 153      -bool_t nfs_allow_preepoch_time = FALSE; /* default: do not allow preepoch */
      153 +/*
      154 + * Do we allow preepoch (negative) time values otw?
      155 + * default: do not allow preepoch
      156 + */
      157 +volatile bool_t nfs_allow_preepoch_time = FALSE;
 154  158  
 155  159  /*
 156  160   * Access cache
 157  161   */
 158  162  static acache_hash_t *acache;
 159      -static long nacache;    /* used strictly to size the number of hash queues */
      163 +volatile long nacache;  /* used strictly to size the number of hash queues */
 160  164  
 161  165  static int acachesize;
 162  166  static int acachemask;
 163  167  static struct kmem_cache *acache_cache;
 164  168  
 165  169  /*
 166  170   * Client side utilities
 167  171   */
 168  172  
 169  173  /*

 170  174   * client side statistics
 171  175   */
 172  176  static const struct clstat clstat_tmpl = {
 173  177          { "calls",      KSTAT_DATA_UINT64 },
 174  178          { "badcalls",   KSTAT_DATA_UINT64 },
 175  179          { "clgets",     KSTAT_DATA_UINT64 },
 176  180          { "cltoomany",  KSTAT_DATA_UINT64 },
 177  181  #ifdef DEBUG
 178  182          { "clalloc",    KSTAT_DATA_UINT64 },
 179  183          { "noresponse", KSTAT_DATA_UINT64 },
 180  184          { "failover",   KSTAT_DATA_UINT64 },
 181  185          { "remap",      KSTAT_DATA_UINT64 },
 182  186  #endif
 183  187  };
 184  188  
 185  189  /*
 186  190   * The following are statistics that describe behavior of the system as a whole
 187  191   * and doesn't correspond to any one particular zone.
 188  192   */
 189  193  #ifdef DEBUG
 190  194  static struct clstat_debug {
 191  195          kstat_named_t   nrnode;                 /* number of allocated rnodes */
 192  196          kstat_named_t   access;                 /* size of access cache */
 193  197          kstat_named_t   dirent;                 /* size of readdir cache */
 194  198          kstat_named_t   dirents;                /* size of readdir buf cache */
 195  199          kstat_named_t   reclaim;                /* number of reclaims */
 196  200          kstat_named_t   clreclaim;              /* number of cl reclaims */
 197  201          kstat_named_t   f_reclaim;              /* number of free reclaims */
 198  202          kstat_named_t   a_reclaim;              /* number of active reclaims */
 199  203          kstat_named_t   r_reclaim;              /* number of rnode reclaims */
 200  204          kstat_named_t   rpath;                  /* bytes used to store rpaths */
 201  205  } clstat_debug = {
 202  206          { "nrnode",     KSTAT_DATA_UINT64 },
 203  207          { "access",     KSTAT_DATA_UINT64 },
 204  208          { "dirent",     KSTAT_DATA_UINT64 },
 205  209          { "dirents",    KSTAT_DATA_UINT64 },
 206  210          { "reclaim",    KSTAT_DATA_UINT64 },
 207  211          { "clreclaim",  KSTAT_DATA_UINT64 },
 208  212          { "f_reclaim",  KSTAT_DATA_UINT64 },
 209  213          { "a_reclaim",  KSTAT_DATA_UINT64 },
 210  214          { "r_reclaim",  KSTAT_DATA_UINT64 },
 211  215          { "r_path",     KSTAT_DATA_UINT64 },
 212  216  };
 213  217  #endif  /* DEBUG */
 214  218  
 215  219  /*
 216  220   * We keep a global list of per-zone client data, so we can clean up all zones
 217  221   * if we get low on memory.
 218  222   */
 219  223  static list_t nfs_clnt_list;
 220  224  static kmutex_t nfs_clnt_list_lock;

↓ open down ↓

51 lines elided

↑ open up ↑

 221  225  static zone_key_t nfsclnt_zone_key;
 222  226  
 223  227  static struct kmem_cache *chtab_cache;
 224  228  
 225  229  /*
 226  230   * Some servers do not properly update the attributes of the
 227  231   * directory when changes are made.  To allow interoperability
 228  232   * with these broken servers, the nfs_disable_rddir_cache
 229  233   * parameter must be set in /etc/system
 230  234   */
 231      -int nfs_disable_rddir_cache = 0;
      235 +volatile int nfs_disable_rddir_cache = 0;
 232  236  
 233  237  int             clget(clinfo_t *, servinfo_t *, cred_t *, CLIENT **,
 234  238                      struct chtab **);
 235  239  void            clfree(CLIENT *, struct chtab *);
 236  240  static int      acl_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **,
 237  241                      struct chtab **, struct nfs_clnt *);
 238  242  static int      nfs_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **,
 239  243                      struct chtab **, struct nfs_clnt *);
 240  244  static void     clreclaim(void *);
 241  245  static int      nfs_feedback(int, int, mntinfo_t *);

 242  246  static int      rfscall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t,
 243  247                      caddr_t, cred_t *, int *, enum clnt_stat *, int,
 244  248                      failinfo_t *);
 245  249  static int      aclcall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t,
 246  250                      caddr_t, cred_t *, int *, int, failinfo_t *);
 247  251  static void     rinactive(rnode_t *, cred_t *);
 248  252  static int      rtablehash(nfs_fhandle *);
 249  253  static vnode_t  *make_rnode(nfs_fhandle *, rhashq_t *, struct vfs *,
 250  254                      struct vnodeops *,
 251  255                      int (*)(vnode_t *, page_t *, u_offset_t *, size_t *, int,
 252  256                          cred_t *),
 253  257                      int (*)(const void *, const void *), int *, cred_t *,
 254  258                      char *, char *);
 255  259  static void     rp_rmfree(rnode_t *);
 256  260  static void     rp_addhash(rnode_t *);
 257  261  static void     rp_rmhash_locked(rnode_t *);
 258  262  static rnode_t  *rfind(rhashq_t *, nfs_fhandle *, struct vfs *);
 259  263  static void     destroy_rnode(rnode_t *);
 260  264  static void     rddir_cache_free(rddir_cache *);
 261  265  static int      nfs_free_data_reclaim(rnode_t *);
 262  266  static int      nfs_active_data_reclaim(rnode_t *);
 263  267  static int      nfs_free_reclaim(void);
 264  268  static int      nfs_active_reclaim(void);
 265  269  static int      nfs_rnode_reclaim(void);
 266  270  static void     nfs_reclaim(void *);
 267  271  static int      failover_safe(failinfo_t *);
 268  272  static void     failover_newserver(mntinfo_t *mi);
 269  273  static void     failover_thread(mntinfo_t *mi);
 270  274  static int      failover_wait(mntinfo_t *);
 271  275  static int      failover_remap(failinfo_t *);
 272  276  static int      failover_lookup(char *, vnode_t *,
 273  277                      int (*)(vnode_t *, char *, vnode_t **,
 274  278                          struct pathname *, int, vnode_t *, cred_t *, int),
 275  279                      int (*)(vnode_t *, vnode_t **, bool_t, cred_t *, int),
 276  280                      vnode_t **);
 277  281  static void     nfs_free_r_path(rnode_t *);
 278  282  static void     nfs_set_vroot(vnode_t *);
 279  283  static char     *nfs_getsrvnames(mntinfo_t *, size_t *);
 280  284  
 281  285  /*
 282  286   * from rpcsec module (common/rpcsec)
 283  287   */
 284  288  extern int sec_clnt_geth(CLIENT *, struct sec_data *, cred_t *, AUTH **);
 285  289  extern void sec_clnt_freeh(AUTH *);
 286  290  extern void sec_clnt_freeinfo(struct sec_data *);
 287  291  
 288  292  /*
 289  293   * used in mount policy
 290  294   */
 291  295  extern ts_label_t *getflabel_cipso(vfs_t *);
 292  296  
 293  297  /*
 294  298   * EIO or EINTR are not recoverable errors.
 295  299   */
 296  300  #define IS_RECOVERABLE_ERROR(error)     !((error == EINTR) || (error == EIO))
 297  301  
 298  302  #ifdef DEBUG
 299  303  #define SRV_QFULL_MSG   "send queue to NFS%d server %s is full; still trying\n"
 300  304  #define SRV_NOTRESP_MSG "NFS%d server %s not responding still trying\n"
 301  305  #else
 302  306  #define SRV_QFULL_MSG   "send queue to NFS server %s is full still trying\n"
 303  307  #define SRV_NOTRESP_MSG "NFS server %s not responding still trying\n"
 304  308  #endif
 305  309  /*
 306  310   * Common handle get program for NFS, NFS ACL, and NFS AUTH client.
 307  311   */
 308  312  static int
 309  313  clget_impl(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
 310  314      struct chtab **chp, struct nfs_clnt *nfscl)
 311  315  {
 312  316          struct chhead *ch, *newch;
 313  317          struct chhead **plistp;
 314  318          struct chtab *cp;
 315  319          int error;
 316  320          k_sigset_t smask;
 317  321  
 318  322          if (newcl == NULL || chp == NULL || ci == NULL)
 319  323                  return (EINVAL);
 320  324  
 321  325          *newcl = NULL;
 322  326          *chp = NULL;
 323  327  
 324  328          /*
 325  329           * Find an unused handle or create one
 326  330           */
 327  331          newch = NULL;
 328  332          nfscl->nfscl_stat.clgets.value.ui64++;
 329  333  top:
 330  334          /*
 331  335           * Find the correct entry in the cache to check for free
 332  336           * client handles.  The search is based on the RPC program
 333  337           * number, program version number, dev_t for the transport
 334  338           * device, and the protocol family.
 335  339           */
 336  340          mutex_enter(&nfscl->nfscl_chtable_lock);
 337  341          plistp = &nfscl->nfscl_chtable;
 338  342          for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) {
 339  343                  if (ch->ch_prog == ci->cl_prog &&
 340  344                      ch->ch_vers == ci->cl_vers &&
 341  345                      ch->ch_dev == svp->sv_knconf->knc_rdev &&
 342  346                      (strcmp(ch->ch_protofmly,
 343  347                      svp->sv_knconf->knc_protofmly) == 0))
 344  348                          break;
 345  349                  plistp = &ch->ch_next;
 346  350          }
 347  351  
 348  352          /*
 349  353           * If we didn't find a cache entry for this quadruple, then
 350  354           * create one.  If we don't have one already preallocated,
 351  355           * then drop the cache lock, create one, and then start over.
 352  356           * If we did have a preallocated entry, then just add it to
 353  357           * the front of the list.
 354  358           */
 355  359          if (ch == NULL) {
 356  360                  if (newch == NULL) {
 357  361                          mutex_exit(&nfscl->nfscl_chtable_lock);
 358  362                          newch = kmem_alloc(sizeof (*newch), KM_SLEEP);
 359  363                          newch->ch_timesused = 0;
 360  364                          newch->ch_prog = ci->cl_prog;
 361  365                          newch->ch_vers = ci->cl_vers;
 362  366                          newch->ch_dev = svp->sv_knconf->knc_rdev;
 363  367                          newch->ch_protofmly = kmem_alloc(
 364  368                              strlen(svp->sv_knconf->knc_protofmly) + 1,
 365  369                              KM_SLEEP);
 366  370                          (void) strcpy(newch->ch_protofmly,
 367  371                              svp->sv_knconf->knc_protofmly);
 368  372                          newch->ch_list = NULL;
 369  373                          goto top;
 370  374                  }
 371  375                  ch = newch;
 372  376                  newch = NULL;
 373  377                  ch->ch_next = nfscl->nfscl_chtable;
 374  378                  nfscl->nfscl_chtable = ch;
 375  379          /*
 376  380           * We found a cache entry, but if it isn't on the front of the
 377  381           * list, then move it to the front of the list to try to take
 378  382           * advantage of locality of operations.
 379  383           */
 380  384          } else if (ch != nfscl->nfscl_chtable) {
 381  385                  *plistp = ch->ch_next;
 382  386                  ch->ch_next = nfscl->nfscl_chtable;
 383  387                  nfscl->nfscl_chtable = ch;
 384  388          }
 385  389  
 386  390          /*
 387  391           * If there was a free client handle cached, then remove it
 388  392           * from the list, init it, and use it.
 389  393           */
 390  394          if (ch->ch_list != NULL) {
 391  395                  cp = ch->ch_list;
 392  396                  ch->ch_list = cp->ch_list;
 393  397                  mutex_exit(&nfscl->nfscl_chtable_lock);
 394  398                  if (newch != NULL) {
 395  399                          kmem_free(newch->ch_protofmly,
 396  400                              strlen(newch->ch_protofmly) + 1);
 397  401                          kmem_free(newch, sizeof (*newch));
 398  402                  }
 399  403                  (void) clnt_tli_kinit(cp->ch_client, svp->sv_knconf,
 400  404                      &svp->sv_addr, ci->cl_readsize, ci->cl_retrans, cr);
 401  405                  error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr,
 402  406                      &cp->ch_client->cl_auth);
 403  407                  if (error || cp->ch_client->cl_auth == NULL) {
 404  408                          CLNT_DESTROY(cp->ch_client);
 405  409                          kmem_cache_free(chtab_cache, cp);
 406  410                          return ((error != 0) ? error : EINTR);
 407  411                  }
 408  412                  ch->ch_timesused++;
 409  413                  *newcl = cp->ch_client;
 410  414                  *chp = cp;
 411  415                  return (0);
 412  416          }
 413  417  
 414  418          /*
 415  419           * There weren't any free client handles which fit, so allocate
 416  420           * a new one and use that.
 417  421           */
 418  422  #ifdef DEBUG
 419  423          atomic_inc_64(&nfscl->nfscl_stat.clalloc.value.ui64);
 420  424  #endif
 421  425          mutex_exit(&nfscl->nfscl_chtable_lock);
 422  426  
 423  427          nfscl->nfscl_stat.cltoomany.value.ui64++;
 424  428          if (newch != NULL) {
 425  429                  kmem_free(newch->ch_protofmly, strlen(newch->ch_protofmly) + 1);
 426  430                  kmem_free(newch, sizeof (*newch));
 427  431          }
 428  432  
 429  433          cp = kmem_cache_alloc(chtab_cache, KM_SLEEP);
 430  434          cp->ch_head = ch;
 431  435  
 432  436          sigintr(&smask, (int)ci->cl_flags & MI_INT);
 433  437          error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, ci->cl_prog,
 434  438              ci->cl_vers, ci->cl_readsize, ci->cl_retrans, cr, &cp->ch_client);
 435  439          sigunintr(&smask);
 436  440  
 437  441          if (error != 0) {
 438  442                  kmem_cache_free(chtab_cache, cp);
 439  443  #ifdef DEBUG
 440  444                  atomic_dec_64(&nfscl->nfscl_stat.clalloc.value.ui64);
 441  445  #endif
 442  446                  /*
 443  447                   * Warning is unnecessary if error is EINTR.
 444  448                   */
 445  449                  if (error != EINTR) {
 446  450                          nfs_cmn_err(error, CE_WARN,
 447  451                              "clget: couldn't create handle: %m\n");
 448  452                  }
 449  453                  return (error);
 450  454          }
 451  455          (void) CLNT_CONTROL(cp->ch_client, CLSET_PROGRESS, NULL);
 452  456          auth_destroy(cp->ch_client->cl_auth);
 453  457          error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr,
 454  458              &cp->ch_client->cl_auth);
 455  459          if (error || cp->ch_client->cl_auth == NULL) {
 456  460                  CLNT_DESTROY(cp->ch_client);
 457  461                  kmem_cache_free(chtab_cache, cp);
 458  462  #ifdef DEBUG
 459  463                  atomic_dec_64(&nfscl->nfscl_stat.clalloc.value.ui64);
 460  464  #endif
 461  465                  return ((error != 0) ? error : EINTR);
 462  466          }
 463  467          ch->ch_timesused++;
 464  468          *newcl = cp->ch_client;
 465  469          ASSERT(cp->ch_client->cl_nosignal == FALSE);
 466  470          *chp = cp;
 467  471          return (0);
 468  472  }
 469  473  
 470  474  int
 471  475  clget(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
 472  476      struct chtab **chp)
 473  477  {
 474  478          struct nfs_clnt *nfscl;
 475  479  
 476  480          nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
 477  481          ASSERT(nfscl != NULL);
 478  482  
 479  483          return (clget_impl(ci, svp, cr, newcl, chp, nfscl));
 480  484  }
 481  485  
 482  486  static int
 483  487  acl_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
 484  488      struct chtab **chp, struct nfs_clnt *nfscl)
 485  489  {
 486  490          clinfo_t ci;
 487  491          int error;
 488  492  
 489  493          /*
 490  494           * Set read buffer size to rsize
 491  495           * and add room for RPC headers.
 492  496           */
 493  497          ci.cl_readsize = mi->mi_tsize;
 494  498          if (ci.cl_readsize != 0)
 495  499                  ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA);
 496  500  
 497  501          /*
 498  502           * If soft mount and server is down just try once.
 499  503           * meaning: do not retransmit.
 500  504           */
 501  505          if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN))
 502  506                  ci.cl_retrans = 0;
 503  507          else
 504  508                  ci.cl_retrans = mi->mi_retrans;
 505  509  
 506  510          ci.cl_prog = NFS_ACL_PROGRAM;
 507  511          ci.cl_vers = mi->mi_vers;
 508  512          ci.cl_flags = mi->mi_flags;
 509  513  
 510  514          /*
 511  515           * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS
 512  516           * security flavor, the client tries to establish a security context
 513  517           * by contacting the server. If the connection is timed out or reset,
 514  518           * e.g. server reboot, we will try again.
 515  519           */
 516  520          do {
 517  521                  error = clget_impl(&ci, svp, cr, newcl, chp, nfscl);
 518  522  
 519  523                  if (error == 0)
 520  524                          break;
 521  525  
 522  526                  /*
 523  527                   * For forced unmount or zone shutdown, bail out, no retry.
 524  528                   */
 525  529                  if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
 526  530                          error = EIO;
 527  531                          break;
 528  532                  }
 529  533  
 530  534                  /* do not retry for softmount */
 531  535                  if (!(mi->mi_flags & MI_HARD))
 532  536                          break;
 533  537  
 534  538                  /* let the caller deal with the failover case */
 535  539                  if (FAILOVER_MOUNT(mi))
 536  540                          break;
 537  541  
 538  542          } while (error == ETIMEDOUT || error == ECONNRESET);
 539  543  
 540  544          return (error);
 541  545  }
 542  546  
 543  547  static int
 544  548  nfs_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
 545  549      struct chtab **chp, struct nfs_clnt *nfscl)
 546  550  {
 547  551          clinfo_t ci;
 548  552          int error;
 549  553  
 550  554          /*
 551  555           * Set read buffer size to rsize
 552  556           * and add room for RPC headers.
 553  557           */
 554  558          ci.cl_readsize = mi->mi_tsize;
 555  559          if (ci.cl_readsize != 0)
 556  560                  ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA);
 557  561  
 558  562          /*
 559  563           * If soft mount and server is down just try once.
 560  564           * meaning: do not retransmit.
 561  565           */
 562  566          if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN))
 563  567                  ci.cl_retrans = 0;
 564  568          else
 565  569                  ci.cl_retrans = mi->mi_retrans;
 566  570  
 567  571          ci.cl_prog = mi->mi_prog;
 568  572          ci.cl_vers = mi->mi_vers;
 569  573          ci.cl_flags = mi->mi_flags;
 570  574  
 571  575          /*
 572  576           * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS
 573  577           * security flavor, the client tries to establish a security context
 574  578           * by contacting the server. If the connection is timed out or reset,
 575  579           * e.g. server reboot, we will try again.
 576  580           */
 577  581          do {
 578  582                  error = clget_impl(&ci, svp, cr, newcl, chp, nfscl);
 579  583  
 580  584                  if (error == 0)
 581  585                          break;
 582  586  
 583  587                  /*
 584  588                   * For forced unmount or zone shutdown, bail out, no retry.
 585  589                   */
 586  590                  if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
 587  591                          error = EIO;
 588  592                          break;
 589  593                  }
 590  594  
 591  595                  /* do not retry for softmount */
 592  596                  if (!(mi->mi_flags & MI_HARD))
 593  597                          break;
 594  598  
 595  599                  /* let the caller deal with the failover case */
 596  600                  if (FAILOVER_MOUNT(mi))
 597  601                          break;
 598  602  
 599  603          } while (error == ETIMEDOUT || error == ECONNRESET);
 600  604  
 601  605          return (error);
 602  606  }
 603  607  
 604  608  static void
 605  609  clfree_impl(CLIENT *cl, struct chtab *cp, struct nfs_clnt *nfscl)
 606  610  {
 607  611          if (cl->cl_auth != NULL) {
 608  612                  sec_clnt_freeh(cl->cl_auth);
 609  613                  cl->cl_auth = NULL;
 610  614          }
 611  615  
 612  616          /*
 613  617           * Timestamp this cache entry so that we know when it was last
 614  618           * used.
 615  619           */
 616  620          cp->ch_freed = gethrestime_sec();
 617  621  
 618  622          /*
 619  623           * Add the free client handle to the front of the list.
 620  624           * This way, the list will be sorted in youngest to oldest
 621  625           * order.
 622  626           */
 623  627          mutex_enter(&nfscl->nfscl_chtable_lock);
 624  628          cp->ch_list = cp->ch_head->ch_list;
 625  629          cp->ch_head->ch_list = cp;
 626  630          mutex_exit(&nfscl->nfscl_chtable_lock);
 627  631  }
 628  632  
 629  633  void
 630  634  clfree(CLIENT *cl, struct chtab *cp)
 631  635  {
 632  636          struct nfs_clnt *nfscl;
 633  637  
 634  638          nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
 635  639          ASSERT(nfscl != NULL);
 636  640  
 637  641          clfree_impl(cl, cp, nfscl);
 638  642  }
 639  643  
 640  644  #define CL_HOLDTIME     60      /* time to hold client handles */
 641  645  
 642  646  static void
 643  647  clreclaim_zone(struct nfs_clnt *nfscl, uint_t cl_holdtime)
 644  648  {
 645  649          struct chhead *ch;
 646  650          struct chtab *cp;       /* list of objects that can be reclaimed */
 647  651          struct chtab *cpe;
 648  652          struct chtab *cpl;
 649  653          struct chtab **cpp;
 650  654  #ifdef DEBUG
 651  655          int n = 0;
 652  656  #endif
 653  657  
 654  658          /*
 655  659           * Need to reclaim some memory, so step through the cache
 656  660           * looking through the lists for entries which can be freed.
 657  661           */
 658  662          cp = NULL;
 659  663  
 660  664          mutex_enter(&nfscl->nfscl_chtable_lock);
 661  665  
 662  666          /*
 663  667           * Here we step through each non-NULL quadruple and start to
 664  668           * construct the reclaim list pointed to by cp.  Note that
 665  669           * cp will contain all eligible chtab entries.  When this traversal
 666  670           * completes, chtab entries from the last quadruple will be at the
 667  671           * front of cp and entries from previously inspected quadruples have
 668  672           * been appended to the rear of cp.
 669  673           */
 670  674          for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) {
 671  675                  if (ch->ch_list == NULL)
 672  676                          continue;
 673  677                  /*
 674  678                   * Search each list for entries older then
 675  679                   * cl_holdtime seconds.  The lists are maintained
 676  680                   * in youngest to oldest order so that when the
 677  681                   * first entry is found which is old enough, then
 678  682                   * all of the rest of the entries on the list will
 679  683                   * be old enough as well.
 680  684                   */
 681  685                  cpl = ch->ch_list;
 682  686                  cpp = &ch->ch_list;
 683  687                  while (cpl != NULL &&
 684  688                      cpl->ch_freed + cl_holdtime > gethrestime_sec()) {
 685  689                          cpp = &cpl->ch_list;
 686  690                          cpl = cpl->ch_list;
 687  691                  }
 688  692                  if (cpl != NULL) {
 689  693                          *cpp = NULL;
 690  694                          if (cp != NULL) {
 691  695                                  cpe = cpl;
 692  696                                  while (cpe->ch_list != NULL)
 693  697                                          cpe = cpe->ch_list;
 694  698                                  cpe->ch_list = cp;
 695  699                          }
 696  700                          cp = cpl;
 697  701                  }
 698  702          }
 699  703  
 700  704          mutex_exit(&nfscl->nfscl_chtable_lock);
 701  705  
 702  706          /*
 703  707           * If cp is empty, then there is nothing to reclaim here.
 704  708           */
 705  709          if (cp == NULL)
 706  710                  return;
 707  711  
 708  712          /*
 709  713           * Step through the list of entries to free, destroying each client
 710  714           * handle and kmem_free'ing the memory for each entry.
 711  715           */
 712  716          while (cp != NULL) {
 713  717  #ifdef DEBUG
 714  718                  n++;
 715  719  #endif
 716  720                  CLNT_DESTROY(cp->ch_client);
 717  721                  cpl = cp->ch_list;
 718  722                  kmem_cache_free(chtab_cache, cp);
 719  723                  cp = cpl;
 720  724          }
 721  725  
 722  726  #ifdef DEBUG
 723  727          /*
 724  728           * Update clalloc so that nfsstat shows the current number
 725  729           * of allocated client handles.
 726  730           */
 727  731          atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -n);
 728  732  #endif
 729  733  }
 730  734  
 731  735  /* ARGSUSED */
 732  736  static void
 733  737  clreclaim(void *all)
 734  738  {
 735  739          struct nfs_clnt *nfscl;
 736  740  
 737  741  #ifdef DEBUG
 738  742          clstat_debug.clreclaim.value.ui64++;
 739  743  #endif
 740  744          /*
 741  745           * The system is low on memory; go through and try to reclaim some from
 742  746           * every zone on the system.
 743  747           */
 744  748          mutex_enter(&nfs_clnt_list_lock);
 745  749          nfscl = list_head(&nfs_clnt_list);
 746  750          for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl))
 747  751                  clreclaim_zone(nfscl, CL_HOLDTIME);
 748  752          mutex_exit(&nfs_clnt_list_lock);
 749  753  }
 750  754  
 751  755  /*
 752  756   * Minimum time-out values indexed by call type
 753  757   * These units are in "eights" of a second to avoid multiplies
 754  758   */
 755  759  static unsigned int minimum_timeo[] = {
 756  760          6, 7, 10
 757  761  };
 758  762  
 759  763  /*
 760  764   * Back off for retransmission timeout, MAXTIMO is in hz of a sec
 761  765   */
 762  766  #define MAXTIMO (20*hz)
 763  767  #define backoff(tim)    (((tim) < MAXTIMO) ? dobackoff(tim) : (tim))
 764  768  #define dobackoff(tim)  ((((tim) << 1) > MAXTIMO) ? MAXTIMO : ((tim) << 1))
 765  769  
 766  770  #define MIN_NFS_TSIZE 512       /* minimum "chunk" of NFS IO */
 767  771  #define REDUCE_NFS_TIME (hz/2)  /* rtxcur we try to keep under */
 768  772  #define INCREASE_NFS_TIME (hz/3*8) /* srtt we try to keep under (scaled*8) */
 769  773  
 770  774  /*
 771  775   * Function called when rfscall notices that we have been
 772  776   * re-transmitting, or when we get a response without retransmissions.
 773  777   * Return 1 if the transfer size was adjusted down - 0 if no change.
 774  778   */
 775  779  static int
 776  780  nfs_feedback(int flag, int which, mntinfo_t *mi)
 777  781  {
 778  782          int kind;
 779  783          int r = 0;
 780  784  
 781  785          mutex_enter(&mi->mi_lock);
 782  786          if (flag == FEEDBACK_REXMIT1) {
 783  787                  if (mi->mi_timers[NFS_CALLTYPES].rt_rtxcur != 0 &&
 784  788                      mi->mi_timers[NFS_CALLTYPES].rt_rtxcur < REDUCE_NFS_TIME)
 785  789                          goto done;
 786  790                  if (mi->mi_curread > MIN_NFS_TSIZE) {
 787  791                          mi->mi_curread /= 2;
 788  792                          if (mi->mi_curread < MIN_NFS_TSIZE)
 789  793                                  mi->mi_curread = MIN_NFS_TSIZE;
 790  794                          r = 1;
 791  795                  }
 792  796  
 793  797                  if (mi->mi_curwrite > MIN_NFS_TSIZE) {
 794  798                          mi->mi_curwrite /= 2;
 795  799                          if (mi->mi_curwrite < MIN_NFS_TSIZE)
 796  800                                  mi->mi_curwrite = MIN_NFS_TSIZE;
 797  801                          r = 1;
 798  802                  }
 799  803          } else if (flag == FEEDBACK_OK) {
 800  804                  kind = mi->mi_timer_type[which];
 801  805                  if (kind == 0 ||
 802  806                      mi->mi_timers[kind].rt_srtt >= INCREASE_NFS_TIME)
 803  807                          goto done;
 804  808                  if (kind == 1) {
 805  809                          if (mi->mi_curread >= mi->mi_tsize)
 806  810                                  goto done;
 807  811                          mi->mi_curread +=  MIN_NFS_TSIZE;
 808  812                          if (mi->mi_curread > mi->mi_tsize/2)
 809  813                                  mi->mi_curread = mi->mi_tsize;
 810  814                  } else if (kind == 2) {
 811  815                          if (mi->mi_curwrite >= mi->mi_stsize)
 812  816                                  goto done;
 813  817                          mi->mi_curwrite += MIN_NFS_TSIZE;
 814  818                          if (mi->mi_curwrite > mi->mi_stsize/2)
 815  819                                  mi->mi_curwrite = mi->mi_stsize;
 816  820                  }
 817  821          }
 818  822  done:
 819  823          mutex_exit(&mi->mi_lock);
 820  824          return (r);
 821  825  }
 822  826  
 823  827  #ifdef DEBUG
 824  828  static int rfs2call_hits = 0;
 825  829  static int rfs2call_misses = 0;
 826  830  #endif
 827  831  
 828  832  int
 829  833  rfs2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
 830  834      xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
 831  835      enum nfsstat *statusp, int flags, failinfo_t *fi)
 832  836  {
 833  837          int rpcerror;
 834  838          enum clnt_stat rpc_status;
 835  839  
 836  840          ASSERT(statusp != NULL);
 837  841  
 838  842          rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp,
 839  843              cr, douprintf, &rpc_status, flags, fi);
 840  844          if (!rpcerror) {
 841  845                  /*
 842  846                   * See crnetadjust() for comments.
 843  847                   */
 844  848                  if (*statusp == NFSERR_ACCES &&
 845  849                      (cr = crnetadjust(cr)) != NULL) {
 846  850  #ifdef DEBUG
 847  851                          rfs2call_hits++;
 848  852  #endif
 849  853                          rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres,
 850  854                              resp, cr, douprintf, NULL, flags, fi);
 851  855                          crfree(cr);
 852  856  #ifdef DEBUG
 853  857                          if (*statusp == NFSERR_ACCES)
 854  858                                  rfs2call_misses++;
 855  859  #endif
 856  860                  }

↓ open down ↓

615 lines elided

↑ open up ↑

 857  861          } else if (rpc_status == RPC_PROCUNAVAIL) {
 858  862                  *statusp = NFSERR_OPNOTSUPP;
 859  863                  rpcerror = 0;
 860  864          }
 861  865  
 862  866          return (rpcerror);
 863  867  }
 864  868  
 865  869  #define NFS3_JUKEBOX_DELAY      10 * hz
 866  870  
 867      -static clock_t nfs3_jukebox_delay = 0;
      871 +volatile clock_t nfs3_jukebox_delay = 0;
 868  872  
 869  873  #ifdef DEBUG
 870  874  static int rfs3call_hits = 0;
 871  875  static int rfs3call_misses = 0;
 872  876  #endif
 873  877  
 874  878  int
 875  879  rfs3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
 876  880      xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
 877  881      nfsstat3 *statusp, int flags, failinfo_t *fi)

 878  882  {
 879  883          int rpcerror;
 880  884          int user_informed;
 881  885  
 882  886          user_informed = 0;
 883  887          do {
 884  888                  rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp,
 885  889                      cr, douprintf, NULL, flags, fi);
 886  890                  if (!rpcerror) {
 887  891                          cred_t *crr;
 888  892                          if (*statusp == NFS3ERR_JUKEBOX) {
 889  893                                  if (ttoproc(curthread) == &p0) {
 890  894                                          rpcerror = EAGAIN;
 891  895                                          break;
 892  896                                  }
 893  897                                  if (!user_informed) {
 894  898                                          user_informed = 1;
 895  899                                          uprintf(
 896  900                  "file temporarily unavailable on the server, retrying...\n");
 897  901                                  }
 898  902                                  delay(nfs3_jukebox_delay);
 899  903                          }
 900  904                          /*
 901  905                           * See crnetadjust() for comments.
 902  906                           */
 903  907                          else if (*statusp == NFS3ERR_ACCES &&
 904  908                              (crr = crnetadjust(cr)) != NULL) {
 905  909  #ifdef DEBUG
 906  910                                  rfs3call_hits++;
 907  911  #endif
 908  912                                  rpcerror = rfscall(mi, which, xdrargs, argsp,
 909  913                                      xdrres, resp, crr, douprintf,
 910  914                                      NULL, flags, fi);
 911  915  
 912  916                                  crfree(crr);
 913  917  #ifdef DEBUG
 914  918                                  if (*statusp == NFS3ERR_ACCES)
 915  919                                          rfs3call_misses++;
 916  920  #endif
 917  921                          }
 918  922                  }
 919  923          } while (!rpcerror && *statusp == NFS3ERR_JUKEBOX);
 920  924  
 921  925          return (rpcerror);
 922  926  }
 923  927  
 924  928  #define VALID_FH(fi)    (VTOR(fi->vp)->r_server == VTOMI(fi->vp)->mi_curr_serv)
 925  929  #define INC_READERS(mi)         { \
 926  930          mi->mi_readers++; \
 927  931  }
 928  932  #define DEC_READERS(mi)         { \
 929  933          mi->mi_readers--; \
 930  934          if (mi->mi_readers == 0) \
 931  935                  cv_broadcast(&mi->mi_failover_cv); \
 932  936  }
 933  937  
 934  938  static int
 935  939  rfscall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
 936  940      xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf,
 937  941      enum clnt_stat *rpc_status, int flags, failinfo_t *fi)
 938  942  {
 939  943          CLIENT *client;
 940  944          struct chtab *ch;
 941  945          cred_t *cr = icr;
 942  946          enum clnt_stat status;
 943  947          struct rpc_err rpcerr, rpcerr_tmp;
 944  948          struct timeval wait;
 945  949          int timeo;              /* in units of hz */
 946  950          int my_rsize, my_wsize;
 947  951          bool_t tryagain;
 948  952          bool_t cred_cloned = FALSE;
 949  953          k_sigset_t smask;
 950  954          servinfo_t *svp;
 951  955          struct nfs_clnt *nfscl;
 952  956          zoneid_t zoneid = getzoneid();
 953  957          char *msg;
 954  958  #ifdef DEBUG
 955  959          char *bufp;
 956  960  #endif
 957  961  
 958  962  
 959  963          TRACE_2(TR_FAC_NFS, TR_RFSCALL_START,
 960  964              "rfscall_start:which %d mi %p", which, mi);
 961  965  
 962  966          nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
 963  967          ASSERT(nfscl != NULL);
 964  968  
 965  969          nfscl->nfscl_stat.calls.value.ui64++;
 966  970          mi->mi_reqs[which].value.ui64++;
 967  971  
 968  972          rpcerr.re_status = RPC_SUCCESS;
 969  973  
 970  974          /*
 971  975           * In case of forced unmount or zone shutdown, return EIO.
 972  976           */
 973  977  
 974  978          if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
 975  979                  rpcerr.re_status = RPC_FAILED;
 976  980                  rpcerr.re_errno = EIO;
 977  981                  return (rpcerr.re_errno);
 978  982          }
 979  983  
 980  984          /*
 981  985           * Remember the transfer sizes in case
 982  986           * nfs_feedback changes them underneath us.
 983  987           */
 984  988          my_rsize = mi->mi_curread;
 985  989          my_wsize = mi->mi_curwrite;
 986  990  
 987  991          /*
 988  992           * NFS client failover support
 989  993           *
 990  994           * If this rnode is not in sync with the current server (VALID_FH),
 991  995           * we'd like to do a remap to get in sync.  We can be interrupted
 992  996           * in failover_remap(), and if so we'll bail.  Otherwise, we'll
 993  997           * use the best info we have to try the RPC.  Part of that is
 994  998           * unconditionally updating the filehandle copy kept for V3.
 995  999           *
 996 1000           * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible
 997 1001           * rw_enter(); we're trying to keep the current server from being
 998 1002           * changed on us until we're done with the remapping and have a
 999 1003           * matching client handle.  We don't want to sending a filehandle
1000 1004           * to the wrong host.
1001 1005           */
1002 1006  failoverretry:
1003 1007          if (FAILOVER_MOUNT(mi)) {
1004 1008                  mutex_enter(&mi->mi_lock);
1005 1009                  if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1006 1010                          if (failover_wait(mi)) {
1007 1011                                  mutex_exit(&mi->mi_lock);
1008 1012                                  return (EINTR);
1009 1013                          }
1010 1014                  }
1011 1015                  INC_READERS(mi);
1012 1016                  mutex_exit(&mi->mi_lock);
1013 1017                  if (fi) {
1014 1018                          if (!VALID_FH(fi) &&
1015 1019                              !(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1016 1020                                  int remaperr;
1017 1021  
1018 1022                                  svp = mi->mi_curr_serv;
1019 1023                                  remaperr = failover_remap(fi);
1020 1024                                  if (remaperr != 0) {
1021 1025  #ifdef DEBUG
1022 1026                                          if (remaperr != EINTR)
1023 1027                                                  nfs_cmn_err(remaperr, CE_WARN,
1024 1028                                              "rfscall couldn't failover: %m");
1025 1029  #endif
1026 1030                                          mutex_enter(&mi->mi_lock);
1027 1031                                          DEC_READERS(mi);
1028 1032                                          mutex_exit(&mi->mi_lock);
1029 1033                                          /*
1030 1034                                           * If failover_remap returns ETIMEDOUT
1031 1035                                           * and the filesystem is hard mounted
1032 1036                                           * we have to retry the call with a new
1033 1037                                           * server.
1034 1038                                           */
1035 1039                                          if ((mi->mi_flags & MI_HARD) &&
1036 1040                                              IS_RECOVERABLE_ERROR(remaperr)) {
1037 1041                                                  if (svp == mi->mi_curr_serv)
1038 1042                                                          failover_newserver(mi);
1039 1043                                                  rpcerr.re_status = RPC_SUCCESS;
1040 1044                                                  goto failoverretry;
1041 1045                                          }
1042 1046                                          rpcerr.re_errno = remaperr;
1043 1047                                          return (remaperr);
1044 1048                                  }
1045 1049                          }
1046 1050                          if (fi->fhp && fi->copyproc)
1047 1051                                  (*fi->copyproc)(fi->fhp, fi->vp);
1048 1052                  }
1049 1053          }
1050 1054  
1051 1055          /* For TSOL, use a new cred which has net_mac_aware flag */
1052 1056          if (!cred_cloned && is_system_labeled()) {
1053 1057                  cred_cloned = TRUE;
1054 1058                  cr = crdup(icr);
1055 1059                  (void) setpflags(NET_MAC_AWARE, 1, cr);
1056 1060          }
1057 1061  
1058 1062          /*
1059 1063           * clget() calls clnt_tli_kinit() which clears the xid, so we
1060 1064           * are guaranteed to reprocess the retry as a new request.
1061 1065           */
1062 1066          svp = mi->mi_curr_serv;
1063 1067          rpcerr.re_errno = nfs_clget(mi, svp, cr, &client, &ch, nfscl);
1064 1068  
1065 1069          if (FAILOVER_MOUNT(mi)) {
1066 1070                  mutex_enter(&mi->mi_lock);
1067 1071                  DEC_READERS(mi);
1068 1072                  mutex_exit(&mi->mi_lock);
1069 1073  
1070 1074                  if ((rpcerr.re_errno == ETIMEDOUT ||
1071 1075                      rpcerr.re_errno == ECONNRESET) &&
1072 1076                      failover_safe(fi)) {
1073 1077                          if (svp == mi->mi_curr_serv)
1074 1078                                  failover_newserver(mi);
1075 1079                          goto failoverretry;
1076 1080                  }
1077 1081          }
1078 1082          if (rpcerr.re_errno != 0)
1079 1083                  return (rpcerr.re_errno);
1080 1084  
1081 1085          if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD ||
1082 1086              svp->sv_knconf->knc_semantics == NC_TPI_COTS) {
1083 1087                  timeo = (mi->mi_timeo * hz) / 10;
1084 1088          } else {
1085 1089                  mutex_enter(&mi->mi_lock);
1086 1090                  timeo = CLNT_SETTIMERS(client,
1087 1091                      &(mi->mi_timers[mi->mi_timer_type[which]]),
1088 1092                      &(mi->mi_timers[NFS_CALLTYPES]),
1089 1093                      (minimum_timeo[mi->mi_call_type[which]]*hz)>>3,
1090 1094                      (void (*)())NULL, (caddr_t)mi, 0);
1091 1095                  mutex_exit(&mi->mi_lock);
1092 1096          }
1093 1097  
1094 1098          /*
1095 1099           * If hard mounted fs, retry call forever unless hard error occurs.
1096 1100           */
1097 1101          do {
1098 1102                  tryagain = FALSE;
1099 1103  
1100 1104                  if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1101 1105                          status = RPC_FAILED;
1102 1106                          rpcerr.re_status = RPC_FAILED;
1103 1107                          rpcerr.re_errno = EIO;
1104 1108                          break;
1105 1109                  }
1106 1110  
1107 1111                  TICK_TO_TIMEVAL(timeo, &wait);
1108 1112  
1109 1113                  /*
1110 1114                   * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
1111 1115                   * and SIGTERM. (Preserving the existing masks).
1112 1116                   * Mask out SIGINT if mount option nointr is specified.
1113 1117                   */
1114 1118                  sigintr(&smask, (int)mi->mi_flags & MI_INT);
1115 1119                  if (!(mi->mi_flags & MI_INT))
1116 1120                          client->cl_nosignal = TRUE;
1117 1121  
1118 1122                  /*
1119 1123                   * If there is a current signal, then don't bother
1120 1124                   * even trying to send out the request because we
1121 1125                   * won't be able to block waiting for the response.
1122 1126                   * Simply assume RPC_INTR and get on with it.
1123 1127                   */
1124 1128                  if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING))
1125 1129                          status = RPC_INTR;
1126 1130                  else {
1127 1131                          status = CLNT_CALL(client, which, xdrargs, argsp,
1128 1132                              xdrres, resp, wait);
1129 1133                  }
1130 1134  
1131 1135                  if (!(mi->mi_flags & MI_INT))
1132 1136                          client->cl_nosignal = FALSE;
1133 1137                  /*
1134 1138                   * restore original signal mask
1135 1139                   */
1136 1140                  sigunintr(&smask);
1137 1141  
1138 1142                  switch (status) {
1139 1143                  case RPC_SUCCESS:
1140 1144                          if ((mi->mi_flags & MI_DYNAMIC) &&
1141 1145                              mi->mi_timer_type[which] != 0 &&
1142 1146                              (mi->mi_curread != my_rsize ||
1143 1147                              mi->mi_curwrite != my_wsize))
1144 1148                                  (void) nfs_feedback(FEEDBACK_OK, which, mi);
1145 1149                          break;
1146 1150  
1147 1151                  case RPC_INTR:
1148 1152                          /*
1149 1153                           * There is no way to recover from this error,
1150 1154                           * even if mount option nointr is specified.
1151 1155                           * SIGKILL, for example, cannot be blocked.
1152 1156                           */
1153 1157                          rpcerr.re_status = RPC_INTR;
1154 1158                          rpcerr.re_errno = EINTR;
1155 1159                          break;
1156 1160  
1157 1161                  case RPC_UDERROR:
1158 1162                          /*
1159 1163                           * If the NFS server is local (vold) and
1160 1164                           * it goes away then we get RPC_UDERROR.
1161 1165                           * This is a retryable error, so we would
1162 1166                           * loop, so check to see if the specific
1163 1167                           * error was ECONNRESET, indicating that
1164 1168                           * target did not exist at all.  If so,
1165 1169                           * return with RPC_PROGUNAVAIL and
1166 1170                           * ECONNRESET to indicate why.
1167 1171                           */
1168 1172                          CLNT_GETERR(client, &rpcerr);
1169 1173                          if (rpcerr.re_errno == ECONNRESET) {
1170 1174                                  rpcerr.re_status = RPC_PROGUNAVAIL;
1171 1175                                  rpcerr.re_errno = ECONNRESET;
1172 1176                                  break;
1173 1177                          }
1174 1178                          /*FALLTHROUGH*/
1175 1179  
1176 1180                  default:                /* probably RPC_TIMEDOUT */
1177 1181                          if (IS_UNRECOVERABLE_RPC(status))
1178 1182                                  break;
1179 1183  
1180 1184                          /*
1181 1185                           * increment server not responding count
1182 1186                           */
1183 1187                          mutex_enter(&mi->mi_lock);
1184 1188                          mi->mi_noresponse++;
1185 1189                          mutex_exit(&mi->mi_lock);
1186 1190  #ifdef DEBUG
1187 1191                          nfscl->nfscl_stat.noresponse.value.ui64++;
1188 1192  #endif
1189 1193  
1190 1194                          if (!(mi->mi_flags & MI_HARD)) {
1191 1195                                  if (!(mi->mi_flags & MI_SEMISOFT) ||
1192 1196                                      (mi->mi_ss_call_type[which] == 0))
1193 1197                                          break;
1194 1198                          }
1195 1199  
1196 1200                          /*
1197 1201                           * The call is in progress (over COTS).
1198 1202                           * Try the CLNT_CALL again, but don't
1199 1203                           * print a noisy error message.
1200 1204                           */
1201 1205                          if (status == RPC_INPROGRESS) {
1202 1206                                  tryagain = TRUE;
1203 1207                                  break;
1204 1208                          }
1205 1209  
1206 1210                          if (flags & RFSCALL_SOFT)
1207 1211                                  break;
1208 1212  
1209 1213                          /*
1210 1214                           * On zone shutdown, just move on.
1211 1215                           */
1212 1216                          if (zone_status_get(curproc->p_zone) >=
1213 1217                              ZONE_IS_SHUTTING_DOWN) {
1214 1218                                  rpcerr.re_status = RPC_FAILED;
1215 1219                                  rpcerr.re_errno = EIO;
1216 1220                                  break;
1217 1221                          }
1218 1222  
1219 1223                          /*
1220 1224                           * NFS client failover support
1221 1225                           *
1222 1226                           * If the current server just failed us, we'll
1223 1227                           * start the process of finding a new server.
1224 1228                           * After that, we can just retry.
1225 1229                           */
1226 1230                          if (FAILOVER_MOUNT(mi) && failover_safe(fi)) {
1227 1231                                  if (svp == mi->mi_curr_serv)
1228 1232                                          failover_newserver(mi);
1229 1233                                  clfree_impl(client, ch, nfscl);
1230 1234                                  goto failoverretry;
1231 1235                          }
1232 1236  
1233 1237                          tryagain = TRUE;
1234 1238                          timeo = backoff(timeo);
1235 1239  
1236 1240                          CLNT_GETERR(client, &rpcerr_tmp);
1237 1241                          if ((status == RPC_CANTSEND) &&
1238 1242                              (rpcerr_tmp.re_errno == ENOBUFS))
1239 1243                                  msg = SRV_QFULL_MSG;
1240 1244                          else
1241 1245                                  msg = SRV_NOTRESP_MSG;
1242 1246  
1243 1247                          mutex_enter(&mi->mi_lock);
1244 1248                          if (!(mi->mi_flags & MI_PRINTED)) {
1245 1249                                  mi->mi_flags |= MI_PRINTED;
1246 1250                                  mutex_exit(&mi->mi_lock);
1247 1251  #ifdef DEBUG
1248 1252                                  zprintf(zoneid, msg, mi->mi_vers,
1249 1253                                      svp->sv_hostname);
1250 1254  #else
1251 1255                                  zprintf(zoneid, msg, svp->sv_hostname);
1252 1256  #endif
1253 1257                          } else
1254 1258                                  mutex_exit(&mi->mi_lock);
1255 1259                          if (*douprintf && nfs_has_ctty()) {
1256 1260                                  *douprintf = 0;
1257 1261                                  if (!(mi->mi_flags & MI_NOPRINT))
1258 1262  #ifdef DEBUG
1259 1263                                          uprintf(msg, mi->mi_vers,
1260 1264                                              svp->sv_hostname);
1261 1265  #else
1262 1266                                          uprintf(msg, svp->sv_hostname);
1263 1267  #endif
1264 1268                          }
1265 1269  
1266 1270                          /*
1267 1271                           * If doing dynamic adjustment of transfer
1268 1272                           * size and if it's a read or write call
1269 1273                           * and if the transfer size changed while
1270 1274                           * retransmitting or if the feedback routine
1271 1275                           * changed the transfer size,
1272 1276                           * then exit rfscall so that the transfer
1273 1277                           * size can be adjusted at the vnops level.
1274 1278                           */
1275 1279                          if ((mi->mi_flags & MI_DYNAMIC) &&
1276 1280                              mi->mi_timer_type[which] != 0 &&
1277 1281                              (mi->mi_curread != my_rsize ||
1278 1282                              mi->mi_curwrite != my_wsize ||
1279 1283                              nfs_feedback(FEEDBACK_REXMIT1, which, mi))) {
1280 1284                                  /*
1281 1285                                   * On read or write calls, return
1282 1286                                   * back to the vnode ops level if
1283 1287                                   * the transfer size changed.
1284 1288                                   */
1285 1289                                  clfree_impl(client, ch, nfscl);
1286 1290                                  if (cred_cloned)
1287 1291                                          crfree(cr);
1288 1292                                  return (ENFS_TRYAGAIN);
1289 1293                          }
1290 1294                  }
1291 1295          } while (tryagain);
1292 1296  
1293 1297          if (status != RPC_SUCCESS) {
1294 1298                  /*
1295 1299                   * Let soft mounts use the timed out message.
1296 1300                   */
1297 1301                  if (status == RPC_INPROGRESS)
1298 1302                          status = RPC_TIMEDOUT;
1299 1303                  nfscl->nfscl_stat.badcalls.value.ui64++;
1300 1304                  if (status != RPC_INTR) {
1301 1305                          mutex_enter(&mi->mi_lock);
1302 1306                          mi->mi_flags |= MI_DOWN;
1303 1307                          mutex_exit(&mi->mi_lock);
1304 1308                          CLNT_GETERR(client, &rpcerr);
1305 1309  #ifdef DEBUG
1306 1310                          bufp = clnt_sperror(client, svp->sv_hostname);
1307 1311                          zprintf(zoneid, "NFS%d %s failed for %s\n",
1308 1312                              mi->mi_vers, mi->mi_rfsnames[which], bufp);
1309 1313                          if (nfs_has_ctty()) {
1310 1314                                  if (!(mi->mi_flags & MI_NOPRINT)) {
1311 1315                                          uprintf("NFS%d %s failed for %s\n",
1312 1316                                              mi->mi_vers, mi->mi_rfsnames[which],
1313 1317                                              bufp);
1314 1318                                  }
1315 1319                          }
1316 1320                          kmem_free(bufp, MAXPATHLEN);
1317 1321  #else
1318 1322                          zprintf(zoneid,
1319 1323                              "NFS %s failed for server %s: error %d (%s)\n",
1320 1324                              mi->mi_rfsnames[which], svp->sv_hostname,
1321 1325                              status, clnt_sperrno(status));
1322 1326                          if (nfs_has_ctty()) {
1323 1327                                  if (!(mi->mi_flags & MI_NOPRINT)) {
1324 1328                                          uprintf(
1325 1329                                  "NFS %s failed for server %s: error %d (%s)\n",
1326 1330                                              mi->mi_rfsnames[which],
1327 1331                                              svp->sv_hostname, status,
1328 1332                                              clnt_sperrno(status));
1329 1333                                  }
1330 1334                          }
1331 1335  #endif
1332 1336                          /*
1333 1337                           * when CLNT_CALL() fails with RPC_AUTHERROR,
1334 1338                           * re_errno is set appropriately depending on
1335 1339                           * the authentication error
1336 1340                           */
1337 1341                          if (status == RPC_VERSMISMATCH ||
1338 1342                              status == RPC_PROGVERSMISMATCH)
1339 1343                                  rpcerr.re_errno = EIO;
1340 1344                  }
1341 1345          } else {
1342 1346                  /*
1343 1347                   * Test the value of mi_down and mi_printed without
1344 1348                   * holding the mi_lock mutex.  If they are both zero,
1345 1349                   * then it is okay to skip the down and printed
1346 1350                   * processing.  This saves on a mutex_enter and
1347 1351                   * mutex_exit pair for a normal, successful RPC.
1348 1352                   * This was just complete overhead.
1349 1353                   */
1350 1354                  if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) {
1351 1355                          mutex_enter(&mi->mi_lock);
1352 1356                          mi->mi_flags &= ~MI_DOWN;
1353 1357                          if (mi->mi_flags & MI_PRINTED) {
1354 1358                                  mi->mi_flags &= ~MI_PRINTED;
1355 1359                                  mutex_exit(&mi->mi_lock);
1356 1360  #ifdef DEBUG
1357 1361                          if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1358 1362                                  zprintf(zoneid, "NFS%d server %s ok\n",
1359 1363                                      mi->mi_vers, svp->sv_hostname);
1360 1364  #else
1361 1365                          if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1362 1366                                  zprintf(zoneid, "NFS server %s ok\n",
1363 1367                                      svp->sv_hostname);
1364 1368  #endif
1365 1369                          } else
1366 1370                                  mutex_exit(&mi->mi_lock);
1367 1371                  }
1368 1372  
1369 1373                  if (*douprintf == 0) {
1370 1374                          if (!(mi->mi_flags & MI_NOPRINT))
1371 1375  #ifdef DEBUG
1372 1376                                  if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1373 1377                                          uprintf("NFS%d server %s ok\n",
1374 1378                                              mi->mi_vers, svp->sv_hostname);
1375 1379  #else
1376 1380                          if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1377 1381                                  uprintf("NFS server %s ok\n", svp->sv_hostname);
1378 1382  #endif
1379 1383                          *douprintf = 1;
1380 1384                  }
1381 1385          }
1382 1386  
1383 1387          clfree_impl(client, ch, nfscl);
1384 1388          if (cred_cloned)
1385 1389                  crfree(cr);
1386 1390  
1387 1391          ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0);
1388 1392  
1389 1393          if (rpc_status != NULL)
1390 1394                  *rpc_status = rpcerr.re_status;
1391 1395  
1392 1396          TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d",
1393 1397              rpcerr.re_errno);
1394 1398  
1395 1399          return (rpcerr.re_errno);
1396 1400  }
1397 1401  
1398 1402  #ifdef DEBUG
1399 1403  static int acl2call_hits = 0;
1400 1404  static int acl2call_misses = 0;
1401 1405  #endif
1402 1406  
1403 1407  int
1404 1408  acl2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1405 1409      xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
1406 1410      enum nfsstat *statusp, int flags, failinfo_t *fi)
1407 1411  {
1408 1412          int rpcerror;
1409 1413  
1410 1414          rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp,
1411 1415              cr, douprintf, flags, fi);
1412 1416          if (!rpcerror) {
1413 1417                  /*
1414 1418                   * See comments with crnetadjust().
1415 1419                   */
1416 1420                  if (*statusp == NFSERR_ACCES &&
1417 1421                      (cr = crnetadjust(cr)) != NULL) {
1418 1422  #ifdef DEBUG
1419 1423                          acl2call_hits++;
1420 1424  #endif
1421 1425                          rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres,
1422 1426                              resp, cr, douprintf, flags, fi);
1423 1427                          crfree(cr);
1424 1428  #ifdef DEBUG
1425 1429                          if (*statusp == NFSERR_ACCES)
1426 1430                                  acl2call_misses++;
1427 1431  #endif
1428 1432                  }
1429 1433          }
1430 1434  
1431 1435          return (rpcerror);
1432 1436  }
1433 1437  
1434 1438  #ifdef DEBUG
1435 1439  static int acl3call_hits = 0;
1436 1440  static int acl3call_misses = 0;
1437 1441  #endif
1438 1442  
1439 1443  int
1440 1444  acl3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1441 1445      xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
1442 1446      nfsstat3 *statusp, int flags, failinfo_t *fi)
1443 1447  {
1444 1448          int rpcerror;
1445 1449          int user_informed;
1446 1450  
1447 1451          user_informed = 0;
1448 1452  
1449 1453          do {
1450 1454                  rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp,
1451 1455                      cr, douprintf, flags, fi);
1452 1456                  if (!rpcerror) {
1453 1457                          cred_t *crr;
1454 1458                          if (*statusp == NFS3ERR_JUKEBOX) {
1455 1459                                  if (!user_informed) {
1456 1460                                          user_informed = 1;
1457 1461                                          uprintf(
1458 1462                  "file temporarily unavailable on the server, retrying...\n");
1459 1463                                  }
1460 1464                                  delay(nfs3_jukebox_delay);
1461 1465                          }
1462 1466                          /*
1463 1467                           * See crnetadjust() for comments.
1464 1468                           */
1465 1469                          else if (*statusp == NFS3ERR_ACCES &&
1466 1470                              (crr = crnetadjust(cr)) != NULL) {
1467 1471  #ifdef DEBUG
1468 1472                                  acl3call_hits++;
1469 1473  #endif
1470 1474                                  rpcerror = aclcall(mi, which, xdrargs, argsp,
1471 1475                                      xdrres, resp, crr, douprintf, flags, fi);
1472 1476  
1473 1477                                  crfree(crr);
1474 1478  #ifdef DEBUG
1475 1479                                  if (*statusp == NFS3ERR_ACCES)
1476 1480                                          acl3call_misses++;
1477 1481  #endif
1478 1482                          }
1479 1483                  }
1480 1484          } while (!rpcerror && *statusp == NFS3ERR_JUKEBOX);
1481 1485  
1482 1486          return (rpcerror);
1483 1487  }
1484 1488  
1485 1489  static int
1486 1490  aclcall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1487 1491      xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf,
1488 1492      int flags, failinfo_t *fi)
1489 1493  {
1490 1494          CLIENT *client;
1491 1495          struct chtab *ch;
1492 1496          cred_t *cr = icr;
1493 1497          bool_t cred_cloned = FALSE;
1494 1498          enum clnt_stat status;
1495 1499          struct rpc_err rpcerr;
1496 1500          struct timeval wait;
1497 1501          int timeo;              /* in units of hz */
1498 1502  #if 0 /* notyet */
1499 1503          int my_rsize, my_wsize;
1500 1504  #endif
1501 1505          bool_t tryagain;
1502 1506          k_sigset_t smask;
1503 1507          servinfo_t *svp;
1504 1508          struct nfs_clnt *nfscl;
1505 1509          zoneid_t zoneid = getzoneid();
1506 1510  #ifdef DEBUG
1507 1511          char *bufp;
1508 1512  #endif
1509 1513  
1510 1514  #if 0 /* notyet */
1511 1515          TRACE_2(TR_FAC_NFS, TR_RFSCALL_START,
1512 1516              "rfscall_start:which %d mi %p", which, mi);
1513 1517  #endif
1514 1518  
1515 1519          nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
1516 1520          ASSERT(nfscl != NULL);
1517 1521  
1518 1522          nfscl->nfscl_stat.calls.value.ui64++;
1519 1523          mi->mi_aclreqs[which].value.ui64++;
1520 1524  
1521 1525          rpcerr.re_status = RPC_SUCCESS;
1522 1526  
1523 1527          if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1524 1528                  rpcerr.re_status = RPC_FAILED;
1525 1529                  rpcerr.re_errno = EIO;
1526 1530                  return (rpcerr.re_errno);
1527 1531          }
1528 1532  
1529 1533  #if 0 /* notyet */
1530 1534          /*
1531 1535           * Remember the transfer sizes in case
1532 1536           * nfs_feedback changes them underneath us.
1533 1537           */
1534 1538          my_rsize = mi->mi_curread;
1535 1539          my_wsize = mi->mi_curwrite;
1536 1540  #endif
1537 1541  
1538 1542          /*
1539 1543           * NFS client failover support
1540 1544           *
1541 1545           * If this rnode is not in sync with the current server (VALID_FH),
1542 1546           * we'd like to do a remap to get in sync.  We can be interrupted
1543 1547           * in failover_remap(), and if so we'll bail.  Otherwise, we'll
1544 1548           * use the best info we have to try the RPC.  Part of that is
1545 1549           * unconditionally updating the filehandle copy kept for V3.
1546 1550           *
1547 1551           * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible
1548 1552           * rw_enter(); we're trying to keep the current server from being
1549 1553           * changed on us until we're done with the remapping and have a
1550 1554           * matching client handle.  We don't want to sending a filehandle
1551 1555           * to the wrong host.
1552 1556           */
1553 1557  failoverretry:
1554 1558          if (FAILOVER_MOUNT(mi)) {
1555 1559                  mutex_enter(&mi->mi_lock);
1556 1560                  if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1557 1561                          if (failover_wait(mi)) {
1558 1562                                  mutex_exit(&mi->mi_lock);
1559 1563                                  return (EINTR);
1560 1564                          }
1561 1565                  }
1562 1566                  INC_READERS(mi);
1563 1567                  mutex_exit(&mi->mi_lock);
1564 1568                  if (fi) {
1565 1569                          if (!VALID_FH(fi) &&
1566 1570                              !(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1567 1571                                  int remaperr;
1568 1572  
1569 1573                                  svp = mi->mi_curr_serv;
1570 1574                                  remaperr = failover_remap(fi);
1571 1575                                  if (remaperr != 0) {
1572 1576  #ifdef DEBUG
1573 1577                                          if (remaperr != EINTR)
1574 1578                                                  nfs_cmn_err(remaperr, CE_WARN,
1575 1579                                              "aclcall couldn't failover: %m");
1576 1580  #endif
1577 1581                                          mutex_enter(&mi->mi_lock);
1578 1582                                          DEC_READERS(mi);
1579 1583                                          mutex_exit(&mi->mi_lock);
1580 1584  
1581 1585                                          /*
1582 1586                                           * If failover_remap returns ETIMEDOUT
1583 1587                                           * and the filesystem is hard mounted
1584 1588                                           * we have to retry the call with a new
1585 1589                                           * server.
1586 1590                                           */
1587 1591                                          if ((mi->mi_flags & MI_HARD) &&
1588 1592                                              IS_RECOVERABLE_ERROR(remaperr)) {
1589 1593                                                  if (svp == mi->mi_curr_serv)
1590 1594                                                          failover_newserver(mi);
1591 1595                                                  rpcerr.re_status = RPC_SUCCESS;
1592 1596                                                  goto failoverretry;
1593 1597                                          }
1594 1598                                          return (remaperr);
1595 1599                                  }
1596 1600                          }
1597 1601                          if (fi->fhp && fi->copyproc)
1598 1602                                  (*fi->copyproc)(fi->fhp, fi->vp);
1599 1603                  }
1600 1604          }
1601 1605  
1602 1606          /* For TSOL, use a new cred which has net_mac_aware flag */
1603 1607          if (!cred_cloned && is_system_labeled()) {
1604 1608                  cred_cloned = TRUE;
1605 1609                  cr = crdup(icr);
1606 1610                  (void) setpflags(NET_MAC_AWARE, 1, cr);
1607 1611          }
1608 1612  
1609 1613          /*
1610 1614           * acl_clget() calls clnt_tli_kinit() which clears the xid, so we
1611 1615           * are guaranteed to reprocess the retry as a new request.
1612 1616           */
1613 1617          svp = mi->mi_curr_serv;
1614 1618          rpcerr.re_errno = acl_clget(mi, svp, cr, &client, &ch, nfscl);
1615 1619          if (FAILOVER_MOUNT(mi)) {
1616 1620                  mutex_enter(&mi->mi_lock);
1617 1621                  DEC_READERS(mi);
1618 1622                  mutex_exit(&mi->mi_lock);
1619 1623  
1620 1624                  if ((rpcerr.re_errno == ETIMEDOUT ||
1621 1625                      rpcerr.re_errno == ECONNRESET) &&
1622 1626                      failover_safe(fi)) {
1623 1627                          if (svp == mi->mi_curr_serv)
1624 1628                                  failover_newserver(mi);
1625 1629                          goto failoverretry;
1626 1630                  }
1627 1631          }
1628 1632          if (rpcerr.re_errno != 0) {
1629 1633                  if (cred_cloned)
1630 1634                          crfree(cr);
1631 1635                  return (rpcerr.re_errno);
1632 1636          }
1633 1637  
1634 1638          if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD ||
1635 1639              svp->sv_knconf->knc_semantics == NC_TPI_COTS) {
1636 1640                  timeo = (mi->mi_timeo * hz) / 10;
1637 1641          } else {
1638 1642                  mutex_enter(&mi->mi_lock);
1639 1643                  timeo = CLNT_SETTIMERS(client,
1640 1644                      &(mi->mi_timers[mi->mi_acl_timer_type[which]]),
1641 1645                      &(mi->mi_timers[NFS_CALLTYPES]),
1642 1646                      (minimum_timeo[mi->mi_acl_call_type[which]]*hz)>>3,
1643 1647                      (void (*)()) 0, (caddr_t)mi, 0);
1644 1648                  mutex_exit(&mi->mi_lock);
1645 1649          }
1646 1650  
1647 1651          /*
1648 1652           * If hard mounted fs, retry call forever unless hard error occurs.
1649 1653           */
1650 1654          do {
1651 1655                  tryagain = FALSE;
1652 1656  
1653 1657                  if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1654 1658                          status = RPC_FAILED;
1655 1659                          rpcerr.re_status = RPC_FAILED;
1656 1660                          rpcerr.re_errno = EIO;
1657 1661                          break;
1658 1662                  }
1659 1663  
1660 1664                  TICK_TO_TIMEVAL(timeo, &wait);
1661 1665  
1662 1666                  /*
1663 1667                   * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
1664 1668                   * and SIGTERM. (Preserving the existing masks).
1665 1669                   * Mask out SIGINT if mount option nointr is specified.
1666 1670                   */
1667 1671                  sigintr(&smask, (int)mi->mi_flags & MI_INT);
1668 1672                  if (!(mi->mi_flags & MI_INT))
1669 1673                          client->cl_nosignal = TRUE;
1670 1674  
1671 1675                  /*
1672 1676                   * If there is a current signal, then don't bother
1673 1677                   * even trying to send out the request because we
1674 1678                   * won't be able to block waiting for the response.
1675 1679                   * Simply assume RPC_INTR and get on with it.
1676 1680                   */
1677 1681                  if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING))
1678 1682                          status = RPC_INTR;
1679 1683                  else {
1680 1684                          status = CLNT_CALL(client, which, xdrargs, argsp,
1681 1685                              xdrres, resp, wait);
1682 1686                  }
1683 1687  
1684 1688                  if (!(mi->mi_flags & MI_INT))
1685 1689                          client->cl_nosignal = FALSE;
1686 1690                  /*
1687 1691                   * restore original signal mask
1688 1692                   */
1689 1693                  sigunintr(&smask);
1690 1694  
1691 1695                  switch (status) {
1692 1696                  case RPC_SUCCESS:
1693 1697  #if 0 /* notyet */
1694 1698                          if ((mi->mi_flags & MI_DYNAMIC) &&
1695 1699                              mi->mi_timer_type[which] != 0 &&
1696 1700                              (mi->mi_curread != my_rsize ||
1697 1701                              mi->mi_curwrite != my_wsize))
1698 1702                                  (void) nfs_feedback(FEEDBACK_OK, which, mi);
1699 1703  #endif
1700 1704                          break;
1701 1705  
1702 1706                  /*
1703 1707                   * Unfortunately, there are servers in the world which
1704 1708                   * are not coded correctly.  They are not prepared to
1705 1709                   * handle RPC requests to the NFS port which are not
1706 1710                   * NFS requests.  Thus, they may try to process the
1707 1711                   * NFS_ACL request as if it were an NFS request.  This
1708 1712                   * does not work.  Generally, an error will be generated
1709 1713                   * on the client because it will not be able to decode
1710 1714                   * the response from the server.  However, it seems
1711 1715                   * possible that the server may not be able to decode
1712 1716                   * the arguments.  Thus, the criteria for deciding
1713 1717                   * whether the server supports NFS_ACL or not is whether
1714 1718                   * the following RPC errors are returned from CLNT_CALL.
1715 1719                   */
1716 1720                  case RPC_CANTDECODERES:
1717 1721                  case RPC_PROGUNAVAIL:
1718 1722                  case RPC_CANTDECODEARGS:
1719 1723                  case RPC_PROGVERSMISMATCH:
1720 1724                          mutex_enter(&mi->mi_lock);
1721 1725                          mi->mi_flags &= ~(MI_ACL | MI_EXTATTR);
1722 1726                          mutex_exit(&mi->mi_lock);
1723 1727                          break;
1724 1728  
1725 1729                  /*
1726 1730                   * If the server supports NFS_ACL but not the new ops
1727 1731                   * for extended attributes, make sure we don't retry.
1728 1732                   */
1729 1733                  case RPC_PROCUNAVAIL:
1730 1734                          mutex_enter(&mi->mi_lock);
1731 1735                          mi->mi_flags &= ~MI_EXTATTR;
1732 1736                          mutex_exit(&mi->mi_lock);
1733 1737                          break;
1734 1738  
1735 1739                  case RPC_INTR:
1736 1740                          /*
1737 1741                           * There is no way to recover from this error,
1738 1742                           * even if mount option nointr is specified.
1739 1743                           * SIGKILL, for example, cannot be blocked.
1740 1744                           */
1741 1745                          rpcerr.re_status = RPC_INTR;
1742 1746                          rpcerr.re_errno = EINTR;
1743 1747                          break;
1744 1748  
1745 1749                  case RPC_UDERROR:
1746 1750                          /*
1747 1751                           * If the NFS server is local (vold) and
1748 1752                           * it goes away then we get RPC_UDERROR.
1749 1753                           * This is a retryable error, so we would
1750 1754                           * loop, so check to see if the specific
1751 1755                           * error was ECONNRESET, indicating that
1752 1756                           * target did not exist at all.  If so,
1753 1757                           * return with RPC_PROGUNAVAIL and
1754 1758                           * ECONNRESET to indicate why.
1755 1759                           */
1756 1760                          CLNT_GETERR(client, &rpcerr);
1757 1761                          if (rpcerr.re_errno == ECONNRESET) {
1758 1762                                  rpcerr.re_status = RPC_PROGUNAVAIL;
1759 1763                                  rpcerr.re_errno = ECONNRESET;
1760 1764                                  break;
1761 1765                          }
1762 1766                          /*FALLTHROUGH*/
1763 1767  
1764 1768                  default:                /* probably RPC_TIMEDOUT */
1765 1769                          if (IS_UNRECOVERABLE_RPC(status))
1766 1770                                  break;
1767 1771  
1768 1772                          /*
1769 1773                           * increment server not responding count
1770 1774                           */
1771 1775                          mutex_enter(&mi->mi_lock);
1772 1776                          mi->mi_noresponse++;
1773 1777                          mutex_exit(&mi->mi_lock);
1774 1778  #ifdef DEBUG
1775 1779                          nfscl->nfscl_stat.noresponse.value.ui64++;
1776 1780  #endif
1777 1781  
1778 1782                          if (!(mi->mi_flags & MI_HARD)) {
1779 1783                                  if (!(mi->mi_flags & MI_SEMISOFT) ||
1780 1784                                      (mi->mi_acl_ss_call_type[which] == 0))
1781 1785                                          break;
1782 1786                          }
1783 1787  
1784 1788                          /*
1785 1789                           * The call is in progress (over COTS).
1786 1790                           * Try the CLNT_CALL again, but don't
1787 1791                           * print a noisy error message.
1788 1792                           */
1789 1793                          if (status == RPC_INPROGRESS) {
1790 1794                                  tryagain = TRUE;
1791 1795                                  break;
1792 1796                          }
1793 1797  
1794 1798                          if (flags & RFSCALL_SOFT)
1795 1799                                  break;
1796 1800  
1797 1801                          /*
1798 1802                           * On zone shutdown, just move on.
1799 1803                           */
1800 1804                          if (zone_status_get(curproc->p_zone) >=
1801 1805                              ZONE_IS_SHUTTING_DOWN) {
1802 1806                                  rpcerr.re_status = RPC_FAILED;
1803 1807                                  rpcerr.re_errno = EIO;
1804 1808                                  break;
1805 1809                          }
1806 1810  
1807 1811                          /*
1808 1812                           * NFS client failover support
1809 1813                           *
1810 1814                           * If the current server just failed us, we'll
1811 1815                           * start the process of finding a new server.
1812 1816                           * After that, we can just retry.
1813 1817                           */
1814 1818                          if (FAILOVER_MOUNT(mi) && failover_safe(fi)) {
1815 1819                                  if (svp == mi->mi_curr_serv)
1816 1820                                          failover_newserver(mi);
1817 1821                                  clfree_impl(client, ch, nfscl);
1818 1822                                  goto failoverretry;
1819 1823                          }
1820 1824  
1821 1825                          tryagain = TRUE;
1822 1826                          timeo = backoff(timeo);
1823 1827                          mutex_enter(&mi->mi_lock);
1824 1828                          if (!(mi->mi_flags & MI_PRINTED)) {
1825 1829                                  mi->mi_flags |= MI_PRINTED;
1826 1830                                  mutex_exit(&mi->mi_lock);
1827 1831  #ifdef DEBUG
1828 1832                                  zprintf(zoneid,
1829 1833                          "NFS_ACL%d server %s not responding still trying\n",
1830 1834                                      mi->mi_vers, svp->sv_hostname);
1831 1835  #else
1832 1836                                  zprintf(zoneid,
1833 1837                              "NFS server %s not responding still trying\n",
1834 1838                                      svp->sv_hostname);
1835 1839  #endif
1836 1840                          } else
1837 1841                                  mutex_exit(&mi->mi_lock);
1838 1842                          if (*douprintf && nfs_has_ctty()) {
1839 1843                                  *douprintf = 0;
1840 1844                                  if (!(mi->mi_flags & MI_NOPRINT))
1841 1845  #ifdef DEBUG
1842 1846                                          uprintf(
1843 1847                          "NFS_ACL%d server %s not responding still trying\n",
1844 1848                                              mi->mi_vers, svp->sv_hostname);
1845 1849  #else
1846 1850                                          uprintf(
1847 1851                              "NFS server %s not responding still trying\n",
1848 1852                                              svp->sv_hostname);
1849 1853  #endif
1850 1854                          }
1851 1855  
1852 1856  #if 0 /* notyet */
1853 1857                          /*
1854 1858                           * If doing dynamic adjustment of transfer
1855 1859                           * size and if it's a read or write call
1856 1860                           * and if the transfer size changed while
1857 1861                           * retransmitting or if the feedback routine
1858 1862                           * changed the transfer size,
1859 1863                           * then exit rfscall so that the transfer
1860 1864                           * size can be adjusted at the vnops level.
1861 1865                           */
1862 1866                          if ((mi->mi_flags & MI_DYNAMIC) &&
1863 1867                              mi->mi_acl_timer_type[which] != 0 &&
1864 1868                              (mi->mi_curread != my_rsize ||
1865 1869                              mi->mi_curwrite != my_wsize ||
1866 1870                              nfs_feedback(FEEDBACK_REXMIT1, which, mi))) {
1867 1871                                  /*
1868 1872                                   * On read or write calls, return
1869 1873                                   * back to the vnode ops level if
1870 1874                                   * the transfer size changed.
1871 1875                                   */
1872 1876                                  clfree_impl(client, ch, nfscl);
1873 1877                                  if (cred_cloned)
1874 1878                                          crfree(cr);
1875 1879                                  return (ENFS_TRYAGAIN);
1876 1880                          }
1877 1881  #endif
1878 1882                  }
1879 1883          } while (tryagain);
1880 1884  
1881 1885          if (status != RPC_SUCCESS) {
1882 1886                  /*
1883 1887                   * Let soft mounts use the timed out message.
1884 1888                   */
1885 1889                  if (status == RPC_INPROGRESS)
1886 1890                          status = RPC_TIMEDOUT;
1887 1891                  nfscl->nfscl_stat.badcalls.value.ui64++;
1888 1892                  if (status == RPC_CANTDECODERES ||
1889 1893                      status == RPC_PROGUNAVAIL ||
1890 1894                      status == RPC_PROCUNAVAIL ||
1891 1895                      status == RPC_CANTDECODEARGS ||
1892 1896                      status == RPC_PROGVERSMISMATCH)
1893 1897                          CLNT_GETERR(client, &rpcerr);
1894 1898                  else if (status != RPC_INTR) {
1895 1899                          mutex_enter(&mi->mi_lock);
1896 1900                          mi->mi_flags |= MI_DOWN;
1897 1901                          mutex_exit(&mi->mi_lock);
1898 1902                          CLNT_GETERR(client, &rpcerr);
1899 1903  #ifdef DEBUG
1900 1904                          bufp = clnt_sperror(client, svp->sv_hostname);
1901 1905                          zprintf(zoneid, "NFS_ACL%d %s failed for %s\n",
1902 1906                              mi->mi_vers, mi->mi_aclnames[which], bufp);
1903 1907                          if (nfs_has_ctty()) {
1904 1908                                  if (!(mi->mi_flags & MI_NOPRINT)) {
1905 1909                                          uprintf("NFS_ACL%d %s failed for %s\n",
1906 1910                                              mi->mi_vers, mi->mi_aclnames[which],
1907 1911                                              bufp);
1908 1912                                  }
1909 1913                          }
1910 1914                          kmem_free(bufp, MAXPATHLEN);
1911 1915  #else
1912 1916                          zprintf(zoneid,
1913 1917                              "NFS %s failed for server %s: error %d (%s)\n",
1914 1918                              mi->mi_aclnames[which], svp->sv_hostname,
1915 1919                              status, clnt_sperrno(status));
1916 1920                          if (nfs_has_ctty()) {
1917 1921                                  if (!(mi->mi_flags & MI_NOPRINT))
1918 1922                                          uprintf(
1919 1923                                  "NFS %s failed for server %s: error %d (%s)\n",
1920 1924                                              mi->mi_aclnames[which],
1921 1925                                              svp->sv_hostname, status,
1922 1926                                              clnt_sperrno(status));
1923 1927                          }
1924 1928  #endif
1925 1929                          /*
1926 1930                           * when CLNT_CALL() fails with RPC_AUTHERROR,
1927 1931                           * re_errno is set appropriately depending on
1928 1932                           * the authentication error
1929 1933                           */
1930 1934                          if (status == RPC_VERSMISMATCH ||
1931 1935                              status == RPC_PROGVERSMISMATCH)
1932 1936                                  rpcerr.re_errno = EIO;
1933 1937                  }
1934 1938          } else {
1935 1939                  /*
1936 1940                   * Test the value of mi_down and mi_printed without
1937 1941                   * holding the mi_lock mutex.  If they are both zero,
1938 1942                   * then it is okay to skip the down and printed
1939 1943                   * processing.  This saves on a mutex_enter and
1940 1944                   * mutex_exit pair for a normal, successful RPC.
1941 1945                   * This was just complete overhead.
1942 1946                   */
1943 1947                  if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) {
1944 1948                          mutex_enter(&mi->mi_lock);
1945 1949                          mi->mi_flags &= ~MI_DOWN;
1946 1950                          if (mi->mi_flags & MI_PRINTED) {
1947 1951                                  mi->mi_flags &= ~MI_PRINTED;
1948 1952                                  mutex_exit(&mi->mi_lock);
1949 1953  #ifdef DEBUG
1950 1954                                  zprintf(zoneid, "NFS_ACL%d server %s ok\n",
1951 1955                                      mi->mi_vers, svp->sv_hostname);
1952 1956  #else
1953 1957                                  zprintf(zoneid, "NFS server %s ok\n",
1954 1958                                      svp->sv_hostname);
1955 1959  #endif
1956 1960                          } else
1957 1961                                  mutex_exit(&mi->mi_lock);
1958 1962                  }
1959 1963  
1960 1964                  if (*douprintf == 0) {
1961 1965                          if (!(mi->mi_flags & MI_NOPRINT))
1962 1966  #ifdef DEBUG
1963 1967                                  uprintf("NFS_ACL%d server %s ok\n",
1964 1968                                      mi->mi_vers, svp->sv_hostname);
1965 1969  #else
1966 1970                                  uprintf("NFS server %s ok\n", svp->sv_hostname);
1967 1971  #endif
1968 1972                          *douprintf = 1;
1969 1973                  }
1970 1974          }
1971 1975  
1972 1976          clfree_impl(client, ch, nfscl);
1973 1977          if (cred_cloned)
1974 1978                  crfree(cr);
1975 1979  
1976 1980          ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0);
1977 1981  
1978 1982  #if 0 /* notyet */
1979 1983          TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d",
1980 1984              rpcerr.re_errno);
1981 1985  #endif
1982 1986  
1983 1987          return (rpcerr.re_errno);
1984 1988  }
1985 1989  
1986 1990  int
1987 1991  vattr_to_sattr(struct vattr *vap, struct nfssattr *sa)
1988 1992  {
1989 1993          uint_t mask = vap->va_mask;
1990 1994  
1991 1995          if (!(mask & AT_MODE))
1992 1996                  sa->sa_mode = (uint32_t)-1;
1993 1997          else
1994 1998                  sa->sa_mode = vap->va_mode;
1995 1999          if (!(mask & AT_UID))
1996 2000                  sa->sa_uid = (uint32_t)-1;
1997 2001          else
1998 2002                  sa->sa_uid = (uint32_t)vap->va_uid;
1999 2003          if (!(mask & AT_GID))
2000 2004                  sa->sa_gid = (uint32_t)-1;
2001 2005          else
2002 2006                  sa->sa_gid = (uint32_t)vap->va_gid;
2003 2007          if (!(mask & AT_SIZE))
2004 2008                  sa->sa_size = (uint32_t)-1;
2005 2009          else
2006 2010                  sa->sa_size = (uint32_t)vap->va_size;
2007 2011          if (!(mask & AT_ATIME))
2008 2012                  sa->sa_atime.tv_sec = sa->sa_atime.tv_usec = (int32_t)-1;
2009 2013          else {
2010 2014                  /* check time validity */
2011 2015                  if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) {
2012 2016                          return (EOVERFLOW);
2013 2017                  }
2014 2018                  sa->sa_atime.tv_sec = vap->va_atime.tv_sec;
2015 2019                  sa->sa_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2016 2020          }
2017 2021          if (!(mask & AT_MTIME))
2018 2022                  sa->sa_mtime.tv_sec = sa->sa_mtime.tv_usec = (int32_t)-1;
2019 2023          else {
2020 2024                  /* check time validity */
2021 2025                  if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) {
2022 2026                          return (EOVERFLOW);
2023 2027                  }
2024 2028                  sa->sa_mtime.tv_sec = vap->va_mtime.tv_sec;
2025 2029                  sa->sa_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2026 2030          }
2027 2031          return (0);
2028 2032  }
2029 2033  
2030 2034  int
2031 2035  vattr_to_sattr3(struct vattr *vap, sattr3 *sa)
2032 2036  {
2033 2037          uint_t mask = vap->va_mask;
2034 2038  
2035 2039          if (!(mask & AT_MODE))
2036 2040                  sa->mode.set_it = FALSE;
2037 2041          else {
2038 2042                  sa->mode.set_it = TRUE;
2039 2043                  sa->mode.mode = (mode3)vap->va_mode;
2040 2044          }
2041 2045          if (!(mask & AT_UID))
2042 2046                  sa->uid.set_it = FALSE;
2043 2047          else {
2044 2048                  sa->uid.set_it = TRUE;
2045 2049                  sa->uid.uid = (uid3)vap->va_uid;
2046 2050          }
2047 2051          if (!(mask & AT_GID))
2048 2052                  sa->gid.set_it = FALSE;
2049 2053          else {
2050 2054                  sa->gid.set_it = TRUE;
2051 2055                  sa->gid.gid = (gid3)vap->va_gid;
2052 2056          }
2053 2057          if (!(mask & AT_SIZE))
2054 2058                  sa->size.set_it = FALSE;
2055 2059          else {
2056 2060                  sa->size.set_it = TRUE;
2057 2061                  sa->size.size = (size3)vap->va_size;
2058 2062          }
2059 2063          if (!(mask & AT_ATIME))
2060 2064                  sa->atime.set_it = DONT_CHANGE;
2061 2065          else {
2062 2066                  /* check time validity */
2063 2067                  if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) {
2064 2068                          return (EOVERFLOW);
2065 2069                  }
2066 2070                  sa->atime.set_it = SET_TO_CLIENT_TIME;
2067 2071                  sa->atime.atime.seconds = (uint32)vap->va_atime.tv_sec;
2068 2072                  sa->atime.atime.nseconds = (uint32)vap->va_atime.tv_nsec;
2069 2073          }
2070 2074          if (!(mask & AT_MTIME))
2071 2075                  sa->mtime.set_it = DONT_CHANGE;
2072 2076          else {
2073 2077                  /* check time validity */
2074 2078                  if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) {
2075 2079                          return (EOVERFLOW);
2076 2080                  }
2077 2081                  sa->mtime.set_it = SET_TO_CLIENT_TIME;
2078 2082                  sa->mtime.mtime.seconds = (uint32)vap->va_mtime.tv_sec;
2079 2083                  sa->mtime.mtime.nseconds = (uint32)vap->va_mtime.tv_nsec;
2080 2084          }
2081 2085          return (0);
2082 2086  }
2083 2087  
2084 2088  void
2085 2089  setdiropargs(struct nfsdiropargs *da, char *nm, vnode_t *dvp)
2086 2090  {
2087 2091  
2088 2092          da->da_fhandle = VTOFH(dvp);
2089 2093          da->da_name = nm;
2090 2094          da->da_flags = 0;
2091 2095  }
2092 2096  
2093 2097  void
2094 2098  setdiropargs3(diropargs3 *da, char *nm, vnode_t *dvp)
2095 2099  {
2096 2100  
2097 2101          da->dirp = VTOFH3(dvp);
2098 2102          da->name = nm;
2099 2103  }
2100 2104  
2101 2105  int
2102 2106  setdirgid(vnode_t *dvp, gid_t *gidp, cred_t *cr)
2103 2107  {
2104 2108          int error;
2105 2109          rnode_t *rp;
2106 2110          struct vattr va;
2107 2111  
2108 2112          va.va_mask = AT_MODE | AT_GID;
2109 2113          error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2110 2114          if (error)
2111 2115                  return (error);
2112 2116  
2113 2117          /*
2114 2118           * To determine the expected group-id of the created file:
2115 2119           *  1)  If the filesystem was not mounted with the Old-BSD-compatible
2116 2120           *      GRPID option, and the directory's set-gid bit is clear,
2117 2121           *      then use the process's gid.
2118 2122           *  2)  Otherwise, set the group-id to the gid of the parent directory.
2119 2123           */
2120 2124          rp = VTOR(dvp);
2121 2125          mutex_enter(&rp->r_statelock);
2122 2126          if (!(VTOMI(dvp)->mi_flags & MI_GRPID) && !(va.va_mode & VSGID))
2123 2127                  *gidp = crgetgid(cr);
2124 2128          else
2125 2129                  *gidp = va.va_gid;
2126 2130          mutex_exit(&rp->r_statelock);
2127 2131          return (0);
2128 2132  }
2129 2133  
2130 2134  int
2131 2135  setdirmode(vnode_t *dvp, mode_t *omp, cred_t *cr)
2132 2136  {
2133 2137          int error;
2134 2138          struct vattr va;
2135 2139  
2136 2140          va.va_mask = AT_MODE;
2137 2141          error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2138 2142          if (error)
2139 2143                  return (error);
2140 2144  
2141 2145          /*
2142 2146           * Modify the expected mode (om) so that the set-gid bit matches
2143 2147           * that of the parent directory (dvp).
2144 2148           */
2145 2149          if (va.va_mode & VSGID)
2146 2150                  *omp |= VSGID;
2147 2151          else
2148 2152                  *omp &= ~VSGID;
2149 2153          return (0);
2150 2154  }
2151 2155  
2152 2156  void
2153 2157  nfs_setswaplike(vnode_t *vp, vattr_t *vap)
2154 2158  {
2155 2159  
2156 2160          if (vp->v_type == VREG && (vap->va_mode & (VEXEC | VSVTX)) == VSVTX) {
2157 2161                  if (!(vp->v_flag & VSWAPLIKE)) {
2158 2162                          mutex_enter(&vp->v_lock);
2159 2163                          vp->v_flag |= VSWAPLIKE;
2160 2164                          mutex_exit(&vp->v_lock);
2161 2165                  }
2162 2166          } else {
2163 2167                  if (vp->v_flag & VSWAPLIKE) {
2164 2168                          mutex_enter(&vp->v_lock);
2165 2169                          vp->v_flag &= ~VSWAPLIKE;
2166 2170                          mutex_exit(&vp->v_lock);
2167 2171                  }
2168 2172          }
2169 2173  }
2170 2174  
2171 2175  /*
2172 2176   * Free the resources associated with an rnode.
2173 2177   */
2174 2178  static void
2175 2179  rinactive(rnode_t *rp, cred_t *cr)
2176 2180  {
2177 2181          vnode_t *vp;
2178 2182          cred_t *cred;
2179 2183          char *contents;
2180 2184          int size;
2181 2185          vsecattr_t *vsp;
2182 2186          int error;
2183 2187          nfs3_pathconf_info *info;
2184 2188  
2185 2189          /*
2186 2190           * Before freeing anything, wait until all asynchronous
2187 2191           * activity is done on this rnode.  This will allow all
2188 2192           * asynchronous read ahead and write behind i/o's to
2189 2193           * finish.
2190 2194           */
2191 2195          mutex_enter(&rp->r_statelock);
2192 2196          while (rp->r_count > 0)
2193 2197                  cv_wait(&rp->r_cv, &rp->r_statelock);
2194 2198          mutex_exit(&rp->r_statelock);
2195 2199  
2196 2200          /*
2197 2201           * Flush and invalidate all pages associated with the vnode.
2198 2202           */
2199 2203          vp = RTOV(rp);
2200 2204          if (vn_has_cached_data(vp)) {
2201 2205                  ASSERT(vp->v_type != VCHR);
2202 2206                  if ((rp->r_flags & RDIRTY) && !rp->r_error) {
2203 2207                          error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, 0, cr, NULL);
2204 2208                          if (error && (error == ENOSPC || error == EDQUOT)) {
2205 2209                                  mutex_enter(&rp->r_statelock);
2206 2210                                  if (!rp->r_error)
2207 2211                                          rp->r_error = error;
2208 2212                                  mutex_exit(&rp->r_statelock);
2209 2213                          }
2210 2214                  }
2211 2215                  nfs_invalidate_pages(vp, (u_offset_t)0, cr);
2212 2216          }
2213 2217  
2214 2218          /*
2215 2219           * Free any held credentials and caches which may be associated
2216 2220           * with this rnode.
2217 2221           */
2218 2222          mutex_enter(&rp->r_statelock);
2219 2223          cred = rp->r_cred;
2220 2224          rp->r_cred = NULL;
2221 2225          contents = rp->r_symlink.contents;
2222 2226          size = rp->r_symlink.size;
2223 2227          rp->r_symlink.contents = NULL;
2224 2228          vsp = rp->r_secattr;
2225 2229          rp->r_secattr = NULL;
2226 2230          info = rp->r_pathconf;
2227 2231          rp->r_pathconf = NULL;
2228 2232          mutex_exit(&rp->r_statelock);
2229 2233  
2230 2234          /*
2231 2235           * Free the held credential.
2232 2236           */
2233 2237          if (cred != NULL)
2234 2238                  crfree(cred);
2235 2239  
2236 2240          /*
2237 2241           * Free the access cache entries.
2238 2242           */
2239 2243          (void) nfs_access_purge_rp(rp);
2240 2244  
2241 2245          /*
2242 2246           * Free the readdir cache entries.
2243 2247           */
2244 2248          if (HAVE_RDDIR_CACHE(rp))
2245 2249                  nfs_purge_rddir_cache(vp);
2246 2250  
2247 2251          /*
2248 2252           * Free the symbolic link cache.
2249 2253           */
2250 2254          if (contents != NULL) {
2251 2255  
2252 2256                  kmem_free((void *)contents, size);
2253 2257          }
2254 2258  
2255 2259          /*
2256 2260           * Free any cached ACL.
2257 2261           */
2258 2262          if (vsp != NULL)
2259 2263                  nfs_acl_free(vsp);
2260 2264  
2261 2265          /*
2262 2266           * Free any cached pathconf information.
2263 2267           */
2264 2268          if (info != NULL)
2265 2269                  kmem_free(info, sizeof (*info));
2266 2270  }
2267 2271  
2268 2272  /*
2269 2273   * Return a vnode for the given NFS Version 2 file handle.
2270 2274   * If no rnode exists for this fhandle, create one and put it
2271 2275   * into the hash queues.  If the rnode for this fhandle
2272 2276   * already exists, return it.
2273 2277   *
2274 2278   * Note: make_rnode() may upgrade the hash bucket lock to exclusive.
2275 2279   */
2276 2280  vnode_t *
2277 2281  makenfsnode(fhandle_t *fh, struct nfsfattr *attr, struct vfs *vfsp,
2278 2282      hrtime_t t, cred_t *cr, char *dnm, char *nm)
2279 2283  {
2280 2284          int newnode;
2281 2285          int index;
2282 2286          vnode_t *vp;
2283 2287          nfs_fhandle nfh;
2284 2288          vattr_t va;
2285 2289  
2286 2290          nfh.fh_len = NFS_FHSIZE;
2287 2291          bcopy(fh, nfh.fh_buf, NFS_FHSIZE);
2288 2292  
2289 2293          index = rtablehash(&nfh);
2290 2294          rw_enter(&rtable[index].r_lock, RW_READER);
2291 2295  
2292 2296          vp = make_rnode(&nfh, &rtable[index], vfsp, nfs_vnodeops,
2293 2297              nfs_putapage, nfs_rddir_compar, &newnode, cr, dnm, nm);
2294 2298  
2295 2299          if (attr != NULL) {
2296 2300                  if (!newnode) {
2297 2301                          rw_exit(&rtable[index].r_lock);
2298 2302                          (void) nfs_cache_fattr(vp, attr, &va, t, cr);
2299 2303                  } else {
2300 2304                          if (attr->na_type < NFNON || attr->na_type > NFSOC)
2301 2305                                  vp->v_type = VBAD;
2302 2306                          else
2303 2307                                  vp->v_type = n2v_type(attr);
2304 2308                          /*
2305 2309                           * A translation here seems to be necessary
2306 2310                           * because this function can be called
2307 2311                           * with `attr' that has come from the wire,
2308 2312                           * and been operated on by vattr_to_nattr().
2309 2313                           * See nfsrootvp()->VOP_GETTATTR()->nfsgetattr()
2310 2314                           * ->nfs_getattr_otw()->rfscall()->vattr_to_nattr()
2311 2315                           * ->makenfsnode().
2312 2316                           */
2313 2317                          if ((attr->na_rdev & 0xffff0000) == 0)
2314 2318                                  vp->v_rdev = nfsv2_expdev(attr->na_rdev);
2315 2319                          else
2316 2320                                  vp->v_rdev = expldev(n2v_rdev(attr));
2317 2321                          nfs_attrcache(vp, attr, t);
2318 2322                          rw_exit(&rtable[index].r_lock);
2319 2323                  }
2320 2324          } else {
2321 2325                  if (newnode) {
2322 2326                          PURGE_ATTRCACHE(vp);
2323 2327                  }
2324 2328                  rw_exit(&rtable[index].r_lock);
2325 2329          }
2326 2330  
2327 2331          return (vp);
2328 2332  }
2329 2333  
2330 2334  /*
2331 2335   * Return a vnode for the given NFS Version 3 file handle.
2332 2336   * If no rnode exists for this fhandle, create one and put it
2333 2337   * into the hash queues.  If the rnode for this fhandle
2334 2338   * already exists, return it.
2335 2339   *
2336 2340   * Note: make_rnode() may upgrade the hash bucket lock to exclusive.
2337 2341   */
2338 2342  vnode_t *
2339 2343  makenfs3node_va(nfs_fh3 *fh, vattr_t *vap, struct vfs *vfsp, hrtime_t t,
2340 2344      cred_t *cr, char *dnm, char *nm)
2341 2345  {
2342 2346          int newnode;
2343 2347          int index;
2344 2348          vnode_t *vp;
2345 2349  
2346 2350          index = rtablehash((nfs_fhandle *)fh);
2347 2351          rw_enter(&rtable[index].r_lock, RW_READER);
2348 2352  
2349 2353          vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp,
2350 2354              nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr,
2351 2355              dnm, nm);
2352 2356  
2353 2357          if (vap == NULL) {
2354 2358                  if (newnode) {
2355 2359                          PURGE_ATTRCACHE(vp);
2356 2360                  }
2357 2361                  rw_exit(&rtable[index].r_lock);
2358 2362                  return (vp);
2359 2363          }
2360 2364  
2361 2365          if (!newnode) {
2362 2366                  rw_exit(&rtable[index].r_lock);
2363 2367                  nfs_attr_cache(vp, vap, t, cr);
2364 2368          } else {
2365 2369                  rnode_t *rp = VTOR(vp);
2366 2370  
2367 2371                  vp->v_type = vap->va_type;
2368 2372                  vp->v_rdev = vap->va_rdev;
2369 2373  
2370 2374                  mutex_enter(&rp->r_statelock);
2371 2375                  if (rp->r_mtime <= t)
2372 2376                          nfs_attrcache_va(vp, vap);
2373 2377                  mutex_exit(&rp->r_statelock);
2374 2378                  rw_exit(&rtable[index].r_lock);
2375 2379          }
2376 2380  
2377 2381          return (vp);
2378 2382  }
2379 2383  
2380 2384  vnode_t *
2381 2385  makenfs3node(nfs_fh3 *fh, fattr3 *attr, struct vfs *vfsp, hrtime_t t,
2382 2386      cred_t *cr, char *dnm, char *nm)
2383 2387  {
2384 2388          int newnode;
2385 2389          int index;
2386 2390          vnode_t *vp;
2387 2391          vattr_t va;
2388 2392  
2389 2393          index = rtablehash((nfs_fhandle *)fh);
2390 2394          rw_enter(&rtable[index].r_lock, RW_READER);
2391 2395  
2392 2396          vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp,
2393 2397              nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr,
2394 2398              dnm, nm);
2395 2399  
2396 2400          if (attr == NULL) {
2397 2401                  if (newnode) {
2398 2402                          PURGE_ATTRCACHE(vp);
2399 2403                  }
2400 2404                  rw_exit(&rtable[index].r_lock);
2401 2405                  return (vp);
2402 2406          }
2403 2407  
2404 2408          if (!newnode) {
2405 2409                  rw_exit(&rtable[index].r_lock);
2406 2410                  (void) nfs3_cache_fattr3(vp, attr, &va, t, cr);
2407 2411          } else {
2408 2412                  if (attr->type < NF3REG || attr->type > NF3FIFO)
2409 2413                          vp->v_type = VBAD;
2410 2414                  else
2411 2415                          vp->v_type = nf3_to_vt[attr->type];
2412 2416                  vp->v_rdev = makedevice(attr->rdev.specdata1,
2413 2417                      attr->rdev.specdata2);
2414 2418                  nfs3_attrcache(vp, attr, t);
2415 2419                  rw_exit(&rtable[index].r_lock);
2416 2420          }
2417 2421  
2418 2422          return (vp);
2419 2423  }
2420 2424  
2421 2425  /*
2422 2426   * Read this comment before making changes to rtablehash()!
2423 2427   * This is a hash function in which seemingly obvious and harmless
2424 2428   * changes can cause escalations costing million dollars!
2425 2429   * Know what you are doing.
2426 2430   *
2427 2431   * rtablehash() implements Jenkins' one-at-a-time hash algorithm.  The
2428 2432   * algorithm is currently detailed here:
2429 2433   *
2430 2434   *   http://burtleburtle.net/bob/hash/doobs.html
2431 2435   *
2432 2436   * Of course, the above link may not be valid by the time you are reading
2433 2437   * this, but suffice it to say that the one-at-a-time algorithm works well in
2434 2438   * almost all cases.  If you are changing the algorithm be sure to verify that
2435 2439   * the hash algorithm still provides even distribution in all cases and with
2436 2440   * any server returning filehandles in whatever order (sequential or random).
2437 2441   */
2438 2442  static int
2439 2443  rtablehash(nfs_fhandle *fh)
2440 2444  {
2441 2445          ulong_t hash, len, i;
2442 2446          char *key;
2443 2447  
2444 2448          key = fh->fh_buf;
2445 2449          len = (ulong_t)fh->fh_len;
2446 2450          for (hash = 0, i = 0; i < len; i++) {
2447 2451                  hash += key[i];
2448 2452                  hash += (hash << 10);
2449 2453                  hash ^= (hash >> 6);
2450 2454          }
2451 2455          hash += (hash << 3);
2452 2456          hash ^= (hash >> 11);
2453 2457          hash += (hash << 15);
2454 2458          return (hash & rtablemask);
2455 2459  }
2456 2460  
2457 2461  static vnode_t *
2458 2462  make_rnode(nfs_fhandle *fh, rhashq_t *rhtp, struct vfs *vfsp,
2459 2463      struct vnodeops *vops,
2460 2464      int (*putapage)(vnode_t *, page_t *, u_offset_t *, size_t *, int, cred_t *),
2461 2465      int (*compar)(const void *, const void *),
2462 2466      int *newnode, cred_t *cr, char *dnm, char *nm)
2463 2467  {
2464 2468          rnode_t *rp;
2465 2469          rnode_t *trp;
2466 2470          vnode_t *vp;
2467 2471          mntinfo_t *mi;
2468 2472  
2469 2473          ASSERT(RW_READ_HELD(&rhtp->r_lock));
2470 2474  
2471 2475          mi = VFTOMI(vfsp);
2472 2476  start:
2473 2477          if ((rp = rfind(rhtp, fh, vfsp)) != NULL) {
2474 2478                  vp = RTOV(rp);
2475 2479                  nfs_set_vroot(vp);
2476 2480                  *newnode = 0;
2477 2481                  return (vp);
2478 2482          }
2479 2483          rw_exit(&rhtp->r_lock);
2480 2484  
2481 2485          mutex_enter(&rpfreelist_lock);
2482 2486          if (rpfreelist != NULL && rnew >= nrnode) {
2483 2487                  rp = rpfreelist;
2484 2488                  rp_rmfree(rp);
2485 2489                  mutex_exit(&rpfreelist_lock);
2486 2490  
2487 2491                  vp = RTOV(rp);
2488 2492  
2489 2493                  if (rp->r_flags & RHASHED) {
2490 2494                          rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2491 2495                          mutex_enter(&vp->v_lock);
2492 2496                          if (vp->v_count > 1) {
2493 2497                                  VN_RELE_LOCKED(vp);
2494 2498                                  mutex_exit(&vp->v_lock);
2495 2499                                  rw_exit(&rp->r_hashq->r_lock);
2496 2500                                  rw_enter(&rhtp->r_lock, RW_READER);
2497 2501                                  goto start;
2498 2502                          }
2499 2503                          mutex_exit(&vp->v_lock);
2500 2504                          rp_rmhash_locked(rp);
2501 2505                          rw_exit(&rp->r_hashq->r_lock);
2502 2506                  }
2503 2507  
2504 2508                  rinactive(rp, cr);
2505 2509  
2506 2510                  mutex_enter(&vp->v_lock);
2507 2511                  if (vp->v_count > 1) {
2508 2512                          VN_RELE_LOCKED(vp);
2509 2513                          mutex_exit(&vp->v_lock);
2510 2514                          rw_enter(&rhtp->r_lock, RW_READER);
2511 2515                          goto start;
2512 2516                  }
2513 2517                  mutex_exit(&vp->v_lock);
2514 2518                  vn_invalid(vp);
2515 2519                  /*
2516 2520                   * destroy old locks before bzero'ing and
2517 2521                   * recreating the locks below.
2518 2522                   */
2519 2523                  nfs_rw_destroy(&rp->r_rwlock);
2520 2524                  nfs_rw_destroy(&rp->r_lkserlock);
2521 2525                  mutex_destroy(&rp->r_statelock);
2522 2526                  cv_destroy(&rp->r_cv);
2523 2527                  cv_destroy(&rp->r_commit.c_cv);
2524 2528                  nfs_free_r_path(rp);
2525 2529                  avl_destroy(&rp->r_dir);
2526 2530                  /*
2527 2531                   * Make sure that if rnode is recycled then
2528 2532                   * VFS count is decremented properly before
2529 2533                   * reuse.
2530 2534                   */
2531 2535                  VFS_RELE(vp->v_vfsp);
2532 2536                  vn_reinit(vp);
2533 2537          } else {
2534 2538                  vnode_t *new_vp;
2535 2539  
2536 2540                  mutex_exit(&rpfreelist_lock);
2537 2541  
2538 2542                  rp = kmem_cache_alloc(rnode_cache, KM_SLEEP);
2539 2543                  new_vp = vn_alloc(KM_SLEEP);
2540 2544  
2541 2545                  atomic_inc_ulong((ulong_t *)&rnew);
2542 2546  #ifdef DEBUG
2543 2547                  clstat_debug.nrnode.value.ui64++;
2544 2548  #endif
2545 2549                  vp = new_vp;
2546 2550          }
2547 2551  
2548 2552          bzero(rp, sizeof (*rp));
2549 2553          rp->r_vnode = vp;
2550 2554          nfs_rw_init(&rp->r_rwlock, NULL, RW_DEFAULT, NULL);
2551 2555          nfs_rw_init(&rp->r_lkserlock, NULL, RW_DEFAULT, NULL);
2552 2556          mutex_init(&rp->r_statelock, NULL, MUTEX_DEFAULT, NULL);
2553 2557          cv_init(&rp->r_cv, NULL, CV_DEFAULT, NULL);
2554 2558          cv_init(&rp->r_commit.c_cv, NULL, CV_DEFAULT, NULL);
2555 2559          rp->r_fh.fh_len = fh->fh_len;
2556 2560          bcopy(fh->fh_buf, rp->r_fh.fh_buf, fh->fh_len);
2557 2561          rp->r_server = mi->mi_curr_serv;
2558 2562          if (FAILOVER_MOUNT(mi)) {
2559 2563                  /*
2560 2564                   * If replicated servers, stash pathnames
2561 2565                   */
2562 2566                  if (dnm != NULL && nm != NULL) {
2563 2567                          char *s, *p;
2564 2568                          uint_t len;
2565 2569  
2566 2570                          len = (uint_t)(strlen(dnm) + strlen(nm) + 2);
2567 2571                          rp->r_path = kmem_alloc(len, KM_SLEEP);
2568 2572  #ifdef DEBUG
2569 2573                          clstat_debug.rpath.value.ui64 += len;
2570 2574  #endif
2571 2575                          s = rp->r_path;
2572 2576                          for (p = dnm; *p; p++)
2573 2577                                  *s++ = *p;
2574 2578                          *s++ = '/';
2575 2579                          for (p = nm; *p; p++)
2576 2580                                  *s++ = *p;
2577 2581                          *s = '\0';
2578 2582                  } else {
2579 2583                          /* special case for root */
2580 2584                          rp->r_path = kmem_alloc(2, KM_SLEEP);
2581 2585  #ifdef DEBUG
2582 2586                          clstat_debug.rpath.value.ui64 += 2;
2583 2587  #endif
2584 2588                          *rp->r_path = '.';
2585 2589                          *(rp->r_path + 1) = '\0';
2586 2590                  }
2587 2591          }
2588 2592          VFS_HOLD(vfsp);
2589 2593          rp->r_putapage = putapage;
2590 2594          rp->r_hashq = rhtp;
2591 2595          rp->r_flags = RREADDIRPLUS;
2592 2596          avl_create(&rp->r_dir, compar, sizeof (rddir_cache),
2593 2597              offsetof(rddir_cache, tree));
2594 2598          vn_setops(vp, vops);
2595 2599          vp->v_data = (caddr_t)rp;
2596 2600          vp->v_vfsp = vfsp;
2597 2601          vp->v_type = VNON;
2598 2602          vp->v_flag |= VMODSORT;
2599 2603          nfs_set_vroot(vp);
2600 2604  
2601 2605          /*
2602 2606           * There is a race condition if someone else
2603 2607           * alloc's the rnode while no locks are held, so we
2604 2608           * check again and recover if found.
2605 2609           */
2606 2610          rw_enter(&rhtp->r_lock, RW_WRITER);
2607 2611          if ((trp = rfind(rhtp, fh, vfsp)) != NULL) {
2608 2612                  vp = RTOV(trp);
2609 2613                  nfs_set_vroot(vp);
2610 2614                  *newnode = 0;
2611 2615                  rw_exit(&rhtp->r_lock);
2612 2616                  rp_addfree(rp, cr);
2613 2617                  rw_enter(&rhtp->r_lock, RW_READER);
2614 2618                  return (vp);
2615 2619          }
2616 2620          rp_addhash(rp);
2617 2621          *newnode = 1;
2618 2622          return (vp);
2619 2623  }
2620 2624  
2621 2625  /*
2622 2626   * Callback function to check if the page should be marked as
2623 2627   * modified. In the positive case, p_fsdata is set to C_NOCOMMIT.
2624 2628   */
2625 2629  int
2626 2630  nfs_setmod_check(page_t *pp)
2627 2631  {
2628 2632          if (pp->p_fsdata != C_NOCOMMIT) {
2629 2633                  pp->p_fsdata = C_NOCOMMIT;
2630 2634                  return (1);
2631 2635          }
2632 2636          return (0);
2633 2637  }
2634 2638  
2635 2639  static void
2636 2640  nfs_set_vroot(vnode_t *vp)
2637 2641  {
2638 2642          rnode_t *rp;
2639 2643          nfs_fhandle *rootfh;
2640 2644  
2641 2645          rp = VTOR(vp);
2642 2646          rootfh = &rp->r_server->sv_fhandle;
2643 2647          if (rootfh->fh_len == rp->r_fh.fh_len &&
2644 2648              bcmp(rootfh->fh_buf, rp->r_fh.fh_buf, rp->r_fh.fh_len) == 0) {
2645 2649                  if (!(vp->v_flag & VROOT)) {
2646 2650                          mutex_enter(&vp->v_lock);
2647 2651                          vp->v_flag |= VROOT;
2648 2652                          mutex_exit(&vp->v_lock);
2649 2653                  }
2650 2654          }
2651 2655  }
2652 2656  
2653 2657  static void
2654 2658  nfs_free_r_path(rnode_t *rp)
2655 2659  {
2656 2660          char *path;
2657 2661          size_t len;
2658 2662  
2659 2663          path = rp->r_path;
2660 2664          if (path) {
2661 2665                  rp->r_path = NULL;
2662 2666                  len = strlen(path) + 1;
2663 2667                  kmem_free(path, len);
2664 2668  #ifdef DEBUG
2665 2669                  clstat_debug.rpath.value.ui64 -= len;
2666 2670  #endif
2667 2671          }
2668 2672  }
2669 2673  
2670 2674  /*
2671 2675   * Put an rnode on the free list.
2672 2676   *
2673 2677   * Rnodes which were allocated above and beyond the normal limit
2674 2678   * are immediately freed.
2675 2679   */
2676 2680  void
2677 2681  rp_addfree(rnode_t *rp, cred_t *cr)
2678 2682  {
2679 2683          vnode_t *vp;
2680 2684          struct vfs *vfsp;
2681 2685  
2682 2686          vp = RTOV(rp);
2683 2687          ASSERT(vp->v_count >= 1);
2684 2688          ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
2685 2689  
2686 2690          /*
2687 2691           * If we have too many rnodes allocated and there are no
2688 2692           * references to this rnode, or if the rnode is no longer
2689 2693           * accessible by it does not reside in the hash queues,
2690 2694           * or if an i/o error occurred while writing to the file,
2691 2695           * then just free it instead of putting it on the rnode
2692 2696           * freelist.
2693 2697           */
2694 2698          vfsp = vp->v_vfsp;
2695 2699          if (((rnew > nrnode || !(rp->r_flags & RHASHED) || rp->r_error ||
2696 2700              (vfsp->vfs_flag & VFS_UNMOUNTED)) && rp->r_count == 0)) {
2697 2701                  if (rp->r_flags & RHASHED) {
2698 2702                          rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2699 2703                          mutex_enter(&vp->v_lock);
2700 2704                          if (vp->v_count > 1) {
2701 2705                                  VN_RELE_LOCKED(vp);
2702 2706                                  mutex_exit(&vp->v_lock);
2703 2707                                  rw_exit(&rp->r_hashq->r_lock);
2704 2708                                  return;
2705 2709                          }
2706 2710                          mutex_exit(&vp->v_lock);
2707 2711                          rp_rmhash_locked(rp);
2708 2712                          rw_exit(&rp->r_hashq->r_lock);
2709 2713                  }
2710 2714  
2711 2715                  rinactive(rp, cr);
2712 2716  
2713 2717                  /*
2714 2718                   * Recheck the vnode reference count.  We need to
2715 2719                   * make sure that another reference has not been
2716 2720                   * acquired while we were not holding v_lock.  The
2717 2721                   * rnode is not in the rnode hash queues, so the
2718 2722                   * only way for a reference to have been acquired
2719 2723                   * is for a VOP_PUTPAGE because the rnode was marked
2720 2724                   * with RDIRTY or for a modified page.  This
2721 2725                   * reference may have been acquired before our call
2722 2726                   * to rinactive.  The i/o may have been completed,
2723 2727                   * thus allowing rinactive to complete, but the
2724 2728                   * reference to the vnode may not have been released
2725 2729                   * yet.  In any case, the rnode can not be destroyed
2726 2730                   * until the other references to this vnode have been
2727 2731                   * released.  The other references will take care of
2728 2732                   * either destroying the rnode or placing it on the
2729 2733                   * rnode freelist.  If there are no other references,
2730 2734                   * then the rnode may be safely destroyed.
2731 2735                   */
2732 2736                  mutex_enter(&vp->v_lock);
2733 2737                  if (vp->v_count > 1) {
2734 2738                          VN_RELE_LOCKED(vp);
2735 2739                          mutex_exit(&vp->v_lock);
2736 2740                          return;
2737 2741                  }
2738 2742                  mutex_exit(&vp->v_lock);
2739 2743  
2740 2744                  destroy_rnode(rp);
2741 2745                  return;
2742 2746          }
2743 2747  
2744 2748          /*
2745 2749           * Lock the hash queue and then recheck the reference count
2746 2750           * to ensure that no other threads have acquired a reference
2747 2751           * to indicate that the rnode should not be placed on the
2748 2752           * freelist.  If another reference has been acquired, then
2749 2753           * just release this one and let the other thread complete
2750 2754           * the processing of adding this rnode to the freelist.
2751 2755           */
2752 2756          rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2753 2757  
2754 2758          mutex_enter(&vp->v_lock);
2755 2759          if (vp->v_count > 1) {
2756 2760                  VN_RELE_LOCKED(vp);
2757 2761                  mutex_exit(&vp->v_lock);
2758 2762                  rw_exit(&rp->r_hashq->r_lock);
2759 2763                  return;
2760 2764          }
2761 2765          mutex_exit(&vp->v_lock);
2762 2766  
2763 2767          /*
2764 2768           * If there is no cached data or metadata for this file, then
2765 2769           * put the rnode on the front of the freelist so that it will
2766 2770           * be reused before other rnodes which may have cached data or
2767 2771           * metadata associated with them.
2768 2772           */
2769 2773          mutex_enter(&rpfreelist_lock);
2770 2774          if (rpfreelist == NULL) {
2771 2775                  rp->r_freef = rp;
2772 2776                  rp->r_freeb = rp;
2773 2777                  rpfreelist = rp;
2774 2778          } else {
2775 2779                  rp->r_freef = rpfreelist;
2776 2780                  rp->r_freeb = rpfreelist->r_freeb;
2777 2781                  rpfreelist->r_freeb->r_freef = rp;
2778 2782                  rpfreelist->r_freeb = rp;
2779 2783                  if (!vn_has_cached_data(vp) &&
2780 2784                      !HAVE_RDDIR_CACHE(rp) &&
2781 2785                      rp->r_symlink.contents == NULL &&
2782 2786                      rp->r_secattr == NULL &&
2783 2787                      rp->r_pathconf == NULL)
2784 2788                          rpfreelist = rp;
2785 2789          }
2786 2790          mutex_exit(&rpfreelist_lock);
2787 2791  
2788 2792          rw_exit(&rp->r_hashq->r_lock);
2789 2793  }
2790 2794  
2791 2795  /*
2792 2796   * Remove an rnode from the free list.
2793 2797   *
2794 2798   * The caller must be holding rpfreelist_lock and the rnode
2795 2799   * must be on the freelist.
2796 2800   */
2797 2801  static void
2798 2802  rp_rmfree(rnode_t *rp)
2799 2803  {
2800 2804  
2801 2805          ASSERT(MUTEX_HELD(&rpfreelist_lock));
2802 2806          ASSERT(rp->r_freef != NULL && rp->r_freeb != NULL);
2803 2807  
2804 2808          if (rp == rpfreelist) {
2805 2809                  rpfreelist = rp->r_freef;
2806 2810                  if (rp == rpfreelist)
2807 2811                          rpfreelist = NULL;
2808 2812          }
2809 2813  
2810 2814          rp->r_freeb->r_freef = rp->r_freef;
2811 2815          rp->r_freef->r_freeb = rp->r_freeb;
2812 2816  
2813 2817          rp->r_freef = rp->r_freeb = NULL;
2814 2818  }
2815 2819  
2816 2820  /*
2817 2821   * Put a rnode in the hash table.
2818 2822   *
2819 2823   * The caller must be holding the exclusive hash queue lock.
2820 2824   */
2821 2825  static void
2822 2826  rp_addhash(rnode_t *rp)
2823 2827  {
2824 2828  
2825 2829          ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
2826 2830          ASSERT(!(rp->r_flags & RHASHED));
2827 2831  
2828 2832          rp->r_hashf = rp->r_hashq->r_hashf;
2829 2833          rp->r_hashq->r_hashf = rp;
2830 2834          rp->r_hashb = (rnode_t *)rp->r_hashq;
2831 2835          rp->r_hashf->r_hashb = rp;
2832 2836  
2833 2837          mutex_enter(&rp->r_statelock);
2834 2838          rp->r_flags |= RHASHED;
2835 2839          mutex_exit(&rp->r_statelock);
2836 2840  }
2837 2841  
2838 2842  /*
2839 2843   * Remove a rnode from the hash table.
2840 2844   *
2841 2845   * The caller must be holding the hash queue lock.
2842 2846   */
2843 2847  static void
2844 2848  rp_rmhash_locked(rnode_t *rp)
2845 2849  {
2846 2850  
2847 2851          ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
2848 2852          ASSERT(rp->r_flags & RHASHED);
2849 2853  
2850 2854          rp->r_hashb->r_hashf = rp->r_hashf;
2851 2855          rp->r_hashf->r_hashb = rp->r_hashb;
2852 2856  
2853 2857          mutex_enter(&rp->r_statelock);
2854 2858          rp->r_flags &= ~RHASHED;
2855 2859          mutex_exit(&rp->r_statelock);
2856 2860  }
2857 2861  
2858 2862  /*
2859 2863   * Remove a rnode from the hash table.
2860 2864   *
2861 2865   * The caller must not be holding the hash queue lock.
2862 2866   */
2863 2867  void
2864 2868  rp_rmhash(rnode_t *rp)
2865 2869  {
2866 2870  
2867 2871          rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2868 2872          rp_rmhash_locked(rp);
2869 2873          rw_exit(&rp->r_hashq->r_lock);
2870 2874  }
2871 2875  
2872 2876  /*
2873 2877   * Lookup a rnode by fhandle.
2874 2878   *
2875 2879   * The caller must be holding the hash queue lock, either shared or exclusive.
2876 2880   */
2877 2881  static rnode_t *
2878 2882  rfind(rhashq_t *rhtp, nfs_fhandle *fh, struct vfs *vfsp)
2879 2883  {
2880 2884          rnode_t *rp;
2881 2885          vnode_t *vp;
2882 2886  
2883 2887          ASSERT(RW_LOCK_HELD(&rhtp->r_lock));
2884 2888  
2885 2889          for (rp = rhtp->r_hashf; rp != (rnode_t *)rhtp; rp = rp->r_hashf) {
2886 2890                  vp = RTOV(rp);
2887 2891                  if (vp->v_vfsp == vfsp &&
2888 2892                      rp->r_fh.fh_len == fh->fh_len &&
2889 2893                      bcmp(rp->r_fh.fh_buf, fh->fh_buf, fh->fh_len) == 0) {
2890 2894                          /*
2891 2895                           * remove rnode from free list, if necessary.
2892 2896                           */
2893 2897                          if (rp->r_freef != NULL) {
2894 2898                                  mutex_enter(&rpfreelist_lock);
2895 2899                                  /*
2896 2900                                   * If the rnode is on the freelist,
2897 2901                                   * then remove it and use that reference
2898 2902                                   * as the new reference.  Otherwise,
2899 2903                                   * need to increment the reference count.
2900 2904                                   */
2901 2905                                  if (rp->r_freef != NULL) {
2902 2906                                          rp_rmfree(rp);
2903 2907                                          mutex_exit(&rpfreelist_lock);
2904 2908                                  } else {
2905 2909                                          mutex_exit(&rpfreelist_lock);
2906 2910                                          VN_HOLD(vp);
2907 2911                                  }
2908 2912                          } else
2909 2913                                  VN_HOLD(vp);
2910 2914                          return (rp);
2911 2915                  }
2912 2916          }
2913 2917          return (NULL);
2914 2918  }
2915 2919  
2916 2920  /*
2917 2921   * Return 1 if there is a active vnode belonging to this vfs in the
2918 2922   * rtable cache.
2919 2923   *
2920 2924   * Several of these checks are done without holding the usual
2921 2925   * locks.  This is safe because destroy_rtable(), rp_addfree(),
2922 2926   * etc. will redo the necessary checks before actually destroying
2923 2927   * any rnodes.
2924 2928   */
2925 2929  int
2926 2930  check_rtable(struct vfs *vfsp)
2927 2931  {
2928 2932          int index;
2929 2933          rnode_t *rp;
2930 2934          vnode_t *vp;
2931 2935  
2932 2936          for (index = 0; index < rtablesize; index++) {
2933 2937                  rw_enter(&rtable[index].r_lock, RW_READER);
2934 2938                  for (rp = rtable[index].r_hashf;
2935 2939                      rp != (rnode_t *)(&rtable[index]);
2936 2940                      rp = rp->r_hashf) {
2937 2941                          vp = RTOV(rp);
2938 2942                          if (vp->v_vfsp == vfsp) {
2939 2943                                  if (rp->r_freef == NULL ||
2940 2944                                      (vn_has_cached_data(vp) &&
2941 2945                                      (rp->r_flags & RDIRTY)) ||
2942 2946                                      rp->r_count > 0) {
2943 2947                                          rw_exit(&rtable[index].r_lock);
2944 2948                                          return (1);
2945 2949                                  }
2946 2950                          }
2947 2951                  }
2948 2952                  rw_exit(&rtable[index].r_lock);
2949 2953          }
2950 2954          return (0);
2951 2955  }
2952 2956  
2953 2957  /*
2954 2958   * Destroy inactive vnodes from the hash queues which belong to this
2955 2959   * vfs.  It is essential that we destroy all inactive vnodes during a
2956 2960   * forced unmount as well as during a normal unmount.
2957 2961   */
2958 2962  void
2959 2963  destroy_rtable(struct vfs *vfsp, cred_t *cr)
2960 2964  {
2961 2965          int index;
2962 2966          rnode_t *rp;
2963 2967          rnode_t *rlist;
2964 2968          rnode_t *r_hashf;
2965 2969          vnode_t *vp;
2966 2970  
2967 2971          rlist = NULL;
2968 2972  
2969 2973          for (index = 0; index < rtablesize; index++) {
2970 2974                  rw_enter(&rtable[index].r_lock, RW_WRITER);
2971 2975                  for (rp = rtable[index].r_hashf;
2972 2976                      rp != (rnode_t *)(&rtable[index]);
2973 2977                      rp = r_hashf) {
2974 2978                          /* save the hash pointer before destroying */
2975 2979                          r_hashf = rp->r_hashf;
2976 2980                          vp = RTOV(rp);
2977 2981                          if (vp->v_vfsp == vfsp) {
2978 2982                                  mutex_enter(&rpfreelist_lock);
2979 2983                                  if (rp->r_freef != NULL) {
2980 2984                                          rp_rmfree(rp);
2981 2985                                          mutex_exit(&rpfreelist_lock);
2982 2986                                          rp_rmhash_locked(rp);
2983 2987                                          rp->r_hashf = rlist;
2984 2988                                          rlist = rp;
2985 2989                                  } else
2986 2990                                          mutex_exit(&rpfreelist_lock);
2987 2991                          }
2988 2992                  }
2989 2993                  rw_exit(&rtable[index].r_lock);
2990 2994          }
2991 2995  
2992 2996          for (rp = rlist; rp != NULL; rp = rlist) {
2993 2997                  rlist = rp->r_hashf;
2994 2998                  /*
2995 2999                   * This call to rp_addfree will end up destroying the
2996 3000                   * rnode, but in a safe way with the appropriate set
2997 3001                   * of checks done.
2998 3002                   */
2999 3003                  rp_addfree(rp, cr);
3000 3004          }
3001 3005  
3002 3006  }
3003 3007  
3004 3008  /*
3005 3009   * This routine destroys all the resources associated with the rnode
3006 3010   * and then the rnode itself.
3007 3011   */
3008 3012  static void
3009 3013  destroy_rnode(rnode_t *rp)
3010 3014  {
3011 3015          vnode_t *vp;
3012 3016          vfs_t *vfsp;
3013 3017  
3014 3018          vp = RTOV(rp);
3015 3019          vfsp = vp->v_vfsp;
3016 3020  
3017 3021          ASSERT(vp->v_count == 1);
3018 3022          ASSERT(rp->r_count == 0);
3019 3023          ASSERT(rp->r_lmpl == NULL);
3020 3024          ASSERT(rp->r_mapcnt == 0);
3021 3025          ASSERT(!(rp->r_flags & RHASHED));
3022 3026          ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
3023 3027          atomic_dec_ulong((ulong_t *)&rnew);
3024 3028  #ifdef DEBUG
3025 3029          clstat_debug.nrnode.value.ui64--;
3026 3030  #endif
3027 3031          nfs_rw_destroy(&rp->r_rwlock);
3028 3032          nfs_rw_destroy(&rp->r_lkserlock);
3029 3033          mutex_destroy(&rp->r_statelock);
3030 3034          cv_destroy(&rp->r_cv);
3031 3035          cv_destroy(&rp->r_commit.c_cv);
3032 3036          if (rp->r_flags & RDELMAPLIST)
3033 3037                  list_destroy(&rp->r_indelmap);
3034 3038          nfs_free_r_path(rp);
3035 3039          avl_destroy(&rp->r_dir);
3036 3040          vn_invalid(vp);
3037 3041          vn_free(vp);
3038 3042          kmem_cache_free(rnode_cache, rp);
3039 3043          VFS_RELE(vfsp);
3040 3044  }
3041 3045  
3042 3046  /*
3043 3047   * Flush all vnodes in this (or every) vfs.
3044 3048   * Used by nfs_sync and by nfs_unmount.
3045 3049   */
3046 3050  void
3047 3051  rflush(struct vfs *vfsp, cred_t *cr)
3048 3052  {
3049 3053          int index;
3050 3054          rnode_t *rp;
3051 3055          vnode_t *vp, **vplist;
3052 3056          long num, cnt;
3053 3057  
3054 3058          /*
3055 3059           * Check to see whether there is anything to do.
3056 3060           */
3057 3061          num = rnew;
3058 3062          if (num == 0)
3059 3063                  return;
3060 3064  
3061 3065          /*
3062 3066           * Allocate a slot for all currently active rnodes on the
3063 3067           * supposition that they all may need flushing.
3064 3068           */
3065 3069          vplist = kmem_alloc(num * sizeof (*vplist), KM_SLEEP);
3066 3070          cnt = 0;
3067 3071  
3068 3072          /*
3069 3073           * Walk the hash queues looking for rnodes with page
3070 3074           * lists associated with them.  Make a list of these
3071 3075           * files.
3072 3076           */
3073 3077          for (index = 0; index < rtablesize; index++) {
3074 3078                  rw_enter(&rtable[index].r_lock, RW_READER);
3075 3079                  for (rp = rtable[index].r_hashf;
3076 3080                      rp != (rnode_t *)(&rtable[index]);
3077 3081                      rp = rp->r_hashf) {
3078 3082                          vp = RTOV(rp);
3079 3083                          /*
3080 3084                           * Don't bother sync'ing a vp if it
3081 3085                           * is part of virtual swap device or
3082 3086                           * if VFS is read-only
3083 3087                           */
3084 3088                          if (IS_SWAPVP(vp) || vn_is_readonly(vp))
3085 3089                                  continue;
3086 3090                          /*
3087 3091                           * If flushing all mounted file systems or
3088 3092                           * the vnode belongs to this vfs, has pages
3089 3093                           * and is marked as either dirty or mmap'd,
3090 3094                           * hold and add this vnode to the list of
3091 3095                           * vnodes to flush.
3092 3096                           */
3093 3097                          if ((vfsp == NULL || vp->v_vfsp == vfsp) &&
3094 3098                              vn_has_cached_data(vp) &&
3095 3099                              ((rp->r_flags & RDIRTY) || rp->r_mapcnt > 0)) {
3096 3100                                  VN_HOLD(vp);
3097 3101                                  vplist[cnt++] = vp;
3098 3102                                  if (cnt == num) {
3099 3103                                          rw_exit(&rtable[index].r_lock);
3100 3104                                          goto toomany;
3101 3105                                  }
3102 3106                          }
3103 3107                  }
3104 3108                  rw_exit(&rtable[index].r_lock);
3105 3109          }
3106 3110  toomany:
3107 3111  
3108 3112          /*
3109 3113           * Flush and release all of the files on the list.
3110 3114           */
3111 3115          while (cnt-- > 0) {
3112 3116                  vp = vplist[cnt];
3113 3117                  (void) VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_ASYNC, cr, NULL);
3114 3118                  VN_RELE(vp);
3115 3119          }
3116 3120  
3117 3121          /*
3118 3122           * Free the space allocated to hold the list.
3119 3123           */
3120 3124          kmem_free(vplist, num * sizeof (*vplist));
3121 3125  }
3122 3126  
3123 3127  /*
3124 3128   * This probably needs to be larger than or equal to
3125 3129   * log2(sizeof (struct rnode)) due to the way that rnodes are
3126 3130   * allocated.
3127 3131   */
3128 3132  #define ACACHE_SHIFT_BITS       9
3129 3133  
3130 3134  static int
3131 3135  acachehash(rnode_t *rp, cred_t *cr)
3132 3136  {
3133 3137  
3134 3138          return ((((intptr_t)rp >> ACACHE_SHIFT_BITS) + crgetuid(cr)) &
3135 3139              acachemask);
3136 3140  }
3137 3141  
3138 3142  #ifdef DEBUG
3139 3143  static long nfs_access_cache_hits = 0;
3140 3144  static long nfs_access_cache_misses = 0;
3141 3145  #endif
3142 3146  
3143 3147  nfs_access_type_t
3144 3148  nfs_access_check(rnode_t *rp, uint32_t acc, cred_t *cr)
3145 3149  {
3146 3150          vnode_t *vp;
3147 3151          acache_t *ap;
3148 3152          acache_hash_t *hp;
3149 3153          nfs_access_type_t all;
3150 3154  
3151 3155          vp = RTOV(rp);
3152 3156          if (!ATTRCACHE_VALID(vp) || nfs_waitfor_purge_complete(vp))
3153 3157                  return (NFS_ACCESS_UNKNOWN);
3154 3158  
3155 3159          if (rp->r_acache != NULL) {
3156 3160                  hp = &acache[acachehash(rp, cr)];
3157 3161                  rw_enter(&hp->lock, RW_READER);
3158 3162                  ap = hp->next;
3159 3163                  while (ap != (acache_t *)hp) {
3160 3164                          if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) {
3161 3165                                  if ((ap->known & acc) == acc) {
3162 3166  #ifdef DEBUG
3163 3167                                          nfs_access_cache_hits++;
3164 3168  #endif
3165 3169                                          if ((ap->allowed & acc) == acc)
3166 3170                                                  all = NFS_ACCESS_ALLOWED;
3167 3171                                          else
3168 3172                                                  all = NFS_ACCESS_DENIED;
3169 3173                                  } else {
3170 3174  #ifdef DEBUG
3171 3175                                          nfs_access_cache_misses++;
3172 3176  #endif
3173 3177                                          all = NFS_ACCESS_UNKNOWN;
3174 3178                                  }
3175 3179                                  rw_exit(&hp->lock);
3176 3180                                  return (all);
3177 3181                          }
3178 3182                          ap = ap->next;
3179 3183                  }
3180 3184                  rw_exit(&hp->lock);
3181 3185          }
3182 3186  
3183 3187  #ifdef DEBUG
3184 3188          nfs_access_cache_misses++;
3185 3189  #endif
3186 3190          return (NFS_ACCESS_UNKNOWN);
3187 3191  }
3188 3192  
3189 3193  void
3190 3194  nfs_access_cache(rnode_t *rp, uint32_t acc, uint32_t resacc, cred_t *cr)
3191 3195  {
3192 3196          acache_t *ap;
3193 3197          acache_t *nap;
3194 3198          acache_hash_t *hp;
3195 3199  
3196 3200          hp = &acache[acachehash(rp, cr)];
3197 3201  
3198 3202          /*
3199 3203           * Allocate now assuming that mostly an allocation will be
3200 3204           * required.  This allows the allocation to happen without
3201 3205           * holding the hash bucket locked.
3202 3206           */
3203 3207          nap = kmem_cache_alloc(acache_cache, KM_NOSLEEP);
3204 3208          if (nap != NULL) {
3205 3209                  nap->known = acc;
3206 3210                  nap->allowed = resacc;
3207 3211                  nap->rnode = rp;
3208 3212                  crhold(cr);
3209 3213                  nap->cred = cr;
3210 3214                  nap->hashq = hp;
3211 3215          }
3212 3216  
3213 3217          rw_enter(&hp->lock, RW_WRITER);
3214 3218  
3215 3219          if (rp->r_acache != NULL) {
3216 3220                  ap = hp->next;
3217 3221                  while (ap != (acache_t *)hp) {
3218 3222                          if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) {
3219 3223                                  ap->known |= acc;
3220 3224                                  ap->allowed &= ~acc;
3221 3225                                  ap->allowed |= resacc;
3222 3226                                  rw_exit(&hp->lock);
3223 3227                                  if (nap != NULL) {
3224 3228                                          crfree(nap->cred);
3225 3229                                          kmem_cache_free(acache_cache, nap);
3226 3230                                  }
3227 3231                                  return;
3228 3232                          }
3229 3233                          ap = ap->next;
3230 3234                  }
3231 3235          }
3232 3236  
3233 3237          if (nap != NULL) {
3234 3238  #ifdef DEBUG
3235 3239                  clstat_debug.access.value.ui64++;
3236 3240  #endif
3237 3241                  nap->next = hp->next;
3238 3242                  hp->next = nap;
3239 3243                  nap->next->prev = nap;
3240 3244                  nap->prev = (acache_t *)hp;
3241 3245  
3242 3246                  mutex_enter(&rp->r_statelock);
3243 3247                  nap->list = rp->r_acache;
3244 3248                  rp->r_acache = nap;
3245 3249                  mutex_exit(&rp->r_statelock);
3246 3250          }
3247 3251  
3248 3252          rw_exit(&hp->lock);
3249 3253  }
3250 3254  
3251 3255  int
3252 3256  nfs_access_purge_rp(rnode_t *rp)
3253 3257  {
3254 3258          acache_t *ap;
3255 3259          acache_t *tmpap;
3256 3260          acache_t *rplist;
3257 3261  
3258 3262          /*
3259 3263           * If there aren't any cached entries, then there is nothing
3260 3264           * to free.
3261 3265           */
3262 3266          if (rp->r_acache == NULL)
3263 3267                  return (0);
3264 3268  
3265 3269          mutex_enter(&rp->r_statelock);
3266 3270          rplist = rp->r_acache;
3267 3271          rp->r_acache = NULL;
3268 3272          mutex_exit(&rp->r_statelock);
3269 3273  
3270 3274          /*
3271 3275           * Loop through each entry in the list pointed to in the
3272 3276           * rnode.  Remove each of these entries from the hash
3273 3277           * queue that it is on and remove it from the list in
3274 3278           * the rnode.
3275 3279           */
3276 3280          for (ap = rplist; ap != NULL; ap = tmpap) {
3277 3281                  rw_enter(&ap->hashq->lock, RW_WRITER);
3278 3282                  ap->prev->next = ap->next;
3279 3283                  ap->next->prev = ap->prev;
3280 3284                  rw_exit(&ap->hashq->lock);
3281 3285  
3282 3286                  tmpap = ap->list;
3283 3287                  crfree(ap->cred);
3284 3288                  kmem_cache_free(acache_cache, ap);
3285 3289  #ifdef DEBUG
3286 3290                  clstat_debug.access.value.ui64--;
3287 3291  #endif
3288 3292          }
3289 3293  
3290 3294          return (1);
3291 3295  }
3292 3296  
3293 3297  static const char prefix[] = ".nfs";
3294 3298  
3295 3299  static kmutex_t newnum_lock;
3296 3300  
3297 3301  int
3298 3302  newnum(void)
3299 3303  {
3300 3304          static uint_t newnum = 0;
3301 3305          uint_t id;
3302 3306  
3303 3307          mutex_enter(&newnum_lock);
3304 3308          if (newnum == 0)
3305 3309                  newnum = gethrestime_sec() & 0xffff;
3306 3310          id = newnum++;
3307 3311          mutex_exit(&newnum_lock);
3308 3312          return (id);
3309 3313  }
3310 3314  
3311 3315  char *
3312 3316  newname(void)
3313 3317  {
3314 3318          char *news;
3315 3319          char *s;
3316 3320          const char *p;
3317 3321          uint_t id;
3318 3322  
3319 3323          id = newnum();
3320 3324          news = kmem_alloc(MAXNAMELEN, KM_SLEEP);
3321 3325          s = news;
3322 3326          p = prefix;
3323 3327          while (*p != '\0')
3324 3328                  *s++ = *p++;
3325 3329          while (id != 0) {
3326 3330                  *s++ = "0123456789ABCDEF"[id & 0x0f];
3327 3331                  id >>= 4;
3328 3332          }
3329 3333          *s = '\0';
3330 3334          return (news);
3331 3335  }
3332 3336  
3333 3337  /*
3334 3338   * Snapshot callback for nfs:0:nfs_client as registered with the kstat
3335 3339   * framework.
3336 3340   */
3337 3341  static int
3338 3342  cl_snapshot(kstat_t *ksp, void *buf, int rw)
3339 3343  {
3340 3344          ksp->ks_snaptime = gethrtime();
3341 3345          if (rw == KSTAT_WRITE) {
3342 3346                  bcopy(buf, ksp->ks_private, sizeof (clstat_tmpl));
3343 3347  #ifdef DEBUG
3344 3348                  /*
3345 3349                   * Currently only the global zone can write to kstats, but we
3346 3350                   * add the check just for paranoia.
3347 3351                   */
3348 3352                  if (INGLOBALZONE(curproc))
3349 3353                          bcopy((char *)buf + sizeof (clstat_tmpl), &clstat_debug,
3350 3354                              sizeof (clstat_debug));
3351 3355  #endif
3352 3356          } else {
3353 3357                  bcopy(ksp->ks_private, buf, sizeof (clstat_tmpl));
3354 3358  #ifdef DEBUG
3355 3359                  /*
3356 3360                   * If we're displaying the "global" debug kstat values, we
3357 3361                   * display them as-is to all zones since in fact they apply to
3358 3362                   * the system as a whole.
3359 3363                   */
3360 3364                  bcopy(&clstat_debug, (char *)buf + sizeof (clstat_tmpl),
3361 3365                      sizeof (clstat_debug));
3362 3366  #endif
3363 3367          }
3364 3368          return (0);
3365 3369  }
3366 3370  
3367 3371  static void *
3368 3372  clinit_zone(zoneid_t zoneid)
3369 3373  {
3370 3374          kstat_t *nfs_client_kstat;
3371 3375          struct nfs_clnt *nfscl;
3372 3376          uint_t ndata;
3373 3377  
3374 3378          nfscl = kmem_alloc(sizeof (*nfscl), KM_SLEEP);
3375 3379          mutex_init(&nfscl->nfscl_chtable_lock, NULL, MUTEX_DEFAULT, NULL);
3376 3380          nfscl->nfscl_chtable = NULL;
3377 3381          nfscl->nfscl_zoneid = zoneid;
3378 3382  
3379 3383          bcopy(&clstat_tmpl, &nfscl->nfscl_stat, sizeof (clstat_tmpl));
3380 3384          ndata = sizeof (clstat_tmpl) / sizeof (kstat_named_t);
3381 3385  #ifdef DEBUG
3382 3386          ndata += sizeof (clstat_debug) / sizeof (kstat_named_t);
3383 3387  #endif
3384 3388          if ((nfs_client_kstat = kstat_create_zone("nfs", 0, "nfs_client",
3385 3389              "misc", KSTAT_TYPE_NAMED, ndata,
3386 3390              KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, zoneid)) != NULL) {
3387 3391                  nfs_client_kstat->ks_private = &nfscl->nfscl_stat;
3388 3392                  nfs_client_kstat->ks_snapshot = cl_snapshot;
3389 3393                  kstat_install(nfs_client_kstat);
3390 3394          }
3391 3395          mutex_enter(&nfs_clnt_list_lock);
3392 3396          list_insert_head(&nfs_clnt_list, nfscl);
3393 3397          mutex_exit(&nfs_clnt_list_lock);
3394 3398          return (nfscl);
3395 3399  }
3396 3400  
3397 3401  /*ARGSUSED*/
3398 3402  static void
3399 3403  clfini_zone(zoneid_t zoneid, void *arg)
3400 3404  {
3401 3405          struct nfs_clnt *nfscl = arg;
3402 3406          chhead_t *chp, *next;
3403 3407  
3404 3408          if (nfscl == NULL)
3405 3409                  return;
3406 3410          mutex_enter(&nfs_clnt_list_lock);
3407 3411          list_remove(&nfs_clnt_list, nfscl);
3408 3412          mutex_exit(&nfs_clnt_list_lock);
3409 3413          clreclaim_zone(nfscl, 0);
3410 3414          for (chp = nfscl->nfscl_chtable; chp != NULL; chp = next) {
3411 3415                  ASSERT(chp->ch_list == NULL);
3412 3416                  kmem_free(chp->ch_protofmly, strlen(chp->ch_protofmly) + 1);
3413 3417                  next = chp->ch_next;
3414 3418                  kmem_free(chp, sizeof (*chp));
3415 3419          }
3416 3420          kstat_delete_byname_zone("nfs", 0, "nfs_client", zoneid);
3417 3421          mutex_destroy(&nfscl->nfscl_chtable_lock);
3418 3422          kmem_free(nfscl, sizeof (*nfscl));
3419 3423  }
3420 3424  
3421 3425  /*
3422 3426   * Called by endpnt_destructor to make sure the client handles are
3423 3427   * cleaned up before the RPC endpoints.  This becomes a no-op if
3424 3428   * clfini_zone (above) is called first.  This function is needed
3425 3429   * (rather than relying on clfini_zone to clean up) because the ZSD
3426 3430   * callbacks have no ordering mechanism, so we have no way to ensure
3427 3431   * that clfini_zone is called before endpnt_destructor.
3428 3432   */
3429 3433  void
3430 3434  clcleanup_zone(zoneid_t zoneid)
3431 3435  {
3432 3436          struct nfs_clnt *nfscl;
3433 3437  
3434 3438          mutex_enter(&nfs_clnt_list_lock);
3435 3439          nfscl = list_head(&nfs_clnt_list);
3436 3440          for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl)) {
3437 3441                  if (nfscl->nfscl_zoneid == zoneid) {
3438 3442                          clreclaim_zone(nfscl, 0);
3439 3443                          break;
3440 3444                  }
3441 3445          }
3442 3446          mutex_exit(&nfs_clnt_list_lock);
3443 3447  }
3444 3448  
3445 3449  int
3446 3450  nfs_subrinit(void)
3447 3451  {
3448 3452          int i;
3449 3453          ulong_t nrnode_max;
3450 3454  
3451 3455          /*
3452 3456           * Allocate and initialize the rnode hash queues
3453 3457           */
3454 3458          if (nrnode <= 0)
3455 3459                  nrnode = ncsize;
3456 3460          nrnode_max = (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct rnode));
3457 3461          if (nrnode > nrnode_max || (nrnode == 0 && ncsize == 0)) {
3458 3462                  zcmn_err(GLOBAL_ZONEID, CE_NOTE,
3459 3463                      "!setting nrnode to max value of %ld", nrnode_max);
3460 3464                  nrnode = nrnode_max;
3461 3465          }
3462 3466  
3463 3467          rtablesize = 1 << highbit(nrnode / hashlen);
3464 3468          rtablemask = rtablesize - 1;
3465 3469          rtable = kmem_alloc(rtablesize * sizeof (*rtable), KM_SLEEP);
3466 3470          for (i = 0; i < rtablesize; i++) {
3467 3471                  rtable[i].r_hashf = (rnode_t *)(&rtable[i]);
3468 3472                  rtable[i].r_hashb = (rnode_t *)(&rtable[i]);
3469 3473                  rw_init(&rtable[i].r_lock, NULL, RW_DEFAULT, NULL);
3470 3474          }
3471 3475          rnode_cache = kmem_cache_create("rnode_cache", sizeof (rnode_t),
3472 3476              0, NULL, NULL, nfs_reclaim, NULL, NULL, 0);
3473 3477  
3474 3478          /*
3475 3479           * Allocate and initialize the access cache
3476 3480           */
3477 3481  
3478 3482          /*
3479 3483           * Initial guess is one access cache entry per rnode unless
3480 3484           * nacache is set to a non-zero value and then it is used to
3481 3485           * indicate a guess at the number of access cache entries.
3482 3486           */
3483 3487          if (nacache > 0)
3484 3488                  acachesize = 1 << highbit(nacache / hashlen);
3485 3489          else
3486 3490                  acachesize = rtablesize;
3487 3491          acachemask = acachesize - 1;
3488 3492          acache = kmem_alloc(acachesize * sizeof (*acache), KM_SLEEP);
3489 3493          for (i = 0; i < acachesize; i++) {
3490 3494                  acache[i].next = (acache_t *)&acache[i];
3491 3495                  acache[i].prev = (acache_t *)&acache[i];
3492 3496                  rw_init(&acache[i].lock, NULL, RW_DEFAULT, NULL);
3493 3497          }
3494 3498          acache_cache = kmem_cache_create("nfs_access_cache",
3495 3499              sizeof (acache_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
3496 3500          /*
3497 3501           * Allocate and initialize the client handle cache
3498 3502           */
3499 3503          chtab_cache = kmem_cache_create("client_handle_cache",
3500 3504              sizeof (struct chtab), 0, NULL, NULL, clreclaim, NULL, NULL, 0);
3501 3505          /*
3502 3506           * Initialize the list of per-zone client handles (and associated data).
3503 3507           * This needs to be done before we call zone_key_create().
3504 3508           */
3505 3509          list_create(&nfs_clnt_list, sizeof (struct nfs_clnt),
3506 3510              offsetof(struct nfs_clnt, nfscl_node));
3507 3511          /*
3508 3512           * Initialize the zone_key for per-zone client handle lists.
3509 3513           */
3510 3514          zone_key_create(&nfsclnt_zone_key, clinit_zone, NULL, clfini_zone);
3511 3515          /*
3512 3516           * Initialize the various mutexes and reader/writer locks
3513 3517           */
3514 3518          mutex_init(&rpfreelist_lock, NULL, MUTEX_DEFAULT, NULL);
3515 3519          mutex_init(&newnum_lock, NULL, MUTEX_DEFAULT, NULL);
3516 3520          mutex_init(&nfs_minor_lock, NULL, MUTEX_DEFAULT, NULL);
3517 3521  
3518 3522          /*
3519 3523           * Assign unique major number for all nfs mounts
3520 3524           */
3521 3525          if ((nfs_major = getudev()) == -1) {
3522 3526                  zcmn_err(GLOBAL_ZONEID, CE_WARN,
3523 3527                      "nfs: init: can't get unique device number");
3524 3528                  nfs_major = 0;
3525 3529          }
3526 3530          nfs_minor = 0;
3527 3531  
3528 3532          if (nfs3_jukebox_delay == 0)
3529 3533                  nfs3_jukebox_delay = NFS3_JUKEBOX_DELAY;
3530 3534  
3531 3535          return (0);
3532 3536  }
3533 3537  
3534 3538  void
3535 3539  nfs_subrfini(void)
3536 3540  {
3537 3541          int i;
3538 3542  
3539 3543          /*
3540 3544           * Deallocate the rnode hash queues
3541 3545           */
3542 3546          kmem_cache_destroy(rnode_cache);
3543 3547  
3544 3548          for (i = 0; i < rtablesize; i++)
3545 3549                  rw_destroy(&rtable[i].r_lock);
3546 3550          kmem_free(rtable, rtablesize * sizeof (*rtable));
3547 3551  
3548 3552          /*
3549 3553           * Deallocated the access cache
3550 3554           */
3551 3555          kmem_cache_destroy(acache_cache);
3552 3556  
3553 3557          for (i = 0; i < acachesize; i++)
3554 3558                  rw_destroy(&acache[i].lock);
3555 3559          kmem_free(acache, acachesize * sizeof (*acache));
3556 3560  
3557 3561          /*
3558 3562           * Deallocate the client handle cache
3559 3563           */
3560 3564          kmem_cache_destroy(chtab_cache);
3561 3565  
3562 3566          /*
3563 3567           * Destroy the various mutexes and reader/writer locks
3564 3568           */
3565 3569          mutex_destroy(&rpfreelist_lock);
3566 3570          mutex_destroy(&newnum_lock);
3567 3571          mutex_destroy(&nfs_minor_lock);
3568 3572          (void) zone_key_delete(nfsclnt_zone_key);
3569 3573  }
3570 3574  
3571 3575  enum nfsstat
3572 3576  puterrno(int error)
3573 3577  {
3574 3578  
3575 3579          switch (error) {
3576 3580          case EOPNOTSUPP:
3577 3581                  return (NFSERR_OPNOTSUPP);
3578 3582          case ENAMETOOLONG:
3579 3583                  return (NFSERR_NAMETOOLONG);
3580 3584          case ENOTEMPTY:
3581 3585                  return (NFSERR_NOTEMPTY);
3582 3586          case EDQUOT:
3583 3587                  return (NFSERR_DQUOT);
3584 3588          case ESTALE:
3585 3589                  return (NFSERR_STALE);
3586 3590          case EREMOTE:
3587 3591                  return (NFSERR_REMOTE);
3588 3592          case ENOSYS:
3589 3593                  return (NFSERR_OPNOTSUPP);
3590 3594          case EOVERFLOW:
3591 3595                  return (NFSERR_INVAL);
3592 3596          default:
3593 3597                  return ((enum nfsstat)error);
3594 3598          }
3595 3599          /* NOTREACHED */
3596 3600  }
3597 3601  
3598 3602  int
3599 3603  geterrno(enum nfsstat status)
3600 3604  {
3601 3605  
3602 3606          switch (status) {
3603 3607          case NFSERR_OPNOTSUPP:
3604 3608                  return (EOPNOTSUPP);
3605 3609          case NFSERR_NAMETOOLONG:
3606 3610                  return (ENAMETOOLONG);
3607 3611          case NFSERR_NOTEMPTY:
3608 3612                  return (ENOTEMPTY);
3609 3613          case NFSERR_DQUOT:
3610 3614                  return (EDQUOT);
3611 3615          case NFSERR_STALE:
3612 3616                  return (ESTALE);
3613 3617          case NFSERR_REMOTE:
3614 3618                  return (EREMOTE);
3615 3619          case NFSERR_WFLUSH:
3616 3620                  return (EIO);
3617 3621          default:
3618 3622                  return ((int)status);
3619 3623          }
3620 3624          /* NOTREACHED */
3621 3625  }
3622 3626  
3623 3627  enum nfsstat3
3624 3628  puterrno3(int error)
3625 3629  {
3626 3630  
3627 3631  #ifdef DEBUG
3628 3632          switch (error) {
3629 3633          case 0:
3630 3634                  return (NFS3_OK);
3631 3635          case EPERM:
3632 3636                  return (NFS3ERR_PERM);
3633 3637          case ENOENT:
3634 3638                  return (NFS3ERR_NOENT);
3635 3639          case EIO:
3636 3640                  return (NFS3ERR_IO);
3637 3641          case ENXIO:
3638 3642                  return (NFS3ERR_NXIO);
3639 3643          case EACCES:
3640 3644                  return (NFS3ERR_ACCES);
3641 3645          case EEXIST:
3642 3646                  return (NFS3ERR_EXIST);
3643 3647          case EXDEV:
3644 3648                  return (NFS3ERR_XDEV);
3645 3649          case ENODEV:
3646 3650                  return (NFS3ERR_NODEV);
3647 3651          case ENOTDIR:
3648 3652                  return (NFS3ERR_NOTDIR);
3649 3653          case EISDIR:
3650 3654                  return (NFS3ERR_ISDIR);
3651 3655          case EINVAL:
3652 3656                  return (NFS3ERR_INVAL);
3653 3657          case EFBIG:
3654 3658                  return (NFS3ERR_FBIG);
3655 3659          case ENOSPC:
3656 3660                  return (NFS3ERR_NOSPC);
3657 3661          case EROFS:
3658 3662                  return (NFS3ERR_ROFS);
3659 3663          case EMLINK:
3660 3664                  return (NFS3ERR_MLINK);
3661 3665          case ENAMETOOLONG:
3662 3666                  return (NFS3ERR_NAMETOOLONG);
3663 3667          case ENOTEMPTY:
3664 3668                  return (NFS3ERR_NOTEMPTY);
3665 3669          case EDQUOT:
3666 3670                  return (NFS3ERR_DQUOT);
3667 3671          case ESTALE:
3668 3672                  return (NFS3ERR_STALE);
3669 3673          case EREMOTE:
3670 3674                  return (NFS3ERR_REMOTE);
3671 3675          case ENOSYS:
3672 3676          case EOPNOTSUPP:
3673 3677                  return (NFS3ERR_NOTSUPP);
3674 3678          case EOVERFLOW:
3675 3679                  return (NFS3ERR_INVAL);
3676 3680          default:
3677 3681                  zcmn_err(getzoneid(), CE_WARN,
3678 3682                      "puterrno3: got error %d", error);
3679 3683                  return ((enum nfsstat3)error);
3680 3684          }
3681 3685  #else
3682 3686          switch (error) {
3683 3687          case ENAMETOOLONG:
3684 3688                  return (NFS3ERR_NAMETOOLONG);
3685 3689          case ENOTEMPTY:
3686 3690                  return (NFS3ERR_NOTEMPTY);
3687 3691          case EDQUOT:
3688 3692                  return (NFS3ERR_DQUOT);
3689 3693          case ESTALE:
3690 3694                  return (NFS3ERR_STALE);
3691 3695          case ENOSYS:
3692 3696          case EOPNOTSUPP:
3693 3697                  return (NFS3ERR_NOTSUPP);
3694 3698          case EREMOTE:
3695 3699                  return (NFS3ERR_REMOTE);
3696 3700          case EOVERFLOW:
3697 3701                  return (NFS3ERR_INVAL);
3698 3702          default:
3699 3703                  return ((enum nfsstat3)error);
3700 3704          }
3701 3705  #endif
3702 3706  }
3703 3707  
3704 3708  int
3705 3709  geterrno3(enum nfsstat3 status)
3706 3710  {
3707 3711  
3708 3712  #ifdef DEBUG
3709 3713          switch (status) {
3710 3714          case NFS3_OK:
3711 3715                  return (0);
3712 3716          case NFS3ERR_PERM:
3713 3717                  return (EPERM);
3714 3718          case NFS3ERR_NOENT:
3715 3719                  return (ENOENT);
3716 3720          case NFS3ERR_IO:
3717 3721                  return (EIO);
3718 3722          case NFS3ERR_NXIO:
3719 3723                  return (ENXIO);
3720 3724          case NFS3ERR_ACCES:
3721 3725                  return (EACCES);
3722 3726          case NFS3ERR_EXIST:
3723 3727                  return (EEXIST);
3724 3728          case NFS3ERR_XDEV:
3725 3729                  return (EXDEV);
3726 3730          case NFS3ERR_NODEV:
3727 3731                  return (ENODEV);
3728 3732          case NFS3ERR_NOTDIR:
3729 3733                  return (ENOTDIR);
3730 3734          case NFS3ERR_ISDIR:
3731 3735                  return (EISDIR);
3732 3736          case NFS3ERR_INVAL:
3733 3737                  return (EINVAL);
3734 3738          case NFS3ERR_FBIG:
3735 3739                  return (EFBIG);
3736 3740          case NFS3ERR_NOSPC:
3737 3741                  return (ENOSPC);
3738 3742          case NFS3ERR_ROFS:
3739 3743                  return (EROFS);
3740 3744          case NFS3ERR_MLINK:
3741 3745                  return (EMLINK);
3742 3746          case NFS3ERR_NAMETOOLONG:
3743 3747                  return (ENAMETOOLONG);
3744 3748          case NFS3ERR_NOTEMPTY:
3745 3749                  return (ENOTEMPTY);
3746 3750          case NFS3ERR_DQUOT:
3747 3751                  return (EDQUOT);
3748 3752          case NFS3ERR_STALE:
3749 3753                  return (ESTALE);
3750 3754          case NFS3ERR_REMOTE:
3751 3755                  return (EREMOTE);
3752 3756          case NFS3ERR_BADHANDLE:
3753 3757                  return (ESTALE);
3754 3758          case NFS3ERR_NOT_SYNC:
3755 3759                  return (EINVAL);
3756 3760          case NFS3ERR_BAD_COOKIE:
3757 3761                  return (ENOENT);
3758 3762          case NFS3ERR_NOTSUPP:
3759 3763                  return (EOPNOTSUPP);
3760 3764          case NFS3ERR_TOOSMALL:
3761 3765                  return (EINVAL);
3762 3766          case NFS3ERR_SERVERFAULT:
3763 3767                  return (EIO);
3764 3768          case NFS3ERR_BADTYPE:
3765 3769                  return (EINVAL);
3766 3770          case NFS3ERR_JUKEBOX:
3767 3771                  return (ENXIO);
3768 3772          default:
3769 3773                  zcmn_err(getzoneid(), CE_WARN,
3770 3774                      "geterrno3: got status %d", status);
3771 3775                  return ((int)status);
3772 3776          }
3773 3777  #else
3774 3778          switch (status) {
3775 3779          case NFS3ERR_NAMETOOLONG:
3776 3780                  return (ENAMETOOLONG);
3777 3781          case NFS3ERR_NOTEMPTY:
3778 3782                  return (ENOTEMPTY);
3779 3783          case NFS3ERR_DQUOT:
3780 3784                  return (EDQUOT);
3781 3785          case NFS3ERR_STALE:
3782 3786          case NFS3ERR_BADHANDLE:
3783 3787                  return (ESTALE);
3784 3788          case NFS3ERR_NOTSUPP:
3785 3789                  return (EOPNOTSUPP);
3786 3790          case NFS3ERR_REMOTE:
3787 3791                  return (EREMOTE);
3788 3792          case NFS3ERR_NOT_SYNC:
3789 3793          case NFS3ERR_TOOSMALL:
3790 3794          case NFS3ERR_BADTYPE:
3791 3795                  return (EINVAL);
3792 3796          case NFS3ERR_BAD_COOKIE:
3793 3797                  return (ENOENT);
3794 3798          case NFS3ERR_SERVERFAULT:
3795 3799                  return (EIO);
3796 3800          case NFS3ERR_JUKEBOX:
3797 3801                  return (ENXIO);
3798 3802          default:
3799 3803                  return ((int)status);
3800 3804          }
3801 3805  #endif
3802 3806  }
3803 3807  
3804 3808  rddir_cache *
3805 3809  rddir_cache_alloc(int flags)
3806 3810  {
3807 3811          rddir_cache *rc;
3808 3812  
3809 3813          rc = kmem_alloc(sizeof (*rc), flags);
3810 3814          if (rc != NULL) {
3811 3815                  rc->entries = NULL;
3812 3816                  rc->flags = RDDIR;
3813 3817                  cv_init(&rc->cv, NULL, CV_DEFAULT, NULL);
3814 3818                  mutex_init(&rc->lock, NULL, MUTEX_DEFAULT, NULL);
3815 3819                  rc->count = 1;
3816 3820  #ifdef DEBUG
3817 3821                  atomic_inc_64(&clstat_debug.dirent.value.ui64);
3818 3822  #endif
3819 3823          }
3820 3824          return (rc);
3821 3825  }
3822 3826  
3823 3827  static void
3824 3828  rddir_cache_free(rddir_cache *rc)
3825 3829  {
3826 3830  
3827 3831  #ifdef DEBUG
3828 3832          atomic_dec_64(&clstat_debug.dirent.value.ui64);
3829 3833  #endif
3830 3834          if (rc->entries != NULL) {
3831 3835  #ifdef DEBUG
3832 3836                  rddir_cache_buf_free(rc->entries, rc->buflen);
3833 3837  #else
3834 3838                  kmem_free(rc->entries, rc->buflen);
3835 3839  #endif
3836 3840          }
3837 3841          cv_destroy(&rc->cv);
3838 3842          mutex_destroy(&rc->lock);
3839 3843          kmem_free(rc, sizeof (*rc));
3840 3844  }
3841 3845  
3842 3846  void
3843 3847  rddir_cache_hold(rddir_cache *rc)
3844 3848  {
3845 3849  
3846 3850          mutex_enter(&rc->lock);
3847 3851          rc->count++;
3848 3852          mutex_exit(&rc->lock);
3849 3853  }
3850 3854  
3851 3855  void
3852 3856  rddir_cache_rele(rddir_cache *rc)
3853 3857  {
3854 3858  
3855 3859          mutex_enter(&rc->lock);
3856 3860          ASSERT(rc->count > 0);
3857 3861          if (--rc->count == 0) {
3858 3862                  mutex_exit(&rc->lock);
3859 3863                  rddir_cache_free(rc);
3860 3864          } else
3861 3865                  mutex_exit(&rc->lock);
3862 3866  }
3863 3867  
3864 3868  #ifdef DEBUG
3865 3869  char *
3866 3870  rddir_cache_buf_alloc(size_t size, int flags)
3867 3871  {
3868 3872          char *rc;
3869 3873  
3870 3874          rc = kmem_alloc(size, flags);
3871 3875          if (rc != NULL)
3872 3876                  atomic_add_64(&clstat_debug.dirents.value.ui64, size);
3873 3877          return (rc);
3874 3878  }
3875 3879  
3876 3880  void
3877 3881  rddir_cache_buf_free(void *addr, size_t size)
3878 3882  {
3879 3883  
3880 3884          atomic_add_64(&clstat_debug.dirents.value.ui64, -(int64_t)size);
3881 3885          kmem_free(addr, size);
3882 3886  }
3883 3887  #endif
3884 3888  
3885 3889  static int
3886 3890  nfs_free_data_reclaim(rnode_t *rp)
3887 3891  {
3888 3892          char *contents;
3889 3893          int size;
3890 3894          vsecattr_t *vsp;
3891 3895          nfs3_pathconf_info *info;
3892 3896          int freed;
3893 3897          cred_t *cred;
3894 3898  
3895 3899          /*
3896 3900           * Free any held credentials and caches which
3897 3901           * may be associated with this rnode.
3898 3902           */
3899 3903          mutex_enter(&rp->r_statelock);
3900 3904          cred = rp->r_cred;
3901 3905          rp->r_cred = NULL;
3902 3906          contents = rp->r_symlink.contents;
3903 3907          size = rp->r_symlink.size;
3904 3908          rp->r_symlink.contents = NULL;
3905 3909          vsp = rp->r_secattr;
3906 3910          rp->r_secattr = NULL;
3907 3911          info = rp->r_pathconf;
3908 3912          rp->r_pathconf = NULL;
3909 3913          mutex_exit(&rp->r_statelock);
3910 3914  
3911 3915          if (cred != NULL)
3912 3916                  crfree(cred);
3913 3917  
3914 3918          /*
3915 3919           * Free the access cache entries.
3916 3920           */
3917 3921          freed = nfs_access_purge_rp(rp);
3918 3922  
3919 3923          if (!HAVE_RDDIR_CACHE(rp) &&
3920 3924              contents == NULL &&
3921 3925              vsp == NULL &&
3922 3926              info == NULL)
3923 3927                  return (freed);
3924 3928  
3925 3929          /*
3926 3930           * Free the readdir cache entries
3927 3931           */
3928 3932          if (HAVE_RDDIR_CACHE(rp))
3929 3933                  nfs_purge_rddir_cache(RTOV(rp));
3930 3934  
3931 3935          /*
3932 3936           * Free the symbolic link cache.
3933 3937           */
3934 3938          if (contents != NULL) {
3935 3939  
3936 3940                  kmem_free((void *)contents, size);
3937 3941          }
3938 3942  
3939 3943          /*
3940 3944           * Free any cached ACL.
3941 3945           */
3942 3946          if (vsp != NULL)
3943 3947                  nfs_acl_free(vsp);
3944 3948  
3945 3949          /*
3946 3950           * Free any cached pathconf information.
3947 3951           */
3948 3952          if (info != NULL)
3949 3953                  kmem_free(info, sizeof (*info));
3950 3954  
3951 3955          return (1);
3952 3956  }
3953 3957  
3954 3958  static int
3955 3959  nfs_active_data_reclaim(rnode_t *rp)
3956 3960  {
3957 3961          char *contents;
3958 3962          int size;
3959 3963          vsecattr_t *vsp;
3960 3964          nfs3_pathconf_info *info;
3961 3965          int freed;
3962 3966  
3963 3967          /*
3964 3968           * Free any held credentials and caches which
3965 3969           * may be associated with this rnode.
3966 3970           */
3967 3971          if (!mutex_tryenter(&rp->r_statelock))
3968 3972                  return (0);
3969 3973          contents = rp->r_symlink.contents;
3970 3974          size = rp->r_symlink.size;
3971 3975          rp->r_symlink.contents = NULL;
3972 3976          vsp = rp->r_secattr;
3973 3977          rp->r_secattr = NULL;
3974 3978          info = rp->r_pathconf;
3975 3979          rp->r_pathconf = NULL;
3976 3980          mutex_exit(&rp->r_statelock);
3977 3981  
3978 3982          /*
3979 3983           * Free the access cache entries.
3980 3984           */
3981 3985          freed = nfs_access_purge_rp(rp);
3982 3986  
3983 3987          if (!HAVE_RDDIR_CACHE(rp) &&
3984 3988              contents == NULL &&
3985 3989              vsp == NULL &&
3986 3990              info == NULL)
3987 3991                  return (freed);
3988 3992  
3989 3993          /*
3990 3994           * Free the readdir cache entries
3991 3995           */
3992 3996          if (HAVE_RDDIR_CACHE(rp))
3993 3997                  nfs_purge_rddir_cache(RTOV(rp));
3994 3998  
3995 3999          /*
3996 4000           * Free the symbolic link cache.
3997 4001           */
3998 4002          if (contents != NULL) {
3999 4003  
4000 4004                  kmem_free((void *)contents, size);
4001 4005          }
4002 4006  
4003 4007          /*
4004 4008           * Free any cached ACL.
4005 4009           */
4006 4010          if (vsp != NULL)
4007 4011                  nfs_acl_free(vsp);
4008 4012  
4009 4013          /*
4010 4014           * Free any cached pathconf information.
4011 4015           */
4012 4016          if (info != NULL)
4013 4017                  kmem_free(info, sizeof (*info));
4014 4018  
4015 4019          return (1);
4016 4020  }
4017 4021  
4018 4022  static int
4019 4023  nfs_free_reclaim(void)
4020 4024  {
4021 4025          int freed;
4022 4026          rnode_t *rp;
4023 4027  
4024 4028  #ifdef DEBUG
4025 4029          clstat_debug.f_reclaim.value.ui64++;
4026 4030  #endif
4027 4031          freed = 0;
4028 4032          mutex_enter(&rpfreelist_lock);
4029 4033          rp = rpfreelist;
4030 4034          if (rp != NULL) {
4031 4035                  do {
4032 4036                          if (nfs_free_data_reclaim(rp))
4033 4037                                  freed = 1;
4034 4038                  } while ((rp = rp->r_freef) != rpfreelist);
4035 4039          }
4036 4040          mutex_exit(&rpfreelist_lock);
4037 4041          return (freed);
4038 4042  }
4039 4043  
4040 4044  static int
4041 4045  nfs_active_reclaim(void)
4042 4046  {
4043 4047          int freed;
4044 4048          int index;
4045 4049          rnode_t *rp;
4046 4050  
4047 4051  #ifdef DEBUG
4048 4052          clstat_debug.a_reclaim.value.ui64++;
4049 4053  #endif
4050 4054          freed = 0;
4051 4055          for (index = 0; index < rtablesize; index++) {
4052 4056                  rw_enter(&rtable[index].r_lock, RW_READER);
4053 4057                  for (rp = rtable[index].r_hashf;
4054 4058                      rp != (rnode_t *)(&rtable[index]);
4055 4059                      rp = rp->r_hashf) {
4056 4060                          if (nfs_active_data_reclaim(rp))
4057 4061                                  freed = 1;
4058 4062                  }
4059 4063                  rw_exit(&rtable[index].r_lock);
4060 4064          }
4061 4065          return (freed);
4062 4066  }
4063 4067  
4064 4068  static int
4065 4069  nfs_rnode_reclaim(void)
4066 4070  {
4067 4071          int freed;
4068 4072          rnode_t *rp;
4069 4073          vnode_t *vp;
4070 4074  
4071 4075  #ifdef DEBUG
4072 4076          clstat_debug.r_reclaim.value.ui64++;
4073 4077  #endif
4074 4078          freed = 0;
4075 4079          mutex_enter(&rpfreelist_lock);
4076 4080          while ((rp = rpfreelist) != NULL) {
4077 4081                  rp_rmfree(rp);
4078 4082                  mutex_exit(&rpfreelist_lock);
4079 4083                  if (rp->r_flags & RHASHED) {
4080 4084                          vp = RTOV(rp);
4081 4085                          rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4082 4086                          mutex_enter(&vp->v_lock);
4083 4087                          if (vp->v_count > 1) {
4084 4088                                  VN_RELE_LOCKED(vp);
4085 4089                                  mutex_exit(&vp->v_lock);
4086 4090                                  rw_exit(&rp->r_hashq->r_lock);
4087 4091                                  mutex_enter(&rpfreelist_lock);
4088 4092                                  continue;
4089 4093                          }
4090 4094                          mutex_exit(&vp->v_lock);
4091 4095                          rp_rmhash_locked(rp);
4092 4096                          rw_exit(&rp->r_hashq->r_lock);
4093 4097                  }
4094 4098                  /*
4095 4099                   * This call to rp_addfree will end up destroying the
4096 4100                   * rnode, but in a safe way with the appropriate set
4097 4101                   * of checks done.
4098 4102                   */
4099 4103                  rp_addfree(rp, CRED());
4100 4104                  mutex_enter(&rpfreelist_lock);
4101 4105          }
4102 4106          mutex_exit(&rpfreelist_lock);
4103 4107          return (freed);
4104 4108  }
4105 4109  
4106 4110  /*ARGSUSED*/
4107 4111  static void
4108 4112  nfs_reclaim(void *cdrarg)
4109 4113  {
4110 4114  
4111 4115  #ifdef DEBUG
4112 4116          clstat_debug.reclaim.value.ui64++;
4113 4117  #endif
4114 4118          if (nfs_free_reclaim())
4115 4119                  return;
4116 4120  
4117 4121          if (nfs_active_reclaim())
4118 4122                  return;
4119 4123  
4120 4124          (void) nfs_rnode_reclaim();
4121 4125  }
4122 4126  
4123 4127  /*
4124 4128   * NFS client failover support
4125 4129   *
4126 4130   * Routines to copy filehandles
4127 4131   */
4128 4132  void
4129 4133  nfscopyfh(caddr_t fhp, vnode_t *vp)
4130 4134  {
4131 4135          fhandle_t *dest = (fhandle_t *)fhp;
4132 4136  
4133 4137          if (dest != NULL)
4134 4138                  *dest = *VTOFH(vp);
4135 4139  }
4136 4140  
4137 4141  void
4138 4142  nfs3copyfh(caddr_t fhp, vnode_t *vp)
4139 4143  {
4140 4144          nfs_fh3 *dest = (nfs_fh3 *)fhp;
4141 4145  
4142 4146          if (dest != NULL)
4143 4147                  *dest = *VTOFH3(vp);
4144 4148  }
4145 4149  
4146 4150  /*
4147 4151   * NFS client failover support
4148 4152   *
4149 4153   * failover_safe() will test various conditions to ensure that
4150 4154   * failover is permitted for this vnode.  It will be denied
4151 4155   * if:
4152 4156   *      1) the operation in progress does not support failover (NULL fi)
4153 4157   *      2) there are no available replicas (NULL mi_servers->sv_next)
4154 4158   *      3) any locks are outstanding on this file
4155 4159   */
4156 4160  static int
4157 4161  failover_safe(failinfo_t *fi)
4158 4162  {
4159 4163  
4160 4164          /*
4161 4165           * Does this op permit failover?
4162 4166           */
4163 4167          if (fi == NULL || fi->vp == NULL)
4164 4168                  return (0);
4165 4169  
4166 4170          /*
4167 4171           * Are there any alternates to failover to?
4168 4172           */
4169 4173          if (VTOMI(fi->vp)->mi_servers->sv_next == NULL)
4170 4174                  return (0);
4171 4175  
4172 4176          /*
4173 4177           * Disable check; we've forced local locking
4174 4178           *
4175 4179           * if (flk_has_remote_locks(fi->vp))
4176 4180           *      return (0);
4177 4181           */
4178 4182  
4179 4183          /*
4180 4184           * If we have no partial path, we can't do anything
4181 4185           */
4182 4186          if (VTOR(fi->vp)->r_path == NULL)
4183 4187                  return (0);
4184 4188  
4185 4189          return (1);
4186 4190  }
4187 4191  
4188 4192  #include <sys/thread.h>
4189 4193  
4190 4194  /*
4191 4195   * NFS client failover support
4192 4196   *
4193 4197   * failover_newserver() will start a search for a new server,
4194 4198   * preferably by starting an async thread to do the work.  If
4195 4199   * someone is already doing this (recognizable by MI_BINDINPROG
4196 4200   * being set), it will simply return and the calling thread
4197 4201   * will queue on the mi_failover_cv condition variable.
4198 4202   */
4199 4203  static void
4200 4204  failover_newserver(mntinfo_t *mi)
4201 4205  {
4202 4206          /*
4203 4207           * Check if someone else is doing this already
4204 4208           */
4205 4209          mutex_enter(&mi->mi_lock);
4206 4210          if (mi->mi_flags & MI_BINDINPROG) {
4207 4211                  mutex_exit(&mi->mi_lock);
4208 4212                  return;
4209 4213          }
4210 4214          mi->mi_flags |= MI_BINDINPROG;
4211 4215  
4212 4216          /*
4213 4217           * Need to hold the vfs struct so that it can't be released
4214 4218           * while the failover thread is selecting a new server.
4215 4219           */
4216 4220          VFS_HOLD(mi->mi_vfsp);
4217 4221  
4218 4222          /*
4219 4223           * Start a thread to do the real searching.
4220 4224           */
4221 4225          (void) zthread_create(NULL, 0, failover_thread, mi, 0, minclsyspri);
4222 4226  
4223 4227          mutex_exit(&mi->mi_lock);
4224 4228  }
4225 4229  
4226 4230  /*
4227 4231   * NFS client failover support
4228 4232   *
4229 4233   * failover_thread() will find a new server to replace the one
4230 4234   * currently in use, wake up other threads waiting on this mount
4231 4235   * point, and die.  It will start at the head of the server list
4232 4236   * and poll servers until it finds one with an NFS server which is
4233 4237   * registered and responds to a NULL procedure ping.
4234 4238   *
4235 4239   * XXX failover_thread is unsafe within the scope of the
4236 4240   * present model defined for cpr to suspend the system.
4237 4241   * Specifically, over-the-wire calls made by the thread
4238 4242   * are unsafe. The thread needs to be reevaluated in case of
4239 4243   * future updates to the cpr suspend model.
4240 4244   */
4241 4245  static void
4242 4246  failover_thread(mntinfo_t *mi)
4243 4247  {
4244 4248          servinfo_t *svp = NULL;
4245 4249          CLIENT *cl;
4246 4250          enum clnt_stat status;
4247 4251          struct timeval tv;
4248 4252          int error;
4249 4253          int oncethru = 0;
4250 4254          callb_cpr_t cprinfo;
4251 4255          rnode_t *rp;
4252 4256          int index;
4253 4257          char *srvnames;
4254 4258          size_t srvnames_len;
4255 4259          struct nfs_clnt *nfscl = NULL;
4256 4260          zoneid_t zoneid = getzoneid();
4257 4261  
4258 4262  #ifdef DEBUG
4259 4263          /*
4260 4264           * This is currently only needed to access counters which exist on
4261 4265           * DEBUG kernels, hence we don't want to pay the penalty of the lookup
4262 4266           * on non-DEBUG kernels.
4263 4267           */
4264 4268          nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
4265 4269          ASSERT(nfscl != NULL);
4266 4270  #endif
4267 4271  
4268 4272          /*
4269 4273           * Its safe to piggyback on the mi_lock since failover_newserver()
4270 4274           * code guarantees that there will be only one failover thread
4271 4275           * per mountinfo at any instance.
4272 4276           */
4273 4277          CALLB_CPR_INIT(&cprinfo, &mi->mi_lock, callb_generic_cpr,
4274 4278              "failover_thread");
4275 4279  
4276 4280          mutex_enter(&mi->mi_lock);
4277 4281          while (mi->mi_readers) {
4278 4282                  CALLB_CPR_SAFE_BEGIN(&cprinfo);
4279 4283                  cv_wait(&mi->mi_failover_cv, &mi->mi_lock);
4280 4284                  CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock);
4281 4285          }
4282 4286          mutex_exit(&mi->mi_lock);
4283 4287  
4284 4288          tv.tv_sec = 2;
4285 4289          tv.tv_usec = 0;
4286 4290  
4287 4291          /*
4288 4292           * Ping the null NFS procedure of every server in
4289 4293           * the list until one responds.  We always start
4290 4294           * at the head of the list and always skip the one
4291 4295           * that is current, since it's caused us a problem.
4292 4296           */
4293 4297          while (svp == NULL) {
4294 4298                  for (svp = mi->mi_servers; svp; svp = svp->sv_next) {
4295 4299                          if (!oncethru && svp == mi->mi_curr_serv)
4296 4300                                  continue;
4297 4301  
4298 4302                          /*
4299 4303                           * If the file system was forcibly umounted
4300 4304                           * while trying to do a failover, then just
4301 4305                           * give up on the failover.  It won't matter
4302 4306                           * what the server is.
4303 4307                           */
4304 4308                          if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
4305 4309                                  svp = NULL;
4306 4310                                  goto done;
4307 4311                          }
4308 4312  
4309 4313                          error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr,
4310 4314                              NFS_PROGRAM, NFS_VERSION, 0, 1, CRED(), &cl);
4311 4315                          if (error)
4312 4316                                  continue;
4313 4317  
4314 4318                          if (!(mi->mi_flags & MI_INT))
4315 4319                                  cl->cl_nosignal = TRUE;
4316 4320                          status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL,
4317 4321                              xdr_void, NULL, tv);
4318 4322                          if (!(mi->mi_flags & MI_INT))
4319 4323                                  cl->cl_nosignal = FALSE;
4320 4324                          AUTH_DESTROY(cl->cl_auth);
4321 4325                          CLNT_DESTROY(cl);
4322 4326                          if (status == RPC_SUCCESS) {
4323 4327                                  if (svp == mi->mi_curr_serv) {
4324 4328  #ifdef DEBUG
4325 4329                                          zcmn_err(zoneid, CE_NOTE,
4326 4330                          "NFS%d: failing over: selecting original server %s",
4327 4331                                              mi->mi_vers, svp->sv_hostname);
4328 4332  #else
4329 4333                                          zcmn_err(zoneid, CE_NOTE,
4330 4334                          "NFS: failing over: selecting original server %s",
4331 4335                                              svp->sv_hostname);
4332 4336  #endif
4333 4337                                  } else {
4334 4338  #ifdef DEBUG
4335 4339                                          zcmn_err(zoneid, CE_NOTE,
4336 4340                                      "NFS%d: failing over from %s to %s",
4337 4341                                              mi->mi_vers,
4338 4342                                              mi->mi_curr_serv->sv_hostname,
4339 4343                                              svp->sv_hostname);
4340 4344  #else
4341 4345                                          zcmn_err(zoneid, CE_NOTE,
4342 4346                                      "NFS: failing over from %s to %s",
4343 4347                                              mi->mi_curr_serv->sv_hostname,
4344 4348                                              svp->sv_hostname);
4345 4349  #endif
4346 4350                                  }
4347 4351                                  break;
4348 4352                          }
4349 4353                  }
4350 4354  
4351 4355                  if (svp == NULL) {
4352 4356                          if (!oncethru) {
4353 4357                                  srvnames = nfs_getsrvnames(mi, &srvnames_len);
4354 4358  #ifdef DEBUG
4355 4359                                  zprintf(zoneid,
4356 4360                                      "NFS%d servers %s not responding "
4357 4361                                      "still trying\n", mi->mi_vers, srvnames);
4358 4362  #else
4359 4363                                  zprintf(zoneid, "NFS servers %s not responding "
4360 4364                                      "still trying\n", srvnames);
4361 4365  #endif
4362 4366                                  oncethru = 1;
4363 4367                          }
4364 4368                          mutex_enter(&mi->mi_lock);
4365 4369                          CALLB_CPR_SAFE_BEGIN(&cprinfo);
4366 4370                          mutex_exit(&mi->mi_lock);
4367 4371                          delay(hz);
4368 4372                          mutex_enter(&mi->mi_lock);
4369 4373                          CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock);
4370 4374                          mutex_exit(&mi->mi_lock);
4371 4375                  }
4372 4376          }
4373 4377  
4374 4378          if (oncethru) {
4375 4379  #ifdef DEBUG
4376 4380                  zprintf(zoneid, "NFS%d servers %s ok\n", mi->mi_vers, srvnames);
4377 4381  #else
4378 4382                  zprintf(zoneid, "NFS servers %s ok\n", srvnames);
4379 4383  #endif
4380 4384          }
4381 4385  
4382 4386          if (svp != mi->mi_curr_serv) {
4383 4387                  (void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
4384 4388                  index = rtablehash(&mi->mi_curr_serv->sv_fhandle);
4385 4389                  rw_enter(&rtable[index].r_lock, RW_WRITER);
4386 4390                  rp = rfind(&rtable[index], &mi->mi_curr_serv->sv_fhandle,
4387 4391                      mi->mi_vfsp);
4388 4392                  if (rp != NULL) {
4389 4393                          if (rp->r_flags & RHASHED)
4390 4394                                  rp_rmhash_locked(rp);
4391 4395                          rw_exit(&rtable[index].r_lock);
4392 4396                          rp->r_server = svp;
4393 4397                          rp->r_fh = svp->sv_fhandle;
4394 4398                          (void) nfs_free_data_reclaim(rp);
4395 4399                          index = rtablehash(&rp->r_fh);
4396 4400                          rp->r_hashq = &rtable[index];
4397 4401                          rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4398 4402                          vn_exists(RTOV(rp));
4399 4403                          rp_addhash(rp);
4400 4404                          rw_exit(&rp->r_hashq->r_lock);
4401 4405                          VN_RELE(RTOV(rp));
4402 4406                  } else
4403 4407                          rw_exit(&rtable[index].r_lock);
4404 4408          }
4405 4409  
4406 4410  done:
4407 4411          if (oncethru)
4408 4412                  kmem_free(srvnames, srvnames_len);
4409 4413          mutex_enter(&mi->mi_lock);
4410 4414          mi->mi_flags &= ~MI_BINDINPROG;
4411 4415          if (svp != NULL) {
4412 4416                  mi->mi_curr_serv = svp;
4413 4417                  mi->mi_failover++;
4414 4418  #ifdef DEBUG
4415 4419          nfscl->nfscl_stat.failover.value.ui64++;
4416 4420  #endif
4417 4421          }
4418 4422          cv_broadcast(&mi->mi_failover_cv);
4419 4423          CALLB_CPR_EXIT(&cprinfo);
4420 4424          VFS_RELE(mi->mi_vfsp);
4421 4425          zthread_exit();
4422 4426          /* NOTREACHED */
4423 4427  }
4424 4428  
4425 4429  /*
4426 4430   * NFS client failover support
4427 4431   *
4428 4432   * failover_wait() will put the thread to sleep until MI_BINDINPROG
4429 4433   * is cleared, meaning that failover is complete.  Called with
4430 4434   * mi_lock mutex held.
4431 4435   */
4432 4436  static int
4433 4437  failover_wait(mntinfo_t *mi)
4434 4438  {
4435 4439          k_sigset_t smask;
4436 4440  
4437 4441          /*
4438 4442           * If someone else is hunting for a living server,
4439 4443           * sleep until it's done.  After our sleep, we may
4440 4444           * be bound to the right server and get off cheaply.
4441 4445           */
4442 4446          while (mi->mi_flags & MI_BINDINPROG) {
4443 4447                  /*
4444 4448                   * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
4445 4449                   * and SIGTERM. (Preserving the existing masks).
4446 4450                   * Mask out SIGINT if mount option nointr is specified.
4447 4451                   */
4448 4452                  sigintr(&smask, (int)mi->mi_flags & MI_INT);
4449 4453                  if (!cv_wait_sig(&mi->mi_failover_cv, &mi->mi_lock)) {
4450 4454                          /*
4451 4455                           * restore original signal mask
4452 4456                           */
4453 4457                          sigunintr(&smask);
4454 4458                          return (EINTR);
4455 4459                  }
4456 4460                  /*
4457 4461                   * restore original signal mask
4458 4462                   */
4459 4463                  sigunintr(&smask);
4460 4464          }
4461 4465          return (0);
4462 4466  }
4463 4467  
4464 4468  /*
4465 4469   * NFS client failover support
4466 4470   *
4467 4471   * failover_remap() will do a partial pathname lookup and find the
4468 4472   * desired vnode on the current server.  The interim vnode will be
4469 4473   * discarded after we pilfer the new filehandle.
4470 4474   *
4471 4475   * Side effects:
4472 4476   * - This routine will also update the filehandle in the args structure
4473 4477   *    pointed to by the fi->fhp pointer if it is non-NULL.
4474 4478   */
4475 4479  
4476 4480  static int
4477 4481  failover_remap(failinfo_t *fi)
4478 4482  {
4479 4483          vnode_t *vp, *nvp, *rootvp;
4480 4484          rnode_t *rp, *nrp;
4481 4485          mntinfo_t *mi;
4482 4486          int error;
4483 4487  #ifdef DEBUG
4484 4488          struct nfs_clnt *nfscl;
4485 4489  
4486 4490          nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
4487 4491          ASSERT(nfscl != NULL);
4488 4492  #endif
4489 4493          /*
4490 4494           * Sanity check
4491 4495           */
4492 4496          if (fi == NULL || fi->vp == NULL || fi->lookupproc == NULL)
4493 4497                  return (EINVAL);
4494 4498          vp = fi->vp;
4495 4499          rp = VTOR(vp);
4496 4500          mi = VTOMI(vp);
4497 4501  
4498 4502          if (!(vp->v_flag & VROOT)) {
4499 4503                  /*
4500 4504                   * Given the root fh, use the path stored in
4501 4505                   * the rnode to find the fh for the new server.
4502 4506                   */
4503 4507                  error = VFS_ROOT(mi->mi_vfsp, &rootvp);
4504 4508                  if (error)
4505 4509                          return (error);
4506 4510  
4507 4511                  error = failover_lookup(rp->r_path, rootvp,
4508 4512                      fi->lookupproc, fi->xattrdirproc, &nvp);
4509 4513  
4510 4514                  VN_RELE(rootvp);
4511 4515  
4512 4516                  if (error)
4513 4517                          return (error);
4514 4518  
4515 4519                  /*
4516 4520                   * If we found the same rnode, we're done now
4517 4521                   */
4518 4522                  if (nvp == vp) {
4519 4523                          /*
4520 4524                           * Failed and the new server may physically be same
4521 4525                           * OR may share a same disk subsystem. In this case
4522 4526                           * file handle for a particular file path is not going
4523 4527                           * to change, given the same filehandle lookup will
4524 4528                           * always locate the same rnode as the existing one.
4525 4529                           * All we might need to do is to update the r_server
4526 4530                           * with the current servinfo.
4527 4531                           */
4528 4532                          if (!VALID_FH(fi)) {
4529 4533                                  rp->r_server = mi->mi_curr_serv;
4530 4534                          }
4531 4535                          VN_RELE(nvp);
4532 4536                          return (0);
4533 4537                  }
4534 4538  
4535 4539                  /*
4536 4540                   * Try to make it so that no one else will find this
4537 4541                   * vnode because it is just a temporary to hold the
4538 4542                   * new file handle until that file handle can be
4539 4543                   * copied to the original vnode/rnode.
4540 4544                   */
4541 4545                  nrp = VTOR(nvp);
4542 4546                  mutex_enter(&mi->mi_remap_lock);
4543 4547                  /*
4544 4548                   * Some other thread could have raced in here and could
4545 4549                   * have done the remap for this particular rnode before
4546 4550                   * this thread here. Check for rp->r_server and
4547 4551                   * mi->mi_curr_serv and return if they are same.
4548 4552                   */
4549 4553                  if (VALID_FH(fi)) {
4550 4554                          mutex_exit(&mi->mi_remap_lock);
4551 4555                          VN_RELE(nvp);
4552 4556                          return (0);
4553 4557                  }
4554 4558  
4555 4559                  if (nrp->r_flags & RHASHED)
4556 4560                          rp_rmhash(nrp);
4557 4561  
4558 4562                  /*
4559 4563                   * As a heuristic check on the validity of the new
4560 4564                   * file, check that the size and type match against
4561 4565                   * that we remember from the old version.
4562 4566                   */
4563 4567                  if (rp->r_size != nrp->r_size || vp->v_type != nvp->v_type) {
4564 4568                          mutex_exit(&mi->mi_remap_lock);
4565 4569                          zcmn_err(mi->mi_zone->zone_id, CE_WARN,
4566 4570                              "NFS replicas %s and %s: file %s not same.",
4567 4571                              rp->r_server->sv_hostname,
4568 4572                              nrp->r_server->sv_hostname, rp->r_path);
4569 4573                          VN_RELE(nvp);
4570 4574                          return (EINVAL);
4571 4575                  }
4572 4576  
4573 4577                  /*
4574 4578                   * snarf the filehandle from the new rnode
4575 4579                   * then release it, again while updating the
4576 4580                   * hash queues for the rnode.
4577 4581                   */
4578 4582                  if (rp->r_flags & RHASHED)
4579 4583                          rp_rmhash(rp);
4580 4584                  rp->r_server = mi->mi_curr_serv;
4581 4585                  rp->r_fh = nrp->r_fh;
4582 4586                  rp->r_hashq = nrp->r_hashq;
4583 4587                  /*
4584 4588                   * Copy the attributes from the new rnode to the old
4585 4589                   * rnode.  This will help to reduce unnecessary page
4586 4590                   * cache flushes.
4587 4591                   */
4588 4592                  rp->r_attr = nrp->r_attr;
4589 4593                  rp->r_attrtime = nrp->r_attrtime;
4590 4594                  rp->r_mtime = nrp->r_mtime;
4591 4595                  (void) nfs_free_data_reclaim(rp);
4592 4596                  nfs_setswaplike(vp, &rp->r_attr);
4593 4597                  rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4594 4598                  rp_addhash(rp);
4595 4599                  rw_exit(&rp->r_hashq->r_lock);
4596 4600                  mutex_exit(&mi->mi_remap_lock);
4597 4601                  VN_RELE(nvp);
4598 4602          }
4599 4603  
4600 4604          /*
4601 4605           * Update successful failover remap count
4602 4606           */
4603 4607          mutex_enter(&mi->mi_lock);
4604 4608          mi->mi_remap++;
4605 4609          mutex_exit(&mi->mi_lock);
4606 4610  #ifdef DEBUG
4607 4611          nfscl->nfscl_stat.remap.value.ui64++;
4608 4612  #endif
4609 4613  
4610 4614          /*
4611 4615           * If we have a copied filehandle to update, do it now.
4612 4616           */
4613 4617          if (fi->fhp != NULL && fi->copyproc != NULL)
4614 4618                  (*fi->copyproc)(fi->fhp, vp);
4615 4619  
4616 4620          return (0);
4617 4621  }
4618 4622  
4619 4623  /*
4620 4624   * NFS client failover support
4621 4625   *
4622 4626   * We want a simple pathname lookup routine to parse the pieces
4623 4627   * of path in rp->r_path.  We know that the path was a created
4624 4628   * as rnodes were made, so we know we have only to deal with
4625 4629   * paths that look like:
4626 4630   *      dir1/dir2/dir3/file
4627 4631   * Any evidence of anything like .., symlinks, and ENOTDIR
4628 4632   * are hard errors, because they mean something in this filesystem
4629 4633   * is different from the one we came from, or has changed under
4630 4634   * us in some way.  If this is true, we want the failure.
4631 4635   *
4632 4636   * Extended attributes: if the filesystem is mounted with extended
4633 4637   * attributes enabled (-o xattr), the attribute directory will be
4634 4638   * represented in the r_path as the magic name XATTR_RPATH. So if
4635 4639   * we see that name in the pathname, is must be because this node
4636 4640   * is an extended attribute.  Therefore, look it up that way.
4637 4641   */
4638 4642  static int
4639 4643  failover_lookup(char *path, vnode_t *root,
4640 4644      int (*lookupproc)(vnode_t *, char *, vnode_t **, struct pathname *, int,
4641 4645      vnode_t *, cred_t *, int),
4642 4646      int (*xattrdirproc)(vnode_t *, vnode_t **, bool_t, cred_t *, int),
4643 4647      vnode_t **new)
4644 4648  {
4645 4649          vnode_t *dvp, *nvp;
4646 4650          int error = EINVAL;
4647 4651          char *s, *p, *tmppath;
4648 4652          size_t len;
4649 4653          mntinfo_t *mi;
4650 4654          bool_t xattr;
4651 4655  
4652 4656          /* Make local copy of path */
4653 4657          len = strlen(path) + 1;
4654 4658          tmppath = kmem_alloc(len, KM_SLEEP);
4655 4659          (void) strcpy(tmppath, path);
4656 4660          s = tmppath;
4657 4661  
4658 4662          dvp = root;
4659 4663          VN_HOLD(dvp);
4660 4664          mi = VTOMI(root);
4661 4665          xattr = mi->mi_flags & MI_EXTATTR;
4662 4666  
4663 4667          do {
4664 4668                  p = strchr(s, '/');
4665 4669                  if (p != NULL)
4666 4670                          *p = '\0';
4667 4671                  if (xattr && strcmp(s, XATTR_RPATH) == 0) {
4668 4672                          error = (*xattrdirproc)(dvp, &nvp, FALSE, CRED(),
4669 4673                              RFSCALL_SOFT);
4670 4674                  } else {
4671 4675                          error = (*lookupproc)(dvp, s, &nvp, NULL, 0, NULL,
4672 4676                              CRED(), RFSCALL_SOFT);
4673 4677                  }
4674 4678                  if (p != NULL)
4675 4679                          *p++ = '/';
4676 4680                  if (error) {
4677 4681                          VN_RELE(dvp);
4678 4682                          kmem_free(tmppath, len);
4679 4683                          return (error);
4680 4684                  }
4681 4685                  s = p;
4682 4686                  VN_RELE(dvp);
4683 4687                  dvp = nvp;
4684 4688          } while (p != NULL);
4685 4689  
4686 4690          if (nvp != NULL && new != NULL)
4687 4691                  *new = nvp;
4688 4692          kmem_free(tmppath, len);
4689 4693          return (0);
4690 4694  }
4691 4695  
4692 4696  /*
4693 4697   * NFS client failover support
4694 4698   *
4695 4699   * sv_free() frees the malloc'd portion of a "servinfo_t".
4696 4700   */
4697 4701  void
4698 4702  sv_free(servinfo_t *svp)
4699 4703  {
4700 4704          servinfo_t *next;
4701 4705          struct knetconfig *knconf;
4702 4706  
4703 4707          while (svp != NULL) {
4704 4708                  next = svp->sv_next;
4705 4709                  if (svp->sv_secdata)
4706 4710                          sec_clnt_freeinfo(svp->sv_secdata);
4707 4711                  if (svp->sv_hostname && svp->sv_hostnamelen > 0)
4708 4712                          kmem_free(svp->sv_hostname, svp->sv_hostnamelen);
4709 4713                  knconf = svp->sv_knconf;
4710 4714                  if (knconf != NULL) {
4711 4715                          if (knconf->knc_protofmly != NULL)
4712 4716                                  kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
4713 4717                          if (knconf->knc_proto != NULL)
4714 4718                                  kmem_free(knconf->knc_proto, KNC_STRSIZE);
4715 4719                          kmem_free(knconf, sizeof (*knconf));
4716 4720                  }
4717 4721                  knconf = svp->sv_origknconf;
4718 4722                  if (knconf != NULL) {
4719 4723                          if (knconf->knc_protofmly != NULL)
4720 4724                                  kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
4721 4725                          if (knconf->knc_proto != NULL)
4722 4726                                  kmem_free(knconf->knc_proto, KNC_STRSIZE);
4723 4727                          kmem_free(knconf, sizeof (*knconf));
4724 4728                  }
4725 4729                  if (svp->sv_addr.buf != NULL && svp->sv_addr.maxlen != 0)
4726 4730                          kmem_free(svp->sv_addr.buf, svp->sv_addr.maxlen);
4727 4731                  mutex_destroy(&svp->sv_lock);
4728 4732                  kmem_free(svp, sizeof (*svp));
4729 4733                  svp = next;
4730 4734          }
4731 4735  }
4732 4736  
4733 4737  /*
4734 4738   * Only can return non-zero if intr != 0.
4735 4739   */
4736 4740  int
4737 4741  nfs_rw_enter_sig(nfs_rwlock_t *l, krw_t rw, int intr)
4738 4742  {
4739 4743  
4740 4744          mutex_enter(&l->lock);
4741 4745  
4742 4746          /*
4743 4747           * If this is a nested enter, then allow it.  There
4744 4748           * must be as many exits as enters through.
4745 4749           */
4746 4750          if (l->owner == curthread) {
4747 4751                  /* lock is held for writing by current thread */
4748 4752                  ASSERT(rw == RW_READER || rw == RW_WRITER);
4749 4753                  l->count--;
4750 4754          } else if (rw == RW_READER) {
4751 4755                  /*
4752 4756                   * While there is a writer active or writers waiting,
4753 4757                   * then wait for them to finish up and move on.  Then,
4754 4758                   * increment the count to indicate that a reader is
4755 4759                   * active.
4756 4760                   */
4757 4761                  while (l->count < 0 || l->waiters > 0) {
4758 4762                          if (intr) {
4759 4763                                  klwp_t *lwp = ttolwp(curthread);
4760 4764  
4761 4765                                  if (lwp != NULL)
4762 4766                                          lwp->lwp_nostop++;
4763 4767                                  if (cv_wait_sig(&l->cv_rd, &l->lock) == 0) {
4764 4768                                          if (lwp != NULL)
4765 4769                                                  lwp->lwp_nostop--;
4766 4770                                          mutex_exit(&l->lock);
4767 4771                                          return (EINTR);
4768 4772                                  }
4769 4773                                  if (lwp != NULL)
4770 4774                                          lwp->lwp_nostop--;
4771 4775                          } else
4772 4776                                  cv_wait(&l->cv_rd, &l->lock);
4773 4777                  }
4774 4778                  ASSERT(l->count < INT_MAX);
4775 4779  #ifdef  DEBUG
4776 4780                  if ((l->count % 10000) == 9999)
4777 4781                          cmn_err(CE_WARN, "nfs_rw_enter_sig: count %d on"
4778 4782                              "rwlock @ %p\n", l->count, (void *)&l);
4779 4783  #endif
4780 4784                  l->count++;
4781 4785          } else {
4782 4786                  ASSERT(rw == RW_WRITER);
4783 4787                  /*
4784 4788                   * While there are readers active or a writer
4785 4789                   * active, then wait for all of the readers
4786 4790                   * to finish or for the writer to finish.
4787 4791                   * Then, set the owner field to curthread and
4788 4792                   * decrement count to indicate that a writer
4789 4793                   * is active.
4790 4794                   */
4791 4795                  while (l->count != 0) {
4792 4796                          l->waiters++;
4793 4797                          if (intr) {
4794 4798                                  klwp_t *lwp = ttolwp(curthread);
4795 4799  
4796 4800                                  if (lwp != NULL)
4797 4801                                          lwp->lwp_nostop++;
4798 4802                                  if (cv_wait_sig(&l->cv, &l->lock) == 0) {
4799 4803                                          if (lwp != NULL)
4800 4804                                                  lwp->lwp_nostop--;
4801 4805                                          l->waiters--;
4802 4806                                          /*
4803 4807                                           * If there are readers active and no
4804 4808                                           * writers waiting then wake up all of
4805 4809                                           * the waiting readers (if any).
4806 4810                                           */
4807 4811                                          if (l->count > 0 && l->waiters == 0)
4808 4812                                                  cv_broadcast(&l->cv_rd);
4809 4813                                          mutex_exit(&l->lock);
4810 4814                                          return (EINTR);
4811 4815                                  }
4812 4816                                  if (lwp != NULL)
4813 4817                                          lwp->lwp_nostop--;
4814 4818                          } else
4815 4819                                  cv_wait(&l->cv, &l->lock);
4816 4820                          l->waiters--;
4817 4821                  }
4818 4822                  ASSERT(l->owner == NULL);
4819 4823                  l->owner = curthread;
4820 4824                  l->count--;
4821 4825          }
4822 4826  
4823 4827          mutex_exit(&l->lock);
4824 4828  
4825 4829          return (0);
4826 4830  }
4827 4831  
4828 4832  /*
4829 4833   * If the lock is available, obtain it and return non-zero.  If there is
4830 4834   * already a conflicting lock, return 0 immediately.
4831 4835   */
4832 4836  
4833 4837  int
4834 4838  nfs_rw_tryenter(nfs_rwlock_t *l, krw_t rw)
4835 4839  {
4836 4840          mutex_enter(&l->lock);
4837 4841  
4838 4842          /*
4839 4843           * If this is a nested enter, then allow it.  There
4840 4844           * must be as many exits as enters through.
4841 4845           */
4842 4846          if (l->owner == curthread) {
4843 4847                  /* lock is held for writing by current thread */
4844 4848                  ASSERT(rw == RW_READER || rw == RW_WRITER);
4845 4849                  l->count--;
4846 4850          } else if (rw == RW_READER) {
4847 4851                  /*
4848 4852                   * If there is a writer active or writers waiting, deny the
4849 4853                   * lock.  Otherwise, bump the count of readers.
4850 4854                   */
4851 4855                  if (l->count < 0 || l->waiters > 0) {
4852 4856                          mutex_exit(&l->lock);
4853 4857                          return (0);
4854 4858                  }
4855 4859                  l->count++;
4856 4860          } else {
4857 4861                  ASSERT(rw == RW_WRITER);
4858 4862                  /*
4859 4863                   * If there are readers active or a writer active, deny the
4860 4864                   * lock.  Otherwise, set the owner field to curthread and
4861 4865                   * decrement count to indicate that a writer is active.
4862 4866                   */
4863 4867                  if (l->count != 0) {
4864 4868                          mutex_exit(&l->lock);
4865 4869                          return (0);
4866 4870                  }
4867 4871                  ASSERT(l->owner == NULL);
4868 4872                  l->owner = curthread;
4869 4873                  l->count--;
4870 4874          }
4871 4875  
4872 4876          mutex_exit(&l->lock);
4873 4877  
4874 4878          return (1);
4875 4879  }
4876 4880  
4877 4881  void
4878 4882  nfs_rw_exit(nfs_rwlock_t *l)
4879 4883  {
4880 4884  
4881 4885          mutex_enter(&l->lock);
4882 4886  
4883 4887          if (l->owner != NULL) {
4884 4888                  ASSERT(l->owner == curthread);
4885 4889  
4886 4890                  /*
4887 4891                   * To release a writer lock increment count to indicate that
4888 4892                   * there is one less writer active.  If this was the last of
4889 4893                   * possibly nested writer locks, then clear the owner field as
4890 4894                   * well to indicate that there is no writer active.
4891 4895                   */
4892 4896                  ASSERT(l->count < 0);
4893 4897                  l->count++;
4894 4898                  if (l->count == 0) {
4895 4899                          l->owner = NULL;
4896 4900  
4897 4901                          /*
4898 4902                           * If there are no writers waiting then wakeup all of
4899 4903                           * the waiting readers (if any).
4900 4904                           */
4901 4905                          if (l->waiters == 0)
4902 4906                                  cv_broadcast(&l->cv_rd);
4903 4907                  }
4904 4908          } else {
4905 4909                  /*
4906 4910                   * To release a reader lock just decrement count to indicate
4907 4911                   * that there is one less reader active.
4908 4912                   */
4909 4913                  ASSERT(l->count > 0);
4910 4914                  l->count--;
4911 4915          }
4912 4916  
4913 4917          /*
4914 4918           * If there are no readers active nor a writer active and there is a
4915 4919           * writer waiting we need to wake up it.
4916 4920           */
4917 4921          if (l->count == 0 && l->waiters > 0)
4918 4922                  cv_signal(&l->cv);
4919 4923          mutex_exit(&l->lock);
4920 4924  }
4921 4925  
4922 4926  int
4923 4927  nfs_rw_lock_held(nfs_rwlock_t *l, krw_t rw)
4924 4928  {
4925 4929  
4926 4930          if (rw == RW_READER)
4927 4931                  return (l->count > 0);
4928 4932          ASSERT(rw == RW_WRITER);
4929 4933          return (l->count < 0);
4930 4934  }
4931 4935  
4932 4936  /* ARGSUSED */
4933 4937  void
4934 4938  nfs_rw_init(nfs_rwlock_t *l, char *name, krw_type_t type, void *arg)
4935 4939  {
4936 4940  
4937 4941          l->count = 0;
4938 4942          l->waiters = 0;
4939 4943          l->owner = NULL;
4940 4944          mutex_init(&l->lock, NULL, MUTEX_DEFAULT, NULL);
4941 4945          cv_init(&l->cv, NULL, CV_DEFAULT, NULL);
4942 4946          cv_init(&l->cv_rd, NULL, CV_DEFAULT, NULL);
4943 4947  }
4944 4948  
4945 4949  void
4946 4950  nfs_rw_destroy(nfs_rwlock_t *l)
4947 4951  {
4948 4952  
4949 4953          mutex_destroy(&l->lock);
4950 4954          cv_destroy(&l->cv);
4951 4955          cv_destroy(&l->cv_rd);
4952 4956  }
4953 4957  
4954 4958  int
4955 4959  nfs3_rddir_compar(const void *x, const void *y)
4956 4960  {
4957 4961          rddir_cache *a = (rddir_cache *)x;
4958 4962          rddir_cache *b = (rddir_cache *)y;
4959 4963  
4960 4964          if (a->nfs3_cookie == b->nfs3_cookie) {
4961 4965                  if (a->buflen == b->buflen)
4962 4966                          return (0);
4963 4967                  if (a->buflen < b->buflen)
4964 4968                          return (-1);
4965 4969                  return (1);
4966 4970          }
4967 4971  
4968 4972          if (a->nfs3_cookie < b->nfs3_cookie)
4969 4973                  return (-1);
4970 4974  
4971 4975          return (1);
4972 4976  }
4973 4977  
4974 4978  int
4975 4979  nfs_rddir_compar(const void *x, const void *y)
4976 4980  {
4977 4981          rddir_cache *a = (rddir_cache *)x;
4978 4982          rddir_cache *b = (rddir_cache *)y;
4979 4983  
4980 4984          if (a->nfs_cookie == b->nfs_cookie) {
4981 4985                  if (a->buflen == b->buflen)
4982 4986                          return (0);
4983 4987                  if (a->buflen < b->buflen)
4984 4988                          return (-1);
4985 4989                  return (1);
4986 4990          }
4987 4991  
4988 4992          if (a->nfs_cookie < b->nfs_cookie)
4989 4993                  return (-1);
4990 4994  
4991 4995          return (1);
4992 4996  }
4993 4997  
4994 4998  static char *
4995 4999  nfs_getsrvnames(mntinfo_t *mi, size_t *len)
4996 5000  {
4997 5001          servinfo_t *s;
4998 5002          char *srvnames;
4999 5003          char *namep;
5000 5004          size_t length;
5001 5005  
5002 5006          /*
5003 5007           * Calculate the length of the string required to hold all
5004 5008           * of the server names plus either a comma or a null
5005 5009           * character following each individual one.
5006 5010           */
5007 5011          length = 0;
5008 5012          for (s = mi->mi_servers; s != NULL; s = s->sv_next)
5009 5013                  length += s->sv_hostnamelen;
5010 5014  
5011 5015          srvnames = kmem_alloc(length, KM_SLEEP);
5012 5016  
5013 5017          namep = srvnames;
5014 5018          for (s = mi->mi_servers; s != NULL; s = s->sv_next) {
5015 5019                  (void) strcpy(namep, s->sv_hostname);
5016 5020                  namep += s->sv_hostnamelen - 1;
5017 5021                  *namep++ = ',';
5018 5022          }
5019 5023          *--namep = '\0';
5020 5024  
5021 5025          *len = length;
5022 5026  
5023 5027          return (srvnames);
5024 5028  }
5025 5029  
5026 5030  /*
5027 5031   * These two functions are temporary and designed for the upgrade-workaround
5028 5032   * only.  They cannot be used for general zone-crossing NFS client support, and
5029 5033   * will be removed shortly.
5030 5034   *
5031 5035   * When the workaround is enabled, all NFS traffic is forced into the global
5032 5036   * zone.  These functions are called when the code needs to refer to the state
5033 5037   * of the underlying network connection.  They're not called when the function
5034 5038   * needs to refer to the state of the process that invoked the system call.
5035 5039   * (E.g., when checking whether the zone is shutting down during the mount()
5036 5040   * call.)
5037 5041   */
5038 5042  
5039 5043  struct zone *
5040 5044  nfs_zone(void)
5041 5045  {
5042 5046          return (nfs_global_client_only != 0 ? global_zone : curproc->p_zone);
5043 5047  }
5044 5048  
5045 5049  zoneid_t
5046 5050  nfs_zoneid(void)
5047 5051  {
5048 5052          return (nfs_global_client_only != 0 ? GLOBAL_ZONEID : getzoneid());
5049 5053  }
5050 5054  
5051 5055  /*
5052 5056   * nfs_mount_label_policy:
5053 5057   *      Determine whether the mount is allowed according to MAC check,
5054 5058   *      by comparing (where appropriate) label of the remote server
5055 5059   *      against the label of the zone being mounted into.
5056 5060   *
5057 5061   *      Returns:
5058 5062   *               0 :    access allowed
5059 5063   *              -1 :    read-only access allowed (i.e., read-down)
5060 5064   *              >0 :    error code, such as EACCES
5061 5065   */
5062 5066  int
5063 5067  nfs_mount_label_policy(vfs_t *vfsp, struct netbuf *addr,
5064 5068      struct knetconfig *knconf, cred_t *cr)
5065 5069  {
5066 5070          int             addr_type;
5067 5071          void            *ipaddr;
5068 5072          bslabel_t       *server_sl, *mntlabel;
5069 5073          zone_t          *mntzone = NULL;
5070 5074          ts_label_t      *zlabel;
5071 5075          tsol_tpc_t      *tp;
5072 5076          ts_label_t      *tsl = NULL;
5073 5077          int             retv;
5074 5078  
5075 5079          /*
5076 5080           * Get the zone's label.  Each zone on a labeled system has a label.
5077 5081           */
5078 5082          mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE);
5079 5083          zlabel = mntzone->zone_slabel;
5080 5084          ASSERT(zlabel != NULL);
5081 5085          label_hold(zlabel);
5082 5086  
5083 5087          if (strcmp(knconf->knc_protofmly, NC_INET) == 0) {
5084 5088                  addr_type = IPV4_VERSION;
5085 5089                  ipaddr = &((struct sockaddr_in *)addr->buf)->sin_addr;
5086 5090          } else if (strcmp(knconf->knc_protofmly, NC_INET6) == 0) {
5087 5091                  addr_type = IPV6_VERSION;
5088 5092                  ipaddr = &((struct sockaddr_in6 *)addr->buf)->sin6_addr;
5089 5093          } else {
5090 5094                  retv = 0;
5091 5095                  goto out;
5092 5096          }
5093 5097  
5094 5098          retv = EACCES;                          /* assume the worst */
5095 5099  
5096 5100          /*
5097 5101           * Next, get the assigned label of the remote server.
5098 5102           */
5099 5103          tp = find_tpc(ipaddr, addr_type, B_FALSE);
5100 5104          if (tp == NULL)
5101 5105                  goto out;                       /* error getting host entry */
5102 5106  
5103 5107          if (tp->tpc_tp.tp_doi != zlabel->tsl_doi)
5104 5108                  goto rel_tpc;                   /* invalid domain */
5105 5109          if ((tp->tpc_tp.host_type != SUN_CIPSO) &&
5106 5110              (tp->tpc_tp.host_type != UNLABELED))
5107 5111                  goto rel_tpc;                   /* invalid hosttype */
5108 5112  
5109 5113          if (tp->tpc_tp.host_type == SUN_CIPSO) {
5110 5114                  tsl = getflabel_cipso(vfsp);
5111 5115                  if (tsl == NULL)
5112 5116                          goto rel_tpc;           /* error getting server lbl */
5113 5117  
5114 5118                  server_sl = label2bslabel(tsl);
5115 5119          } else {        /* UNLABELED */
5116 5120                  server_sl = &tp->tpc_tp.tp_def_label;
5117 5121          }
5118 5122  
5119 5123          mntlabel = label2bslabel(zlabel);
5120 5124  
5121 5125          /*
5122 5126           * Now compare labels to complete the MAC check.  If the labels
5123 5127           * are equal or if the requestor is in the global zone and has
5124 5128           * NET_MAC_AWARE, then allow read-write access.   (Except for
5125 5129           * mounts into the global zone itself; restrict these to
5126 5130           * read-only.)
5127 5131           *
5128 5132           * If the requestor is in some other zone, but their label
5129 5133           * dominates the server, then allow read-down.
5130 5134           *
5131 5135           * Otherwise, access is denied.
5132 5136           */
5133 5137          if (blequal(mntlabel, server_sl) ||
5134 5138              (crgetzoneid(cr) == GLOBAL_ZONEID &&
5135 5139              getpflags(NET_MAC_AWARE, cr) != 0)) {
5136 5140                  if ((mntzone == global_zone) ||
5137 5141                      !blequal(mntlabel, server_sl))
5138 5142                          retv = -1;              /* read-only */
5139 5143                  else
5140 5144                          retv = 0;               /* access OK */
5141 5145          } else if (bldominates(mntlabel, server_sl)) {
5142 5146                  retv = -1;                      /* read-only */
5143 5147          } else {
5144 5148                  retv = EACCES;
5145 5149          }
5146 5150  
5147 5151          if (tsl != NULL)
5148 5152                  label_rele(tsl);
5149 5153  
5150 5154  rel_tpc:
5151 5155          TPC_RELE(tp);
5152 5156  out:
5153 5157          if (mntzone)
5154 5158                  zone_rele(mntzone);
5155 5159          label_rele(zlabel);
5156 5160          return (retv);
5157 5161  }
5158 5162  
5159 5163  boolean_t
5160 5164  nfs_has_ctty(void)
5161 5165  {
5162 5166          boolean_t rv;
5163 5167          mutex_enter(&curproc->p_splock);
5164 5168          rv = (curproc->p_sessp->s_vp != NULL);
5165 5169          mutex_exit(&curproc->p_splock);
5166 5170          return (rv);
5167 5171  }
5168 5172  
5169 5173  /*
5170 5174   * See if xattr directory to see if it has any generic user attributes
5171 5175   */
5172 5176  int
5173 5177  do_xattr_exists_check(vnode_t *vp, ulong_t *valp, cred_t *cr)
5174 5178  {
5175 5179          struct uio uio;
5176 5180          struct iovec iov;
5177 5181          char *dbuf;
5178 5182          struct dirent64 *dp;
5179 5183          size_t dlen = 8 * 1024;
5180 5184          size_t dbuflen;
5181 5185          int eof = 0;
5182 5186          int error;
5183 5187  
5184 5188          *valp = 0;
5185 5189          dbuf = kmem_alloc(dlen, KM_SLEEP);
5186 5190          uio.uio_iov = &iov;
5187 5191          uio.uio_iovcnt = 1;
5188 5192          uio.uio_segflg = UIO_SYSSPACE;
5189 5193          uio.uio_fmode = 0;
5190 5194          uio.uio_extflg = UIO_COPY_CACHED;
5191 5195          uio.uio_loffset = 0;
5192 5196          uio.uio_resid = dlen;
5193 5197          iov.iov_base = dbuf;
5194 5198          iov.iov_len = dlen;
5195 5199          (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
5196 5200          error = VOP_READDIR(vp, &uio, cr, &eof, NULL, 0);
5197 5201          VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
5198 5202  
5199 5203          dbuflen = dlen - uio.uio_resid;
5200 5204  
5201 5205          if (error || dbuflen == 0) {
5202 5206                  kmem_free(dbuf, dlen);
5203 5207                  return (error);
5204 5208          }
5205 5209  
5206 5210          dp = (dirent64_t *)dbuf;
5207 5211  
5208 5212          while ((intptr_t)dp < (intptr_t)dbuf + dbuflen) {
5209 5213                  if (strcmp(dp->d_name, ".") == 0 ||
5210 5214                      strcmp(dp->d_name, "..") == 0 || strcmp(dp->d_name,
5211 5215                      VIEW_READWRITE) == 0 || strcmp(dp->d_name,

↓ open down ↓

4334 lines elided

↑ open up ↑

5212 5216                      VIEW_READONLY) == 0) {
5213 5217                          dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen);
5214 5218                          continue;
5215 5219                  }
5216 5220  
5217 5221                  *valp = 1;
5218 5222                  break;
5219 5223          }
5220 5224          kmem_free(dbuf, dlen);
5221 5225          return (0);
     5226 +}
     5227 +
     5228 +/*
     5229 + * Return non-zero in a case the vp is an empty directory used as a ZFS mount
     5230 + * point.  The NFSv2 and NFSv3 servers should not allow to write to such
     5231 + * directories.
     5232 + */
     5233 +int
     5234 +protect_zfs_mntpt(vnode_t *vp)
     5235 +{
     5236 +        int error;
     5237 +        vfs_t *vfsp;
     5238 +        struct uio uio;
     5239 +        struct iovec iov;
     5240 +        int eof;
     5241 +        size_t len = 8 * 1024;
     5242 +        char *buf;
     5243 +
     5244 +        if (vp->v_type != VDIR || vn_ismntpt(vp) == 0)
     5245 +                return (0);
     5246 +
     5247 +        error = vn_vfsrlock_wait(vp);
     5248 +        if (error != 0)
     5249 +                return (error);
     5250 +
     5251 +        /*
     5252 +         * We protect ZFS mount points only
     5253 +         */
     5254 +        if ((vfsp = vn_mountedvfs(vp)) == NULL ||
     5255 +            strncmp(vfssw[vfsp->vfs_fstype].vsw_name, "zfs", 3) != 0) {
     5256 +                vn_vfsunlock(vp);
     5257 +                return (0);
     5258 +        }
     5259 +
     5260 +        vn_vfsunlock(vp);
     5261 +
     5262 +        buf = kmem_alloc(len, KM_SLEEP);
     5263 +
     5264 +        uio.uio_iov = &iov;
     5265 +        uio.uio_iovcnt = 1;
     5266 +        uio.uio_segflg = UIO_SYSSPACE;
     5267 +        uio.uio_fmode = 0;
     5268 +        uio.uio_extflg = UIO_COPY_CACHED;
     5269 +        uio.uio_loffset = 0;
     5270 +        uio.uio_llimit = MAXOFFSET_T;
     5271 +
     5272 +        eof = 0;
     5273 +
     5274 +        do {
     5275 +                size_t rlen;
     5276 +                dirent64_t *dp;
     5277 +
     5278 +                uio.uio_resid = len;
     5279 +                iov.iov_base = buf;
     5280 +                iov.iov_len = len;
     5281 +
     5282 +                (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
     5283 +                error = VOP_READDIR(vp, &uio, kcred, &eof, NULL, 0);
     5284 +                VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
     5285 +
     5286 +                if (error != 0)
     5287 +                        break;
     5288 +
     5289 +                error = EBUSY;
     5290 +
     5291 +                rlen = len - uio.uio_resid;
     5292 +                if (rlen == 0)
     5293 +                        break;
     5294 +
     5295 +                for (dp = (dirent64_t *)buf;
     5296 +                    (intptr_t)dp < (intptr_t)buf + rlen;
     5297 +                    dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen)) {
     5298 +                        if (strcmp(dp->d_name, ".") != 0 &&
     5299 +                            strcmp(dp->d_name, "..") != 0) {
     5300 +                                error = 0;
     5301 +                                break;
     5302 +                        }
     5303 +                }
     5304 +        } while (eof == 0 && error != 0);
     5305 +
     5306 +        kmem_free(buf, len);
     5307 +
     5308 +        return (error);
5222 5309  }

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX