1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2011 Bayard G. Bell. All rights reserved.
  24  * Copyright (c) 2013 by Delphix. All rights reserved.
  25  * Copyright (c) 2017 Joyent Inc
  26  * Copyright 2019 Nexenta by DDN, Inc.
  27  */
  28 
  29 /*
  30  *      Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
  31  *      All rights reserved.
  32  *      Use is subject to license terms.
  33  */
  34 
  35 /*
  36  * Copyright (c) 2011 Bayard G. Bell. All rights reserved.
  37  * Copyright (c) 2013 by Delphix. All rights reserved.
  38  * Copyright 2018 Nexenta Systems, Inc.
  39  * Copyright (c) 2017 Joyent Inc
  40  */
  41 
  42 #include <sys/param.h>
  43 #include <sys/types.h>
  44 #include <sys/systm.h>
  45 #include <sys/cred.h>
  46 #include <sys/proc.h>
  47 #include <sys/user.h>
  48 #include <sys/buf.h>
  49 #include <sys/vfs.h>
  50 #include <sys/vnode.h>
  51 #include <sys/pathname.h>
  52 #include <sys/uio.h>
  53 #include <sys/file.h>
  54 #include <sys/stat.h>
  55 #include <sys/errno.h>
  56 #include <sys/socket.h>
  57 #include <sys/sysmacros.h>
  58 #include <sys/siginfo.h>
  59 #include <sys/tiuser.h>
  60 #include <sys/statvfs.h>
  61 #include <sys/stream.h>
  62 #include <sys/strsun.h>
  63 #include <sys/strsubr.h>
  64 #include <sys/stropts.h>
  65 #include <sys/timod.h>
  66 #include <sys/t_kuser.h>
  67 #include <sys/kmem.h>
  68 #include <sys/kstat.h>
  69 #include <sys/dirent.h>
  70 #include <sys/cmn_err.h>
  71 #include <sys/debug.h>
  72 #include <sys/unistd.h>
  73 #include <sys/vtrace.h>
  74 #include <sys/mode.h>
  75 #include <sys/acl.h>
  76 #include <sys/sdt.h>
  77 #include <sys/debug.h>
  78 
  79 #include <rpc/types.h>
  80 #include <rpc/auth.h>
  81 #include <rpc/auth_unix.h>
  82 #include <rpc/auth_des.h>
  83 #include <rpc/svc.h>
  84 #include <rpc/xdr.h>
  85 #include <rpc/rpc_rdma.h>
  86 
  87 #include <nfs/nfs.h>
  88 #include <nfs/export.h>
  89 #include <nfs/nfssys.h>
  90 #include <nfs/nfs_clnt.h>
  91 #include <nfs/nfs_acl.h>
  92 #include <nfs/nfs_log.h>
  93 #include <nfs/lm.h>
  94 #include <nfs/nfs_dispatch.h>
  95 #include <nfs/nfs4_drc.h>
  96 
  97 #include <sys/modctl.h>
  98 #include <sys/cladm.h>
  99 #include <sys/clconf.h>
 100 
 101 #include <sys/tsol/label.h>
 102 
 103 #define MAXHOST 32
 104 const char *kinet_ntop6(uchar_t *, char *, size_t);
 105 
 106 /*
 107  * Module linkage information.
 108  */
 109 
 110 static struct modlmisc modlmisc = {
 111         &mod_miscops, "NFS server module"
 112 };
 113 
 114 static struct modlinkage modlinkage = {
 115         MODREV_1, (void *)&modlmisc, NULL
 116 };
 117 
 118 zone_key_t nfssrv_zone_key;
 119 kmem_cache_t *nfs_xuio_cache;
 120 int nfs_loaned_buffers = 0;
 121 
 122 int
 123 _init(void)
 124 {
 125         int status;
 126 
 127         nfs_srvinit();
 128 
 129         status = mod_install((struct modlinkage *)&modlinkage);
 130         if (status != 0) {
 131                 /*
 132                  * Could not load module, cleanup previous
 133                  * initialization work.
 134                  */
 135                 nfs_srvfini();
 136 
 137                 return (status);
 138         }
 139 
 140         /*
 141          * Initialise some placeholders for nfssys() calls. These have
 142          * to be declared by the nfs module, since that handles nfssys()
 143          * calls - also used by NFS clients - but are provided by this
 144          * nfssrv module. These also then serve as confirmation to the
 145          * relevant code in nfs that nfssrv has been loaded, as they're
 146          * initially NULL.
 147          */
 148         nfs_srv_quiesce_func = nfs_srv_quiesce_all;
 149         nfs_srv_dss_func = rfs4_dss_setpaths;
 150 
 151         /* setup DSS paths here; must be done before initial server startup */
 152         rfs4_dss_paths = rfs4_dss_oldpaths = NULL;
 153 
 154         /* initialize the copy reduction caches */
 155 
 156         nfs_xuio_cache = kmem_cache_create("nfs_xuio_cache",
 157             sizeof (nfs_xuio_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 158 
 159         return (status);
 160 }
 161 
 162 int
 163 _fini()
 164 {
 165         return (EBUSY);
 166 }
 167 
 168 int
 169 _info(struct modinfo *modinfop)
 170 {
 171         return (mod_info(&modlinkage, modinfop));
 172 }
 173 
 174 /*
 175  * PUBLICFH_CHECK() checks if the dispatch routine supports
 176  * RPC_PUBLICFH_OK, if the filesystem is exported public, and if the
 177  * incoming request is using the public filehandle. The check duplicates
 178  * the exportmatch() call done in checkexport(), and we should consider
 179  * modifying those routines to avoid the duplication. For now, we optimize
 180  * by calling exportmatch() only after checking that the dispatch routine
 181  * supports RPC_PUBLICFH_OK, and if the filesystem is explicitly exported
 182  * public (i.e., not the placeholder).
 183  */
 184 #define PUBLICFH_CHECK(ne, disp, exi, fsid, xfid) \
 185                 ((disp->dis_flags & RPC_PUBLICFH_OK) && \
 186                 ((exi->exi_export.ex_flags & EX_PUBLIC) || \
 187                 (exi == ne->exi_public && exportmatch(ne->exi_root, \
 188                 fsid, xfid))))
 189 
 190 static void     nfs_srv_shutdown_all(int);
 191 static void     rfs4_server_start(nfs_globals_t *, int);
 192 static void     nullfree(void);
 193 static void     rfs_dispatch(struct svc_req *, SVCXPRT *);
 194 static void     acl_dispatch(struct svc_req *, SVCXPRT *);
 195 static void     common_dispatch(struct svc_req *, SVCXPRT *,
 196                 rpcvers_t, rpcvers_t, char *,
 197                 struct rpc_disptable *);
 198 static  int     checkauth(struct exportinfo *, struct svc_req *, cred_t *, int,
 199                 bool_t, bool_t *);
 200 static char     *client_name(struct svc_req *req);
 201 static char     *client_addr(struct svc_req *req, char *buf);
 202 extern  int     sec_svc_getcred(struct svc_req *, cred_t *cr, char **, int *);
 203 extern  bool_t  sec_svc_inrootlist(int, caddr_t, int, caddr_t *);
 204 static void     *nfs_srv_zone_init(zoneid_t);
 205 static void     nfs_srv_zone_fini(zoneid_t, void *);
 206 
 207 #define NFSLOG_COPY_NETBUF(exi, xprt, nb)       {               \
 208         (nb)->maxlen = (xprt)->xp_rtaddr.maxlen;          \
 209         (nb)->len = (xprt)->xp_rtaddr.len;                        \
 210         (nb)->buf = kmem_alloc((nb)->len, KM_SLEEP);              \
 211         bcopy((xprt)->xp_rtaddr.buf, (nb)->buf, (nb)->len);    \
 212         }
 213 
 214 /*
 215  * Public Filehandle common nfs routines
 216  */
 217 static int      MCLpath(char **);
 218 static void     URLparse(char *);
 219 
 220 /*
 221  * NFS callout table.
 222  * This table is used by svc_getreq() to dispatch a request with
 223  * a given prog/vers pair to an appropriate service provider
 224  * dispatch routine.
 225  *
 226  * NOTE: ordering is relied upon below when resetting the version min/max
 227  * for NFS_PROGRAM.  Careful, if this is ever changed.
 228  */
 229 static SVC_CALLOUT __nfs_sc_clts[] = {
 230         { NFS_PROGRAM,     NFS_VERSMIN,     NFS_VERSMAX,        rfs_dispatch },
 231         { NFS_ACL_PROGRAM, NFS_ACL_VERSMIN, NFS_ACL_VERSMAX,    acl_dispatch }
 232 };
 233 
 234 static SVC_CALLOUT_TABLE nfs_sct_clts = {
 235         sizeof (__nfs_sc_clts) / sizeof (__nfs_sc_clts[0]), FALSE,
 236         __nfs_sc_clts
 237 };
 238 
 239 static SVC_CALLOUT __nfs_sc_cots[] = {
 240         { NFS_PROGRAM,     NFS_VERSMIN,     NFS_VERSMAX,        rfs_dispatch },
 241         { NFS_ACL_PROGRAM, NFS_ACL_VERSMIN, NFS_ACL_VERSMAX,    acl_dispatch }
 242 };
 243 
 244 static SVC_CALLOUT_TABLE nfs_sct_cots = {
 245         sizeof (__nfs_sc_cots) / sizeof (__nfs_sc_cots[0]), FALSE, __nfs_sc_cots
 246 };
 247 
 248 static SVC_CALLOUT __nfs_sc_rdma[] = {
 249         { NFS_PROGRAM,     NFS_VERSMIN,     NFS_VERSMAX,        rfs_dispatch },
 250         { NFS_ACL_PROGRAM, NFS_ACL_VERSMIN, NFS_ACL_VERSMAX,    acl_dispatch }
 251 };
 252 
 253 static SVC_CALLOUT_TABLE nfs_sct_rdma = {
 254         sizeof (__nfs_sc_rdma) / sizeof (__nfs_sc_rdma[0]), FALSE, __nfs_sc_rdma
 255 };
 256 
 257 /*
 258  * DSS: distributed stable storage
 259  * lists of all DSS paths: current, and before last warmstart
 260  */
 261 nvlist_t *rfs4_dss_paths, *rfs4_dss_oldpaths;
 262 
 263 int rfs4_dispatch(struct rpcdisp *, struct svc_req *, SVCXPRT *, char *);
 264 bool_t rfs4_minorvers_mismatch(struct svc_req *, SVCXPRT *, void *);
 265 
 266 /*
 267  * Will be called at the point the server pool is being unregistered
 268  * from the pool list. From that point onwards, the pool is waiting
 269  * to be drained and as such the server state is stale and pertains
 270  * to the old instantiation of the NFS server pool.
 271  */
 272 void
 273 nfs_srv_offline(void)
 274 {
 275         nfs_globals_t *ng;
 276 
 277         ng = zone_getspecific(nfssrv_zone_key, curzone);
 278 
 279         mutex_enter(&ng->nfs_server_upordown_lock);
 280         if (ng->nfs_server_upordown == NFS_SERVER_RUNNING) {
 281                 ng->nfs_server_upordown = NFS_SERVER_OFFLINE;
 282         }
 283         mutex_exit(&ng->nfs_server_upordown_lock);
 284 }
 285 
 286 /*
 287  * Will be called at the point the server pool is being destroyed so
 288  * all transports have been closed and no service threads are in
 289  * existence.
 290  *
 291  * If we quiesce the server, we're shutting it down without destroying the
 292  * server state. This allows it to warm start subsequently.
 293  */
 294 void
 295 nfs_srv_stop_all(void)
 296 {
 297         int quiesce = 0;
 298         nfs_srv_shutdown_all(quiesce);
 299 }
 300 
 301 /*
 302  * This alternative shutdown routine can be requested via nfssys()
 303  */
 304 void
 305 nfs_srv_quiesce_all(void)
 306 {
 307         int quiesce = 1;
 308         nfs_srv_shutdown_all(quiesce);
 309 }
 310 
 311 static void
 312 nfs_srv_shutdown_all(int quiesce)
 313 {
 314         nfs_globals_t *ng = zone_getspecific(nfssrv_zone_key, curzone);
 315 
 316         mutex_enter(&ng->nfs_server_upordown_lock);
 317         if (quiesce) {
 318                 if (ng->nfs_server_upordown == NFS_SERVER_RUNNING ||
 319                     ng->nfs_server_upordown == NFS_SERVER_OFFLINE) {
 320                         ng->nfs_server_upordown = NFS_SERVER_QUIESCED;
 321                         cv_signal(&ng->nfs_server_upordown_cv);
 322 
 323                         /* reset DSS state */
 324                         rfs4_dss_numnewpaths = 0;
 325                         rfs4_dss_newpaths = NULL;
 326 
 327                         cmn_err(CE_NOTE, "nfs_server: server is now quiesced; "
 328                             "NFSv4 state has been preserved");
 329                 }
 330         } else {
 331                 if (ng->nfs_server_upordown == NFS_SERVER_OFFLINE) {
 332                         ng->nfs_server_upordown = NFS_SERVER_STOPPING;
 333                         mutex_exit(&ng->nfs_server_upordown_lock);
 334                         rfs4_state_zone_fini();
 335                         rfs4_fini_drc();
 336                         mutex_enter(&ng->nfs_server_upordown_lock);
 337                         ng->nfs_server_upordown = NFS_SERVER_STOPPED;
 338 
 339                         /* reset DSS state */
 340                         rfs4_dss_numnewpaths = 0;
 341                         rfs4_dss_newpaths = NULL;
 342 
 343                         cv_signal(&ng->nfs_server_upordown_cv);
 344                 }
 345         }
 346         mutex_exit(&ng->nfs_server_upordown_lock);
 347 }
 348 
 349 static int
 350 nfs_srv_set_sc_versions(struct file *fp, SVC_CALLOUT_TABLE **sctpp,
 351     rpcvers_t versmin, rpcvers_t versmax)
 352 {
 353         struct strioctl strioc;
 354         struct T_info_ack tinfo;
 355         int             error, retval;
 356 
 357         /*
 358          * Find out what type of transport this is.
 359          */
 360         strioc.ic_cmd = TI_GETINFO;
 361         strioc.ic_timout = -1;
 362         strioc.ic_len = sizeof (tinfo);
 363         strioc.ic_dp = (char *)&tinfo;
 364         tinfo.PRIM_type = T_INFO_REQ;
 365 
 366         error = strioctl(fp->f_vnode, I_STR, (intptr_t)&strioc, 0, K_TO_K,
 367             CRED(), &retval);
 368         if (error || retval)
 369                 return (error);
 370 
 371         /*
 372          * Based on our query of the transport type...
 373          *
 374          * Reset the min/max versions based on the caller's request
 375          * NOTE: This assumes that NFS_PROGRAM is first in the array!!
 376          * And the second entry is the NFS_ACL_PROGRAM.
 377          */
 378         switch (tinfo.SERV_type) {
 379         case T_CLTS:
 380                 if (versmax == NFS_V4)
 381                         return (EINVAL);
 382                 __nfs_sc_clts[0].sc_versmin = versmin;
 383                 __nfs_sc_clts[0].sc_versmax = versmax;
 384                 __nfs_sc_clts[1].sc_versmin = versmin;
 385                 __nfs_sc_clts[1].sc_versmax = versmax;
 386                 *sctpp = &nfs_sct_clts;
 387                 break;
 388         case T_COTS:
 389         case T_COTS_ORD:
 390                 __nfs_sc_cots[0].sc_versmin = versmin;
 391                 __nfs_sc_cots[0].sc_versmax = versmax;
 392                 /* For the NFS_ACL program, check the max version */
 393                 if (versmax > NFS_ACL_VERSMAX)
 394                         versmax = NFS_ACL_VERSMAX;
 395                 __nfs_sc_cots[1].sc_versmin = versmin;
 396                 __nfs_sc_cots[1].sc_versmax = versmax;
 397                 *sctpp = &nfs_sct_cots;
 398                 break;
 399         default:
 400                 error = EINVAL;
 401         }
 402 
 403         return (error);
 404 }
 405 
 406 /*
 407  * NFS Server system call.
 408  * Does all of the work of running a NFS server.
 409  * uap->fd is the fd of an open transport provider
 410  */
 411 int
 412 nfs_svc(struct nfs_svc_args *arg, model_t model)
 413 {
 414         nfs_globals_t *ng;
 415         file_t *fp;
 416         SVCMASTERXPRT *xprt;
 417         int error;
 418         int readsize;
 419         char buf[KNC_STRSIZE];
 420         size_t len;
 421         STRUCT_HANDLE(nfs_svc_args, uap);
 422         struct netbuf addrmask;
 423         SVC_CALLOUT_TABLE *sctp = NULL;
 424 
 425 #ifdef lint
 426         model = model;          /* STRUCT macros don't always refer to it */
 427 #endif
 428 
 429         ng = zone_getspecific(nfssrv_zone_key, curzone);
 430         STRUCT_SET_HANDLE(uap, model, arg);
 431 
 432         /* Check privileges in nfssys() */
 433 
 434         if ((fp = getf(STRUCT_FGET(uap, fd))) == NULL)
 435                 return (EBADF);
 436 
 437         /*
 438          * Set read buffer size to rsize
 439          * and add room for RPC headers.
 440          */
 441         readsize = nfs3tsize() + (RPC_MAXDATASIZE - NFS_MAXDATA);
 442         if (readsize < RPC_MAXDATASIZE)
 443                 readsize = RPC_MAXDATASIZE;
 444 
 445         error = copyinstr((const char *)STRUCT_FGETP(uap, netid), buf,
 446             KNC_STRSIZE, &len);
 447         if (error) {
 448                 releasef(STRUCT_FGET(uap, fd));
 449                 return (error);
 450         }
 451 
 452         addrmask.len = STRUCT_FGET(uap, addrmask.len);
 453         addrmask.maxlen = STRUCT_FGET(uap, addrmask.maxlen);
 454         addrmask.buf = kmem_alloc(addrmask.maxlen, KM_SLEEP);
 455         error = copyin(STRUCT_FGETP(uap, addrmask.buf), addrmask.buf,
 456             addrmask.len);
 457         if (error) {
 458                 releasef(STRUCT_FGET(uap, fd));
 459                 kmem_free(addrmask.buf, addrmask.maxlen);
 460                 return (error);
 461         }
 462 
 463         ng->nfs_versmin = STRUCT_FGET(uap, versmin);
 464         ng->nfs_versmax = STRUCT_FGET(uap, versmax);
 465 
 466         /* Double check the vers min/max ranges */
 467         if ((ng->nfs_versmin > ng->nfs_versmax) ||
 468             (ng->nfs_versmin < NFS_VERSMIN) ||
 469             (ng->nfs_versmax > NFS_VERSMAX)) {
 470                 ng->nfs_versmin = NFS_VERSMIN_DEFAULT;
 471                 ng->nfs_versmax = NFS_VERSMAX_DEFAULT;
 472         }
 473 
 474         if (error = nfs_srv_set_sc_versions(fp, &sctp, ng->nfs_versmin,
 475             ng->nfs_versmax)) {
 476                 releasef(STRUCT_FGET(uap, fd));
 477                 kmem_free(addrmask.buf, addrmask.maxlen);
 478                 return (error);
 479         }
 480 
 481         /* Initialize nfsv4 server */
 482         if (ng->nfs_versmax == (rpcvers_t)NFS_V4)
 483                 rfs4_server_start(ng, STRUCT_FGET(uap, delegation));
 484 
 485         /* Create a transport handle. */
 486         error = svc_tli_kcreate(fp, readsize, buf, &addrmask, &xprt,
 487             sctp, NULL, NFS_SVCPOOL_ID, TRUE);
 488 
 489         if (error)
 490                 kmem_free(addrmask.buf, addrmask.maxlen);
 491 
 492         releasef(STRUCT_FGET(uap, fd));
 493 
 494         /* HA-NFSv4: save the cluster nodeid */
 495         if (cluster_bootflags & CLUSTER_BOOTED)
 496                 lm_global_nlmid = clconf_get_nodeid();
 497 
 498         return (error);
 499 }
 500 
 501 static void
 502 rfs4_server_start(nfs_globals_t *ng, int nfs4_srv_delegation)
 503 {
 504         /*
 505          * Determine if the server has previously been "started" and
 506          * if not, do the per instance initialization
 507          */
 508         mutex_enter(&ng->nfs_server_upordown_lock);
 509 
 510         if (ng->nfs_server_upordown != NFS_SERVER_RUNNING) {
 511                 /* Do we need to stop and wait on the previous server? */
 512                 while (ng->nfs_server_upordown == NFS_SERVER_STOPPING ||
 513                     ng->nfs_server_upordown == NFS_SERVER_OFFLINE)
 514                         cv_wait(&ng->nfs_server_upordown_cv,
 515                             &ng->nfs_server_upordown_lock);
 516 
 517                 if (ng->nfs_server_upordown != NFS_SERVER_RUNNING) {
 518                         (void) svc_pool_control(NFS_SVCPOOL_ID,
 519                             SVCPSET_UNREGISTER_PROC, (void *)&nfs_srv_offline);
 520                         (void) svc_pool_control(NFS_SVCPOOL_ID,
 521                             SVCPSET_SHUTDOWN_PROC, (void *)&nfs_srv_stop_all);
 522 
 523                         rfs4_do_server_start(ng->nfs_server_upordown,
 524                             nfs4_srv_delegation,
 525                             cluster_bootflags & CLUSTER_BOOTED);
 526 
 527                         ng->nfs_server_upordown = NFS_SERVER_RUNNING;
 528                 }
 529                 cv_signal(&ng->nfs_server_upordown_cv);
 530         }
 531         mutex_exit(&ng->nfs_server_upordown_lock);
 532 }
 533 
 534 /*
 535  * If RDMA device available,
 536  * start RDMA listener.
 537  */
 538 int
 539 rdma_start(struct rdma_svc_args *rsa)
 540 {
 541         nfs_globals_t *ng;
 542         int error;
 543         rdma_xprt_group_t started_rdma_xprts;
 544         rdma_stat stat;
 545         int svc_state = 0;
 546 
 547         /* Double check the vers min/max ranges */
 548         if ((rsa->nfs_versmin > rsa->nfs_versmax) ||
 549             (rsa->nfs_versmin < NFS_VERSMIN) ||
 550             (rsa->nfs_versmax > NFS_VERSMAX)) {
 551                 rsa->nfs_versmin = NFS_VERSMIN_DEFAULT;
 552                 rsa->nfs_versmax = NFS_VERSMAX_DEFAULT;
 553         }
 554 
 555         ng = zone_getspecific(nfssrv_zone_key, curzone);
 556         ng->nfs_versmin = rsa->nfs_versmin;
 557         ng->nfs_versmax = rsa->nfs_versmax;
 558 
 559         /* Set the versions in the callout table */
 560         __nfs_sc_rdma[0].sc_versmin = rsa->nfs_versmin;
 561         __nfs_sc_rdma[0].sc_versmax = rsa->nfs_versmax;
 562         /* For the NFS_ACL program, check the max version */
 563         __nfs_sc_rdma[1].sc_versmin = rsa->nfs_versmin;
 564         if (rsa->nfs_versmax > NFS_ACL_VERSMAX)
 565                 __nfs_sc_rdma[1].sc_versmax = NFS_ACL_VERSMAX;
 566         else
 567                 __nfs_sc_rdma[1].sc_versmax = rsa->nfs_versmax;
 568 
 569         /* Initialize nfsv4 server */
 570         if (rsa->nfs_versmax == (rpcvers_t)NFS_V4)
 571                 rfs4_server_start(ng, rsa->delegation);
 572 
 573         started_rdma_xprts.rtg_count = 0;
 574         started_rdma_xprts.rtg_listhead = NULL;
 575         started_rdma_xprts.rtg_poolid = rsa->poolid;
 576 
 577 restart:
 578         error = svc_rdma_kcreate(rsa->netid, &nfs_sct_rdma, rsa->poolid,
 579             &started_rdma_xprts);
 580 
 581         svc_state = !error;
 582 
 583         while (!error) {
 584 
 585                 /*
 586                  * wait till either interrupted by a signal on
 587                  * nfs service stop/restart or signalled by a
 588                  * rdma attach/detatch.
 589                  */
 590 
 591                 stat = rdma_kwait();
 592 
 593                 /*
 594                  * stop services if running -- either on a HCA detach event
 595                  * or if the nfs service is stopped/restarted.
 596                  */
 597 
 598                 if ((stat == RDMA_HCA_DETACH || stat == RDMA_INTR) &&
 599                     svc_state) {
 600                         rdma_stop(&started_rdma_xprts);
 601                         svc_state = 0;
 602                 }
 603 
 604                 /*
 605                  * nfs service stop/restart, break out of the
 606                  * wait loop and return;
 607                  */
 608                 if (stat == RDMA_INTR)
 609                         return (0);
 610 
 611                 /*
 612                  * restart stopped services on a HCA attach event
 613                  * (if not already running)
 614                  */
 615 
 616                 if ((stat == RDMA_HCA_ATTACH) && (svc_state == 0))
 617                         goto restart;
 618 
 619                 /*
 620                  * loop until a nfs service stop/restart
 621                  */
 622         }
 623 
 624         return (error);
 625 }
 626 
 627 /* ARGSUSED */
 628 void
 629 rpc_null(caddr_t *argp, caddr_t *resp, struct exportinfo *exi,
 630     struct svc_req *req, cred_t *cr, bool_t ro)
 631 {
 632 }
 633 
 634 /* ARGSUSED */
 635 void
 636 rpc_null_v3(caddr_t *argp, caddr_t *resp, struct exportinfo *exi,
 637     struct svc_req *req, cred_t *cr, bool_t ro)
 638 {
 639         DTRACE_NFSV3_4(op__null__start, struct svc_req *, req,
 640             cred_t *, cr, vnode_t *, NULL, struct exportinfo *, exi);
 641         DTRACE_NFSV3_4(op__null__done, struct svc_req *, req,
 642             cred_t *, cr, vnode_t *, NULL, struct exportinfo *, exi);
 643 }
 644 
 645 /* ARGSUSED */
 646 static void
 647 rfs_error(caddr_t *argp, caddr_t *resp, struct exportinfo *exi,
 648     struct svc_req *req, cred_t *cr, bool_t ro)
 649 {
 650         /* return (EOPNOTSUPP); */
 651 }
 652 
 653 static void
 654 nullfree(void)
 655 {
 656 }
 657 
 658 static char *rfscallnames_v2[] = {
 659         "RFS2_NULL",
 660         "RFS2_GETATTR",
 661         "RFS2_SETATTR",
 662         "RFS2_ROOT",
 663         "RFS2_LOOKUP",
 664         "RFS2_READLINK",
 665         "RFS2_READ",
 666         "RFS2_WRITECACHE",
 667         "RFS2_WRITE",
 668         "RFS2_CREATE",
 669         "RFS2_REMOVE",
 670         "RFS2_RENAME",
 671         "RFS2_LINK",
 672         "RFS2_SYMLINK",
 673         "RFS2_MKDIR",
 674         "RFS2_RMDIR",
 675         "RFS2_READDIR",
 676         "RFS2_STATFS"
 677 };
 678 
 679 static struct rpcdisp rfsdisptab_v2[] = {
 680         /*
 681          * NFS VERSION 2
 682          */
 683 
 684         /* RFS_NULL = 0 */
 685         {rpc_null,
 686             xdr_void, NULL_xdrproc_t, 0,
 687             xdr_void, NULL_xdrproc_t, 0,
 688             nullfree, RPC_IDEMPOTENT,
 689             0},
 690 
 691         /* RFS_GETATTR = 1 */
 692         {rfs_getattr,
 693             xdr_fhandle, xdr_fastfhandle, sizeof (fhandle_t),
 694             xdr_attrstat, xdr_fastattrstat, sizeof (struct nfsattrstat),
 695             nullfree, RPC_IDEMPOTENT|RPC_ALLOWANON|RPC_MAPRESP,
 696             rfs_getattr_getfh},
 697 
 698         /* RFS_SETATTR = 2 */
 699         {rfs_setattr,
 700             xdr_saargs, NULL_xdrproc_t, sizeof (struct nfssaargs),
 701             xdr_attrstat, xdr_fastattrstat, sizeof (struct nfsattrstat),
 702             nullfree, RPC_MAPRESP,
 703             rfs_setattr_getfh},
 704 
 705         /* RFS_ROOT = 3 *** NO LONGER SUPPORTED *** */
 706         {rfs_error,
 707             xdr_void, NULL_xdrproc_t, 0,
 708             xdr_void, NULL_xdrproc_t, 0,
 709             nullfree, RPC_IDEMPOTENT,
 710             0},
 711 
 712         /* RFS_LOOKUP = 4 */
 713         {rfs_lookup,
 714             xdr_diropargs, NULL_xdrproc_t, sizeof (struct nfsdiropargs),
 715             xdr_diropres, xdr_fastdiropres, sizeof (struct nfsdiropres),
 716             nullfree, RPC_IDEMPOTENT|RPC_MAPRESP|RPC_PUBLICFH_OK,
 717             rfs_lookup_getfh},
 718 
 719         /* RFS_READLINK = 5 */
 720         {rfs_readlink,
 721             xdr_fhandle, xdr_fastfhandle, sizeof (fhandle_t),
 722             xdr_rdlnres, NULL_xdrproc_t, sizeof (struct nfsrdlnres),
 723             rfs_rlfree, RPC_IDEMPOTENT,
 724             rfs_readlink_getfh},
 725 
 726         /* RFS_READ = 6 */
 727         {rfs_read,
 728             xdr_readargs, NULL_xdrproc_t, sizeof (struct nfsreadargs),
 729             xdr_rdresult, NULL_xdrproc_t, sizeof (struct nfsrdresult),
 730             rfs_rdfree, RPC_IDEMPOTENT,
 731             rfs_read_getfh},
 732 
 733         /* RFS_WRITECACHE = 7 *** NO LONGER SUPPORTED *** */
 734         {rfs_error,
 735             xdr_void, NULL_xdrproc_t, 0,
 736             xdr_void, NULL_xdrproc_t, 0,
 737             nullfree, RPC_IDEMPOTENT,
 738             0},
 739 
 740         /* RFS_WRITE = 8 */
 741         {rfs_write,
 742             xdr_writeargs, NULL_xdrproc_t, sizeof (struct nfswriteargs),
 743             xdr_attrstat, xdr_fastattrstat, sizeof (struct nfsattrstat),
 744             nullfree, RPC_MAPRESP,
 745             rfs_write_getfh},
 746 
 747         /* RFS_CREATE = 9 */
 748         {rfs_create,
 749             xdr_creatargs, NULL_xdrproc_t, sizeof (struct nfscreatargs),
 750             xdr_diropres, xdr_fastdiropres, sizeof (struct nfsdiropres),
 751             nullfree, RPC_MAPRESP,
 752             rfs_create_getfh},
 753 
 754         /* RFS_REMOVE = 10 */
 755         {rfs_remove,
 756             xdr_diropargs, NULL_xdrproc_t, sizeof (struct nfsdiropargs),
 757 #ifdef _LITTLE_ENDIAN
 758             xdr_enum, xdr_fastenum, sizeof (enum nfsstat),
 759 #else
 760             xdr_enum, NULL_xdrproc_t, sizeof (enum nfsstat),
 761 #endif
 762             nullfree, RPC_MAPRESP,
 763             rfs_remove_getfh},
 764 
 765         /* RFS_RENAME = 11 */
 766         {rfs_rename,
 767             xdr_rnmargs, NULL_xdrproc_t, sizeof (struct nfsrnmargs),
 768 #ifdef _LITTLE_ENDIAN
 769             xdr_enum, xdr_fastenum, sizeof (enum nfsstat),
 770 #else
 771             xdr_enum, NULL_xdrproc_t, sizeof (enum nfsstat),
 772 #endif
 773             nullfree, RPC_MAPRESP,
 774             rfs_rename_getfh},
 775 
 776         /* RFS_LINK = 12 */
 777         {rfs_link,
 778             xdr_linkargs, NULL_xdrproc_t, sizeof (struct nfslinkargs),
 779 #ifdef _LITTLE_ENDIAN
 780             xdr_enum, xdr_fastenum, sizeof (enum nfsstat),
 781 #else
 782             xdr_enum, NULL_xdrproc_t, sizeof (enum nfsstat),
 783 #endif
 784             nullfree, RPC_MAPRESP,
 785             rfs_link_getfh},
 786 
 787         /* RFS_SYMLINK = 13 */
 788         {rfs_symlink,
 789             xdr_slargs, NULL_xdrproc_t, sizeof (struct nfsslargs),
 790 #ifdef _LITTLE_ENDIAN
 791             xdr_enum, xdr_fastenum, sizeof (enum nfsstat),
 792 #else
 793             xdr_enum, NULL_xdrproc_t, sizeof (enum nfsstat),
 794 #endif
 795             nullfree, RPC_MAPRESP,
 796             rfs_symlink_getfh},
 797 
 798         /* RFS_MKDIR = 14 */
 799         {rfs_mkdir,
 800             xdr_creatargs, NULL_xdrproc_t, sizeof (struct nfscreatargs),
 801             xdr_diropres, xdr_fastdiropres, sizeof (struct nfsdiropres),
 802             nullfree, RPC_MAPRESP,
 803             rfs_mkdir_getfh},
 804 
 805         /* RFS_RMDIR = 15 */
 806         {rfs_rmdir,
 807             xdr_diropargs, NULL_xdrproc_t, sizeof (struct nfsdiropargs),
 808 #ifdef _LITTLE_ENDIAN
 809             xdr_enum, xdr_fastenum, sizeof (enum nfsstat),
 810 #else
 811             xdr_enum, NULL_xdrproc_t, sizeof (enum nfsstat),
 812 #endif
 813             nullfree, RPC_MAPRESP,
 814             rfs_rmdir_getfh},
 815 
 816         /* RFS_READDIR = 16 */
 817         {rfs_readdir,
 818             xdr_rddirargs, NULL_xdrproc_t, sizeof (struct nfsrddirargs),
 819             xdr_putrddirres, NULL_xdrproc_t, sizeof (struct nfsrddirres),
 820             rfs_rddirfree, RPC_IDEMPOTENT,
 821             rfs_readdir_getfh},
 822 
 823         /* RFS_STATFS = 17 */
 824         {rfs_statfs,
 825             xdr_fhandle, xdr_fastfhandle, sizeof (fhandle_t),
 826             xdr_statfs, xdr_faststatfs, sizeof (struct nfsstatfs),
 827             nullfree, RPC_IDEMPOTENT|RPC_ALLOWANON|RPC_MAPRESP,
 828             rfs_statfs_getfh},
 829 };
 830 
 831 static char *rfscallnames_v3[] = {
 832         "RFS3_NULL",
 833         "RFS3_GETATTR",
 834         "RFS3_SETATTR",
 835         "RFS3_LOOKUP",
 836         "RFS3_ACCESS",
 837         "RFS3_READLINK",
 838         "RFS3_READ",
 839         "RFS3_WRITE",
 840         "RFS3_CREATE",
 841         "RFS3_MKDIR",
 842         "RFS3_SYMLINK",
 843         "RFS3_MKNOD",
 844         "RFS3_REMOVE",
 845         "RFS3_RMDIR",
 846         "RFS3_RENAME",
 847         "RFS3_LINK",
 848         "RFS3_READDIR",
 849         "RFS3_READDIRPLUS",
 850         "RFS3_FSSTAT",
 851         "RFS3_FSINFO",
 852         "RFS3_PATHCONF",
 853         "RFS3_COMMIT"
 854 };
 855 
 856 static struct rpcdisp rfsdisptab_v3[] = {
 857         /*
 858          * NFS VERSION 3
 859          */
 860 
 861         /* RFS_NULL = 0 */
 862         {rpc_null_v3,
 863             xdr_void, NULL_xdrproc_t, 0,
 864             xdr_void, NULL_xdrproc_t, 0,
 865             nullfree, RPC_IDEMPOTENT,
 866             0},
 867 
 868         /* RFS3_GETATTR = 1 */
 869         {rfs3_getattr,
 870             xdr_nfs_fh3_server, NULL_xdrproc_t, sizeof (GETATTR3args),
 871             xdr_GETATTR3res, NULL_xdrproc_t, sizeof (GETATTR3res),
 872             nullfree, (RPC_IDEMPOTENT | RPC_ALLOWANON),
 873             rfs3_getattr_getfh},
 874 
 875         /* RFS3_SETATTR = 2 */
 876         {rfs3_setattr,
 877             xdr_SETATTR3args, NULL_xdrproc_t, sizeof (SETATTR3args),
 878             xdr_SETATTR3res, NULL_xdrproc_t, sizeof (SETATTR3res),
 879             nullfree, 0,
 880             rfs3_setattr_getfh},
 881 
 882         /* RFS3_LOOKUP = 3 */
 883         {rfs3_lookup,
 884             xdr_diropargs3, NULL_xdrproc_t, sizeof (LOOKUP3args),
 885             xdr_LOOKUP3res, NULL_xdrproc_t, sizeof (LOOKUP3res),
 886             nullfree, (RPC_IDEMPOTENT | RPC_PUBLICFH_OK),
 887             rfs3_lookup_getfh},
 888 
 889         /* RFS3_ACCESS = 4 */
 890         {rfs3_access,
 891             xdr_ACCESS3args, NULL_xdrproc_t, sizeof (ACCESS3args),
 892             xdr_ACCESS3res, NULL_xdrproc_t, sizeof (ACCESS3res),
 893             nullfree, RPC_IDEMPOTENT,
 894             rfs3_access_getfh},
 895 
 896         /* RFS3_READLINK = 5 */
 897         {rfs3_readlink,
 898             xdr_nfs_fh3_server, NULL_xdrproc_t, sizeof (READLINK3args),
 899             xdr_READLINK3res, NULL_xdrproc_t, sizeof (READLINK3res),
 900             rfs3_readlink_free, RPC_IDEMPOTENT,
 901             rfs3_readlink_getfh},
 902 
 903         /* RFS3_READ = 6 */
 904         {rfs3_read,
 905             xdr_READ3args, NULL_xdrproc_t, sizeof (READ3args),
 906             xdr_READ3res, NULL_xdrproc_t, sizeof (READ3res),
 907             rfs3_read_free, RPC_IDEMPOTENT,
 908             rfs3_read_getfh},
 909 
 910         /* RFS3_WRITE = 7 */
 911         {rfs3_write,
 912             xdr_WRITE3args, NULL_xdrproc_t, sizeof (WRITE3args),
 913             xdr_WRITE3res, NULL_xdrproc_t, sizeof (WRITE3res),
 914             nullfree, 0,
 915             rfs3_write_getfh},
 916 
 917         /* RFS3_CREATE = 8 */
 918         {rfs3_create,
 919             xdr_CREATE3args, NULL_xdrproc_t, sizeof (CREATE3args),
 920             xdr_CREATE3res, NULL_xdrproc_t, sizeof (CREATE3res),
 921             nullfree, 0,
 922             rfs3_create_getfh},
 923 
 924         /* RFS3_MKDIR = 9 */
 925         {rfs3_mkdir,
 926             xdr_MKDIR3args, NULL_xdrproc_t, sizeof (MKDIR3args),
 927             xdr_MKDIR3res, NULL_xdrproc_t, sizeof (MKDIR3res),
 928             nullfree, 0,
 929             rfs3_mkdir_getfh},
 930 
 931         /* RFS3_SYMLINK = 10 */
 932         {rfs3_symlink,
 933             xdr_SYMLINK3args, NULL_xdrproc_t, sizeof (SYMLINK3args),
 934             xdr_SYMLINK3res, NULL_xdrproc_t, sizeof (SYMLINK3res),
 935             nullfree, 0,
 936             rfs3_symlink_getfh},
 937 
 938         /* RFS3_MKNOD = 11 */
 939         {rfs3_mknod,
 940             xdr_MKNOD3args, NULL_xdrproc_t, sizeof (MKNOD3args),
 941             xdr_MKNOD3res, NULL_xdrproc_t, sizeof (MKNOD3res),
 942             nullfree, 0,
 943             rfs3_mknod_getfh},
 944 
 945         /* RFS3_REMOVE = 12 */
 946         {rfs3_remove,
 947             xdr_diropargs3, NULL_xdrproc_t, sizeof (REMOVE3args),
 948             xdr_REMOVE3res, NULL_xdrproc_t, sizeof (REMOVE3res),
 949             nullfree, 0,
 950             rfs3_remove_getfh},
 951 
 952         /* RFS3_RMDIR = 13 */
 953         {rfs3_rmdir,
 954             xdr_diropargs3, NULL_xdrproc_t, sizeof (RMDIR3args),
 955             xdr_RMDIR3res, NULL_xdrproc_t, sizeof (RMDIR3res),
 956             nullfree, 0,
 957             rfs3_rmdir_getfh},
 958 
 959         /* RFS3_RENAME = 14 */
 960         {rfs3_rename,
 961             xdr_RENAME3args, NULL_xdrproc_t, sizeof (RENAME3args),
 962             xdr_RENAME3res, NULL_xdrproc_t, sizeof (RENAME3res),
 963             nullfree, 0,
 964             rfs3_rename_getfh},
 965 
 966         /* RFS3_LINK = 15 */
 967         {rfs3_link,
 968             xdr_LINK3args, NULL_xdrproc_t, sizeof (LINK3args),
 969             xdr_LINK3res, NULL_xdrproc_t, sizeof (LINK3res),
 970             nullfree, 0,
 971             rfs3_link_getfh},
 972 
 973         /* RFS3_READDIR = 16 */
 974         {rfs3_readdir,
 975             xdr_READDIR3args, NULL_xdrproc_t, sizeof (READDIR3args),
 976             xdr_READDIR3res, NULL_xdrproc_t, sizeof (READDIR3res),
 977             rfs3_readdir_free, RPC_IDEMPOTENT,
 978             rfs3_readdir_getfh},
 979 
 980         /* RFS3_READDIRPLUS = 17 */
 981         {rfs3_readdirplus,
 982             xdr_READDIRPLUS3args, NULL_xdrproc_t, sizeof (READDIRPLUS3args),
 983             xdr_READDIRPLUS3res, NULL_xdrproc_t, sizeof (READDIRPLUS3res),
 984             rfs3_readdirplus_free, RPC_AVOIDWORK,
 985             rfs3_readdirplus_getfh},
 986 
 987         /* RFS3_FSSTAT = 18 */
 988         {rfs3_fsstat,
 989             xdr_nfs_fh3_server, NULL_xdrproc_t, sizeof (FSSTAT3args),
 990             xdr_FSSTAT3res, NULL_xdrproc_t, sizeof (FSSTAT3res),
 991             nullfree, RPC_IDEMPOTENT,
 992             rfs3_fsstat_getfh},
 993 
 994         /* RFS3_FSINFO = 19 */
 995         {rfs3_fsinfo,
 996             xdr_nfs_fh3_server, NULL_xdrproc_t, sizeof (FSINFO3args),
 997             xdr_FSINFO3res, NULL_xdrproc_t, sizeof (FSINFO3res),
 998             nullfree, RPC_IDEMPOTENT|RPC_ALLOWANON,
 999             rfs3_fsinfo_getfh},
1000 
1001         /* RFS3_PATHCONF = 20 */
1002         {rfs3_pathconf,
1003             xdr_nfs_fh3_server, NULL_xdrproc_t, sizeof (PATHCONF3args),
1004             xdr_PATHCONF3res, NULL_xdrproc_t, sizeof (PATHCONF3res),
1005             nullfree, RPC_IDEMPOTENT,
1006             rfs3_pathconf_getfh},
1007 
1008         /* RFS3_COMMIT = 21 */
1009         {rfs3_commit,
1010             xdr_COMMIT3args, NULL_xdrproc_t, sizeof (COMMIT3args),
1011             xdr_COMMIT3res, NULL_xdrproc_t, sizeof (COMMIT3res),
1012             nullfree, RPC_IDEMPOTENT,
1013             rfs3_commit_getfh},
1014 };
1015 
1016 static char *rfscallnames_v4[] = {
1017         "RFS4_NULL",
1018         "RFS4_COMPOUND",
1019         "RFS4_NULL",
1020         "RFS4_NULL",
1021         "RFS4_NULL",
1022         "RFS4_NULL",
1023         "RFS4_NULL",
1024         "RFS4_NULL",
1025         "RFS4_CREATE"
1026 };
1027 
1028 static struct rpcdisp rfsdisptab_v4[] = {
1029         /*
1030          * NFS VERSION 4
1031          */
1032 
1033         /* RFS_NULL = 0 */
1034         {rpc_null,
1035             xdr_void, NULL_xdrproc_t, 0,
1036             xdr_void, NULL_xdrproc_t, 0,
1037             nullfree, RPC_IDEMPOTENT, 0},
1038 
1039         /* RFS4_compound = 1 */
1040         {rfs4_compound,
1041             xdr_COMPOUND4args_srv, NULL_xdrproc_t, sizeof (COMPOUND4args),
1042             xdr_COMPOUND4res_srv, NULL_xdrproc_t, sizeof (COMPOUND4res),
1043             rfs4_compound_free, 0, 0},
1044 };
1045 
1046 union rfs_args {
1047         /*
1048          * NFS VERSION 2
1049          */
1050 
1051         /* RFS_NULL = 0 */
1052 
1053         /* RFS_GETATTR = 1 */
1054         fhandle_t nfs2_getattr_args;
1055 
1056         /* RFS_SETATTR = 2 */
1057         struct nfssaargs nfs2_setattr_args;
1058 
1059         /* RFS_ROOT = 3 *** NO LONGER SUPPORTED *** */
1060 
1061         /* RFS_LOOKUP = 4 */
1062         struct nfsdiropargs nfs2_lookup_args;
1063 
1064         /* RFS_READLINK = 5 */
1065         fhandle_t nfs2_readlink_args;
1066 
1067         /* RFS_READ = 6 */
1068         struct nfsreadargs nfs2_read_args;
1069 
1070         /* RFS_WRITECACHE = 7 *** NO LONGER SUPPORTED *** */
1071 
1072         /* RFS_WRITE = 8 */
1073         struct nfswriteargs nfs2_write_args;
1074 
1075         /* RFS_CREATE = 9 */
1076         struct nfscreatargs nfs2_create_args;
1077 
1078         /* RFS_REMOVE = 10 */
1079         struct nfsdiropargs nfs2_remove_args;
1080 
1081         /* RFS_RENAME = 11 */
1082         struct nfsrnmargs nfs2_rename_args;
1083 
1084         /* RFS_LINK = 12 */
1085         struct nfslinkargs nfs2_link_args;
1086 
1087         /* RFS_SYMLINK = 13 */
1088         struct nfsslargs nfs2_symlink_args;
1089 
1090         /* RFS_MKDIR = 14 */
1091         struct nfscreatargs nfs2_mkdir_args;
1092 
1093         /* RFS_RMDIR = 15 */
1094         struct nfsdiropargs nfs2_rmdir_args;
1095 
1096         /* RFS_READDIR = 16 */
1097         struct nfsrddirargs nfs2_readdir_args;
1098 
1099         /* RFS_STATFS = 17 */
1100         fhandle_t nfs2_statfs_args;
1101 
1102         /*
1103          * NFS VERSION 3
1104          */
1105 
1106         /* RFS_NULL = 0 */
1107 
1108         /* RFS3_GETATTR = 1 */
1109         GETATTR3args nfs3_getattr_args;
1110 
1111         /* RFS3_SETATTR = 2 */
1112         SETATTR3args nfs3_setattr_args;
1113 
1114         /* RFS3_LOOKUP = 3 */
1115         LOOKUP3args nfs3_lookup_args;
1116 
1117         /* RFS3_ACCESS = 4 */
1118         ACCESS3args nfs3_access_args;
1119 
1120         /* RFS3_READLINK = 5 */
1121         READLINK3args nfs3_readlink_args;
1122 
1123         /* RFS3_READ = 6 */
1124         READ3args nfs3_read_args;
1125 
1126         /* RFS3_WRITE = 7 */
1127         WRITE3args nfs3_write_args;
1128 
1129         /* RFS3_CREATE = 8 */
1130         CREATE3args nfs3_create_args;
1131 
1132         /* RFS3_MKDIR = 9 */
1133         MKDIR3args nfs3_mkdir_args;
1134 
1135         /* RFS3_SYMLINK = 10 */
1136         SYMLINK3args nfs3_symlink_args;
1137 
1138         /* RFS3_MKNOD = 11 */
1139         MKNOD3args nfs3_mknod_args;
1140 
1141         /* RFS3_REMOVE = 12 */
1142         REMOVE3args nfs3_remove_args;
1143 
1144         /* RFS3_RMDIR = 13 */
1145         RMDIR3args nfs3_rmdir_args;
1146 
1147         /* RFS3_RENAME = 14 */
1148         RENAME3args nfs3_rename_args;
1149 
1150         /* RFS3_LINK = 15 */
1151         LINK3args nfs3_link_args;
1152 
1153         /* RFS3_READDIR = 16 */
1154         READDIR3args nfs3_readdir_args;
1155 
1156         /* RFS3_READDIRPLUS = 17 */
1157         READDIRPLUS3args nfs3_readdirplus_args;
1158 
1159         /* RFS3_FSSTAT = 18 */
1160         FSSTAT3args nfs3_fsstat_args;
1161 
1162         /* RFS3_FSINFO = 19 */
1163         FSINFO3args nfs3_fsinfo_args;
1164 
1165         /* RFS3_PATHCONF = 20 */
1166         PATHCONF3args nfs3_pathconf_args;
1167 
1168         /* RFS3_COMMIT = 21 */
1169         COMMIT3args nfs3_commit_args;
1170 
1171         /*
1172          * NFS VERSION 4
1173          */
1174 
1175         /* RFS_NULL = 0 */
1176 
1177         /* COMPUND = 1 */
1178         COMPOUND4args nfs4_compound_args;
1179 };
1180 
1181 union rfs_res {
1182         /*
1183          * NFS VERSION 2
1184          */
1185 
1186         /* RFS_NULL = 0 */
1187 
1188         /* RFS_GETATTR = 1 */
1189         struct nfsattrstat nfs2_getattr_res;
1190 
1191         /* RFS_SETATTR = 2 */
1192         struct nfsattrstat nfs2_setattr_res;
1193 
1194         /* RFS_ROOT = 3 *** NO LONGER SUPPORTED *** */
1195 
1196         /* RFS_LOOKUP = 4 */
1197         struct nfsdiropres nfs2_lookup_res;
1198 
1199         /* RFS_READLINK = 5 */
1200         struct nfsrdlnres nfs2_readlink_res;
1201 
1202         /* RFS_READ = 6 */
1203         struct nfsrdresult nfs2_read_res;
1204 
1205         /* RFS_WRITECACHE = 7 *** NO LONGER SUPPORTED *** */
1206 
1207         /* RFS_WRITE = 8 */
1208         struct nfsattrstat nfs2_write_res;
1209 
1210         /* RFS_CREATE = 9 */
1211         struct nfsdiropres nfs2_create_res;
1212 
1213         /* RFS_REMOVE = 10 */
1214         enum nfsstat nfs2_remove_res;
1215 
1216         /* RFS_RENAME = 11 */
1217         enum nfsstat nfs2_rename_res;
1218 
1219         /* RFS_LINK = 12 */
1220         enum nfsstat nfs2_link_res;
1221 
1222         /* RFS_SYMLINK = 13 */
1223         enum nfsstat nfs2_symlink_res;
1224 
1225         /* RFS_MKDIR = 14 */
1226         struct nfsdiropres nfs2_mkdir_res;
1227 
1228         /* RFS_RMDIR = 15 */
1229         enum nfsstat nfs2_rmdir_res;
1230 
1231         /* RFS_READDIR = 16 */
1232         struct nfsrddirres nfs2_readdir_res;
1233 
1234         /* RFS_STATFS = 17 */
1235         struct nfsstatfs nfs2_statfs_res;
1236 
1237         /*
1238          * NFS VERSION 3
1239          */
1240 
1241         /* RFS_NULL = 0 */
1242 
1243         /* RFS3_GETATTR = 1 */
1244         GETATTR3res nfs3_getattr_res;
1245 
1246         /* RFS3_SETATTR = 2 */
1247         SETATTR3res nfs3_setattr_res;
1248 
1249         /* RFS3_LOOKUP = 3 */
1250         LOOKUP3res nfs3_lookup_res;
1251 
1252         /* RFS3_ACCESS = 4 */
1253         ACCESS3res nfs3_access_res;
1254 
1255         /* RFS3_READLINK = 5 */
1256         READLINK3res nfs3_readlink_res;
1257 
1258         /* RFS3_READ = 6 */
1259         READ3res nfs3_read_res;
1260 
1261         /* RFS3_WRITE = 7 */
1262         WRITE3res nfs3_write_res;
1263 
1264         /* RFS3_CREATE = 8 */
1265         CREATE3res nfs3_create_res;
1266 
1267         /* RFS3_MKDIR = 9 */
1268         MKDIR3res nfs3_mkdir_res;
1269 
1270         /* RFS3_SYMLINK = 10 */
1271         SYMLINK3res nfs3_symlink_res;
1272 
1273         /* RFS3_MKNOD = 11 */
1274         MKNOD3res nfs3_mknod_res;
1275 
1276         /* RFS3_REMOVE = 12 */
1277         REMOVE3res nfs3_remove_res;
1278 
1279         /* RFS3_RMDIR = 13 */
1280         RMDIR3res nfs3_rmdir_res;
1281 
1282         /* RFS3_RENAME = 14 */
1283         RENAME3res nfs3_rename_res;
1284 
1285         /* RFS3_LINK = 15 */
1286         LINK3res nfs3_link_res;
1287 
1288         /* RFS3_READDIR = 16 */
1289         READDIR3res nfs3_readdir_res;
1290 
1291         /* RFS3_READDIRPLUS = 17 */
1292         READDIRPLUS3res nfs3_readdirplus_res;
1293 
1294         /* RFS3_FSSTAT = 18 */
1295         FSSTAT3res nfs3_fsstat_res;
1296 
1297         /* RFS3_FSINFO = 19 */
1298         FSINFO3res nfs3_fsinfo_res;
1299 
1300         /* RFS3_PATHCONF = 20 */
1301         PATHCONF3res nfs3_pathconf_res;
1302 
1303         /* RFS3_COMMIT = 21 */
1304         COMMIT3res nfs3_commit_res;
1305 
1306         /*
1307          * NFS VERSION 4
1308          */
1309 
1310         /* RFS_NULL = 0 */
1311 
1312         /* RFS4_COMPOUND = 1 */
1313         COMPOUND4res nfs4_compound_res;
1314 
1315 };
1316 
1317 static struct rpc_disptable rfs_disptable[] = {
1318         {sizeof (rfsdisptab_v2) / sizeof (rfsdisptab_v2[0]),
1319             rfscallnames_v2,
1320             &rfsproccnt_v2_ptr, rfsdisptab_v2},
1321         {sizeof (rfsdisptab_v3) / sizeof (rfsdisptab_v3[0]),
1322             rfscallnames_v3,
1323             &rfsproccnt_v3_ptr, rfsdisptab_v3},
1324         {sizeof (rfsdisptab_v4) / sizeof (rfsdisptab_v4[0]),
1325             rfscallnames_v4,
1326             &rfsproccnt_v4_ptr, rfsdisptab_v4},
1327 };
1328 
1329 /*
1330  * If nfs_portmon is set, then clients are required to use privileged
1331  * ports (ports < IPPORT_RESERVED) in order to get NFS services.
1332  *
1333  * N.B.: this attempt to carry forward the already ill-conceived notion
1334  * of privileged ports for TCP/UDP is really quite ineffectual.  Not only
1335  * is it transport-dependent, it's laughably easy to spoof.  If you're
1336  * really interested in security, you must start with secure RPC instead.
1337  */
1338 static int nfs_portmon = 0;
1339 
1340 #ifdef DEBUG
1341 static int cred_hits = 0;
1342 static int cred_misses = 0;
1343 #endif
1344 
1345 #ifdef DEBUG
1346 /*
1347  * Debug code to allow disabling of rfs_dispatch() use of
1348  * fastxdrargs() and fastxdrres() calls for testing purposes.
1349  */
1350 static int rfs_no_fast_xdrargs = 0;
1351 static int rfs_no_fast_xdrres = 0;
1352 #endif
1353 
1354 union acl_args {
1355         /*
1356          * ACL VERSION 2
1357          */
1358 
1359         /* ACL2_NULL = 0 */
1360 
1361         /* ACL2_GETACL = 1 */
1362         GETACL2args acl2_getacl_args;
1363 
1364         /* ACL2_SETACL = 2 */
1365         SETACL2args acl2_setacl_args;
1366 
1367         /* ACL2_GETATTR = 3 */
1368         GETATTR2args acl2_getattr_args;
1369 
1370         /* ACL2_ACCESS = 4 */
1371         ACCESS2args acl2_access_args;
1372 
1373         /* ACL2_GETXATTRDIR = 5 */
1374         GETXATTRDIR2args acl2_getxattrdir_args;
1375 
1376         /*
1377          * ACL VERSION 3
1378          */
1379 
1380         /* ACL3_NULL = 0 */
1381 
1382         /* ACL3_GETACL = 1 */
1383         GETACL3args acl3_getacl_args;
1384 
1385         /* ACL3_SETACL = 2 */
1386         SETACL3args acl3_setacl;
1387 
1388         /* ACL3_GETXATTRDIR = 3 */
1389         GETXATTRDIR3args acl3_getxattrdir_args;
1390 
1391 };
1392 
1393 union acl_res {
1394         /*
1395          * ACL VERSION 2
1396          */
1397 
1398         /* ACL2_NULL = 0 */
1399 
1400         /* ACL2_GETACL = 1 */
1401         GETACL2res acl2_getacl_res;
1402 
1403         /* ACL2_SETACL = 2 */
1404         SETACL2res acl2_setacl_res;
1405 
1406         /* ACL2_GETATTR = 3 */
1407         GETATTR2res acl2_getattr_res;
1408 
1409         /* ACL2_ACCESS = 4 */
1410         ACCESS2res acl2_access_res;
1411 
1412         /* ACL2_GETXATTRDIR = 5 */
1413         GETXATTRDIR2args acl2_getxattrdir_res;
1414 
1415         /*
1416          * ACL VERSION 3
1417          */
1418 
1419         /* ACL3_NULL = 0 */
1420 
1421         /* ACL3_GETACL = 1 */
1422         GETACL3res acl3_getacl_res;
1423 
1424         /* ACL3_SETACL = 2 */
1425         SETACL3res acl3_setacl_res;
1426 
1427         /* ACL3_GETXATTRDIR = 3 */
1428         GETXATTRDIR3res acl3_getxattrdir_res;
1429 
1430 };
1431 
1432 static bool_t
1433 auth_tooweak(struct svc_req *req, char *res)
1434 {
1435 
1436         if (req->rq_vers == NFS_VERSION && req->rq_proc == RFS_LOOKUP) {
1437                 struct nfsdiropres *dr = (struct nfsdiropres *)res;
1438                 if ((enum wnfsstat)dr->dr_status == WNFSERR_CLNT_FLAVOR)
1439                         return (TRUE);
1440         } else if (req->rq_vers == NFS_V3 && req->rq_proc == NFSPROC3_LOOKUP) {
1441                 LOOKUP3res *resp = (LOOKUP3res *)res;
1442                 if ((enum wnfsstat)resp->status == WNFSERR_CLNT_FLAVOR)
1443                         return (TRUE);
1444         }
1445         return (FALSE);
1446 }
1447 
1448 
1449 static void
1450 common_dispatch(struct svc_req *req, SVCXPRT *xprt, rpcvers_t min_vers,
1451     rpcvers_t max_vers, char *pgmname, struct rpc_disptable *disptable)
1452 {
1453         int which;
1454         rpcvers_t vers;
1455         char *args;
1456         union {
1457                         union rfs_args ra;
1458                         union acl_args aa;
1459                 } args_buf;
1460         char *res;
1461         union {
1462                         union rfs_res rr;
1463                         union acl_res ar;
1464                 } res_buf;
1465         struct rpcdisp *disp = NULL;
1466         int dis_flags = 0;
1467         cred_t *cr;
1468         int error = 0;
1469         int anon_ok;
1470         struct exportinfo *exi = NULL;
1471         unsigned int nfslog_rec_id;
1472         int dupstat;
1473         struct dupreq *dr;
1474         int authres;
1475         bool_t publicfh_ok = FALSE;
1476         enum_t auth_flavor;
1477         bool_t dupcached = FALSE;
1478         struct netbuf   nb;
1479         bool_t logging_enabled = FALSE;
1480         struct exportinfo *nfslog_exi = NULL;
1481         char **procnames;
1482         char cbuf[INET6_ADDRSTRLEN];    /* to hold both IPv4 and IPv6 addr */
1483         bool_t ro = FALSE;
1484         nfs_export_t *ne = nfs_get_export();
1485 
1486         vers = req->rq_vers;
1487 
1488         if (vers < min_vers || vers > max_vers) {
1489                 svcerr_progvers(req->rq_xprt, min_vers, max_vers);
1490                 error++;
1491                 cmn_err(CE_NOTE, "%s: bad version number %u", pgmname, vers);
1492                 goto done;
1493         }
1494         vers -= min_vers;
1495 
1496         which = req->rq_proc;
1497         if (which < 0 || which >= disptable[(int)vers].dis_nprocs) {
1498                 svcerr_noproc(req->rq_xprt);
1499                 error++;
1500                 goto done;
1501         }
1502 
1503         (*(disptable[(int)vers].dis_proccntp))[which].value.ui64++;
1504 
1505         disp = &disptable[(int)vers].dis_table[which];
1506         procnames = disptable[(int)vers].dis_procnames;
1507 
1508         auth_flavor = req->rq_cred.oa_flavor;
1509 
1510         /*
1511          * Deserialize into the args struct.
1512          */
1513         args = (char *)&args_buf;
1514 
1515 #ifdef DEBUG
1516         if (rfs_no_fast_xdrargs || (auth_flavor == RPCSEC_GSS) ||
1517             disp->dis_fastxdrargs == NULL_xdrproc_t ||
1518             !SVC_GETARGS(xprt, disp->dis_fastxdrargs, (char *)&args))
1519 #else
1520         if ((auth_flavor == RPCSEC_GSS) ||
1521             disp->dis_fastxdrargs == NULL_xdrproc_t ||
1522             !SVC_GETARGS(xprt, disp->dis_fastxdrargs, (char *)&args))
1523 #endif
1524         {
1525                 bzero(args, disp->dis_argsz);
1526                 if (!SVC_GETARGS(xprt, disp->dis_xdrargs, args)) {
1527                         error++;
1528                         /*
1529                          * Check if we are outside our capabilities.
1530                          */
1531                         if (rfs4_minorvers_mismatch(req, xprt, (void *)args))
1532                                 goto done;
1533 
1534                         svcerr_decode(xprt);
1535                         cmn_err(CE_NOTE,
1536                             "Failed to decode arguments for %s version %u "
1537                             "procedure %s client %s%s",
1538                             pgmname, vers + min_vers, procnames[which],
1539                             client_name(req), client_addr(req, cbuf));
1540                         goto done;
1541                 }
1542         }
1543 
1544         /*
1545          * If Version 4 use that specific dispatch function.
1546          */
1547         if (req->rq_vers == 4) {
1548                 error += rfs4_dispatch(disp, req, xprt, args);
1549                 goto done;
1550         }
1551 
1552         dis_flags = disp->dis_flags;
1553 
1554         /*
1555          * Find export information and check authentication,
1556          * setting the credential if everything is ok.
1557          */
1558         if (disp->dis_getfh != NULL) {
1559                 void *fh;
1560                 fsid_t *fsid;
1561                 fid_t *fid, *xfid;
1562                 fhandle_t *fh2;
1563                 nfs_fh3 *fh3;
1564 
1565                 fh = (*disp->dis_getfh)(args);
1566                 switch (req->rq_vers) {
1567                 case NFS_VERSION:
1568                         fh2 = (fhandle_t *)fh;
1569                         fsid = &fh2->fh_fsid;
1570                         fid = (fid_t *)&fh2->fh_len;
1571                         xfid = (fid_t *)&fh2->fh_xlen;
1572                         break;
1573                 case NFS_V3:
1574                         fh3 = (nfs_fh3 *)fh;
1575                         fsid = &fh3->fh3_fsid;
1576                         fid = FH3TOFIDP(fh3);
1577                         xfid = FH3TOXFIDP(fh3);
1578                         break;
1579                 }
1580 
1581                 /*
1582                  * Fix for bug 1038302 - corbin
1583                  * There is a problem here if anonymous access is
1584                  * disallowed.  If the current request is part of the
1585                  * client's mount process for the requested filesystem,
1586                  * then it will carry root (uid 0) credentials on it, and
1587                  * will be denied by checkauth if that client does not
1588                  * have explicit root=0 permission.  This will cause the
1589                  * client's mount operation to fail.  As a work-around,
1590                  * we check here to see if the request is a getattr or
1591                  * statfs operation on the exported vnode itself, and
1592                  * pass a flag to checkauth with the result of this test.
1593                  *
1594                  * The filehandle refers to the mountpoint itself if
1595                  * the fh_data and fh_xdata portions of the filehandle
1596                  * are equal.
1597                  *
1598                  * Added anon_ok argument to checkauth().
1599                  */
1600 
1601                 if ((dis_flags & RPC_ALLOWANON) && EQFID(fid, xfid))
1602                         anon_ok = 1;
1603                 else
1604                         anon_ok = 0;
1605 
1606                 cr = xprt->xp_cred;
1607                 ASSERT(cr != NULL);
1608 #ifdef DEBUG
1609                 {
1610                         if (crgetref(cr) != 1) {
1611                                 crfree(cr);
1612                                 cr = crget();
1613                                 xprt->xp_cred = cr;
1614                                 cred_misses++;
1615                         } else
1616                                 cred_hits++;
1617                 }
1618 #else
1619                 if (crgetref(cr) != 1) {
1620                         crfree(cr);
1621                         cr = crget();
1622                         xprt->xp_cred = cr;
1623                 }
1624 #endif
1625 
1626                 exi = checkexport(fsid, xfid);
1627 
1628                 if (exi != NULL) {
1629                         publicfh_ok = PUBLICFH_CHECK(ne, disp, exi, fsid, xfid);
1630 
1631                         /*
1632                          * Don't allow non-V4 clients access
1633                          * to pseudo exports
1634                          */
1635                         if (PSEUDO(exi)) {
1636                                 svcerr_weakauth(xprt);
1637                                 error++;
1638                                 goto done;
1639                         }
1640 
1641                         authres = checkauth(exi, req, cr, anon_ok, publicfh_ok,
1642                             &ro);
1643                         /*
1644                          * authres >  0: authentication OK - proceed
1645                          * authres == 0: authentication weak - return error
1646                          * authres <  0: authentication timeout - drop
1647                          */
1648                         if (authres <= 0) {
1649                                 if (authres == 0) {
1650                                         svcerr_weakauth(xprt);
1651                                         error++;
1652                                 }
1653                                 goto done;
1654                         }
1655                 }
1656         } else
1657                 cr = NULL;
1658 
1659         if ((dis_flags & RPC_MAPRESP) && (auth_flavor != RPCSEC_GSS)) {
1660                 res = (char *)SVC_GETRES(xprt, disp->dis_ressz);
1661                 if (res == NULL)
1662                         res = (char *)&res_buf;
1663         } else
1664                 res = (char *)&res_buf;
1665 
1666         if (!(dis_flags & RPC_IDEMPOTENT)) {
1667                 dupstat = SVC_DUP_EXT(xprt, req, res, disp->dis_ressz, &dr,
1668                     &dupcached);
1669 
1670                 switch (dupstat) {
1671                 case DUP_ERROR:
1672                         svcerr_systemerr(xprt);
1673                         error++;
1674                         goto done;
1675                         /* NOTREACHED */
1676                 case DUP_INPROGRESS:
1677                         if (res != (char *)&res_buf)
1678                                 SVC_FREERES(xprt);
1679                         error++;
1680                         goto done;
1681                         /* NOTREACHED */
1682                 case DUP_NEW:
1683                 case DUP_DROP:
1684                         curthread->t_flag |= T_DONTPEND;
1685 
1686                         (*disp->dis_proc)(args, res, exi, req, cr, ro);
1687 
1688                         curthread->t_flag &= ~T_DONTPEND;
1689                         if (curthread->t_flag & T_WOULDBLOCK) {
1690                                 curthread->t_flag &= ~T_WOULDBLOCK;
1691                                 SVC_DUPDONE_EXT(xprt, dr, res, NULL,
1692                                     disp->dis_ressz, DUP_DROP);
1693                                 if (res != (char *)&res_buf)
1694                                         SVC_FREERES(xprt);
1695                                 error++;
1696                                 goto done;
1697                         }
1698                         if (dis_flags & RPC_AVOIDWORK) {
1699                                 SVC_DUPDONE_EXT(xprt, dr, res, NULL,
1700                                     disp->dis_ressz, DUP_DROP);
1701                         } else {
1702                                 SVC_DUPDONE_EXT(xprt, dr, res,
1703                                     disp->dis_resfree == nullfree ? NULL :
1704                                     disp->dis_resfree,
1705                                     disp->dis_ressz, DUP_DONE);
1706                                 dupcached = TRUE;
1707                         }
1708                         break;
1709                 case DUP_DONE:
1710                         break;
1711                 }
1712 
1713         } else {
1714                 curthread->t_flag |= T_DONTPEND;
1715 
1716                 (*disp->dis_proc)(args, res, exi, req, cr, ro);
1717 
1718                 curthread->t_flag &= ~T_DONTPEND;
1719                 if (curthread->t_flag & T_WOULDBLOCK) {
1720                         curthread->t_flag &= ~T_WOULDBLOCK;
1721                         if (res != (char *)&res_buf)
1722                                 SVC_FREERES(xprt);
1723                         error++;
1724                         goto done;
1725                 }
1726         }
1727 
1728         if (auth_tooweak(req, res)) {
1729                 svcerr_weakauth(xprt);
1730                 error++;
1731                 goto done;
1732         }
1733 
1734         /*
1735          * Check to see if logging has been enabled on the server.
1736          * If so, then obtain the export info struct to be used for
1737          * the later writing of the log record.  This is done for
1738          * the case that a lookup is done across a non-logged public
1739          * file system.
1740          */
1741         if (nfslog_buffer_list != NULL) {
1742                 nfslog_exi = nfslog_get_exi(ne, exi, req, res, &nfslog_rec_id);
1743                 /*
1744                  * Is logging enabled?
1745                  */
1746                 logging_enabled = (nfslog_exi != NULL);
1747 
1748                 /*
1749                  * Copy the netbuf for logging purposes, before it is
1750                  * freed by svc_sendreply().
1751                  */
1752                 if (logging_enabled) {
1753                         NFSLOG_COPY_NETBUF(nfslog_exi, xprt, &nb);
1754                         /*
1755                          * If RPC_MAPRESP flag set (i.e. in V2 ops) the
1756                          * res gets copied directly into the mbuf and
1757                          * may be freed soon after the sendreply. So we
1758                          * must copy it here to a safe place...
1759                          */
1760                         if (res != (char *)&res_buf) {
1761                                 bcopy(res, (char *)&res_buf, disp->dis_ressz);
1762                         }
1763                 }
1764         }
1765 
1766         /*
1767          * Serialize and send results struct
1768          */
1769 #ifdef DEBUG
1770         if (rfs_no_fast_xdrres == 0 && res != (char *)&res_buf)
1771 #else
1772         if (res != (char *)&res_buf)
1773 #endif
1774         {
1775                 if (!svc_sendreply(xprt, disp->dis_fastxdrres, res)) {
1776                         cmn_err(CE_NOTE, "%s: bad sendreply", pgmname);
1777                         svcerr_systemerr(xprt);
1778                         error++;
1779                 }
1780         } else {
1781                 if (!svc_sendreply(xprt, disp->dis_xdrres, res)) {
1782                         cmn_err(CE_NOTE, "%s: bad sendreply", pgmname);
1783                         svcerr_systemerr(xprt);
1784                         error++;
1785                 }
1786         }
1787 
1788         /*
1789          * Log if needed
1790          */
1791         if (logging_enabled) {
1792                 nfslog_write_record(nfslog_exi, req, args, (char *)&res_buf,
1793                     cr, &nb, nfslog_rec_id, NFSLOG_ONE_BUFFER);
1794                 exi_rele(nfslog_exi);
1795                 kmem_free((&nb)->buf, (&nb)->len);
1796         }
1797 
1798         /*
1799          * Free results struct. With the addition of NFS V4 we can
1800          * have non-idempotent procedures with functions.
1801          */
1802         if (disp->dis_resfree != nullfree && dupcached == FALSE) {
1803                 (*disp->dis_resfree)(res);
1804         }
1805 
1806 done:
1807         /*
1808          * Free arguments struct
1809          */
1810         if (disp) {
1811                 if (!SVC_FREEARGS(xprt, disp->dis_xdrargs, args)) {
1812                         cmn_err(CE_NOTE, "%s: bad freeargs", pgmname);
1813                         error++;
1814                 }
1815         } else {
1816                 if (!SVC_FREEARGS(xprt, (xdrproc_t)0, (caddr_t)0)) {
1817                         cmn_err(CE_NOTE, "%s: bad freeargs", pgmname);
1818                         error++;
1819                 }
1820         }
1821 
1822         if (exi != NULL)
1823                 exi_rele(exi);
1824 
1825         global_svstat_ptr[req->rq_vers][NFS_BADCALLS].value.ui64 += error;
1826 
1827         global_svstat_ptr[req->rq_vers][NFS_CALLS].value.ui64++;
1828 }
1829 
1830 static void
1831 rfs_dispatch(struct svc_req *req, SVCXPRT *xprt)
1832 {
1833         common_dispatch(req, xprt, NFS_VERSMIN, NFS_VERSMAX,
1834             "NFS", rfs_disptable);
1835 }
1836 
1837 static char *aclcallnames_v2[] = {
1838         "ACL2_NULL",
1839         "ACL2_GETACL",
1840         "ACL2_SETACL",
1841         "ACL2_GETATTR",
1842         "ACL2_ACCESS",
1843         "ACL2_GETXATTRDIR"
1844 };
1845 
1846 static struct rpcdisp acldisptab_v2[] = {
1847         /*
1848          * ACL VERSION 2
1849          */
1850 
1851         /* ACL2_NULL = 0 */
1852         {rpc_null,
1853             xdr_void, NULL_xdrproc_t, 0,
1854             xdr_void, NULL_xdrproc_t, 0,
1855             nullfree, RPC_IDEMPOTENT,
1856             0},
1857 
1858         /* ACL2_GETACL = 1 */
1859         {acl2_getacl,
1860             xdr_GETACL2args, xdr_fastGETACL2args, sizeof (GETACL2args),
1861             xdr_GETACL2res, NULL_xdrproc_t, sizeof (GETACL2res),
1862             acl2_getacl_free, RPC_IDEMPOTENT,
1863             acl2_getacl_getfh},
1864 
1865         /* ACL2_SETACL = 2 */
1866         {acl2_setacl,
1867             xdr_SETACL2args, NULL_xdrproc_t, sizeof (SETACL2args),
1868 #ifdef _LITTLE_ENDIAN
1869             xdr_SETACL2res, xdr_fastSETACL2res, sizeof (SETACL2res),
1870 #else
1871             xdr_SETACL2res, NULL_xdrproc_t, sizeof (SETACL2res),
1872 #endif
1873             nullfree, RPC_MAPRESP,
1874             acl2_setacl_getfh},
1875 
1876         /* ACL2_GETATTR = 3 */
1877         {acl2_getattr,
1878             xdr_GETATTR2args, xdr_fastGETATTR2args, sizeof (GETATTR2args),
1879 #ifdef _LITTLE_ENDIAN
1880             xdr_GETATTR2res, xdr_fastGETATTR2res, sizeof (GETATTR2res),
1881 #else
1882             xdr_GETATTR2res, NULL_xdrproc_t, sizeof (GETATTR2res),
1883 #endif
1884             nullfree, RPC_IDEMPOTENT|RPC_ALLOWANON|RPC_MAPRESP,
1885             acl2_getattr_getfh},
1886 
1887         /* ACL2_ACCESS = 4 */
1888         {acl2_access,
1889             xdr_ACCESS2args, xdr_fastACCESS2args, sizeof (ACCESS2args),
1890 #ifdef _LITTLE_ENDIAN
1891             xdr_ACCESS2res, xdr_fastACCESS2res, sizeof (ACCESS2res),
1892 #else
1893             xdr_ACCESS2res, NULL_xdrproc_t, sizeof (ACCESS2res),
1894 #endif
1895             nullfree, RPC_IDEMPOTENT|RPC_MAPRESP,
1896             acl2_access_getfh},
1897 
1898         /* ACL2_GETXATTRDIR = 5 */
1899         {acl2_getxattrdir,
1900             xdr_GETXATTRDIR2args, NULL_xdrproc_t, sizeof (GETXATTRDIR2args),
1901             xdr_GETXATTRDIR2res, NULL_xdrproc_t, sizeof (GETXATTRDIR2res),
1902             nullfree, RPC_IDEMPOTENT,
1903             acl2_getxattrdir_getfh},
1904 };
1905 
1906 static char *aclcallnames_v3[] = {
1907         "ACL3_NULL",
1908         "ACL3_GETACL",
1909         "ACL3_SETACL",
1910         "ACL3_GETXATTRDIR"
1911 };
1912 
1913 static struct rpcdisp acldisptab_v3[] = {
1914         /*
1915          * ACL VERSION 3
1916          */
1917 
1918         /* ACL3_NULL = 0 */
1919         {rpc_null,
1920             xdr_void, NULL_xdrproc_t, 0,
1921             xdr_void, NULL_xdrproc_t, 0,
1922             nullfree, RPC_IDEMPOTENT,
1923             0},
1924 
1925         /* ACL3_GETACL = 1 */
1926         {acl3_getacl,
1927             xdr_GETACL3args, NULL_xdrproc_t, sizeof (GETACL3args),
1928             xdr_GETACL3res, NULL_xdrproc_t, sizeof (GETACL3res),
1929             acl3_getacl_free, RPC_IDEMPOTENT,
1930             acl3_getacl_getfh},
1931 
1932         /* ACL3_SETACL = 2 */
1933         {acl3_setacl,
1934             xdr_SETACL3args, NULL_xdrproc_t, sizeof (SETACL3args),
1935             xdr_SETACL3res, NULL_xdrproc_t, sizeof (SETACL3res),
1936             nullfree, 0,
1937             acl3_setacl_getfh},
1938 
1939         /* ACL3_GETXATTRDIR = 3 */
1940         {acl3_getxattrdir,
1941             xdr_GETXATTRDIR3args, NULL_xdrproc_t, sizeof (GETXATTRDIR3args),
1942             xdr_GETXATTRDIR3res, NULL_xdrproc_t, sizeof (GETXATTRDIR3res),
1943             nullfree, RPC_IDEMPOTENT,
1944             acl3_getxattrdir_getfh},
1945 };
1946 
1947 static struct rpc_disptable acl_disptable[] = {
1948         {sizeof (acldisptab_v2) / sizeof (acldisptab_v2[0]),
1949                 aclcallnames_v2,
1950                 &aclproccnt_v2_ptr, acldisptab_v2},
1951         {sizeof (acldisptab_v3) / sizeof (acldisptab_v3[0]),
1952                 aclcallnames_v3,
1953                 &aclproccnt_v3_ptr, acldisptab_v3},
1954 };
1955 
1956 static void
1957 acl_dispatch(struct svc_req *req, SVCXPRT *xprt)
1958 {
1959         common_dispatch(req, xprt, NFS_ACL_VERSMIN, NFS_ACL_VERSMAX,
1960             "ACL", acl_disptable);
1961 }
1962 
1963 int
1964 checkwin(int flavor, int window, struct svc_req *req)
1965 {
1966         struct authdes_cred *adc;
1967 
1968         switch (flavor) {
1969         case AUTH_DES:
1970                 adc = (struct authdes_cred *)req->rq_clntcred;
1971                 CTASSERT(sizeof (struct authdes_cred) <= RQCRED_SIZE);
1972                 if (adc->adc_fullname.window > window)
1973                         return (0);
1974                 break;
1975 
1976         default:
1977                 break;
1978         }
1979         return (1);
1980 }
1981 
1982 
1983 /*
1984  * checkauth() will check the access permission against the export
1985  * information.  Then map root uid/gid to appropriate uid/gid.
1986  *
1987  * This routine is used by NFS V3 and V2 code.
1988  */
1989 static int
1990 checkauth(struct exportinfo *exi, struct svc_req *req, cred_t *cr, int anon_ok,
1991     bool_t publicfh_ok, bool_t *ro)
1992 {
1993         int i, nfsflavor, rpcflavor, stat, access;
1994         struct secinfo *secp;
1995         caddr_t principal;
1996         char buf[INET6_ADDRSTRLEN]; /* to hold both IPv4 and IPv6 addr */
1997         int anon_res = 0;
1998 
1999         uid_t uid;
2000         gid_t gid;
2001         uint_t ngids;
2002         gid_t *gids;
2003 
2004         /*
2005          * Check for privileged port number
2006          * N.B.:  this assumes that we know the format of a netbuf.
2007          */
2008         if (nfs_portmon) {
2009                 struct sockaddr *ca;
2010                 ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2011 
2012                 if (ca == NULL)
2013                         return (0);
2014 
2015                 if ((ca->sa_family == AF_INET &&
2016                     ntohs(((struct sockaddr_in *)ca)->sin_port) >=
2017                     IPPORT_RESERVED) ||
2018                     (ca->sa_family == AF_INET6 &&
2019                     ntohs(((struct sockaddr_in6 *)ca)->sin6_port) >=
2020                     IPPORT_RESERVED)) {
2021                         cmn_err(CE_NOTE,
2022                             "nfs_server: client %s%ssent NFS request from "
2023                             "unprivileged port",
2024                             client_name(req), client_addr(req, buf));
2025                         return (0);
2026                 }
2027         }
2028 
2029         /*
2030          *  return 1 on success or 0 on failure
2031          */
2032         stat = sec_svc_getcred(req, cr, &principal, &nfsflavor);
2033 
2034         /*
2035          * A failed AUTH_UNIX sec_svc_getcred() implies we couldn't set
2036          * the credentials; below we map that to anonymous.
2037          */
2038         if (!stat && nfsflavor != AUTH_UNIX) {
2039                 cmn_err(CE_NOTE,
2040                     "nfs_server: couldn't get unix cred for %s",
2041                     client_name(req));
2042                 return (0);
2043         }
2044 
2045         /*
2046          * Short circuit checkauth() on operations that support the
2047          * public filehandle, and if the request for that operation
2048          * is using the public filehandle. Note that we must call
2049          * sec_svc_getcred() first so that xp_cookie is set to the
2050          * right value. Normally xp_cookie is just the RPC flavor
2051          * of the the request, but in the case of RPCSEC_GSS it
2052          * could be a pseudo flavor.
2053          */
2054         if (publicfh_ok)
2055                 return (1);
2056 
2057         rpcflavor = req->rq_cred.oa_flavor;
2058         /*
2059          * Check if the auth flavor is valid for this export
2060          */
2061         access = nfsauth_access(exi, req, cr, &uid, &gid, &ngids, &gids);
2062         if (access & NFSAUTH_DROP)
2063                 return (-1);    /* drop the request */
2064 
2065         if (access & NFSAUTH_RO)
2066                 *ro = TRUE;
2067 
2068         if (access & NFSAUTH_DENIED) {
2069                 /*
2070                  * If anon_ok == 1 and we got NFSAUTH_DENIED, it was
2071                  * probably due to the flavor not matching during
2072                  * the mount attempt. So map the flavor to AUTH_NONE
2073                  * so that the credentials get mapped to the anonymous
2074                  * user.
2075                  */
2076                 if (anon_ok == 1)
2077                         rpcflavor = AUTH_NONE;
2078                 else
2079                         return (0);     /* deny access */
2080 
2081         } else if (access & NFSAUTH_MAPNONE) {
2082                 /*
2083                  * Access was granted even though the flavor mismatched
2084                  * because AUTH_NONE was one of the exported flavors.
2085                  */
2086                 rpcflavor = AUTH_NONE;
2087 
2088         } else if (access & NFSAUTH_WRONGSEC) {
2089                 /*
2090                  * NFSAUTH_WRONGSEC is used for NFSv4. If we get here,
2091                  * it means a client ignored the list of allowed flavors
2092                  * returned via the MOUNT protocol. So we just disallow it!
2093                  */
2094                 return (0);
2095         }
2096 
2097         if (rpcflavor != AUTH_SYS)
2098                 kmem_free(gids, ngids * sizeof (gid_t));
2099 
2100         switch (rpcflavor) {
2101         case AUTH_NONE:
2102                 anon_res = crsetugid(cr, exi->exi_export.ex_anon,
2103                     exi->exi_export.ex_anon);
2104                 (void) crsetgroups(cr, 0, NULL);
2105                 break;
2106 
2107         case AUTH_UNIX:
2108                 if (!stat || crgetuid(cr) == 0 && !(access & NFSAUTH_UIDMAP)) {
2109                         anon_res = crsetugid(cr, exi->exi_export.ex_anon,
2110                             exi->exi_export.ex_anon);
2111                         (void) crsetgroups(cr, 0, NULL);
2112                 } else if (crgetuid(cr) == 0 && access & NFSAUTH_ROOT) {
2113                         /*
2114                          * It is root, so apply rootid to get real UID
2115                          * Find the secinfo structure.  We should be able
2116                          * to find it by the time we reach here.
2117                          * nfsauth_access() has done the checking.
2118                          */
2119                         secp = NULL;
2120                         for (i = 0; i < exi->exi_export.ex_seccnt; i++) {
2121                                 struct secinfo *sptr;
2122                                 sptr = &exi->exi_export.ex_secinfo[i];
2123                                 if (sptr->s_secinfo.sc_nfsnum == nfsflavor) {
2124                                         secp = sptr;
2125                                         break;
2126                                 }
2127                         }
2128                         if (secp != NULL) {
2129                                 (void) crsetugid(cr, secp->s_rootid,
2130                                     secp->s_rootid);
2131                                 (void) crsetgroups(cr, 0, NULL);
2132                         }
2133                 } else if (crgetuid(cr) != uid || crgetgid(cr) != gid) {
2134                         if (crsetugid(cr, uid, gid) != 0)
2135                                 anon_res = crsetugid(cr,
2136                                     exi->exi_export.ex_anon,
2137                                     exi->exi_export.ex_anon);
2138                         (void) crsetgroups(cr, 0, NULL);
2139                 } else if (access & NFSAUTH_GROUPS) {
2140                         (void) crsetgroups(cr, ngids, gids);
2141                 }
2142 
2143                 kmem_free(gids, ngids * sizeof (gid_t));
2144 
2145                 break;
2146 
2147         case AUTH_DES:
2148         case RPCSEC_GSS:
2149                 /*
2150                  *  Find the secinfo structure.  We should be able
2151                  *  to find it by the time we reach here.
2152                  *  nfsauth_access() has done the checking.
2153                  */
2154                 secp = NULL;
2155                 for (i = 0; i < exi->exi_export.ex_seccnt; i++) {
2156                         if (exi->exi_export.ex_secinfo[i].s_secinfo.sc_nfsnum ==
2157                             nfsflavor) {
2158                                 secp = &exi->exi_export.ex_secinfo[i];
2159                                 break;
2160                         }
2161                 }
2162 
2163                 if (!secp) {
2164                         cmn_err(CE_NOTE, "nfs_server: client %s%shad "
2165                             "no secinfo data for flavor %d",
2166                             client_name(req), client_addr(req, buf),
2167                             nfsflavor);
2168                         return (0);
2169                 }
2170 
2171                 if (!checkwin(rpcflavor, secp->s_window, req)) {
2172                         cmn_err(CE_NOTE,
2173                             "nfs_server: client %s%sused invalid "
2174                             "auth window value",
2175                             client_name(req), client_addr(req, buf));
2176                         return (0);
2177                 }
2178 
2179                 /*
2180                  * Map root principals listed in the share's root= list to root,
2181                  * and map any others principals that were mapped to root by RPC
2182                  * to anon.
2183                  */
2184                 if (principal && sec_svc_inrootlist(rpcflavor, principal,
2185                     secp->s_rootcnt, secp->s_rootnames)) {
2186                         if (crgetuid(cr) == 0 && secp->s_rootid == 0)
2187                                 return (1);
2188 
2189 
2190                         (void) crsetugid(cr, secp->s_rootid, secp->s_rootid);
2191 
2192                         /*
2193                          * NOTE: If and when kernel-land privilege tracing is
2194                          * added this may have to be replaced with code that
2195                          * retrieves root's supplementary groups (e.g., using
2196                          * kgss_get_group_info().  In the meantime principals
2197                          * mapped to uid 0 get all privileges, so setting cr's
2198                          * supplementary groups for them does nothing.
2199                          */
2200                         (void) crsetgroups(cr, 0, NULL);
2201 
2202                         return (1);
2203                 }
2204 
2205                 /*
2206                  * Not a root princ, or not in root list, map UID 0/nobody to
2207                  * the anon ID for the share.  (RPC sets cr's UIDs and GIDs to
2208                  * UID_NOBODY and GID_NOBODY, respectively.)
2209                  */
2210                 if (crgetuid(cr) != 0 &&
2211                     (crgetuid(cr) != UID_NOBODY || crgetgid(cr) != GID_NOBODY))
2212                         return (1);
2213 
2214                 anon_res = crsetugid(cr, exi->exi_export.ex_anon,
2215                     exi->exi_export.ex_anon);
2216                 (void) crsetgroups(cr, 0, NULL);
2217                 break;
2218         default:
2219                 return (0);
2220         } /* switch on rpcflavor */
2221 
2222         /*
2223          * Even if anon access is disallowed via ex_anon == -1, we allow
2224          * this access if anon_ok is set.  So set creds to the default
2225          * "nobody" id.
2226          */
2227         if (anon_res != 0) {
2228                 if (anon_ok == 0) {
2229                         cmn_err(CE_NOTE,
2230                             "nfs_server: client %s%ssent wrong "
2231                             "authentication for %s",
2232                             client_name(req), client_addr(req, buf),
2233                             exi->exi_export.ex_path ?
2234                             exi->exi_export.ex_path : "?");
2235                         return (0);
2236                 }
2237 
2238                 if (crsetugid(cr, UID_NOBODY, GID_NOBODY) != 0)
2239                         return (0);
2240         }
2241 
2242         return (1);
2243 }
2244 
2245 /*
2246  * returns 0 on failure, -1 on a drop, -2 on wrong security flavor,
2247  * and 1 on success
2248  */
2249 int
2250 checkauth4(struct compound_state *cs, struct svc_req *req)
2251 {
2252         int i, rpcflavor, access;
2253         struct secinfo *secp;
2254         char buf[MAXHOST + 1];
2255         int anon_res = 0, nfsflavor;
2256         struct exportinfo *exi;
2257         cred_t  *cr;
2258         caddr_t principal;
2259 
2260         uid_t uid;
2261         gid_t gid;
2262         uint_t ngids;
2263         gid_t *gids;
2264 
2265         exi = cs->exi;
2266         cr = cs->cr;
2267         principal = cs->principal;
2268         nfsflavor = cs->nfsflavor;
2269 
2270         ASSERT(cr != NULL);
2271 
2272         rpcflavor = req->rq_cred.oa_flavor;
2273         cs->access &= ~CS_ACCESS_LIMITED;
2274 
2275         /*
2276          * Check for privileged port number
2277          * N.B.:  this assumes that we know the format of a netbuf.
2278          */
2279         if (nfs_portmon) {
2280                 struct sockaddr *ca;
2281                 ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2282 
2283                 if (ca == NULL)
2284                         return (0);
2285 
2286                 if ((ca->sa_family == AF_INET &&
2287                     ntohs(((struct sockaddr_in *)ca)->sin_port) >=
2288                     IPPORT_RESERVED) ||
2289                     (ca->sa_family == AF_INET6 &&
2290                     ntohs(((struct sockaddr_in6 *)ca)->sin6_port) >=
2291                     IPPORT_RESERVED)) {
2292                         cmn_err(CE_NOTE,
2293                             "nfs_server: client %s%ssent NFSv4 request from "
2294                             "unprivileged port",
2295                             client_name(req), client_addr(req, buf));
2296                         return (0);
2297                 }
2298         }
2299 
2300         /*
2301          * Check the access right per auth flavor on the vnode of
2302          * this export for the given request.
2303          */
2304         access = nfsauth4_access(cs->exi, cs->vp, req, cr, &uid, &gid, &ngids,
2305             &gids);
2306 
2307         if (access & NFSAUTH_WRONGSEC)
2308                 return (-2);    /* no access for this security flavor */
2309 
2310         if (access & NFSAUTH_DROP)
2311                 return (-1);    /* drop the request */
2312 
2313         if (access & NFSAUTH_DENIED) {
2314 
2315                 if (exi->exi_export.ex_seccnt > 0)
2316                         return (0);     /* deny access */
2317 
2318         } else if (access & NFSAUTH_LIMITED) {
2319 
2320                 cs->access |= CS_ACCESS_LIMITED;
2321 
2322         } else if (access & NFSAUTH_MAPNONE) {
2323                 /*
2324                  * Access was granted even though the flavor mismatched
2325                  * because AUTH_NONE was one of the exported flavors.
2326                  */
2327                 rpcflavor = AUTH_NONE;
2328         }
2329 
2330         /*
2331          * XXX probably need to redo some of it for nfsv4?
2332          * return 1 on success or 0 on failure
2333          */
2334 
2335         if (rpcflavor != AUTH_SYS)
2336                 kmem_free(gids, ngids * sizeof (gid_t));
2337 
2338         switch (rpcflavor) {
2339         case AUTH_NONE:
2340                 anon_res = crsetugid(cr, exi->exi_export.ex_anon,
2341                     exi->exi_export.ex_anon);
2342                 (void) crsetgroups(cr, 0, NULL);
2343                 break;
2344 
2345         case AUTH_UNIX:
2346                 if (crgetuid(cr) == 0 && !(access & NFSAUTH_UIDMAP)) {
2347                         anon_res = crsetugid(cr, exi->exi_export.ex_anon,
2348                             exi->exi_export.ex_anon);
2349                         (void) crsetgroups(cr, 0, NULL);
2350                 } else if (crgetuid(cr) == 0 && access & NFSAUTH_ROOT) {
2351                         /*
2352                          * It is root, so apply rootid to get real UID
2353                          * Find the secinfo structure.  We should be able
2354                          * to find it by the time we reach here.
2355                          * nfsauth_access() has done the checking.
2356                          */
2357                         secp = NULL;
2358                         for (i = 0; i < exi->exi_export.ex_seccnt; i++) {
2359                                 struct secinfo *sptr;
2360                                 sptr = &exi->exi_export.ex_secinfo[i];
2361                                 if (sptr->s_secinfo.sc_nfsnum == nfsflavor) {
2362                                         secp = &exi->exi_export.ex_secinfo[i];
2363                                         break;
2364                                 }
2365                         }
2366                         if (secp != NULL) {
2367                                 (void) crsetugid(cr, secp->s_rootid,
2368                                     secp->s_rootid);
2369                                 (void) crsetgroups(cr, 0, NULL);
2370                         }
2371                 } else if (crgetuid(cr) != uid || crgetgid(cr) != gid) {
2372                         if (crsetugid(cr, uid, gid) != 0)
2373                                 anon_res = crsetugid(cr,
2374                                     exi->exi_export.ex_anon,
2375                                     exi->exi_export.ex_anon);
2376                         (void) crsetgroups(cr, 0, NULL);
2377                 } if (access & NFSAUTH_GROUPS) {
2378                         (void) crsetgroups(cr, ngids, gids);
2379                 }
2380 
2381                 kmem_free(gids, ngids * sizeof (gid_t));
2382 
2383                 break;
2384 
2385         default:
2386                 /*
2387                  *  Find the secinfo structure.  We should be able
2388                  *  to find it by the time we reach here.
2389                  *  nfsauth_access() has done the checking.
2390                  */
2391                 secp = NULL;
2392                 for (i = 0; i < exi->exi_export.ex_seccnt; i++) {
2393                         if (exi->exi_export.ex_secinfo[i].s_secinfo.sc_nfsnum ==
2394                             nfsflavor) {
2395                                 secp = &exi->exi_export.ex_secinfo[i];
2396                                 break;
2397                         }
2398                 }
2399 
2400                 if (!secp) {
2401                         cmn_err(CE_NOTE, "nfs_server: client %s%shad "
2402                             "no secinfo data for flavor %d",
2403                             client_name(req), client_addr(req, buf),
2404                             nfsflavor);
2405                         return (0);
2406                 }
2407 
2408                 if (!checkwin(rpcflavor, secp->s_window, req)) {
2409                         cmn_err(CE_NOTE,
2410                             "nfs_server: client %s%sused invalid "
2411                             "auth window value",
2412                             client_name(req), client_addr(req, buf));
2413                         return (0);
2414                 }
2415 
2416                 /*
2417                  * Map root principals listed in the share's root= list to root,
2418                  * and map any others principals that were mapped to root by RPC
2419                  * to anon. If not going to anon, set to rootid (root_mapping).
2420                  */
2421                 if (principal && sec_svc_inrootlist(rpcflavor, principal,
2422                     secp->s_rootcnt, secp->s_rootnames)) {
2423                         if (crgetuid(cr) == 0 && secp->s_rootid == 0)
2424                                 return (1);
2425 
2426                         (void) crsetugid(cr, secp->s_rootid, secp->s_rootid);
2427 
2428                         /*
2429                          * NOTE: If and when kernel-land privilege tracing is
2430                          * added this may have to be replaced with code that
2431                          * retrieves root's supplementary groups (e.g., using
2432                          * kgss_get_group_info().  In the meantime principals
2433                          * mapped to uid 0 get all privileges, so setting cr's
2434                          * supplementary groups for them does nothing.
2435                          */
2436                         (void) crsetgroups(cr, 0, NULL);
2437 
2438                         return (1);
2439                 }
2440 
2441                 /*
2442                  * Not a root princ, or not in root list, map UID 0/nobody to
2443                  * the anon ID for the share.  (RPC sets cr's UIDs and GIDs to
2444                  * UID_NOBODY and GID_NOBODY, respectively.)
2445                  */
2446                 if (crgetuid(cr) != 0 &&
2447                     (crgetuid(cr) != UID_NOBODY || crgetgid(cr) != GID_NOBODY))
2448                         return (1);
2449 
2450                 anon_res = crsetugid(cr, exi->exi_export.ex_anon,
2451                     exi->exi_export.ex_anon);
2452                 (void) crsetgroups(cr, 0, NULL);
2453                 break;
2454         } /* switch on rpcflavor */
2455 
2456         /*
2457          * Even if anon access is disallowed via ex_anon == -1, we allow
2458          * this access if anon_ok is set.  So set creds to the default
2459          * "nobody" id.
2460          */
2461 
2462         if (anon_res != 0) {
2463                 cmn_err(CE_NOTE,
2464                     "nfs_server: client %s%ssent wrong "
2465                     "authentication for %s",
2466                     client_name(req), client_addr(req, buf),
2467                     exi->exi_export.ex_path ?
2468                     exi->exi_export.ex_path : "?");
2469                 return (0);
2470         }
2471 
2472         return (1);
2473 }
2474 
2475 
2476 static char *
2477 client_name(struct svc_req *req)
2478 {
2479         char *hostname = NULL;
2480 
2481         /*
2482          * If it's a Unix cred then use the
2483          * hostname from the credential.
2484          */
2485         if (req->rq_cred.oa_flavor == AUTH_UNIX) {
2486                 hostname = ((struct authunix_parms *)
2487                     req->rq_clntcred)->aup_machname;
2488         }
2489         if (hostname == NULL)
2490                 hostname = "";
2491 
2492         return (hostname);
2493 }
2494 
2495 static char *
2496 client_addr(struct svc_req *req, char *buf)
2497 {
2498         struct sockaddr *ca;
2499         uchar_t *b;
2500         char *frontspace = "";
2501 
2502         /*
2503          * We assume we are called in tandem with client_name and the
2504          * format string looks like "...client %s%sblah blah..."
2505          *
2506          * If it's a Unix cred then client_name returned
2507          * a host name, so we need insert a space between host name
2508          * and IP address.
2509          */
2510         if (req->rq_cred.oa_flavor == AUTH_UNIX)
2511                 frontspace = " ";
2512 
2513         /*
2514          * Convert the caller's IP address to a dotted string
2515          */
2516         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2517 
2518         if (ca->sa_family == AF_INET) {
2519                 b = (uchar_t *)&((struct sockaddr_in *)ca)->sin_addr;
2520                 (void) sprintf(buf, "%s(%d.%d.%d.%d) ", frontspace,
2521                     b[0] & 0xFF, b[1] & 0xFF, b[2] & 0xFF, b[3] & 0xFF);
2522         } else if (ca->sa_family == AF_INET6) {
2523                 struct sockaddr_in6 *sin6;
2524                 sin6 = (struct sockaddr_in6 *)ca;
2525                 (void) kinet_ntop6((uchar_t *)&sin6->sin6_addr,
2526                     buf, INET6_ADDRSTRLEN);
2527 
2528         } else {
2529 
2530                 /*
2531                  * No IP address to print. If there was a host name
2532                  * printed, then we print a space.
2533                  */
2534                 (void) sprintf(buf, frontspace);
2535         }
2536 
2537         return (buf);
2538 }
2539 
2540 /*
2541  * NFS Server initialization routine.  This routine should only be called
2542  * once.  It performs the following tasks:
2543  *      - Call sub-initialization routines (localize access to variables)
2544  *      - Initialize all locks
2545  *      - initialize the version 3 write verifier
2546  */
2547 void
2548 nfs_srvinit(void)
2549 {
2550         /* NFS server zone-specific global variables */
2551         zone_key_create(&nfssrv_zone_key, nfs_srv_zone_init,
2552             NULL, nfs_srv_zone_fini);
2553 
2554         nfs_exportinit();
2555         rfs_srvrinit();
2556         rfs3_srvrinit();
2557         rfs4_srvrinit();
2558         nfsauth_init();
2559 }
2560 
2561 /*
2562  * NFS Server finalization routine. This routine is called to cleanup the
2563  * initialization work previously performed if the NFS server module could
2564  * not be loaded correctly.
2565  */
2566 void
2567 nfs_srvfini(void)
2568 {
2569         nfsauth_fini();
2570         rfs4_srvrfini();
2571         rfs3_srvrfini();
2572         rfs_srvrfini();
2573         nfs_exportfini();
2574 
2575         (void) zone_key_delete(nfssrv_zone_key);
2576 }
2577 
2578 /* ARGSUSED */
2579 static void *
2580 nfs_srv_zone_init(zoneid_t zoneid)
2581 {
2582         nfs_globals_t *ng;
2583 
2584         ng = kmem_zalloc(sizeof (*ng), KM_SLEEP);
2585 
2586         ng->nfs_versmin = NFS_VERSMIN_DEFAULT;
2587         ng->nfs_versmax = NFS_VERSMAX_DEFAULT;
2588 
2589         /* Init the stuff to control start/stop */
2590         ng->nfs_server_upordown = NFS_SERVER_STOPPED;
2591         mutex_init(&ng->nfs_server_upordown_lock, NULL, MUTEX_DEFAULT, NULL);
2592         cv_init(&ng->nfs_server_upordown_cv, NULL, CV_DEFAULT, NULL);
2593         mutex_init(&ng->rdma_wait_mutex, NULL, MUTEX_DEFAULT, NULL);
2594         cv_init(&ng->rdma_wait_cv, NULL, CV_DEFAULT, NULL);
2595 
2596         return (ng);
2597 }
2598 
2599 /* ARGSUSED */
2600 static void
2601 nfs_srv_zone_fini(zoneid_t zoneid, void *data)
2602 {
2603         nfs_globals_t *ng;
2604 
2605         ng = (nfs_globals_t *)data;
2606         mutex_destroy(&ng->nfs_server_upordown_lock);
2607         cv_destroy(&ng->nfs_server_upordown_cv);
2608         mutex_destroy(&ng->rdma_wait_mutex);
2609         cv_destroy(&ng->rdma_wait_cv);
2610 
2611         kmem_free(ng, sizeof (*ng));
2612 }
2613 
2614 /*
2615  * Set up an iovec array of up to cnt pointers.
2616  */
2617 void
2618 mblk_to_iov(mblk_t *m, int cnt, struct iovec *iovp)
2619 {
2620         while (m != NULL && cnt-- > 0) {
2621                 iovp->iov_base = (caddr_t)m->b_rptr;
2622                 iovp->iov_len = (m->b_wptr - m->b_rptr);
2623                 iovp++;
2624                 m = m->b_cont;
2625         }
2626 }
2627 
2628 /*
2629  * Common code between NFS Version 2 and NFS Version 3 for the public
2630  * filehandle multicomponent lookups.
2631  */
2632 
2633 /*
2634  * Public filehandle evaluation of a multi-component lookup, following
2635  * symbolic links, if necessary. This may result in a vnode in another
2636  * filesystem, which is OK as long as the other filesystem is exported.
2637  *
2638  * Note that the exi will be set either to NULL or a new reference to the
2639  * exportinfo struct that corresponds to the vnode of the multi-component path.
2640  * It is the callers responsibility to release this reference.
2641  */
2642 int
2643 rfs_publicfh_mclookup(char *p, vnode_t *dvp, cred_t *cr, vnode_t **vpp,
2644     struct exportinfo **exi, struct sec_ol *sec)
2645 {
2646         int pathflag;
2647         vnode_t *mc_dvp = NULL;
2648         vnode_t *realvp;
2649         int error;
2650 
2651         *exi = NULL;
2652 
2653         /*
2654          * check if the given path is a url or native path. Since p is
2655          * modified by MCLpath(), it may be empty after returning from
2656          * there, and should be checked.
2657          */
2658         if ((pathflag = MCLpath(&p)) == -1)
2659                 return (EIO);
2660 
2661         /*
2662          * If pathflag is SECURITY_QUERY, turn the SEC_QUERY bit
2663          * on in sec->sec_flags. This bit will later serve as an
2664          * indication in makefh_ol() or makefh3_ol() to overload the
2665          * filehandle to contain the sec modes used by the server for
2666          * the path.
2667          */
2668         if (pathflag == SECURITY_QUERY) {
2669                 if ((sec->sec_index = (uint_t)(*p)) > 0) {
2670                         sec->sec_flags |= SEC_QUERY;
2671                         p++;
2672                         if ((pathflag = MCLpath(&p)) == -1)
2673                                 return (EIO);
2674                 } else {
2675                         cmn_err(CE_NOTE,
2676                             "nfs_server: invalid security index %d, "
2677                             "violating WebNFS SNEGO protocol.", sec->sec_index);
2678                         return (EIO);
2679                 }
2680         }
2681 
2682         if (p[0] == '\0') {
2683                 error = ENOENT;
2684                 goto publicfh_done;
2685         }
2686 
2687         error = rfs_pathname(p, &mc_dvp, vpp, dvp, cr, pathflag);
2688 
2689         /*
2690          * If name resolves to "/" we get EINVAL since we asked for
2691          * the vnode of the directory that the file is in. Try again
2692          * with NULL directory vnode.
2693          */
2694         if (error == EINVAL) {
2695                 error = rfs_pathname(p, NULL, vpp, dvp, cr, pathflag);
2696                 if (!error) {
2697                         ASSERT(*vpp != NULL);
2698                         if ((*vpp)->v_type == VDIR) {
2699                                 VN_HOLD(*vpp);
2700                                 mc_dvp = *vpp;
2701                         } else {
2702                                 /*
2703                                  * This should not happen, the filesystem is
2704                                  * in an inconsistent state. Fail the lookup
2705                                  * at this point.
2706                                  */
2707                                 VN_RELE(*vpp);
2708                                 error = EINVAL;
2709                         }
2710                 }
2711         }
2712 
2713         if (error)
2714                 goto publicfh_done;
2715 
2716         if (*vpp == NULL) {
2717                 error = ENOENT;
2718                 goto publicfh_done;
2719         }
2720 
2721         ASSERT(mc_dvp != NULL);
2722         ASSERT(*vpp != NULL);
2723 
2724         if ((*vpp)->v_type == VDIR) {
2725                 do {
2726                         /*
2727                          * *vpp may be an AutoFS node, so we perform
2728                          * a VOP_ACCESS() to trigger the mount of the intended
2729                          * filesystem, so we can perform the lookup in the
2730                          * intended filesystem.
2731                          */
2732                         (void) VOP_ACCESS(*vpp, 0, 0, cr, NULL);
2733 
2734                         /*
2735                          * If vnode is covered, get the
2736                          * the topmost vnode.
2737                          */
2738                         if (vn_mountedvfs(*vpp) != NULL) {
2739                                 error = traverse(vpp);
2740                                 if (error) {
2741                                         VN_RELE(*vpp);
2742                                         goto publicfh_done;
2743                                 }
2744                         }
2745 
2746                         if (VOP_REALVP(*vpp, &realvp, NULL) == 0 &&
2747                             realvp != *vpp) {
2748                                 /*
2749                                  * If realvp is different from *vpp
2750                                  * then release our reference on *vpp, so that
2751                                  * the export access check be performed on the
2752                                  * real filesystem instead.
2753                                  */
2754                                 VN_HOLD(realvp);
2755                                 VN_RELE(*vpp);
2756                                 *vpp = realvp;
2757                         } else {
2758                                 break;
2759                         }
2760                 /* LINTED */
2761                 } while (TRUE);
2762 
2763                 /*
2764                  * Let nfs_vptexi() figure what the real parent is.
2765                  */
2766                 VN_RELE(mc_dvp);
2767                 mc_dvp = NULL;
2768 
2769         } else {
2770                 /*
2771                  * If vnode is covered, get the
2772                  * the topmost vnode.
2773                  */
2774                 if (vn_mountedvfs(mc_dvp) != NULL) {
2775                         error = traverse(&mc_dvp);
2776                         if (error) {
2777                                 VN_RELE(*vpp);
2778                                 goto publicfh_done;
2779                         }
2780                 }
2781 
2782                 if (VOP_REALVP(mc_dvp, &realvp, NULL) == 0 &&
2783                     realvp != mc_dvp) {
2784                         /*
2785                          * *vpp is a file, obtain realvp of the parent
2786                          * directory vnode.
2787                          */
2788                         VN_HOLD(realvp);
2789                         VN_RELE(mc_dvp);
2790                         mc_dvp = realvp;
2791                 }
2792         }
2793 
2794         /*
2795          * The pathname may take us from the public filesystem to another.
2796          * If that's the case then just set the exportinfo to the new export
2797          * and build filehandle for it. Thanks to per-access checking there's
2798          * no security issues with doing this. If the client is not allowed
2799          * access to this new export then it will get an access error when it
2800          * tries to use the filehandle
2801          */
2802         if (error = nfs_check_vpexi(mc_dvp, *vpp, kcred, exi)) {
2803                 VN_RELE(*vpp);
2804                 goto publicfh_done;
2805         }
2806 
2807         /*
2808          * Not allowed access to pseudo exports.
2809          */
2810         if (PSEUDO(*exi)) {
2811                 error = ENOENT;
2812                 VN_RELE(*vpp);
2813                 goto publicfh_done;
2814         }
2815 
2816         /*
2817          * Do a lookup for the index file. We know the index option doesn't
2818          * allow paths through handling in the share command, so mc_dvp will
2819          * be the parent for the index file vnode, if its present. Use
2820          * temporary pointers to preserve and reuse the vnode pointers of the
2821          * original directory in case there's no index file. Note that the
2822          * index file is a native path, and should not be interpreted by
2823          * the URL parser in rfs_pathname()
2824          */
2825         if (((*exi)->exi_export.ex_flags & EX_INDEX) &&
2826             ((*vpp)->v_type == VDIR) && (pathflag == URLPATH)) {
2827                 vnode_t *tvp, *tmc_dvp; /* temporary vnode pointers */
2828 
2829                 tmc_dvp = mc_dvp;
2830                 mc_dvp = tvp = *vpp;
2831 
2832                 error = rfs_pathname((*exi)->exi_export.ex_index, NULL, vpp,
2833                     mc_dvp, cr, NATIVEPATH);
2834 
2835                 if (error == ENOENT) {
2836                         *vpp = tvp;
2837                         mc_dvp = tmc_dvp;
2838                         error = 0;
2839                 } else {        /* ok or error other than ENOENT */
2840                         if (tmc_dvp)
2841                                 VN_RELE(tmc_dvp);
2842                         if (error)
2843                                 goto publicfh_done;
2844 
2845                         /*
2846                          * Found a valid vp for index "filename". Sanity check
2847                          * for odd case where a directory is provided as index
2848                          * option argument and leads us to another filesystem
2849                          */
2850 
2851                         /* Release the reference on the old exi value */
2852                         ASSERT(*exi != NULL);
2853                         exi_rele(*exi);
2854 
2855                         if (error = nfs_check_vpexi(mc_dvp, *vpp, kcred, exi)) {
2856                                 VN_RELE(*vpp);
2857                                 goto publicfh_done;
2858                         }
2859                 }
2860         }
2861 
2862 publicfh_done:
2863         if (mc_dvp)
2864                 VN_RELE(mc_dvp);
2865 
2866         return (error);
2867 }
2868 
2869 /*
2870  * Evaluate a multi-component path
2871  */
2872 int
2873 rfs_pathname(
2874         char *path,                     /* pathname to evaluate */
2875         vnode_t **dirvpp,               /* ret for ptr to parent dir vnode */
2876         vnode_t **compvpp,              /* ret for ptr to component vnode */
2877         vnode_t *startdvp,              /* starting vnode */
2878         cred_t *cr,                     /* user's credential */
2879         int pathflag)                   /* flag to identify path, e.g. URL */
2880 {
2881         char namebuf[TYPICALMAXPATHLEN];
2882         struct pathname pn;
2883         int error;
2884 
2885         /*
2886          * If pathname starts with '/', then set startdvp to root.
2887          */
2888         if (*path == '/') {
2889                 while (*path == '/')
2890                         path++;
2891 
2892                 startdvp = ZONE_ROOTVP();
2893         }
2894 
2895         error = pn_get_buf(path, UIO_SYSSPACE, &pn, namebuf, sizeof (namebuf));
2896         if (error == 0) {
2897                 /*
2898                  * Call the URL parser for URL paths to modify the original
2899                  * string to handle any '%' encoded characters that exist.
2900                  * Done here to avoid an extra bcopy in the lookup.
2901                  * We need to be careful about pathlen's. We know that
2902                  * rfs_pathname() is called with a non-empty path. However,
2903                  * it could be emptied due to the path simply being all /'s,
2904                  * which is valid to proceed with the lookup, or due to the
2905                  * URL parser finding an encoded null character at the
2906                  * beginning of path which should not proceed with the lookup.
2907                  */
2908                 if (pn.pn_pathlen != 0 && pathflag == URLPATH) {
2909                         URLparse(pn.pn_path);
2910                         if ((pn.pn_pathlen = strlen(pn.pn_path)) == 0)
2911                                 return (ENOENT);
2912                 }
2913                 VN_HOLD(startdvp);
2914                 error = lookuppnvp(&pn, NULL, NO_FOLLOW, dirvpp, compvpp,
2915                     ZONE_ROOTVP(), startdvp, cr);
2916         }
2917         if (error == ENAMETOOLONG) {
2918                 /*
2919                  * This thread used a pathname > TYPICALMAXPATHLEN bytes long.
2920                  */
2921                 if (error = pn_get(path, UIO_SYSSPACE, &pn))
2922                         return (error);
2923                 if (pn.pn_pathlen != 0 && pathflag == URLPATH) {
2924                         URLparse(pn.pn_path);
2925                         if ((pn.pn_pathlen = strlen(pn.pn_path)) == 0) {
2926                                 pn_free(&pn);
2927                                 return (ENOENT);
2928                         }
2929                 }
2930                 VN_HOLD(startdvp);
2931                 error = lookuppnvp(&pn, NULL, NO_FOLLOW, dirvpp, compvpp,
2932                     ZONE_ROOTVP(), startdvp, cr);
2933                 pn_free(&pn);
2934         }
2935 
2936         return (error);
2937 }
2938 
2939 /*
2940  * Adapt the multicomponent lookup path depending on the pathtype
2941  */
2942 static int
2943 MCLpath(char **path)
2944 {
2945         unsigned char c = (unsigned char)**path;
2946 
2947         /*
2948          * If the MCL path is between 0x20 and 0x7E (graphic printable
2949          * character of the US-ASCII coded character set), its a URL path,
2950          * per RFC 1738.
2951          */
2952         if (c >= 0x20 && c <= 0x7E)
2953                 return (URLPATH);
2954 
2955         /*
2956          * If the first octet of the MCL path is not an ASCII character
2957          * then it must be interpreted as a tag value that describes the
2958          * format of the remaining octets of the MCL path.
2959          *
2960          * If the first octet of the MCL path is 0x81 it is a query
2961          * for the security info.
2962          */
2963         switch (c) {
2964         case 0x80:      /* native path, i.e. MCL via mount protocol */
2965                 (*path)++;
2966                 return (NATIVEPATH);
2967         case 0x81:      /* security query */
2968                 (*path)++;
2969                 return (SECURITY_QUERY);
2970         default:
2971                 return (-1);
2972         }
2973 }
2974 
2975 #define fromhex(c)  ((c >= '0' && c <= '9') ? (c - '0') : \
2976                         ((c >= 'A' && c <= 'F') ? (c - 'A' + 10) :\
2977                         ((c >= 'a' && c <= 'f') ? (c - 'a' + 10) : 0)))
2978 
2979 /*
2980  * The implementation of URLparse guarantees that the final string will
2981  * fit in the original one. Replaces '%' occurrences followed by 2 characters
2982  * with its corresponding hexadecimal character.
2983  */
2984 static void
2985 URLparse(char *str)
2986 {
2987         char *p, *q;
2988 
2989         p = q = str;
2990         while (*p) {
2991                 *q = *p;
2992                 if (*p++ == '%') {
2993                         if (*p) {
2994                                 *q = fromhex(*p) * 16;
2995                                 p++;
2996                                 if (*p) {
2997                                         *q += fromhex(*p);
2998                                         p++;
2999                                 }
3000                         }
3001                 }
3002                 q++;
3003         }
3004         *q = '\0';
3005 }
3006 
3007 
3008 /*
3009  * Get the export information for the lookup vnode, and verify its
3010  * useable.
3011  */
3012 int
3013 nfs_check_vpexi(vnode_t *mc_dvp, vnode_t *vp, cred_t *cr,
3014     struct exportinfo **exi)
3015 {
3016         int walk;
3017         int error = 0;
3018 
3019         *exi = nfs_vptoexi(mc_dvp, vp, cr, &walk, NULL, FALSE);
3020         if (*exi == NULL)
3021                 error = EACCES;
3022         else {
3023                 /*
3024                  * If nosub is set for this export then
3025                  * a lookup relative to the public fh
3026                  * must not terminate below the
3027                  * exported directory.
3028                  */
3029                 if ((*exi)->exi_export.ex_flags & EX_NOSUB && walk > 0)
3030                         error = EACCES;
3031         }
3032 
3033         return (error);
3034 }
3035 
3036 /*
3037  * Used by NFSv3 and NFSv4 server to query label of
3038  * a pathname component during lookup/access ops.
3039  */
3040 ts_label_t *
3041 nfs_getflabel(vnode_t *vp, struct exportinfo *exi)
3042 {
3043         zone_t *zone;
3044         ts_label_t *zone_label;
3045         char *path;
3046 
3047         mutex_enter(&vp->v_lock);
3048         if (vp->v_path != vn_vpath_empty) {
3049                 zone = zone_find_by_any_path(vp->v_path, B_FALSE);
3050                 mutex_exit(&vp->v_lock);
3051         } else {
3052                 /*
3053                  * v_path not cached. Fall back on pathname of exported
3054                  * file system as we rely on pathname from which we can
3055                  * derive a label. The exported file system portion of
3056                  * path is sufficient to obtain a label.
3057                  */
3058                 path = exi->exi_export.ex_path;
3059                 if (path == NULL) {
3060                         mutex_exit(&vp->v_lock);
3061                         return (NULL);
3062                 }
3063                 zone = zone_find_by_any_path(path, B_FALSE);
3064                 mutex_exit(&vp->v_lock);
3065         }
3066         /*
3067          * Caller has verified that the file is either
3068          * exported or visible. So if the path falls in
3069          * global zone, admin_low is returned; otherwise
3070          * the zone's label is returned.
3071          */
3072         zone_label = zone->zone_slabel;
3073         label_hold(zone_label);
3074         zone_rele(zone);
3075         return (zone_label);
3076 }
3077 
3078 /*
3079  * TX NFS routine used by NFSv3 and NFSv4 to do label check
3080  * on client label and server's file object lable.
3081  */
3082 boolean_t
3083 do_rfs_label_check(bslabel_t *clabel, vnode_t *vp, int flag,
3084     struct exportinfo *exi)
3085 {
3086         bslabel_t *slabel;
3087         ts_label_t *tslabel;
3088         boolean_t result;
3089 
3090         if ((tslabel = nfs_getflabel(vp, exi)) == NULL) {
3091                 return (B_FALSE);
3092         }
3093         slabel = label2bslabel(tslabel);
3094         DTRACE_PROBE4(tx__rfs__log__info__labelcheck, char *,
3095             "comparing server's file label(1) with client label(2) (vp(3))",
3096             bslabel_t *, slabel, bslabel_t *, clabel, vnode_t *, vp);
3097 
3098         if (flag == EQUALITY_CHECK)
3099                 result = blequal(clabel, slabel);
3100         else
3101                 result = bldominates(clabel, slabel);
3102         label_rele(tslabel);
3103         return (result);
3104 }
3105 
3106 /*
3107  * Callback function to return the loaned buffers.
3108  * Calls VOP_RETZCBUF() only after all uio_iov[]
3109  * buffers are returned. nu_ref maintains the count.
3110  */
3111 void
3112 rfs_free_xuio(void *free_arg)
3113 {
3114         uint_t ref;
3115         nfs_xuio_t *nfsuiop = (nfs_xuio_t *)free_arg;
3116 
3117         ref = atomic_dec_uint_nv(&nfsuiop->nu_ref);
3118 
3119         /*
3120          * Call VOP_RETZCBUF() only when all the iov buffers
3121          * are sent OTW.
3122          */
3123         if (ref != 0)
3124                 return;
3125 
3126         if (((uio_t *)nfsuiop)->uio_extflg & UIO_XUIO) {
3127                 (void) VOP_RETZCBUF(nfsuiop->nu_vp, (xuio_t *)free_arg, NULL,
3128                     NULL);
3129                 VN_RELE(nfsuiop->nu_vp);
3130         }
3131 
3132         kmem_cache_free(nfs_xuio_cache, free_arg);
3133 }
3134 
3135 xuio_t *
3136 rfs_setup_xuio(vnode_t *vp)
3137 {
3138         nfs_xuio_t *nfsuiop;
3139 
3140         nfsuiop = kmem_cache_alloc(nfs_xuio_cache, KM_SLEEP);
3141 
3142         bzero(nfsuiop, sizeof (nfs_xuio_t));
3143         nfsuiop->nu_vp = vp;
3144 
3145         /*
3146          * ref count set to 1. more may be added
3147          * if multiple mblks refer to multiple iov's.
3148          * This is done in uio_to_mblk().
3149          */
3150 
3151         nfsuiop->nu_ref = 1;
3152 
3153         nfsuiop->nu_frtn.free_func = rfs_free_xuio;
3154         nfsuiop->nu_frtn.free_arg = (char *)nfsuiop;
3155 
3156         nfsuiop->nu_uio.xu_type = UIOTYPE_ZEROCOPY;
3157 
3158         return (&nfsuiop->nu_uio);
3159 }
3160 
3161 mblk_t *
3162 uio_to_mblk(uio_t *uiop)
3163 {
3164         struct iovec *iovp;
3165         int i;
3166         mblk_t *mp, *mp1;
3167         nfs_xuio_t *nfsuiop = (nfs_xuio_t *)uiop;
3168 
3169         if (uiop->uio_iovcnt == 0)
3170                 return (NULL);
3171 
3172         iovp = uiop->uio_iov;
3173         mp = mp1 = esballoca((uchar_t *)iovp->iov_base, iovp->iov_len,
3174             BPRI_MED, &nfsuiop->nu_frtn);
3175         ASSERT(mp != NULL);
3176 
3177         mp->b_wptr += iovp->iov_len;
3178         mp->b_datap->db_type = M_DATA;
3179 
3180         for (i = 1; i < uiop->uio_iovcnt; i++) {
3181                 iovp = (uiop->uio_iov + i);
3182 
3183                 mp1->b_cont = esballoca(
3184                     (uchar_t *)iovp->iov_base, iovp->iov_len, BPRI_MED,
3185                     &nfsuiop->nu_frtn);
3186 
3187                 mp1 = mp1->b_cont;
3188                 ASSERT(mp1 != NULL);
3189                 mp1->b_wptr += iovp->iov_len;
3190                 mp1->b_datap->db_type = M_DATA;
3191         }
3192 
3193         nfsuiop->nu_ref = uiop->uio_iovcnt;
3194 
3195         return (mp);
3196 }
3197 
3198 /*
3199  * Allocate memory to hold data for a read request of len bytes.
3200  *
3201  * We don't allocate buffers greater than kmem_max_cached in size to avoid
3202  * allocating memory from the kmem_oversized arena.  If we allocate oversized
3203  * buffers, we incur heavy cross-call activity when freeing these large buffers
3204  * in the TCP receive path. Note that we can't set b_wptr here since the
3205  * length of the data returned may differ from the length requested when
3206  * reading the end of a file; we set b_wptr in rfs_rndup_mblks() once the
3207  * length of the read is known.
3208  */
3209 mblk_t *
3210 rfs_read_alloc(uint_t len, struct iovec **iov, int *iovcnt)
3211 {
3212         struct iovec *iovarr;
3213         mblk_t *mp, **mpp = &mp;
3214         size_t mpsize;
3215         uint_t remain = len;
3216         int i, err = 0;
3217 
3218         *iovcnt = howmany(len, kmem_max_cached);
3219 
3220         iovarr = kmem_alloc(*iovcnt * sizeof (struct iovec), KM_SLEEP);
3221         *iov = iovarr;
3222 
3223         for (i = 0; i < *iovcnt; remain -= mpsize, i++) {
3224                 ASSERT(remain <= len);
3225                 /*
3226                  * We roundup the size we allocate to a multiple of
3227                  * BYTES_PER_XDR_UNIT (4 bytes) so that the call to
3228                  * xdrmblk_putmblk() never fails.
3229                  */
3230                 ASSERT(kmem_max_cached % BYTES_PER_XDR_UNIT == 0);
3231                 mpsize = MIN(kmem_max_cached, remain);
3232                 *mpp = allocb_wait(RNDUP(mpsize), BPRI_MED, STR_NOSIG, &err);
3233                 ASSERT(*mpp != NULL);
3234                 ASSERT(err == 0);
3235 
3236                 iovarr[i].iov_base = (caddr_t)(*mpp)->b_rptr;
3237                 iovarr[i].iov_len = mpsize;
3238                 mpp = &(*mpp)->b_cont;
3239         }
3240         return (mp);
3241 }
3242 
3243 void
3244 rfs_rndup_mblks(mblk_t *mp, uint_t len, int buf_loaned)
3245 {
3246         int i;
3247         int alloc_err = 0;
3248         mblk_t *rmp;
3249         uint_t mpsize, remainder;
3250 
3251         remainder = P2NPHASE(len, BYTES_PER_XDR_UNIT);
3252 
3253         /*
3254          * Non copy-reduction case.  This function assumes that blocks were
3255          * allocated in multiples of BYTES_PER_XDR_UNIT bytes, which makes this
3256          * padding safe without bounds checking.
3257          */
3258         if (!buf_loaned) {
3259                 /*
3260                  * Set the size of each mblk in the chain until we've consumed
3261                  * the specified length for all but the last one.
3262                  */
3263                 while ((mpsize = MBLKSIZE(mp)) < len) {
3264                         ASSERT(mpsize % BYTES_PER_XDR_UNIT == 0);
3265                         mp->b_wptr += mpsize;
3266                         len -= mpsize;
3267                         mp = mp->b_cont;
3268                         ASSERT(mp != NULL);
3269                 }
3270 
3271                 ASSERT(len + remainder <= mpsize);
3272                 mp->b_wptr += len;
3273                 for (i = 0; i < remainder; i++)
3274                         *mp->b_wptr++ = '\0';
3275                 return;
3276         }
3277 
3278         /*
3279          * No remainder mblk required.
3280          */
3281         if (remainder == 0)
3282                 return;
3283 
3284         /*
3285          * Get to the last mblk in the chain.
3286          */
3287         while (mp->b_cont != NULL)
3288                 mp = mp->b_cont;
3289 
3290         /*
3291          * In case of copy-reduction mblks, the size of the mblks are fixed
3292          * and are of the size of the loaned buffers.  Allocate a remainder
3293          * mblk and chain it to the data buffers. This is sub-optimal, but not
3294          * expected to happen commonly.
3295          */
3296         rmp = allocb_wait(remainder, BPRI_MED, STR_NOSIG, &alloc_err);
3297         ASSERT(rmp != NULL);
3298         ASSERT(alloc_err == 0);
3299 
3300         for (i = 0; i < remainder; i++)
3301                 *rmp->b_wptr++ = '\0';
3302 
3303         rmp->b_datap->db_type = M_DATA;
3304         mp->b_cont = rmp;
3305 }